def run_rfe(df_feature, df_label):
    df_ohe = dynamic_get_dummies(df_feature)
    train_x, test_x, train_y, test_y = train_test_split(df_ohe,
                                                        df_label,
                                                        test_size=0.3,
                                                        random_state=99)
    # dbg_recover = pd.concat([test_x, test_y], axis=1)
    # over sample
    train_x, train_y = over_sample(train_x, train_y)

    # build model
    nof_list = np.arange(1, (max_feature_try_numbers + 1))
    class_1_precision_list = []
    class_1_recall_list = []
    for n in range(len(nof_list)):
        save_print("********Current nof features are: " + str(nof_list[n]))
        dc_tree = DecisionTreeClassifier(criterion='entropy',
                                         min_samples_split=20,
                                         random_state=99)
        rfe = RFE(dc_tree, nof_list[n])
        rfe_train_x = rfe.fit_transform(train_x, train_y)
        rfe_test_x = rfe.transform(test_x)
        dc_tree.fit(rfe_train_x, train_y)
        labels = df_label.unique()
        # predict
        test_y_predict = dc_tree.predict(rfe_test_x)
        class_1_precision, class_1_recall = get_accuracy(
            "decision tree", test_y, test_y_predict, labels)
        class_1_precision_list.append(class_1_precision)
        class_1_recall_list.append(class_1_recall)
    plot_pre_recall(nof_list, class_1_precision_list, class_1_recall_list,
                    'decision tree')
Exemple #2
0
def screen_rows(df):
    save_print("STEP -- screen_rows")
    save_print(df['Status'].value_counts(normalize=True))
    row_filter = df["Status"].str.upper() == "UNKNOWN"
    index_names = df[row_filter].index
    # drop rows
    df.drop(index_names, axis=0, inplace=True)
Exemple #3
0
def analysis_where(df):
    save_print("\nAnalysis where -- location")
    # check missing
    check_missing(df, col_where)
    # check unique
    check_unique(df, col_where)

    # [Division]
    df['Division'] = df['Division'].astype(str)
    plot_top_n_info(df, 'Division')

    # [Location_Type]
    plot_top_n_info(df, 'Location_Type')

    # [Premise_Type]
    pt_vc = df['Premise_Type'].value_counts()
    plot_bar_value_counts(pt_vc, 'Premise_Type')
    # plot top n for Premise_Type == 'Other'
    row_filter = df['Premise_Type'] != 'Other'
    index_names = df[row_filter].index
    filter_df = df.drop(index_names, axis=0)
    plot_top_n_info(filter_df, 'Location_Type', 'Other_PType')

    # [Hood_ID]
    df['Hood_ID'] = df['Hood_ID'].astype(str)
    plot_top_n_info(df, 'Hood_ID')

    # [Lat/Long]
    grouped = df.groupby(['Lat', 'Long'])['ObjectId'].count()
    plot_toronto_scatter(grouped, 'Long', 'Lat', 'Lat_Long')
Exemple #4
0
def fill_missing_bike_cost(df):
    save_print("\nSTEP -- fill_missing_bike_cost")
    median = df.groupby('Bike_Type').median().Cost_of_Bike
    save_print(median)
    df['Cost_of_Bike'].fillna(
        df.groupby('Bike_Type')['Cost_of_Bike'].transform('median'),
        inplace=True)
Exemple #5
0
def run_rfe(df_feature, df_label):
    df_ohe = dynamic_get_dummies(df_feature)
    train_x, test_x, train_y, test_y = train_test_split(df_ohe,
                                                        df_label,
                                                        test_size=0.3,
                                                        random_state=99)
    # dbg_recover = pd.concat([test_x, test_y], axis=1)
    # over sample
    train_x, train_y = over_sample(train_x, train_y)

    # build model
    nof_list = np.arange(1, (max_feature_try_numbers + 1))
    class_1_precision_list = []
    class_1_recall_list = []
    for n in range(len(nof_list)):
        save_print("********Current nof features are: " + str(nof_list[n]))
        lg_regression = linear_model.LogisticRegression(solver='lbfgs')
        rfe = RFE(lg_regression, nof_list[n])
        rfe_train_x = rfe.fit_transform(train_x, train_y)
        rfe_test_x = rfe.transform(test_x)
        lg_regression.fit(rfe_train_x, train_y)
        labels = df_label.unique()
        # predict probs
        test_y_predict_probs = lg_regression.predict_proba(rfe_test_x)
        test_y_predict_prob = test_y_predict_probs[:, 1]
        prob_df = pd.DataFrame(test_y_predict_prob)
        prob_df['predict'] = np.where(prob_df[0] >= lg_threshold, 1, 0)
        class_1_precision, class_1_recall = get_accuracy(
            "logistic regression predict_probs", test_y, prob_df['predict'],
            labels)
        class_1_precision_list.append(class_1_precision)
        class_1_recall_list.append(class_1_recall)
    plot_pre_recall(nof_list, class_1_precision_list, class_1_recall_list,
                    'logistic regression')
Exemple #6
0
def fill_missing_bike_color(df):
    save_print("\nSTEP -- fill_missing_bike_color")
    grp_by_cols = ['Bike_Type']
    cal_col = 'Bike_Colour'
    tmp_df = group_by_most_frequent(df, grp_by_cols, cal_col)
    s = df['Bike_Type'].map(tmp_df.set_index('Bike_Type')['Bike_Colour'])
    df.loc[df['Bike_Colour'].isnull(), 'Bike_Colour'] = s
    save_print("\nSTEP -- fill_missing_bike_color")
Exemple #7
0
def cat_data(df):
    save_print("\nSTEP -- cat_data")
    categorical_cols = []
    for col, col_type in df.dtypes.iteritems():
        if col_type == 'O':
            categorical_cols.append(col)
    save_print(categorical_cols)
    return pd.get_dummies(df, columns=categorical_cols, dummy_na=False)
Exemple #8
0
def cal_corr_cost_speed(df):
    save_print('****\nSTEP -- cal_corr_cost_speed***')
    # fig, axs = plt.subplots(nrows=7, ncols=2, figsize=(6, 6), facecolor='w', edgecolor='k')
    # fig.subplots_adjust(hspace=.5, wspace=.001)

    unique_types = df['Bike_Type'].unique()
    # for u_type, ax in zip(unique_types, axs.flat):
    for u_type in unique_types:
        filter_cond = df["Bike_Type"] == u_type
        tmp_df = df.where(filter_cond)
        corr = tmp_df['Bike_Speed'].corr(tmp_df['Cost_of_Bike'])
        save_print(u_type + " " + str(corr))
Exemple #9
0
def run_once(df_feature, df_label):
    df_ohe = dynamic_get_dummies(df_feature)
    train_x, test_x, train_y, test_y = train_test_split(df_ohe,
                                                        df_label,
                                                        test_size=0.3,
                                                        random_state=99)
    # dbg_recover = pd.concat([test_x, test_y], axis=1)
    # over sample
    train_x, train_y = over_sample(train_x, train_y)
    # build model
    lg_regression = linear_model.LogisticRegression(solver='lbfgs')
    rfe = RFE(lg_regression, best_nof_feature)
    rfe_train_x = rfe.fit_transform(train_x, train_y)
    rfe_test_x = rfe.transform(test_x)
    lg_regression.fit(rfe_train_x, train_y)
    labels = df_label.unique()
    # predict probs
    test_y_predict_probs = lg_regression.predict_proba(rfe_test_x)
    test_y_predict_prob = test_y_predict_probs[:, 1]
    prob_df = pd.DataFrame(test_y_predict_prob)
    prob_df['predict'] = np.where(prob_df[0] >= lg_threshold, 1, 0)
    get_accuracy("logistic regression predict_probs", test_y,
                 prob_df['predict'], labels)
    # print features
    cols = list(df_ohe.columns)
    temp = pd.Series(rfe.support_, index=cols)
    selected_features_rfe = temp[temp == True].index
    save_print("Top " + str(best_nof_feature) + " features are: ")
    save_print(selected_features_rfe)
    # dump model
    joblib.dump(lg_regression, root_folder + "lg_regression.pkl")
    save_print("lg_regression Model dumped!")
    joblib.dump(selected_features_rfe, root_folder + "lg_regression_cols.pkl")
    save_print("lg_regression models columns dumped!")
def run_once(df_feature, df_label):
    df_ohe = dynamic_get_dummies(df_feature)
    train_x, test_x, train_y, test_y = train_test_split(df_ohe,
                                                        df_label,
                                                        test_size=0.3,
                                                        random_state=99)
    # dbg_recover = pd.concat([test_x, test_y], axis=1)
    # over sample
    train_x, train_y = over_sample(train_x, train_y)
    # build model
    dc_tree = DecisionTreeClassifier(criterion='entropy',
                                     min_samples_split=20,
                                     random_state=99)
    rfe = RFE(dc_tree, best_nof_feature)
    rfe_train_x = rfe.fit_transform(train_x, train_y)
    rfe_test_x = rfe.transform(test_x)
    dc_tree.fit(rfe_train_x, train_y)
    labels = df_label.unique()
    # predict
    test_y_predict = dc_tree.predict(rfe_test_x)
    get_accuracy("decision tree", test_y, test_y_predict, labels)
    # print features
    cols = list(df_ohe.columns)
    temp = pd.Series(rfe.support_, index=cols)
    selected_features_rfe = temp[temp == True].index
    save_print("Top " + str(best_nof_feature) + " features are: ")
    save_print(selected_features_rfe)
    # dump model
    joblib.dump(dc_tree, root_folder + "dc_tree.pkl")
    save_print("dc_tree Model dumped!")
    joblib.dump(selected_features_rfe, root_folder + "dc_tree_cols.pkl")
    save_print("dc_tree models columns dumped!")
Exemple #11
0
def convert_value(df):
    save_print("\nSTEP -- convert_value")
    # col Occurrence_Date
    # -- convert
    convert_time(df)
    # col Bike_Make
    # -- trim from right
    df['Bike_Make'].str.rstrip(' \n\t')
    # -- convert
    plot_top_n_info(df, 'Bike_Make')
    # col Bike_Colour
    # -- trim from right spaces
    df['Bike_Colour'].str.rstrip(' \n\t')
    # -- convert
    plot_top_n_info(df, 'Bike_Colour')
Exemple #12
0
def plot_top_n_info(df, col, msg=None):
    # this method will print top n index at console
    # also plot png for histogram on hard drive
    threshold = len(df) * histogram_percentile
    sum_value = 0
    n = 0
    for value in df[col].value_counts().values.tolist():
        sum_value += value
        if sum_value >= threshold:
            break
        else:
            n = n + 1
    top_cols = df[col].value_counts()[:n].index.tolist()
    console_str = 'Top ' + str(n) + '/' + str(df[col].nunique()) + ' [' + col + \
                  '] value that contributes ' + str(histogram_percentile * 100) + '% ' + current_df_name + ' data'
    save_print(console_str)
    save_print(df[col].value_counts()[:n])

    # plot
    plt.rcdefaults()
    # plt.autoscale(tight=False)
    fig, ax = plt.subplots()
    fig.set_size_inches(15, 8)

    ax.barh(df[col].value_counts()[:n].index.tolist(),
            df[col].value_counts()[:n].values.tolist(),
            align='center',
            color=decide_color())
    ax.set_ylabel(col, labelpad=10, weight='bold', size=12)
    ax.set_xlabel('value count', labelpad=3, weight='bold', size=12)
    ax.set_title(console_str)
    # ax.yaxis.tick_right()
    ax.invert_yaxis()
    for p in ax.patches:
        ax.annotate(str(p.get_width()),
                    (p.get_width(), p.get_y() + p.get_height() * 0.75))
    if msg is None:
        file_name = 'output/' + col + '_top_n_' + current_df_name + '.png'
    else:
        file_name = 'output/' + msg + ' ' + col + '_top_n_' + current_df_name + '.png'
    fig.savefig(file_name)
    return top_cols
Exemple #13
0
def analysis_what(df):
    save_print("\nAnalysis what -- bike")
    # check missing
    check_missing(df, col_what)
    # check unique
    check_unique(df, col_what)

    # [Bike_Make]
    plot_top_n_info(df, 'Bike_Make')
    # [Bike_Type]
    plot_top_n_info(df, 'Bike_Type')
    # [Bike_Speed]
    plot_box(df, 'Bike_Speed')
    save_print('****Bike_Speed group by Bike_Type describe:****')
    save_print(df.groupby(['Bike_Type'])['Bike_Speed'].describe())
    plot_histogram(df, 'Bike_Speed')
    # [Bike_Colour]
    fill_missing_bike_color(df)
    plot_top_n_info(df, 'Bike_Colour')
    # [Cost_of_Bike]
    fill_missing_bike_cost(df)
    plot_box(df, 'Cost_of_Bike')
    plot_histogram(df, 'Cost_of_Bike', np.arange(0, 5000, 50), True)
    # [Bike_Speed, Cost_of_Bike]
    cal_corr_cost_speed(df)

    # recheck missing
    check_missing(df, col_what)
Exemple #14
0
def analysis_when(df):
    save_print("\nAnalysis when -- time")
    # check missing
    check_missing(df, col_when)
    # check unique
    check_unique(df, col_when)

    # [Occurrence_Year]
    annual_theft = df['Occurrence_Year'].value_counts()
    plot_bar_value_counts(annual_theft, 'Annual_Status')

    # [Occurrence_Month]
    month_theft = df['Occurrence_Month'].value_counts()
    plot_bar_value_counts(month_theft, 'Month_Status', 1.0)

    # [Occurrence_Day]
    day_theft = df['Occurrence_Day'].value_counts()
    plot_bar_value_counts(day_theft, 'Daily_Status', 2.0)

    # [Occurrence_Time]
    if convert_hour:
        df['Occurrence_Time'] = df['Occurrence_Time'].str.split(':').apply(
            lambda x: int(x[0]))
Exemple #15
0
def replace_status(df):
    save_print("STEP -- replace status")
    # replace STOLEN as 1 and RECOVERED as 0
    df.loc[df['Status'] == 'STOLEN', 'Status'] = status_stolen
    df.loc[df['Status'] == 'RECOVERED', 'Status'] = status_recover
Exemple #16
0
def screen_cols(df):
    save_print("STEP -- screen_cols")
    exclude_cols = ['Occurrence_Date']
    # check X -- Long is identical
    bool_x_long = df['X'].round(4).equals(df['Long'].round(4))
    save_print("Is col X and col Long identical? " + str(bool_x_long))
    if bool_x_long:
        save_print("Therefore remove col X")
        exclude_cols.append('X')

    # check Y -- Lat
    bool_y_lat = df['Y'].round(4).equals(df['Lat'].round(4))
    save_print("Is col Y and col Lat identical? " + str(bool_y_lat))
    if bool_y_lat:
        save_print("Therefore remove col Y")
        exclude_cols.append('Y')

    # check col Index is just id
    bool_is_index_id = df['Index_'].nunique() == len(df)
    save_print("Is col Index_ an id col? " + str(bool_is_index_id))
    if bool_is_index_id:
        save_print("Therefore remove col Index")
        exclude_cols.append('Index_')

    # check col event_unique_id is just id
    bool_is_event_uid_id = df['event_unique_id'].nunique() >= len(df) * 0.8
    save_print("Is col event_unique_id an id col? " + str(bool_is_event_uid_id))
    if bool_is_event_uid_id:
        save_print("Therefore remove col event_unique_id")
        exclude_cols.append('event_unique_id')

    # check col ObjectId is just id
    bool_is_obj_id = df['ObjectId'].nunique() == len(df)
    save_print("Is col ObjectId an id col? " + str(bool_is_obj_id))
    # if bool_is_obj_id:
    #     save_print("Therefore remove col ObjectId")
    #     exclude_cols.append('ObjectId')

    # check City is all Toronto
    bool_is_city_all_toronto = df['City'].nunique() == 1
    save_print("Is col City only one value Toronto? " + str(bool_is_city_all_toronto))
    if bool_is_city_all_toronto:
        save_print("Therefore remove col City")
        exclude_cols.append('City')

    # check Neighbourhood is one to one with Hood_ID
    bool_is_121 = len(df.groupby('Neighbourhood')['Hood_ID'].nunique().drop_duplicates()) == 1
    save_print('Is col Neighbourhood one to one with Hood_ID? ' + str(bool_is_121))
    if bool_is_121:
        save_print("Therefore remove col Neighbourhood")
        exclude_cols.append('Neighbourhood')

    # print all the removed columns
    save_print('\n***The columns will be removed are: ***')
    save_print(exclude_cols)
    # drop columns
    df.drop(exclude_cols, axis=1, inplace=True)
Exemple #17
0
def print_basic_info(df):
    save_print("\nSTEP -- print_basic_info")
    save_print('****column values are:****')
    save_print(df.columns.values)
    save_print('\n****column type are:****')
    save_print(df.dtypes)
    # split into two df
    cond_stolen = df['Status'] == status_stolen
    df_stolen = df[cond_stolen]
    save_print('\n****column shape df_stolen:****')
    save_print(df_stolen.shape)
    cond_recover = df['Status'] == status_recover
    df_recover = df[cond_recover]
    save_print('\n****column shape df_recover:****')
    save_print(df_recover.shape)
    # save_print('\n****column describe are:****')
    # save_print(df.describe())
    # save_print('\n****top 5 rows are:****')
    # save_print(df.head(5))
    return df_stolen, df_recover
Exemple #18
0
def convert_time(df):
    save_print("\nSTEP -- convert_time")
    df['Occurrence_Time'] = df['Occurrence_Time'].str.split(':').apply(
        lambda x: int(x[0]) * 60 + int(x[1]))
    save_print(df['Occurrence_Time'].head(5))