def run_rfe(df_feature, df_label): df_ohe = dynamic_get_dummies(df_feature) train_x, test_x, train_y, test_y = train_test_split(df_ohe, df_label, test_size=0.3, random_state=99) # dbg_recover = pd.concat([test_x, test_y], axis=1) # over sample train_x, train_y = over_sample(train_x, train_y) # build model nof_list = np.arange(1, (max_feature_try_numbers + 1)) class_1_precision_list = [] class_1_recall_list = [] for n in range(len(nof_list)): save_print("********Current nof features are: " + str(nof_list[n])) dc_tree = DecisionTreeClassifier(criterion='entropy', min_samples_split=20, random_state=99) rfe = RFE(dc_tree, nof_list[n]) rfe_train_x = rfe.fit_transform(train_x, train_y) rfe_test_x = rfe.transform(test_x) dc_tree.fit(rfe_train_x, train_y) labels = df_label.unique() # predict test_y_predict = dc_tree.predict(rfe_test_x) class_1_precision, class_1_recall = get_accuracy( "decision tree", test_y, test_y_predict, labels) class_1_precision_list.append(class_1_precision) class_1_recall_list.append(class_1_recall) plot_pre_recall(nof_list, class_1_precision_list, class_1_recall_list, 'decision tree')
def screen_rows(df): save_print("STEP -- screen_rows") save_print(df['Status'].value_counts(normalize=True)) row_filter = df["Status"].str.upper() == "UNKNOWN" index_names = df[row_filter].index # drop rows df.drop(index_names, axis=0, inplace=True)
def analysis_where(df): save_print("\nAnalysis where -- location") # check missing check_missing(df, col_where) # check unique check_unique(df, col_where) # [Division] df['Division'] = df['Division'].astype(str) plot_top_n_info(df, 'Division') # [Location_Type] plot_top_n_info(df, 'Location_Type') # [Premise_Type] pt_vc = df['Premise_Type'].value_counts() plot_bar_value_counts(pt_vc, 'Premise_Type') # plot top n for Premise_Type == 'Other' row_filter = df['Premise_Type'] != 'Other' index_names = df[row_filter].index filter_df = df.drop(index_names, axis=0) plot_top_n_info(filter_df, 'Location_Type', 'Other_PType') # [Hood_ID] df['Hood_ID'] = df['Hood_ID'].astype(str) plot_top_n_info(df, 'Hood_ID') # [Lat/Long] grouped = df.groupby(['Lat', 'Long'])['ObjectId'].count() plot_toronto_scatter(grouped, 'Long', 'Lat', 'Lat_Long')
def fill_missing_bike_cost(df): save_print("\nSTEP -- fill_missing_bike_cost") median = df.groupby('Bike_Type').median().Cost_of_Bike save_print(median) df['Cost_of_Bike'].fillna( df.groupby('Bike_Type')['Cost_of_Bike'].transform('median'), inplace=True)
def run_rfe(df_feature, df_label): df_ohe = dynamic_get_dummies(df_feature) train_x, test_x, train_y, test_y = train_test_split(df_ohe, df_label, test_size=0.3, random_state=99) # dbg_recover = pd.concat([test_x, test_y], axis=1) # over sample train_x, train_y = over_sample(train_x, train_y) # build model nof_list = np.arange(1, (max_feature_try_numbers + 1)) class_1_precision_list = [] class_1_recall_list = [] for n in range(len(nof_list)): save_print("********Current nof features are: " + str(nof_list[n])) lg_regression = linear_model.LogisticRegression(solver='lbfgs') rfe = RFE(lg_regression, nof_list[n]) rfe_train_x = rfe.fit_transform(train_x, train_y) rfe_test_x = rfe.transform(test_x) lg_regression.fit(rfe_train_x, train_y) labels = df_label.unique() # predict probs test_y_predict_probs = lg_regression.predict_proba(rfe_test_x) test_y_predict_prob = test_y_predict_probs[:, 1] prob_df = pd.DataFrame(test_y_predict_prob) prob_df['predict'] = np.where(prob_df[0] >= lg_threshold, 1, 0) class_1_precision, class_1_recall = get_accuracy( "logistic regression predict_probs", test_y, prob_df['predict'], labels) class_1_precision_list.append(class_1_precision) class_1_recall_list.append(class_1_recall) plot_pre_recall(nof_list, class_1_precision_list, class_1_recall_list, 'logistic regression')
def fill_missing_bike_color(df): save_print("\nSTEP -- fill_missing_bike_color") grp_by_cols = ['Bike_Type'] cal_col = 'Bike_Colour' tmp_df = group_by_most_frequent(df, grp_by_cols, cal_col) s = df['Bike_Type'].map(tmp_df.set_index('Bike_Type')['Bike_Colour']) df.loc[df['Bike_Colour'].isnull(), 'Bike_Colour'] = s save_print("\nSTEP -- fill_missing_bike_color")
def cat_data(df): save_print("\nSTEP -- cat_data") categorical_cols = [] for col, col_type in df.dtypes.iteritems(): if col_type == 'O': categorical_cols.append(col) save_print(categorical_cols) return pd.get_dummies(df, columns=categorical_cols, dummy_na=False)
def cal_corr_cost_speed(df): save_print('****\nSTEP -- cal_corr_cost_speed***') # fig, axs = plt.subplots(nrows=7, ncols=2, figsize=(6, 6), facecolor='w', edgecolor='k') # fig.subplots_adjust(hspace=.5, wspace=.001) unique_types = df['Bike_Type'].unique() # for u_type, ax in zip(unique_types, axs.flat): for u_type in unique_types: filter_cond = df["Bike_Type"] == u_type tmp_df = df.where(filter_cond) corr = tmp_df['Bike_Speed'].corr(tmp_df['Cost_of_Bike']) save_print(u_type + " " + str(corr))
def run_once(df_feature, df_label): df_ohe = dynamic_get_dummies(df_feature) train_x, test_x, train_y, test_y = train_test_split(df_ohe, df_label, test_size=0.3, random_state=99) # dbg_recover = pd.concat([test_x, test_y], axis=1) # over sample train_x, train_y = over_sample(train_x, train_y) # build model lg_regression = linear_model.LogisticRegression(solver='lbfgs') rfe = RFE(lg_regression, best_nof_feature) rfe_train_x = rfe.fit_transform(train_x, train_y) rfe_test_x = rfe.transform(test_x) lg_regression.fit(rfe_train_x, train_y) labels = df_label.unique() # predict probs test_y_predict_probs = lg_regression.predict_proba(rfe_test_x) test_y_predict_prob = test_y_predict_probs[:, 1] prob_df = pd.DataFrame(test_y_predict_prob) prob_df['predict'] = np.where(prob_df[0] >= lg_threshold, 1, 0) get_accuracy("logistic regression predict_probs", test_y, prob_df['predict'], labels) # print features cols = list(df_ohe.columns) temp = pd.Series(rfe.support_, index=cols) selected_features_rfe = temp[temp == True].index save_print("Top " + str(best_nof_feature) + " features are: ") save_print(selected_features_rfe) # dump model joblib.dump(lg_regression, root_folder + "lg_regression.pkl") save_print("lg_regression Model dumped!") joblib.dump(selected_features_rfe, root_folder + "lg_regression_cols.pkl") save_print("lg_regression models columns dumped!")
def run_once(df_feature, df_label): df_ohe = dynamic_get_dummies(df_feature) train_x, test_x, train_y, test_y = train_test_split(df_ohe, df_label, test_size=0.3, random_state=99) # dbg_recover = pd.concat([test_x, test_y], axis=1) # over sample train_x, train_y = over_sample(train_x, train_y) # build model dc_tree = DecisionTreeClassifier(criterion='entropy', min_samples_split=20, random_state=99) rfe = RFE(dc_tree, best_nof_feature) rfe_train_x = rfe.fit_transform(train_x, train_y) rfe_test_x = rfe.transform(test_x) dc_tree.fit(rfe_train_x, train_y) labels = df_label.unique() # predict test_y_predict = dc_tree.predict(rfe_test_x) get_accuracy("decision tree", test_y, test_y_predict, labels) # print features cols = list(df_ohe.columns) temp = pd.Series(rfe.support_, index=cols) selected_features_rfe = temp[temp == True].index save_print("Top " + str(best_nof_feature) + " features are: ") save_print(selected_features_rfe) # dump model joblib.dump(dc_tree, root_folder + "dc_tree.pkl") save_print("dc_tree Model dumped!") joblib.dump(selected_features_rfe, root_folder + "dc_tree_cols.pkl") save_print("dc_tree models columns dumped!")
def convert_value(df): save_print("\nSTEP -- convert_value") # col Occurrence_Date # -- convert convert_time(df) # col Bike_Make # -- trim from right df['Bike_Make'].str.rstrip(' \n\t') # -- convert plot_top_n_info(df, 'Bike_Make') # col Bike_Colour # -- trim from right spaces df['Bike_Colour'].str.rstrip(' \n\t') # -- convert plot_top_n_info(df, 'Bike_Colour')
def plot_top_n_info(df, col, msg=None): # this method will print top n index at console # also plot png for histogram on hard drive threshold = len(df) * histogram_percentile sum_value = 0 n = 0 for value in df[col].value_counts().values.tolist(): sum_value += value if sum_value >= threshold: break else: n = n + 1 top_cols = df[col].value_counts()[:n].index.tolist() console_str = 'Top ' + str(n) + '/' + str(df[col].nunique()) + ' [' + col + \ '] value that contributes ' + str(histogram_percentile * 100) + '% ' + current_df_name + ' data' save_print(console_str) save_print(df[col].value_counts()[:n]) # plot plt.rcdefaults() # plt.autoscale(tight=False) fig, ax = plt.subplots() fig.set_size_inches(15, 8) ax.barh(df[col].value_counts()[:n].index.tolist(), df[col].value_counts()[:n].values.tolist(), align='center', color=decide_color()) ax.set_ylabel(col, labelpad=10, weight='bold', size=12) ax.set_xlabel('value count', labelpad=3, weight='bold', size=12) ax.set_title(console_str) # ax.yaxis.tick_right() ax.invert_yaxis() for p in ax.patches: ax.annotate(str(p.get_width()), (p.get_width(), p.get_y() + p.get_height() * 0.75)) if msg is None: file_name = 'output/' + col + '_top_n_' + current_df_name + '.png' else: file_name = 'output/' + msg + ' ' + col + '_top_n_' + current_df_name + '.png' fig.savefig(file_name) return top_cols
def analysis_what(df): save_print("\nAnalysis what -- bike") # check missing check_missing(df, col_what) # check unique check_unique(df, col_what) # [Bike_Make] plot_top_n_info(df, 'Bike_Make') # [Bike_Type] plot_top_n_info(df, 'Bike_Type') # [Bike_Speed] plot_box(df, 'Bike_Speed') save_print('****Bike_Speed group by Bike_Type describe:****') save_print(df.groupby(['Bike_Type'])['Bike_Speed'].describe()) plot_histogram(df, 'Bike_Speed') # [Bike_Colour] fill_missing_bike_color(df) plot_top_n_info(df, 'Bike_Colour') # [Cost_of_Bike] fill_missing_bike_cost(df) plot_box(df, 'Cost_of_Bike') plot_histogram(df, 'Cost_of_Bike', np.arange(0, 5000, 50), True) # [Bike_Speed, Cost_of_Bike] cal_corr_cost_speed(df) # recheck missing check_missing(df, col_what)
def analysis_when(df): save_print("\nAnalysis when -- time") # check missing check_missing(df, col_when) # check unique check_unique(df, col_when) # [Occurrence_Year] annual_theft = df['Occurrence_Year'].value_counts() plot_bar_value_counts(annual_theft, 'Annual_Status') # [Occurrence_Month] month_theft = df['Occurrence_Month'].value_counts() plot_bar_value_counts(month_theft, 'Month_Status', 1.0) # [Occurrence_Day] day_theft = df['Occurrence_Day'].value_counts() plot_bar_value_counts(day_theft, 'Daily_Status', 2.0) # [Occurrence_Time] if convert_hour: df['Occurrence_Time'] = df['Occurrence_Time'].str.split(':').apply( lambda x: int(x[0]))
def replace_status(df): save_print("STEP -- replace status") # replace STOLEN as 1 and RECOVERED as 0 df.loc[df['Status'] == 'STOLEN', 'Status'] = status_stolen df.loc[df['Status'] == 'RECOVERED', 'Status'] = status_recover
def screen_cols(df): save_print("STEP -- screen_cols") exclude_cols = ['Occurrence_Date'] # check X -- Long is identical bool_x_long = df['X'].round(4).equals(df['Long'].round(4)) save_print("Is col X and col Long identical? " + str(bool_x_long)) if bool_x_long: save_print("Therefore remove col X") exclude_cols.append('X') # check Y -- Lat bool_y_lat = df['Y'].round(4).equals(df['Lat'].round(4)) save_print("Is col Y and col Lat identical? " + str(bool_y_lat)) if bool_y_lat: save_print("Therefore remove col Y") exclude_cols.append('Y') # check col Index is just id bool_is_index_id = df['Index_'].nunique() == len(df) save_print("Is col Index_ an id col? " + str(bool_is_index_id)) if bool_is_index_id: save_print("Therefore remove col Index") exclude_cols.append('Index_') # check col event_unique_id is just id bool_is_event_uid_id = df['event_unique_id'].nunique() >= len(df) * 0.8 save_print("Is col event_unique_id an id col? " + str(bool_is_event_uid_id)) if bool_is_event_uid_id: save_print("Therefore remove col event_unique_id") exclude_cols.append('event_unique_id') # check col ObjectId is just id bool_is_obj_id = df['ObjectId'].nunique() == len(df) save_print("Is col ObjectId an id col? " + str(bool_is_obj_id)) # if bool_is_obj_id: # save_print("Therefore remove col ObjectId") # exclude_cols.append('ObjectId') # check City is all Toronto bool_is_city_all_toronto = df['City'].nunique() == 1 save_print("Is col City only one value Toronto? " + str(bool_is_city_all_toronto)) if bool_is_city_all_toronto: save_print("Therefore remove col City") exclude_cols.append('City') # check Neighbourhood is one to one with Hood_ID bool_is_121 = len(df.groupby('Neighbourhood')['Hood_ID'].nunique().drop_duplicates()) == 1 save_print('Is col Neighbourhood one to one with Hood_ID? ' + str(bool_is_121)) if bool_is_121: save_print("Therefore remove col Neighbourhood") exclude_cols.append('Neighbourhood') # print all the removed columns save_print('\n***The columns will be removed are: ***') save_print(exclude_cols) # drop columns df.drop(exclude_cols, axis=1, inplace=True)
def print_basic_info(df): save_print("\nSTEP -- print_basic_info") save_print('****column values are:****') save_print(df.columns.values) save_print('\n****column type are:****') save_print(df.dtypes) # split into two df cond_stolen = df['Status'] == status_stolen df_stolen = df[cond_stolen] save_print('\n****column shape df_stolen:****') save_print(df_stolen.shape) cond_recover = df['Status'] == status_recover df_recover = df[cond_recover] save_print('\n****column shape df_recover:****') save_print(df_recover.shape) # save_print('\n****column describe are:****') # save_print(df.describe()) # save_print('\n****top 5 rows are:****') # save_print(df.head(5)) return df_stolen, df_recover
def convert_time(df): save_print("\nSTEP -- convert_time") df['Occurrence_Time'] = df['Occurrence_Time'].str.split(':').apply( lambda x: int(x[0]) * 60 + int(x[1])) save_print(df['Occurrence_Time'].head(5))