def data_prepare(): df_train, df_test = base_data_process.eda(age2group=True, one_hot=False, scale=True) base_data_process.label2index(df_train, LABEL) label = df_train[LABEL] df_train.drop(columns=[LABEL], inplace=True) df_train.drop(columns=[ID], inplace=True) label_one_hot = pd.get_dummies(label) feats = [f for f in df_train.columns if f not in category_list] log.info('feats are {}'.format(feats)) category_encode_size_map = {} for c in category_list: if c not in df_train.columns: log.warn('{} not in df'.format(c)) continue category_encode_size_map[c] = len(df_train[c].unique()) log.info('{} has {} classes'.format(c, len(df_train[c].unique()))) # category_encode_size_map = {} # for c in category_list: # if c not in df_train.columns: # continue # le = preprocessing.LabelEncoder() # le.fit(pd.concat([df_train[c], df_test[c]], axis=0)) # # df_train[c] = le.transform(df_train[c]) # df_test[c] = le.transform(df_test[c]) # # category_encode_size_map[c] = len(le.classes_) # log.info('{} has {} classes, origin classes are {}'.format(c, len(le.classes_), le.classes_)) return df_train, df_test, label, label_one_hot, feats, category_encode_size_map
def data_prepare(df_train, df_test): conti_list = ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'contract_time', 'former_complaint_fee', 'former_complaint_num', 'last_month_traffic', 'local_caller_time', 'local_trafffic_month', 'month_traffic', 'online_time', 'pay_num', 'pay_times', 'service1_caller_time', 'service2_caller_time', 'pay_num_per_time', 'll'] normalize_process(df_train, df_test, conti_list) # label 2 index base_data_process.label2index(df_train, LABEL) log.info('current path: {}'.format(os.getcwd())) with timer('save train data'): df_train.to_csv('../../origin_data/train_modified.csv', index=False) with timer('save test data'): df_test.to_csv('../../origin_data/test_modified.csv', index=False)
def data_prepare(df_train, df_test): conti_list = [ '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee', 'contract_time', 'former_complaint_fee', 'former_complaint_num', 'last_month_traffic', 'local_caller_time', 'local_trafffic_month', 'month_traffic', 'online_time', 'pay_num', 'pay_times', 'service1_caller_time', 'service2_caller_time', 'pay_num_per_time', 'll' ] normalize_process(df_train, df_test, conti_list) # label 2 index base_data_process.label2index(df_train, LABEL) base_util.pickle_dump( (base_data_process.encode_map, base_data_process.decode_list), '../../origin_data/label2index.pkl') with timer('save train data'): df_train.to_csv('../../origin_data/train_modified.csv', index=False) with timer('save test data'): df_test.to_csv('../../origin_data/test_modified.csv', index=False)
df['predict'] = index2label(y_pre) df.to_csv('result{}.csv'.format(name), index=False) # Display/plot feature importance def display_importances(feature_importance_df_): cols = feature_importance_df_[[ "feature", "importance" ]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index best_features = feature_importance_df_.loc[ feature_importance_df_.feature.isin(cols)] plt.figure(figsize=(8, 10)) sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) plt.title('LightGBM Features (avg over folds)') plt.tight_layout() plt.savefig('lgbm_importances01.png') if __name__ == '__main__': if not os.path.exists('origin_data_save'): os.mkdir('origin_data_save') with timer('data process'): df_train, df_test = eda() label2index(df_train, LABEL) with timer('model process'): model(df_train, df_test, num_folds=5, num_boost_round=10000)
11, 'verbose': -1 } trials = Trials() with timer('optimization'): # Run optimization best = fmin(fn=objective, space=space, algo=tpe.suggest, trials=trials, max_evals=config_dict['max_evals']) print('-' * 100) log.warn(best) with open('model_trials.pkl', mode='wb') as mt: pickle.dump(trials, mt) config_dict = {'train': pd.DataFrame(), 'max_evals': 1000} if __name__ == '__main__': df_train, df_test = eda(True, False) config_dict['train'] = df_train.iloc[:, :] label2index(df_train, 'current_service') optimization()