def data_prepare(): df_train, df_test = base_data_process.eda(age2group=True, one_hot=False, scale=True) base_data_process.label2index(df_train, LABEL) label = df_train[LABEL] df_train.drop(columns=[LABEL], inplace=True) df_train.drop(columns=[ID], inplace=True) label_one_hot = pd.get_dummies(label) feats = [f for f in df_train.columns if f not in category_list] log.info('feats are {}'.format(feats)) category_encode_size_map = {} for c in category_list: if c not in df_train.columns: log.warn('{} not in df'.format(c)) continue category_encode_size_map[c] = len(df_train[c].unique()) log.info('{} has {} classes'.format(c, len(df_train[c].unique()))) # category_encode_size_map = {} # for c in category_list: # if c not in df_train.columns: # continue # le = preprocessing.LabelEncoder() # le.fit(pd.concat([df_train[c], df_test[c]], axis=0)) # # df_train[c] = le.transform(df_train[c]) # df_test[c] = le.transform(df_test[c]) # # category_encode_size_map[c] = len(le.classes_) # log.info('{} has {} classes, origin classes are {}'.format(c, len(le.classes_), le.classes_)) return df_train, df_test, label, label_one_hot, feats, category_encode_size_map
return base_util.pickle_load('../../origin_data/label2index.pkl') def batch_yield(df, batch_size): if batch_size == -1: batch_size = len(df) # equal to shuffle df = df.sample(frac=1) # len(df) // batch_size * batch_size <= len(df) total_batch = len(df) // batch_size for i in range(total_batch): data = df.iloc[i * batch_size:i * batch_size + batch_size, :] labels = data[LABEL] data.drop([LABEL], axis=1, inplace=True) yield data, labels def save_result(ids, labels, submit_path): df_test = pd.DataFrame() df_test[ID] = ids df_test[LABEL] = labels df_test.columns = [ID, 'predict'] print('====shape df_test====', df_test.shape) df_test.to_csv(submit_path, index=False) if __name__ == '__main__': df_train, df_test = base_data_process.eda(age2group=True, one_hot=True) data_prepare(df_train, df_test)
df['predict'] = index2label(y_pre) df.to_csv('result{}.csv'.format(name), index=False) # Display/plot feature importance def display_importances(feature_importance_df_): cols = feature_importance_df_[[ "feature", "importance" ]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index best_features = feature_importance_df_.loc[ feature_importance_df_.feature.isin(cols)] plt.figure(figsize=(8, 10)) sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) plt.title('LightGBM Features (avg over folds)') plt.tight_layout() plt.savefig('lgbm_importances01.png') if __name__ == '__main__': if not os.path.exists('origin_data_save'): os.mkdir('origin_data_save') with timer('data process'): df_train, df_test = eda() label2index(df_train, LABEL) with timer('model process'): model(df_train, df_test, num_folds=5, num_boost_round=10000)
feature_importance_df_.feature.isin(cols)] plt.figure(figsize=(8, 10)) sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) plt.title('LightGBM Features (avg over folds)') plt.tight_layout() plt.savefig(name) if __name__ == '__main__': if not os.path.exists('origin_data_save'): os.mkdir('origin_data_save') with timer('data process'): df_train, df_test = eda(age2group=True, one_hot=False, scale=False) cols_index_to_use = [ 59, 90, 98, 9, 33, 79, 63, 7, 44, 19, 47, 74, 38, 66 ] cols_to_use = ['col_{}'.format(i) for i in cols_index_to_use] ll_df = pd.read_csv('../../origin_data/last_layer_100.csv', index_col=False, header=0) ll_df_test = pd.read_csv('../../origin_data/last_layer_100_test.csv', index_col=False, header=0) ll_df = ll_df[cols_to_use] ll_df_test = ll_df_test[cols_to_use]
11, 'verbose': -1 } trials = Trials() with timer('optimization'): # Run optimization best = fmin(fn=objective, space=space, algo=tpe.suggest, trials=trials, max_evals=config_dict['max_evals']) print('-' * 100) log.warn(best) with open('model_trials.pkl', mode='wb') as mt: pickle.dump(trials, mt) config_dict = {'train': pd.DataFrame(), 'max_evals': 1000} if __name__ == '__main__': df_train, df_test = eda(True, False) config_dict['train'] = df_train.iloc[:, :] label2index(df_train, 'current_service') optimization()