def main(): params = { 'num_leaves': 256, 'min_child_samples': 79, 'objective': 'binary', 'max_depth': 13, 'learning_rate': 0.03, "boosting_type": "gbdt", "subsample_freq": 3, "subsample": 0.9, # "bagging_seed": 11, # "eval_metric": 'auc', # "verbosity": -1, 'reg_alpha': 0.3, 'reg_lambda': 0.3, 'colsample_bytree': 0.9, #'categorical_feature': cat_cols } # df_train_glo, df_test_glo = read_all_data('small') df_train_glo, df_test_glo = read_all_data('all') X, y, X_test = data_preprocess(df_train_glo, df_test_glo) # ypred = train_lgb_cla(X, y, X_test, params) # write(ypred, '1006_1') ypred = train_xgb(X, y, X_test, params) write(ypred, '1006_2')
def main(): time_now = time.time() # df_train_glo, df_test_glo = read_all_data('small') df_train_glo, df_test_glo = read_all_data('all') global get_dummies_fea must_delete = delete_null_feature(0.4) best_loss = 10000000 best_ratio = 0.8 best_mask = 'delete_row' delete_cols = delete_both_feature(best_ratio) + must_delete df_train, df_test = data_preprocess(df_train_glo, df_test_glo, delete_cols, best_mask) pred = train(df_train, df_test) write(pred, '1004_2') for ratio in [0.8, 0.5, 0.3, 0.2]: # for ratio in []: delete_cols = delete_both_feature(ratio) + must_delete for drop_mask in ['fillna_mode', 'delete_row']: print '\n\ndelete_null_ratio = ', ratio, 'drop_mask = ', drop_mask, '-' * 100 df_train, df_test = data_preprocess(df_train_glo, df_test_glo, delete_cols, drop_mask) for model in [xgboost.XGBRegressor()]: loss, mse = kFold_cross(df_train, model) # 0.97 if best_loss > loss: best_loss = loss best_mask = drop_mask best_ratio = ratio print time.time() - time_now, '\n\n' delete_cols = delete_both_feature(best_ratio) + must_delete df_train, df_test = data_preprocess(df_train_glo, df_test_glo, delete_cols, best_mask) pred = train(df_train, df_test) write(pred, '1004_3')
def plot1(): data_size = sys.argv[1] # all or small df_train, df_test = read_all_data(data_size) plt.figure(figsize=(15, 5)) plt.scatter(df_train.TransactionDT, df_train.D15) plt.title('Original D15') plt.xlabel('Time') plt.ylabel('D15') plt.show()
def main(): data_size = sys.argv[1] # all or small df_train, df_test = read_all_data(data_size) print df_train.shape, df_test.shape row = 3 col = 5 for i in range(1, 16): plt.subplot(row, col, i) plt.scatter(df_train['TransactionDT'], df_train['D' + str(i)]) plt.title('D' + str(i)) plt.savefig('./picture/Transaction_D_index.jpg') plt.show()
def main(): time_now = time.time() # df_train_glo, df_test_glo = read_all_data('small') df_train_glo, df_test_glo = read_all_data('all') global get_dummies_fea must_delete = delete_null_feature(0.4) best_ratio = 0.8 best_mask = 'delete_row' delete_cols = delete_both_feature(best_ratio) + must_delete df_train, df_test = data_preprocess(df_train_glo, df_test_glo, delete_cols, best_mask) print df_train.columns.values print df_test.columns.values y = df_train['isFraud'] X = df_train.drop(['isFraud', 'TransactionDT'], axis = 1) X_test = df_test.drop(['TransactionDT'], axis = 1) y_pred = train_lgb(X, y, X_test) write(y_pred, '1008_1')
def main(): begin_time = time.time() df_train, df_test = read_all_data(int(sys.argv[1])) y_train = df_train['isFraud'].copy() print df_train.shape, df_test.shape for i in range(1, 10): print 'M' + str(i) + '.most_common = ', Counter( df_train['M1'].tolist()).most_common(10) df_train['M' + str(i)] = df_train['M' + str(i)].apply(lambda x: 1 if str( x) == 'T' else -1 if str(x) == 'F' else 0).astype(np.int) df_test['M' + str(i)] = df_test['M' + str(i)].apply(lambda x: 1 if str( x) == 'T' else -1 if str(x) == 'F' else 0).astype(np.int) print 'M' + str(i) + '.most_common = ', Counter( df_train['M' + str(i)].tolist()).most_common(10) df_train, df_test = data_normalize(df_train, df_test) df_train, df_test = add_datetime_feature(df_train, df_test) df_train, df_test = encode(df_train, df_test) cols = remove_cols(df_train) df_train, df_test = get_uid(df_train, df_test) df_train, df_test = encode2(df_train, df_test, cols) for col in ['ProductCD', 'card6', 'P_emaildomain', 'R_emaildomain', 'id_12', 'id_15', 'id_16', 'id_28', 'id_29', 'id_31', 'id_35', \ 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']: le = LabelEncoder() le.fit(df_train[col].tolist() + df_test[col].tolist()) df_train[col] = le.transform(df_train[col].tolist()) df_test[col] = le.transform(df_test[col].tolist()) print df_train[cols].info() print df_train[['TransactionAmt']].info() for col in cols: if col == 'isFraud': continue if col not in df_train.columns.values or col not in df_test.columns.values: print '\nerror', col, '\n' mean = np.mean(df_train[~df_train[col].isna()][col].tolist() + df_test[~df_test[col].isna()][col].tolist()) df_train[col] = df_train[col].fillna(mean) df_test[col] = df_test[col].fillna(mean) oof, preds = BUILD96(df_train, df_test, y_train, cols) BUILD96_output(preds) print 'spend time = ', time.time() - begin_time
# encoding:utf-8 # FileName: main # Author: xiaoyi | 小一 # email: [email protected] # Date: 2020/2/22 21:05 # Description: 分析疫情数据| 拐点来了吗? import os from plot_data import plot_map, plot_line_chart from preprocess import summary_data from read_data import read_latest_data, read_all_data if __name__ == '__main__': # 读取数据 df_data = read_all_data('province') # 汇总每天的全国成绩 df_result_all = summary_data(df_data, 'all') df_result_excep_HB = summary_data(df_data, 'excep_HB') df_result_HB = summary_data(df_data, 'HB') line_chart_title = [ '累计确诊人数 (by:『知秋小梦』)', '新增确诊人数 (by:『知秋小梦』)', '累计治愈人数 (by:『知秋小梦』)', '累计死亡人数 (by:『知秋小梦』)', '治愈率 (by:『知秋小梦』)', '死亡率 (by:『知秋小梦』)' ] # 绘制折线图 plot_line_chart('全国数据', line_chart_title, df_result_all) plot_line_chart('全国数据(除湖北省)', line_chart_title, df_result_excep_HB) plot_line_chart('湖北省数据', line_chart_title, df_result_HB) # 获取最新日期的疫情数据
def json_data(selectedDate): df_date = merged[merged["Date"]==str(selectedDate)] json_data = json.dumps(json.loads(df_date.to_json())) return df_date.to_json() def update_date(attr, old, new): yr = date_slider.value new_data = json_data(yr) geosource.geojson = new_data p.title.text = 'covid 19 deaths, %s' %yr def update_color(attr, old, new): color_mapper.high = color_slider.value merged = read_all_data() geosource = GeoJSONDataSource(geojson = json_data(str(date(2020, 4,25)))) palette = brewer['YlGnBu'][8] palette = palette[::-1] color_cap_deaths = round(np.nanmax(merged["Deaths"]) + 500, -3) color_mapper = LinearColorMapper(palette = palette, low = 0, high = color_cap_deaths, nan_color = '#d9d9d9') hover = HoverTool(tooltips = [ ('Country/region','@country'),('deaths', '@Deaths')]) color_bar = ColorBar(color_mapper=color_mapper, label_standoff=8,width = 900, height = 20, border_line_color=None,location = (0,0), orientation = 'horizontal') p = figure(title = 'Covid 19 deaths', plot_height = 900 , plot_width = 1600, tools = [BoxZoomTool(), ResetTool(), hover]) p.xgrid.grid_line_color = None p.ygrid.grid_line_color = None p.patches('xs','ys', source = geosource,fill_color = {'field' :'Deaths', 'transform' : color_mapper}, line_color = 'black', line_width = 0.25, fill_alpha = 1) p.add_layout(color_bar, 'below') date_slider = DateSlider(title="Date Range: ", start=date(2020, 1, 31), end=date(2020, 4, 25), value=date(2020, 4, 25), step=1) date_slider.on_change('value', update_date)
def main(): df_train_glo, df_test_glo = read_all_data('small') # df_train_glo, df_test_glo = read_all_data('all') X, y, X_test = data_preprocess(df_train_glo, df_test_glo) ypred = train_xgb(X, y, X_test) write(ypred, '1006_3')
def main(): df_train_glo, df_test_glo = read_all_data('small') # df_train_glo, df_test_glo = read_all_data('all') data_preprocess(df_train_glo, df_test_glo)
import threading from sklearn import preprocessing from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.neural_network import MLPClassifier import numpy as np from collections import defaultdict from read_data import read_all_data from extract_word import extract_feature from exture_feature_vec import extract_feature_of_a_seq all_family_data = read_all_data() sum_num = 0 for data in all_family_data.values(): sum_num += len(data) print(time.ctime(), ":", "数据总量:", sum_num) features = extract_feature(all_family_data) dict_feature = {} for feature in features: if len(feature) in dict_feature.keys(): dict_feature[len(feature)] += 1 else: dict_feature[len(feature)] = 1 print(sorted(dict_feature.items(), key=lambda d: d[0], reverse=False)) print(features, len(features)) family_names = {}
'TEST_VALIDATION', 'TOP_K' ] if all_para[2] == 'LCFN': para_name += ['FREQUENCY_USER', 'FREQUENCY_ITEM'] if all_para[2] == 'LightLCFN': para_name += [ 'FREQUENCY_USER', 'FREQUENCY_ITEM', 'FREQUENCY', 'KEEP_PORB', 'SAMPLE_RATE', 'GRAPH_CONV', 'PREDICTION', 'LOSS_FUNCTION', 'GENERALIZATION', 'OPTIMIZATION', 'IF_TRASFORMATION', 'ACTIVATION', 'POOLING' ] if all_para[2] == 'SGNN': para_name += ['PROP_DIM', 'PROP_EMB', 'IF_NORM'] # if testing the model, we need to read in test set if tuning_method == 'test': all_para[11] = para[11] = 'Test' ## read data data = read_all_data(all_para) para[10] = data[-1] ## tuning the model os.environ["CUDA_VISIBLE_DEVICES"] = all_para[0] if tuning_method == 'tuning': tuning(path_excel_dir, para_name, para, data, lr_coarse, lamda_coarse, min_num_coarse, max_num_coarse, min_num_fine, max_num_fine) if tuning_method == 'fine_tuning': fine_tuning(path_excel_dir, para_name, para, data, lr_fine, lamda_fine, min_num_fine, max_num_fine) if tuning_method == 'cross_tuning': cross_tuning(path_excel_dir, para_name, para, data, lr_fine, lamda_fine, min_num_fine, max_num_fine) if tuning_method == 'coarse_tuning': coarse_tuning(path_excel_dir, para_name, para, data, lr_coarse,