import utils act_train = pd.read_csv('data/act_train.csv') act_test = pd.read_csv('data/act_test.csv') people = pd.read_csv('data/people.csv') #%% print(people.columns.values) print(act_train.columns.values) #%% column_names = {} column_names['category'] = [ 'char_1_p', 'group_1', 'char_2_p', 'char_3_p', 'char_4_p', 'char_5_p', 'char_6_p', 'char_7_p', 'char_8_p', 'char_9_p', 'activity_category', 'char_1_a', 'char_2_a', 'char_3_a', 'char_4_a', 'char_5_a', 'char_6_a', 'char_7_a', 'char_8_a', 'char_9_a', 'char_10_a' ] column_names['date'] = ['date_p', 'date_a'] column_names['ignore'] = ['people_id', 'activity_id'] column_names['y'] = 'outcome' column_names['bool'] = [ 'char_10_p', 'char_11', 'char_12', 'char_13', 'char_14', 'char_15', 'char_16', 'char_17', 'char_18', 'char_19', 'char_20', 'char_21', 'char_22', 'char_23', 'char_24', 'char_25', 'char_26', 'char_27', 'char_28', 'char_29', 'char_30', 'char_31', 'char_32', 'char_33', 'char_34', 'char_35', 'char_36', 'char_37' ] column_names['nu'] = 'char_38' utils.save_variable('column_names', column_names)
for forest_idx in range(0, 10, 1): tr_range = range(forest_idx, tr_row_nu, 10) t_X, t_Y = tr_votes[tr_range, :], tr_Y[tr_range] t0 = time.time() print('forest', forest_idx, end='-->') forest_2nd = RandomForestClassifier(max_depth=max_depth_best, n_estimators=11, random_state=12) forest_2nd.fit(t_X, t_Y) y_pred = forest_2nd.predict(t_X) print('tr:', matthews_corrcoef(t_Y, y_pred), end=',') val_y_pred = forest_2nd.predict(val_votes) print('val:', matthews_corrcoef(val_Y, val_y_pred), end='') print(',cost', int(time.time() - t0)) save_variable(forest_2nd, 'final/2nd_level_models/' + str(forest_idx)) #%% ''' forest 0-->tr: 0.853781308398,val: 0.547872937046,cost 84 forest 1-->tr: 0.839596341509,val: 0.550393783491,cost 89 forest 2-->tr: 0.868869382821,val: 0.544121676669,cost 110 forest 3-->tr: 0.831067645144,val: 0.556898396227,cost 111 forest 4-->tr: 0.866979504134,val: 0.572140934661,cost 108 forest 5-->tr: 0.865444728731,val: 0.565962549302,cost 105 forest 6-->tr: 0.847071811478,val: 0.550088494652,cost 105 forest 7-->tr: 0.867795690777,val: 0.55928161695,cost 110 forest 8-->tr: 0.849556952293,val: 0.54308323331,cost 111 forest 9-->tr: 0.852024718779,val: 0.558287108709,cost 105 ''' #%%
StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set. ''' import os for root, dirs, files in os.walk('final/tr_groups'): for chunk_idx in files: print('chunk', chunk_idx, end='...') chunk_path = os.path.join(root, chunk_idx) tr_chunk = read_variable(chunk_path) model_path = 'final/1L_tree/' + str(chunk_idx) if os.path.isfile(model_path): print('model exist') else: save_variable({}, model_path) print('processing...') chunk_X, chunk_Y = load_pipped_tr_chunk([chunk_idx]) # based on experiment, k=4 give the smallest STD chunk_model = ForestChunkClassifierWithKFolds(k=4, seeds=[13, 11, 193]) chunk_model.fit(chunk_X, chunk_Y, test_X, test_Y) chunk_Y_pred = chunk_model.predict(chunk_X) chunk_mcc = matthews_corrcoef(chunk_Y, chunk_Y_pred) print('OVERALL tr:', chunk_mcc, end='') test_Y_pred = chunk_model.predict(test_X) test_mcc = matthews_corrcoef(test_Y, test_Y_pred) print(',test:', test_mcc, end='-->')
category_dummys_feature_filter = SelectKBest(chi2, k=X.shape[1]) category_dummys_feature_filter.fit(X, y) #% category_dummys_pvalues_significance_level = 0.05 # bigger p value, feature distribution is more like y #normally set pvalues_threshold to 0.01 or 0.05 selected_col_ids = [] for id, p in enumerate(category_dummys_feature_filter.pvalues_): if p > category_dummys_pvalues_significance_level: selected_col_ids.append(id) print(len(selected_col_ids)) utils.save_variable( 'outputs/people_act_train_category_selected_col_ids_pvalue_' + str(category_dummys_pvalues_significance_level), selected_col_ids) del X, y, id, p #%% filter on rows, based on 16GB RAW, it only can process 2K rows in a reasonable time. Y = people_act_train[column_names['y']] row_total_0 = 1000 row_total_1 = 1000 row_count_0 = 0 row_count_1 = 0 selected_row_ids = [] for i, y in enumerate(Y): if y == 0 and row_count_0 < row_total_0: selected_row_ids.append(i)
import utils import progressbar max_chunk_size = 1 chunks_num = pd.read_csv('data/train_numeric.csv',index_col='Id',chunksize=max_chunk_size, low_memory=False,iterator=True) bar = progressbar.ProgressBar() for chunk_id in bar(range(1183747)): # chunk has to be read one by one in sequence chunk_num_response = chunks_num.get_chunk() chunk_num = chunk_num_response.drop(['Response'],axis=1) chunk_y = chunk_num_response['Response'] utils.save_variable(chunk_y, 'data/train_y_rows/'+str(chunk_id)+'.pkl') utils.save_variable(chunk_num, 'data/train_numeric_rows/'+str(chunk_id)+'.pkl') #%% chunks_num = pd.read_csv('data/test_numeric.csv',index_col='Id',chunksize=max_chunk_size, low_memory=False,iterator=True) bar = progressbar.ProgressBar() for chunk_id in bar(range(1183748)): # chunk has to be read one by one in sequence chunk_num = chunks_num.get_chunk() utils.save_variable(chunk_num, 'data/test_numeric_rows/'+str(chunk_id)+'.pkl')
tr_X_1s = read_variable('model_stats/tr_pip_data_1s_1108.pkl') len_1s = tr_X_1s.shape[0] for set_id in range(0, 166, 1): # 6 chunks give about the same 0s (tiny amount of 1s is ignored) as the 1s chunk_range = range(set_id, 1000, 166) t_X, t_Y = load_training_subset_1108(chunk_range) tr_X = np.concatenate([t_X, tr_X_1s]) tr_Y = np.concatenate([t_Y, np.ones(len_1s)]) X, Y = tr_X, tr_Y model = AdaBoostClassifier(n_estimators=100) t0 = time.time() model = model.fit(X, Y) y_pred = model.predict(X) print(set_id, 'boost:', ',tr:', matthews_corrcoef(Y, y_pred), end='') #print('tr 1s:real',sum(Y),',pred',sum(y_pred)) #utils.save_variable(tree_votes_0,'models/tree_votes_0.pkl') print(',val:', end='') X, Y = val_X, val_Y y_pred = model.predict(X) print(matthews_corrcoef(Y, y_pred), end='') print(',cost:', int(time.time() - t0), 'sec') break save_variable(model, '7/boost_' + str(set_id) + '.pkl') ''' '''
for chunk_id in bar(range(0, chunk_nu)): col = chunks.get_chunk()[col_name] ys = responses[chunk_id * max_chunk_size:chunk_id * max_chunk_size + col.shape[0]] for i in range(0, col.shape[0], 1): value = col.iloc[i] y = ys[i] if value != value: cnts[y]['nan'] += 1 else: cnts[y]['nu'].append(value) cnts[0]['nu'] = np.asarray(cnts[0]['nu']).reshape(-1, 1) cnts[1]['nu'] = np.asarray(cnts[1]['nu']).reshape(-1, 1) print('cal kde for 0...') if cnts[0]['nu'].size > 0: cnts[0]['kde'] = KernelDensity(kernel='gaussian').fit( cnts[0]['nu']) print('cal kde for 1...') if cnts[1]['nu'].size > 0: cnts[1]['kde'] = KernelDensity(kernel='gaussian').fit( cnts[1]['nu']) utils.save_variable(cnts, file_path) break except ValueError: print('get ValueError. Restart again.') #%%
#%% convert one date variable to three variables: year, month, and day people_act_train_date_dummys = pd.DataFrame() for name in column_names['date']: people_act_train_date_dummys[name+'_y'] = pd.to_datetime(people_act_train[name]).apply(lambda x: x.year) people_act_train_date_dummys[name+'_m'] = pd.to_datetime(people_act_train[name]).apply(lambda x: x.month) people_act_train_date_dummys[name+'_d'] = pd.to_datetime(people_act_train[name]).apply(lambda x: x.day) del name #%% convert category column to integer #----------------------------------- people_act_train_category2int = pd.DataFrame() for name in column_names['category']: people_act_train_category2int[name] = people_act_train[name].str.replace('((type)|(group))\s','') people_act_train_category2int = people_act_train_category2int.fillna(value=0) del name #%% convert integer to one hot codes one_hot_encoder = OneHotEncoder(n_values='auto', sparse=True) one_hot_encoder.fit(people_act_train_category2int) #%% this variable will not presented in variable explorer and can NOT call toarray() which lead MemoryError people_act_train_category_dummys = one_hot_encoder.transform(people_act_train_category2int) print(people_act_train_category_dummys.shape) print(type(people_act_train_category_dummys)) del people_act_train_category2int #%% utils.save_variable('outputs/people_act_train_date_dummys',people_act_train_date_dummys) utils.save_variable('outputs/people_act_train_category_dummys',people_act_train_category_dummys)
#% #X = val.drop(X_col_exl, axis=1) X= val['char_38'].reshape(-1, 1) Y = val['outcome'] f = model.predict(X) print('------VALIDATION-------') (val_f1,val_auc, val_confusion) = utils.validate_prediction(f,Y) print('============================') print('FINAL ==',val_auc) print('============================') model_char38 = model utils.save_variable('models/model_char38',model_char38) del model del X,Y,f print('###########################') print('without Char 38, Tree Nu:',estimator_nu) print('---------------------------') X_col_exl = ['outcome','char_38'] X = tr.drop(X_col_exl, axis=1) Y = tr['outcome'] # col char_38 model = RandomForestClassifier(n_estimators=estimator_nu, verbose=0, n_jobs=-1) startTime = time.time()
remained_sample_ids = set(data.index.tolist()) test_ids = set([]) data_gp = data.groupby('timestamp') test_ratio = 0.01 print('create test dataset',end='') unique_timestamp = data["timestamp"].unique() n = len(unique_timestamp) test_start_i = int(n*(1-test_ratio)) timesplit = unique_timestamp[test_start_i] print('timesplit:',timesplit) train_data = data[data.timestamp < timesplit] test_data = data[data.timestamp >= timesplit] utils.save_variable(test_data,'E:/two-sigma/output/timeseries/test_data') #%% ''' generate timeseries chunks ''' ts_groups = train_data.groupby('timestamp') for key in ts_groups.groups.keys(): print('Timestamp:',key,end=',') row_ids = ts_groups.groups[key] gp_data = train_data.ix[row_ids] utils.save_variable(gp_data,'E:/two-sigma/output/timeseries/tr_chunks/'+str(key)) print('SIZE:',gp_data.shape)
index_col='Id', chunksize=max_chunk_size, low_memory=False, iterator=True) bar = progressbar.ProgressBar() for chunk_id in bar(range(max_chunk_nu)): # chunk has to be read one by one in sequence chunk_num_response = chunks_num.get_chunk() chunk_num = chunk_num_response.drop(['Response'], axis=1) chunk_y = chunk_num_response['Response'] chunk_date = chunks_date.get_chunk() chunk_cate = chunks_cate.get_chunk() utils.save_variable(chunk_y, 'data/train_y_chunks/' + str(chunk_id) + '.pkl') utils.save_variable(chunk_num, 'data/train_numeric_chunks/' + str(chunk_id) + '.pkl') utils.save_variable(chunk_date, 'data/train_date_chunks/' + str(chunk_id) + '.pkl') utils.save_variable( chunk_cate, 'data/train_categorical_chunks/' + str(chunk_id) + '.pkl') #%% chunks_num = pd.read_csv('data/test_numeric.csv', index_col='Id', chunksize=max_chunk_size, low_memory=False, iterator=True) chunks_date = pd.read_csv('data/test_date.csv',
#%% from sklearn.ensemble import RandomForestClassifier t_X, t_Y = votes[:90000,:], val_Y[:90000] v_X, v_Y = votes[90000:,:], val_Y[90000:] max_depth=59 print(max_depth,end='-->') forest_2nd = RandomForestClassifier(max_depth=max_depth,n_estimators=11,random_state=12) forest_2nd.fit(t_X, t_Y) y_pred= forest_2nd.predict(t_X) print('tr:',matthews_corrcoef(t_Y , y_pred),end=',') y_pred= forest_2nd.predict(v_X) print('val:',matthews_corrcoef(v_Y , y_pred)) save_variable(forest_2nd,'forest_2nd_14.pkl') #%% """ produce testing result """ max_chunk_size = 1000 col_cate_nu = 2140 col_numeric_nu = 969 col_date_nu = 1157 pip = read_variable('model_stats/pip_1110.pkl')
for model_idx in bar(files): model_path = os.path.join(root, model_idx) model = read_variable(model_path) pack = {} pack['root'] = root pack['chunk_id'] = model_idx pack['model'] = model packs.append(pack) print('model loaded:', len(packs)) #%% from utils import load_pipped_tr_chunk, save_variable from sklearn.metrics import matthews_corrcoef ''' WARNING: 41 chunks cost about 20 hrs to finish ''' for chunk_idx in range(41): print('chunk', chunk_idx, end='...') chunk_X, chunk_Y = load_pipped_tr_chunk([chunk_idx]) for pack in packs: model = pack['model'] chunk_Y_pred = model.predict(chunk_X) file_path = pack['root'] + '_tr_y_pred/' + str( pack['chunk_id']) + '_' + str(chunk_idx) save_variable(chunk_Y_pred, file_path) print(file_path, '-->', matthews_corrcoef(chunk_Y, chunk_Y_pred), ',1s:', str(sum(chunk_Y_pred)))
model_forest = [] bar = progressbar.ProgressBar() for model_id in bar(range(0, 301, 1)): model = read_variable('final/good_models/' + str(model_id)) model_forest.append(model) #%% for chunk_id in range(1184): path = 'final/test_votes/' + str(chunk_id) + '.pkl' print('checking chunk:', chunk_id, end='...') if os.path.isfile(path): print('exist') else: save_variable({}, path) print('processing', end='...') chunk_X = load_pipped_test_chunks([chunk_id]) # predit votes = np.zeros([chunk_X.shape[0], len(model_forest)]) bar = progressbar.ProgressBar() model_id = 0 for model in bar(model_forest): t0 = time.time() pred_Y = model.predict_proba(chunk_X) pred_Y_0 = pred_Y[:, 0] votes[:, model_id] = pred_Y_0 model_id += 1 save_variable(votes, path) print('saved to', path)
round_set += 1 n_estimators += 10 X, Y = tr_X, tr_Y y_pred = best_model.predict(X) print('tree BEST:', set_id, ',tr:', matthews_corrcoef(Y, y_pred), end='') print(',val:', end='') X, Y = val_X, val_Y y_pred = best_model.predict(X) val_mcc = matthews_corrcoef(Y, y_pred) print(val_mcc) print('#####################################') save_variable(model, '8/forest_' + str(set_id) + '.pkl') #%% ''' ''' from sklearn.ensemble import RandomForestClassifier len_1s = tr_X_1s.shape[0] set_id = 0 chunk_range = range(set_id, 1000, 166) t_X, t_Y = load_training_subset_1110(chunk_range) tr_X = np.concatenate([t_X, tr_X_1s]) tr_Y = np.concatenate([t_Y, np.ones(len_1s)])
#%% tolil() is more efficient than csr_matrix when values need to be modified people_act_train_category_pcs_dsc_lil = people_act_train_category_pcs_dsc.tolil( ) startTime = time.time() for i in range(2174877, 1160000, -1): people_act_train_category_pcs_dsc_lil[i] = pca.transform( people_act_train_category_dummys[i, selected_col_ids].toarray()) print(i, '/', length) print('PCA Translation took', int(time.time() - startTime), 'sec') print(people_act_train_category_dummys.shape, '-PCA->', people_act_train_category_pcs_dsc_lil.shape) #% utils.save_variable( 'outputs/people_act_train_category_' + str(pc_number) + 'pcs_dsc_lil', people_act_train_category_pcs_dsc_lil) #%% ASC#################### ########################## # can NOT do transform all data in one process which will lead to memory leak people_act_train_category_pcs = csr_matrix( (people_act_train_category_dummys.shape[0], pc_number)) people_act_train_category_pcs_asc = people_act_train_category_pcs #%% people_act_train_category_pcs_asc_lil = people_act_train_category_pcs_asc.tolil( ) startTime = time.time() for i in range(0, 1160001, 1): people_act_train_category_pcs_asc_lil[i] = pca.transform( people_act_train_category_dummys[i, selected_col_ids].toarray())
mcc = matthews_corrcoef(y_val, y_pred) print(mcc, end='') if mcc > best_tree_mcc: best_tree = tre best_tree_mcc = mcc print('(best)', end='') print() print('tree', tree_id, '-->best mcc:', best_tree_mcc) forest.append(best_tree) if id_end == x_tr.shape[0]: break tree_id += 1 id_start = id_end utils.save_variable(forest, 'model_stats/forest.pkl') del x, y, id_start, id_end, tree_id, best_tree_mcc, best_tree #%% #%% validation based on three second level trees x = x_val_final y = y_val_final y_sum = np.zeros(y.shape[0]) threshold = 0 for tre in forest: y_sum += tre.predict(x) y_pred = (y_sum > threshold).astype(int)
print('(best)', end='') print() round_i += 1 forest.append(best_model) tree_id += 1 tr_chunk_start_index = tr_chunk_end_index tr_chunk_end_index = tr_chunk_start_index + tr_chunk_nu if tr_chunk_end_index > tr_chunk_end: tr_chunk_end_index = tr_chunk_end del tr_nu, tr_y, tr_x, best_model_mcc, best_model, x, y, votes_date_nu if tr_chunk_start_index == tr_chunk_end: del tr_chunk_start_index, tr_chunk_end break utils.save_variable(forest, 'model_stats/forest_nu_date_cate.pkl') del sgd_val_x, sgd_val_y #%% ''' valide over all model mcc with validation dataset ''' import progressbar import numpy as np from sklearn.metrics import matthews_corrcoef import utils forest = utils.read_variable('model_stats/forest_nu_date_cate.pkl') val_chunk_ids = range(tr_chunk_end, chunk_nu, 1) val_nu = 183747
pack['model'] = model packs.append(pack) print('model loaded:', len(packs)) #%% test_X, test_Y = load_pipped_test_chunk() real_test_Y_1s_count = str(sum(test_Y)) for pack in packs: model = pack['model'] test_Y_pred = model.predict(test_X) file_path = pack['root'] + '_test_y_pred/' + str(pack['chunk_id']) save_variable(test_Y_pred, file_path) print(file_path, '-->', matthews_corrcoef(test_Y, test_Y_pred), ',1s:', str(sum(test_Y_pred)) + '/' + real_test_Y_1s_count) #%% model_folders = [ 'final/1L_tree', 'final/1L_ada', 'final/1L_gb', 'final/1L_mlp' ] packs = [] for model_folder in model_folders: print('Processing model cluster:', model_folder) for root, dirs, files in os.walk(model_folder): bar = progressbar.ProgressBar() for model_idx in bar(files): model_path = os.path.join(root, model_idx)
feature_engine_pvalues = feature_engine.pvalues_ kbest_cols = [] # pvalue 0.05 or 0.1 pvalue_threshold = 0.05 for idx, pv in enumerate(feature_engine_pvalues): if pv > pvalue_threshold: kbest_cols.append(idx) print('selected col:', len(kbest_cols)) pip['2_kbest_cols'] = kbest_cols #%% utils.save_variable(pip, 'model_stats/pip.pkl') #%% ''' modeling: search for the best max_depth ''' from sklearn.feature_selection import SelectKBest, f_classif X = tr_X[:, kbest_cols] Y = tr_Y # use random_state to produce repeatable result. model = tree.DecisionTreeClassifier(random_state=0) model = model.fit(X, Y) y_pred = model.predict(X) print('tree depth:MAX,tr:', matthews_corrcoef(Y, y_pred), end='') #utils.save_variable(tree_votes_0,'models/tree_votes_0.pkl')
remained_sample_ids = set(data.index.tolist()) sel_sample_ids = set([]) data_gp = data.groupby('timestamp') test_ratio = 0.01 for name, group in data_gp: print('T'+str(name),end=',') gp_idx = set(data_gp.groups[name]) idx_pool = gp_idx.intersection(remained_sample_ids) sel_ids_len = int(len(gp_idx)*test_ratio) sel_ids = set(random.sample(idx_pool,sel_ids_len)) print('sel',len(sel_ids),'samples from',len(gp_idx)) sel_sample_ids = sel_sample_ids | sel_ids remained_sample_ids = remained_sample_ids - sel_ids utils.save_variable(sel_sample_ids,'output/test_ids') print('testing samples:',len(sel_sample_ids)) del sel_sample_ids #%% test_ids = list(utils.read_variable('output/test_ids')) ''' check whether testing dataset is well distributed among different obj ids ''' data_gp = data.groupby('id') overall_stats_by_id = {} for name, group in data_gp: print(name,'give has samples:',group.shape[0]) overall_stats_by_id[name] = group.shape[0] test_data = data.ix[test_ids] data_gp = test_data.groupby('id')
cols_numeric = column_names['numeric'] col_len = cols_categorical.size+cols_date.size+cols_numeric.size del column_names #%% ''' calculate probability of response 0 with categorical col ''' col_stats_cate = {} print('import col statistic:','categorical columns') bar = progressbar.ProgressBar() for col_name in bar(cols_categorical): col_stats_cate[col_name] = utils.read_variable('model_stats/cate/'+col_name+'.pkl') utils.save_variable(col_stats_cate,'model_stats/col_stats_cate.pkl') ''' calculate probability of response 0 with date col ''' col_stats_date = {} print('import col statistic:','date columns') bar = progressbar.ProgressBar() for col_name in bar(cols_date): stat = utils.read_variable('model_stats/date/'+col_name+'.pkl') #remove nu list to save memory del stat[0]['nu'],stat[1]['nu'] col_stats_date[col_name] = stat utils.save_variable(col_stats_date,'model_stats/col_stats_date.pkl')
nan_count = 0 for chunk_id in range(0, chunk_nu): chunk = chunks.get_chunk() nan_count += np.sum(chunk.isnull()) nan_counts[col_index] = nan_count col_nan_counts[col_name] = nan_count #print('\nnan values:',nan_counts[col_index]/train_rows_nu) col_index += 1 del file_name, cols, col_index, chunks, bar, nan_count, chunk_id #%% a = nan_counts / (max_chunk_size * chunk_nu) #%% nan_threshold_percent = 0.2 threshold = nan_threshold_percent * max_chunk_size * chunk_nu col_selected = [] bar = progressbar.ProgressBar() for key, value in bar(col_nan_counts.items()): if int(value) < threshold: col_selected.append(key) print(len(col_selected), 'cols selected.') file_path = 'model_stats/col_nu_selected_2percent_Nan.pkl' utils.save_variable(col_selected, file_path) print('result is saved to:', file_path)
best_model = model print('-->BEST') else: print() print(',test:', end='') final_y_pred = model.predict(final_val_X) final_val_mcc = matthews_corrcoef(final_val_Y, final_y_pred) print(final_val_mcc) print('---------------------------------------') print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^') print('tree:', gp_idx, 'BEST,val:', best_val_mcc, end='-->') if best_val_mcc > good_model_val_mcc: save_variable(model, file_path) print('SAVED' + '(' + file_path + ')') else: print('DISCARD') print('#####################################') #%% ''' TEST ''' from sklearn.ensemble import RandomForestClassifier import os good_model_val_mcc = 0.2
best_model = model best_tr_mcc = tr_mcc print('<---best') else: print() if best_tr_mcc > 0.9: break tr_Y_pred = best_model.predict(tr_X) best_tr_mcc = matthews_corrcoef(tr_Y, tr_Y_pred) print(set_id, 'best sgd:', ',tr:', best_tr_mcc, end='') print(',val:', end='') val_Y_pred = best_model.predict(val_X) print(matthews_corrcoef(val_Y, val_Y_pred)) break save_variable(best_model, '7/sgd_' + str(set_id) + '.pkl') #%% ''' No matter what config, SGD give really bad pred even on training. 0 / 1 sgd: ,tr: 0.0130409009077,val:-0.00894843344066 ( 3325 ) 0 / 2 sgd: ,tr: -0.0126166788456,val:-0.00494400796572 ( 3414 ) 0 / 3 sgd: ,tr: 0.0210647425353,val:-0.00608883260096 ( 2121 ) 0 / 4 sgd: ,tr: 0.0409474491906,val:0.00673896161817 ( 1048 ) 0 / 5 sgd: ,tr: 0.0178420591587,val:-0.00513487374047 ( 2921 ) 0 / 6 sgd: ,tr: 0.0812392407412,val:0.0155743265752 ( 4745 )<---best 0 / 7 sgd: ,tr: 0.0667857466617,val:0.0115874549738 ( 1881 ) 0 / 8 sgd: ,tr: 0.0161787956102,val:-0.00146447562307 ( 2706 ) 0 / 9 sgd: ,tr: 0.0362361124868,val:-0.0127457101921 ( 2143 )
people_act_train['activity_category'] == 'type 1')][0:1] people_act_char_10_a_notnull = people_act_train.loc[ people_act_train['char_10_a'].notnull()][0:1] #%% people_act_grouped = people_act_train.groupby('outcome') #%% outcome 0 VS. outcome 1 people_act_outcome0 = people_act_grouped.get_group(0) people_act_outcome1 = people_act_grouped.get_group(1) plt.pie([people_act_outcome0.shape[0], people_act_outcome1.shape[0]], labels=['0', '1'], autopct='%1.4f%%') plt.show() #%% review category variables category_column_names_stat = pd.DataFrame(index=column_names['category'], columns=['unique_types']) for name in column_names['category']: shape = people_act_train[name].unique().shape category_column_names_stat.ix[name].unique_types = shape[0] print(category_column_names_stat) #%% people_act_train['group_1'].unique() people_act_train['char_10_a'].unique() #%% utils.save_variable('people_act_train', people_act_train) utils.save_variable('people_act_test', people_act_test)
low_memory=False, iterator=True) chunk_id = 0 for chunk_id in range(0, chunk_nu, 1): print('processing chunk:', chunk_id) chunk = chunks_cate.get_chunk() file_path = 'model_stats/test_cate_proba/' + str(chunk_id) + '.pkl' if os.path.exists(file_path): print('already exist.') else: ids = chunk.index result = np.zeros(chunk.shape) time1 = time.time() r = 0 c = 0 for col_name in cols_cate: col = chunk[col_name] time2 = time.time() r = 0 for index, value in col.iteritems(): proba = cal_0_proba_by_cate(col_name, value) result[r, c] = proba r += 1 c += 1 print('per chunk:', time.time() - time1) df = pd.DataFrame(data=result, index=ids.tolist(), columns=cols_cate) utils.save_variable(df, file_path)
tr_X = t_X tr_Y = t_Y ''' based on experiment, smaller tol give better fit on training dataset tol : float, default: 1e-4 Tolerance for stopping criteria. ''' best_model = KNeighborsClassifier(n_jobs=3) t0 = time.time() best_model = best_model.fit(tr_X, tr_Y) tr_Y_pred = best_model.predict(tr_X) best_tr_mcc = matthews_corrcoef(tr_Y, tr_Y_pred) print(set_id, '- i', '(', tol, ')', 'logic:', ',tr:', best_tr_mcc, end='') # print(',val:',end='') # val_Y_pred = best_model.predict(val_X) # print(matthews_corrcoef(val_Y, val_Y_pred),end='') # print(',cost:',int(time.time()-t0),'sec') #print('val 1s:real',sum(val_Y),',pred',sum(val_Y_pred)) save_variable(best_model, file_path)
tr_X_1s = utils.read_variable('model_stats/tr_pip_data_1s.pkl') len_1s = tr_X_1s.shape[0] for set_id in range(6, 1000, 6): # 6 chunks give about the same 0s (tiny amount of 1s is ignored) as the 1s file_path = '7/svc_' + str(set_id) + '.pkl' if os.path.exists(file_path): print('already exist.', file_path) else: chunk_range = range(set_id - 6, set_id, 1) t_X, t_Y = utils.load_training_subset(chunk_range) tr_X = np.concatenate([t_X, tr_X_1s]) tr_Y = np.concatenate([t_Y, np.ones(len_1s)]) model = SVC(kernel='rbf', C=1) t0 = time.time() model = model.fit(tr_X, tr_Y) y_pred = model.predict(tr_X) print(set_id, 'svc:', ',tr:', matthews_corrcoef(tr_Y, y_pred)) # do not do val which cost too long. # print(',val:',end='') # X, Y = val_X, val_Y # y_pred = model.predict(X) # print(matthews_corrcoef(Y, y_pred),end='') # print(',cost:',int(time.time()-t0),'sec') utils.save_variable(model, file_path)
import os row_total = 1183747 #%% ''' get 1% samples as testing dataset for the final testing NOTE: use permutation to cover information through the all training dataset, and try to avoid information unbias between samples at different sequential positions ''' idx = np.random.permutation(row_total) tr_test_split = int(1183747 * 0.01) test_row_idx = idx[:tr_test_split] tr_row_idx = idx[tr_test_split:] save_variable(tr_row_idx, 'final/tr_row_idx') save_variable(test_row_idx, 'final/test_row_idx') # check test dataset have the same distributions of outcomes as training (about 0.5% 1s) count_1 = 0 bar = progressbar.ProgressBar() for row_id in bar(test_row_idx): row_y = read_variable('data/train_y_rows/' + str(row_id) + '.pkl') count_1 += row_y.values print() print('test 1s:', count_1, '(', count_1 / len(test_row_idx), ')') #%% ''' rebalance 1s and 0s in the rest of dataset '''