def get_votes(rows_range): votes = np.zeros([len(rows_range), 104]) Y = np.zeros(len(rows_range)) for i, row_id in enumerate(rows_range): votes[i, :] = read_variable('final/tr_votes_1L/' + str(row_id)) Y[i] = read_variable('data/train_y_rows/' + str(row_id) + '.pkl') return votes, Y
def get_votes_large(rows_range): votes = np.zeros([len(rows_range), 104]) Y = np.zeros(len(rows_range)) bar = progressbar.ProgressBar() i = 0 for row_id in bar(rows_range): votes[i, :] = read_variable('final/tr_votes_1L/' + str(row_id)) Y[i] = read_variable('data/train_y_rows/' + str(row_id) + '.pkl') i += 1 return votes, Y
def load_tr_XY(group_id): gp_0s_idx = tr_0s_groups[group_id] gp_idx = np.concatenate([tr_1s_idx, gp_0s_idx]) gp_idx = np.random.permutation(gp_idx) gp_X = np.zeros([len(gp_idx), col_numeric_nu]) gp_Y = np.zeros(len(gp_idx)) bar = progressbar.ProgressBar() i = 0 for row_id in bar(gp_idx): row_num = read_variable('data/train_numeric_rows/' + str(row_id) + '.pkl') gp_Y[i] = read_variable('data/train_y_rows/' + str(row_id) + '.pkl') gp_X[i, :] = row_num i += 1 return gp_X, gp_Y
model_folders = [ 'final/1L_tree', 'final/1L_ada', 'final/1L_gb', 'final/1L_mlp' ] for model_folder in model_folders: print('Processing model cluster:', model_folder) model_ids = [] model_stds = [] model_means = [] for root, dirs, files in os.walk(model_folder): bar = progressbar.ProgressBar() for model_idx in bar(files): model_path = os.path.join(root, model_idx) model = read_variable(model_path) #print(model_idx,'std:',model.val_std,',mean:',model.val_mean) model_stds.append(model.val_std) model_means.append(model.val_mean) model_ids.append(model_idx) #% plt.figure() plt.errorbar(model_ids, model_means, yerr=model_stds, fmt='o') plt.title(root) #%% model_folders = [ 'final/1L_tree', 'final/1L_ada', 'final/1L_gb', 'final/1L_mlp' ]
import utils import pandas as pd import numpy as np import progressbar max_chunk_size = 1000 train_rows_nu = 1183747 chunk_nu = 1184 #% column_names = utils.read_variable('outputs/column_names.pkl') cols_date = column_names['date'] #%% """ process date data """ from sklearn.neighbors.kde import KernelDensity import os.path responses = utils.read_variable('model_stats/responses.pkl').astype(int) while True: try: for col_name in cols_date: print('processing date col:', col_name) file_path = 'model_stats/date/' + col_name + '.pkl' if os.path.exists(file_path): print('already exist.')
import numpy as np import pandas as pd import utils from sklearn.neural_network import MLPRegressor #%% chunk_i = 45 tr_chunk = utils.read_variable('output/tr_chunks/' + str(chunk_i)) test_chunk = utils.read_variable('output/test_data') tr_Y = tr_chunk['y'] #%% ''' result: tr R scores are always below or around 0 regardless of adjusting following configs 1. alpha 2. hidden_layer_sizes 3. activation 4. tol 5. solver ''' from sklearn.preprocessing import Imputer from sklearn import preprocessing import time from sklearn.model_selection import KFold Y = tr_chunk['y'].values X = tr_chunk.drop(['id', 'y'], 1).as_matrix()
import utils from sklearn import tree from sklearn.ensemble import RandomForestClassifier import pandas as pd import time import sklearn.metrics as mx #%% people_act_train = utils.read_variable('outputs/people_act_train') column_names = utils.read_variable('outputs/column_names') people_act_train_category_dummys = utils.read_variable( 'outputs/people_act_train_category_dummys') pc_number = 70 pca = utils.read_variable('outputs/people_act_train_category_pca_' + str(pc_number) + 'pc') selected_col_ids = utils.read_variable( 'outputs/people_act_train_category_selected_col_ids_pvalue_0.05') from scipy.sparse import csr_matrix # can NOT do transform all data in one process which will lead to memory leak length = people_act_train_category_dummys.shape[0] #%%######################## #% Use two IPython console and run the PCA asc and desc separately as a multiple-threading solution ########################## #%% DSC#################### ########################## # can NOT do transform all data in one process which will lead to memory leak people_act_train_category_pcs_dsc = csr_matrix(
import utils import pandas as pd import numpy as np import progressbar from collections import defaultdict max_chunk_size = 1000 train_rows_nu = 1183747 chunk_nu = 10 #1184 #% column_names = utils.read_variable('outputs/column_names.pkl') cols_numeric = column_names['numeric'] #%% file_name = 'data/train_numeric.csv' cols = cols_numeric nan_counts = np.zeros(cols.size) col_nan_counts = defaultdict(int) col_index = 0 bar = progressbar.ProgressBar() for col_name in bar(cols_numeric): #print('processing (',file_name,') col:',col_name) chunks = pd.read_csv(file_name, usecols=[col_name], chunksize=max_chunk_size, low_memory=False, iterator=True)
import utils import numpy as np import time import pandas as pd #%% print('loading col_stats_cate...') col_stats_cate = utils.read_variable('model_stats/col_stats_cate.pkl') print('loading col_stats_date...') col_stats_date = utils.read_variable('model_stats/col_stats_date.pkl') print('loading col_stats_num...') col_stats_num = utils.read_variable('model_stats/col_stats_num.pkl') #%% ''' calculate probability of response 0 with categorical col ''' def cal_0_proba_by_cate(col_name,value): stat_0 = col_stats_cate[col_name][0] stat_1 = col_stats_cate[col_name][1] response0_proba = 0 if value!=value: # nan response0_proba = stat_0['nan']/(stat_0['nan']+stat_1['nan']) elif value in stat_0 and value not in stat_1: response0_proba = 1 elif value not in stat_0 and value in stat_1: response0_proba = 0
import numpy as np from sklearn.preprocessing import Imputer from sklearn import preprocessing import utils from sklearn.model_selection import KFold from sklearn.linear_model import LinearRegression #%% kbest_i = [0, 88, 91, 99, 100] for chunk_i in range(100): print(chunk_i, end='-->') chunk = utils.read_variable('output/tr_chunks/' + str(chunk_i)) X = chunk.drop(['id', 'y'], 1).as_matrix() X = X[:, kbest_i] Y = chunk['y'].values X_imputed = Imputer(missing_values='NaN', strategy='median', axis=0).fit_transform(X) X_norm = preprocessing.StandardScaler().fit_transform(X_imputed) kf_i = 0 skf = KFold(n_splits=5, shuffle=True, random_state=13) for tr_idx, val_idx in skf.split(X_norm): print('kf', kf_i, end='-->') kf_i += 1 tr_X, tr_Y = X_norm[tr_idx, :], Y[tr_idx] val_X, val_Y = X_norm[val_idx, :], Y[val_idx] model = LinearRegression() model.fit(tr_X, tr_Y) tr_Y_pred = model.predict(tr_X) tr_r = utils.cal_r(tr_Y, tr_Y_pred)
# -*- coding: utf-8 -*- import pandas as pd from sklearn.preprocessing import OneHotEncoder import utils import time #% column_names = utils.read_variable('outputs/column_names') people_act_train = utils.read_variable('outputs/people_act_train') #% convert category column to integer #----------------------------------- people_act_train_category2int = pd.DataFrame() for name in column_names['category']: people_act_train_category2int[name] = people_act_train[name].str.replace( '((type)|(group))\s', '') people_act_train_category2int = people_act_train_category2int.fillna(value=0) del name #% convert integer to one hot codes one_hot_encoder = OneHotEncoder(n_values='auto', sparse=True) one_hot_encoder.fit(people_act_train_category2int) #% this variable will not presented in variable explorer and can NOT call toarray() which lead MemoryError people_act_train_category_dummys = one_hot_encoder.transform( people_act_train_category2int) print(people_act_train_category_dummys.shape) print(type(people_act_train_category_dummys)) del people_act_train_category2int
import utils import progressbar column_names = utils.read_variable('outputs/column_names.pkl') cols_categorical = column_names['categorical'] cols_date = column_names['date'] cols_numeric = column_names['numeric'] col_len = cols_categorical.size+cols_date.size+cols_numeric.size del column_names #%% ''' calculate probability of response 0 with categorical col ''' col_stats_cate = {} print('import col statistic:','categorical columns') bar = progressbar.ProgressBar() for col_name in bar(cols_categorical): col_stats_cate[col_name] = utils.read_variable('model_stats/cate/'+col_name+'.pkl') utils.save_variable(col_stats_cate,'model_stats/col_stats_cate.pkl') ''' calculate probability of response 0 with date col ''' col_stats_date = {} print('import col statistic:','date columns')
import matplotlib.pyplot as plt import utils import pandas as pd #%% column_names = utils.read_variable('column_names') #%% #a_row = people[2:3] #a_col = people['people_id'] #people_unique_ids= people.people_id.unique() #a_cell = people[2:3]['people_id'] #%% join people and act tables # right join: use act_train keys only people_act_train = pd.merge(people, act_train, how='right', on='people_id', suffixes=('_p', '_a')) people_act_test = pd.merge(people, act_test, how='right', on='people_id', suffixes=('_p', '_a')) #%% people_act_activity_category_type1 = people_act_train.loc[( people_act_train['activity_category'] == 'type 1')][0:1] people_act_char_10_a_notnull = people_act_train.loc[ people_act_train['char_10_a'].notnull()][0:1]
(x_cate_num_len, col_cate_nu + col_numeric_nu + +col_date_nu)) y_cate_num = np.zeros(x_cate_num_len) bar = progressbar.ProgressBar() print('loading cate proba and raw num, and date...') chunks_num = pd.read_csv('data/train_numeric.csv', chunksize=max_chunk_size, low_memory=False, iterator=True) chunks_date = pd.read_csv('data/train_date.csv', chunksize=max_chunk_size, low_memory=False, iterator=True) for chunk_id in bar(range(0, chunk_nu, 1)): chunk_cate = utils.read_variable('model_stats/train_cate_proba/' + str(chunk_id) + '.pkl') chunk_num = chunks_num.get_chunk() chunk_date = chunks_date.get_chunk() row_range = range(chunk_id * max_chunk_size, chunk_id * max_chunk_size + chunk_cate.shape[0], 1) x_cate_num[row_range, :col_cate_nu] = chunk_cate x_cate_num[row_range, col_cate_nu:col_cate_nu + col_numeric_nu] = chunk_num.drop(['Response'], axis=1) x_cate_num[row_range, col_cate_nu + col_numeric_nu:] = chunk_date y_cate_num[row_range] = chunk_num['Response'] del chunk_id, bar, chunk_num, chunk_cate, row_range #%% ''' remove low density col
#%% # used to find the best SGD model during the training sgd_val_chunk_nu = 10 sgd_val_chunk_ids = range(0, sgd_val_chunk_nu, 1) sgd_val_nu = sgd_val_chunk_nu * max_chunk_size sgd_val_y = np.empty(sgd_val_nu, dtype=np.int) sgd_val_y[:] = np.NAN sgd_val_x = np.empty((sgd_val_nu, col_num_date + col_cate)) sgd_val_x[:] = np.NAN print('loading model_val dataset.', 'chunk [', sgd_val_chunk_ids, ')') i = 0 for chunk_id in sgd_val_chunk_ids: chunk = utils.read_variable( 'chunk_tree_votes/models/train_y_votes_prob/chunk_' + str(chunk_id) + '.pkl') row_range = range(i * max_chunk_size, i * max_chunk_size + chunk.shape[0], 1) sgd_val_y[row_range] = chunk['Response'] # num and date votes_date_nu = chunk.drop(['Response'], axis=1) sgd_val_x[row_range, 0:col_num_date] = votes_date_nu # cate chunk = utils.read_variable('model_stats/train_cate_proba/' + str(chunk_id) + '.pkl') sgd_val_x[row_range, col_num_date:] = chunk i += 1 del i, chunk_id, row_range, chunk, votes_date_nu
#%% print('-----------------init first Model_2nd----------------') SUPER_start_timestamp = 10 SUPER_window_size = SUPER_start_timestamp SUPER_kf_k = 5 SUPER_seed = 13 SUPER_alpha = 1e-8 skf = KFold(n_splits=SUPER_kf_k, shuffle=True, random_state=SUPER_seed) ''' load lag_models (FIFO) ''' lag_models = [] for lag_model_timestamp in range(SUPER_start_timestamp - SUPER_window_size, SUPER_start_timestamp, 1): lag_model = utils.read_variable( 'E:/two-sigma/output/timeseries/model_1L/' + str(lag_model_timestamp)) lag_models.append(lag_model) ''' Init Model_2L ''' timestamp = SUPER_start_timestamp print('processing', timestamp) tr_chunk = utils.read_variable('E:/two-sigma/output/timeseries/tr_chunks/' + str(timestamp)) ''' CV ''' kf_i = 0 best_model_2L_r = 0
from utils import read_variable all_chunk_idx = range(1184) tr_chunk_idx = read_variable('final/tr_chunk_idx') val_chunk_idx = read_variable('final/val_chunk_idx') #%% ''' check consistence between chunk index ''' for chunk_id in all_chunk_idx: chunk_y = read_variable('data/train_y_chunks/' + str(chunk_id) + '.pkl') chunk_num = read_variable('data/train_numeric_chunks/' + str(chunk_id) + '.pkl') chunk_date = read_variable('data/train_date_chunks/' + str(chunk_id) + '.pkl') chunk_cate = read_variable('data/train_categorical_chunks/' + str(chunk_id) + '.pkl') print(chunk_id, end=',') diff = chunk_num.index.difference(chunk_date.index) if diff.size != 0: print('X', end=',') else: print('v', end=',') diff = chunk_num.index.difference(chunk_cate.index) if diff.size != 0: print('X', end=',') else: print('v', end=',') diff = chunk_num.index.difference(chunk_y.index)
from sklearn.metrics import matthews_corrcoef #%% print('reading training num and date col votes matrix...') # memory can not handle DataFrame # memory can not handle votes and Response in two separate numpy variables votes = np.zeros((1183748, 1185)) max_chunk_size = 1000 #responses = np.zeros((1183748, 1)) bar = progressbar.ProgressBar() for chunk_id in bar(range(0, 1184, 1)): chunk = utils.read_variable( 'chunk_tree_votes/models/train_y_votes_prob/chunk_' + str(chunk_id) + '.pkl') votes[chunk_id * max_chunk_size:chunk_id * max_chunk_size + chunk.shape[0], 1:] = chunk.drop(['Response'], axis=1) votes[chunk_id * max_chunk_size:chunk_id * max_chunk_size + chunk.shape[0], 0] = chunk['Response'] del chunk_id, bar, chunk #% x_tr = votes[:900000, 1:] y_tr = votes[:900000, 0] # used to find best SGD in training x_val = votes[900000:1000000, 1:] y_val = votes[900000:1000000, 0]
test_ratio = 0.01 for name, group in data_gp: print('T'+str(name),end=',') gp_idx = set(data_gp.groups[name]) idx_pool = gp_idx.intersection(remained_sample_ids) sel_ids_len = int(len(gp_idx)*test_ratio) sel_ids = set(random.sample(idx_pool,sel_ids_len)) print('sel',len(sel_ids),'samples from',len(gp_idx)) sel_sample_ids = sel_sample_ids | sel_ids remained_sample_ids = remained_sample_ids - sel_ids utils.save_variable(sel_sample_ids,'output/test_ids') print('testing samples:',len(sel_sample_ids)) del sel_sample_ids #%% test_ids = list(utils.read_variable('output/test_ids')) ''' check whether testing dataset is well distributed among different obj ids ''' data_gp = data.groupby('id') overall_stats_by_id = {} for name, group in data_gp: print(name,'give has samples:',group.shape[0]) overall_stats_by_id[name] = group.shape[0] test_data = data.ix[test_ids] data_gp = test_data.groupby('id') test_ratio_per_obj_ids = [] for name, group in data_gp: print(name,'give testing samples:',group.shape[0],'from',overall_stats_by_id[name]) ratio = group.shape[0]/overall_stats_by_id[name]
#ids = range(400000) #ids_shuffled = np.random.permutation(ids) #tr_val_ids = [] #bar = progressbar.ProgressBar() #for i in bar(ids_shuffled): # if i in all_test_ids: # test_rows_range.append(i) # else: # tr_val_ids.append(i) #%% Read all trainingd dataset all_test_ids = [] tr_val_ids = [] bar = progressbar.ProgressBar() for gp_idx in bar(range(119)): row_group = read_variable('final/row_groups/' + str(gp_idx)) test_row_ids = row_group['test'] all_test_ids.extend(test_row_ids) tr_row_ids = row_group['train'] tr_val_ids.extend(tr_row_ids) #% tr_rows_range = [] val_rows_range = [] test_rows_range = all_test_ids ids = range(len(tr_val_ids)) ids_shuffled = np.random.permutation(ids) #% tr_rows_range = tr_val_ids[:int(len(tr_val_ids) * 0.9)]
import numpy as np from utils import read_variable test_Y = np.zeros(1183748) row_start = 0 row_end = 0 for chunk_id in range(1184): path = 'final/test_votes_1L/' + str(chunk_id) votes = read_variable(path) chunk_Y_pred = (np.sum(votes, axis=1) >= 1).astype(np.int) print(chunk_id, '1s:', sum(chunk_Y_pred)) row_end = row_start + len(chunk_Y_pred) test_Y[row_start:row_end] = chunk_Y_pred row_start = row_end print('FINAL 1s:', sum(test_Y)) #%% import pandas as pd # saving to CSV test_ids = read_variable('outputs/test_ids.pkl') test_y_ids = pd.DataFrame(test_ids, columns=['Id']) test_y_y = pd.DataFrame(test_Y, columns=['Response']) test_y = pd.concat([test_y_ids, test_y_y], axis=1) test_y = test_y.set_index('Id') test_y.to_csv('submissions/submission_1130.csv', float_format='%.0f')
print(max_depth, '-->', col_range, end='') print(',tr:', matthews_corrcoef(Y, y_pred), end='') #print('tr 1s:real',sum(Y),',pred',sum(y_pred)) #utils.save_variable(tree_votes_0,'models/tree_votes_0.pkl') print(',val:', end='') X, Y = val_X[:, col_range], val_Y y_pred = model.predict(X) val_mcc = matthews_corrcoef(Y, y_pred) best_val_mcc = val_mcc print(val_mcc, end='') print(',1s:', sum(y_pred), '/', sum(Y)) #%% from sklearn.tree import DecisionTreeClassifier class_weight = {} class_weight[0] = 1000 class_weight[1] = 10 #%% col_trees = [] bar = ProgressBar() for col_set_idx in bar(range(78)): model = read_variable('vert/tree_' + str(col_set_idx) + '.pkl') col_trees.append(model) #%% for col_set_idx, model in enumerate(col_trees): col_range = range(10 * col_set_idx, 10 * col_set_idx + 10, 1) pred_y = model.predict(val_X[:, col_range]) print(col_set_idx, matthews_corrcoef(val_Y, pred_y), sum(pred_y))
import time from utils import load_training_subset_1110, read_variable, save_variable import numpy as np from sklearn.metrics import matthews_corrcoef #%% val_X, val_Y = load_training_subset_1110(range(1000, 1010, 1)) tr_X_1s = read_variable('model_stats/tr_pip_data_1s_1110.pkl') #%% ''' Model: SGD ''' from sklearn.linear_model import SGDClassifier len_1s = tr_X_1s.shape[0] for set_id in range(0, 166, 1): chunk_range = range(set_id, 1000, 166) t_X, t_Y = load_training_subset_1110(chunk_range) tr_X = np.concatenate([t_X, tr_X_1s]) tr_Y = np.concatenate([t_Y, np.ones(len_1s)]) alpha = 1e-4 # default #‘none’, ‘l2’, ‘l1’, or ‘elasticnet’ penalty = 'l1' model = SGDClassifier(alpha=alpha, shuffle=True, n_jobs=3, penalty=penalty) t0 = time.time() model = model.fit(tr_X, tr_Y)
''' Decision tree learners create biased trees if some classes dominate. It is therefore recommended to balance the dataset prior to fitting with the decision tree. ref:http://scikit-learn.org/stable/modules/tree.html ''' ''' StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set. ''' import os for root, dirs, files in os.walk('final/tr_groups'): for chunk_idx in files: print('chunk', chunk_idx, end='...') chunk_path = os.path.join(root, chunk_idx) tr_chunk = read_variable(chunk_path) model_path = 'final/1L_tree/' + str(chunk_idx) if os.path.isfile(model_path): print('model exist') else: save_variable({}, model_path) print('processing...') chunk_X, chunk_Y = load_pipped_tr_chunk([chunk_idx]) # based on experiment, k=4 give the smallest STD chunk_model = ForestChunkClassifierWithKFolds(k=4, seeds=[13, 11, 193]) chunk_model.fit(chunk_X, chunk_Y, test_X, test_Y) chunk_Y_pred = chunk_model.predict(chunk_X) chunk_mcc = matthews_corrcoef(chunk_Y, chunk_Y_pred)
import progressbar import numpy as np from sklearn.metrics import matthews_corrcoef from utils import read_variable, save_variable import time tr_chunk_idx = read_variable('final/tr_chunk_idx') tr_Y = np.zeros([0]) tr_votes = np.zeros([0, 301]) bar = progressbar.ProgressBar() for chunk_id in bar(tr_chunk_idx): chunk_votes = read_variable('final/tr_votes/' + str(chunk_id) + '.pkl') tr_votes = np.concatenate([tr_votes, chunk_votes]) chunk_Y = read_variable('data/train_y_chunks/' + str(chunk_id) + '.pkl') tr_Y = np.concatenate([tr_Y, chunk_Y]) val_chunk_idx = read_variable('final/val_chunk_idx') val_Y = np.zeros([0]) val_votes = np.zeros([0, 301]) bar = progressbar.ProgressBar() for chunk_id in bar(val_chunk_idx): chunk_votes = read_variable('final/tr_votes/' + str(chunk_id) + '.pkl') val_votes = np.concatenate([val_votes, chunk_votes]) chunk_Y = read_variable('data/train_y_chunks/' + str(chunk_id) + '.pkl') val_Y = np.concatenate([val_Y, chunk_Y]) #%% from sklearn.ensemble import RandomForestClassifier tr_row_nu = tr_Y.shape[0]
import progressbar import numpy as np from sklearn.metrics import matthews_corrcoef import time, os from utils import load_training_subset_1110, read_variable #% val_X, val_Y = load_training_subset_1110(range(1000, 1184, 1)) #%% print('loading trees...') model_forest = [] bar = progressbar.ProgressBar() for set_id in bar(range(0, 300, 1)): model = read_variable('9/forest_' + str(set_id) + '.pkl') model_forest.append(model) #%% #%% print('loading logic...') model_logic = [] bar = progressbar.ProgressBar() for set_id in bar(range(0, 166, 1)): model = read_variable('7/logic_' + str(set_id) + '.pkl') model_logic.append(model) #%% print('loading boost...') model_boost = [] bar = progressbar.ProgressBar() for set_id in bar(range(0, 166, 1)): model = read_variable('7/boost_' + str(set_id) + '.pkl')
#import col_stats_utils import utils import pandas as pd import time import numpy as np ''' ======================================================= ''' print('loading col_stats_cate...') col_stats_cate = utils.read_variable('model_stats/col_stats_cate.pkl') column_names = utils.read_variable('outputs/column_names.pkl') cols_cate = column_names['categorical'] #%% ''' calculate probability of response 0 with categorical col ''' def cal_0_proba_by_cate(col_name, value): stat_0 = col_stats_cate[col_name][0] stat_1 = col_stats_cate[col_name][1] response0_proba = 0 if value != value: # nan response0_proba = stat_0['nan'] / (stat_0['nan'] + stat_1['nan']) elif value in stat_0 and value not in stat_1: response0_proba = 1 elif value not in stat_0 and value in stat_1: response0_proba = 0 elif value in stat_0 and value in stat_1: response0_proba = stat_0[value] / (stat_0[value] + stat_1[value])
import utils from sklearn.ensemble import RandomForestClassifier import time #%% tr= utils.read_variable('outputs/tr') val= utils.read_variable('outputs/val') estimator_nu=2 print('###########################') print('Char 38, Tree Nu:',estimator_nu) print('---------------------------') X_col_exl = ['outcome','char_38'] #X = tr.drop(X_col_exl, axis=1) X= tr['char_38'].reshape(-1, 1) Y = tr['outcome'] # col char_38 model = RandomForestClassifier(n_estimators=estimator_nu, verbose=0, n_jobs=-1) startTime = time.time() model = model.fit(X, Y) print ('Training took', int(time.time() - startTime),'sec'); f = model.predict(X) print('------TRAINING-------') (tr_f1,tr_auc, tr_confusion) = utils.validate_prediction(f,Y) del X,Y,f #%
#%% #val_X,val_Y = utils.load_training_subset(range(1000,1184,1)) #%% ''' Model: SVC WARNING: The implementation is based on libsvm. The fit time complexity is more than quadratic with the number of samples which makes it hard to scale to dataset with more than a couple of 10000 samples. HENCE, training dataset is splitted into 10 blocks ''' from sklearn.svm import SVC import os tr_X_1s = utils.read_variable('model_stats/tr_pip_data_1s.pkl') len_1s = tr_X_1s.shape[0] for set_id in range(6, 1000, 6): # 6 chunks give about the same 0s (tiny amount of 1s is ignored) as the 1s file_path = '7/svc_' + str(set_id) + '.pkl' if os.path.exists(file_path): print('already exist.', file_path) else: chunk_range = range(set_id - 6, set_id, 1) t_X, t_Y = utils.load_training_subset(chunk_range) tr_X = np.concatenate([t_X, tr_X_1s]) tr_Y = np.concatenate([t_Y, np.ones(len_1s)]) model = SVC(kernel='rbf', C=1)
import time from utils import load_pipped_tr_chunks, read_variable, save_variable import numpy as np from sklearn.metrics import matthews_corrcoef #%% tr_chunk_idx = read_variable('final/tr_chunk_idx') #%% ''' Model: Tree ''' ''' Decision tree learners create biased trees if some classes dominate. It is therefore recommended to balance the dataset prior to fitting with the decision tree. ref:http://scikit-learn.org/stable/modules/tree.html ''' from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import SGDClassifier import os good_model_val_mcc = 0.2 from sklearn.neighbors import KNeighborsClassifier for model_idx in range(100): print(model_idx, end='...') file_path = 'final/good_models_onlynum_2/' + str(model_idx) if os.path.isfile(file_path): print('exist') else: print('processing...')