if prr >= 0.9: return 'good' elif prr <= 0.1: return 'bad' else: return 'interm.' @memory.cache def load_rutgers(): return list(get_traces()) features = ['rssi', 'rssi_std', 'rssi_avg'] cv = model_selection.StratifiedKFold(n_splits=10, shuffle=True) pipe = pipeline.Pipeline([ ('scaler', preprocessing.StandardScaler()), ('resample', over_sampling.RandomOverSampler()), #('clf', tree.DecisionTreeClassifier(max_depth=3)), ('linear', linear_model.LogisticRegression(solver='ovr')), ]) @memory.cache def different_window_sizes(W_PRR, W_HISTORY): print(f'*** PRR={W_PRR}, HISTORY={W_HISTORY} ***') dataset = load_rutgers() print('Rutgers loaded ...')
# file directory root_dir = 'D:/Deep Learning/Projects/Melanoma skin cancer detection/' # not using # Folding the validation set using stratified cross validation . Stratified cross validation works well for skewed dataset if __name__ == "__main__": train_csv = pd.read_csv(root_dir + 'train.csv') train_csv['kfold'] = -1 # DataFrame.sample randomizes the rows of the data and frac determines the fraction of the data to be used for randomizing # DataFrame.reset_index creates new index for the randomized data and drop=True delete the previous index train_csv = train_csv.sample(frac=1).reset_index(drop=True) # listing target values based on which we will set up the stratified folds y = train_csv.target.values # initiating the k-fold class from model selection kf = model_selection.StratifiedKFold(n_splits=5) for count, (train_index, val_index) in enumerate(kf.split(X=train_csv, y=y)): train_csv.loc[val_index, 'kfold'] = count train_csv.to_csv(root_dir + "train_folds.csv") # data loading and converting into tensor # train data train_csv = pd.read_csv(root_dir + 'train.csv') images_id_train = train_csv.image_name.values.tolist() train_images = [ os.path.join(root_dir + 'train/', i + '.jpg') for i in images_id_train ] train_targets = train_csv.target.values
# split into train test sets using t_t_s # because we combined the datasets to apply uniform # one hot and label encoding, we set 'shuffle' parameter as false # we also know that there should be 15060 rows in the test sets test_set_size = test_dataset_nomissing.shape[0] print('\n test_set_size...') print(test_set_size) X_train, X_test, Y_train, Y_test = t_t_s(rescaledX, Y, test_size=test_set_size, random_state=seed, shuffle=False) # instantiate XGBC class using defaults model = XGBC() # evaluate the model against the training datset using stratified kfold print('\n evaluating xgb model via skfold...') kfold = m_s.StratifiedKFold(n_splits=10, random_state=seed, shuffle=True) cv_results = c_v_s(model, X_train, Y_train, cv=kfold) print("xgb SKFOLD training accuracy standardized: %.2f%% (%.2f%%)" % (cv_results.mean()*100, cv_results.std()*100)) # fit model to training datasets print('\n training d model...') model.fit(X_train, Y_train) # view trained model print('\n model...') print(model) # make predictions for test data print('\n making predictions...') y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred]
p=2))) models.append( ('KNN-Manhattan', KNeighborsClassifier(n_neighbors=neighbhors, p=1))) models.append(("KNN-Manhattan-Weighted", KNeighborsClassifier(n_neighbors=neighbhors, weights='distance', p=1))) models.append(("Gaussian Bayes", GaussianNB())) models.append(("ID3", tree.DecisionTreeClassifier())) #evaluate each model in turn results = [] names = [] for name, model in models: kfold = model_selection.StratifiedKFold(n_splits=splits, shuffle=True, random_state=seed) cv_results = model_selection.cross_val_score(model, descriptiveFeats, targetFeats, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) print("-----------------------{}-----------------------".format(name)) y_predictions = cross_val_predict(model, descriptiveFeats, targetFeats, cv=kfold) print(accuracy_score(targetFeats, y_predictions)) print(confusion_matrix(targetFeats, y_predictions))
[f for f in features if 'context_' in f][0]) m = train_nn_def.BespokeNN(num_dense_features=num_dense_features, verbose=0, file_prefix=CACHE_DIR + '/holdout_bespoke_nn_') param_grid = { 'model__num_hidden_layers': [0, 1, 2, 4], 'model__hidden_layer_size': [2, 4, 8, 16, 32], 'model__dropout': [0, .25, .5], 'model__learning_rate': [.01, .001, .0001], 'model__dense_reg_strength': [.1, .01, .001], # Bespoke DNN specific parameters 'model__sparse_reg_strength': [.1, .01, .001], } xval = model_selection.StratifiedKFold(4, shuffle=True, random_state=RANDOM_SEED) pipe = pipeline.Pipeline([ ('normalize', preprocessing.MinMaxScaler()), # NN classifier (doesn't hurt trees) ('model', m), ]) gs = model_selection.GridSearchCV(pipe, param_grid, scoring=score_metric, verbose=2, cv=xval, refit=True) threshold = 1 / 3
def create(self, X, y): return model_selection.StratifiedKFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state)
def crossValidationLogReg(): # Create crossvalidation partition for evaluation K_o_splits = 10 outer_it = 0 K_i_splits = 10 model_count = 10 summed_eval_i = np.zeros((model_count)) eval_i = np.zeros((model_count)) eval_o = np.zeros((model_count)) optimal_lambda = np.zeros((K_o_splits)) #CV1 = model_selection.KFold(n_splits=K_o_splits,shuffle=True) #StratifiedKfold ensures that there is a reasonable percentage of each class in each split. CV1 = model_selection.StratifiedKFold(n_splits=K_o_splits, shuffle=True) CV2 = model_selection.StratifiedKFold(n_splits=K_i_splits, shuffle=True) #Outer k-fold split for train_index_o, test_index_o in CV1.split(X, y): print('Outer CV1-fold {0} of {1}'.format(outer_it + 1, K_o_splits)) X_train_o = X[train_index_o, :] y_train_o = y[train_index_o] X_test_o = X[test_index_o, :] y_test_o = y[test_index_o] #Inner validation loop inner_it = 0 for train_index_i, test_index_i in CV2.split(X_train_o, y_train_o): print('Inner CV2-fold {0} of {1}'.format(inner_it + 1, K_i_splits)) X_train_i = X[train_index_i, :] y_train_i = y[train_index_i] X_test_i = X[test_index_i, :] y_test_i = y[test_index_i] #C specifies the inverse of regularization strength. Small C means high regularization lowest_err = 100 optimal_reg = 999 for idx in range(model_count): reg_term = (0.01 + idx * 0.1) model = lm.logistic.LogisticRegression(C=reg_term, penalty='l2') model = model.fit(X_train_i, y_train_i) y_logreg = model.predict(X_test_i) current_err = 100 * ( y_logreg != y_test_i).sum().astype(float) / len(y_test_i) summed_eval_i[idx] += current_err inner_it += 1 eval_i = summed_eval_i * (len(X_test_i) / len(X_train_o)) idx = np.argmin(eval_i) reg_term = (0.01 + idx * 0.1) model = lm.logistic.LogisticRegression(C=reg_term, penalty='l2') model = model.fit(X_train_o, y_train_o) y_logreg = model.predict(X_test_o) current_err = 100 * (y_logreg != y_test_o).sum().astype(float) / len(y_test_o) eval_o[outer_it] = current_err optimal_lambda[outer_it] = reg_term outer_it += 1 mode_reg, _ = numpy.unique(optimal_lambda, return_counts=True) figure() boxplot(eval_o) xlabel('Logistic Regression') ylabel('Cross-validation error [%]') show() e_gen = np.sum(eval_o) * (len(X_test_o) / len(X)) print("Logistic regression generalization error: %f with %s and %f" % ((e_gen), 'l2-norm', mode_reg[0]))
print('Train Indices: ', train_indices, 'Test Indices: ', test_indices) print('{0:-^70}'.format('Shuffle Split')) # 就是打乱数据后随机分割数据集,与第一个方法类似 ss = sm.ShuffleSplit(n_splits=3, test_size=0.25, random_state=0) print('Shuffle Split class: ', ss) print('splits of ss: ', ss.get_n_splits(X)) # 就是构造函数中的n_splits参数 for train_indices, test_indices in ss.split(X): print('Train Indices: ', train_indices, 'Test Indices: ', test_indices) # 以上的K_Fold, LOO, LPO, Shuffle等方法都是建立在样本数据独立同分布的基础上的(i.d.d) # 只有在i.d.d的基础上,以上抽样方法才能保留数据的统计特征,做到不失真 # 以下是数据分布不均匀的时候,基于分层或分组的抽样方法 print('{0:-^70}'.format('Stratified K-Fold')) y = np.array([0, 0, 0, 0, 1, 1, 1, 1]) skf = sm.StratifiedKFold(n_splits=4) print('Stratified K-Fold class: ', skf) print('splits of skf: ', skf.get_n_splits(X, y)) # 增加一个参数,根据y来分层 for train_indices, test_indices in skf.split(X, y): print('Train Indices: ', train_indices, 'Test Indices: ', test_indices) # Group K-Fold, 除了X和y之外,还有个额外的参数是每个样本所属的组 # 抽样要保证测试集里面的数据所属的组与训练集里面的样本所属的组是完全不一样的 print('{0:-^70}'.format('Group K-Fold')) X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10] y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"] groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3] gkf = sm.GroupKFold(n_splits=3) # groups的数量必须要大于n_splits print('X: \n', X) print('y: ', y) print('groups: ', groups)
models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) models.append(('SVM', SVC(gamma='scale'))) models.append(('AdaB', AdaBoostClassifier())) models.append(('RF', RandomForestClassifier(n_estimators=1000, n_jobs=-1))) models.append(('GBM', GradientBoostingClassifier())) outFile = open("output.txt", "w") ############################# # model evaluation results = [] names = [] scoring = 'f1' for name, model in models: kfold = model_selection.StratifiedKFold(n_splits=10, random_state=seed) cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring, error_score=np.nan, n_jobs=-1) results.append(cv_results) names.append(name) msg = "%s: %f (%f) " % (name, cv_results.mean(), cv_results.std()) outFile.write(msg) print(msg) outFile.write("- Time: %s in seconds" % (time.time() - start_time)) print("- Time: %s in seconds" % (time.time() - start_time)) outFile.write("\n")
# Save TF-IDF vectors so I'm not recalculating them every time... #print('Save the TF-IDFed data.\n') #pickle.dump([x_train, y_train], open(os.path.join(data_path, 'meanembedding-train.pkl'), 'wb')) #pickle.dump([x_test, y_test], open(os.path.join(data_path, 'meanembedding-test.pkl'), 'wb')) # Load TF-IDFed train/test data with labels attached #print('Load TF-IDFed Train/Test data.') #[x_train, y_train] = pickle.load(open(os.path.join(data_path,'tfidfed-train.pkl'),'rb')) #[x_test, y_test] = pickle.load(open(os.path.join(data_path,'tfidfed-test.pkl'),'rb')) # Print data info #print('Number of Instances: {}'.format(data.shape[0])) print('\tTraining instances: {}'.format(x_train.shape)) print('\tTesting instances: {}\n'.format(x_test.shape)) strat_kfold = ms.StratifiedKFold(n_splits=5, shuffle=True) print('Cross-validation: {} folds\n'.format(strat_kfold.get_n_splits())) for estimator in classifiers: print('{} fitting - '.format(estimator.__class__.__name__), end='') # Fit model time_start = time.time() estimator.fit(x_train, y_train) time_stop = time.time() elapsed = time_stop - time_start print('{} minutes {} seconds'.format(elapsed // 60, elapsed % 60)) # Predict on the training dataset print('{} predict training - '.format(estimator.__class__.__name__), end='')
def split_data(self): self.train_data_for_target_df = {} self.train_data_for_target_df['no_gradient'] = self.train_data_df.index self.target_data_for_target_df = {} self.target_data_for_target_df[ 'no_gradient'] = self.target_data_df.index self.target_data_index_df = self.target_data_df self.train_data_index_df = self.train_data_df if self.independent_testset == 1: self.test_data_index_df = self.test_data_df else: self.test_data_index_df = self.train_data_df if self.validation_from_testset == 1: self.validation_data_index_df = self.test_data_df else: self.validation_data_index_df = self.train_data_df if (self.independent_testset == 1) & (self.validation_from_testset == 1): self.validation_data_for_target_df = {} self.validation_data_for_target_df[ 'no_gradient'] = self.test_data_df.index else: self.validation_data_for_target_df = None #split test, validation, training dataset #case1: all test, validation, training from one original training set if self.independent_testset == 0: #Split the original training set into test_split_folds folds. (training - test) #output two list train_splits_df and test_splits_df self.train_splits_df = [{} for i in range(self.test_split_folds)] self.test_splits_df = [{} for i in range(self.test_split_folds)] self.train_cv_splits_df = [[{} for i in range(self.cv_split_folds)] for j in range(self.test_split_folds)] self.validation_cv_splits_df = [[ {} for i in range(self.cv_split_folds) ] for j in range(self.test_split_folds)] if self.test_split_method == 0: if self.test_split_folds == 1: kf_list = [] if self.test_split_ratio == 0: kf_list.append( (range(len(self.train_data_df.index)), None)) else: kf_folds = int(1 / self.test_split_ratio) kf = ms.KFold(n_splits=kf_folds, shuffle=True) kf_list.append(list(kf.split(self.train_data_df))[0]) else: kf = ms.KFold(n_splits=self.test_split_folds, shuffle=True) kf_list = list(kf.split(self.train_data_df)) # stratified split (keep prior) if self.test_split_method == 1: if self.test_split_folds == 1: kf_list = [] if self.test_split_ratio == 0: kf_list.append((self.train_data_df.index, None)) else: kf_folds = int(1 / self.test_split_ratio) kf = ms.StratifiedKFold(n_splits=kf_folds, shuffle=True) kf_list.append( list( kf.split( self.train_data_df, self.train_data_df[ self.dependent_variable]))[0]) else: kf = ms.StratifiedKFold(n_splits=self.test_split_folds, shuffle=True) kf_list = list( kf.split(self.train_data_df, self.train_data_df[self.dependent_variable])) # customized split if self.test_split_method == 2: kf_list = self.test_split(self.train_data_df) test_split_fold_id = 0 for train_index_split, test_index_split in kf_list: train_index = self.train_data_df.index[train_index_split] self.train_splits_df[test_split_fold_id][ 'no_gradient'] = train_index if test_index_split is None: test_index = None else: test_index = self.train_data_df.index[test_index_split] self.test_splits_df[test_split_fold_id][ 'no_gradient'] = test_index test_split_fold_id += 1 #Split each training set into cv_split_folds folds (training - validation) for i in range(self.test_split_folds): cur_train_data_df = self.train_data_df.loc[ self.train_splits_df[i]['no_gradient'], :] if self.cv_split_method == 0: if self.cv_split_folds == 1: kf_folds = int(1 / self.cv_split_ratio) kf = ms.KFold(n_splits=kf_folds, shuffle=True) kf_list = [] kf_list.append(list(kf.split(cur_train_data_df))[0]) else: kf = ms.KFold(n_splits=self.cv_split_folds, shuffle=True) kf_list = list(kf.split(cur_train_data_df)) # stratified split (keep prior) if self.cv_split_method == 1: if self.cv_split_folds == 1: kf_folds = int(1 / self.cv_split_ratio) kf = ms.StratifiedKFold(n_splits=kf_folds, shuffle=True) kf_list = [] kf_list.append( list( kf.split( cur_train_data_df, cur_train_data_df[ self.dependent_variable]))[0]) else: kf = ms.StratifiedKFold(n_splits=self.cv_split_folds, shuffle=True) kf_list = list( kf.split( cur_train_data_df, cur_train_data_df[self.dependent_variable])) # customized split if self.cv_split_method == 2: kf_list = self.cv_split(cur_train_data_df) cv_split_fold_id = 0 for train_index_split, validation_index_split in kf_list: train_index = cur_train_data_df.index[train_index_split] validation_index = cur_train_data_df.index[ validation_index_split] self.train_cv_splits_df[i][cv_split_fold_id][ 'no_gradient'] = train_index self.validation_cv_splits_df[i][cv_split_fold_id][ 'no_gradient'] = validation_index # self.train_cv_splits_df[i][cv_split_fold_id]['no_gradient'] = self.train_splits_df[i].loc[train_index, :] # self.validation_cv_splits_df[i][cv_split_fold_id]['no_gradient'] = self.train_splits_df[i].loc[validation_index, :] cv_split_fold_id += 1 #case2: training from one set, validation and test from another set (independent_testset and validation_from_testset parameters) if self.independent_testset == 1: # if self.validation_from_testset: #validation from testset will force the cv_split_folds = 1 #self.cv_split_folds = 1 # else: #validation not from testset will force the test_split_folds = 1 # self.test_split_folds = 1 self.train_splits_df = [{} for i in range(self.test_split_folds)] self.test_splits_df = [{} for i in range(self.test_split_folds)] self.train_cv_splits_df = [[{} for i in range(self.cv_split_folds)] for j in range(self.test_split_folds)] self.validation_cv_splits_df = [[ {} for i in range(self.cv_split_folds) ] for j in range(self.test_split_folds)] #split testset to test_split_folds folds (validation - test) if self.validation_from_testset: if self.validation_equal_testset: self.test_split_folds = 1 #special case that validation set is the same as test set kf_list = [(range(self.test_data_df.shape[0]), range(self.test_data_df.shape[0]))] else: if self.test_split_method == 0: if self.test_split_folds == 1: kf_folds = int(1 / self.test_split_ratio) kf = ms.KFold(n_splits=kf_folds, shuffle=True) kf_list = [] kf_list.append( list(kf.split(self.test_data_df))[0]) else: kf = ms.KFold(n_splits=self.test_split_folds, shuffle=True) kf_list = list(kf.split(self.test_data_df)) # stratified split (keep prior) if self.test_split_method == 1: if self.test_split_folds == 1: kf_folds = int(1 / self.test_split_ratio) kf = ms.StratifiedKFold(n_splits=kf_folds, shuffle=True) kf_list = [] kf_list.append( list( kf.split( self.test_data_df, self.test_data_df[ self.dependent_variable]))[0]) else: kf = ms.StratifiedKFold( n_splits=self.test_split_folds, shuffle=True) kf_list = list( kf.split( self.test_data_df, self.test_data_df[ self.dependent_variable])) # customized split if self.test_split_method == 2: kf_list = self.test_split(self.test_data_df.copy()) test_split_fold_id = 0 for validation_index_split, test_index_split in kf_list: validation_index = self.test_data_df.index[ validation_index_split] test_index = self.test_data_df.index[test_index_split] self.train_splits_df[test_split_fold_id][ 'no_gradient'] = self.train_data_df.index self.test_splits_df[test_split_fold_id][ 'no_gradient'] = test_index cv_validation_index = np.array_split( validation_index, self.cv_split_folds) for j in range(self.cv_split_folds): self.train_cv_splits_df[test_split_fold_id][j][ 'no_gradient'] = self.train_data_df.index self.validation_cv_splits_df[test_split_fold_id][j][ 'no_gradient'] = cv_validation_index[j] # self.train_splits_df[test_split_fold_id]['no_gradient'] = self.train_data_df # self.test_splits_df[test_split_fold_id]['no_gradient'] = self.test_data_df.loc[test_index, :] # self.train_cv_splits_df[test_split_fold_id][0]['no_gradient'] = self.train_data_df # self.validation_cv_splits_df[test_split_fold_id][0]['no_gradient'] = self.test_data_df.loc[validation_index, :] test_split_fold_id += 1 print('done') else: self.train_splits_df[0][ 'no_gradient'] = self.train_data_df.index self.test_splits_df[0]['no_gradient'] = self.test_data_df.index #Split each training set into cv_split_folds folds (training - validation) for i in range(self.test_split_folds): cur_train_data_df = self.train_data_df.loc[ self.train_splits_df[i]['no_gradient'], :] if self.cv_split_method == 0: kf = ms.KFold(n_splits=self.cv_split_folds, shuffle=True) kf_list = list(kf.split(cur_train_data_df)) # stratified split (keep prior) if self.cv_split_method == 1: kf = ms.StratifiedKFold(n_splits=self.cv_split_folds, shuffle=True) kf_list = list(kf.split(cur_train_data_df)) # customized split if self.cv_split_method == 2: kf_list = self.cv_split(self.name, self.cv_split_folds, cur_train_data_df) cv_split_fold_id = 0 for train_index_split, validation_index_split in kf_list: train_index = cur_train_data_df.index[ train_index_split] validation_index = cur_train_data_df.index[ validation_index_split] self.train_cv_splits_df[i][cv_split_fold_id][ 'no_gradient'] = train_index self.validation_cv_splits_df[i][cv_split_fold_id][ 'no_gradient'] = validation_index # self.train_cv_splits_df[i][cv_split_fold_id]['no_gradient'] = self.train_splits_df[i].loc[train_index, :] # self.validation_cv_splits_df[i][cv_split_fold_id]['no_gradient'] = self.train_splits_df[i].loc[validation_index, :] cv_split_fold_id += 1
def crossValidationDT(): # Create crossvalidation partition for evaluation K_o_splits = 10 outer_it = 0 K_i_splits = 10 model_count = 10 summed_eval_i = np.zeros((model_count)) eval_i = np.zeros((model_count)) eval_o = np.zeros((model_count)) optimal_lambda = np.zeros((K_o_splits)) #CV1 = model_selection.KFold(n_splits=K_o_splits,shuffle=True) #StratifiedKfold ensures that there is a reasonable percentage of each class in each split. CV1 = model_selection.StratifiedKFold(n_splits=K_o_splits, shuffle=True) CV2 = model_selection.StratifiedKFold(n_splits=K_i_splits, shuffle=True) #Outer k-fold split for train_index_o, test_index_o in CV1.split(X, y): print('Outer CV1-fold {0} of {1}'.format(outer_it + 1, K_o_splits)) X_train_o = X[train_index_o, :] y_train_o = y[train_index_o] X_test_o = X[test_index_o, :] y_test_o = y[test_index_o] #Inner validation loop inner_it = 0 for train_index_i, test_index_i in CV2.split(X_train_o, y_train_o): print('Inner CV2-fold {0} of {1}'.format(inner_it + 1, K_i_splits)) X_train_i = X[train_index_i, :] y_train_i = y[train_index_i] X_test_i = X[test_index_i, :] y_test_i = y[test_index_i] #C specifies the inverse of regularization strength. Small C means high regularization for idx in range(model_count): reg_term = (1 + idx) model2 = tree.DecisionTreeClassifier( max_depth=reg_term, criterion="entropy") ###NEED REGU model2 = model2.fit(X_train_i, y_train_i) y_dectree = model2.predict(X_test_i) current_err = 100 * ( y_dectree != y_test_i).sum().astype(float) / len(y_test_i) summed_eval_i[idx] += current_err inner_it += 1 eval_i = summed_eval_i * (len(X_test_i) / len(X_train_o)) idx = np.argmin(eval_i) reg_term = (1 + idx * 2) model2 = tree.DecisionTreeClassifier( max_depth=reg_term, criterion="entropy") ###NEED REGU "gini" model2 = model2.fit(X_train_o, y_train_o) y_dectree = model2.predict(X_test_o) current_err = 100 * (y_dectree != y_test_o).sum().astype(float) / len(y_test_o) eval_o[outer_it] = current_err optimal_lambda[outer_it] = reg_term outer_it += 1 mode_reg, _ = numpy.unique(optimal_lambda, return_counts=True) figure() boxplot(eval_o) xlabel('Decision Tree') ylabel('Cross-validation error [%]') show() e_gen = np.sum(eval_o) * (len(X_test_o) / len(X)) print("Decision Tree generalization error: %f with %s and %i" % (e_gen, 'max depth', mode_reg[0]))
def svc_fit(train, proj_mask, epochs, folds=5, batch_size=32): """ Fit SVM using SVC on data set. Args: train (tuple of list): (X, y) train data. proj_mask (Namedtuple): Radar projections to use for training. epochs (int): Number of times to augment data. folds (int, optional): Number of folds for the Stratified K-Folds cross-validator. Default=5 batch_size (int, optional): Augment batch size. Default=32. Returns: estimator: Estimator that was chosen by grid search. """ def find_best_svm_estimator(X, y, cv, random_seed): """Exhaustive search over specified parameter values for svm. Returns: optimized svm estimator. Note: https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf """ print('\n Finding best svm estimator...') Cs = [0.01, 0.1, 1, 10, 100] gammas = [0.001, 0.01, 0.1, 1, 10] param_grid = [{ 'C': Cs, 'kernel': ['linear'] }, { 'C': Cs, 'gamma': gammas, 'kernel': ['rbf'] }] init_est = svm.SVC(probability=True, class_weight='balanced', random_state=random_seed, cache_size=1000, verbose=False) grid_search = model_selection.GridSearchCV(estimator=init_est, param_grid=param_grid, verbose=2, n_jobs=4, cv=cv) grid_search.fit(X, y) #print('\n All results:') #print(grid_search.cv_results_) logger.info('\n Best estimator:') logger.info(grid_search.best_estimator_) logger.info('\n Best score for {}-fold search:'.format(folds)) logger.info(grid_search.best_score_) logger.info('\n Best hyperparameters:') logger.info(grid_search.best_params_) return grid_search.best_estimator_ X_train, y_train = train # Augment training set. if epochs: data_gen = DataGenerator(rotation_range=15.0, zoom_range=0.3, noise_sd=0.2) logger.info('Augmenting data set.') logger.info(f'Original number of training samples: {y_train.shape[0]}') # Faster to use a list in below ops. y_train = y_train.tolist() # Do not mutate original lists. xc = X_train.copy() yc = y_train.copy() for e in range(epochs): logger.debug(f'epoch: {e}') batch = 0 for X_batch, y_batch in data_gen.flow(xc, yc, batch_size=batch_size): logger.debug(f'batch: {batch}') X_train.extend(X_batch) y_train.extend(y_batch) batch += 1 if batch >= len(xc) / batch_size: break # Sanity check if augmentation introduced a scaling problem. max = np.amax([[np.concatenate(t, axis=None)] for t in X_train]) assert abs(max - 1.0) < 1e-6, 'scale error' # Convert y_train back to np array. y_train = np.array(y_train, dtype=np.int8) logger.info( f'Augmented number of training samples: {y_train.shape[0]}') logger.info('Generating feature vectors from radar projections.') X_train = common.process_samples(X_train, proj_mask=proj_mask) logger.info(f'Feature vector length: {X_train.shape[1]}') # Balance classes. logger.info('Balancing classes.') y_train, X_train = balance_classes(y_train, X_train) skf = model_selection.StratifiedKFold(n_splits=folds) # Find best classifier. logger.info('Finding best classifier.') clf = find_best_svm_estimator(X_train, y_train, skf.split(X_train, y_train), RANDOM_SEED) return clf
def sgd_fit(train, test, proj_mask, online_learn, svm_model, epochs, folds=5, batch_size=32): """ Fit SVM using SGD on data set. Args: train (tuple of list): (X, y) train data. test (tuple of list): (X, y) test data. proj_mask (Namedtuple): Radar projections to use for training. online_learn (bool): If True perform online learning with data. svm_model (str): Name of existing svm model for online learning. epochs (int): Number of times to augment data. folds (int, optional): Number of folds for the Stratified K-Folds cross-validator. Default=5 batch_size (int, optional): Augment batch size. Default=32. Returns: estimator: Estimator that was chosen by grid search. """ def find_best_sgd_svm_estimator(X, y, cv, random_seed): """Exhaustive search over specified parameter values for svm using sgd. Returns: optimized svm estimator. """ max_iter = max(np.ceil(10**6 / len(X)), 1000) small_alphas = [10.0e-08, 10.0e-09, 10.0e-10] alphas = [10.0e-04, 10.0e-05, 10.0e-06, 10.0e-07] l1_ratios = [0.075, 0.15, 0.30] param_grid = [{ 'alpha': alphas, 'penalty': ['l1', 'l2'], 'average': [False] }, { 'alpha': alphas, 'penalty': ['elasticnet'], 'average': [False], 'l1_ratio': l1_ratios }, { 'alpha': small_alphas, 'penalty': ['l1', 'l2'], 'average': [True] }, { 'alpha': small_alphas, 'penalty': ['elasticnet'], 'average': [True], 'l1_ratio': l1_ratios }] init_est = linear_model.SGDClassifier(loss='log', max_iter=max_iter, random_state=random_seed, n_jobs=-1, warm_start=True) grid_search = model_selection.GridSearchCV(estimator=init_est, param_grid=param_grid, verbose=2, n_jobs=-1, cv=cv) grid_search.fit(X, y) #print('\n All results:') #print(grid_search.cv_results_) logger.info('\n Best estimator:') logger.info(grid_search.best_estimator_) logger.info('\n Best score for {}-fold search:'.format(folds)) logger.info(grid_search.best_score_) logger.info('\n Best hyperparameters:') logger.info(grid_search.best_params_) return grid_search.best_estimator_ X_train, y_train = train X_test, y_test = test # Make a copy of train set for later use in augmentation. if epochs: xc = X_train.copy() yc = y_train.copy() # Generate feature vectors from radar projections. logger.info('Generating feature vectors.') X_train = common.process_samples(X_train, proj_mask=proj_mask) X_test = common.process_samples(X_test, proj_mask=proj_mask) logger.info(f'Feature vector length: {X_train.shape[1]}') # Balance classes. logger.info('Balancing classes.') y_train, X_train = balance_classes(y_train, X_train) if not online_learn: # Find best initial classifier. logger.info('Running best fit with new data.') skf = model_selection.StratifiedKFold(n_splits=folds) clf = find_best_sgd_svm_estimator(X_train, y_train, skf.split(X_train, y_train), RANDOM_SEED) else: # Fit existing classifier with new data. logger.info('Running partial fit with new data.') with open(os.path.join(common.PRJ_DIR, svm_model), 'rb') as fp: clf = pickle.load(fp) max_iter = max(np.ceil(10**6 / len(X_train)), 1000) for _ in range(max_iter): clf.partial_fit(X_train, y_train) # Augment training set and use to run partial fits on classifier. if epochs: logger.info( f'Running partial fit with augmented data (epochs: {epochs}).') y_predicted = clf.predict(X_test) logger.debug( f'Un-augmented accuracy: {metrics.accuracy_score(y_test, y_predicted)}.' ) data_gen = DataGenerator(rotation_range=5.0, zoom_range=0.2, noise_sd=0.1, balance=True) for e in range(epochs): logger.debug(f'Augment epoch: {e}.') batch = 0 for X_batch, y_batch in data_gen.flow(xc, yc, batch_size=batch_size): logger.debug(f'Augment batch: {batch}.') X_batch = common.process_samples(X_batch, proj_mask=proj_mask) y_batch, X_batch = balance_classes(y_batch, X_batch) clf.partial_fit(X_batch, y_batch, classes=np.unique(y_train)) y_predicted = clf.predict(X_test) acc = metrics.accuracy_score(y_test, y_predicted) logger.debug(f'Augmented accuracy: {acc}.') batch += 1 if batch >= len(xc) / batch_size: break return clf
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. """ Titanic evaluation definition. This is one of the main _formal_ forml components (along with `source` and `evaluation`) that's being looked up by the forml loader. """ from sklearn import model_selection, metrics from forml.project import component from forml.lib.flow.operator.folding import evaluation # Typical method of providing component implementation using `component.setup()`. Choosing the `MergingScorer` operator # to implement classical crossvalidated metric scoring component.setup( evaluation.MergingScorer( crossvalidator=model_selection.StratifiedKFold(n_splits=2, shuffle=True, random_state=42), metric=metrics.log_loss, ))
def PCBA_CV(DB, data_type, list_ID, list_y, list_SMILES, dict_id2smile, n_folds): if data_type == 'kernel': if not os.path.isfile('data/' + DB + '/' + DB + '_K.npy'): K = mol_build_K(list_SMILES) np.save('data/' + DB + '/' + DB + '_K', K) else: K = np.load('data/' + DB + '/' + DB + '_K.npy') if DB == 'PCBA': # if Kmedoid # list_assignment, medoids = Kmedoid_cluster(K, n_folds) # if agglomerative clustering list_assignment = Khierarchical_cluster(K, n_folds) else: list_assignment = np.zeros(K.shape[0]) for y in [0, 1]: indices = np.where(list_y == y)[0] K_local = K[indices, :] K_local = K_local[:, indices] local_assignment = Khierarchical_cluster(K_local, n_folds) list_assignment[indices] = local_assignment elif data_type == 'features': if not os.path.isfile('data/' + DB + '/' + DB + '_X.npy'): X = mol_build_X(list_SMILES) np.save('data/' + DB + '/' + DB + '_X', X) else: X = np.load('data/' + DB + '/' + DB + '_X.npy') if DB == 'PCBA': list_assignment = Xkmeans_cluster(X, n_folds) else: list_assignment = np.zeros(X.shape[0]) for y in [0, 1]: indices = np.where(list_y == y)[0] X_local = X[indices, :] local_assignment = Xkmeans_cluster(X_local, n_folds) list_assignment[indices] = local_assignment elif data_type == 'standard': # if not os.path.isfile('data/' + DB + '/' + DB + '_X.npy'): # X = mol_build_X(list_SMILES) # np.save('data/' + DB + '/' + DB + '_X', X) # else: # X = np.load('data/' + DB + '/' + DB + '_X.npy') list_ID = pickle.load(open('data/' + DB + '/' + DB + '_list_ID.data', 'rb')) list_y = np.array(pickle.load(open('data/' + DB + '/' + DB + '_list_y.data', 'rb'))) X = np.zeros((len(list_ID), 1)) list_assignment = np.zeros(X.shape[0]) if DB not in ['PCBA', 'PCBA10', 'PCBA100']: skf = model_selection.StratifiedKFold(n_folds, shuffle=True, random_state=92) skf.get_n_splits(X, list_y) ifold = 0 for train_index, test_index in skf.split(X, list_y): list_assignment[test_index] = ifold ifold += 1 else: skf = model_selection.KFold(n_folds, shuffle=True, random_state=92) skf.get_n_splits(X) ifold = 0 for train_index, test_index in skf.split(X): list_assignment[test_index] = ifold ifold += 1 # import pdb; pdb.Pdb().set_trace() c = collections.Counter(list_assignment) print(c) folds = [np.where(list_assignment == cle)[0] for cle in list(c.keys())] fo = open('data/' + DB + '/' + DB + '_folds.txt', 'w') for ifold in range(n_folds): fo.write("ifold" + str(ifold) + '\n') if DB in ['PCBA', 'PCBA10', 'PCBA100']: for iclass in range(list_y.shape[1]): fo.write("iclass " + str(iclass) + ' ' + str(collections.Counter(list_y[folds[ifold], iclass])) + '\n') print("iclass " + str(iclass) + ' ' + str(collections.Counter(list_y[folds[ifold], iclass]))) else: fo.write(str(collections.Counter(list_y[folds[ifold]])) + '\n') print(ifold, collections.Counter(list_y[folds[ifold]])) fo.write('\n') return folds
kNN_reg = kNN.KNeighborsClassifier(n_neighbors=k) kNN_reg.fit(X, Y) kNN_test = kNN.KNeighborsClassifier(n_neighbors=k) # Linear discirimant Analysis lda_reg = disc.LinearDiscriminantAnalysis() lda_reg.fit(X, Y) lda_test = disc.LinearDiscriminantAnalysis() # Qudratic discriminat analysis qda_reg = disc.QuadraticDiscriminantAnalysis() qda_reg.fit(X, Y) qda_test = disc.QuadraticDiscriminantAnalysis() # Cross-validation cv = mod.StratifiedKFold(n_splits=5, shuffle=True) log_result = mod.cross_validate(log_test, X, Y, cv=cv) kNN_result = mod.cross_validate(kNN_test, X, Y, cv=cv) lda_result = mod.cross_validate(lda_test, X, Y, cv=cv) qda_result = mod.cross_validate(qda_test, X, Y, cv=cv) print(log_result['test_score'].mean()) print(kNN_result['test_score'].mean()) print(lda_result['test_score'].mean()) print(qda_result['test_score'].mean()) sns.set() fig, sub = plt.subplots(2,2) plt.subplots_adjust(wspace=0.6, hspace=0.6) xx, yy = make_meshgrid(X[:,0], X[:,1]) titles = ['Logistic', "kNN (k={})".format(k), 'LDA', 'QDA']
def main(argv): topics = ["uni", "movie", "title"] langs = ["de", "es", "fr"] result_path = "/home/oyku/datasets/newexperiments/experiment_0/mtsm_baseline.csv" cols = ['Topic', 'Lang', 'F1', 'Recall', 'Precision'] df = pd.DataFrame(columns=cols) for topic in topics: for lang in langs: # -------------------------------------- if topic == "uni": path = "/home/oyku/datasets/University/" elif topic == "movie": path = "/home/oyku/datasets/Movie/" elif topic == "title": path = "/home/oyku/datasets/Article/" # -------------------------------------- labeled = path + topic + "_" + lang + "_blocked_translated.csv" labeled = pd.read_csv(labeled) print(labeled.shape) print("Running translated test on " + topic + " dataset on language " + lang) fts_path = path + "features/" + topic + "_" + lang + "_baseline_features.csv" train_features = pd.read_csv(fts_path) print("Training features: " + str(len(list(train_features)))) exclude = ["_id", "ltable_id", "rtable_id"] gold = pd.DataFrame(labeled["Label"]) cols = [col for col in list(train_features) if col not in exclude] train_features = train_features[cols] imp = Imputer(missing_values='NaN', strategy='mean', axis=0) scale = StandardScaler() imp.fit(train_features) imp.statistics_[pd.np.isnan(imp.statistics_)] = 0 features = scale.fit_transform(imp.transform(train_features)) # Cross Validation model = XGBClassifier(random_state=7, n_estimators=350) kfold = model_selection.StratifiedKFold(n_splits=5, random_state=7) scoring = ['f1', 'recall', 'precision'] scores = model_selection.cross_validate(model, features, gold.values.ravel(), cv=kfold, scoring=scoring) f1 = "%.3f (%.3f)" % (scores['test_f1'].mean() * 100, scores['test_f1'].std() * 100) recall = "%.3f (%.3f)" % (scores['test_recall'].mean() * 100, scores['test_recall'].std() * 100) precision = "%.3f (%.3f)" % (scores['test_precision'].mean() * 100, scores['test_precision'].std() * 100) print( "Topic: %s --- Lang: %s --- F1: %s Recall: %s Precision: %s" % (topic, lang, f1, recall, precision)) version_results = [topic, lang, f1, recall, precision] df.loc[len(df)] = version_results df.to_csv(result_path, index=False)
def run(train, y, test, v, z): #cname = sys._getframe().f_code.co_name cname = 'p' train.drop('id', axis=1, inplace=True) test.drop('id', axis=1, inplace=True) from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval dtrain = xgb.DMatrix(train, y) def step_xgb(params): cv = xgb.cv(params=params, dtrain=dtrain, num_boost_round=10000, early_stopping_rounds=50, nfold=10, seed=params['seed']) score = cv.ix[len(cv) - 1, 0] print(cname, score, len(cv), params) return dict(loss=score, status=STATUS_OK) space_xgb = dict(max_depth=hp.choice('max_depth', range(2, 9)), subsample=hp.quniform('subsample', 0.6, 1, 0.05), colsample_bytree=hp.quniform('colsample_bytree', 0.6, 1, 0.05), learning_rate=hp.quniform('learning_rate', 0.005, 0.1, 0.005), min_child_weight=hp.quniform('min_child_weight', 1, 6, 1), gamma=hp.quniform('gamma', 0.5, 10, 0.05), reg_alpha=hp.quniform('reg_alpha', 0, 1, 0.001), objective='binary:logistic', eval_metric='logloss', seed=1, silent=1) trs = state.load('xgb_trials') if trs == None or debug_mode: tr = Trials() else: tr, _ = trs if len(tr.trials) > 0: print('reusing %d trials, best was:' % (len(tr.trials)), space_eval(space_xgb, tr.argmin)) best = tr.argmin while len(tr.trials) < 15: best = fmin(step_xgb, space_xgb, algo=tpe.suggest, max_evals=len(tr.trials) + 1, trials=tr) state.save('xgb_trials', (tr, space_xgb)) xgb_params = space_eval(space_xgb, best) print(xgb_params) N_splits = 9 N_seeds = 3 skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) dtest = xgb.DMatrix(test) cv = [] for s in range(N_seeds): scores = [] cname2 = cname + str(s) v[cname2], z[cname2] = 0, 0 xgb_params['seed'] = s + 4242 for n, (itrain, ival) in enumerate(skf.split(train, y)): dtrain = xgb.DMatrix(train.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train.ix[ival], y[ival]) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False) p = clf.predict(dvalid) v.loc[ival, cname2] += p score = metrics.log_loss(y[ival], p) z[cname2] += clf.predict(dtest) print( cname, 'seed %d step %d of %d: ' % (xgb_params['seed'], n + 1, skf.n_splits), score, state.now()) scores.append(score) z[cname2] /= N_splits cv.append(np.mean(scores)) print('seed %d loss: ' % (xgb_params['seed']), scores, np.mean(scores), np.std(scores)) z['y'] = z[cname2] print('cv:', cv, np.mean(cv), np.std(cv)) return cv, None
def __init__(self, ds: pd.DataFrame, n_ss_folds: int = 3, n_folds: int = 5, target_col: str = 'target', random_state: int or None = None, unlabeled_target_col: str = '5means_classes', test_ratio: int = 0.25, labeled_train_size_per_class: int = None, unlabeled_train_size_per_class: int = None, labeled_train_size: int = None, unlabeled_train_size: int = None, group_col: str or None = None, equal_target: bool = True, equal_unlabeled_target: bool = True, shuffle: bool = True): super().__init__() self._test_ratio = test_ratio if equal_target and labeled_train_size_per_class is None: raise ValueError( "labeled_train_size_per_class must be determined when \ equal_target is True, but found None") if not equal_target and labeled_train_size is None: raise ValueError("labeled_train_size must be determined when \ equal_target is False, but found None") # Master split into Label/Unlabel if group_col is None: master_splitter = model_selection.StratifiedKFold( n_splits=n_ss_folds, random_state=random_state) unlabeled_idx, labeled_idx = next( master_splitter.split(ds, ds[target_col])) else: master_splitter = model_selection.GroupKFold(n_splits=n_ss_folds) unlabeled_idx, labeled_idx = next( master_splitter.split(ds, ds[target_col], groups=ds[group_col])) unlabeled_ds = ds.iloc[unlabeled_idx] # u_groups = ds[unlabeled_target_col].iloc[unlabeled_idx] labeled_ds = ds.iloc[labeled_idx] l_groups = ds[target_col].iloc[labeled_idx] if not equal_target and labeled_train_size is not None and labeled_train_size > len( labeled_idx): raise ValueError( 'Input labeled train size {} is larger than actual labeled train size {}' .format(labeled_train_size, len(labeled_idx))) if unlabeled_train_size is not None and unlabeled_train_size > len( unlabeled_idx): unlabeled_train_size = len(unlabeled_idx) # raise ValueError('Input unlabeled train size {} is larger than actual unlabeled train size {}'.format(unlabeled_train_size, len(unlabeled_idx))) # Split labeled data using GroupKFold # Split unlabeled data using GroupKFold self.__cv_folds_idx = [] self.__ds_chunks = [] # split of train/val data if group_col is None: unlabeled_splitter = model_selection.StratifiedKFold( n_splits=n_folds, random_state=random_state + 1) unlabeled_spl_iter = unlabeled_splitter.split( unlabeled_ds, unlabeled_ds[target_col]) else: unlabeled_splitter = model_selection.GroupKFold(n_splits=n_folds) unlabeled_spl_iter = unlabeled_splitter.split( unlabeled_ds, unlabeled_ds[target_col], groups=unlabeled_ds[group_col]) if group_col is None: labeled_splitter = model_selection.StratifiedKFold( n_splits=n_folds, random_state=random_state + 2) labeled_spl_iter = labeled_splitter.split(labeled_ds, labeled_ds[target_col]) else: labeled_splitter = model_selection.GroupKFold(n_splits=n_folds) labeled_spl_iter = labeled_splitter.split( labeled_ds, labeled_ds[target_col], groups=labeled_ds[group_col]) for i in range(n_folds): u_train, u_test = next(unlabeled_spl_iter) l_train, l_test = next(labeled_spl_iter) l_train_target = labeled_ds.iloc[l_train][target_col] l_train_data = labeled_ds.iloc[l_train] l_test_target = labeled_ds.iloc[l_test][target_col] l_test_data = labeled_ds.iloc[l_test] # Sample labeled_train_size of labeled data if equal_target: filtered_l_train_idx, chosen_l_train = self._sample_labeled_data( l_train_data, l_train_target, target_col, labeled_train_size_per_class, random_state) filtered_l_test_idx, chosen_l_test = self._sample_labeled_data( l_test_data, l_test_target, target_col, int(labeled_train_size_per_class * self._test_ratio), random_state) else: if labeled_train_size is not None: chosen_l_train, _ = model_selection.train_test_split( l_train, train_size=labeled_train_size, random_state=random_state, shuffle=shuffle, stratify=l_train_target) chosen_l_test, _ = model_selection.train_test_split( l_test, train_size=int(labeled_train_size * self._test_ratio), random_state=random_state, shuffle=shuffle, stratify=l_train_target) else: chosen_l_train = l_train chosen_l_test = l_test filtered_l_train_idx = labeled_ds.iloc[chosen_l_train] filtered_l_test_idx = labeled_ds.iloc[chosen_l_test] # Sample unlabeled_train_size of labeled data if equal_unlabeled_target: u_train_target = unlabeled_ds.iloc[u_train][ unlabeled_target_col] u_test_target = unlabeled_ds.iloc[u_test][unlabeled_target_col] filtered_u_train_idx, chosen_u_train = self._sample_unlabeled_data( unlabeled_ds, u_train, unlabeled_target_col, u_train_target, unlabeled_train_size_per_class, random_state) filtered_u_test_idx, chosen_u_test = self._sample_unlabeled_data( unlabeled_ds, u_test, unlabeled_target_col, u_test_target, int(unlabeled_train_size_per_class * self._test_ratio), random_state) else: if unlabeled_train_size is not None: # chosen_u_train, _ = model_selection.train_test_split(u_train, train_size=unlabeled_train_size, # random_state=random_state, shuffle=shuffle) is_replace = unlabeled_train_size > len(u_train) chosen_u_train = resample(u_train, n_samples=unlabeled_train_size, replace=is_replace, random_state=random_state) unlabeled_test_size = int(unlabeled_train_size * self._test_ratio) is_replace = unlabeled_test_size > len(u_test) chosen_u_test = resample(u_test, n_samples=unlabeled_test_size, replace=is_replace, random_state=random_state) else: chosen_u_train = u_train chosen_u_test = u_test filtered_u_train_idx = unlabeled_ds.iloc[chosen_u_train] filtered_u_test_idx = unlabeled_ds.iloc[chosen_u_test] self.__cv_folds_idx.append( (chosen_l_train, chosen_l_test, chosen_u_train, chosen_u_test)) self.__ds_chunks.append( (filtered_l_train_idx, filtered_l_test_idx, filtered_u_train_idx, filtered_u_test_idx)) self.__folds_iter = iter(self.__ds_chunks)
bootstrap=bootstrap, min_samples_leaf=min_sample_leaf, min_samples_split=min_sample_split, n_estimators=n_estimators, max_depth=max_depth, max_features=max_features, oob_score=oob_score, random_state=531, verbose=1, class_weight=class_weight, n_jobs=1) fileModel = fileModel.fit(xTrain_base.values, yTrain_base.values) if base_sampling is None: cv = model_selection.StratifiedKFold(n_splits=5, random_state=None) else: cv = 5 y_pred_score = model_selection.cross_val_predict( fileModel, Valid.drop([selected_label] + ['id_siniestro'], axis=1).values, Valid[[selected_label]].values, cv=cv, method='predict_proba') y_pred_score = np.delete(y_pred_score, 0, axis=1) y_hat_test = (y_pred_score > threshold_models).astype(int) y_hat_test = y_hat_test.tolist() y_hat_test = [item for sublist in y_hat_test for item in sublist] recall_base = metrics.recall_score(y_pred=y_hat_test,
solver='sag', class_weight={ 1: 0.46, 0: 1.32 }, verbose=0.8) #model = SVC(probability=True) #model = BernoulliNB() model.fit(X, y) logRegAccuracy = [] logRegLogLoss = [] logRegAUC = [] print('---------------------------------------------') stratifiedCV = model_selection.StratifiedKFold(n_splits=numCVSplits, random_state=2) for k, (trainInds, validInds) in enumerate(stratifiedCV.split(X, y)): break foldTrainingStartTime = time.time() X_train_cv = X[trainInds, :] X_valid_cv = X[validInds, :] y_train_cv = y[trainInds] y_valid_cv = y[validInds] model.fit(X_train_cv, y_train_cv) y_train_hat = model.predict_proba(X_train_cv)[:, 1] y_valid_hat = model.predict_proba(X_valid_cv)[:, 1]
def plotOptimalK(desc_train, targ_train): scoring = 'accuracy' i_array = list() euclid = list() euclid_W = list() manhattan = list() manhattan_W = list() Gaussian = list() mink = list() minkW = list() splits = 100 for i in range(1, 25): models = [] models.append(('KNN-Euclid', KNeighborsClassifier(n_neighbors=i, p=2))) models.append(("KNN-Euclid-Weighted", KNeighborsClassifier(n_neighbors=i, weights='distance', p=2))) models.append(('KNN-Manhattan', KNeighborsClassifier(n_neighbors=i, p=1))) models.append(("KNN-Manhattan-Weighted", KNeighborsClassifier(n_neighbors=i, weights='distance', p=1))) #models.append(("Gaussian Bayes", GaussianNB())) #models.append(("ID3"),tree.DecisionTreeClassifier()) # evaluate each model in turn results = [] names = [] for name, model in models: models kfold = model_selection.StratifiedKFold(n_splits=splits, random_state=seed) cv_results = model_selection.cross_val_score(model, desc_train, targ_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) # msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) # print(msg) # print("-") if name == 'KNN-Euclid': euclid.append(cv_results.mean()) elif name == "KNN-Euclid-Weighted": euclid_W.append(cv_results.mean()) elif name == 'KNN-Manhattan': manhattan.append(cv_results.mean()) elif name == 'KNN-Manhattan-Weighted': manhattan_W.append(cv_results.mean()) elif name == 'KNN-Mikowski (8)': mink.append(cv_results.mean()) elif name == 'KNN-Mikowski (8)-Weighted': minkW.append(cv_results.mean()) # elif name == 'Gaussian Bayes': # Gaussian.append( cv_results.mean()) i_array.append(i) plt.plot(i_array, euclid, label='Euclidian') plt.plot(i_array, euclid_W, label='Euclidian Weighted') plt.plot(i_array, manhattan, label='Manhattan ') plt.plot(i_array, manhattan_W, label='Manhattan Weighted') plt.plot(i_array, mink, label='Mikowski (8) ') plt.plot(i_array, minkW, label='Mikowski (8)-Weighted') plt.legend(loc='upper left') plt.title("Accuracy per (K)") plt.xlabel("K values") plt.ylabel("Accuracy") plt.show()
labelsDB = data['shot_made_flag'] # ## Build a model based on featuresDB table, and make sure it doesn't overfit # (i.e. the training error and the test error are the same) # #### Use an ExtraTreesClassifier for that # In[ ]: #%% build a simple model and make sure it doesnt overfit randomSeed = 1 numFolds = 4 stratifiedCV = model_selection.StratifiedKFold(n_splits=numFolds, shuffle=True, random_state=randomSeed) mainLearner = ensemble.ExtraTreesClassifier(n_estimators=500, max_depth=5, min_samples_leaf=120, max_features=120, criterion='entropy', bootstrap=False, n_jobs=-1, random_state=randomSeed) startTime = time.time() trainAccuracy = []; validAccuracy = []; trainLogLosses = []; validLogLosses = [] for trainInds, validInds in stratifiedCV.split(featuresDB, labelsDB): # split to train and valid sets X_train_CV = featuresDB.iloc[trainInds,:] y_train_CV = labelsDB.iloc[trainInds] X_valid_CV = featuresDB.iloc[validInds,:] y_valid_CV = labelsDB.iloc[validInds]
gamma=G_list[it]) gram_test = metrics.pairwise.rbf_kernel(X_test, X_train, gamma=G_list[it]) kernel_train_list.append(gram_train) kernel_test_list.append(gram_test) # weight_v = hsic_kernel_weights_norm(kernel_train_list, y_train, 1, 0.01, 0) # calculating weights using HSIC during experiment # combine kernels for i in range(n_kernels): gram_train += kernel_train_list[i] * weight_v[i] gram_test += kernel_test_list[i] * weight_v[i] # five-fold cross validation cv = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0) # cost parameter for SVM C = c_list[ds] # init SVM classifier with precomputed kernel clf = svm.SVC(C=C, kernel='precomputed', probability=True) scorerMCC = metrics.make_scorer(metrics.matthews_corrcoef) scorerSP = metrics.make_scorer(specificity_score) scorerPR = metrics.make_scorer(metrics.precision_score) scorerSE = metrics.make_scorer(metrics.recall_score) scorer = { 'ACC': 'accuracy', 'recall': scorerSE,
def lr_roc_curve(self, C): """ This generates a roc curve using logistic regression and 10 fold crossvalidation :param C: The C parameter used for the logistic regression. """ # sets model model = lm.LogisticRegression(class_weight='balanced', C=C) # seeds random state from time random_state = np.random.RandomState(int(time.time())) np.random.seed(int(time.time() / 100)) # Uncomment if you want to seed random state from iteger instead (to be able to repeat exact results) #random_state = np.random.RandomState(11235813) #np.random.seed(112358) # Sets up 10-fold cross validation set cv = ms.StratifiedKFold(n_splits=10, random_state=random_state, shuffle=True) tprs = [] aucs = [] f1s = [] mean_fpr = np.linspace(0, 1, 100) i = 0 # Creates a shuffled index for X and y shuffled_idx = np.arange(len(self.y)) np.random.shuffle(shuffled_idx) # Uncomment if you want it to find and print the mean f1 score #test_f1_mean = np.mean(ms.cross_val_score(model, self.X[shuffled_idx], self.y[shuffled_idx], cv=10, n_jobs=-1, scoring='f1')) #print('using cross val score F1 = %0.4f' % (test_f1_mean)) # Calculates and plots the roc cureve for each set in 10-fold cross validation for train, test in cv.split(self.X, self.y): model_i = model.fit(self.X[train], self.y[train]) probas_ = model_i.predict_proba(self.X[test]) pred = model_i.predict(self.X[test]) f1 = met.f1_score(self.y[test], pred, average='binary') f1s.append(f1) # Compute ROC curve and area the curve fpr, tpr, thresholds = met.roc_curve(self.y[test], probas_[:, 1]) tprs.append(sci.interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = met.auc(fpr, tpr) aucs.append(roc_auc) plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.4f, F1 = %0.4f)' % (i + 1, roc_auc, f1)) i += 1 # Plots the 50/50 line plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Coin Flip', alpha=.8) # Finds and plots the mean roc curve and mean f1 score mean_tpr = np.mean(tprs, axis=0) mean_f1 = np.mean(f1s) mean_tpr[-1] = 1.0 mean_auc = met.auc(mean_fpr, mean_tpr) std_auc = np.std(aucs) plt.plot(mean_fpr, mean_tpr, color='b', label=u'Mean ROC (AUC = %0.4f \u00B1 %0.4f, \n \ Mean F1 = %0.4f)' % (mean_auc, std_auc, mean_f1), lw=2, alpha=.8) # Finds and plots the +- standard deviation for roc curve std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0) plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') # Sets legend, limits, labels, and displays plot plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver Operating Characteristic') plt.legend(loc="lower right") outloc = self.output + '/Figure2.png' plt.savefig(outloc)
import pandas as pd from sklearn import model_selection if __name__ == '__main__': df = pd.read_csv('input/train.csv') df['kfold'] = -1 print('train length:', df.shape) df = df.sample(frac=1).reset_index(drop=True) kf = model_selection.StratifiedKFold(n_splits=5, shuffle=False, random_state=None) kf.get_n_splits() for fold, (train_idx, val_idx) in enumerate(kf.split(X=df, y=df.target.values)): print(len(train_idx), len(val_idx)) df.loc[val_idx, 'kfold'] = fold df.to_csv('input/train_folds.csv', index=False)
def classify_rf(self, max_depth=64, n_estimators=1000, max_features="sqrt", roc_flag=False, rand_flag=False, save="", compare_flag=True, group_classes=True): """ This uses LogisticRegressionCV to find the maximum mean f1 score using by adjusting the C parameter :param C_flag: A boolian indicating what to output from the function. (if False output the max mean f1, if True output the C value used to find the maximum mean f1 score) """ # seeds random state from time random_state = np.random.RandomState(int(time.time())) np.random.seed(int(time.time() / 100)) if group_classes: rng_idx = np.arange(len(self.class_list)) np.random.shuffle(rng_idx) # Uncomment if you want to seed random state from iteger instead (to be able to repeat exact results) #random_state = np.random.RandomState(11235813) #np.random.seed(112358) # Sets and fits Random ForestModel model2 = ensemble.RandomForestClassifier(class_weight='balanced', max_depth=max_depth, max_leaf_nodes=None, n_estimators=n_estimators, min_samples_leaf=1, min_samples_split=2, max_features=max_features, n_jobs=-1) fitModel = model2.fit(self.X, self.y) # saves the model if len(save) > 0: joblib.dump(fitModel, save) if rand_flag: # Generate random drug-disease pairs rand_n = 10000 self.rand_rate(rand_n, self.drugs_path, self.diseases_path) # Get random pairs cutoff rates probas_rand = fitModel.predict_proba(self.X2) self.data["treat_prob"] = [pr[1] for pr in probas_rand] rand_df_sort = self.data.sort_values( "treat_prob", ascending=False).reset_index(drop=True) rand_df_sort.to_csv(self.output + "random_pairs_names.csv", index=False) #print(self.data.sort_values("treat_prob", ascending = False).reset_index(drop=True)) # Get true positive cutoff rates probas_tp = fitModel.predict_proba(self.Xtp) # Get true negative cutoff rates probas_tn = fitModel.predict_proba(self.Xtn) # Plot the cutoff rates together self.plot_cutoff([ pd.DataFrame({"treat_prob": [pr[1] for pr in probas_rand]}), pd.DataFrame({"treat_prob": [pr[1] for pr in probas_tp]}), pd.DataFrame({"treat_prob": [pr[1] for pr in probas_tn]}) ], ["Random Pairs", "True Positives", "True Negatives"]) if roc_flag: model = ensemble.RandomForestClassifier(class_weight='balanced', max_depth=max_depth, max_leaf_nodes=None, n_estimators=n_estimators, min_samples_leaf=1, min_samples_split=2, max_features=max_features, n_jobs=-1) # Sets up 10-fold cross validation set cv = ms.StratifiedKFold(n_splits=10, random_state=random_state, shuffle=True) if group_classes: cv = ms.GroupKFold(n_splits=10) tprs = [] aucs = [] f1s = [] mean_fpr = np.linspace(0, 1, 100) i = 0 # Creates a shuffled index for X and y shuffled_idx = np.arange(len(self.y)) np.random.shuffle(shuffled_idx) # Uncomment if you want it to find and print the mean f1 score #test_f1_mean = np.mean(ms.cross_val_score(model, self.X[shuffled_idx], self.y[shuffled_idx], cv=10, n_jobs=-1, scoring='f1')) #print('using cross val score F1 = %0.4f' % (test_f1_mean)) prob_list = [] if group_classes: cv_params = { "X": self.X[rng_idx], "y": self.y[rng_idx], "groups": list(self.class_list[rng_idx]) } else: cv_params = {"X": self.X, "y": self.y} # Calculates and plots the roc cureve for each set in 10-fold cross validation for train, test in cv.split(**cv_params): model_i = model.fit(self.X[train], self.y[train]) probas_ = model_i.predict_proba(self.X[test]) pred = model_i.predict(self.X[test]) f1 = met.f1_score(self.y[test], pred, average='binary') f1s.append(f1) # Compute ROC curve and area the curve #prob_list += [pd.DataFrame({"treat_prob":[pr[1] for pr in probas_]})] fpr, tpr, thresholds = met.roc_curve(self.y[test], probas_[:, 1]) tprs.append(sci.interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = met.auc(fpr, tpr) aucs.append(roc_auc) plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.4f, F1 = %0.4f)' % (i, roc_auc, f1)) i += 1 # Plots the 50/50 line plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Coin Flip', alpha=.8) # Finds and plots the mean roc curve and mean f1 score mean_tpr = np.mean(tprs, axis=0) mean_f1 = np.mean(f1s) mean_tpr[-1] = 1.0 mean_auc = met.auc(mean_fpr, mean_tpr) std_auc = np.std(aucs) plt.plot(mean_fpr, mean_tpr, color='b', label=u'Mean ROC (AUC = %0.4f \u00B1 %0.4f, \n \ Mean F1 = %0.4f)' % (mean_auc, std_auc, mean_f1), lw=2, alpha=.8) # Finds and plots the +- standard deviation for roc curve std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0) plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') # Sets legend, limits, labels, and displays plot plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver Operating Characteristic') plt.legend(loc="lower right") outloc = self.output + '/Figure3.png' plt.savefig(outloc) #plt.show() plt.close()
1)) # a list for storing generalizaition error after each outer cv-fold h_optimal_list = [ ] # a list for storing optimal hidden units no after each outer cv-fold ANN_best_models = [ ] # a list for models for storing models after each outer cv-fold # Make figure for holding summaries (errors and learning curves) summaries, summaries_axes = plt.subplots(1, 2, figsize=(10, 5)) # Make a list for storing assigned color of learning curve for up to K=10 color_list = [ 'tab:orange', 'tab:green', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan', 'tab:red', 'tab:blue' ] for k, (train_index, test_index) in enumerate( model_selection.StratifiedKFold(K, shuffle=True, random_state=0).split(X, y)): print('\nCROSSVALIDATION OUTER FOLD: {0}/{1}'.format(k + 1, K)) #network_validate_classification does the inner cross-validation with cvf=10 opt_val_err, opt_n_h_units = network_validate_classification( X, y, h_interval) model = lambda: torch.nn.Sequential( torch.nn.Linear(M, opt_n_h_units), #M features to H hiden units torch.nn.Tanh(), # 1st transfer function, torch.nn.Linear(opt_n_h_units, 1), # H hidden units to 1 output neuron torch.nn.Sigmoid() # final tranfer function ) loss_fn = torch.nn.BCELoss() X_train = torch.Tensor(X[train_index, :])
def run(self, decision_tree_params, knn_params, nb_params, mlp_params, regression_params): # treinamento inner_skf = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=None) # ajuste dos params outer_skf = model_selection.StratifiedKFold(n_splits=3, shuffle=True, random_state=None) decision_tree = DecisionTreeClassifier( **self.decision_tree_tune_params(outer_skf, decision_tree_params)) knn = KNeighborsClassifier( **self.knn_tune_params(outer_skf, knn_params)) bayes = GaussianNB(**self.nb_tune_params(outer_skf, nb_params)) regression = LogisticRegression(**self.logistic_regression_tune_params( outer_skf, regression_params)) mlp = MLPClassifier(**self.mlp_tune_params(outer_skf, mlp_params)) for train, test in inner_skf.split(self.data, self.target): data_train, target_train = self.data[train], self.target[train] data_test, target_test = self.data[test], self.target[test] decision_tree = decision_tree.fit(data_train, target_train) decision_tree_predicted = decision_tree.predict(data_test) self.predicted_classes["tree"][test] = decision_tree_predicted self.stats["tree"]["f1_score"].append( f1_score(self.target[test], decision_tree_predicted, average=None)) self.stats["tree"]["accuracy_score"].append( accuracy_score(self.target[test], decision_tree_predicted, average=None)) knn = knn.fit(data_train, target_train) knn_predicted = knn.predict(data_test) self.predicted_classes["knn"][test] = knn_predicted self.stats["knn"]["f1_score"].append( f1_score(self.target[test], knn_predicted, average=None)) self.stats["knn"]["accuracy_score"].append( accuracy_score(self.target[test], knn_predicted, average=None)) bayes = bayes.fit(data_train, target_train) bayes_predicted = bayes.predict(data_test) self.predicted_classes["bayes"][test] = bayes_predicted self.stats["bayes"]["f1_score"].append( f1_score(self.target[test], bayes_predicted, average=None)) self.stats["bayes"]["accuracy_score"].append( accuracy_score(self.target[test], bayes_predicted, average=None)) regression = regression.fit(data_train, target_train) regression_predicted = regression.predict(data_test) self.predicted_classes["regression"][test] = regression_predicted self.stats["regression"]["f1_score"].append( f1_score(self.target[test], regression_predicted, average=None)) self.stats["regression"]["accuracy_score"].append( accuracy_score(self.target[test], regression_predicted, average=None)) mlp = mlp.fit(data_train, target_train) mlp_predicted = mlp.predict(data_test) self.predicted_classes["mlp"][test] = mlp_predicted self.stats["mlp"]["f1_score"].append( f1_score(self.target[test], mlp_predicted, average=None)) self.stats["mlp"]["accuracy_score"].append( accuracy_score(self.target[test], mlp_predicted, average=None))