def train_dev_split_cv(df, k_folds=None, match_num=None): """ split train dev sets using cross validation To-do: integrate more CV methods, e.g., loo, lop, etc :param df: data frame :param k_folds: number of folds specified in cross validation :return: data_train, in lists """ output = [] X = df.iloc[:, df.columns != 'target'] y = df.target.to_frame() if not k_folds: print('Applied 5-fold cross validation by default') k_folds = 5 # K Fold CV group_kfold = GroupKFold(n_splits=k_folds) group_kfold.get_n_splits(X, y, groups=X.PID) for train_index, test_index in group_kfold.split(X, y, groups=X.PID): df_subset = {} df_subset['data_train'], df_subset['data_test'] = X.iloc[ train_index], X.iloc[test_index] df_subset['target_train'], df_subset['target_test'] = y.iloc[ train_index], y.iloc[test_index] output.append(df_subset) return output
def group_k_fold(make_model, feature_vector): group_kfold = GroupKFold(n_splits=5) group_kfold.get_n_splits(feature_vector.features, feature_vector.target, feature_vector.pdb_ids) predicted_proba = np.zeros_like(feature_vector.target, dtype=np.float32) predicted = np.zeros_like(feature_vector.target, dtype=np.float32) for train_index, test_index in group_kfold.split(feature_vector.features, feature_vector.target, feature_vector.pdb_ids): X_train, X_test = feature_vector.features[ train_index], feature_vector.features[test_index] y_train, y_test = feature_vector.target[ train_index], feature_vector.target[test_index] model = make_model() model.fit(X_train, y_train) y_pred = model.predict(X_test) predicted[test_index] = y_pred predicted_proba[test_index] = model.predict_proba(X_test)[:, 1] print(precision_recall_fscore_support(y_test, y_pred)) return ClassificationResult(target=feature_vector.target, predicted=predicted, predicted_proba=predicted_proba)
def GroupKFold_Amir(input, n_splits): X = input y = X.landmarks_frame.KL[:] y = y.reset_index(drop=True) groups = X.landmarks_frame.ID[:] group_kfold = GroupKFold(n_splits) group_kfold.get_n_splits(X, y, groups) print(group_kfold) return group_kfold.split(X, y, groups)
def group_test(X, y, model, groups): # X=pre_x[:,chosen_vars] # x_train,x_test,y_train,y_test=train_test_split(new_x,y,test_size=.3) group_kfold = GroupKFold(n_splits=10) group_kfold.get_n_splits(X, y, groups) acc_arr = [] for train_index, test_index in group_kfold.split(X, y, groups): X_train = [] X_test = [] y_train = [] y_test = [] # print(train_index) # print(test_index) for id in train_index: X_train.append(X.iloc[id]) for id in test_index: X_test.append(X.iloc[id]) for id in train_index: y_train.append(y[id]) for id in test_index: y_test.append(y[id]) # print(np.shape(X_train)) # print(np.shape(X_test)) # print(np.shape(y_train)) # print(np.shape(y_test)) if model == 'svm': clf = SVC(gamma='auto').fit(X_train, y_train) tmp_score = clf.score(X_test, y_test) acc_arr.append(tmp_score) elif model == 'rf_extra': clf = ExtraTreesClassifier(n_estimators=100).fit(X_train, y_train) tmp_score = clf.score(X_test, y_test) acc_arr.append(tmp_score) elif model == 'rf': clf = RandomForestClassifier(n_estimators=100).fit( X_train, y_train) tmp_score = clf.score(X_test, y_test) acc_arr.append(tmp_score) elif model == 'nb': clf = GaussianNB().fit(X_train, y_train) tmp_score = clf.score(X_test, y_test) acc_arr.append(tmp_score) elif model == 'lr': clf = LogisticRegression(solver='lbfgs').fit(X_train, y_train) tmp_score = clf.score(X_test, y_test) acc_arr.append(tmp_score) elif model == 'nn': tmp_score = nn(X_train, X_test, y_train, y_test) acc_arr.append(tmp_score) # score=np.mean(cross_val_score(svm,X,y,cv=10)) # print(groupdic[groupid],modeldic[modelid],np.shape(X_test),np.shape(X_train)) # print('accuracy='+','+str(qwer)+'\n') return np.mean(acc_arr)
def get_grouped_k_fold_splits(confused_list, not_confused_list, num_folds): """ Splits data ensuring no users have data in training and eval sets. Args: confused_list (list): list of data item names labelled as confused not_confused_list (list): list of data item names labelled as not_confused num_folds (int): number of folds for cross validation. Returns: (in following order) train_confused_splits (list): each element is a list containing the file names of the data items for this partition of the dataset test_confused_splits (list): as above train_not_confused_splits (list): as above test_not_confused_splits (list): as above """ train_confused_splits = [] test_confused_splits = [] # make list where each index corresponds to the "group" (userID) confused_groups = [uid.split('_')[0][:-1] for uid in confused_list] not_confused_groups = [uid.split('_')[0][:-1] for uid in not_confused_list] # get train test splits for confused class dummy_y = [1 for i in range(len(confused_list))] gkf = GroupKFold(n_splits=num_folds) gkf.get_n_splits(X=confused_list, y=dummy_y, groups=confused_groups) for train, test in gkf.split(X=confused_list, y=dummy_y, groups=confused_groups): train_confused_splits.append([confused_list[i] for i in train]) test_confused_splits.append([confused_list[i] for i in test]) train_not_confused_splits = [] test_not_confused_splits = [] # get train test splits for not_confused class dummy_y = [1 for i in range(len(not_confused_list))] gkf = GroupKFold(n_splits=num_folds) gkf.get_n_splits(X=not_confused_list, y=dummy_y, groups=not_confused_groups) for train, test in gkf.split(X=not_confused_list, y=dummy_y, groups=not_confused_groups): train_not_confused_splits.append([not_confused_list[i] for i in train]) test_not_confused_splits.append([not_confused_list[i] for i in test]) split = (train_confused_splits, test_confused_splits, train_not_confused_splits, test_not_confused_splits) return split
def group_test_3(pre_x, kmeans_labels, names, num_dic, groups, num_vars, meta_i): chosen_vars = np.zeros(meta_i) chosen_values = np.zeros( meta_i) - 1 #subtract one so that decent negative values can be chosen # print('===') for j in range(meta_i): # old_val=np.inf*-1 # print(j) for i in range(num_vars): #clean this routine up? check for errors? clust = kmeans_labels[i] if clust == j: # print(names[i]) new_val = num_dic[i] old_val = chosen_values[clust] if old_val < new_val: # print(names[i],old_val,new_val) chosen_vars[clust] = int(i) chosen_values[clust] = new_val # print(chosen_vars) # print(type(chosen_vars)) chosen_works = [] chosen_names = [] for qq in list(chosen_vars): chosen_names.append(names[int(qq)]) chosen_works.append(int(qq)) X = pre_x[:, chosen_works] group_kfold = GroupKFold(n_splits=10) #make new func group_kfold.get_n_splits(X, y, groups) acc_arr = [] for train_index, test_index in group_kfold.split(X, y, groups): X_train = [] X_test = [] y_train = [] y_test = [] # print(train_index) # print(test_index) for id in train_index: X_train.append(X[id]) for id in test_index: X_test.append(X[id]) for id in train_index: y_train.append(y[id]) for id in test_index: y_test.append(y[id]) clf = RandomForestClassifier().fit(X_train, y_train) tmp_score = clf.score(X_test, y_test) acc_arr.append(tmp_score) return np.mean(acc_arr), chosen_names
def split_data(input_file, output_dir, seed, n_folds): df = pd.read_csv(input_file, sep='\t', dtype=str) # shuffle rows of dataframe several times for _ in range(5): df = df.sample(frac=1, random_state=seed).reset_index(drop=True) # get group indeces hearing_to_num = {} for idx, hearing_id in enumerate(df['hearing_id'].unique()): hearing_to_num[hearing_id] = idx df['hearing_num'] = df['hearing_id'].map(hearing_to_num) group_idxs = df['hearing_num'].values outer_cv = GroupKFold(n_splits=n_folds) # Split X and y into K-partitions to outer CV indeces = df.index.values for (i, (train_index, test_index)) in enumerate(outer_cv.split(indeces, indeces, groups=group_idxs)): print('Fold: ', str(i), '/', str(outer_cv.get_n_splits()-1)) fold_dir = os.path.join(output_dir, 'fold'+str(i)) Path(fold_dir).mkdir(parents=True, exist_ok=True) file_name_train = os.path.join(fold_dir, 'train.tsv') df.loc[train_index][COLUMN_NAMES].to_csv(file_name_train, sep='\t', index=False) file_name_test = os.path.join(fold_dir, 'test.tsv') df.loc[test_index][COLUMN_NAMES].to_csv(file_name_test, sep='\t', index=False)
def kfold_holdout(X, y, groups, splits=5): group_kfold = GroupKFold(n_splits=splits) group_kfold.get_n_splits(X, y, groups) d_obj = Data(splits=splits, holdout=False) for train_index, test_index in group_kfold.split(X, y, groups): # inplace shuffeling shuffle(train_index) shuffle(test_index) d_obj.Xs_train.append(X[train_index]) d_obj.Xs_val.append(X[test_index]) d_obj.ys_train.append(y[train_index]) d_obj.ys_val.append(y[test_index]) return d_obj
def plot_roc_with_cv(classifier, X, y, groups, cv=6): """ Plot the ROC curve with k fold cross validation """ cv = GroupKFold(n_splits=cv) cv.get_n_splits(X, y, groups) plt.figure(figsize=(8,7)) tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) i = 0 for train, test in cv.split(X, y, groups): probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) tprs.append(interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = auc(fpr, tpr) aucs.append(roc_auc) plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc)) i += 1 plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Luck', alpha=.8) mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) std_auc = np.std(aucs) plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8) std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0) plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic curve') plt.legend(loc="lower right") plt.show()
def get_grouped_splits(confused_items, not_confused_items, k): """ Splits data ensuring no users have data in training and eval sets. Args: confused (list): list of data item names labelled as confused not_confused (list): list of data item names labelled as not_confused k (int): number of folds for cross validation. Returns: (in following order) train_confused_splits (list): each element is a list containing the file names of the data items for this partition of the dataset test_confused_splits (list): as above train_not_confused_splits (list): as above test_not_confused_splits (list): as above """ train_confused_splits = [] test_confused_splits = [] train_not_confused_splits = [] test_not_confused_splits = [] # make list where each index corresponds to the "group" (userID) groups = [uid.split('_')[0][:-1] for uid in confused_items] + \ [uid.split('_')[0][:-1] for uid in not_confused_items] # get train test splits for confused class dummy_y = [0 for i in range(len(confused_items))] + \ [1 for i in range(len(not_confused_items))] items = confused_items + not_confused_items gkf = GroupKFold(n_splits=k) gkf.get_n_splits(X=items, y=dummy_y, groups=groups) for train, test in gkf.split(X=items, y=dummy_y, groups=groups): train_confused_splits.append([items[i] for i in train if dummy_y[i] == 0]) test_confused_splits.append([items[i] for i in test if dummy_y[i] == 0]) train_not_confused_splits.append([items[i] for i in train if dummy_y[i] == 1]) test_not_confused_splits.append([items[i] for i in test if dummy_y[i] == 1]) return (train_confused_splits, test_confused_splits, train_not_confused_splits, test_not_confused_splits)
def kfold_holdout(X, y, groups, splits, holdout): group_kfold = GroupKFold(n_splits=splits) group_kfold.get_n_splits(X, y, groups) d_obj = Data(splits=splits, holdout=holdout) for train_index, test_index in group_kfold.split(X, y, groups): # inplace shuffeling shuffle(train_index) shuffle(test_index) # generate folds if holdout == True: if d_obj.X_test_holdout is None: # first folds are for test only d_obj.X_train_holdout, d_obj.X_test_holdout = X[ train_index], X[test_index] d_obj.y_train_holdout, d_obj.y_test_holdout = y[ train_index], y[test_index] store_test_index = test_index else: # holdout idx if re-occuring in train train_index = [ x for x in train_index if x not in store_test_index ] d_obj.Xs_train.append(X[train_index]) d_obj.Xs_val.append(X[test_index]) d_obj.ys_train.append(y[train_index]) d_obj.ys_val.append(y[test_index]) elif holdout == False: d_obj.Xs_train.append(X[train_index]) d_obj.Xs_val.append(X[test_index]) d_obj.ys_train.append(y[train_index]) d_obj.ys_val.append(y[test_index]) else: print("Something is wrong here") exit() return d_obj
def train_test_split_KFold(obj): from sklearn.model_selection import GroupKFold kf1 = GroupKFold(n_splits=5) kf1.get_n_splits(obj.X, obj.Y, obj.MRNs.astype(int)) hold_out_count = 0 for main_index, hold_out_index in kf1.split(obj.X, obj.Y, obj.MRNs.astype(int)): print(main_index) if (hold_out_count == 4): obj.X_hold_out = obj.X[hold_out_index, :] obj.Y_hold_out = obj.Y[hold_out_index] obj.hold_out_MRNs = obj.MRNs[hold_out_index] obj.hold_out_entryDates = obj.entryDates[hold_out_index] obj.hold_out_indices = hold_out_index obj.X = obj.X[main_index, :] obj.Y = obj.Y[main_index] obj.MRNs = obj.MRNs[main_index] obj.entryDates = obj.entryDates[main_index] obj.cv_indices = main_index hold_out_count += 1
def create_training_files(data_path, num_folds, training_folder, full_train_path): """ Loads in an excel file containing manually labeled tokens and creates a tab delimited file for use with training stanford CRF NER model Blank rows are imported as NaN which are intended to be blanks in training file to seperate "documents" :return: Saves a temporary folder in logs/ner_cv that will be used for cross-validation (will be cleaned up by later function TODO) """ df = pd.read_excel(data_path, sheet_name='Tokens') group_kfold = GroupKFold(n_splits=num_folds) group_kfold.get_n_splits() for i, (train_index, test_index) in enumerate( group_kfold.split(df.Token, df.Label, df.OG_Text)): cv_folder = os.path.join(training_folder, f'fold_{i+1}') os.makedirs(cv_folder, exist_ok=True) df.iloc[train_index].to_csv(os.path.join(cv_folder, 'train.tsv'), columns=['Token', 'Label'], sep='\t', index=False, header=False) df.iloc[test_index].to_csv(os.path.join(cv_folder, 'test.tsv'), columns=['Token'], sep='\t', index=False, header=False) df.iloc[test_index].to_csv(os.path.join(cv_folder, 'labels.tsv'), columns=['Token', 'Label'], sep='\t', index=False, header=False) df.to_csv(full_train_path, columns=['Token', 'Label'], sep='\t', index=False, header=False)
d = d.drop(["speaker"], axis=1) X, y = d.iloc[:, 1:].values, d.iloc[:, 0].values # 1 Layer def flatten(mylist): return [item for sublist in mylist for item in sublist] cmodel1_tprs = [] cmodel1_aucs = [] cmodel1_resultsA = [] cmodel1_resultsB = [] mean_fpr = np.linspace(0, 1, 100) group_kfold = GroupKFold(n_splits=5) group_kfold.get_n_splits(X, y, speaker) print(group_kfold,) model1_cvscores = [] model1_history_main = [] # This will save the results from all cross-validations sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) # MODEL 1 MODEL_NO = "1" model1 = Sequential() model1.add(Dense(300, input_dim=24, activation='relu')) model1.add(Dense(300, activation='relu')) model1.add(Dense(1, kernel_initializer='uniform', activation='sigmoid')) model1.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
X, y, names, groups, _ = ml_data_parser('30_data.csv') print('i,random_forest,svm,naiive_bayes,logistic_reg,extra_random_forest') cv_num = 10 for i in range(1, len(names)): new_x = X[:, :i] # f=[np.shape(new_x)[0]] # f.append(np.shape(new_x)[1]) # for j in range(i): # f.append(names[j]) # print(','.join(map(str,f))) # x_train,x_test,y_train,y_test=train_test_split(new_x,y,test_size=.33) group_kfold = GroupKFold(n_splits=cv_num) group_kfold.get_n_splits(new_x, y, groups) xlen = len(names) rfacc = [] svmacc = [] nbacc = [] lracc = [] extrarfacc = [] for train_index, test_index in group_kfold.split(new_x, y, groups): x_train = [] x_test = [] y_train = [] y_test = [] # print(train_index) # print(test_index) for id in train_index: x_train.append(new_x[id])
from sklearn.preprocessing import StandardScaler scaler = StandardScaler() # Fit only to the training data scaler.fit(features_train) X_train = scaler.transform(features_train) X_test = scaler.transform(features_test) y_train = labels_train.flatten() y_test = labels_test.flatten() from sklearn.neural_network import MLPClassifier from sklearn.metrics import classification_report,confusion_matrix from sklearn.model_selection import GroupKFold from sklearn.model_selection import GridSearchCV group_kfold = GroupKFold(n_splits=4) group_kfold.get_n_splits(X_train, y_train, groups_train) tuned_parameters = [{'solver': ['sgd'], 'momentum': [0.3,0.6,0.9], 'learning_rate_init': [0.01,0.02,0.05,0.1,0.2,0.5],'nesterovs_momentum': [False,True], 'learning_rate': ['constant','invscaling','adaptive']}, {'solver': ['sgd'], 'momentum': [0], 'learning_rate_init': [0.01,0.02,0.05,0.1,0.2,0.5]}, {'solver': ['adam'], 'learning_rate_init': [0.01,0.02,0.05,0.1,0.2,0.5]}, {'solver': ['lbfgs']}] scores = ['precision', 'recall', 'f1'] for score in scores: print("# Tuning hyper-parameters for %s" % score) print()
toronto.X = toronto.df.iloc[:, 0:49].values toronto.Y = toronto.df.iloc[:, 49].values toronto.Y = nd.replace( nd.replace(nd.replace(toronto.Y.astype(str), 'F 4', '4'), 'F 1', '0'), 'F 0', '0').astype(int) toronto.MRNs = toronto.df.iloc[:, 51] toronto.entryDates = toronto.df.iloc[:, 52] toronto.split = 'groupKFold' # KFold # groupKFold dft = toronto.df from sklearn.model_selection import GroupKFold from sklearn.model_selection import KFold kf = GroupKFold(n_splits=10) normalKF = KFold(n_splits=10, shuffle=True, random_state=0) kf.get_n_splits(toronto.X, toronto.Y, toronto.MRNs.astype(int)) svmObj.params = { 'method': 'label', 'threshold': 0.40, 'C': 0.5, 'gamma': 'auto', 'kernel': 'rbf', 'degree': 3, 'coef0': 0.5, 'shrinking': True, 'tol': 0.001 } rfcObj.params = { 'n_estimators': 100, 'criterion': 'entropy',
def main(): parser = OptionParser() parser.add_option("-l", "--load_model", dest="loadModel", default=False) parser.add_option("-c", "--continue", dest="continueTraining", action='store_true', default=False) parser.add_option("--vb", dest="verbose", action='store_true', default=False) parser.add_option("-e", "--epochs", dest="epochs", default=100) parser.add_option("--bs", "--batchSize", dest="batchSize", default=5) parser.add_option("--lr", "--learning_rate", dest="learningRate", default=0.001) parser.add_option("-p", "--patience", dest="patience", default=70) (options, args) = parser.parse_args() # database to use dbPath = '../dbHdf5/dataset1_2d_onlyTumor_cropped_x-75-425_y-75-425.hdf5' modelDir = '../models' # where models are saved modelArch = 'maskNet002' # model architecture to use from modelLib.py modelName = 'maskNet002_007' # name to save model with epochs = int(options.epochs) batchSize = int(options.batchSize) modelFolder = os.path.join(modelDir, modelName) weightsFolder = os.path.join(modelFolder, "weights") ensureDir(weightsFolder) notes = "Model trained on augmented data (hor flip, ver flip and elastic). Using Dice Coeff Loss" with open(os.path.join(modelFolder, "trainingData.txt"), "w") as df: df.write("Dataset\t%s\n" % dbPath) df.write("Architecture\t%s\n" % modelArch) df.write("Batch Size\t%s\n" % batchSize) df.write("Notes\t%s\n" % notes) db = h5py.File(dbPath, 'r') X = db['slice'][...] X = np.float32(X) X = np.expand_dims(X, -1) Y = db['mask'][...] Y = np.expand_dims(Y, -1) Y = np.float32(Y) cases = db['case'][...] db.close() group_kfold = GroupKFold(n_splits=4) group_kfold.get_n_splits(X, Y, cases) kdx = 0 for train_index, test_index in group_kfold.split(X, Y, cases): kdx += 1 X_train = X[train_index] Y_train = Y[train_index] X_test = X[test_index] Y_test = Y[test_index] with open(os.path.join(modelFolder, "trainingData.txt"), "a") as df: df.write("\nTraining Cases for CV-%d (%d)\t" % (kdx, len(train_index))) df.write("\t".join(np.unique(cases[train_index]))) df.write("\n") df.write("Test Cases for CV-%d (%d)\t" % (kdx, len(test_index))) df.write("\t".join(np.unique(cases[test_index]))) df.write("\n") bestModelPath = os.path.join(weightsFolder, "best_fold_%02d.hdf5" % kdx) ensureDir(bestModelPath) # creating model model = makeModel(modelArch, verbose=options.verbose) model.save(os.path.join(modelFolder, modelName + '.h5')) adam = Adam(lr=float(options.learningRate), beta_1=0.9, beta_2=0.999, epsilon=1e-06, decay=0.00001) model.compile(loss=[customLoss.dice_coef_loss], optimizer=adam) # loading model if options.loadModel: print("\n\nLoading Model Weights:\t %s" % modelName) model = load_model(bestModelPath) log = np.genfromtxt(os.path.join(modelFolder, modelName + '_trainingLog.csv'), delimiter=',', dtype=str)[1:, 0] epochStart = len(log) else: epochStart = 0 print("\nCross Validation Fold : %02d \n" % kdx) # totalSamples = getSampleCount(dbPath,'slice') # trainGen = dataGenerator(dbPath,'slice','mask',batchSize,extendDim=True) # nTrainSamples = totalSamples # callbacks check1 = ModelCheckpoint(os.path.join( weightsFolder, modelName + "_fold_%02d" % kdx + "_{epoch:02d}-loss-{val_loss:.3f}.hdf5"), monitor='val_loss', save_best_only=True, mode='auto') check2 = ModelCheckpoint(bestModelPath, monitor='val_loss', save_best_only=True, mode='auto') check3 = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=int(options.patience), verbose=0, mode='auto') check4 = CSVLogger(os.path.join(modelFolder, modelName + '_trainingLog.csv'), separator=',', append=True) check5 = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=int(options.patience), verbose=1, mode='auto', epsilon=0.0001, cooldown=0, min_lr=1e-10) print("\nInitiating Training:\n") # trained_model = model.fit_generator(trainGen, steps_per_epoch=(nTrainSamples // batchSize), epochs=epochs, initial_epoch=epochStart, # callbacks=[check1,check2,check3,check4,check5], verbose=1) model.fit(X_train, Y_train, validation_data=(X_test, Y_test), batch_size=batchSize, epochs=epochs, initial_epoch=epochStart, callbacks=[check1, check2, check3, check4, check5], verbose=1) del X_test del X_train del Y_test del Y_train
os.environ["CUDA_VISIBLE_DEVICES"] = '1' with open('/home/kamer/notebooks/data/G9_data/action_data.pkl', 'rb') as f: X, y, z = cPickle.load(f) X_transformer = QuantileTransformer(output_distribution='uniform') X = X_transformer.fit_transform(X.reshape(-1, 128)).reshape(-1, 8, 128) from sklearn.model_selection import GroupKFold group_kfold = GroupKFold(n_splits=5) group_kfold.get_n_splits(X, y, z) from sklearn.utils.class_weight import compute_class_weight epochs = 50 all_preds = [] all_targets = [] for train_index, test_index in group_kfold.split(X, y, z): model = Arch2(in_channels=8, out_channels=6, gap_size=128) model.to(torch.device('cuda')) optimizer = AdamW(params=model.parameters(), lr=1e-4) #2e-4 X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] cw = torch.Tensor(
axis=1) respList = np.array([list(data.iloc[0, 4:])]) for i in range(1, len(data)): respList = np.append(respList, [list(data.iloc[i, 4:])], axis=0) print(respList.shape) docs = list(utterances.values) groups = data['ObsID'] # KFOLD ADDITION group_kfold = GroupKFold(n_splits=5) # KFOLD ADDITION # vectorize bag of words vectorizer = CountVectorizer(min_df=0, lowercase=False) vectorizer.fit(docs) docs2 = vectorizer.transform(docs).toarray() group_kfold.get_n_splits(docs2, respList, groups) # KFOLD ADDITION score_array = [] # KFOLD ADDITION acc_array = [] # KFOLD ADDITION roc_array = [] # KFOLD ADDITION for train_index, test_index in group_kfold.split(docs2, respList, groups): # KFOLD ADDITION print("TRAIN:", train_index, "TEST:", test_index) # KFOLD ADDITION X_train, X_test = docs2[train_index], docs2[test_index] # KFOLD ADDITION y_train, y_test = respList[train_index], respList[ test_index] # KFOLD ADDITION print(X_train, X_test, y_train, y_test) # KFOLD ADDITION # X_train, X_test, y_train, y_test = train_test_split(docs2, respList, test_size=0.2)
fs = MultiSURF().fit(X, y) ms_array = list(fs.feature_importances_) feature_importance = {} num_dic = {} trans_x = np.transpose(X) max_val = 0 for i in range(num_vars): feature_importance[names[i]] = ms_array[i] num_dic[i] = ms_array[i] if max_val < num_dic[i]: max_val = num_dic[i] best_feature = i for a in range(10): x1 = X[:, best_feature].reshape(-1, 1) group_kfold = GroupKFold(n_splits=10) group_kfold.get_n_splits(x1, y, groups) acc_arr = [] for train_index, test_index in group_kfold.split(X, y, groups): X_train = [] X_test = [] y_train = [] y_test = [] for id in train_index: X_train.append(x1[id]) for id in test_index: X_test.append(x1[id]) for id in train_index: y_train.append(y[id]) for id in test_index: y_test.append(y[id]) clf = RandomForestClassifier().fit(X_train, y_train)
def main(): #%% n_estimators_default = ITER n_fold = N_FOLD #%% for t in CTYPES: params = PARAMS[t] params['random_state'] = SEED params['num_threads'] = CPU # Train set X = pd.read_csv(DATA_PATH/'train'/f'{t}_full.csv', index_col=0) X = reduce_mem_usage(X) y_all = pd.read_csv(ORIGIN_PATH/'scalar_coupling_contributions.csv').drop('type', axis=1) y_all = reduce_mem_usage(y_all) X = X.merge(y_all, on=['molecule_name', 'atom_index_0', 'atom_index_1'], how='left') ys = { 'sum': X['scalar_coupling_constant'], 'fc': X['fc'], 'sd': X['sd'], 'pso': X['pso'], 'dso': X['dso'], } X = X.drop(['scalar_coupling_constant', 'fc', 'sd', 'pso', 'dso'], axis=1) X_test = pd.read_csv(DATA_PATH/'test'/f'{t}_full.csv', index_col=0) X_test = reduce_mem_usage(X_test) X_all = pd.concat([X, X_test]) cat_features = [] for col in X_all.columns: if col[-5:] == '_atom' or col in ['atom_A', 'atom_B']: cat_features.append(col) print(cat_features) for col in cat_features: print(col) X_all[col] = label_encode(X_all[col]) X = X_all.iloc[:len(X)] X_test = X_all.iloc[len(X):] del X_all; gc.collect() index_train = X['id'] groups = X['molecule_name'] if t[:2] == '1J': X_t = X.drop(['atom_index_0','atom_index_1','id', 'type', 'molecule_name'],axis=1) elif t[:2] == '2J': X_t = X.drop(['atom_index_0','atom_index_1','atom_index_A','id', 'type', 'molecule_name'],axis=1) elif t[:2] == '3J': X_t = X.drop(['atom_index_0','atom_index_1','atom_index_A','atom_index_B','id','type', 'molecule_name'],axis=1) index_test = X_test['id'] if t[:2] == '1J': X_test_t = X_test.drop(['atom_index_0','atom_index_1','id', 'type', 'molecule_name'],axis=1) elif t[:2] == '2J': X_test_t = X_test.drop(['atom_index_0','atom_index_1','atom_index_A','id', 'type', 'molecule_name'],axis=1) elif t[:2] == '3J': X_test_t = X_test.drop(['atom_index_0','atom_index_1','atom_index_A','atom_index_B','id','type', 'molecule_name'],axis=1) params['categorical_feature'] = [X_t.columns.get_loc(x) for x in cat_features] for ytype, y_t in ys.items(): res = [] if opt.icm and ytype == 'sum': continue elif not opt.icm and ytype != 'sum': continue # Split data folds = GroupKFold(n_splits=n_fold) folds.get_n_splits(X_t,y_t,groups) # Train! print(f'Starting {t} / {ytype}') print(f'Params:\n{params}') result_dict_lgb3 = train_model_regression(X=X_t, X_test=X_test_t, y=y_t, params=params, folds=folds, model_type='lgb', eval_metric='mae', plot_feature_importance=True, verbose=1000, early_stopping_rounds=200, n_estimators=n_estimators_default, groups=groups, feature_importance_path=f'results/{t}.png') if opt.icm: res.append((f'{t}_{ytype}', result_dict_lgb3)) with open(RESULT_PATH/f'{t}_{ytype}.pkl', 'wb') as f: pickle.dump(res, f) else: res.append((t, result_dict_lgb3)) with open(RESULT_PATH/f'{t}.pkl', 'wb') as f: pickle.dump(res, f)
def main(): #%% X = pd.read_csv(DATA_PATH / 'train' / f'{CTYPE}_full.csv', index_col=0) X = reduce_mem_usage(X) y = X['scalar_coupling_constant'] X = X.drop(['scalar_coupling_constant'], axis=1) X_test = pd.read_csv(DATA_PATH / 'test' / f'{CTYPE}_full.csv', index_col=0) X_test = reduce_mem_usage(X_test) #%% X = X.fillna(0) X_test = X_test.fillna(0) X_all = pd.concat([X, X_test]) cat_features = [] for col in X_all.columns: if col[-5:] == '_atom' or col in ['atom_A', 'atom_B']: cat_features.append(col) print(cat_features) for col in cat_features: print(col) X_all[col] = label_encode(X_all[col]) print('dummie', X_all.shape) X_all = pd.get_dummies(X_all, columns=cat_features, drop_first=True, dummy_na=True) print('->', X_all.shape) X = X_all.iloc[:len(X)] X_test = X_all.iloc[len(X):] del X_all gc.collect() index_train = X['id'] groups = X['molecule_name'] if CTYPE[:2] == '1J': X_t = X.drop( ['atom_index_0', 'atom_index_1', 'id', 'type', 'molecule_name'], axis=1) elif CTYPE[:2] == '2J': X_t = X.drop([ 'atom_index_0', 'atom_index_1', 'atom_index_A', 'id', 'type', 'molecule_name' ], axis=1) elif CTYPE[:2] == '3J': X_t = X.drop([ 'atom_index_0', 'atom_index_1', 'atom_index_A', 'atom_index_B', 'id', 'type', 'molecule_name' ], axis=1) y_t = y index_test = X_test['id'] if CTYPE[:2] == '1J': X_test_t = X_test.drop( ['atom_index_0', 'atom_index_1', 'id', 'type', 'molecule_name'], axis=1) elif CTYPE[:2] == '2J': X_test_t = X_test.drop([ 'atom_index_0', 'atom_index_1', 'atom_index_A', 'id', 'type', 'molecule_name' ], axis=1) elif CTYPE[:2] == '3J': X_test_t = X_test.drop([ 'atom_index_0', 'atom_index_1', 'atom_index_A', 'atom_index_B', 'id', 'type', 'molecule_name' ], axis=1) sc = StandardScaler() X_t = sc.fit_transform(X_t) X_test_t = sc.transform(X_test_t) #%% folds = GroupKFold(n_splits=N_FOLD) folds.get_n_splits(X_t, y_t, groups) fold_split = folds.split(X_t, y_t, groups) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # for test set prediction X_test_t = torch.tensor(X_test_t, dtype=torch.float).to(device) test_ds = torch.utils.data.TensorDataset(X_test_t) test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False) # oof = np.zeros(len(X_t)) prediction = np.zeros(len(X_test_t)) avg_losses = [] avg_val_losses = [] for fold_i, (train_idx, valid_idx) in enumerate(fold_split): print(f'Fold {fold_i + 1} started at {time.ctime()}') # dataset X_train = torch.tensor(X_t[train_idx.astype(int)], dtype=torch.float).to(device) X_valid = torch.tensor(X_t[valid_idx.astype(int)], dtype=torch.float).to(device) y_train = torch.tensor(np.array(y_t)[train_idx.astype(int), np.newaxis], dtype=torch.float).to(device) y_valid = torch.tensor(np.array(y_t)[valid_idx.astype(int), np.newaxis], dtype=torch.float).to(device) train_ds = torch.utils.data.TensorDataset(X_train, y_train) valid_ds = torch.utils.data.TensorDataset(X_valid, y_valid) train_loader = torch.utils.data.DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True) valid_loader = torch.utils.data.DataLoader(valid_ds, batch_size=BATCH_SIZE, shuffle=False) # define model each fold model = Simple_NN(X_train.shape[1], HIDDEN_DIM, activation=nn.LeakyReLU()) model.to(device) # criterion = nn.L1Loss() criterion = nn.SmoothL1Loss() mae = nn.L1Loss() step_size = 5 base_lr, max_lr = DEFAULT_LR, 5 * DEFAULT_LR # optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=max_lr) optimizer = RAdam(filter(lambda p: p.requires_grad, model.parameters()), lr=max_lr) scheduler = CyclicLR(optimizer, base_lr=base_lr, max_lr=max_lr, step_size=step_size, mode='exp_range', gamma=0.99994) early_stopping = EarlyStopping(patience=EARLY_STOPPING_ROUNDS, verbose=True) best_weight = {'epoch': None, 'state_dict': None} if torch.cuda.device_count() > 1: print('{} gpus found.'.format(torch.cuda.device_count())) model = torch.nn.DataParallel(model) for epoch in range(EPOCH): start_time = time.time() model.train() avg_loss = 0. avg_mae = 0. # train for batch_i, (x, y) in enumerate(train_loader): y_pred = model(x) if scheduler: scheduler.batch_step() loss = criterion(y_pred, y) optimizer.zero_grad() loss.backward() optimizer.step() avg_loss += loss.item() / len(train_loader) # valid model.eval() oof_fold = np.zeros(X_valid.size(0)) prediction_fold = np.zeros(len(X_test_t)) avg_val_loss = 0. for batch_i, (x, y) in enumerate(valid_loader): y_pred = model(x).detach() loss = criterion(y_pred, y) metric = mae(y_pred, y) avg_val_loss += loss.item() / len(valid_loader) avg_mae += metric.item() / len(valid_loader) elapsed_time = time.time() - start_time if early_stopping(avg_val_loss, model): # score updated print( 'Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t MAE={:.4f} \t time={:.2f}s' .format(epoch + 1, EPOCH, avg_loss, avg_val_loss, avg_mae, elapsed_time)) best_weight['epoch'] = epoch best_weight['state_dict'] = model.state_dict() if early_stopping.early_stop: print("Early stopping!") break avg_losses.append(avg_loss) avg_val_losses.append(avg_val_loss) # predict print('best epoch for fold {} is {}'.format(fold_i + 1, best_weight['epoch'] + 1)) model.load_state_dict(best_weight['state_dict']) for batch_i, (x, _) in enumerate(valid_loader): y_pred = model(x).detach() oof_fold[batch_i * BATCH_SIZE:(batch_i + 1) * BATCH_SIZE] = y_pred.cpu().numpy()[:, 0] for batch_i, (x, ) in enumerate(test_loader): y_pred = model(x).detach() prediction_fold[batch_i * BATCH_SIZE:(batch_i + 1) * BATCH_SIZE] = y_pred.cpu().numpy()[:, 0] oof[valid_idx] = oof_fold prediction += prediction_fold / N_FOLD # results overall_mae = mean_absolute_error(oof, y_t.values) overall_logmae = np.log(overall_mae) print( 'Overall \t loss={:.4f} \t val_loss={:.4f} \t MAE={:.4f} \t logMAE={:.4f}' .format(np.average(avg_losses), np.average(avg_val_losses), overall_mae, overall_logmae)) res = [] res_dict = {'oof': oof, 'prediction': prediction} res.append((CTYPE, res_dict)) with open(f'{CTYPE}_DNN.pkl', 'wb') as f: pickle.dump(res, f)
import numpy as np a = np.ones([5, 20]) b = np.zeros([1, 20]) b[0, 10:] = 1 a = a.T b = b.T #################### #train_test_aplit #################### x_train, x_test, y_train, y_test = train_test_split(a, b, test_size=0.2, shuffle=False) print("y_train") print(y_train) print("y_test") print(y_test) #################### #KFold #################### kf = KFold(n_splits=5) for train_index, test_index in kf.split(a): print("train_index:", train_index, ",test_index:", test_index) #################### #Groupkfold #################### kf2 = GroupKFold(n_splits=5) res2 = kf2.get_n_splits(a, b) for train_index, test_index in kf2.split(a, b): print("train_index:", train_index, ",test_index:", test_index)
# import these: from sklearn.model_selection import KFold from sklearn.model_selection import GroupKFold # StratifiedGroupKFold could be used in the future # after data processing, add these: groups = data['ObsID'] # selects column to group by group_kfold = GroupKFold(n_splits=5) # set number of splits group_kfold.get_n_splits( docs2, respList, groups ) # split where docs2 = utterance values, respList = 7 classifier columns # loop through each of the 5 splits: score_array = [] acc_array = [] roc_array = [] for train_index, test_index in group_kfold.split(docs2, respList, groups): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = docs2[train_index], docs2[test_index] y_train, y_test = respList[train_index], respList[test_index] print(X_train, X_test, y_train, y_test) # Remove your prior split ie X_train, X_test, y_train, y_test = train_test_split(docs2, respList, test_size=0.2) # Run prior training as before # Append new values in each loop score_array.append(score) # KFOLD ADDITION acc_array.append(acc) # KFOLD ADDITION roc_array.append(roc_auc_score(y_test, y_pred, multi_class='ovr')) # KFOLD ADDITION
def group_test_2(pre_x, kmeans_labels, names, num_dic, groups, num_vars, meta_i): # print('meta-i='+str(meta_i)) chosen_vars = np.zeros(meta_i) chosen_values = np.zeros(meta_i) # print('===') for i in range(num_vars): #clean this routine up? check for errors? old_val = 0 new_val = num_dic[i] clust = kmeans_labels[i] old_val = chosen_values[clust] if old_val < new_val: # print(names[i],old_val,new_val) chosen_vars[clust] = int(i) chosen_values[clust] = new_val # print(chosen_vars) # print(type(chosen_vars)) chosen_works = [] chosen_names = [] for qq in list(chosen_vars): chosen_names.append(names[int(qq)]) chosen_works.append(int(qq)) X = pre_x[:, chosen_works] # print(chosen_names) # x_train,x_test,y_train,y_test=train_test_split(new_x,y,test_size=.3) group_kfold = GroupKFold(n_splits=3) group_kfold.get_n_splits(X, y, groups) acc_arr = [] for train_index, test_index in group_kfold.split(X, y, groups): X_train = [] X_test = [] y_train = [] y_test = [] # print(train_index) # print(test_index) for id in train_index: X_train.append(X[id]) for id in test_index: X_test.append(X[id]) for id in train_index: y_train.append(y[id]) for id in test_index: y_test.append(y[id]) # if model=='svm': # clf=SVC().fit(X_train,y_train) # elif model=='rf_extra': # clf=ExtraTreesClassifier().fit(X_train,y_train) # elif model=='rf': # clf=RandomForestClassifier().fit(X_train,y_train) # elif model=='nb': # clf=GaussianNB().fit(X_train,y_train) # elif model=='lr': # clf=LogisticRegression().fit(X_train,y_train) clf = RandomForestClassifier().fit(X_train, y_train) # score=np.mean(cross_val_score(svm,X,y,cv=10)) # print(groupdic[groupid],modeldic[modelid],np.shape(X_test),np.shape(X_train)) tmp_score = clf.score(X_test, y_test) acc_arr.append(tmp_score) # print('accuracy='+','+str(qwer)+'\n') return np.mean(acc_arr), chosen_names
X_train = np.loadtxt("data/05_train_df/%s_x_train.csv" % my_dat, skiprows=1, delimiter=",") Y_train = np.loadtxt("data/05_train_df/%s_y_train.csv" % my_dat, skiprows=1, delimiter=",") # do a different split to try these X = X_train y = Y_train #X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify = y, random_state=28) fold_brk = pd.read_csv("data/05_train_df/%s_folds.csv" % my_dat) group_kfold = GroupKFold(n_splits=3) group_kfold.get_n_splits(X, y, grps) for train_index, test_index in group_kfold.split(X, y, grps): #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_valid = X[train_index], X[test_index] y_train, y_valid = y[train_index], y[test_index] print(X_train.shape) print(X_valid.shape) rf_acc = run_acc(RandomForestClassifier(), "RandomForest") #print(X_train, X_test, y_train, y_test) for fold in range(1, 7): train_idx = [x != fold for x in fold_brk['partition'].tolist()] valid_idx = [x == fold for x in fold_brk['partition'].tolist()] X_train = X[train_idx, :] y_train = y[train_idx]
# And test those predictions kappa = cohen_kappa_score(y, predictions) # Print it up print("model kappa using XGBClassifier: %.2f" % kappa) #################################################################################################### # Question 7 #################################################################################################### from sklearn.model_selection import GroupKFold # split our data gkf = GroupKFold(n_splits=10) gkf.get_n_splits(10) # Create a list of unique users and their indecies group_dict = {} groups = np.array([]) for index, row in df_dummies.iterrows(): student_id = row['STUDENTID'] if student_id not in group_dict: group_dict[student_id] = index groups = np.append(groups, group_dict[student_id]) # train and test all the data kappa_sum = 0 print("Decision Tree") for i, data_folds in enumerate(gkf.split(x, y, groups=groups)):
from sklearn.preprocessing import StandardScaler scaler = StandardScaler() # Fit only to the training data scaler.fit(features_train) X_train = scaler.transform(features_train) X_test = scaler.transform(features_test) y_train = labels_train.flatten() y_test = labels_test.flatten() from sklearn.neural_network import MLPClassifier from sklearn.metrics import classification_report, confusion_matrix from sklearn.model_selection import GroupKFold from sklearn.model_selection import GridSearchCV group_kfold = GroupKFold(n_splits=4) group_kfold.get_n_splits(X_train, y_train, groups_train) tuned_parameters = [{ 'hidden_layer_sizes': [[6, 6, 6]] }, { 'hidden_layer_sizes': [[3, 3, 3]] }, { 'hidden_layer_sizes': [[5, 5, 5]] }, { 'hidden_layer_sizes': [[6, 6]] }, { 'hidden_layer_sizes': [[5, 5]] }, { 'hidden_layer_sizes': [[3, 3]] }, { 'hidden_layer_sizes': [[6, 5, 3]]
y = np.hstack((y, i_class_label * np.ones( (length(state_data), ), dtype='int'))) # Update class label i_class_label += int(1) # Transpose x = x.transpose((1, 0)) #%% Plot 3D backscatter values # Plot #labels_dict = None # dict((['live', 'defo'], ['live', 'defo'])) #modalitypoints3d('reciprocity', x, y, labels_dict=labels_dict, title=dataset_use) #%% Classify group_kfold.get_n_splits(X=x, y=y, groups=groups) # Cross validate - kNN - All data knn_all = KNeighborsClassifier(n_neighbors=knn_k) knn_scores_all = cross_val_score(knn_all, x, y, groups=groups, cv=crossval_use) #knn_scores_all = cross_val_score(knn_all, x, y, cv=crossval_kfold) #print('kNN - ' + dataset_use + ' :') #print(np.mean(knn_scores_all)) knn_mean_acc[dataset_use] = np.mean(knn_scores_all) knn_all_acc[dataset_use] = knn_scores_all rf_all = RandomForestClassifier(n_estimators=rf_ntrees, random_state=0) rf_scores_all = cross_val_score(rf_all,