def main(): args = set_args() time = datetime.datetime.now() args.output_dir += time.strftime("%b%d/") os.makedirs(args.output_dir, exist_ok=True) drop_NA_data = pd.read_csv(args.data_path, index_col=0) X, y = data_label_split(drop_NA_data) if args.use_nuclei: X = use_nuclei_feature(X) elif args.use_nuclei_gran: X = use_nuclei_gran_feature(X) X['Metadata_PlateID_Nuclei'] = drop_NA_data[ 'Metadata_PlateID_Nuclei'].tolist() X = normalize_by_group(X, 'Metadata_PlateID_Nuclei') X.dropna('columns', inplace=True) X['compound'] = drop_NA_data['compound'].tolist() models = [ KNeighborsClassifier(30), LogisticRegression(max_iter=1000, solver="saga", n_jobs=-1), RandomForestClassifier(min_samples_split=50, random_state=0), MLPClassifier(solver="adam", max_iter=100) ] envs = os.environ if "SLURM_ARRAY_TASK_ID" in envs: model = models[int(envs['SLURM_ARRAY_TASK_ID'])] else: model = models[1] print('using model %s, data %s' % (str(model).split("(")[0], args.data_path)) train(args, X, model, 0)
def train(args, data, model, verbose, parallel=True, bag_perc=0.5): if parallel: results = multi_mini_noise_signal_cv(args, data, "taxol", "DMSO", model, verbose, bag_perc) else: results = mini_noise_signal_cv(args.bagsize, data, "taxol", "DMSO", model, verbose, bag_perc) results = pd.DataFrame.from_dict(results, orient="index") results.columns = [ "mean_accuracy", "std_accuracy", "mean_pred_score_control", "std_pred_score_control", "mean_pred_score_treatment", "std_pred_score_treatment", ] model_name = str(model).split("(")[0] feature_size = len(data_label_split(data)[0].columns) result_path = os.path.join( args.output_dir, "%s_sample%s_feature%s.csv" % (model_name, args.bagsize, feature_size)) if os.path.exists(result_path): results.to_csv( result_path, mode="a", header=False, ) else: results.to_csv(result_path)
def mini_noise_signal_cv( size: int, data: pd.DataFrame, treatment: str, control: str, model, cv: int, verbose: int, bag_perc: float = 0.5, ) -> tuple: mean_mean_accuracy = [] std_mean_accuracy = [] mean_pred_score_control = [] std_pred_score_control = [] mean_pred_score_treatment = [] std_pred_score_treatment = [] for i in tqdm(range(5, 96, 5)): mini_batch = generate_data_set(size, i / 100, data, treatment, control, bag_perc) X, y = data_label_split(mini_batch) # encode string class into numerical class, 0 for control, 1 for treatment y = y["compound"] # .map({treatment:1, control:0}) mean_accuracy, pred_score_control, pred_score_treatment = kfold_train( cv, X, y, model, "DMSO", "taxol", verbose=verbose) mean_mean_accuracy.append(np.mean(mean_accuracy)) std_mean_accuracy.append(np.std(mean_accuracy)) mean_pred_score_control.append(np.mean(pred_score_control)) std_pred_score_control.append(np.std(pred_score_control)) mean_pred_score_treatment.append(np.mean(pred_score_treatment)) std_pred_score_treatment.append(np.std(pred_score_treatment)) return ( mean_mean_accuracy, std_mean_accuracy, mean_pred_score_control, std_pred_score_control, mean_pred_score_treatment, std_pred_score_treatment, )
def main(): args = set_args() args.cuda = not args.no_cuda and torch.cuda.is_available() print(args) torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) print("GPU is ON! with GPU %s" % torch.cuda.current_device()) time = datetime.datetime.now() args.output_dir += time.strftime("%b%d/") os.makedirs(args.output_dir, exist_ok=True) drop_NA_data = pd.read_csv(args.data_path, index_col=0) X, y = data_label_split(drop_NA_data) if args.use_nuclei: X = use_nuclei_feature(X) elif args.use_nuclei_gran: X = use_nuclei_gran_feature(X) X['Metadata_PlateID_Nuclei'] = drop_NA_data[ 'Metadata_PlateID_Nuclei'].tolist() X = normalize_by_group(X, 'Metadata_PlateID_Nuclei') X.dropna('columns', inplace=True) X['compound'] = drop_NA_data['compound'].tolist() data = X feature_size = len(data_label_split(data)[0].columns) pools = ['att', 'mean', 'max', 'min'] if "SLURM_ARRAY_TASK_ID" in os.environ: args.pool = pools[int(os.environ['SLURM_ARRAY_TASK_ID'])] for i in range(args.start, args.end, 5): # define model if args.pool == 'att': model = profile_AttSet(feature_size, args.thres) else: model = FullDeepSet(feature_size, args.pool, args.thres) if args.cuda: model.cuda() optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.999), weight_decay=args.reg) results = mini_noise_signal_cv(i, i + 1, data, "taxol", "DMSO", model, args) results = pd.DataFrame.from_dict(results, orient="index") results.columns = [ "mean_accuracy", "std_accuracy", "mean_control_accuracy", "std_control_accuracy", "mean_treat_accuracy", "std_treat_accuracy", "mean_pred_score_control", "std_pred_score_control", "mean_pred_score_treatment", "std_pred_score_treatment", ] res_path = os.path.join( args.output_dir, "%s_deepset_thres%.1f_bags%d*%d_bagsize%d_feature%d.csv" % ( args.pool, args.thres, args.num_bags_train, args.batch_size, args.mean_bag_length, feature_size, )) if os.path.exists(res_path): results.to_csv( res_path, mode="a", header=False, ) else: results.to_csv(res_path)
def multi_kfold_train_bag(perc, args, data, model, control=0, treatment=1, verbose=0): X, y = data_label_split(data) y = y["compound"] skf = StratifiedKFold(n_splits=args.splits) pred_score_control = np.array([]) pred_score_treatment = np.array([]) mean_accuracy = [] for i, (train_index, test_index) in enumerate(skf.split(X, y)): print('start perc %s, split %s' % (perc, i)) X_train, X_test = ( data_standardization(X.iloc[train_index]), data_standardization(X.iloc[test_index]), ) y_train, y_test = y.iloc[train_index], y.iloc[test_index] X_train = pd.concat([X_train, y_train], axis=1, sort=False) X_train, y_train = data_label_split( generate_data_set(args.bagsize, perc, X_train, treatment, control)) y_train = y_train['compound'] X_test = pd.concat([X_test, y_test], axis=1, sort=False) valida_dataset = dmso_taxol_ProfileBag( X_test, int(args.num_bags_train / args.splits), args.mean_bag_length, args.var_bag_length, perc, treatment, control, args.batch_size, 0.5, ) valida_loader = D.DataLoader(valida_dataset, batch_size=1, shuffle=True) lgs = model.fit(X_train, y_train) acc_control, acc_treat, pred_score_cont, pred_score_treat = test_bag_model( lgs, valida_loader) pred_score_control = np.append(pred_score_control, pred_score_cont) pred_score_treatment = np.append(pred_score_treatment, pred_score_treat) mean_accuracy.append(np.mean(acc_control + acc_treat)) if args.save_score: with open('%s_%f.txt' % (model, perc), 'w') as f: f.write(','.join(["%.4f" % i for i in pred_score_control.tolist()]) + '\n') f.write(','.join( ["%.4f" % i for i in pred_score_treatment.tolist()])) return { perc: [ np.mean(mean_accuracy), np.std(mean_accuracy), np.mean(pred_score_control), np.std(pred_score_control), np.mean(pred_score_treatment), np.std(pred_score_treatment), ] }
def multi_kfold_train(perc, args, data, model, control=0, treatment=1, verbose=0): mini_batch = generate_data_set(args.bagsize, perc, data, treatment, control) X, y = data_label_split(mini_batch) y = y["compound"] skf = StratifiedKFold(n_splits=args.splits) pred_score_control = np.array([]) pred_score_treatment = np.array([]) mean_accuracy = [] if type(X) == np.ndarray: for i, (train_index, test_index) in tqdm(enumerate(skf.split(X, y)), desc="K fold CV"): if verbose != 0: print("Fold %d" % i, "TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] lgs = model.fit(X_train, y_train) pred_score_control = np.append( pred_score_control, lgs.predict_proba(X_test[y_test == "DMSO"])[:, 0]) pred_score_treatment = np.append( pred_score_treatment, lgs.predict_proba(X_test[y_test == "taxol"])[:, 1]) mean_accuracy.append(lgs.score(X_test, y_test)) # print(y_test, lgs.predict_proba(X_test[y_test==0])[:,0], lgs.predict_proba(X_test[y_test==1])[:,1]) elif type(X) == pd.core.frame.DataFrame: for i, (train_index, test_index) in tqdm(enumerate(skf.split(X, y)), desc="K fold CV"): if verbose != 0: print("Fold %d" % i, "TRAIN:", train_index, "TEST:", test_index) X_train, X_test = ( data_standardization(X.iloc[train_index]), data_standardization(X.iloc[test_index]), ) y_train, y_test = y.iloc[train_index], y.iloc[test_index] lgs = model.fit(X_train, y_train) pred_score_control = np.append( pred_score_control, lgs.predict_proba(X_test[y_test == "DMSO"])[:, 1]) pred_score_treatment = np.append( pred_score_treatment, lgs.predict_proba(X_test[y_test == "taxol"])[:, 1]) mean_accuracy.append(lgs.score(X_test, y_test)) return { perc: [ np.mean(mean_accuracy), np.std(mean_accuracy), np.mean(pred_score_control), np.std(pred_score_control), np.mean(pred_score_treatment), np.std(pred_score_treatment), ] }
def main(): args.cuda = not args.no_cuda and torch.cuda.is_available() print(args) torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) print("GPU is ON! with GPU %s" % torch.cuda.current_device()) data_path = 'moa_data_drop_NA.csv' drop_NA_data = pd.read_csv(data_path, index_col=0) X, y = data_label_split(drop_NA_data) X['Metadata_PlateID_Nuclei'] = drop_NA_data[ 'Metadata_PlateID_Nuclei'].tolist() X = normalize_by_group(X, 'Metadata_PlateID_Nuclei') X.dropna('columns', inplace=True) X['compound'] = drop_NA_data['compound'].tolist() data = X feature_size = len(data_label_split(data)[0].columns) for i in range(args.start, args.end, 5): # define model model = profile_AttSet(feature_size, "att", args.thres) if args.cuda: model.cuda() optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.999), weight_decay=args.reg) results = mini_noise_signal_cv( i, i + 1, data, args.num_bags_train, args.mean_bag_length, args.var_bag_length, "taxol", "DMSO", args.batch_size, model, args.lr, args.reg, args.splits, args.epochs, ) feature_size = len(data_label_split(data)[0].columns) results = pd.DataFrame.from_dict(results, orient="index") results.columns = [ "mean_accuracy", "std_accuracy", "mean_control_accuracy", "std_control_accuracy", "mean_treat_accuracy", "std_treat_accuracy", "mean_pred_score_control", "std_pred_score_control", "mean_pred_score_treatment", "std_pred_score_treatment", ] if os.path.exists("deepset_att%.1f_bags%d*%d_bagsize%d_feature%d.csv" % ( args.thres, args.num_bags_train, args.batch_size, args.mean_bag_length, feature_size, )): results.to_csv( "deepset_att%.1f_bags%d*%d_bagsize%d_feature%d.csv" % ( args.thres, args.num_bags_train, args.batch_size, args.mean_bag_length, feature_size, ), mode="a", header=False, ) else: results.to_csv( "deepset_att%.1f_bags%d*%d_bagsize%d_feature%d.csv" % ( args.thres, args.num_bags_train, args.batch_size, args.mean_bag_length, feature_size, ))
def mini_noise_signal_cv(start, end, data, treatment, control, model, args): dic = {} # Set different percentage of treatment v.s. control for j in range(start, end, 5): X, y = data_label_split(data) y = y["compound"] acc_control_list = [] acc_treat_list = [] pred_score_control_list = [] pred_score_treat_list = [] # Stratified K fold skf = StratifiedKFold(n_splits=args.splits) for i, (train_index, test_index) in enumerate(skf.split(X, y)): X_train, X_test = ( data_standardization(X.iloc[train_index]), data_standardization(X.iloc[test_index]), ) y_train, y_test = y.iloc[train_index], y.iloc[test_index] X_train = pd.concat([X_train, y_train], axis=1, sort=False) X_test = pd.concat([X_test, y_test], axis=1, sort=False) # Redefine dataloader and train model at each fold train_dataset = dmso_taxol_ProfileBag( X_train, int(args.num_bags_train * (args.splits - 1) / args.splits), args.mean_bag_length, args.var_bag_length, j / 100, treatment, control, args.batch_size, 0.5, True, ) valida_dataset = dmso_taxol_ProfileBag( X_test, int(args.num_bags_train / args.splits), args.mean_bag_length, args.var_bag_length, j / 100, treatment, control, args.batch_size, 0.5, ) train_loader = D.DataLoader(train_dataset, batch_size=1, shuffle=True) valida_loader = D.DataLoader(valida_dataset, batch_size=1, shuffle=True) # Start training model.__init__(model.input_feature, model.pool, model.thres) if args.cuda: model.cuda() optimizer = optim.Adam( model.parameters(), lr=args.lr, betas=(0.9, 0.999), weight_decay=args.reg, ) minimum_error = float("inf") early_stop = [] for epoch in range(args.epochs): epoch_result = [] print("Train, Percent:%d, Fold: %d, " % (j, i), end="") train_loss, train_error = train(args, epoch, train_loader, model, optimizer, 1) # if train_loss >= 49: # X_train.to_csv("bag_perc%d_fold%d.csv" % (j, i)) # break epoch_result.append(train_loss) epoch_result.append(train_error) # Conduct testing print("Test, Percent:%d, Fold:%d, " % (j, i), end="") acc_control, acc_treat, pred_score_control, pred_score_treat = test( args, model, valida_loader) if 1 - np.mean(acc_control + acc_treat) < minimum_error: minimum_error = 1 - np.mean(acc_control + acc_treat) best_result = ( acc_control, acc_treat, pred_score_control, pred_score_treat, ) epoch_result.append(1 - np.mean(acc_control)) epoch_result.append(1 - np.mean(acc_treat)) if len(early_stop) < 5: early_stop.append(epoch_result) else: early_stop.append(epoch_result) early_stop.pop(0) # Stop if loss and training+testing error is close to 0 in 5 consecutive epochs if np.mean(early_stop) <= 1e-6: break acc_control_list += best_result[0] acc_treat_list += best_result[1] pred_score_control_list += best_result[2] pred_score_treat_list += best_result[3] print(np.mean(best_result[0] + best_result[1])) dic[j / 100] = [ np.mean(acc_control_list + acc_treat_list), np.std(acc_control_list + acc_treat_list), np.mean(acc_control_list), np.std(acc_control_list), np.mean(acc_treat_list), np.std(acc_treat_list), np.mean(pred_score_control_list), np.std(pred_score_control_list), np.mean(pred_score_treat_list), np.std(pred_score_treat_list), ] return dic