def run(args): print('Load data...') ###### load model model = eval(args.model_name)() CVs = ['CV1','CV2','CV3','CV4','CV5'] data_path = args.data_path + args.dataset + '/' for CV in CVs: print('><<><><><><><><><><><><><><><><><><><><><><><><><<><><><><><>') print('start {}'.format(CV)) ##################### load the data ############################ train_file = CV + '_' + args.dataset + '_' + args.split +'_' + 'train' + '.csv' val_file = CV + '_' + args.dataset + '_' + args.split + '_' + 'val' + '.csv' test = 'test_' + args.dataset + '_' + args.split + '.csv' # load the data train_data = pd.read_csv(data_path + CV + '/' + train_file) val_data = pd.read_csv(data_path + CV + '/' + val_file) test_data = pd.read_csv(data_path + test) if args.is_mixed: # load the mixed data if args.dataset == 'DAVIS': mixed_dataset = 'KIBA' if args.dataset == 'KIBA': mixed_dataset = 'DAVIS' # laod the mixed data mixed_data_file = mixed_dataset + '_mixed_train_unseenP_seenD.csv' mixed_data = pd.read_csv(data_path + mixed_data_file) # remove the repeated protein sequence val_t = val_data['Target Sequence'].unique() mixed_t = mixed_data['Target Sequence'].unique() filter1 = list((set(val_t).intersection(set(mixed_t)))) mixed_data = mixed_data[~mixed_data['Target Sequence'].isin(filter1)] mixed_set = process_data(mixed_data) else: mixed_set = None # pre-processing the data train_set = process_data(train_data) val_set = process_data(val_data) test_set = process_data(test_data) world_size = torch.cuda.device_count() print('Let\'s use', world_size, 'GPUs!') mp.spawn(dist_run, args=(args, world_size, train_set,mixed_set,val_set,test_set,model,CV), nprocs=world_size, join=True)
def main(): # connect to the PostgreSQL server params_dic = config() conn = connect_to_db(params_dic) create_tables(conn) # map the data into database try: # read csv file from the path for file in glob.glob('../exampleco_data/*.csv'): data = pd.read_csv(file) # extract file name file_name = os.path.basename(file).split('.')[0] processed_data = process_data(data, file_name) print('\n') print('Pre data ingestion:') count_records(conn) print('\n') insert_records(conn, processed_data) print('\n') print('Post data ingestion:') count_records(conn) print('\n') except IOError: print(f"File {file} doesn't exist or isn't readable") # close the connection to PostgreSQL server conn.close()
def file_summer_page(): if request.method == "POST": input_file = request.files["input_file"] input_data = input_file.stream.read().decode("utf-8") output_data = process_data(input_data) response = make_response(output_data) response.headers["Content-Disposition"] = "attachment; filename=result.csv" return response return '''
def sumfile_page(): if request.method == "POST": submission = request.files["submission"] input_data = submission.stream.read().decode("utf-8") output_data = processing.process_data(input_data) response = make_response(output_data) response.headers[ "Content-Disposition"] = "attachment; filename=yourfile.csv" return response return '''
def main(am, t): if am: process_data('train.csv', 'test.csv') y_train_jets = [] tx_train_jets = [] ids_train_jets = [] y_test_jets = [] tx_test_jets = [] ids_test_jets = [] load_data_sets(y_train_jets, tx_train_jets, ids_train_jets, y_test_jets, tx_test_jets, ids_test_jets) degree_best_jets = [6, 6, 6, 6] lambda_best_jets = [3e-05, 0.0023, 4.6e-9, 5.7e-05] if t: grid_search_cross_validation(degree_best_jets, lambda_best_jets, y_train_jets, tx_train_jets) predictions = [] ids_predicted = [] learn(predictions, ids_predicted, y_train_jets, tx_train_jets, tx_test_jets, ids_test_jets, lambda_best_jets, degree_best_jets) combine_and_create_submission(predictions, ids_predicted, 'submit_TWN1')
def main(): raw_lst = processing.get_raw_data() names_lst = processing.get_raw_names() for idx in range(len(raw_lst)): dataset_name = names_lst[idx] X, y, target_names = processing.process_data(raw_lst[idx]) model_dict = training.train_model(X, y, target_names, dataset_name) training.print_elapsed_time(model_dict) reporting.produce_report(model_dict)
def main(args): """Main function.""" # Basic settings best_ci = 0 best_epoch = 0 best_train_loss = 10000 rounds = args.rounds # Set CUDA device cuda_name = "cuda:" + str(args.cudanum) device = torch.device(cuda_name if torch.cuda.is_available() else "cpu") # Modeling... modeling = [GINConvNet, GATNet, GAT_GCN, GCNNet][args.model] model_st = modeling.__name__ print(model_st) model = modeling().to(device) # Optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # Adam # Load data train_data = pd.read_csv("../../Data/DAVIS/CV"+str(rounds)+"/CV"+str(rounds)+"_DAVIS_unseenP_seenD_train.csv") val_data = pd.read_csv("../../Data/DAVIS/CV"+str(rounds)+"/CV"+str(rounds)+"_DAVIS_unseenP_seenD_val.csv") test_data = pd.read_csv("../../Data/DAVIS/test_DAVIS_unseenP_seenD.csv") train_set = process_data(train_data, 'train') val_set = process_data(val_data, 'val') test_set = process_data(test_data, 'test') train_generator = TestbedDataset(root = 'dataset', dataset = 'DAVIS_train' + str(rounds), xd = train_set[0], xt = train_set[1], y = train_set[2], smile_graph = train_set[3]) val_generator = TestbedDataset(root = 'dataset', dataset = 'DAVIS_val' + str(rounds), xd = val_set[0], xt = val_set[1], y = val_set[2], smile_graph = val_set[3]) test_generator = TestbedDataset(root = 'dataset', dataset = 'DAVIS_test', xd = test_set[0], xt = test_set[1], y = test_set[2], smile_graph = test_set[3]) # Make mini-batch processing train_loader = DataLoader(train_generator, batch_size = args.batchsize, shuffle = True) val_loader = DataLoader(val_generator, batch_size = args.batchsize, shuffle = False) test_loader = DataLoader(test_generator, batch_size = args.batchsize, shuffle = False) # Training... print("Training.....") for epoch in range(args.epochs): print("===============Go for Training===============") train_loss = train(model, device, train_loader, optimizer, epoch+1) # Validation... G, P = predicting(model, device, val_loader) val_ci = ci(G, P) # Calculate Weighted CI, Average CI of validation set lens = int(len(G)/68) average_ci = np.mean([ci(G[x*68:(x+1)*68],P[x*68:(x+1)*68]) for x in range(0,lens)]) print("===============Go for Validation===============") print("Weighted CI:",average_ci) print("Average CI:",average_ci) print("Overall CI:",val_ci) files = open("bestResult/GraphDTA_"+model_st+"_davis_result"+str(args.rounds)+".txt",'a') files.write("val_averageCI: "+str(average_ci)+", val_weightedCI: "+str(average_ci)+", val_overallCI: "+str(val_ci)+", train_loss: "+str(train_loss)+'\n') model_name = "bestModel/GraphDTA_"+model_st+"_davis_"+str(rounds)+".model" # Save the best result if average_ci > best_ci: best_ci = average_ci best_epoch = epoch best_train_loss = train_loss # Save best model print("Saving the best model...") torch.save(model.state_dict(), model_name) print("===============Go for Testing===============") # Load the model model.load_state_dict(torch.load(model_name)) # Testing... test_G, test_P = predicting(model, device, test_loader) test_CI, test_MSE = ci(test_G,test_P), mse(test_G,test_P) # Calculate Weighted CI, Average CI of testing set t_lens = int(len(test_G)/68) test_average_ci = np.mean([ci(test_G[x*68:(x+1)*68],test_P[x*68:(x+1)*68]) for x in range(0,t_lens)]) # Save the testing result files.write("test_MSE:" + str(test_MSE) + ", test_averageCI:" + str(test_average_ci) + ", test_weightedCI:" + str(test_average_ci) + ", test_overallCI:" + str(test_CI) + "\n") files.write("best_epoch:" + str(best_epoch + 1) + ", best_train_loss:" + str(best_train_loss) + "\n")
def main(args): """Main function.""" # Basic settings best_ci = 0 best_epoch = 0 best_train_loss = 10000 rounds = args.rounds # Set CUDA device cuda_name = "cuda:" + str(args.cudanum) device = torch.device(cuda_name if torch.cuda.is_available() else "cpu") # Modeling... modeling = [GINConvNet, GATNet, GAT_GCN, GCNNet][args.model] model_st = modeling.__name__ print(model_st) model = modeling().to(device) # Optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # Adam # Load data train_data = pd.read_csv("../../Data/KIBA/CV" + str(rounds) + "/CV" + str(rounds) + "_KIBA_unseenP_seenD_train.csv") val_data = pd.read_csv("../../Data/KIBA/CV" + str(rounds) + "/CV" + str(rounds) + "_KIBA_unseenP_seenD_val.csv") test_data = pd.read_csv("../../Data/KIBA/test_KIBA_unseenP_seenD.csv") train_set = process_data(train_data, 'train') val_set = process_data(val_data, 'val') test_set = process_data(test_data, 'test') train_generator = TestbedDataset(root='dataset', dataset='KIBA_train' + str(rounds), xd=train_set[0], xt=train_set[1], y=train_set[2], smile_graph=train_set[3]) val_generator = TestbedDataset(root='dataset', dataset='KIBA_val' + str(rounds), xd=val_set[0], xt=val_set[1], y=val_set[2], smile_graph=val_set[3]) test_generator = TestbedDataset(root='dataset', dataset='KIBA_test', xd=test_set[0], xt=test_set[1], y=test_set[2], smile_graph=test_set[3]) # Make mini-batch processing train_loader = DataLoader(train_generator, batch_size=args.batchsize, shuffle=True) val_loader = DataLoader(val_generator, batch_size=args.batchsize, shuffle=False) test_loader = DataLoader(test_generator, batch_size=args.batchsize, shuffle=False) # Training... print("Training.....") for epoch in range(args.epochs): print("===============Go for Training===============") train_loss = train(model, device, train_loader, optimizer, epoch + 1) # Validation... G, P = predicting(model, device, val_loader) val_ci = ci(G, P) val_path = "../../Data/KIBA/CV" + str(rounds) + "/CV" + str( rounds) + "_val.txt" # Check if kiba len file exists if (path.exists(val_path) == False): get_kiba_len() # Calculate Weighted CI, Average CI of validation set li, lens = cal_len(val_path) s = 0 w_ci, a_ci = [], [] for l in li: try: w_ci.append(l * ci(G[s:s + l], P[s:s + l])) a_ci.append(ci(G[s:s + l], P[s:s + l])) except: pass s += l weight_ci, average_ci = np.sum(w_ci) / np.sum(li), np.mean(a_ci) print("===============Go for Validation===============") print("Weighted CI:", weight_ci) print("Average CI:", average_ci) print("Overall CI:", val_ci) files = open( "bestResult/GraphDTA_" + model_st + "_kiba_result" + str(args.rounds) + ".txt", 'a') files.write("val_averageCI: " + str(average_ci) + ", val_weightedCI: " + str(weight_ci) + ", val_overallCI: " + str(val_ci) + ", train_loss: " + str(train_loss) + '\n') model_name = "bestModel/GraphDTA_" + model_st + "_kiba_" + str( rounds) + ".model" # Save the best result if average_ci > best_ci: best_ci = average_ci best_epoch = epoch best_train_loss = train_loss # Save best model print("Saving the best model...") torch.save(model.state_dict(), model_name) print("===============Go for Testing===============") # Load the model model.load_state_dict(torch.load(model_name)) # Testing... test_G, test_P = predicting(model, device, test_loader) test_CI, test_MSE = ci(test_G, test_P), mse(test_G, test_P) test_path = "../../Data/KIBA/kiba_len.txt" # Check if kiba len file exists if (path.exists(test_path) == False): get_kiba_len() # Calculate Weighted CI, Average CI of testing set t_li, t_lens = cal_len(test_path) s = 0 w_ci, a_ci = [], [] for l in t_li: try: w_ci.append(l * concordance_index(G[s:s + l], P[s:s + l])) a_ci.append(concordance_index(G[s:s + l], P[s:s + l])) except: pass s += l test_weight_ci, test_average_ci = np.sum(w_ci) / t_lens, np.mean(a_ci) # Save the testing result files.write("test_MSE:" + str(test_MSE) + ", test_averageCI:" + str(test_average_ci) + ", test_weightedCI:" + str(test_weight_ci) + ", test_overallCI:" + str(test_CI) + "\n") files.write("best_epoch:" + str(best_epoch + 1) + ", best_train_loss:" + str(best_train_loss) + "\n")
def main(): model = SkipGramModel(VOCAB_SIZE, EMBED_SIZE, BATCH_SIZE, NUM_SAMPLED, LEARNING_RATE) model.build_graph() batch_gen = process_data(VOCAB_SIZE, SKIP_WINDOW, BATCH_SIZE) train_model(model, batch_gen, NUM_TRAINING_STEP)
def main(filename): print('loading data') # Establish database connection with open( '/data/groups/schools1/mlpolicylab_fall20_schools1/pipeline/db_info.yaml', 'r') as f: db_params = yaml.safe_load(f)['db'] engine = create_engine('postgres://:@{host}:{port}/{dbname}'.format( host=db_params['host'], port=db_params['port'], dbname=db_params['dbname'], )) # Load data from database to dataframe df = load_data(filename, engine) # Split dataframe into train and test data. splits, years_reference = train_test_split(df) for i, (train_df, test_df) in enumerate(splits): print(f'processing split {i}') # Explore data for each of the cohort explore_data(train_df) # Process train and test data seperately updated_df_train = process_data(train_df) updated_df_test = process_data(test_df) # Upload the test and train data to database for future reference and easy retrival updated_df_train.columns = [ col.replace('(', '').replace(')', '').replace(' ', '_').replace('/', '_') for col in updated_df_train.columns ] updated_df_test.columns = [ col.replace('(', '').replace(')', '').replace(' ', '_').replace('/', '_') for col in updated_df_test.columns ] table_name = timestamp + '_' + str(years_reference[i][1]) + '_' + str( years_reference[i][0]) df_to_db(table_name, 'processed_data', updated_df_train, updated_df_test, engine) # Retreive test and train data from database processed_train, processed_test = db_to_df(table_name, 'processed_data', engine) updated_df_train_f = processed_train.copy() updated_df_train_l = processed_train.copy() updated_df_test_f = processed_test.copy() updated_df_test_l = processed_test.copy() # Create features for test and train data features_train, train_student_ids = create_features(updated_df_train_f) features_test, test_student_ids = create_features(updated_df_test_f) # Create labels label_train = create_label(updated_df_train_l) label_test = create_label(updated_df_test_l) # Concatenating features and labels to save in the database train_concat = pd.concat([features_train, label_train], axis=1, sort=False) test_concat = pd.concat([features_test, label_test], axis=1, sort=False) # Calculating baseline rate using grade 9 gpa and base rate baseline_precision = baseline(test_concat, years_reference[i]) base_rate = sum(train_concat.not_graduated) / len(train_concat) # Saving and reading from database df_to_db(table_name, 'model_data', train_concat, test_concat, engine) model_train, model_test = db_to_df(table_name, 'model_data', engine) features_train = model_train.iloc[:, :-1] label_train = model_train.iloc[:, -1] features_test = model_test.iloc[:, :-1] label_test = model_test.iloc[:, -1] # Build model algos = ["Logistic", "SVM", "RandomForest", "DecisionTree"] gs_params = { "Logistic": ParameterGrid({ 'solver': ['lbfgs', 'liblinear', 'saga'], 'C': [0.001, 0.01, 0.1, 1, 2, 5, 10] }), "SVM": ParameterGrid({ 'C': [0.01, 1, 2, 5, 10], 'kernel': ['rbf', 'sigmoid'] }), "RandomForest": ParameterGrid({ 'n_estimators': [30, 50, 100, 500, 1000, 10000], 'max_depth': [5, 10, 20, 50], 'min_samples_split': [5, 10, 15], 'max_features': ['auto', 'log2', 'sqrt'] }), "DecisionTree": ParameterGrid({ 'criterion': ['gini', 'entropy'], 'max_depth': [5, 10, 20, 50], 'min_samples_split': [5, 10, 15] }) } print('performing model grid search') for model_name in algos: params = gs_params[model_name] for param in params: model = build_model(features_train, label_train, model_name, param) # Perform prediction pred_proba_train = prediction(features_train, model) pred_proba_test = prediction(features_test, model) # Convert prediction probabilities to dataframes for further processing pred_train_df = pd.DataFrame(pred_proba_train, columns=['probability']) pred_test_df = pd.DataFrame(pred_proba_test, columns=['probability']) # Retreive hyperparameters for processing hyperparameters = ' '.join( ["{}: {}".format(key, param[key]) for key in param.keys()]) pred_train_df['model'], pred_train_df[ 'params'] = model_name, hyperparameters pred_test_df['model'], pred_test_df[ 'params'] = model_name, hyperparameters # Get the prediction scores for test and train data predictions_train = pd.concat( [train_student_ids, pred_train_df], axis=1, sort=False) predictions_test = pd.concat([test_student_ids, pred_test_df], axis=1, sort=False) # Calculate the bias metrics TPR_gender, FDR_gender = bias_metrics(predictions_test, processed_test, 'gender') TPR_disadvantagement, FDR_disadvantagement = bias_metrics( predictions_test, processed_test, 'disadvantagement') # Load the prediction results to database for creating visualizations df_to_db(table_name, 'predictions', predictions_train, predictions_test, engine) # Evaluate model metric = evaluate_model(features_test, label_test, model, model_name, baseline_precision, hyperparameters, columns=model_train.columns[:-1]) # saving results df_summary = pd.DataFrame({ 'test_year': years_reference[i][1], 'train_since': years_reference[i][0], 'algorithm': model_name, 'hyperparameters': hyperparameters, 'baserate': base_rate, 'baseline': [baseline_precision], 'precision': metric, 'TPR_gender': TPR_gender, 'FDR_gender': FDR_gender, 'TPR_disadvantagement': TPR_disadvantagement, 'FDR_disadvantagement': FDR_disadvantagement }) df_summary.to_sql(name=timestamp, schema='performance_metrics', con=engine, if_exists='append', index=False)