Python process_dataの例、processing.process_data Pythonの例

コード例 #1

0

ファイルを表示

def run(args):
    print('Load data...')

    ###### load model
    model = eval(args.model_name)()
 
    CVs = ['CV1','CV2','CV3','CV4','CV5']

    data_path = args.data_path + args.dataset + '/'

    for CV in CVs:
        print('><<><><><><><><><><><><><><><><><><><><><><><><><<><><><><><>')
        print('start {}'.format(CV))

        ##################### load the data ############################
        train_file = CV + '_' + args.dataset + '_' + args.split +'_' + 'train' + '.csv'
        val_file = CV + '_' + args.dataset + '_' +  args.split + '_' + 'val' + '.csv'
        test = 'test_' + args.dataset + '_' + args.split + '.csv'

        # load the data
        train_data = pd.read_csv(data_path + CV + '/' + train_file)
        val_data = pd.read_csv(data_path + CV + '/' + val_file)
        test_data = pd.read_csv(data_path + test)

        if args.is_mixed:
            # load the mixed data
            if args.dataset == 'DAVIS':
                mixed_dataset = 'KIBA'
            if args.dataset == 'KIBA':
                mixed_dataset = 'DAVIS'

            # laod the mixed data
            mixed_data_file = mixed_dataset + '_mixed_train_unseenP_seenD.csv'
            mixed_data = pd.read_csv(data_path + mixed_data_file)
            # remove the repeated protein sequence
            val_t = val_data['Target Sequence'].unique()
            mixed_t = mixed_data['Target Sequence'].unique()
            filter1 = list((set(val_t).intersection(set(mixed_t))))
            mixed_data = mixed_data[~mixed_data['Target Sequence'].isin(filter1)]
            mixed_set = process_data(mixed_data) 
        else:
            mixed_set = None
             
        # pre-processing the data
        train_set = process_data(train_data)        
        val_set = process_data(val_data)
        test_set = process_data(test_data)
       
        world_size = torch.cuda.device_count()
        print('Let\'s use', world_size, 'GPUs!')
        mp.spawn(dist_run, args=(args, world_size, train_set,mixed_set,val_set,test_set,model,CV), nprocs=world_size, join=True)

コード例 #2

0

ファイルを表示

def main():
    # connect to the PostgreSQL server
    params_dic = config()
    conn = connect_to_db(params_dic)
    create_tables(conn)
    
    # map the data into database
    try:
        # read csv file from the path
        for file in glob.glob('../exampleco_data/*.csv'):
            data = pd.read_csv(file)
            # extract file name
            file_name = os.path.basename(file).split('.')[0]
            processed_data = process_data(data, file_name)
            print('\n')
            print('Pre data ingestion:')
            count_records(conn)
            print('\n')
            insert_records(conn, processed_data)
            print('\n')
            print('Post data ingestion:')
            count_records(conn)
            print('\n')
    except IOError:
        print(f"File {file} doesn't exist or isn't readable")
    
    # close the connection to PostgreSQL server
    conn.close()

コード例 #3

0

ファイルを表示

def file_summer_page():
    if request.method == "POST":
        input_file = request.files["input_file"]
        input_data = input_file.stream.read().decode("utf-8")
        output_data = process_data(input_data)
        response = make_response(output_data)
        response.headers["Content-Disposition"] = "attachment; filename=result.csv"
        return response

    return '''

コード例 #4

0

ファイルを表示

ファイル: flask_app.py プロジェクト: sighrandom/forecastwebsite

def sumfile_page():
    if request.method == "POST":
        submission = request.files["submission"]
        input_data = submission.stream.read().decode("utf-8")
        output_data = processing.process_data(input_data)
        response = make_response(output_data)
        response.headers[
            "Content-Disposition"] = "attachment; filename=yourfile.csv"
        return response

    return '''

コード例 #5

0

ファイルを表示

ファイル: run.py プロジェクト: yehchunhung/epfl-ml-project1

def main(am, t):
    if am:
        process_data('train.csv', 'test.csv')
    y_train_jets = []
    tx_train_jets = []
    ids_train_jets = []
    y_test_jets = []
    tx_test_jets = []
    ids_test_jets = []
    load_data_sets(y_train_jets, tx_train_jets, ids_train_jets, y_test_jets,
                   tx_test_jets, ids_test_jets)
    degree_best_jets = [6, 6, 6, 6]
    lambda_best_jets = [3e-05, 0.0023, 4.6e-9, 5.7e-05]
    if t:
        grid_search_cross_validation(degree_best_jets, lambda_best_jets,
                                     y_train_jets, tx_train_jets)
    predictions = []
    ids_predicted = []
    learn(predictions, ids_predicted, y_train_jets, tx_train_jets,
          tx_test_jets, ids_test_jets, lambda_best_jets, degree_best_jets)
    combine_and_create_submission(predictions, ids_predicted, 'submit_TWN1')

コード例 #6

0

ファイルを表示

def main():
    raw_lst = processing.get_raw_data()
    names_lst = processing.get_raw_names()

    for idx in range(len(raw_lst)):
        dataset_name = names_lst[idx]

        X, y, target_names = processing.process_data(raw_lst[idx])

        model_dict = training.train_model(X, y, target_names, dataset_name)

        training.print_elapsed_time(model_dict)

        reporting.produce_report(model_dict)

コード例 #7

0

ファイルを表示

def main(args):
    """Main function."""
    # Basic settings
    best_ci = 0
    best_epoch = 0
    best_train_loss = 10000
    rounds = args.rounds

    # Set CUDA device
    cuda_name = "cuda:" + str(args.cudanum)
    device = torch.device(cuda_name if torch.cuda.is_available() else "cpu")

    # Modeling...
    modeling = [GINConvNet, GATNet, GAT_GCN, GCNNet][args.model]
    model_st = modeling.__name__
    print(model_st)
    model = modeling().to(device)

    # Optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)    # Adam

    # Load data
    train_data = pd.read_csv("../../Data/DAVIS/CV"+str(rounds)+"/CV"+str(rounds)+"_DAVIS_unseenP_seenD_train.csv")
    val_data = pd.read_csv("../../Data/DAVIS/CV"+str(rounds)+"/CV"+str(rounds)+"_DAVIS_unseenP_seenD_val.csv")
    test_data = pd.read_csv("../../Data/DAVIS/test_DAVIS_unseenP_seenD.csv")

    train_set = process_data(train_data, 'train')
    val_set = process_data(val_data, 'val')
    test_set = process_data(test_data, 'test')

    train_generator = TestbedDataset(root = 'dataset', dataset = 'DAVIS_train' + str(rounds), xd = train_set[0],
                                xt = train_set[1], y = train_set[2], smile_graph = train_set[3])
    val_generator = TestbedDataset(root = 'dataset', dataset = 'DAVIS_val' + str(rounds), xd = val_set[0],
                                xt = val_set[1], y = val_set[2], smile_graph = val_set[3])
    test_generator = TestbedDataset(root = 'dataset', dataset = 'DAVIS_test', xd = test_set[0],
                                xt = test_set[1], y = test_set[2], smile_graph = test_set[3])

    # Make mini-batch processing
    train_loader = DataLoader(train_generator, batch_size = args.batchsize, shuffle = True)
    val_loader = DataLoader(val_generator, batch_size = args.batchsize, shuffle = False)
    test_loader = DataLoader(test_generator, batch_size = args.batchsize, shuffle = False)

    # Training...
    print("Training.....")
    for epoch in range(args.epochs):
        print("===============Go for Training===============")
        train_loss = train(model, device, train_loader, optimizer, epoch+1)

        # Validation...
        G, P = predicting(model, device, val_loader)
        val_ci = ci(G, P)

        # Calculate Weighted CI, Average CI of validation set
        lens = int(len(G)/68)
        average_ci = np.mean([ci(G[x*68:(x+1)*68],P[x*68:(x+1)*68]) for x in range(0,lens)])

        print("===============Go for Validation===============")
        print("Weighted CI:",average_ci)
        print("Average CI:",average_ci)
        print("Overall CI:",val_ci)
        
        files = open("bestResult/GraphDTA_"+model_st+"_davis_result"+str(args.rounds)+".txt",'a')
        files.write("val_averageCI: "+str(average_ci)+", val_weightedCI: "+str(average_ci)+", val_overallCI: "+str(val_ci)+", train_loss: "+str(train_loss)+'\n')
        model_name = "bestModel/GraphDTA_"+model_st+"_davis_"+str(rounds)+".model"

        # Save the best result
        if average_ci > best_ci:
            best_ci = average_ci
            best_epoch = epoch
            best_train_loss = train_loss
            # Save best model
            print("Saving the best model...")
            torch.save(model.state_dict(), model_name)

    print("===============Go for Testing===============")
    # Load the model
    model.load_state_dict(torch.load(model_name))

    # Testing...
    test_G, test_P = predicting(model, device, test_loader)
    test_CI, test_MSE = ci(test_G,test_P), mse(test_G,test_P)

    # Calculate Weighted CI, Average CI of testing set
    t_lens = int(len(test_G)/68)
    test_average_ci = np.mean([ci(test_G[x*68:(x+1)*68],test_P[x*68:(x+1)*68]) for x in range(0,t_lens)])

    # Save the testing result
    files.write("test_MSE:" + str(test_MSE) + ", test_averageCI:" + 
                str(test_average_ci) + ", test_weightedCI:" + str(test_average_ci) + ", test_overallCI:" + str(test_CI) + "\n")
    files.write("best_epoch:" + str(best_epoch + 1) + ", best_train_loss:" + str(best_train_loss) + "\n")

コード例 #8

0

ファイルを表示

def main(args):
    """Main function."""
    # Basic settings
    best_ci = 0
    best_epoch = 0
    best_train_loss = 10000
    rounds = args.rounds

    # Set CUDA device
    cuda_name = "cuda:" + str(args.cudanum)
    device = torch.device(cuda_name if torch.cuda.is_available() else "cpu")

    # Modeling...
    modeling = [GINConvNet, GATNet, GAT_GCN, GCNNet][args.model]
    model_st = modeling.__name__
    print(model_st)
    model = modeling().to(device)

    # Optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)  # Adam

    # Load data
    train_data = pd.read_csv("../../Data/KIBA/CV" + str(rounds) + "/CV" +
                             str(rounds) + "_KIBA_unseenP_seenD_train.csv")
    val_data = pd.read_csv("../../Data/KIBA/CV" + str(rounds) + "/CV" +
                           str(rounds) + "_KIBA_unseenP_seenD_val.csv")
    test_data = pd.read_csv("../../Data/KIBA/test_KIBA_unseenP_seenD.csv")

    train_set = process_data(train_data, 'train')
    val_set = process_data(val_data, 'val')
    test_set = process_data(test_data, 'test')

    train_generator = TestbedDataset(root='dataset',
                                     dataset='KIBA_train' + str(rounds),
                                     xd=train_set[0],
                                     xt=train_set[1],
                                     y=train_set[2],
                                     smile_graph=train_set[3])
    val_generator = TestbedDataset(root='dataset',
                                   dataset='KIBA_val' + str(rounds),
                                   xd=val_set[0],
                                   xt=val_set[1],
                                   y=val_set[2],
                                   smile_graph=val_set[3])
    test_generator = TestbedDataset(root='dataset',
                                    dataset='KIBA_test',
                                    xd=test_set[0],
                                    xt=test_set[1],
                                    y=test_set[2],
                                    smile_graph=test_set[3])

    # Make mini-batch processing
    train_loader = DataLoader(train_generator,
                              batch_size=args.batchsize,
                              shuffle=True)
    val_loader = DataLoader(val_generator,
                            batch_size=args.batchsize,
                            shuffle=False)
    test_loader = DataLoader(test_generator,
                             batch_size=args.batchsize,
                             shuffle=False)

    # Training...
    print("Training.....")
    for epoch in range(args.epochs):
        print("===============Go for Training===============")
        train_loss = train(model, device, train_loader, optimizer, epoch + 1)

        # Validation...
        G, P = predicting(model, device, val_loader)
        val_ci = ci(G, P)

        val_path = "../../Data/KIBA/CV" + str(rounds) + "/CV" + str(
            rounds) + "_val.txt"
        # Check if kiba len file exists
        if (path.exists(val_path) == False):
            get_kiba_len()

        # Calculate Weighted CI, Average CI of validation set
        li, lens = cal_len(val_path)
        s = 0
        w_ci, a_ci = [], []
        for l in li:
            try:
                w_ci.append(l * ci(G[s:s + l], P[s:s + l]))
                a_ci.append(ci(G[s:s + l], P[s:s + l]))
            except:
                pass
            s += l
        weight_ci, average_ci = np.sum(w_ci) / np.sum(li), np.mean(a_ci)

        print("===============Go for Validation===============")
        print("Weighted CI:", weight_ci)
        print("Average CI:", average_ci)
        print("Overall CI:", val_ci)

        files = open(
            "bestResult/GraphDTA_" + model_st + "_kiba_result" +
            str(args.rounds) + ".txt", 'a')
        files.write("val_averageCI: " + str(average_ci) +
                    ", val_weightedCI: " + str(weight_ci) +
                    ", val_overallCI: " + str(val_ci) + ", train_loss: " +
                    str(train_loss) + '\n')
        model_name = "bestModel/GraphDTA_" + model_st + "_kiba_" + str(
            rounds) + ".model"

        # Save the best result
        if average_ci > best_ci:
            best_ci = average_ci
            best_epoch = epoch
            best_train_loss = train_loss
            # Save best model
            print("Saving the best model...")
            torch.save(model.state_dict(), model_name)

    print("===============Go for Testing===============")
    # Load the model
    model.load_state_dict(torch.load(model_name))

    # Testing...
    test_G, test_P = predicting(model, device, test_loader)
    test_CI, test_MSE = ci(test_G, test_P), mse(test_G, test_P)

    test_path = "../../Data/KIBA/kiba_len.txt"
    # Check if kiba len file exists
    if (path.exists(test_path) == False):
        get_kiba_len()
    # Calculate Weighted CI, Average CI of testing set
    t_li, t_lens = cal_len(test_path)
    s = 0
    w_ci, a_ci = [], []
    for l in t_li:
        try:
            w_ci.append(l * concordance_index(G[s:s + l], P[s:s + l]))
            a_ci.append(concordance_index(G[s:s + l], P[s:s + l]))
        except:
            pass
        s += l
    test_weight_ci, test_average_ci = np.sum(w_ci) / t_lens, np.mean(a_ci)

    # Save the testing result
    files.write("test_MSE:" + str(test_MSE) + ", test_averageCI:" +
                str(test_average_ci) + ", test_weightedCI:" +
                str(test_weight_ci) + ", test_overallCI:" + str(test_CI) +
                "\n")
    files.write("best_epoch:" + str(best_epoch + 1) + ", best_train_loss:" +
                str(best_train_loss) + "\n")

コード例 #9

0

ファイルを表示

def main():
    model = SkipGramModel(VOCAB_SIZE, EMBED_SIZE, BATCH_SIZE, NUM_SAMPLED,
                          LEARNING_RATE)
    model.build_graph()
    batch_gen = process_data(VOCAB_SIZE, SKIP_WINDOW, BATCH_SIZE)
    train_model(model, batch_gen, NUM_TRAINING_STEP)

コード例 #10

0

ファイルを表示

def main(filename):
    print('loading data')
    # Establish database connection
    with open(
            '/data/groups/schools1/mlpolicylab_fall20_schools1/pipeline/db_info.yaml',
            'r') as f:
        db_params = yaml.safe_load(f)['db']

    engine = create_engine('postgres://:@{host}:{port}/{dbname}'.format(
        host=db_params['host'],
        port=db_params['port'],
        dbname=db_params['dbname'],
    ))
    # Load data from database to dataframe
    df = load_data(filename, engine)

    # Split dataframe into train and test data.
    splits, years_reference = train_test_split(df)

    for i, (train_df, test_df) in enumerate(splits):
        print(f'processing split {i}')

        # Explore data for each of the cohort
        explore_data(train_df)

        # Process train and test data seperately
        updated_df_train = process_data(train_df)
        updated_df_test = process_data(test_df)

        # Upload the test and train data to database for future reference and easy retrival
        updated_df_train.columns = [
            col.replace('(', '').replace(')',
                                         '').replace(' ',
                                                     '_').replace('/', '_')
            for col in updated_df_train.columns
        ]
        updated_df_test.columns = [
            col.replace('(', '').replace(')',
                                         '').replace(' ',
                                                     '_').replace('/', '_')
            for col in updated_df_test.columns
        ]

        table_name = timestamp + '_' + str(years_reference[i][1]) + '_' + str(
            years_reference[i][0])

        df_to_db(table_name, 'processed_data', updated_df_train,
                 updated_df_test, engine)

        # Retreive test and train data from database
        processed_train, processed_test = db_to_df(table_name,
                                                   'processed_data', engine)

        updated_df_train_f = processed_train.copy()
        updated_df_train_l = processed_train.copy()
        updated_df_test_f = processed_test.copy()
        updated_df_test_l = processed_test.copy()

        # Create features for test and train data
        features_train, train_student_ids = create_features(updated_df_train_f)
        features_test, test_student_ids = create_features(updated_df_test_f)

        # Create labels
        label_train = create_label(updated_df_train_l)
        label_test = create_label(updated_df_test_l)

        # Concatenating features and labels to save in the database
        train_concat = pd.concat([features_train, label_train],
                                 axis=1,
                                 sort=False)
        test_concat = pd.concat([features_test, label_test],
                                axis=1,
                                sort=False)

        # Calculating baseline rate using grade 9 gpa and base rate
        baseline_precision = baseline(test_concat, years_reference[i])
        base_rate = sum(train_concat.not_graduated) / len(train_concat)

        # Saving and reading from database
        df_to_db(table_name, 'model_data', train_concat, test_concat, engine)
        model_train, model_test = db_to_df(table_name, 'model_data', engine)

        features_train = model_train.iloc[:, :-1]
        label_train = model_train.iloc[:, -1]
        features_test = model_test.iloc[:, :-1]
        label_test = model_test.iloc[:, -1]

        # Build model
        algos = ["Logistic", "SVM", "RandomForest", "DecisionTree"]
        gs_params = {
            "Logistic":
            ParameterGrid({
                'solver': ['lbfgs', 'liblinear', 'saga'],
                'C': [0.001, 0.01, 0.1, 1, 2, 5, 10]
            }),
            "SVM":
            ParameterGrid({
                'C': [0.01, 1, 2, 5, 10],
                'kernel': ['rbf', 'sigmoid']
            }),
            "RandomForest":
            ParameterGrid({
                'n_estimators': [30, 50, 100, 500, 1000, 10000],
                'max_depth': [5, 10, 20, 50],
                'min_samples_split': [5, 10, 15],
                'max_features': ['auto', 'log2', 'sqrt']
            }),
            "DecisionTree":
            ParameterGrid({
                'criterion': ['gini', 'entropy'],
                'max_depth': [5, 10, 20, 50],
                'min_samples_split': [5, 10, 15]
            })
        }

        print('performing model grid search')
        for model_name in algos:
            params = gs_params[model_name]
            for param in params:
                model = build_model(features_train, label_train, model_name,
                                    param)

                # Perform prediction
                pred_proba_train = prediction(features_train, model)
                pred_proba_test = prediction(features_test, model)

                # Convert prediction probabilities to dataframes for further processing
                pred_train_df = pd.DataFrame(pred_proba_train,
                                             columns=['probability'])
                pred_test_df = pd.DataFrame(pred_proba_test,
                                            columns=['probability'])

                # Retreive hyperparameters for processing
                hyperparameters = ' '.join(
                    ["{}: {}".format(key, param[key]) for key in param.keys()])

                pred_train_df['model'], pred_train_df[
                    'params'] = model_name, hyperparameters
                pred_test_df['model'], pred_test_df[
                    'params'] = model_name, hyperparameters

                # Get the prediction scores for test and train data
                predictions_train = pd.concat(
                    [train_student_ids, pred_train_df], axis=1, sort=False)
                predictions_test = pd.concat([test_student_ids, pred_test_df],
                                             axis=1,
                                             sort=False)

                # Calculate the bias metrics
                TPR_gender, FDR_gender = bias_metrics(predictions_test,
                                                      processed_test, 'gender')
                TPR_disadvantagement, FDR_disadvantagement = bias_metrics(
                    predictions_test, processed_test, 'disadvantagement')

                # Load the prediction results to database for creating visualizations
                df_to_db(table_name, 'predictions', predictions_train,
                         predictions_test, engine)

                # Evaluate model
                metric = evaluate_model(features_test,
                                        label_test,
                                        model,
                                        model_name,
                                        baseline_precision,
                                        hyperparameters,
                                        columns=model_train.columns[:-1])

                # saving results
                df_summary = pd.DataFrame({
                    'test_year':
                    years_reference[i][1],
                    'train_since':
                    years_reference[i][0],
                    'algorithm':
                    model_name,
                    'hyperparameters':
                    hyperparameters,
                    'baserate':
                    base_rate,
                    'baseline': [baseline_precision],
                    'precision':
                    metric,
                    'TPR_gender':
                    TPR_gender,
                    'FDR_gender':
                    FDR_gender,
                    'TPR_disadvantagement':
                    TPR_disadvantagement,
                    'FDR_disadvantagement':
                    FDR_disadvantagement
                })
                df_summary.to_sql(name=timestamp,
                                  schema='performance_metrics',
                                  con=engine,
                                  if_exists='append',
                                  index=False)