Exemple #1
0
def load_data(params, seed):
    drop_cols = ['case_id']
    onehot_cols = ['cancer_type']
    y_cols = ['cancer_type']

    if params['use_landmark_genes']:
        lincs_file = 'lincs1000.tsv'
        lincs_path = p1_common.get_p1_file(url_p1b1 + lincs_file)
        df_l1000 = pd.read_csv(lincs_path, sep='\t')
        x_cols = df_l1000['gdc'].tolist()
        drop_cols = None
    else:
        x_cols = None

    train_path = p1_common.get_p1_file(url_p1b1 + file_train)
    test_path = p1_common.get_p1_file(url_p1b1 + file_test)

    return p1_common.load_csv_data(train_path,
                                   test_path,
                                   x_cols=x_cols,
                                   y_cols=y_cols,
                                   drop_cols=drop_cols,
                                   onehot_cols=onehot_cols,
                                   n_cols=params['feature_subsample'],
                                   shuffle=params['shuffle'],
                                   scaling=params['scaling'],
                                   dtype=params['datatype'],
                                   validation_split=params['validation_split'],
                                   return_dataframe=False,
                                   return_header=True,
                                   seed=seed)
Exemple #2
0
def load_data_orig(params, seed):
    if params['with_type']:
        drop_cols = ['case_id']
        onehot_cols = ['cancer_type']
    else:
        drop_cols = ['case_id', 'cancer_type']
        onehot_cols = None

    if params['use_landmark_genes']:
        lincs_file = 'lincs1000.tsv'
        lincs_path = p1_common.get_p1_file(url_p1b1 + lincs_file)
        df_l1000 = pd.read_csv(lincs_path, sep='\t')
        usecols = df_l1000['gdc']
        drop_cols = None
    else:
        usecols = None

    return p1_common.load_X_data(url_p1b1,
                                 file_train,
                                 file_test,
                                 drop_cols=drop_cols,
                                 onehot_cols=onehot_cols,
                                 usecols=usecols,
                                 n_cols=params['feature_subsample'],
                                 shuffle=params['shuffle'],
                                 scaling=params['scaling'],
                                 validation_split=params['validation_split'],
                                 dtype=params['datatype'],
                                 seed=seed)
Exemple #3
0
def stage_data():
    server = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B3/'

    cell_expr_path = p1_common.get_p1_file(server+'P1B3_cellline_expressions.tsv')
    cell_mrna_path = p1_common.get_p1_file(server+'P1B3_cellline_mirna.tsv')
    cell_prot_path = p1_common.get_p1_file(server+'P1B3_cellline_proteome.tsv')
    cell_kino_path = p1_common.get_p1_file(server+'P1B3_cellline_kinome.tsv')
    drug_desc_path = p1_common.get_p1_file(server+'P1B3_drug_descriptors.tsv')
    drug_auen_path = p1_common.get_p1_file(server+'P1B3_drug_latent.csv')
    dose_resp_path = p1_common.get_p1_file(server+'P1B3_dose_response.csv')
    test_cell_path = p1_common.get_p1_file(server+'P1B3_test_celllines.txt')
    test_drug_path = p1_common.get_p1_file(server+'P1B3_test_drugs.txt')

    return(cell_expr_path, cell_mrna_path, cell_prot_path, cell_kino_path,
           drug_desc_path, drug_auen_path, dose_resp_path, test_cell_path,
           test_drug_path)
def get_file(url):
    return p1_common.get_p1_file(url)