Ejemplo n.º 1
0
def load_data(params, seed):
    drop_cols = ['case_id']
    onehot_cols = ['cancer_type']
    y_cols = ['cancer_type']

    if params['use_landmark_genes']:
        lincs_file = 'lincs1000.tsv'
        lincs_path = candle.fetch_file(params['url_p1b1'] + lincs_file,
                                       'Pilot1')
        df_l1000 = pd.read_csv(lincs_path, sep='\t')
        x_cols = df_l1000['gdc'].tolist()
        drop_cols = None
    else:
        x_cols = None

    train_path = candle.fetch_file(params['url_p1b1'] + params['file_train'],
                                   'Pilot1')
    test_path = candle.fetch_file(params['url_p1b1'] + params['file_test'],
                                  'Pilot1')

    return candle.load_csv_data(train_path,
                                test_path,
                                x_cols=x_cols,
                                y_cols=y_cols,
                                drop_cols=drop_cols,
                                onehot_cols=onehot_cols,
                                n_cols=params['feature_subsample'],
                                shuffle=params['shuffle'],
                                scaling=params['scaling'],
                                dtype=params['datatype'],
                                validation_split=params['validation_split'],
                                return_dataframe=False,
                                return_header=True,
                                seed=seed)
Ejemplo n.º 2
0
def stage_data():
    server = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B3/'

    cell_expr_path = candle.fetch_file(server +
                                       'P1B3_cellline_expressions.tsv',
                                       'Pilot1',
                                       untar=False)
    cell_mrna_path = candle.fetch_file(server + 'P1B3_cellline_mirna.tsv',
                                       'Pilot1',
                                       untar=False)
    cell_prot_path = candle.fetch_file(server + 'P1B3_cellline_proteome.tsv',
                                       'Pilot1',
                                       untar=False)
    cell_kino_path = candle.fetch_file(server + 'P1B3_cellline_kinome.tsv',
                                       'Pilot1',
                                       untar=False)
    drug_desc_path = candle.fetch_file(server + 'P1B3_drug_descriptors.tsv',
                                       'Pilot1',
                                       untar=False)
    drug_auen_path = candle.fetch_file(server + 'P1B3_drug_latent.csv',
                                       'Pilot1',
                                       untar=False)
    dose_resp_path = candle.fetch_file(server + 'P1B3_dose_response.csv',
                                       'Pilot1',
                                       untar=False)
    test_cell_path = candle.fetch_file(server + 'P1B3_test_celllines.txt',
                                       'Pilot1',
                                       untar=False)
    test_drug_path = candle.fetch_file(server + 'P1B3_test_drugs.txt',
                                       'Pilot1',
                                       untar=False)

    return (cell_expr_path, cell_mrna_path, cell_prot_path, cell_kino_path,
            drug_desc_path, drug_auen_path, dose_resp_path, test_cell_path,
            test_drug_path)
Ejemplo n.º 3
0
def load_data(params):

    train_path = candle.fetch_file(params['data_url'] + params['train_data'], 'Pilot1')
    test_path = candle.fetch_file(params['data_url'] + params['test_data'], 'Pilot1')
    
    if params['feature_subsample'] > 0:
        usecols = list(range(params['feature_subsample']))
    else:
        usecols = None


    return candle.load_Xy_data_noheader(train_path, test_path, params['classes'], usecols,
                                        scaling='maxabs',dtype=params['datatype'])
Ejemplo n.º 4
0
def load_data_orig(params, seed):
    if params['with_type']:
        drop_cols = ['case_id']
        onehot_cols = ['cancer_type']
    else:
        drop_cols = ['case_id', 'cancer_type']
        onehot_cols = None

    if params['use_landmark_genes']:
        lincs_file = 'lincs1000.tsv'
        lincs_path = candle.fetch_file(url_p1b1 + lincs_file)
        df_l1000 = pd.read_csv(lincs_path, sep='\t')
        usecols = df_l1000['gdc']
        drop_cols = None
    else:
        usecols = None

    return candle.load_X_data(params['url_p1b1'],
                              params['file_train'],
                              params['file_test'],
                              drop_cols=drop_cols,
                              onehot_cols=onehot_cols,
                              usecols=usecols,
                              n_cols=params['feature_subsample'],
                              shuffle=params['shuffle'],
                              scaling=params['scaling'],
                              validation_split=params['validation_split'],
                              dtype=params['datatype'],
                              seed=seed)
Ejemplo n.º 5
0
def load_data_one_hot(params, seed):
    # fetch data
    file_train = candle.fetch_file(params['data_url'] + params['train_data'],
                                   subdir='Pilot1')
    file_test = candle.fetch_file(params['data_url'] + params['test_data'],
                                  subdir='Pilot1')

    return candle.load_Xy_one_hot_data2(file_train,
                                        file_test,
                                        class_col=['cancer_type'],
                                        drop_cols=['case_id', 'cancer_type'],
                                        n_cols=params['feature_subsample'],
                                        shuffle=params['shuffle'],
                                        scaling=params['scaling'],
                                        validation_split=params['val_split'],
                                        dtype=params['data_type'],
                                        seed=seed)
Ejemplo n.º 6
0
def fetch_data(gParameters):
    """ Downloads and decompresses the data if not locally available.
        Since the training data depends on the model definition it is not loaded,
        instead the local path where the raw data resides is returned
    """

    path = gParameters['data_url']
    fpath = candle.fetch_file(path + gParameters['train_data'], 'Pilot3', untar=True)
    
    return fpath
Ejemplo n.º 7
0
def fetch_data(gParameters):
    """ Download and untar data

    Args:
        gParameters: parameters from candle

    Returns:
        path to where the data is located
    """
    path = gParameters['data_url']
    fpath = candle.fetch_file(path + gParameters['train_data'],
                              'Pilot3',
                              untar=True)
    return fpath
Ejemplo n.º 8
0
def get_list_of_data_files(GP):

    import pilot2_datasets as p2
    reload(p2)
    print('Reading Data...')
    ## Identify the data set selected
    data_set = p2.data_sets[GP['set_sel']][0]
    ## Get the MD5 hash for the proper data set
    data_hash = p2.data_sets[GP['set_sel']][1]
    print('Reading Data Files... %s->%s' % (GP['set_sel'], data_set))
    ## Check if the data files are in the data director, otherwise fetch from FTP
    data_file = candle.fetch_file(
        'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot2/' +
        data_set + '.tar.gz',
        untar=True,
        subdir='Pilot2')
    data_dir = os.path.join(os.path.dirname(data_file), data_set)
    ## Make a list of all of the data files in the data set
    data_files = glob.glob('%s/*.npz' % data_dir)

    fields = p2.gen_data_set_dict()

    return (data_files, fields)
Ejemplo n.º 9
0
def get_file(url):
    return candle.fetch_file(url, 'Pilot1')
Ejemplo n.º 10
0
def run(gParameters):

    origin = gParameters['data_url']
    train_data = gParameters['train_data']
    data_loc = candle.fetch_file(origin + train_data,
                                 untar=True,
                                 md5_hash=None,
                                 subdir='Pilot3')

    print('Data downloaded and stored at: ' + data_loc)
    data_path = os.path.dirname(data_loc)
    print(data_path)

    kerasDefaults = candle.keras_default_config()

    rnn_size = gParameters['rnn_size']
    n_layers = gParameters['n_layers']
    learning_rate = gParameters['learning_rate']
    dropout = gParameters['dropout']
    recurrent_dropout = gParameters['recurrent_dropout']
    n_epochs = gParameters['epochs']
    data_train = data_path + '/data.pkl'
    verbose = gParameters['verbose']
    savedir = gParameters['output_dir']
    do_sample = gParameters['do_sample']
    temperature = gParameters['temperature']
    primetext = gParameters['primetext']
    length = gParameters['length']

    # load data from pickle
    f = open(data_train, 'rb')

    if (sys.version_info > (3, 0)):
        classes = pickle.load(f, encoding='latin1')
        chars = pickle.load(f, encoding='latin1')
        char_indices = pickle.load(f, encoding='latin1')
        indices_char = pickle.load(f, encoding='latin1')

        maxlen = pickle.load(f, encoding='latin1')
        step = pickle.load(f, encoding='latin1')

        X_ind = pickle.load(f, encoding='latin1')
        y_ind = pickle.load(f, encoding='latin1')
    else:
        classes = pickle.load(f)
        chars = pickle.load(f)
        char_indices = pickle.load(f)
        indices_char = pickle.load(f)

        maxlen = pickle.load(f)
        step = pickle.load(f)

        X_ind = pickle.load(f)
        y_ind = pickle.load(f)

    f.close()

    [s1, s2] = X_ind.shape
    print(X_ind.shape)
    print(y_ind.shape)
    print(maxlen)
    print(len(chars))

    X = np.zeros((s1, s2, len(chars)), dtype=np.bool)
    y = np.zeros((s1, len(chars)), dtype=np.bool)

    for i in range(s1):
        for t in range(s2):
            X[i, t, X_ind[i, t]] = 1
        y[i, y_ind[i]] = 1

    # build the model: a single LSTM
    if verbose:
        print('Build model...')

    model = Sequential()

    # for rnn_size in rnn_sizes:
    for k in range(n_layers):
        if k < n_layers - 1:
            ret_seq = True
        else:
            ret_seq = False

        if k == 0:
            model.add(
                LSTM(rnn_size,
                     input_shape=(maxlen, len(chars)),
                     return_sequences=ret_seq,
                     dropout=dropout,
                     recurrent_dropout=recurrent_dropout))
        else:
            model.add(
                LSTM(rnn_size,
                     dropout=dropout,
                     recurrent_dropout=recurrent_dropout,
                     return_sequences=ret_seq))

    model.add(Dense(len(chars)))
    model.add(Activation(gParameters['activation']))

    optimizer = candle.build_optimizer(gParameters['optimizer'],
                                       gParameters['learning_rate'],
                                       kerasDefaults)

    model.compile(loss=gParameters['loss'], optimizer=optimizer)

    if verbose:
        model.summary()

    for iteration in range(1, n_epochs + 1):
        if verbose:
            print()
            print('-' * 50)
            print('Iteration', iteration)

        history = LossHistory()
        model.fit(X, y, batch_size=100, epochs=1, callbacks=[history])

        loss = history.losses[-1]
        if verbose:
            print(loss)

        dirname = savedir
        if len(dirname) > 0 and not dirname.endswith('/'):
            dirname = dirname + '/'

        if not os.path.exists(dirname):
            os.makedirs(dirname)

        # serialize model to JSON
        model_json = model.to_json()
        with open(
                dirname + "/model_" + str(iteration) + "_" +
                "{:f}".format(loss) + ".json", "w") as json_file:
            json_file.write(model_json)

        # serialize weights to HDF5
        model.save_weights(dirname + "/model_" + str(iteration) + "_" +
                           "{:f}".format(loss) + ".h5")

        if verbose:
            print("Checkpoint saved.")

        if do_sample:
            outtext = open(dirname + "/example_" + str(iteration) + "_" +
                           "{:f}".format(loss) + ".txt",
                           "w",
                           encoding='utf-8')

            diversity = temperature

            outtext.write('----- diversity:' + str(diversity) + "\n")

            generated = ''
            seedstr = primetext

            outtext.write('----- Generating with seed: "' + seedstr + '"' +
                          "\n")

            sentence = " " * maxlen

            # class_index = 0
            generated += sentence
            outtext.write(generated)

            for c in seedstr:
                sentence = sentence[1:] + c
                x = np.zeros((1, maxlen, len(chars)))
                for t, char in enumerate(sentence):
                    x[0, t, char_indices[char]] = 1.

                preds = model.predict(x, verbose=verbose)[0]
                next_index = sample(preds, diversity)
                next_char = indices_char[next_index]

                generated += c

                outtext.write(c)

            for i in range(length):
                x = np.zeros((1, maxlen, len(chars)))
                for t, char in enumerate(sentence):
                    x[0, t, char_indices[char]] = 1.

                preds = model.predict(x, verbose=verbose)[0]
                next_index = sample(preds, diversity)
                next_char = indices_char[next_index]

                generated += next_char
                sentence = sentence[1:] + next_char

            if (sys.version_info > (3, 0)):
                outtext.write(generated + '\n')
            else:
                outtext.write(generated.decode('utf-8').encode('utf-8') + '\n')

            outtext.close()