def load_data(params, seed): drop_cols = ['case_id'] onehot_cols = ['cancer_type'] y_cols = ['cancer_type'] if params['use_landmark_genes']: lincs_file = 'lincs1000.tsv' lincs_path = candle.fetch_file(params['url_p1b1'] + lincs_file, 'Pilot1') df_l1000 = pd.read_csv(lincs_path, sep='\t') x_cols = df_l1000['gdc'].tolist() drop_cols = None else: x_cols = None train_path = candle.fetch_file(params['url_p1b1'] + params['file_train'], 'Pilot1') test_path = candle.fetch_file(params['url_p1b1'] + params['file_test'], 'Pilot1') return candle.load_csv_data(train_path, test_path, x_cols=x_cols, y_cols=y_cols, drop_cols=drop_cols, onehot_cols=onehot_cols, n_cols=params['feature_subsample'], shuffle=params['shuffle'], scaling=params['scaling'], dtype=params['datatype'], validation_split=params['validation_split'], return_dataframe=False, return_header=True, seed=seed)
def stage_data(): server = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B3/' cell_expr_path = candle.fetch_file(server + 'P1B3_cellline_expressions.tsv', 'Pilot1', untar=False) cell_mrna_path = candle.fetch_file(server + 'P1B3_cellline_mirna.tsv', 'Pilot1', untar=False) cell_prot_path = candle.fetch_file(server + 'P1B3_cellline_proteome.tsv', 'Pilot1', untar=False) cell_kino_path = candle.fetch_file(server + 'P1B3_cellline_kinome.tsv', 'Pilot1', untar=False) drug_desc_path = candle.fetch_file(server + 'P1B3_drug_descriptors.tsv', 'Pilot1', untar=False) drug_auen_path = candle.fetch_file(server + 'P1B3_drug_latent.csv', 'Pilot1', untar=False) dose_resp_path = candle.fetch_file(server + 'P1B3_dose_response.csv', 'Pilot1', untar=False) test_cell_path = candle.fetch_file(server + 'P1B3_test_celllines.txt', 'Pilot1', untar=False) test_drug_path = candle.fetch_file(server + 'P1B3_test_drugs.txt', 'Pilot1', untar=False) return (cell_expr_path, cell_mrna_path, cell_prot_path, cell_kino_path, drug_desc_path, drug_auen_path, dose_resp_path, test_cell_path, test_drug_path)
def load_data(params): train_path = candle.fetch_file(params['data_url'] + params['train_data'], 'Pilot1') test_path = candle.fetch_file(params['data_url'] + params['test_data'], 'Pilot1') if params['feature_subsample'] > 0: usecols = list(range(params['feature_subsample'])) else: usecols = None return candle.load_Xy_data_noheader(train_path, test_path, params['classes'], usecols, scaling='maxabs',dtype=params['datatype'])
def load_data_orig(params, seed): if params['with_type']: drop_cols = ['case_id'] onehot_cols = ['cancer_type'] else: drop_cols = ['case_id', 'cancer_type'] onehot_cols = None if params['use_landmark_genes']: lincs_file = 'lincs1000.tsv' lincs_path = candle.fetch_file(url_p1b1 + lincs_file) df_l1000 = pd.read_csv(lincs_path, sep='\t') usecols = df_l1000['gdc'] drop_cols = None else: usecols = None return candle.load_X_data(params['url_p1b1'], params['file_train'], params['file_test'], drop_cols=drop_cols, onehot_cols=onehot_cols, usecols=usecols, n_cols=params['feature_subsample'], shuffle=params['shuffle'], scaling=params['scaling'], validation_split=params['validation_split'], dtype=params['datatype'], seed=seed)
def load_data_one_hot(params, seed): # fetch data file_train = candle.fetch_file(params['data_url'] + params['train_data'], subdir='Pilot1') file_test = candle.fetch_file(params['data_url'] + params['test_data'], subdir='Pilot1') return candle.load_Xy_one_hot_data2(file_train, file_test, class_col=['cancer_type'], drop_cols=['case_id', 'cancer_type'], n_cols=params['feature_subsample'], shuffle=params['shuffle'], scaling=params['scaling'], validation_split=params['val_split'], dtype=params['data_type'], seed=seed)
def fetch_data(gParameters): """ Downloads and decompresses the data if not locally available. Since the training data depends on the model definition it is not loaded, instead the local path where the raw data resides is returned """ path = gParameters['data_url'] fpath = candle.fetch_file(path + gParameters['train_data'], 'Pilot3', untar=True) return fpath
def fetch_data(gParameters): """ Download and untar data Args: gParameters: parameters from candle Returns: path to where the data is located """ path = gParameters['data_url'] fpath = candle.fetch_file(path + gParameters['train_data'], 'Pilot3', untar=True) return fpath
def get_list_of_data_files(GP): import pilot2_datasets as p2 reload(p2) print('Reading Data...') ## Identify the data set selected data_set = p2.data_sets[GP['set_sel']][0] ## Get the MD5 hash for the proper data set data_hash = p2.data_sets[GP['set_sel']][1] print('Reading Data Files... %s->%s' % (GP['set_sel'], data_set)) ## Check if the data files are in the data director, otherwise fetch from FTP data_file = candle.fetch_file( 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot2/' + data_set + '.tar.gz', untar=True, subdir='Pilot2') data_dir = os.path.join(os.path.dirname(data_file), data_set) ## Make a list of all of the data files in the data set data_files = glob.glob('%s/*.npz' % data_dir) fields = p2.gen_data_set_dict() return (data_files, fields)
def get_file(url): return candle.fetch_file(url, 'Pilot1')
def run(gParameters): origin = gParameters['data_url'] train_data = gParameters['train_data'] data_loc = candle.fetch_file(origin + train_data, untar=True, md5_hash=None, subdir='Pilot3') print('Data downloaded and stored at: ' + data_loc) data_path = os.path.dirname(data_loc) print(data_path) kerasDefaults = candle.keras_default_config() rnn_size = gParameters['rnn_size'] n_layers = gParameters['n_layers'] learning_rate = gParameters['learning_rate'] dropout = gParameters['dropout'] recurrent_dropout = gParameters['recurrent_dropout'] n_epochs = gParameters['epochs'] data_train = data_path + '/data.pkl' verbose = gParameters['verbose'] savedir = gParameters['output_dir'] do_sample = gParameters['do_sample'] temperature = gParameters['temperature'] primetext = gParameters['primetext'] length = gParameters['length'] # load data from pickle f = open(data_train, 'rb') if (sys.version_info > (3, 0)): classes = pickle.load(f, encoding='latin1') chars = pickle.load(f, encoding='latin1') char_indices = pickle.load(f, encoding='latin1') indices_char = pickle.load(f, encoding='latin1') maxlen = pickle.load(f, encoding='latin1') step = pickle.load(f, encoding='latin1') X_ind = pickle.load(f, encoding='latin1') y_ind = pickle.load(f, encoding='latin1') else: classes = pickle.load(f) chars = pickle.load(f) char_indices = pickle.load(f) indices_char = pickle.load(f) maxlen = pickle.load(f) step = pickle.load(f) X_ind = pickle.load(f) y_ind = pickle.load(f) f.close() [s1, s2] = X_ind.shape print(X_ind.shape) print(y_ind.shape) print(maxlen) print(len(chars)) X = np.zeros((s1, s2, len(chars)), dtype=np.bool) y = np.zeros((s1, len(chars)), dtype=np.bool) for i in range(s1): for t in range(s2): X[i, t, X_ind[i, t]] = 1 y[i, y_ind[i]] = 1 # build the model: a single LSTM if verbose: print('Build model...') model = Sequential() # for rnn_size in rnn_sizes: for k in range(n_layers): if k < n_layers - 1: ret_seq = True else: ret_seq = False if k == 0: model.add( LSTM(rnn_size, input_shape=(maxlen, len(chars)), return_sequences=ret_seq, dropout=dropout, recurrent_dropout=recurrent_dropout)) else: model.add( LSTM(rnn_size, dropout=dropout, recurrent_dropout=recurrent_dropout, return_sequences=ret_seq)) model.add(Dense(len(chars))) model.add(Activation(gParameters['activation'])) optimizer = candle.build_optimizer(gParameters['optimizer'], gParameters['learning_rate'], kerasDefaults) model.compile(loss=gParameters['loss'], optimizer=optimizer) if verbose: model.summary() for iteration in range(1, n_epochs + 1): if verbose: print() print('-' * 50) print('Iteration', iteration) history = LossHistory() model.fit(X, y, batch_size=100, epochs=1, callbacks=[history]) loss = history.losses[-1] if verbose: print(loss) dirname = savedir if len(dirname) > 0 and not dirname.endswith('/'): dirname = dirname + '/' if not os.path.exists(dirname): os.makedirs(dirname) # serialize model to JSON model_json = model.to_json() with open( dirname + "/model_" + str(iteration) + "_" + "{:f}".format(loss) + ".json", "w") as json_file: json_file.write(model_json) # serialize weights to HDF5 model.save_weights(dirname + "/model_" + str(iteration) + "_" + "{:f}".format(loss) + ".h5") if verbose: print("Checkpoint saved.") if do_sample: outtext = open(dirname + "/example_" + str(iteration) + "_" + "{:f}".format(loss) + ".txt", "w", encoding='utf-8') diversity = temperature outtext.write('----- diversity:' + str(diversity) + "\n") generated = '' seedstr = primetext outtext.write('----- Generating with seed: "' + seedstr + '"' + "\n") sentence = " " * maxlen # class_index = 0 generated += sentence outtext.write(generated) for c in seedstr: sentence = sentence[1:] + c x = np.zeros((1, maxlen, len(chars))) for t, char in enumerate(sentence): x[0, t, char_indices[char]] = 1. preds = model.predict(x, verbose=verbose)[0] next_index = sample(preds, diversity) next_char = indices_char[next_index] generated += c outtext.write(c) for i in range(length): x = np.zeros((1, maxlen, len(chars))) for t, char in enumerate(sentence): x[0, t, char_indices[char]] = 1. preds = model.predict(x, verbose=verbose)[0] next_index = sample(preds, diversity) next_char = indices_char[next_index] generated += next_char sentence = sentence[1:] + next_char if (sys.version_info > (3, 0)): outtext.write(generated + '\n') else: outtext.write(generated.decode('utf-8').encode('utf-8') + '\n') outtext.close()