def run(params): args = candle.ArgumentStruct(**params) seed = args.rng_seed candle.set_seed(seed) # Setting up random seed for reproducible and deterministic results seed_random_state(args.rng_seed) # check for sufficient number of epochs to start validation if params['epochs'] < params['resp_val_start_epoch']: raise Exception( 'Number of epochs is less than validation threshold (resp_val_start_epoch)' ) # Construct extension to save validation results now = datetime.datetime.now() ext = '%02d%02d_%02d%02d_pytorch' \ % (now.month, now.day, now.hour, now.minute) candle.verify_path(params['save_path']) prefix = '{}{}'.format(params['save_path'], ext) logfile = params['logfile'] if params['logfile'] else prefix + '.log' candle.set_up_logger(logfile, unoMT.logger, params['verbose']) unoMT.logger.info('Params: {}'.format(params)) # Computation device config (cuda or cpu) use_cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if use_cuda else 'cpu') modelUno = UnoMTModel(args, use_cuda, device) modelUno.pre_train_config() modelUno.train() modelUno.print_final_stats()
def set_up_logger(logfile, logger1, logger2, verbose): candle.verify_path(logfile) fh = logging.FileHandler(logfile) fh.setFormatter( logging.Formatter("[%(asctime)s %(process)d] %(message)s", datefmt="%Y-%m-%d %H:%M:%S")) fh.setLevel(logging.DEBUG) sh = logging.StreamHandler() sh.setFormatter(logging.Formatter('')) sh.setLevel(logging.DEBUG if verbose else logging.INFO) for log in [logger1, logger2]: log.setLevel(logging.DEBUG) log.addHandler(fh) log.addHandler(sh)
def run(gParameters): # Construct extension to save model ext = p1b2.extension_from_parameters(gParameters, '.keras') candle.verify_path(gParameters['save_path']) prefix = '{}{}'.format(gParameters['save_path'], ext) logfile = gParameters['logfile'] if gParameters[ 'logfile'] else prefix + '.log' #candle.set_up_logger(logfile, p1b2.logger, gParameters['verbose']) #p1b2.logger.info('Params: {}'.format(gParameters)) # Get default parameters for initialization and optimizer functions kerasDefaults = candle.keras_default_config() seed = gParameters['rng_seed'] # Load dataset (X_train, y_train), (X_test, y_test) = p1b2.load_data2(gParameters, seed) print("Shape X_test: ", X_test.shape) print("Shape y_test: ", y_test.shape) print("Range X_test --> Min: ", np.min(X_test), ", max: ", np.max(X_test)) print("Range y_test --> Min: ", np.min(y_test), ", max: ", np.max(y_test)) # Define optimizer optimizer = candle.build_optimizer(gParameters['optimizer'], gParameters['learning_rate'], kerasDefaults) # load json and create model json_file = open('p1b2.model.json', 'r') loaded_model_json = json_file.read() json_file.close() loaded_model_json = model_from_json(loaded_model_json) # load weights into new model loaded_model_json.load_weights('p1b2.model.h5') print("Loaded model from disk") # evaluate loaded model on test data loaded_model_json.compile(loss=gParameters['loss'], optimizer=optimizer, metrics=['accuracy']) y_pred = loaded_model_json.predict(X_test) scores = p1b2.evaluate_accuracy_one_hot(y_pred, y_test) print('Evaluation on test data:', scores)
def run(GP): # set the seed if GP['rng_seed']: np.random.seed(GP['rng_seed']) else: np.random.seed(np.random.randint(10000)) # Set paths if not os.path.isdir(GP['home_dir']): print('Keras home directory not set') sys.exit(0) sys.path.append(GP['home_dir']) # Setup loggin args = candle.ArgumentStruct(**GP) # set_seed(args.rng_seed) # ext = extension_from_parameters(args) candle.verify_path(args.save_path) prefix = args.save_path # + ext logfile = args.logfile if args.logfile else prefix + '.log' candle.set_up_logger(logfile, logger, False) # args.verbose logger.info('Params: {}'.format(GP)) import p2b1 as hf reload(hf) # import keras_model_utils as KEU # reload(KEU) # reload(p2ck) # reload(p2ck.optimizers) maps = hf.autoencoder_preprocess() from keras.optimizers import SGD, RMSprop, Adam from keras.datasets import mnist from keras.callbacks import LearningRateScheduler, ModelCheckpoint from keras import callbacks from keras.layers.advanced_activations import ELU from keras.preprocessing.image import ImageDataGenerator # GP=hf.ReadConfig(opts.config_file) batch_size = GP['batch_size'] learning_rate = GP['learning_rate'] kerasDefaults = candle.keras_default_config() # #### Read Data ######## import helper (data_files, fields) = p2b1.get_list_of_data_files(GP) # Read from local directoy # (data_files, fields) = helper.get_local_files('/p/gscratchr/brainusr/datasets/cancer/pilot2/3k_run16_10us.35fs-DPPC.20-DIPC.60-CHOL.20.dir/') # (data_files, fields) = helper.get_local_files('3k_run16', '/p/lscratchf/brainusr/datasets/cancer/pilot2/') # Define datagenerator datagen = hf.ImageNoiseDataGenerator(corruption_level=GP['noise_factor']) # get data dimension ## num_samples = 0 for f in data_files: # Seperate different arrays from the data (X, nbrs, resnums) = helper.get_data_arrays(f) num_samples += X.shape[0] (X, nbrs, resnums) = helper.get_data_arrays(data_files[0]) print('\nData chunk shape: ', X.shape) molecular_hidden_layers = GP['molecular_num_hidden'] if not molecular_hidden_layers: X_train = hf.get_data(X, case=GP['case']) input_dim = X_train.shape[1] else: # computing input dimension for outer AE input_dim = X.shape[1] * molecular_hidden_layers[-1] print('\nState AE input/output dimension: ', input_dim) # get data dimension for molecular autoencoder molecular_nbrs = np.int(GP['molecular_nbrs']) num_molecules = X.shape[1] num_beads = X.shape[2] if GP['nbr_type'] == 'relative': # relative x, y, z positions num_loc_features = 3 loc_feat_vect = ['rel_x', 'rel_y', 'rel_z'] elif GP['nbr_type'] == 'invariant': # relative distance and angle num_loc_features = 2 loc_feat_vect = ['rel_dist', 'rel_angle'] else: print('Invalid nbr_type!!') exit() if not GP['type_bool']: # only consider molecular location coordinates num_type_features = 0 type_feat_vect = [] else: num_type_features = 5 type_feat_vect = list(fields.keys())[3:8] num_features = num_loc_features + num_type_features + num_beads dim = np.prod([num_beads, num_features, molecular_nbrs + 1]) bead_kernel_size = num_features molecular_input_dim = dim mol_kernel_size = num_beads feature_vector = loc_feat_vect + type_feat_vect + list(fields.keys())[8:] print('\nMolecular AE input/output dimension: ', molecular_input_dim) print( '\nData Format:\n[Frames (%s), Molecules (%s), Beads (%s), %s (%s)]' % (num_samples, num_molecules, num_beads, feature_vector, num_features)) # ## Define Model, Solver and Compile ########## print('\nDefine the model and compile') opt = candle.build_optimizer(GP['optimizer'], learning_rate, kerasDefaults) model_type = 'mlp' memo = '%s_%s' % (GP['base_memo'], model_type) # ####### Define Molecular Model, Solver and Compile ######### molecular_nonlinearity = GP['molecular_nonlinearity'] len_molecular_hidden_layers = len(molecular_hidden_layers) conv_bool = GP['conv_bool'] full_conv_bool = GP['full_conv_bool'] if conv_bool: molecular_model, molecular_encoder = AE_models.conv_dense_mol_auto( bead_k_size=bead_kernel_size, mol_k_size=mol_kernel_size, weights_path=None, input_shape=(1, molecular_input_dim, 1), nonlinearity=molecular_nonlinearity, hidden_layers=molecular_hidden_layers, l2_reg=GP['l2_reg'], drop=float(GP['dropout'])) elif full_conv_bool: molecular_model, molecular_encoder = AE_models.full_conv_mol_auto( bead_k_size=bead_kernel_size, mol_k_size=mol_kernel_size, weights_path=None, input_shape=(1, molecular_input_dim, 1), nonlinearity=molecular_nonlinearity, hidden_layers=molecular_hidden_layers, l2_reg=GP['l2_reg'], drop=float(GP['dropout'])) else: molecular_model, molecular_encoder = AE_models.dense_auto( weights_path=None, input_shape=(molecular_input_dim, ), nonlinearity=molecular_nonlinearity, hidden_layers=molecular_hidden_layers, l2_reg=GP['l2_reg'], drop=float(GP['dropout'])) if GP['loss'] == 'mse': loss_func = 'mse' elif GP['loss'] == 'custom': loss_func = helper.combined_loss molecular_model.compile( optimizer=opt, loss=loss_func, metrics=['mean_squared_error', 'mean_absolute_error']) print('\nModel Summary: \n') molecular_model.summary() # #### set up callbacks and cooling for the molecular_model ########## drop = GP['dropout'] mb_epochs = GP['epochs'] initial_lrate = GP['learning_rate'] epochs_drop = 1 + int(np.floor(mb_epochs / 3)) def step_decay(epoch): global initial_lrate, epochs_drop, drop lrate = initial_lrate * np.power(drop, np.floor((1 + epoch) / epochs_drop)) return lrate lr_scheduler = LearningRateScheduler(step_decay) history = callbacks.History() # callbacks=[history,lr_scheduler] history_logger = candle.LoggingCallback(logger.debug) candleRemoteMonitor = candle.CandleRemoteMonitor(params=GP) timeoutMonitor = candle.TerminateOnTimeOut(TIMEOUT) callbacks = [history, history_logger, candleRemoteMonitor, timeoutMonitor] loss = 0. # ### Save the Model to disk if GP['save_path'] is not None: save_path = GP['save_path'] if not os.path.exists(save_path): os.makedirs(save_path) else: save_path = '.' model_json = molecular_model.to_json() with open(save_path + '/model.json', "w") as json_file: json_file.write(model_json) encoder_json = molecular_encoder.to_json() with open(save_path + '/encoder.json', "w") as json_file: json_file.write(encoder_json) print('Saved model to disk') # ### Train the Model if GP['train_bool']: ct = hf.Candle_Molecular_Train( molecular_model, molecular_encoder, data_files, mb_epochs, callbacks, batch_size=batch_size, nbr_type=GP['nbr_type'], save_path=GP['save_path'], len_molecular_hidden_layers=len_molecular_hidden_layers, molecular_nbrs=molecular_nbrs, conv_bool=conv_bool, full_conv_bool=full_conv_bool, type_bool=GP['type_bool'], sampling_density=GP['sampling_density']) frame_loss, frame_mse = ct.train_ac() else: frame_mse = [] frame_loss = [] return frame_loss, frame_mse
def run(params): args = candle.ArgumentStruct(**params) candle.set_seed(args.rng_seed) ext = uno.extension_from_parameters(args) candle.verify_path(args.save_path) prefix = args.save_path + ext logfile = args.logfile if args.logfile else prefix + '.log' uno.set_up_logger(logfile, logger, uno.loggerUno, args.verbose) logger.info('Params: {}'.format(params)) loader = uno_combined_data_loader.CombinedDataLoader(args.rng_seed) loader.load(cache=args.cache, ncols=args.feature_subsample, agg_dose=args.agg_dose, cell_features=args.cell_features, drug_features=args.drug_features, drug_median_response_min=args.drug_median_response_min, drug_median_response_max=args.drug_median_response_max, use_landmark_genes=args.use_landmark_genes, use_filtered_genes=args.use_filtered_genes, cell_feature_subset_path=args.cell_feature_subset_path or args.feature_subset_path, drug_feature_subset_path=args.drug_feature_subset_path or args.feature_subset_path, preprocess_rnaseq=args.preprocess_rnaseq, single=args.single, train_sources=args.train_sources, test_sources=args.test_sources, embed_feature_source=not args.no_feature_source, encode_response_source=not args.no_response_source, partition_by=args.partition_by) target = args.agg_dose or 'Growth' val_split = args.val_split train_split = 1 - val_split loader.partition_data(partition_by=args.partition_by, cv_folds=args.cv, train_split=train_split, val_split=val_split, cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug, cell_subset_path=args.cell_subset_path, drug_subset_path=args.drug_subset_path) print('partition_by: ', args.partition_by) if args.partition_by == 'drug_pair': fname_drugs = args.save_path + 'infer_drug_ids' pds = loader.get_drugs_in_val() with open(fname_drugs, 'w') as f: for item in pds: f.write('%s\n' % item) logger.info( 'Drug IDs in holdout set written in file: {}'.format(fname_drugs)) elif args.partition_by == 'cell': fname_cells = args.save_path + 'infer_cell_ids' pcs = loader.get_cells_in_val() with open(fname_cells, 'w') as f: for item in pcs: f.write('%s\n' % item) logger.info( 'Cell IDs in holdout set written in file: {}'.format(fname_cells)) else: # fname_index = args.save_path + 'infer_index_ids' pins = loader.get_index_in_val() with open(fname_index, 'w') as f: for item in pins: f.write('%s\n' % item) logger.info( 'Indices in holdout set written in file: {}'.format(fname_index))
def run(gParameters): # Construct extension to save model ext = p1b2.extension_from_parameters(gParameters, '.keras') candle.verify_path(gParameters['save_path']) prefix = '{}{}'.format(gParameters['save_path'], ext) logfile = gParameters['logfile'] if gParameters[ 'logfile'] else prefix + '.log' candle.set_up_logger(logfile, p1b2.logger, gParameters['verbose']) p1b2.logger.info('Params: {}'.format(gParameters)) # Get default parameters for initialization and optimizer functions kerasDefaults = candle.keras_default_config() seed = gParameters['rng_seed'] # Load dataset #(X_train, y_train), (X_test, y_test) = p1b2.load_data(gParameters, seed) #(X_train, y_train), (X_val, y_val), (X_test, y_test) = p1b2.load_data_one_hot(gParameters, seed) (X_train, y_train), (X_test, y_test) = p1b2.load_data2(gParameters, seed) print("Shape X_train: ", X_train.shape) #print ("Shape X_val: ", X_val.shape) print("Shape X_test: ", X_test.shape) print("Shape y_train: ", y_train.shape) #print ("Shape y_val: ", y_val.shape) print("Shape y_test: ", y_test.shape) print("Range X_train --> Min: ", np.min(X_train), ", max: ", np.max(X_train)) #print ("Range X_val --> Min: ", np.min(X_val), ", max: ", np.max(X_val)) print("Range X_test --> Min: ", np.min(X_test), ", max: ", np.max(X_test)) print("Range y_train --> Min: ", np.min(y_train), ", max: ", np.max(y_train)) #print ("Range y_val --> Min: ", np.min(y_val), ", max: ", np.max(y_val)) print("Range y_test --> Min: ", np.min(y_test), ", max: ", np.max(y_test)) input_dim = X_train.shape[1] input_vector = Input(shape=(input_dim, )) output_dim = y_train.shape[1] # Initialize weights and learning rule initializer_weights = candle.build_initializer( gParameters['initialization'], kerasDefaults, seed) initializer_bias = candle.build_initializer('constant', kerasDefaults, 0.) activation = gParameters['activation'] # Define MLP architecture layers = gParameters['dense'] if layers != None: if type(layers) != list: layers = list(layers) for i, l in enumerate(layers): if i == 0: x = Dense(l, activation=activation, kernel_initializer=initializer_weights, bias_initializer=initializer_bias, kernel_regularizer=l2(gParameters['reg_l2']), activity_regularizer=l2( gParameters['reg_l2']))(input_vector) else: x = Dense(l, activation=activation, kernel_initializer=initializer_weights, bias_initializer=initializer_bias, kernel_regularizer=l2(gParameters['reg_l2']), activity_regularizer=l2(gParameters['reg_l2']))(x) if gParameters['dropout']: x = Dropout(gParameters['dropout'])(x) output = Dense(output_dim, activation=activation, kernel_initializer=initializer_weights, bias_initializer=initializer_bias)(x) else: output = Dense(output_dim, activation=activation, kernel_initializer=initializer_weights, bias_initializer=initializer_bias)(input_vector) # Build MLP model mlp = Model(outputs=output, inputs=input_vector) p1b2.logger.debug('Model: {}'.format(mlp.to_json())) # Define optimizer optimizer = candle.build_optimizer(gParameters['optimizer'], gParameters['learning_rate'], kerasDefaults) # Compile and display model mlp.compile(loss=gParameters['loss'], optimizer=optimizer, metrics=['accuracy']) mlp.summary() # Seed random generator for training np.random.seed(seed) history = mlp.fit(X_train, y_train, batch_size=gParameters['batch_size'], epochs=gParameters['epochs'], validation_split=gParameters['val_split'] #validation_data=(X_val, y_val) ) best_val_loss = "{:.5f}".format(min(history.history['val_loss'])) best_val_acc = "{:.5f}".format(max(history.history['val_acc'])) print('best_val_loss =', best_val_loss, 'best_val_acc =', best_val_acc) # model save # save_filepath = "model_mlp_W_" + ext # mlp.save_weights(save_filepath) model_json = mlp.to_json() with open(prefix + '.model.json', 'w') as f: print(model_json, file=f) mlp.save_weights(prefix + '.weights.h5') # Evalute model on test set y_pred = mlp.predict(X_test) scores = p1b2.evaluate_accuracy_one_hot(y_pred, y_test) print('Evaluation on test data:', scores)
def run(params): args = candle.ArgumentStruct(**params) seed = args.rng_seed candle.set_seed(seed) # Construct extension to save model ext = extension_from_parameters(params, 'keras') candle.verify_path(params['save_path']) prefix = '{}{}'.format(params['save_path'], ext) logfile = params['logfile'] if params['logfile'] else prefix+'.log' root_fname = 'Agg_attn_abs_bin' candle.set_up_logger(logfile, attn.logger, params['verbose']) attn.logger.info('Params: {}'.format(params)) # Get default parameters for initialization and optimizer functions keras_defaults = candle.keras_default_config() ## X_train, _Y_train, X_val, _Y_val, X_test, _Y_test = attn.load_data(params, seed) # move this inside the load_data function Y_train = _Y_train['AUC'] Y_test = _Y_test['AUC'] Y_val = _Y_val['AUC'] Y_train_neg, Y_train_pos = np.bincount(Y_train) Y_test_neg, Y_test_pos = np.bincount(Y_test) Y_val_neg, Y_val_pos = np.bincount(Y_val) Y_train_total = Y_train_neg + Y_train_pos Y_test_total = Y_test_neg + Y_test_pos Y_val_total = Y_val_neg + Y_val_pos total = Y_train_total + Y_test_total + Y_val_total neg = Y_train_neg + Y_test_neg + Y_val_neg pos = Y_train_pos + Y_test_pos + Y_val_pos print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n'.format( total, pos, 100 * pos / total)) nb_classes = params['dense'][-1] # Convert classes to categorical with an extra slot for the abstaining class Y_train, Y_test, Y_val = candle.modify_labels(nb_classes+1, Y_train, Y_test, Y_val) # Disable class weight (for initial testing of the abstention classifier) #y_integers = np.argmax(Y_train, axis=1) #class_weights = compute_class_weight('balanced', np.unique(y_integers), y_integers) #d_class_weights = dict(enumerate(class_weights)) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Y_train shape:', Y_train.shape) print('Y_test shape:', Y_test.shape) PS = X_train.shape[1] model = build_attention_model(params, PS) model = candle.add_model_output(model, mode='abstain', num_add=1, activation='sigmoid') print('Model after modifying layer for abstention') model.summary() # Configure abstention model mask_ = np.zeros(nb_classes+1) mask_[-1] = 1 mu0 = 0.5 # In the long term this is not as important since mu auto tunes, however it may require a large number of epochs to converge if set far away from target candle.abstention_variable_initialization(mu0, mask_, nb_classes) #parallel_model = multi_gpu_model(model, gpus=4) #parallel_model.compile(loss='mean_squared_error', # optimizer=SGD(lr=0.0001, momentum=0.9), # metrics=['mae',r2]) kerasDefaults = candle.keras_default_config() if params['momentum']: kerasDefaults['momentum_sgd'] = params['momentum'] optimizer = candle.build_optimizer(params['optimizer'], params['learning_rate'], kerasDefaults) # compile model with abstention loss model.compile(loss=candle.abstention_loss, optimizer=optimizer, metrics=['acc',tf_auc,candle.abs_acc,candle.acc_class1,candle.abs_acc_class1]) # set up a bunch of callbacks to do work during model training.. checkpointer = ModelCheckpoint(filepath=params['save_path'] + root_fname + '.autosave.model.h5', verbose=1, save_weights_only=False, save_best_only=True) csv_logger = CSVLogger('{}/{}.training.log'.format(params['save_path'], root_fname)) reduce_lr = ReduceLROnPlateau(monitor='val_tf_auc', factor=0.20, patience=40, verbose=1, mode='auto', min_delta=0.0001, cooldown=3, min_lr=0.000000001) early_stop = EarlyStopping(monitor='val_tf_auc', patience=200, verbose=1, mode='auto') candle_monitor = candle.CandleRemoteMonitor(params=params) candle_monitor = candle.CandleRemoteMonitor(params=params) timeout_monitor = candle.TerminateOnTimeOut(params['timeout']) tensorboard = TensorBoard(log_dir="tb/tb{}".format(ext)) history_logger = candle.LoggingCallback(attn.logger.debug) abstention_cbk = candle.AbstentionAdapt_Callback(monitor='val_abs_acc_class1', scale_factor=params['abs_scale_factor'], target_acc=params['target_abs_acc']) callbacks = [candle_monitor, timeout_monitor, csv_logger, history_logger, abstention_cbk] if params['reduce_lr']: callbacks.append(reduce_lr) if params['use_cp']: callbacks.append(checkpointer) if params['use_tb']: callbacks.append(tensorboard) if params['early_stop']: callbacks.append(early_stop) epochs = params['epochs'] batch_size=params['batch_size'] history = model.fit(X_train, Y_train, #class_weight=d_class_weights, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(X_val, Y_val), callbacks = callbacks) # diagnostic plots if 'loss' in history.history.keys(): candle.plot_history(params['save_path'] + root_fname, history, 'loss') if 'acc' in history.history.keys(): candle.plot_history(params['save_path'] + root_fname, history, 'acc') if 'abs_acc' in history.history.keys(): candle.plot_history(params['save_path'] + root_fname, history, 'abs_acc') # Plot mu evolution fname = params['save_path'] + root_fname + '.mu.png' xlabel='Epochs' ylabel='Abstention Weight mu' title='mu Evolution' attnviz.plot_array(abstention_cbk.muvalues, xlabel, ylabel, title, fname) # Evaluate model score = model.evaluate(X_test, Y_test, verbose=0) Y_predict = model.predict(X_test) evaluate_abstention(params, root_fname, nb_classes, Y_test, _Y_test, Y_predict, pos, total, score) save_and_test_saved_model(params, model, root_fname, X_train, X_test, Y_test) attn.logger.handlers = [] return history
def run(params): args = candle.ArgumentStruct(**params) candle.set_seed(args.rng_seed) ext = uno.extension_from_parameters(args) candle.verify_path(args.save_path) prefix = args.save_path + 'uno' + ext logfile = args.logfile if args.logfile else prefix+'.log' uno.set_up_logger(logfile, logger, uno.loggerUno, args.verbose) logger.info('Params: {}'.format(params)) # Exclude drugs / cells for UQ if 'uq_exclude_drugs_file' in params.keys(): args.exclude_drugs = uno.read_IDs_file(args.uq_exclude_drugs_file) logger.info('Drugs to exclude: {}'.format(args.exclude_drugs)) else: args.exclude_drugs = [] if 'uq_exclude_cells_file' in params.keys(): args.exclude_cells = uno.read_IDs_file(args.uq_exclude_cells_file) logger.info('Cells to exclude: {}'.format(args.exclude_cells)) else: args.exclude_cells = [] if 'uq_exclude_indices_file' in params.keys(): exclude_indices_ = uno.read_IDs_file(args.uq_exclude_indices_file) args.exclude_indices = [int(x) for x in exclude_indices_] logger.info('Indices to exclude: {}'.format(args.exclude_indices)) else: args.exclude_indices = [] if (len(args.gpus) > 0): import tensorflow as tf config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = ",".join(map(str, args.gpus)) K.set_session(tf.Session(config=config)) loader = uno_combined_data_loader.CombinedDataLoader(seed=args.rng_seed) loader.load(cache=args.cache, ncols=args.feature_subsample, agg_dose=args.agg_dose, cell_features=args.cell_features, drug_features=args.drug_features, drug_median_response_min=args.drug_median_response_min, drug_median_response_max=args.drug_median_response_max, use_landmark_genes=args.use_landmark_genes, use_filtered_genes=args.use_filtered_genes, cell_feature_subset_path=args.cell_feature_subset_path or args.feature_subset_path, drug_feature_subset_path=args.drug_feature_subset_path or args.feature_subset_path, preprocess_rnaseq=args.preprocess_rnaseq, single=args.single, train_sources=args.train_sources, test_sources=args.test_sources, embed_feature_source=not args.no_feature_source, encode_response_source=not args.no_response_source, ) target = args.agg_dose or 'Growth' val_split = args.val_split train_split = 1 - val_split loader.partition_data(partition_by=args.partition_by, cv_folds=args.cv, train_split=train_split, val_split=val_split, cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug, cell_subset_path=args.cell_subset_path, drug_subset_path=args.drug_subset_path, exclude_cells=args.exclude_cells, exclude_drugs=args.exclude_drugs, exclude_indices=args.exclude_indices ) model = uno_model_utils.build_model(loader, args, logger) logger.info('Combined model:') model.summary(print_fn=logger.info) # plot_model(model, to_file=prefix+'.model.png', show_shapes=True) if args.cp: model_json = model.to_json() with open(prefix+'.model.json', 'w') as f: print(model_json, file=f) def warmup_scheduler(epoch): lr = args.learning_rate or base_lr * args.batch_size/100 if epoch <= 5: K.set_value(model.optimizer.lr, (base_lr * (5-epoch) + lr * epoch) / 5) logger.debug('Epoch {}: lr={:.5g}'.format(epoch, K.get_value(model.optimizer.lr))) return K.get_value(model.optimizer.lr) df_pred_list = [] cv_ext = '' cv = args.cv if args.cv > 1 else 1 for fold in range(cv): if args.cv > 1: logger.info('Cross validation fold {}/{}:'.format(fold+1, cv)) cv_ext = '.cv{}'.format(fold+1) # model = uno_model_utils.build_model(loader, args, logger, silent=True) template_model = uno_model_utils.build_model(loader, args, logger, silent=True) if args.initial_weights: logger.info("Loading weights from {}".format(args.initial_weights)) template_model.load_weights(args.initial_weights) if len(args.gpus) > 1: from keras.utils import multi_gpu_model gpu_count = len(args.gpus) logger.info("Multi GPU with {} gpus".format(gpu_count)) model = multi_gpu_model(template_model, cpu_merge=False, gpus=gpu_count) else: model = template_model optimizer = optimizers.deserialize({'class_name': args.optimizer, 'config': {}}) base_lr = args.base_lr or K.get_value(optimizer.lr) if args.learning_rate: K.set_value(optimizer.lr, args.learning_rate) if args.loss == 'heteroscedastic': logger.info('Training heteroscedastic model:') model.compile(loss=heteroscedastic_loss, optimizer=optimizer, metrics=[uno_model_utils.mae_heteroscedastic, uno_model_utils.r2_heteroscedastic, uno_model_utils.meanS_heteroscesdastic]) elif args.loss == 'quantile': logger.info('Training quantile model:') model.compile(loss=triple_quantile_loss, optimizer=optimizer, metrics=[uno_model_utils.quantile50, uno_model_utils.quantile10, uno_model_utils.quantile90]) else: logger.info('Training homoscedastic model:') model.compile(loss=args.loss, optimizer=optimizer, metrics=[candle.mae, candle.r2]) # calculate trainable and non-trainable params params.update(candle.compute_trainable_params(model)) candle_monitor = candle.CandleRemoteMonitor(params=params) timeout_monitor = candle.TerminateOnTimeOut(params['timeout']) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001) warmup_lr = LearningRateScheduler(warmup_scheduler) #checkpointer = ModelCheckpoint(prefix+cv_ext+'.weights.h5', save_best_only=True, save_weights_only=True) checkpointer = candle.MultiGPUCheckpoint(prefix + cv_ext + '.model.h5', save_best_only=True) tensorboard = TensorBoard(log_dir="tb/{}{}{}".format(args.tb_prefix, ext, cv_ext)) history_logger = candle.LoggingCallback(logger.debug) # model_recorder = uno_model_utils.ModelRecorder() # callbacks = [history_logger, model_recorder] callbacks = [candle_monitor, timeout_monitor, history_logger]#, model_recorder] if args.reduce_lr: callbacks.append(reduce_lr) if args.warmup_lr: callbacks.append(warmup_lr) if args.cp: callbacks.append(checkpointer) if args.tb: callbacks.append(tensorboard) if args.save_weights: callbacks.append(uno_model_utils.SimpleWeightSaver(args.save_path + '/' + args.save_weights)) train_gen = uno_combined_data_generator.CombinedDataGenerator(loader, fold=fold, batch_size=args.batch_size, shuffle=args.shuffle) val_gen = uno_combined_data_generator.CombinedDataGenerator(loader, partition='val', fold=fold, batch_size=args.batch_size, shuffle=args.shuffle) df_val = val_gen.get_response(copy=True) y_val = df_val[target].values y_shuf = np.random.permutation(y_val) uno.log_evaluation(uno.evaluate_prediction(y_val, y_shuf), logger, description='Between random pairs in y_val:') if args.no_gen: x_train_list, y_train = train_gen.get_slice(size=train_gen.size, single=args.single) x_val_list, y_val = val_gen.get_slice(size=val_gen.size, single=args.single) history = model.fit(x_train_list, y_train, batch_size=args.batch_size, epochs=args.epochs, callbacks=callbacks, validation_data=(x_val_list, y_val)) else: logger.info('Data points per epoch: train = %d, val = %d',train_gen.size, val_gen.size) logger.info('Steps per epoch: train = %d, val = %d',train_gen.steps, val_gen.steps) history = model.fit_generator(train_gen, train_gen.steps, epochs=args.epochs, callbacks=callbacks, validation_data=val_gen, validation_steps=val_gen.steps) # if args.cp: # model.load_weights(prefix+cv_ext+'.weights.h5') # model = model_recorder.best_model if args.no_gen: y_val_pred = model.predict(x_val_list, batch_size=args.batch_size) else: val_gen.reset() y_val_pred = model.predict_generator(val_gen, val_gen.steps + 1) y_val_pred = y_val_pred[:val_gen.size] if args.loss == 'heteroscedastic': y_val_pred_ = y_val_pred[:,0] s_val_pred = y_val_pred[:,1] y_val_pred = y_val_pred_.flatten() df_val['Predicted_'+target] = y_val_pred df_val[target+'_Error'] = y_val_pred-y_val df_val['Pred_S_'+target] = s_val_pred elif args.loss == 'quantile': y_val_pred_50q = y_val_pred[:,0] y_val_pred_10q = y_val_pred[:,1] y_val_pred_90q = y_val_pred[:,2] y_val_pred = y_val_pred_50q.flatten() # 50th quantile prediction df_val['Predicted_50q_'+target] = y_val_pred df_val[target+'_Error_50q'] = y_val_pred-y_val df_val['Predicted_10q_'+target] = y_val_pred_10q.flatten() df_val['Predicted_90q_'+target] = y_val_pred_90q.flatten() else: y_val_pred = y_val_pred.flatten() # df_val = df_val.assign(PredictedGrowth=y_val_pred, GrowthError=y_val_pred-y_val) df_val['Predicted'+target] = y_val_pred df_val[target+'Error'] = y_val_pred-y_val scores = uno.evaluate_prediction(y_val, y_val_pred) uno.log_evaluation(scores, logger) df_pred_list.append(df_val) # if args.cp: # model_recorder.best_model.save(prefix+'.model.h5') if hasattr(history, 'loss'): candle.plot_history(prefix, history, 'loss') if args.loss == 'heteroscedastic': if hasattr(history, 'r2_heteroscedastic'): candle.plot_history(prefix, history, 'r2_heteroscedastic') if hasattr(history, 'meanS_heteroscedastic'): candle.plot_history(prefix, history, 'meanS_heteroscesdastic') elif args.loss == 'quantile': if hasattr(history, 'quantile50'): candle.plot_history(prefix, history, 'quantile50') if hasattr(history, 'quantile10'): candle.plot_history(prefix, history, 'quantile10') if hasattr(history, 'quantile90'): candle.plot_history(prefix, history, 'quantile90') else: if hasattr(history, 'r2'): candle.plot_history(prefix, history, 'r2') pred_fname = prefix + '.predicted.tsv' df_pred = pd.concat(df_pred_list) if args.agg_dose: if args.single: # df_pred.sort_values(['Source', 'Sample', 'Drug1', target], inplace=True) df_pred.sort_values(['Sample', 'Drug1', target], inplace=True) else: df_pred.sort_values(['Source', 'Sample', 'Drug1', 'Drug2', target], inplace=True) else: if args.single: # df_pred.sort_values(['Source', 'Sample', 'Drug1', 'Dose1', 'Growth'], inplace=True) df_pred.sort_values(['Sample', 'Drug1', 'Dose1', 'Growth'], inplace=True) else: # df_pred.sort_values(['Source', 'Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'], inplace=True) df_pred.sort_values(['Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'], inplace=True) df_pred.to_csv(pred_fname, sep='\t', index=False, float_format='%.4g') logger.info('Testing predictions stored in file: {}'.format(pred_fname)) if args.cp: logger.info('Model stored in file: {}'.format(prefix+'.model.h5')) # logger.info('Model weights stored in file: {}'.format(prefix+cv_ext+'.weights.h5')) logger.info('Model weights stored in file: {}'.format(args.save_path + '/' + args.save_weights)) if args.cv > 1: scores = uno.evaluate_prediction(df_pred[target], df_pred['Predicted'+target]) uno.log_evaluation(scores, logger, description='Combining cross validation folds:') for test_source in loader.test_sep_sources: test_gen = uno_combined_data_generator.CombinedDataGenerator(loader, partition='test', batch_size=args.batch_size, source=test_source) df_test = test_gen.get_response(copy=True) y_test = df_test[target].values n_test = len(y_test) if n_test == 0: continue if args.no_gen: x_test_list, y_test = test_gen.get_slice(size=test_gen.size, single=args.single) y_test_pred = model.predict(x_test_list, batch_size=args.batch_size) if args.loss == 'heteroscedastic': y_test_pred = y_test_pred[:,0] elif args.loss == 'quantile': y_test_pred = y_test_pred[:,0] # 50th quantile prediction else: y_test_pred = model.predict_generator(test_gen.flow(single=args.single), test_gen.steps) if args.loss == 'heteroscedastic': y_test_pred = y_test_pred[:test_gen.size,0] elif args.loss == 'quantile': y_test_pred = y_test_pred[:test_gen.size,0] # 50th quantile prediction else: y_test_pred = y_test_pred[:test_gen.size] y_test_pred = y_test_pred.flatten() scores = uno.evaluate_prediction(y_test, y_test_pred) uno.log_evaluation(scores, logger, description='Testing on data from {} ({})'.format(test_source, n_test)) if K.backend() == 'tensorflow': K.clear_session() logger.handlers = [] return history
def run(params): args = candle.ArgumentStruct(**params) seed = args.rng_seed candle.set_seed(seed) # Construct extension to save model ext = p1b1.extension_from_parameters(params, '.keras') candle.verify_path(params['save_path']) prefix = '{}{}'.format(params['save_path'], ext) logfile = params['logfile'] if params['logfile'] else prefix+'.log' candle.set_up_logger(logfile, p1b1.logger, params['verbose']) p1b1.logger.info('Params: {}'.format(params)) # Get default parameters for initialization and optimizer functions keras_defaults = candle.keras_default_config() # Load dataset x_train, y_train, x_val, y_val, x_test, y_test, x_labels, y_labels = p1b1.load_data(params, seed) # cache_file = 'data_l1000_cache.h5' # save_cache(cache_file, x_train, y_train, x_val, y_val, x_test, y_test, x_labels, y_labels) # x_train, y_train, x_val, y_val, x_test, y_test, x_labels, y_labels = load_cache(cache_file) p1b1.logger.info("Shape x_train: {}".format(x_train.shape)) p1b1.logger.info("Shape x_val: {}".format(x_val.shape)) p1b1.logger.info("Shape x_test: {}".format(x_test.shape)) p1b1.logger.info("Range x_train: [{:.3g}, {:.3g}]".format(np.min(x_train), np.max(x_train))) p1b1.logger.info("Range x_val: [{:.3g}, {:.3g}]".format(np.min(x_val), np.max(x_val))) p1b1.logger.info("Range x_test: [{:.3g}, {:.3g}]".format(np.min(x_test), np.max(x_test))) p1b1.logger.debug('Class labels') for i, label in enumerate(y_labels): p1b1.logger.debug(' {}: {}'.format(i, label)) # clf = build_type_classifier(x_train, y_train, x_val, y_val) n_classes = len(y_labels) cond_train = y_train cond_val = y_val cond_test = y_test input_dim = x_train.shape[1] cond_dim = cond_train.shape[1] latent_dim = params['latent_dim'] activation = params['activation'] dropout = params['dropout'] dense_layers = params['dense'] dropout_layer = AlphaDropout if params['alpha_dropout'] else Dropout # Initialize weights and learning rule initializer_weights = candle.build_initializer(params['initialization'], keras_defaults, seed) initializer_bias = candle.build_initializer('constant', keras_defaults, 0.) if dense_layers is not None: if type(dense_layers) != list: dense_layers = list(dense_layers) else: dense_layers = [] # Encoder Part x_input = Input(shape=(input_dim,)) cond_input = Input(shape=(cond_dim,)) h = x_input if params['model'] == 'cvae': h = keras.layers.concatenate([x_input, cond_input]) for i, layer in enumerate(dense_layers): if layer > 0: x = h h = Dense(layer, activation=activation, kernel_initializer=initializer_weights, bias_initializer=initializer_bias)(h) if params['residual']: try: h = keras.layers.add([h, x]) except ValueError: pass if params['batch_normalization']: h = BatchNormalization()(h) if dropout > 0: h = dropout_layer(dropout)(h) if params['model'] == 'ae': encoded = Dense(latent_dim, activation=activation, kernel_initializer=initializer_weights, bias_initializer=initializer_bias)(h) else: epsilon_std = params['epsilon_std'] z_mean = Dense(latent_dim, name='z_mean')(h) z_log_var = Dense(latent_dim, name='z_log_var')(h) encoded = z_mean def vae_loss(x, x_decoded_mean): xent_loss = binary_crossentropy(x, x_decoded_mean) kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) return K.mean(xent_loss + kl_loss/input_dim) def sampling(params): z_mean_, z_log_var_ = params batch_size = K.shape(z_mean_)[0] epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0., stddev=epsilon_std) return z_mean_ + K.exp(z_log_var_ / 2) * epsilon z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var]) if params['model'] == 'cvae': z_cond = keras.layers.concatenate([z, cond_input]) # Decoder Part decoder_input = Input(shape=(latent_dim,)) h = decoder_input if params['model'] == 'cvae': h = keras.layers.concatenate([decoder_input, cond_input]) for i, layer in reversed(list(enumerate(dense_layers))): if layer > 0: x = h h = Dense(layer, activation=activation, kernel_initializer=initializer_weights, bias_initializer=initializer_bias)(h) if params['residual']: try: h = keras.layers.add([h, x]) except ValueError: pass if params['batch_normalization']: h = BatchNormalization()(h) if dropout > 0: h = dropout_layer(dropout)(h) decoded = Dense(input_dim, activation='sigmoid', kernel_initializer=initializer_weights, bias_initializer=initializer_bias)(h) # Build autoencoder model if params['model'] == 'cvae': encoder = Model([x_input, cond_input], encoded) decoder = Model([decoder_input, cond_input], decoded) model = Model([x_input, cond_input], decoder([z, cond_input])) loss = vae_loss metrics = [xent, corr, mse] elif params['model'] == 'vae': encoder = Model(x_input, encoded) decoder = Model(decoder_input, decoded) model = Model(x_input, decoder(z)) loss = vae_loss metrics = [xent, corr, mse] else: encoder = Model(x_input, encoded) decoder = Model(decoder_input, decoded) model = Model(x_input, decoder(encoded)) loss = params['loss'] metrics = [xent, corr] model.summary() decoder.summary() if params['cp']: model_json = model.to_json() with open(prefix+'.model.json', 'w') as f: print(model_json, file=f) # Define optimizer # optimizer = candle.build_optimizer(params['optimizer'], # params['learning_rate'], # keras_defaults) optimizer = optimizers.deserialize({'class_name': params['optimizer'], 'config': {}}) base_lr = params['base_lr'] or K.get_value(optimizer.lr) if params['learning_rate']: K.set_value(optimizer.lr, params['learning_rate']) model.compile(loss=loss, optimizer=optimizer, metrics=metrics) # calculate trainable and non-trainable params params.update(candle.compute_trainable_params(model)) def warmup_scheduler(epoch): lr = params['learning_rate'] or base_lr * params['batch_size']/100 if epoch <= 5: K.set_value(model.optimizer.lr, (base_lr * (5-epoch) + lr * epoch) / 5) p1b1.logger.debug('Epoch {}: lr={}'.format(epoch, K.get_value(model.optimizer.lr))) return K.get_value(model.optimizer.lr) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001) warmup_lr = LearningRateScheduler(warmup_scheduler) checkpointer = ModelCheckpoint(params['save_path']+ext+'.weights.h5', save_best_only=True, save_weights_only=True) tensorboard = TensorBoard(log_dir="tb/tb{}".format(ext)) candle_monitor = candle.CandleRemoteMonitor(params=params) timeout_monitor = candle.TerminateOnTimeOut(params['timeout']) history_logger = LoggingCallback(p1b1.logger.debug) callbacks = [candle_monitor, timeout_monitor, history_logger] if params['reduce_lr']: callbacks.append(reduce_lr) if params['warmup_lr']: callbacks.append(warmup_lr) if params['cp']: callbacks.append(checkpointer) if params['tb']: callbacks.append(tensorboard) x_val2 = np.copy(x_val) np.random.shuffle(x_val2) start_scores = p1b1.evaluate_autoencoder(x_val, x_val2) p1b1.logger.info('\nBetween random pairs of validation samples: {}'.format(start_scores)) if params['model'] == 'cvae': inputs = [x_train, cond_train] val_inputs = [x_val, cond_val] test_inputs = [x_test, cond_test] else: inputs = x_train val_inputs = x_val test_inputs = x_test outputs = x_train val_outputs = x_val test_outputs = x_test history = model.fit(inputs, outputs, verbose=2, batch_size=params['batch_size'], epochs=params['epochs'], callbacks=callbacks, validation_data=(val_inputs, val_outputs)) if params['cp']: encoder.save(prefix+'.encoder.h5') decoder.save(prefix+'.decoder.h5') candle.plot_history(prefix, history, 'loss') candle.plot_history(prefix, history, 'corr', 'streaming pearson correlation') # Evalute model on test set x_pred = model.predict(test_inputs) scores = p1b1.evaluate_autoencoder(x_pred, x_test) p1b1.logger.info('\nEvaluation on test data: {}'.format(scores)) x_test_encoded = encoder.predict(test_inputs, batch_size=params['batch_size']) y_test_classes = np.argmax(y_test, axis=1) candle.plot_scatter(x_test_encoded, y_test_classes, prefix+'.latent') if params['tsne']: tsne = TSNE(n_components=2, random_state=seed) x_test_encoded_tsne = tsne.fit_transform(x_test_encoded) candle.plot_scatter(x_test_encoded_tsne, y_test_classes, prefix+'.latent.tsne') # diff = x_pred - x_test # plt.hist(diff.ravel(), bins='auto') # plt.title("Histogram of Errors with 'auto' bins") # plt.savefig('histogram_keras.png') # generate synthetic data # epsilon_std = 1.0 # for i in range(1000): # z_sample = np.random.normal(size=(1, 2)) * epsilon_std # x_decoded = decoder.predict(z_sample) p1b1.logger.handlers = [] return history
def run(params): args = candle.ArgumentStruct(**params) seed = args.rng_seed candle.set_seed(seed) # Construct extension to save model ext = attn.extension_from_parameters(params, 'keras') candle.verify_path(params['save_path']) prefix = '{}{}'.format(params['save_path'], ext) logfile = params['logfile'] if params['logfile'] else prefix + '.log' root_fname = 'Agg_attn_bin' candle.set_up_logger(logfile, attn.logger, params['verbose']) attn.logger.info('Params: {}'.format(params)) # Get default parameters for initialization and optimizer functions keras_defaults = candle.keras_default_config() ## X_train, _Y_train, X_val, _Y_val, X_test, _Y_test = attn.load_data( params, seed) # move this inside the load_data function Y_train = _Y_train['AUC'] Y_test = _Y_test['AUC'] Y_val = _Y_val['AUC'] Y_train_neg, Y_train_pos = np.bincount(Y_train) Y_test_neg, Y_test_pos = np.bincount(Y_test) Y_val_neg, Y_val_pos = np.bincount(Y_val) Y_train_total = Y_train_neg + Y_train_pos Y_test_total = Y_test_neg + Y_test_pos Y_val_total = Y_val_neg + Y_val_pos total = Y_train_total + Y_test_total + Y_val_total neg = Y_train_neg + Y_test_neg + Y_val_neg pos = Y_train_pos + Y_test_pos + Y_val_pos print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n'. format(total, pos, 100 * pos / total)) nb_classes = params['dense'][-1] Y_train = np_utils.to_categorical(Y_train, nb_classes) Y_test = np_utils.to_categorical(Y_test, nb_classes) Y_val = np_utils.to_categorical(Y_val, nb_classes) y_integers = np.argmax(Y_train, axis=1) class_weights = compute_class_weight('balanced', np.unique(y_integers), y_integers) d_class_weights = dict(enumerate(class_weights)) print('X_train shape:', X_train.shape) print('X_val shape:', X_val.shape) print('X_test shape:', X_test.shape) print('Y_train shape:', Y_train.shape) print('Y_val shape:', Y_val.shape) print('Y_test shape:', Y_test.shape) # save the data # data_train = {"X": X_train, "y": Y_train} # data_val = {"X": X_val, "y": Y_val} # data_test = {"X": X_test, "y": Y_test} h5f = h5py.File('training_attn.h5', 'w') h5f.create_dataset('X_train', data=X_train) h5f.create_dataset('Y_train', data=Y_train) h5f.create_dataset('X_val', data=X_val) h5f.create_dataset('Y_val', data=Y_val) h5f.create_dataset('X_test', data=X_test) h5f.create_dataset('Y_test', data=Y_test) h5f.close()
def run(params): args = candle.ArgumentStruct(**params) seed = args.rng_seed candle.set_seed(seed) # Construct extension to save model ext = attn.extension_from_parameters(params, "keras") candle.verify_path(params["save_path"]) prefix = "{}{}".format(params["save_path"], ext) logfile = params["logfile"] if params["logfile"] else prefix + ".log" root_fname = "Agg_attn_bin" candle.set_up_logger(logfile, attn.logger, params["verbose"]) attn.logger.info("Params: {}".format(params)) # Get default parameters for initialization and optimizer functions keras_defaults = candle.keras_default_config() ## X_train, _Y_train, X_val, _Y_val, X_test, _Y_test = attn.load_data( params, seed) # move this inside the load_data function Y_train = _Y_train["AUC"] Y_test = _Y_test["AUC"] Y_val = _Y_val["AUC"] Y_train_neg, Y_train_pos = np.bincount(Y_train) Y_test_neg, Y_test_pos = np.bincount(Y_test) Y_val_neg, Y_val_pos = np.bincount(Y_val) Y_train_total = Y_train_neg + Y_train_pos Y_test_total = Y_test_neg + Y_test_pos Y_val_total = Y_val_neg + Y_val_pos total = Y_train_total + Y_test_total + Y_val_total neg = Y_train_neg + Y_test_neg + Y_val_neg pos = Y_train_pos + Y_test_pos + Y_val_pos print("Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n". format(total, pos, 100 * pos / total)) nb_classes = params["dense"][-1] Y_train = np_utils.to_categorical(Y_train, nb_classes) Y_test = np_utils.to_categorical(Y_test, nb_classes) Y_val = np_utils.to_categorical(Y_val, nb_classes) y_integers = np.argmax(Y_train, axis=1) class_weights = compute_class_weight("balanced", np.unique(y_integers), y_integers) d_class_weights = dict(enumerate(class_weights)) print("X_train shape:", X_train.shape) print("X_test shape:", X_test.shape) print("Y_train shape:", Y_train.shape) print("Y_test shape:", Y_test.shape) PS = X_train.shape[1] model = build_attention_model(params, PS) # parallel_model = multi_gpu_model(model, gpus=4) # parallel_model.compile(loss='mean_squared_error', # optimizer=SGD(lr=0.0001, momentum=0.9), # metrics=['mae',r2]) kerasDefaults = candle.keras_default_config() if params["momentum"]: kerasDefaults["momentum_sgd"] = params["momentum"] optimizer = candle.build_optimizer(params["optimizer"], params["learning_rate"], kerasDefaults) model.compile( loss=params["loss"], optimizer=optimizer, # SGD(lr=0.00001, momentum=0.9), # optimizer=Adam(lr=0.00001), # optimizer=RMSprop(lr=0.0001), # optimizer=Adadelta(), metrics=[ "acc", tf.keras.metrics.AUC(name="auroc", curve="ROC"), tf.keras.metrics.AUC(name="aucpr", curve="PR"), ], ) # set up a bunch of callbacks to do work during model training.. checkpointer = ModelCheckpoint( filepath=params["save_path"] + root_fname + ".autosave.model.h5", verbose=1, save_weights_only=False, save_best_only=True, ) csv_logger = CSVLogger("{}/{}.training.log".format(params["save_path"], root_fname)) reduce_lr = ReduceLROnPlateau( monitor="val_auroc", factor=0.20, patience=40, verbose=1, mode="auto", min_delta=0.0001, cooldown=3, min_lr=0.000000001, ) early_stop = EarlyStopping(monitor="val_auroc", patience=200, verbose=1, mode="auto") candle_monitor = candle.CandleRemoteMonitor(params=params) candle_monitor = candle.CandleRemoteMonitor(params=params) timeout_monitor = candle.TerminateOnTimeOut(params["timeout"]) tensorboard = TensorBoard(log_dir="tb/tb{}".format(ext)) history_logger = LoggingCallback(attn.logger.debug) callbacks = [candle_monitor, timeout_monitor, csv_logger, history_logger] if params["reduce_lr"]: callbacks.append(reduce_lr) if params["use_cp"]: callbacks.append(checkpointer) if params["use_tb"]: callbacks.append(tensorboard) if params["early_stop"]: callbacks.append(early_stop) epochs = params["epochs"] batch_size = params["batch_size"] history = model.fit( X_train, Y_train, class_weight=d_class_weights, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(X_val, Y_val), callbacks=callbacks, ) # diagnostic plots if "loss" in history.history.keys(): candle.plot_history(params["save_path"] + root_fname, history, "loss") if "acc" in history.history.keys(): candle.plot_history(params["save_path"] + root_fname, history, "acc") if "auroc" in history.history.keys(): candle.plot_history(params["save_path"] + root_fname, history, "auroc") if "auprc" in history.history.keys(): candle.plot_history(params["save_path"] + root_fname, history, "aucpr") # Evaluate model score = model.evaluate(X_test, Y_test, verbose=0) Y_predict = model.predict(X_test) evaluate_model(params, root_fname, nb_classes, Y_test, _Y_test, Y_predict, pos, total, score) save_and_test_saved_model(params, model, root_fname, X_train, X_test, Y_test) attn.logger.handlers = [] return history
def run(params): args = candle.ArgumentStruct(**params) candle.set_seed(args.rng_seed) logfile_def = 'uno_infer_from_' + args.uq_infer_file + '.log' logfile = args.logfile if args.logfile else logfile_def uno.set_up_logger(logfile, logger, uno.loggerUno, args.verbose) logger.info('Params: {}'.format(params)) ext = uno.extension_from_parameters(args) candle.verify_path(args.save_path) prefix = args.save_path + 'uno' + ext # Load trained model candle.register_permanent_dropout() model = keras.models.load_model(args.model_file, compile=False) model.load_weights(args.weights_file) logger.info('Loaded model:') model.summary(print_fn=logger.info) # Determine output to infer target = args.agg_dose or 'Growth' if (args.uq_infer_given_drugs or args.uq_infer_given_cells or args.uq_infer_given_indices): loader = uno_combined_data_loader.CombinedDataLoader(args.rng_seed) loader.load(cache=args.cache, ncols=args.feature_subsample, agg_dose=args.agg_dose, cell_features=args.cell_features, drug_features=args.drug_features, drug_median_response_min=args.drug_median_response_min, drug_median_response_max=args.drug_median_response_max, use_landmark_genes=args.use_landmark_genes, use_filtered_genes=args.use_filtered_genes, cell_feature_subset_path=args.cell_feature_subset_path or args.feature_subset_path, drug_feature_subset_path=args.drug_feature_subset_path or args.feature_subset_path, preprocess_rnaseq=args.preprocess_rnaseq, single=args.single, train_sources=args.train_sources, test_sources=args.test_sources, embed_feature_source=not args.no_feature_source, encode_response_source=not args.no_response_source, ) if args.uq_infer_given_drugs: test_gen = given_drugs(args, loader) elif args.uq_infer_given_cells: test_gen = given_cells(args, loader) else: test_gen = given_indices(args, loader) else: test_gen = from_file(args, model) df_test = test_gen.get_response(copy=True) y_test = df_test[target].values for i in range(args.n_pred): if args.no_gen: x_test_list, y_test = test_gen.get_slice(size=test_gen.size, single=args.single) y_test_pred = model.predict(x_test_list, batch_size=args.batch_size) else: test_gen.reset() y_test_pred = model.predict_generator(test_gen.flow(single=args.single), test_gen.steps) y_test_pred = y_test_pred[:test_gen.size] if args.loss == 'heteroscedastic': y_test_pred_ = y_test_pred[:,0] s_test_pred = y_test_pred[:,1] y_test_pred = y_test_pred_.flatten() df_test['Predicted_'+target+'_'+str(i+1)] = y_test_pred df_test['Pred_S_'+target+'_'+str(i+1)] = s_test_pred pred_fname = prefix + '.predicted_INFER_HET.tsv' elif args.loss == 'quantile': y_test_pred_50q = y_test_pred[:,0] y_test_pred_10q = y_test_pred[:,1] y_test_pred_90q = y_test_pred[:,2] y_test_pred = y_test_pred_50q.flatten() # 50th quantile prediction df_test['Predicted_50q_'+target+'_'+str(i+1)] = y_test_pred df_test['Predicted_10q_'+target+'_'+str(i+1)] = y_test_pred_10q.flatten() df_test['Predicted_90q_'+target+'_'+str(i+1)] = y_test_pred_90q.flatten() pred_fname = prefix + '.predicted_INFER_QTL.tsv' else: y_test_pred = y_test_pred.flatten() df_test['Predicted_'+target+'_'+str(i+1)] = y_test_pred pred_fname = prefix + '.predicted_INFER.tsv' if args.n_pred < 21: scores = uno.evaluate_prediction(y_test, y_test_pred) uno.log_evaluation(scores, logger) df_pred = df_test if args.agg_dose: if args.single: df_pred.sort_values(['Sample', 'Drug1', target], inplace=True) else: df_pred.sort_values(['Sample', 'Drug1', 'Drug2', target], inplace=True) else: if args.single: df_pred.sort_values(['Sample', 'Drug1', 'Dose1', 'Growth'], inplace=True) else: df_pred.sort_values(['Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'], inplace=True) df_pred.to_csv(pred_fname, sep='\t', index=False, float_format='%.4g') logger.info('Predictions stored in file: {}'.format(pred_fname)) if K.backend() == 'tensorflow': K.clear_session() logger.handlers = []
def run(params): args = candle.ArgumentStruct(**params) seed = args.rng_seed candle.set_seed(seed) # Construct extension to save model ext = adrp.extension_from_parameters(params, ".keras") candle.verify_path(params["save_path"]) prefix = "{}{}".format(params["save_path"], ext) logfile = params["logfile"] if params["logfile"] else prefix + ".log" candle.set_up_logger(logfile, adrp.logger, params["verbose"]) adrp.logger.info("Params: {}".format(params)) # Get default parameters for initialization and optimizer functions keras_defaults = candle.keras_default_config() ## X_train, Y_train, X_test, Y_test, PS = adrp.load_data(params, seed) print("X_train shape:", X_train.shape) print("X_test shape:", X_test.shape) print("Y_train shape:", Y_train.shape) print("Y_test shape:", Y_test.shape) # Initialize weights and learning rule initializer_weights = candle.build_initializer(params["initialization"], keras_defaults, seed) initializer_bias = candle.build_initializer("constant", keras_defaults, 0.0) activation = params["activation"] # TODO: set output_dim output_dim = 1 # TODO: Use dense_layers for creating inputs/outputs dense_layers = params["dense"] inputs = Input(shape=(PS, )) if dense_layers != None: if type(dense_layers) != list: dense_layers = list(dense_layers) for i, l in enumerate(dense_layers): if i == 0: x = Dense( l, activation=activation, kernel_initializer=initializer_weights, bias_initializer=initializer_bias, )(inputs) else: x = Dense( l, activation=activation, kernel_initializer=initializer_weights, bias_initializer=initializer_bias, )(x) if params["dropout"]: x = Dropout(params["dropout"])(x) output = Dense( output_dim, activation=activation, kernel_initializer=initializer_weights, bias_initializer=initializer_bias, )(x) else: output = Dense( output_dim, activation=activation, kernel_initializer=initializer_weights, bias_initializer=initializer_bias, )(inputs) model = Model(inputs=inputs, outputs=output) model.summary() kerasDefaults = candle.keras_default_config() if params["momentum"]: kerasDefaults["momentum_sgd"] = params["momentum"] optimizer = candle.build_optimizer(params["optimizer"], params["learning_rate"], kerasDefaults) model.compile( loss=params["loss"], optimizer=optimizer, metrics=["mae", r2], ) # set up a bunch of callbacks to do work during model training.. checkpointer = ModelCheckpoint( filepath=params["save_path"] + "agg_adrp.autosave.model.h5", verbose=1, save_weights_only=False, save_best_only=True, ) csv_logger = CSVLogger(params["save_path"] + "agg_adrp.training.log") reduce_lr = ReduceLROnPlateau( monitor="val_loss", factor=0.75, patience=20, mode="auto", epsilon=0.0001, cooldown=3, min_lr=0.000000001, ) early_stop = EarlyStopping(monitor="val_loss", patience=100, verbose=1, mode="auto") # history = parallel_model.fit(X_train, Y_train, epochs = params["epochs"] batch_size = params["batch_size"] history = model.fit( X_train, Y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(X_test, Y_test), callbacks=[checkpointer, csv_logger, reduce_lr, early_stop], ) score = model.evaluate(X_test, Y_test, verbose=0) print(score) print(history.history.keys()) # see big fuction below, creates plots etc. # TODO: Break post_process into multiple functions post_process(params, X_train, X_test, Y_test, score, history, model) adrp.logger.handlers = [] return history