def run(params):
    args = Struct(**params)
    set_seed(args.rng_seed)
    ext = extension_from_parameters(args)
    verify_path(args.save_path)
    prefix = args.save_path + ext
    logfile = args.logfile if args.logfile else prefix + '.log'
    set_up_logger(logfile, args.verbose)
    logger.info('Params: {}'.format(params))

    loader = ComboDataLoader(seed=args.rng_seed,
                             val_split=args.validation_split,
                             cell_features=args.cell_features,
                             drug_features=args.drug_features,
                             use_mean_growth=args.use_mean_growth,
                             response_url=args.response_url,
                             use_landmark_genes=args.use_landmark_genes,
                             preprocess_rnaseq=args.preprocess_rnaseq,
                             exclude_cells=args.exclude_cells,
                             exclude_drugs=args.exclude_drugs,
                             use_combo_score=args.use_combo_score,
                             cv_partition=args.cv_partition,
                             cv=args.cv)
    # test_loader(loader)
    # test_generator(loader)

    train_gen = ComboDataGenerator(loader, batch_size=args.batch_size).flow()
    val_gen = ComboDataGenerator(loader,
                                 partition='val',
                                 batch_size=args.batch_size).flow()

    train_steps = int(loader.n_train / args.batch_size)
    val_steps = int(loader.n_val / args.batch_size)

    model = build_model(loader, args, verbose=True)
    model.summary()
    # candle.plot_model(model, to_file=prefix+'.model.png', show_shapes=True)

    if args.cp:
        model_json = model.to_json()
        with open(prefix + '.model.json', 'w') as f:
            print(model_json, file=f)

    def warmup_scheduler(epoch):
        lr = args.learning_rate or base_lr * args.batch_size / 100
        if epoch <= 5:
            K.set_value(model.optimizer.lr,
                        (base_lr * (5 - epoch) + lr * epoch) / 5)
        logger.debug('Epoch {}: lr={}'.format(epoch,
                                              K.get_value(model.optimizer.lr)))
        return K.get_value(model.optimizer.lr)

    df_pred_list = []

    cv_ext = ''
    cv = args.cv if args.cv > 1 else 1

    fold = 0
    while fold < cv:
        if args.cv > 1:
            logger.info('Cross validation fold {}/{}:'.format(fold + 1, cv))
            cv_ext = '.cv{}'.format(fold + 1)

        model = build_model(loader, args)

        optimizer = optimizers.deserialize({
            'class_name': args.optimizer,
            'config': {}
        })
        base_lr = args.base_lr or K.get_value(optimizer.lr)
        if args.learning_rate:
            K.set_value(optimizer.lr, args.learning_rate)

        model.compile(loss=args.loss, optimizer=optimizer, metrics=[mae, r2])

        # calculate trainable and non-trainable params
        params.update(candle.compute_trainable_params(model))

        candle_monitor = candle.CandleRemoteMonitor(params=params)
        timeout_monitor = candle.TerminateOnTimeOut(params['timeout'])

        reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                      factor=0.5,
                                      patience=5,
                                      min_lr=0.00001)
        warmup_lr = LearningRateScheduler(warmup_scheduler)
        checkpointer = ModelCheckpoint(prefix + cv_ext + '.weights.h5',
                                       save_best_only=True,
                                       save_weights_only=True)
        tensorboard = TensorBoard(log_dir="tb/tb{}{}".format(ext, cv_ext))
        history_logger = LoggingCallback(logger.debug)
        model_recorder = ModelRecorder()

        # callbacks = [history_logger, model_recorder]
        callbacks = [
            candle_monitor, timeout_monitor, history_logger, model_recorder
        ]
        if args.reduce_lr:
            callbacks.append(reduce_lr)
        if args.warmup_lr:
            callbacks.append(warmup_lr)
        if args.cp:
            callbacks.append(checkpointer)
        if args.tb:
            callbacks.append(tensorboard)

        if args.gen:
            history = model.fit_generator(train_gen,
                                          train_steps,
                                          epochs=args.epochs,
                                          callbacks=callbacks,
                                          validation_data=val_gen,
                                          validation_steps=val_steps)
            fold += 1
        else:
            if args.cv > 1:
                x_train_list, y_train, x_val_list, y_val, df_train, df_val = loader.load_data_cv(
                    fold)
            else:
                x_train_list, y_train, x_val_list, y_val, df_train, df_val = loader.load_data(
                )

            y_shuf = np.random.permutation(y_val)
            log_evaluation(evaluate_prediction(y_val, y_shuf),
                           description='Between random pairs in y_val:')
            history = model.fit(x_train_list,
                                y_train,
                                batch_size=args.batch_size,
                                shuffle=args.shuffle,
                                epochs=args.epochs,
                                callbacks=callbacks,
                                validation_data=(x_val_list, y_val))

        if args.cp:
            model.load_weights(prefix + cv_ext + '.weights.h5')

        if not args.gen:
            y_val_pred = model.predict(x_val_list,
                                       batch_size=args.batch_size).flatten()
            scores = evaluate_prediction(y_val, y_val_pred)
            if args.cv > 1 and scores[args.loss] > args.max_val_loss:
                logger.warn(
                    'Best val_loss {} is greater than {}; retrain the model...'
                    .format(scores[args.loss], args.max_val_loss))
                continue
            else:
                fold += 1
            log_evaluation(scores)
            df_val.is_copy = False
            df_val['GROWTH_PRED'] = y_val_pred
            df_val['GROWTH_ERROR'] = y_val_pred - y_val
            df_pred_list.append(df_val)

        if args.cp:
            # model.save(prefix+'.model.h5')
            model_recorder.best_model.save(prefix + '.model.h5')

            # test reloadded model prediction
            # new_model = keras.models.load_model(prefix+'.model.h5')
            # new_model.load_weights(prefix+cv_ext+'.weights.h5')
            # new_pred = new_model.predict(x_val_list, batch_size=args.batch_size).flatten()
            # print('y_val:', y_val[:10])
            # print('old_pred:', y_val_pred[:10])
            # print('new_pred:', new_pred[:10])

        candle.plot_history(prefix, history, 'loss')
        candle.plot_history(prefix, history, 'r2')

        if K.backend() == 'tensorflow':
            K.clear_session()

    if not args.gen:
        if args.use_combo_score:
            pred_fname = prefix + '.predicted.score.tsv'
        elif args.use_mean_growth:
            pred_fname = prefix + '.predicted.mean.growth.tsv'
        else:
            pred_fname = prefix + '.predicted.growth.tsv'
        df_pred = pd.concat(df_pred_list)
        df_pred.to_csv(pred_fname, sep='\t', index=False, float_format='%.4g')

    logger.handlers = []

    return history
예제 #2
0
def run(params):
    args = candle.ArgumentStruct(**params)
    seed = args.rng_seed
    candle.set_seed(seed)

    # Construct extension to save model
    ext = extension_from_parameters(params, 'keras')
    candle.verify_path(params['save_path'])
    prefix = '{}{}'.format(params['save_path'], ext)
    logfile = params['logfile'] if params['logfile'] else prefix+'.log'
    root_fname = 'Agg_attn_abs_bin'
    candle.set_up_logger(logfile, attn.logger, params['verbose'])
    attn.logger.info('Params: {}'.format(params))

    # Get default parameters for initialization and optimizer functions
    keras_defaults = candle.keras_default_config()

    ##
    X_train, _Y_train, X_val, _Y_val, X_test, _Y_test  = attn.load_data(params, seed)

    # move this inside the load_data function
    Y_train = _Y_train['AUC']
    Y_test = _Y_test['AUC']
    Y_val = _Y_val['AUC']

    Y_train_neg, Y_train_pos = np.bincount(Y_train)
    Y_test_neg, Y_test_pos = np.bincount(Y_test)
    Y_val_neg, Y_val_pos = np.bincount(Y_val)

    Y_train_total = Y_train_neg + Y_train_pos
    Y_test_total = Y_test_neg + Y_test_pos
    Y_val_total = Y_val_neg + Y_val_pos

    total = Y_train_total + Y_test_total + Y_val_total
    neg = Y_train_neg + Y_test_neg + Y_val_neg
    pos = Y_train_pos + Y_test_pos + Y_val_pos

    print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
        total, pos, 100 * pos / total))

    nb_classes = params['dense'][-1]

    # Convert classes to categorical with an extra slot for the abstaining class
    Y_train, Y_test, Y_val = candle.modify_labels(nb_classes+1, Y_train, Y_test, Y_val)

    # Disable class weight (for initial testing of the abstention classifier)
    #y_integers = np.argmax(Y_train, axis=1)
    #class_weights = compute_class_weight('balanced', np.unique(y_integers), y_integers)
    #d_class_weights = dict(enumerate(class_weights))

    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)

    print('Y_train shape:', Y_train.shape)
    print('Y_test shape:', Y_test.shape)

    PS = X_train.shape[1]
    model = build_attention_model(params, PS)
    model = candle.add_model_output(model, mode='abstain', num_add=1, activation='sigmoid')
    print('Model after modifying layer for abstention')
    model.summary()
    
    # Configure abstention model
    mask_ = np.zeros(nb_classes+1)
    mask_[-1] = 1
    mu0 = 0.5 # In the long term this is not as important since mu auto tunes, however it may require a large number of epochs to converge if set far away from target

    candle.abstention_variable_initialization(mu0, mask_, nb_classes)

    #parallel_model = multi_gpu_model(model, gpus=4)
    #parallel_model.compile(loss='mean_squared_error',
    #         optimizer=SGD(lr=0.0001, momentum=0.9),
    #              metrics=['mae',r2])
    kerasDefaults = candle.keras_default_config()
    if params['momentum']:
        kerasDefaults['momentum_sgd'] = params['momentum']

    optimizer = candle.build_optimizer(params['optimizer'], params['learning_rate'], kerasDefaults)

    # compile model with abstention loss
    model.compile(loss=candle.abstention_loss, optimizer=optimizer, metrics=['acc',tf_auc,candle.abs_acc,candle.acc_class1,candle.abs_acc_class1])


    # set up a bunch of callbacks to do work during model training..
    checkpointer = ModelCheckpoint(filepath=params['save_path'] + root_fname + '.autosave.model.h5', verbose=1, save_weights_only=False, save_best_only=True)
    csv_logger = CSVLogger('{}/{}.training.log'.format(params['save_path'], root_fname))
    reduce_lr = ReduceLROnPlateau(monitor='val_tf_auc', factor=0.20, patience=40, verbose=1, mode='auto', min_delta=0.0001, cooldown=3, min_lr=0.000000001)
    early_stop = EarlyStopping(monitor='val_tf_auc', patience=200, verbose=1, mode='auto')
    candle_monitor = candle.CandleRemoteMonitor(params=params)

    candle_monitor = candle.CandleRemoteMonitor(params=params)
    timeout_monitor = candle.TerminateOnTimeOut(params['timeout'])
    tensorboard = TensorBoard(log_dir="tb/tb{}".format(ext))

    history_logger = candle.LoggingCallback(attn.logger.debug)
    
    abstention_cbk = candle.AbstentionAdapt_Callback(monitor='val_abs_acc_class1', scale_factor=params['abs_scale_factor'], target_acc=params['target_abs_acc'])

    callbacks = [candle_monitor, timeout_monitor, csv_logger, history_logger, abstention_cbk]

    if params['reduce_lr']:
        callbacks.append(reduce_lr)

    if params['use_cp']:
        callbacks.append(checkpointer)
    if params['use_tb']:
        callbacks.append(tensorboard)
    if params['early_stop']:
        callbacks.append(early_stop)

    epochs = params['epochs']
    batch_size=params['batch_size']
    history = model.fit(X_train, Y_train, #class_weight=d_class_weights,
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=1,
                        validation_data=(X_val, Y_val),
                        callbacks = callbacks)
                        
    # diagnostic plots
    if 'loss' in history.history.keys():
        candle.plot_history(params['save_path'] + root_fname, history, 'loss')
    if 'acc' in history.history.keys():
        candle.plot_history(params['save_path'] + root_fname, history, 'acc')
    if 'abs_acc' in history.history.keys():
        candle.plot_history(params['save_path'] + root_fname, history, 'abs_acc')
    # Plot mu evolution
    fname = params['save_path'] + root_fname + '.mu.png'
    xlabel='Epochs'
    ylabel='Abstention Weight mu'
    title='mu Evolution'
    attnviz.plot_array(abstention_cbk.muvalues, xlabel, ylabel, title, fname)

    # Evaluate model
    score = model.evaluate(X_test, Y_test, verbose=0)
    Y_predict = model.predict(X_test)
    evaluate_abstention(params, root_fname, nb_classes, Y_test, _Y_test, Y_predict, pos, total, score)

    save_and_test_saved_model(params, model, root_fname, X_train, X_test, Y_test)

    attn.logger.handlers = []

    return history
예제 #3
0
def run(params):

    args = candle.ArgumentStruct(**params)
    seed = args.rng_seed
    candle.set_seed(seed)
    
    # Construct extension to save model
    ext = p1b1.extension_from_parameters(params, '.keras')
    candle.verify_path(params['save_path'])
    prefix = '{}{}'.format(params['save_path'], ext)
    logfile = params['logfile'] if params['logfile'] else prefix+'.log'
    candle.set_up_logger(logfile, p1b1.logger, params['verbose'])
    p1b1.logger.info('Params: {}'.format(params))

    # Get default parameters for initialization and optimizer functions
    keras_defaults = candle.keras_default_config()

    # Load dataset
    x_train, y_train, x_val, y_val, x_test, y_test, x_labels, y_labels = p1b1.load_data(params, seed)

    # cache_file = 'data_l1000_cache.h5'
    # save_cache(cache_file, x_train, y_train, x_val, y_val, x_test, y_test, x_labels, y_labels)
    # x_train, y_train, x_val, y_val, x_test, y_test, x_labels, y_labels = load_cache(cache_file)

    p1b1.logger.info("Shape x_train: {}".format(x_train.shape))
    p1b1.logger.info("Shape x_val:   {}".format(x_val.shape))
    p1b1.logger.info("Shape x_test:  {}".format(x_test.shape))

    p1b1.logger.info("Range x_train: [{:.3g}, {:.3g}]".format(np.min(x_train), np.max(x_train)))
    p1b1.logger.info("Range x_val:   [{:.3g}, {:.3g}]".format(np.min(x_val), np.max(x_val)))
    p1b1.logger.info("Range x_test:  [{:.3g}, {:.3g}]".format(np.min(x_test), np.max(x_test)))

    p1b1.logger.debug('Class labels')
    for i, label in enumerate(y_labels):
        p1b1.logger.debug('  {}: {}'.format(i, label))

    # clf = build_type_classifier(x_train, y_train, x_val, y_val)

    n_classes = len(y_labels)
    cond_train = y_train
    cond_val = y_val
    cond_test = y_test

    input_dim = x_train.shape[1]
    cond_dim = cond_train.shape[1]
    latent_dim = params['latent_dim']

    activation = params['activation']
    dropout = params['dropout']
    dense_layers = params['dense']
    dropout_layer = AlphaDropout if params['alpha_dropout'] else Dropout

    # Initialize weights and learning rule
    initializer_weights = candle.build_initializer(params['initialization'], keras_defaults, seed)
    initializer_bias = candle.build_initializer('constant', keras_defaults, 0.)

    if dense_layers is not None:
        if type(dense_layers) != list:
            dense_layers = list(dense_layers)
    else:
        dense_layers = []

    # Encoder Part
    x_input = Input(shape=(input_dim,))
    cond_input = Input(shape=(cond_dim,))
    h = x_input
    if params['model'] == 'cvae':
        h = keras.layers.concatenate([x_input, cond_input])

    for i, layer in enumerate(dense_layers):
        if layer > 0:
            x = h
            h = Dense(layer, activation=activation,
                      kernel_initializer=initializer_weights,
                      bias_initializer=initializer_bias)(h)
            if params['residual']:
                try:
                    h = keras.layers.add([h, x])
                except ValueError:
                    pass
            if params['batch_normalization']:
                h = BatchNormalization()(h)
            if dropout > 0:
                h = dropout_layer(dropout)(h)

    if params['model'] == 'ae':
        encoded = Dense(latent_dim, activation=activation,
                        kernel_initializer=initializer_weights,
                        bias_initializer=initializer_bias)(h)
    else:
        epsilon_std = params['epsilon_std']
        z_mean = Dense(latent_dim, name='z_mean')(h)
        z_log_var = Dense(latent_dim, name='z_log_var')(h)
        encoded = z_mean

        def vae_loss(x, x_decoded_mean):
            xent_loss = binary_crossentropy(x, x_decoded_mean)
            kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
            return K.mean(xent_loss + kl_loss/input_dim)

        def sampling(params):
            z_mean_, z_log_var_ = params
            batch_size = K.shape(z_mean_)[0]
            epsilon = K.random_normal(shape=(batch_size, latent_dim),
                                      mean=0., stddev=epsilon_std)
            return z_mean_ + K.exp(z_log_var_ / 2) * epsilon

        z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
        if params['model'] == 'cvae':
            z_cond = keras.layers.concatenate([z, cond_input])

    # Decoder Part
    decoder_input = Input(shape=(latent_dim,))
    h = decoder_input
    if params['model'] == 'cvae':
        h = keras.layers.concatenate([decoder_input, cond_input])

    for i, layer in reversed(list(enumerate(dense_layers))):
        if layer > 0:
            x = h
            h = Dense(layer, activation=activation,
                      kernel_initializer=initializer_weights,
                      bias_initializer=initializer_bias)(h)
            if params['residual']:
                try:
                    h = keras.layers.add([h, x])
                except ValueError:
                    pass
            if params['batch_normalization']:
                h = BatchNormalization()(h)
            if dropout > 0:
                h = dropout_layer(dropout)(h)

    decoded = Dense(input_dim, activation='sigmoid',
                    kernel_initializer=initializer_weights,
                    bias_initializer=initializer_bias)(h)

    # Build autoencoder model
    if params['model'] == 'cvae':
        encoder = Model([x_input, cond_input], encoded)
        decoder = Model([decoder_input, cond_input], decoded)
        model = Model([x_input, cond_input], decoder([z, cond_input]))
        loss = vae_loss
        metrics = [xent, corr, mse]
    elif params['model'] == 'vae':
        encoder = Model(x_input, encoded)
        decoder = Model(decoder_input, decoded)
        model = Model(x_input, decoder(z))
        loss = vae_loss
        metrics = [xent, corr, mse]
    else:
        encoder = Model(x_input, encoded)
        decoder = Model(decoder_input, decoded)
        model = Model(x_input, decoder(encoded))
        loss = params['loss']
        metrics = [xent, corr]

    model.summary()
    decoder.summary()

    if params['cp']:
        model_json = model.to_json()
        with open(prefix+'.model.json', 'w') as f:
            print(model_json, file=f)

    # Define optimizer
    # optimizer = candle.build_optimizer(params['optimizer'],
    #                                             params['learning_rate'],
    #                                             keras_defaults)
    optimizer = optimizers.deserialize({'class_name': params['optimizer'], 'config': {}})
    base_lr = params['base_lr'] or K.get_value(optimizer.lr)
    if params['learning_rate']:
        K.set_value(optimizer.lr, params['learning_rate'])

    model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

    # calculate trainable and non-trainable params
    params.update(candle.compute_trainable_params(model))

    def warmup_scheduler(epoch):
        lr = params['learning_rate'] or base_lr * params['batch_size']/100
        if epoch <= 5:
            K.set_value(model.optimizer.lr, (base_lr * (5-epoch) + lr * epoch) / 5)
        p1b1.logger.debug('Epoch {}: lr={}'.format(epoch, K.get_value(model.optimizer.lr)))
        return K.get_value(model.optimizer.lr)

    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001)
    warmup_lr = LearningRateScheduler(warmup_scheduler)
    checkpointer = ModelCheckpoint(params['save_path']+ext+'.weights.h5', save_best_only=True, save_weights_only=True)
    tensorboard = TensorBoard(log_dir="tb/tb{}".format(ext))
    candle_monitor = candle.CandleRemoteMonitor(params=params)
    timeout_monitor = candle.TerminateOnTimeOut(params['timeout'])
    history_logger = LoggingCallback(p1b1.logger.debug)

    callbacks = [candle_monitor, timeout_monitor, history_logger]
    if params['reduce_lr']:
        callbacks.append(reduce_lr)
    if params['warmup_lr']:
        callbacks.append(warmup_lr)
    if params['cp']:
        callbacks.append(checkpointer)
    if params['tb']:
        callbacks.append(tensorboard)

    x_val2 = np.copy(x_val)
    np.random.shuffle(x_val2)
    start_scores = p1b1.evaluate_autoencoder(x_val, x_val2)
    p1b1.logger.info('\nBetween random pairs of validation samples: {}'.format(start_scores))

    if params['model'] == 'cvae':
        inputs = [x_train, cond_train]
        val_inputs = [x_val, cond_val]
        test_inputs = [x_test, cond_test]
    else:
        inputs = x_train
        val_inputs = x_val
        test_inputs = x_test

    outputs = x_train
    val_outputs = x_val
    test_outputs = x_test

    history = model.fit(inputs, outputs,
                        verbose=2,
                        batch_size=params['batch_size'],
                        epochs=params['epochs'],
                        callbacks=callbacks,
                        validation_data=(val_inputs, val_outputs))

    if params['cp']:
        encoder.save(prefix+'.encoder.h5')
        decoder.save(prefix+'.decoder.h5')

    candle.plot_history(prefix, history, 'loss')
    candle.plot_history(prefix, history, 'corr', 'streaming pearson correlation')

    # Evalute model on test set
    x_pred = model.predict(test_inputs)
    scores = p1b1.evaluate_autoencoder(x_pred, x_test)
    p1b1.logger.info('\nEvaluation on test data: {}'.format(scores))

    x_test_encoded = encoder.predict(test_inputs, batch_size=params['batch_size'])
    y_test_classes = np.argmax(y_test, axis=1)
    candle.plot_scatter(x_test_encoded, y_test_classes, prefix+'.latent')

    if params['tsne']:
        tsne = TSNE(n_components=2, random_state=seed)
        x_test_encoded_tsne = tsne.fit_transform(x_test_encoded)
        candle.plot_scatter(x_test_encoded_tsne, y_test_classes, prefix+'.latent.tsne')

    # diff = x_pred - x_test
    # plt.hist(diff.ravel(), bins='auto')
    # plt.title("Histogram of Errors with 'auto' bins")
    # plt.savefig('histogram_keras.png')

    # generate synthetic data
    # epsilon_std = 1.0
    # for i in range(1000):
    #     z_sample = np.random.normal(size=(1, 2)) * epsilon_std
    #     x_decoded = decoder.predict(z_sample)

    p1b1.logger.handlers = []

    return history
예제 #4
0
def run(params):
    args = candle.ArgumentStruct(**params)
    candle.set_seed(args.rng_seed)
    ext = uno.extension_from_parameters(args)
    candle.verify_path(args.save_path)
    prefix = args.save_path + 'uno' + ext
    logfile = args.logfile if args.logfile else prefix+'.log'
    uno.set_up_logger(logfile, logger, uno.loggerUno, args.verbose)
    logger.info('Params: {}'.format(params))

    # Exclude drugs / cells for UQ
    if 'uq_exclude_drugs_file' in params.keys():
        args.exclude_drugs = uno.read_IDs_file(args.uq_exclude_drugs_file)
        logger.info('Drugs to exclude: {}'.format(args.exclude_drugs))
    else:
        args.exclude_drugs = []
    if 'uq_exclude_cells_file' in params.keys():
        args.exclude_cells = uno.read_IDs_file(args.uq_exclude_cells_file)
        logger.info('Cells to exclude: {}'.format(args.exclude_cells))
    else:
        args.exclude_cells = []

    if 'uq_exclude_indices_file' in params.keys():
        exclude_indices_ = uno.read_IDs_file(args.uq_exclude_indices_file)
        args.exclude_indices = [int(x) for x in exclude_indices_]
        logger.info('Indices to exclude: {}'.format(args.exclude_indices))
    else:
        args.exclude_indices = []


    if (len(args.gpus) > 0):
        import tensorflow as tf
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = ",".join(map(str, args.gpus))
        K.set_session(tf.Session(config=config))

    loader = uno_combined_data_loader.CombinedDataLoader(seed=args.rng_seed)
    loader.load(cache=args.cache,
                ncols=args.feature_subsample,
                agg_dose=args.agg_dose,
                cell_features=args.cell_features,
                drug_features=args.drug_features,
                drug_median_response_min=args.drug_median_response_min,
                drug_median_response_max=args.drug_median_response_max,
                use_landmark_genes=args.use_landmark_genes,
                use_filtered_genes=args.use_filtered_genes,
                cell_feature_subset_path=args.cell_feature_subset_path or args.feature_subset_path,
                drug_feature_subset_path=args.drug_feature_subset_path or args.feature_subset_path,
                preprocess_rnaseq=args.preprocess_rnaseq,
                single=args.single,
                train_sources=args.train_sources,
                test_sources=args.test_sources,
                embed_feature_source=not args.no_feature_source,
                encode_response_source=not args.no_response_source,
                )

    target = args.agg_dose or 'Growth'
    val_split = args.val_split
    train_split = 1 - val_split

    loader.partition_data(partition_by=args.partition_by,
                          cv_folds=args.cv, train_split=train_split, val_split=val_split,
                          cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug,
                          cell_subset_path=args.cell_subset_path,
                          drug_subset_path=args.drug_subset_path,
                          exclude_cells=args.exclude_cells,
                          exclude_drugs=args.exclude_drugs,
                          exclude_indices=args.exclude_indices
                          )

    model = uno_model_utils.build_model(loader, args, logger)
    logger.info('Combined model:')
    model.summary(print_fn=logger.info)
    # plot_model(model, to_file=prefix+'.model.png', show_shapes=True)

    if args.cp:
        model_json = model.to_json()
        with open(prefix+'.model.json', 'w') as f:
            print(model_json, file=f)

    def warmup_scheduler(epoch):
        lr = args.learning_rate or base_lr * args.batch_size/100
        if epoch <= 5:
            K.set_value(model.optimizer.lr, (base_lr * (5-epoch) + lr * epoch) / 5)
        logger.debug('Epoch {}: lr={:.5g}'.format(epoch, K.get_value(model.optimizer.lr)))
        return K.get_value(model.optimizer.lr)

    df_pred_list = []

    cv_ext = ''
    cv = args.cv if args.cv > 1 else 1

    for fold in range(cv):
        if args.cv > 1:
            logger.info('Cross validation fold {}/{}:'.format(fold+1, cv))
            cv_ext = '.cv{}'.format(fold+1)

#        model = uno_model_utils.build_model(loader, args, logger, silent=True)

        template_model = uno_model_utils.build_model(loader, args, logger, silent=True)
        if args.initial_weights:
            logger.info("Loading weights from {}".format(args.initial_weights))
            template_model.load_weights(args.initial_weights)

        if len(args.gpus) > 1:
            from keras.utils import multi_gpu_model
            gpu_count = len(args.gpus)
            logger.info("Multi GPU with {} gpus".format(gpu_count))
            model = multi_gpu_model(template_model, cpu_merge=False, gpus=gpu_count)
        else:
            model = template_model


        optimizer = optimizers.deserialize({'class_name': args.optimizer, 'config': {}})
        base_lr = args.base_lr or K.get_value(optimizer.lr)
        if args.learning_rate:
            K.set_value(optimizer.lr, args.learning_rate)

        if args.loss == 'heteroscedastic':
            logger.info('Training heteroscedastic model:')
            model.compile(loss=heteroscedastic_loss, optimizer=optimizer, metrics=[uno_model_utils.mae_heteroscedastic, uno_model_utils.r2_heteroscedastic, uno_model_utils.meanS_heteroscesdastic])
        elif args.loss == 'quantile':
            logger.info('Training quantile model:')
            model.compile(loss=triple_quantile_loss, optimizer=optimizer, metrics=[uno_model_utils.quantile50, uno_model_utils.quantile10, uno_model_utils.quantile90])
        else:
            logger.info('Training homoscedastic model:')
            model.compile(loss=args.loss, optimizer=optimizer, metrics=[candle.mae, candle.r2])

        # calculate trainable and non-trainable params
        params.update(candle.compute_trainable_params(model))

        candle_monitor = candle.CandleRemoteMonitor(params=params)
        timeout_monitor = candle.TerminateOnTimeOut(params['timeout'])

        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001)
        warmup_lr = LearningRateScheduler(warmup_scheduler)
        #checkpointer = ModelCheckpoint(prefix+cv_ext+'.weights.h5', save_best_only=True, save_weights_only=True)
        checkpointer = candle.MultiGPUCheckpoint(prefix + cv_ext + '.model.h5', save_best_only=True)
        tensorboard = TensorBoard(log_dir="tb/{}{}{}".format(args.tb_prefix, ext, cv_ext))
        history_logger = candle.LoggingCallback(logger.debug)
#        model_recorder = uno_model_utils.ModelRecorder()

        # callbacks = [history_logger, model_recorder]
        callbacks = [candle_monitor, timeout_monitor, history_logger]#, model_recorder]
        if args.reduce_lr:
            callbacks.append(reduce_lr)
        if args.warmup_lr:
            callbacks.append(warmup_lr)
        if args.cp:
            callbacks.append(checkpointer)
        if args.tb:
            callbacks.append(tensorboard)
        if args.save_weights:
            callbacks.append(uno_model_utils.SimpleWeightSaver(args.save_path + '/' + args.save_weights))


        train_gen = uno_combined_data_generator.CombinedDataGenerator(loader, fold=fold, batch_size=args.batch_size, shuffle=args.shuffle)
        val_gen = uno_combined_data_generator.CombinedDataGenerator(loader, partition='val', fold=fold, batch_size=args.batch_size, shuffle=args.shuffle)

        df_val = val_gen.get_response(copy=True)
        y_val = df_val[target].values
        y_shuf = np.random.permutation(y_val)
        uno.log_evaluation(uno.evaluate_prediction(y_val, y_shuf), logger,
                       description='Between random pairs in y_val:')

        if args.no_gen:
            x_train_list, y_train = train_gen.get_slice(size=train_gen.size, single=args.single)
            x_val_list, y_val = val_gen.get_slice(size=val_gen.size, single=args.single)
            history = model.fit(x_train_list, y_train,
                                batch_size=args.batch_size,
                                epochs=args.epochs,
                                callbacks=callbacks,
                                validation_data=(x_val_list, y_val))
        else:
            logger.info('Data points per epoch: train = %d, val = %d',train_gen.size, val_gen.size)
            logger.info('Steps per epoch: train = %d, val = %d',train_gen.steps, val_gen.steps)
            history = model.fit_generator(train_gen, train_gen.steps,
                                          epochs=args.epochs,
                                          callbacks=callbacks,
                                          validation_data=val_gen,
                                          validation_steps=val_gen.steps)

#        if args.cp:
#            model.load_weights(prefix+cv_ext+'.weights.h5')
        # model = model_recorder.best_model

        if args.no_gen:
            y_val_pred = model.predict(x_val_list, batch_size=args.batch_size)
        else:
            val_gen.reset()
            y_val_pred = model.predict_generator(val_gen, val_gen.steps + 1)
            y_val_pred = y_val_pred[:val_gen.size]

        if args.loss == 'heteroscedastic':
            y_val_pred_ = y_val_pred[:,0]
            s_val_pred = y_val_pred[:,1]

            y_val_pred = y_val_pred_.flatten()

            df_val['Predicted_'+target] = y_val_pred
            df_val[target+'_Error'] = y_val_pred-y_val
            df_val['Pred_S_'+target] = s_val_pred

        elif args.loss == 'quantile':
            y_val_pred_50q = y_val_pred[:,0]
            y_val_pred_10q = y_val_pred[:,1]
            y_val_pred_90q = y_val_pred[:,2]

            y_val_pred = y_val_pred_50q.flatten()   # 50th quantile prediction

            df_val['Predicted_50q_'+target] = y_val_pred
            df_val[target+'_Error_50q'] = y_val_pred-y_val
            df_val['Predicted_10q_'+target] = y_val_pred_10q.flatten()
            df_val['Predicted_90q_'+target] = y_val_pred_90q.flatten()

        else:
            y_val_pred = y_val_pred.flatten()

            # df_val = df_val.assign(PredictedGrowth=y_val_pred, GrowthError=y_val_pred-y_val)
            df_val['Predicted'+target] = y_val_pred
            df_val[target+'Error'] = y_val_pred-y_val

        scores = uno.evaluate_prediction(y_val, y_val_pred)
        uno.log_evaluation(scores, logger)

        df_pred_list.append(df_val)

#        if args.cp:
#            model_recorder.best_model.save(prefix+'.model.h5')

        if hasattr(history, 'loss'):
            candle.plot_history(prefix, history, 'loss')
        if args.loss == 'heteroscedastic':
            if hasattr(history, 'r2_heteroscedastic'):
                candle.plot_history(prefix, history, 'r2_heteroscedastic')
            if hasattr(history, 'meanS_heteroscedastic'):
                candle.plot_history(prefix, history, 'meanS_heteroscesdastic')
        elif args.loss == 'quantile':
            if hasattr(history, 'quantile50'):
                candle.plot_history(prefix, history, 'quantile50')
            if hasattr(history, 'quantile10'):
                candle.plot_history(prefix, history, 'quantile10')
            if hasattr(history, 'quantile90'):
                candle.plot_history(prefix, history, 'quantile90')
        else:
            if hasattr(history, 'r2'):
                candle.plot_history(prefix, history, 'r2')

    pred_fname = prefix + '.predicted.tsv'
    df_pred = pd.concat(df_pred_list)
    if args.agg_dose:
        if args.single:
#            df_pred.sort_values(['Source', 'Sample', 'Drug1', target], inplace=True)
            df_pred.sort_values(['Sample', 'Drug1', target], inplace=True)
        else:
            df_pred.sort_values(['Source', 'Sample', 'Drug1', 'Drug2', target], inplace=True)
    else:
        if args.single:
#            df_pred.sort_values(['Source', 'Sample', 'Drug1', 'Dose1', 'Growth'], inplace=True)
            df_pred.sort_values(['Sample', 'Drug1', 'Dose1', 'Growth'], inplace=True)
        else:
#            df_pred.sort_values(['Source', 'Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'], inplace=True)
            df_pred.sort_values(['Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'], inplace=True)
    df_pred.to_csv(pred_fname, sep='\t', index=False, float_format='%.4g')
    logger.info('Testing predictions stored in file: {}'.format(pred_fname))

    if args.cp:
        logger.info('Model stored in file: {}'.format(prefix+'.model.h5'))
#        logger.info('Model weights stored in file: {}'.format(prefix+cv_ext+'.weights.h5'))
        logger.info('Model weights stored in file: {}'.format(args.save_path + '/' + args.save_weights))

    if args.cv > 1:
        scores = uno.evaluate_prediction(df_pred[target], df_pred['Predicted'+target])
        uno.log_evaluation(scores, logger, description='Combining cross validation folds:')

    for test_source in loader.test_sep_sources:
        test_gen = uno_combined_data_generator.CombinedDataGenerator(loader, partition='test', batch_size=args.batch_size, source=test_source)
        df_test = test_gen.get_response(copy=True)
        y_test = df_test[target].values
        n_test = len(y_test)
        if n_test == 0:
            continue
        if args.no_gen:
            x_test_list, y_test = test_gen.get_slice(size=test_gen.size, single=args.single)
            y_test_pred = model.predict(x_test_list, batch_size=args.batch_size)
            if args.loss == 'heteroscedastic':
                y_test_pred = y_test_pred[:,0]
            elif args.loss == 'quantile':
                y_test_pred = y_test_pred[:,0] # 50th quantile prediction
        else:
            y_test_pred = model.predict_generator(test_gen.flow(single=args.single), test_gen.steps)
            if args.loss == 'heteroscedastic':
                y_test_pred = y_test_pred[:test_gen.size,0]
            elif args.loss == 'quantile':
                y_test_pred = y_test_pred[:test_gen.size,0] # 50th quantile prediction
            else:
                y_test_pred = y_test_pred[:test_gen.size]

        y_test_pred = y_test_pred.flatten()
        scores = uno.evaluate_prediction(y_test, y_test_pred)
        uno.log_evaluation(scores, logger, description='Testing on data from {} ({})'.format(test_source, n_test))

    if K.backend() == 'tensorflow':
        K.clear_session()

    logger.handlers = []

    return history
예제 #5
0
def run(params):
    args = candle.ArgumentStruct(**params)
    seed = args.rng_seed
    candle.set_seed(seed)

    # Construct extension to save model
    ext = attn.extension_from_parameters(params, "keras")
    candle.verify_path(params["save_path"])
    prefix = "{}{}".format(params["save_path"], ext)
    logfile = params["logfile"] if params["logfile"] else prefix + ".log"
    root_fname = "Agg_attn_bin"
    candle.set_up_logger(logfile, attn.logger, params["verbose"])
    attn.logger.info("Params: {}".format(params))

    # Get default parameters for initialization and optimizer functions
    keras_defaults = candle.keras_default_config()

    ##
    X_train, _Y_train, X_val, _Y_val, X_test, _Y_test = attn.load_data(
        params, seed)

    # move this inside the load_data function
    Y_train = _Y_train["AUC"]
    Y_test = _Y_test["AUC"]
    Y_val = _Y_val["AUC"]

    Y_train_neg, Y_train_pos = np.bincount(Y_train)
    Y_test_neg, Y_test_pos = np.bincount(Y_test)
    Y_val_neg, Y_val_pos = np.bincount(Y_val)

    Y_train_total = Y_train_neg + Y_train_pos
    Y_test_total = Y_test_neg + Y_test_pos
    Y_val_total = Y_val_neg + Y_val_pos

    total = Y_train_total + Y_test_total + Y_val_total
    neg = Y_train_neg + Y_test_neg + Y_val_neg
    pos = Y_train_pos + Y_test_pos + Y_val_pos

    print("Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n".
          format(total, pos, 100 * pos / total))

    nb_classes = params["dense"][-1]

    Y_train = np_utils.to_categorical(Y_train, nb_classes)
    Y_test = np_utils.to_categorical(Y_test, nb_classes)
    Y_val = np_utils.to_categorical(Y_val, nb_classes)

    y_integers = np.argmax(Y_train, axis=1)
    class_weights = compute_class_weight("balanced", np.unique(y_integers),
                                         y_integers)
    d_class_weights = dict(enumerate(class_weights))

    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)

    print("Y_train shape:", Y_train.shape)
    print("Y_test shape:", Y_test.shape)

    PS = X_train.shape[1]
    model = build_attention_model(params, PS)

    # parallel_model = multi_gpu_model(model, gpus=4)
    # parallel_model.compile(loss='mean_squared_error',
    #         optimizer=SGD(lr=0.0001, momentum=0.9),
    #              metrics=['mae',r2])
    kerasDefaults = candle.keras_default_config()
    if params["momentum"]:
        kerasDefaults["momentum_sgd"] = params["momentum"]

    optimizer = candle.build_optimizer(params["optimizer"],
                                       params["learning_rate"], kerasDefaults)

    model.compile(
        loss=params["loss"],
        optimizer=optimizer,
        #                       SGD(lr=0.00001, momentum=0.9),
        #             optimizer=Adam(lr=0.00001),
        #             optimizer=RMSprop(lr=0.0001),
        #             optimizer=Adadelta(),
        metrics=[
            "acc",
            tf.keras.metrics.AUC(name="auroc", curve="ROC"),
            tf.keras.metrics.AUC(name="aucpr", curve="PR"),
        ],
    )

    # set up a bunch of callbacks to do work during model training..

    checkpointer = ModelCheckpoint(
        filepath=params["save_path"] + root_fname + ".autosave.model.h5",
        verbose=1,
        save_weights_only=False,
        save_best_only=True,
    )
    csv_logger = CSVLogger("{}/{}.training.log".format(params["save_path"],
                                                       root_fname))
    reduce_lr = ReduceLROnPlateau(
        monitor="val_auroc",
        factor=0.20,
        patience=40,
        verbose=1,
        mode="auto",
        min_delta=0.0001,
        cooldown=3,
        min_lr=0.000000001,
    )
    early_stop = EarlyStopping(monitor="val_auroc",
                               patience=200,
                               verbose=1,
                               mode="auto")
    candle_monitor = candle.CandleRemoteMonitor(params=params)

    candle_monitor = candle.CandleRemoteMonitor(params=params)
    timeout_monitor = candle.TerminateOnTimeOut(params["timeout"])
    tensorboard = TensorBoard(log_dir="tb/tb{}".format(ext))

    history_logger = LoggingCallback(attn.logger.debug)

    callbacks = [candle_monitor, timeout_monitor, csv_logger, history_logger]

    if params["reduce_lr"]:
        callbacks.append(reduce_lr)

    if params["use_cp"]:
        callbacks.append(checkpointer)
    if params["use_tb"]:
        callbacks.append(tensorboard)
    if params["early_stop"]:
        callbacks.append(early_stop)

    epochs = params["epochs"]
    batch_size = params["batch_size"]
    history = model.fit(
        X_train,
        Y_train,
        class_weight=d_class_weights,
        batch_size=batch_size,
        epochs=epochs,
        verbose=1,
        validation_data=(X_val, Y_val),
        callbacks=callbacks,
    )

    # diagnostic plots
    if "loss" in history.history.keys():
        candle.plot_history(params["save_path"] + root_fname, history, "loss")
    if "acc" in history.history.keys():
        candle.plot_history(params["save_path"] + root_fname, history, "acc")
    if "auroc" in history.history.keys():
        candle.plot_history(params["save_path"] + root_fname, history, "auroc")
    if "auprc" in history.history.keys():
        candle.plot_history(params["save_path"] + root_fname, history, "aucpr")

    # Evaluate model
    score = model.evaluate(X_test, Y_test, verbose=0)
    Y_predict = model.predict(X_test)

    evaluate_model(params, root_fname, nb_classes, Y_test, _Y_test, Y_predict,
                   pos, total, score)

    save_and_test_saved_model(params, model, root_fname, X_train, X_test,
                              Y_test)

    attn.logger.handlers = []

    return history