コード例 #1
0
def run(params):
    args = Struct(**params)
    set_seed(args.rng_seed)
    ext = extension_from_parameters(args)
    verify_path(args.save_path)
    prefix = args.save_path + ext
    logfile = args.logfile if args.logfile else prefix + '.log'
    set_up_logger(logfile, args.verbose)
    logger.info('Params: {}'.format(params))

    if (len(args.gpus) > 0):
        import tensorflow as tf
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = ",".join(map(str, args.gpus))
        K.set_session(tf.Session(config=config))

    loader = CombinedDataLoader(seed=args.rng_seed)
    loader.load(
        cache=args.cache,
        ncols=args.feature_subsample,
        agg_dose=args.agg_dose,
        cell_features=args.cell_features,
        drug_features=args.drug_features,
        drug_median_response_min=args.drug_median_response_min,
        drug_median_response_max=args.drug_median_response_max,
        use_landmark_genes=args.use_landmark_genes,
        use_filtered_genes=args.use_filtered_genes,
        cell_feature_subset_path=args.cell_feature_subset_path
        or args.feature_subset_path,
        drug_feature_subset_path=args.drug_feature_subset_path
        or args.feature_subset_path,
        preprocess_rnaseq=args.preprocess_rnaseq,
        single=args.single,
        train_sources=args.train_sources,
        test_sources=args.test_sources,
        embed_feature_source=not args.no_feature_source,
        encode_response_source=not args.no_response_source,
    )

    target = args.agg_dose or 'Growth'
    val_split = args.validation_split
    train_split = 1 - val_split

    if args.export_csv:
        fname = args.export_csv
        loader.partition_data(cv_folds=args.cv,
                              train_split=train_split,
                              val_split=val_split,
                              cell_types=args.cell_types,
                              by_cell=args.by_cell,
                              by_drug=args.by_drug,
                              cell_subset_path=args.cell_subset_path,
                              drug_subset_path=args.drug_subset_path)
        train_gen = CombinedDataGenerator(loader,
                                          batch_size=args.batch_size,
                                          shuffle=args.shuffle)
        val_gen = CombinedDataGenerator(loader,
                                        partition='val',
                                        batch_size=args.batch_size,
                                        shuffle=args.shuffle)

        x_train_list, y_train = train_gen.get_slice(size=train_gen.size,
                                                    dataframe=True,
                                                    single=args.single)
        x_val_list, y_val = val_gen.get_slice(size=val_gen.size,
                                              dataframe=True,
                                              single=args.single)
        df_train = pd.concat([y_train] + x_train_list, axis=1)
        df_val = pd.concat([y_val] + x_val_list, axis=1)
        df = pd.concat([df_train, df_val]).reset_index(drop=True)
        if args.growth_bins > 1:
            df = uno_data.discretize(df, 'Growth', bins=args.growth_bins)
        df.to_csv(fname, sep='\t', index=False, float_format="%.3g")
        return

    if args.export_data:
        fname = args.export_data
        loader.partition_data(cv_folds=args.cv,
                              train_split=train_split,
                              val_split=val_split,
                              cell_types=args.cell_types,
                              by_cell=args.by_cell,
                              by_drug=args.by_drug,
                              cell_subset_path=args.cell_subset_path,
                              drug_subset_path=args.drug_subset_path)
        train_gen = CombinedDataGenerator(loader,
                                          batch_size=args.batch_size,
                                          shuffle=args.shuffle)
        val_gen = CombinedDataGenerator(loader,
                                        partition='val',
                                        batch_size=args.batch_size,
                                        shuffle=args.shuffle)
        store = pd.HDFStore(fname, complevel=9, complib='blosc:snappy')

        config_min_itemsize = {'Sample': 30, 'Drug1': 10}
        if not args.single:
            config_min_itemsize['Drug2'] = 10

        for partition in ['train', 'val']:
            gen = train_gen if partition == 'train' else val_gen
            for i in range(gen.steps):
                x_list, y = gen.get_slice(size=args.batch_size,
                                          dataframe=True,
                                          single=args.single)

                for j, input_feature in enumerate(x_list):
                    input_feature.columns = [''] * len(input_feature.columns)
                    store.append('x_{}_{}'.format(partition, j),
                                 input_feature.astype('float32'),
                                 format='table',
                                 data_column=True)
                store.append('y_{}'.format(partition),
                             y.astype({target: 'float32'}),
                             format='table',
                             data_column=True,
                             min_itemsize=config_min_itemsize)
                logger.info('Generating {} dataset. {} / {}'.format(
                    partition, i, gen.steps))
        store.close()
        logger.info('Completed generating {}'.format(fname))
        return

    loader.partition_data(cv_folds=args.cv,
                          train_split=train_split,
                          val_split=val_split,
                          cell_types=args.cell_types,
                          by_cell=args.by_cell,
                          by_drug=args.by_drug,
                          cell_subset_path=args.cell_subset_path,
                          drug_subset_path=args.drug_subset_path)

    model = build_model(loader, args)
    logger.info('Combined model:')
    model.summary(print_fn=logger.info)
    # plot_model(model, to_file=prefix+'.model.png', show_shapes=True)

    if args.cp:
        model_json = model.to_json()
        with open(prefix + '.model.json', 'w') as f:
            print(model_json, file=f)

    def warmup_scheduler(epoch):
        lr = args.learning_rate or base_lr * args.batch_size / 100
        if epoch <= 5:
            K.set_value(model.optimizer.lr,
                        (base_lr * (5 - epoch) + lr * epoch) / 5)
        logger.debug('Epoch {}: lr={:.5g}'.format(
            epoch, K.get_value(model.optimizer.lr)))
        return K.get_value(model.optimizer.lr)

    df_pred_list = []

    cv_ext = ''
    cv = args.cv if args.cv > 1 else 1

    for fold in range(cv):
        if args.cv > 1:
            logger.info('Cross validation fold {}/{}:'.format(fold + 1, cv))
            cv_ext = '.cv{}'.format(fold + 1)

        template_model = build_model(loader, args, silent=True)
        if args.initial_weights:
            logger.info("Loading weights from {}".format(args.initial_weights))
            template_model.load_weights(args.initial_weights)

        if len(args.gpus) > 1:
            from keras.utils import multi_gpu_model
            gpu_count = len(args.gpus)
            logger.info("Multi GPU with {} gpus".format(gpu_count))
            model = multi_gpu_model(template_model,
                                    cpu_merge=False,
                                    gpus=gpu_count)
        else:
            model = template_model

        optimizer = optimizers.deserialize({
            'class_name': args.optimizer,
            'config': {}
        })
        base_lr = args.base_lr or K.get_value(optimizer.lr)
        if args.learning_rate:
            K.set_value(optimizer.lr, args.learning_rate)

        model.compile(loss=args.loss, optimizer=optimizer, metrics=[mae, r2])

        # calculate trainable and non-trainable params
        params.update(candle.compute_trainable_params(model))

        candle_monitor = candle.CandleRemoteMonitor(params=params)
        timeout_monitor = candle.TerminateOnTimeOut(params['timeout'])

        reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                      factor=0.5,
                                      patience=5,
                                      min_lr=0.00001)
        warmup_lr = LearningRateScheduler(warmup_scheduler)
        checkpointer = MultiGPUCheckpoint(prefix + cv_ext + '.model.h5',
                                          save_best_only=True)
        tensorboard = TensorBoard(
            log_dir="tb/{}{}{}".format(args.tb_prefix, ext, cv_ext))
        history_logger = LoggingCallback(logger.debug)

        callbacks = [candle_monitor, timeout_monitor, history_logger]
        if args.reduce_lr:
            callbacks.append(reduce_lr)
        if args.warmup_lr:
            callbacks.append(warmup_lr)
        if args.cp:
            callbacks.append(checkpointer)
        if args.tb:
            callbacks.append(tensorboard)
        if args.save_weights:
            callbacks.append(
                SimpleWeightSaver(args.save_path + '/' + args.save_weights))

        if args.use_exported_data is not None:
            train_gen = DataFeeder(filename=args.use_exported_data,
                                   batch_size=args.batch_size,
                                   shuffle=args.shuffle,
                                   single=args.single,
                                   agg_dose=args.agg_dose)
            val_gen = DataFeeder(partition='val',
                                 filename=args.use_exported_data,
                                 batch_size=args.batch_size,
                                 shuffle=args.shuffle,
                                 single=args.single,
                                 agg_dose=args.agg_dose)
        else:
            train_gen = CombinedDataGenerator(loader,
                                              fold=fold,
                                              batch_size=args.batch_size,
                                              shuffle=args.shuffle,
                                              single=args.single)
            val_gen = CombinedDataGenerator(loader,
                                            partition='val',
                                            fold=fold,
                                            batch_size=args.batch_size,
                                            shuffle=args.shuffle,
                                            single=args.single)

        df_val = val_gen.get_response(copy=True)
        y_val = df_val[target].values
        y_shuf = np.random.permutation(y_val)
        log_evaluation(evaluate_prediction(y_val, y_shuf),
                       description='Between random pairs in y_val:')

        if args.no_gen:
            x_train_list, y_train = train_gen.get_slice(size=train_gen.size,
                                                        single=args.single)
            x_val_list, y_val = val_gen.get_slice(size=val_gen.size,
                                                  single=args.single)
            history = model.fit(x_train_list,
                                y_train,
                                batch_size=args.batch_size,
                                epochs=args.epochs,
                                callbacks=callbacks,
                                validation_data=(x_val_list, y_val))
        else:
            logger.info('Data points per epoch: train = %d, val = %d',
                        train_gen.size, val_gen.size)
            logger.info('Steps per epoch: train = %d, val = %d',
                        train_gen.steps, val_gen.steps)
            history = model.fit_generator(train_gen,
                                          train_gen.steps,
                                          epochs=args.epochs,
                                          callbacks=callbacks,
                                          validation_data=val_gen,
                                          validation_steps=val_gen.steps)

        if args.no_gen:
            y_val_pred = model.predict(x_val_list, batch_size=args.batch_size)
        else:
            val_gen.reset()
            y_val_pred = model.predict_generator(val_gen, val_gen.steps + 1)
            y_val_pred = y_val_pred[:val_gen.size]

        y_val_pred = y_val_pred.flatten()

        scores = evaluate_prediction(y_val, y_val_pred)
        log_evaluation(scores)

        # df_val = df_val.assign(PredictedGrowth=y_val_pred, GrowthError=y_val_pred - y_val)
        df_val['Predicted' + target] = y_val_pred
        df_val[target + 'Error'] = y_val_pred - y_val
        df_pred_list.append(df_val)

        if hasattr(history, 'loss'):
            plot_history(prefix, history, 'loss')
        if hasattr(history, 'r2'):
            plot_history(prefix, history, 'r2')

    pred_fname = prefix + '.predicted.tsv'
    df_pred = pd.concat(df_pred_list)
    if args.agg_dose:
        if args.single:
            df_pred.sort_values(['Sample', 'Drug1', target], inplace=True)
        else:
            df_pred.sort_values(['Source', 'Sample', 'Drug1', 'Drug2', target],
                                inplace=True)
    else:
        if args.single:
            df_pred.sort_values(['Sample', 'Drug1', 'Dose1', 'Growth'],
                                inplace=True)
        else:
            df_pred.sort_values(
                ['Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'],
                inplace=True)
    df_pred.to_csv(pred_fname, sep='\t', index=False, float_format='%.4g')

    if args.cv > 1:
        scores = evaluate_prediction(df_pred[target],
                                     df_pred['Predicted' + target])
        log_evaluation(scores, description='Combining cross validation folds:')

    for test_source in loader.test_sep_sources:
        test_gen = CombinedDataGenerator(loader,
                                         partition='test',
                                         batch_size=args.batch_size,
                                         source=test_source)
        df_test = test_gen.get_response(copy=True)
        y_test = df_test[target].values
        n_test = len(y_test)
        if n_test == 0:
            continue
        if args.no_gen:
            x_test_list, y_test = test_gen.get_slice(size=test_gen.size,
                                                     single=args.single)
            y_test_pred = model.predict(x_test_list,
                                        batch_size=args.batch_size)
        else:
            y_test_pred = model.predict_generator(
                test_gen.flow(single=args.single), test_gen.steps)
            y_test_pred = y_test_pred[:test_gen.size]
        y_test_pred = y_test_pred.flatten()
        scores = evaluate_prediction(y_test, y_test_pred)
        log_evaluation(scores,
                       description='Testing on data from {} ({})'.format(
                           test_source, n_test))

    if K.backend() == 'tensorflow':
        K.clear_session()

    logger.handlers = []

    return history
コード例 #2
0
def run(params):
    args = Struct(**params)
    set_seed(args.rng_seed)
    ext = extension_from_parameters(args)
    verify_path(args.save)
    prefix = args.save + ext
    logfile = args.logfile if args.logfile else prefix + '.log'
    set_up_logger(logfile, args.verbose)
    logger.info('Params: {}'.format(params))

    loader = CombinedDataLoader(seed=args.rng_seed)
    loader.load(
        cache=args.cache,
        ncols=args.feature_subsample,
        cell_features=args.cell_features,
        drug_features=args.drug_features,
        drug_median_response_min=args.drug_median_response_min,
        drug_median_response_max=args.drug_median_response_max,
        use_landmark_genes=args.use_landmark_genes,
        use_filtered_genes=args.use_filtered_genes,
        preprocess_rnaseq=args.preprocess_rnaseq,
        single=args.single,
        train_sources=args.train_sources,
        test_sources=args.test_sources,
        embed_feature_source=not args.no_feature_source,
        encode_response_source=not args.no_response_source,
    )

    val_split = args.validation_split
    train_split = 1 - val_split

    if args.export_data:
        fname = args.export_data
        loader.partition_data(cv_folds=args.cv,
                              train_split=train_split,
                              val_split=val_split,
                              cell_types=args.cell_types,
                              by_cell=args.by_cell,
                              by_drug=args.by_drug)
        train_gen = CombinedDataGenerator(loader,
                                          batch_size=args.batch_size,
                                          shuffle=args.shuffle)
        val_gen = CombinedDataGenerator(loader,
                                        partition='val',
                                        batch_size=args.batch_size,
                                        shuffle=args.shuffle)
        x_train_list, y_train = train_gen.get_slice(size=train_gen.size,
                                                    dataframe=True,
                                                    single=args.single)
        x_val_list, y_val = val_gen.get_slice(size=val_gen.size,
                                              dataframe=True,
                                              single=args.single)
        df_train = pd.concat([y_train] + x_train_list, axis=1)
        df_val = pd.concat([y_val] + x_val_list, axis=1)
        df = pd.concat([df_train, df_val]).reset_index(drop=True)
        if args.growth_bins > 1:
            df = uno_data.discretize(df, 'Growth', bins=args.growth_bins)
        df.to_csv(fname, sep='\t', index=False, float_format="%.3g")
        return

    loader.partition_data(cv_folds=args.cv,
                          train_split=train_split,
                          val_split=val_split,
                          cell_types=args.cell_types,
                          by_cell=args.by_cell,
                          by_drug=args.by_drug)

    model = build_model(loader, args)
    logger.info('Combined model:')
    # model.summary(print_fn=logger.info)
    # plot_model(model, to_file=prefix+'.model.png', show_shapes=True)

    if args.cp:
        model_json = model.to_json()
        with open(prefix + '.model.json', 'w') as f:
            print(model_json, file=f)

    def warmup_scheduler(epoch):
        lr = args.learning_rate or base_lr * args.batch_size / 100
        if epoch <= 5:
            K.set_value(model.optimizer.lr,
                        (base_lr * (5 - epoch) + lr * epoch) / 5)
        logger.debug('Epoch {}: lr={:.5g}'.format(
            epoch, K.get_value(model.optimizer.lr)))
        return K.get_value(model.optimizer.lr)

    df_pred_list = []

    cv_ext = ''
    cv = args.cv if args.cv > 1 else 1

    for fold in range(cv):
        if args.cv > 1:
            logger.info('Cross validation fold {}/{}:'.format(fold + 1, cv))
            cv_ext = '.cv{}'.format(fold + 1)

        model = build_model(loader, args, silent=True)

        optimizer = optimizers.deserialize({
            'class_name': args.optimizer,
            'config': {}
        })
        base_lr = args.base_lr or K.get_value(optimizer.lr)
        if args.learning_rate:
            K.set_value(optimizer.lr, args.learning_rate)

        model.compile(loss=args.loss, optimizer=optimizer, metrics=[mae, r2])

        # calculate trainable and non-trainable params
        params.update(candle.compute_trainable_params(model))

        candle_monitor = candle.CandleRemoteMonitor(params=params)
        timeout_monitor = candle.TerminateOnTimeOut(params['timeout'])

        reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                      factor=0.5,
                                      patience=5,
                                      min_lr=0.00001)
        warmup_lr = LearningRateScheduler(warmup_scheduler)
        checkpointer = ModelCheckpoint(prefix + cv_ext + '.weights.h5',
                                       save_best_only=True,
                                       save_weights_only=True)
        tensorboard = TensorBoard(log_dir="tb/tb{}{}".format(ext, cv_ext))
        history_logger = LoggingCallback(logger.debug)
        model_recorder = ModelRecorder()

        # callbacks = [history_logger, model_recorder]
        callbacks = [
            candle_monitor, timeout_monitor, history_logger, model_recorder
        ]
        if args.reduce_lr:
            callbacks.append(reduce_lr)
        if args.warmup_lr:
            callbacks.append(warmup_lr)
        if args.cp:
            callbacks.append(checkpointer)
        if args.tb:
            callbacks.append(tensorboard)

        train_gen = CombinedDataGenerator(loader,
                                          fold=fold,
                                          batch_size=args.batch_size,
                                          shuffle=args.shuffle)
        val_gen = CombinedDataGenerator(loader,
                                        partition='val',
                                        fold=fold,
                                        batch_size=args.batch_size,
                                        shuffle=args.shuffle)

        df_val = val_gen.get_response(copy=True)
        y_val = df_val['Growth'].values
        y_shuf = np.random.permutation(y_val)
        log_evaluation(evaluate_prediction(y_val, y_shuf),
                       description='Between random pairs in y_val:')

        candleRemoteMonitor = CandleRemoteMonitor(params=params)

        callbacks.append(candleRemoteMonitor)

        if args.no_gen:
            x_train_list, y_train = train_gen.get_slice(size=train_gen.size,
                                                        single=args.single)
            x_val_list, y_val = val_gen.get_slice(size=val_gen.size,
                                                  single=args.single)
            history = model.fit(x_train_list,
                                y_train,
                                batch_size=args.batch_size,
                                epochs=args.epochs,
                                callbacks=callbacks,
                                validation_data=(x_val_list, y_val))
        else:
            logger.info('Data points per epoch: train = %d, val = %d',
                        train_gen.size, val_gen.size)
            logger.info('Steps per epoch: train = %d, val = %d',
                        train_gen.steps, val_gen.steps)
            history = model.fit_generator(
                train_gen.flow(single=args.single),
                train_gen.steps,
                epochs=args.epochs,
                callbacks=callbacks,
                validation_data=val_gen.flow(single=args.single),
                validation_steps=val_gen.steps)

        if args.cp:
            model.load_weights(prefix + cv_ext + '.weights.h5')
        # model = model_recorder.best_model

        if args.no_gen:
            y_val_pred = model.predict(x_val_list, batch_size=args.batch_size)
        else:
            val_gen.reset()
            y_val_pred = model.predict_generator(
                val_gen.flow(single=args.single), val_gen.steps)
            y_val_pred = y_val_pred[:val_gen.size]

        y_val_pred = y_val_pred.flatten()

        scores = evaluate_prediction(y_val, y_val_pred)
        log_evaluation(scores)

        df_val = df_val.assign(PredictedGrowth=y_val_pred,
                               GrowthError=y_val_pred - y_val)
        df_pred_list.append(df_val)

        plot_history(prefix, history, 'loss')
        plot_history(prefix, history, 'r2')

    pred_fname = prefix + '.predicted.tsv'
    df_pred = pd.concat(df_pred_list)
    df_pred.sort_values(
        ['Source', 'Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'],
        inplace=True)
    df_pred.to_csv(pred_fname, sep='\t', index=False, float_format='%.4g')

    if args.cv > 1:
        scores = evaluate_prediction(df_pred['Growth'],
                                     df_pred['PredictedGrowth'])
        log_evaluation(scores, description='Combining cross validation folds:')

    for test_source in loader.test_sep_sources:
        test_gen = CombinedDataGenerator(loader,
                                         partition='test',
                                         batch_size=args.batch_size,
                                         source=test_source)
        df_test = test_gen.get_response(copy=True)
        y_test = df_test['Growth'].values
        n_test = len(y_test)
        if n_test == 0:
            continue
        if args.no_gen:
            x_test_list, y_test = test_gen.get_slice(size=test_gen.size,
                                                     single=args.single)
            y_test_pred = model.predict(x_test_list,
                                        batch_size=args.batch_size)
        else:
            y_test_pred = model.predict_generator(
                test_gen.flow(single=args.single), test_gen.steps)
            y_test_pred = y_test_pred[:test_gen.size]
        y_test_pred = y_test_pred.flatten()
        scores = evaluate_prediction(y_test, y_test_pred)
        log_evaluation(scores,
                       description='Testing on data from {} ({})'.format(
                           test_source, n_test))

    if K.backend() == 'tensorflow':
        K.clear_session()

    logger.handlers = []

    return history