Beispiel #1
0
def main():
    usage = 'usage: %prog [options] <params_file> <model_file> <data_dir>'
    parser = OptionParser(usage)
    parser.add_option(
        '--ai',
        dest='accuracy_indexes',
        help=
        'Comma-separated list of target indexes to make accuracy scatter plots.'
    )
    parser.add_option('--mc',
                      dest='mc_n',
                      default=0,
                      type='int',
                      help='Monte carlo test iterations [Default: %default]')
    parser.add_option(
        '--peak',
        '--peaks',
        dest='peaks',
        default=False,
        action='store_true',
        help='Compute expensive peak accuracy [Default: %default]')
    parser.add_option(
        '-o',
        dest='out_dir',
        default='test_out',
        help='Output directory for test statistics [Default: %default]')
    parser.add_option(
        '--rc',
        dest='rc',
        default=False,
        action='store_true',
        help='Average the fwd and rc predictions [Default: %default]')
    parser.add_option(
        '--save',
        dest='save',
        default=False,
        action='store_true',
        help='Save targets and predictions numpy arrays [Default: %default]')
    parser.add_option('--shifts',
                      dest='shifts',
                      default='0',
                      help='Ensemble prediction shifts [Default: %default]')
    parser.add_option(
        '-t',
        dest='targets_file',
        default=None,
        type='str',
        help='File specifying target indexes and labels in table format')
    parser.add_option(
        '--tfr',
        dest='tfr_pattern',
        default='test-*.tfr',
        help='TFR pattern string appended to data_dir [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide parameters, model, and test data HDF5')
    else:
        params_file = args[0]
        model_file = args[1]
        data_dir = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    # parse shifts to integers
    options.shifts = [int(shift) for shift in options.shifts.split(',')]

    #######################################################
    # inputs

    # read targets
    if options.targets_file is None:
        options.targets_file = '%s/targets.txt' % data_dir
    targets_df = pd.read_csv(options.targets_file, index_col=0, sep='\t')

    # read model parameters
    with open(params_file) as params_open:
        params = json.load(params_open)
    params_model = params['model']
    params_train = params['train']

    # read data parameters
    data_stats_file = '%s/statistics.json' % data_dir
    with open(data_stats_file) as data_stats_open:
        data_stats = json.load(data_stats_open)

    # construct data ops
    tfr_pattern_path = '%s/tfrecords/%s' % (data_dir, options.tfr_pattern)
    eval_data = dataset.SeqDataset(tfr_pattern_path,
                                   seq_length=data_stats['seq_length'],
                                   target_length=data_stats['target_length'],
                                   batch_size=params_train['batch_size'],
                                   mode=tf.estimator.ModeKeys.EVAL)

    # initialize model
    seqnn_model = seqnn.SeqNN(params_model)
    seqnn_model.restore(model_file)
    seqnn_model.build_ensemble(options.rc, options.shifts)

    #######################################################
    # evaluate

    eval_loss = params_train.get('loss', 'poisson')

    # evaluate
    test_loss, test_pr, test_r2 = seqnn_model.evaluate(eval_data,
                                                       loss=eval_loss)
    print('')

    # print summary statistics
    print('Test Loss:         %7.5f' % test_loss)
    print('Test R2:           %7.5f' % test_r2.mean())
    print('Test PearsonR:     %7.5f' % test_pr.mean())

    # write target-level statistics
    targets_acc_df = pd.DataFrame({
        'index': targets_df.index,
        'r2': test_r2,
        'pearsonr': test_pr,
        'identifier': targets_df.identifier,
        'description': targets_df.description
    })
    targets_acc_df.to_csv('%s/acc.txt' % options.out_dir,
                          sep='\t',
                          index=False,
                          float_format='%.5f')

    #######################################################
    # predict?

    if options.save or options.peaks or options.accuracy_indexes is not None:
        # compute predictions
        test_preds = seqnn_model.predict(eval_data).astype('float16')

        # read targets
        test_targets = eval_data.numpy(return_inputs=False)

    if options.save:
        preds_h5 = h5py.File('%s/preds.h5' % options.out_dir, 'w')
        preds_h5.create_dataset('preds', data=test_preds)
        preds_h5.close()
        targets_h5 = h5py.File('%s/targets.h5' % options.out_dir, 'w')
        targets_h5.create_dataset('targets', data=test_targets)
        targets_h5.close()

    #######################################################
    # peak call accuracy

    if options.peaks:
        peaks_out_file = '%s/peaks.txt' % options.out_dir
        test_peaks(test_preds, test_targets, peaks_out_file)

    #######################################################
    # accuracy plots

    if options.accuracy_indexes is not None:
        accuracy_indexes = [
            int(ti) for ti in options.accuracy_indexes.split(',')
        ]

        if not os.path.isdir('%s/scatter' % options.out_dir):
            os.mkdir('%s/scatter' % options.out_dir)

        if not os.path.isdir('%s/violin' % options.out_dir):
            os.mkdir('%s/violin' % options.out_dir)

        if not os.path.isdir('%s/roc' % options.out_dir):
            os.mkdir('%s/roc' % options.out_dir)

        if not os.path.isdir('%s/pr' % options.out_dir):
            os.mkdir('%s/pr' % options.out_dir)

        for ti in accuracy_indexes:
            test_targets_ti = test_targets[:, :, ti]

            ############################################
            # scatter

            # sample every few bins (adjust to plot the # points I want)
            ds_indexes = np.arange(0, test_preds.shape[1], 8)

            # subset and flatten
            test_targets_ti_flat = test_targets_ti[:, ds_indexes].flatten(
            ).astype('float32')
            test_preds_ti_flat = test_preds[:, ds_indexes,
                                            ti].flatten().astype('float32')

            # take log2
            test_targets_ti_log = np.log2(test_targets_ti_flat + 1)
            test_preds_ti_log = np.log2(test_preds_ti_flat + 1)

            # plot log2
            sns.set(font_scale=1.2, style='ticks')
            out_pdf = '%s/scatter/t%d.pdf' % (options.out_dir, ti)
            plots.regplot(test_targets_ti_log,
                          test_preds_ti_log,
                          out_pdf,
                          poly_order=1,
                          alpha=0.3,
                          sample=500,
                          figsize=(6, 6),
                          x_label='log2 Experiment',
                          y_label='log2 Prediction',
                          table=True)

            ############################################
            # violin

            # call peaks
            test_targets_ti_lambda = np.mean(test_targets_ti_flat)
            test_targets_pvals = 1 - poisson.cdf(
                np.round(test_targets_ti_flat) - 1, mu=test_targets_ti_lambda)
            test_targets_qvals = np.array(ben_hoch(test_targets_pvals))
            test_targets_peaks = test_targets_qvals < 0.01
            test_targets_peaks_str = np.where(test_targets_peaks, 'Peak',
                                              'Background')

            # violin plot
            sns.set(font_scale=1.3, style='ticks')
            plt.figure()
            df = pd.DataFrame({
                'log2 Prediction':
                np.log2(test_preds_ti_flat + 1),
                'Experimental coverage status':
                test_targets_peaks_str
            })
            ax = sns.violinplot(x='Experimental coverage status',
                                y='log2 Prediction',
                                data=df)
            ax.grid(True, linestyle=':')
            plt.savefig('%s/violin/t%d.pdf' % (options.out_dir, ti))
            plt.close()

            # ROC
            plt.figure()
            fpr, tpr, _ = roc_curve(test_targets_peaks, test_preds_ti_flat)
            auroc = roc_auc_score(test_targets_peaks, test_preds_ti_flat)
            plt.plot([0, 1], [0, 1],
                     c='black',
                     linewidth=1,
                     linestyle='--',
                     alpha=0.7)
            plt.plot(fpr, tpr, c='black')
            ax = plt.gca()
            ax.set_xlabel('False positive rate')
            ax.set_ylabel('True positive rate')
            ax.text(0.99,
                    0.02,
                    'AUROC %.3f' % auroc,
                    horizontalalignment='right')  # , fontsize=14)
            ax.grid(True, linestyle=':')
            plt.savefig('%s/roc/t%d.pdf' % (options.out_dir, ti))
            plt.close()

            # PR
            plt.figure()
            prec, recall, _ = precision_recall_curve(test_targets_peaks,
                                                     test_preds_ti_flat)
            auprc = average_precision_score(test_targets_peaks,
                                            test_preds_ti_flat)
            plt.axhline(y=test_targets_peaks.mean(),
                        c='black',
                        linewidth=1,
                        linestyle='--',
                        alpha=0.7)
            plt.plot(recall, prec, c='black')
            ax = plt.gca()
            ax.set_xlabel('Recall')
            ax.set_ylabel('Precision')
            ax.text(0.99,
                    0.95,
                    'AUPRC %.3f' % auprc,
                    horizontalalignment='right')  # , fontsize=14)
            ax.grid(True, linestyle=':')
            plt.savefig('%s/pr/t%d.pdf' % (options.out_dir, ti))
            plt.close()
Beispiel #2
0
def main():
    usage = 'usage: %prog [options] <params_file> <model_file> <data_dir>'
    parser = OptionParser(usage)
    parser.add_option(
        '-b',
        dest='track_bed',
        help='BED file describing regions so we can output BigWig tracks')
    parser.add_option('-g',
                      dest='genome_file',
                      default='%s/tutorials/data/human.hg19.genome' %
                      os.environ['BASENJIDIR'],
                      help='Chromosome length information [Default: %default]')
    parser.add_option('--mc',
                      dest='mc_n',
                      default=0,
                      type='int',
                      help='Monte carlo test iterations [Default: %default]')
    parser.add_option(
        '-o',
        dest='out_dir',
        default='test_out',
        help='Output directory for test statistics [Default: %default]')
    parser.add_option(
        '--rc',
        dest='rc',
        default=False,
        action='store_true',
        help='Average the fwd and rc predictions [Default: %default]')
    parser.add_option('--shifts',
                      dest='shifts',
                      default='0',
                      help='Ensemble prediction shifts [Default: %default]')
    parser.add_option(
        '-t',
        dest='targets_file',
        default=None,
        type='str',
        help='File specifying target indexes and labels in table format')
    parser.add_option(
        '--ti',
        dest='track_indexes',
        help='Comma-separated list of target indexes to output BigWig tracks')
    parser.add_option(
        '--tfr',
        dest='tfr_pattern',
        default='test-*.tfr',
        help='TFR pattern string appended to data_dir [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide parameters, model, and test data HDF5')
    else:
        params_file = args[0]
        model_file = args[1]
        data_dir = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    # parse shifts to integers
    options.shifts = [int(shift) for shift in options.shifts.split(',')]

    # read targets
    if options.targets_file is None:
        options.targets_file = '%s/targets.txt' % data_dir
        targets_df = pd.read_csv(options.targets_file, index_col=0, sep='\t')
        target_subset = None
    else:
        targets_df = pd.read_csv(options.targets_file, index_col=0, sep='\t')
        target_subset = targets_df.index

    # read model parameters
    with open(params_file) as params_open:
        params = json.load(params_open)
    params_model = params['model']
    params_train = params['train']

    # read data parameters
    data_stats_file = '%s/statistics.json' % data_dir
    with open(data_stats_file) as data_stats_open:
        data_stats = json.load(data_stats_open)

    # construct data ops
    tfr_pattern_path = '%s/tfrecords/%s' % (data_dir, options.tfr_pattern)
    eval_data = dataset.SeqDataset(tfr_pattern_path,
                                   seq_length=params_model['seq_length'],
                                   target_length=data_stats['target_length'],
                                   batch_size=params_train['batch_size'],
                                   mode=tf.estimator.ModeKeys.EVAL)

    # initialize model
    seqnn_model = seqnn.SeqNN(params_model)
    seqnn_model.restore(model_file)
    seqnn_model.build_ensemble(options.rc, options.shifts)

    # predict
    test_preds = seqnn_model.predict(eval_data, verbose=1).astype('float16')

    # save
    preds_h5 = h5py.File('%s/preds.h5' % options.out_dir, 'w')
    preds_h5.create_dataset('preds', data=test_preds)
    preds_h5.close()

    # print normalization factors
    target_means = test_preds.mean(axis=(0, 1), dtype='float64')
    target_means_median = np.median(target_means)
    # target_means /= target_means_median
    norm_out = open('%s/normalization.txt' % options.out_dir, 'w')
    # print('\n'.join([str(tu) for tu in target_means]), file=norm_out)
    for ti in range(len(target_means)):
        print(ti,
              target_means[ti],
              target_means_median / target_means[ti],
              file=norm_out)
    norm_out.close()

    #######################################################
    # BigWig tracks

    # print bigwig tracks for visualization
    if options.track_bed:
        if options.genome_file is None:
            parser.error(
                'Must provide genome file in order to print valid BigWigs.')

        if not os.path.isdir('%s/tracks' % options.out_dir):
            os.mkdir('%s/tracks' % options.out_dir)

        track_indexes = range(test_preds.shape[2])
        if options.track_indexes:
            track_indexes = [
                int(ti) for ti in options.track_indexes.split(',')
            ]

        for ti in track_indexes:
            # make predictions bigwig
            bw_file = '%s/tracks/t%d_preds.bw' % (options.out_dir, ti)
            bigwig_write(bw_file, test_preds[:, :, ti], options.track_bed,
                         options.genome_file, model.hp.batch_buffer)
Beispiel #3
0
def main():
    usage = 'usage: %prog [options] <params_file> <data1_dir> <data2_dir> ...'
    parser = OptionParser(usage)
    parser.add_option(
        '-o',
        dest='out_dir',
        default='train2_out',
        help='Output directory for test statistics [Default: %default]')
    parser.add_option(
        '--restore',
        dest='restore',
        help='Restore model and continue training [Default: %default]')
    parser.add_option('--trunk',
                      dest='trunk',
                      default=False,
                      action='store_true',
                      help='Restore only model trunk [Default: %default]')
    parser.add_option(
        '--tfr_train',
        dest='tfr_train_pattern',
        default=None,
        help=
        'Training TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]'
    )
    parser.add_option(
        '--tfr_eval',
        dest='tfr_eval_pattern',
        default=None,
        help=
        'Evaluation TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) < 2:
        parser.error('Must provide parameters and data directory.')
    else:
        params_file = args[0]
        data_dirs = args[1:]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)
    if params_file != '%s/params.json' % options.out_dir:
        shutil.copy(params_file, '%s/params.json' % options.out_dir)

    # read model parameters
    with open(params_file) as params_open:
        params = json.load(params_open)
    params_model = params['model']
    params_train = params['train']

    # read datasets
    train_data = []
    eval_data = []

    for data_dir in data_dirs:
        # load train data
        train_data.append(
            dataset.SeqDataset(data_dir,
                               split_label='train',
                               batch_size=params_train['batch_size'],
                               mode=tf.estimator.ModeKeys.TRAIN,
                               tfr_pattern=options.tfr_train_pattern))

        # load eval data
        eval_data.append(
            dataset.SeqDataset(data_dir,
                               split_label='valid',
                               batch_size=params_train['batch_size'],
                               mode=tf.estimator.ModeKeys.EVAL,
                               tfr_pattern=options.tfr_eval_pattern))

    if params_train.get('num_gpu', 1) == 1:
        ########################################
        # one GPU

        # initialize model
        seqnn_model = seqnn.SeqNN(params_model)

        # restore
        if options.restore:
            seqnn_model.restore(options.restore, options.trunk)

        # initialize trainer
        seqnn_trainer = trainer.Trainer(params_train, train_data, eval_data,
                                        options.out_dir)

        # compile model
        seqnn_trainer.compile(seqnn_model)

        # train model
        seqnn_trainer.fit2(seqnn_model)

    else:
        ########################################
        # two GPU

        strategy = tf.distribute.MirroredStrategy()
        with strategy.scope():

            # distribute data
            for di in range(len(data_dirs)):
                train_data[di].distribute(strategy)
                eval_data[di].distribute(strategy)

            # initialize model
            seqnn_model = seqnn.SeqNN(params_model)

            # restore
            if options.restore:
                seqnn_model.restore(options.restore, options.trunk)

            # initialize trainer
            seqnn_trainer = trainer.Trainer(params_train, train_data,
                                            eval_data, options.out_dir,
                                            strategy, params_train['num_gpu'])

            # compile model
            seqnn_trainer.compile(seqnn_model)

        # train model
        seqnn_trainer.fit2(seqnn_model)
Beispiel #4
0
def main():
  usage = 'usage: %prog [options] <params_file> <data1_dir> <data2_dir> ...'
  parser = OptionParser(usage)
  parser.add_option('-o', dest='out_dir',
      default='train2_out',
      help='Output directory for test statistics [Default: %default]')
  parser.add_option('--restore', dest='restore',
      help='Restore model and continue training [Default: %default]')
  parser.add_option('--trunk', dest='trunk',
      default=False, action='store_true',
      help='Restore only model trunk [Default: %default]')
  parser.add_option('--tfr_train', dest='tfr_train_pattern',
      default='train-*.tfr',
      help='Training TFRecord pattern string appended to data_dir [Default: %default]')
  parser.add_option('--tfr_eval', dest='tfr_eval_pattern',
      default='valid-*.tfr',
      help='Evaluation TFRecord pattern string appended to data_dir [Default: %default]')
  (options, args) = parser.parse_args()

  if len(args) < 2:
    parser.error('Must provide parameters and data directory.')
  else:
    params_file = args[0]
    data_dirs = args[1:]

  if not os.path.isdir(options.out_dir):
    os.mkdir(options.out_dir)
  if params_file != '%s/params.json' % options.out_dir:
    shutil.copy(params_file, '%s/params.json' % options.out_dir)

  # read model parameters
  with open(params_file) as params_open:
    params = json.load(params_open)
  params_model = params['model']
  params_train = params['train']

  # read datasets
  data_stats = []
  train_data = []
  eval_data = []

  for data_dir in data_dirs:
    # read data parameters
    data_stats_file = '%s/statistics.json' % data_dir
    with open(data_stats_file) as data_stats_open:
      data_stats.append(json.load(data_stats_open))

    # load train data
    tfr_train_full = '%s/tfrecords/%s' % (data_dir, options.tfr_train_pattern)
    train_data.append(dataset.SeqDataset(tfr_train_full,
      seq_length=data_stats[0]['seq_length'],
      target_length=data_stats[0]['target_length'],
      batch_size=params_train['batch_size'],
      mode=tf.estimator.ModeKeys.TRAIN))

    # load eval data
    tfr_eval_full = '%s/tfrecords/%s' % (data_dir, options.tfr_eval_pattern)
    eval_data.append(dataset.SeqDataset(tfr_eval_full,
      seq_length=data_stats[0]['seq_length'],
      target_length=data_stats[0]['target_length'],
      batch_size=params_train['batch_size'],
      mode=tf.estimator.ModeKeys.EVAL))

  if params_train.get('num_gpu', 1) == 1:
    ########################################
    # one GPU

    # initialize model
    seqnn_model = seqnn.SeqNN(params_model)

    # restore
    if options.restore:
      seqnn_model.restore(options.restore, options.trunk)

    # initialize trainer
    seqnn_trainer = trainer.Trainer(params_train, train_data, 
                                    eval_data, options.out_dir)

    # compile model
    seqnn_trainer.compile(seqnn_model)

    # train model
    seqnn_trainer.fit2(seqnn_model)

  else:
    ########################################
    # two GPU

    print('Multiple GPUs untested for joint genome training.', file=sys.stderr)
    exit(1)

    mirrored_strategy = tf.distribute.MirroredStrategy()
    with mirrored_strategy.scope():

      # initialize model
      seqnn_model = seqnn.SeqNN(params_model)

      # restore
      if options.restore:
        seqnn_model.restore(options.restore, options.trunk)

      # initialize trainer
      seqnn_trainer = trainer.Trainer(params_train, train_data,
                                      eval_data, options.out_dir)

      # compile model
      seqnn_trainer.compile(seqnn_model)

    # train model
    seqnn_trainer.fit2(seqnn_model)
Beispiel #5
0
def main():
    usage = 'usage: %prog [options] <params_file> <model_file> <data_dir>'
    parser = OptionParser(usage)
    parser.add_option('--head',
                      dest='head_i',
                      default=0,
                      type='int',
                      help='Parameters head to test [Default: %default]')
    parser.add_option(
        '-o',
        dest='out_dir',
        default='test_out',
        help='Output directory for test statistics [Default: %default]')
    parser.add_option(
        '--rc',
        dest='rc',
        default=False,
        action='store_true',
        help='Average the fwd and rc predictions [Default: %default]')
    parser.add_option(
        '--save',
        dest='save',
        default=False,
        action='store_true',
        help='Save targets and predictions numpy arrays [Default: %default]')
    parser.add_option('--shifts',
                      dest='shifts',
                      default='0',
                      help='Ensemble prediction shifts [Default: %default]')
    parser.add_option(
        '-t',
        dest='targets_file',
        default=None,
        type='str',
        help='File specifying target indexes and labels in table format')
    parser.add_option(
        '--split',
        dest='split_label',
        default='test',
        help='Dataset split label for eg TFR pattern [Default: %default]')
    parser.add_option(
        '--tfr',
        dest='tfr_pattern',
        default=None,
        help=
        'TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]'
    )
    parser.add_option(
        '-v',
        dest='high_var_pct',
        default=1.0,
        type='float',
        help='Highly variable site proportion to take [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide parameters, model, and test data HDF5')
    else:
        params_file = args[0]
        model_file = args[1]
        data_dir = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    # parse shifts to integers
    options.shifts = [int(shift) for shift in options.shifts.split(',')]

    #######################################################
    # targets

    # read table
    if options.targets_file is None:
        options.targets_file = '%s/targets.txt' % data_dir
    targets_df = pd.read_csv(options.targets_file, index_col=0, sep='\t')
    num_targets = targets_df.shape[0]

    # classify
    target_classes = []
    for ti in range(num_targets):
        description = targets_df.iloc[ti].description
        if description.find(':') == -1:
            tc = '*'
        else:
            desc_split = description.split(':')
            if desc_split[0] == 'CHIP':
                tc = '/'.join(desc_split[:2])
            else:
                tc = desc_split[0]
        target_classes.append(tc)
    targets_df['class'] = target_classes
    target_classes = sorted(set(target_classes))
    print(target_classes)

    #######################################################
    # model

    # read parameters
    with open(params_file) as params_open:
        params = json.load(params_open)
    params_model = params['model']
    params_train = params['train']

    # construct eval data
    eval_data = dataset.SeqDataset(data_dir,
                                   split_label=options.split_label,
                                   batch_size=params_train['batch_size'],
                                   mode=tf.estimator.ModeKeys.EVAL,
                                   tfr_pattern=options.tfr_pattern)

    # initialize model
    seqnn_model = seqnn.SeqNN(params_model)
    seqnn_model.restore(model_file, options.head_i)
    seqnn_model.build_ensemble(options.rc, options.shifts)
    seqnn_model.downcast()

    #######################################################
    # targets/predictions

    # option to read from disk?

    # predict
    eval_preds = seqnn_model.predict(eval_data, verbose=1)
    print('')

    # targets
    eval_targets = eval_data.numpy(return_inputs=False, return_outputs=True)

    # flatten
    eval_preds = np.reshape(eval_preds, (-1, num_targets))
    eval_targets = np.reshape(eval_targets, (-1, num_targets))

    #######################################################
    # process classes

    targets_spec = np.zeros(num_targets)

    for tc in target_classes:
        class_mask = np.array(targets_df['class'] == tc)
        num_targets_class = class_mask.sum()

        if num_targets_class == 1:
            targets_spec[class_mask] = np.nan
        else:
            # slice class
            eval_preds_class = eval_preds[:, class_mask].astype('float32')
            eval_targets_class = eval_targets[:, class_mask].astype('float32')

            # highly variable filter
            if options.high_var_pct < 1:
                eval_targets_var = eval_targets_class.var(axis=1)
                high_var_t = np.percentile(eval_targets_var,
                                           100 * (1 - options.high_var_pct))
                high_var_mask = (eval_targets_var >= high_var_t)

                eval_preds_class = eval_preds_class[high_var_mask]
                eval_targets_class = eval_targets_class[high_var_mask]

            # quantile normalize
            eval_preds_norm = quantile_normalize(eval_preds_class)
            eval_targets_norm = quantile_normalize(eval_targets_class)

            # mean normalize
            eval_preds_norm = eval_preds_norm - eval_preds_norm.mean(
                axis=-1, keepdims=True)
            eval_targets_norm = eval_targets_norm - eval_targets_norm.mean(
                axis=-1, keepdims=True)

            # compute correlations
            pearsonr_class = np.zeros(num_targets_class)
            for ti in range(num_targets_class):
                pearsonr_class[ti] = pearsonr(eval_preds_norm[:, ti],
                                              eval_targets_norm[:, ti])[0]

            # save
            targets_spec[class_mask] = pearsonr_class

            # print
            print('%-15s  %4d  %.4f' %
                  (tc, num_targets_class, pearsonr_class[ti]))

    # write target-level statistics
    targets_acc_df = pd.DataFrame({
        'index': targets_df.index,
        'pearsonr': targets_spec,
        'identifier': targets_df.identifier,
        'description': targets_df.description
    })
    targets_acc_df.to_csv('%s/acc.txt' % options.out_dir,
                          sep='\t',
                          index=False,
                          float_format='%.5f')
Beispiel #6
0
def main():
    usage = 'usage: %prog [options] <params_file> <model_file> <data_dir>'
    parser = OptionParser(usage)
    parser.add_option(
        '-a',
        dest='act_t',
        default=0.5,
        type='float',
        help=
        'Activation threshold (as proportion of max) to consider for PWM [Default: %default]'
    )
    parser.add_option(
        '-d',
        dest='plot_density',
        default=False,
        action='store_true',
        help='Plot filter activation density [Default: %default]')
    parser.add_option(
        '--heat',
        dest='plot_heats',
        default=False,
        action='store_true',
        help=
        'Plot heat maps describing filter activations in the test sequences [Default: %default]'
    )
    parser.add_option(
        '-l',
        dest='seq_length_crop',
        default=None,
        type='int',
        help='Crop sequences to shorter length [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='basenji_motifs')
    parser.add_option('-m',
                      dest='meme_db',
                      default='%s/cisbp/Homo_sapiens.meme' %
                      os.environ['HG38'],
                      help='MEME database used to annotate motifs')
    parser.add_option(
        '-p',
        dest='parallel_threads',
        default=1,
        type='int',
        help='Generate weblogos in parallal threads [Default: %default]')
    parser.add_option(
        '-s',
        dest='sample',
        default=None,
        type='int',
        help='Sample sequences from the test set [Default:%default]')
    parser.add_option(
        '-t',
        dest='trim_filters',
        default=False,
        action='store_true',
        help=
        'Trim uninformative positions off the filter ends [Default: %default]')
    parser.add_option(
        '--tfr',
        dest='tfr_pattern',
        default='test-*.tfr',
        help='TFR pattern string appended to data_dir [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error(
            'Must provide Basenji params and model files and data directory')
    else:
        params_file = args[0]
        model_file = args[1]
        data_dir = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #######################################################
    # inputs

    # read model parameters
    with open(params_file) as params_open:
        params = json.load(params_open)
    params_model = params['model']
    params_train = params['train']
    if options.seq_length_crop is not None:
        params_model['seq_length'] = options.seq_length_crop

    # read data parameters
    data_stats_file = '%s/statistics.json' % data_dir
    with open(data_stats_file) as data_stats_open:
        data_stats = json.load(data_stats_open)

    # construct data ops
    tfr_pattern_path = '%s/tfrecords/%s' % (data_dir, options.tfr_pattern)
    eval_data = dataset.SeqDataset(tfr_pattern_path,
                                   seq_length=data_stats['seq_length'],
                                   seq_length_crop=options.seq_length_crop,
                                   target_length=data_stats['target_length'],
                                   batch_size=params_train['batch_size'],
                                   mode=tf.estimator.ModeKeys.EVAL)

    # obtain sequences
    eval_seqs_1hot = eval_data.numpy(return_inputs=True, return_outputs=False)
    eval_seqs_dna = dna_io.hot1_dna(eval_seqs_1hot)
    del eval_seqs_1hot

    #################################################################
    # model

    # initialize model
    seqnn_model = seqnn.SeqNN(params_model)
    seqnn_model.restore(model_file)

    # first layer embedding
    seqnn_model.build_embed(0)
    _, preds_length, preds_depth = seqnn_model.embed.output.shape

    # get weights
    filter_weights = seqnn_model.get_conv_weights()
    print(filter_weights.shape)
    num_filters, _, filter_size = filter_weights.shape

    # compute filter activations
    filter_outs = seqnn_model.predict(eval_data)
    print(filter_outs.shape)

    #################################################################
    # individual filter plots

    # save information contents
    filters_ic = []
    meme_out = meme_intro('%s/filters_meme.txt' % options.out_dir,
                          eval_seqs_dna)

    # plot weblogo of high scoring outputs (in parallel)
    if options.parallel_threads > 1:
        pfl_args = []
        for f in range(num_filters):
            pfl_args.append(
                (filter_outs[:, :, f], filter_size, eval_seqs_dna,
                 '%s/filter%d_logo' % (options.out_dir, f), options.act_t))
        with multiprocessing.get_context('spawn').Pool(
                options.parallel_threads) as pool:
            pool.starmap(plot_filter_logo, pfl_args)

    for f in range(num_filters):
        print('Filter %d' % f)

        # plot filter parameters as a heatmap
        plot_filter_heat(filter_weights[f, :, :],
                         '%s/filter%d_heat.pdf' % (options.out_dir, f))

        if options.parallel_threads == 1:
            plot_filter_logo(filter_outs[:, :, f], filter_size, eval_seqs_dna,
                             '%s/filter%d_logo' % (options.out_dir, f),
                             options.act_t)

        # write possum motif file
        # filter_possum(filter_weights[f, :, :], 'filter%d' % f,
        #               '%s/filter%d_possum.txt' % (options.out_dir,
        #                                           f), options.trim_filters)

        # make a PWM for the filter
        filter_pwm, nsites = make_filter_pwm('%s/filter%d_logo.fa' %
                                             (options.out_dir, f))

        if nsites < 10:
            # no information
            filters_ic.append(0)
        else:
            # compute and save information content
            filters_ic.append(info_content(filter_pwm))

            # add to the meme motif file
            meme_add(meme_out, f, filter_pwm, nsites, options.trim_filters)

    meme_out.close()

    #################################################################
    # annotate filters
    #################################################################
    # run tomtom
    subprocess.call(
        'tomtom -dist pearson -thresh 0.1 -oc %s/tomtom %s/filters_meme.txt %s'
        % (options.out_dir, options.out_dir, options.meme_db),
        shell=True)

    # read in annotations
    filter_names = name_filters(num_filters,
                                '%s/tomtom/tomtom.tsv' % options.out_dir,
                                options.meme_db)

    #################################################################
    # print a table of information
    #################################################################
    table_out = open('%s/table.txt' % options.out_dir, 'w')

    # print header for later panda reading
    header_cols = ('', 'consensus', 'annotation', 'ic', 'mean', 'std')
    print('%3s  %19s  %10s  %5s  %6s  %6s' % header_cols, file=table_out)

    for f in range(num_filters):
        # collapse to a consensus motif
        consensus = filter_motif(filter_weights[f, :, :])

        # grab annotation
        annotation = '.'
        name_pieces = filter_names[f].split('_')
        if len(name_pieces) > 1:
            annotation = name_pieces[1]

        f_scores = np.ravel(filter_outs[:, :, f])
        fmean, fstd = f_scores.mean(), f_scores.std()
        if options.plot_density:
            # plot density of filter output scores
            plot_score_density(f_scores,
                               '%s/filter%d_dens.pdf' % (options.out_dir, f))

        row_cols = (f, consensus, annotation, filters_ic[f], fmean, fstd)
        print('%-3d  %19s  %10s  %5.2f  %6.4f  %6.4f' % row_cols,
              file=table_out)

    table_out.close()

    #################################################################
    # global filter plots
    #################################################################

    # these methods make less sense for longer sequences;
    # I should fragment the sequences first.

    if options.plot_heats:
        # plot filter-sequence heatmap
        plot_filter_seq_heat(filter_outs,
                             '%s/filter_seqs.pdf' % options.out_dir)

        # plot filter-segment heatmap
        plot_filter_seg_heat(filter_outs,
                             '%s/filter_segs.pdf' % options.out_dir)
        plot_filter_seg_heat(filter_outs,
                             '%s/filter_segs_raw.pdf' % options.out_dir,
                             whiten=False)

        # plot filter-target correlation heatmap
        plot_target_corr(filter_outs, seq_targets, filter_names, target_names,
                         '%s/filter_target_cors_mean.pdf' % options.out_dir,
                         'mean')
        plot_target_corr(filter_outs, seq_targets, filter_names, target_names,
                         '%s/filter_target_cors_max.pdf' % options.out_dir,
                         'max')
Beispiel #7
0
def main(_):
    # I could write some additional code around this to check for common
    # problems, such as with num_targets.
    with open(FLAGS.params) as params_open:
        params = json.load(params_open)
    params_model = params['model']
    params_train = params['train']

    if params_train.get('use_gpu', 1) == False:
        os.environ["CUDA_VISIBLE_DEVICES"] = '-1'
        print("  ")
        print(" training on CPU ")
        print("  ")
        #need to blind to CPUs before tf is imported

    import shutil
    if not os.path.isdir(FLAGS.log_dir):
        os.mkdir(FLAGS.log_dir)
    if not os.path.isfile(FLAGS.log_dir + '/params.json'):
        shutil.copy(FLAGS.params, FLAGS.log_dir + '/params.json')

    import tensorflow as tf
    if tf.__version__[0] == '1':
        tf.compat.v1.enable_eager_execution()
    print('tf version:', tf.__version__)

    from basenji import dataset
    from basenji import seqnn
    from basenji import trainer

    # load data
    diagonal_offset = params_model.get('diagonal_offset', 2)
    target_crop = params_model.get('target_crop', 0)
    target_length_crop = params_model[
        'target_length'] - diagonal_offset - 2 * target_crop
    tlen = target_length_crop * (target_length_crop + 1) // 2

    train_data = dataset.SeqDataset(FLAGS.train_data,
                                    params_train['batch_size'],
                                    params_model['seq_length'], tlen,
                                    tf.estimator.ModeKeys.TRAIN)

    eval_data = dataset.SeqDataset(FLAGS.eval_data, params_train['batch_size'],
                                   params_model['seq_length'], tlen,
                                   tf.estimator.ModeKeys.EVAL)

    if params_train.get('num_gpu', 1) == 1:
        ########################################
        # one GPU

        # initialize model
        seqnn_model = seqnn.SeqNN(params_model)

        # restore
        if FLAGS.restore:
            seqnn_model.restore(FLAGS.restore, FLAGS.trunk)
            print('restored weights')
            if FLAGS.freeze_trunk:
                seqnn_model.model_trunk.trainable = False
                print('frozen trunk')

        # initialize trainer
        seqnn_trainer = trainer.Trainer(params_train, train_data, eval_data)

        # compile model
        seqnn_trainer.compile(seqnn_model.model)

        # train model
        seqnn_trainer.fit(seqnn_model.model)

    else:
        ########################################
        # two GPU
        print('need to update multigpu')
        '''
Beispiel #8
0
def main(_):
    # I could write some additional code around this to check for common
    # problems, such as with num_targets.
    with open(FLAGS.params) as params_open:
        params = json.load(params_open)
    params_model = params['model']
    params_train = params['train']

    if params_train.get('use_gpu', 1) == False:
        os.environ["CUDA_VISIBLE_DEVICES"] = '-1'
        print("  ")
        print(" training on CPU ")
        print("  ")

    # load data
    train_data = dataset.SeqDataset(FLAGS.train_data,
                                    params_train['batch_size'],
                                    params_model['seq_length'],
                                    params_model['target_length'],
                                    tf.estimator.ModeKeys.TRAIN)
    eval_data = dataset.SeqDataset(FLAGS.eval_data, params_train['batch_size'],
                                   params_model['seq_length'],
                                   params_model['target_length'],
                                   tf.estimator.ModeKeys.EVAL)

    if params_train.get('num_gpu', 1) == 1:
        ########################################
        # one GPU

        # initialize model
        seqnn_model = seqnn.SeqNN(params_model)

        # restore
        if FLAGS.restore:
            seqnn_model.restore(FLAGS.restore, FLAGS.trunk)

        # initialize trainer
        seqnn_trainer = trainer.Trainer(params_train, train_data, eval_data)

        # compile model
        seqnn_trainer.compile(seqnn_model.model)

        # train model
        seqnn_trainer.fit(seqnn_model.model)

    else:
        ########################################
        # two GPU

        mirrored_strategy = tf.distribute.MirroredStrategy()
        with mirrored_strategy.scope():

            # initialize model
            seqnn_model = seqnn.SeqNN(params_model)

            # restore
            if FLAGS.restore:
                seqnn_model.restore(FLAGS.restore, FLAGS.trunk)

            # initialize trainer
            seqnn_trainer = trainer.Trainer(params_train, train_data,
                                            eval_data)

            # compile model
            seqnn_trainer.compile(seqnn_model.model, None)

        # train model
        seqnn_trainer.fit(seqnn_model.model)