def read_data(learning_file):

    # 设置训练集
    train_data = dataset.SeqDataset(learning_file, state='Train', k=0)

    # 设置验证集
    valid_data = dataset.SeqDataset(learning_file, state='Valid', k=0)
    '''
    print('num_of_trainData:', len(train_data))
    print('num_of_validData:', len(valid_data))
    
    print('train positive label sum:', np.sum(np.array(train_data.labels)))
    print('valid positive label sum:', np.sum(np.array(valid_data.labels)))
    '''

    logger.info('num_of_trainData:' + str(len(train_data)))
    logger.info('num_of_validData:' + str(len(valid_data)))

    logger.info('train positive label sum:' +
                str(np.sum(np.array(train_data.labels))))
    logger.info('valid positive label sum:' +
                str(np.sum(np.array(valid_data.labels))))

    logger.info('Positive samples proportion:{:.5f}'.format(
        (np.sum(np.array(train_data.labels)) +
         np.sum(np.array(valid_data.labels))) /
        (len(train_data) + len(valid_data))))

    train_loader = Data.DataLoader(dataset=train_data,
                                   batch_size=BATCH_SIZE,
                                   shuffle=True)
    valid_loader = Data.DataLoader(dataset=valid_data,
                                   batch_size=int(len(valid_data) / 50),
                                   shuffle=True)

    return train_data, valid_data, train_loader, valid_loader
Beispiel #2
0
    with open(data_stats_file) as data_stats_open:
        data_stats = json.load(data_stats_open)
    seq_length = data_stats['seq_length']
    target_length = data_stats['target_length']
    hic_diags = data_stats['diagonal_offset']
    target_crop = data_stats['crop_bp'] // data_stats['pool_width']
    target_length1 = data_stats['seq_length'] // data_stats['pool_width']

    ### load data ###
    sequences = pd.read_csv(data_dir + '/sequences.bed',
                            sep='\t',
                            names=['chr', 'start', 'stop', 'type'])
    sequences_test = sequences.iloc[sequences['type'].values == 'test']
    sequences_test.reset_index(inplace=True, drop=True)
    print("going to load test dataset")
    test_data = dataset.SeqDataset(data_dir, 'test', batch_size=8)

    # test_targets is a float array with shape
    # [#regions, #pixels, target #target datasets]
    # representing log(obs/exp)data, where #pixels
    # corresponds to the number of entries in the flattened
    # upper-triangular representation of the matrix

    # test_inputs are 1-hot encoded arrays with shape
    # [#regions, 2^20 bp, 4 nucleotides datasets]

    test_inputs, test_targets = test_data.numpy(return_inputs=True,
                                                return_outputs=True)

    # print(test_targets)
Beispiel #3
0
def main():
  usage = 'usage: %prog [options] <params_file> <model_file> <data_dir>'
  parser = OptionParser(usage)
  parser.add_option('--ai', dest='accuracy_indexes',
      help='Comma-separated list of target indexes to make accuracy scatter plots.')
  parser.add_option('--head', dest='head_i',
      default=0, type='int',
      help='Parameters head to test [Default: %default]')
  parser.add_option('--mc', dest='mc_n',
      default=0, type='int',
      help='Monte carlo test iterations [Default: %default]')
  parser.add_option('--peak','--peaks', dest='peaks',
      default=False, action='store_true',
      help='Compute expensive peak accuracy [Default: %default]')
  parser.add_option('-o', dest='out_dir',
      default='test_out',
      help='Output directory for test statistics [Default: %default]')
  parser.add_option('--rc', dest='rc',
      default=False, action='store_true',
      help='Average the fwd and rc predictions [Default: %default]')
  parser.add_option('--save', dest='save',
      default=False, action='store_true',
      help='Save targets and predictions numpy arrays [Default: %default]')
  parser.add_option('--shifts', dest='shifts',
      default='0',
      help='Ensemble prediction shifts [Default: %default]')
  parser.add_option('-t', dest='targets_file',
      default=None, type='str',
      help='File specifying target indexes and labels in table format')
  parser.add_option('--split', dest='split_label',
      default='test',
      help='Dataset split label for eg TFR pattern [Default: %default]')
  parser.add_option('--tfr', dest='tfr_pattern',
      default=None,
      help='TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]')
  (options, args) = parser.parse_args()

  if len(args) != 3:
    parser.error('Must provide parameters, model, and test data HDF5')
  else:
    params_file = args[0]
    model_file = args[1]
    data_dir = args[2]

  if not os.path.isdir(options.out_dir):
    os.mkdir(options.out_dir)

  # parse shifts to integers
  options.shifts = [int(shift) for shift in options.shifts.split(',')]

  #######################################################
  # inputs

  # read targets
  if options.targets_file is None:
    options.targets_file = '%s/targets.txt' % data_dir
  targets_df = pd.read_csv(options.targets_file, index_col=0, sep='\t')

  # read model parameters
  with open(params_file) as params_open:
    params = json.load(params_open)
  params_model = params['model']
  params_train = params['train']

  # construct eval data
  eval_data = dataset.SeqDataset(data_dir,
    split_label=options.split_label,
    batch_size=params_train['batch_size'],
    mode='eval',
    tfr_pattern=options.tfr_pattern)

  # initialize model
  seqnn_model = seqnn.SeqNN(params_model)
  seqnn_model.restore(model_file, options.head_i)
  seqnn_model.build_ensemble(options.rc, options.shifts)

  #######################################################
  # evaluate

  loss_label = params_train.get('loss', 'poisson').lower()
  spec_weight = params_train.get('spec_weight', 1)
  loss_fn = trainer.parse_loss(loss_label, spec_weight=spec_weight)

  # evaluate
  test_loss, test_metric1, test_metric2 = seqnn_model.evaluate(eval_data, loss=loss_fn)

  # print summary statistics
  print('\nTest Loss:         %7.5f' % test_loss)

  if loss_label == 'bce':
    print('Test AUROC:        %7.5f' % test_metric1.mean())
    print('Test AUPRC:        %7.5f' % test_metric2.mean())

    # write target-level statistics
    targets_acc_df = pd.DataFrame({
      'index': targets_df.index,
      'auroc': test_metric1,
      'auprc': test_metric2,
      'identifier': targets_df.identifier,
      'description': targets_df.description
      })

  else:
    print('Test PearsonR:     %7.5f' % test_metric1.mean())
    print('Test R2:           %7.5f' % test_metric2.mean())

    # write target-level statistics
    targets_acc_df = pd.DataFrame({
      'index': targets_df.index,
      'pearsonr': test_metric1,
      'r2': test_metric2,
      'identifier': targets_df.identifier,
      'description': targets_df.description
      })

  targets_acc_df.to_csv('%s/acc.txt'%options.out_dir, sep='\t',
                        index=False, float_format='%.5f')

  #######################################################
  # predict?

  if options.save or options.peaks or options.accuracy_indexes is not None:
    # compute predictions
    test_preds = seqnn_model.predict(eval_data).astype('float16')

    # read targets
    test_targets = eval_data.numpy(return_inputs=False)

  if options.save:
    preds_h5 = h5py.File('%s/preds.h5' % options.out_dir, 'w')
    preds_h5.create_dataset('preds', data=test_preds)
    preds_h5.close()
    targets_h5 = h5py.File('%s/targets.h5' % options.out_dir, 'w')
    targets_h5.create_dataset('targets', data=test_targets)
    targets_h5.close()


  #######################################################
  # peak call accuracy

  if options.peaks:
    peaks_out_file = '%s/peaks.txt' % options.out_dir
    test_peaks(test_preds, test_targets, peaks_out_file)


  #######################################################
  # accuracy plots

  if options.accuracy_indexes is not None:
    accuracy_indexes = [int(ti) for ti in options.accuracy_indexes.split(',')]

    if not os.path.isdir('%s/scatter' % options.out_dir):
      os.mkdir('%s/scatter' % options.out_dir)

    if not os.path.isdir('%s/violin' % options.out_dir):
      os.mkdir('%s/violin' % options.out_dir)

    if not os.path.isdir('%s/roc' % options.out_dir):
      os.mkdir('%s/roc' % options.out_dir)

    if not os.path.isdir('%s/pr' % options.out_dir):
      os.mkdir('%s/pr' % options.out_dir)

    for ti in accuracy_indexes:
      test_targets_ti = test_targets[:, :, ti]

      ############################################
      # scatter

      # sample every few bins (adjust to plot the # points I want)
      ds_indexes = np.arange(0, test_preds.shape[1], 8)

      # subset and flatten
      test_targets_ti_flat = test_targets_ti[:, ds_indexes].flatten(
      ).astype('float32')
      test_preds_ti_flat = test_preds[:, ds_indexes, ti].flatten().astype(
          'float32')

      # take log2
      test_targets_ti_log = np.log2(test_targets_ti_flat + 1)
      test_preds_ti_log = np.log2(test_preds_ti_flat + 1)

      # plot log2
      sns.set(font_scale=1.2, style='ticks')
      out_pdf = '%s/scatter/t%d.pdf' % (options.out_dir, ti)
      plots.regplot(
          test_targets_ti_log,
          test_preds_ti_log,
          out_pdf,
          poly_order=1,
          alpha=0.3,
          sample=500,
          figsize=(6, 6),
          x_label='log2 Experiment',
          y_label='log2 Prediction',
          table=True)

      ############################################
      # violin

      # call peaks
      test_targets_ti_lambda = np.mean(test_targets_ti_flat)
      test_targets_pvals = 1 - poisson.cdf(
          np.round(test_targets_ti_flat) - 1, mu=test_targets_ti_lambda)
      test_targets_qvals = np.array(ben_hoch(test_targets_pvals))
      test_targets_peaks = test_targets_qvals < 0.01
      test_targets_peaks_str = np.where(test_targets_peaks, 'Peak',
                                        'Background')

      # violin plot
      sns.set(font_scale=1.3, style='ticks')
      plt.figure()
      df = pd.DataFrame({
          'log2 Prediction': np.log2(test_preds_ti_flat + 1),
          'Experimental coverage status': test_targets_peaks_str
      })
      ax = sns.violinplot(
          x='Experimental coverage status', y='log2 Prediction', data=df)
      ax.grid(True, linestyle=':')
      plt.savefig('%s/violin/t%d.pdf' % (options.out_dir, ti))
      plt.close()

      # ROC
      plt.figure()
      fpr, tpr, _ = roc_curve(test_targets_peaks, test_preds_ti_flat)
      auroc = roc_auc_score(test_targets_peaks, test_preds_ti_flat)
      plt.plot(
          [0, 1], [0, 1], c='black', linewidth=1, linestyle='--', alpha=0.7)
      plt.plot(fpr, tpr, c='black')
      ax = plt.gca()
      ax.set_xlabel('False positive rate')
      ax.set_ylabel('True positive rate')
      ax.text(
          0.99, 0.02, 'AUROC %.3f' % auroc,
          horizontalalignment='right')  # , fontsize=14)
      ax.grid(True, linestyle=':')
      plt.savefig('%s/roc/t%d.pdf' % (options.out_dir, ti))
      plt.close()

      # PR
      plt.figure()
      prec, recall, _ = precision_recall_curve(test_targets_peaks,
                                               test_preds_ti_flat)
      auprc = average_precision_score(test_targets_peaks, test_preds_ti_flat)
      plt.axhline(
          y=test_targets_peaks.mean(),
          c='black',
          linewidth=1,
          linestyle='--',
          alpha=0.7)
      plt.plot(recall, prec, c='black')
      ax = plt.gca()
      ax.set_xlabel('Recall')
      ax.set_ylabel('Precision')
      ax.text(
          0.99, 0.95, 'AUPRC %.3f' % auprc,
          horizontalalignment='right')  # , fontsize=14)
      ax.grid(True, linestyle=':')
      plt.savefig('%s/pr/t%d.pdf' % (options.out_dir, ti))
      plt.close()
Beispiel #4
0
def main():
    usage = 'usage: %prog [options] <params_file> <data1_dir> ...'
    parser = OptionParser(usage)
    parser.add_option('-k',
                      dest='keras_fit',
                      default=False,
                      action='store_true',
                      help='Train with Keras fit method [Default: %default]')
    parser.add_option(
        '-o',
        dest='out_dir',
        default='train_out',
        help='Output directory for test statistics [Default: %default]')
    parser.add_option(
        '--restore',
        dest='restore',
        help='Restore model and continue training [Default: %default]')
    parser.add_option('--trunk',
                      dest='trunk',
                      default=False,
                      action='store_true',
                      help='Restore only model trunk [Default: %default]')
    parser.add_option(
        '--tfr_train',
        dest='tfr_train_pattern',
        default=None,
        help=
        'Training TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]'
    )
    parser.add_option(
        '--tfr_eval',
        dest='tfr_eval_pattern',
        default=None,
        help=
        'Evaluation TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) < 2:
        parser.error('Must provide parameters and data directory.')
    else:
        params_file = args[0]
        data_dirs = args[1:]

    if options.keras_fit and len(data_dirs) > 1:
        print('Cannot use keras fit method with multi-genome training.')
        exit(1)

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)
    if params_file != '%s/params.json' % options.out_dir:
        shutil.copy(params_file, '%s/params.json' % options.out_dir)

    # read model parameters
    with open(params_file) as params_open:
        params = json.load(params_open)
    params_model = params['model']
    params_train = params['train']

    # read datasets
    train_data = []
    eval_data = []

    for data_dir in data_dirs:
        # load train data
        train_data.append(
            dataset.SeqDataset(data_dir,
                               split_label='train',
                               batch_size=params_train['batch_size'],
                               mode='train',
                               tfr_pattern=options.tfr_train_pattern))

        # load eval data
        eval_data.append(
            dataset.SeqDataset(data_dir,
                               split_label='valid',
                               batch_size=params_train['batch_size'],
                               mode='eval',
                               tfr_pattern=options.tfr_eval_pattern))

    if params_train.get('num_gpu', 1) == 1:
        ########################################
        # one GPU

        # initialize model
        # print('INITIALIZE MODEL')
        seqnn_model = seqnn.SeqNN(params_model)
        # seqnn_model = model_zoo.basenji_model((131072,4), 3)

        # restore
        if options.restore:
            seqnn_model.restore(options.restore, trunk=options.trunk)

        # initialize trainer
        seqnn_trainer = trainer.Trainer(params_train, train_data, eval_data,
                                        options.out_dir)

        # compile model
        seqnn_trainer.compile(seqnn_model)

    # else:
    ########################################
    # two GPU

    # strategy = tf.distribute.MirroredStrategy()
    #
    # with strategy.scope():
    #
    #   if not options.keras_fit:
    #     # distribute data
    #     for di in range(len(data_dirs)):
    #       train_data[di].distribute(strategy)
    #       eval_data[di].distribute(strategy)
    #
    #   # initialize model
    #   seqnn_model = seqnn.SeqNN(params_model)
    #
    #   # restore
    #   if options.restore:
    #     seqnn_model.restore(options.restore, options.trunk)
    #
    #   # initialize trainer
    #   seqnn_trainer = trainer.Trainer(params_train, train_data, eval_data, options.out_dir,
    #                                   strategy, params_train['num_gpu'], options.keras_fit)
    #
    #   # compile model
    #   seqnn_trainer.compile(seqnn_model)

    # train model
    if options.keras_fit:
        seqnn_trainer.fit_keras(seqnn_model)
    else:
        if len(data_dirs) == 1:
            seqnn_trainer.fit_tape(seqnn_model)
        else:
            seqnn_trainer.fit2(seqnn_model)
Beispiel #5
0
def main():
    usage = 'usage: %prog [options] <data_dir> <model_name> <output_dir> <params_file>...'
    parser = OptionParser(usage)
    parser.add_option(
        '-b',
        dest='batch_size',
        default=4,
        help='Batch size for the model training [Default: %default]')
    parser.add_option('-p',
                      dest='patience',
                      default=8,
                      help='Training patience [Default: %default]')
    parser.add_option('-l',
                      dest='learning_rate',
                      default=0.1,
                      help='Learning rate [Default: %default]')
    parser.add_option('-m',
                      dest='momentum',
                      default=0.99,
                      help='SGD momentum [Default: %default]')
    parser.add_option('-e',
                      dest='n_epochs',
                      default=8,
                      help='Training patience [Default: %default]')
    parser.add_option('--clip_norm',
                      dest='clip_norm',
                      default=1000000,
                      help='Training patience [Default: %default]')
    (options, args) = parser.parse_args()
    ########TODO:ADD THE REST OF THE parameters
    if len(args) < 4:
        parser.error('Must provide data_dir, model and output directory.')
    else:
        data_dir = args[0]
        model_name = args[1]
        output_dir = args[2]
        params_file = args[3]

    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    ####LOAD DATA
# read model parameters
    with open(params_file) as params_open:
        params = json.load(params_open)
    params_model = params['model']
    params_train = params['train']

    # read datasets
    train_data = []
    eval_data = []

    # load train data
    train_data.append(
        dataset.SeqDataset(data_dir,
                           split_label='train',
                           batch_size=params_train['batch_size'],
                           mode='train'))

    # load eval data
    eval_data.append(
        dataset.SeqDataset(data_dir,
                           split_label='valid',
                           batch_size=params_train['batch_size'],
                           mode='eval'))
    ##########################################

    # train, valid = load_data(data_dir, options.batch_size)
    # print(type(valid[0]))
    # print(len(valid))
    # print(valid)
    if model_name == 'basenji':
        model = model_zoo.basenji_model((131072, 4), 3)
    loss_fn = tf.keras.losses.Poisson(reduction=tf.keras.losses.Reduction.NONE)
    early_stop = tf.keras.callbacks.EarlyStopping(
        monitor='val_pearsonr',  #'val_aupr',#
        patience=options.patience,
        verbose=1,
        mode='max')
    # early_stop = EarlyStoppingMin(monitor='val_pearsonr', mode='max', verbose=1,
    #                patience=options.patience, min_epoch=1)
    save_best = tf.keras.callbacks.ModelCheckpoint(
        '{}/model_best.h5'.format(output_dir),
        save_best_only=True,
        mode='max',
        monitor='val_pearsonr',
        verbose=1)
    callbacks = [
        early_stop,
        tf.keras.callbacks.TensorBoard(output_dir),
        tf.keras.callbacks.ModelCheckpoint('%s/model_check.h5' % output_dir),
        save_best
    ]
    # fit model
    num_targets = model.output_shape[-1]
    print('num_targets ', num_targets)
    model_metrics = [metrics.PearsonR(num_targets), metrics.R2(num_targets)]

    optimizer = tf.keras.optimizers.SGD(learning_rate=options.learning_rate,
                                        momentum=options.momentum,
                                        clipnorm=options.clip_norm)

    model.compile(loss=loss_fn, optimizer=optimizer, metrics=model_metrics)
    model.fit(train,
              epochs=options.n_epochs,
              callbacks=callbacks,
              validation_data=valid)