Ejemplo n.º 1
0
def main():
    usage = 'usage: %prog [options] <params_file> <model_file> <bed_file>'
    parser = OptionParser(usage)
    parser.add_option('-f',
                      dest='genome_fasta',
                      default=None,
                      help='Genome FASTA for sequences [Default: %default]')
    parser.add_option(
        '-l',
        dest='mut_len',
        default=200,
        type='int',
        help='Length of center sequence to mutate [Default: %default]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='sat_mut',
                      help='Output directory [Default: %default]')
    parser.add_option('--plots',
                      dest='plots',
                      default=False,
                      action='store_true',
                      help='Make heatmap plots [Default: %default]')
    parser.add_option('-p',
                      dest='processes',
                      default=None,
                      type='int',
                      help='Number of processes, passed by multi script')
    parser.add_option(
        '--rc',
        dest='rc',
        default=False,
        action='store_true',
        help=
        'Ensemble forward and reverse complement predictions [Default: %default]'
    )
    parser.add_option('--shifts',
                      dest='shifts',
                      default='0',
                      help='Ensemble prediction shifts [Default: %default]')
    parser.add_option(
        '-t',
        dest='targets_file',
        default=None,
        type='str',
        help='File specifying target indexes and labels in table format')
    (options, args) = parser.parse_args()

    if len(args) == 3:
        # single worker
        params_file = args[0]
        model_file = args[1]
        bed_file = args[2]

    elif len(args) == 5:
        # multi worker
        options_pkl_file = args[0]
        params_file = args[1]
        model_file = args[2]
        bed_file = args[3]
        worker_index = int(args[4])

        # load options
        options_pkl = open(options_pkl_file, 'rb')
        options = pickle.load(options_pkl)
        options_pkl.close()

        # update output directory
        options.out_dir = '%s/job%d' % (options.out_dir, worker_index)

    else:
        parser.error(
            'Must provide parameters and model files and QTL BED file')

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    options.shifts = [int(shift) for shift in options.shifts.split(',')]

    #################################################################
    # read parameters and collet target information

    job = params.read_job_params(params_file)

    if options.targets_file is None:
        target_ids = ['t%d' % ti for ti in range(job['num_targets'])]
        target_labels = [''] * len(target_ids)
        target_subset = None

    else:
        targets_df = pd.read_table(options.targets_file, index_col=0)
        target_ids = targets_df.identifier
        target_labels = targets_df.description
        target_subset = targets_df.index
        if len(target_subset) == job['num_targets']:
            target_subset = None

    num_targets = len(target_ids)

    #################################################################
    # sequence dataset

    # read sequences from BED
    seqs_dna, seqs_coords = bed_seqs(bed_file, options.genome_fasta,
                                     job['seq_length'])

    # filter for worker SNPs
    if options.processes is not None:
        worker_bounds = np.linspace(0,
                                    len(seqs_dna),
                                    options.processes + 1,
                                    dtype='int')
        seqs_dna = seqs_dna[
            worker_bounds[worker_index]:worker_bounds[worker_index + 1]]
        seqs_coords = seqs_coords[
            worker_bounds[worker_index]:worker_bounds[worker_index + 1]]

    num_seqs = len(seqs_dna)

    # determine mutation region limits
    seq_mid = job['seq_length'] // 2
    mut_start = seq_mid - options.mut_len // 2
    mut_end = mut_start + options.mut_len

    # make data ops
    data_ops = satmut_data_ops(seqs_dna, mut_start, mut_end, job['batch_size'])

    #################################################################
    # setup model

    # build model
    model = seqnn.SeqNN()
    model.build_sad(job,
                    data_ops,
                    target_subset=target_subset,
                    ensemble_rc=options.rc,
                    ensemble_shifts=options.shifts)

    #################################################################
    # setup output

    scores_h5_file = '%s/scores.h5' % options.out_dir
    if os.path.isfile(scores_h5_file):
        os.remove(scores_h5_file)
    scores_h5 = h5py.File('%s/scores.h5' % options.out_dir)
    scores_h5.create_dataset('scores',
                             dtype='float16',
                             shape=(num_seqs, options.mut_len, 4, num_targets))
    scores_h5.create_dataset('seqs',
                             dtype='bool',
                             shape=(num_seqs, options.mut_len, 4))

    # store mutagenesis sequence coordinates
    seqs_chr, seqs_start, _ = zip(*seqs_coords)
    seqs_chr = np.array(seqs_chr, dtype='S')
    seqs_start = np.array(seqs_start) + mut_start
    seqs_end = seqs_start + options.mut_len
    scores_h5.create_dataset('chrom', data=seqs_chr)
    scores_h5.create_dataset('start', data=seqs_start)
    scores_h5.create_dataset('end', data=seqs_end)

    preds_per_seq = 1 + 3 * options.mut_len

    score_threads = []
    score_queue = Queue()
    for i in range(1):
        sw = ScoreWorker(score_queue, scores_h5)
        sw.start()
        score_threads.append(sw)

    #################################################################
    # predict scores, write output

    # initialize saver
    saver = tf.train.Saver()

    with tf.Session() as sess:
        # coordinator
        coord = tf.train.Coordinator()
        tf.train.start_queue_runners(coord=coord)

        # load variables into session
        saver.restore(sess, model_file)

        # initialize predictions stream
        preds_stream = PredStream(sess, model, 32)

        # predictions index
        pi = 0

        for si in range(num_seqs):
            print('Predicting %d' % si, flush=True)

            # collect sequence predictions
            seq_preds = []
            for spi in range(preds_per_seq):
                seq_preds.append(preds_stream[pi])
                pi += 1

            # wait for previous to finish
            score_queue.join()

            # queue sequence for scoring
            score_queue.put((seqs_dna[si], seq_preds, si))

            # queue sequence for plotting
            if options.plots:
                plot_queue.put((seqs_dna[si], seq_preds, si))

    # finish queue
    print('Waiting for threads to finish.', flush=True)
    score_queue.join()

    # close output HDF5
    scores_h5.close()
Ejemplo n.º 2
0
def main():
  usage = 'usage: %prog [options] <params_file> <model_file> <vcf_file>'
  parser = OptionParser(usage)
  parser.add_option('-c', dest='center_pct',
      default=0.25, type='float',
      help='Require clustered SNPs lie in center region [Default: %default]')
  parser.add_option('-f', dest='genome_fasta',
      default='%s/data/hg19.fa' % os.environ['BASENJIDIR'],
      help='Genome FASTA for sequences [Default: %default]')
  parser.add_option('--flip', dest='flip_ref',
      default=False, action='store_true',
      help='Flip reference/alternate alleles when simple [Default: %default]')
  parser.add_option('--local', dest='local',
      default=1024, type='int',
      help='Local SAD score [Default: %default]')
  parser.add_option('-n', dest='norm_file',
      default=None,
      help='Normalize SAD scores')
  parser.add_option('-o',dest='out_dir',
      default='sad',
      help='Output directory for tables and plots [Default: %default]')
  parser.add_option('-p', dest='processes',
      default=None, type='int',
      help='Number of processes, passed by multi script')
  parser.add_option('--pseudo', dest='log_pseudo',
      default=1, type='float',
      help='Log2 pseudocount [Default: %default]')
  parser.add_option('--rc', dest='rc',
      default=False, action='store_true',
      help='Average forward and reverse complement predictions [Default: %default]')
  parser.add_option('--shifts', dest='shifts',
      default='0', type='str',
      help='Ensemble prediction shifts [Default: %default]')
  parser.add_option('--stats', dest='sad_stats',
      default='SAD',
      help='Comma-separated list of stats to save. [Default: %default]')
  parser.add_option('-t', dest='targets_file',
      default=None, type='str',
      help='File specifying target indexes and labels in table format')
  parser.add_option('--ti', dest='track_indexes',
      default=None, type='str',
      help='Comma-separated list of target indexes to output BigWig tracks')
  parser.add_option('-u', dest='penultimate',
      default=False, action='store_true',
      help='Compute SED in the penultimate layer [Default: %default]')
  (options, args) = parser.parse_args()

  if len(args) == 3:
    # single worker
    params_file = args[0]
    model_file = args[1]
    vcf_file = args[2]

  elif len(args) == 5:
    # multi worker
    options_pkl_file = args[0]
    params_file = args[1]
    model_file = args[2]
    vcf_file = args[3]
    worker_index = int(args[4])

    # load options
    options_pkl = open(options_pkl_file, 'rb')
    options = pickle.load(options_pkl)
    options_pkl.close()

    # update output directory
    options.out_dir = '%s/job%d' % (options.out_dir, worker_index)

  else:
    parser.error('Must provide parameters and model files and QTL VCF file')

  if not os.path.isdir(options.out_dir):
    os.mkdir(options.out_dir)

  if options.track_indexes is None:
    options.track_indexes = []
  else:
    options.track_indexes = [int(ti) for ti in options.track_indexes.split(',')]
    if not os.path.isdir('%s/tracks' % options.out_dir):
      os.mkdir('%s/tracks' % options.out_dir)

  options.shifts = [int(shift) for shift in options.shifts.split(',')]
  options.sad_stats = options.sad_stats.split(',')


  #################################################################
  # read parameters and collet target information

  job = params.read_job_params(params_file, require=['seq_length','num_targets'])

  if options.targets_file is None:
    target_ids = ['t%d' % ti for ti in range(job['num_targets'])]
    target_labels = ['']*len(target_ids)
    target_subset = None

  else:
    targets_df = pd.read_table(options.targets_file, index_col=0)
    target_ids = targets_df.identifier
    target_labels = targets_df.description
    target_subset = targets_df.index
    if len(target_subset) == job['num_targets']:
        target_subset = None


  #################################################################
  # load SNPs

  # read sorted SNPs from VCF
  snps = bvcf.vcf_snps(vcf_file, require_sorted=True, flip_ref=options.flip_ref,
                       validate_ref_fasta=options.genome_fasta)

  # filter for worker SNPs
  if options.processes is not None:
    worker_bounds = np.linspace(0, len(snps), options.processes+1, dtype='int')
    snps = snps[worker_bounds[worker_index]:worker_bounds[worker_index+1]]

  num_snps = len(snps)

  # cluster SNPs by position
  snp_clusters = cluster_snps(snps, job['seq_length'], options.center_pct)

  # delimit sequence boundaries
  [sc.delimit(job['seq_length']) for sc in snp_clusters]

  # open genome FASTA
  genome_open = pysam.Fastafile(options.genome_fasta)

  # make SNP sequence generator
  def snp_gen():
    for sc in snp_clusters:
      snp_1hot_list = sc.get_1hots(genome_open)
      for snp_1hot in snp_1hot_list:
        yield {'sequence':snp_1hot}

  snp_types = {'sequence': tf.float32}
  snp_shapes = {'sequence': tf.TensorShape([tf.Dimension(job['seq_length']),
                                            tf.Dimension(4)])}

  dataset = tf.data.Dataset.from_generator(snp_gen,
                                          output_types=snp_types,
                                          output_shapes=snp_shapes)
  dataset = dataset.batch(job['batch_size'])
  dataset = dataset.prefetch(2*job['batch_size'])
  # dataset = dataset.apply(tf.contrib.data.prefetch_to_device('/device:GPU:0'))

  iterator = dataset.make_one_shot_iterator()
  data_ops = iterator.get_next()


  #################################################################
  # setup model

  # build model
  t0 = time.time()
  model = seqnn.SeqNN()
  model.build_sad(job, data_ops,
                  ensemble_rc=options.rc, ensemble_shifts=options.shifts,
                  embed_penultimate=options.penultimate, target_subset=target_subset)
  print('Model building time %f' % (time.time() - t0), flush=True)

  if options.penultimate:
    # labels become inappropriate
    target_ids = ['']*model.hp.cnn_filters[-1]
    target_labels = target_ids

  # read target normalization factors
  target_norms = np.ones(len(target_labels))
  if options.norm_file is not None:
    ti = 0
    for line in open(options.norm_file):
      target_norms[ti] = float(line.strip())
      ti += 1

  num_targets = len(target_ids)

  #################################################################
  # setup output

  snp_flips = np.array([snp.flipped for snp in snps], dtype='bool')

  sad_out = initialize_output_h5(options.out_dir, options.sad_stats,
                                 snps, target_ids, target_labels)

  snp_threads = []

  snp_queue = Queue()
  for i in range(1):
    sw = SNPWorker(snp_queue, sad_out, options.sad_stats, options.log_pseudo)
    sw.start()
    snp_threads.append(sw)

  #################################################################
  # predict SNP scores, write output

  # initialize saver
  saver = tf.train.Saver()
  with tf.Session() as sess:
    # load variables into session
    saver.restore(sess, model_file)

    # initialize predictions stream
    preds_stream = PredStream(sess, model, 32)

    # predictions index
    pi = 0

    # SNP index
    si = 0

    for snp_cluster in snp_clusters:
      ref_preds = preds_stream[pi]
      pi += 1

      for snp in snp_cluster.snps:
        # print(snp, flush=True)

        alt_preds = preds_stream[pi]
        pi += 1

        # queue SNP
        if snp_flips[si]:
          snp_queue.put((alt_preds, ref_preds, si))
        else:
          snp_queue.put((ref_preds, alt_preds, si))

        # update SNP index
        si += 1

  # finish queue
  print('Waiting for threads to finish.', flush=True)
  snp_queue.join()

  # close genome
  genome_open.close()

  ###################################################
  # compute SAD distributions across variants

  # define percentiles
  d_fine = 0.001
  d_coarse = 0.01
  percentiles_neg = np.arange(d_fine, 0.1, d_fine)
  percentiles_base = np.arange(0.1, 0.9, d_coarse)
  percentiles_pos = np.arange(0.9, 1, d_fine)

  percentiles = np.concatenate([percentiles_neg, percentiles_base, percentiles_pos])
  sad_out.create_dataset('percentiles', data=percentiles)
  pct_len = len(percentiles)

  for sad_stat in options.sad_stats:
    sad_stat_pct = '%s_pct' % sad_stat

    # compute
    sad_pct = np.percentile(sad_out[sad_stat], 100*percentiles, axis=0).T
    sad_pct = sad_pct.astype('float16')

    # save
    sad_out.create_dataset(sad_stat_pct, data=sad_pct, dtype='float16')

  sad_out.close()
Ejemplo n.º 3
0
def main():
    usage = 'usage: %prog [options] <params_file> <model_file> <bed_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-b',
        dest='bigwig_indexes',
        default=None,
        help='Comma-separated list of target indexes to write BigWigs')
    parser.add_option('-e',
                      dest='embed_layer',
                      default=None,
                      type='int',
                      help='Embed sequences using the specified layer index.')
    parser.add_option('-f',
                      dest='genome_fasta',
                      default=None,
                      help='Genome FASTA for sequences [Default: %default]')
    parser.add_option('-g',
                      dest='genome_file',
                      default=None,
                      help='Chromosome length information [Default: %default]')
    parser.add_option(
        '-l',
        dest='site_length',
        default=None,
        type='int',
        help='Prediction site length. [Default: params.seq_length]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='pred_out',
                      help='Output directory [Default: %default]')
    # parser.add_option('--plots', dest='plots',
    #     default=False, action='store_true',
    #     help='Make heatmap plots [Default: %default]')
    parser.add_option('-p',
                      dest='processes',
                      default=None,
                      type='int',
                      help='Number of processes, passed by multi script')
    parser.add_option(
        '--rc',
        dest='rc',
        default=False,
        action='store_true',
        help=
        'Ensemble forward and reverse complement predictions [Default: %default]'
    )
    parser.add_option('-s',
                      dest='sum',
                      default=False,
                      action='store_true',
                      help='Sum site predictions [Default: %default]')
    parser.add_option('--shifts',
                      dest='shifts',
                      default='0',
                      help='Ensemble prediction shifts [Default: %default]')
    parser.add_option(
        '-t',
        dest='targets_file',
        default=None,
        type='str',
        help='File specifying target indexes and labels in table format')
    (options, args) = parser.parse_args()

    if len(args) == 3:
        params_file = args[0]
        model_file = args[1]
        bed_file = args[2]

    elif len(args) == 5:
        # multi worker
        options_pkl_file = args[0]
        params_file = args[1]
        model_file = args[2]
        bed_file = args[3]
        worker_index = int(args[4])

        # load options
        options_pkl = open(options_pkl_file, 'rb')
        options = pickle.load(options_pkl)
        options_pkl.close()

        # update output directory
        options.out_dir = '%s/job%d' % (options.out_dir, worker_index)
    else:
        parser.error('Must provide parameter and model files and BED file')

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    options.shifts = [int(shift) for shift in options.shifts.split(',')]

    if options.bigwig_indexes is not None:
        options.bigwig_indexes = [
            int(bi) for bi in options.bigwig_indexes.split(',')
        ]
    else:
        options.bigwig_indexes = []

    if len(options.bigwig_indexes) > 0:
        bigwig_dir = '%s/bigwig' % options.out_dir
        if not os.path.isdir(bigwig_dir):
            os.mkdir(bigwig_dir)

    #################################################################
    # read parameters and collet target information

    job = params.read_job_params(params_file,
                                 require=['num_targets', 'seq_length'])

    if job.get('batch_buffer', 0) > 0:
        print('Turn off batch_buffer.', file=sys.stderr)
        exit(1)

    num_targets = np.sum(job['num_targets'])
    if options.targets_file is None:
        target_subset = None
    else:
        targets_df = pd.read_table(options.targets_file, index_col=0)
        target_subset = targets_df.index
        if len(target_subset) == num_targets:
            target_subset = None
        else:
            num_targets = len(target_subset)

    if options.site_length is None:
        options.site_length = params['seq_length']

    #################################################################
    # sequence dataset

    # construct model sequences
    model_seqs_dna, model_seqs_coords = make_bed_data(bed_file,
                                                      options.genome_fasta,
                                                      job['seq_length'])

    # construct site coordinates
    site_seqs_coords = read_bed(bed_file, options.site_length)

    # filter for worker SNPs
    if options.processes is not None:
        worker_bounds = np.linspace(0,
                                    len(model_seqs_dna),
                                    options.processes + 1,
                                    dtype='int')
        model_seqs_dna = model_seqs_dna[
            worker_bounds[worker_index]:worker_bounds[worker_index + 1]]
        model_seqs_coords = model_seqs_coords[
            worker_bounds[worker_index]:worker_bounds[worker_index + 1]]
        site_seqs_coords = site_seqs_coords[
            worker_bounds[worker_index]:worker_bounds[worker_index + 1]]

    num_seqs = len(model_seqs_dna)

    # make data ops
    data_ops = seq_data_ops(model_seqs_dna, job['batch_size'])

    #################################################################
    # setup model

    # build model
    model = seqnn.SeqNN()
    model.build_sad(job,
                    data_ops,
                    ensemble_rc=options.rc,
                    ensemble_shifts=options.shifts,
                    embed_layer=options.embed_layer,
                    target_subset=target_subset)

    #################################################################
    # setup output

    # determine site boundaries in predictions space
    assert (job['seq_length'] % model.preds_length == 0)
    preds_window = job['seq_length'] // model.preds_length

    assert (model.preds_length % 2 == 0)
    preds_mid = model.preds_length // 2

    assert (options.site_length % preds_window == 0)
    site_preds_length = options.site_length // preds_window

    assert (site_preds_length % 2 == 0)
    site_preds_start = preds_mid - site_preds_length // 2
    site_preds_end = site_preds_start + site_preds_length

    # initialize HDF5
    out_h5_file = '%s/predict.h5' % options.out_dir
    if os.path.isfile(out_h5_file):
        os.remove(out_h5_file)
    out_h5 = h5py.File(out_h5_file, 'w')

    # create predictions
    if options.sum:
        out_h5.create_dataset('preds',
                              shape=(num_seqs, model.preds_depth),
                              dtype='float16')
    else:
        out_h5.create_dataset('preds',
                              shape=(num_seqs, site_preds_length,
                                     model.preds_depth),
                              dtype='float16')

    # store site coordinates
    site_seqs_chr, site_seqs_start, site_seqs_end = zip(*site_seqs_coords)
    site_seqs_chr = np.array(site_seqs_chr, dtype='S')
    site_seqs_start = np.array(site_seqs_start)
    site_seqs_end = np.array(site_seqs_end)
    out_h5.create_dataset('chrom', data=site_seqs_chr)
    out_h5.create_dataset('start', data=site_seqs_start)
    out_h5.create_dataset('end', data=site_seqs_end)

    #################################################################
    # predict scores, write output

    # initialize saver
    saver = tf.train.Saver()

    with tf.Session() as sess:
        # load variables into session
        saver.restore(sess, model_file)

        # initialize predictions stream
        preds_stream = PredStream(sess, model, 64)

        for si in range(num_seqs):
            print('Predicting %d' % si, flush=True)

            # predict
            preds_full = preds_stream[si]

            # slice site
            preds_site = preds_full[site_preds_start:site_preds_end, :]

            # write
            if options.sum:
                out_h5['preds'][si] = preds_site.sum(axis=0)
            else:
                out_h5['preds'][si] = preds_site

            # write bigwig
            for ti in options.bigwig_indexes:
                bw_file = '%s/s%d_t%d.bw' % (bigwig_dir, si, ti)
                bigwig_write(preds_full[:, ti], model_seqs_coords[si], bw_file,
                             options.genome_file, model.hp.batch_buffer)

    # close output HDF5
    out_h5.close()
Ejemplo n.º 4
0
def main():
    usage = 'usage: %prog [options] <params_file> <model_file> <bed_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-b',
        dest='bigwig_indexes',
        default=None,
        help='Comma-separated list of target indexes to write BigWigs')
    parser.add_option('-f',
                      dest='genome_fasta',
                      default=None,
                      help='Genome FASTA for sequences [Default: %default]')
    parser.add_option('-g',
                      dest='genome_file',
                      default=None,
                      help='Chromosome length information [Default: %default]')
    # parser.add_option('-l', dest='mid_len',
    #     default=256, type='int',
    #     help='Length of center sequence to sum predictions for [Default: %default]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='pred_out',
                      help='Output directory [Default: %default]')
    # parser.add_option('--plots', dest='plots',
    #     default=False, action='store_true',
    #     help='Make heatmap plots [Default: %default]')
    parser.add_option('-p',
                      dest='processes',
                      default=None,
                      type='int',
                      help='Number of processes, passed by multi script')
    parser.add_option(
        '--rc',
        dest='rc',
        default=False,
        action='store_true',
        help=
        'Ensemble forward and reverse complement predictions [Default: %default]'
    )
    parser.add_option('--shifts',
                      dest='shifts',
                      default='0',
                      help='Ensemble prediction shifts [Default: %default]')
    parser.add_option(
        '-t',
        dest='targets_file',
        default=None,
        type='str',
        help='File specifying target indexes and labels in table format')
    (options, args) = parser.parse_args()

    if len(args) == 3:
        params_file = args[0]
        model_file = args[1]
        bed_file = args[2]

    elif len(args) == 5:
        # multi worker
        options_pkl_file = args[0]
        params_file = args[1]
        model_file = args[2]
        bed_file = args[3]
        worker_index = int(args[4])

        # load options
        options_pkl = open(options_pkl_file, 'rb')
        options = pickle.load(options_pkl)
        options_pkl.close()

        # update output directory
        options.out_dir = '%s/job%d' % (options.out_dir, worker_index)
    else:
        parser.error('Must provide parameter and model files and BED file')

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    options.shifts = [int(shift) for shift in options.shifts.split(',')]

    if options.bigwig_indexes is not None:
        options.bigwig_indexes = [
            int(bi) for bi in options.bigwig_indexes.split(',')
        ]
    else:
        options.bigwig_indexes = []

    if len(options.bigwig_indexes) > 0:
        bigwig_dir = '%s/bigwig' % options.out_dir
        if not os.path.isdir(bigwig_dir):
            os.mkdir(bigwig_dir)

    #################################################################
    # read parameters and collet target information

    job = params.read_job_params(params_file,
                                 require=['num_targets', 'seq_length'])

    num_targets = np.sum(job['num_targets'])
    if options.targets_file is None:
        target_subset = None
    else:
        targets_df = pd.read_table(options.targets_file, index_col=0)
        target_subset = targets_df.index
        if len(target_subset) == num_targets:
            target_subset = None
        else:
            num_targets = len(target_subset)

    #################################################################
    # sequence dataset

    # read sequences from BED
    seqs_dna, seqs_coords = bed_seqs(bed_file, options.genome_fasta,
                                     job['seq_length'])

    # filter for worker SNPs
    if options.processes is not None:
        worker_bounds = np.linspace(0,
                                    len(seqs_dna),
                                    options.processes + 1,
                                    dtype='int')
        seqs_dna = seqs_dna[
            worker_bounds[worker_index]:worker_bounds[worker_index + 1]]
        seqs_coords = seqs_coords[
            worker_bounds[worker_index]:worker_bounds[worker_index + 1]]

    num_seqs = len(seqs_dna)

    # make data ops
    data_ops = seq_data_ops(seqs_dna, job['batch_size'])

    #################################################################
    # setup model

    # build model
    model = seqnn.SeqNN()
    model.build_sad(job,
                    data_ops,
                    ensemble_rc=options.rc,
                    ensemble_shifts=options.shifts,
                    target_subset=target_subset)

    #################################################################
    # setup output

    out_h5_file = '%s/predict.h5' % options.out_dir
    if os.path.isfile(out_h5_file):
        os.remove(out_h5_file)
    out_h5 = h5py.File(out_h5_file, 'w')
    out_h5.create_dataset('preds',
                          shape=(num_seqs, num_targets),
                          dtype='float16')

    # store sequence coordinates
    seqs_chr, seqs_start, _ = zip(*seqs_coords)
    seqs_chr = np.array(seqs_chr, dtype='S')
    seqs_start = np.array(seqs_start)
    seqs_end = seqs_start + job['seq_length']
    out_h5.create_dataset('chrom', data=seqs_chr)
    out_h5.create_dataset('start', data=seqs_start)
    out_h5.create_dataset('end', data=seqs_end)

    if model.preds_length % 2 == 0:
        # sum center two
        mid_start = model.preds_length // 2 - 1
        mid_end = mid_start + 2
    else:
        # take center one
        mid_start = model.preds_length // 2
        mid_end = mid_start + 1

    #################################################################
    # predict scores, write output

    # initialize saver
    saver = tf.train.Saver()

    with tf.Session() as sess:
        # coordinator
        coord = tf.train.Coordinator()
        tf.train.start_queue_runners(coord=coord)

        # load variables into session
        saver.restore(sess, model_file)

        # initialize predictions stream
        preds_stream = PredStream(sess, model, 64)

        for si in range(num_seqs):
            print('Predicting %d' % si, flush=True)

            # predict
            preds_full = preds_stream[si]

            # slice middle and summarize
            preds = preds_full[mid_start:mid_end, :].sum(axis=0)

            # write
            out_h5['preds'][si] = preds

            # write bigwig
            for ti in options.bigwig_indexes:
                bw_file = '%s/s%d_t%d.bw' % (bigwig_dir, si, ti)
                bigwig_write(preds_full[:, ti], seqs_coords[si], bw_file,
                             options.genome_file, model.hp.batch_buffer)

    # close output HDF5
    out_h5.close()
Ejemplo n.º 5
0
def main():
    usage = "usage: %prog [options] <params_file> <model_file> <vcf_file>"
    parser = OptionParser(usage)
    parser.add_option(
        "-c",
        dest="center_pct",
        default=0.25,
        type="float",
        help="Require clustered SNPs lie in center region [Default: %default]",
    )
    parser.add_option(
        "-f",
        dest="genome_fasta",
        default="%s/data/hg19.fa" % os.environ["BASENJIDIR"],
        help="Genome FASTA for sequences [Default: %default]",
    )
    parser.add_option(
        "-g",
        dest="genome_file",
        default="%s/data/human.hg19.genome" % os.environ["BASENJIDIR"],
        help="Chromosome lengths file [Default: %default]",
    )
    parser.add_option(
        "--local",
        dest="local",
        default=1024,
        type="int",
        help="Local SAD score [Default: %default]",
    )
    parser.add_option("-n", dest="norm_file", default=None, help="Normalize SAD scores")
    parser.add_option(
        "-o",
        dest="out_dir",
        default="sad",
        help="Output directory for tables and plots [Default: %default]",
    )
    parser.add_option(
        "-p",
        dest="processes",
        default=None,
        type="int",
        help="Number of processes, passed by multi script",
    )
    parser.add_option(
        "--pseudo",
        dest="log_pseudo",
        default=1,
        type="float",
        help="Log2 pseudocount [Default: %default]",
    )
    parser.add_option(
        "--rc",
        dest="rc",
        default=False,
        action="store_true",
        help="Average forward and reverse complement predictions [Default: %default]",
    )
    parser.add_option(
        "--shifts",
        dest="shifts",
        default="0",
        type="str",
        help="Ensemble prediction shifts [Default: %default]",
    )
    parser.add_option(
        "--stats",
        dest="sad_stats",
        default="SAD",
        help="Comma-separated list of stats to save. [Default: %default]",
    )
    parser.add_option(
        "-t",
        dest="targets_file",
        default=None,
        type="str",
        help="File specifying target indexes and labels in table format",
    )
    parser.add_option(
        "--ti",
        dest="track_indexes",
        default=None,
        type="str",
        help="Comma-separated list of target indexes to output BigWig tracks",
    )
    parser.add_option(
        "-u",
        dest="penultimate",
        default=False,
        action="store_true",
        help="Compute SED in the penultimate layer [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) == 3:
        # single worker
        params_file = args[0]
        model_file = args[1]
        vcf_file = args[2]

    elif len(args) == 5:
        # multi worker
        options_pkl_file = args[0]
        params_file = args[1]
        model_file = args[2]
        vcf_file = args[3]
        worker_index = int(args[4])

        # load options
        options_pkl = open(options_pkl_file, "rb")
        options = pickle.load(options_pkl)
        options_pkl.close()

        # update output directory
        options.out_dir = "%s/job%d" % (options.out_dir, worker_index)

    else:
        parser.error("Must provide parameters and model files and QTL VCF file")

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    if options.track_indexes is None:
        options.track_indexes = []
    else:
        options.track_indexes = [int(ti) for ti in options.track_indexes.split(",")]
        if not os.path.isdir("%s/tracks" % options.out_dir):
            os.mkdir("%s/tracks" % options.out_dir)

    options.shifts = [int(shift) for shift in options.shifts.split(",")]
    options.sad_stats = options.sad_stats.split(",")

    #################################################################
    # read parameters and collet target information

    job = params.read_job_params(params_file, require=["seq_length", "num_targets"])

    if options.targets_file is None:
        target_ids = ["t%d" % ti for ti in range(job["num_targets"])]
        target_labels = [""] * len(target_ids)
        target_subset = None

    else:
        targets_df = pd.read_table(options.targets_file, index_col=0)
        target_ids = targets_df.identifier
        target_labels = targets_df.description
        target_subset = targets_df.index
        if len(target_subset) == job["num_targets"]:
            target_subset = None

    #################################################################
    # load SNPs

    # read sorted SNPs from VCF
    snps = bvcf.vcf_snps(
        vcf_file,
        require_sorted=True,
        flip_ref=False,
        validate_ref_fasta=options.genome_fasta,
    )

    # filter for worker SNPs
    if options.processes is not None:
        worker_bounds = np.linspace(0, len(snps), options.processes + 1, dtype="int")
        snps = snps[worker_bounds[worker_index] : worker_bounds[worker_index + 1]]

    num_snps = len(snps)

    # cluster SNPs by position
    snp_clusters = cluster_snps(snps, job["seq_length"], options.center_pct)

    # delimit sequence boundaries
    [sc.delimit(job["seq_length"]) for sc in snp_clusters]

    # open genome FASTA
    genome_open = pysam.Fastafile(options.genome_fasta)

    # make SNP sequence generator
    def snp_gen():
        for sc in snp_clusters:
            snp_1hot_list = sc.get_1hots(genome_open)
            for snp_1hot in snp_1hot_list:
                yield {"sequence": snp_1hot}

    snp_types = {"sequence": tf.float32}
    snp_shapes = {
        "sequence": tf.TensorShape([tf.Dimension(job["seq_length"]), tf.Dimension(4)])
    }

    dataset = tf.data.Dataset.from_generator(
        snp_gen, output_types=snp_types, output_shapes=snp_shapes
    )
    dataset = dataset.batch(job["batch_size"])
    dataset = dataset.prefetch(2 * job["batch_size"])
    # dataset = dataset.apply(tf.contrib.data.prefetch_to_device('/device:GPU:0'))

    iterator = dataset.make_one_shot_iterator()
    data_ops = iterator.get_next()

    #################################################################
    # setup model

    # build model
    t0 = time.time()
    model = seqnn.SeqNN()
    model.build_sad(
        job,
        data_ops,
        ensemble_rc=options.rc,
        ensemble_shifts=options.shifts,
        embed_penultimate=options.penultimate,
        target_subset=target_subset,
    )
    print("Model building time %f" % (time.time() - t0), flush=True)

    if options.penultimate:
        # labels become inappropriate
        target_ids = [""] * model.hp.cnn_filters[-1]
        target_labels = target_ids

    # read target normalization factors
    target_norms = np.ones(len(target_labels))
    if options.norm_file is not None:
        ti = 0
        for line in open(options.norm_file):
            target_norms[ti] = float(line.strip())
            ti += 1

    num_targets = len(target_ids)

    #################################################################
    # setup output

    sad_out = initialize_output_h5(
        options.out_dir, options.sad_stats, snps, target_ids, target_labels
    )

    snp_threads = []

    snp_queue = Queue()
    for i in range(1):
        sw = SNPWorker(snp_queue, sad_out, options.sad_stats, options.log_pseudo)
        sw.start()
        snp_threads.append(sw)

    #################################################################
    # predict SNP scores, write output

    # initialize saver
    saver = tf.train.Saver()
    with tf.Session() as sess:
        # load variables into session
        saver.restore(sess, model_file)

        # initialize predictions stream
        preds_stream = PredStream(sess, model, 32)

        # predictions index
        pi = 0

        # SNP index
        si = 0

        for snp_cluster in snp_clusters:
            ref_preds = preds_stream[pi]
            pi += 1

            for snp in snp_cluster.snps:
                # print(snp, flush=True)

                alt_preds = preds_stream[pi]
                pi += 1

                # queue SNP
                snp_queue.put((ref_preds, alt_preds, si))

                # update SNP index
                si += 1

    # finish queue
    print("Waiting for threads to finish.", flush=True)
    snp_queue.join()

    # close genome
    genome_open.close()

    ###################################################
    # compute SAD distributions across variants

    # define percentiles
    d_fine = 0.001
    d_coarse = 0.01
    percentiles_neg = np.arange(d_fine, 0.1, d_fine)
    percentiles_base = np.arange(0.1, 0.9, d_coarse)
    percentiles_pos = np.arange(0.9, 1, d_fine)

    percentiles = np.concatenate([percentiles_neg, percentiles_base, percentiles_pos])
    sad_out.create_dataset("percentiles", data=percentiles)
    pct_len = len(percentiles)

    for sad_stat in options.sad_stats:
        sad_stat_pct = "%s_pct" % sad_stat

        # compute
        sad_pct = np.percentile(sad_out[sad_stat], 100 * percentiles, axis=0).T
        sad_pct = sad_pct.astype("float16")

        # save
        sad_out.create_dataset(sad_stat_pct, data=sad_pct, dtype="float16")

    sad_out.close()
Ejemplo n.º 6
0
def main():
    usage = "usage: %prog [options] <params_file> <model_file> <bed_file>"
    parser = OptionParser(usage)
    parser.add_option(
        "-f",
        dest="genome_fasta",
        default=None,
        help="Genome FASTA for sequences [Default: %default]",
    )
    parser.add_option(
        "-l",
        dest="mut_len",
        default=200,
        type="int",
        help="Length of center sequence to mutate [Default: %default]",
    )
    parser.add_option(
        "-o",
        dest="out_dir",
        default="sat_mut",
        help="Output directory [Default: %default]",
    )
    parser.add_option(
        "--plots",
        dest="plots",
        default=False,
        action="store_true",
        help="Make heatmap plots [Default: %default]",
    )
    parser.add_option(
        "-p",
        dest="processes",
        default=None,
        type="int",
        help="Number of processes, passed by multi script",
    )
    parser.add_option(
        "--rc",
        dest="rc",
        default=False,
        action="store_true",
        help=
        "Ensemble forward and reverse complement predictions [Default: %default]",
    )
    parser.add_option(
        "--shifts",
        dest="shifts",
        default="0",
        help="Ensemble prediction shifts [Default: %default]",
    )
    parser.add_option(
        "-t",
        dest="targets_file",
        default=None,
        type="str",
        help="File specifying target indexes and labels in table format",
    )
    (options, args) = parser.parse_args()

    if len(args) == 3:
        # single worker
        params_file = args[0]
        model_file = args[1]
        bed_file = args[2]

    elif len(args) == 5:
        # multi worker
        options_pkl_file = args[0]
        params_file = args[1]
        model_file = args[2]
        bed_file = args[3]
        worker_index = int(args[4])

        # load options
        options_pkl = open(options_pkl_file, "rb")
        options = pickle.load(options_pkl)
        options_pkl.close()

        # update output directory
        options.out_dir = "%s/job%d" % (options.out_dir, worker_index)

    else:
        parser.error("Must provide parameter and model files and BED file")

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    options.shifts = [int(shift) for shift in options.shifts.split(",")]

    #################################################################
    # read parameters and collet target information

    job = params.read_job_params(params_file)

    if options.targets_file is None:
        target_ids = ["t%d" % ti for ti in range(job["num_targets"])]
        target_labels = [""] * len(target_ids)
        target_subset = None

    else:
        targets_df = pd.read_table(options.targets_file, index_col=0)
        target_ids = targets_df.identifier
        target_labels = targets_df.description
        target_subset = targets_df.index
        if len(target_subset) == job["num_targets"]:
            target_subset = None

    num_targets = len(target_ids)

    #################################################################
    # sequence dataset

    # read sequences from BED
    seqs_dna, seqs_coords = bed_seqs(bed_file, options.genome_fasta,
                                     job["seq_length"])

    # filter for worker SNPs
    if options.processes is not None:
        worker_bounds = np.linspace(0,
                                    len(seqs_dna),
                                    options.processes + 1,
                                    dtype="int")
        seqs_dna = seqs_dna[
            worker_bounds[worker_index]:worker_bounds[worker_index + 1]]
        seqs_coords = seqs_coords[
            worker_bounds[worker_index]:worker_bounds[worker_index + 1]]

    num_seqs = len(seqs_dna)

    # determine mutation region limits
    seq_mid = job["seq_length"] // 2
    mut_start = seq_mid - options.mut_len // 2
    mut_end = mut_start + options.mut_len

    # make data ops
    data_ops = satmut_data_ops(seqs_dna, mut_start, mut_end, job["batch_size"])

    #################################################################
    # setup model

    # build model
    model = seqnn.SeqNN()
    model.build_sad(
        job,
        data_ops,
        target_subset=target_subset,
        ensemble_rc=options.rc,
        ensemble_shifts=options.shifts,
    )

    #################################################################
    # setup output

    scores_h5_file = "%s/scores.h5" % options.out_dir
    if os.path.isfile(scores_h5_file):
        os.remove(scores_h5_file)
    scores_h5 = h5py.File("%s/scores.h5" % options.out_dir)
    scores_h5.create_dataset("scores",
                             dtype="float16",
                             shape=(num_seqs, options.mut_len, 4, num_targets))
    scores_h5.create_dataset("seqs",
                             dtype="bool",
                             shape=(num_seqs, options.mut_len, 4))

    # store mutagenesis sequence coordinates
    seqs_chr, seqs_start, _, seqs_strand = zip(*seqs_coords)
    seqs_chr = np.array(seqs_chr, dtype="S")
    seqs_start = np.array(seqs_start) + mut_start
    seqs_end = seqs_start + options.mut_len
    seqs_strand = np.array(seqs_strand, dtype="S")
    scores_h5.create_dataset("chrom", data=seqs_chr)
    scores_h5.create_dataset("start", data=seqs_start)
    scores_h5.create_dataset("end", data=seqs_end)
    scores_h5.create_dataset("strand", data=seqs_strand)

    preds_per_seq = 1 + 3 * options.mut_len

    score_threads = []
    score_queue = Queue()
    for i in range(1):
        sw = ScoreWorker(score_queue, scores_h5)
        sw.start()
        score_threads.append(sw)

    #################################################################
    # predict scores, write output

    # initialize saver
    saver = tf.train.Saver()

    with tf.Session() as sess:
        # coordinator
        coord = tf.train.Coordinator()
        tf.train.start_queue_runners(coord=coord)

        # load variables into session
        saver.restore(sess, model_file)

        # initialize predictions stream
        preds_stream = PredStream(sess, model, 32)

        # predictions index
        pi = 0

        for si in range(num_seqs):
            print("Predicting %d" % si, flush=True)

            # collect sequence predictions
            seq_preds = []
            for spi in range(preds_per_seq):
                seq_preds.append(preds_stream[pi])
                pi += 1

            # wait for previous to finish
            score_queue.join()

            # queue sequence for scoring
            score_queue.put((seqs_dna[si], seq_preds, si))

            # queue sequence for plotting
            if options.plots:
                plot_queue.put((seqs_dna[si], seq_preds, si))

    # finish queue
    print("Waiting for threads to finish.", flush=True)
    score_queue.join()

    # close output HDF5
    scores_h5.close()