コード例 #1
0
def main():
    usage = 'usage: %prog [options] <fasta_file> <seqs_bed_file> <seqs_cov_dir> <tfr_file>'
    parser = OptionParser(usage)
    parser.add_option('-s',
                      dest='start_i',
                      default=0,
                      type='int',
                      help='Sequence start index [Default: %default]')
    parser.add_option('-e',
                      dest='end_i',
                      default=None,
                      type='int',
                      help='Sequence end index [Default: %default]')
    parser.add_option('--te',
                      dest='target_extend',
                      default=None,
                      type='int',
                      help='Extend targets vector [Default: %default]')
    parser.add_option(
        '--ts',
        dest='target_start',
        default=0,
        type='int',
        help='Write targets into vector starting at index [Default: %default')
    parser.add_option('-u',
                      dest='umap_npy',
                      help='Unmappable array numpy file')
    parser.add_option(
        '--umap_clip',
        dest='umap_clip',
        default=1,
        type='float',
        help=
        'Clip values at unmappable positions to distribution quantiles, eg 0.25. [Default: %default]'
    )
    parser.add_option(
        '--umap_tfr',
        dest='umap_tfr',
        default=False,
        action='store_true',
        help='Save umap array into TFRecords [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 4:
        parser.error('Must provide input arguments.')
    else:
        fasta_file = args[0]
        seqs_bed_file = args[1]
        seqs_cov_dir = args[2]
        tfr_file = args[3]

    ################################################################
    # read model sequences

    model_seqs = []
    for line in open(seqs_bed_file):
        a = line.split()
        model_seqs.append(ModelSeq(a[0], int(a[1]), int(a[2]), None))

    if options.end_i is None:
        options.end_i = len(model_seqs)

    num_seqs = options.end_i - options.start_i

    ################################################################
    # determine sequence coverage files

    seqs_cov_files = []
    ti = 0
    seqs_cov_file = '%s/%d.h5' % (seqs_cov_dir, ti)
    while os.path.isfile(seqs_cov_file):
        seqs_cov_files.append(seqs_cov_file)
        ti += 1
        seqs_cov_file = '%s/%d.h5' % (seqs_cov_dir, ti)

    if len(seqs_cov_files) == 0:
        print('Sequence coverage files not found, e.g. %s' % seqs_cov_file,
              file=sys.stderr)
        exit(1)

    seq_pool_len = h5py.File(seqs_cov_files[0], 'r')['targets'].shape[1]
    num_targets = len(seqs_cov_files)

    ################################################################
    # read targets

    # extend targets
    num_targets_tfr = num_targets
    if options.target_extend is not None:
        assert (options.target_extend >= num_targets_tfr)
        num_targets_tfr = options.target_extend

    # initialize targets
    targets = np.zeros((num_seqs, seq_pool_len, num_targets_tfr),
                       dtype='float16')

    # read each target
    for ti in range(num_targets):
        seqs_cov_open = h5py.File(seqs_cov_files[ti], 'r')
        tii = options.target_start + ti
        targets[:, :, tii] = seqs_cov_open['targets'][
            options.start_i:options.end_i, :]
        seqs_cov_open.close()

    ################################################################
    # modify unmappable

    if options.umap_npy is not None and options.umap_clip < 1:
        unmap_mask = np.load(options.umap_npy)

        for si in range(num_seqs):
            msi = options.start_i + si

            # determine unmappable null value
            seq_target_null = np.percentile(targets[si],
                                            q=[100 * options.umap_clip],
                                            axis=0)[0]

            # set unmappable positions to null
            targets[si, unmap_mask[msi, :], :] = np.minimum(
                targets[si, unmap_mask[msi, :], :], seq_target_null)

    elif options.umap_npy is not None and options.umap_tfr:
        unmap_mask = np.load(options.umap_npy)

    ################################################################
    # write TFRecords

    # open FASTA
    fasta_open = pysam.Fastafile(fasta_file)

    # define options
    tf_opts = tf.io.TFRecordOptions(compression_type='ZLIB')

    with tf.io.TFRecordWriter(tfr_file, tf_opts) as writer:
        for si in range(num_seqs):
            msi = options.start_i + si
            mseq = model_seqs[msi]

            # read FASTA
            seq_dna = fasta_open.fetch(mseq.chr, mseq.start, mseq.end)

            # one hot code
            seq_1hot = dna_1hot(seq_dna)
            # seq_1hot = dna_1hot_index(seq_dna) # more efficient, but fighting inertia

            # hash to bytes
            features_dict = {
                'sequence': feature_bytes(seq_1hot),
                'target': feature_bytes(targets[si, :, :])
            }

            # add unmappability
            if options.umap_tfr:
                features_dict['umap'] = feature_bytes(unmap_mask[msi, :])

            # write example
            example = tf.train.Example(features=tf.train.Features(
                feature=features_dict))
            writer.write(example.SerializeToString())

        fasta_open.close()
コード例 #2
0
ファイル: akita_data_read.py プロジェクト: yichao-cai/basenji
def main():
  usage = 'usage: %prog [options] <genome_hic_file> <seqs_bed_file> <seqs_hic_file>'
  parser = OptionParser(usage)
  parser.add_option('-b', dest='blacklist_bed',
      help='Set blacklist nucleotides to a baseline value.')
  parser.add_option('--clip', dest='clip',
      default=None, type='float',
      help='Clip values post-summary to a maximum [Default: %default]')
  parser.add_option('--crop', dest='crop_bp',
      default=0, type='int',
      help='Crop bp off each end [Default: %default]')
  parser.add_option('-d', dest='diagonal_offset',
      default=2, type='int',
      help='Positions on the diagonal to ignore [Default: %default]')
  parser.add_option('-k', dest='kernel_stddev',
      default=0, type='int',
      help='Gaussian kernel stddev to smooth values [Default: %default]')
  # parser.add_option('-s', dest='scale',
  #     default=1., type='float',
  #     help='Scale values by [Default: %default]')
  parser.add_option('-w',dest='pool_width',
      default=1, type='int',
      help='Average pooling width [Default: %default]')
  parser.add_option('--as_obsexp',dest='as_obsexp',
      default=False,action="store_true",
      help='save targets as obsexp profiles')
  parser.add_option('--global_obsexp',dest='global_obsexp',
      default=False,action="store_true",
      help='use global obs/exp')
  parser.add_option('--no_log',dest='no_log',
      default=False,action="store_true",
      help='no not take log for obs/exp')

  (options, args) = parser.parse_args()

  if len(args) != 3:
    parser.error('')
  else:
    genome_hic_file = args[0]
    seqs_bed_file = args[1]
    seqs_hic_file = args[2]

  # read model sequences
  model_seqs = []
  for line in open(seqs_bed_file):
    a = line.split()
    model_seqs.append(ModelSeq(a[0],int(a[1]),int(a[2]),None))

  # read blacklist regions
  black_chr_trees = read_blacklist(options.blacklist_bed)

  # compute dimensions
  num_seqs = len(model_seqs)
  seq_len_nt = model_seqs[0].end - model_seqs[0].start
  seq_len_pool = seq_len_nt // options.pool_width

  if options.crop_bp == 0:
    seq_len_crop = seq_len_pool
  else:
    crop_start = options.crop_bp // options.pool_width
    crop_end = seq_len_pool - crop_start
    seq_len_crop = seq_len_pool - 2*crop_start

  # compute upper triangular indexes
  triu_tup = np.triu_indices(seq_len_crop, options.diagonal_offset)
  seq_len_nodiag = seq_len_crop - options.diagonal_offset
  seq_len_hic = seq_len_nodiag*(seq_len_nodiag + 1) // 2

  # initialize sequences coverage file
  seqs_hic_open = h5py.File(seqs_hic_file, 'w')
  seqs_hic_open.create_dataset('targets', shape=(num_seqs, seq_len_hic), dtype='float16')

  if options.kernel_stddev > 0:
    # initialize Gaussian kernel
    kernel = Gaussian2DKernel(x_stddev=options.kernel_stddev)
  else:
    kernel = None

  # open genome coverage file
  genome_hic_cool = cooler.Cooler(genome_hic_file)

  if options.global_obsexp:
    try:
      print('loading by-chromosome expected')
      genome_hic_expected = pd.read_csv(genome_hic_file.replace('.cool','.expected'), sep='\t')
    except:
      print('not found: '+genome_hic_file.replace('cool','expected'))
      raise ValueError('invalid expected file')
   
  # check for "chr" prefix
  chr_pre = 'chr1' in genome_hic_cool.chromnames

  # assert that resolution matches
  assert(options.pool_width == genome_hic_cool.info['bin-size'])

  # for each model sequence
  for si in range(num_seqs):
    mseq = model_seqs[si]

    try:
      # pull hic values
      if chr_pre:
        mseq_str = '%s:%d-%d' % (mseq.chr, mseq.start, mseq.end)
      else:
        mseq_str = '%s:%d-%d' % (mseq.chr[3:], mseq.start, mseq.end)
      #print('mseq_str:', mseq_str)

      seq_hic_raw = genome_hic_cool.matrix(balance=True).fetch(mseq_str)
      seq_hic_nan = np.isnan(seq_hic_raw)
      num_filtered_bins = np.sum(np.sum(seq_hic_nan,axis=0) == len(seq_hic_nan))
      if num_filtered_bins > (.5*len(seq_hic_nan)):
        print("WARNING: %s >50% bins filtered, check:  %s. " % (genome_hic_file, mseq_str))

      # set blacklist to NaNs
      if mseq.chr in black_chr_trees:
        for black_interval in black_chr_trees[mseq.chr][mseq.start:mseq.end]:
          # adjust for sequence indexes
          black_seq_start = (black_interval.begin - mseq.start)// options.pool_width
          black_seq_end =   int(  np.ceil( (black_interval.end - mseq.start)/ options.pool_width ) )
          seq_hic_raw[:,black_seq_start:black_seq_end] = np.nan
          seq_hic_raw[black_seq_start:black_seq_end,:] = np.nan
        seq_hic_nan = np.isnan(seq_hic_raw)

      # clip first diagonals and high values
      clipval = np.nanmedian(np.diag(seq_hic_raw,options.diagonal_offset))
      for i in range(-options.diagonal_offset+1,options.diagonal_offset):
        set_diag(seq_hic_raw, clipval, i)
      seq_hic_raw = np.clip(seq_hic_raw, 0, clipval)
      seq_hic_raw[seq_hic_nan] = np.nan

      # adaptively coarsegrain based on raw counts
      seq_hic_smoothed = adaptive_coarsegrain(
                              seq_hic_raw,
                              genome_hic_cool.matrix(balance=False).fetch(mseq_str),
                              cutoff=2, max_levels=8)
      seq_hic_nan = np.isnan(seq_hic_smoothed)
      #todo: pass an option to add a certain pseudocount value, or the minimum nonzero value

      if options.as_obsexp:
        # compute obs/exp        
        if options.global_obsexp: # compute global obs/exp
          exp_chr = genome_hic_expected.iloc[ genome_hic_expected['chrom'].values ==mseq.chr][0:seq_len_pool]
          if len(exp_chr) ==0: 
              raise ValueError('no expected values found for chr:'+mseq.chr)
          exp_map= np.zeros((seq_len_pool,seq_len_pool))
          for i in range(seq_len_pool):
            set_diag(exp_map,exp_chr['balanced.avg'].values[i],i)
            set_diag(exp_map,exp_chr['balanced.avg'].values[i],-i)
          seq_hic_obsexp = seq_hic_smoothed / exp_map
          for i in range(-options.diagonal_offset+1,options.diagonal_offset): set_diag(seq_hic_obsexp,1.0,i)
          seq_hic_obsexp[seq_hic_nan] = np.nan          

        else: # compute local obs/exp
          seq_hic_obsexp = observed_over_expected(seq_hic_smoothed, ~seq_hic_nan)[0]

        # log
        if options.no_log==False:
          seq_hic_obsexp = np.log(seq_hic_obsexp)
          if options.clip is not None:
            seq_hic_obsexp = np.clip(seq_hic_obsexp, -options.clip, options.clip)
          seq_hic_obsexp = interp_nan(seq_hic_obsexp)
          for i in range(-options.diagonal_offset+1, options.diagonal_offset): set_diag(seq_hic_obsexp, 0,i)
        else:
          if options.clip is not None:
            seq_hic_obsexp = np.clip(seq_hic_obsexp, 0, options.clip)
          seq_hic_obsexp = interp_nan(seq_hic_obsexp)
          for i in range(-options.diagonal_offset+1, options.diagonal_offset): set_diag(seq_hic_obsexp, 1,i)

        # apply kernel
        if kernel is not None:
          seq_hic = convolve(seq_hic_obsexp, kernel)
        else:
          seq_hic = seq_hic_obsexp

      else:
        # interpolate all missing bins
        seq_hic_interpolated = interp_nan(seq_hic_smoothed)

        # rescale, reclip
        seq_hic = 100000*seq_hic_interpolated
        clipval = np.nanmedian(np.diag(seq_hic,options.diagonal_offset))
        for i in range(-options.diagonal_offset+1, options.diagonal_offset):
          set_diag(seq_hic,clipval,i)
        seq_hic = np.clip(seq_hic, 0, clipval)

        #extra smoothing. todo pass kernel specs
        if kernel is not None:
          seq_hic = convolve(seq_hic, kernel)

    except ValueError:
      print("WARNING: %s doesn't see %s. Setting to all zeros." % (genome_hic_file, mseq_str))
      seq_hic = np.zeros((seq_len_pool,seq_len_pool), dtype='float16')

    # crop
    if options.crop_bp > 0:
      seq_hic = seq_hic[crop_start:crop_end,:]
      seq_hic = seq_hic[:,crop_start:crop_end]

    # unroll upper triangular
    seq_hic = seq_hic[triu_tup]

    # write
    seqs_hic_open['targets'][si,:] = seq_hic.astype('float16')

  # close sequences coverage file
  seqs_hic_open.close()
コード例 #3
0
ファイル: basenji_data_read.py プロジェクト: tderrien/basenji
def main():
    usage = 'usage: %prog [options] <genome_cov_file> <seqs_bed_file> <seqs_cov_file>'
    parser = OptionParser(usage)
    parser.add_option('-b',
                      dest='blacklist_bed',
                      help='Set blacklist nucleotides to a baseline value.')
    parser.add_option(
        '-c',
        dest='clip',
        default=None,
        type='float',
        help='Clip values post-summary to a maximum [Default: %default]')
    parser.add_option('--crop',
                      dest='crop_bp',
                      default=0,
                      type='int',
                      help='Crop bp off each end [Default: %default]')
    parser.add_option('-i',
                      dest='interp_nan',
                      default=False,
                      action='store_true',
                      help='Interpolate NaNs [Default: %default]')
    parser.add_option('-s',
                      dest='scale',
                      default=1.,
                      type='float',
                      help='Scale values by [Default: %default]')
    parser.add_option(
        '--soft',
        dest='soft_clip',
        default=False,
        action='store_true',
        help=
        'Soft clip values, applying sqrt to the execess above the threshold [Default: %default]'
    )
    parser.add_option(
        '-u',
        dest='sum_stat',
        default='sum',
        help='Summary statistic to compute in windows [Default: %default]')
    parser.add_option('-w',
                      dest='pool_width',
                      default=1,
                      type='int',
                      help='Average pooling width [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error('')
    else:
        genome_cov_file = args[0]
        seqs_bed_file = args[1]
        seqs_cov_file = args[2]

    assert (options.crop_bp >= 0)

    # read model sequences
    model_seqs = []
    for line in open(seqs_bed_file):
        a = line.split()
        model_seqs.append(ModelSeq(a[0], int(a[1]), int(a[2]), None))

    # read blacklist regions
    black_chr_trees = read_blacklist(options.blacklist_bed)

    # compute dimensions
    num_seqs = len(model_seqs)
    seq_len_nt = model_seqs[0].end - model_seqs[0].start
    seq_len_nt -= 2 * options.crop_bp
    target_length = seq_len_nt // options.pool_width
    assert (target_length > 0)

    # initialize sequences coverage file
    seqs_cov_open = h5py.File(seqs_cov_file, 'w')
    seqs_cov_open.create_dataset('targets',
                                 shape=(num_seqs, target_length),
                                 dtype='float16')

    # open genome coverage file
    genome_cov_open = CovFace(genome_cov_file)

    # for each model sequence
    for si in range(num_seqs):
        mseq = model_seqs[si]

        # read coverage
        seq_cov_nt = genome_cov_open.read(mseq.chr, mseq.start, mseq.end)

        # interpolate NaN
        if options.interp_nan:
            seq_cov_nt = interp_nan(seq_cov_nt)

        # determine baseline coverage
        baseline_cov = np.percentile(seq_cov_nt, 10)
        baseline_cov = np.nan_to_num(baseline_cov)

        # set blacklist to baseline
        if mseq.chr in black_chr_trees:
            for black_interval in black_chr_trees[
                    mseq.chr][mseq.start:mseq.end]:
                # adjust for sequence indexes
                black_seq_start = black_interval.begin - mseq.start
                black_seq_end = black_interval.end - mseq.start
                seq_cov_nt[black_seq_start:black_seq_end] = baseline_cov

        # set NaN's to baseline
        if not options.interp_nan:
            nan_mask = np.isnan(seq_cov_nt)
            seq_cov_nt[nan_mask] = baseline_cov

        # crop
        if options.crop_bp > 0:
            seq_cov_nt = seq_cov_nt[options.crop_bp:-options.crop_bp]

        # sum pool
        seq_cov = seq_cov_nt.reshape(target_length, options.pool_width)
        if options.sum_stat == 'sum':
            seq_cov = seq_cov.sum(axis=1, dtype='float32')
        elif options.sum_stat in ['mean', 'avg']:
            seq_cov = seq_cov.mean(axis=1, dtype='float32')
        elif options.sum_stat == 'median':
            seq_cov = seq_cov.median(axis=1, dtype='float32')
        elif options.sum_stat == 'max':
            seq_cov = seq_cov.max(axis=1, dtype='float32')
        else:
            print('ERROR: Unrecognized summary statistic "%s".' %
                  options.sum_stat,
                  file=sys.stderr)
            exit(1)

        # clip
        if options.clip is not None:
            if options.soft_clip:
                clip_mask = (seq_cov > options.clip)
                seq_cov[clip_mask] = options.clip + np.sqrt(
                    seq_cov[clip_mask] - options.clip)
            else:
                seq_cov = np.clip(seq_cov, 0, options.clip)

        # scale
        seq_cov = options.scale * seq_cov

        # write
        seqs_cov_open['targets'][si, :] = seq_cov.astype('float16')

    # close genome coverage file
    genome_cov_open.close()

    # close sequences coverage file
    seqs_cov_open.close()
コード例 #4
0
def main():
    usage = 'usage: %prog [options] <genome_cov_file> <seqs_bed_file> <seqs_cov_file>'
    parser = OptionParser(usage)
    # add option for thresholding
    parser.add_option('-b',
                      dest='blacklist_bed',
                      help='Set blacklist nucleotides to a baseline value.')
    parser.add_option(
        '-c',
        dest='clip',
        default=None,
        type='float',
        help='Clip values post-summary to a maximum [Default: %default]')
    parser.add_option(
        '--clip_soft',
        dest='clip_soft',
        default=None,
        type='float',
        help=
        'Soft clip values, applying sqrt to the execess above the threshold [Default: %default]'
    )
    parser.add_option('--crop',
                      dest='crop_bp',
                      default=0,
                      type='int',
                      help='Crop bp off each end [Default: %default]')
    parser.add_option('-i',
                      dest='interp_nan',
                      default=False,
                      action='store_true',
                      help='Interpolate NaNs [Default: %default]')
    parser.add_option('-s',
                      dest='scale',
                      default=1.,
                      type='float',
                      help='Scale values by [Default: %default]')
    parser.add_option(
        '-u',
        dest='sum_stat',
        default='sum',
        help='Summary statistic to compute in windows [Default: %default]')
    parser.add_option('-w',
                      dest='pool_width',
                      default=1,
                      type='int',
                      help='Average pooling width [Default: %default]')
    parser.add_option('--norm',
                      dest='norm',
                      default='',
                      type='str',
                      help='Normalize coverage values')
    # parser.add_option('--step_fr', dest='step_fr',
    #     default=1., type='float',
    #     help='Stride using fraction of bin size [Default: %default]')
    parser.add_option('--step_bp',
                      dest='step_bp',
                      default=0,
                      type='int',
                      help='Stride using bp step size [Default: %default]')
    parser.add_option('--padding',
                      dest='padding',
                      default='valid',
                      type='str',
                      help='Padding method for sliding window approach')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error('')
    else:
        genome_cov_file = args[0]
        seqs_bed_file = args[1]
        seqs_cov_file = args[2]

    assert (options.crop_bp >= 0)

    # read model sequences
    model_seqs = []
    for line in open(seqs_bed_file):
        a = line.split()
        model_seqs.append(ModelSeq(a[0], int(a[1]), int(a[2]), None))

    # read blacklist regions
    black_chr_trees = read_blacklist(options.blacklist_bed)

    # compute dimensions
    num_seqs = len(model_seqs)
    seq_len_nt = model_seqs[0].end - model_seqs[0].start
    seq_len_nt -= 2 * options.crop_bp
    target_length = seq_len_nt // options.pool_width
    assert (target_length > 0)

    # initialize sequences coverage file
    seqs_cov_open = h5py.File(seqs_cov_file, 'w')
    # seqs_cov_open.create_dataset('targets', shape=(num_seqs, target_length), dtype='float16')
    targets_list = []

    # open genome coverage file
    genome_cov_open = CovFace(genome_cov_file)

    # for each model sequence
    for si in range(num_seqs):
        mseq = model_seqs[si]

        # read coverage
        seq_cov_nt = genome_cov_open.read(mseq.chr, mseq.start, mseq.end)

        # interpolate NaN
        if options.interp_nan:
            seq_cov_nt = interp_nan(seq_cov_nt)

        # determine baseline coverage
        if target_length >= 8:
            baseline_cov = np.percentile(seq_cov_nt, 10)
            baseline_cov = np.nan_to_num(baseline_cov)
        else:
            baseline_cov = 0

        # set blacklist to baseline
        if mseq.chr in black_chr_trees:
            for black_interval in black_chr_trees[
                    mseq.chr][mseq.start:mseq.end]:
                # adjust for sequence indexes
                black_seq_start = black_interval.begin - mseq.start
                black_seq_end = black_interval.end - mseq.start
                seq_cov_nt[black_seq_start:black_seq_end] = baseline_cov

        # set NaN's to baseline
        if not options.interp_nan:
            nan_mask = np.isnan(seq_cov_nt)
            seq_cov_nt[nan_mask] = baseline_cov

        # crop
        if options.crop_bp > 0:
            seq_cov_nt = seq_cov_nt[options.crop_bp:-options.crop_bp]

        # sliding window
        if options.step_bp > 0:
            if options.padding == 'same':
                seq_cov_nt = np.pad(seq_cov_nt,
                                    (int(options.pool_width / 2 - 1),
                                     int(options.pool_width / 2)), 'edge')
            #
            seq_cov = np.array(
                list(
                    more_itertools.windowed(seq_cov_nt,
                                            n=options.pool_width,
                                            step=options.step_bp)))
        # sum pool
        else:
            seq_cov = seq_cov_nt.reshape(target_length, options.pool_width)
        if options.sum_stat == 'sum':
            seq_cov = seq_cov.sum(axis=1, dtype='float32')
        elif options.sum_stat in ['mean', 'avg']:
            seq_cov = seq_cov.mean(axis=1, dtype='float32')
        elif options.sum_stat == 'median':
            seq_cov = seq_cov.median(axis=1)
        elif options.sum_stat == 'max':
            seq_cov = seq_cov.max(axis=1)
        elif options.sum_stat == 'peak':
            seq_cov = seq_cov.mean(axis=1, dtype='float32')
            seq_cov = np.clip(np.sqrt(seq_cov * 4), 0, 1)
        else:
            print('ERROR: Unrecognized summary statistic "%s".' %
                  options.sum_stat,
                  file=sys.stderr)
            exit(1)

        # clip
        if options.clip_soft is not None:
            clip_mask = (seq_cov > options.clip_soft)
            seq_cov[clip_mask] = options.clip_soft + np.sqrt(
                seq_cov[clip_mask] - options.clip_soft)
        if options.clip is not None:
            seq_cov = np.clip(seq_cov, 0, options.clip)
        # # threshold
        # if options.threshold > 0:
        #   print('Filtering using threshold {}'.format(options.threshold))
        #   print('~~~')
        #   print(seq_cov.shape)
        #   print('~~~')

        # scale
        seq_cov = options.scale * seq_cov
        if options.norm == 'log':
            seq_cov = np.log(seq_cov + 1)
            # print('LOG NORMALIZING THE DATA')
        # save
        targets_list.append(seq_cov.astype('float16'))
        # print('!!!!!')
        # print(np.array(targets_list, dtype='float16').shape)
        # exit()
        # write
        # seqs_cov_open['targets'][si,:] = seq_cov.astype('float16')

    # write all
    seqs_cov_open.create_dataset('targets',
                                 dtype='float16',
                                 data=np.array(targets_list, dtype='float16'))

    # close genome coverage file
    genome_cov_open.close()

    # close sequences coverage file
    seqs_cov_open.close()
コード例 #5
0
def main():
    usage = (
        "usage: %prog [options] <fasta_file> <seqs_bed_file> <seqs_cov_dir> <tfr_file>"
    )
    parser = OptionParser(usage)
    parser.add_option("-g",
                      dest="genome_index",
                      default=None,
                      type="int",
                      help="Genome index")
    parser.add_option(
        "-s",
        dest="start_i",
        default=0,
        type="int",
        help="Sequence start index [Default: %default]",
    )
    parser.add_option(
        "-e",
        dest="end_i",
        default=None,
        type="int",
        help="Sequence end index [Default: %default]",
    )
    parser.add_option(
        "--te",
        dest="target_extend",
        default=None,
        type="int",
        help="Extend targets vector [Default: %default]",
    )
    parser.add_option(
        "--ts",
        dest="target_start",
        default=0,
        type="int",
        help="Write targets into vector starting at index [Default: %default",
    )
    parser.add_option("-u",
                      dest="umap_npy",
                      help="Unmappable array numpy file")
    parser.add_option(
        "--umap_set",
        dest="umap_set",
        default=None,
        type="float",
        help=
        "Sequence distribution value to set unmappable positions to, eg 0.25.",
    )
    (options, args) = parser.parse_args()

    if len(args) != 4:
        parser.error("Must provide input arguments.")
    else:
        fasta_file = args[0]
        seqs_bed_file = args[1]
        seqs_cov_dir = args[2]
        tfr_file = args[3]

    ################################################################
    # read model sequences

    model_seqs = []
    for line in open(seqs_bed_file):
        a = line.split()
        model_seqs.append(ModelSeq(a[0], int(a[1]), int(a[2]), None))

    if options.end_i is None:
        options.end_i = len(model_seqs)

    num_seqs = options.end_i - options.start_i

    ################################################################
    # determine sequence coverage files

    seqs_cov_files = []
    ti = 0
    if options.genome_index is None:
        seqs_cov_file = "%s/%d.h5" % (seqs_cov_dir, ti)
    else:
        seqs_cov_file = "%s/%d-%d.h5" % (seqs_cov_dir, options.genome_index,
                                         ti)
    while os.path.isfile(seqs_cov_file):
        seqs_cov_files.append(seqs_cov_file)
        ti += 1
        if options.genome_index is None:
            seqs_cov_file = "%s/%d.h5" % (seqs_cov_dir, ti)
        else:
            seqs_cov_file = "%s/%d-%d.h5" % (seqs_cov_dir,
                                             options.genome_index, ti)

    if len(seqs_cov_files) == 0:
        print(
            "Sequence coverage files not found, e.g. %s" % seqs_cov_file,
            file=sys.stderr,
        )
        exit(1)

    seq_pool_len = h5py.File(seqs_cov_files[0], "r")["seqs_cov"].shape[1]
    num_targets = len(seqs_cov_files)

    ################################################################
    # read targets

    # extend targets
    num_targets_tfr = num_targets
    if options.target_extend is not None:
        assert options.target_extend >= num_targets_tfr
        num_targets_tfr = options.target_extend

    # initialize targets
    targets = np.zeros((num_seqs, seq_pool_len, num_targets_tfr),
                       dtype="float16")

    # read each target
    for ti in range(num_targets):
        seqs_cov_open = h5py.File(seqs_cov_files[ti], "r")
        tii = options.target_start + ti
        targets[:, :, tii] = seqs_cov_open["seqs_cov"][
            options.start_i:options.end_i, :]
        seqs_cov_open.close()

    ################################################################
    # modify unmappable

    if options.umap_npy is not None and options.umap_set is not None:
        unmap_mask = np.load(options.umap_npy)

        for si in range(num_seqs):
            msi = options.start_i + si

            # determine unmappable null value
            seq_target_null = np.percentile(targets[si],
                                            q=[100 * options.umap_set],
                                            axis=0)[0]

            # set unmappable positions to null
            targets[si, unmap_mask[msi, :], :] = np.minimum(
                targets[si, unmap_mask[msi, :], :], seq_target_null)

    ################################################################
    # write TFRecords

    # open FASTA
    fasta_open = pysam.Fastafile(fasta_file)

    # define options
    tf_opts = tf.python_io.TFRecordOptions(
        tf.python_io.TFRecordCompressionType.ZLIB)

    with tf.python_io.TFRecordWriter(tfr_file, tf_opts) as writer:
        for si in range(num_seqs):
            msi = options.start_i + si
            mseq = model_seqs[msi]

            # read FASTA
            seq_dna = fasta_open.fetch(mseq.chr, mseq.start, mseq.end)

            # one hot code
            seq_1hot = dna_1hot(seq_dna)

            if options.genome_index is None:
                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        "genome":
                        _int_feature(0),
                        "sequence":
                        _bytes_feature(seq_1hot.flatten().tostring()),
                        "target":
                        _bytes_feature(targets[si, :, :].flatten().tostring()),
                    }))
            else:
                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        "genome":
                        _int_feature(options.genome_index),
                        "sequence":
                        _bytes_feature(seq_1hot.flatten().tostring()),
                        "target":
                        _bytes_feature(targets[si, :, :].flatten().tostring()),
                    }))

            writer.write(example.SerializeToString())

        fasta_open.close()
コード例 #6
0
def main():
    usage = 'usage: %prog [options] <genome_hic_file> <seqs_bed_file> <seqs_hic_file>'
    parser = OptionParser(usage)
    parser.add_option('-b',
                      dest='blacklist_bed',
                      help='Set blacklist nucleotides to a baseline value.')
    parser.add_option(
        '-c',
        dest='clip',
        default=None,
        type='float',
        help='Clip values post-summary to a maximum [Default: %default]')
    parser.add_option('-s',
                      dest='scale',
                      default=1.,
                      type='float',
                      help='Scale values by [Default: %default]')
    parser.add_option(
        '--soft',
        dest='soft_clip',
        default=False,
        action='store_true',
        help=
        'Soft clip values, applying sqrt to the execess above the threshold [Default: %default]'
    )
    parser.add_option(
        '-u',
        dest='sum_stat',
        default='sum',
        help='Summary statistic to compute in windows [Default: %default]')
    parser.add_option('-w',
                      dest='pool_width',
                      default=1,
                      type='int',
                      help='Average pooling width [Default: %default]')
    parser.add_option('--as_obsexp',
                      dest='as_obsexp',
                      default=False,
                      action="store_true",
                      help='save targets as obsexp profiles')

    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error('')
    else:
        genome_hic_file = args[0]
        seqs_bed_file = args[1]
        seqs_hic_file = args[2]

    print('saving TFRs as obsexp:', options.as_obsexp)

    # read model sequences
    model_seqs = []
    for line in open(seqs_bed_file):
        a = line.split()
        model_seqs.append(ModelSeq(a[0], int(a[1]), int(a[2]), None))

    # read blacklist regions
    black_chr_trees = read_blacklist(options.blacklist_bed)

    # compute dimensions
    num_seqs = len(model_seqs)
    seq_len_nt = model_seqs[0].end - model_seqs[0].start
    seq_len_pool = seq_len_nt // options.pool_width

    # initialize sequences coverage file
    seqs_hic_open = h5py.File(seqs_hic_file, 'w')
    seqs_hic_open.create_dataset('seqs_hic',
                                 shape=(num_seqs, seq_len_pool, seq_len_pool),
                                 dtype='float16')

    # open genome coverage file
    genome_hic_cool = cooler.Cooler(genome_hic_file)

    # check for "chr" prefix
    chr_pre = 'chr1' in genome_hic_cool.chromnames

    # assert that resolution matches
    assert (options.pool_width == genome_hic_cool.info['bin-size'])

    # for each model sequence
    for si in range(num_seqs):
        mseq = model_seqs[si]

        try:
            # pull hic values
            if chr_pre:
                mseq_str = '%s:%d-%d' % (mseq.chr, mseq.start, mseq.end)
            else:
                mseq_str = '%s:%d-%d' % (mseq.chr[3:], mseq.start, mseq.end)
            #print('mseq_str:', mseq_str)

            seq_hic_raw = genome_hic_cool.matrix(balance=True).fetch(mseq_str)
            seq_hic_nan = np.isnan(seq_hic_raw)
            if np.sum(seq_hic_nan[len(seq_hic_nan) // 2 -
                                  1:len(seq_hic_nan) // 2 + 1,
                                  len(seq_hic_nan) // 2 -
                                  2:len(seq_hic_nan) // 2 + 2]) > 4:
                print(
                    "WARNING: %s lots of zeros, check that umap_midpoint is correct %s. "
                    % (genome_hic_file, mseq_str))

            # set blacklist to NaNs
            if mseq.chr in black_chr_trees:
                for black_interval in black_chr_trees[
                        mseq.chr][mseq.start:mseq.end]:
                    # adjust for sequence indexes
                    black_seq_start = (black_interval.begin -
                                       mseq.start) // options.pool_width
                    black_seq_end = int(
                        np.ceil((black_interval.end - mseq.start) /
                                options.pool_width))
                    seq_hic_raw[:, black_seq_start:black_seq_end] = np.nan
                    seq_hic_raw[black_seq_start:black_seq_end, :] = np.nan
                seq_hic_nan = np.isnan(seq_hic_raw)

            # clip first diagonals and high values
            clipval = np.nanmedian(np.diag(seq_hic_raw, 2))
            for i in [-1, 0, 1]:
                set_diag(seq_hic_raw, clipval, i)
            seq_hic_raw = np.clip(seq_hic_raw, 0, seq_hic_raw)
            seq_hic_raw[seq_hic_nan] = np.nan

            # adaptively coarsegrain based on raw counts
            seq_hic_smoothed = adaptive_coarsegrain(
                seq_hic_raw,
                genome_hic_cool.matrix(balance=False).fetch(mseq_str),
                cutoff=2,
                max_levels=8)
            #todo: pass an option to add a certain pseudocount value, or the minimum nonzero value

            if options.as_obsexp == True:
                # interpolate single missing bins
                seq_hic_interpolated = interpolate_bad_singletons(
                    seq_hic_smoothed,
                    mask=(~seq_hic_nan),
                    fillDiagonal=True,
                    returnMask=False,
                    secondPass=True,
                    verbose=False)
                seq_hic_nan = np.isnan(seq_hic_interpolated)

                # compute observed/expected
                seq_hic_obsexp = observed_over_expected(
                    seq_hic_interpolated, ~seq_hic_nan)[0]
                # todo: allow passing a global expected rather than computing locally

                # log
                seq_hic_obsexp = np.log(seq_hic_obsexp)

                # set nan to 0
                seq_hic_obsexp = np.nan_to_num(seq_hic_obsexp)

                # todo: make obsexp_clip an option for obs/exp
                seq_hic = np.clip(seq_hic_obsexp, -2, 2)

            else:
                # interpolate all missing bins
                seq_hic_interpolated = interp_nan(seq_hic_smoothed)

                # rescale
                seq_hic = 100000 * seq_hic_interpolated

                # todo add extra smoothing

        except ValueError:
            print("WARNING: %s doesn't see %s. Setting to all zeros." %
                  (genome_hic_file, mseq_str))
            seq_hic = np.zeros((seq_len_pool, seq_len_pool), dtype='float16')

        # write
        seqs_hic_open['seqs_hic'][si, :, :] = seq_hic.astype('float16')

    # close sequences coverage file
    seqs_hic_open.close()
コード例 #7
0
def main():
    usage = 'usage: %prog [options] <fasta_file> <seqs_bed_file> <seqs_cov_dir> <tfr_file>'
    parser = OptionParser(usage)
    parser.add_option('-s',
                      dest='start_i',
                      default=0,
                      type='int',
                      help='Sequence start index [Default: %default]')
    parser.add_option('-e',
                      dest='end_i',
                      default=None,
                      type='int',
                      help='Sequence end index [Default: %default]')
    parser.add_option('-u',
                      dest='umap_npy',
                      help='Unmappable array numpy file')
    parser.add_option(
        '--umap_set',
        dest='umap_set',
        default=None,
        type='float',
        help=
        'Sequence distribution value to set unmappable positions to, eg 0.25.')
    (options, args) = parser.parse_args()

    if len(args) != 4:
        parser.error('Must provide input arguments.')
    else:
        fasta_file = args[0]
        seqs_bed_file = args[1]
        seqs_cov_dir = args[2]
        tfr_file = args[3]

    ################################################################
    # read model sequences

    model_seqs = []
    for line in open(seqs_bed_file):
        a = line.split()
        model_seqs.append(ModelSeq(a[0], int(a[1]), int(a[2])))

    if options.end_i is None:
        options.end_i = len(model_seqs)

    num_seqs = options.end_i - options.start_i

    ################################################################
    # determine sequence coverage files

    seqs_cov_files = []
    ti = 0
    seqs_cov_file = '%s/%d.h5' % (seqs_cov_dir, ti)
    while os.path.isfile(seqs_cov_file):
        seqs_cov_files.append(seqs_cov_file)
        ti += 1
        seqs_cov_file = '%s/%d.h5' % (seqs_cov_dir, ti)

    seq_pool_len = h5py.File(seqs_cov_files[0], 'r')['seqs_cov'].shape[1]
    num_targets = len(seqs_cov_files)

    ################################################################
    # read targets

    # initialize targets
    targets = np.zeros((num_seqs, seq_pool_len, num_targets), dtype='float16')

    # read each target
    for ti in range(num_targets):
        seqs_cov_open = h5py.File(seqs_cov_files[ti], 'r')
        targets[:, :, ti] = seqs_cov_open['seqs_cov'][
            options.start_i:options.end_i, :]
        seqs_cov_open.close()

    ################################################################
    # modify unmappable

    if options.umap_npy is not None and options.umap_set is not None:
        unmap_mask = np.load(options.umap_npy)

        for si in range(num_seqs):
            msi = options.start_i + si

            # determine unmappable null value
            seq_target_null = np.percentile(targets[si],
                                            q=[100 * options.umap_set],
                                            axis=0)[0]

            # set unmappable positions to null
            targets[si, unmap_mask[msi, :], :] = np.minimum(
                targets[si, unmap_mask[msi, :], :], seq_target_null)

    ################################################################
    # write TFRecords

    # open FASTA
    fasta_open = pysam.Fastafile(fasta_file)

    # define options
    tf_opts = tf.python_io.TFRecordOptions(
        tf.python_io.TFRecordCompressionType.ZLIB)

    with tf.python_io.TFRecordWriter(tfr_file, tf_opts) as writer:
        for si in range(num_seqs):
            msi = options.start_i + si
            mseq = model_seqs[msi]

            # read FASTA
            seq_dna = fasta_open.fetch(mseq.chr, mseq.start, mseq.end)

            # one hot code
            seq_1hot = dna_1hot(seq_dna)

            # example = tf.train.Example(features=tf.train.Features(feature={
            #     'sequence': _bytes_feature(seq_1hot.flatten().tostring()),
            #     'target': _float_feature(targets[si,:,:].flatten())}))
            example = tf.train.Example(features=tf.train.Features(
                feature={
                    'sequence':
                    _bytes_feature(seq_1hot.flatten().tostring()),
                    'target':
                    _bytes_feature(targets[si, :, :].flatten().tostring())
                }))

            writer.write(example.SerializeToString())

        fasta_open.close()
コード例 #8
0
def main():
    usage = "usage: %prog [options] <genome_cov_file> <seqs_bed_file> <seqs_cov_file>"
    parser = OptionParser(usage)
    parser.add_option(
        "-b",
        dest="blacklist_bed",
        help="Set blacklist nucleotides to a baseline value.",
    )
    parser.add_option(
        "-c",
        dest="clip",
        default=None,
        type="float",
        help="Clip values post-summary to a maximum [Default: %default]",
    )
    parser.add_option(
        "-s",
        dest="scale",
        default=1.0,
        type="float",
        help="Scale values by [Default: %default]",
    )
    parser.add_option(
        "--soft",
        dest="soft_clip",
        default=False,
        action="store_true",
        help="Soft clip values, applying sqrt to the execess above the threshold [Default: %default]",
    )
    parser.add_option(
        "-u",
        dest="sum_stat",
        default="sum",
        help="Summary statistic to compute in windows [Default: %default]",
    )
    parser.add_option(
        "-w",
        dest="pool_width",
        default=1,
        type="int",
        help="Average pooling width [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error("")
    else:
        genome_cov_file = args[0]
        seqs_bed_file = args[1]
        seqs_cov_file = args[2]

    # read model sequences
    model_seqs = []
    for line in open(seqs_bed_file):
        a = line.split()
        model_seqs.append(ModelSeq(a[0], int(a[1]), int(a[2]), None))

    # read blacklist regions
    black_chr_trees = read_blacklist(options.blacklist_bed)

    # compute dimensions
    num_seqs = len(model_seqs)
    seq_len_nt = model_seqs[0].end - model_seqs[0].start
    seq_len_pool = seq_len_nt // options.pool_width

    # initialize sequences coverage file
    seqs_cov_open = h5py.File(seqs_cov_file, "w")
    seqs_cov_open.create_dataset(
        "seqs_cov", shape=(num_seqs, seq_len_pool), dtype="float16"
    )

    # open genome coverage file
    genome_cov_open = CovFace(genome_cov_file)

    # for each model sequence
    for si in range(num_seqs):
        mseq = model_seqs[si]

        # read coverage
        seq_cov_nt = genome_cov_open.read(mseq.chr, mseq.start, mseq.end)

        # determine baseline coverage
        baseline_cov = np.percentile(seq_cov_nt, 10)
        baseline_cov = np.nan_to_num(baseline_cov)

        # set blacklist to baseline
        if mseq.chr in black_chr_trees:
            for black_interval in black_chr_trees[mseq.chr][mseq.start : mseq.end]:
                # adjust for sequence indexes
                black_seq_start = black_interval.begin - mseq.start
                black_seq_end = black_interval.end - mseq.start
                seq_cov_nt[black_seq_start:black_seq_end] = baseline_cov

        # set NaN's to baseline
        nan_mask = np.isnan(seq_cov_nt)
        seq_cov_nt[nan_mask] = baseline_cov

        # sum pool
        seq_cov = seq_cov_nt.reshape(seq_len_pool, options.pool_width)
        if options.sum_stat == "sum":
            seq_cov = seq_cov.sum(axis=1, dtype="float32")
        elif options.sum_stat in ["mean", "avg"]:
            seq_cov = seq_cov.mean(axis=1, dtype="float32")
        elif options.sum_stat == "median":
            seq_cov = seq_cov.median(axis=1, dtype="float32")
        elif options.sum_stat == "max":
            seq_cov = seq_cov.max(axis=1, dtype="float32")
        else:
            print(
                'ERROR: Unrecognized summary statistic "%s".' % options.sum_stat,
                file=sys.stderr,
            )
            exit(1)

        # clip
        if options.clip is not None:
            if options.soft_clip:
                clip_mask = seq_cov > options.clip
                seq_cov[clip_mask] = options.clip + np.sqrt(
                    seq_cov[clip_mask] - options.clip
                )
            else:
                seq_cov = np.clip(seq_cov, 0, options.clip)

        # scale
        seq_cov = options.scale * seq_cov

        # write
        seqs_cov_open["seqs_cov"][si, :] = seq_cov.astype("float16")

    # close genome coverage file
    genome_cov_open.close()

    # close sequences coverage file
    seqs_cov_open.close()
コード例 #9
0
def main():
  usage = 'usage: %prog [options] <fasta_file> <targets_file>'
  parser = OptionParser(usage)
  parser.add_option('-b', dest='blacklist_bed',
      help='Set blacklist nucleotides to a baseline value.')
  parser.add_option('--break', dest='break_t',
      default=8388608, type='int',
      help='Break in half contigs above length [Default: %default]')
  parser.add_option('-c', '--crop', dest='crop_bp',
      default=0, type='int',
      help='Crop bp off each end [Default: %default]')
  parser.add_option('-d', dest='diagonal_offset',
      default=2, type='int',
      help='Positions on the diagonal to ignore [Default: %default]')
  parser.add_option('-f', dest='folds',
      default=None, type='int',
      help='Generate cross fold split [Default: %default]')
  parser.add_option('-g', dest='gaps_file',
      help='Genome assembly gaps BED [Default: %default]')
  parser.add_option('-k', dest='kernel_stddev',
      default=0, type='int',
      help='Gaussian kernel stddev to smooth values [Default: %default]')
  parser.add_option('-l', dest='seq_length',
      default=131072, type='int',
      help='Sequence length [Default: %default]')
  parser.add_option('--limit', dest='limit_bed',
      help='Limit to segments that overlap regions in a BED file')
  parser.add_option('--local', dest='run_local',
      default=False, action='store_true',
      help='Run jobs locally as opposed to on SLURM [Default: %default]')
  parser.add_option('-o', dest='out_dir',
      default='data_out',
      help='Output directory [Default: %default]')
  parser.add_option('-p', dest='processes',
      default=None, type='int',
      help='Number parallel processes [Default: %default]')
  parser.add_option('-r', dest='seqs_per_tfr',
      default=128, type='int',
      help='Sequences per TFRecord file [Default: %default]')
  parser.add_option('--restart', dest='restart',
      default=False, action='store_true',
      help='Continue progress from midpoint. [Default: %default]')
  parser.add_option('--sample', dest='sample_pct',
      default=1.0, type='float',
      help='Down-sample the segmenDown-sample the segments')
  parser.add_option('--seed', dest='seed',
      default=44, type='int',
      help='Random seed [Default: %default]')
  parser.add_option('--stride_train', dest='stride_train',
      default=1., type='float',
      help='Stride to advance train sequences [Default: seq_length]')
  parser.add_option('--stride_test', dest='stride_test',
      default=1., type='float',
      help='Stride to advance valid and test sequences [Default: seq_length]')
  parser.add_option('--st', '--split_test', dest='split_test',
      default=False, action='store_true',
      help='Exit after split. [Default: %default]')
  parser.add_option('-t', dest='test_pct_or_chr',
      default=0.05, type='str',
      help='Proportion of the data for testing [Default: %default]')
  parser.add_option('-u', dest='umap_bed',
      help='Unmappable regions in BED format')
  parser.add_option('--umap_midpoints', dest='umap_midpoints',
      help='Regions with midpoints to exclude in BED format. Used for 4C/HiC.')
  parser.add_option('--umap_t', dest='umap_t',
      default=0.3, type='float',
      help='Remove sequences with more than this unmappable bin % [Default: %default]')
  parser.add_option('--umap_set', dest='umap_set',
      default=None, type='float',
      help='Set unmappable regions to this percentile in the sequences\' distribution of values')
  parser.add_option('-w', dest='pool_width',
      default=128, type='int',
      help='Sum pool width [Default: %default]')
  parser.add_option('-v', dest='valid_pct_or_chr',
      default=0.05, type='str',
      help='Proportion of the data for validation [Default: %default]')
  parser.add_option('--snap', dest='snap',
      default=None, type='int',
      help='snap stride to multiple for binned targets in bp, if not None seq_length must be a multiple of snap')
  parser.add_option('--as_obsexp', dest='as_obsexp',
      action="store_true", default=False,
      help='save targets as obsexp profiles')
  parser.add_option('--global_obsexp', dest='global_obsexp',
      action="store_true", default=False,
      help='use pre-calculated by-chromosome obs/exp')
  parser.add_option('--no_log', dest='no_log',
      action="store_true", default=False,
      help='do not take log for obs/exp')

  (options, args) = parser.parse_args()

  if len(args) != 2:
    parser.error('Must provide FASTA and sample coverage labels and paths.')
  else:
    fasta_file = args[0]
    targets_file = args[1]

  random.seed(options.seed)
  np.random.seed(options.seed)

  # transform proportion strides to base pairs
  if options.stride_train <= 1:
    print('stride_train %.f'%options.stride_train, end='')
    options.stride_train = options.stride_train*options.seq_length
    print(' converted to %f' % options.stride_train)
  options.stride_train = int(np.round(options.stride_train))
  if options.stride_test <= 1:
    print('stride_test %.f'%options.stride_test, end='')
    options.stride_test = options.stride_test*options.seq_length
    print(' converted to %f' % options.stride_test)
  options.stride_test = int(np.round(options.stride_test))

  if options.snap != None:
    if np.mod(options.seq_length, options.snap) != 0: 
      raise ValueError('seq_length must be a multiple of snap')
    if np.mod(options.stride_train, options.snap) != 0: 
      raise ValueError('stride_train must be a multiple of snap')
    if np.mod(options.stride_test, options.snap) != 0:
      raise ValueError('stride_test must be a multiple of snap')

  if os.path.isdir(options.out_dir) and not options.restart:
    print('Remove output directory %s or use --restart option.' % options.out_dir)
    exit(1)
  elif not os.path.isdir(options.out_dir):
    os.mkdir(options.out_dir)

  # dump options
  with open('%s/options.json' % options.out_dir, 'w') as options_json_out:
    json.dump(options.__dict__, options_json_out, sort_keys=True, indent=4)

  ################################################################
  # define genomic contigs
  ################################################################
  if not options.restart:
    chrom_contigs = genome.load_chromosomes(fasta_file)

    # remove gaps
    if options.gaps_file:
      chrom_contigs = genome.split_contigs(chrom_contigs,
                                           options.gaps_file)

    # ditch the chromosomes for contigs
    contigs = []
    for chrom in chrom_contigs:
      contigs += [Contig(chrom, ctg_start, ctg_end)
                   for ctg_start, ctg_end in chrom_contigs[chrom]]

    # limit to a BED file
    if options.limit_bed is not None:
      contigs = limit_contigs(contigs, options.limit_bed)

    # filter for large enough
    contigs = [ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length]

    # break up large contigs
    if options.break_t is not None:
      contigs = break_large_contigs(contigs, options.break_t)

    # print contigs to BED file
    ctg_bed_file = '%s/contigs.bed' % options.out_dir
    write_seqs_bed(ctg_bed_file, contigs)

  ################################################################
  # divide between train/valid/test
  ################################################################
  # label folds
  if options.folds is not None:
    fold_labels = ['fold%d' % fi for fi in range(options.folds)]
    num_folds = options.folds
  else:
    fold_labels = ['train', 'valid', 'test']
    num_folds = 3

  if not options.restart:
    if options.folds is not None:
      # divide by fold pct
      fold_contigs = divide_contigs_folds(contigs, options.folds)

    else:
      try:
        # convert to float pct
        valid_pct = float(options.valid_pct_or_chr)
        test_pct = float(options.test_pct_or_chr)
        assert(0 <= valid_pct <= 1)
        assert(0 <= test_pct <= 1)

        # divide by pct
        fold_contigs = divide_contigs_pct(contigs, test_pct, valid_pct)

      except (ValueError, AssertionError):
        # divide by chr
        valid_chrs = options.valid_pct_or_chr.split(',')
        test_chrs = options.test_pct_or_chr.split(',')
        fold_contigs = divide_contigs_chr(contigs, test_chrs, valid_chrs)

    # rejoin broken contigs within set
    for fi in range(len(fold_contigs)):
      fold_contigs[fi] = rejoin_large_contigs(fold_contigs[fi])

    # write labeled contigs to BED file
    ctg_bed_file = '%s/contigs.bed' % options.out_dir
    ctg_bed_out = open(ctg_bed_file, 'w')
    for fi in range(len(fold_contigs)):
      for ctg in fold_contigs[fi]:
        line = '%s\t%d\t%d\t%s' % (ctg.chr, ctg.start, ctg.end, fold_labels[fi])
        print(line, file=ctg_bed_out)
    ctg_bed_out.close()

  if options.split_test:
    exit()

  ################################################################
  # define model sequences
  ################################################################
  if not options.restart:
    fold_mseqs = []
    for fi in range(num_folds):
      if fold_labels[fi] in ['valid','test']:
        stride_fold = options.stride_test
      else:
        stride_fold = options.stride_train

      # stride sequences across contig
      fold_mseqs_fi = contig_sequences(fold_contigs[fi], options.seq_length,
                                       stride_fold, options.snap, fold_labels[fi])
      fold_mseqs.append(fold_mseqs_fi)

      # shuffle
      random.shuffle(fold_mseqs[fi])

      # down-sample
      if options.sample_pct < 1.0:
        fold_mseqs[fi] = random.sample(fold_mseqs[fi], int(options.sample_pct*len(fold_mseqs[fi])))

    # merge into one list
    mseqs = [ms for fm in fold_mseqs for ms in fm]


  ################################################################
  # mappability
  ################################################################
  if not options.restart:
    if (options.umap_bed is not None) or (options.umap_midpoints is not None):
      if shutil.which('bedtools') is None:
        print('Install Bedtools to annotate unmappable sites', file=sys.stderr)
        exit(1)

    if options.umap_bed is not None:
      # annotate unmappable positions
      mseqs_unmap = annotate_unmap(mseqs, options.umap_bed,
                                   options.seq_length, options.pool_width)

      # filter unmappable
      mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') < options.umap_t)
      mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]]
      mseqs_unmap = mseqs_unmap[mseqs_map_mask,:]

      # write to file
      unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir
      np.save(unmap_npy, mseqs_unmap)

    if options.umap_midpoints is not None:
      # annotate unmappable midpoints for 4C/HiC
      mseqs_unmap = annotate_unmap(mseqs, options.umap_midpoints,
                                   options.seq_length, options.pool_width)

      # filter unmappable
      seqmid =  mseqs_unmap.shape[1]//2  #int( options.seq_length / options.pool_width /2)
      mseqs_map_mask = (np.sum(mseqs_unmap[:,seqmid-1:seqmid+1],axis=1) == 0)

      mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]]
      mseqs_unmap = mseqs_unmap[mseqs_map_mask,:]

      # write to file
      unmap_npy = '%s/mseqs_unmap_midpoints.npy' % options.out_dir
      np.save(unmap_npy, mseqs_unmap)

    # write sequences to BED
    print('writing sequences to BED')
    seqs_bed_file = '%s/sequences.bed' % options.out_dir
    write_seqs_bed(seqs_bed_file, mseqs, True)
  else:
    # read from directory
    seqs_bed_file = '%s/sequences.bed' % options.out_dir
    unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir
    mseqs = []
    fold_mseqs = []
    for fi in range(num_folds):
      fold_mseqs.append([])
    for line in open(seqs_bed_file):
      a = line.split()
      msg = ModelSeq(a[0], int(a[1]), int(a[2]), a[3])
      mseqs.append(msg)
      if a[3] == 'train':
        fi = 0
      elif a[3] == 'valid':
        fi = 1
      elif a[3] == 'test':
        fi = 2
      else:
        fi = int(a[3].replace('fold',''))
      fold_mseqs[fi].append(msg)


  ################################################################
  # read sequence coverage values
  ################################################################
  # read target datasets
  targets_df = pd.read_csv(targets_file, index_col=0, sep='\t')

  seqs_cov_dir = '%s/seqs_cov' % options.out_dir
  if not os.path.isdir(seqs_cov_dir):
    os.mkdir(seqs_cov_dir)

  read_jobs = []

  for ti in range(targets_df.shape[0]):
    genome_cov_file = targets_df['file'].iloc[ti]
    seqs_cov_stem = '%s/%d' % (seqs_cov_dir, ti)
    seqs_cov_file = '%s.h5' % seqs_cov_stem

    clip_ti = None
    if 'clip' in targets_df.columns:
      clip_ti = targets_df['clip'].iloc[ti]

    # scale_ti = 1
    # if 'scale' in targets_df.columns:
    #   scale_ti = targets_df['scale'].iloc[ti]

    if options.restart and os.path.isfile(seqs_cov_file):
      print('Skipping existing %s' % seqs_cov_file, file=sys.stderr)
    else:
      cmd = 'python ~/nn_anopheles/basenji/bin/akita_data_read.py'
      cmd += ' --crop %d' % options.crop_bp
      cmd += ' -d %s' % options.diagonal_offset
      cmd += ' -k %d' % options.kernel_stddev
      cmd += ' -w %d' % options.pool_width
      if clip_ti is not None:
        cmd += ' --clip %f' % clip_ti
      # cmd += ' -s %f' % scale_ti
      if options.blacklist_bed:
        cmd += ' -b %s' % options.blacklist_bed
      if options.as_obsexp:
        cmd += ' --as_obsexp'
        if options.global_obsexp:
          cmd += ' --global_obsexp'
        if options.no_log:
          cmd += ' --no_log'
      cmd += ' %s' % genome_cov_file
      cmd += ' %s' % seqs_bed_file
      cmd += ' %s' % seqs_cov_file

      if options.run_local:
        # breaks on some OS
        # cmd += ' &> %s.err' % seqs_cov_stem
        read_jobs.append(cmd)
      else:
        j = slurm.Job(cmd,
            name='read_t%d' % ti,
            out_file='%s.out' % seqs_cov_stem,
            err_file='%s.err' % seqs_cov_stem,
            queue='standard', mem=15000, time='12:0:0')
        read_jobs.append(j)

  if options.run_local:
    util.exec_par(read_jobs, options.processes, verbose=True)
  else:
    slurm.multi_run(read_jobs, options.processes, verbose=True,
                    launch_sleep=1, update_sleep=5)


  ################################################################
  # write TF Records
  ################################################################
  # copy targets file
  shutil.copy(targets_file, '%s/targets.txt' % options.out_dir)

  # initialize TF Records dir
  tfr_dir = '%s/tfrecords' % options.out_dir
  if not os.path.isdir(tfr_dir):
    os.mkdir(tfr_dir)

  write_jobs = []

  for fold_set in fold_labels:
    fold_set_indexes = [i for i in range(len(mseqs)) if mseqs[i].label == fold_set]
    fold_set_start = fold_set_indexes[0]
    fold_set_end = fold_set_indexes[-1] + 1

    tfr_i = 0
    tfr_start = fold_set_start
    tfr_end = min(tfr_start+options.seqs_per_tfr, fold_set_end)

    while tfr_start <= fold_set_end:
      tfr_stem = '%s/%s-%d' % (tfr_dir, fold_set, tfr_i)

      cmd = 'python ~/nn_anopheles/basenji/bin/basenji_data_write.py'
      cmd += ' -s %d' % tfr_start
      cmd += ' -e %d' % tfr_end

      # do not use      
      # if options.umap_bed is not None:
      #   cmd += ' -u %s' % unmap_npy
      # if options.umap_set is not None:
      #   cmd += ' --umap_set %f' % options.umap_set

      cmd += ' %s' % fasta_file
      cmd += ' %s' % seqs_bed_file
      cmd += ' %s' % seqs_cov_dir
      cmd += ' %s.tfr' % tfr_stem

      if options.run_local:
        # breaks on some OS
        # cmd += ' &> %s.err' % tfr_stem
        write_jobs.append(cmd)
      else:
        j = slurm.Job(cmd,
              name='write_%s-%d' % (fold_set, tfr_i),
              out_file='%s.out' % tfr_stem,
              err_file='%s.err' % tfr_stem,
              queue='standard', mem=15000, time='12:0:0')
        write_jobs.append(j)

      # update
      tfr_i += 1
      tfr_start += options.seqs_per_tfr
      tfr_end = min(tfr_start+options.seqs_per_tfr, fold_set_end)

  if options.run_local:
    util.exec_par(write_jobs, options.processes, verbose=True)
  else:
    slurm.multi_run(write_jobs, options.processes, verbose=True,
                    launch_sleep=1, update_sleep=5)

  ################################################################
  # stats
  ################################################################
  stats_dict = {}
  stats_dict['num_targets'] = targets_df.shape[0]
  stats_dict['seq_length'] = options.seq_length
  stats_dict['pool_width'] = options.pool_width
  stats_dict['crop_bp'] = options.crop_bp
  stats_dict['diagonal_offset'] = options.diagonal_offset

  target1_length = options.seq_length - 2*options.crop_bp
  target1_length = target1_length // options.pool_width
  target1_length = target1_length - options.diagonal_offset
  target_length = target1_length*(target1_length+1) // 2
  stats_dict['target_length'] = target_length

  for fi in range(num_folds):
    stats_dict['%s_seqs' % fold_labels[fi]] = len(fold_mseqs[fi])

  with open('%s/statistics.json' % options.out_dir, 'w') as stats_json_out:
    json.dump(stats_dict, stats_json_out, indent=4)
コード例 #10
0
ファイル: akita_data_write.py プロジェクト: polyaB/basenji
def main():
    usage = 'usage: %prog [options] <fasta_file> <seqs_bed_file> <seqs_cov_dir> <tfr_file>'
    parser = OptionParser(usage)
    parser.add_option('-g',
                      dest='genome_index',
                      default=None,
                      type='int',
                      help='Genome index')
    parser.add_option('-s',
                      dest='start_i',
                      default=0,
                      type='int',
                      help='Sequence start index [Default: %default]')
    parser.add_option('-e',
                      dest='end_i',
                      default=None,
                      type='int',
                      help='Sequence end index [Default: %default]')
    parser.add_option('--te',
                      dest='target_extend',
                      default=None,
                      type='int',
                      help='Extend targets vector [Default: %default]')
    parser.add_option(
        '--ts',
        dest='target_start',
        default=0,
        type='int',
        help='Write targets into vector starting at index [Default: %default')
    parser.add_option('-u',
                      dest='umap_npy',
                      help='Unmappable array numpy file')
    parser.add_option(
        '--umap_set',
        dest='umap_set',
        default=None,
        type='float',
        help=
        'Sequence distribution value to set unmappable positions to, eg 0.25.')
    (options, args) = parser.parse_args()

    if len(args) != 4:
        parser.error('Must provide input arguments.')
    else:
        fasta_file = args[0]
        seqs_bed_file = args[1]
        seqs_cov_dir = args[2]
        tfr_file = args[3]

    ################################################################
    # read model sequences

    model_seqs = []
    for line in open(seqs_bed_file):
        a = line.split()
        model_seqs.append(ModelSeq(a[0], int(a[1]), int(a[2]), None))

    if options.end_i is None:
        options.end_i = len(model_seqs)

    num_seqs = options.end_i - options.start_i

    ################################################################
    # determine sequence coverage files

    seqs_cov_files = []
    ti = 0
    if options.genome_index is None:
        seqs_cov_file = '%s/%d.h5' % (seqs_cov_dir, ti)
    else:
        seqs_cov_file = '%s/%d-%d.h5' % (seqs_cov_dir, options.genome_index,
                                         ti)
    while os.path.isfile(seqs_cov_file):
        seqs_cov_files.append(seqs_cov_file)
        ti += 1
        if options.genome_index is None:
            seqs_cov_file = '%s/%d.h5' % (seqs_cov_dir, ti)
        else:
            seqs_cov_file = '%s/%d-%d.h5' % (seqs_cov_dir,
                                             options.genome_index, ti)

    if len(seqs_cov_files) == 0:
        print('Sequence coverage files not found, e.g. %s' % seqs_cov_file,
              file=sys.stderr)
        exit(1)

    seq_pool_len_hic = h5py.File(seqs_cov_files[0], 'r')['targets'].shape[1]
    num_targets = len(seqs_cov_files)

    ################################################################
    # read targets

    # extend targets
    num_targets_tfr = num_targets
    if options.target_extend is not None:
        assert (options.target_extend >= num_targets_tfr)
        num_targets_tfr = options.target_extend

    # initialize targets
    targets = np.zeros((num_seqs, seq_pool_len_hic, num_targets_tfr),
                       dtype='float16')

    # read each target
    for ti in range(num_targets):
        seqs_cov_open = h5py.File(seqs_cov_files[ti], 'r')
        tii = options.target_start + ti
        targets[:, :, tii] = seqs_cov_open['targets'][
            options.start_i:options.end_i, :]
        seqs_cov_open.close()

    ################################################################
    # write TFRecords

    # open FASTA
    fasta_open = pysam.Fastafile(fasta_file)

    # define options
    tf_opts = tf.io.TFRecordOptions(compression='ZLIB')

    with tf.io.TFRecordWriter(tfr_file, tf_opts) as writer:
        for si in range(num_seqs):
            msi = options.start_i + si
            mseq = model_seqs[msi]

            # read FASTA
            seq_dna = fasta_open.fetch(mseq.chr, mseq.start, mseq.end)

            # one hot code
            seq_1hot = dna_1hot(seq_dna)

            if options.genome_index is None:
                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        'genome':
                        _int_feature(0),
                        'sequence':
                        _bytes_feature(seq_1hot.flatten().tostring()),
                        'target':
                        _bytes_feature(targets[si, :, :].flatten().tostring())
                    }))
            else:
                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        'genome':
                        _int_feature(options.genome_index),
                        'sequence':
                        _bytes_feature(seq_1hot.flatten().tostring()),
                        'target':
                        _bytes_feature(targets[si, :, :].flatten().tostring())
                    }))

            writer.write(example.SerializeToString())

        fasta_open.close()
コード例 #11
0
def main():
    usage = 'usage: %prog [options] <fasta_file> <seqs_bed_file> <seqs_cov_dir> <tfr_file> <fold_set>'
    parser = OptionParser(usage)
    parser.add_option('--threshold',
                      dest='threshold',
                      default=0,
                      type='float',
                      help='Set a minimum threshold for activity.')
    parser.add_option(
        '--test_threshold',
        dest='test_threshold',
        type='float',
        help='Set a minimum threshold for activity for test set.')
    parser.add_option('-s',
                      dest='start_i',
                      default=0,
                      type='int',
                      help='Sequence start index [Default: %default]')
    parser.add_option('-e',
                      dest='end_i',
                      default=None,
                      type='int',
                      help='Sequence end index [Default: %default]')
    parser.add_option('--te',
                      dest='target_extend',
                      default=None,
                      type='int',
                      help='Extend targets vector [Default: %default]')
    parser.add_option(
        '--ts',
        dest='target_start',
        default=0,
        type='int',
        help='Write targets into vector starting at index [Default: %default')
    parser.add_option('-u',
                      dest='umap_npy',
                      help='Unmappable array numpy file')
    parser.add_option(
        '--umap_clip',
        dest='umap_clip',
        default=1,
        type='float',
        help=
        'Clip values at unmappable positions to distribution quantiles, eg 0.25. [Default: %default]'
    )
    parser.add_option(
        '--umap_tfr',
        dest='umap_tfr',
        default=False,
        action='store_true',
        help='Save umap array into TFRecords [Default: %default]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='data_out',
                      help='Output directory [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 5:
        parser.error('Must provide input arguments.')
    else:
        fasta_file = args[0]
        seqs_bed_file = args[1]
        seqs_cov_dir = args[2]
        tfr_file = args[3]
        fold_set = args[4]

    if fold_set == 'test':
        options.threshold = options.test_threshold

    ################################################################
    # read model sequences

    model_seqs = []
    for line in open(seqs_bed_file):
        a = line.split()
        model_seqs.append(ModelSeq(a[0], int(a[1]), int(a[2]), None))

    if options.end_i is None:
        options.end_i = len(model_seqs)

    num_seqs = options.end_i - options.start_i

    ################################################################
    # determine sequence coverage files

    seqs_cov_files = []
    ti = 0
    seqs_cov_file = '%s/%d.h5' % (seqs_cov_dir, ti)
    while os.path.isfile(seqs_cov_file):
        seqs_cov_files.append(seqs_cov_file)
        ti += 1
        seqs_cov_file = '%s/%d.h5' % (seqs_cov_dir, ti)

    if len(seqs_cov_files) == 0:
        print('Sequence coverage files not found, e.g. %s' % seqs_cov_file,
              file=sys.stderr)
        exit(1)

    seq_pool_len = h5py.File(seqs_cov_files[0], 'r')['targets'].shape[1]
    num_targets = len(seqs_cov_files)

    ################################################################
    # read targets

    # extend targets
    num_targets_tfr = num_targets
    if options.target_extend is not None:
        assert (options.target_extend >= num_targets_tfr)
        num_targets_tfr = options.target_extend

    # initialize targets
    targets = np.zeros((num_seqs, seq_pool_len, num_targets_tfr),
                       dtype='float16')

    # read each target
    for ti in range(num_targets):
        seqs_cov_open = h5py.File(seqs_cov_files[ti], 'r')
        tii = options.target_start + ti
        targets[:, :, tii] = seqs_cov_open['targets'][
            options.start_i:options.end_i, :]
        seqs_cov_open.close()
    # threshold each sequence using an arbitrary threshold
    mask_by_thr = np.any(np.any(targets > options.threshold, axis=1), axis=-1)
    idx_filt_seqs = np.argwhere(mask_by_thr).flatten()
    num_seqs_to_add = len(idx_filt_seqs)
    for i in range(5):
        print('*')
    print(num_seqs_to_add)
    for i in range(5):
        print('*')
    # current_json = open('%s/statistics.json' % options.out_dir, 'r')
    # current_stats = json.load(current_json)
    # current_stats['%s_seqs'%fold_set] += num_seqs_to_add # update number of seqs

    # with open('%s/statistics.json' % options.out_dir, 'w') as stats_json_out:
    #   json.dump(current_stats, stats_json_out, indent=4)

    count_dir = os.path.join(options.out_dir, 'counts')
    if not os.path.isdir(count_dir):
        os.mkdir(count_dir)
    file_id = fold_set + '_' + uuid.uuid4().hex
    file_path = os.path.join(count_dir, file_id)
    f = open(file_path, 'w')
    f.write(str(num_seqs_to_add))
    f.close()

    ################################################################
    # modify unmappable
    #
    # if options.umap_npy is not None and options.umap_clip < 1:
    #   unmap_mask = np.load(options.umap_npy)
    #
    #   for si in idx_filt_seqs:
    #     msi = options.start_i + si
    #
    #     # determine unmappable null value
    #     seq_target_null = np.percentile(targets[si], q=[100*options.umap_clip], axis=0)[0]
    #
    #     # set unmappable positions to null
    #     targets[si,unmap_mask[msi,:],:] = np.minimum(targets[si,unmap_mask[msi,:],:], seq_target_null)
    #
    # elif options.umap_npy is not None and options.umap_tfr:
    #   unmap_mask = np.load(options.umap_npy)

    ################################################################
    # write TFRecords

    # open FASTA
    fasta_open = pysam.Fastafile(fasta_file)

    # define options
    tf_opts = tf.io.TFRecordOptions(compression_type='ZLIB')
    with tf.io.TFRecordWriter(tfr_file, tf_opts) as writer:
        for si in idx_filt_seqs:

            msi = options.start_i + si
            mseq = model_seqs[msi]

            # read FASTA
            seq_dna = fasta_open.fetch(mseq.chr, mseq.start, mseq.end)

            # one hot code
            seq_1hot = dna_1hot(seq_dna)
            # seq_1hot = dna_1hot_index(seq_dna) # more efficient, but fighting inertia
            # hash to bytes
            features_dict = {
                'coordinate':
                feature_str('{}_{}_{}'.format(mseq.chr, mseq.start,
                                              mseq.end).encode()),
                # 'sequence': feature_bytes(seq_1hot),
                'sequence':
                feature_bytes(seq_1hot),
                'target':
                feature_bytes(targets[si, :, :])
            }
            # features_dict = {
            #   'chrom': feature_str(mseq.chr.encode()),
            #   # 'sequence': feature_bytes(seq_1hot),
            #   'start': feature_floats(mseq.start),
            #   'end': feature_floats(mseq.end),
            #   'sequence': feature_bytes(seq_1hot),
            #   'target': feature_bytes(targets[si,:,:])
            #   }
            # add unmappability
            if options.umap_tfr:
                features_dict['umap'] = feature_bytes(unmap_mask[msi, :])

            # write example
            example = tf.train.Example(features=tf.train.Features(
                feature=features_dict))
            writer.write(example.SerializeToString())

        fasta_open.close()
コード例 #12
0
def main():
    usage = 'usage: %prog [options] <genome_cov_file> <seqs_bed_file> <seqs_cov_file>'
    parser = OptionParser(usage)
    parser.add_option('-w',
                      dest='pool_width',
                      default=1,
                      type='int',
                      help='Average pooling width [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error('')
    else:
        genome_cov_file = args[0]
        seqs_bed_file = args[1]
        seqs_cov_file = args[2]

    # read model sequences
    model_seqs = []
    for line in open(seqs_bed_file):
        a = line.split()
        model_seqs.append(ModelSeq(a[0], int(a[1]), int(a[2])))

    # compute dimensions
    num_seqs = len(model_seqs)
    seq_len_nt = model_seqs[0].end - model_seqs[0].start
    seq_len_pool = seq_len_nt // options.pool_width

    # initialize sequences coverage file
    seqs_cov_open = h5py.File(seqs_cov_file, 'w')
    seqs_cov_open.create_dataset('seqs_cov',
                                 shape=(num_seqs, seq_len_pool),
                                 dtype='float16')

    # open genome coverage file
    genome_cov_open = h5py.File(genome_cov_file, 'r')

    # for each model sequence
    for si in range(num_seqs):
        mseq = model_seqs[si]

        # read coverage
        if mseq.chr in genome_cov_open:
            seq_cov_nt = genome_cov_open[mseq.chr][mseq.start:mseq.end]
        else:
            print("WARNING: %s doesn't see %s:%d-%d. Setting to all zeros." %
                  (cov_file, mseq.chr, mseq.start, mseq.end))
            seq_cov_nt = np.zeros(mseq.end - mseq.start, dtype='float16')

        # set NaN's to zero
        seq_cov_nt = np.nan_to_num(seq_cov_nt)

        # sum pool
        seq_cov = seq_cov_nt.reshape(seq_len_pool, options.pool_width)
        seq_cov = seq_cov.sum(axis=1, dtype='float32')

        # write
        seqs_cov_open['seqs_cov'][si, :] = seq_cov

    # close genome coverage file
    genome_cov_open.close()

    # close sequences coverage file
    seqs_cov_open.close()