Beispiel #1
0
def test_read_gen2():
  """Read gen: Read pos, cigar, v_list and seq (cpy 2)"""
  ref_seq, vcf = load_data()
  nodes = rgen.create_node_list(ref_seq, ref_start_pos=1, vl=vcf[0]['v'][0])

  assert rgen.generate_read(1, 10, 0, 0, nodes) == (1, '10=', [], 'ATGACGTATC')
  assert rgen.generate_read(2, 10, 0, 0, nodes) == (2, '10=', [], 'TGACGTATCC')
  assert rgen.generate_read(4, 10, 0, 0, nodes) == (4, '10=', [], 'ACGTATCCAA')
  assert rgen.generate_read(5, 10, 0, 1, nodes) == (5, '9=1X', [0], 'CGTATCCAAT')
  assert rgen.generate_read(6, 10, 0, 2, nodes) == (6, '8=1X1=', [0], 'GTATCCAATG')
Beispiel #2
0
def test_get_begin_end_nodes():
  """Read gen: find start and stop nodes for reads"""
  ref_seq, vcf = load_data()
  nodes = rgen.create_node_list(ref_seq, ref_start_pos=1, vl=vcf[0]['v'][1])

  pl = np.arange(1, 16, dtype=int)
  ll = 10
  nse = rgen.get_begin_end_nodes(pl, ll, nodes)
  assert_array_equal(nse[0], np.array([0, 0, 0, 0, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 6]))
  assert_array_equal(nse[1], np.array([3, 3, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8]))
Beispiel #3
0
def test_get_begin_end_nodes():
    """Read gen: find start and stop nodes for reads"""
    ref_seq, vcf = load_data()
    nodes = rgen.create_node_list(ref_seq, ref_start_pos=1, vl=vcf[0]['v'][1])

    pl = np.arange(1, 16, dtype=int)
    ll = 10
    nse = rgen.get_begin_end_nodes(pl, ll, nodes)
    assert_array_equal(nse[0],
                       np.array([0, 0, 0, 0, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 6]))
    assert_array_equal(nse[1],
                       np.array([3, 3, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8]))
Beispiel #4
0
def test_read_gen_crowded2():
  """Read gen: SNP immediately followed by an INS"""
  ref_seq = open(os.path.join(mitty.test.example_data_dir, 'tiny.fasta')).readlines()[1]
  vcf = vio.load_variant_file(
    os.path.join(mitty.test.example_data_dir, 'test-snp-ins.vcf.gz'),
    'g0_s0',
    os.path.join(mitty.test.example_data_dir, 'tiny.whole.bed'))

  # ATGACGTATCCAAGGAGGCGTTACC
  # 12345678901234567890
  nodes = rgen.create_node_list(ref_seq, ref_start_pos=1, vl=vcf[0]['v'][1])
  assert rgen.generate_read(1, 10, 0, 3, nodes) == (1, '4=1X3I2=', [0, 3], 'ATGATTTTGT')
Beispiel #5
0
def test_read_gen_crowded1():
  """Read gen: DEL followed immediately by a SNP"""
  ref_seq = open(os.path.join(mitty.test.example_data_dir, 'tiny.fasta')).readlines()[1]
  vcf = vio.load_variant_file(
    os.path.join(mitty.test.example_data_dir, 'test-del-snp.vcf.gz'),
    'g0_s0',
    os.path.join(mitty.test.example_data_dir, 'tiny.whole.bed'))

  # ATGACGTATCCAAGGAGGCGTTACC
  # 12345678901234567890
  nodes = rgen.create_node_list(ref_seq, ref_start_pos=1, vl=vcf[0]['v'][1])
  assert rgen.generate_read(1, 10, 0, 3, nodes) == (1, '5=2D1X4=', [-2, 0], 'ATGACTTCCA')
Beispiel #6
0
def test_read_gen_crowded2():
    """Read gen: SNP immediately followed by an INS"""
    ref_seq = open(os.path.join(mitty.test.example_data_dir,
                                'tiny.fasta')).readlines()[1]
    vcf = vio.load_variant_file(
        os.path.join(mitty.test.example_data_dir, 'test-snp-ins.vcf.gz'),
        'g0_s0', os.path.join(mitty.test.example_data_dir, 'tiny.whole.bed'))

    # ATGACGTATCCAAGGAGGCGTTACC
    # 12345678901234567890
    nodes = rgen.create_node_list(ref_seq, ref_start_pos=1, vl=vcf[0]['v'][1])
    assert rgen.generate_read(1, 10, 0, 3,
                              nodes) == (1, '4=1X3I2=', [0, 3], 'ATGATTTTGT')
Beispiel #7
0
def test_read_gen_crowded1():
    """Read gen: DEL followed immediately by a SNP"""
    ref_seq = open(os.path.join(mitty.test.example_data_dir,
                                'tiny.fasta')).readlines()[1]
    vcf = vio.load_variant_file(
        os.path.join(mitty.test.example_data_dir, 'test-del-snp.vcf.gz'),
        'g0_s0', os.path.join(mitty.test.example_data_dir, 'tiny.whole.bed'))

    # ATGACGTATCCAAGGAGGCGTTACC
    # 12345678901234567890
    nodes = rgen.create_node_list(ref_seq, ref_start_pos=1, vl=vcf[0]['v'][1])
    assert rgen.generate_read(1, 10, 0, 3,
                              nodes) == (1, '5=2D1X4=', [-2, 0], 'ATGACTTCCA')
Beispiel #8
0
def test_read_gen2():
    """Read gen: Read pos, cigar, v_list and seq (cpy 2)"""
    ref_seq, vcf = load_data()
    nodes = rgen.create_node_list(ref_seq, ref_start_pos=1, vl=vcf[0]['v'][0])

    assert rgen.generate_read(1, 10, 0, 0,
                              nodes) == (1, '10=', [], 'ATGACGTATC')
    assert rgen.generate_read(2, 10, 0, 0,
                              nodes) == (2, '10=', [], 'TGACGTATCC')
    assert rgen.generate_read(4, 10, 0, 0,
                              nodes) == (4, '10=', [], 'ACGTATCCAA')
    assert rgen.generate_read(5, 10, 0, 1,
                              nodes) == (5, '9=1X', [0], 'CGTATCCAAT')
    assert rgen.generate_read(6, 10, 0, 2,
                              nodes) == (6, '8=1X1=', [0], 'GTATCCAATG')
Beispiel #9
0
def test_expand_sequence():
    """Read gen: Sequence expansion"""
    ref_seq, vcf = load_data()
    nodes = rgen.create_node_list(ref_seq, ref_start_pos=1, vl=vcf[0]['v'][1])

    assert len(nodes) == 9
    #                   ps pr op l seq
    assert nodes[0] == (1, 1, '=', 4, 'ATGA', None)
    assert nodes[1] == (5, 5, 'X', 1, 'T', 0)
    assert nodes[2] == (6, 6, '=', 3, 'GTA', None)
    assert nodes[3] == (9, 9, 'I', 3, 'TTT', 3)
    assert nodes[4] == (12, 9, '=', 3, 'TCC', None)
    assert nodes[5] == (14, 14, 'D', 2, '', -2)
    assert nodes[6] == (15, 14, '=', 7, 'GGAGGCG', None)
    assert nodes[7] == (21, 25, 'D', 4, '', -4), nodes
    assert nodes[8] == (22, 25, '=', 1, 'C', None), nodes
Beispiel #10
0
def test_expand_sequence():
  """Read gen: Sequence expansion"""
  ref_seq, vcf = load_data()
  nodes = rgen.create_node_list(ref_seq, ref_start_pos=1, vl=vcf[0]['v'][1])

  assert len(nodes) == 9
  #                   ps pr op l seq
  assert nodes[0] == (1, 1, '=', 4, 'ATGA', None)
  assert nodes[1] == (5, 5, 'X', 1, 'T', 0)
  assert nodes[2] == (6, 6, '=', 3, 'GTA', None)
  assert nodes[3] == (9, 9, 'I', 3, 'TTT', 3)
  assert nodes[4] == (12, 9, '=', 3, 'TCC', None)
  assert nodes[5] == (14, 14, 'D', 2, '', -2)
  assert nodes[6] == (15, 14, '=', 7, 'GGAGGCG', None)
  assert nodes[7] == (21, 25, 'D', 4, '', -4), nodes
  assert nodes[8] == (22, 25, '=', 1, 'C', None), nodes
Beispiel #11
0
def test_read_gen1():
  """Read gen: Read pos, cigar, v_list and seq (cpy 1)"""
  ref_seq, vcf = load_data()
  nodes = rgen.create_node_list(ref_seq, ref_start_pos=1, vl=vcf[0]['v'][1])

  assert rgen.generate_read(1, 10, 0, 3, nodes) == (1, '4=1X3=2I', [0, 3], 'ATGATGTATT')
  assert rgen.generate_read(2, 10, 0, 3, nodes) == (2, '3=1X3=3I', [0, 3], 'TGATGTATTT')
  assert rgen.generate_read(3, 10, 0, 4, nodes) == (3, '2=1X3=3I1=', [0, 3], 'GATGTATTTT')
  assert rgen.generate_read(4, 10, 0, 4, nodes) == (4, '1=1X3=3I2=', [0, 3], 'ATGTATTTTC')
  assert rgen.generate_read(5, 10, 1, 4, nodes) == (5, '1X3=3I3=', [0, 3], 'TGTATTTTCC')
  assert rgen.generate_read(6, 10, 2, 6, nodes) == (6, '3=3I3=2D1=', [3, -2], 'GTATTTTCCG')
  assert rgen.generate_read(7, 10, 2, 6, nodes) == (7, '2=3I3=2D2=', [3, -2], 'TATTTTCCGG')
  assert rgen.generate_read(8, 10, 2, 6, nodes) == (8, '1=3I3=2D3=', [3, -2], 'ATTTTCCGGA')
  assert rgen.generate_read(9, 10, 3, 6, nodes) == (9, '3I3=2D4=', [3, -2], 'TTTTCCGGAG')
  assert rgen.generate_read(10, 10, 3, 6, nodes) == (9, '2I3=2D5=', [3, -2], 'TTTCCGGAGG')
  assert rgen.generate_read(11, 10, 3, 6, nodes) == (9, '1I3=2D6=', [3, -2], 'TTCCGGAGGC')
  assert rgen.generate_read(12, 10, 4, 6, nodes) == (9, '3=2D7=', [-2], 'TCCGGAGGCG')
  assert rgen.generate_read(13, 10, 4, 8, nodes) == (10, '2=2D7=4D1=', [-2, -4], 'CCGGAGGCGC')
  assert rgen.generate_read(14, 9, 4, 8, nodes) == (11, '1=2D7=4D1=', [-2, -4], 'CGGAGGCGC')
  assert rgen.generate_read(15, 8, 6, 8, nodes) == (14, '7=4D1=', [-4], 'GGAGGCGC')
Beispiel #12
0
def test_read_gen1():
    """Read gen: Read pos, cigar, v_list and seq (cpy 1)"""
    ref_seq, vcf = load_data()
    nodes = rgen.create_node_list(ref_seq, ref_start_pos=1, vl=vcf[0]['v'][1])

    assert rgen.generate_read(1, 10, 0, 3,
                              nodes) == (1, '4=1X3=2I', [0, 3], 'ATGATGTATT')
    assert rgen.generate_read(2, 10, 0, 3,
                              nodes) == (2, '3=1X3=3I', [0, 3], 'TGATGTATTT')
    assert rgen.generate_read(3, 10, 0, 4,
                              nodes) == (3, '2=1X3=3I1=', [0, 3], 'GATGTATTTT')
    assert rgen.generate_read(4, 10, 0, 4,
                              nodes) == (4, '1=1X3=3I2=', [0, 3], 'ATGTATTTTC')
    assert rgen.generate_read(5, 10, 1, 4,
                              nodes) == (5, '1X3=3I3=', [0, 3], 'TGTATTTTCC')
    assert rgen.generate_read(6, 10, 2, 6,
                              nodes) == (6, '3=3I3=2D1=', [3,
                                                           -2], 'GTATTTTCCG')
    assert rgen.generate_read(7, 10, 2, 6,
                              nodes) == (7, '2=3I3=2D2=', [3,
                                                           -2], 'TATTTTCCGG')
    assert rgen.generate_read(8, 10, 2, 6,
                              nodes) == (8, '1=3I3=2D3=', [3,
                                                           -2], 'ATTTTCCGGA')
    assert rgen.generate_read(9, 10, 3, 6,
                              nodes) == (9, '3I3=2D4=', [3, -2], 'TTTTCCGGAG')
    assert rgen.generate_read(10, 10, 3, 6,
                              nodes) == (9, '2I3=2D5=', [3, -2], 'TTTCCGGAGG')
    assert rgen.generate_read(11, 10, 3, 6,
                              nodes) == (9, '1I3=2D6=', [3, -2], 'TTCCGGAGGC')
    assert rgen.generate_read(12, 10, 4, 6,
                              nodes) == (9, '3=2D7=', [-2], 'TCCGGAGGCG')
    assert rgen.generate_read(13, 10, 4, 8,
                              nodes) == (10, '2=2D7=4D1=', [-2,
                                                            -4], 'CCGGAGGCGC')
    assert rgen.generate_read(14, 9, 4, 8,
                              nodes) == (11, '1=2D7=4D1=', [-2,
                                                            -4], 'CGGAGGCGC')
    assert rgen.generate_read(15, 8, 6, 8,
                              nodes) == (14, '7=4D1=', [-4], 'GGAGGCGC')
Beispiel #13
0
def read_generating_worker(worker_id, fasta_fname, sample_name, read_module, read_model, in_queue, out_queue):
  """This worker will be given a fasta_fname, and region information. It is to generate the node_list,
  the read locations and then, finally, the reads themselves. The reads - in FASTQ format - are returned
  to the parent thread for writing.

  :param worker_id: Just a serial that helps us number reads uniquely
  :param fasta_fname:
  :param sample_name:  (This is just passed to the read qname)
  :param model:
  :param region_idx:
  :param in_queue: will be sending work information as (region_idx, cpy)
  :param out_queue: We'll be sending strings representing each template in FASTQ format
  :return:
  """
  fasta = pysam.FastaFile(fasta_fname)
  total_cnt, t00 = 0, time.time()
  for ps, wd in enumerate(iter(in_queue.get, __process_stop_code__)):
    r_idx, cpy, rng_seed = wd['region_idx'], wd['region_cpy'], wd['rng_seed']
    region = vcf_df[r_idx]['region']
    ref_seq = sanitize(fasta.fetch(reference=region[0], start=region[1], end=region[2]))

    # The structure needed to generate read sequences, pos and CIGAR strings
    # + 1 because BED is 0-indexed and our convention is 1-indexed as is what is displayed in genome browsers
    node_list = rpc.create_node_list(ref_seq, ref_start_pos=region[1] + 1, vl=vcf_df[r_idx]['v'][cpy])

    p_min, p_max = node_list[0].ps, node_list[-1].ps + node_list[-1].oplen  # We never end with a deletion
    r_info_l = read_module.generate_reads(read_model, p_min, p_max, rng_seed)

    qname_serial_stub = '{}:{}:{}'.format(sample_name, worker_id, ps)
    t0 = time.time()
    this_cnt = 0
    for template in zip(
      *[zip(*([r_info[k] for k in ['file_order', 'strand', 'pos', 'len']] +
                rpc.get_begin_end_nodes(r_info['pos'], r_info['len'], node_list))) for r_info in r_info_l]):
      reads = [None] * len(template)
      for fo, s, p, l, ns, ne in template:
        pos, cigar, v_list, seq = rpc.generate_read(p, l, ns, ne, node_list)
        if seq.count('N') > 2: break
        # This break combined with the else clause skips those read pairs where
        # at least one read has too many 'N's
        if s == 1:
          seq = seq.translate(DNA_complement)[::-1]
        reads[fo] = (s, pos, cigar, v_list, '', seq, 'I' * len(seq))
      else:
        this_cnt += 1
        # (
        #   None,  - We want the writer to put a unique stamp on each template
        #   sample_name,
        #   chrom,
        #   copy,
        #   (
        #     (strand, pos, cigar, (v1,v2,...), MD, seq, qual)
        #     ...  [repeated as for as many reads in this template]
        #   )
        # )
        out_queue.put((None, sample_name, region[0], cpy, reads))

    t1 = time.time()
    logger.debug('Worker {} ({}): {} templates in {:0.2f}s ({:0.2f} t/s)'.format(worker_id, region, this_cnt, t1 - t0, this_cnt/(t1 - t0)))
    total_cnt += this_cnt

  t11 = time.time()
  logger.debug('Worker {} finished: {} templates in {:0.2f}s ({:0.2f} t/s)'.format(
    worker_id, total_cnt, t11 - t00, total_cnt / (t11 - t00)))
Beispiel #14
0
def test_read_gen_ins():
    """Read gen: Reads from inside insertion"""
    ref_seq, vcf = load_data()
    nodes = rgen.create_node_list(ref_seq, ref_start_pos=1, vl=vcf[0]['v'][1])

    assert rgen.generate_read(9, 2, 3, 3, nodes) == (9, '>0+2I', [3], 'TT')
Beispiel #15
0
def read_generating_worker(worker_id, fasta_fname, sample_name, read_module,
                           read_model, in_queue, out_queue):
    """This worker will be given a fasta_fname, and region information. It is to generate the node_list,
  the read locations and then, finally, the reads themselves. The reads - in FASTQ format - are returned
  to the parent thread for writing.

  :param worker_id: Just a serial that helps us number reads uniquely
  :param fasta_fname:
  :param sample_name:  (This is just passed to the read qname)
  :param model:
  :param region_idx:
  :param in_queue: will be sending work information as (region_idx, cpy)
  :param out_queue: We'll be sending strings representing each template in FASTQ format
  :return:
  """
    fasta = pysam.FastaFile(fasta_fname)
    total_cnt, t00 = 0, time.time()
    for ps, wd in enumerate(iter(in_queue.get, __process_stop_code__)):
        r_idx, cpy, rng_seed = wd['region_idx'], wd['region_cpy'], wd[
            'rng_seed']
        region = vcf_df[r_idx]['region']
        ref_seq = sanitize(
            fasta.fetch(reference=region[0], start=region[1], end=region[2]))

        # The structure needed to generate read sequences, pos and CIGAR strings
        # + 1 because BED is 0-indexed and our convention is 1-indexed as is what is displayed in genome browsers
        node_list = rpc.create_node_list(ref_seq,
                                         ref_start_pos=region[1] + 1,
                                         vl=vcf_df[r_idx]['v'][cpy])

        p_min, p_max = node_list[0].ps, node_list[-1].ps + node_list[
            -1].oplen  # We never end with a deletion
        r_info_l = read_module.generate_reads(read_model, p_min, p_max,
                                              rng_seed)

        qname_serial_stub = '{}:{}:{}'.format(sample_name, worker_id, ps)
        t0 = time.time()
        this_cnt = 0
        for template in zip(*[
                zip(*([
                    r_info[k] for k in ['file_order', 'strand', 'pos', 'len']
                ] + rpc.get_begin_end_nodes(r_info['pos'], r_info['len'],
                                            node_list))) for r_info in r_info_l
        ]):
            reads = [None] * len(template)
            for fo, s, p, l, ns, ne in template:
                pos, cigar, v_list, seq = rpc.generate_read(
                    p, l, ns, ne, node_list)
                if seq.count('N') > 2: break
                # This break combined with the else clause skips those read pairs where
                # at least one read has too many 'N's
                if s == 1:
                    seq = seq.translate(DNA_complement)[::-1]
                reads[fo] = (s, pos, cigar, v_list, '', seq, 'I' * len(seq))
            else:
                this_cnt += 1
                # (
                #   None,  - We want the writer to put a unique stamp on each template
                #   sample_name,
                #   chrom,
                #   copy,
                #   (
                #     (strand, pos, cigar, (v1,v2,...), MD, seq, qual)
                #     ...  [repeated as for as many reads in this template]
                #   )
                # )
                out_queue.put((None, sample_name, region[0], cpy, reads))

        t1 = time.time()
        logger.debug(
            'Worker {} ({}): {} templates in {:0.2f}s ({:0.2f} t/s)'.format(
                worker_id, region, this_cnt, t1 - t0, this_cnt / (t1 - t0)))
        total_cnt += this_cnt

    t11 = time.time()
    logger.debug(
        'Worker {} finished: {} templates in {:0.2f}s ({:0.2f} t/s)'.format(
            worker_id, total_cnt, t11 - t00, total_cnt / (t11 - t00)))
Beispiel #16
0
def test_read_gen_ins():
  """Read gen: Reads from inside insertion"""
  ref_seq, vcf = load_data()
  nodes = rgen.create_node_list(ref_seq, ref_start_pos=1, vl=vcf[0]['v'][1])

  assert rgen.generate_read(9, 2, 3, 3, nodes) == (9, '>0+2I', [3], 'TT')