Beispiel #1
0
    def test_merge_with_depth(self):

        vars1 = MockList([{'chrom': '1', 'pos': 1, 'ref': b'A', 'alt': [b'T'],
                           'gts': numpy.array([[0, 0], [1, 1]]),
                           'dp': numpy.array([1, 1])}])
        vars2 = MockList([{'chrom': '1', 'pos': 1, 'ref': b'A', 'alt': [b'T'],
                           'gts': numpy.array([[0, 0], [1, 1]]),
                           'dp': numpy.array([20, 20])}])
        vars1.samples = ['a', 'b']
        vars2.samples = ['c', 'd']
        merger = MockMerger(gt_shape=(4, 2))

        variation = VarMerger._merge_vars(merger, vars1[0], vars2[0])
        exp = {'gts': [[0, 0], [1, 1], [0, 0], [1, 1]], 'pos': 1,
               'ref': b'A', 'chrom': '1', 'alt': [b'T'], 'dp': [1, 1, 20, 20]}
        self.var_is_equal(exp, variation)

        # merge the same var with depth
        h5_1 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3},
                           ignore_complex_overlaps=True,
                           check_ref_matches=False, ignore_non_matching=True)
        new_vars = VariationsArrays(ignore_overflows=True,
                                    ignore_undefined_fields=True)

        first_snv_merged_depth = numpy.array([1, 8, 5, 1, 8, 5],
                                             dtype=numpy.int16)
        depth = list(merger.variations)[0][8][1]
        assert depth[0] == b'DP'
        assert numpy.all(depth[1] == first_snv_merged_depth)
        new_vars.put_vars(merger)
        assert '/calls/DP' in new_vars.keys()
        assert numpy.all(new_vars['/calls/DP'][0] == first_snv_merged_depth)
Beispiel #2
0
 def test_sort_variations(self):
     fhand = open(join(TEST_DATA_DIR, 'csv', 'standard_ex.tsv'), 'rb')
     var_info = {
         b'solcap_snp_sl_15058': {
             'chrom': b'chrom2',
             'pos': 345
         },
         b'solcap_snp_sl_60635': {
             'chrom': b'chrom1',
             'pos': 346
         },
         b'solcap_snp_sl_60604': {
             'chrom': b'chrom1',
             'pos': 325
         }
     }
     parser = CSVParser(fhand, var_info, first_sample_column=1, sep=b'\t')
     variations = VariationsArrays(ignore_undefined_fields=True)
     variations.put_vars(parser)
     sorted_vars = VariationsArrays()
     sort_variations(variations, sorted_vars)
     exp_chrom = [b'chrom1', b'chrom1', b'chrom2']
     exp_pos = [325, 346, 345]
     assert numpy.all(sorted_vars['/variations/chrom'] == exp_chrom)
     assert numpy.all(sorted_vars['/variations/pos'] == exp_pos)
     fhand.close()
Beispiel #3
0
    def test_parse_bam(self):
        bam_fpath = join(TEST_DATA_DIR, 'example.rg.bam')
        parser = BAMParser([bam_fpath],
                           kmer_size=4,
                           ploidy=2,
                           min_num_samples=2,
                           max_field_lens={
                               'alt': 1,
                               'CALLS': {
                                   b'AD': 3
                               }
                           },
                           max_field_str_lens={'chrom': 20})

        snps = VariationsArrays(ignore_undefined_fields=True)
        snps.put_vars(parser)
        assert snps.ploidy
        assert list(snps.chroms) == ['ref']
        assert snps.num_variations == 4
        assert len(snps[REF_FIELD]) == 4
        assert len(snps[REF_FIELD][0]) == 4

        assert list(snps[CHROM_FIELD]) == ['ref', 'ref', 'ref', 'ref']
        assert list(snps[POS_FIELD]) == [15, 16, 17, 36]
        assert AD_FIELD in snps
        assert GT_FIELD in snps
Beispiel #4
0
 def test_delete_item_from_variationArray(self):
     vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf = VCFParser(vcf_fhand)
     snps = VariationsArrays(ignore_undefined_fields=True)
     snps.put_vars(vcf)
     del snps['/calls/GT']
     assert '/calls/GT' not in snps.keys()
     vcf_fhand.close()
 def test_delete_item_from_variationArray(self):
     vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf = VCFParser(vcf_fhand, pre_read_max_size=1000)
     snps = VariationsArrays(ignore_undefined_fields=True)
     snps.put_vars(vcf)
     del snps['/calls/GT']
     assert '/calls/GT' not in snps.keys()
     vcf_fhand.close()
Beispiel #6
0
    def test_ignore_non_matching(self):

        h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r")
        h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3},
                           ignore_complex_overlaps=True,
                           check_ref_matches=False, ignore_non_matching=True)
        new_vars = VariationsArrays(ignore_undefined_fields=True)
        new_vars.put_vars(merger)
        assert new_vars.num_variations == 1
Beispiel #7
0
 def test_put_vars_arrays_from_vcf(self):
     vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf = VCFParser(vcf_fhand)
     snps = VariationsArrays(ignore_undefined_fields=True)
     snps.put_vars(vcf)
     assert snps['/calls/GT'].shape == (5, 3, 2)
     assert numpy.all(snps['/calls/GT'][1] == [[0, 0], [0, 1], [0, 0]])
     expected = numpy.array([48, 48, 43], dtype=numpy.int16)
     assert numpy.all(snps['/calls/GQ'][0, :] == expected)
     vcf_fhand.close()
 def test_put_vars_arrays_from_vcf(self):
     vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf = VCFParser(vcf_fhand, pre_read_max_size=1000)
     snps = VariationsArrays(ignore_undefined_fields=True)
     snps.put_vars(vcf)
     assert snps['/calls/GT'].shape == (5, 3, 2)
     assert numpy.all(snps['/calls/GT'][1] == [[0, 0], [0, 1], [0, 0]])
     expected = numpy.array([48, 48, 43], dtype=numpy.int16)
     assert numpy.all(snps['/calls/GQ'][0, :] == expected)
     vcf_fhand.close()
Beispiel #9
0
    def test_ignore_non_matching(self):

        h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r")
        h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3},
                           ignore_complex_overlaps=True,
                           check_ref_matches=False, ignore_non_matching=True)
        new_vars = VariationsArrays(ignore_overflows=True,
                                    ignore_undefined_fields=True)
        new_vars.put_vars(merger)
        assert new_vars.num_variations == 1
Beispiel #10
0
    def test_by_chunks(self):
        fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
        vcf_parser = VCFParser(fhand=fhand, n_threads=None)
        snps = VariationsArrays()
        snps.put_vars(vcf_parser)
        fhand.close()

        fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
        vcf_parser = VCFParser(fhand=fhand, n_threads=None)
        snps = VariationsArrays(vars_in_chunk=1)
        snps.put_vars(vcf_parser)
        fhand.close()
Beispiel #11
0
    def test_merge_variations(self):
        h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r")
        h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3},
                           ignore_complex_overlaps=True,
                           check_ref_matches=False)
        assert merger.ploidy == 2
        assert merger.samples == [b'TS-1', b'TS-11', b'TS-21', b'NA00001',
                                  b'NA00002', b'NA00003']
        expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged.h5'),
                                   'r')
        new_vars = VariationsArrays(ignore_overflows=True,
                                    ignore_undefined_fields=True)
        new_vars.put_vars(merger)

        for field in new_vars.keys():
            if 'float' in str(new_vars[field][:].dtype):
                assert numpy.all(remove_nans(expected_h5[field][:]) ==
                                 remove_nans(new_vars[field][:]))
            else:
                result = new_vars[field][:]
                try:
                    assert numpy.all(expected_h5[field][:] == result)
                except AssertionError:
                    print(field)
                    print(expected_h5[field][:])
                    print(result)

        # Change the order
        h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r")
        h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        merger = VarMerger(h5_2, h5_1, max_field_lens={'alt': 3},
                           ignore_complex_overlaps=True,
                           check_ref_matches=False)
        assert merger.ploidy == 2
        assert merger.samples == [b'NA00001', b'NA00002', b'NA00003',
                                  b'TS-1', b'TS-11', b'TS-21']
        expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged2.h5'),
                                   'r')
        new_vars = VariationsArrays(ignore_overflows=True,
                                    ignore_undefined_fields=True)
        new_vars.put_vars(merger)

        for field in new_vars.keys():
            if 'float' in str(new_vars[field][:].dtype):
                assert numpy.all(remove_nans(expected_h5[field][:]) ==
                                 remove_nans(new_vars[field][:]))
            else:
                result = new_vars[field][:]
                assert numpy.all(expected_h5[field][:] == result)
Beispiel #12
0
 def test_vcf_detect_fields(self):
     vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf_fhand2 = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf = VCFParser(vcf_fhand, kept_fields=['/variations/qual'])
     vcf2 = VCFParser(vcf_fhand2, ignored_fields=['/variations/qual'])
     snps = VariationsArrays(ignore_undefined_fields=True)
     snps.put_vars(vcf)
     metadata = snps.metadata
     snps2 = VariationsArrays(ignore_undefined_fields=True)
     snps2.put_vars(vcf2)
     metadata2 = snps2.metadata
     assert '/calls/HQ' in metadata.keys()
     assert '/variations/qual' not in metadata2.keys()
     vcf_fhand.close()
     vcf_fhand2.close()
Beispiel #13
0
 def test_vcf_detect_fields(self):
     vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf_fhand2 = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf = VCFParser(vcf_fhand, pre_read_max_size=1000,
                     kept_fields=['/variations/qual'])
     vcf2 = VCFParser(vcf_fhand2, pre_read_max_size=1000,
                      ignored_fields=['/variations/qual'])
     snps = VariationsArrays(ignore_undefined_fields=True)
     snps.put_vars(vcf)
     metadata = snps.metadata
     snps2 = VariationsArrays(ignore_undefined_fields=True)
     snps2.put_vars(vcf2)
     metadata2 = snps2.metadata
     assert '/calls/HQ' in metadata.keys()
     assert '/variations/qual' not in metadata2.keys()
     vcf_fhand.close()
     vcf_fhand2.close()
Beispiel #14
0
 def test_generated_vcf_feed_outputs_equal_vcfs(self):
     h5_vars = VariationsH5(
         join(TEST_DATA_DIR, 'tomato.apeki_gbs.calmd.1stchunk.h5'), "r")
     with NamedTemporaryFile(mode='wb') as vcf_vars_from_h5:
         write_vcf(h5_vars, vcf_vars_from_h5)
         vcf_vars_from_h5.flush()
         vcf_fhand = open(vcf_vars_from_h5.name, 'rb')
         vcf = VCFParser(vcf_fhand)
         vcf_vars_parsed = VariationsArrays()
         vcf_vars_parsed.put_vars(vcf)
         with NamedTemporaryFile(mode='wb') as vcf_vars_from_vcf:
             vcf_vars_parsed.write_vcf(vcf_vars_from_vcf)
             vcf_vars_from_vcf.flush()
             vcf_from_h5_fhand = open(vcf_vars_from_h5.name, 'rb')
             vcf_from_vcf_fhand = open(vcf_vars_from_vcf.name, 'rb')
             for line_parsed_from_h5, line_parsed_from_vcf in zip(
                     vcf_from_h5_fhand, vcf_from_vcf_fhand):
                 assert line_parsed_from_h5 == line_parsed_from_vcf, "when importing from a generated VCF and exporting to a new VCF both files must be the same"
Beispiel #15
0
 def test_sort_variations(self):
     fhand = open(join(TEST_DATA_DIR, 'csv', 'standard_ex.tsv'), 'rb')
     var_info = {b'solcap_snp_sl_15058': {'chrom': b'chrom2', 'pos': 345},
                 b'solcap_snp_sl_60635': {'chrom': b'chrom1', 'pos': 346},
                 b'solcap_snp_sl_60604': {'chrom': b'chrom1', 'pos': 325}}
     parser = CSVParser(fhand, var_info, first_sample_column=1, sep=b'\t',
                        max_field_lens={'alt': 3},
                        max_field_str_lens={'chrom': 10, 'alt': 10})
     variations = VariationsArrays(ignore_overflows=True,
                                   ignore_undefined_fields=True)
     variations.put_vars(parser)
     sorted_vars = VariationsArrays()
     sort_variations(variations, sorted_vars)
     exp_chrom = [b'chrom1', b'chrom1', b'chrom2']
     exp_pos = [325, 346, 345]
     assert numpy.all(sorted_vars['/variations/chrom'] == exp_chrom)
     assert numpy.all(sorted_vars['/variations/pos'] == exp_pos)
     fhand.close()
Beispiel #16
0
 def test_write_meta_header(self):
     files = ['format_def_without_info.vcf',
              'format_def_without_filter.vcf',
              'format_without_flt_info_qual.vcf']
     for file in files:
         vcf_fhand = open(join(TEST_DATA_DIR, file), 'rb')
         header_lines = [line for line in vcf_fhand if line.startswith(b'#')]
         vcf_fhand.close()
         with open(join(TEST_DATA_DIR, file), 'rb') as vcf_fhand:
             vcf = VCFParser(vcf_fhand, max_field_lens={'alt': 2},
                             pre_read_max_size=10000)
             var_array = VariationsArrays(ignore_undefined_fields=True)
             var_array.put_vars(vcf)
             with NamedTemporaryFile(suffix='.h5') as tmp_fhand:
                 _write_vcf_meta(var_array, tmp_fhand, vcf_format='VCFv4.0')
                 _write_vcf_header(var_array, tmp_fhand)
                 tmp_fhand.flush()
                 with open(tmp_fhand.name, 'rb') as retmp_fhand:
                     for line in retmp_fhand:
                         assert line in header_lines
Beispiel #17
0
 def test_write_meta_header(self):
     files = [
         'format_def_without_info.vcf', 'format_def_without_filter.vcf',
         'format_without_flt_info_qual.vcf'
     ]
     for file in files:
         vcf_fhand = open(join(TEST_DATA_DIR, file), 'rb')
         header_lines = [
             line for line in vcf_fhand if line.startswith(b'#')
         ]
         vcf_fhand.close()
         with open(join(TEST_DATA_DIR, file), 'rb') as vcf_fhand:
             vcf = VCFParser(vcf_fhand)
             var_array = VariationsArrays(ignore_undefined_fields=True)
             var_array.put_vars(vcf)
             with NamedTemporaryFile(suffix='.h5') as tmp_fhand:
                 _write_vcf_meta(var_array, tmp_fhand, vcf_format='VCFv4.0')
                 _write_vcf_header(var_array, tmp_fhand)
                 tmp_fhand.flush()
                 with open(tmp_fhand.name, 'rb') as retmp_fhand:
                     for line in retmp_fhand:
                         assert line in header_lines
Beispiel #18
0
    def test_set_to_missing(self):
        orig_vars = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        vars = copy_setting_gts_to_missing(orig_vars, gt_rate_to_missing=0.9)

        orig_gts = orig_vars[GT_FIELD][...]
        noise_gts = vars[GT_FIELD]
        assert orig_gts.shape == noise_gts.shape
        mask_different_gts = orig_gts != noise_gts
        expected_num_gts_set_to_missing = int(
            round(numpy.sum(orig_gts != MISSING_INT) * 0.9))
        assert expected_num_gts_set_to_missing == mask_different_gts.sum()
        assert not numpy.sum(orig_gts[mask_different_gts] == MISSING_INT)

        vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
        vcf = VCFParser(vcf_fhand)
        snps = VariationsArrays(ignore_undefined_fields=True)
        snps.put_vars(vcf)
        vcf_fhand.close()
        numpy.random.seed(1)
        gts = numpy.array([[[0, 1], [1, 0], [-1, 1]], [[0, 0], [0, 1], [-1,
                                                                        0]],
                           [[-1, 2], [2, 1], [-1, 2]], [[0, 0], [-1, 0],
                                                        [1, 0]],
                           [[0, 1], [-1, 2], [1, 1]]])

        expected_gts = numpy.array([[[0, 1], [1, 0], [-1, 1]],
                                    [[-1, -1], [0, 1], [-1, 0]],
                                    [[-1, 2], [2, 1], [-1, 2]],
                                    [[0, 0], [-1, 0], [-1, -1]],
                                    [[-1, -1], [-1, 2], [-1, -1]]])
        del snps[GT_FIELD]
        snps[GT_FIELD] = gts
        vars = copy_setting_gts_to_missing(snps, gt_rate_to_missing=0.5)

        noise_gts = vars[GT_FIELD]
        assert numpy.all(noise_gts == expected_gts)
Beispiel #19
0
    def test_put_vars_from_csv(self):
        fhand_ex = open(join(TEST_DATA_DIR, 'csv', 'iupac_ex3.txt'), 'rb')
        var_info = {b'1': {'chrom': b'SL2.40ch02', 'pos': 331954},
                    b'2': {'chrom': b'SL2.40ch02', 'pos': 681961},
                    b'3': {'chrom': b'SL2.40ch02', 'pos': 1511764}}
        parser = CSVParser(fhand_ex, var_info, first_sample_column=3,
                           first_gt_column=3, sep=b'\t',
                           gt_splitter=create_iupac_allele_splitter(),
                           max_field_lens={'alt': 1},
                           max_field_str_lens={'alt': 1, 'chrom': 20,
                                               'ref': 1})

        with NamedTemporaryFile(suffix='.h5') as fhand:
            os.remove(fhand.name)
            h5 = VariationsH5(fhand.name, mode='w', ignore_overflows=True,
                              ignore_undefined_fields=True)
            h5.put_vars(parser)
            exp = [b'SL2.40ch02', b'SL2.40ch02', b'SL2.40ch02']
            assert list(h5['/variations/chrom'][:]) == exp
            alleles = list(zip(h5['/variations/ref'],
                           [alts[0] for alts in h5['/variations/alt']]))
            exp = [(b'G', b'T'), (b'C', b''), (b'A', b'T')]
            for als, aexp in zip(alleles, exp):
                assert set(als) == set(aexp)
            assert list(h5['/variations/pos'][:]) == [331954, 681961,
                                                      1511764]
            exp1 = numpy.array([[[1, 1], [0, 0], [-1, -1]],
                                [[0, 0], [0, 0], [-1, -1]],
                                [[0, 0], [0, 0], [1, 0]]])
            exp2 = numpy.array([[[0, 0], [1, 1], [-1, -1]],
                                [[0, 0], [0, 0], [-1, -1]],
                                [[1, 1], [1, 1], [0, 1]]])

            for gts, exp_gts1, exp_gts2 in zip(h5['/calls/GT'][:], exp1, exp2):
                for gt, ex1, ex2 in zip(gts, exp_gts1, exp_gts2):
                    assert set(gt) == set(ex1) or set(gt) == set(ex2)

        if os.path.exists(fhand.name):
            os.remove(fhand.name)
        fhand_ex.close()

        fhand_ex = open(join(TEST_DATA_DIR, 'csv',
                             'two_letter_coding_ex3.txt'), 'rb')
        var_info = {b'1': {'chrom': b'SL2.40ch02', 'pos': 331954},
                    b'2': {'chrom': b'SL2.40ch02', 'pos': 681961},
                    b'3': {'chrom': b'SL2.40ch02', 'pos': 1511764}}
        parser = CSVParser(fhand_ex, var_info, first_sample_column=3,
                           first_gt_column=3, sep=b'\t',
                           max_field_lens={'alt': 1},
                           max_field_str_lens={'alt': 1, 'chrom': 20,
                                               'ref': 1})

        h5 = VariationsArrays(ignore_overflows=True,
                              ignore_undefined_fields=True)
        h5.put_vars(parser)
        exp = [b'SL2.40ch02', b'SL2.40ch02', b'SL2.40ch02']
        assert list(h5['/variations/chrom'][:]) == exp
        alleles = list(zip(h5['/variations/ref'],
                       [alts[0] for alts in h5['/variations/alt']]))
        exp = [(b'G', b'T'), (b'C', b''), (b'A', b'T')]
        for als, aexp in zip(alleles, exp):
            assert set(als) == set(aexp)
        assert list(h5['/variations/pos'][:]) == [331954, 681961,
                                                  1511764]
        exp1 = numpy.array([[[1, 1], [0, 0], [-1, -1]],
                            [[0, 0], [0, 0], [-1, -1]],
                            [[0, 0], [0, 0], [1, 0]]])
        exp2 = numpy.array([[[0, 0], [1, 1], [-1, -1]],
                            [[0, 0], [0, 0], [-1, -1]],
                            [[1, 1], [1, 1], [0, 1]]])
        for gts, exp_gts1, exp_gts2 in zip(h5['/calls/GT'][:], exp1, exp2):
            for gt, ex1, ex2 in zip(gts, exp_gts1, exp_gts2):
                assert set(gt) == set(ex1) or set(gt) == set(ex2)
        fhand_ex.close()
Beispiel #20
0
    def test_write_vcf(self):
        # With all fields available
        tmp_fhand = NamedTemporaryFile()
        tmp_fhand.close()
        vcf_fhand = open(join(TEST_DATA_DIR, 'format_def_exp.vcf'), 'rb')
        vcf = VCFParser(vcf_fhand, max_field_lens={'alt': 2},
                        pre_read_max_size=10000)

        max_field_lens = {'CALLS': {b'GT': 1, b'HQ': 2, b'DP': 1, b'GQ': 1},
                          'FILTER': 1,
                          'INFO': {b'AA': 1, b'AF': 2, b'DP': 1,
                                   b'DB': 1, b'NS': 1, b'H2': 1}, 'alt': 2}
        max_field_str_lens = {'INFO': {b'AA': 1}, 'alt': 5, 'chrom': 2, 'ref': 4,
                              'id': 10, 'FILTER': 0}

        variations = VariationsArrays(ignore_undefined_fields=True)
        variations.put_vars(vcf, max_field_lens=max_field_lens,
                            max_field_str_lens=max_field_str_lens)
        vcf_fhand.close()
        with NamedTemporaryFile(mode='wb') as out_fhand:
            write_vcf(variations, out_fhand, vcf_format='VCFv4.0')
            vcf_fpath = join(TEST_DATA_DIR, 'format_def_exp.vcf')
            with open(vcf_fpath, 'r') as exp_fhand:
                exp_lines = list(exp_fhand)
                out_fhand.seek(0)
                with open(out_fhand.name) as refhand:
                    for line in refhand:
                        try:
                            assert line in exp_lines
                        except AssertionError:
                            print('aa', line)

        # With missing info in variations
        tmp_fhand = NamedTemporaryFile()
        out_fpath = tmp_fhand.name
        tmp_fhand.close()
        vcf_fhand = open(join(TEST_DATA_DIR, 'format_def_without_info.vcf'),
                         'rb')
        vcf = VCFParser(vcf_fhand)

        max_field_lens = {'INFO': {}, 'CALLS': {b'GQ': 1, b'GT': 1, b'HQ': 2,
                                                b'DP': 1},
                          'FILTER': 1, 'alt': 2}
        max_field_str_lens = {'ref': 4, 'INFO': {}, 'id': 10, 'FILTER': 0,
                              'alt': 5, 'chrom': 2}

        h5_without_info = VariationsH5(fpath=out_fpath, mode='w',
                                       ignore_undefined_fields=True)
        h5_without_info.put_vars(vcf, max_field_lens=max_field_lens,
                                 max_field_str_lens=max_field_str_lens)
        vcf_fhand.close()
        with NamedTemporaryFile(mode='wb') as out_fhand:
            write_vcf(h5_without_info, out_fhand, vcf_format='VCFv4.0')
            vcf_fpath = join(TEST_DATA_DIR, 'format_def_without_info_exp.vcf')
            with open(vcf_fpath, 'r') as exp_fhand:
                exp_lines = list(exp_fhand)
                out_fhand.seek(0)
                with open(out_fhand.name) as refhand:
                    for line in refhand:
                        try:
                            assert line in exp_lines
                        except AssertionError:
                            print(line)
Beispiel #21
0
    def test_merge_variations(self):
        h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r")
        h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3},
                           ignore_complex_overlaps=True,
                           check_ref_matches=False)
        assert merger.ploidy == 2
        assert merger.samples == [b'TS-1', b'TS-11', b'TS-21', b'NA00001',
                                  b'NA00002', b'NA00003']
        expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged.h5'),
                                   'r')
        new_vars = VariationsArrays(ignore_undefined_fields=True)
        new_vars.put_vars(merger)

        first_h5 = h5_1
        second_h5 = h5_2

        field_paths = []
        for field_path in field_paths:
            print('path', field_path)
            print('first:')
            if field_path in first_h5:
                print(h5_1[field_path][:].shape)
            print('second:')
            if field_path in second_h5:
                print(second_h5[field_path][:].shape)
            print('expected:')
            print(expected_h5[field_path][:].shape)
            print('merged:')
            print(new_vars[field_path].shape)


        for field in new_vars.keys():
            if 'float' in str(new_vars[field][:].dtype):
                assert numpy.allclose(remove_nans(expected_h5[field][:]),
                                      remove_nans(new_vars[field][:]))
            else:
                result = new_vars[field][:]

                try:
                    if not expected_h5[field][:].shape == result.shape:
                        raise AssertionError('comparison failed for field: ' + field)
                    assert numpy.all(expected_h5[field][:] == result)
                except (AssertionError, ValueError, TypeError):
                    print(field)
                    print(expected_h5[field][:])
                    print(result)
                    raise

        # Change the order
        h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r")
        h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        merger = VarMerger(h5_2, h5_1, max_field_lens={'alt': 3},
                           ignore_complex_overlaps=True,
                           check_ref_matches=False)
        assert merger.ploidy == 2
        assert merger.samples == [b'NA00001', b'NA00002', b'NA00003',
                                  b'TS-1', b'TS-11', b'TS-21']
        expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged2.h5'),
                                   'r')
        new_vars = VariationsArrays(ignore_undefined_fields=True)
        new_vars.put_vars(merger)

        for field in new_vars.keys():
            if 'float' in str(new_vars[field][:].dtype):
                assert numpy.all(remove_nans(expected_h5[field][:]) ==
                                 remove_nans(new_vars[field][:]))
            else:
                result = new_vars[field][:]
                assert numpy.all(expected_h5[field][:] == result)
Beispiel #22
0
    def test_put_vars_from_csv(self):
        fhand_ex = open(join(TEST_DATA_DIR, 'csv', 'iupac_ex3.txt'), 'rb')
        var_info = {b'1': {'chrom': b'SL2.40ch02', 'pos': 331954},
                    b'2': {'chrom': b'SL2.40ch02', 'pos': 681961},
                    b'3': {'chrom': b'SL2.40ch02', 'pos': 1511764}}
        parser = CSVParser(fhand_ex, var_info, first_sample_column=3,
                           first_gt_column=3, sep=b'\t',
                           gt_splitter=create_iupac_allele_splitter())

        with NamedTemporaryFile(suffix='.h5') as fhand:
            os.remove(fhand.name)
            h5 = VariationsH5(fhand.name, mode='w',
                              ignore_undefined_fields=True)
            h5.put_vars(parser)
            exp = [b'SL2.40ch02', b'SL2.40ch02', b'SL2.40ch02']
            assert list(h5['/variations/chrom'][:]) == exp
            alleles = list(zip(h5['/variations/ref'],
                           [alts[0] for alts in h5['/variations/alt']]))
            exp = [(b'G', b'T'), (b'C', b''), (b'A', b'T')]
            for als, aexp in zip(alleles, exp):
                assert set(als) == set(aexp)
            assert list(h5['/variations/pos'][:]) == [331954, 681961,
                                                      1511764]
            exp1 = numpy.array([[[1, 1], [0, 0], [-1, -1]],
                                [[0, 0], [0, 0], [-1, -1]],
                                [[0, 0], [0, 0], [1, 0]]])
            exp2 = numpy.array([[[0, 0], [1, 1], [-1, -1]],
                                [[0, 0], [0, 0], [-1, -1]],
                                [[1, 1], [1, 1], [0, 1]]])

            for gts, exp_gts1, exp_gts2 in zip(h5['/calls/GT'][:], exp1, exp2):
                for gt, ex1, ex2 in zip(gts, exp_gts1, exp_gts2):
                    assert set(gt) == set(ex1) or set(gt) == set(ex2)

        if os.path.exists(fhand.name):
            os.remove(fhand.name)
        fhand_ex.close()

        fhand_ex = open(join(TEST_DATA_DIR, 'csv',
                             'two_letter_coding_ex3.txt'), 'rb')
        var_info = {b'1': {'chrom': b'SL2.40ch02', 'pos': 331954},
                    b'2': {'chrom': b'SL2.40ch02', 'pos': 681961},
                    b'3': {'chrom': b'SL2.40ch02', 'pos': 1511764}}
        parser = CSVParser(fhand_ex, var_info, first_sample_column=3,
                           first_gt_column=3, sep=b'\t')

        h5 = VariationsArrays(ignore_undefined_fields=True)
        h5.put_vars(parser)
        exp = [b'SL2.40ch02', b'SL2.40ch02', b'SL2.40ch02']
        assert list(h5['/variations/chrom'][:]) == exp
        alleles = list(zip(h5['/variations/ref'],
                       [alts[0] for alts in h5['/variations/alt']]))
        exp = [(b'G', b'T'), (b'C', b''), (b'A', b'T')]
        for als, aexp in zip(alleles, exp):
            assert set(als) == set(aexp)
        assert list(h5['/variations/pos'][:]) == [331954, 681961,
                                                  1511764]
        exp1 = numpy.array([[[1, 1], [0, 0], [-1, -1]],
                            [[0, 0], [0, 0], [-1, -1]],
                            [[0, 0], [0, 0], [1, 0]]])
        exp2 = numpy.array([[[0, 0], [1, 1], [-1, -1]],
                            [[0, 0], [0, 0], [-1, -1]],
                            [[1, 1], [1, 1], [0, 1]]])
        for gts, exp_gts1, exp_gts2 in zip(h5['/calls/GT'][:], exp1, exp2):
            for gt, ex1, ex2 in zip(gts, exp_gts1, exp_gts2):
                assert set(gt) == set(ex1) or set(gt) == set(ex2)
        fhand_ex.close()