Exemple #1
0
    def test_by_chunks(self):
        fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
        vcf_parser = VCFParser(fhand=fhand, n_threads=None)
        snps = VariationsArrays()
        snps.put_vars(vcf_parser)
        fhand.close()

        fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
        vcf_parser = VCFParser(fhand=fhand, n_threads=None)
        snps = VariationsArrays(vars_in_chunk=1)
        snps.put_vars(vcf_parser)
        fhand.close()
Exemple #2
0
 def test_vcf_detect_fields(self):
     vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf_fhand2 = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf = VCFParser(vcf_fhand, kept_fields=['/variations/qual'])
     vcf2 = VCFParser(vcf_fhand2, ignored_fields=['/variations/qual'])
     snps = VariationsArrays(ignore_undefined_fields=True)
     snps.put_vars(vcf)
     metadata = snps.metadata
     snps2 = VariationsArrays(ignore_undefined_fields=True)
     snps2.put_vars(vcf2)
     metadata2 = snps2.metadata
     assert '/calls/HQ' in metadata.keys()
     assert '/variations/qual' not in metadata2.keys()
     vcf_fhand.close()
     vcf_fhand2.close()
Exemple #3
0
 def test_delete_item_from_variationArray(self):
     vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf = VCFParser(vcf_fhand)
     snps = VariationsArrays(ignore_undefined_fields=True)
     snps.put_vars(vcf)
     del snps['/calls/GT']
     assert '/calls/GT' not in snps.keys()
     vcf_fhand.close()
Exemple #4
0
 def test_parser_vcf_filters(self):
     vcf_fhand = open(join(TEST_DATA_DIR, 'format_def_without_info.vcf'),
                      'rb')
     vcf = VCFParser(vcf_fhand)
     filters = []
     for var in vcf.variations:
         filters.append(var[6])
     assert filters == [[], [b'q10'], [], [], []]
     vcf_fhand.close()
     # No filters
     vcf_fhand = open(join(TEST_DATA_DIR, 'format_def_without_filter.vcf'),
                      'rb')
     vcf = VCFParser(vcf_fhand)
     filters = []
     for var in vcf.variations:
         filters.append(var[6])
     assert filters == [None, None, None, None, None]
     vcf_fhand.close()
Exemple #5
0
 def test_put_vars_arrays_from_vcf(self):
     vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf = VCFParser(vcf_fhand)
     snps = VariationsArrays(ignore_undefined_fields=True)
     snps.put_vars(vcf)
     assert snps['/calls/GT'].shape == (5, 3, 2)
     assert numpy.all(snps['/calls/GT'][1] == [[0, 0], [0, 1], [0, 0]])
     expected = numpy.array([48, 48, 43], dtype=numpy.int16)
     assert numpy.all(snps['/calls/GQ'][0, :] == expected)
     vcf_fhand.close()
Exemple #6
0
def _create_var_mat_objs_from_vcf(vcf_fpath, kwargs, kept_fields=None,
                                  ignored_fields=None):
    for klass in VAR_MAT_CLASSES:
        if vcf_fpath.endswith('.gz'):
            fhand = gzip.open(vcf_fpath, 'rb')
        else:
            fhand = open(vcf_fpath, 'rb')
        vcf_parser = VCFParser(fhand=fhand, **kwargs)
        out_snps = _init_var_mat(klass)
        out_snps.put_vars(vcf_parser)
        fhand.close()
        yield out_snps
Exemple #7
0
def _parse_vcf(chrom, vcf_fpath, tmp_dir, kept_fields, ignored_fields):
    tmp_h5_fhand = NamedTemporaryFile(prefix=chrom.decode() + '.',
                                      suffix='.tmp.h5', dir=tmp_dir)

    tmp_h5_fpath = tmp_h5_fhand.name
    tmp_h5_fhand.close()
    tmp_h5 = VariationsH5(tmp_h5_fpath, 'w', ignore_undefined_fields=True,
                          kept_fields=kept_fields,
                          ignored_fields=ignored_fields)

    vcf_parser = VCFParser(get_vcf_lines_for_chrom(chrom, vcf_fpath),
                           kept_fields=kept_fields,
                           ignored_fields=ignored_fields)

    tmp_h5.put_vars(vcf_parser)
    tmp_h5.close()
    return tmp_h5_fpath
Exemple #8
0
 def test_put_vars_hdf5_from_vcf(self):
     vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
     vcf = VCFParser(vcf_fhand)
     with NamedTemporaryFile(suffix='.hdf5') as fhand:
         os.remove(fhand.name)
         h5f = VariationsH5(fhand.name, 'w', ignore_undefined_fields=True)
         h5f.put_vars(vcf)
         assert numpy.all(h5f['/variations/alt'][:] == [[b'A', b''],
                                                        [b'A', b''],
                                                        [b'G', b'T'],
                                                        [b'', b''],
                                                        [b'G', b'GTACT']])
         assert h5f['/calls/GT'].shape == (5, 3, 2)
         assert numpy.all(h5f['/calls/GT'][1] == [[0, 0], [0, 1], [0, 0]])
         expected = numpy.array([48, 48, 43], dtype=numpy.int16)
         assert numpy.all(h5f['/calls/GQ'][0, :] == expected)
         vcf_fhand.close()
Exemple #9
0
 def test_generated_vcf_feed_outputs_equal_vcfs(self):
     h5_vars = VariationsH5(
         join(TEST_DATA_DIR, 'tomato.apeki_gbs.calmd.1stchunk.h5'), "r")
     with NamedTemporaryFile(mode='wb') as vcf_vars_from_h5:
         write_vcf(h5_vars, vcf_vars_from_h5)
         vcf_vars_from_h5.flush()
         vcf_fhand = open(vcf_vars_from_h5.name, 'rb')
         vcf = VCFParser(vcf_fhand)
         vcf_vars_parsed = VariationsArrays()
         vcf_vars_parsed.put_vars(vcf)
         with NamedTemporaryFile(mode='wb') as vcf_vars_from_vcf:
             vcf_vars_parsed.write_vcf(vcf_vars_from_vcf)
             vcf_vars_from_vcf.flush()
             vcf_from_h5_fhand = open(vcf_vars_from_h5.name, 'rb')
             vcf_from_vcf_fhand = open(vcf_vars_from_vcf.name, 'rb')
             for line_parsed_from_h5, line_parsed_from_vcf in zip(
                     vcf_from_h5_fhand, vcf_from_vcf_fhand):
                 assert line_parsed_from_h5 == line_parsed_from_vcf, "when importing from a generated VCF and exporting to a new VCF both files must be the same"
Exemple #10
0
    def test_count_alleles(self):
        for klass in VAR_MAT_CLASSES:
            in_snps = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')
            var_mat = _init_var_mat(klass)
            try:
                chunks = in_snps.iterate_chunks(kept_fields=['/calls/GT'])
                var_mat.put_chunks(chunks)
                assert numpy.any(var_mat.allele_count)
                in_snps.close()
            finally:
                pass

        expected = [[3, 3, 0], [5, 1, 0], [0, 2, 4], [6, 0, 0], [2, 3, 1]]
        for klass in VAR_MAT_CLASSES:
            fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
            vcf_parser = VCFParser(fhand=fhand)
            var_mat = _init_var_mat(klass)
            var_mat.put_vars(vcf_parser)
            assert numpy.all(var_mat.allele_count == expected)
            fhand.close()
Exemple #11
0
def main():
    description = 'Transforms VCF file into HDF5 format'
    parser = _setup_argparse(description=description)
    args = _parse_args(parser)
    in_fpath = args['in_fpath']
    if in_fpath.split('.')[-1] == 'gz':
        fhand = read_gzip_file(in_fpath)
    else:
        fhand = open(in_fpath, 'rb')
    vcf_parser = VCFParser(fhand=fhand,
                           pre_read_max_size=args['pre_read_max_size'],
                           ignored_fields=args['ignored_fields'],
                           kept_fields=args['kept_fields'],
                           max_field_lens={
                               'CALLS': {
                                   b'AO': args['alt_gt_num']
                               },
                               'alt': args['alt_gt_num']
                           })
    h5 = VariationsH5(args['out_fpath'], mode='w')
    h5.put_vars(vcf_parser)
Exemple #12
0
    def test_samples(self):
        gts = numpy.array([[[0, 0], [0, 1], [2, 2], [-1, 3]],
                           [[0, 0], [0, 0], [1, 1], [2, 2]],
                           [[-1, -1], [-1, -1], [-1, -1], [-1, -1]]])
        varis = VariationsArrays()
        varis[GT_FIELD] = gts
        varis.samples = [1, 2, 3, 4]
        assert varis.samples == [1, 2, 3, 4]

        # With another file
        tmp_fhand = NamedTemporaryFile()
        path = tmp_fhand.name
        tmp_fhand.close()
        fhand = open(join(TEST_DATA_DIR, 'phylome.sample.vcf'), 'rb')
        vcf_parser = VCFParser(fhand=fhand)
        h5 = VariationsH5(path, mode='w', ignore_undefined_fields=True)
        h5.put_vars(vcf_parser)
        fhand.close()
        samples = h5.samples
        samples[0] = '0'
        h5.samples = samples
Exemple #13
0
 def test_write_meta_header(self):
     files = [
         'format_def_without_info.vcf', 'format_def_without_filter.vcf',
         'format_without_flt_info_qual.vcf'
     ]
     for file in files:
         vcf_fhand = open(join(TEST_DATA_DIR, file), 'rb')
         header_lines = [
             line for line in vcf_fhand if line.startswith(b'#')
         ]
         vcf_fhand.close()
         with open(join(TEST_DATA_DIR, file), 'rb') as vcf_fhand:
             vcf = VCFParser(vcf_fhand)
             var_array = VariationsArrays(ignore_undefined_fields=True)
             var_array.put_vars(vcf)
             with NamedTemporaryFile(suffix='.h5') as tmp_fhand:
                 _write_vcf_meta(var_array, tmp_fhand, vcf_format='VCFv4.0')
                 _write_vcf_header(var_array, tmp_fhand)
                 tmp_fhand.flush()
                 with open(tmp_fhand.name, 'rb') as retmp_fhand:
                     for line in retmp_fhand:
                         assert line in header_lines
Exemple #14
0
    def test_set_to_missing(self):
        orig_vars = VariationsH5(join(TEST_DATA_DIR, 'ril.hdf5'), mode='r')

        vars = copy_setting_gts_to_missing(orig_vars, gt_rate_to_missing=0.9)

        orig_gts = orig_vars[GT_FIELD][...]
        noise_gts = vars[GT_FIELD]
        assert orig_gts.shape == noise_gts.shape
        mask_different_gts = orig_gts != noise_gts
        expected_num_gts_set_to_missing = int(
            round(numpy.sum(orig_gts != MISSING_INT) * 0.9))
        assert expected_num_gts_set_to_missing == mask_different_gts.sum()
        assert not numpy.sum(orig_gts[mask_different_gts] == MISSING_INT)

        vcf_fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
        vcf = VCFParser(vcf_fhand)
        snps = VariationsArrays(ignore_undefined_fields=True)
        snps.put_vars(vcf)
        vcf_fhand.close()
        numpy.random.seed(1)
        gts = numpy.array([[[0, 1], [1, 0], [-1, 1]], [[0, 0], [0, 1], [-1,
                                                                        0]],
                           [[-1, 2], [2, 1], [-1, 2]], [[0, 0], [-1, 0],
                                                        [1, 0]],
                           [[0, 1], [-1, 2], [1, 1]]])

        expected_gts = numpy.array([[[0, 1], [1, 0], [-1, 1]],
                                    [[-1, -1], [0, 1], [-1, 0]],
                                    [[-1, 2], [2, 1], [-1, 2]],
                                    [[0, 0], [-1, 0], [-1, -1]],
                                    [[-1, -1], [-1, 2], [-1, -1]]])
        del snps[GT_FIELD]
        snps[GT_FIELD] = gts
        vars = copy_setting_gts_to_missing(snps, gt_rate_to_missing=0.5)

        noise_gts = vars[GT_FIELD]
        assert numpy.all(noise_gts == expected_gts)
Exemple #15
0
    def test_vcf_to_hdf5(self):
        tmp_fhand = NamedTemporaryFile()
        path = tmp_fhand.name
        tmp_fhand.close()

        fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
        vcf_parser = VCFParser(fhand=fhand, n_threads=None)
        h5 = VariationsH5(path, mode='w', ignore_undefined_fields=True,
                          vars_in_chunk=2)
        h5.put_vars(vcf_parser)
        fhand.close()

        h5 = VariationsH5(path, 'r')
        assert h5['/calls/GT'].shape == (5, 3, 2)
        assert numpy.all(h5['/calls/GT'][1] == [[0, 0], [0, 1], [0, 0]])

        expected = numpy.array([[[51, 51], [51, 51], [-1, -1]],
                                [[58, 50], [65, 3], [-1, -1]],
                                [[23, 27], [18, 2], [-1, -1]],
                                [[56, 60], [51, 51], [-1, -1]],
                                [[-1, -1], [-1, -1], [-1, -1]]],
                               dtype=numpy.int16)
        assert numpy.all(h5['/calls/HQ'][:] == expected)
        expected = numpy.array([48, 48, 43], dtype=numpy.int16)
        assert numpy.all(h5['/calls/GQ'][0, :] == expected)

        # Variations filters fields
        expected = numpy.array([1, 0, 1, 1, 1])
        assert numpy.all(h5['/variations/filter/q10'][:] == expected)
        expected = numpy.array([False, False, False, False, False])
        expected = numpy.array([1, 1, 1, 1, 1])
        assert numpy.all(h5['/variations/filter/s50'][:] == expected)

        # Variations info fields
        expected = numpy.array([[0.5, numpy.nan],
                                [0.01699829, numpy.nan],
                                [0.33300781, 0.66699219],
                                [numpy.nan, numpy.nan],
                                [numpy.nan, numpy.nan]])

        af = h5['/variations/info/AF'][:]
        assert numpy.allclose(af, expected, equal_nan=True, atol=0.01)
        expected = numpy.array([3, 3, 2, 3, 3])
        assert numpy.all(h5['/variations/info/NS'][:] == expected)
        expected = numpy.array([14, 11, 10, 13, 9])
        assert numpy.all(h5['/variations/info/DP'][:] == expected)
        expected = numpy.array([True, False, True, False, False])
        assert numpy.all(h5['/variations/info/DB'][:] == expected)
        expected = numpy.array([True, False, False, False, False])
        assert numpy.all(h5['/variations/info/H2'][:] == expected)

        os.remove(path)
        # With another file
        tmp_fhand = NamedTemporaryFile()
        path = tmp_fhand.name
        tmp_fhand.close()

        fhand = open(join(TEST_DATA_DIR, 'phylome.sample.vcf'), 'rb')
        vcf_parser = VCFParser(fhand=fhand)
        h5 = VariationsH5(path, mode='w', ignore_undefined_fields=True)
        h5.put_vars(vcf_parser)
        fhand.close()
        h5 = h5py.File(path, 'r')

        assert numpy.all(h5['/calls/GT'].shape == (2, 42, 2))
        assert numpy.all(h5['/calls/GT'][1, 12] == [1, 1])
        assert numpy.all(h5['/calls/GL'][0, 0, 0] == 0)
        os.remove(path)