Esempio n. 1
0
    def test_merge_variations(self):
        h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r")
        h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3},
                           ignore_complex_overlaps=True,
                           check_ref_matches=False)
        assert merger.ploidy == 2
        assert merger.samples == [b'TS-1', b'TS-11', b'TS-21', b'NA00001',
                                  b'NA00002', b'NA00003']
        expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged.h5'),
                                   'r')
        new_vars = VariationsArrays(ignore_overflows=True,
                                    ignore_undefined_fields=True)
        new_vars.put_vars(merger)

        for field in new_vars.keys():
            if 'float' in str(new_vars[field][:].dtype):
                assert numpy.all(remove_nans(expected_h5[field][:]) ==
                                 remove_nans(new_vars[field][:]))
            else:
                result = new_vars[field][:]
                try:
                    assert numpy.all(expected_h5[field][:] == result)
                except AssertionError:
                    print(field)
                    print(expected_h5[field][:])
                    print(result)

        # Change the order
        h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r")
        h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        merger = VarMerger(h5_2, h5_1, max_field_lens={'alt': 3},
                           ignore_complex_overlaps=True,
                           check_ref_matches=False)
        assert merger.ploidy == 2
        assert merger.samples == [b'NA00001', b'NA00002', b'NA00003',
                                  b'TS-1', b'TS-11', b'TS-21']
        expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged2.h5'),
                                   'r')
        new_vars = VariationsArrays(ignore_overflows=True,
                                    ignore_undefined_fields=True)
        new_vars.put_vars(merger)

        for field in new_vars.keys():
            if 'float' in str(new_vars[field][:].dtype):
                assert numpy.all(remove_nans(expected_h5[field][:]) ==
                                 remove_nans(new_vars[field][:]))
            else:
                result = new_vars[field][:]
                assert numpy.all(expected_h5[field][:] == result)
Esempio n. 2
0
def _preprocess_format_calls_paths(variations, var_index, calls_paths):

    new_format_paths, new_calls_paths = [], []
    for key in calls_paths:
        values = remove_nans(variations[key][var_index])
        if (not np.all(values == MISSING_VALUES[values.dtype]) and
                values.shape[0] != 0):
            new_calls_paths.append(key)
            new_format_paths.append(key.split('/')[-1])
    return new_calls_paths, new_format_paths
Esempio n. 3
0
def _get_info_value(variations, index, info_paths, metadata, num_alt):
    info = []
    for key in info_paths:
        dtype = str(variations[key].dtype)
        value = variations[key][index]
        field_key = key.split('/')[-1]
        meta_number = metadata[key]['Number']

        if 'bool' in dtype:
            if value:
                info.append(field_key)
        elif meta_number == 'A':
            try:
                value = value[:num_alt]
            except IndexError:
                if num_alt == 1:
                    value = numpy.array(value)
                else:
                    raise
            if '|S' in dtype:
                value = [val.decode() for val in value]
            else:
                value = remove_nans(value)
                value = [str(val) for val in value]
            value = ','.join(value) if value else None
        elif meta_number == 1:

            if '|S' in dtype:
                value = value.decode()
            else:
                value = str(value)
        elif meta_number > 1:
            value = [str(val) for val in value if not numpy.isnan(val)]
            value = ','.join(value) if value else None
        elif not value:
            value = None
        else:
            raise(NotImplemented)

        if 'bool' not in dtype and value and value is not None:
            info.append('{}={}'.format(field_key, value))

    return ';'.join(info)
Esempio n. 4
0
def _get_info_value(variations, index, info_paths, metadata, num_alt):
    info = []
    for key in info_paths:
        dtype = str(variations[key].dtype)
        value = variations[key][index]
        field_key = key.split('/')[-1]
        meta_number = metadata[key]['Number']

        if 'bool' in dtype:
            if value:
                info.append(field_key)
        elif meta_number == 'A':
            try:
                value = value[:num_alt]
            except IndexError:
                if num_alt == 1:
                    value = np.array(value)
                else:
                    raise
            if '|S' in dtype:
                value = [val.decode() for val in value]
            else:
                value = remove_nans(value)
                value = [str(val) for val in value]
            value = ','.join(value) if value else None
        elif meta_number == 1:

            if '|S' in dtype:
                value = value.decode()
            else:
                value = str(value)
        elif meta_number > 1:
            value = [str(val) for val in value if not np.isnan(val)]
            value = ','.join(value) if value else None
        elif not value:
            value = None
        else:
            raise(NotImplemented)

        if 'bool' not in dtype and value and value is not None:
            info.append('{}={}'.format(field_key, value))

    return ';'.join(info)
Esempio n. 5
0
    def test_merge_variations(self):
        h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r")
        h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        merger = VarMerger(h5_1, h5_2, max_field_lens={'alt': 3},
                           ignore_complex_overlaps=True,
                           check_ref_matches=False)
        assert merger.ploidy == 2
        assert merger.samples == [b'TS-1', b'TS-11', b'TS-21', b'NA00001',
                                  b'NA00002', b'NA00003']
        expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged.h5'),
                                   'r')
        new_vars = VariationsArrays(ignore_undefined_fields=True)
        new_vars.put_vars(merger)

        first_h5 = h5_1
        second_h5 = h5_2

        field_paths = []
        for field_path in field_paths:
            print('path', field_path)
            print('first:')
            if field_path in first_h5:
                print(h5_1[field_path][:].shape)
            print('second:')
            if field_path in second_h5:
                print(second_h5[field_path][:].shape)
            print('expected:')
            print(expected_h5[field_path][:].shape)
            print('merged:')
            print(new_vars[field_path].shape)


        for field in new_vars.keys():
            if 'float' in str(new_vars[field][:].dtype):
                assert numpy.allclose(remove_nans(expected_h5[field][:]),
                                      remove_nans(new_vars[field][:]))
            else:
                result = new_vars[field][:]

                try:
                    if not expected_h5[field][:].shape == result.shape:
                        raise AssertionError('comparison failed for field: ' + field)
                    assert numpy.all(expected_h5[field][:] == result)
                except (AssertionError, ValueError, TypeError):
                    print(field)
                    print(expected_h5[field][:])
                    print(result)
                    raise

        # Change the order
        h5_1 = VariationsH5(join(TEST_DATA_DIR, 'csv', 'format.h5'), "r")
        h5_2 = VariationsH5(join(TEST_DATA_DIR, 'format_def.h5'), "r")
        merger = VarMerger(h5_2, h5_1, max_field_lens={'alt': 3},
                           ignore_complex_overlaps=True,
                           check_ref_matches=False)
        assert merger.ploidy == 2
        assert merger.samples == [b'NA00001', b'NA00002', b'NA00003',
                                  b'TS-1', b'TS-11', b'TS-21']
        expected_h5 = VariationsH5(join(TEST_DATA_DIR, 'expected_merged2.h5'),
                                   'r')
        new_vars = VariationsArrays(ignore_undefined_fields=True)
        new_vars.put_vars(merger)

        for field in new_vars.keys():
            if 'float' in str(new_vars[field][:].dtype):
                assert numpy.all(remove_nans(expected_h5[field][:]) ==
                                 remove_nans(new_vars[field][:]))
            else:
                result = new_vars[field][:]
                assert numpy.all(expected_h5[field][:] == result)
Esempio n. 6
0
    def test_vcf_to_hdf5(self):
        tmp_fhand = NamedTemporaryFile()
        path = tmp_fhand.name
        tmp_fhand.close()

        fhand = open(join(TEST_DATA_DIR, 'format_def.vcf'), 'rb')
        vcf_parser = VCFParser(fhand=fhand, max_field_lens={'alt': 5},
                               n_threads=None, pre_read_max_size=1000)
        h5 = VariationsH5(path, mode='w', ignore_undefined_fields=True)
        h5.put_vars(vcf_parser)
        fhand.close()

        h5 = VariationsH5(path, 'r')
        assert h5['/calls/GT'].shape == (5, 3, 2)
        assert numpy.all(h5['/calls/GT'][1] == [[0, 0], [0, 1], [0, 0]])

        expected = numpy.array([[[51, 51], [51, 51], [-1, -1]],
                                [[58, 50], [65, 3], [-1, -1]],
                                [[23, 27], [18, 2], [-1, -1]],
                                [[56, 60], [51, 51], [-1, -1]],
                                [[-1, -1], [-1, -1], [-1, -1]]],
                               dtype=numpy.int16)
        assert numpy.all(h5['/calls/HQ'][:] == expected)
        expected = numpy.array([48, 48, 43], dtype=numpy.int16)
        assert numpy.all(h5['/calls/GQ'][0, :] == expected)

        # Variations filters fields
        expected = numpy.array([False, True, False, False, False])
        assert numpy.all(h5['/variations/filter/q10'][:] == expected)
        expected = numpy.array([False, False, False, False, False])
        assert numpy.all(h5['/variations/filter/s50'][:] == expected)
        expected = [True, False, True, True, True]
        assert numpy.all(h5['/variations/filter/PASS'][:] == expected)

        # Variations info fields
        expected = remove_nans(numpy.array([[0.5, numpy.nan],
                                            [0.01699829, numpy.nan],
                                            [0.33300781, 0.66699219],
                                            [numpy.nan, numpy.nan],
                                            [numpy.nan, numpy.nan]],
                                           dtype=numpy.float16))

        af = remove_nans(h5['/variations/info/AF'][:])
        assert numpy.all(af == expected)
        expected = numpy.array([3, 3, 2, 3, 3])
        assert numpy.all(h5['/variations/info/NS'][:] == expected)
        expected = numpy.array([14, 11, 10, 13, 9])
        assert numpy.all(h5['/variations/info/DP'][:] == expected)
        expected = numpy.array([True, False, True, False, False])
        assert numpy.all(h5['/variations/info/DB'][:] == expected)
        expected = numpy.array([True, False, False, False, False])
        assert numpy.all(h5['/variations/info/H2'][:] == expected)

        os.remove(path)
        # With another file
        tmp_fhand = NamedTemporaryFile()
        path = tmp_fhand.name
        tmp_fhand.close()

        fhand = open(join(TEST_DATA_DIR, 'phylome.sample.vcf'), 'rb')
        vcf_parser = VCFParser(fhand=fhand, pre_read_max_size=1000)
        h5 = VariationsH5(path, mode='w', ignore_undefined_fields=True)
        h5.put_vars(vcf_parser)
        fhand.close()
        h5 = h5py.File(path, 'r')

        assert numpy.all(h5['/calls/GT'].shape == (2, 42, 2))
        assert numpy.all(h5['/calls/GT'][1, 12] == [1, 1])
        assert numpy.all(h5['/calls/GL'][0, 0, 0] == 0)
        os.remove(path)