Ejemplo n.º 1
0
    def test_filter_alleles_hts(self):
        # 1 variant: A:T,G
        ds = hl.import_vcf(resource('filter_alleles/input.vcf'))

        self.assertTrue(
            hl.filter_alleles_hts(ds, lambda a, i: a == 'T', subset=True)
                .drop('old_alleles', 'old_locus', 'new_to_old', 'old_to_new')
                ._same(hl.import_vcf(resource('filter_alleles/keep_allele1_subset.vcf'))))

        self.assertTrue(
            hl.filter_alleles_hts(ds, lambda a, i: a == 'G', subset=True)
                .drop('old_alleles', 'old_locus', 'new_to_old', 'old_to_new')
                ._same(hl.import_vcf(resource('filter_alleles/keep_allele2_subset.vcf')))
        )

        self.assertTrue(
            hl.filter_alleles_hts(ds, lambda a, i: a != 'G', subset=False)
                .drop('old_alleles', 'old_locus', 'new_to_old', 'old_to_new')
                ._same(hl.import_vcf(resource('filter_alleles/keep_allele1_downcode.vcf')))
        )

        (hl.filter_alleles_hts(ds, lambda a, i: a == 'G', subset=False)).old_to_new.show()
        self.assertTrue(
            hl.filter_alleles_hts(ds, lambda a, i: a == 'G', subset=False)
                .drop('old_alleles', 'old_locus', 'new_to_old', 'old_to_new')
                ._same(hl.import_vcf(resource('filter_alleles/keep_allele2_downcode.vcf')))
        )
Ejemplo n.º 2
0
    def test_import_vcf_skip_invalid_loci(self):
        mt = hl.import_vcf(resource('skip_invalid_loci.vcf'), reference_genome='GRCh37',
                           skip_invalid_loci=True)
        self.assertTrue(mt._force_count_rows() == 3)

        with self.assertRaisesRegex(FatalError, 'Invalid locus'):
            hl.import_vcf(resource('skip_invalid_loci.vcf')).count()
Ejemplo n.º 3
0
    def test_union_cols_example(self):
        joined = hl.import_vcf(resource('joined.vcf'))

        left = hl.import_vcf(resource('joinleft.vcf'))
        right = hl.import_vcf(resource('joinright.vcf'))

        self.assertTrue(left.union_cols(right)._same(joined))
Ejemplo n.º 4
0
 def test_not_identical_headers(self):
     t = new_temp_file('vcf')
     mt = hl.import_vcf(resource('sample.vcf'))
     hl.export_vcf(mt.filter_cols((mt.s != "C1048::HG02024") & (mt.s != "HG00255")), t)
     
     with self.assertRaisesRegex(FatalError, 'invalid sample IDs'):
         (hl.import_vcf([resource('sample.vcf'), t])
          ._force_count_rows())
Ejemplo n.º 5
0
    def test_export_vcf(self):
        dataset = hl.import_vcf(resource('sample.vcf.bgz'))
        vcf_metadata = hl.get_vcf_metadata(resource('sample.vcf.bgz'))
        hl.export_vcf(dataset, '/tmp/sample.vcf', metadata=vcf_metadata)
        dataset_imported = hl.import_vcf('/tmp/sample.vcf')
        self.assertTrue(dataset._same(dataset_imported))

        no_sample_dataset = dataset.filter_cols(False).select_entries()
        hl.export_vcf(no_sample_dataset, '/tmp/no_sample.vcf', metadata=vcf_metadata)
        no_sample_dataset_imported = hl.import_vcf('/tmp/no_sample.vcf')
        self.assertTrue(no_sample_dataset._same(no_sample_dataset_imported))

        metadata_imported = hl.get_vcf_metadata('/tmp/sample.vcf')
        self.assertDictEqual(vcf_metadata, metadata_imported)
Ejemplo n.º 6
0
    def test_tdt(self):
        pedigree = hl.Pedigree.read(resource('tdt.fam'))
        tdt_tab = (hl.transmission_disequilibrium_test(
            hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)),
            pedigree))

        truth = hl.import_table(
            resource('tdt_results.tsv'),
            types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32,
                   'Chi2': hl.tfloat64, 'Pval': hl.tfloat64})
        truth = (truth
                 .transmute(locus=hl.locus(truth.CHROM, truth.POSITION),
                            alleles=[truth.REF, truth.ALT])
                 .key_by('locus', 'alleles'))

        if tdt_tab.count() != truth.count():
            self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count()))

        bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False)
               .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer'))
        bad.describe()

        bad = bad.filter(~(
                (bad.t == bad.T) &
                (bad.u == bad.U) &
                (hl.abs(bad.chi_sq - bad.Chi2) < 0.001) &
                (hl.abs(bad.p_value - bad.Pval) < 0.001)))

        if bad.count() != 0:
            bad.order_by(hl.asc(bad.v)).show()
            self.fail('Found rows in violation of the predicate (see show output)')
Ejemplo n.º 7
0
    def test_de_novo(self):
        mt = hl.import_vcf(resource('denovo.vcf'))
        mt = mt.filter_rows(mt.locus.in_y_par(), keep=False)  # de_novo_finder doesn't know about y PAR
        ped = hl.Pedigree.read(resource('denovo.fam'))
        r = hl.de_novo(mt, ped, mt.info.ESP)
        r = r.select(
            prior=r.prior,
            kid_id=r.proband.s,
            dad_id=r.father.s,
            mom_id=r.mother.s,
            p_de_novo=r.p_de_novo,
            confidence=r.confidence).key_by('locus', 'alleles', 'kid_id', 'dad_id', 'mom_id')

        truth = hl.import_table(resource('denovo.out'), impute=True, comment='#')
        truth = truth.select(
            locus=hl.locus(truth['Chr'], truth['Pos']),
            alleles=[truth['Ref'], truth['Alt']],
            kid_id=truth['Child_ID'],
            dad_id=truth['Dad_ID'],
            mom_id=truth['Mom_ID'],
            p_de_novo=truth['Prob_dn'],
            confidence=truth['Validation_Likelihood'].split('_')[0]).key_by('locus', 'alleles', 'kid_id', 'dad_id',
                                                                            'mom_id')

        j = r.join(truth, how='outer')
        self.assertTrue(j.all((j.confidence == j.confidence_1) & (hl.abs(j.p_de_novo - j.p_de_novo_1) < 1e-4)))
Ejemplo n.º 8
0
 def test_import_vcf_flags_are_defined(self):
     # issue 3277
     t = hl.import_vcf(resource('sample.vcf')).rows()
     self.assertTrue(t.all(hl.is_defined(t.info.NEGATIVE_TRAIN_SITE) &
                           hl.is_defined(t.info.POSITIVE_TRAIN_SITE) &
                           hl.is_defined(t.info.DB) &
                           hl.is_defined(t.info.DS)))
Ejemplo n.º 9
0
    def test_filter_intervals_compound_key(self):
        ds = hl.import_vcf(resource('sample.vcf'), min_partitions=20)
        ds = (ds.annotate_rows(variant=hl.struct(locus=ds.locus, alleles=ds.alleles))
              .key_rows_by('locus', 'alleles'))

        intervals = [hl.Interval(hl.Struct(locus=hl.Locus('20', 10639222), alleles=['A', 'T']),
                                 hl.Struct(locus=hl.Locus('20', 10644700), alleles=['A', 'T']))]
        self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3)
Ejemplo n.º 10
0
 def test_fix3307_read_mt_wrong(self):
     mt = hl.import_vcf(resource('sample2.vcf'))
     mt = hl.split_multi_hts(mt)
     mt.write('/tmp/foo.mt', overwrite=True)
     mt2 = hl.read_matrix_table('/tmp/foo.mt')
     t = hl.read_table('/tmp/foo.mt/rows')
     self.assertTrue(mt.rows()._same(t))
     self.assertTrue(mt2.rows()._same(t))
     self.assertTrue(mt._same(mt2))
Ejemplo n.º 11
0
    def test_import_vcf(self):
        vcf = hl.split_multi_hts(
            hl.import_vcf(resource('sample2.vcf'),
                          reference_genome=hl.get_reference('GRCh38'),
                          contig_recoding={"22": "chr22"}))

        vcf_table = vcf.rows()
        self.assertTrue(vcf_table.all(vcf_table.locus.contig == "chr22"))
        self.assertTrue(vcf.locus.dtype, hl.tlocus('GRCh37'))
Ejemplo n.º 12
0
def get_1kg(output_dir, overwrite: bool = False):
    """Download subset of the `1000 Genomes <http://www.internationalgenome.org/>`__
    dataset and sample annotations.

    Notes
    -----
    The download is about 15M.

    Parameters
    ----------
    output_dir
        Directory in which to write data.
    overwrite
        If ``True``, overwrite any existing files/directories at `output_dir`.
    """
    jhc = Env.hc()._jhc

    _mkdir(jhc, output_dir)

    matrix_table_path = os.path.join(output_dir, '1kg.mt')
    annotations_path = os.path.join(output_dir, '1kg_annotations.txt')

    if (overwrite
            or not Env.jutils().dirExists(jhc, matrix_table_path)
            or not Env.jutils().fileExists(jhc, annotations_path)):
        init_temp_dir()
        tmp_vcf = os.path.join(tmp_dir, '1kg.vcf.bgz')
        source = resources['1kg_matrix_table']
        info(f'downloading 1KG VCF ...\n'
             f'  Source: {source}')
        urlretrieve(resources['1kg_matrix_table'], tmp_vcf)
        cluster_readable_vcf = Env.jutils().copyToTmp(jhc, local_path_uri(tmp_vcf), 'vcf')
        info('importing VCF and writing to matrix table...')
        hl.import_vcf(cluster_readable_vcf, min_partitions=16).write(matrix_table_path, overwrite=True)

        tmp_annot = os.path.join(tmp_dir, '1kg_annotations.txt')
        source = resources['1kg_annotations']
        info(f'downloading 1KG annotations ...\n'
             f'  Source: {source}')
        urlretrieve(source, tmp_annot)
        hl.hadoop_copy(local_path_uri(tmp_annot), annotations_path)
        info('Done!')
    else:
        info('1KG files found')
Ejemplo n.º 13
0
 def test_filter_alleles(self):
     # poor man's Gen
     paths = [resource('sample.vcf'),
              resource('multipleChromosomes.vcf'),
              resource('sample2.vcf')]
     for path in paths:
         ds = hl.import_vcf(path)
         self.assertEqual(
             hl.filter_alleles(ds, lambda a, i: False).count_rows(), 0)
         self.assertEqual(hl.filter_alleles(ds, lambda a, i: True).count_rows(), ds.count_rows())
Ejemplo n.º 14
0
    def test_undeclared_info(self):
        mt = hl.import_vcf(resource('undeclaredinfo.vcf'))

        rows = mt.rows()
        self.assertTrue(rows.all(hl.is_defined(rows.info)))

        info_type = mt.row.dtype['info']
        self.assertTrue('InbreedingCoeff' in info_type)
        self.assertFalse('undeclared' in info_type)
        self.assertFalse('undeclaredFlag' in info_type)
Ejemplo n.º 15
0
    def test_reference_genome_liftover(self):
        grch37 = hl.get_reference('GRCh37')
        grch38 = hl.get_reference('GRCh38')

        self.assertTrue(not grch37.has_liftover('GRCh38') and not grch38.has_liftover('GRCh37'))
        grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38')
        grch38.add_liftover(resource('grch38_to_grch37_chr20.over.chain.gz'), 'GRCh37')
        self.assertTrue(grch37.has_liftover('GRCh38') and grch38.has_liftover('GRCh37'))

        ds = hl.import_vcf(resource('sample.vcf'))
        t = ds.annotate_rows(liftover=hl.liftover(hl.liftover(ds.locus, 'GRCh38'), 'GRCh37')).rows()
        self.assertTrue(t.all(t.locus == t.liftover))

        null_locus = hl.null(hl.tlocus('GRCh38'))

        rows = [
            {'l37': hl.locus('20', 1, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 60000, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 60001, 'GRCh37'), 'l38': hl.locus('chr20', 79360, 'GRCh38')},
            {'l37': hl.locus('20', 278686, 'GRCh37'), 'l38': hl.locus('chr20', 298045, 'GRCh38')},
            {'l37': hl.locus('20', 278687, 'GRCh37'), 'l38': hl.locus('chr20', 298046, 'GRCh38')},
            {'l37': hl.locus('20', 278688, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 278689, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 278690, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 278691, 'GRCh37'), 'l38': hl.locus('chr20', 298047, 'GRCh38')},
            {'l37': hl.locus('20', 37007586, 'GRCh37'), 'l38': hl.locus('chr12', 32563117, 'GRCh38')},
            {'l37': hl.locus('20', 62965520, 'GRCh37'), 'l38': hl.locus('chr20', 64334167, 'GRCh38')},
            {'l37': hl.locus('20', 62965521, 'GRCh37'), 'l38': null_locus}
        ]
        schema = hl.tstruct(l37=hl.tlocus(grch37), l38=hl.tlocus(grch38))
        t = hl.Table.parallelize(rows, schema)
        self.assertTrue(t.all(hl.cond(hl.is_defined(t.l38),
                                      hl.liftover(t.l37, 'GRCh38') == t.l38,
                                      hl.is_missing(hl.liftover(t.l37, 'GRCh38')))))

        t = t.filter(hl.is_defined(t.l38))
        self.assertTrue(t.count() == 6)

        t = t.key_by('l38')
        t.count()
        self.assertTrue(list(t.key) == ['l38'])

        null_locus_interval = hl.null(hl.tinterval(hl.tlocus('GRCh38')))
        rows = [
            {'i37': hl.locus_interval('20', 1, 60000, True, False, 'GRCh37'), 'i38': null_locus_interval},
            {'i37': hl.locus_interval('20', 60001, 82456, True, True, 'GRCh37'),
             'i38': hl.locus_interval('chr20', 79360, 101815, True, True, 'GRCh38')}
        ]
        schema = hl.tstruct(i37=hl.tinterval(hl.tlocus(grch37)), i38=hl.tinterval(hl.tlocus(grch38)))
        t = hl.Table.parallelize(rows, schema)
        self.assertTrue(t.all(hl.liftover(t.i37, 'GRCh38') == t.i38))

        grch37.remove_liftover("GRCh38")
        grch38.remove_liftover("GRCh37")
Ejemplo n.º 16
0
 def test_hw_func_and_agg_agree(self):
     mt = hl.import_vcf(resource('sample.vcf'))
     mt = mt.annotate_rows(
         stats=hl.agg.call_stats(mt.GT, mt.alleles),
         hw=hl.agg.hardy_weinberg_test(mt.GT))
     mt = mt.annotate_rows(
         hw2=hl.hardy_weinberg_test(mt.stats.homozygote_count[0],
                                    mt.stats.AC[1] - 2 * mt.stats.homozygote_count[1],
                                    mt.stats.homozygote_count[1]))
     rt = mt.rows()
     self.assertTrue(rt.all(rt.hw == rt.hw2))
Ejemplo n.º 17
0
 def test_import_vcf_missing_info_field_elements(self):
     mt = hl.import_vcf(resource('missingInfoArray.vcf'), reference_genome='GRCh37', array_elements_required=False)
     mt = mt.select_rows(FOO=mt.info.FOO, BAR=mt.info.BAR)
     expected = hl.Table.parallelize([{'locus': hl.Locus('X', 16050036), 'alleles': ['A', 'C'],
                                       'FOO': [1, None], 'BAR': [2, None, None]},
                                      {'locus': hl.Locus('X', 16061250), 'alleles': ['T', 'A', 'C'],
                                       'FOO': [None, 2, None], 'BAR': [None, 1.0, None]}],
                                     hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr),
                                                FOO=hl.tarray(hl.tint), BAR=hl.tarray(hl.tfloat64)),
                                     key=['locus', 'alleles'])
     self.assertTrue(mt.rows()._same(expected))
Ejemplo n.º 18
0
Archivo: utils.py Proyecto: jigold/hail
def download_data():
    global _initialized, _data_dir, _mt
    _data_dir = os.environ.get('HAIL_BENCHMARK_DIR', '/tmp/hail_benchmark_data')
    print(f'using benchmark data directory {_data_dir}')
    os.makedirs(_data_dir, exist_ok=True)

    files = map(lambda f: os.path.join(_data_dir, f), ['profile.vcf.bgz', 'profile.mt'])
    if not all(os.path.exists(file) for file in files):
        vcf = os.path.join(_data_dir, 'profile.vcf.bgz')
        print('files not found - downloading...', end='',flush=True)
        urlretrieve('https://storage.googleapis.com/hail-common/benchmark/profile.vcf.bgz',
                    os.path.join(_data_dir, vcf))
        print('done', flush=True)
        print('importing...', end='', flush=True)
        hl.import_vcf(vcf).write(os.path.join(_data_dir, 'profile.mt'))
        print('done', flush=True)
    else:
        print('all files found.', flush=True)

    _initialized = True
    _mt = hl.read_matrix_table(resource('profile.mt'))
Ejemplo n.º 19
0
    def test_trio_matrix_null_keys(self):
        ped = hl.Pedigree.read(resource('triomatrix.fam'))
        ht = hl.import_fam(resource('triomatrix.fam'))

        mt = hl.import_vcf(resource('triomatrix.vcf'))
        mt = mt.annotate_cols(fam=ht[mt.s].fam_id)

        # Make keys all null
        mt = mt.key_cols_by(s=hl.null(hl.tstr))

        tt = hl.trio_matrix(mt, ped, complete_trios=True)
        self.assertEqual(tt.count_cols(), 0)
Ejemplo n.º 20
0
    def test_joins(self):
        kt = hl.utils.range_table(1).key_by().drop('idx')
        kt = kt.annotate(a='foo')

        kt1 = hl.utils.range_table(1).key_by().drop('idx')
        kt1 = kt1.annotate(a='foo', b='bar').key_by('a')

        kt2 = hl.utils.range_table(1).key_by().drop('idx')
        kt2 = kt2.annotate(b='bar', c='baz').key_by('b')

        kt3 = hl.utils.range_table(1).key_by().drop('idx')
        kt3 = kt3.annotate(c='baz', d='qux').key_by('c')

        kt4 = hl.utils.range_table(1).key_by().drop('idx')
        kt4 = kt4.annotate(d='qux', e='quam').key_by('d')

        ktr = kt.annotate(e=kt4[kt3[kt2[kt1[kt.a].b].c].d].e)
        self.assertTrue(ktr.aggregate(agg.collect(ktr.e)) == ['quam'])

        ktr = kt.select(e=kt4[kt3[kt2[kt1[kt.a].b].c].d].e)
        self.assertTrue(ktr.aggregate(agg.collect(ktr.e)) == ['quam'])

        self.assertEqual(kt.filter(kt4[kt3[kt2[kt1[kt.a].b].c].d].e == 'quam').count(), 1)

        m = hl.import_vcf(resource('sample.vcf'))
        vkt = m.rows()
        vkt = vkt.select(vkt.qual)
        vkt = vkt.annotate(qual2=m.index_rows(vkt.key).qual)
        self.assertTrue(vkt.filter(vkt.qual != vkt.qual2).count() == 0)

        m2 = m.annotate_rows(qual2=vkt.index(m.row_key).qual)
        self.assertTrue(m2.filter_rows(m2.qual != m2.qual2).count_rows() == 0)

        m3 = m.annotate_rows(qual2=m.index_rows(m.row_key).qual)
        self.assertTrue(m3.filter_rows(m3.qual != m3.qual2).count_rows() == 0)

        kt5 = hl.utils.range_table(1).annotate(key='C1589').key_by('key')
        m4 = m.annotate_cols(foo=m.s[:5])
        m4 = m4.annotate_cols(idx=kt5[m4.foo].idx)
        n_C1589 = m.filter_cols(m.s[:5] == 'C1589').count_cols()
        self.assertTrue(n_C1589 > 1)
        self.assertEqual(m4.filter_cols(hl.is_defined(m4.idx)).count_cols(), n_C1589)

        kt = hl.utils.range_table(1)
        kt = kt.annotate_globals(foo=5)
        self.assertEqual(hl.eval(kt.foo), 5)

        kt2 = hl.utils.range_table(1)

        kt2 = kt2.annotate_globals(kt_foo=kt.index_globals().foo)
        self.assertEqual(hl.eval(kt2.globals.kt_foo), 5)
Ejemplo n.º 21
0
    def test_call_fields(self):
        expected = hl.Table.parallelize(
            [hl.struct(locus = hl.locus("X", 16050036), s = "C1046::HG02024",
                       GT = hl.call(0, 0), GTA = hl.null(hl.tcall), GTZ = hl.call(0, 1)),
             hl.struct(locus = hl.locus("X", 16050036), s = "C1046::HG02025",
                       GT = hl.call(1), GTA = hl.null(hl.tcall), GTZ = hl.call(0)),
             hl.struct(locus = hl.locus("X", 16061250), s = "C1046::HG02024",
                       GT = hl.call(2, 2), GTA = hl.call(2, 1), GTZ = hl.call(1, 1)),
             hl.struct(locus = hl.locus("X", 16061250), s = "C1046::HG02025",
                       GT = hl.call(2), GTA = hl.null(hl.tcall), GTZ = hl.call(1))],
            key=['locus', 's'])

        mt = hl.import_vcf(resource('generic.vcf'), call_fields=['GT', 'GTA', 'GTZ'])
        entries = mt.entries()
        entries = entries.key_by('locus', 's')
        entries = entries.select('GT', 'GTA', 'GTZ')
        self.assertTrue(entries._same(expected))
Ejemplo n.º 22
0
    def test_import_plink_contig_recoding_w_reference(self):
        vcf = hl.split_multi_hts(
            hl.import_vcf(resource('sample2.vcf'),
                          reference_genome=hl.get_reference('GRCh38'),
                          contig_recoding={"22": "chr22"}))

        hl.export_plink(vcf, '/tmp/sample_plink')

        bfile = '/tmp/sample_plink'
        plink = hl.import_plink(
            bfile + '.bed', bfile + '.bim', bfile + '.fam',
            a2_reference=True,
            contig_recoding={'chr22': '22'},
            reference_genome='GRCh37').rows()
        self.assertTrue(plink.all(plink.locus.contig == "22"))
        self.assertEqual(vcf.count_rows(), plink.count())
        self.assertTrue(plink.locus.dtype, hl.tlocus('GRCh37'))
Ejemplo n.º 23
0
    def test_import_vcf_missing_format_field_elements(self):
        mt = hl.import_vcf(resource('missingFormatArray.vcf'), reference_genome='GRCh37', array_elements_required=False)
        mt = mt.select_rows().select_entries('AD', 'PL')

        expected = hl.Table.parallelize([{'locus': hl.Locus('X', 16050036), 'alleles': ['A', 'C'], 's': 'C1046::HG02024',
                                          'AD': [None, None], 'PL': [0, None, 180]},
                                         {'locus': hl.Locus('X', 16050036), 'alleles': ['A', 'C'], 's': 'C1046::HG02025',
                                          'AD': [None, 6], 'PL': [70, None]},
                                         {'locus': hl.Locus('X', 16061250), 'alleles': ['T', 'A', 'C'], 's': 'C1046::HG02024',
                                          'AD': [0, 0, None], 'PL': [396, None, None, 33, None, 0]},
                                         {'locus': hl.Locus('X', 16061250), 'alleles': ['T', 'A', 'C'], 's': 'C1046::HG02025',
                                          'AD': [0, 0, 9], 'PL': [None, None, None]}],
                                        hl.tstruct(locus=hl.tlocus('GRCh37'), alleles=hl.tarray(hl.tstr), s=hl.tstr,
                                                   AD=hl.tarray(hl.tint), PL=hl.tarray(hl.tint)),
                                        key=['locus', 'alleles', 's'])

        self.assertTrue(mt.entries()._same(expected))
Ejemplo n.º 24
0
    def test_unions(self):
        dataset = hl.import_vcf(resource('sample2.vcf'))

        # test union_rows
        ds1 = dataset.filter_rows(dataset.locus.position % 2 == 1)
        ds2 = dataset.filter_rows(dataset.locus.position % 2 == 0)

        datasets = [ds1, ds2]
        r1 = ds1.union_rows(ds2)
        r2 = hl.MatrixTable.union_rows(*datasets)

        self.assertTrue(r1._same(r2))

        # test union_cols
        ds = dataset.union_cols(dataset).union_cols(dataset)
        for s, count in ds.aggregate_cols(agg.counter(ds.s)).items():
            self.assertEqual(count, 3)
Ejemplo n.º 25
0
    def test_haploid(self):
        expected = hl.Table.parallelize(
            [hl.struct(locus = hl.locus("X", 16050036), s = "C1046::HG02024",
                       GT = hl.call(0, 0), AD = [10, 0], GQ = 44),
             hl.struct(locus = hl.locus("X", 16050036), s = "C1046::HG02025",
                       GT = hl.call(1), AD = [0, 6], GQ = 70),
             hl.struct(locus = hl.locus("X", 16061250), s = "C1046::HG02024",
                       GT = hl.call(2, 2), AD = [0, 0, 11], GQ = 33),
             hl.struct(locus = hl.locus("X", 16061250), s = "C1046::HG02025",
                       GT = hl.call(2), AD = [0, 0, 9], GQ = 24)],
            key=['locus', 's'])

        mt = hl.import_vcf(resource('haploid.vcf'))
        entries = mt.entries()
        entries = entries.key_by('locus', 's')
        entries = entries.select('GT', 'AD', 'GQ')
        self.assertTrue(entries._same(expected))
Ejemplo n.º 26
0
    def test_export_plink(self):
        vcf_file = resource('sample.vcf')
        mt = hl.split_multi_hts(hl.import_vcf(vcf_file, min_partitions=10))

        # permute columns so not in alphabetical order!
        import random
        indices = list(range(mt.count_cols()))
        random.shuffle(indices)
        mt = mt.choose_cols(indices)

        split_vcf_file = uri_path(new_temp_file())
        hl_output = uri_path(new_temp_file())
        plink_output = uri_path(new_temp_file())
        merge_output = uri_path(new_temp_file())

        hl.export_vcf(mt, split_vcf_file)
        hl.export_plink(mt, hl_output)

        run_command(["plink", "--vcf", split_vcf_file,
                     "--make-bed", "--out", plink_output,
                     "--const-fid", "--keep-allele-order"])

        data = []
        with open(uri_path(plink_output + ".bim")) as file:
            for line in file:
                row = line.strip().split()
                row[1] = ":".join([row[0], row[3], row[5], row[4]])
                data.append("\t".join(row) + "\n")

        with open(plink_output + ".bim", 'w') as f:
            f.writelines(data)

        run_command(["plink", "--bfile", plink_output,
                     "--bmerge", hl_output, "--merge-mode",
                     "6", "--out", merge_output])

        same = True
        with open(merge_output + ".diff") as f:
            for line in f:
                row = line.strip().split()
                if row != ["SNP", "FID", "IID", "NEW", "OLD"]:
                    same = False
                    break

        self.assertTrue(same)
Ejemplo n.º 27
0
    def test_matrix_filter_intervals(self):
        ds = hl.import_vcf(resource('sample.vcf'), min_partitions=20)

        self.assertEqual(
            hl.filter_intervals(ds, [hl.parse_locus_interval('20:10639222-10644705')]).count_rows(), 3)

        intervals = [hl.parse_locus_interval('20:10639222-10644700'),
                     hl.parse_locus_interval('20:10644700-10644705')]
        self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3)

        intervals = hl.array([hl.parse_locus_interval('20:10639222-10644700'),
                              hl.parse_locus_interval('20:10644700-10644705')])
        self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3)

        intervals = hl.array([hl.eval(hl.parse_locus_interval('20:10639222-10644700')),
                              hl.parse_locus_interval('20:10644700-10644705')])
        self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3)

        intervals = [hl.eval(hl.parse_locus_interval('[20:10019093-10026348]')),
                     hl.eval(hl.parse_locus_interval('[20:17705793-17716416]'))]
        self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 4)
Ejemplo n.º 28
0
    def test_hardy_weinberg_test(self):
        mt = hl.import_vcf(resource('HWE_test.vcf'))
        mt = mt.select_rows(**hl.agg.hardy_weinberg_test(mt.GT))
        rt = mt.rows()
        expected = hl.Table.parallelize([
            hl.struct(
                locus=hl.locus('20', pos),
                alleles=alleles,
                het_freq_hwe=r,
                p_value=p)
            for (pos, alleles, r, p) in [
                (1, ['A', 'G'], 0.0, 0.5),
                (2, ['A', 'G'], 0.25, 0.5),
                (3, ['T', 'C'], 0.5357142857142857, 0.21428571428571427),
                (4, ['T', 'A'], 0.5714285714285714, 0.6571428571428573),
                (5, ['G', 'A'], 0.3333333333333333, 0.5)]],
            key=['locus', 'alleles'])
        self.assertTrue(rt.filter(rt.locus.position != 6)._same(expected))

        rt6 = rt.filter(rt.locus.position == 6).collect()[0]
        self.assertEqual(rt6['p_value'], 0.5)
        self.assertTrue(math.isnan(rt6['het_freq_hwe']))
Ejemplo n.º 29
0
 def test_trio_matrix_incomplete_trios(self):
     ped = hl.Pedigree.read(resource('triomatrix.fam'))
     mt = hl.import_vcf(resource('triomatrix.vcf'))
     hl.trio_matrix(mt, ped, complete_trios=False)
Ejemplo n.º 30
0
    "--input-url",
    help="URL of ExAC sites VCF",
    default="gs://exac/170122_exacv1_bundle/ExAC.r1.sites.vep.vcf.gz")
p.add_argument("--output-url",
               help="URL to write Hail table to",
               required=True)
p.add_argument("--subset",
               help="Filter variants to this chrom:start-end range")
args = p.parse_args()

hl.init(log="/tmp/hail.log")

print("\n=== Importing VCF ===")

mt = hl.import_vcf(args.input_url,
                   force_bgz=True,
                   min_partitions=2000,
                   skip_invalid_loci=True)

# Drop entry values
mt = mt.drop("AD", "DP", "GQ", "GT", "MIN_DP", "PL", "SB")

if args.subset:
    print(f"\n=== Filtering to interval {args.subset} ===")
    subset_interval = hl.parse_locus_interval(args.subset)
    mt = mt.filter_rows(subset_interval.contains(mt.locus))

print("\n=== Splitting multiallelic variants ===")

mt = hl.split_multi(mt)

# For multiallelic variants, these fields contain a value for each alt allele
Ejemplo n.º 31
0
def get_1kg(output_dir, overwrite: bool = False):
    """Download subset of the `1000 Genomes <http://www.internationalgenome.org/>`__
    dataset and sample annotations.

    Notes
    -----
    The download is about 15M.

    Parameters
    ----------
    output_dir
        Directory in which to write data.
    overwrite
        If ``True``, overwrite any existing files/directories at `output_dir`.
    """
    fs = Env.fs()

    if not _dir_exists(fs, output_dir):
        fs.mkdir(output_dir)

    matrix_table_path = os.path.join(output_dir, '1kg.mt')
    vcf_path = os.path.join(output_dir, '1kg.vcf.bgz')
    sample_annotations_path = os.path.join(output_dir, '1kg_annotations.txt')
    gene_annotations_path = os.path.join(output_dir,
                                         'ensembl_gene_annotations.txt')

    if (overwrite or not _dir_exists(fs, matrix_table_path)
            or not _file_exists(fs, sample_annotations_path)
            or not _file_exists(fs, vcf_path)
            or not _file_exists(fs, gene_annotations_path)):
        init_temp_dir()
        tmp_vcf = os.path.join(tmp_dir, '1kg.vcf.bgz')
        source = resources['1kg_matrix_table']
        info(f'downloading 1KG VCF ...\n' f'  Source: {source}')
        sync_retry_transient_errors(urlretrieve, resources['1kg_matrix_table'],
                                    tmp_vcf)
        cluster_readable_vcf = _copy_to_tmp(fs,
                                            local_path_uri(tmp_vcf),
                                            extension='vcf.bgz')
        info('importing VCF and writing to matrix table...')
        hl.import_vcf(cluster_readable_vcf,
                      min_partitions=16).write(matrix_table_path,
                                               overwrite=True)

        tmp_sample_annot = os.path.join(tmp_dir, '1kg_annotations.txt')
        source = resources['1kg_annotations']
        info(f'downloading 1KG annotations ...\n' f'  Source: {source}')
        sync_retry_transient_errors(urlretrieve, source, tmp_sample_annot)

        tmp_gene_annot = os.path.join(tmp_dir, 'ensembl_gene_annotations.txt')
        source = resources['1kg_ensembl_gene_annotations']
        info(f'downloading Ensembl gene annotations ...\n'
             f'  Source: {source}')
        sync_retry_transient_errors(urlretrieve, source, tmp_gene_annot)

        hl.hadoop_copy(local_path_uri(tmp_sample_annot),
                       sample_annotations_path)
        hl.hadoop_copy(local_path_uri(tmp_gene_annot), gene_annotations_path)
        hl.hadoop_copy(local_path_uri(tmp_vcf), vcf_path)
        info('Done!')
    else:
        info('1KG files found')
Ejemplo n.º 32
0
 def get_dataset():
     if Tests._dataset is None:
         Tests._dataset = hl.split_multi_hts(
             hl.import_vcf(resource('sample.vcf')))
     return Tests._dataset
Ejemplo n.º 33
0
    def test_mendel_errors(self):
        mt = hl.import_vcf(resource('mendel.vcf'))
        ped = hl.Pedigree.read(resource('mendel.fam'))

        men, fam, ind, var = hl.mendel_errors(mt['GT'], ped)

        self.assertEqual(men.key.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr),
                                                   s=hl.tstr))
        self.assertEqual(men.row.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr),
                                                   s=hl.tstr,
                                                   fam_id=hl.tstr,
                                                   mendel_code=hl.tint))
        self.assertEqual(fam.key.dtype, hl.tstruct(pat_id=hl.tstr,
                                                   mat_id=hl.tstr))
        self.assertEqual(fam.row.dtype, hl.tstruct(pat_id=hl.tstr,
                                                   mat_id=hl.tstr,
                                                   fam_id=hl.tstr,
                                                   children=hl.tint,
                                                   errors=hl.tint64,
                                                   snp_errors=hl.tint64))
        self.assertEqual(ind.key.dtype, hl.tstruct(s=hl.tstr))
        self.assertEqual(ind.row.dtype, hl.tstruct(s=hl.tstr,
                                                   fam_id=hl.tstr,
                                                   errors=hl.tint64,
                                                   snp_errors=hl.tint64))
        self.assertEqual(var.key.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr)))
        self.assertEqual(var.row.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr),
                                                   errors=hl.tint64))

        self.assertEqual(men.count(), 41)
        self.assertEqual(fam.count(), 2)
        self.assertEqual(ind.count(), 7)
        self.assertEqual(var.count(), mt.count_rows())

        self.assertEqual(set(fam.select('children', 'errors', 'snp_errors').collect()),
                         {
                             hl.utils.Struct(pat_id='Dad1', mat_id='Mom1', children=2,
                                             errors=41, snp_errors=39),
                             hl.utils.Struct(pat_id='Dad2', mat_id='Mom2', children=1,
                                             errors=0, snp_errors=0)
                         })

        self.assertEqual(set(ind.select('errors', 'snp_errors').collect()),
                         {
                             hl.utils.Struct(s='Son1', errors=23, snp_errors=22),
                             hl.utils.Struct(s='Dtr1', errors=18, snp_errors=17),
                             hl.utils.Struct(s='Dad1', errors=19, snp_errors=18),
                             hl.utils.Struct(s='Mom1', errors=22, snp_errors=21),
                             hl.utils.Struct(s='Dad2', errors=0, snp_errors=0),
                             hl.utils.Struct(s='Mom2', errors=0, snp_errors=0),
                             hl.utils.Struct(s='Son2', errors=0, snp_errors=0)
                         })

        to_keep = hl.set([
            (hl.Locus("1", 1), ['C', 'CT']),
            (hl.Locus("1", 2), ['C', 'T']),
            (hl.Locus("X", 1), ['C', 'T']),
            (hl.Locus("X", 3), ['C', 'T']),
            (hl.Locus("Y", 1), ['C', 'T']),
            (hl.Locus("Y", 3), ['C', 'T'])
        ])
        self.assertEqual(var.filter(to_keep.contains((var.locus, var.alleles)))
                         .order_by('locus')
                         .select('locus', 'alleles', 'errors').collect(),
                         [
                             hl.utils.Struct(locus=hl.Locus("1", 1), alleles=['C', 'CT'], errors=2),
                             hl.utils.Struct(locus=hl.Locus("1", 2), alleles=['C', 'T'], errors=1),
                             hl.utils.Struct(locus=hl.Locus("X", 1), alleles=['C', 'T'], errors=2),
                             hl.utils.Struct(locus=hl.Locus("X", 3), alleles=['C', 'T'], errors=1),
                             hl.utils.Struct(locus=hl.Locus("Y", 1), alleles=['C', 'T'], errors=1),
                             hl.utils.Struct(locus=hl.Locus("Y", 3), alleles=['C', 'T'], errors=1),
                         ])

        ped2 = hl.Pedigree.read(resource('mendelWithMissingSex.fam'))
        men2, _, _, _ = hl.mendel_errors(mt['GT'], ped2)

        self.assertTrue(men2.filter(men2.s == 'Dtr1')._same(men.filter(men.s == 'Dtr1')))
Ejemplo n.º 34
0
def get_dataset():
    global _dataset
    if _dataset is None:
        _dataset = hl.split_multi_hts(hl.import_vcf(resource('sample.vcf'))).cache()
    return _dataset
Ejemplo n.º 35
0
 def test_info_char(self):
     self.assertEqual(
         hl.import_vcf(resource('infochar.vcf')).count_rows(), 1)
Ejemplo n.º 36
0
 def test_import_vcf_no_reference_specified(self):
     vcf = hl.import_vcf(resource('sample2.vcf'), reference_genome=None)
     self.assertTrue(
         vcf.locus.dtype == hl.tstruct(contig=hl.tstr, position=hl.tint32))
     self.assertEqual(vcf.count_rows(), 735)
Ejemplo n.º 37
0
 def test_import_vcf_bad_reference_allele(self):
     vcf = hl.import_vcf(resource('invalid_base.vcf'))
     self.assertEqual(vcf.count_rows(), 1)
Ejemplo n.º 38
0
 def test_import_vcf_can_import_float_array_format(self):
     mt = hl.import_vcf(resource('floating_point_array.vcf'))
     self.assertTrue(
         mt.aggregate_entries(hl.agg.all(mt.numeric_array == [1.5, 2.5])))
Ejemplo n.º 39
0
 def test_glob(self):
     full = hl.import_vcf(resource('sample.vcf'))
     parts = hl.import_vcf(resource('samplepart*.vcf'))
     self.assertTrue(parts._same(full))
Ejemplo n.º 40
0
# chr21 0 ~ 1714
# chr22 0 ~ 1669
filelist = [
    'gs://rcstorage/genotype/gnarly_chr22.' + str(i) +
    '.variant_filtered.vcf.gz' for i in range(7700)
]

# define output files
vds_splitmulti_file = 'gs://rcstorage/matrixtable/' + chrom + '/splitmulti.vds'

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# II. Import VCF
#     Combine all VCF chunks for one chromosome and import as vds
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
print("importing vcf...")
vds = hl.import_vcf(filelist, force_bgz=True, reference_genome='GRCh38')

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# III. Remove variants without PASS in Filter column
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
print("filtering variants without pass...")
vds = vds.filter_rows(hl.len(vds.filters) == 0)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# IV. Split multi
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
print("spliting multi...")
vds = hl.split_multi_hts(
    vds.select_entries(vds.GT, vds.AD, vds.DP, vds.GQ, vds.PL))

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Ejemplo n.º 41
0
def process_gnomad_data(datapath,
                        chromosome,
                        transcript_list,
                        exomes=True,
                        synonymous=True):
    """
    Uses hail to process the gnomAD dataset 
    """

    basedir = dirname(__file__)
    logdir = path_join(basedir, 'hail.log')
    hl_init(log=logdir, append=True, default_reference='GRCh38')
    # this try-except block makes sure the program won't spend time
    # writing the table to disk if it already exists from a previous loop
    #try:
    #    mt = hl.import_vcf(datapath)
    #except:
    #    #it already exists, so just read it.
    #    pass
    mt = import_vcf(datapath)
    # first filter down to the right number of transcripts
    transcripts, intervals = zip(*transcript_list)
    transcripts = hl_literal(list(transcripts))
    mt = filter_intervals(mt, [
        parse_locus_interval(x, reference_genome='GRCh38') for x in intervals
    ])
    mt = mt.filter_rows(mt.filters == hl_empty_set('str'))
    mt = mt.explode_rows(mt.info.vep)
    # get the right transcript
    mt = mt.annotate_rows(vep=mt.info.vep.split('\|'))
    #print(mt.vep.take(1))
    mt = mt.annotate_rows(gene=mt.vep[3])
    mt = mt.annotate_rows(enst=mt.vep[6])
    mt = mt.filter_rows(transcripts.contains(mt.enst))
    mt = mt.annotate_rows(vartype=mt.vep[1].split('&'))
    mt = mt.explode_rows(mt.vartype)
    vartype_list = hl_literal([
        'frameshift_variant', 'inframe_deletion', 'inframe_insertion',
        'missense_variant', 'start_lost', 'stop_gained'
    ])
    if synonymous:
        vartype_list = vartype_list.extend(['synonymous_variant'])
    mt = mt.filter_rows(vartype_list.contains(mt.vartype))
    mt = mt.annotate_rows(codon_num=mt.vep[14])
    mt = mt.annotate_rows(aa_change=mt.vep[15])
    #mt = mt.annotate_rows(orig_aa = mt.vep[15].split('/')[0])
    #mt = mt.annotate_rows(var_aa = mt.vep[15].split('/')[1])
    #mt.filter_rows(mt.vartype == "synonymous_variant").var_aa = None
    mt = mt.annotate_rows(transcript_consequence=mt.vep[10])
    mt = mt.annotate_rows(protein_consequence=mt.vep[11])
    mt = mt.annotate_rows(AC=mt.info.AC[0])
    try:
        mt = mt.annotate_rows(non_neuro_AC=mt.info.non_neuro_AC[0])
        mt = mt.annotate_rows(non_neuro_AN=mt.info.non_neuro_AN[0])
    except:
        mt = mt.annotate_rows(non_neuro_AC=hl_null('int'))
        mt = mt.annotate_rows(non_neuro_AN=hl_null('int'))
    try:
        mt = mt.annotate_rows(non_topmed_AC=mt.info.non_topmed_AC[0])
        mt = mt.annotate_rows(non_topmed_AN=mt.info.non_topmed_AN[0])
    except:
        mt = mt.annotate_rows(non_topmed_AC=hl_null('int'))
        mt = mt.annotate_rows(non_topmed_AN=hl_null('int'))
    try:
        mt = mt.annotate_rows(non_cancer_AC=mt.info.non_cancer_AC[0])
        mt = mt.annotate_rows(non_cancer_AN=mt.info.non_cancer_AN[0])
    except:
        mt = mt.annotate_rows(non_cancer_AC=hl_null('int'))
        mt = mt.annotate_rows(non_cancer_AN=hl_null('int'))
    try:
        mt = mt.annotate_rows(controls_AC=mt.info.controls_AC[0])
        mt = mt.annotate_rows(controls_AN=mt.info.controls_AN[0])
    except:
        mt = mt.annotate_rows(controls_AC=hl_null('int'))
        mt = mt.annotate_rows(controls_AN=hl_null('int'))
    try:
        mt = mt.annotate_rows(pab_max=mt.info.pab_max[0])
    except:
        mt = mt.annotate_rows(pab_max=hl_null('int'))
    try:
        mt = mt.annotate_rows(VQSLOD=mt.info.VQSLOD)
    except:
        mt = mt.annotate_rows(VQSLOD=hl_null('int'))
    try:
        mt = mt.annotate_rows(DP=mt.info.DP)
    except:
        mt = mt.annotate_rows(DP=hl_null('int'))
    try:
        mt = mt.annotate_rows(BaseQRankSum=mt.info.BaseQRankSum)
    except:
        mt = mt.annotate_rows(BaseQRankSum=hl_null('int'))
    try:
        mt = mt.annotate_rows(ClippingRankSum=mt.info.ClippingRankSum)
    except:
        mt = mt.annotate_rows(ClippingRankSum=hl_null('int'))
    try:
        mt = mt.annotate_rows(rf_tp_probability=mt.info.rf_tp_probability)
    except:
        mt = mt.annotate_rows(rf_tp_probability=hl_null('int'))

    ht = mt.select_rows(mt.qual, mt.filters, mt.vartype, mt.gene,
                        mt.transcript_consequence, mt.protein_consequence,
                        mt.codon_num, mt.aa_change, mt.info.FS,
                        mt.info.MQRankSum, mt.info.InbreedingCoeff,
                        mt.info.ReadPosRankSum, mt.VQSLOD, mt.info.QD, mt.DP,
                        mt.BaseQRankSum, mt.info.MQ, mt.ClippingRankSum,
                        mt.rf_tp_probability, mt.pab_max, mt.AC, mt.info.AN,
                        mt.non_neuro_AC, mt.non_neuro_AN, mt.non_cancer_AC,
                        mt.non_cancer_AN, mt.non_topmed_AC, mt.non_topmed_AN,
                        mt.controls_AC, mt.controls_AN).make_table()

    ht = ht.annotate(chromosome=ht.locus.contig, position=ht.locus.position)
    ht = ht.annotate(allele_ref=ht.alleles[0], allele_alt=ht.alleles[1])
    ht = ht.key_by(ht.chromosome, ht.position, ht.allele_ref, ht.allele_alt)
    ht = ht.drop(ht.alleles, ht.locus)
    df = ht.to_pandas()
    hl_stop()
    cols = df.columns.tolist()
    cols = cols[-4:] + cols[:-4]
    df = df[cols]
    df['filters'] = 'PASS'
    df['ref_aa'], df['alt_aa'] = df['aa_change'].str.split('/', 1).str
    df.loc[df.vartype == 'synonymous_variant', 'protein_consequence'] = None
    df['Variant'] = df.apply(lambda row: Variant_name(row), axis=1)
    df = df.drop(['aa_change'], axis=1)
    cols = df.columns.tolist()
    cols = cols[:11] + cols[-2:] + cols[11:-2]
    df = df[cols]
    if exomes:
        ome = 'exomes'
    else:
        ome = 'genomes'
    df['source'] = ome
    #filename = 'gnomad_' + ome + '_chr' + chromosome + '_processed.tsv'
    #df.to_csv(filename, sep='\t', encoding = 'utf-8', index=False)
    #os.remove('temp_matrix_table_' + chromosome + '.mt')
    return df
        if (args.chr_prefix is True) and (args.reference_genome == "GRCh37"):
            recode = {f"chr{i}": f"{i}" for i in (list(range(1, 23)) + ['X', 'Y'])}

        elif (args.chr_prefix is False) and (args.reference_genome == "GRCh38"):
            recode = {f"{i}": f"chr{i}" for i in (list(range(1, 23)) + ['X', 'Y'])}
        else:
            recode = None

        # If MT does not already exist, load in VCF and then write it to disk
        stat_cmd = ['gsutil', '-q', 'stat', mt_name + "/metadata.json.gz"]
        exists = subprocess.call(stat_cmd)

        if exists == 1:  # stat returns 1 if file/folder does not exist, 0 if it exists
            logging.info(f'Detected mt of input vcf {vcf} does not exist, importing vcf.')
            if recode is None:
                hl.import_vcf(vcf_name, force_bgz=args.force_bgz, call_fields=args.call_fields,
                              reference_genome=args.reference_genome).write(mt_name, overwrite=True)
            else:
                hl.import_vcf(vcf_name, force_bgz=args.force_bgz, call_fields=args.call_fields,
                              reference_genome=args.reference_genome, contig_recoding=recode
                              ).write(mt_name, overwrite=True)
        else:
            logging.info(f"Detected mt of input vcf {vcf} already exists, reading mt directly.")

        mt = hl.read_matrix_table(mt_name)

        if args.test:
            logging.info('Test flag given, filtering to on chrom 22.')
            if args.reference_genome == "GRCh38":
                chrom_code = "chr22"
            else:
                chrom_code = "22"