Exemple #1
0
 def test_kt_globals(self):
     kt = KeyTable.range(10)
     kt = kt.annotate_global_expr('foo = [1,2,3]')
     kt = kt.annotate_global('bar', [4, 5, 6], TArray(TInt32()))
     self.assertEqual(
         kt.filter('foo.exists(x => x == idx) || bar.exists(x => x == idx)'
                   ).count(), 6)
Exemple #2
0
    def important_variants(self, n_limit=1000):
        """ Gets the top n most important loci.

        :param int n_limit: the limit of the number of loci to return

        :return: A KeyTable with the variant in the first column and importance in the second.
        :rtype: :py:class:`KeyTable`
        """
        return KeyTable(self.hc, self._jia.variantImportance(n_limit))
Exemple #3
0
    def test_trio_matrix(self):
        ped = Pedigree.read('src/test/resources/triomatrix.fam')
        from hail import KeyTable
        fam_table = KeyTable.import_fam(
            'src/test/resources/triomatrix.fam').to_hail2()

        dataset = hc.import_vcf('src/test/resources/triomatrix.vcf')
        dataset = dataset.annotate_cols(fam=fam_table[dataset.s])

        tm = methods.trio_matrix(dataset, ped, complete_trios=True)

        tm.count_rows()
Exemple #4
0
#!./bin/pyhail-0.1-latest.sh
import pyspark
import hail
from hail import KeyTable
from hail.representation import Interval

hc = hail.HailContext(log='log/08b_plink_export_somaticfiltered.log',
                      tmp_dir='tmp/hail')

#vds = hc.read('../MGRB.phase2.SNPtier12.match.vqsr.minrep.locusannot.WGStier12.unrelated.vds')
vds = hc.read(
    '../MGRB.phase2.SNPtier12.match.vqsr.minrep.locusannot.WGStier12.unrelated.nocancer.over70.tgp.hrc.gnomad.dbsnp.clinvar.cato.eigen.vep.vds'
)

tier1_bed = KeyTable.import_bed(
    '../../locus-annotations/source_data/HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel.bed'
)

# Extract good probably not somatic markers for rare variant comparisons.
# Definition of 'good markers':
#   * In autosomes
#   * In tier 1 regions.
#
# Definition of probably not somatic:
#   DP > 10 AND
#   (
#     GT != het OR
#     (
#       binomTest(ad, dp, 0.5, "two.sided") >= alpha
#     )
#   )
Exemple #5
0
    def test_keytable(self):
        test_resources = 'src/test/resources'

        # Import
        # columns: Sample Status qPhen
        kt = hc.import_keytable(
            test_resources + '/sampleAnnotations.tsv',
            config=TextTableConfig(impute=True)).key_by('Sample')
        kt2 = hc.import_keytable(
            test_resources + '/sampleAnnotations2.tsv',
            config=TextTableConfig(impute=True)).key_by('Sample')

        # Variables
        self.assertEqual(kt.num_columns, 3)
        self.assertEqual(kt.key_names[0], "Sample")
        self.assertEqual(kt.column_names[2], "qPhen")
        self.assertEqual(kt.count_rows(), 100)
        kt.schema

        # Export
        kt.export('/tmp/testExportKT.tsv')

        # Filter, Same
        ktcase = kt.filter('Status == "CASE"', True)
        ktcase2 = kt.filter('Status == "CTRL"', False)
        self.assertTrue(ktcase.same(ktcase2))

        # Annotate
        (kt.annotate('X = Status').count_rows())

        # Join
        kt.join(kt2, 'left').count_rows()

        # AggregateByKey
        (kt.aggregate_by_key("Status = Status",
                             "Sum = qPhen.sum()").count_rows())

        # Forall, Exists
        self.assertFalse(kt.forall('Status == "CASE"'))
        self.assertTrue(kt.exists('Status == "CASE"'))

        kt.rename({"Sample": "ID"})
        kt.rename(["Field1", "Field2", "Field3"])
        kt.rename([name + "_a" for name in kt.column_names])

        kt.select(["Sample"])
        kt.select(["Sample", "Status"])

        kt.key_by(['Sample', 'Status'])
        kt.key_by([])

        kt.flatten()
        kt.expand_types()

        kt.to_dataframe()

        kt.annotate("newField = [0, 1, 2]").explode(["newField"])

        sample = hc.import_vcf(test_resources + '/sample.vcf')
        sample_variants = (sample.variants_keytable().annotate(
            'v = str(v), va.filters = va.filters.toArray()').flatten())

        sample_variants2 = hc.dataframe_to_keytable(
            sample_variants.to_dataframe(), ['v'])
        self.assertTrue(sample_variants.same(sample_variants2))

        # cseed: calculated by hand using sort -n -k 3,3 and inspection
        self.assertTrue(kt.filter('qPhen < 10000').count_rows() == 23)

        kt.write('/tmp/sampleAnnotations.kt', overwrite=True)
        kt3 = hc.read_keytable('/tmp/sampleAnnotations.kt')
        self.assertTrue(kt.same(kt3))

        # test order_by
        schema = TStruct(['a', 'b'], [TInt(), TString()])
        rows = [{
            'a': 5
        }, {
            'a': 5,
            'b': 'quam'
        }, {
            'a': -1,
            'b': 'quam'
        }, {
            'b': 'foo'
        }, {
            'a': 7,
            'b': 'baz'
        }]
        kt4 = KeyTable.from_py(hc, rows, schema, npartitions=3)

        bya = [r.get('a') for r in kt4.order_by('a').collect()]
        self.assertEqual(bya, [-1, 5, 5, 7, None])

        bydesca = [r.get('a') for r in kt4.order_by(desc('a')).collect()]
        self.assertEqual(bydesca, [7, 5, 5, -1, None])

        byab = [(r.get('a'), r.get('b'))
                for r in kt4.order_by('a', 'b').collect()]
        self.assertEqual(byab, [(-1, 'quam'), (5, 'quam'), (5, None),
                                (7, 'baz'), (None, 'foo')])

        bydescab = [(r.get('a'), r.get('b'))
                    for r in kt4.order_by(desc('a'), 'b').collect()]
        self.assertEqual(bydescab, [(7, 'baz'), (5, 'quam'), (5, None),
                                    (-1, 'quam'), (None, 'foo')])
Exemple #6
0
    def test_trio_matrix(self):
        """
        This test depends on certain properties of the trio matrix VCF
        and pedigree structure.

        This test is NOT a valid test if the pedigree includes quads:
        the trio_matrix method will duplicate the parents appropriately,
        but the genotypes_table and samples_table orthogonal paths would
        require another duplication/explode that we haven't written.
        """
        ped = Pedigree.read('src/test/resources/triomatrix.fam')
        famkt = KeyTable.import_fam('src/test/resources/triomatrix.fam')

        vds = hc.import_vcf('src/test/resources/triomatrix.vcf')\
                .annotate_samples_table(famkt, root='sa.fam')

        dads = famkt.filter('isDefined(patID)')\
                    .annotate('isDad = true')\
                    .select(['patID', 'isDad'])\
                    .key_by('patID')

        moms = famkt.filter('isDefined(matID)') \
            .annotate('isMom = true') \
            .select(['matID', 'isMom']) \
            .key_by('matID')

        # test genotypes
        gkt = (vds.genotypes_table(
        ).key_by('s').join(dads, how='left').join(moms, how='left').annotate(
            'isDad = isDefined(isDad), isMom = isDefined(isMom)'
        ).aggregate_by_key(
            'v = v, fam = sa.fam.famID',
            'data = g.map(g => {role: if (isDad) 1 else if (isMom) 2 else 0, g: g}).collect()'
        ).filter('data.length() == 3').explode('data').select(
            ['v', 'fam', 'data']))

        tkt = (vds.trio_matrix(ped, complete_trios=True).genotypes_table(
        ).annotate(
            'fam = sa.proband.annotations.fam.famID, data = [{role: 0, g: g.proband}, {role: 1, g: g.father}, {role: 2, g: g.mother}]'
        ).select(['v', 'fam',
                  'data']).explode('data').filter('isDefined(data.g)').key_by(
                      ['v', 'fam']))

        self.assertTrue(gkt.same(tkt))

        # test annotations
        g_sa = (vds.samples_table(
        ).join(dads, how='left').join(moms, how='left').annotate(
            'isDad = isDefined(isDad), isMom = isDefined(isMom)'
        ).aggregate_by_key(
            'fam = sa.fam.famID',
            'data = sa.map(sa => {role: if (isDad) 1 else if (isMom) 2 else 0, sa: sa}).collect()'
        ).filter('data.length() == 3').explode('data').select(['fam', 'data']))

        t_sa = (vds.trio_matrix(ped, complete_trios=True).samples_table(
        ).annotate(
            'fam = sa.proband.annotations.fam.famID, data = [{role: 0, sa: sa.proband.annotations}, '
            '{role: 1, sa: sa.father.annotations}, '
            '{role: 2, sa: sa.mother.annotations}]').select([
                'fam', 'data'
            ]).explode('data').filter('isDefined(data.sa)').key_by(['fam']))

        self.assertTrue(g_sa.same(t_sa))
Exemple #7
0
    def test_keytable(self):
        test_resources = 'src/test/resources'

        # Import
        # columns: Sample Status qPhen
        kt = hc.import_table(test_resources + '/sampleAnnotations.tsv',
                             impute=True).key_by('Sample')
        kt2 = hc.import_table(test_resources + '/sampleAnnotations2.tsv',
                              impute=True).key_by('Sample')

        # Variables
        self.assertEqual(kt.num_columns, 3)
        self.assertEqual(kt.key[0], "Sample")
        self.assertEqual(kt.columns[2], "qPhen")
        self.assertEqual(kt.count(), 100)
        kt.schema

        # Export
        kt.export('/tmp/testExportKT.tsv')

        # Filter, Same
        ktcase = kt.filter('Status == "CASE"', True)
        ktcase2 = kt.filter('Status == "CTRL"', False)
        self.assertTrue(ktcase.same(ktcase2))

        # Annotate
        (kt.annotate('X = Status').count())

        # Join
        kt.join(kt2, 'left').count()

        # AggregateByKey
        (kt.aggregate_by_key("Status = Status", "Sum = qPhen.sum()").count())

        # Forall, Exists
        self.assertFalse(kt.forall('Status == "CASE"'))
        self.assertTrue(kt.exists('Status == "CASE"'))

        kt.rename({"Sample": "ID"})
        kt.rename(["Field1", "Field2", "Field3"])
        kt.rename([name + "_a" for name in kt.columns])

        kt.select("Sample")
        kt.select(["Sample", "Status"], qualified_name=True)

        kt.drop("Sample")
        kt.drop(["Sample", "Status"])

        kt.key_by(['Sample', 'Status'])
        kt.key_by([])

        kt.flatten()
        kt.expand_types()

        kt.to_dataframe().count()

        kt.show(10)
        kt.show(4, print_types=False, truncate_to=15)

        kt.annotate("newField = [0, 1, 2]").explode(["newField"])

        sample = hc.import_vcf(test_resources + '/sample.vcf')
        sample_variants = (sample.variants_table().annotate(
            'v = str(v), va.filters = va.filters.toArray()').flatten())

        sample_variants2 = KeyTable.from_dataframe(
            sample_variants.to_dataframe()).key_by('v')
        self.assertTrue(sample_variants.same(sample_variants2))

        # cseed: calculated by hand using sort -n -k 3,3 and inspection
        self.assertTrue(kt.filter('qPhen < 10000').count() == 23)

        kt.write('/tmp/sampleAnnotations.kt', overwrite=True)
        kt3 = hc.read_table('/tmp/sampleAnnotations.kt')
        self.assertTrue(kt.same(kt3))

        # test order_by
        schema = TStruct(['a', 'b'], [TInt32(), TString()])
        rows = [{
            'a': 5
        }, {
            'a': 5,
            'b': 'quam'
        }, {
            'a': -1,
            'b': 'quam'
        }, {
            'b': 'foo'
        }, {
            'a': 7,
            'b': 'baz'
        }]
        kt4 = KeyTable.parallelize(rows, schema, num_partitions=3)

        bya = [r.get('a') for r in kt4.order_by('a').collect()]
        self.assertEqual(bya, [-1, 5, 5, 7, None])

        bydesca = [r.get('a') for r in kt4.order_by(desc('a')).collect()]
        self.assertEqual(bydesca, [7, 5, 5, -1, None])

        byab = [(r.get('a'), r.get('b'))
                for r in kt4.order_by('a', 'b').collect()]
        self.assertEqual(byab, [(-1, 'quam'), (5, 'quam'), (5, None),
                                (7, 'baz'), (None, 'foo')])

        bydescab = [(r.get('a'), r.get('b'))
                    for r in kt4.order_by(desc('a'), 'b').collect()]
        self.assertEqual(bydescab, [(7, 'baz'), (5, 'quam'), (5, None),
                                    (-1, 'quam'), (None, 'foo')])

        KeyTable.import_fam(test_resources + '/sample.fam')._typecheck()

        self.assertEqual(kt.union(kt).count(), kt.count() * 2)
        self.assertEqual(kt.union(kt, kt).count(), kt.count() * 3)

        first3 = kt.take(3)
        self.assertEqual(first3[0].qPhen, 27704)
        self.assertEqual(first3[1].qPhen, 16636)
        self.assertEqual(first3[2].qPhen, 7256)
        self.assertEqual(first3[0].Sample, 'HG00096')
        self.assertEqual(first3[1].Sample, 'HG00097')
        self.assertEqual(first3[2].Sample, 'HG00099')
        self.assertTrue(all(x.Status == 'CASE' for x in first3))

        self.assertTrue(kt.head(3).count(), 3)

        self.assertEqual(range(10),
                         [x.idx for x in KeyTable.range(10).collect()])
        self.assertTrue(
            KeyTable.range(200).indexed('foo').forall('idx == foo'))

        kt3 = KeyTable.parallelize([{
            'A': Struct(c1=5, c2=21)
        }], TStruct(['A'],
                    [TStruct(['c1', 'c2'], [TInt32(), TInt32()])]))

        self.assertTrue(kt3.ungroup('A').group('A', 'c1', 'c2').same(kt3))
Exemple #8
0
    def test_dataset(self):
        test_resources = 'src/test/resources'

        vds = hc.import_vcf(test_resources + '/sample.vcf')
        vds2 = hc.import_vcf(test_resources + '/sample2.vcf')

        for (dataset, dataset2) in [(vds, vds2)]:
            gt = 'g.GT'

            dataset = dataset.cache()
            dataset2 = dataset2.persist()

            dataset.write('/tmp/sample.vds', overwrite=True)

            dataset.count()

            self.assertEqual(dataset.head(3).count_variants(), 3)

            dataset.query_variants(['variants.count()'])
            dataset.query_samples(['samples.count()'])

            (dataset.annotate_samples_expr(
                'sa.nCalled = gs.filter(g => isDefined({0})).count()'.format(
                    gt)).samples_table().select(['s', 'nCalled = sa.nCalled'
                                                 ]).export('/tmp/sa.tsv'))

            dataset.annotate_global_expr('global.foo = 5')
            dataset.annotate_global_expr(['global.foo = 5', 'global.bar = 6'])

            dataset = dataset.annotate_samples_table(
                hc.import_table(test_resources +
                                '/sampleAnnotations.tsv').key_by('Sample'),
                expr=
                'sa.isCase = table.Status == "CASE", sa.qPhen = table.qPhen')

            (dataset.annotate_variants_expr(
                'va.nCalled = gs.filter(g => isDefined({0})).count()'.format(
                    gt)).count())

            loci_tb = (
                hc.import_table(test_resources + '/sample2_loci.tsv').annotate(
                    'locus = Locus(chr, pos.toInt32())').key_by('locus'))
            (dataset.annotate_variants_table(loci_tb,
                                             root='va.locus_annot').count())

            variants_tb = (hc.import_table(
                test_resources + '/variantAnnotations.tsv'
            ).annotate(
                'variant = Variant(Chromosome, Position.toInt32(), Ref, Alt)').
                           key_by('variant'))
            (dataset.annotate_variants_table(variants_tb,
                                             root='va.table').count())

            (dataset.annotate_variants_vds(
                dataset, expr='va.good = va.info.AF == vds.info.AF').count())

            downsampled = dataset.sample_variants(0.10)
            downsampled.variants_table().select(
                ['chr = v.contig',
                 'pos = v.start']).export('/tmp/sample2_loci.tsv')
            downsampled.variants_table().select('v').export(
                '/tmp/sample2_variants.tsv')

            with open(test_resources + '/sample2.sample_list') as f:
                samples = [s.strip() for s in f]
            (dataset.filter_samples_list(samples).count()[0] == 56)

            locus_tb = (
                hc.import_table(test_resources + '/sample2_loci.tsv').annotate(
                    'locus = Locus(chr, pos.toInt32())').key_by('locus'))

            (dataset.annotate_variants_table(locus_tb,
                                             root='va.locus_annot').count())

            tb = (hc.import_table(
                test_resources + '/variantAnnotations.tsv'
            ).annotate(
                'variant = Variant(Chromosome, Position.toInt32(), Ref, Alt)').
                  key_by('variant'))
            (dataset.annotate_variants_table(tb, root='va.table').count())

            (dataset.annotate_variants_vds(
                dataset, expr='va.good = va.info.AF == vds.info.AF').count())

            dataset.export_vcf('/tmp/sample2.vcf.bgz')

            self.assertEqual(dataset.drop_samples().count()[0], 0)
            self.assertEqual(dataset.drop_variants().count()[1], 0)

            dataset_dedup = (hc.import_vcf([
                test_resources + '/sample2.vcf',
                test_resources + '/sample2.vcf'
            ]).deduplicate())
            self.assertEqual(dataset_dedup.count()[1], 735)

            (dataset.filter_samples_expr('pcoin(0.5)').samples_table().select(
                's').export('/tmp/sample2.sample_list'))

            (dataset.filter_variants_expr('pcoin(0.5)').variants_table().
             select('v').export('/tmp/sample2.variant_list'))

            (dataset.filter_variants_table(
                KeyTable.import_interval_list(
                    test_resources + '/annotinterall.interval_list')).count())

            dataset.filter_intervals(Interval.parse('1:100-end')).count()
            dataset.filter_intervals(map(Interval.parse,
                                         ['1:100-end', '3-22'])).count()

            (dataset.filter_variants_table(
                KeyTable.import_interval_list(
                    test_resources + '/annotinterall.interval_list')).count())

            self.assertEqual(
                dataset2.filter_variants_table(
                    hc.import_table(test_resources + '/sample2_variants.tsv',
                                    key='f0',
                                    impute=True,
                                    no_header=True)).count()[1], 21)

            m2 = {
                r.f0: r.f1
                for r in hc.import_table(test_resources +
                                         '/sample2_rename.tsv',
                                         no_header=True).collect()
            }
            self.assertEqual(
                dataset2.join(dataset2.rename_samples(m2)).count()[0], 200)

            dataset._typecheck()

            dataset.variants_table().export('/tmp/variants.tsv')
            self.assertTrue(
                (dataset.variants_table().annotate('va = json(va)')).same(
                    hc.import_table('/tmp/variants.tsv',
                                    impute=True).key_by('v')))

            dataset.samples_table().export('/tmp/samples.tsv')
            self.assertTrue((
                dataset.samples_table().annotate('s = s, sa = json(sa)')).same(
                    hc.import_table('/tmp/samples.tsv',
                                    impute=True).key_by('s')))

            gt_string = 'gt = g.GT, gq = g.GQ'
            gt_string2 = 'gt: g.GT, gq: g.GQ'

            cols = ['v = v', 'info = va.info']
            for s in dataset.sample_ids:
                cols.append('`{s}`.gt = va.G["{s}"].gt'.format(s=s))
                cols.append('`{s}`.gq = va.G["{s}"].gq'.format(s=s))

            dataset_table = (dataset.annotate_variants_expr(
                'va.G = index(gs.map(g => { s: s, %s }).collect(), s)' %
                gt_string2).variants_table().select(cols))

            dataset_table_typs = {
                fd.name: fd.typ
                for fd in dataset_table.schema.fields
            }
            dataset_table.export('/tmp/sample_kt.tsv')

            self.assertTrue((dataset.make_table(
                'v = v, info = va.info', gt_string, ['v'])).same(
                    hc.import_table('/tmp/sample_kt.tsv',
                                    types=dataset_table_typs).key_by('v')))

            dataset.annotate_variants_expr(
                "va.nHet = gs.filter(g => {0}.isHet()).count()".format(gt))

            dataset.make_table('v = v, info = va.info', 'gt = {0}'.format(gt),
                               ['v'])

            dataset.num_partitions()
            dataset.file_version()
            dataset.sample_ids[:5]
            dataset.variant_schema
            dataset.sample_schema

            self.assertEqual(dataset2.num_samples, 100)
            self.assertEqual(dataset2.count_variants(), 735)

            dataset.annotate_variants_table(dataset.variants_table(),
                                            root="va")

            kt = (dataset.variants_table().annotate("v2 = v").key_by(
                ["v", "v2"]))

            dataset.annotate_variants_table(kt,
                                            root='va.foo',
                                            vds_key=["v", "v"])

            self.assertEqual(kt.query('v.fraction(x => x == v2)'), 1.0)

            dataset.genotypes_table()

            ## This is very slow!!!
            variants_py = (dataset.annotate_variants_expr(
                'va.hets = gs.filter(g => {0}.isHet()).collect()'.format(
                    gt)).variants_table().filter('pcoin(0.1)').collect())

            expr = 'g.GT.isHet() && g.GQ > 20'

            (dataset.filter_genotypes(expr).genotypes_table().select([
                'v', 's', 'nNonRefAlleles = {0}.nNonRefAlleles()'.format(gt)
            ]).export('/tmp/sample2_genotypes.tsv'))

            self.assertTrue((dataset.repartition(16,
                                                 shuffle=False).same(dataset)))

            self.assertTrue(dataset.naive_coalesce(2).same(dataset))

            print(dataset.storage_level())

            dataset = dataset.unpersist()
            dataset2 = dataset2.unpersist()

            new_sample_order = dataset.sample_ids[:]
            random.shuffle(new_sample_order)
            self.assertEqual(
                vds.reorder_samples(new_sample_order).sample_ids,
                new_sample_order)

        sample = hc.import_vcf(test_resources + '/sample.vcf').cache()

        sample.summarize().report()
        sample.drop_samples().summarize().report()

        sample_split = sample.split_multi_hts()

        sample2 = hc.import_vcf(test_resources + '/sample2.vcf')
        sample2 = sample2.persist()

        sample2_split = sample2.split_multi_hts()

        sample.annotate_alleles_expr_hts(
            'va.gs = gs.map(g => g.GT).callStats(g => v)').count()
        sample.annotate_alleles_expr_hts(
            ['va.gs = gs.map(g => g.GT).callStats(g => v)',
             'va.foo = 5']).count()

        glob, concordance1, concordance2 = (
            sample2_split.concordance(sample2_split))
        print(glob[1][4])
        print(glob[4][0])
        print(glob[:][3])
        concordance1.write('/tmp/foo.kt', overwrite=True)
        concordance2.write('/tmp/foo.kt', overwrite=True)

        sample2_split.export_gen('/tmp/sample2.gen', 5)
        sample2_split.export_plink('/tmp/sample2')

        sample2.filter_variants_expr('v.isBiallelic').count()

        sample2.split_multi_hts().grm().export_gcta_grm_bin('/tmp/sample2.grm')

        sample2.hardcalls().count()

        sample2_split.ibd(min=0.2, max=0.6)

        sample2.split_multi_hts().impute_sex().variant_schema

        self.assertEqual(sample2.genotype_schema, Type.hts_schema())

        m2 = {
            r.f0: r.f1
            for r in hc.import_table(test_resources + '/sample2_rename.tsv',
                                     no_header=True,
                                     impute=True).collect()
        }
        self.assertEqual(
            sample2.join(sample2.rename_samples(m2)).count()[0], 200)

        cov = hc.import_table(test_resources + '/regressionLinear.cov',
                              types={
                                  'Cov1': TFloat64(),
                                  'Cov2': TFloat64()
                              }).key_by('Sample')

        phen1 = hc.import_table(test_resources + '/regressionLinear.pheno',
                                missing='0',
                                types={
                                    'Pheno': TFloat64()
                                }).key_by('Sample')
        phen2 = hc.import_table(test_resources +
                                '/regressionLogisticBoolean.pheno',
                                missing='0',
                                types={
                                    'isCase': TBoolean()
                                }).key_by('Sample')

        regression = (hc.import_vcf(
            test_resources +
            '/regressionLinear.vcf').split_multi_hts().annotate_samples_table(
                cov, root='sa.cov').annotate_samples_table(
                    phen1, root='sa.pheno.Pheno').annotate_samples_table(
                        phen2, root='sa.pheno.isCase'))

        (regression.linreg(['sa.pheno.Pheno'],
                           'g.GT.nNonRefAlleles()',
                           covariates=['sa.cov.Cov1',
                                       'sa.cov.Cov2 + 1 - 1']).count())

        (regression.logreg('wald',
                           'sa.pheno.isCase',
                           'g.GT.nNonRefAlleles()',
                           covariates=['sa.cov.Cov1',
                                       'sa.cov.Cov2 + 1 - 1']).count())

        vds_assoc = (regression.annotate_samples_expr(
            'sa.culprit = gs.filter(g => v == Variant("1", 1, "C", "T")).map(g => g.GT.gt).collect()[0]'
        ).annotate_samples_expr(
            'sa.pheno.PhenoLMM = (1 + 0.1 * sa.cov.Cov1 * sa.cov.Cov2) * sa.culprit'
        ))

        covariatesSkat = hc.import_table(test_resources + "/skat.cov",
                                         impute=True).key_by("Sample")

        phenotypesSkat = (hc.import_table(test_resources + "/skat.pheno",
                                          types={
                                              "Pheno": TFloat64()
                                          },
                                          missing="0").key_by("Sample"))

        intervalsSkat = KeyTable.import_interval_list(test_resources +
                                                      "/skat.interval_list")

        weightsSkat = (hc.import_table(test_resources + "/skat.weights",
                                       types={
                                           "locus": TLocus(),
                                           "weight": TFloat64()
                                       }).key_by("locus"))

        skatVds = (vds2.split_multi_hts().annotate_variants_table(
            intervalsSkat, root="va.gene").annotate_variants_table(
                weightsSkat, root="va.weight").annotate_samples_table(
                    phenotypesSkat, root="sa.pheno").annotate_samples_table(
                        covariatesSkat, root="sa.cov").annotate_samples_expr(
                            "sa.pheno = if (sa.pheno == 1.0) false else " +
                            "if (sa.pheno == 2.0) true else NA: Boolean"))

        (skatVds.skat(key_expr='va.gene',
                      weight_expr='va.weight',
                      y='sa.pheno',
                      x='g.GT.nNonRefAlleles()',
                      covariates=['sa.cov.Cov1', 'sa.cov.Cov2'],
                      logistic=False).count())

        (skatVds.skat(key_expr='va.gene',
                      weight_expr='va.weight',
                      y='sa.pheno',
                      x='plDosage(g.PL)',
                      covariates=['sa.cov.Cov1', 'sa.cov.Cov2'],
                      logistic=True).count())

        vds_kinship = vds_assoc.filter_variants_expr('v.start < 4')

        km = vds_kinship.rrm(False, False)

        ld_matrix_path = '/tmp/ldmatrix'
        ldMatrix = vds_kinship.ld_matrix()
        if os.path.isdir(ld_matrix_path):
            shutil.rmtree(ld_matrix_path)
        ldMatrix.write(ld_matrix_path)
        LDMatrix.read(ld_matrix_path).to_local_matrix()

        vds_assoc = vds_assoc.lmmreg(km, 'sa.pheno.PhenoLMM',
                                     'g.GT.nNonRefAlleles()',
                                     ['sa.cov.Cov1', 'sa.cov.Cov2'])

        vds_assoc.variants_table().select(['Variant = v', 'va.lmmreg.*'
                                           ]).export('/tmp/lmmreg.tsv')

        men, fam, ind, var = sample_split.mendel_errors(
            Pedigree.read(test_resources + '/sample.fam'))
        men.select(['fid', 's', 'code'])
        fam.select(['father', 'nChildren'])
        self.assertEqual(ind.key, ['s'])
        self.assertEqual(var.key, ['v'])
        sample_split.annotate_variants_table(var, root='va.mendel').count()

        sample_split.pca_of_normalized_genotypes()

        sample_split.tdt(Pedigree.read(test_resources + '/sample.fam'))

        sample2_split.variant_qc().variant_schema

        sample2.variants_table().export('/tmp/variants.tsv')
        self.assertTrue(
            (sample2.variants_table().annotate('va = json(va)')).same(
                hc.import_table('/tmp/variants.tsv', impute=True).key_by('v')))

        sample2.samples_table().export('/tmp/samples.tsv')
        self.assertTrue(
            (sample2.samples_table().annotate('s = s, sa = json(sa)')).same(
                hc.import_table('/tmp/samples.tsv', impute=True).key_by('s')))

        cols = ['v = v', 'info = va.info']
        for s in sample2.sample_ids:
            cols.append('{s}.gt = va.G["{s}"].gt'.format(s=s))
            cols.append('{s}.gq = va.G["{s}"].gq'.format(s=s))

        sample2_table = (sample2.annotate_variants_expr(
            'va.G = index(gs.map(g => { s: s, gt: g.GT, gq: g.GQ }).collect(), s)'
        ).variants_table().select(cols))

        sample2_table.export('/tmp/sample_kt.tsv')
        sample2_typs = {fd.name: fd.typ for fd in sample2_table.schema.fields}

        self.assertTrue((sample2.make_table(
            'v = v, info = va.info', 'gt = g.GT, gq = g.GQ', ['v'])).same(
                hc.import_table('/tmp/sample_kt.tsv',
                                types=sample2_typs).key_by('v')))

        sample_split.annotate_variants_expr(
            "va.nHet = gs.filter(g => g.GT.isHet()).count()")

        sample2.make_table('v = v, info = va.info', 'gt = g.GT', ['v'])

        sample.num_partitions()
        sample.file_version()
        sample.sample_ids[:5]

        sample2.filter_alleles_hts('pcoin(0.5)')

        sample_split.ld_prune(8).variants_table().select('v').export(
            "/tmp/testLDPrune.tsv")
        kt = (sample2.variants_table().annotate("v2 = v").key_by(["v", "v2"]))
        sample2.annotate_variants_table(kt, root="va.foo", vds_key=["v", "v"])

        self.assertEqual(kt.query('v.fraction(x => x == v2)'), 1.0)

        variants_py = (sample.annotate_variants_expr(
            'va.hets = gs.filter(g => g.GT.isHet).collect()').variants_table().
                       take(5))

        VariantDataset.from_table(sample.variants_table())
Exemple #9
0
    def test_dataset(self):
        test_resources = 'src/test/resources'

        vds = hc.import_vcf(test_resources + '/sample.vcf')
        vds2 = hc.import_vcf(test_resources + '/sample2.vcf')
        gds = hc.import_vcf(test_resources + '/sample.vcf', generic=True)
        gds2 = hc.import_vcf(test_resources + '/sample2.vcf', generic=True)

        for (dataset, dataset2) in [(vds, vds2), (gds, gds2)]:

            if dataset._is_generic_genotype:
                gt = 'g.GT'
            else:
                gt = 'g'

            dataset.cache()
            dataset2.persist()

            dataset.write('/tmp/sample.vds', overwrite=True)

            dataset.count()

            dataset.query_variants(['variants.count()'])
            dataset.query_samples(['samples.count()'])

            (dataset.annotate_samples_expr(
                'sa.nCalled = gs.filter(g => {0}.isCalled()).count()'.format(
                    gt)).export_samples('/tmp/sa.tsv',
                                        's = s, nCalled = sa.nCalled'))

            dataset.annotate_global_expr('global.foo = 5')
            dataset.annotate_global_expr(['global.foo = 5', 'global.bar = 6'])

            dataset = dataset.annotate_samples_table(
                hc.import_table(test_resources +
                                '/sampleAnnotations.tsv').key_by('Sample'),
                expr=
                'sa.isCase = table.Status == "CASE", sa.qPhen = table.qPhen')

            (dataset.annotate_variants_expr(
                'va.nCalled = gs.filter(g => {0}.isCalled()).count()'.format(
                    gt)).count())

            loci_tb = (
                hc.import_table(test_resources + '/sample2_loci.tsv').annotate(
                    'locus = Locus(chr, pos.toInt())').key_by('locus'))
            (dataset.annotate_variants_table(loci_tb,
                                             root='va.locus_annot').count())

            variants_tb = (hc.import_table(
                test_resources + '/variantAnnotations.tsv').annotate(
                    'variant = Variant(Chromosome, Position.toInt(), Ref, Alt)'
                ).key_by('variant'))
            (dataset.annotate_variants_table(variants_tb,
                                             root='va.table').count())

            (dataset.annotate_variants_vds(
                dataset, expr='va.good = va.info.AF == vds.info.AF').count())

            downsampled = dataset.sample_variants(0.10)
            downsampled.export_variants('/tmp/sample2_loci.tsv',
                                        'chr = v.contig, pos = v.start')
            downsampled.export_variants('/tmp/sample2_variants.tsv', 'v')

            with open(test_resources + '/sample2.sample_list') as f:
                samples = [s.strip() for s in f]
            (dataset.filter_samples_list(samples).count()[0] == 56)

            locus_tb = (
                hc.import_table(test_resources + '/sample2_loci.tsv').annotate(
                    'locus = Locus(chr, pos.toInt())').key_by('locus'))

            (dataset.annotate_variants_table(locus_tb,
                                             root='va.locus_annot').count())

            tb = (hc.import_table(
                test_resources + '/variantAnnotations.tsv').annotate(
                    'variant = Variant(Chromosome, Position.toInt(), Ref, Alt)'
                ).key_by('variant'))
            (dataset.annotate_variants_table(tb, root='va.table').count())

            (dataset.annotate_variants_vds(
                dataset, expr='va.good = va.info.AF == vds.info.AF').count())

            dataset.export_vcf('/tmp/sample2.vcf.bgz')

            self.assertEqual(dataset.drop_samples().count()[0], 0)
            self.assertEqual(dataset.drop_variants().count()[1], 0)

            dataset_dedup = (hc.import_vcf([
                test_resources + '/sample2.vcf',
                test_resources + '/sample2.vcf'
            ]).deduplicate())
            self.assertEqual(dataset_dedup.count()[1], 735)

            (dataset.filter_samples_expr('pcoin(0.5)').export_samples(
                '/tmp/sample2.sample_list', 's'))

            (dataset.filter_variants_expr('pcoin(0.5)').export_variants(
                '/tmp/sample2.variant_list', 'v'))

            (dataset.filter_variants_table(
                KeyTable.import_interval_list(
                    test_resources + '/annotinterall.interval_list')).count())

            dataset.filter_intervals(Interval.parse('1:100-end')).count()
            dataset.filter_intervals(map(Interval.parse,
                                         ['1:100-end', '3-22'])).count()

            (dataset.filter_variants_table(
                KeyTable.import_interval_list(
                    test_resources + '/annotinterall.interval_list')).count())

            self.assertEqual(
                dataset2.filter_variants_table(
                    hc.import_table(test_resources + '/sample2_variants.tsv',
                                    key='f0',
                                    impute=True,
                                    no_header=True)).count()[1], 21)

            m2 = {
                r.f0: r.f1
                for r in hc.import_table(test_resources +
                                         '/sample2_rename.tsv',
                                         no_header=True).collect()
            }
            self.assertEqual(
                dataset2.join(dataset2.rename_samples(m2)).count()[0], 200)

            dataset._typecheck()

            dataset.export_variants('/tmp/variants.tsv', 'v = v, va = va')
            self.assertTrue(
                (dataset.variants_table().annotate('va = json(va)')).same(
                    hc.import_table('/tmp/variants.tsv',
                                    impute=True).key_by('v')))

            dataset.export_samples('/tmp/samples.tsv', 's = s, sa = sa')
            self.assertTrue((
                dataset.samples_table().annotate('s = s, sa = json(sa)')).same(
                    hc.import_table('/tmp/samples.tsv',
                                    impute=True).key_by('s')))

            if dataset._is_generic_genotype:
                gt_string = 'gt = g.GT, gq = g.GQ'
                gt_string2 = 'gt: g.GT, gq: g.GQ'
            else:
                gt_string = 'gt = g.gt, gq = g.gq'
                gt_string2 = 'gt: g.gt, gq: g.gq'

            cols = ['v = v, info = va.info']
            for s in dataset.sample_ids:
                cols.append(
                    '{s}.gt = va.G["{s}"].gt, {s}.gq = va.G["{s}"].gq'.format(
                        s=s))

            (dataset.annotate_variants_expr(
                'va.G = index(gs.map(g => { s: s, %s }).collect(), s)' %
                gt_string2).export_variants('/tmp/sample_kt.tsv',
                                            ','.join(cols)))

            ((dataset.make_table(
                'v = v, info = va.info', gt_string, ['v'])).same(
                    hc.import_table('/tmp/sample_kt.tsv').key_by('v')))

            dataset.annotate_variants_expr(
                "va.nHet = gs.filter(g => {0}.isHet()).count()".format(gt))

            dataset.aggregate_by_key(
                "Variant = v",
                "nHet = g.map(g => {0}.isHet().toInt()).sum().toLong()".format(
                    gt))
            dataset.aggregate_by_key(["Variant = v"], [
                "nHet = g.map(g => {0}.isHet().toInt()).sum().toLong()".format(
                    gt)
            ])

            dataset.make_table('v = v, info = va.info', 'gt = {0}'.format(gt),
                               ['v'])

            dataset.num_partitions()
            dataset.file_version()
            dataset.sample_ids[:5]
            dataset.variant_schema
            dataset.sample_schema

            self.assertEqual(dataset2.num_samples, 100)
            self.assertEqual(dataset2.count_variants(), 735)

            dataset.annotate_variants_table(dataset.variants_table(),
                                            root="va")

            kt = (dataset.variants_table().annotate("v2 = v").key_by(
                ["v", "v2"]))

            dataset.annotate_variants_table(kt,
                                            root='va.foo',
                                            vds_key=["v", "v"])

            self.assertEqual(kt.query('v.fraction(x => x == v2)'), 1.0)

            dataset.genotypes_table()

            ## This is very slow!!!
            variants_py = (dataset.annotate_variants_expr(
                'va.hets = gs.filter(g => {0}.isHet()).collect()'.format(
                    gt)).variants_table().filter('pcoin(0.1)').collect())

            if dataset._is_generic_genotype:
                expr = 'g.GT.isHet() && g.GQ > 20'
            else:
                expr = 'g.isHet() && g.gq > 20'

            (dataset.filter_genotypes(expr).export_genotypes(
                '/tmp/sample2_genotypes.tsv',
                'v, s, {0}.nNonRefAlleles()'.format(gt)))

            self.assertTrue((dataset.repartition(16,
                                                 shuffle=False).same(dataset)))

            print(dataset.storage_level())
            dataset.unpersist()
            dataset2.unpersist()

        sample = hc.import_vcf(test_resources + '/sample.vcf')
        sample.cache()

        sample.summarize().report()

        sample_split = sample.split_multi()

        sample2 = hc.import_vcf(test_resources + '/sample2.vcf')
        sample2.persist()

        sample2_split = sample2.split_multi()

        sample.annotate_alleles_expr('va.gs = gs.callStats(g => v)').count()
        sample.annotate_alleles_expr(
            ['va.gs = gs.callStats(g => v)', 'va.foo = 5']).count()

        glob, concordance1, concordance2 = (
            sample2_split.concordance(sample2_split))
        print(glob[1][4])
        print(glob[4][0])
        print(glob[:][3])
        concordance1.write('/tmp/foo.kt', overwrite=True)
        concordance2.write('/tmp/foo.kt', overwrite=True)

        sample2_split.export_gen('/tmp/sample2.gen', 5)
        sample2_split.export_plink('/tmp/sample2')

        sample2.filter_multi().count()

        sample2.split_multi().grm().export_gcta_grm_bin('/tmp/sample2.grm')

        sample2.hardcalls().count()

        sample2_split.ibd(min=0.2, max=0.6)

        sample2.split_multi().impute_sex().variant_schema

        self.assertTrue(isinstance(sample2.genotype_schema, TGenotype))

        m2 = {
            r.f0: r.f1
            for r in hc.import_table(test_resources + '/sample2_rename.tsv',
                                     no_header=True,
                                     impute=True).collect()
        }
        self.assertEqual(
            sample2.join(sample2.rename_samples(m2)).count()[0], 200)

        cov = hc.import_table(test_resources + '/regressionLinear.cov',
                              types={
                                  'Cov1': TDouble(),
                                  'Cov2': TDouble()
                              }).key_by('Sample')

        phen1 = hc.import_table(test_resources + '/regressionLinear.pheno',
                                missing='0',
                                types={
                                    'Pheno': TDouble()
                                }).key_by('Sample')
        phen2 = hc.import_table(test_resources +
                                '/regressionLogisticBoolean.pheno',
                                missing='0',
                                types={
                                    'isCase': TBoolean()
                                }).key_by('Sample')

        regression = (hc.import_vcf(
            test_resources +
            '/regressionLinear.vcf').split_multi().annotate_samples_table(
                cov, root='sa.cov').annotate_samples_table(
                    phen1, root='sa.pheno.Pheno').annotate_samples_table(
                        phen2, root='sa.pheno.isCase'))

        (regression.linreg('sa.pheno.Pheno',
                           covariates=['sa.cov.Cov1',
                                       'sa.cov.Cov2 + 1 - 1']).count())

        (regression.logreg('wald',
                           'sa.pheno.isCase',
                           covariates=['sa.cov.Cov1',
                                       'sa.cov.Cov2 + 1 - 1']).count())

        vds_assoc = (regression.annotate_samples_expr(
            'sa.culprit = gs.filter(g => v == Variant("1", 1, "C", "T")).map(g => g.gt).collect()[0]'
        ).annotate_samples_expr(
            'sa.pheno.PhenoLMM = (1 + 0.1 * sa.cov.Cov1 * sa.cov.Cov2) * sa.culprit'
        ))

        vds_kinship = vds_assoc.filter_variants_expr('v.start < 4')

        km = vds_kinship.rrm(False, False)
        ldMatrix = vds_kinship.ld_matrix()
        vds_assoc = vds_assoc.lmmreg(km, 'sa.pheno.PhenoLMM',
                                     ['sa.cov.Cov1', 'sa.cov.Cov2'])

        vds_assoc.export_variants('/tmp/lmmreg.tsv',
                                  'Variant = v, va.lmmreg.*')

        men, fam, ind, var = sample_split.mendel_errors(
            Pedigree.read(test_resources + '/sample.fam'))
        men.select(['fid', 's', 'code'])
        fam.select(['father', 'nChildren'])
        self.assertEqual(ind.key, ['s'])
        self.assertEqual(var.key, ['v'])
        sample_split.annotate_variants_table(var, root='va.mendel').count()

        sample_split.pca('sa.scores')

        sample_split.tdt(Pedigree.read(test_resources + '/sample.fam'))

        sample2_split.variant_qc().variant_schema

        sample2.export_variants('/tmp/variants.tsv', 'v = v, va = va')
        self.assertTrue(
            (sample2.variants_table().annotate('va = json(va)')).same(
                hc.import_table('/tmp/variants.tsv', impute=True).key_by('v')))

        sample2.export_samples('/tmp/samples.tsv', 's = s, sa = sa')
        self.assertTrue(
            (sample2.samples_table().annotate('s = s, sa = json(sa)')).same(
                hc.import_table('/tmp/samples.tsv', impute=True).key_by('s')))

        cols = ['v = v, info = va.info']
        for s in sample2.sample_ids:
            cols.append(
                '{s}.gt = va.G["{s}"].gt, {s}.gq = va.G["{s}"].gq'.format(s=s))

        (sample2.annotate_variants_expr(
            'va.G = index(gs.map(g => { s: s, gt: g.gt, gq: g.gq }).collect(), s)'
        ).export_variants('/tmp/sample_kt.tsv', ','.join(cols)))

        ((sample2.make_table(
            'v = v, info = va.info', 'gt = g.gt, gq = g.gq',
            ['v'])).same(hc.import_table('/tmp/sample_kt.tsv').key_by('v')))

        sample_split.annotate_variants_expr(
            "va.nHet = gs.filter(g => g.isHet()).count()")

        sample_split.aggregate_by_key(
            "Variant = v",
            "nHet = g.map(g => g.isHet().toInt()).sum().toLong()")
        sample_split.aggregate_by_key(
            ["Variant = v"],
            ["nHet = g.map(g => g.isHet().toInt()).sum().toLong()"])

        sample2.make_table('v = v, info = va.info', 'gt = g.gt', ['v'])

        sample.num_partitions()
        sample.file_version()
        sample.sample_ids[:5]

        self.assertFalse(sample2.was_split())

        self.assertTrue(sample_split.was_split())

        sample2.filter_alleles('pcoin(0.5)')

        gds.annotate_genotypes_expr('g = g.GT.toGenotype()').split_multi()

        sample_split.ld_prune().export_variants("/tmp/testLDPrune.tsv", "v")
        kt = (sample2.variants_table().annotate("v2 = v").key_by(["v", "v2"]))
        sample2.annotate_variants_table(kt, root="va.foo", vds_key=["v", "v"])

        self.assertEqual(kt.query('v.fraction(x => x == v2)'), 1.0)

        variants_py = (sample.annotate_variants_expr(
            'va.hets = gs.filter(g => g.isHet).collect()').variants_table().
                       collect())

        VariantDataset.from_table(sample.variants_table())
Exemple #10
0
    def test_keytable(self):
        test_resources = 'src/test/resources'

        # Import
        # columns: Sample Status qPhen
        kt = hc.import_keytable(test_resources + '/sampleAnnotations.tsv',
                                config=TextTableConfig(impute=True)).key_by('Sample')
        kt2 = hc.import_keytable(test_resources + '/sampleAnnotations2.tsv',
                                 config=TextTableConfig(impute=True)).key_by('Sample')

        # Variables
        self.assertEqual(kt.num_columns, 3)
        self.assertEqual(kt.key_names[0], "Sample")
        self.assertEqual(kt.column_names[2], "qPhen")
        self.assertEqual(kt.count_rows(), 100)
        kt.schema

        # Export
        kt.export('/tmp/testExportKT.tsv')

        # Filter, Same
        ktcase = kt.filter('Status == "CASE"', True)
        ktcase2 = kt.filter('Status == "CTRL"', False)
        self.assertTrue(ktcase.same(ktcase2))

        # Annotate
        (kt.annotate('X = Status')
         .count_rows())

        # Join
        kt.join(kt2, 'left').count_rows()

        # AggregateByKey
        (kt.aggregate_by_key("Status = Status", "Sum = qPhen.sum()")
         .count_rows())

        # Forall, Exists
        self.assertFalse(kt.forall('Status == "CASE"'))
        self.assertTrue(kt.exists('Status == "CASE"'))

        kt.rename({"Sample": "ID"})
        kt.rename(["Field1", "Field2", "Field3"])
        kt.rename([name + "_a" for name in kt.column_names])

        kt.select(["Sample"])
        kt.select(["Sample", "Status"])

        kt.key_by(['Sample', 'Status'])
        kt.key_by([])

        kt.flatten()
        kt.expand_types()

        kt.to_dataframe()

        kt.annotate("newField = [0, 1, 2]").explode(["newField"])

        sample = hc.import_vcf(test_resources + '/sample.vcf')
        sample_variants = (sample.variants_keytable()
                           .annotate('v = str(v), va.filters = va.filters.toArray()')
                           .flatten())

        sample_variants2 = hc.dataframe_to_keytable(
            sample_variants.to_dataframe(), ['v'])
        self.assertTrue(sample_variants.same(sample_variants2))

        # cseed: calculated by hand using sort -n -k 3,3 and inspection
        self.assertTrue(kt.filter('qPhen < 10000').count_rows() == 23)

        kt.write('/tmp/sampleAnnotations.kt', overwrite=True)
        kt3 = hc.read_keytable('/tmp/sampleAnnotations.kt')
        self.assertTrue(kt.same(kt3))

        # test order_by
        schema = TStruct(['a', 'b'], [TInt(), TString()])
        rows = [{'a': 5},
                {'a': 5, 'b': 'quam'},
                {'a': -1, 'b': 'quam'},
                {'b': 'foo'},
                {'a': 7, 'b': 'baz'}]
        kt4 = KeyTable.from_py(hc, rows, schema, npartitions=3)

        bya = [r.get('a') for r in kt4.order_by('a').collect()]
        self.assertEqual(bya, [-1, 5, 5, 7, None])

        bydesca = [r.get('a') for r in kt4.order_by(desc('a')).collect()]
        self.assertEqual(bydesca, [7, 5, 5, -1, None])

        byab = [(r.get('a'), r.get('b')) for r in kt4.order_by('a', 'b').collect()]
        self.assertEqual(byab, [(-1, 'quam'), (5, 'quam'), (5, None), (7, 'baz'), (None, 'foo')])

        bydescab = [(r.get('a'), r.get('b'))
                    for r in kt4.order_by(desc('a'), 'b').collect()]
        self.assertEqual(bydescab, [(7, 'baz'), (5, 'quam'), (5, None), (-1, 'quam'), (None, 'foo')])