コード例 #1
0
ファイル: tests.py プロジェクト: harlixxy/hail
    def test_keytable(self):
        test_resources = 'src/test/resources'

        # Import
        # columns: Sample Status qPhen
        kt = hc.import_keytable(test_resources + '/sampleAnnotations.tsv',
                                'Sample',
                                config=TextTableConfig(impute=True))
        kt2 = hc.import_keytable(test_resources + '/sampleAnnotations2.tsv',
                                 'Sample',
                                 config=TextTableConfig(impute=True))

        # Variables
        self.assertEqual(kt.num_columns, 3)
        self.assertEqual(kt.key_names[0], "Sample")
        self.assertEqual(kt.column_names[2], "qPhen")
        self.assertEqual(kt.count_rows(), 100)
        kt.schema

        # Export
        kt.export('/tmp/testExportKT.tsv')

        # Filter, Same
        ktcase = kt.filter('Status == "CASE"', True)
        ktcase2 = kt.filter('Status == "CTRL"', False)
        self.assertTrue(ktcase.same(ktcase2))

        # Annotate
        (kt.annotate('X = Status').count_rows())

        # Join
        kt.join(kt2, 'left').count_rows()

        # AggregateByKey
        (kt.aggregate_by_key("Status = Status",
                             "Sum = qPhen.sum()").count_rows())

        # Forall, Exists
        self.assertFalse(kt.forall('Status == "CASE"'))
        self.assertTrue(kt.exists('Status == "CASE"'))

        kt.rename({"Sample": "ID"})
        kt.rename(["Field1", "Field2", "Field3"])
        kt.rename([name + "_a" for name in kt.column_names])

        kt.select(["Sample"])
        kt.select(["Sample", "Status"])

        kt.key_by(['Sample', 'Status'])
        kt.key_by([])

        kt.flatten()
        kt.expand_types()

        kt.to_dataframe()

        kt.annotate("newField = [0, 1, 2]").explode(["newField"])

        sample = hc.import_vcf(test_resources + '/sample.vcf')
        sample_variants = (sample.variants_keytable().annotate(
            'v = str(v), va.filters = va.filters.toArray').flatten())

        sample_variants2 = hc.dataframe_to_keytable(
            sample_variants.to_dataframe(), ['v'])
        self.assertTrue(sample_variants.same(sample_variants2))
コード例 #2
0
ファイル: tests.py プロジェクト: IsmailM/hail
    def test_dataset(self):
        test_resources = 'src/test/resources'

        sample = hc.import_vcf(test_resources + '/sample.vcf')
        sample.cache()

        sample_split = sample.split_multi()

        sample2 = hc.import_vcf(test_resources + '/sample2.vcf')
        sample2.persist()

        sample2_split = sample2.split_multi()

        sample2.aggregate_intervals(
            test_resources + '/annotinterall.interval_list',
            'N = variants.count()', '/tmp/annotinter.tsv')

        sample2.query_variants(['variants.count()'])

        sample2.query_samples(['samples.count()'])

        (sample2.annotate_global_list(test_resources + '/global_list.txt',
                                      'global.genes',
                                      as_set=True).globals)

        (sample2.annotate_global_table(test_resources + '/global_table.tsv',
                                       'global.genes').globals)

        (sample2.annotate_samples_expr(
            'sa.nCalled = gs.filter(g => g.isCalled()).count()').
         export_samples('/tmp/sa.tsv', 's = s, nCalled = sa.nCalled'))

        sample2.annotate_samples_list(test_resources + '/sample2.sample_list',
                                      'sa.listed')

        sample2.annotate_global_expr('global.foo = 5')
        sample2.annotate_global_expr(['global.foo = 5', 'global.bar = 6'])

        sample2_annot = sample2.annotate_samples_table(
            test_resources + '/sampleAnnotations.tsv',
            'Sample',
            code='sa.isCase = table.Status == "CASE", sa.qPhen = table.qPhen')

        sample2.annotate_samples_vds(sample2_annot,
                                     code='sa.isCase = vds.isCase')

        (sample.annotate_variants_bed(
            test_resources + '/example1.bed',
            root='va.bed').filter_variants_expr('va.bed').count())

        sample.annotate_alleles_expr('va.gs = gs.callStats(g => v)').count()
        sample.annotate_alleles_expr(
            ['va.gs = gs.callStats(g => v)', 'va.foo = 5']).count()

        (sample2.annotate_variants_expr(
            'va.nCalled = gs.filter(g => g.isCalled()).count()').count())

        (sample2.annotate_variants_intervals(test_resources +
                                             '/annotinterall.interval_list',
                                             'va.included',
                                             all=True).count())

        (sample2.annotate_variants_loci(test_resources + '/sample2_loci.tsv',
                                        'Locus(chr, pos.toInt())',
                                        'va.locus_annot').count())

        (sample.annotate_variants_table(
            test_resources + '/variantAnnotations.tsv',
            'Variant(Chromosome, Position.toInt(), Ref, Alt)',
            root='va.table').count())

        (sample2.annotate_variants_vds(
            sample2, code='va.good = va.info.AF == vds.info.AF').count())

        glob, concordance1, concordance2 = (
            sample2_split.concordance(sample2_split))
        print(glob[1][4])
        print(glob[4][0])
        print(glob[:][3])
        concordance1.write('/tmp/foo.vds', overwrite=True)
        concordance2.write('/tmp/foo.vds', overwrite=True)

        downsampled = sample2.downsample_variants(20)
        downsampled.export_variants('/tmp/sample2_loci.tsv',
                                    'chr = v.contig, pos = v.start')
        downsampled.export_variants('/tmp/sample2_variants.tsv', 'v')

        (sample2.filter_samples_list(
            test_resources + '/sample2.sample_list').count()['nSamples'] == 56)

        sample2_split.export_gen('/tmp/sample2.gen')

        (sample2.filter_genotypes('g.isHet() && g.gq > 20').export_genotypes(
            '/tmp/sample2_genotypes.tsv', 'v, s, g.nNonRefAlleles()'))

        sample2_split.export_plink('/tmp/sample2')

        sample2.export_vcf('/tmp/sample2.vcf.bgz')

        sample2.filter_multi().count()

        self.assertEqual(sample2.drop_samples().count()['nSamples'], 0)

        self.assertEqual(sample2.drop_variants().count()['nVariants'], 0)

        sample2_dedup = (hc.import_vcf(
            [test_resources + '/sample2.vcf',
             test_resources + '/sample2.vcf']).deduplicate())
        self.assertEqual(sample2_dedup.count()['nVariants'], 735)

        (sample2.filter_samples_expr('pcoin(0.5)').export_samples(
            '/tmp/sample2.sample_list', 's'))

        (sample2.filter_variants_expr('pcoin(0.5)').export_variants(
            '/tmp/sample2.variant_list', 'v'))

        (sample2.filter_variants_intervals(
            IntervalTree.read(test_resources +
                              '/annotinterall.interval_list')).count())

        sample2.filter_variants_intervals(Interval.parse('1:100-end')).count()
        sample2.filter_variants_intervals(
            IntervalTree.parse_all(['1:100-end', '3-22'])).count()
        sample2.filter_variants_intervals(
            IntervalTree([Interval.parse('1:100-end')])).count()

        (sample2.filter_variants_intervals(
            IntervalTree.read(test_resources +
                              '/annotinterall.interval_list')).count())

        self.assertEqual(
            sample2.filter_variants_list(
                test_resources + '/sample2_variants.tsv').count()['nVariants'],
            21)

        sample2.split_multi().grm('/tmp/sample2.grm', 'gcta-grm-bin')

        sample2.hardcalls().count()

        sample2_split.ibd(min=0.2, max=0.6)

        sample2.split_multi().impute_sex().variant_schema

        self.assertTrue(isinstance(sample2.genotype_schema, TGenotype))

        m2 = {
            r._0: r._1
            for r in hc.import_keytable(test_resources + '/sample2_rename.tsv',
                                        config=TextTableConfig(
                                            noheader=True)).collect()
        }
        self.assertEqual(
            sample2.join(sample2.rename_samples(m2)).count()['nSamples'], 200)

        linreg = (hc.import_vcf(
            test_resources +
            '/regressionLinear.vcf').split_multi().annotate_samples_table(
                test_resources + '/regressionLinear.cov',
                'Sample',
                root='sa.cov',
                config=TextTableConfig(types='Cov1: Double, Cov2: Double')
            ).annotate_samples_table(
                test_resources + '/regressionLinear.pheno',
                'Sample',
                code='sa.pheno.Pheno = table.Pheno',
                config=TextTableConfig(
                    types='Pheno: Double',
                    missing='0')).annotate_samples_table(
                        test_resources + '/regressionLogisticBoolean.pheno',
                        'Sample',
                        code='sa.pheno.isCase = table.isCase',
                        config=TextTableConfig(types='isCase: Boolean',
                                               missing='0')))

        (linreg.linreg('sa.pheno.Pheno',
                       covariates=['sa.cov.Cov1',
                                   'sa.cov.Cov2 + 1 - 1']).count())

        (linreg.logreg('wald',
                       'sa.pheno.isCase',
                       covariates=['sa.cov.Cov1',
                                   'sa.cov.Cov2 + 1 - 1']).count())

        vds_assoc = (hc.import_vcf(test_resources + '/sample.vcf').split_multi(
        ).variant_qc().annotate_samples_expr(
            'sa.culprit = gs.filter(g => v == Variant("20", 13753124, "A", "C")).map(g => g.gt).collect()[0]'
        ).annotate_samples_expr('sa.pheno = rnorm(1,1) * sa.culprit').
                     annotate_samples_expr('sa.cov1 = rnorm(0,1)').
                     annotate_samples_expr('sa.cov2 = rnorm(0,1)'))

        vds_kinship = vds_assoc.filter_variants_expr('va.qc.AF > .05')

        vds_assoc = vds_assoc.lmmreg(vds_kinship, 'sa.pheno',
                                     ['sa.cov1', 'sa.cov2'])

        vds_assoc.export_variants('/tmp/lmmreg.tsv',
                                  'Variant = v, va.lmmreg.*')

        sample_split.mendel_errors('/tmp/sample.mendel',
                                   test_resources + '/sample.fam')

        sample_split.pca('sa.scores')

        self.assertTrue((sample2.repartition(16, shuffle=False).same(sample2)))

        print(sample2.storage_level())

        sample_split.tdt(test_resources + '/sample.fam')

        sample2._typecheck()

        sample2_split.variant_qc().variant_schema

        sample2.export_variants('/tmp/variants.tsv', 'v = v, va = va')
        self.assertTrue(
            (sample2.variants_keytable().annotate('va = json(va)')).same(
                hc.import_keytable('/tmp/variants.tsv', ['v'],
                                   config=TextTableConfig(impute=True))))

        sample2.export_samples('/tmp/samples.tsv', 's = s, sa = sa')
        self.assertTrue(
            (sample2.samples_keytable().annotate('s = s.id, sa = json(sa)')
             ).same(
                 hc.import_keytable('/tmp/samples.tsv', ['s'],
                                    config=TextTableConfig(impute=True))))

        cols = ['v = v, info = va.info']
        for s in sample2.sample_ids:
            cols.append(
                '{s}.gt = va.G["{s}"].gt, {s}.gq = va.G["{s}"].gq'.format(s=s))

        (sample2.annotate_variants_expr(
            'va.G = index(gs.map(g => { s: s.id, gt: g.gt, gq: g.gq }).collect(), s)'
        ).export_variants('/tmp/sample_kt.tsv', ','.join(cols)))

        ((sample2.make_keytable('v = v, info = va.info',
                                'gt = g.gt, gq = g.gq', ['v'])).same(
                                    hc.import_keytable('/tmp/sample_kt.tsv',
                                                       ['v'])))

        sample_split.annotate_variants_expr(
            "va.nHet = gs.filter(g => g.isHet()).count()")

        sample_split.aggregate_by_key(
            "Variant = v",
            "nHet = g.map(g => g.isHet().toInt()).sum().toLong()")
        sample_split.aggregate_by_key(
            ["Variant = v"],
            ["nHet = g.map(g => g.isHet().toInt()).sum().toLong()"])

        sample2.make_keytable('v = v, info = va.info', 'gt = g.gt', ['v'])

        sample.num_partitions()
        sample.file_version()
        sample.sample_ids[:5]

        self.assertFalse(sample.was_split())
        self.assertTrue(sample_split.was_split())

        self.assertFalse(sample.is_dosage())

        self.assertEqual(sample.num_samples, 100)
        self.assertEqual(sample.count_variants(), 346)

        sample2.filter_alleles('pcoin(0.5)')

        sample2.annotate_variants_keytable(sample2.variants_keytable(),
                                           "va.foo = table.va")

        kt = (sample2.variants_keytable().annotate("v2 = v").key_by(
            ["v", "v2"]))
        sample2.annotate_variants_keytable(kt, "va.foo = table.va", ["v", "v"])

        variants_py = (sample.annotate_variants_expr(
            'va.hets = gs.filter(g => g.isHet).collect()').variants_keytable().
                       collect())
コード例 #3
0
    def test_dataset(self):
        test_resources = 'src/test/resources'
        
        vds = hc.import_vcf(test_resources + '/sample.vcf')
        vds2 = hc.import_vcf(test_resources + '/sample2.vcf')
        gds = hc.import_vcf(test_resources + '/sample.vcf', generic=True)
        gds2 = hc.import_vcf(test_resources + '/sample2.vcf', generic=True)

        for (dataset, dataset2) in [(vds, vds2), (gds, gds2)]:

            if dataset._is_generic_genotype:
                gt = 'g.GT'
            else:
                gt = 'g'

            dataset.cache()
            dataset2.persist()

            dataset.write('/tmp/sample.vds', overwrite=True)

            dataset.count(genotypes=True)

            dataset.aggregate_intervals(test_resources + '/annotinterall.interval_list',
                                        'N = variants.count()',
                                        '/tmp/annotinter.tsv')

            dataset.query_variants(['variants.count()'])
            dataset.query_samples(['samples.count()'])

            dataset.annotate_global_list(test_resources + '/global_list.txt', 'global.genes', as_set=True).globals

            dataset.annotate_global_table(test_resources + '/global_table.tsv', 'global.genes').globals

            (dataset.annotate_samples_expr('sa.nCalled = gs.filter(g => {0}.isCalled()).count()'.format(gt))
             .export_samples('/tmp/sa.tsv', 's = s, nCalled = sa.nCalled'))

            dataset.annotate_samples_list(test_resources + '/sample2.sample_list',
                                          'sa.listed')

            dataset.annotate_global_expr('global.foo = 5')
            dataset.annotate_global_expr(['global.foo = 5', 'global.bar = 6'])

            dataset_annot = dataset.annotate_samples_table(
                test_resources + '/sampleAnnotations.tsv',
                'Sample',
                code='sa.isCase = table.Status == "CASE", sa.qPhen = table.qPhen')

            dataset.annotate_samples_vds(dataset_annot,
                                         code='sa.isCase = vds.isCase')
    
            (dataset.annotate_variants_bed(test_resources + '/example1.bed',
                                          root='va.bed')
             .filter_variants_expr('va.bed')
             .count())

            (dataset.annotate_variants_expr('va.nCalled = gs.filter(g => {0}.isCalled()).count()'.format(gt))
             .count())

            (dataset.annotate_variants_intervals(test_resources + '/annotinterall.interval_list',
                                                 'va.included',
                                                 all=True)
             .count())

            (dataset.annotate_variants_loci(test_resources + '/sample2_loci.tsv',
                                            'Locus(chr, pos.toInt())',
                                            'va.locus_annot')
             .count())

            (dataset.annotate_variants_table(test_resources + '/variantAnnotations.tsv',
                                            'Variant(Chromosome, Position.toInt(), Ref, Alt)',
                                            root='va.table')
             .count())

            (dataset.annotate_variants_vds(dataset, code='va.good = va.info.AF == vds.info.AF')
             .count())

            downsampled = dataset.downsample_variants(20)
            downsampled.export_variants('/tmp/sample2_loci.tsv', 'chr = v.contig, pos = v.start')
            downsampled.export_variants('/tmp/sample2_variants.tsv', 'v')

            with open(test_resources + '/sample2.sample_list') as f:
                samples = [s.strip() for s in f]
            (dataset.filter_samples_list(samples)
             .count()['nSamples'] == 56)

            dataset.export_vcf('/tmp/sample2.vcf.bgz')

            self.assertEqual(dataset.drop_samples().count()['nSamples'], 0)
            self.assertEqual(dataset.drop_variants().count()['nVariants'], 0)

            dataset_dedup = (hc.import_vcf([test_resources + '/sample2.vcf',
                                        test_resources + '/sample2.vcf'])
                         .deduplicate())
            self.assertEqual(dataset_dedup.count()['nVariants'], 735)

            (dataset.filter_samples_expr('pcoin(0.5)')
             .export_samples('/tmp/sample2.sample_list', 's'))

            (dataset.filter_variants_expr('pcoin(0.5)')
             .export_variants('/tmp/sample2.variant_list', 'v'))

            (dataset.filter_variants_intervals(IntervalTree.read(test_resources + '/annotinterall.interval_list'))
             .count())

            dataset.filter_variants_intervals(Interval.parse('1:100-end')).count()
            dataset.filter_variants_intervals(IntervalTree.parse_all(['1:100-end', '3-22'])).count()
            dataset.filter_variants_intervals(IntervalTree([Interval.parse('1:100-end')])).count()

            (dataset.filter_variants_intervals(IntervalTree.read(test_resources + '/annotinterall.interval_list'))
             .count())

            self.assertEqual(dataset2.filter_variants_list(
                test_resources + '/sample2_variants.tsv')
                             .count()['nVariants'], 21)

            m2 = {r._0: r._1 for r in hc.import_keytable(test_resources + '/sample2_rename.tsv',
                                                         config=TextTableConfig(noheader=True))
                .collect()}
            self.assertEqual(dataset2.join(dataset2.rename_samples(m2))
                             .count()['nSamples'], 200)

            dataset._typecheck()

            dataset.export_variants('/tmp/variants.tsv', 'v = v, va = va')
            self.assertTrue((dataset.variants_keytable()
                             .annotate('va = json(va)'))
                            .same(hc.import_keytable('/tmp/variants.tsv', config=TextTableConfig(impute=True)).key_by('v')))

            dataset.export_samples('/tmp/samples.tsv', 's = s, sa = sa')
            self.assertTrue((dataset.samples_keytable()
                             .annotate('s = s, sa = json(sa)'))
                            .same(hc.import_keytable('/tmp/samples.tsv', config=TextTableConfig(impute=True)).key_by('s')))

            if dataset._is_generic_genotype:
                gt_string = 'gt = g.GT, gq = g.GQ'
                gt_string2 = 'gt: g.GT, gq: g.GQ'
            else:
                gt_string = 'gt = g.gt, gq = g.gq'
                gt_string2 = 'gt: g.gt, gq: g.gq'

            cols = ['v = v, info = va.info']
            for s in dataset.sample_ids:
                cols.append('{s}.gt = va.G["{s}"].gt, {s}.gq = va.G["{s}"].gq'.format(s=s))

            (dataset
             .annotate_variants_expr('va.G = index(gs.map(g => { s: s, %s }).collect(), s)' % gt_string2)
             .export_variants('/tmp/sample_kt.tsv', ','.join(cols)))

            ((dataset
              .make_keytable('v = v, info = va.info', gt_string, ['v']))
             .same(hc.import_keytable('/tmp/sample_kt.tsv').key_by('v')))

            dataset.annotate_variants_expr("va.nHet = gs.filter(g => {0}.isHet()).count()".format(gt))

            dataset.aggregate_by_key("Variant = v", "nHet = g.map(g => {0}.isHet().toInt()).sum().toLong()".format(gt))
            dataset.aggregate_by_key(["Variant = v"], ["nHet = g.map(g => {0}.isHet().toInt()).sum().toLong()".format(gt)])

            dataset.make_keytable('v = v, info = va.info', 'gt = {0}'.format(gt), ['v'])

            dataset.num_partitions()
            dataset.file_version()
            dataset.sample_ids[:5]
            dataset.variant_schema
            dataset.sample_schema

            self.assertFalse(dataset.is_dosage())

            self.assertEqual(dataset2.num_samples, 100)
            self.assertEqual(dataset2.count_variants(), 735)

            dataset.annotate_variants_keytable(dataset.variants_keytable(), "va.foo = table.va")

            kt = (dataset.variants_keytable()
                  .annotate("v2 = v")
                  .key_by(["v", "v2"]))

            dataset.annotate_variants_keytable(kt, "va.foo = table.va", ["v", "v"])

            self.assertEqual(kt.query('v.fraction(x => x == v2)'), 1.0)

            ## This is very slow!!!
            variants_py = (dataset
                           .annotate_variants_expr('va.hets = gs.filter(g => {0}.isHet()).collect()'.format(gt))
                           .variants_keytable()
                           .filter('pcoin(0.1)')
                           .collect())

            if dataset._is_generic_genotype:
                expr = 'g.GT.isHet() && g.GQ > 20'
            else:
                expr = 'g.isHet() && g.gq > 20'

            (dataset.filter_genotypes(expr)
             .export_genotypes('/tmp/sample2_genotypes.tsv', 'v, s, {0}.nNonRefAlleles()'.format(gt)))

            self.assertTrue(
                (dataset.repartition(16, shuffle=False)
                 .same(dataset)))

            print(dataset.storage_level())

        sample = hc.import_vcf(test_resources + '/sample.vcf')
        sample.cache()

        sample_split = sample.split_multi()

        sample2 = hc.import_vcf(test_resources + '/sample2.vcf')
        sample2.persist()

        sample2_split = sample2.split_multi()

        sample.annotate_alleles_expr('va.gs = gs.callStats(g => v)').count()
        sample.annotate_alleles_expr(['va.gs = gs.callStats(g => v)', 'va.foo = 5']).count()

        glob, concordance1, concordance2 = (sample2_split.concordance(sample2_split))
        print(glob[1][4])
        print(glob[4][0])
        print(glob[:][3])
        concordance1.write('/tmp/foo.vds', overwrite=True)
        concordance2.write('/tmp/foo.vds', overwrite=True)

        sample2_split.export_gen('/tmp/sample2.gen')
        sample2_split.export_plink('/tmp/sample2')

        sample2.filter_multi().count()

        sample2.split_multi().grm('/tmp/sample2.grm', 'gcta-grm-bin')

        sample2.hardcalls().count()

        sample2_split.ibd(min=0.2, max=0.6)

        sample2.split_multi().impute_sex().variant_schema

        self.assertTrue(isinstance(sample2.genotype_schema, TGenotype))

        regression = (hc.import_vcf(test_resources + '/regressionLinear.vcf')
                  .split_multi()
                  .annotate_samples_table(test_resources + '/regressionLinear.cov',
                                          'Sample',
                                          root='sa.cov',
                                          config=TextTableConfig(types='Cov1: Double, Cov2: Double'))
                  .annotate_samples_table(test_resources + '/regressionLinear.pheno',
                                          'Sample',
                                          code='sa.pheno.Pheno = table.Pheno',
                                          config=TextTableConfig(types='Pheno: Double', missing='0'))
                  .annotate_samples_table(test_resources + '/regressionLogisticBoolean.pheno',
                                          'Sample',
                                          code='sa.pheno.isCase = table.isCase',
                                          config=TextTableConfig(types='isCase: Boolean', missing='0')))

        (regression.linreg('sa.pheno.Pheno', covariates=['sa.cov.Cov1', 'sa.cov.Cov2 + 1 - 1'])
         .count())

        (regression.logreg('wald', 'sa.pheno.isCase', covariates=['sa.cov.Cov1', 'sa.cov.Cov2 + 1 - 1'])
         .count())

        vds_assoc = (regression
                     .annotate_samples_expr(
            'sa.culprit = gs.filter(g => v == Variant("1", 1, "C", "T")).map(g => g.gt).collect()[0]')
                     .annotate_samples_expr('sa.pheno.PhenoLMM = (1 + 0.1 * sa.cov.Cov1 * sa.cov.Cov2) * sa.culprit'))

        vds_kinship = vds_assoc.filter_variants_expr('v.start < 4')

        km = vds_kinship.rrm(False, False)
        vds_assoc = vds_assoc.lmmreg(km, 'sa.pheno.PhenoLMM', ['sa.cov.Cov1', 'sa.cov.Cov2'])

        vds_assoc.export_variants('/tmp/lmmreg.tsv', 'Variant = v, va.lmmreg.*')

        sample_split.mendel_errors('/tmp/sample.mendel', test_resources + '/sample.fam')

        sample_split.pca('sa.scores')

        sample_split.tdt(test_resources + '/sample.fam')

        sample2_split.variant_qc().variant_schema

        self.assertTrue(sample_split.was_split())

        sample2.filter_alleles('pcoin(0.5)')

        gds.annotate_genotypes_expr('g = g.GT.toGenotype()').split_multi()
コード例 #4
0
    def test_keytable(self):
        test_resources = 'src/test/resources'

        # Import
        # columns: Sample Status qPhen
        kt = hc.import_keytable(test_resources + '/sampleAnnotations.tsv',
                                config=TextTableConfig(impute=True)).key_by('Sample')
        kt2 = hc.import_keytable(test_resources + '/sampleAnnotations2.tsv',
                                 config=TextTableConfig(impute=True)).key_by('Sample')

        # Variables
        self.assertEqual(kt.num_columns, 3)
        self.assertEqual(kt.key_names[0], "Sample")
        self.assertEqual(kt.column_names[2], "qPhen")
        self.assertEqual(kt.count_rows(), 100)
        kt.schema

        # Export
        kt.export('/tmp/testExportKT.tsv')

        # Filter, Same
        ktcase = kt.filter('Status == "CASE"', True)
        ktcase2 = kt.filter('Status == "CTRL"', False)
        self.assertTrue(ktcase.same(ktcase2))

        # Annotate
        (kt.annotate('X = Status')
         .count_rows())

        # Join
        kt.join(kt2, 'left').count_rows()

        # AggregateByKey
        (kt.aggregate_by_key("Status = Status", "Sum = qPhen.sum()")
         .count_rows())

        # Forall, Exists
        self.assertFalse(kt.forall('Status == "CASE"'))
        self.assertTrue(kt.exists('Status == "CASE"'))

        kt.rename({"Sample": "ID"})
        kt.rename(["Field1", "Field2", "Field3"])
        kt.rename([name + "_a" for name in kt.column_names])

        kt.select(["Sample"])
        kt.select(["Sample", "Status"])

        kt.key_by(['Sample', 'Status'])
        kt.key_by([])

        kt.flatten()
        kt.expand_types()

        kt.to_dataframe()

        kt.annotate("newField = [0, 1, 2]").explode(["newField"])

        sample = hc.import_vcf(test_resources + '/sample.vcf')
        sample_variants = (sample.variants_keytable()
                           .annotate('v = str(v), va.filters = va.filters.toArray()')
                           .flatten())

        sample_variants2 = hc.dataframe_to_keytable(
            sample_variants.to_dataframe(), ['v'])
        self.assertTrue(sample_variants.same(sample_variants2))

        # cseed: calculated by hand using sort -n -k 3,3 and inspection
        self.assertTrue(kt.filter('qPhen < 10000').count_rows() == 23)

        kt.write('/tmp/sampleAnnotations.kt', overwrite=True)
        kt3 = hc.read_keytable('/tmp/sampleAnnotations.kt')
        self.assertTrue(kt.same(kt3))

        # test order_by
        schema = TStruct(['a', 'b'], [TInt(), TString()])
        rows = [{'a': 5},
                {'a': 5, 'b': 'quam'},
                {'a': -1, 'b': 'quam'},
                {'b': 'foo'},
                {'a': 7, 'b': 'baz'}]
        kt4 = KeyTable.from_py(hc, rows, schema, npartitions=3)

        bya = [r.get('a') for r in kt4.order_by('a').collect()]
        self.assertEqual(bya, [-1, 5, 5, 7, None])

        bydesca = [r.get('a') for r in kt4.order_by(desc('a')).collect()]
        self.assertEqual(bydesca, [7, 5, 5, -1, None])

        byab = [(r.get('a'), r.get('b')) for r in kt4.order_by('a', 'b').collect()]
        self.assertEqual(byab, [(-1, 'quam'), (5, 'quam'), (5, None), (7, 'baz'), (None, 'foo')])

        bydescab = [(r.get('a'), r.get('b'))
                    for r in kt4.order_by(desc('a'), 'b').collect()]
        self.assertEqual(bydescab, [(7, 'baz'), (5, 'quam'), (5, None), (-1, 'quam'), (None, 'foo')])