def test_kt_globals(self): kt = KeyTable.range(10) kt = kt.annotate_global_expr('foo = [1,2,3]') kt = kt.annotate_global('bar', [4, 5, 6], TArray(TInt32())) self.assertEqual( kt.filter('foo.exists(x => x == idx) || bar.exists(x => x == idx)' ).count(), 6)
def important_variants(self, n_limit=1000): """ Gets the top n most important loci. :param int n_limit: the limit of the number of loci to return :return: A KeyTable with the variant in the first column and importance in the second. :rtype: :py:class:`KeyTable` """ return KeyTable(self.hc, self._jia.variantImportance(n_limit))
def test_trio_matrix(self): ped = Pedigree.read('src/test/resources/triomatrix.fam') from hail import KeyTable fam_table = KeyTable.import_fam( 'src/test/resources/triomatrix.fam').to_hail2() dataset = hc.import_vcf('src/test/resources/triomatrix.vcf') dataset = dataset.annotate_cols(fam=fam_table[dataset.s]) tm = methods.trio_matrix(dataset, ped, complete_trios=True) tm.count_rows()
#!./bin/pyhail-0.1-latest.sh import pyspark import hail from hail import KeyTable from hail.representation import Interval hc = hail.HailContext(log='log/08b_plink_export_somaticfiltered.log', tmp_dir='tmp/hail') #vds = hc.read('../MGRB.phase2.SNPtier12.match.vqsr.minrep.locusannot.WGStier12.unrelated.vds') vds = hc.read( '../MGRB.phase2.SNPtier12.match.vqsr.minrep.locusannot.WGStier12.unrelated.nocancer.over70.tgp.hrc.gnomad.dbsnp.clinvar.cato.eigen.vep.vds' ) tier1_bed = KeyTable.import_bed( '../../locus-annotations/source_data/HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel.bed' ) # Extract good probably not somatic markers for rare variant comparisons. # Definition of 'good markers': # * In autosomes # * In tier 1 regions. # # Definition of probably not somatic: # DP > 10 AND # ( # GT != het OR # ( # binomTest(ad, dp, 0.5, "two.sided") >= alpha # ) # )
def test_keytable(self): test_resources = 'src/test/resources' # Import # columns: Sample Status qPhen kt = hc.import_keytable( test_resources + '/sampleAnnotations.tsv', config=TextTableConfig(impute=True)).key_by('Sample') kt2 = hc.import_keytable( test_resources + '/sampleAnnotations2.tsv', config=TextTableConfig(impute=True)).key_by('Sample') # Variables self.assertEqual(kt.num_columns, 3) self.assertEqual(kt.key_names[0], "Sample") self.assertEqual(kt.column_names[2], "qPhen") self.assertEqual(kt.count_rows(), 100) kt.schema # Export kt.export('/tmp/testExportKT.tsv') # Filter, Same ktcase = kt.filter('Status == "CASE"', True) ktcase2 = kt.filter('Status == "CTRL"', False) self.assertTrue(ktcase.same(ktcase2)) # Annotate (kt.annotate('X = Status').count_rows()) # Join kt.join(kt2, 'left').count_rows() # AggregateByKey (kt.aggregate_by_key("Status = Status", "Sum = qPhen.sum()").count_rows()) # Forall, Exists self.assertFalse(kt.forall('Status == "CASE"')) self.assertTrue(kt.exists('Status == "CASE"')) kt.rename({"Sample": "ID"}) kt.rename(["Field1", "Field2", "Field3"]) kt.rename([name + "_a" for name in kt.column_names]) kt.select(["Sample"]) kt.select(["Sample", "Status"]) kt.key_by(['Sample', 'Status']) kt.key_by([]) kt.flatten() kt.expand_types() kt.to_dataframe() kt.annotate("newField = [0, 1, 2]").explode(["newField"]) sample = hc.import_vcf(test_resources + '/sample.vcf') sample_variants = (sample.variants_keytable().annotate( 'v = str(v), va.filters = va.filters.toArray()').flatten()) sample_variants2 = hc.dataframe_to_keytable( sample_variants.to_dataframe(), ['v']) self.assertTrue(sample_variants.same(sample_variants2)) # cseed: calculated by hand using sort -n -k 3,3 and inspection self.assertTrue(kt.filter('qPhen < 10000').count_rows() == 23) kt.write('/tmp/sampleAnnotations.kt', overwrite=True) kt3 = hc.read_keytable('/tmp/sampleAnnotations.kt') self.assertTrue(kt.same(kt3)) # test order_by schema = TStruct(['a', 'b'], [TInt(), TString()]) rows = [{ 'a': 5 }, { 'a': 5, 'b': 'quam' }, { 'a': -1, 'b': 'quam' }, { 'b': 'foo' }, { 'a': 7, 'b': 'baz' }] kt4 = KeyTable.from_py(hc, rows, schema, npartitions=3) bya = [r.get('a') for r in kt4.order_by('a').collect()] self.assertEqual(bya, [-1, 5, 5, 7, None]) bydesca = [r.get('a') for r in kt4.order_by(desc('a')).collect()] self.assertEqual(bydesca, [7, 5, 5, -1, None]) byab = [(r.get('a'), r.get('b')) for r in kt4.order_by('a', 'b').collect()] self.assertEqual(byab, [(-1, 'quam'), (5, 'quam'), (5, None), (7, 'baz'), (None, 'foo')]) bydescab = [(r.get('a'), r.get('b')) for r in kt4.order_by(desc('a'), 'b').collect()] self.assertEqual(bydescab, [(7, 'baz'), (5, 'quam'), (5, None), (-1, 'quam'), (None, 'foo')])
def test_trio_matrix(self): """ This test depends on certain properties of the trio matrix VCF and pedigree structure. This test is NOT a valid test if the pedigree includes quads: the trio_matrix method will duplicate the parents appropriately, but the genotypes_table and samples_table orthogonal paths would require another duplication/explode that we haven't written. """ ped = Pedigree.read('src/test/resources/triomatrix.fam') famkt = KeyTable.import_fam('src/test/resources/triomatrix.fam') vds = hc.import_vcf('src/test/resources/triomatrix.vcf')\ .annotate_samples_table(famkt, root='sa.fam') dads = famkt.filter('isDefined(patID)')\ .annotate('isDad = true')\ .select(['patID', 'isDad'])\ .key_by('patID') moms = famkt.filter('isDefined(matID)') \ .annotate('isMom = true') \ .select(['matID', 'isMom']) \ .key_by('matID') # test genotypes gkt = (vds.genotypes_table( ).key_by('s').join(dads, how='left').join(moms, how='left').annotate( 'isDad = isDefined(isDad), isMom = isDefined(isMom)' ).aggregate_by_key( 'v = v, fam = sa.fam.famID', 'data = g.map(g => {role: if (isDad) 1 else if (isMom) 2 else 0, g: g}).collect()' ).filter('data.length() == 3').explode('data').select( ['v', 'fam', 'data'])) tkt = (vds.trio_matrix(ped, complete_trios=True).genotypes_table( ).annotate( 'fam = sa.proband.annotations.fam.famID, data = [{role: 0, g: g.proband}, {role: 1, g: g.father}, {role: 2, g: g.mother}]' ).select(['v', 'fam', 'data']).explode('data').filter('isDefined(data.g)').key_by( ['v', 'fam'])) self.assertTrue(gkt.same(tkt)) # test annotations g_sa = (vds.samples_table( ).join(dads, how='left').join(moms, how='left').annotate( 'isDad = isDefined(isDad), isMom = isDefined(isMom)' ).aggregate_by_key( 'fam = sa.fam.famID', 'data = sa.map(sa => {role: if (isDad) 1 else if (isMom) 2 else 0, sa: sa}).collect()' ).filter('data.length() == 3').explode('data').select(['fam', 'data'])) t_sa = (vds.trio_matrix(ped, complete_trios=True).samples_table( ).annotate( 'fam = sa.proband.annotations.fam.famID, data = [{role: 0, sa: sa.proband.annotations}, ' '{role: 1, sa: sa.father.annotations}, ' '{role: 2, sa: sa.mother.annotations}]').select([ 'fam', 'data' ]).explode('data').filter('isDefined(data.sa)').key_by(['fam'])) self.assertTrue(g_sa.same(t_sa))
def test_keytable(self): test_resources = 'src/test/resources' # Import # columns: Sample Status qPhen kt = hc.import_table(test_resources + '/sampleAnnotations.tsv', impute=True).key_by('Sample') kt2 = hc.import_table(test_resources + '/sampleAnnotations2.tsv', impute=True).key_by('Sample') # Variables self.assertEqual(kt.num_columns, 3) self.assertEqual(kt.key[0], "Sample") self.assertEqual(kt.columns[2], "qPhen") self.assertEqual(kt.count(), 100) kt.schema # Export kt.export('/tmp/testExportKT.tsv') # Filter, Same ktcase = kt.filter('Status == "CASE"', True) ktcase2 = kt.filter('Status == "CTRL"', False) self.assertTrue(ktcase.same(ktcase2)) # Annotate (kt.annotate('X = Status').count()) # Join kt.join(kt2, 'left').count() # AggregateByKey (kt.aggregate_by_key("Status = Status", "Sum = qPhen.sum()").count()) # Forall, Exists self.assertFalse(kt.forall('Status == "CASE"')) self.assertTrue(kt.exists('Status == "CASE"')) kt.rename({"Sample": "ID"}) kt.rename(["Field1", "Field2", "Field3"]) kt.rename([name + "_a" for name in kt.columns]) kt.select("Sample") kt.select(["Sample", "Status"], qualified_name=True) kt.drop("Sample") kt.drop(["Sample", "Status"]) kt.key_by(['Sample', 'Status']) kt.key_by([]) kt.flatten() kt.expand_types() kt.to_dataframe().count() kt.show(10) kt.show(4, print_types=False, truncate_to=15) kt.annotate("newField = [0, 1, 2]").explode(["newField"]) sample = hc.import_vcf(test_resources + '/sample.vcf') sample_variants = (sample.variants_table().annotate( 'v = str(v), va.filters = va.filters.toArray()').flatten()) sample_variants2 = KeyTable.from_dataframe( sample_variants.to_dataframe()).key_by('v') self.assertTrue(sample_variants.same(sample_variants2)) # cseed: calculated by hand using sort -n -k 3,3 and inspection self.assertTrue(kt.filter('qPhen < 10000').count() == 23) kt.write('/tmp/sampleAnnotations.kt', overwrite=True) kt3 = hc.read_table('/tmp/sampleAnnotations.kt') self.assertTrue(kt.same(kt3)) # test order_by schema = TStruct(['a', 'b'], [TInt32(), TString()]) rows = [{ 'a': 5 }, { 'a': 5, 'b': 'quam' }, { 'a': -1, 'b': 'quam' }, { 'b': 'foo' }, { 'a': 7, 'b': 'baz' }] kt4 = KeyTable.parallelize(rows, schema, num_partitions=3) bya = [r.get('a') for r in kt4.order_by('a').collect()] self.assertEqual(bya, [-1, 5, 5, 7, None]) bydesca = [r.get('a') for r in kt4.order_by(desc('a')).collect()] self.assertEqual(bydesca, [7, 5, 5, -1, None]) byab = [(r.get('a'), r.get('b')) for r in kt4.order_by('a', 'b').collect()] self.assertEqual(byab, [(-1, 'quam'), (5, 'quam'), (5, None), (7, 'baz'), (None, 'foo')]) bydescab = [(r.get('a'), r.get('b')) for r in kt4.order_by(desc('a'), 'b').collect()] self.assertEqual(bydescab, [(7, 'baz'), (5, 'quam'), (5, None), (-1, 'quam'), (None, 'foo')]) KeyTable.import_fam(test_resources + '/sample.fam')._typecheck() self.assertEqual(kt.union(kt).count(), kt.count() * 2) self.assertEqual(kt.union(kt, kt).count(), kt.count() * 3) first3 = kt.take(3) self.assertEqual(first3[0].qPhen, 27704) self.assertEqual(first3[1].qPhen, 16636) self.assertEqual(first3[2].qPhen, 7256) self.assertEqual(first3[0].Sample, 'HG00096') self.assertEqual(first3[1].Sample, 'HG00097') self.assertEqual(first3[2].Sample, 'HG00099') self.assertTrue(all(x.Status == 'CASE' for x in first3)) self.assertTrue(kt.head(3).count(), 3) self.assertEqual(range(10), [x.idx for x in KeyTable.range(10).collect()]) self.assertTrue( KeyTable.range(200).indexed('foo').forall('idx == foo')) kt3 = KeyTable.parallelize([{ 'A': Struct(c1=5, c2=21) }], TStruct(['A'], [TStruct(['c1', 'c2'], [TInt32(), TInt32()])])) self.assertTrue(kt3.ungroup('A').group('A', 'c1', 'c2').same(kt3))
def test_dataset(self): test_resources = 'src/test/resources' vds = hc.import_vcf(test_resources + '/sample.vcf') vds2 = hc.import_vcf(test_resources + '/sample2.vcf') for (dataset, dataset2) in [(vds, vds2)]: gt = 'g.GT' dataset = dataset.cache() dataset2 = dataset2.persist() dataset.write('/tmp/sample.vds', overwrite=True) dataset.count() self.assertEqual(dataset.head(3).count_variants(), 3) dataset.query_variants(['variants.count()']) dataset.query_samples(['samples.count()']) (dataset.annotate_samples_expr( 'sa.nCalled = gs.filter(g => isDefined({0})).count()'.format( gt)).samples_table().select(['s', 'nCalled = sa.nCalled' ]).export('/tmp/sa.tsv')) dataset.annotate_global_expr('global.foo = 5') dataset.annotate_global_expr(['global.foo = 5', 'global.bar = 6']) dataset = dataset.annotate_samples_table( hc.import_table(test_resources + '/sampleAnnotations.tsv').key_by('Sample'), expr= 'sa.isCase = table.Status == "CASE", sa.qPhen = table.qPhen') (dataset.annotate_variants_expr( 'va.nCalled = gs.filter(g => isDefined({0})).count()'.format( gt)).count()) loci_tb = ( hc.import_table(test_resources + '/sample2_loci.tsv').annotate( 'locus = Locus(chr, pos.toInt32())').key_by('locus')) (dataset.annotate_variants_table(loci_tb, root='va.locus_annot').count()) variants_tb = (hc.import_table( test_resources + '/variantAnnotations.tsv' ).annotate( 'variant = Variant(Chromosome, Position.toInt32(), Ref, Alt)'). key_by('variant')) (dataset.annotate_variants_table(variants_tb, root='va.table').count()) (dataset.annotate_variants_vds( dataset, expr='va.good = va.info.AF == vds.info.AF').count()) downsampled = dataset.sample_variants(0.10) downsampled.variants_table().select( ['chr = v.contig', 'pos = v.start']).export('/tmp/sample2_loci.tsv') downsampled.variants_table().select('v').export( '/tmp/sample2_variants.tsv') with open(test_resources + '/sample2.sample_list') as f: samples = [s.strip() for s in f] (dataset.filter_samples_list(samples).count()[0] == 56) locus_tb = ( hc.import_table(test_resources + '/sample2_loci.tsv').annotate( 'locus = Locus(chr, pos.toInt32())').key_by('locus')) (dataset.annotate_variants_table(locus_tb, root='va.locus_annot').count()) tb = (hc.import_table( test_resources + '/variantAnnotations.tsv' ).annotate( 'variant = Variant(Chromosome, Position.toInt32(), Ref, Alt)'). key_by('variant')) (dataset.annotate_variants_table(tb, root='va.table').count()) (dataset.annotate_variants_vds( dataset, expr='va.good = va.info.AF == vds.info.AF').count()) dataset.export_vcf('/tmp/sample2.vcf.bgz') self.assertEqual(dataset.drop_samples().count()[0], 0) self.assertEqual(dataset.drop_variants().count()[1], 0) dataset_dedup = (hc.import_vcf([ test_resources + '/sample2.vcf', test_resources + '/sample2.vcf' ]).deduplicate()) self.assertEqual(dataset_dedup.count()[1], 735) (dataset.filter_samples_expr('pcoin(0.5)').samples_table().select( 's').export('/tmp/sample2.sample_list')) (dataset.filter_variants_expr('pcoin(0.5)').variants_table(). select('v').export('/tmp/sample2.variant_list')) (dataset.filter_variants_table( KeyTable.import_interval_list( test_resources + '/annotinterall.interval_list')).count()) dataset.filter_intervals(Interval.parse('1:100-end')).count() dataset.filter_intervals(map(Interval.parse, ['1:100-end', '3-22'])).count() (dataset.filter_variants_table( KeyTable.import_interval_list( test_resources + '/annotinterall.interval_list')).count()) self.assertEqual( dataset2.filter_variants_table( hc.import_table(test_resources + '/sample2_variants.tsv', key='f0', impute=True, no_header=True)).count()[1], 21) m2 = { r.f0: r.f1 for r in hc.import_table(test_resources + '/sample2_rename.tsv', no_header=True).collect() } self.assertEqual( dataset2.join(dataset2.rename_samples(m2)).count()[0], 200) dataset._typecheck() dataset.variants_table().export('/tmp/variants.tsv') self.assertTrue( (dataset.variants_table().annotate('va = json(va)')).same( hc.import_table('/tmp/variants.tsv', impute=True).key_by('v'))) dataset.samples_table().export('/tmp/samples.tsv') self.assertTrue(( dataset.samples_table().annotate('s = s, sa = json(sa)')).same( hc.import_table('/tmp/samples.tsv', impute=True).key_by('s'))) gt_string = 'gt = g.GT, gq = g.GQ' gt_string2 = 'gt: g.GT, gq: g.GQ' cols = ['v = v', 'info = va.info'] for s in dataset.sample_ids: cols.append('`{s}`.gt = va.G["{s}"].gt'.format(s=s)) cols.append('`{s}`.gq = va.G["{s}"].gq'.format(s=s)) dataset_table = (dataset.annotate_variants_expr( 'va.G = index(gs.map(g => { s: s, %s }).collect(), s)' % gt_string2).variants_table().select(cols)) dataset_table_typs = { fd.name: fd.typ for fd in dataset_table.schema.fields } dataset_table.export('/tmp/sample_kt.tsv') self.assertTrue((dataset.make_table( 'v = v, info = va.info', gt_string, ['v'])).same( hc.import_table('/tmp/sample_kt.tsv', types=dataset_table_typs).key_by('v'))) dataset.annotate_variants_expr( "va.nHet = gs.filter(g => {0}.isHet()).count()".format(gt)) dataset.make_table('v = v, info = va.info', 'gt = {0}'.format(gt), ['v']) dataset.num_partitions() dataset.file_version() dataset.sample_ids[:5] dataset.variant_schema dataset.sample_schema self.assertEqual(dataset2.num_samples, 100) self.assertEqual(dataset2.count_variants(), 735) dataset.annotate_variants_table(dataset.variants_table(), root="va") kt = (dataset.variants_table().annotate("v2 = v").key_by( ["v", "v2"])) dataset.annotate_variants_table(kt, root='va.foo', vds_key=["v", "v"]) self.assertEqual(kt.query('v.fraction(x => x == v2)'), 1.0) dataset.genotypes_table() ## This is very slow!!! variants_py = (dataset.annotate_variants_expr( 'va.hets = gs.filter(g => {0}.isHet()).collect()'.format( gt)).variants_table().filter('pcoin(0.1)').collect()) expr = 'g.GT.isHet() && g.GQ > 20' (dataset.filter_genotypes(expr).genotypes_table().select([ 'v', 's', 'nNonRefAlleles = {0}.nNonRefAlleles()'.format(gt) ]).export('/tmp/sample2_genotypes.tsv')) self.assertTrue((dataset.repartition(16, shuffle=False).same(dataset))) self.assertTrue(dataset.naive_coalesce(2).same(dataset)) print(dataset.storage_level()) dataset = dataset.unpersist() dataset2 = dataset2.unpersist() new_sample_order = dataset.sample_ids[:] random.shuffle(new_sample_order) self.assertEqual( vds.reorder_samples(new_sample_order).sample_ids, new_sample_order) sample = hc.import_vcf(test_resources + '/sample.vcf').cache() sample.summarize().report() sample.drop_samples().summarize().report() sample_split = sample.split_multi_hts() sample2 = hc.import_vcf(test_resources + '/sample2.vcf') sample2 = sample2.persist() sample2_split = sample2.split_multi_hts() sample.annotate_alleles_expr_hts( 'va.gs = gs.map(g => g.GT).callStats(g => v)').count() sample.annotate_alleles_expr_hts( ['va.gs = gs.map(g => g.GT).callStats(g => v)', 'va.foo = 5']).count() glob, concordance1, concordance2 = ( sample2_split.concordance(sample2_split)) print(glob[1][4]) print(glob[4][0]) print(glob[:][3]) concordance1.write('/tmp/foo.kt', overwrite=True) concordance2.write('/tmp/foo.kt', overwrite=True) sample2_split.export_gen('/tmp/sample2.gen', 5) sample2_split.export_plink('/tmp/sample2') sample2.filter_variants_expr('v.isBiallelic').count() sample2.split_multi_hts().grm().export_gcta_grm_bin('/tmp/sample2.grm') sample2.hardcalls().count() sample2_split.ibd(min=0.2, max=0.6) sample2.split_multi_hts().impute_sex().variant_schema self.assertEqual(sample2.genotype_schema, Type.hts_schema()) m2 = { r.f0: r.f1 for r in hc.import_table(test_resources + '/sample2_rename.tsv', no_header=True, impute=True).collect() } self.assertEqual( sample2.join(sample2.rename_samples(m2)).count()[0], 200) cov = hc.import_table(test_resources + '/regressionLinear.cov', types={ 'Cov1': TFloat64(), 'Cov2': TFloat64() }).key_by('Sample') phen1 = hc.import_table(test_resources + '/regressionLinear.pheno', missing='0', types={ 'Pheno': TFloat64() }).key_by('Sample') phen2 = hc.import_table(test_resources + '/regressionLogisticBoolean.pheno', missing='0', types={ 'isCase': TBoolean() }).key_by('Sample') regression = (hc.import_vcf( test_resources + '/regressionLinear.vcf').split_multi_hts().annotate_samples_table( cov, root='sa.cov').annotate_samples_table( phen1, root='sa.pheno.Pheno').annotate_samples_table( phen2, root='sa.pheno.isCase')) (regression.linreg(['sa.pheno.Pheno'], 'g.GT.nNonRefAlleles()', covariates=['sa.cov.Cov1', 'sa.cov.Cov2 + 1 - 1']).count()) (regression.logreg('wald', 'sa.pheno.isCase', 'g.GT.nNonRefAlleles()', covariates=['sa.cov.Cov1', 'sa.cov.Cov2 + 1 - 1']).count()) vds_assoc = (regression.annotate_samples_expr( 'sa.culprit = gs.filter(g => v == Variant("1", 1, "C", "T")).map(g => g.GT.gt).collect()[0]' ).annotate_samples_expr( 'sa.pheno.PhenoLMM = (1 + 0.1 * sa.cov.Cov1 * sa.cov.Cov2) * sa.culprit' )) covariatesSkat = hc.import_table(test_resources + "/skat.cov", impute=True).key_by("Sample") phenotypesSkat = (hc.import_table(test_resources + "/skat.pheno", types={ "Pheno": TFloat64() }, missing="0").key_by("Sample")) intervalsSkat = KeyTable.import_interval_list(test_resources + "/skat.interval_list") weightsSkat = (hc.import_table(test_resources + "/skat.weights", types={ "locus": TLocus(), "weight": TFloat64() }).key_by("locus")) skatVds = (vds2.split_multi_hts().annotate_variants_table( intervalsSkat, root="va.gene").annotate_variants_table( weightsSkat, root="va.weight").annotate_samples_table( phenotypesSkat, root="sa.pheno").annotate_samples_table( covariatesSkat, root="sa.cov").annotate_samples_expr( "sa.pheno = if (sa.pheno == 1.0) false else " + "if (sa.pheno == 2.0) true else NA: Boolean")) (skatVds.skat(key_expr='va.gene', weight_expr='va.weight', y='sa.pheno', x='g.GT.nNonRefAlleles()', covariates=['sa.cov.Cov1', 'sa.cov.Cov2'], logistic=False).count()) (skatVds.skat(key_expr='va.gene', weight_expr='va.weight', y='sa.pheno', x='plDosage(g.PL)', covariates=['sa.cov.Cov1', 'sa.cov.Cov2'], logistic=True).count()) vds_kinship = vds_assoc.filter_variants_expr('v.start < 4') km = vds_kinship.rrm(False, False) ld_matrix_path = '/tmp/ldmatrix' ldMatrix = vds_kinship.ld_matrix() if os.path.isdir(ld_matrix_path): shutil.rmtree(ld_matrix_path) ldMatrix.write(ld_matrix_path) LDMatrix.read(ld_matrix_path).to_local_matrix() vds_assoc = vds_assoc.lmmreg(km, 'sa.pheno.PhenoLMM', 'g.GT.nNonRefAlleles()', ['sa.cov.Cov1', 'sa.cov.Cov2']) vds_assoc.variants_table().select(['Variant = v', 'va.lmmreg.*' ]).export('/tmp/lmmreg.tsv') men, fam, ind, var = sample_split.mendel_errors( Pedigree.read(test_resources + '/sample.fam')) men.select(['fid', 's', 'code']) fam.select(['father', 'nChildren']) self.assertEqual(ind.key, ['s']) self.assertEqual(var.key, ['v']) sample_split.annotate_variants_table(var, root='va.mendel').count() sample_split.pca_of_normalized_genotypes() sample_split.tdt(Pedigree.read(test_resources + '/sample.fam')) sample2_split.variant_qc().variant_schema sample2.variants_table().export('/tmp/variants.tsv') self.assertTrue( (sample2.variants_table().annotate('va = json(va)')).same( hc.import_table('/tmp/variants.tsv', impute=True).key_by('v'))) sample2.samples_table().export('/tmp/samples.tsv') self.assertTrue( (sample2.samples_table().annotate('s = s, sa = json(sa)')).same( hc.import_table('/tmp/samples.tsv', impute=True).key_by('s'))) cols = ['v = v', 'info = va.info'] for s in sample2.sample_ids: cols.append('{s}.gt = va.G["{s}"].gt'.format(s=s)) cols.append('{s}.gq = va.G["{s}"].gq'.format(s=s)) sample2_table = (sample2.annotate_variants_expr( 'va.G = index(gs.map(g => { s: s, gt: g.GT, gq: g.GQ }).collect(), s)' ).variants_table().select(cols)) sample2_table.export('/tmp/sample_kt.tsv') sample2_typs = {fd.name: fd.typ for fd in sample2_table.schema.fields} self.assertTrue((sample2.make_table( 'v = v, info = va.info', 'gt = g.GT, gq = g.GQ', ['v'])).same( hc.import_table('/tmp/sample_kt.tsv', types=sample2_typs).key_by('v'))) sample_split.annotate_variants_expr( "va.nHet = gs.filter(g => g.GT.isHet()).count()") sample2.make_table('v = v, info = va.info', 'gt = g.GT', ['v']) sample.num_partitions() sample.file_version() sample.sample_ids[:5] sample2.filter_alleles_hts('pcoin(0.5)') sample_split.ld_prune(8).variants_table().select('v').export( "/tmp/testLDPrune.tsv") kt = (sample2.variants_table().annotate("v2 = v").key_by(["v", "v2"])) sample2.annotate_variants_table(kt, root="va.foo", vds_key=["v", "v"]) self.assertEqual(kt.query('v.fraction(x => x == v2)'), 1.0) variants_py = (sample.annotate_variants_expr( 'va.hets = gs.filter(g => g.GT.isHet).collect()').variants_table(). take(5)) VariantDataset.from_table(sample.variants_table())
def test_dataset(self): test_resources = 'src/test/resources' vds = hc.import_vcf(test_resources + '/sample.vcf') vds2 = hc.import_vcf(test_resources + '/sample2.vcf') gds = hc.import_vcf(test_resources + '/sample.vcf', generic=True) gds2 = hc.import_vcf(test_resources + '/sample2.vcf', generic=True) for (dataset, dataset2) in [(vds, vds2), (gds, gds2)]: if dataset._is_generic_genotype: gt = 'g.GT' else: gt = 'g' dataset.cache() dataset2.persist() dataset.write('/tmp/sample.vds', overwrite=True) dataset.count() dataset.query_variants(['variants.count()']) dataset.query_samples(['samples.count()']) (dataset.annotate_samples_expr( 'sa.nCalled = gs.filter(g => {0}.isCalled()).count()'.format( gt)).export_samples('/tmp/sa.tsv', 's = s, nCalled = sa.nCalled')) dataset.annotate_global_expr('global.foo = 5') dataset.annotate_global_expr(['global.foo = 5', 'global.bar = 6']) dataset = dataset.annotate_samples_table( hc.import_table(test_resources + '/sampleAnnotations.tsv').key_by('Sample'), expr= 'sa.isCase = table.Status == "CASE", sa.qPhen = table.qPhen') (dataset.annotate_variants_expr( 'va.nCalled = gs.filter(g => {0}.isCalled()).count()'.format( gt)).count()) loci_tb = ( hc.import_table(test_resources + '/sample2_loci.tsv').annotate( 'locus = Locus(chr, pos.toInt())').key_by('locus')) (dataset.annotate_variants_table(loci_tb, root='va.locus_annot').count()) variants_tb = (hc.import_table( test_resources + '/variantAnnotations.tsv').annotate( 'variant = Variant(Chromosome, Position.toInt(), Ref, Alt)' ).key_by('variant')) (dataset.annotate_variants_table(variants_tb, root='va.table').count()) (dataset.annotate_variants_vds( dataset, expr='va.good = va.info.AF == vds.info.AF').count()) downsampled = dataset.sample_variants(0.10) downsampled.export_variants('/tmp/sample2_loci.tsv', 'chr = v.contig, pos = v.start') downsampled.export_variants('/tmp/sample2_variants.tsv', 'v') with open(test_resources + '/sample2.sample_list') as f: samples = [s.strip() for s in f] (dataset.filter_samples_list(samples).count()[0] == 56) locus_tb = ( hc.import_table(test_resources + '/sample2_loci.tsv').annotate( 'locus = Locus(chr, pos.toInt())').key_by('locus')) (dataset.annotate_variants_table(locus_tb, root='va.locus_annot').count()) tb = (hc.import_table( test_resources + '/variantAnnotations.tsv').annotate( 'variant = Variant(Chromosome, Position.toInt(), Ref, Alt)' ).key_by('variant')) (dataset.annotate_variants_table(tb, root='va.table').count()) (dataset.annotate_variants_vds( dataset, expr='va.good = va.info.AF == vds.info.AF').count()) dataset.export_vcf('/tmp/sample2.vcf.bgz') self.assertEqual(dataset.drop_samples().count()[0], 0) self.assertEqual(dataset.drop_variants().count()[1], 0) dataset_dedup = (hc.import_vcf([ test_resources + '/sample2.vcf', test_resources + '/sample2.vcf' ]).deduplicate()) self.assertEqual(dataset_dedup.count()[1], 735) (dataset.filter_samples_expr('pcoin(0.5)').export_samples( '/tmp/sample2.sample_list', 's')) (dataset.filter_variants_expr('pcoin(0.5)').export_variants( '/tmp/sample2.variant_list', 'v')) (dataset.filter_variants_table( KeyTable.import_interval_list( test_resources + '/annotinterall.interval_list')).count()) dataset.filter_intervals(Interval.parse('1:100-end')).count() dataset.filter_intervals(map(Interval.parse, ['1:100-end', '3-22'])).count() (dataset.filter_variants_table( KeyTable.import_interval_list( test_resources + '/annotinterall.interval_list')).count()) self.assertEqual( dataset2.filter_variants_table( hc.import_table(test_resources + '/sample2_variants.tsv', key='f0', impute=True, no_header=True)).count()[1], 21) m2 = { r.f0: r.f1 for r in hc.import_table(test_resources + '/sample2_rename.tsv', no_header=True).collect() } self.assertEqual( dataset2.join(dataset2.rename_samples(m2)).count()[0], 200) dataset._typecheck() dataset.export_variants('/tmp/variants.tsv', 'v = v, va = va') self.assertTrue( (dataset.variants_table().annotate('va = json(va)')).same( hc.import_table('/tmp/variants.tsv', impute=True).key_by('v'))) dataset.export_samples('/tmp/samples.tsv', 's = s, sa = sa') self.assertTrue(( dataset.samples_table().annotate('s = s, sa = json(sa)')).same( hc.import_table('/tmp/samples.tsv', impute=True).key_by('s'))) if dataset._is_generic_genotype: gt_string = 'gt = g.GT, gq = g.GQ' gt_string2 = 'gt: g.GT, gq: g.GQ' else: gt_string = 'gt = g.gt, gq = g.gq' gt_string2 = 'gt: g.gt, gq: g.gq' cols = ['v = v, info = va.info'] for s in dataset.sample_ids: cols.append( '{s}.gt = va.G["{s}"].gt, {s}.gq = va.G["{s}"].gq'.format( s=s)) (dataset.annotate_variants_expr( 'va.G = index(gs.map(g => { s: s, %s }).collect(), s)' % gt_string2).export_variants('/tmp/sample_kt.tsv', ','.join(cols))) ((dataset.make_table( 'v = v, info = va.info', gt_string, ['v'])).same( hc.import_table('/tmp/sample_kt.tsv').key_by('v'))) dataset.annotate_variants_expr( "va.nHet = gs.filter(g => {0}.isHet()).count()".format(gt)) dataset.aggregate_by_key( "Variant = v", "nHet = g.map(g => {0}.isHet().toInt()).sum().toLong()".format( gt)) dataset.aggregate_by_key(["Variant = v"], [ "nHet = g.map(g => {0}.isHet().toInt()).sum().toLong()".format( gt) ]) dataset.make_table('v = v, info = va.info', 'gt = {0}'.format(gt), ['v']) dataset.num_partitions() dataset.file_version() dataset.sample_ids[:5] dataset.variant_schema dataset.sample_schema self.assertEqual(dataset2.num_samples, 100) self.assertEqual(dataset2.count_variants(), 735) dataset.annotate_variants_table(dataset.variants_table(), root="va") kt = (dataset.variants_table().annotate("v2 = v").key_by( ["v", "v2"])) dataset.annotate_variants_table(kt, root='va.foo', vds_key=["v", "v"]) self.assertEqual(kt.query('v.fraction(x => x == v2)'), 1.0) dataset.genotypes_table() ## This is very slow!!! variants_py = (dataset.annotate_variants_expr( 'va.hets = gs.filter(g => {0}.isHet()).collect()'.format( gt)).variants_table().filter('pcoin(0.1)').collect()) if dataset._is_generic_genotype: expr = 'g.GT.isHet() && g.GQ > 20' else: expr = 'g.isHet() && g.gq > 20' (dataset.filter_genotypes(expr).export_genotypes( '/tmp/sample2_genotypes.tsv', 'v, s, {0}.nNonRefAlleles()'.format(gt))) self.assertTrue((dataset.repartition(16, shuffle=False).same(dataset))) print(dataset.storage_level()) dataset.unpersist() dataset2.unpersist() sample = hc.import_vcf(test_resources + '/sample.vcf') sample.cache() sample.summarize().report() sample_split = sample.split_multi() sample2 = hc.import_vcf(test_resources + '/sample2.vcf') sample2.persist() sample2_split = sample2.split_multi() sample.annotate_alleles_expr('va.gs = gs.callStats(g => v)').count() sample.annotate_alleles_expr( ['va.gs = gs.callStats(g => v)', 'va.foo = 5']).count() glob, concordance1, concordance2 = ( sample2_split.concordance(sample2_split)) print(glob[1][4]) print(glob[4][0]) print(glob[:][3]) concordance1.write('/tmp/foo.kt', overwrite=True) concordance2.write('/tmp/foo.kt', overwrite=True) sample2_split.export_gen('/tmp/sample2.gen', 5) sample2_split.export_plink('/tmp/sample2') sample2.filter_multi().count() sample2.split_multi().grm().export_gcta_grm_bin('/tmp/sample2.grm') sample2.hardcalls().count() sample2_split.ibd(min=0.2, max=0.6) sample2.split_multi().impute_sex().variant_schema self.assertTrue(isinstance(sample2.genotype_schema, TGenotype)) m2 = { r.f0: r.f1 for r in hc.import_table(test_resources + '/sample2_rename.tsv', no_header=True, impute=True).collect() } self.assertEqual( sample2.join(sample2.rename_samples(m2)).count()[0], 200) cov = hc.import_table(test_resources + '/regressionLinear.cov', types={ 'Cov1': TDouble(), 'Cov2': TDouble() }).key_by('Sample') phen1 = hc.import_table(test_resources + '/regressionLinear.pheno', missing='0', types={ 'Pheno': TDouble() }).key_by('Sample') phen2 = hc.import_table(test_resources + '/regressionLogisticBoolean.pheno', missing='0', types={ 'isCase': TBoolean() }).key_by('Sample') regression = (hc.import_vcf( test_resources + '/regressionLinear.vcf').split_multi().annotate_samples_table( cov, root='sa.cov').annotate_samples_table( phen1, root='sa.pheno.Pheno').annotate_samples_table( phen2, root='sa.pheno.isCase')) (regression.linreg('sa.pheno.Pheno', covariates=['sa.cov.Cov1', 'sa.cov.Cov2 + 1 - 1']).count()) (regression.logreg('wald', 'sa.pheno.isCase', covariates=['sa.cov.Cov1', 'sa.cov.Cov2 + 1 - 1']).count()) vds_assoc = (regression.annotate_samples_expr( 'sa.culprit = gs.filter(g => v == Variant("1", 1, "C", "T")).map(g => g.gt).collect()[0]' ).annotate_samples_expr( 'sa.pheno.PhenoLMM = (1 + 0.1 * sa.cov.Cov1 * sa.cov.Cov2) * sa.culprit' )) vds_kinship = vds_assoc.filter_variants_expr('v.start < 4') km = vds_kinship.rrm(False, False) ldMatrix = vds_kinship.ld_matrix() vds_assoc = vds_assoc.lmmreg(km, 'sa.pheno.PhenoLMM', ['sa.cov.Cov1', 'sa.cov.Cov2']) vds_assoc.export_variants('/tmp/lmmreg.tsv', 'Variant = v, va.lmmreg.*') men, fam, ind, var = sample_split.mendel_errors( Pedigree.read(test_resources + '/sample.fam')) men.select(['fid', 's', 'code']) fam.select(['father', 'nChildren']) self.assertEqual(ind.key, ['s']) self.assertEqual(var.key, ['v']) sample_split.annotate_variants_table(var, root='va.mendel').count() sample_split.pca('sa.scores') sample_split.tdt(Pedigree.read(test_resources + '/sample.fam')) sample2_split.variant_qc().variant_schema sample2.export_variants('/tmp/variants.tsv', 'v = v, va = va') self.assertTrue( (sample2.variants_table().annotate('va = json(va)')).same( hc.import_table('/tmp/variants.tsv', impute=True).key_by('v'))) sample2.export_samples('/tmp/samples.tsv', 's = s, sa = sa') self.assertTrue( (sample2.samples_table().annotate('s = s, sa = json(sa)')).same( hc.import_table('/tmp/samples.tsv', impute=True).key_by('s'))) cols = ['v = v, info = va.info'] for s in sample2.sample_ids: cols.append( '{s}.gt = va.G["{s}"].gt, {s}.gq = va.G["{s}"].gq'.format(s=s)) (sample2.annotate_variants_expr( 'va.G = index(gs.map(g => { s: s, gt: g.gt, gq: g.gq }).collect(), s)' ).export_variants('/tmp/sample_kt.tsv', ','.join(cols))) ((sample2.make_table( 'v = v, info = va.info', 'gt = g.gt, gq = g.gq', ['v'])).same(hc.import_table('/tmp/sample_kt.tsv').key_by('v'))) sample_split.annotate_variants_expr( "va.nHet = gs.filter(g => g.isHet()).count()") sample_split.aggregate_by_key( "Variant = v", "nHet = g.map(g => g.isHet().toInt()).sum().toLong()") sample_split.aggregate_by_key( ["Variant = v"], ["nHet = g.map(g => g.isHet().toInt()).sum().toLong()"]) sample2.make_table('v = v, info = va.info', 'gt = g.gt', ['v']) sample.num_partitions() sample.file_version() sample.sample_ids[:5] self.assertFalse(sample2.was_split()) self.assertTrue(sample_split.was_split()) sample2.filter_alleles('pcoin(0.5)') gds.annotate_genotypes_expr('g = g.GT.toGenotype()').split_multi() sample_split.ld_prune().export_variants("/tmp/testLDPrune.tsv", "v") kt = (sample2.variants_table().annotate("v2 = v").key_by(["v", "v2"])) sample2.annotate_variants_table(kt, root="va.foo", vds_key=["v", "v"]) self.assertEqual(kt.query('v.fraction(x => x == v2)'), 1.0) variants_py = (sample.annotate_variants_expr( 'va.hets = gs.filter(g => g.isHet).collect()').variants_table(). collect()) VariantDataset.from_table(sample.variants_table())
def test_keytable(self): test_resources = 'src/test/resources' # Import # columns: Sample Status qPhen kt = hc.import_keytable(test_resources + '/sampleAnnotations.tsv', config=TextTableConfig(impute=True)).key_by('Sample') kt2 = hc.import_keytable(test_resources + '/sampleAnnotations2.tsv', config=TextTableConfig(impute=True)).key_by('Sample') # Variables self.assertEqual(kt.num_columns, 3) self.assertEqual(kt.key_names[0], "Sample") self.assertEqual(kt.column_names[2], "qPhen") self.assertEqual(kt.count_rows(), 100) kt.schema # Export kt.export('/tmp/testExportKT.tsv') # Filter, Same ktcase = kt.filter('Status == "CASE"', True) ktcase2 = kt.filter('Status == "CTRL"', False) self.assertTrue(ktcase.same(ktcase2)) # Annotate (kt.annotate('X = Status') .count_rows()) # Join kt.join(kt2, 'left').count_rows() # AggregateByKey (kt.aggregate_by_key("Status = Status", "Sum = qPhen.sum()") .count_rows()) # Forall, Exists self.assertFalse(kt.forall('Status == "CASE"')) self.assertTrue(kt.exists('Status == "CASE"')) kt.rename({"Sample": "ID"}) kt.rename(["Field1", "Field2", "Field3"]) kt.rename([name + "_a" for name in kt.column_names]) kt.select(["Sample"]) kt.select(["Sample", "Status"]) kt.key_by(['Sample', 'Status']) kt.key_by([]) kt.flatten() kt.expand_types() kt.to_dataframe() kt.annotate("newField = [0, 1, 2]").explode(["newField"]) sample = hc.import_vcf(test_resources + '/sample.vcf') sample_variants = (sample.variants_keytable() .annotate('v = str(v), va.filters = va.filters.toArray()') .flatten()) sample_variants2 = hc.dataframe_to_keytable( sample_variants.to_dataframe(), ['v']) self.assertTrue(sample_variants.same(sample_variants2)) # cseed: calculated by hand using sort -n -k 3,3 and inspection self.assertTrue(kt.filter('qPhen < 10000').count_rows() == 23) kt.write('/tmp/sampleAnnotations.kt', overwrite=True) kt3 = hc.read_keytable('/tmp/sampleAnnotations.kt') self.assertTrue(kt.same(kt3)) # test order_by schema = TStruct(['a', 'b'], [TInt(), TString()]) rows = [{'a': 5}, {'a': 5, 'b': 'quam'}, {'a': -1, 'b': 'quam'}, {'b': 'foo'}, {'a': 7, 'b': 'baz'}] kt4 = KeyTable.from_py(hc, rows, schema, npartitions=3) bya = [r.get('a') for r in kt4.order_by('a').collect()] self.assertEqual(bya, [-1, 5, 5, 7, None]) bydesca = [r.get('a') for r in kt4.order_by(desc('a')).collect()] self.assertEqual(bydesca, [7, 5, 5, -1, None]) byab = [(r.get('a'), r.get('b')) for r in kt4.order_by('a', 'b').collect()] self.assertEqual(byab, [(-1, 'quam'), (5, 'quam'), (5, None), (7, 'baz'), (None, 'foo')]) bydescab = [(r.get('a'), r.get('b')) for r in kt4.order_by(desc('a'), 'b').collect()] self.assertEqual(bydescab, [(7, 'baz'), (5, 'quam'), (5, None), (-1, 'quam'), (None, 'foo')])