def test_keytable(self): test_resources = 'src/test/resources' # Import # columns: Sample Status qPhen kt = hc.import_keytable(test_resources + '/sampleAnnotations.tsv', 'Sample', config=TextTableConfig(impute=True)) kt2 = hc.import_keytable(test_resources + '/sampleAnnotations2.tsv', 'Sample', config=TextTableConfig(impute=True)) # Variables self.assertEqual(kt.num_columns, 3) self.assertEqual(kt.key_names[0], "Sample") self.assertEqual(kt.column_names[2], "qPhen") self.assertEqual(kt.count_rows(), 100) kt.schema # Export kt.export('/tmp/testExportKT.tsv') # Filter, Same ktcase = kt.filter('Status == "CASE"', True) ktcase2 = kt.filter('Status == "CTRL"', False) self.assertTrue(ktcase.same(ktcase2)) # Annotate (kt.annotate('X = Status').count_rows()) # Join kt.join(kt2, 'left').count_rows() # AggregateByKey (kt.aggregate_by_key("Status = Status", "Sum = qPhen.sum()").count_rows()) # Forall, Exists self.assertFalse(kt.forall('Status == "CASE"')) self.assertTrue(kt.exists('Status == "CASE"')) kt.rename({"Sample": "ID"}) kt.rename(["Field1", "Field2", "Field3"]) kt.rename([name + "_a" for name in kt.column_names]) kt.select(["Sample"]) kt.select(["Sample", "Status"]) kt.key_by(['Sample', 'Status']) kt.key_by([]) kt.flatten() kt.expand_types() kt.to_dataframe() kt.annotate("newField = [0, 1, 2]").explode(["newField"]) sample = hc.import_vcf(test_resources + '/sample.vcf') sample_variants = (sample.variants_keytable().annotate( 'v = str(v), va.filters = va.filters.toArray').flatten()) sample_variants2 = hc.dataframe_to_keytable( sample_variants.to_dataframe(), ['v']) self.assertTrue(sample_variants.same(sample_variants2))
def test_dataset(self): test_resources = 'src/test/resources' sample = hc.import_vcf(test_resources + '/sample.vcf') sample.cache() sample_split = sample.split_multi() sample2 = hc.import_vcf(test_resources + '/sample2.vcf') sample2.persist() sample2_split = sample2.split_multi() sample2.aggregate_intervals( test_resources + '/annotinterall.interval_list', 'N = variants.count()', '/tmp/annotinter.tsv') sample2.query_variants(['variants.count()']) sample2.query_samples(['samples.count()']) (sample2.annotate_global_list(test_resources + '/global_list.txt', 'global.genes', as_set=True).globals) (sample2.annotate_global_table(test_resources + '/global_table.tsv', 'global.genes').globals) (sample2.annotate_samples_expr( 'sa.nCalled = gs.filter(g => g.isCalled()).count()'). export_samples('/tmp/sa.tsv', 's = s, nCalled = sa.nCalled')) sample2.annotate_samples_list(test_resources + '/sample2.sample_list', 'sa.listed') sample2.annotate_global_expr('global.foo = 5') sample2.annotate_global_expr(['global.foo = 5', 'global.bar = 6']) sample2_annot = sample2.annotate_samples_table( test_resources + '/sampleAnnotations.tsv', 'Sample', code='sa.isCase = table.Status == "CASE", sa.qPhen = table.qPhen') sample2.annotate_samples_vds(sample2_annot, code='sa.isCase = vds.isCase') (sample.annotate_variants_bed( test_resources + '/example1.bed', root='va.bed').filter_variants_expr('va.bed').count()) sample.annotate_alleles_expr('va.gs = gs.callStats(g => v)').count() sample.annotate_alleles_expr( ['va.gs = gs.callStats(g => v)', 'va.foo = 5']).count() (sample2.annotate_variants_expr( 'va.nCalled = gs.filter(g => g.isCalled()).count()').count()) (sample2.annotate_variants_intervals(test_resources + '/annotinterall.interval_list', 'va.included', all=True).count()) (sample2.annotate_variants_loci(test_resources + '/sample2_loci.tsv', 'Locus(chr, pos.toInt())', 'va.locus_annot').count()) (sample.annotate_variants_table( test_resources + '/variantAnnotations.tsv', 'Variant(Chromosome, Position.toInt(), Ref, Alt)', root='va.table').count()) (sample2.annotate_variants_vds( sample2, code='va.good = va.info.AF == vds.info.AF').count()) glob, concordance1, concordance2 = ( sample2_split.concordance(sample2_split)) print(glob[1][4]) print(glob[4][0]) print(glob[:][3]) concordance1.write('/tmp/foo.vds', overwrite=True) concordance2.write('/tmp/foo.vds', overwrite=True) downsampled = sample2.downsample_variants(20) downsampled.export_variants('/tmp/sample2_loci.tsv', 'chr = v.contig, pos = v.start') downsampled.export_variants('/tmp/sample2_variants.tsv', 'v') (sample2.filter_samples_list( test_resources + '/sample2.sample_list').count()['nSamples'] == 56) sample2_split.export_gen('/tmp/sample2.gen') (sample2.filter_genotypes('g.isHet() && g.gq > 20').export_genotypes( '/tmp/sample2_genotypes.tsv', 'v, s, g.nNonRefAlleles()')) sample2_split.export_plink('/tmp/sample2') sample2.export_vcf('/tmp/sample2.vcf.bgz') sample2.filter_multi().count() self.assertEqual(sample2.drop_samples().count()['nSamples'], 0) self.assertEqual(sample2.drop_variants().count()['nVariants'], 0) sample2_dedup = (hc.import_vcf( [test_resources + '/sample2.vcf', test_resources + '/sample2.vcf']).deduplicate()) self.assertEqual(sample2_dedup.count()['nVariants'], 735) (sample2.filter_samples_expr('pcoin(0.5)').export_samples( '/tmp/sample2.sample_list', 's')) (sample2.filter_variants_expr('pcoin(0.5)').export_variants( '/tmp/sample2.variant_list', 'v')) (sample2.filter_variants_intervals( IntervalTree.read(test_resources + '/annotinterall.interval_list')).count()) sample2.filter_variants_intervals(Interval.parse('1:100-end')).count() sample2.filter_variants_intervals( IntervalTree.parse_all(['1:100-end', '3-22'])).count() sample2.filter_variants_intervals( IntervalTree([Interval.parse('1:100-end')])).count() (sample2.filter_variants_intervals( IntervalTree.read(test_resources + '/annotinterall.interval_list')).count()) self.assertEqual( sample2.filter_variants_list( test_resources + '/sample2_variants.tsv').count()['nVariants'], 21) sample2.split_multi().grm('/tmp/sample2.grm', 'gcta-grm-bin') sample2.hardcalls().count() sample2_split.ibd(min=0.2, max=0.6) sample2.split_multi().impute_sex().variant_schema self.assertTrue(isinstance(sample2.genotype_schema, TGenotype)) m2 = { r._0: r._1 for r in hc.import_keytable(test_resources + '/sample2_rename.tsv', config=TextTableConfig( noheader=True)).collect() } self.assertEqual( sample2.join(sample2.rename_samples(m2)).count()['nSamples'], 200) linreg = (hc.import_vcf( test_resources + '/regressionLinear.vcf').split_multi().annotate_samples_table( test_resources + '/regressionLinear.cov', 'Sample', root='sa.cov', config=TextTableConfig(types='Cov1: Double, Cov2: Double') ).annotate_samples_table( test_resources + '/regressionLinear.pheno', 'Sample', code='sa.pheno.Pheno = table.Pheno', config=TextTableConfig( types='Pheno: Double', missing='0')).annotate_samples_table( test_resources + '/regressionLogisticBoolean.pheno', 'Sample', code='sa.pheno.isCase = table.isCase', config=TextTableConfig(types='isCase: Boolean', missing='0'))) (linreg.linreg('sa.pheno.Pheno', covariates=['sa.cov.Cov1', 'sa.cov.Cov2 + 1 - 1']).count()) (linreg.logreg('wald', 'sa.pheno.isCase', covariates=['sa.cov.Cov1', 'sa.cov.Cov2 + 1 - 1']).count()) vds_assoc = (hc.import_vcf(test_resources + '/sample.vcf').split_multi( ).variant_qc().annotate_samples_expr( 'sa.culprit = gs.filter(g => v == Variant("20", 13753124, "A", "C")).map(g => g.gt).collect()[0]' ).annotate_samples_expr('sa.pheno = rnorm(1,1) * sa.culprit'). annotate_samples_expr('sa.cov1 = rnorm(0,1)'). annotate_samples_expr('sa.cov2 = rnorm(0,1)')) vds_kinship = vds_assoc.filter_variants_expr('va.qc.AF > .05') vds_assoc = vds_assoc.lmmreg(vds_kinship, 'sa.pheno', ['sa.cov1', 'sa.cov2']) vds_assoc.export_variants('/tmp/lmmreg.tsv', 'Variant = v, va.lmmreg.*') sample_split.mendel_errors('/tmp/sample.mendel', test_resources + '/sample.fam') sample_split.pca('sa.scores') self.assertTrue((sample2.repartition(16, shuffle=False).same(sample2))) print(sample2.storage_level()) sample_split.tdt(test_resources + '/sample.fam') sample2._typecheck() sample2_split.variant_qc().variant_schema sample2.export_variants('/tmp/variants.tsv', 'v = v, va = va') self.assertTrue( (sample2.variants_keytable().annotate('va = json(va)')).same( hc.import_keytable('/tmp/variants.tsv', ['v'], config=TextTableConfig(impute=True)))) sample2.export_samples('/tmp/samples.tsv', 's = s, sa = sa') self.assertTrue( (sample2.samples_keytable().annotate('s = s.id, sa = json(sa)') ).same( hc.import_keytable('/tmp/samples.tsv', ['s'], config=TextTableConfig(impute=True)))) cols = ['v = v, info = va.info'] for s in sample2.sample_ids: cols.append( '{s}.gt = va.G["{s}"].gt, {s}.gq = va.G["{s}"].gq'.format(s=s)) (sample2.annotate_variants_expr( 'va.G = index(gs.map(g => { s: s.id, gt: g.gt, gq: g.gq }).collect(), s)' ).export_variants('/tmp/sample_kt.tsv', ','.join(cols))) ((sample2.make_keytable('v = v, info = va.info', 'gt = g.gt, gq = g.gq', ['v'])).same( hc.import_keytable('/tmp/sample_kt.tsv', ['v']))) sample_split.annotate_variants_expr( "va.nHet = gs.filter(g => g.isHet()).count()") sample_split.aggregate_by_key( "Variant = v", "nHet = g.map(g => g.isHet().toInt()).sum().toLong()") sample_split.aggregate_by_key( ["Variant = v"], ["nHet = g.map(g => g.isHet().toInt()).sum().toLong()"]) sample2.make_keytable('v = v, info = va.info', 'gt = g.gt', ['v']) sample.num_partitions() sample.file_version() sample.sample_ids[:5] self.assertFalse(sample.was_split()) self.assertTrue(sample_split.was_split()) self.assertFalse(sample.is_dosage()) self.assertEqual(sample.num_samples, 100) self.assertEqual(sample.count_variants(), 346) sample2.filter_alleles('pcoin(0.5)') sample2.annotate_variants_keytable(sample2.variants_keytable(), "va.foo = table.va") kt = (sample2.variants_keytable().annotate("v2 = v").key_by( ["v", "v2"])) sample2.annotate_variants_keytable(kt, "va.foo = table.va", ["v", "v"]) variants_py = (sample.annotate_variants_expr( 'va.hets = gs.filter(g => g.isHet).collect()').variants_keytable(). collect())
def test_dataset(self): test_resources = 'src/test/resources' vds = hc.import_vcf(test_resources + '/sample.vcf') vds2 = hc.import_vcf(test_resources + '/sample2.vcf') gds = hc.import_vcf(test_resources + '/sample.vcf', generic=True) gds2 = hc.import_vcf(test_resources + '/sample2.vcf', generic=True) for (dataset, dataset2) in [(vds, vds2), (gds, gds2)]: if dataset._is_generic_genotype: gt = 'g.GT' else: gt = 'g' dataset.cache() dataset2.persist() dataset.write('/tmp/sample.vds', overwrite=True) dataset.count(genotypes=True) dataset.aggregate_intervals(test_resources + '/annotinterall.interval_list', 'N = variants.count()', '/tmp/annotinter.tsv') dataset.query_variants(['variants.count()']) dataset.query_samples(['samples.count()']) dataset.annotate_global_list(test_resources + '/global_list.txt', 'global.genes', as_set=True).globals dataset.annotate_global_table(test_resources + '/global_table.tsv', 'global.genes').globals (dataset.annotate_samples_expr('sa.nCalled = gs.filter(g => {0}.isCalled()).count()'.format(gt)) .export_samples('/tmp/sa.tsv', 's = s, nCalled = sa.nCalled')) dataset.annotate_samples_list(test_resources + '/sample2.sample_list', 'sa.listed') dataset.annotate_global_expr('global.foo = 5') dataset.annotate_global_expr(['global.foo = 5', 'global.bar = 6']) dataset_annot = dataset.annotate_samples_table( test_resources + '/sampleAnnotations.tsv', 'Sample', code='sa.isCase = table.Status == "CASE", sa.qPhen = table.qPhen') dataset.annotate_samples_vds(dataset_annot, code='sa.isCase = vds.isCase') (dataset.annotate_variants_bed(test_resources + '/example1.bed', root='va.bed') .filter_variants_expr('va.bed') .count()) (dataset.annotate_variants_expr('va.nCalled = gs.filter(g => {0}.isCalled()).count()'.format(gt)) .count()) (dataset.annotate_variants_intervals(test_resources + '/annotinterall.interval_list', 'va.included', all=True) .count()) (dataset.annotate_variants_loci(test_resources + '/sample2_loci.tsv', 'Locus(chr, pos.toInt())', 'va.locus_annot') .count()) (dataset.annotate_variants_table(test_resources + '/variantAnnotations.tsv', 'Variant(Chromosome, Position.toInt(), Ref, Alt)', root='va.table') .count()) (dataset.annotate_variants_vds(dataset, code='va.good = va.info.AF == vds.info.AF') .count()) downsampled = dataset.downsample_variants(20) downsampled.export_variants('/tmp/sample2_loci.tsv', 'chr = v.contig, pos = v.start') downsampled.export_variants('/tmp/sample2_variants.tsv', 'v') with open(test_resources + '/sample2.sample_list') as f: samples = [s.strip() for s in f] (dataset.filter_samples_list(samples) .count()['nSamples'] == 56) dataset.export_vcf('/tmp/sample2.vcf.bgz') self.assertEqual(dataset.drop_samples().count()['nSamples'], 0) self.assertEqual(dataset.drop_variants().count()['nVariants'], 0) dataset_dedup = (hc.import_vcf([test_resources + '/sample2.vcf', test_resources + '/sample2.vcf']) .deduplicate()) self.assertEqual(dataset_dedup.count()['nVariants'], 735) (dataset.filter_samples_expr('pcoin(0.5)') .export_samples('/tmp/sample2.sample_list', 's')) (dataset.filter_variants_expr('pcoin(0.5)') .export_variants('/tmp/sample2.variant_list', 'v')) (dataset.filter_variants_intervals(IntervalTree.read(test_resources + '/annotinterall.interval_list')) .count()) dataset.filter_variants_intervals(Interval.parse('1:100-end')).count() dataset.filter_variants_intervals(IntervalTree.parse_all(['1:100-end', '3-22'])).count() dataset.filter_variants_intervals(IntervalTree([Interval.parse('1:100-end')])).count() (dataset.filter_variants_intervals(IntervalTree.read(test_resources + '/annotinterall.interval_list')) .count()) self.assertEqual(dataset2.filter_variants_list( test_resources + '/sample2_variants.tsv') .count()['nVariants'], 21) m2 = {r._0: r._1 for r in hc.import_keytable(test_resources + '/sample2_rename.tsv', config=TextTableConfig(noheader=True)) .collect()} self.assertEqual(dataset2.join(dataset2.rename_samples(m2)) .count()['nSamples'], 200) dataset._typecheck() dataset.export_variants('/tmp/variants.tsv', 'v = v, va = va') self.assertTrue((dataset.variants_keytable() .annotate('va = json(va)')) .same(hc.import_keytable('/tmp/variants.tsv', config=TextTableConfig(impute=True)).key_by('v'))) dataset.export_samples('/tmp/samples.tsv', 's = s, sa = sa') self.assertTrue((dataset.samples_keytable() .annotate('s = s, sa = json(sa)')) .same(hc.import_keytable('/tmp/samples.tsv', config=TextTableConfig(impute=True)).key_by('s'))) if dataset._is_generic_genotype: gt_string = 'gt = g.GT, gq = g.GQ' gt_string2 = 'gt: g.GT, gq: g.GQ' else: gt_string = 'gt = g.gt, gq = g.gq' gt_string2 = 'gt: g.gt, gq: g.gq' cols = ['v = v, info = va.info'] for s in dataset.sample_ids: cols.append('{s}.gt = va.G["{s}"].gt, {s}.gq = va.G["{s}"].gq'.format(s=s)) (dataset .annotate_variants_expr('va.G = index(gs.map(g => { s: s, %s }).collect(), s)' % gt_string2) .export_variants('/tmp/sample_kt.tsv', ','.join(cols))) ((dataset .make_keytable('v = v, info = va.info', gt_string, ['v'])) .same(hc.import_keytable('/tmp/sample_kt.tsv').key_by('v'))) dataset.annotate_variants_expr("va.nHet = gs.filter(g => {0}.isHet()).count()".format(gt)) dataset.aggregate_by_key("Variant = v", "nHet = g.map(g => {0}.isHet().toInt()).sum().toLong()".format(gt)) dataset.aggregate_by_key(["Variant = v"], ["nHet = g.map(g => {0}.isHet().toInt()).sum().toLong()".format(gt)]) dataset.make_keytable('v = v, info = va.info', 'gt = {0}'.format(gt), ['v']) dataset.num_partitions() dataset.file_version() dataset.sample_ids[:5] dataset.variant_schema dataset.sample_schema self.assertFalse(dataset.is_dosage()) self.assertEqual(dataset2.num_samples, 100) self.assertEqual(dataset2.count_variants(), 735) dataset.annotate_variants_keytable(dataset.variants_keytable(), "va.foo = table.va") kt = (dataset.variants_keytable() .annotate("v2 = v") .key_by(["v", "v2"])) dataset.annotate_variants_keytable(kt, "va.foo = table.va", ["v", "v"]) self.assertEqual(kt.query('v.fraction(x => x == v2)'), 1.0) ## This is very slow!!! variants_py = (dataset .annotate_variants_expr('va.hets = gs.filter(g => {0}.isHet()).collect()'.format(gt)) .variants_keytable() .filter('pcoin(0.1)') .collect()) if dataset._is_generic_genotype: expr = 'g.GT.isHet() && g.GQ > 20' else: expr = 'g.isHet() && g.gq > 20' (dataset.filter_genotypes(expr) .export_genotypes('/tmp/sample2_genotypes.tsv', 'v, s, {0}.nNonRefAlleles()'.format(gt))) self.assertTrue( (dataset.repartition(16, shuffle=False) .same(dataset))) print(dataset.storage_level()) sample = hc.import_vcf(test_resources + '/sample.vcf') sample.cache() sample_split = sample.split_multi() sample2 = hc.import_vcf(test_resources + '/sample2.vcf') sample2.persist() sample2_split = sample2.split_multi() sample.annotate_alleles_expr('va.gs = gs.callStats(g => v)').count() sample.annotate_alleles_expr(['va.gs = gs.callStats(g => v)', 'va.foo = 5']).count() glob, concordance1, concordance2 = (sample2_split.concordance(sample2_split)) print(glob[1][4]) print(glob[4][0]) print(glob[:][3]) concordance1.write('/tmp/foo.vds', overwrite=True) concordance2.write('/tmp/foo.vds', overwrite=True) sample2_split.export_gen('/tmp/sample2.gen') sample2_split.export_plink('/tmp/sample2') sample2.filter_multi().count() sample2.split_multi().grm('/tmp/sample2.grm', 'gcta-grm-bin') sample2.hardcalls().count() sample2_split.ibd(min=0.2, max=0.6) sample2.split_multi().impute_sex().variant_schema self.assertTrue(isinstance(sample2.genotype_schema, TGenotype)) regression = (hc.import_vcf(test_resources + '/regressionLinear.vcf') .split_multi() .annotate_samples_table(test_resources + '/regressionLinear.cov', 'Sample', root='sa.cov', config=TextTableConfig(types='Cov1: Double, Cov2: Double')) .annotate_samples_table(test_resources + '/regressionLinear.pheno', 'Sample', code='sa.pheno.Pheno = table.Pheno', config=TextTableConfig(types='Pheno: Double', missing='0')) .annotate_samples_table(test_resources + '/regressionLogisticBoolean.pheno', 'Sample', code='sa.pheno.isCase = table.isCase', config=TextTableConfig(types='isCase: Boolean', missing='0'))) (regression.linreg('sa.pheno.Pheno', covariates=['sa.cov.Cov1', 'sa.cov.Cov2 + 1 - 1']) .count()) (regression.logreg('wald', 'sa.pheno.isCase', covariates=['sa.cov.Cov1', 'sa.cov.Cov2 + 1 - 1']) .count()) vds_assoc = (regression .annotate_samples_expr( 'sa.culprit = gs.filter(g => v == Variant("1", 1, "C", "T")).map(g => g.gt).collect()[0]') .annotate_samples_expr('sa.pheno.PhenoLMM = (1 + 0.1 * sa.cov.Cov1 * sa.cov.Cov2) * sa.culprit')) vds_kinship = vds_assoc.filter_variants_expr('v.start < 4') km = vds_kinship.rrm(False, False) vds_assoc = vds_assoc.lmmreg(km, 'sa.pheno.PhenoLMM', ['sa.cov.Cov1', 'sa.cov.Cov2']) vds_assoc.export_variants('/tmp/lmmreg.tsv', 'Variant = v, va.lmmreg.*') sample_split.mendel_errors('/tmp/sample.mendel', test_resources + '/sample.fam') sample_split.pca('sa.scores') sample_split.tdt(test_resources + '/sample.fam') sample2_split.variant_qc().variant_schema self.assertTrue(sample_split.was_split()) sample2.filter_alleles('pcoin(0.5)') gds.annotate_genotypes_expr('g = g.GT.toGenotype()').split_multi()
def test_keytable(self): test_resources = 'src/test/resources' # Import # columns: Sample Status qPhen kt = hc.import_keytable(test_resources + '/sampleAnnotations.tsv', config=TextTableConfig(impute=True)).key_by('Sample') kt2 = hc.import_keytable(test_resources + '/sampleAnnotations2.tsv', config=TextTableConfig(impute=True)).key_by('Sample') # Variables self.assertEqual(kt.num_columns, 3) self.assertEqual(kt.key_names[0], "Sample") self.assertEqual(kt.column_names[2], "qPhen") self.assertEqual(kt.count_rows(), 100) kt.schema # Export kt.export('/tmp/testExportKT.tsv') # Filter, Same ktcase = kt.filter('Status == "CASE"', True) ktcase2 = kt.filter('Status == "CTRL"', False) self.assertTrue(ktcase.same(ktcase2)) # Annotate (kt.annotate('X = Status') .count_rows()) # Join kt.join(kt2, 'left').count_rows() # AggregateByKey (kt.aggregate_by_key("Status = Status", "Sum = qPhen.sum()") .count_rows()) # Forall, Exists self.assertFalse(kt.forall('Status == "CASE"')) self.assertTrue(kt.exists('Status == "CASE"')) kt.rename({"Sample": "ID"}) kt.rename(["Field1", "Field2", "Field3"]) kt.rename([name + "_a" for name in kt.column_names]) kt.select(["Sample"]) kt.select(["Sample", "Status"]) kt.key_by(['Sample', 'Status']) kt.key_by([]) kt.flatten() kt.expand_types() kt.to_dataframe() kt.annotate("newField = [0, 1, 2]").explode(["newField"]) sample = hc.import_vcf(test_resources + '/sample.vcf') sample_variants = (sample.variants_keytable() .annotate('v = str(v), va.filters = va.filters.toArray()') .flatten()) sample_variants2 = hc.dataframe_to_keytable( sample_variants.to_dataframe(), ['v']) self.assertTrue(sample_variants.same(sample_variants2)) # cseed: calculated by hand using sort -n -k 3,3 and inspection self.assertTrue(kt.filter('qPhen < 10000').count_rows() == 23) kt.write('/tmp/sampleAnnotations.kt', overwrite=True) kt3 = hc.read_keytable('/tmp/sampleAnnotations.kt') self.assertTrue(kt.same(kt3)) # test order_by schema = TStruct(['a', 'b'], [TInt(), TString()]) rows = [{'a': 5}, {'a': 5, 'b': 'quam'}, {'a': -1, 'b': 'quam'}, {'b': 'foo'}, {'a': 7, 'b': 'baz'}] kt4 = KeyTable.from_py(hc, rows, schema, npartitions=3) bya = [r.get('a') for r in kt4.order_by('a').collect()] self.assertEqual(bya, [-1, 5, 5, 7, None]) bydesca = [r.get('a') for r in kt4.order_by(desc('a')).collect()] self.assertEqual(bydesca, [7, 5, 5, -1, None]) byab = [(r.get('a'), r.get('b')) for r in kt4.order_by('a', 'b').collect()] self.assertEqual(byab, [(-1, 'quam'), (5, 'quam'), (5, None), (7, 'baz'), (None, 'foo')]) bydescab = [(r.get('a'), r.get('b')) for r in kt4.order_by(desc('a'), 'b').collect()] self.assertEqual(bydescab, [(7, 'baz'), (5, 'quam'), (5, None), (-1, 'quam'), (None, 'foo')])