def test_kt_globals(self): kt = KeyTable.range(10) kt = kt.annotate_global_expr('foo = [1,2,3]') kt = kt.annotate_global('bar', [4, 5, 6], TArray(TInt32())) self.assertEqual( kt.filter('foo.exists(x => x == idx) || bar.exists(x => x == idx)' ).count(), 6)
def test_keytable(self): test_resources = 'src/test/resources' # Import # columns: Sample Status qPhen kt = hc.import_table(test_resources + '/sampleAnnotations.tsv', impute=True).key_by('Sample') kt2 = hc.import_table(test_resources + '/sampleAnnotations2.tsv', impute=True).key_by('Sample') # Variables self.assertEqual(kt.num_columns, 3) self.assertEqual(kt.key[0], "Sample") self.assertEqual(kt.columns[2], "qPhen") self.assertEqual(kt.count(), 100) kt.schema # Export kt.export('/tmp/testExportKT.tsv') # Filter, Same ktcase = kt.filter('Status == "CASE"', True) ktcase2 = kt.filter('Status == "CTRL"', False) self.assertTrue(ktcase.same(ktcase2)) # Annotate (kt.annotate('X = Status').count()) # Join kt.join(kt2, 'left').count() # AggregateByKey (kt.aggregate_by_key("Status = Status", "Sum = qPhen.sum()").count()) # Forall, Exists self.assertFalse(kt.forall('Status == "CASE"')) self.assertTrue(kt.exists('Status == "CASE"')) kt.rename({"Sample": "ID"}) kt.rename(["Field1", "Field2", "Field3"]) kt.rename([name + "_a" for name in kt.columns]) kt.select("Sample") kt.select(["Sample", "Status"], qualified_name=True) kt.drop("Sample") kt.drop(["Sample", "Status"]) kt.key_by(['Sample', 'Status']) kt.key_by([]) kt.flatten() kt.expand_types() kt.to_dataframe().count() kt.show(10) kt.show(4, print_types=False, truncate_to=15) kt.annotate("newField = [0, 1, 2]").explode(["newField"]) sample = hc.import_vcf(test_resources + '/sample.vcf') sample_variants = (sample.variants_table().annotate( 'v = str(v), va.filters = va.filters.toArray()').flatten()) sample_variants2 = KeyTable.from_dataframe( sample_variants.to_dataframe()).key_by('v') self.assertTrue(sample_variants.same(sample_variants2)) # cseed: calculated by hand using sort -n -k 3,3 and inspection self.assertTrue(kt.filter('qPhen < 10000').count() == 23) kt.write('/tmp/sampleAnnotations.kt', overwrite=True) kt3 = hc.read_table('/tmp/sampleAnnotations.kt') self.assertTrue(kt.same(kt3)) # test order_by schema = TStruct(['a', 'b'], [TInt32(), TString()]) rows = [{ 'a': 5 }, { 'a': 5, 'b': 'quam' }, { 'a': -1, 'b': 'quam' }, { 'b': 'foo' }, { 'a': 7, 'b': 'baz' }] kt4 = KeyTable.parallelize(rows, schema, num_partitions=3) bya = [r.get('a') for r in kt4.order_by('a').collect()] self.assertEqual(bya, [-1, 5, 5, 7, None]) bydesca = [r.get('a') for r in kt4.order_by(desc('a')).collect()] self.assertEqual(bydesca, [7, 5, 5, -1, None]) byab = [(r.get('a'), r.get('b')) for r in kt4.order_by('a', 'b').collect()] self.assertEqual(byab, [(-1, 'quam'), (5, 'quam'), (5, None), (7, 'baz'), (None, 'foo')]) bydescab = [(r.get('a'), r.get('b')) for r in kt4.order_by(desc('a'), 'b').collect()] self.assertEqual(bydescab, [(7, 'baz'), (5, 'quam'), (5, None), (-1, 'quam'), (None, 'foo')]) KeyTable.import_fam(test_resources + '/sample.fam')._typecheck() self.assertEqual(kt.union(kt).count(), kt.count() * 2) self.assertEqual(kt.union(kt, kt).count(), kt.count() * 3) first3 = kt.take(3) self.assertEqual(first3[0].qPhen, 27704) self.assertEqual(first3[1].qPhen, 16636) self.assertEqual(first3[2].qPhen, 7256) self.assertEqual(first3[0].Sample, 'HG00096') self.assertEqual(first3[1].Sample, 'HG00097') self.assertEqual(first3[2].Sample, 'HG00099') self.assertTrue(all(x.Status == 'CASE' for x in first3)) self.assertTrue(kt.head(3).count(), 3) self.assertEqual(range(10), [x.idx for x in KeyTable.range(10).collect()]) self.assertTrue( KeyTable.range(200).indexed('foo').forall('idx == foo')) kt3 = KeyTable.parallelize([{ 'A': Struct(c1=5, c2=21) }], TStruct(['A'], [TStruct(['c1', 'c2'], [TInt32(), TInt32()])])) self.assertTrue(kt3.ungroup('A').group('A', 'c1', 'c2').same(kt3))