def test_keytable(self): test_resources = 'src/test/resources' # Import # columns: Sample Status qPhen kt = hc.import_table(test_resources + '/sampleAnnotations.tsv', impute=True).key_by('Sample') kt2 = hc.import_table(test_resources + '/sampleAnnotations2.tsv', impute=True).key_by('Sample') # Variables self.assertEqual(kt.num_columns, 3) self.assertEqual(kt.key[0], "Sample") self.assertEqual(kt.columns[2], "qPhen") self.assertEqual(kt.count(), 100) kt.schema # Export kt.export('/tmp/testExportKT.tsv') # Filter, Same ktcase = kt.filter('Status == "CASE"', True) ktcase2 = kt.filter('Status == "CTRL"', False) self.assertTrue(ktcase.same(ktcase2)) # Annotate (kt.annotate('X = Status').count()) # Join kt.join(kt2, 'left').count() # AggregateByKey (kt.aggregate_by_key("Status = Status", "Sum = qPhen.sum()").count()) # Forall, Exists self.assertFalse(kt.forall('Status == "CASE"')) self.assertTrue(kt.exists('Status == "CASE"')) kt.rename({"Sample": "ID"}) kt.rename(["Field1", "Field2", "Field3"]) kt.rename([name + "_a" for name in kt.columns]) kt.select("Sample") kt.select(["Sample", "Status"]) kt.drop("Sample") kt.drop(["Sample", "Status"]) kt.key_by(['Sample', 'Status']) kt.key_by([]) kt.flatten() kt.expand_types() kt.to_dataframe() kt.annotate("newField = [0, 1, 2]").explode(["newField"]) sample = hc.import_vcf(test_resources + '/sample.vcf') sample_variants = (sample.variants_table().annotate( 'v = str(v), va.filters = va.filters.toArray()').flatten()) sample_variants2 = KeyTable.from_dataframe( sample_variants.to_dataframe()).key_by('v') self.assertTrue(sample_variants.same(sample_variants2)) # cseed: calculated by hand using sort -n -k 3,3 and inspection self.assertTrue(kt.filter('qPhen < 10000').count() == 23) kt.write('/tmp/sampleAnnotations.kt', overwrite=True) kt3 = hc.read_table('/tmp/sampleAnnotations.kt') self.assertTrue(kt.same(kt3)) # test order_by schema = TStruct(['a', 'b'], [TInt32(), TString()]) rows = [{ 'a': 5 }, { 'a': 5, 'b': 'quam' }, { 'a': -1, 'b': 'quam' }, { 'b': 'foo' }, { 'a': 7, 'b': 'baz' }] kt4 = KeyTable.parallelize(rows, schema, num_partitions=3) bya = [r.get('a') for r in kt4.order_by('a').collect()] self.assertEqual(bya, [-1, 5, 5, 7, None]) bydesca = [r.get('a') for r in kt4.order_by(desc('a')).collect()] self.assertEqual(bydesca, [7, 5, 5, -1, None]) byab = [(r.get('a'), r.get('b')) for r in kt4.order_by('a', 'b').collect()] self.assertEqual(byab, [(-1, 'quam'), (5, 'quam'), (5, None), (7, 'baz'), (None, 'foo')]) bydescab = [(r.get('a'), r.get('b')) for r in kt4.order_by(desc('a'), 'b').collect()] self.assertEqual(bydescab, [(7, 'baz'), (5, 'quam'), (5, None), (-1, 'quam'), (None, 'foo')]) KeyTable.import_fam(test_resources + '/sample.fam')._typecheck() self.assertEqual(kt.union(kt).count(), kt.count() * 2) self.assertEqual(kt.union(kt, kt).count(), kt.count() * 3) first3 = kt.take(3) self.assertEqual(first3[0].qPhen, 27704) self.assertEqual(first3[1].qPhen, 16636) self.assertEqual(first3[2].qPhen, 7256) self.assertEqual(first3[0].Sample, 'HG00096') self.assertEqual(first3[1].Sample, 'HG00097') self.assertEqual(first3[2].Sample, 'HG00099') self.assertTrue(all(x.Status == 'CASE' for x in first3)) self.assertEqual(range(10), [x.index for x in KeyTable.range(10).collect()]) self.assertTrue( KeyTable.range(200).indexed('foo').forall('index == foo'))
def test_keytable(self): test_resources = 'src/test/resources' # Import # columns: Sample Status qPhen kt = hc.import_keytable( test_resources + '/sampleAnnotations.tsv', config=TextTableConfig(impute=True)).key_by('Sample') kt2 = hc.import_keytable( test_resources + '/sampleAnnotations2.tsv', config=TextTableConfig(impute=True)).key_by('Sample') # Variables self.assertEqual(kt.num_columns, 3) self.assertEqual(kt.key_names[0], "Sample") self.assertEqual(kt.column_names[2], "qPhen") self.assertEqual(kt.count_rows(), 100) kt.schema # Export kt.export('/tmp/testExportKT.tsv') # Filter, Same ktcase = kt.filter('Status == "CASE"', True) ktcase2 = kt.filter('Status == "CTRL"', False) self.assertTrue(ktcase.same(ktcase2)) # Annotate (kt.annotate('X = Status').count_rows()) # Join kt.join(kt2, 'left').count_rows() # AggregateByKey (kt.aggregate_by_key("Status = Status", "Sum = qPhen.sum()").count_rows()) # Forall, Exists self.assertFalse(kt.forall('Status == "CASE"')) self.assertTrue(kt.exists('Status == "CASE"')) kt.rename({"Sample": "ID"}) kt.rename(["Field1", "Field2", "Field3"]) kt.rename([name + "_a" for name in kt.column_names]) kt.select(["Sample"]) kt.select(["Sample", "Status"]) kt.key_by(['Sample', 'Status']) kt.key_by([]) kt.flatten() kt.expand_types() kt.to_dataframe() kt.annotate("newField = [0, 1, 2]").explode(["newField"]) sample = hc.import_vcf(test_resources + '/sample.vcf') sample_variants = (sample.variants_keytable().annotate( 'v = str(v), va.filters = va.filters.toArray()').flatten()) sample_variants2 = hc.dataframe_to_keytable( sample_variants.to_dataframe(), ['v']) self.assertTrue(sample_variants.same(sample_variants2)) # cseed: calculated by hand using sort -n -k 3,3 and inspection self.assertTrue(kt.filter('qPhen < 10000').count_rows() == 23) kt.write('/tmp/sampleAnnotations.kt', overwrite=True) kt3 = hc.read_keytable('/tmp/sampleAnnotations.kt') self.assertTrue(kt.same(kt3)) # test order_by schema = TStruct(['a', 'b'], [TInt(), TString()]) rows = [{ 'a': 5 }, { 'a': 5, 'b': 'quam' }, { 'a': -1, 'b': 'quam' }, { 'b': 'foo' }, { 'a': 7, 'b': 'baz' }] kt4 = KeyTable.from_py(hc, rows, schema, npartitions=3) bya = [r.get('a') for r in kt4.order_by('a').collect()] self.assertEqual(bya, [-1, 5, 5, 7, None]) bydesca = [r.get('a') for r in kt4.order_by(desc('a')).collect()] self.assertEqual(bydesca, [7, 5, 5, -1, None]) byab = [(r.get('a'), r.get('b')) for r in kt4.order_by('a', 'b').collect()] self.assertEqual(byab, [(-1, 'quam'), (5, 'quam'), (5, None), (7, 'baz'), (None, 'foo')]) bydescab = [(r.get('a'), r.get('b')) for r in kt4.order_by(desc('a'), 'b').collect()] self.assertEqual(bydescab, [(7, 'baz'), (5, 'quam'), (5, None), (-1, 'quam'), (None, 'foo')])
def test_keytable(self): test_resources = 'src/test/resources' # Import # columns: Sample Status qPhen kt = hc.import_keytable(test_resources + '/sampleAnnotations.tsv', config=TextTableConfig(impute=True)).key_by('Sample') kt2 = hc.import_keytable(test_resources + '/sampleAnnotations2.tsv', config=TextTableConfig(impute=True)).key_by('Sample') # Variables self.assertEqual(kt.num_columns, 3) self.assertEqual(kt.key_names[0], "Sample") self.assertEqual(kt.column_names[2], "qPhen") self.assertEqual(kt.count_rows(), 100) kt.schema # Export kt.export('/tmp/testExportKT.tsv') # Filter, Same ktcase = kt.filter('Status == "CASE"', True) ktcase2 = kt.filter('Status == "CTRL"', False) self.assertTrue(ktcase.same(ktcase2)) # Annotate (kt.annotate('X = Status') .count_rows()) # Join kt.join(kt2, 'left').count_rows() # AggregateByKey (kt.aggregate_by_key("Status = Status", "Sum = qPhen.sum()") .count_rows()) # Forall, Exists self.assertFalse(kt.forall('Status == "CASE"')) self.assertTrue(kt.exists('Status == "CASE"')) kt.rename({"Sample": "ID"}) kt.rename(["Field1", "Field2", "Field3"]) kt.rename([name + "_a" for name in kt.column_names]) kt.select(["Sample"]) kt.select(["Sample", "Status"]) kt.key_by(['Sample', 'Status']) kt.key_by([]) kt.flatten() kt.expand_types() kt.to_dataframe() kt.annotate("newField = [0, 1, 2]").explode(["newField"]) sample = hc.import_vcf(test_resources + '/sample.vcf') sample_variants = (sample.variants_keytable() .annotate('v = str(v), va.filters = va.filters.toArray()') .flatten()) sample_variants2 = hc.dataframe_to_keytable( sample_variants.to_dataframe(), ['v']) self.assertTrue(sample_variants.same(sample_variants2)) # cseed: calculated by hand using sort -n -k 3,3 and inspection self.assertTrue(kt.filter('qPhen < 10000').count_rows() == 23) kt.write('/tmp/sampleAnnotations.kt', overwrite=True) kt3 = hc.read_keytable('/tmp/sampleAnnotations.kt') self.assertTrue(kt.same(kt3)) # test order_by schema = TStruct(['a', 'b'], [TInt(), TString()]) rows = [{'a': 5}, {'a': 5, 'b': 'quam'}, {'a': -1, 'b': 'quam'}, {'b': 'foo'}, {'a': 7, 'b': 'baz'}] kt4 = KeyTable.from_py(hc, rows, schema, npartitions=3) bya = [r.get('a') for r in kt4.order_by('a').collect()] self.assertEqual(bya, [-1, 5, 5, 7, None]) bydesca = [r.get('a') for r in kt4.order_by(desc('a')).collect()] self.assertEqual(bydesca, [7, 5, 5, -1, None]) byab = [(r.get('a'), r.get('b')) for r in kt4.order_by('a', 'b').collect()] self.assertEqual(byab, [(-1, 'quam'), (5, 'quam'), (5, None), (7, 'baz'), (None, 'foo')]) bydescab = [(r.get('a'), r.get('b')) for r in kt4.order_by(desc('a'), 'b').collect()] self.assertEqual(bydescab, [(7, 'baz'), (5, 'quam'), (5, None), (-1, 'quam'), (None, 'foo')])