Exemple #1
0
    def test(self):
        schema = TStruct(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'],
                         [TInt32(), TInt32(), TInt32(), TInt32(), TString(),
                          TArray(TInt32()), TArray(TStruct(['x', 'y', 'z'], [TInt32(), TInt32(), TString()])),
                          TStruct(['a', 'b', 'c'], [TInt32(), TInt32(), TString()]),
                          TBoolean(), TStruct(['x', 'y', 'z'], [TInt32(), TInt32(), TString()])])

        rows = [{'a':4, 'b': 1, 'c': 3, 'd': 5,
                 'e': "hello", 'f': [1, 2, 3],
                 'g': [Struct({'x': 1, 'y': 5, 'z': "banana"})],
                 'h': Struct({'a': 5, 'b': 3, 'c': "winter"}),
                 'i': True,
                 'j': Struct({'x': 3, 'y': 2, 'z': "summer"})}]

        kt = KeyTable.parallelize(rows, schema)

        result = convert_struct_to_dict(kt.annotate(
            chisq = chisq(kt.a, kt.b, kt.c, kt.d),
            combvar = combine_variants(Variant.parse("1:2:A:T"), Variant.parse("1:2:A:C")),
            ctt = ctt(kt.a, kt.b, kt.c, kt.d, 5),
            Dict = Dict([kt.a, kt.b], [kt.c, kt.d]),
            dpois = dpois(4, kt.a),
            drop = drop(kt.h, 'b', 'c'),
            exp = exp(kt.c),
            fet = fet(kt.a, kt.b, kt.c, kt.d),
            gt_index = gt_index(kt.a, kt.b),
            gtj = gtj(kt.a),
            gtk = gtk(kt.b),
            hwe = hwe(1, 2, 1),
            index = index(kt.g, 'z'),
            is_defined = is_defined(kt.i),
            is_missing = is_missing(kt.i),
            is_nan = is_nan(kt.a.to_float64()),
            json = json(kt.g),
            log = log(kt.a.to_float64(), kt.b.to_float64()),
            log10 = log10(kt.c.to_float64()),
            merge = merge(kt.h, kt.j),
            or_else = or_else(kt.a, 5),
            or_missing = or_missing(kt.i, kt.j),
            pchisqtail = pchisqtail(kt.a.to_float64(), kt.b.to_float64()),
            pcoin = pcoin(0.5),
            pnorm = pnorm(0.2),
            pow = pow(2.0, kt.b),
            ppois = ppois(kt.a.to_float64(), kt.b.to_float64()),
            qchisqtail = qchisqtail(kt.a.to_float64(), kt.b.to_float64()),
            range = range(0, 5, kt.b),
            rnorm = rnorm(0.0, kt.b),
            rpois = rpois(kt.a),
            runif = runif(kt.b, kt.a),
            select = select(kt.h, 'c', 'b'),
            sqrt = sqrt(kt.a),
            to_str = [to_str(5), to_str(kt.a), to_str(kt.g)],
            where = where(kt.i, 5, 10)
        ).to_hail1().take(1)[0])
Exemple #2
0
    def test_select(self):
        schema = TStruct(['a', 'b', 'c', 'd', 'e', 'f', 'g'],
                         [TInt32(), TInt32(), TInt32(), TInt32(), TString(), TArray(TInt32()), TStruct(['x', 'y'], [TBoolean(), TInt32()])])

        rows = [{'a':4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'g': {'x': True, 'y': 2}},
                {'a':0, 'b': 5, 'c': 13, 'd': -1, 'e': "cat", 'f': [], 'g': {'x': True, 'y': 2}},
                {'a':4, 'b': 2, 'c': 20, 'd': 3, 'e': "dog", 'f': [5, 6, 7], 'g': None}]

        kt = KeyTable.parallelize(rows, schema)

        self.assertEqual(kt.select(False, kt.a, kt.e).columns, ['a', 'e'])
        self.assertEqual(kt.select(False, *[kt.a, kt.e]).columns, ['a', 'e'])
        self.assertEqual(kt.select(False, kt.a, foo = kt.a + kt.b - kt.c - kt.d).columns, ['a', 'foo'])
        self.assertEqual(kt.select(False, kt.a, *kt.g, foo=kt.a + kt.b - kt.c - kt.d).columns, ['a', 'x', 'y', 'foo'])
Exemple #3
0
    def test_filter(self):
        schema = TStruct(['a', 'b', 'c', 'd', 'e', 'f'],
                         [TInt32(), TInt32(), TInt32(), TInt32(), TString(), TArray(TInt32())])

        rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3]},
                {'a': 0, 'b': 5, 'c': 13, 'd': -1, 'e': "cat", 'f': []},
                {'a': 4, 'b': 2, 'c': 20, 'd': 3, 'e': "dog", 'f': [5, 6, 7]}]

        kt = KeyTable.parallelize(rows, schema)

        self.assertEqual(kt.filter(kt.a == 4).count(), 2)
        self.assertEqual(kt.filter((kt.d == -1) | (kt.c == 20) | (kt.e == "hello")).count(), 3)
        self.assertEqual(kt.filter((kt.c != 20) & (kt.a == 4)).count(), 1)
        self.assertEqual(kt.filter(True).count(), 3)
Exemple #4
0
    def test_numeric_conversion(self):
        schema = TStruct(['a', 'b', 'c', 'd'], [TFloat64(), TFloat64(), TInt32(), TInt64()])
        rows = [{'a': 2.0, 'b': 4.0, 'c': 1, 'd': long(5)}]
        kt = KeyTable.parallelize(rows, schema)

        kt = kt.annotate(x1 = [1.0, kt.a, 1, long(1)],
                         x2 = [1, 1.0],
                         x3 = [kt.a, kt.c],
                         x4 = [kt.c, kt.d],
                         x5 = [1, kt.c, long(1)])

        expected_schema = {'a': TFloat64(), 'b': TFloat64(), 'c': TInt32(), 'd': TInt64(),
                           'x1': TArray(TFloat64()), 'x2': TArray(TFloat64()), 'x3': TArray(TFloat64()),
                           'x4': TArray(TInt64()), 'x5': TArray(TInt64())}

        self.assertTrue(all([expected_schema[fd.name] == fd.typ for fd in kt.schema.fields]))
Exemple #5
0
    def test_array_column(self):
        schema = TStruct(['a'], [TArray(TInt32())])
        rows = [{'a': [1, 2, 3]}]
        kt = KeyTable.parallelize(rows, schema)

        result = convert_struct_to_dict(kt.annotate(
            x1 = kt.a[0],
            x2 = kt.a[2],
            x3 = kt.a[:],
            x4 = kt.a[1:2],
            x5 = kt.a[-1:2],
            x6 = kt.a[:2]
        ).to_hail1().take(1)[0])

        expected = {'a': [1, 2, 3], 'x1': 1, 'x2': 3, 'x3': [1, 2, 3],
                    'x4': [2], 'x5': [], 'x6': [1, 2]}

        self.assertDictEqual(result, expected)
Exemple #6
0
    def test_query(self):
        schema = TStruct(['a', 'b', 'c', 'd', 'e', 'f'],
                         [TInt32(), TInt32(), TInt32(), TInt32(), TString(), TArray(TInt32())])

        rows = [{'a':4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3]},
                {'a':0, 'b': 5, 'c': 13, 'd': -1, 'e': "cat", 'f': []},
                {'a':4, 'b': 2, 'c': 20, 'd': 3, 'e': "dog", 'f': [5, 6, 7]}]

        kt = KeyTable.parallelize(rows, schema)
        kt_agg = kt.aggregate()
        q1, q2 = kt_agg.query(kt_agg.b.sum(), kt_agg.b.count())
        q3 = kt_agg.query(kt_agg.e.collect())
        q4 = kt_agg.query(kt_agg.e.filter(lambda x, _: (_.d >= 5) | (_.a == 0)).collect())

        self.assertEqual(q1, 8)
        self.assertEqual(q2, 3)
        self.assertEqual(set(q3), set(["hello", "cat", "dog"]))
        self.assertEqual(set(q4), set(["hello", "cat"]))
Exemple #7
0
    def test_aggregate(self):
        schema = TStruct(['status', 'gt', 'qPheno'],
                         [TInt32(), TGenotype(), TInt32()])

        rows = [{'status':0, 'gt': Genotype(0), 'qPheno': 3},
                {'status':0, 'gt': Genotype(1), 'qPheno': 13},
                {'status':1, 'gt': Genotype(1), 'qPheno': 20}]

        kt = KeyTable.parallelize(rows, schema)

        g = kt.group_by(status = kt.status)
        result = convert_struct_to_dict(g.aggregate_by_key(
            x1 = g.qPheno.map(lambda x, _: x * 2).collect(),
            x2 = g.qPheno.flat_map(lambda x, _: [x, x + 1]).collect(),
            x3 = g.qPheno.min(),
            x4 = g.qPheno.max(),
            x5 = g.qPheno.sum(),
            x6 = g.qPheno.map(lambda x, _: x.to_int64()).product(),
            x7 = g.qPheno.count(),
            x8 = g.qPheno.filter(lambda x, _: x == 3).count(),
            x9 = g.qPheno.fraction(lambda x, _: x == 1),
            x10 = g.qPheno.map(lambda x, _: x.to_float64()).stats(),
            x11 = g.gt.hardy_weinberg(),
            x13 = g.gt.inbreeding(lambda x, _: 0.1),
            x14 = g.gt.call_stats(lambda g, _: Variant("1", 10000, "A", "T")),
            x15 = g.gt.map(lambda g, _: Struct({'a': 5, 'b': "foo", 'c': Struct({'banana': 'apple'})})).collect()[0],
            x16 = (g.gt.map(lambda g, _: Struct({'a': 5, 'b': "foo", 'c': Struct({'banana': 'apple'})}))
                   .map(lambda s, _: s.c.banana).collect()[0]),
            num_partitions=5
        ).to_hail1().take(1)[0])

        expected = {'status': 0, 'x1': [6, 26], 'x2': [3, 4, 13, 14],
                    'x3': 3, 'x4': 13, 'x5': 16, 'x6': 39, 'x7': 2, 'x8': 1,
                    'x9': 0.0, 'x10': {'mean': 8, 'stdev': 5, 'min': 3, 'max': 13, 'nNotMissing': 2, 'sum': 16},
                    'x11': {'rExpectedHetFrequency': 1.0, 'pHWE': 0.5},
                    'x13': {'nCalled': 1, 'expectedHoms': 0.82, 'Fstat': -4.5555555555555545, 'nTotal': 2, 'observedHoms': 0},
                    'x14': {'AC': [1, 1], "AF": [0.5, 0.5], "GC": [0, 1, 0], "AN": 2},
                    'x15': {'a': 5, 'b': 'foo', 'c': {'banana': 'apple'}},
                    'x16': 'apple'}

        self.assertDictEqual(result, expected)
Exemple #8
0
    def test_constructors(self):
        schema = TStruct(['a', 'b', 'c', 'd'], [TFloat64(), TFloat64(), TInt32(), TInt64()])
        rows = [{'a': 2.0, 'b': 4.0, 'c': 1, 'd': long(5)}]
        kt = KeyTable.parallelize(rows, schema)

        kt = kt.annotate(v1 = VariantColumn.parse("1:500:A:T"),
                         v2 = VariantColumn.from_args("1", 23, "A", "T"),
                         v3 = VariantColumn.from_args("1", 23, "A", ["T", "G"]),
                         l1 = LocusColumn.parse("1:51"),
                         l2 = LocusColumn.from_args("1", 51),
                         i1 = IntervalColumn.parse("1:51-56"),
                         i2 = IntervalColumn.from_args("1", 51, 56),
                         i3 = IntervalColumn.from_loci(LocusColumn.from_args("1", 51), LocusColumn.from_args("1", 56)))

        kt = kt.annotate(g1 = GenotypeColumn.from_call(CallColumn.from_int32(1)),
                         g2 = GenotypeColumn.pl_genotype(kt.v1, CallColumn.from_int32(1), [6, 7], 13, 20, [20, 0, 1000]))

        expected_schema = {'a': TFloat64(), 'b': TFloat64(), 'c': TInt32(), 'd': TInt64(), 'v1': TVariant(),
                           'v2': TVariant(), 'v3': TVariant(), 'l1': TLocus(), 'l2': TLocus(), 'i1': TInterval(),
                           'i2': TInterval(), 'i3': TInterval(), 'g1': TGenotype(), 'g2': TGenotype(), 'g3': TGenotype(),
                           'g4': TGenotype()}

        self.assertTrue(all([expected_schema[fd.name] == fd.typ for fd in kt.schema.fields]))
Exemple #9
0
    def test_dict_column(self):
        schema = TStruct(['x'], [TFloat64()])
        rows = [{'x': 2.0}]
        kt = KeyTable.parallelize(rows, schema)

        kt = kt.annotate(a = Dict(['cat', 'dog'], [3, 7]))

        result = convert_struct_to_dict(kt.annotate(
            x1 = kt.a['cat'],
            x2 = kt.a['dog'],
            x3 = kt.a.contains('rabbit'),
            x4 = kt.a.is_empty(),
            x5 = kt.a.key_set(),
            x6 = kt.a.keys(),
            x7 = kt.a.values(),
            x8 = kt.a.size(),
            x9 = kt.a.map_values(lambda v: v.to_float64())
        ).to_hail1().take(1)[0])

        expected = {'a': {'cat': 3, 'dog': 7}, 'x': 2.0, 'x1': 3, 'x2': 7, 'x3': False,
                    'x4': False, 'x5': set(['cat', 'dog']), 'x6': ['cat', 'dog'],
                    'x7': [3, 7], 'x8': 2, 'x9': {'cat': 3.0, 'dog': 7.0}}

        self.assertDictEqual(result, expected)
Exemple #10
0
    def test_annotate(self):
        schema = TStruct(['a', 'b', 'c', 'd', 'e', 'f'],
                         [TInt32(), TInt32(), TInt32(), TInt32(), TString(), TArray(TInt32())])

        rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3]},
                {'a': 0, 'b': 5, 'c': 13, 'd': -1, 'e': "cat", 'f': []},
                {'a': 4, 'b': 2, 'c': 20, 'd': 3, 'e': "dog", 'f': [5, 6, 7]}]

        kt = KeyTable.parallelize(rows, schema)

        result1 = convert_struct_to_dict(kt.annotate(foo = kt.a + 1,
                                                     foo2 = kt.a).to_hail1().take(1)[0])

        self.assertDictEqual(result1, {'a': 4,
                                   'b': 1,
                                   'c': 3,
                                   'd': 5,
                                   'e': "hello",
                                   'f': [1, 2, 3],
                                   'foo': 5,
                                   'foo2': 4})

        result2 = convert_struct_to_dict(kt.annotate(**{'a.foo': 5,
                                         'b.x': "hello",
                                         'b.y': 23,
                                         'b.z': True,
                                         'b.q.hello': [1, 2, 3]}
                                                     ).to_hail1().take(1)[0])

        self.assertDictEqual(result2, {'a': {'foo': 5},
                                   'b': {'x': "hello", 'y': 23, 'z': True, 'q': {'hello': [1, 2, 3]}},
                                   'c': 3,
                                   'd': 5,
                                   'e': "hello",
                                   'f': [1, 2, 3]})

        result3 = convert_struct_to_dict(kt.annotate(
            x1 = kt.f.map(lambda x: x * 2),
            x2 = kt.f.map(lambda x: [x, x + 1]).flat_map(lambda x: x),
            x3 = kt.f.min(),
            x4 = kt.f.max(),
            x5 = kt.f.sum(),
            x6 = kt.f.product(),
            x7 = kt.f.length(),
            x8 = kt.f.filter(lambda x: x == 3),
            x9 = kt.f.tail(),
            x10 = kt.f[:],
            x11 = kt.f[1:2],
            x12 = kt.f.map(lambda x: [x, x + 1]),
            x13 = kt.f.map(lambda x: [[x, x + 1], [x + 2]]).flat_map(lambda x: x),
            x14 = where(kt.a < kt.b, kt.c, Column.null(TInt32())),
            x15 = set([1, 2, 3])
        ).to_hail1().take(1)[0])

        self.assertDictEqual(result3, {'a': 4,
                                   'b': 1,
                                   'c': 3,
                                   'd': 5,
                                   'e': "hello",
                                   'f': [1, 2, 3],
                                   'x1': [2, 4, 6], 'x2': [1, 2, 2, 3, 3, 4],
                                   'x3': 1, 'x4': 3, 'x5': 6, 'x6': 6, 'x7': 3, 'x8': [3],
                                   'x9': [2, 3], 'x10': [1, 2, 3], 'x11': [2],
                                   'x12': [[1, 2], [2, 3], [3, 4]],
                                   'x13': [[1, 2], [3], [2, 3], [4], [3, 4], [5]],
                                   'x14': None, 'x15': set([1, 2, 3])})
Exemple #11
0
    def test_operators(self):
        schema = TStruct(['a', 'b', 'c', 'd', 'e', 'f'],
                         [TInt32(), TInt32(), TInt32(), TInt32(), TString(), TArray(TInt32())])

        rows = [{'a':4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3]},
                {'a':0, 'b': 5, 'c': 13, 'd': -1, 'e': "cat", 'f': []},
                {'a':4, 'b': 2, 'c': 20, 'd': 3, 'e': "dog", 'f': [5, 6, 7]}]

        kt = KeyTable.parallelize(rows, schema)

        result = convert_struct_to_dict(kt.annotate(
            x1 = kt.a + 5,
            x2 = 5 + kt.a,
            x3 = kt.a + kt.b,
            x4 = kt.a - 5,
            x5 = 5 - kt.a,
            x6 = kt.a - kt.b,
            x7 = kt.a * 5,
            x8 = 5 * kt.a,
            x9 = kt.a * kt.b,
            x10 = kt.a / 5,
            x11 = 5 / kt.a,
            x12 = kt.a / kt.b,
            x13 = -kt.a,
            x14 = +kt.a,
            x15 = kt.a == kt.b,
            x16 = kt.a == 5,
            x17 = 5 == kt.a,
            x18 = kt.a != kt.b,
            x19 = kt.a != 5,
            x20 = 5 != kt.a,
            x21 = kt.a > kt.b,
            x22 = kt.a > 5,
            x23 = 5 > kt.a,
            x24 = kt.a >= kt.b,
            x25 = kt.a >= 5,
            x26 = 5 >= kt.a,
            x27 = kt.a < kt.b,
            x28 = kt.a < 5,
            x29 = 5 < kt.a,
            x30 = kt.a <= kt.b,
            x31 = kt.a <= 5,
            x32 = 5 <= kt.a,
            x33 = (kt.a == 0) & (kt.b == 5),
            x34 = (kt.a == 0) | (kt.b == 5),
            x35 = False,
            x36 = True
        ).to_hail1().take(1)[0])

        expected = {'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3],
                    'x1': 9, 'x2': 9, 'x3': 5,
                    'x4': -1, 'x5': 1, 'x6': 3,
                    'x7': 20, 'x8': 20, 'x9': 4,
                    'x10': 4.0 / 5, 'x11': 5.0 / 4, 'x12': 4, 'x13': -4, 'x14': 4,
                    'x15': False, 'x16': False, 'x17': False,
                    'x18': True, 'x19': True, 'x20': True,
                    'x21': True, 'x22': False, 'x23': True,
                    'x24': True, 'x25': False, 'x26': True,
                    'x27': False, 'x28': True, 'x29': False,
                    'x30': False, 'x31': True, 'x32': False,
                    'x33': False, 'x34': False, 'x35': False, 'x36': True}

        self.assertDictEqual(result, expected)