def test_agg_by_2(self):
        test_table = empty_table(10)
        test_table = test_table.update(
            ["grp_id=(int)(i/5)", "var=(int)i", "weights=(double)1.0/(i+1)"])

        aggs = [
            group(["aggGroup=var"]),
            avg(["aggAvg=var"]),
            count_("aggCount"),
            first(["aggFirst=var"]),
            last(["aggLast=var"]),
            max_(["aggMax=var"]),
            median(["aggMed=var"]),
            min_(["aggMin=var"]),
            pct(0.20, ["aggPct=var"]),
            std(["aggStd=var"]),
            sum_(["aggSum=var"]),
            abs_sum(["aggAbsSum=var"]),
            var(["aggVar=var"]),
            weighted_avg("var", ["weights"]),
        ]

        result_table = test_table.agg_by(aggs, ["grp_id"])
        self.assertGreaterEqual(result_table.size, 1)

        for agg in aggs:
            result_table = test_table.agg_by(agg, "grp_id")
            self.assertGreaterEqual(result_table.size, 1)
    def test_agg_all_by(self):
        test_table = empty_table(10)
        test_table = test_table.update(
            ["grp_id=(int)(i/5)", "var=(int)i", "weights=(double)1.0/(i+1)"])

        aggs = [
            group(),
            avg(),
            first(),
            last(),
            max_(),
            median(),
            min_(),
            pct(0.20),
            std(),
            sum_(),
            abs_sum(),
            var(),
            weighted_avg("var"),
        ]
        for agg in aggs:
            with self.subTest(agg):
                result_table = test_table.agg_all_by(agg, ["grp_id"])
                self.assertGreaterEqual(result_table.size, 1)

        # column names in the Aggregation are ignored
        aggs = [
            group(["aggGroup=var"]),
            avg(["aggAvg=var"]),
            pct(0.20, ["aggPct=var"]),
            std(["aggStd=var"]),
            sum_(["aggSum=var"]),
            abs_sum(["aggAbsSum=var"]),
            var(["aggVar=var"]),
            weighted_avg("var", ["weights"]),
        ]
        for agg in aggs:
            with self.subTest(agg):
                result_table = test_table.agg_all_by(agg, ["grp_id"])
                self.assertGreaterEqual(result_table.size, 1)

        with self.assertRaises(DHError) as cm:
            test_table.agg_all_by(count_("aggCount"), "grp_id")
        self.assertIn("unsupported", cm.exception.root_cause)

        for agg in aggs:
            with self.subTest(agg):
                result_table = test_table.agg_all_by(agg)
                self.assertEqual(result_table.size, 1)
Exemple #3
0
    high_value_users,
    kafka_base_properties,
    topic='high_value_users_sink',
    key_spec=pk.avro_spec('high_value_users_sink_key',
                          publish_schema=True,
                          schema_namespace=schema_namespace,
                          include_only_columns=['user_id']),
    value_spec=pk.avro_spec('high_value_users_sink_value',
                            publish_schema=True,
                            schema_namespace=schema_namespace,
                            column_properties={
                                "lifetime_value.precision": "12",
                                "lifetime_value.scale": "4"
                            }),
    last_by_key_columns=True)

hvu_test = ck.consume(consume_properties,
                      topic='high_value_users_sink',
                      offsets=ck.ALL_PARTITIONS_SEEK_TO_BEGINNING,
                      key_spec=KeyValueSpec.IGNORE,
                      value_spec=ck.avro_spec('high_value_users_sink_value'),
                      table_type=TableType.Append)

pageviews_summary = pageviews_stg \
    .agg_by(
        [
            agg.count_('total'),
            agg.max_(['max_received_at = received_at'])
        ]) \
    .update(['dt_ms = (DateTime.now() - max_received_at)/1_000_000.0'])