def test_agg_by_2(self): test_table = empty_table(10) test_table = test_table.update( ["grp_id=(int)(i/5)", "var=(int)i", "weights=(double)1.0/(i+1)"]) aggs = [ group(["aggGroup=var"]), avg(["aggAvg=var"]), count_("aggCount"), first(["aggFirst=var"]), last(["aggLast=var"]), max_(["aggMax=var"]), median(["aggMed=var"]), min_(["aggMin=var"]), pct(0.20, ["aggPct=var"]), std(["aggStd=var"]), sum_(["aggSum=var"]), abs_sum(["aggAbsSum=var"]), var(["aggVar=var"]), weighted_avg("var", ["weights"]), ] result_table = test_table.agg_by(aggs, ["grp_id"]) self.assertGreaterEqual(result_table.size, 1) for agg in aggs: result_table = test_table.agg_by(agg, "grp_id") self.assertGreaterEqual(result_table.size, 1)
def test_agg_all_by(self): test_table = empty_table(10) test_table = test_table.update( ["grp_id=(int)(i/5)", "var=(int)i", "weights=(double)1.0/(i+1)"]) aggs = [ group(), avg(), first(), last(), max_(), median(), min_(), pct(0.20), std(), sum_(), abs_sum(), var(), weighted_avg("var"), ] for agg in aggs: with self.subTest(agg): result_table = test_table.agg_all_by(agg, ["grp_id"]) self.assertGreaterEqual(result_table.size, 1) # column names in the Aggregation are ignored aggs = [ group(["aggGroup=var"]), avg(["aggAvg=var"]), pct(0.20, ["aggPct=var"]), std(["aggStd=var"]), sum_(["aggSum=var"]), abs_sum(["aggAbsSum=var"]), var(["aggVar=var"]), weighted_avg("var", ["weights"]), ] for agg in aggs: with self.subTest(agg): result_table = test_table.agg_all_by(agg, ["grp_id"]) self.assertGreaterEqual(result_table.size, 1) with self.assertRaises(DHError) as cm: test_table.agg_all_by(count_("aggCount"), "grp_id") self.assertIn("unsupported", cm.exception.root_cause) for agg in aggs: with self.subTest(agg): result_table = test_table.agg_all_by(agg) self.assertEqual(result_table.size, 1)
high_value_users, kafka_base_properties, topic='high_value_users_sink', key_spec=pk.avro_spec('high_value_users_sink_key', publish_schema=True, schema_namespace=schema_namespace, include_only_columns=['user_id']), value_spec=pk.avro_spec('high_value_users_sink_value', publish_schema=True, schema_namespace=schema_namespace, column_properties={ "lifetime_value.precision": "12", "lifetime_value.scale": "4" }), last_by_key_columns=True) hvu_test = ck.consume(consume_properties, topic='high_value_users_sink', offsets=ck.ALL_PARTITIONS_SEEK_TO_BEGINNING, key_spec=KeyValueSpec.IGNORE, value_spec=ck.avro_spec('high_value_users_sink_value'), table_type=TableType.Append) pageviews_summary = pageviews_stg \ .agg_by( [ agg.count_('total'), agg.max_(['max_received_at = received_at']) ]) \ .update(['dt_ms = (DateTime.now() - max_received_at)/1_000_000.0'])