def test_ordered_aggregation(self): imps = Table.from_tag(IMPS) resx = select(imps.ad_id, imps.cpm_millis, where=imps.date == '2014-01-27') sum_millis = {} for ad_id, millis in resx: if ad_id not in sum_millis: sum_millis[ad_id] = [0, 0] sum_millis[ad_id][0] += millis sum_millis[ad_id][1] += 1 results = select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27', order_by=2, limit=3, nest=True) self.assertGreater(len(list(results)), 0) lowest = 0 for ad_id, millis, count in results: self.assertLessEqual(lowest, count) lowest = count ad_tup = sum_millis[ad_id] self.assertEqual(millis, ad_tup[0]) self.assertEqual(count, ad_tup[1]) self.assertEqual(len(list(results)), min(len(sum_millis), 3))
def test_aggregate_join(self): imps = Table.from_tag(IMPS) pix = Table.from_tag(PIXELS) imp_sites = list(select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13')) pix_sites = list(select(pix.site_id, pix.amount, where=pix.date < '2014-01-13')) join = {} for imp_site, imp_ad_id in imp_sites: for pix_site, pix_amount in pix_sites: if imp_site == pix_site: if imp_ad_id not in join: join[imp_ad_id] = [0, 0] join[imp_ad_id][0] += pix_amount join[imp_ad_id][1] += 1 res = select(imps.ad_id, h_sum(pix.amount), h_count(), where=(imps.date < '2014-01-13', pix.date < '2014-01-13'), join=(imps.site_id, pix.site_id)) results = list(res) self.assertEqual(len(results), len(join)) for (ad_id, amount, count) in results: ramount, rcount = join[ad_id] self.assertEqual(ramount, amount) self.assertEqual(rcount, count)
def test_column_fn_with_agg(self): ips = Table.from_tag(IPS) res = select(ips.exchange_id, h_max(ip_ntoa(ips.ip)), where=ips, order_by=(ips.exchange_id, )) results = list(res) res.purge() exchanges = [ex for ex, _ in results] ipss = [ip for _, ip in results] self.assertListEqual(['Adx', 'Appnexus', 'OpenX', 'Rubycon'], exchanges) self.assertListEqual(['192.168.1.1'] * 4, ipss) res = select(ips.exchange_id, h_min(ip_ntoa(ips.ip)), where=ips, order_by=(ips.exchange_id, )) results = list(res) res.purge() exchanges = [ex for ex, _ in results] ipss = [ip for _, ip in results] self.assertListEqual(['Adx', 'Appnexus', 'OpenX', 'Rubycon'], exchanges) self.assertListEqual(['127.0.0.1'] * 4, ipss)
def test_nested_join(self): imps = Table.from_tag(IMPS) pix = Table.from_tag(PIXELS) imp_sites = list(select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13')) pix_sites = list(select(pix.site_id, pix.amount, where=((pix.date < '2014-01-13') & (pix.isActive == True)))) join = [] for imp_site, imp_ad_id in imp_sites: for pix_site, pix_amount in pix_sites: if imp_site == pix_site: join.append((imp_ad_id, pix_amount)) sub_pix = select(pix.site_id, pix.amount, pix.date, where=((pix.date < '2014-01-15') & (pix.isActive == True)), nest=True) res = select(imps.ad_id, sub_pix.amount, where=(imps.date < '2014-01-13', sub_pix.date < '2014-01-13'), join=(imps.site_id, sub_pix.site_id)) results = [tuple(c) for c in res] self.assertEqual(len(results), len(join)) for jtup in join: self.assertIn(jtup, results)
def test_simple_join(self): imps = Table.from_tag(IMPS) pix = Table.from_tag(PIXELS) imp_sites = [(s, a) for (s, a), _ in result_iterator( select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))] pix_sites = [(s, a) for (s, a), _ in result_iterator( select(pix.site_id, pix.amount, where=pix.date < '2014-01-13'))] join = [] for imp_site, imp_ad_id in imp_sites: for pix_site, pix_amount in pix_sites: if imp_site == pix_site: join.append((imp_ad_id, pix_amount)) res = select(imps.ad_id, pix.amount, where=(imps.date < '2014-01-13', pix.date < '2014-01-13'), join=(imps.site_id, pix.site_id), order_by='amount') results = [(ad_id, amount) for (ad_id, amount), _ in result_iterator(res)] self.assertTrue(len(results), len(join)) for jtup in join: self.assertIn(jtup, results) lowest = 0 for ad_id, amount in results: self.assertLessEqual(lowest, amount) lowest = amount
def test_nested_join(self): imps = Table.from_tag(IMPS) pix = Table.from_tag(PIXELS) imp_sites = [(s, a) for (s, a), _ in result_iterator( select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))] pix_sites = [(s, a) for (s, a), _ in result_iterator( select(pix.site_id, pix.amount, where=((pix.date < '2014-01-13') & (pix.isActive > 0))))] join = [] for imp_site, imp_ad_id in imp_sites: for pix_site, pix_amount in pix_sites: if imp_site == pix_site: join.append((imp_ad_id, pix_amount)) sub_pix = select(pix.site_id, pix.amount, pix.date, where=((pix.date < '2014-01-15') & (pix.isActive > 0)), nest=True) res = select(imps.ad_id, sub_pix.amount, where=(imps.date < '2014-01-13', sub_pix.date < '2014-01-13'), join=(imps.site_id, sub_pix.site_id)) results = [(ad_id, amount) for (ad_id, amount), _ in result_iterator(res)] self.assertTrue(len(results), len(join)) for jtup in join: self.assertIn(jtup, results)
def test_simple_join(self): imps = Table.from_tag(IMPS) pix = Table.from_tag(PIXELS) imp_sites = [(s, a) for (s, a) in select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13')] pix_sites = [(s, a) for (s, a) in select(pix.site_id, pix.amount, where=pix.date < '2014-01-13')] join = [] for imp_site, imp_ad_id in imp_sites: for pix_site, pix_amount in pix_sites: if imp_site == pix_site: join.append((imp_ad_id, pix_amount)) res = select(imps.ad_id, pix.amount, where=(imps.date < '2014-01-13', pix.date < '2014-01-13'), join=(imps.site_id, pix.site_id), order_by='amount') results = list(res) self.assertEqual(len(results), len(join)) for jtup in join: self.assertIn(jtup, results) lowest = 0 for ad_id, amount in results: self.assertLessEqual(lowest, amount) lowest = amount
def test_aggregate_join(self): imps = Table.from_tag(IMPS) pix = Table.from_tag(PIXELS) imp_sites = [(s, a) for (s, a), _ in result_iterator( select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))] pix_sites = [(s, a) for (s, a), _ in result_iterator( select(pix.site_id, pix.amount, where=pix.date < '2014-01-13'))] join = {} for imp_site, imp_ad_id in imp_sites: for pix_site, pix_amount in pix_sites: if imp_site == pix_site: if imp_ad_id not in join: join[imp_ad_id] = [0, 0] join[imp_ad_id][0] += pix_amount join[imp_ad_id][1] += 1 res = select(imps.ad_id, h_sum(pix.amount), h_count(), where=(imps.date < '2014-01-13', pix.date < '2014-01-13'), join=(imps.site_id, pix.site_id)) results = [(ad_id, amount, count) for (ad_id, amount, count), _ in result_iterator(res)] self.assertTrue(len(results), len(join)) for (ad_id, amount, count) in results: ramount, rcount = join[ad_id] self.assertEqual(ramount, amount) self.assertEqual(rcount, count)
def test_nested_agg(self): imps = Table.from_tag(IMPS) results = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date > '2014-01-22') sum_millis = {} for ad_id, dt, millis in results: key = str(ad_id) + dt if key not in sum_millis: sum_millis[key] = [0, 0] sum_millis[key][0] += millis sum_millis[key][1] += 1 newtab = select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), h_count(), where=imps.date > '2014-01-22', nest=True) results = select(*star(newtab), where=newtab) self.assertGreater(len(list(results)), 0) for ad_id, dt, millis, count in results: ad_tup = sum_millis[str(ad_id) + dt] self.assertEqual(millis, ad_tup[0]) self.assertEqual(count, ad_tup[1])
def test_ordered_aggregation(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.cpm_millis, where=imps.date == '2014-01-27') resx = [c for c, _ in result_iterator(res)] sum_millis = {} for ad_id, millis in resx: if ad_id not in sum_millis: sum_millis[ad_id] = [0, 0] sum_millis[ad_id][0] += millis sum_millis[ad_id][1] += 1 res = select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27', order_by=2, limit=3) results = [c for c, _ in result_iterator(res)] lowest = 0 for ad_id, millis, count in results: self.assertLessEqual(lowest, count) lowest = count ad_tup = sum_millis[ad_id] self.assertEqual(millis, ad_tup[0]) self.assertEqual(count, ad_tup[1]) self.assertTrue(len(results) == min(len(sum_millis), 3))
def test_cardinality_with_order_by(self): hll = Table.from_tag(PIXELS_HLL) tokens_by_date = defaultdict(set) with open("./fixtures/pixel.json") as f: for line in f: record = ujson.loads(line) tokens_by_date[record["date"]].add(record["token"]) result = [(date, len(tokens)) for date, tokens in tokens_by_date.items()] # Test order by date expects = sorted(result, key=itemgetter(0), reverse=True) res = select(hll.date, h_hll(hll.hll), where=hll, order_by=0, desc=True) estimates = list(res) for i, (date, expected_cardinality) in enumerate(expects): self.assertEqual(estimates[i][0], date) self.checkEstimate(estimates[i][1], expected_cardinality) res.purge() # Test order by hll res = select(hll.date, h_hll(hll.hll), where=hll, order_by=1, desc=True) l = list(res) for i in range(len(l) - 1): self.assertTrue(l[i][1] >= l[i + 1][1]) res.purge()
def test_nested_join(self): imps = Table.from_tag(IMPS) pix = Table.from_tag(PIXELS) imp_sites = [(s, a) for (s, a), _ in result_iterator(select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))] pix_sites = [(s, a) for (s, a), _ in result_iterator(select(pix.site_id, pix.amount, where=((pix.date < '2014-01-13') & (pix.isActive > 0))))] join = [] for imp_site, imp_ad_id in imp_sites: for pix_site, pix_amount in pix_sites: if imp_site == pix_site: join.append((imp_ad_id, pix_amount)) sub_pix = select(pix.site_id, pix.amount, pix.date, where=((pix.date < '2014-01-15') & (pix.isActive > 0)), nest=True) res = select(imps.ad_id, sub_pix.amount, where=(imps.date < '2014-01-13', sub_pix.date < '2014-01-13'), join=(imps.site_id, sub_pix.site_id)) results = [(ad_id, amount) for (ad_id, amount), _ in result_iterator(res)] self.assertTrue(len(results), len(join)) for jtup in join: self.assertIn(jtup, results)
def test_ordered_aggregation(self): imps = Table.from_tag(IMPS) resx = select(imps.ad_id, imps.cpm_millis, where=imps.date == '2014-01-27') sum_millis = {} for ad_id, millis in resx: if ad_id not in sum_millis: sum_millis[ad_id] = [0, 0] sum_millis[ad_id][0] += millis sum_millis[ad_id][1] += 1 results = select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27', order_by=2, limit=3, nest=True) self.assertGreater(len(list(results)), 0) lowest = 0 for ad_id, millis, count in results: self.assertLessEqual(lowest, count) lowest = count ad_tup = sum_millis[ad_id] self.assertEqual(millis, ad_tup[0]) self.assertEqual(count, ad_tup[1]) self.assertEqual(len(list(results)), min(len(sum_millis), 3)) resx.purge()
def test_nested_join(self): imps = Table.from_tag(IMPS) pix = Table.from_tag(PIXELS) imp_sites = list( select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13')) pix_sites = list( select(pix.site_id, pix.amount, where=((pix.date < '2014-01-13') & (pix.isActive == True)))) join = [] for imp_site, imp_ad_id in imp_sites: for pix_site, pix_amount in pix_sites: if imp_site == pix_site: join.append((imp_ad_id, pix_amount)) sub_pix = select(pix.site_id, pix.amount, pix.date, where=((pix.date < '2014-01-15') & (pix.isActive == True)), nest=True) res = select(imps.ad_id, sub_pix.amount, where=(imps.date < '2014-01-13', sub_pix.date < '2014-01-13'), join=(imps.site_id, sub_pix.site_id)) results = [tuple(c) for c in res] self.assertEqual(len(results), len(join)) for jtup in join: self.assertIn(jtup, results)
def test_column_fn_with_nest(self): ips = Table.from_tag(IPS) res = select(ip_ntoa(ips.ip), where=ips.exchange_id == "Adx", order_by=(ip_ntoa(ips.ip),), distinct=True, nest=True) ret = select(res.ip, where=res, order_by=(res.ip,)) results = list(ret) ret.purge() ipss = [ip[0] for ip in results] self.assertListEqual(['127.0.0.1', '192.1.1.1', '192.1.1.2', '192.168.1.1'], ipss)
def test_column_fn_with_nest(self): ips = Table.from_tag(IPS) res = select(ip_ntoa(ips.ip), where=ips.exchange_id == "Adx", order_by=(ip_ntoa(ips.ip), ), distinct=True, nest=True) ret = select(res.ip, where=res, order_by=(res.ip, )) results = list(ret) ret.purge() ipss = [ip[0] for ip in results] self.assertListEqual( ['127.0.0.1', '192.1.1.1', '192.1.1.2', '192.168.1.1'], ipss)
def test_nested_self_join(self): """ A self join is joining the table against itself. This requires the use of aliases. """ imps = Table.from_tag(IMPS) early_res = select(imps.ad_id, imps.cpm_millis, where=imps.date < '2014-01-20') early = list(early_res) late_res = select(imps.ad_id, imps.cpm_millis, where=imps.date >= '2014-01-20') late = list(late_res) join = {} for eid, ecpm in early: for lid, lcpm in late: if eid == lid: if eid not in join: join[eid] = [0, 0, 0] join[eid][0] += ecpm join[eid][1] += lcpm join[eid][2] += 1 early = select(imps.ad_id, imps.cpm_millis, where=imps.date < '2014-01-20', nest=True) late = select(imps.ad_id, imps.cpm_millis, where=imps.date >= '2014-01-20', nest=True) jimmy = select(early.ad_id.named('adididid'), h_sum(early.cpm_millis).named('emillis'), h_sum(late.cpm_millis).named('lmillis'), h_count(), where=(early, late), join='ad_id') james = list(jimmy) self.assertEqual(len(join), len(james)) for (ad_id, emillis, lmillis, cnt) in james: ecpm, lcpm, ocnt = join[ad_id] self.assertEqual(emillis, ecpm) self.assertEqual(lmillis, lcpm) self.assertEqual(cnt, ocnt) early_res.purge() late_res.purge() jimmy.purge()
def test_bool_values(self): pix = Table.from_tag(PIXELS) res = select(pix.isActive, where=pix.isActive == True) actives = 0 for (act, ) in res: actives += act self.assertEqual(actives, 234) res = select(pix.isActive, where=pix.isActive == 0) actives = 0 for (act, ) in res: actives += 1 self.assertEqual(actives, 266)
def test_overall(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.date, where=imps.date == '2014-01-27', distinct=True, limit=4, order_by='ad_id', desc=True) results = [a for (a, d), _ in result_iterator(res)] self.assertEqual(len(results), 4) self.assertListEqual(results, [30019, 30018, 30017, 30015])
def test_single_int_order(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27', order_by=imps.cpm_millis) lowest = 0 for (a, d, c), _ in result_iterator(res): self.assertLessEqual(lowest, c) lowest = c
def test_combo_where_on_or_partition(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.date, imps.cpm_millis, where=((imps.date == '2014-01-21') | (imps.date == '2014-01-25') | (imps.ad_id == 30010))) results = [c for c, _ in result_iterator(res)] self.assertEqual(len(results), 27) self.assertTrue(all(d == '2014-01-21' or d == '2014-01-25' or a == 30010 for a, d, _ in results))
def test_combo_where_on_and_partition(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.date, imps.cpm_millis, where=((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23') & (imps.ad_id == 30010))) results = list(res) self.assertEqual(len(results), 2) self.assertTrue(all(d in ('2014-01-21', '2014-01-22', '2014-01-23') and a == 30010 for a, d, _ in results))
def test_combo_where_on_or_partition_ex1(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.date, imps.cpm_millis, where=((imps.date << ['2014-01-21', '2014-01-25']) | (imps.ad_id << [30003, 30010]))) results = list(res) self.assertEqual(len(results), 40) self.assertTrue(all(d == '2014-01-21' or d == '2014-01-25' or a == 30010 or a == 30003 for a, d, _ in results))
def test_combo_where_on_mixed_partition(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.date, imps.cpm_millis, where=(((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23') & (imps.time > 170000)))) results = [c for c, _ in result_iterator(res)] self.assertEqual(len(results), 2) self.assertTrue(all((d in ('2014-01-21', '2014-01-22', '2014-01-23') and a == 30003) for a, d, c in results))
def test_combo_where_no_partition(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.date, imps.cpm_millis, where=(imps.time >= 180000)) results = list(res) print results self.assertEqual(len(results), 5) res.purge()
def test_combo_where_on_or_partition_ex2(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.date, imps.cpm_millis, where=((imps.date << ['2014-01-21', '2014-01-25']) & (imps.ad_id << [30003, 30010]))) results = [c for c, _ in result_iterator(res)] self.assertEqual(len(results), 1) self.assertTrue(all(d == '2014-01-21' and a == 30010 for a, d, _ in results))
def test_range_on_partition(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date > '2014-01-27') results = list(res) self.assertEqual(len(results), 20) self.assertTrue(all(d in ('2014-01-28', '2014-01-29') for _, d, _ in results)) res.purge()
def test_column_fn(self): ips = Table.from_tag(IPS) res = select(ips.exchange_id, ip_ntoa(ips.ip), where=ips.exchange_id == "Adx") results = list(res) self.assertEqual(len(results), 29) res.purge()
def test_distinct(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.date, where=imps.date == '2014-01-27', distinct=True) results = list(res) self.assertEqual(len(results), 8)
def test_equality_on_partition(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27') results = [c for c, _ in result_iterator(res)] self.assertEqual(len(results), 10) found = next((a, d, c) for a, d, c in results if a == 30018 and d == '2014-01-27' and c == 4506) self.assertIsNotNone(found) self.assertTrue(all(d == '2014-01-27' for _, d, _ in results))
def test_simple_aggregation(self): imps = Table.from_tag(IMPS) results = select(imps.ad_id, imps.cpm_millis, where=imps.date == '2014-01-27') sum_millis = {} for ad_id, millis in results: if ad_id not in sum_millis: sum_millis[ad_id] = [0, 0] sum_millis[ad_id][0] += millis sum_millis[ad_id][1] += 1 results = select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27') self.assertGreater(len(list(results)), 0) for ad_id, millis, count in results: ad_tup = sum_millis[ad_id] self.assertEqual(millis, ad_tup[0]) self.assertEqual(count, ad_tup[1])
def test_simple_aggregation(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.cpm_millis, where=imps.date == '2014-01-27') results = [c for c, _ in result_iterator(res)] sum_millis = {} for ad_id, millis in results: if ad_id not in sum_millis: sum_millis[ad_id] = [0, 0] sum_millis[ad_id][0] += millis sum_millis[ad_id][1] += 1 res = select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27') results = [c for c, _ in result_iterator(res)] for ad_id, millis, count in results: ad_tup = sum_millis[ad_id] self.assertEqual(millis, ad_tup[0]) self.assertEqual(count, ad_tup[1])
def test_bit_values(self): pix = Table.from_tag(PIXELS) res = select(pix.isActive, where=pix.isActive == 1) actives = 0 for (act, ) in res: actives += act self.assertEqual(actives, 234) res.purge()
def test_aggregate(self): imps = Table.from_tag(IMPS) res = select(h_sum(imps.click), h_sum(imps.conversion), h_sum(imps.impression), where=imps) (clicks, conversions, impressions) = list(res)[0] self.assertEqual(clicks, 21) self.assertEqual(conversions, 5) self.assertEqual(impressions, 174)
def test_low_limit(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27', limit=4) results = list(res) self.assertEqual(len(results), 4)
def test_combo_where_no_partition(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.date, imps.cpm_millis, where=(imps.time >= 180000)) results = list(res) print results self.assertEqual(len(results), 5)
def test_multiple_group_bys(self): imps = Table.from_tag(IMPS) results = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date > '2014-01-22') sum_millis = {} for ad_id, dt, millis in results: key = str(ad_id) + dt if key not in sum_millis: sum_millis[key] = [0, 0] sum_millis[key][0] += millis sum_millis[key][1] += 1 results = select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), h_count(), where=imps.date > '2014-01-22') self.assertGreater(len(list(results)), 0) for ad_id, dt, millis, count in results: ad_tup = sum_millis[str(ad_id) + dt] self.assertEqual(millis, ad_tup[0]) self.assertEqual(count, ad_tup[1])
def test_range_on_partition(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date > '2014-01-27') results = list(res) self.assertEqual(len(results), 20) self.assertTrue( all(d in ('2014-01-28', '2014-01-29') for _, d, _ in results))
def test_overflow(self): from itertools import izip imps = Table.from_tag(IMPS) fly_results = select(imps.date, h_sum(imps.impression), where=imps, order_by=imps.date) nest_tab = select(imps.date, h_sum(imps.impression), where=imps, nest=True) nest_results = select(*star(nest_tab), where=nest_tab, order_by=0) for ((fdate, fimps), (ndate, nimps)) in izip(fly_results, nest_results): self.assertEqual(fdate, ndate) self.assertEqual(fimps, nimps)
def test_column_fn_with_agg(self): ips = Table.from_tag(IPS) res = select(ips.exchange_id, h_max(ip_ntoa(ips.ip)), where=ips, order_by=(ips.exchange_id,)) results = list(res) res.purge() exchanges = [ex for ex, _ in results] ipss = [ip for _, ip in results] self.assertListEqual(['Adx', 'Appnexus', 'OpenX', 'Rubycon'], exchanges) self.assertListEqual(['192.168.1.1'] * 4, ipss) res = select(ips.exchange_id, h_min(ip_ntoa(ips.ip)), where=ips, order_by=(ips.exchange_id,)) results = list(res) res.purge() exchanges = [ex for ex, _ in results] ipss = [ip for _, ip in results] self.assertListEqual(['Adx', 'Appnexus', 'OpenX', 'Rubycon'], exchanges) self.assertListEqual(['127.0.0.1'] * 4, ipss)
def test_cardinality_all(self): hll = Table.from_tag(PIXELS_HLL) res = select(h_hll(hll.hll), where=hll) estimate = next(iter(res))[0] tokens = set([]) with open("./fixtures/pixel.json") as f: for line in f: record = ujson.loads(line) tokens.add(record["token"]) self.checkEstimate(estimate, len(tokens)) res.purge()
def test_combo_where_on_or_partition_ex2(self): imps = Table.from_tag(IMPS) res = select(imps.ad_id, imps.date, imps.cpm_millis, where=((imps.date << ['2014-01-21', '2014-01-25']) & (imps.ad_id << [30003, 30010]))) results = list(res) self.assertEqual(len(results), 1) self.assertTrue( all(d == '2014-01-21' and a == 30010 for a, d, _ in results))
def test_cardinality_on_condition(self): hll = Table.from_tag(PIXELS_HLL) active_tokens = set([]) inactive_tokens = set([]) with open("./fixtures/pixel.json") as f: for line in f: record = ujson.loads(line) if record["isActive"]: active_tokens.add(record["token"]) else: inactive_tokens.add(record["token"]) res = select(h_hll(hll.hll), where=(hll.isActive == 1)) estimate = next(iter(res))[0] self.checkEstimate(estimate, len(active_tokens)) res.purge() res = select(h_hll(hll.hll), where=(hll.isActive == 0)) estimate = next(iter(res))[0] self.checkEstimate(estimate, len(inactive_tokens)) res.purge()
def test_project(self): imps = Table.from_tag(IMPS) res = select(imps.click, imps.conversion, imps.impression, where=imps) clicks = conversions = impressions = 0 for (click, conv, imp) in res: clicks += click conversions += conv impressions += imp self.assertEqual(clicks, 21) self.assertEqual(conversions, 5) self.assertEqual(impressions, 174)
def test_nested_agg(self): imps = Table.from_tag(IMPS) results = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date > '2014-01-22') sum_millis = {} for ad_id, dt, millis in results: key = str(ad_id) + dt if key not in sum_millis: sum_millis[key] = [0, 0] sum_millis[key][0] += millis sum_millis[key][1] += 1 results.purge() newtab = select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), h_count(), where=imps.date > '2014-01-22', nest=True) results = select(*star(newtab), where=newtab) self.assertGreater(len(list(results)), 0) for ad_id, dt, millis, count in results: ad_tup = sum_millis[str(ad_id) + dt] self.assertEqual(millis, ad_tup[0]) self.assertEqual(count, ad_tup[1]) results.purge()
def test_multiple_group_bys(self): imps = Table.from_tag(IMPS) results = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date > '2014-01-22') sum_millis = {} for ad_id, dt, millis in results: key = str(ad_id) + dt if key not in sum_millis: sum_millis[key] = [0, 0] sum_millis[key][0] += millis sum_millis[key][1] += 1 results.purge() results = select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), h_count(), where=imps.date > '2014-01-22') self.assertGreater(len(list(results)), 0) for ad_id, dt, millis, count in results: ad_tup = sum_millis[str(ad_id) + dt] self.assertEqual(millis, ad_tup[0]) self.assertEqual(count, ad_tup[1]) results.purge()