Ejemplo n.º 1
4
    def test_ordered_aggregation(self):
        imps = Table.from_tag(IMPS)
        resx = select(imps.ad_id, imps.cpm_millis, where=imps.date == '2014-01-27')

        sum_millis = {}
        for ad_id, millis in resx:
            if ad_id not in sum_millis:
                sum_millis[ad_id] = [0, 0]
            sum_millis[ad_id][0] += millis
            sum_millis[ad_id][1] += 1

        results = select(imps.ad_id, h_sum(imps.cpm_millis), h_count(),
                         where=imps.date == '2014-01-27',
                         order_by=2,
                         limit=3,
                         nest=True)
        self.assertGreater(len(list(results)), 0)
        lowest = 0
        for ad_id, millis, count in results:
            self.assertLessEqual(lowest, count)
            lowest = count
            ad_tup = sum_millis[ad_id]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
        self.assertEqual(len(list(results)), min(len(sum_millis), 3))
Ejemplo n.º 2
1
    def test_aggregate_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = list(select(imps.site_id, imps.ad_id,
                                where=imps.date < '2014-01-13'))
        pix_sites = list(select(pix.site_id, pix.amount,
                                where=pix.date < '2014-01-13'))

        join = {}
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    if imp_ad_id not in join:
                        join[imp_ad_id] = [0, 0]
                    join[imp_ad_id][0] += pix_amount
                    join[imp_ad_id][1] += 1

        res = select(imps.ad_id, h_sum(pix.amount), h_count(),
                     where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                     join=(imps.site_id, pix.site_id))
        results = list(res)
        self.assertEqual(len(results), len(join))

        for (ad_id, amount, count) in results:
            ramount, rcount = join[ad_id]
            self.assertEqual(ramount, amount)
            self.assertEqual(rcount, count)
Ejemplo n.º 3
0
    def test_column_fn_with_agg(self):
        ips = Table.from_tag(IPS)
        res = select(ips.exchange_id,
                     h_max(ip_ntoa(ips.ip)),
                     where=ips,
                     order_by=(ips.exchange_id, ))
        results = list(res)
        res.purge()
        exchanges = [ex for ex, _ in results]
        ipss = [ip for _, ip in results]
        self.assertListEqual(['Adx', 'Appnexus', 'OpenX', 'Rubycon'],
                             exchanges)
        self.assertListEqual(['192.168.1.1'] * 4, ipss)

        res = select(ips.exchange_id,
                     h_min(ip_ntoa(ips.ip)),
                     where=ips,
                     order_by=(ips.exchange_id, ))
        results = list(res)
        res.purge()
        exchanges = [ex for ex, _ in results]
        ipss = [ip for _, ip in results]
        self.assertListEqual(['Adx', 'Appnexus', 'OpenX', 'Rubycon'],
                             exchanges)
        self.assertListEqual(['127.0.0.1'] * 4, ipss)
Ejemplo n.º 4
0
    def test_nested_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = list(select(imps.site_id, imps.ad_id,
                                where=imps.date < '2014-01-13'))
        pix_sites = list(select(pix.site_id, pix.amount,
                                where=((pix.date < '2014-01-13') &
                                       (pix.isActive == True))))

        join = []
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    join.append((imp_ad_id, pix_amount))

        sub_pix = select(pix.site_id, pix.amount, pix.date,
                         where=((pix.date < '2014-01-15') & (pix.isActive == True)),
                         nest=True)

        res = select(imps.ad_id, sub_pix.amount,
                     where=(imps.date < '2014-01-13', sub_pix.date < '2014-01-13'),
                     join=(imps.site_id, sub_pix.site_id))
        results = [tuple(c) for c in res]
        self.assertEqual(len(results), len(join))

        for jtup in join:
            self.assertIn(jtup, results)
Ejemplo n.º 5
0
    def test_simple_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = [(s, a) for (s, a), _ in result_iterator(
            select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))]
        pix_sites = [(s, a) for (s, a), _ in result_iterator(
            select(pix.site_id, pix.amount, where=pix.date < '2014-01-13'))]

        join = []
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    join.append((imp_ad_id, pix_amount))

        res = select(imps.ad_id,
                     pix.amount,
                     where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                     join=(imps.site_id, pix.site_id),
                     order_by='amount')
        results = [(ad_id, amount)
                   for (ad_id, amount), _ in result_iterator(res)]
        self.assertTrue(len(results), len(join))

        for jtup in join:
            self.assertIn(jtup, results)

        lowest = 0
        for ad_id, amount in results:
            self.assertLessEqual(lowest, amount)
            lowest = amount
Ejemplo n.º 6
0
    def test_nested_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = [(s, a) for (s, a), _ in result_iterator(
            select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))]
        pix_sites = [(s, a) for (s, a), _ in result_iterator(
            select(pix.site_id,
                   pix.amount,
                   where=((pix.date < '2014-01-13') & (pix.isActive > 0))))]

        join = []
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    join.append((imp_ad_id, pix_amount))

        sub_pix = select(pix.site_id,
                         pix.amount,
                         pix.date,
                         where=((pix.date < '2014-01-15') &
                                (pix.isActive > 0)),
                         nest=True)

        res = select(imps.ad_id,
                     sub_pix.amount,
                     where=(imps.date < '2014-01-13',
                            sub_pix.date < '2014-01-13'),
                     join=(imps.site_id, sub_pix.site_id))
        results = [(ad_id, amount)
                   for (ad_id, amount), _ in result_iterator(res)]
        self.assertTrue(len(results), len(join))

        for jtup in join:
            self.assertIn(jtup, results)
Ejemplo n.º 7
0
    def test_simple_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = [(s, a) for (s, a) in select(imps.site_id, imps.ad_id,
                                                 where=imps.date < '2014-01-13')]
        pix_sites = [(s, a) for (s, a) in select(pix.site_id, pix.amount,
                                                 where=pix.date < '2014-01-13')]

        join = []
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    join.append((imp_ad_id, pix_amount))

        res = select(imps.ad_id, pix.amount,
                     where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                     join=(imps.site_id, pix.site_id),
                     order_by='amount')
        results = list(res)
        self.assertEqual(len(results), len(join))

        for jtup in join:
            self.assertIn(jtup, results)

        lowest = 0
        for ad_id, amount in results:
            self.assertLessEqual(lowest, amount)
            lowest = amount
Ejemplo n.º 8
0
    def test_aggregate_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = [(s, a) for (s, a), _ in result_iterator(
            select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))]
        pix_sites = [(s, a) for (s, a), _ in result_iterator(
            select(pix.site_id, pix.amount, where=pix.date < '2014-01-13'))]

        join = {}
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    if imp_ad_id not in join:
                        join[imp_ad_id] = [0, 0]
                    join[imp_ad_id][0] += pix_amount
                    join[imp_ad_id][1] += 1

        res = select(imps.ad_id,
                     h_sum(pix.amount),
                     h_count(),
                     where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                     join=(imps.site_id, pix.site_id))
        results = [(ad_id, amount, count)
                   for (ad_id, amount, count), _ in result_iterator(res)]
        self.assertTrue(len(results), len(join))

        for (ad_id, amount, count) in results:
            ramount, rcount = join[ad_id]
            self.assertEqual(ramount, amount)
            self.assertEqual(rcount, count)
Ejemplo n.º 9
0
    def test_nested_agg(self):
        imps = Table.from_tag(IMPS)
        results = select(imps.ad_id,
                         imps.date,
                         imps.cpm_millis,
                         where=imps.date > '2014-01-22')

        sum_millis = {}
        for ad_id, dt, millis in results:
            key = str(ad_id) + dt
            if key not in sum_millis:
                sum_millis[key] = [0, 0]
            sum_millis[key][0] += millis
            sum_millis[key][1] += 1

        newtab = select(imps.ad_id,
                        imps.date,
                        h_sum(imps.cpm_millis),
                        h_count(),
                        where=imps.date > '2014-01-22',
                        nest=True)
        results = select(*star(newtab), where=newtab)
        self.assertGreater(len(list(results)), 0)
        for ad_id, dt, millis, count in results:
            ad_tup = sum_millis[str(ad_id) + dt]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
Ejemplo n.º 10
0
    def test_ordered_aggregation(self):
        imps = Table.from_tag(IMPS)
        res = select(imps.ad_id, imps.cpm_millis, where=imps.date == '2014-01-27')
        resx = [c for c, _ in result_iterator(res)]

        sum_millis = {}
        for ad_id, millis in resx:
            if ad_id not in sum_millis:
                sum_millis[ad_id] = [0, 0]
            sum_millis[ad_id][0] += millis
            sum_millis[ad_id][1] += 1

        res = select(imps.ad_id, h_sum(imps.cpm_millis), h_count(),
                     where=imps.date == '2014-01-27',
                     order_by=2,
                     limit=3)
        results = [c for c, _ in result_iterator(res)]
        lowest = 0
        for ad_id, millis, count in results:
            self.assertLessEqual(lowest, count)
            lowest = count
            ad_tup = sum_millis[ad_id]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
        self.assertTrue(len(results) == min(len(sum_millis), 3))
Ejemplo n.º 11
0
    def test_cardinality_with_order_by(self):
        hll = Table.from_tag(PIXELS_HLL)
        tokens_by_date = defaultdict(set)
        with open("./fixtures/pixel.json") as f:
            for line in f:
                record = ujson.loads(line)
                tokens_by_date[record["date"]].add(record["token"])
        result = [(date, len(tokens))
                  for date, tokens in tokens_by_date.items()]

        # Test order by date
        expects = sorted(result, key=itemgetter(0), reverse=True)
        res = select(hll.date,
                     h_hll(hll.hll),
                     where=hll,
                     order_by=0,
                     desc=True)
        estimates = list(res)
        for i, (date, expected_cardinality) in enumerate(expects):
            self.assertEqual(estimates[i][0], date)
            self.checkEstimate(estimates[i][1], expected_cardinality)
        res.purge()

        # Test order by hll
        res = select(hll.date,
                     h_hll(hll.hll),
                     where=hll,
                     order_by=1,
                     desc=True)
        l = list(res)
        for i in range(len(l) - 1):
            self.assertTrue(l[i][1] >= l[i + 1][1])
        res.purge()
Ejemplo n.º 12
0
    def test_nested_join(self):
        imps = Table.from_tag(IMPS)
        pix  = Table.from_tag(PIXELS)

        imp_sites = [(s, a) for (s, a), _ in result_iterator(select(imps.site_id, imps.ad_id,
                                                                    where=imps.date < '2014-01-13'))]
        pix_sites = [(s, a) for (s, a), _ in result_iterator(select(pix.site_id, pix.amount,
                                                                    where=((pix.date < '2014-01-13') &
                                                                           (pix.isActive > 0))))]

        join = []
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    join.append((imp_ad_id, pix_amount))

        sub_pix = select(pix.site_id, pix.amount, pix.date,
                         where=((pix.date < '2014-01-15') & (pix.isActive > 0)),
                         nest=True)

        res = select(imps.ad_id, sub_pix.amount,
                     where=(imps.date < '2014-01-13', sub_pix.date < '2014-01-13'),
                     join=(imps.site_id, sub_pix.site_id))
        results = [(ad_id, amount) for (ad_id, amount), _ in result_iterator(res)]
        self.assertTrue(len(results), len(join))

        for jtup in join:
            self.assertIn(jtup, results)
Ejemplo n.º 13
0
    def test_ordered_aggregation(self):
        imps = Table.from_tag(IMPS)
        resx = select(imps.ad_id,
                      imps.cpm_millis,
                      where=imps.date == '2014-01-27')

        sum_millis = {}
        for ad_id, millis in resx:
            if ad_id not in sum_millis:
                sum_millis[ad_id] = [0, 0]
            sum_millis[ad_id][0] += millis
            sum_millis[ad_id][1] += 1

        results = select(imps.ad_id,
                         h_sum(imps.cpm_millis),
                         h_count(),
                         where=imps.date == '2014-01-27',
                         order_by=2,
                         limit=3,
                         nest=True)
        self.assertGreater(len(list(results)), 0)
        lowest = 0
        for ad_id, millis, count in results:
            self.assertLessEqual(lowest, count)
            lowest = count
            ad_tup = sum_millis[ad_id]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
        self.assertEqual(len(list(results)), min(len(sum_millis), 3))

        resx.purge()
Ejemplo n.º 14
0
    def test_nested_join(self):
        imps = Table.from_tag(IMPS)
        pix = Table.from_tag(PIXELS)

        imp_sites = list(
            select(imps.site_id, imps.ad_id, where=imps.date < '2014-01-13'))
        pix_sites = list(
            select(pix.site_id,
                   pix.amount,
                   where=((pix.date < '2014-01-13') & (pix.isActive == True))))

        join = []
        for imp_site, imp_ad_id in imp_sites:
            for pix_site, pix_amount in pix_sites:
                if imp_site == pix_site:
                    join.append((imp_ad_id, pix_amount))

        sub_pix = select(pix.site_id,
                         pix.amount,
                         pix.date,
                         where=((pix.date < '2014-01-15') &
                                (pix.isActive == True)),
                         nest=True)

        res = select(imps.ad_id,
                     sub_pix.amount,
                     where=(imps.date < '2014-01-13',
                            sub_pix.date < '2014-01-13'),
                     join=(imps.site_id, sub_pix.site_id))
        results = [tuple(c) for c in res]
        self.assertEqual(len(results), len(join))

        for jtup in join:
            self.assertIn(jtup, results)
Ejemplo n.º 15
0
 def test_column_fn_with_nest(self):
     ips = Table.from_tag(IPS)
     res = select(ip_ntoa(ips.ip),
                  where=ips.exchange_id == "Adx", order_by=(ip_ntoa(ips.ip),),
                  distinct=True, nest=True)
     ret = select(res.ip, where=res, order_by=(res.ip,))
     results = list(ret)
     ret.purge()
     ipss = [ip[0] for ip in results]
     self.assertListEqual(['127.0.0.1', '192.1.1.1', '192.1.1.2', '192.168.1.1'],
                          ipss)
Ejemplo n.º 16
0
 def test_column_fn_with_nest(self):
     ips = Table.from_tag(IPS)
     res = select(ip_ntoa(ips.ip),
                  where=ips.exchange_id == "Adx",
                  order_by=(ip_ntoa(ips.ip), ),
                  distinct=True,
                  nest=True)
     ret = select(res.ip, where=res, order_by=(res.ip, ))
     results = list(ret)
     ret.purge()
     ipss = [ip[0] for ip in results]
     self.assertListEqual(
         ['127.0.0.1', '192.1.1.1', '192.1.1.2', '192.168.1.1'], ipss)
Ejemplo n.º 17
0
    def test_nested_self_join(self):
        """
        A self join is joining the table against itself.  This requires the use of aliases.
        """
        imps = Table.from_tag(IMPS)

        early_res = select(imps.ad_id,
                           imps.cpm_millis,
                           where=imps.date < '2014-01-20')
        early = list(early_res)
        late_res = select(imps.ad_id,
                          imps.cpm_millis,
                          where=imps.date >= '2014-01-20')
        late = list(late_res)

        join = {}
        for eid, ecpm in early:
            for lid, lcpm in late:
                if eid == lid:
                    if eid not in join:
                        join[eid] = [0, 0, 0]
                    join[eid][0] += ecpm
                    join[eid][1] += lcpm
                    join[eid][2] += 1

        early = select(imps.ad_id,
                       imps.cpm_millis,
                       where=imps.date < '2014-01-20',
                       nest=True)
        late = select(imps.ad_id,
                      imps.cpm_millis,
                      where=imps.date >= '2014-01-20',
                      nest=True)
        jimmy = select(early.ad_id.named('adididid'),
                       h_sum(early.cpm_millis).named('emillis'),
                       h_sum(late.cpm_millis).named('lmillis'),
                       h_count(),
                       where=(early, late),
                       join='ad_id')

        james = list(jimmy)
        self.assertEqual(len(join), len(james))

        for (ad_id, emillis, lmillis, cnt) in james:
            ecpm, lcpm, ocnt = join[ad_id]
            self.assertEqual(emillis, ecpm)
            self.assertEqual(lmillis, lcpm)
            self.assertEqual(cnt, ocnt)
        early_res.purge()
        late_res.purge()
        jimmy.purge()
Ejemplo n.º 18
0
    def test_bool_values(self):
        pix = Table.from_tag(PIXELS)
        res = select(pix.isActive, where=pix.isActive == True)
        actives = 0
        for (act, ) in res:
            actives += act

        self.assertEqual(actives, 234)

        res = select(pix.isActive, where=pix.isActive == 0)
        actives = 0
        for (act, ) in res:
            actives += 1

        self.assertEqual(actives, 266)
Ejemplo n.º 19
0
 def test_overall(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, where=imps.date == '2014-01-27', distinct=True, limit=4,
                  order_by='ad_id', desc=True)
     results = [a for (a, d), _ in result_iterator(res)]
     self.assertEqual(len(results), 4)
     self.assertListEqual(results, [30019, 30018, 30017, 30015])
Ejemplo n.º 20
0
 def test_single_int_order(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27', order_by=imps.cpm_millis)
     lowest = 0
     for (a, d, c), _ in result_iterator(res):
         self.assertLessEqual(lowest, c)
         lowest = c
Ejemplo n.º 21
0
 def test_combo_where_on_or_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=((imps.date == '2014-01-21') | (imps.date == '2014-01-25') | (imps.ad_id == 30010)))
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 27)
     self.assertTrue(all(d == '2014-01-21' or d == '2014-01-25' or a == 30010 for a, d, _ in results))
Ejemplo n.º 22
0
 def test_combo_where_on_and_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23') & (imps.ad_id == 30010)))
     results = list(res)
     self.assertEqual(len(results), 2)
     self.assertTrue(all(d in ('2014-01-21', '2014-01-22', '2014-01-23') and a == 30010 for a, d, _ in results))
Ejemplo n.º 23
0
 def test_combo_where_on_or_partition_ex1(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=((imps.date << ['2014-01-21', '2014-01-25']) | (imps.ad_id << [30003, 30010])))
     results = list(res)
     self.assertEqual(len(results), 40)
     self.assertTrue(all(d == '2014-01-21' or d == '2014-01-25' or a == 30010 or a == 30003 for a, d, _ in results))
Ejemplo n.º 24
0
 def test_combo_where_on_mixed_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=(((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23') & (imps.time > 170000))))
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 2)
     self.assertTrue(all((d in ('2014-01-21', '2014-01-22', '2014-01-23') and a == 30003) for a, d, c in results))
Ejemplo n.º 25
0
 def test_combo_where_no_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis, where=(imps.time >= 180000))
     results = list(res)
     print results
     self.assertEqual(len(results), 5)
     res.purge()
Ejemplo n.º 26
0
 def test_combo_where_on_or_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=((imps.date == '2014-01-21') | (imps.date == '2014-01-25') | (imps.ad_id == 30010)))
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 27)
     self.assertTrue(all(d == '2014-01-21' or d == '2014-01-25' or a == 30010 for a, d, _ in results))
Ejemplo n.º 27
0
 def test_combo_where_on_or_partition_ex2(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=((imps.date << ['2014-01-21', '2014-01-25']) & (imps.ad_id << [30003, 30010])))
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 1)
     self.assertTrue(all(d == '2014-01-21' and a == 30010 for a, d, _ in results))
Ejemplo n.º 28
0
 def test_combo_where_on_mixed_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis,
                  where=(((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23') & (imps.time > 170000))))
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 2)
     self.assertTrue(all((d in ('2014-01-21', '2014-01-22', '2014-01-23') and a == 30003) for a, d, c in results))
Ejemplo n.º 29
0
 def test_range_on_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date > '2014-01-27')
     results = list(res)
     self.assertEqual(len(results), 20)
     self.assertTrue(all(d in ('2014-01-28', '2014-01-29') for _, d, _ in results))
     res.purge()
Ejemplo n.º 30
0
 def test_column_fn(self):
     ips = Table.from_tag(IPS)
     res = select(ips.exchange_id, ip_ntoa(ips.ip),
                  where=ips.exchange_id == "Adx")
     results = list(res)
     self.assertEqual(len(results), 29)
     res.purge()
Ejemplo n.º 31
0
 def test_distinct(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id,
                  imps.date,
                  where=imps.date == '2014-01-27',
                  distinct=True)
     results = list(res)
     self.assertEqual(len(results), 8)
Ejemplo n.º 32
0
 def test_equality_on_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27')
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 10)
     found = next((a, d, c) for a, d, c in results if a == 30018 and d == '2014-01-27' and c == 4506)
     self.assertIsNotNone(found)
     self.assertTrue(all(d == '2014-01-27' for _, d, _ in results))
Ejemplo n.º 33
0
    def test_simple_aggregation(self):
        imps = Table.from_tag(IMPS)
        results = select(imps.ad_id, imps.cpm_millis, where=imps.date == '2014-01-27')

        sum_millis = {}
        for ad_id, millis in results:
            if ad_id not in sum_millis:
                sum_millis[ad_id] = [0, 0]
            sum_millis[ad_id][0] += millis
            sum_millis[ad_id][1] += 1

        results = select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27')
        self.assertGreater(len(list(results)), 0)
        for ad_id, millis, count in results:
            ad_tup = sum_millis[ad_id]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
Ejemplo n.º 34
0
 def test_column_fn(self):
     ips = Table.from_tag(IPS)
     res = select(ips.exchange_id,
                  ip_ntoa(ips.ip),
                  where=ips.exchange_id == "Adx")
     results = list(res)
     self.assertEqual(len(results), 29)
     res.purge()
Ejemplo n.º 35
0
 def test_equality_on_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27')
     results = [c for c, _ in result_iterator(res)]
     self.assertEqual(len(results), 10)
     found = next((a, d, c) for a, d, c in results if a == 30018 and d == '2014-01-27' and c == 4506)
     self.assertIsNotNone(found)
     self.assertTrue(all(d == '2014-01-27' for _, d, _ in results))
Ejemplo n.º 36
0
    def test_simple_aggregation(self):
        imps = Table.from_tag(IMPS)
        res = select(imps.ad_id, imps.cpm_millis, where=imps.date == '2014-01-27')
        results = [c for c, _ in result_iterator(res)]

        sum_millis = {}
        for ad_id, millis in results:
            if ad_id not in sum_millis:
                sum_millis[ad_id] = [0, 0]
            sum_millis[ad_id][0] += millis
            sum_millis[ad_id][1] += 1

        res = select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27')
        results = [c for c, _ in result_iterator(res)]
        for ad_id, millis, count in results:
            ad_tup = sum_millis[ad_id]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
Ejemplo n.º 37
0
    def test_bit_values(self):
        pix = Table.from_tag(PIXELS)
        res = select(pix.isActive, where=pix.isActive == 1)
        actives = 0
        for (act, ) in res:
            actives += act

        self.assertEqual(actives, 234)
        res.purge()
Ejemplo n.º 38
0
    def test_bit_values(self):
        pix = Table.from_tag(PIXELS)
        res = select(pix.isActive, where=pix.isActive == 1)
        actives = 0
        for (act, ) in res:
            actives += act

        self.assertEqual(actives, 234)
        res.purge()
Ejemplo n.º 39
0
    def test_aggregate(self):
        imps = Table.from_tag(IMPS)
        res = select(h_sum(imps.click), h_sum(imps.conversion), h_sum(imps.impression), where=imps)

        (clicks, conversions, impressions) = list(res)[0]

        self.assertEqual(clicks, 21)
        self.assertEqual(conversions, 5)
        self.assertEqual(impressions, 174)
Ejemplo n.º 40
0
 def test_low_limit(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id,
                  imps.date,
                  imps.cpm_millis,
                  where=imps.date == '2014-01-27',
                  limit=4)
     results = list(res)
     self.assertEqual(len(results), 4)
Ejemplo n.º 41
0
 def test_combo_where_no_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id,
                  imps.date,
                  imps.cpm_millis,
                  where=(imps.time >= 180000))
     results = list(res)
     print results
     self.assertEqual(len(results), 5)
Ejemplo n.º 42
0
    def test_multiple_group_bys(self):
        imps = Table.from_tag(IMPS)
        results = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date > '2014-01-22')

        sum_millis = {}
        for ad_id, dt, millis in results:
            key = str(ad_id) + dt
            if key not in sum_millis:
                sum_millis[key] = [0, 0]
            sum_millis[key][0] += millis
            sum_millis[key][1] += 1

        results = select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), h_count(), where=imps.date > '2014-01-22')
        self.assertGreater(len(list(results)), 0)
        for ad_id, dt, millis, count in results:
            ad_tup = sum_millis[str(ad_id) + dt]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
Ejemplo n.º 43
0
 def test_range_on_partition(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id,
                  imps.date,
                  imps.cpm_millis,
                  where=imps.date > '2014-01-27')
     results = list(res)
     self.assertEqual(len(results), 20)
     self.assertTrue(
         all(d in ('2014-01-28', '2014-01-29') for _, d, _ in results))
Ejemplo n.º 44
0
    def test_overflow(self):
        from itertools import izip

        imps = Table.from_tag(IMPS)
        fly_results = select(imps.date,
                             h_sum(imps.impression),
                             where=imps,
                             order_by=imps.date)

        nest_tab = select(imps.date,
                          h_sum(imps.impression),
                          where=imps,
                          nest=True)
        nest_results = select(*star(nest_tab), where=nest_tab, order_by=0)

        for ((fdate, fimps), (ndate, nimps)) in izip(fly_results,
                                                     nest_results):
            self.assertEqual(fdate, ndate)
            self.assertEqual(fimps, nimps)
Ejemplo n.º 45
0
    def test_nested_self_join(self):
        """
        A self join is joining the table against itself.  This requires the use of aliases.
        """
        imps = Table.from_tag(IMPS)

        early_res = select(imps.ad_id, imps.cpm_millis,
                           where=imps.date < '2014-01-20')
        early = list(early_res)
        late_res = select(imps.ad_id, imps.cpm_millis,
                          where=imps.date >= '2014-01-20')
        late = list(late_res)

        join = {}
        for eid, ecpm in early:
            for lid, lcpm in late:
                if eid == lid:
                    if eid not in join:
                        join[eid] = [0, 0, 0]
                    join[eid][0] += ecpm
                    join[eid][1] += lcpm
                    join[eid][2] += 1

        early = select(imps.ad_id, imps.cpm_millis, where=imps.date < '2014-01-20', nest=True)
        late = select(imps.ad_id, imps.cpm_millis, where=imps.date >= '2014-01-20', nest=True)
        jimmy = select(early.ad_id.named('adididid'),
                       h_sum(early.cpm_millis).named('emillis'),
                       h_sum(late.cpm_millis).named('lmillis'),
                       h_count(),
                       where=(early, late),
                       join='ad_id')

        james = list(jimmy)
        self.assertEqual(len(join), len(james))

        for (ad_id, emillis, lmillis, cnt) in james:
            ecpm, lcpm, ocnt = join[ad_id]
            self.assertEqual(emillis, ecpm)
            self.assertEqual(lmillis, lcpm)
            self.assertEqual(cnt, ocnt)
        early_res.purge()
        late_res.purge()
        jimmy.purge()
Ejemplo n.º 46
0
    def test_column_fn_with_agg(self):
        ips = Table.from_tag(IPS)
        res = select(ips.exchange_id, h_max(ip_ntoa(ips.ip)),
                     where=ips, order_by=(ips.exchange_id,))
        results = list(res)
        res.purge()
        exchanges = [ex for ex, _ in results]
        ipss = [ip for _, ip in results]
        self.assertListEqual(['Adx', 'Appnexus', 'OpenX', 'Rubycon'], exchanges)
        self.assertListEqual(['192.168.1.1'] * 4, ipss)

        res = select(ips.exchange_id, h_min(ip_ntoa(ips.ip)),
                     where=ips, order_by=(ips.exchange_id,))
        results = list(res)
        res.purge()
        exchanges = [ex for ex, _ in results]
        ipss = [ip for _, ip in results]
        self.assertListEqual(['Adx', 'Appnexus', 'OpenX', 'Rubycon'], exchanges)
        self.assertListEqual(['127.0.0.1'] * 4, ipss)
Ejemplo n.º 47
0
 def test_cardinality_all(self):
     hll = Table.from_tag(PIXELS_HLL)
     res = select(h_hll(hll.hll), where=hll)
     estimate = next(iter(res))[0]
     tokens = set([])
     with open("./fixtures/pixel.json") as f:
         for line in f:
             record = ujson.loads(line)
             tokens.add(record["token"])
     self.checkEstimate(estimate, len(tokens))
     res.purge()
Ejemplo n.º 48
0
 def test_cardinality_all(self):
     hll = Table.from_tag(PIXELS_HLL)
     res = select(h_hll(hll.hll), where=hll)
     estimate = next(iter(res))[0]
     tokens = set([])
     with open("./fixtures/pixel.json") as f:
         for line in f:
             record = ujson.loads(line)
             tokens.add(record["token"])
     self.checkEstimate(estimate, len(tokens))
     res.purge()
Ejemplo n.º 49
0
 def test_combo_where_on_or_partition_ex2(self):
     imps = Table.from_tag(IMPS)
     res = select(imps.ad_id,
                  imps.date,
                  imps.cpm_millis,
                  where=((imps.date << ['2014-01-21', '2014-01-25']) &
                         (imps.ad_id << [30003, 30010])))
     results = list(res)
     self.assertEqual(len(results), 1)
     self.assertTrue(
         all(d == '2014-01-21' and a == 30010 for a, d, _ in results))
Ejemplo n.º 50
0
    def test_cardinality_on_condition(self):
        hll = Table.from_tag(PIXELS_HLL)
        active_tokens = set([])
        inactive_tokens = set([])
        with open("./fixtures/pixel.json") as f:
            for line in f:
                record = ujson.loads(line)
                if record["isActive"]:
                    active_tokens.add(record["token"])
                else:
                    inactive_tokens.add(record["token"])
        res = select(h_hll(hll.hll), where=(hll.isActive == 1))
        estimate = next(iter(res))[0]
        self.checkEstimate(estimate, len(active_tokens))
        res.purge()

        res = select(h_hll(hll.hll), where=(hll.isActive == 0))
        estimate = next(iter(res))[0]
        self.checkEstimate(estimate, len(inactive_tokens))
        res.purge()
Ejemplo n.º 51
0
    def test_cardinality_on_condition(self):
        hll = Table.from_tag(PIXELS_HLL)
        active_tokens = set([])
        inactive_tokens = set([])
        with open("./fixtures/pixel.json") as f:
            for line in f:
                record = ujson.loads(line)
                if record["isActive"]:
                    active_tokens.add(record["token"])
                else:
                    inactive_tokens.add(record["token"])
        res = select(h_hll(hll.hll), where=(hll.isActive == 1))
        estimate = next(iter(res))[0]
        self.checkEstimate(estimate, len(active_tokens))
        res.purge()

        res = select(h_hll(hll.hll), where=(hll.isActive == 0))
        estimate = next(iter(res))[0]
        self.checkEstimate(estimate, len(inactive_tokens))
        res.purge()
Ejemplo n.º 52
0
    def test_project(self):
        imps = Table.from_tag(IMPS)
        res = select(imps.click, imps.conversion, imps.impression, where=imps)
        clicks = conversions = impressions = 0
        for (click, conv, imp) in res:
            clicks += click
            conversions += conv
            impressions += imp

        self.assertEqual(clicks, 21)
        self.assertEqual(conversions, 5)
        self.assertEqual(impressions, 174)
Ejemplo n.º 53
0
    def test_nested_agg(self):
        imps = Table.from_tag(IMPS)
        results = select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date > '2014-01-22')

        sum_millis = {}
        for ad_id, dt, millis in results:
            key = str(ad_id) + dt
            if key not in sum_millis:
                sum_millis[key] = [0, 0]
            sum_millis[key][0] += millis
            sum_millis[key][1] += 1
        results.purge()

        newtab = select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), h_count(),
                        where=imps.date > '2014-01-22',
                        nest=True)
        results = select(*star(newtab), where=newtab)
        self.assertGreater(len(list(results)), 0)
        for ad_id, dt, millis, count in results:
            ad_tup = sum_millis[str(ad_id) + dt]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
        results.purge()
Ejemplo n.º 54
0
    def test_cardinality_with_order_by(self):
        hll = Table.from_tag(PIXELS_HLL)
        tokens_by_date = defaultdict(set)
        with open("./fixtures/pixel.json") as f:
            for line in f:
                record = ujson.loads(line)
                tokens_by_date[record["date"]].add(record["token"])
        result = [(date, len(tokens)) for date, tokens in tokens_by_date.items()]

        # Test order by date
        expects = sorted(result, key=itemgetter(0), reverse=True)
        res = select(hll.date, h_hll(hll.hll), where=hll, order_by=0, desc=True)
        estimates = list(res)
        for i, (date, expected_cardinality) in enumerate(expects):
            self.assertEqual(estimates[i][0], date)
            self.checkEstimate(estimates[i][1], expected_cardinality)
        res.purge()

        # Test order by hll
        res = select(hll.date, h_hll(hll.hll), where=hll, order_by=1, desc=True)
        l = list(res)
        for i in range(len(l) - 1):
            self.assertTrue(l[i][1] >= l[i + 1][1])
        res.purge()
Ejemplo n.º 55
-1
    def test_multiple_group_bys(self):
        imps = Table.from_tag(IMPS)
        results = select(imps.ad_id,
                         imps.date,
                         imps.cpm_millis,
                         where=imps.date > '2014-01-22')

        sum_millis = {}
        for ad_id, dt, millis in results:
            key = str(ad_id) + dt
            if key not in sum_millis:
                sum_millis[key] = [0, 0]
            sum_millis[key][0] += millis
            sum_millis[key][1] += 1
        results.purge()

        results = select(imps.ad_id,
                         imps.date,
                         h_sum(imps.cpm_millis),
                         h_count(),
                         where=imps.date > '2014-01-22')
        self.assertGreater(len(list(results)), 0)
        for ad_id, dt, millis, count in results:
            ad_tup = sum_millis[str(ad_id) + dt]
            self.assertEqual(millis, ad_tup[0])
            self.assertEqual(count, ad_tup[1])
        results.purge()
Ejemplo n.º 56
-1
    def test_overflow(self):
        from itertools import izip

        imps = Table.from_tag(IMPS)
        fly_results = select(imps.date, h_sum(imps.impression), where=imps, order_by=imps.date)

        nest_tab = select(imps.date, h_sum(imps.impression), where=imps, nest=True)
        nest_results = select(*star(nest_tab), where=nest_tab, order_by=0)

        for ((fdate, fimps), (ndate, nimps)) in izip(fly_results, nest_results):
            self.assertEqual(fdate, ndate)
            self.assertEqual(fimps, nimps)