def test_simple_aggs(pipeline, clean_db): """ Verify that combines work properly on simple aggs """ q = """ SELECT x::integer %% 10 AS k, avg(x), sum(y::float8), count(*) FROM stream0 GROUP BY k; """ desc = ('x', 'y') pipeline.create_stream('stream0', x='int', y='float8') pipeline.create_cv('test_simple_aggs', q) pipeline.create_table('test_simple_aggs_t', x='integer', y='float8') rows = [] for n in range(10000): row = (random.randint(0, 1000), random.random()) rows.append(row) pipeline.insert('stream0', desc, rows) pipeline.insert('test_simple_aggs_t', desc, rows) table_result = list(pipeline.execute('SELECT avg(x), sum(y::float8), count(*) FROM test_simple_aggs_t')) cv_result = list(pipeline.execute('SELECT combine(avg), combine(sum), combine(count) FROM test_simple_aggs')) assert len(table_result) == len(cv_result) for tr, cr in zip(table_result, cv_result): assert abs(tr[0] - cr[0]) < 0.00001 assert abs(tr[1] - cr[1]) < 0.00001 assert abs(tr[2] - cr[2]) < 0.00001
def test_null_groups(pipeline, clean_db): """ Verify that null group columns are considered equal """ pipeline.create_stream("s", x="int", y="int", z="int") q = """ SELECT x::integer, y::integer, z::integer, COUNT(*) FROM s GROUP BY x, y, z; """ desc = ("x", "y", "z") pipeline.create_cv("test_null_groups", q) pipeline.create_table("test_null_groups_t", x="integer", y="integer", z="integer") rows = [] for n in range(10000): vals = list(random.randint(0, 10) for n in range(3)) vals = map(lambda n: random.random() > 0.1 and n or None, vals) rows.append(tuple(vals)) pipeline.insert("s", desc, rows) pipeline.insert("test_null_groups_t", desc, rows) table_q = """ SELECT x, y, z, COUNT(*) FROM test_null_groups_t GROUP BY x, y, z ORDER BY x, y, z; """ expected = list(pipeline.execute(table_q)) result = list(pipeline.execute("SELECT x, y, z, count FROM test_null_groups ORDER BY x, y, z")) for r, e in zip(result, expected): assert r == e
def test_incremental_join(pipeline, clean_db): """ Verify that join results increase appropriately as we incrementally add stream events to the input """ num_cols = 4 join_cols = [0, 1] t_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) pipeline.create_table('inc', **t_cols) pipeline.create_stream('stream', **t_cols) q = """ SELECT s.col0::integer FROM inc JOIN stream s ON inc.col0 = s.col0 AND inc.col1 = s.col1::integer """ t = _generate_rows(num_cols, 64) _insert(pipeline, 'inc', t, 0.1) pipeline.create_cv('test_join', q) s = [] for n in range(2): row = _generate_row(num_cols) _insert(pipeline, 'stream', [row]) s.append(row) expected = _join(t, s, join_cols) result = pipeline.execute('SELECT COUNT(*) FROM test_join').first() assert result['count'] == len(expected)
def test_hll_distinct(pipeline, clean_db): """ Verify that combines work on HLL COUNT DISTINCT queries """ q = """ SELECT x::integer %% 10 AS k, COUNT(DISTINCT x) AS count FROM stream GROUP BY k """ desc = ('x', 'y') pipeline.create_cv('test_hll_distinct', q) pipeline.create_table('test_hll_distinct_t', x='integer', y='float8') rows = [] for n in range(10000): row = (random.randint(0, 1000), random.random()) rows.append(row) pipeline.insert('stream', desc, rows) pipeline.insert('test_hll_distinct_t', desc, rows) # Note that the CQ will use the HLL variant of COUNT DISTINCT, # so use hll_count_distinct on the table too tq = """ SELECT hll_count_distinct(x) FROM test_hll_distinct_t """ table_result = list(pipeline.execute(tq)) cq = """ SELECT combine(count) FROM test_hll_distinct """ cv_result = list(pipeline.execute(cq)) assert len(table_result) == len(cv_result) for tr, cr in zip(table_result, cv_result): assert tr == cr
def test_indexed(pipeline, clean_db): """ Verify that stream-table joins involving indexed tables work """ pipeline.create_stream('stream', x='int', y='int') q = """ SELECT stream.x::integer, count(*) FROM stream JOIN test_indexed_t t ON stream.x = t.x GROUP BY stream.x """ pipeline.create_table('test_indexed_t', x='integer', y='integer') pipeline.execute('CREATE INDEX idx ON test_indexed_t(x)') t = _generate_rows(2, 1000) s = _generate_rows(2, 1000) pipeline.insert('test_indexed_t', ('x', 'y'), t) time.sleep(0.1) pipeline.create_cv('test_indexed', q) pipeline.insert('stream', ('x', 'y'), s) expected = _join(s, t, [0]) result = pipeline.execute('SELECT sum(count) FROM test_indexed').first() assert result['sum'] == len(expected)
def test_join_multiple_tables(pipeline, clean_db): """ Verify that stream-table joins involving multiple tables work """ num_cols = 8 join_cols = [0] t0_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) t1_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) pipeline.create_table('t0', **t0_cols) pipeline.create_table('t1', **t1_cols) pipeline.create_stream('stream', **t0_cols) q = """ SELECT s.col0::integer FROM t0 JOIN t1 ON t0.col0 = t1.col0 JOIN stream s ON t1.col0 = s.col0 """ t0 = _generate_rows(num_cols, 64) t1 = _generate_rows(num_cols, 64) s = _generate_rows(num_cols, 64) _insert(pipeline, 't1', t1, 0.1) _insert(pipeline, 't0', t0, 0.1) pipeline.create_cv('test_join_multi', q) _insert(pipeline, 'stream', s) expected = _join(t0, _join(s, t1, join_cols), join_cols) result = pipeline.execute('SELECT COUNT(*) FROM test_join_multi').first() assert result['count'] == len(expected)
def test_join_with_where(pipeline, clean_db): """ Verify that stream-table joins using a WHERE clause work properly """ num_cols = 4 q = """ SELECT s.col0::integer FROM stream s, wt WHERE s.col0 = 1 AND wt.col0 = 1 """ wt_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) pipeline.create_table('wt', **wt_cols) pipeline.create_table('wt_s', **wt_cols) wt = _generate_rows(num_cols, 64) s = _generate_rows(num_cols, 64) _insert(pipeline, 'wt', wt, 0.1) _insert(pipeline, 'wt_s', s, 0.1) pipeline.create_stream('stream', **wt_cols) pipeline.create_cv('test_join_where', q) _insert(pipeline, 'stream', s) expected = pipeline.execute('SELECT COUNT(*) FROM wt_s s, wt WHERE s.col0 = 1 AND wt.col0 = 1').first() result = pipeline.execute('SELECT COUNT(*) FROM test_join_where').first() assert result['count'] == expected['count']
def test_stats_aggs(pipeline, clean_db): """ Verify that combines work on stats aggs """ q = """ SELECT x::integer %% 10 AS k, regr_sxx(x, y::float8), stddev(x) FROM stream GROUP BY k; """ desc = ('x', 'y') pipeline.create_cv('test_stats_aggs', q) pipeline.create_table('test_stats_aggs_t', x='integer', y='float8') rows = [] for n in range(10000): row = (random.randint(0, 1000), random.random()) rows.append(row) pipeline.insert('stream', desc, rows) pipeline.insert('test_stats_aggs_t', desc, rows) tq = """ SELECT regr_sxx(x, y::float8), stddev(x) FROM test_stats_aggs_t """ table_result = list(pipeline.execute(tq)) cq = """ SELECT combine(regr_sxx), combine(stddev) FROM test_stats_aggs """ cv_result = list(pipeline.execute(cq)) assert len(table_result) == len(cv_result) for tr, cr in zip(table_result, cv_result): assert abs(tr[0] - cr[0]) < 0.00001 assert abs(tr[1] - cr[1]) < 0.00001
def test_colums_subset(pipeline, clean_db): """ Verify that copying data from a file into a stream works when the file's input columns are a subset of the stream0's columns """ pipeline.create_stream("stream0", x="int", y="float8", z="numeric", m="int") q = "SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric), max(m::integer) FROM stream0" pipeline.create_cv("test_copy_subset", q) pipeline.create_table("test_copy_subset_t", x="integer", y="float8", z="numeric") path = os.path.abspath(os.path.join(pipeline.tmp_dir, "test_copy.csv")) rows = [] for n in range(10000): row = random.randint(1, 1024), random.randint(1, 1024), random.random() rows.append(row) _generate_csv(path, rows, desc=("x", "y", "z")) pipeline.execute("COPY test_copy_subset_t (x, y, z) FROM '%s' HEADER CSV" % path) pipeline.execute("COPY stream0 (x, y, z) FROM '%s' HEADER CSV" % path) expected = pipeline.execute( "SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric) FROM test_copy_subset_t" ).first() result = list(pipeline.execute("SELECT s0, s1, avg FROM test_copy_subset")) assert len(result) == 1 result = result[0] assert result[0] == expected[0] assert result[1] == expected[1] assert result[2] == expected[2]
def _test_agg(pipeline, agg, check_fn=None): name = agg[:agg.find('(')] q = 'SELECT g::integer, %s OVER (PARTITION BY g ORDER BY ts::timestamp) FROM %s' cv_name = 'test_%s' % name table_name = 'test_%s_t' % name desc = ('ts', 'g', 'x', 'y', 'z') pipeline.create_cv(cv_name, q % (agg, 'stream')) pipeline.create_table(table_name, ts='timestamp', x='integer', y='integer', z='integer', g='integer') rows = [] for i, n in enumerate(range(1000)): ts = str(datetime.utcnow() + timedelta(seconds=i)) row = ts, n % 10, random.randint(1, 256), random.randint(1, 256), random.randint(1, 256) rows.append(row) pipeline.insert('stream', desc, rows) pipeline.insert(table_name, desc, rows) if check_fn: return check_fn(pipeline) expected = list(pipeline.execute(q % (agg, table_name) + ' ORDER BY g')) result = list(pipeline.execute('SELECT * FROM %s ORDER BY g' % cv_name)) assert len(expected) == len(result) for e, r in zip(expected, result): assert e == r pipeline.drop_cv(cv_name) pipeline.drop_table(table_name)
def test_join_across_batches(pipeline, clean_db): """ Verify that stream-table joins are properly built when they span across multiple input batches """ num_cols = 4 join_cols = [0] t_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) pipeline.create_table('batch', **t_cols) pipeline.create_stream('stream', **t_cols) q = """ SELECT s.col0::integer FROM batch JOIN stream s ON batch.col0 = s.col0 """ t = _generate_rows(num_cols, 64) _insert(pipeline, 'batch', t, 0.1) s = _generate_rows(num_cols, 64) pipeline.create_cv('test_batched_join', q) _insert(pipeline, 'stream', s) expected = _join(t, s, join_cols) result = pipeline.execute('SELECT COUNT(*) FROM test_batched_join').first() assert result['count'] == len(expected)
def test_copy_to_typed_stream(pipeline, clean_db): """ Verify that copying data from a file into a typed stream works. """ pipeline.create_stream('stream', x='integer', y='float8', z='numeric') q = 'SELECT sum(x) AS s0, sum(y) AS s1, avg(z) FROM stream' pipeline.create_cv('test_copy_to_typed_stream', q) pipeline.create_table('test_copy_to_typed_stream_t', x='integer', y='float8', z='numeric') path = os.path.abspath(os.path.join(pipeline.tmp_dir, 'test_copy.csv')) rows = [] for n in range(10000): row = random.randint(1, 1024), random.randint(1, 1024), random.random() rows.append(row) _generate_csv(path, rows, desc=('x', 'y', 'z')) pipeline.execute('COPY test_copy_to_typed_stream_t (x, y, z) FROM \'%s\' HEADER CSV' % path) pipeline.execute('COPY stream (x, y, z) FROM \'%s\' HEADER CSV' % path) expected = pipeline.execute('SELECT sum(x) AS s0, sum(y) AS s1, avg(z) FROM test_copy_to_typed_stream_t').first() result = list(pipeline.execute('SELECT s0, s1, avg FROM test_copy_to_typed_stream')) assert len(result) == 1 result = result[0] assert result[0] == expected[0] assert result[1] == expected[1] assert result[2] == expected[2]
def test_colums_subset(pipeline, clean_db): """ Verify that copying data from a file into a stream works when the file's input columns are a subset of the stream's columns """ q = 'SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric), max(m::integer) FROM stream' pipeline.create_cv('test_copy_subset', q) pipeline.create_table('test_copy_subset_t', x='integer', y='float8', z='numeric') path = os.path.abspath(os.path.join(pipeline.tmp_dir, 'test_copy.csv')) rows = [] for n in range(10000): row = random.randint(1, 1024), random.randint(1, 1024), random.random() rows.append(row) _generate_csv(path, rows, desc=('x', 'y', 'z')) pipeline.execute('COPY test_copy_subset_t (x, y, z) FROM \'%s\' HEADER CSV' % path) pipeline.execute('COPY stream (x, y, z) FROM \'%s\' HEADER CSV' % path) expected = pipeline.execute('SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric) FROM test_copy_subset_t').first() result = list(pipeline.execute('SELECT s0, s1, avg FROM test_copy_subset')) assert len(result) == 1 result = result[0] assert result[0] == expected[0] assert result[1] == expected[1] assert result[2] == expected[2]
def test_null_groups(pipeline, clean_db): """ Verify that null group columns are considered equal """ pipeline.create_stream('stream', x='int', y='int', z='int') q = """ SELECT x::integer, y::integer, z::integer, COUNT(*) FROM stream GROUP BY x, y, z; """ desc = ('x', 'y', 'z') pipeline.create_cv('test_null_groups', q) pipeline.create_table('test_null_groups_t', x='integer', y='integer', z='integer') rows = [] for n in range(10000): vals = list(random.randint(0, 10) for n in range(3)) vals = map(lambda n: random.random() > 0.1 and n or None, vals) rows.append(tuple(vals)) pipeline.insert('stream', desc, rows) pipeline.insert('test_null_groups_t', desc, rows) table_q = """ SELECT x, y, z, COUNT(*) FROM test_null_groups_t GROUP BY x, y, z ORDER BY x, y, z; """ expected = list(pipeline.execute(table_q)) result = list(pipeline.execute('SELECT x, y, z, count FROM test_null_groups ORDER BY x, y, z')) for r, e in zip(result, expected): assert r == e
def test_join_ordering(pipeline, clean_db): """ Verify that the correct plan is generated regardless of the ordering of streams and tables. """ num_cols = 8 join_cols = [0] ordering0_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) ordering1_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) pipeline.create_table('ordering0', **ordering0_cols) pipeline.create_table('ordering1', **ordering1_cols) ordering0 = _generate_rows(num_cols, 64) ordering1 = _generate_rows(num_cols, 64) _insert(pipeline, 'ordering0', ordering0, 0.1) _insert(pipeline, 'ordering1', ordering1, 0.1) pipeline.create_stream('stream', **ordering0_cols) # stream, table, table q0 = """ SELECT s.col0::integer, ordering0.col3, ordering1.col4 FROM stream s JOIN ordering0 ON s.col0 = ordering0.col0 JOIN ordering1 ON ordering0.col0 = ordering1.col0 """ pipeline.create_cv('test_ordering0', q0) # table, stream, table q1 = """ SELECT s.col0::integer, ordering0.col3, ordering1.col4 FROM ordering0 JOIN stream s ON s.col0 = ordering0.col0 JOIN ordering1 ON ordering0.col0 = ordering1.col0 """ pipeline.create_cv('test_ordering1', q1) # table, table, stream q2 = """ SELECT s.col0::integer, ordering0.col3, ordering1.col4 FROM ordering0 JOIN ordering1 ON ordering0.col0 = ordering1.col0 JOIN stream s ON s.col0 = ordering0.col0 """ pipeline.create_cv('test_ordering2', q2) s = _generate_rows(num_cols, 64) _insert(pipeline, 'stream', s) expected = _join(ordering0, _join(ordering1, s, join_cols), join_cols) result0 = pipeline.execute('SELECT COUNT(*) FROM test_ordering0').first() result1 = pipeline.execute('SELECT COUNT(*) FROM test_ordering1').first() result2 = pipeline.execute('SELECT COUNT(*) FROM test_ordering2').first() assert result0['count'] == len(expected) assert result1['count'] == len(expected) assert result2['count'] == len(expected)
def test_bloom_type(pipeline, clean_db): pipeline.create_table("test_bloom_type", x="int", y="bloom") pipeline.execute("INSERT INTO test_bloom_type (x, y) VALUES " "(1, bloom_empty()), (2, bloom_empty())") for i in xrange(1000): pipeline.execute("UPDATE test_bloom_type SET y = bloom_add(y, %d / x)" % i) result = list(pipeline.execute("SELECT bloom_cardinality(y) " "FROM test_bloom_type ORDER BY x")) assert result[0][0] == 986 assert result[1][0] == 495
def test_hll_type(pipeline, clean_db): pipeline.create_table('test_hll_type', x='int', y='hll') pipeline.execute('INSERT INTO test_hll_type (x, y) VALUES ' '(1, hll_empty()), (2, hll_empty())') for i in xrange(1000): pipeline.execute('UPDATE test_hll_type SET y = hll_add(y, %d / x)' % i) result = list(pipeline.execute('SELECT hll_cardinality(y) ' 'FROM test_hll_type ORDER BY x')) assert result[0][0] == 995 assert result[1][0] == 497
def test_cmsketch_type(pipeline, clean_db): pipeline.create_table('test_cmsketch_type', x='int', y='cmsketch') pipeline.execute('INSERT INTO test_cmsketch_type (x, y) VALUES ' '(1, cmsketch_empty()), (2, cmsketch_empty())') for i in xrange(1000): pipeline.execute('UPDATE test_cmsketch_type ' 'SET y = cmsketch_add(y, {} %% x)'.format(i)) result = list(pipeline.execute('SELECT cmsketch_frequency(y, 0), ' 'cmsketch_frequency(y, 1) ' 'FROM test_cmsketch_type ORDER BY x')) assert result[0] == (1000, 0) assert result[1] == (500, 500)
def test_cmsketch_type(pipeline, clean_db): pipeline.create_table("test_cmsketch_type", x="int", y="cmsketch") pipeline.execute("INSERT INTO test_cmsketch_type (x, y) VALUES " "(1, cmsketch_empty()), (2, cmsketch_empty())") for i in xrange(1000): pipeline.execute("UPDATE test_cmsketch_type " "SET y = cmsketch_add(y, {} %% x)".format(i)) result = list( pipeline.execute( "SELECT cmsketch_frequency(y, 0), " "cmsketch_frequency(y, 1) " "FROM test_cmsketch_type ORDER BY x" ) ) assert result[0] == (1000, 0) assert result[1] == (500, 500)
def test_tdigest_type(pipeline, clean_db): pipeline.create_table('test_tdigest_type', x='int', y='tdigest') pipeline.execute('INSERT INTO test_tdigest_type (x, y) VALUES ' '(1, tdigest_empty()), (2, tdigest_empty())') for i in xrange(1000): pipeline.execute('UPDATE test_tdigest_type ' 'SET y = tdigest_add(y, {} %% (x * 500))'.format(i)) result = list(pipeline.execute('SELECT tdigest_cdf(y, 400), ' 'tdigest_quantile(y, 0.9)' 'FROM test_tdigest_type ORDER BY x')) assert map(lambda x: round(x, 1), result[0]) == [0.8, 449.5] assert map(lambda x: round(x, 1), result[1]) == [0.4, 899.5]
def test_join_with_aggs(pipeline, clean_db): """ Verify that joins involving aggregates referencing columns from multiple tables work """ num_cols = 4 join_cols = [1] q = """ SELECT sum(s.col0::integer) AS s0, sum(a0.col0::integer) AS s1, sum(a1.col0::integer) AS s2 FROM a1 JOIN a0 ON a1.col1 = a0.col1 JOIN stream s ON s.col1::integer = a0.col1 """ a0_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) a1_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) pipeline.create_table('a0', **a0_cols) pipeline.create_table('a1', **a1_cols) a0 = _generate_rows(num_cols, 64) a1 = _generate_rows(num_cols, 64) s = _generate_rows(num_cols, 64) _insert(pipeline, 'a0', a0, 0.1) _insert(pipeline, 'a1', a1, 0.1) pipeline.create_stream('stream', **a0_cols) pipeline.create_cv('test_agg_join', q) _insert(pipeline, 'stream', s) expected = _join(a1, _join(a0, s, join_cols), join_cols) result = pipeline.execute('SELECT * FROM test_agg_join').first() # sum of col0 from stream s0_expected = sum([r[num_cols * 2] for r in expected]) # sum of col0 from a0 s1_expected = sum([r[num_cols * 1] for r in expected]) # sum of col0 from a1 s2_expected = sum([r[num_cols * 0] for r in expected]) assert s0_expected == result['s0'] assert s1_expected == result['s1'] assert s2_expected == result['s2']
def test_percentile_cont_agg(pipeline, clean_db): range_top = 100000 q = [0.0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 1.0] batches = [] min_seen = range_top max_seen = 0 for _ in xrange(10): b = [(random.randint(0, range_top),) for _ in xrange(5000)] min_seen = min(min_seen, min(b)[0]) max_seen = max(max_seen, max(b)[0]) batches.append(b) pipeline.create_stream('test_stream', x='int') query = '''SELECT percentile_cont(ARRAY[%s]) WITHIN GROUP (ORDER BY x::integer) FROM %s ''' % (', '.join(map(lambda f: str(f), q)), '%s') pipeline.create_cv('test_cq_percentile_cont', query % 'test_stream') pipeline.create_table('test_percentile_cont', x='integer') for b in batches: pipeline.insert('test_stream', ('x',), b) pipeline.insert('test_percentile_cont', ('x',), b) actual = pipeline.execute(query % 'test_percentile_cont') result = pipeline.execute('SELECT * FROM test_cq_percentile_cont') actual = actual.first()['percentile_cont'] result = result.first()['percentile_cont'] assert len(actual) == len(result) assert result == sorted(result) diff = [abs(actual[i] - result[i]) for i in xrange(len(actual))] # 0th and 100th percentile should be accurate. assert result[0] == min_seen assert result[-1] == max_seen # 1st and 99th percentile should be within 0.1%. assert diff[1] <= 0.001 * range_top assert diff[-2] <= 0.001 * range_top # All percentiles should be within 0.5%. assert all(x <= 0.005 * range_top for x in diff)
def test_join_with_aggs(pipeline, clean_db): """ Verify that joins involving aggregates referencing columns from multiple tables work """ num_cols = 4 join_cols = [1] q = """ SELECT sum(s.col0::integer) AS s0, sum(a0.col0::integer) AS s1, sum(a1.col0::integer) AS s2 FROM a1 JOIN a0 ON a1.col1 = a0.col1 JOIN stream s ON s.col1::integer = a0.col1 """ a0_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) a1_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) pipeline.create_table('a0', **a0_cols) pipeline.create_table('a1', **a1_cols) a0 = _generate_rows(num_cols, 64) a1 = _generate_rows(num_cols, 64) s = _generate_rows(num_cols, 64) _insert(pipeline, 'a0', a0, 0.1) _insert(pipeline, 'a1', a1, 0.1) pipeline.create_cv('test_agg_join', q) _insert(pipeline, 'stream', s) expected = _join(a1, _join(a0, s, join_cols), join_cols) result = pipeline.execute('SELECT * FROM test_agg_join').first() # sum of col0 from stream s0_expected = sum([r[num_cols * 2] for r in expected]) # sum of col0 from a0 s1_expected = sum([r[num_cols * 1] for r in expected]) # sum of col0 from a1 s2_expected = sum([r[num_cols * 0] for r in expected]) assert s0_expected == result['s0'] assert s1_expected == result['s1'] assert s2_expected == result['s2']
def test_cont_transforms(pipeline, clean_db): pipeline.execute('CREATE FOREIGN TABLE cv_stream (x int, y text) SERVER pipelinedb') pipeline.execute('CREATE FOREIGN TABLE ct_stream (x int, y text) SERVER pipelinedb') pipeline.create_cv('test_cv', 'SELECT count(*) FROM cv_stream') pipeline.create_ct('test_ct1', 'SELECT x::int, y::text FROM ct_stream WHERE mod(x, 2) = 0', "pipelinedb.insert_into_stream('cv_stream', 'cv_stream')") pipeline.create_table('test_t', x='int', y='text') pipeline.execute(''' CREATE OR REPLACE FUNCTION test_tg() RETURNS trigger AS $$ BEGIN INSERT INTO test_t (x, y) VALUES (NEW.x, NEW.y); RETURN NEW; END; $$ LANGUAGE plpgsql; ''') pipeline.create_ct('test_ct2', 'SELECT x::int, y::text FROM ct_stream', 'test_tg') pipeline.insert('ct_stream', ('x', 'y'), [(1, 'hello'), (2, 'world')]) time.sleep(1) assert pipeline.execute('SELECT count FROM test_cv')[0]['count'] == 2 _dump(pipeline, 'test_cont_transform.sql') pipeline.drop_all() pipeline.drop_table('test_t') pipeline.execute('DROP FUNCTION test_tg()') _restore(pipeline, 'test_cont_transform.sql') pipeline.insert('ct_stream', ('x', 'y'), [(1, 'hello'), (2, 'world')]) time.sleep(1) assert pipeline.execute('SELECT count FROM test_cv')[0]['count'] == 4 ntups = 0 for row in pipeline.execute('SELECT x, count(*) FROM test_t GROUP BY x'): assert row['count'] == 2 assert row['x'] in (1, 2) ntups += 1 assert ntups == 2
def test_colums_subset(pipeline, clean_db): """ Verify that copying data from a file into a stream works when the file's input columns are a subset of the stream0's columns """ pipeline.create_stream('stream0', x='int', y='float8', z='numeric', m='int') q = 'SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric), max(m::integer) FROM stream0' pipeline.create_cv('test_copy_subset', q) pipeline.create_table('test_copy_subset_t', x='integer', y='float8', z='numeric') path = os.path.abspath(os.path.join(pipeline.tmp_dir, 'test_copy.csv')) rows = [] for n in range(10000): row = random.randint(1, 1024), random.randint(1, 1024), random.random() rows.append(row) _generate_csv(path, rows, desc=('x', 'y', 'z')) pipeline.execute( 'COPY test_copy_subset_t (x, y, z) FROM \'%s\' HEADER CSV' % path) pipeline.execute('COPY stream0 (x, y, z) FROM \'%s\' HEADER CSV' % path) expected = pipeline.execute( 'SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric) FROM test_copy_subset_t' ).first() result = list(pipeline.execute('SELECT s0, s1, avg FROM test_copy_subset')) assert len(result) == 1 result = result[0] assert result[0] == expected[0] assert result[1] == expected[1] assert result[2] == expected[2]
def test_cont_transforms(pipeline, clean_db): pipeline.execute('CREATE STREAM cv_stream (x int, y text)') pipeline.execute('CREATE STREAM ct_stream (x int, y text)') pipeline.create_cv('test_cv', 'SELECT count(*) FROM cv_stream') pipeline.create_ct('test_ct1', 'SELECT x::int, y::text FROM ct_stream WHERE mod(x, 2) = 0', "pipeline_stream_insert('cv_stream', 'cv_stream')") pipeline.create_table('test_t', x='int', y='text') pipeline.execute(''' CREATE OR REPLACE FUNCTION test_tg() RETURNS trigger AS $$ BEGIN INSERT INTO test_t (x, y) VALUES (NEW.x, NEW.y); RETURN NEW; END; $$ LANGUAGE plpgsql; ''') pipeline.create_ct('test_ct2', 'SELECT x::int, y::text FROM ct_stream', 'test_tg()') pipeline.insert('ct_stream', ('x', 'y'), [(1, 'hello'), (2, 'world')]) time.sleep(1) _dump(pipeline, 'test_cont_transform.sql') pipeline.drop_all() pipeline.drop_table('test_t') pipeline.execute('DROP FUNCTION test_tg()') _restore(pipeline, 'test_cont_transform.sql') pipeline.insert('ct_stream', ('x', 'y'), [(1, 'hello'), (2, 'world')]) time.sleep(1) assert pipeline.execute('SELECT count FROM test_cv').first()['count'] == 4 ntups = 0 for row in pipeline.execute('SELECT x, count(*) FROM test_t GROUP BY x'): assert row['count'] == 2 assert row['x'] in (1, 2) ntups += 1 assert ntups == 2
def test_distinct(pipeline, clean_db): """ Verify that streaming SELECT DISTINCT ON (...) works """ pipeline.create_stream('stream0', x='int', y='int', z='int') pipeline.create_table('table0', x='int', y='int', z='int') q = 'SELECT DISTINCT ON (x::int, y::int - z::int) x::int, y::int FROM stream0' pipeline.create_cv('test_distinct', q) uniques = defaultdict(set) values = [] for _ in xrange(2000): x, y, z = random.randint(0, 20), random.randint(0, 20), random.randint( 0, 20) values.append((x, y, z)) uniques[(x, y - z)].add(y) pipeline.insert('stream0', ['x', 'y', 'z'], values) pipeline.insert('table0', ['x', 'y', 'z'], values) q = """ SELECT DISTINCT ON (x::int, y::int - z::int) x::int, y::int FROM table0 """ expected = pipeline.execute(q) expected = len(expected) assert expected < 2000 result = pipeline.execute('SELECT COUNT(*) FROM test_distinct')[0] assert expected == result['count'] # Check if the first row was selected for uniques result = pipeline.execute('SELECT * FROM test_distinct') reverse_uniques = defaultdict(set) for (x, _), ys in uniques.iteritems(): for y in ys: reverse_uniques[y].add(x) for row in result: assert row['x'] in reverse_uniques[row['y']]
def test_tdigest_type(pipeline, clean_db): pipeline.create_table('test_tdigest_type', x='int', y='tdigest') pipeline.execute('INSERT INTO test_tdigest_type (x, y) VALUES ' '(1, tdigest_empty()), (2, tdigest_empty())') for i in xrange(1000): pipeline.execute( 'UPDATE test_tdigest_type SET y = dist_add(y, %d %% (x * 500))' % i) result = list( pipeline.execute('SELECT dist_cdf(y, 400), ' 'dist_quantile(y, 0.9)' 'FROM test_tdigest_type ORDER BY x')) assert map( lambda x: round(x, 1), (result[0]['dist_cdf'], result[0]['dist_quantile'])) == [0.8, 449.5] assert map( lambda x: round(x, 1), (result[1]['dist_cdf'], result[1]['dist_quantile'])) == [0.4, 899.5]
def test_copy_to_typed_stream(pipeline, clean_db): """ Verify that copying data from a file into a typed stream works. """ pipeline.create_stream('stream', x='integer', y='float8', z='numeric') q = 'SELECT sum(x) AS s0, sum(y) AS s1, avg(z) FROM stream' pipeline.create_cv('test_copy_to_typed_stream', q) pipeline.create_table('test_copy_to_typed_stream_t', x='integer', y='float8', z='numeric') path = os.path.abspath(os.path.join(pipeline.tmp_dir, 'test_copy.csv')) rows = [] for n in range(10000): row = random.randint(1, 1024), random.randint(1, 1024), random.random() rows.append(row) _generate_csv(path, rows, desc=('x', 'y', 'z')) pipeline.execute( 'COPY test_copy_to_typed_stream_t (x, y, z) FROM \'%s\' HEADER CSV' % path) pipeline.execute('COPY stream (x, y, z) FROM \'%s\' HEADER CSV' % path) expected = pipeline.execute( 'SELECT sum(x) AS s0, sum(y) AS s1, avg(z) FROM test_copy_to_typed_stream_t' ).first() result = list( pipeline.execute('SELECT s0, s1, avg FROM test_copy_to_typed_stream')) assert len(result) == 1 result = result[0] assert result[0] == expected[0] assert result[1] == expected[1] assert result[2] == expected[2]
def test_nested_expressions(pipeline, clean_db): """ Verify that combines work properly on arbitrarily nested expressions """ q = """ SELECT x::integer %% 10 AS k, (rank(256) WITHIN GROUP (ORDER BY x) + dense_rank(256) WITHIN GROUP (ORDER BY x)) * (avg(x + y::float8) - (sum(x) * avg(y))) AS whoa FROM stream GROUP BY k """ desc = ('x', 'y') pipeline.create_cv('test_nested', q) pipeline.create_table('test_nested_t', x='integer', y='float8') rows = [] for n in range(10000): row = (random.randint(0, 1000), random.random()) rows.append(row) pipeline.insert('stream', desc, rows) pipeline.insert('test_nested_t', desc, rows) # Note that the CQ will use the HLL variant of dense_rank, # so use hll_dense_rank on the table too tq = """ SELECT (rank(256) WITHIN GROUP (ORDER BY x) + hll_dense_rank(256) WITHIN GROUP (ORDER BY x)) * (avg(x + y::float8) - (sum(x) * avg(y))) AS whoa FROM test_nested_t """ table_result = list(pipeline.execute(tq)) cq = """ SELECT combine(whoa) FROM test_nested """ cv_result = list(pipeline.execute(cq)) assert len(table_result) == len(cv_result) for tr, cr in zip(table_result, cv_result): assert abs(tr[0] - cr[0]) < 0.0001
def test_hypothetical_set_aggs(pipeline, clean_db): """ Verify that combines work properly on HS aggs """ q = """ SELECT x::integer % 10 AS k, rank(256) WITHIN GROUP (ORDER BY x), dense_rank(256) WITHIN GROUP (ORDER BY x) FROM stream0 GROUP BY k """ desc = ('x', 'y') pipeline.create_stream('stream0', x='int', y='float8') pipeline.create_cv('test_hs_aggs', q) pipeline.create_table('test_hs_aggs_t', x='integer', y='float8') rows = [] for n in range(10000): row = (random.randint(0, 1000), random.random()) rows.append(row) pipeline.insert('stream0', desc, rows) pipeline.insert('test_hs_aggs_t', desc, rows) # Note that the CQ will use the combinable variant of dense_rank, # so use that on the table too tq = """ SELECT rank(256) WITHIN GROUP (ORDER BY x), combinable_dense_rank(256, x) FROM test_hs_aggs_t """ table_result = pipeline.execute(tq) cq = """ SELECT combine(rank), combine(dense_rank) FROM test_hs_aggs """ cv_result = pipeline.execute(cq) assert len(table_result) == len(cv_result) for tr, cr in zip(table_result, cv_result): assert tr[0] == cr[0] assert tr[1] == cr[1]
def test_distinct(pipeline, clean_db): """ Verify that streaming SELECT DISTINCT ON (...) works """ pipeline.create_stream('stream0', x='int', y='int', z='int') pipeline.create_table('table0', x='int', y='int', z='int') q = 'SELECT DISTINCT ON (x::int, y::int - z::int) x::int, y::int FROM stream0' pipeline.create_cv('test_distinct', q) uniques = defaultdict(set) values = [] for _ in xrange(2000): x, y, z = random.randint(0, 20), random.randint(0, 20), random.randint(0, 20) values.append((x, y, z)) uniques[(x, y - z)].add(y) pipeline.insert('stream0', ['x', 'y', 'z'], values) pipeline.insert('table0', ['x', 'y', 'z'], values) q = """ SELECT DISTINCT ON (x::int, y::int - z::int) x::int, y::int FROM table0 """ expected = list(pipeline.execute(q)) expected = len(expected) assert expected < 2000 result = pipeline.execute('SELECT COUNT(*) FROM test_distinct').first() assert expected == result['count'] # Check if the first row was selected for uniques result = pipeline.execute('SELECT * FROM test_distinct') reverse_uniques = defaultdict(set) for (x, _), ys in uniques.iteritems(): for y in ys: reverse_uniques[y].add(x) for row in result: assert row['x'] in reverse_uniques[row['y']]
def test_object_aggs(pipeline, clean_db): """ Verify that combines work properly on object aggs """ q = """ SELECT x::integer % 10 AS k, json_agg(x), json_object_agg(x, y::float8), string_agg(s::text, \' :: \')FROM stream0 GROUP BY k; """ desc = ('x', 'y', 's') pipeline.create_stream('stream0', x='int', y='float8', s='text') pipeline.create_cv('test_object_aggs', q) pipeline.create_table('test_object_aggs_t', x='integer', y='float8', s='text') rows = [] for n in range(10000): row = (random.randint(0, 1000), random.random(), str(n) * random.randint(1, 8)) rows.append(row) pipeline.insert('stream0', desc, rows) pipeline.insert('test_object_aggs_t', desc, rows) tq = """ SELECT json_agg(x), json_object_agg(x, y::float8), string_agg(s::text, \' :: \') FROM test_object_aggs_t """ table_result = pipeline.execute(tq) cq = """ SELECT combine(json_agg), combine(json_object_agg), combine(string_agg) FROM test_object_aggs """ cv_result = pipeline.execute(cq) assert len(table_result) == len(cv_result) for tr, cr in zip(table_result, cv_result): assert sorted(tr[0]) == sorted(cr[0]) assert sorted(tr[1]) == sorted(cr[1]) assert sorted(tr[2]) == sorted(cr[2])
def test_hypothetical_set_aggs(pipeline, clean_db): """ Verify that combines work properly on HS aggs """ q = """ SELECT x::integer %% 10 AS k, rank(256) WITHIN GROUP (ORDER BY x), dense_rank(256) WITHIN GROUP (ORDER BY x) FROM stream0 GROUP BY k """ desc = ('x', 'y') pipeline.create_stream('stream0', x='int', y='float8') pipeline.create_cv('test_hs_aggs', q) pipeline.create_table('test_hs_aggs_t', x='integer', y='float8') rows = [] for n in range(10000): row = (random.randint(0, 1000), random.random()) rows.append(row) pipeline.insert('stream0', desc, rows) pipeline.insert('test_hs_aggs_t', desc, rows) # Note that the CQ will use the HLL variant of dense_rank, # so use hll_dense_rank on the table too tq = """ SELECT rank(256) WITHIN GROUP (ORDER BY x), hll_dense_rank(256) WITHIN GROUP (ORDER BY x) FROM test_hs_aggs_t """ table_result = list(pipeline.execute(tq)) cq = """ SELECT combine(rank), combine(dense_rank) FROM test_hs_aggs """ cv_result = list(pipeline.execute(cq)) assert len(table_result) == len(cv_result) for tr, cr in zip(table_result, cv_result): assert tr == cr
def test_produce(pipeline, kafka, clean_db): """ Tests pipeline_kafka.emit_tuple and pipeline_kafka.produce_message """ pipeline.create_stream('stream0', payload='json') pipeline.create_cv('cv', 'SELECT payload FROM stream0') pipeline.create_table('t', x='integer', y='integer') pipeline.execute("""CREATE TRIGGER tg AFTER INSERT ON t FOR EACH ROW EXECUTE PROCEDURE pipeline_kafka.emit_tuple('topic') """) kafka.create_topic('topic', partitions=4) pipeline.consume_begin('topic', 'stream0') for i in range(100): pipeline.insert('t', ('x', 'y'), [(i, 2 * i)]) def messages(): rows = pipeline.execute('SELECT * FROM cv') assert len(rows) == 100 assert eventually(messages)
def _test_agg(pipeline, agg, check_fn=None): name = agg[:agg.find('(')] q = 'SELECT g::integer, %s OVER (PARTITION BY g ORDER BY ts::timestamp) FROM %s' cv_name = 'test_%s' % name table_name = 'test_%s_t' % name desc = ('ts', 'g', 'x', 'y', 'z') pipeline.create_cv(cv_name, q % (agg, 'stream')) pipeline.create_table(table_name, ts='timestamp', x='integer', y='integer', z='integer', g='integer') rows = [] for i, n in enumerate(range(1000)): ts = str(datetime.utcnow() + timedelta(seconds=i)) row = ts, n % 10, random.randint(1, 256), random.randint( 1, 256), random.randint(1, 256) rows.append(row) pipeline.insert('stream', desc, rows) pipeline.insert(table_name, desc, rows) if check_fn: return check_fn(pipeline) expected = list(pipeline.execute(q % (agg, table_name) + ' ORDER BY g')) result = list(pipeline.execute('SELECT * FROM %s ORDER BY g' % cv_name)) assert len(expected) == len(result) for e, r in zip(expected, result): assert e == r pipeline.drop_cv(cv_name) pipeline.drop_table(table_name)
def test_windowed_combine(pipeline, clean_db): """ Verify that windowed queries with combines work """ q = """ SELECT x::integer, avg(y::integer) FROM stream0 GROUP BY x """ desc = ('x', 'y') pipeline.create_stream('stream0', x='int', y='float8') pipeline.create_cv('test_windowed_combine', q) pipeline.create_table('test_windowed_combine_t', x='integer', y='integer') rows = [] for n in range(10000): row = (n, n) rows.append(row) pipeline.insert('stream0', desc, rows) pipeline.insert('test_windowed_combine_t', desc, rows) table = """ SELECT first_value(x) OVER w, avg(y) OVER w FROM test_windowed_combine_t WINDOW w AS (ORDER BY x ROWS BETWEEN CURRENT ROW AND 5 FOLLOWING) ORDER BY first_value """ expected = pipeline.execute(table) combine = """ SELECT first_value(x) OVER w, avg(avg) OVER w FROM test_windowed_combine WINDOW w AS (ORDER BY x ROWS BETWEEN CURRENT ROW AND 5 FOLLOWING) ORDER BY first_value """ actual = pipeline.execute(combine) for e, a in zip(expected, actual): assert e[0] == a[0] assert e[1] == a[1]
def test_object_aggs(pipeline, clean_db): """ Verify that combines work properly on object aggs """ q = """ SELECT x::integer %% 10 AS k, json_agg(x), json_object_agg(x, y::float8), string_agg(s::text, \' :: \')FROM stream0 GROUP BY k; """ desc = ('x', 'y', 's') pipeline.create_stream('stream0', x='int', y='float8', s='text') pipeline.create_cv('test_object_aggs', q) pipeline.create_table('test_object_aggs_t', x='integer', y='float8', s='text') rows = [] for n in range(10000): row = (random.randint(0, 1000), random.random(), str(n) * random.randint(1, 8)) rows.append(row) pipeline.insert('stream0', desc, rows) pipeline.insert('test_object_aggs_t', desc, rows) tq = """ SELECT json_agg(x), json_object_agg(x, y::float8), string_agg(s::text, \' :: \') FROM test_object_aggs_t """ table_result = list(pipeline.execute(tq)) cq = """ SELECT combine(json_agg), combine(json_object_agg), combine(string_agg) FROM test_object_aggs """ cv_result = list(pipeline.execute(cq)) assert len(table_result) == len(cv_result) for tr, cr in zip(table_result, cv_result): assert sorted(tr[0]) == sorted(cr[0]) assert sorted(tr[1]) == sorted(cr[1]) assert sorted(tr[2]) == sorted(cr[2])
def test_windowed_combine(pipeline, clean_db): """ Verify that windowed queries with combines work """ q = """ SELECT x::integer, avg(y::integer) FROM stream0 GROUP BY x """ desc = ('x', 'y') pipeline.create_stream('stream0', x='int', y='float8') pipeline.create_cv('test_windowed_combine', q) pipeline.create_table('test_windowed_combine_t', x='integer', y='integer') rows = [] for n in range(10000): row = (n, n) rows.append(row) pipeline.insert('stream0', desc, rows) pipeline.insert('test_windowed_combine_t', desc, rows) table = """ SELECT first_value(x) OVER w, avg(y) OVER w FROM test_windowed_combine_t WINDOW w AS (ORDER BY x ROWS BETWEEN CURRENT ROW AND 5 FOLLOWING) ORDER BY first_value """ expected = list(pipeline.execute(table)) combine = """ SELECT first_value(x) OVER w, avg(avg) OVER w FROM test_windowed_combine WINDOW w AS (ORDER BY x ROWS BETWEEN CURRENT ROW AND 5 FOLLOWING) ORDER BY first_value """ actual = list(pipeline.execute(combine)) for e, a in zip(expected, actual): assert e == a
def test_null_groups(pipeline, clean_db): """ Verify that null group columns are considered equal """ pipeline.create_stream('s', x='int', y='int', z='int') q = """ SELECT x::integer, y::integer, z::integer, COUNT(*) FROM s GROUP BY x, y, z; """ desc = ('x', 'y', 'z') pipeline.create_cv('test_null_groups', q) pipeline.create_table('test_null_groups_t', x='integer', y='integer', z='integer') rows = [] for n in range(10000): vals = list(random.randint(0, 10) for n in range(3)) vals = map(lambda n: random.random() > 0.1 and n or None, vals) rows.append(tuple(vals)) pipeline.insert('s', desc, rows) pipeline.insert('test_null_groups_t', desc, rows) table_q = """ SELECT x, y, z, COUNT(*) FROM test_null_groups_t GROUP BY x, y, z ORDER BY x, y, z; """ expected = list(pipeline.execute(table_q)) result = list( pipeline.execute( 'SELECT x, y, z, count FROM test_null_groups ORDER BY x, y, z')) for r, e in zip(result, expected): assert r == e
def test_indexed(pipeline, clean_db): """ Verify that stream-table joins involving indexed tables work """ q = """ SELECT stream.x::integer, count(*) FROM stream JOIN test_indexed_t t ON stream.x = t.x GROUP BY stream.x """ pipeline.create_table('test_indexed_t', x='integer', y='integer') pipeline.execute('CREATE INDEX idx ON test_indexed_t(x)') t = _generate_rows(2, 1000) s = _generate_rows(2, 1000) pipeline.insert('test_indexed_t', ('x', 'y'), t) time.sleep(0.1) pipeline.create_cv('test_indexed', q) pipeline.insert('stream', ('x', 'y'), s) expected = _join(s, t, [0]) result = pipeline.execute('SELECT sum(count) FROM test_indexed').first() assert result['sum'] == len(expected)
def test_hll_distinct(pipeline, clean_db): """ Verify that combines work on HLL COUNT DISTINCT queries """ q = """ SELECT x::integer % 10 AS k, COUNT(DISTINCT x) AS count FROM stream0 GROUP BY k """ desc = ('x', 'y') pipeline.create_stream('stream0', x='int', y='float8') pipeline.create_cv('test_hll_distinct', q) pipeline.create_table('test_hll_distinct_t', x='integer', y='float8') rows = [] for n in range(10000): row = (random.randint(0, 1000), random.random()) rows.append(row) pipeline.insert('stream0', desc, rows) pipeline.insert('test_hll_distinct_t', desc, rows) # Note that the CQ will use the HLL variant of COUNT DISTINCT, # so use hll_count_distinct on the table too tq = """ SELECT hll_count_distinct(x) FROM test_hll_distinct_t """ table_result = pipeline.execute(tq) cq = """ SELECT combine(count) FROM test_hll_distinct """ cv_result = pipeline.execute(cq) assert len(table_result) == len(cv_result) for tr, cr in zip(table_result, cv_result): assert tr[0] == cr[0]
def test_stats_aggs(pipeline, clean_db): """ Verify that combines work on stats aggs """ q = """ SELECT x::integer % 10 AS k, regr_sxx(x, y::float8), stddev(x) FROM stream0 GROUP BY k; """ desc = ('x', 'y') pipeline.create_stream('stream0', x='int', y='float8') pipeline.create_cv('test_stats_aggs', q) pipeline.create_table('test_stats_aggs_t', x='integer', y='float8') rows = [] for n in range(10000): row = (random.randint(0, 1000), random.random()) rows.append(row) pipeline.insert('stream0', desc, rows) pipeline.insert('test_stats_aggs_t', desc, rows) tq = """ SELECT regr_sxx(x, y::float8), stddev(x) FROM test_stats_aggs_t """ table_result = pipeline.execute(tq) cq = """ SELECT combine(regr_sxx), combine(stddev) FROM test_stats_aggs """ cv_result = pipeline.execute(cq) assert len(table_result) == len(cv_result) for tr, cr in zip(table_result, cv_result): assert abs(tr[0] - cr[0]) < 0.00001 assert abs(tr[1] - cr[1]) < 0.00001
def test_binary_upgrade(pipeline, clean_db): """ Verify that binary upgrades properly transfer all objects and data into the new installation """ # Create some regular tables with data, and create an index on half of them for n in range(16): name = 't_%d' % n pipeline.create_table(name, x='integer', y='text', z='text') rows = [(x, name, name) for x in range(1000)] pipeline.insert(name, ('x', 'y', 'z'), rows) if n >= 8: pipeline.execute('CREATE INDEX idx_%s ON %s(y)' % (name, name)) # Create some streams for n in range(8): name = 's_%d' % n pipeline.create_stream(name, x='integer', y='text') # Now create some CVs with data, some with indices for n in range(32): name = 'cv_%d' % n pipeline.create_stream('stream_%d' % n, x='int', y='text', z='text') pipeline.create_cv( name, 'SELECT z::text, COUNT(DISTINCT z) AS distinct_count, COUNT(*) FROM stream_%d GROUP BY z' % n) rows = [(x, name, name) for x in range(1000)] pipeline.insert('stream_%d' % n, ('x', 'y', 'z'), rows) if n >= 16: pipeline.execute('CREATE INDEX idx_%s ON %s(z)' % (name, name)) # Now create some in another namespace pipeline.execute('CREATE SCHEMA namespace') for n in range(8): name = 'namespace.cv_%d' % n pipeline.create_stream('namespace.stream_%d' % n, x='int', y='text', z='text') pipeline.create_cv( name, 'SELECT z::text, COUNT(DISTINCT z) AS distinct_count, COUNT(*) FROM namespace.stream_%d GROUP BY z' % n) rows = [(x, name, name) for x in range(1000)] pipeline.insert('namespace.stream_%d' % n, ('x', 'y', 'z'), rows) if n >= 4: pipeline.execute('CREATE INDEX namespace_idx_%d ON %s(z)' % (n, name)) create_fn = """ CREATE OR REPLACE FUNCTION tg_fn() RETURNS trigger AS $$ BEGIN RETURN NEW; END; $$ LANGUAGE plpgsql; """ pipeline.execute(create_fn) pipeline.create_stream('stream0', z='text') # Create some transforms for n in range(8): name = 'ct_%d' % n pipeline.create_ct(name, 'SELECT z::text FROM stream0', 'tg_fn()') time.sleep(10) old_bin_dir = new_bin_dir = pipeline.bin_dir old_data_dir = pipeline.data_dir new_data_dir = os.path.abspath('test_binary_upgrade_data_dir') pipeline.stop() p = subprocess.Popen( [os.path.join(pipeline.bin_dir, 'pipeline-init'), '-D', new_data_dir]) stdout, stderr = p.communicate() result = subprocess.check_call([ os.path.join(pipeline.bin_dir, 'pipeline-upgrade'), '-b', old_bin_dir, '-B', new_bin_dir, '-d', old_data_dir, '-D', new_data_dir ]) assert result == 0 # The cleanup path expects this to be running, but we're done with it pipeline.run() # pipeline-upgrade returned successfully and has already done sanity checks # but let's manually verify that all objects were migrated to the new data directory upgraded = PipelineDB(data_dir=new_data_dir) upgraded.run() # Tables for n in range(16): name = 't_%d' % n q = 'SELECT x, y, z FROM %s ORDER BY x' % name rows = upgraded.execute(q) for i, row in enumerate(rows): x, y, z = row assert x == i assert y == name assert z == name # Streams for n in range(8): name = 's_%d' % n rows = list( upgraded.execute( "SELECT oid FROM pg_class WHERE relkind = '$' AND relname = '%s'" % name)) assert len(rows) == 1 # CVs for n in range(32): name = 'cv_%d' % n rows = list( upgraded.execute('SELECT z, distinct_count, count FROM %s' % name)) assert len(rows) == 1 assert rows[0][0] == name assert rows[0][1] == 1 assert rows[0][2] == 1000 # CVs in separate schema for n in range(8): name = 'namespace.cv_%d' % n rows = list( upgraded.execute('SELECT z, distinct_count, count FROM %s' % name)) assert len(rows) == 1 assert rows[0][0] == name assert rows[0][1] == 1 assert rows[0][2] == 1000 # Transforms for n in range(8): name = 'ct_%d' % n q = """ SELECT c.relname FROM pg_class c JOIN pipeline_query pq ON c.oid = pq.relid WHERE pq.type = 't' AND c.relname = '%s' """ % name rows = list(upgraded.execute(q)) assert len(rows) == 1 upgraded.stop() shutil.rmtree(new_data_dir)
def test_binary_upgrade(pipeline, clean_db): """ Verify that binary upgrades properly transfer all objects and data into the new installation """ if pipeline.version_num == 110000: pytest.skip('skipping until PG11 supports dump/restore WITH OIDS') # Create some regular tables with data, and create an index on half of them for n in range(16): name = 't_%d' % n pipeline.create_table(name, x='integer', y='text', z='text') rows = [(x, name, name) for x in range(1000)] pipeline.insert(name, ('x', 'y', 'z'), rows) if n >= 8: pipeline.execute('CREATE INDEX idx_%s ON %s(y)' % (name, name)) # Create some streams for n in range(8): name = 's_%d' % n pipeline.create_stream(name, x='integer', y='text') # Now create some CVs with data, some with indices for n in range(32): name = 'cv_%d' % n pipeline.create_stream('stream_%d' % n, x='int', y='text', z='text') pipeline.create_cv( name, 'SELECT z::text, COUNT(DISTINCT z) AS distinct_count, COUNT(*) FROM stream_%d GROUP BY z' % n) if n >= 16: pipeline.execute('CREATE INDEX idx_%s ON %s(z)' % (name, name)) # Create some STJs for n in range(8): pipeline.create_cv( 'stj_%d' % n, 'SELECT t.x, count(*) FROM stream_%d s JOIN t_%d t ON s.x = t.x GROUP BY t.x' % (n, n)) # Create some SW CVs for n in range(8): pipeline.create_cv('sw_%d' % n, 'SELECT count(*) FROM stream_%d' % n, sw='%d days' % (n + 1), step_factor=n + 1) # Create some CVs/CTs/streams that we'll rename for n in range(4): pipeline.create_stream('to_rename_s_%d' % n, x='int') pipeline.create_cv( 'to_rename_cv_%d' % n, 'SELECT x, count(*) FROM to_rename_s_%d GROUP BY x' % n) pipeline.create_ct('to_rename_ct_%d' % n, 'SELECT x FROM to_rename_s_%d' % n) pipeline.create_cv( 'to_rename_ct_reader_%d' % n, "SELECT count(*) FROM output_of('to_rename_ct_%d')" % n) rows = [(x, ) for x in range(1000)] pipeline.insert('to_rename_s_%d' % n, ('x', ), rows) # Now rename them for n in range(4): pipeline.execute( 'ALTER FOREIGN TABLE to_rename_s_%d RENAME TO renamed_s_%d' % (n, n)) pipeline.execute('ALTER VIEW to_rename_cv_%d RENAME TO renamed_cv_%d' % (n, n)) pipeline.execute('ALTER VIEW to_rename_ct_%d RENAME TO renamed_ct_%d' % (n, n)) pipeline.execute( 'ALTER VIEW to_rename_ct_reader_%d RENAME TO renamed_ct_reader_%d' % (n, n)) # And write some data using the new stream names rows = [(x, ) for x in range(1000)] pipeline.insert('renamed_s_%d' % n, ('x', ), rows) # Create a CV chain that combines output streams q = """ SELECT (new).z, combine((delta).count) AS count, combine((delta).distinct_count) AS distinct_count FROM output_of('cv_0') GROUP BY (new).z """ pipeline.create_cv('combine_cv_0', q) q = """ SELECT combine((delta).count) AS count, combine((delta).distinct_count) AS distinct_count FROM output_of('combine_cv_0') """ pipeline.create_cv('combine_cv_1', q) for n in range(32): name = 'cv_%d' % n rows = [(x, name, name) for x in range(1000)] pipeline.insert('stream_%d' % n, ('x', 'y', 'z'), rows) # Create a CV with a TTL to verify TTL info is restored properly pipeline.create_cv( 'ttlcv', 'SELECT second(arrival_timestamp), count(*) FROM stream_0 GROUP BY second', ttl='1 hour', ttl_column='second') # Now create some in another namespace pipeline.execute('CREATE SCHEMA namespace') for n in range(8): name = 'namespace.cv_%d' % n pipeline.create_stream('namespace.stream_%d' % n, x='int', y='text', z='text') pipeline.create_cv( name, 'SELECT z::text, COUNT(DISTINCT z) AS distinct_count, COUNT(*) FROM namespace.stream_%d GROUP BY z' % n) rows = [(x, name, name) for x in range(1000)] pipeline.insert('namespace.stream_%d' % n, ('x', 'y', 'z'), rows) if n >= 4: pipeline.execute('CREATE INDEX namespace_idx_%d ON %s(z)' % (n, name)) create_fn = """ CREATE OR REPLACE FUNCTION tg_fn() RETURNS trigger AS $$ BEGIN RETURN NEW; END; $$ LANGUAGE plpgsql; """ pipeline.execute(create_fn) pipeline.create_stream('stream0', z='text') # Create some transforms with trigger functions for n in range(8): name = 'ct_%d' % n pipeline.create_ct(name, 'SELECT z::text FROM stream0', 'tg_fn') # Create some transforms without trigger functions for n in range(8): name = 'ct_no_trig_%d' % n pipeline.create_ct(name, 'SELECT z::text FROM stream0') time.sleep(10) old_bin_dir = new_bin_dir = pipeline.bin_dir old_data_dir = pipeline.data_dir new_data_dir0 = os.path.abspath('test_binary_upgrade_data_dir0') if os.path.exists(new_data_dir0): shutil.rmtree(new_data_dir0) pipeline.stop() p = subprocess.Popen( [os.path.join(pipeline.bin_dir, 'initdb'), '-D', new_data_dir0]) stdout, stderr = p.communicate() with open(os.path.join(new_data_dir0, 'postgresql.conf'), 'a') as f: f.write('shared_preload_libraries=pipelinedb\n') f.write('max_worker_processes=128\n') f.write('pipelinedb.stream_insert_level=sync_commit\n') result = subprocess.check_call([ os.path.join(pipeline.bin_dir, 'pg_upgrade'), '-b', old_bin_dir, '-B', new_bin_dir, '-d', old_data_dir, '-D', new_data_dir0 ]) assert result == 0 # The cleanup path expects this to be running, but we're done with it pipeline.run() # pg_upgrade returned successfully and has already done sanity checks # but let's manually verify that all objects were migrated to the new data directory upgraded = PipelineDB(data_dir=new_data_dir0) upgraded.run() # Tables for n in range(16): name = 't_%d' % n q = 'SELECT x, y, z FROM %s ORDER BY x' % name rows = upgraded.execute(q) for i, row in enumerate(rows): assert row['x'] == i assert row['y'] == name assert row['z'] == name # Streams for n in range(8): name = 's_%d' % n rows = list( upgraded.execute( "SELECT oid FROM pg_class WHERE relkind = 'f' AND relname = '%s'" % name)) assert len(rows) == 1 # CVs for n in range(32): name = 'cv_%d' % n rows = list( upgraded.execute('SELECT z, distinct_count, count FROM %s' % name)) assert len(rows) == 1 assert rows[0][0] == name assert rows[0][1] == 1 assert rows[0][2] == 1000 # CV with TTL row = list( upgraded.execute( "SELECT ttl, ttl_attno FROM pg_class c JOIN pipelinedb.cont_query pq on c.oid = pq.relid WHERE c.relname = 'ttlcv'" ))[0] assert row[0] == 3600 assert row[1] == 1 # CVs in separate schema for n in range(8): name = 'namespace.cv_%d' % n rows = list( upgraded.execute('SELECT z, distinct_count, count FROM %s' % name)) assert len(rows) == 1 assert rows[0][0] == name assert rows[0][1] == 1 assert rows[0][2] == 1000 # Transforms with trigger functions for n in range(8): name = 'ct_%d' % n q = """ SELECT c.relname FROM pg_class c JOIN pipelinedb.cont_query pq ON c.oid = pq.relid WHERE pq.type = 't' AND c.relname = '%s' """ % name rows = list(upgraded.execute(q)) assert len(rows) == 1 # Transforms without trigger functions for n in range(8): name = 'ct_no_trig_%d' % n q = """ SELECT c.relname FROM pg_class c JOIN pipelinedb.cont_query pq ON c.oid = pq.relid WHERE pq.type = 't' AND c.relname = '%s' """ % name rows = list(upgraded.execute(q)) assert len(rows) == 1 # Verify SW CVs for n in range(8): name = 'sw_%d' % n row = upgraded.execute( "SELECT ttl, step_factor FROM pipelinedb.cont_query cq JOIN pg_class c ON cq.relid = c.oid WHERE relname = '%s'" % name)[0] assert row['ttl'] == (n + 1) * 3600 * 24 assert row['step_factor'] == n + 1 row = upgraded.execute('SELECT count FROM %s' % name)[0] assert row['count'] == 1000 # Verify renamed CVs/CTs/streams for n in range(4): row = upgraded.execute('SELECT combine(count) FROM renamed_cv_%d' % n)[0] assert row['combine'] == 2000 row = upgraded.execute( 'SELECT combine(count) FROM renamed_ct_reader_%d' % n)[0] assert row['combine'] == 2000 # Verify chained CVs row = upgraded.execute( 'SELECT z, count, distinct_count FROM combine_cv_0')[0] assert row['z'] == 'cv_0' assert row['count'] == 1000 assert row['distinct_count'] == 1 row = upgraded.execute('SELECT count, distinct_count FROM combine_cv_1')[0] assert row['count'] == 1000 assert row['distinct_count'] == 1 # Now insert some new data and verify CVs are still updating properly for n in range(32): name = 'cv_%d' % n rows = [(x, name, name) for x in range(1000)] upgraded.insert('stream_%d' % n, ('x', 'y', 'z'), rows) for n in range(32): name = 'cv_%d' % n rows = list( upgraded.execute('SELECT z, distinct_count, count FROM %s' % name)) assert len(rows) == 1 assert rows[0][0] == name assert rows[0][1] == 1 assert rows[0][2] == 2000 row = upgraded.execute( 'SELECT z, count, distinct_count FROM combine_cv_0')[0] assert row['z'] == 'cv_0' assert row['count'] == 2000 assert row['distinct_count'] == 1 row = upgraded.execute('SELECT count, distinct_count FROM combine_cv_1')[0] assert row['count'] == 2000 assert row['distinct_count'] == 1 # Verify STJs for n in range(8): cv = 'stj_%d' % n row = upgraded.execute('SELECT sum(count) FROM %s' % cv)[0] assert row['sum'] == 2000 # Rename objects again before the second upgrade for n in range(4): upgraded.execute( 'ALTER FOREIGN TABLE renamed_s_%d RENAME TO renamed_again_s_%d' % (n, n)) upgraded.execute( 'ALTER VIEW renamed_cv_%d RENAME TO renamed_again_cv_%d' % (n, n)) upgraded.execute( 'ALTER VIEW renamed_ct_%d RENAME TO renamed_again_ct_%d' % (n, n)) upgraded.execute( 'ALTER VIEW renamed_ct_reader_%d RENAME TO renamed_again_ct_reader_%d' % (n, n)) # And write some data using the new stream names rows = [(x, ) for x in range(1000)] upgraded.insert('renamed_again_s_%d' % n, ('x', ), rows) upgraded.stop() new_data_dir1 = os.path.abspath('test_binary_upgrade_data_dir1') if os.path.exists(new_data_dir1): shutil.rmtree(new_data_dir1) p = subprocess.Popen( [os.path.join(pipeline.bin_dir, 'initdb'), '-D', new_data_dir1]) stdout, stderr = p.communicate() with open(os.path.join(new_data_dir1, 'postgresql.conf'), 'a') as f: f.write('shared_preload_libraries=pipelinedb\n') f.write('max_worker_processes=128\n') f.write('pipelinedb.stream_insert_level=sync_commit\n') # Now upgrade the upgraded DB to verify that restored DBs can be updated properly result = subprocess.check_call([ os.path.join(pipeline.bin_dir, 'pg_upgrade'), '-b', old_bin_dir, '-B', new_bin_dir, '-d', new_data_dir0, '-D', new_data_dir1 ]) assert result == 0 # but let's manually verify that all objects were migrated to the new data directory upgraded = PipelineDB(data_dir=new_data_dir1) upgraded.run() # Tables for n in range(16): name = 't_%d' % n q = 'SELECT x, y, z FROM %s ORDER BY x' % name rows = upgraded.execute(q) for i, row in enumerate(rows): assert row['x'] == i assert row['y'] == name assert row['z'] == name # Streams for n in range(8): name = 's_%d' % n rows = list( upgraded.execute( "SELECT oid FROM pg_class WHERE relkind = 'f' AND relname = '%s'" % name)) assert len(rows) == 1 # CVs for n in range(32): name = 'cv_%d' % n rows = list( upgraded.execute('SELECT z, distinct_count, count FROM %s' % name)) assert len(rows) == 1 assert rows[0][0] == name assert rows[0][1] == 1 assert rows[0][2] == 2000 # CV with TTL row = list( upgraded.execute( "SELECT ttl, ttl_attno FROM pg_class c JOIN pipelinedb.cont_query pq on c.oid = pq.relid WHERE c.relname = 'ttlcv'" ))[0] assert row[0] == 3600 assert row[1] == 1 # CVs in separate schema for n in range(8): name = 'namespace.cv_%d' % n rows = list( upgraded.execute('SELECT z, distinct_count, count FROM %s' % name)) assert len(rows) == 1 assert rows[0][0] == name assert rows[0][1] == 1 assert rows[0][2] == 1000 # Transforms with trigger functions for n in range(8): name = 'ct_%d' % n q = """ SELECT c.relname FROM pg_class c JOIN pipelinedb.cont_query pq ON c.oid = pq.relid WHERE pq.type = 't' AND c.relname = '%s' """ % name rows = list(upgraded.execute(q)) assert len(rows) == 1 # Transforms without trigger functions for n in range(8): name = 'ct_no_trig_%d' % n q = """ SELECT c.relname FROM pg_class c JOIN pipelinedb.cont_query pq ON c.oid = pq.relid WHERE pq.type = 't' AND c.relname = '%s' """ % name rows = list(upgraded.execute(q)) assert len(rows) == 1 # Verify SW Cvs for n in range(8): name = 'sw_%d' % n step_factor = n + 1 row = upgraded.execute( "SELECT ttl, step_factor FROM pipelinedb.cont_query cq JOIN pg_class c ON cq.relid = c.oid WHERE relname = '%s'" % name)[0] assert row['ttl'] == (n + 1) * 3600 * 24 assert row['step_factor'] == n + 1 row = upgraded.execute('SELECT count FROM %s' % name)[0] assert row['count'] == 2000 # Verify renamed CVs/CTs/streams for n in range(4): row = upgraded.execute( 'SELECT combine(count) FROM renamed_again_cv_%d' % n)[0] assert row['combine'] == 3000 row = upgraded.execute( 'SELECT combine(count) FROM renamed_again_ct_reader_%d' % n)[0] assert row['combine'] == 3000 # Verify chained CV row = upgraded.execute( 'SELECT z, count, distinct_count FROM combine_cv_0')[0] assert row['z'] == 'cv_0' assert row['count'] == 2000 assert row['distinct_count'] == 1 row = upgraded.execute('SELECT count, distinct_count FROM combine_cv_1')[0] assert row['count'] == 2000 assert row['distinct_count'] == 1 # Now insert some new data and verify CVs are still updating properly for n in range(32): name = 'cv_%d' % n rows = [(x, name, name) for x in range(1000)] upgraded.insert('stream_%d' % n, ('x', 'y', 'z'), rows) for n in range(32): name = 'cv_%d' % n rows = list( upgraded.execute('SELECT z, distinct_count, count FROM %s' % name)) assert len(rows) == 1 assert rows[0][0] == name assert rows[0][1] == 1 assert rows[0][2] == 3000 row = upgraded.execute( 'SELECT z, count, distinct_count FROM combine_cv_0')[0] assert row['z'] == 'cv_0' assert row['count'] == 3000 assert row['distinct_count'] == 1 row = upgraded.execute('SELECT count, distinct_count FROM combine_cv_1')[0] assert row['count'] == 3000 assert row['distinct_count'] == 1 # Verify STJs for n in range(8): cv = 'stj_%d' % n row = upgraded.execute('SELECT sum(count) FROM %s' % cv)[0] assert row['sum'] == 3000 upgraded.stop() pipeline.execute('DROP VIEW combine_cv_0 CASCADE') shutil.rmtree(new_data_dir0) shutil.rmtree(new_data_dir1)
def test_binary_upgrade(pipeline, clean_db): """ Verify that binary upgrades properly transfer all objects and data into the new installation """ # Create some regular tables with data, and create an index on half of them for n in range(16): name = 't_%d' % n pipeline.create_table(name, x='integer', y='text', z='text') rows = [(x, name, name) for x in range(1000)] pipeline.insert(name, ('x', 'y', 'z'), rows) if n >= 8: pipeline.execute('CREATE INDEX idx_%s ON %s(y)' % (name, name)) # Create some streams for n in range(8): name = 's_%d' % n pipeline.create_stream(name, x='integer', y='text') # Now create some CVs with data, some with indices for n in range(32): name = 'cv_%d' % n pipeline.create_cv(name, 'SELECT z::text, COUNT(DISTINCT z) AS distinct_count, COUNT(*) FROM stream_%d GROUP BY z' % n) rows = [(x, name, name) for x in range(1000)] pipeline.insert('stream_%d' % n, ('x', 'y', 'z'), rows) if n >= 16: pipeline.execute('CREATE INDEX idx_%s ON %s(z)' % (name, name)) # Now create some in another namespace pipeline.execute('CREATE SCHEMA namespace') for n in range(8): name = 'namespace.cv_%d' % n pipeline.create_cv(name, 'SELECT z::text, COUNT(DISTINCT z) AS distinct_count, COUNT(*) FROM namespace.stream_%d GROUP BY z' % n) rows = [(x, name, name) for x in range(1000)] pipeline.insert('namespace.stream_%d' % n, ('x', 'y', 'z'), rows) if n >= 4: pipeline.execute('CREATE INDEX namespace_idx_%d ON %s(z)' % (n, name)) create_fn = """ CREATE OR REPLACE FUNCTION tg_fn() RETURNS trigger AS $$ BEGIN RETURN NEW; END; $$ LANGUAGE plpgsql; """ pipeline.execute(create_fn) # Create some transforms for n in range(8): name = 'ct_%d' % n pipeline.create_ct(name, 'SELECT z::text FROM stream', 'tg_fn()') time.sleep(10) old_bin_dir = new_bin_dir = pipeline.bin_dir old_data_dir = pipeline.data_dir new_data_dir = os.path.abspath('test_binary_upgrade_data_dir') pipeline.stop() p = subprocess.Popen([ os.path.join(pipeline.bin_dir, 'pipeline-init'), '-D', new_data_dir]) stdout, stderr = p.communicate() result = subprocess.check_call([ os.path.join(pipeline.bin_dir, 'pipeline-upgrade'), '-b', old_bin_dir, '-B', new_bin_dir, '-d', old_data_dir, '-D', new_data_dir]) assert result == 0 # The cleanup path expects this to be running, but we're done with it pipeline.run() # pipeline-upgrade returned successfully and has already done sanity checks # but let's manually verify that all objects were migrated to the new data directory upgraded = PipelineDB(data_dir=new_data_dir) upgraded.run() # Tables for n in range(16): name = 't_%d' % n q = 'SELECT x, y, z FROM %s ORDER BY x' % name rows = upgraded.execute(q) for i, row in enumerate(rows): x, y, z = row assert x == i assert y == name assert z == name # Streams for n in range(8): name = 's_%d' % n rows = list(upgraded.execute("SELECT oid FROM pg_class WHERE relkind = '$' AND relname = '%s'" % name)) assert len(rows) == 1 # CVs for n in range(32): name = 'cv_%d' % n rows = list(upgraded.execute('SELECT z, distinct_count, count FROM %s' % name)) assert len(rows) == 1 assert rows[0][0] == name assert rows[0][1] == 1 assert rows[0][2] == 1000 # CVs in separate schema for n in range(8): name = 'namespace.cv_%d' % n rows = list(upgraded.execute('SELECT z, distinct_count, count FROM %s' % name)) assert len(rows) == 1 assert rows[0][0] == name assert rows[0][1] == 1 assert rows[0][2] == 1000 # Transforms for n in range(8): name = 'ct_%d' % n q = """ SELECT c.relname FROM pg_class c JOIN pipeline_query pq ON c.oid = pq.relid WHERE pq.type = 't' AND c.relname = '%s' """ % name rows = list(upgraded.execute(q)) assert len(rows) == 1 upgraded.stop() shutil.rmtree(new_data_dir)