def test_null_groups(pipeline, clean_db): """ Verify that null group columns are considered equal """ pipeline.create_stream('s', x='int', y='int', z='int') q = """ SELECT x::integer, y::integer, z::integer, COUNT(*) FROM s GROUP BY x, y, z; """ desc = ('x', 'y', 'z') pipeline.create_cv('test_null_groups', q) pipeline.create_table('test_null_groups_t', x='integer', y='integer', z='integer') rows = [] for n in range(10000): vals = list(random.randint(0, 10) for n in range(3)) vals = map(lambda n: random.random() > 0.1 and n or None, vals) rows.append(tuple(vals)) pipeline.insert('s', desc, rows) pipeline.insert('test_null_groups_t', desc, rows) table_q = """ SELECT x, y, z, COUNT(*) FROM test_null_groups_t GROUP BY x, y, z ORDER BY x, y, z; """ expected = pipeline.execute(table_q) result = pipeline.execute( 'SELECT x, y, z, count FROM test_null_groups ORDER BY x, y, z') for r, e in zip(result, expected): assert r == e
def test_combine_in_view(pipeline, clean_db): """ Verify that combines in views on top of continuous views work """ q = """ SELECT x::integer, avg(y::integer) FROM stream0 GROUP BY x """ desc = ('x', 'y') pipeline.create_stream('stream0', x='int', y='float8') pipeline.create_cv('test_combine_view', q) pipeline.execute( 'CREATE VIEW v AS SELECT combine(avg) FROM test_combine_view') rows = [] for n in range(10000): rows.append((random.randint(1, 256), random.randint(1, 1024))) pipeline.insert('stream0', desc, rows) view = pipeline.execute('SELECT * FROM v') assert len(view) == 1 expected = sum(r[1] for r in rows) / float(len(rows)) assert abs(float(view[0][0]) - expected) < 0.00001 pipeline.execute('DROP VIEW v')
def test_filter_clause(pipeline, clean_db): """ Verify that FILTER clauses work on aggregates and sliding window aggregates """ pipeline.create_stream("test_filter_stream", x="int") q = """ SELECT SUM(x::int) FILTER (WHERE mod(x, 2) = 0) AS sum2, SUM(x::int) FILTER (WHERE mod(x, 3) = 0) AS sum3 FROM test_filter_stream """ sw = """ WHERE arrival_timestamp > clock_timestamp() - interval '30 second' """ pipeline.create_cv("test_filter", q) pipeline.create_cv("test_filter_sw", "%s %s" % (q, sw)) desc = ("x",) rows = [] for n in range(1000): rows.append((n,)) pipeline.insert("test_filter_stream", desc, rows) sum2 = sum(filter(lambda x: x % 2 == 0, map(lambda x: x[0], rows))) sum3 = sum(filter(lambda x: x % 3 == 0, map(lambda x: x[0], rows))) result1 = pipeline.execute("SELECT * FROM test_filter").first() result2 = pipeline.execute("SELECT * FROM test_filter_sw").first() assert result1["sum2"] == result2["sum2"] == sum2 assert result1["sum3"] == result2["sum3"] == sum3
def test_single_continuous_view(pipeline, clean_db): """ Verify that specific continuous views can be dropped and restored """ pipeline.create_stream('stream0', x='int') pipeline.create_cv('test_single0', 'SELECT COUNT(*) FROM stream0') pipeline.create_cv('test_single1', 'SELECT COUNT(*) FROM stream0') pipeline.insert('stream0', ('x', ), [(x, ) for x in range(10)]) result = pipeline.execute('SELECT count FROM test_single0').first() assert result['count'] == 10 result = pipeline.execute('SELECT count FROM test_single1').first() assert result['count'] == 10 _dump(pipeline, 'test_single.sql', tables=['test_single0', 'stream0', 'test_single0_mrel']) pipeline.drop_all() _restore(pipeline, 'test_single.sql') result = pipeline.execute('SELECT count FROM test_single0').first() assert result['count'] == 10 # We didn't dump this one result = list( pipeline.execute( 'SELECT * FROM pg_class WHERE relname LIKE \'%%test_single1%%\'')) assert not result
def test_online_add_column(pipeline, clean_db): """ Verify that we can add columns to a stream while not affecting running CQs """ pipeline.create_stream("stream0", c0="integer") pipeline.create_cv("cv0", "SELECT c0 FROM stream0") pipeline.insert("stream0", ("c0",), [(n,) for n in range(0, 1000)]) result = list(pipeline.execute("SELECT * FROM cv0")) assert len(result) == 1000 for row in result: for col in row: assert col is not None pipeline.execute("ALTER STREAM stream0 ADD c1 integer") pipeline.create_cv("cv1", "SELECT c0, c1 FROM stream0") pipeline.insert("stream0", ("c0", "c1"), [(n, n) for n in range(1000, 2000)]) result = list(pipeline.execute("SELECT * FROM cv1 WHERE c1 >= 1000")) assert len(result) == 1000 for row in result: for col in row: assert col is not None pipeline.execute("ALTER STREAM stream0 ADD c2 integer") pipeline.create_cv("cv2", "SELECT c0, c1, c2 FROM stream0") pipeline.insert("stream0", ("c0", "c1", "c2"), [(n, n, n) for n in range(2000, 3000)]) result = list(pipeline.execute("SELECT * FROM cv2 WHERE c2 >= 2000")) assert len(result) == 1000 for row in result: for col in row: assert col is not None pipeline.execute("ALTER STREAM stream0 ADD c3 integer") pipeline.create_cv("cv3", "SELECT c0, c1, c2, c3 FROM stream0") pipeline.insert("stream0", ("c0", "c1", "c2", "c3"), [(n, n, n, n) for n in range(3000, 4000)]) result = list(pipeline.execute("SELECT * FROM cv3 WHERE c3 >= 3000")) assert len(result) == 1000 for row in result: for col in row: assert col is not None pipeline.execute("ALTER STREAM stream0 ADD c4 integer") pipeline.create_cv("cv4", "SELECT c0, c1, c2, c3, c4 FROM stream0") pipeline.insert("stream0", ("c0", "c1", "c2", "c3", "c4"), [(n, n, n, n, n) for n in range(4000, 5000)]) result = list(pipeline.execute("SELECT * FROM cv4 WHERE c4 >= 4000")) assert len(result) == 1000 for row in result: for col in row: assert col is not None
def test_prepared_extended(pipeline, clean_db): """ Verify that we can write to streams using the extended protocol. This test shells out to a binary because psycopg2 doesn't use the extended protocol. """ pipeline.create_stream('extended_stream', x='int', y='int', z='int') q = """ SELECT COUNT(x::integer) AS x, COUNT(y::integer) AS y, COUNT(z::integer) AS z FROM extended_stream """ pipeline.create_cv('test_prepared_extended', q) # This will insert 1000 via a paramaterized insert, and 1000 via unparamaterized insert cmd = ['./extended', 'pipeline', str(pipeline.port), 'extended_stream', '1000'] stdout, stderr = subprocess.Popen(cmd).communicate() assert stdout is None assert stderr is None rows = list(pipeline.execute('SELECT x, y, z FROM test_prepared_extended')) assert len(rows) == 1 result = rows[0] assert result['x'] == 2000 assert result['y'] == 2000 assert result['z'] == 2000
def test_colums_subset(pipeline, clean_db): """ Verify that copying data from a file into a stream works when the file's input columns are a subset of the stream0's columns """ pipeline.create_stream("stream0", x="int", y="float8", z="numeric", m="int") q = "SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric), max(m::integer) FROM stream0" pipeline.create_cv("test_copy_subset", q) pipeline.create_table("test_copy_subset_t", x="integer", y="float8", z="numeric") path = os.path.abspath(os.path.join(pipeline.tmp_dir, "test_copy.csv")) rows = [] for n in range(10000): row = random.randint(1, 1024), random.randint(1, 1024), random.random() rows.append(row) _generate_csv(path, rows, desc=("x", "y", "z")) pipeline.execute("COPY test_copy_subset_t (x, y, z) FROM '%s' HEADER CSV" % path) pipeline.execute("COPY stream0 (x, y, z) FROM '%s' HEADER CSV" % path) expected = pipeline.execute( "SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric) FROM test_copy_subset_t" ).first() result = list(pipeline.execute("SELECT s0, s1, avg FROM test_copy_subset")) assert len(result) == 1 result = result[0] assert result[0] == expected[0] assert result[1] == expected[1] assert result[2] == expected[2]
def test_prepared_extended(pipeline, clean_db): """ Verify that we can write to streams using the extended protocol. This test shells out to a binary because psycopg2 doesn't use the extended protocol. """ pipeline.create_stream('extended_stream', x='int', y='int', z='int') q = """ SELECT COUNT(x::integer) AS x, COUNT(y::integer) AS y, COUNT(z::integer) AS z FROM extended_stream """ pipeline.create_cv('test_prepared_extended', q) # This will insert 1000 via a paramaterized insert, and 1000 via unparamaterized insert cmd = [ './extended', 'postgres', str(pipeline.port), 'extended_stream', '1000' ] stdout, stderr = subprocess.Popen(cmd).communicate() assert stdout is None assert stderr is None rows = pipeline.execute('SELECT x, y, z FROM test_prepared_extended') assert len(rows) == 1 result = rows[0] assert result['x'] == 2000 assert result['y'] == 2000 assert result['z'] == 2000
def test_sliding_windows(pipeline, clean_db): """ Verify that sliding window queries are properly dumped and restored """ pipeline.create_stream('stream0', x='int') pipeline.execute( 'CREATE CONTINUOUS VIEW sw_v WITH (sw = \'20 seconds\') AS SELECT count(*) FROM stream0' ) pipeline.insert('stream0', ('x', ), [(x, ) for x in range(10)]) result = pipeline.execute('SELECT count FROM sw_v').first() assert result['count'] == 10 _dump(pipeline, 'test_sw.sql') pipeline.drop_all() _restore(pipeline, 'test_sw.sql') result = pipeline.execute('SELECT count FROM sw_v').first() assert result['count'] == 10 # We should still drop back to 0 within 20 seconds result = pipeline.execute('SELECT count FROM sw_v').first() while result['count'] > 0: time.sleep(1) result = pipeline.execute('SELECT count FROM sw_v').first() result = pipeline.execute('SELECT count FROM sw_v').first() assert result['count'] == 0
def test_null_groups(pipeline, clean_db): """ Verify that null group columns are considered equal """ pipeline.create_stream("s", x="int", y="int", z="int") q = """ SELECT x::integer, y::integer, z::integer, COUNT(*) FROM s GROUP BY x, y, z; """ desc = ("x", "y", "z") pipeline.create_cv("test_null_groups", q) pipeline.create_table("test_null_groups_t", x="integer", y="integer", z="integer") rows = [] for n in range(10000): vals = list(random.randint(0, 10) for n in range(3)) vals = map(lambda n: random.random() > 0.1 and n or None, vals) rows.append(tuple(vals)) pipeline.insert("s", desc, rows) pipeline.insert("test_null_groups_t", desc, rows) table_q = """ SELECT x, y, z, COUNT(*) FROM test_null_groups_t GROUP BY x, y, z ORDER BY x, y, z; """ expected = list(pipeline.execute(table_q)) result = list(pipeline.execute("SELECT x, y, z, count FROM test_null_groups ORDER BY x, y, z")) for r, e in zip(result, expected): assert r == e
def test_freq_agg(pipeline, clean_db): """ Test freq_agg, freq_merge_agg """ pipeline.create_stream('test_cmsketch_stream', k='int', x='int') q = """ SELECT k::integer, freq_agg(x::int) AS c FROM test_cmsketch_stream GROUP BY k """ desc = ('k', 'x') pipeline.create_cv('test_cmsketch_agg', q) rows = [] for n in range(1000): rows.append((0, n % 20)) rows.append((1, n % 50)) pipeline.insert('test_cmsketch_stream', desc, rows) result = pipeline.execute( 'SELECT freq(c, 10) AS x, freq(c, 40) AS y, freq(c, 60) FROM test_cmsketch_agg ORDER BY k' ) assert len(result) == 2 assert (result[0][0], result[0][1], result[0][2]) == (50, 0, 0) assert (result[1][0], result[1][1], result[1][2]) == (20, 20, 0) result = pipeline.execute( 'SELECT freq(combine(c), 10) AS x, freq(combine(c), 40) AS y, freq(combine(c), 60) FROM test_cmsketch_agg' ) assert len(result) == 1 (result[0][0], result[0][1], result[0][2]) == (70, 20, 0)
def test_simple_crash(pipeline, clean_db): """ Test simple worker and combiner crashes. """ pipeline.create_stream('stream0', x='int') q = 'SELECT COUNT(*) FROM stream0' pipeline.create_cv('test_simple_crash', q) pipeline.insert('stream0', ['x'], [(1,), (1,)]) result = pipeline.execute('SELECT * FROM test_simple_crash').first() assert result['count'] == 2 # This batch can potentially get lost. pipeline.insert('stream0', ['x'], [(1,), (1,)]) assert kill_worker() pipeline.insert('stream0', ['x'], [(1,), (1,)]) result = pipeline.execute('SELECT * FROM test_simple_crash').first() assert result['count'] in [4, 6] # This batch can potentially get lost. pipeline.insert('stream0', ['x'], [(1,), (1,)]) assert kill_combiner() pipeline.insert('stream0', ['x'], [(1,), (1,)]) result = pipeline.execute('SELECT * FROM test_simple_crash').first() assert result['count'] in [6, 8, 10] # To ensure that all remaining events in ZMQ queues have been consumed time.sleep(2)
def test_restart_recovery(pipeline, clean_db): pipeline.create_stream('stream0', x='int') q = 'SELECT COUNT(*) FROM stream0' pipeline.create_cv('test_restart_recovery', q) pipeline.insert('stream0', ['x'], [(1,), (1,)]) result = pipeline.execute('SELECT * FROM test_restart_recovery').first() assert result['count'] == 2 # Need to sleep here, otherwise on restart the materialization table is # empty. Not sure why. time.sleep(0.1) # Restart. pipeline.stop() pipeline.run() result = pipeline.execute('SELECT * FROM test_restart_recovery').first() assert result['count'] == 2 pipeline.insert('stream0', ['x'], [(1,), (1,)]) result = pipeline.execute('SELECT * FROM test_restart_recovery').first() assert result['count'] == 4
def test_simple_aggs(pipeline, clean_db): """ Verify that combines work properly on simple aggs """ q = """ SELECT x::integer % 10 AS k, avg(x), sum(y::float8), count(*) FROM stream0 GROUP BY k; """ desc = ('x', 'y') pipeline.create_stream('stream0', x='int', y='float8') pipeline.create_cv('test_simple_aggs', q) pipeline.create_table('test_simple_aggs_t', x='integer', y='float8') rows = [] for n in range(10000): row = (random.randint(0, 1000), random.random()) rows.append(row) pipeline.insert('stream0', desc, rows) pipeline.insert('test_simple_aggs_t', desc, rows) table_result = pipeline.execute( 'SELECT avg(x), sum(y::float8), count(*) FROM test_simple_aggs_t') cv_result = pipeline.execute( 'SELECT combine(avg), combine(sum), combine(count) FROM test_simple_aggs' ) assert len(table_result) == len(cv_result) for tr, cr in zip(table_result, cv_result): assert abs(tr[0] - cr[0]) < 0.00001 assert abs(tr[1] - cr[1]) < 0.00001 assert abs(tr[2] - cr[2]) < 0.00001
def test_fss_agg(pipeline, clean_db): pipeline.create_stream('test_fss_stream', x='int', k='text') q = """ SELECT k::text, fss_agg(x::int, 5) FROM test_fss_stream GROUP BY k """ desc = ('k', 'x') pipeline.create_cv('test_fss_agg', q) items = range(14) random.shuffle(items) a_items = items b_items = list(reversed(items)) values = map(lambda i: ('a', i), get_geometric_dist(a_items)) values.extend(map(lambda i: ('b', i), get_geometric_dist(b_items))) random.shuffle(values) pipeline.insert('test_fss_stream', desc, values) result = list( pipeline.execute( 'SELECT k, fss_topk_values(fss_agg) FROM test_fss_agg ORDER BY k')) topk = map(int, result[0][1].rstrip('}').lstrip('{').split(',')) assert sorted(topk) == sorted(a_items[-5:]) topk = map(int, result[1][1].rstrip('}').lstrip('{').split(',')) assert sorted(topk) == sorted(b_items[-5:])
def assert_result_changes(func, args): """ Verifies that the result of the given function changes with time """ pipeline.create_stream('stream', x='int', y='text', z='int') name = 'assert_%s_decreases' % func pipeline.create_cv(name, "SELECT %s(%s) FROM stream WHERE arrival_timestamp > clock_timestamp() - interval '2 seconds'" % (func, args)) rows = [(n, str(n), n + 1) for n in range(1000)] pipeline.insert('stream', ('x', 'y', 'z'), rows) current = 1 results = [] while current: row = pipeline.execute('SELECT * FROM %s' % name).first() current = row[func] if current is None: break results.append(current) # Verify that we actually read something assert results pipeline.drop_cv(name)
def test_cmsketch_agg(pipeline, clean_db): """ Test cmsketch_agg, cmsketch_merge_agg, cmsketch_cdf, cmsketch_quantile """ pipeline.create_stream('test_cmsketch_stream', k='int', x='int') q = """ SELECT k::integer, cmsketch_agg(x::int) AS c FROM test_cmsketch_stream GROUP BY k """ desc = ('k', 'x') pipeline.create_cv('test_cmsketch_agg', q) rows = [] for n in range(1000): rows.append((0, n % 20)) rows.append((1, n % 50)) pipeline.insert('test_cmsketch_stream', desc, rows) result = list(pipeline.execute( 'SELECT cmsketch_frequency(c, 10) AS x, cmsketch_frequency(c, 40) AS y, ' 'cmsketch_frequency(c, 60) FROM test_cmsketch_agg ORDER BY k').fetchall()) assert len(result) == 2 assert tuple(result[0]) == (50, 0, 0) assert tuple(result[1]) == (20, 20, 0) result = list(pipeline.execute( 'SELECT cmsketch_frequency(combine(c), 10) AS x, ' 'cmsketch_frequency(combine(c), 40) AS y, cmsketch_frequency(combine(c), 60) ' 'FROM test_cmsketch_agg').fetchall()) assert len(result) == 1 assert tuple(result[0]) == (70, 20, 0)
def test_concurrent_vacuum_full(pipeline, clean_db): pipeline.create_stream('test_vacuum_stream', x='int') pipeline.create_cv( 'test_vacuum_full', 'SELECT x::int, COUNT(*) FROM test_vacuum_stream GROUP BY x') stop = False def insert(): while not stop: values = [(random.randint(0, 1000000),) for _ in xrange(1000)] pipeline.insert('test_vacuum_stream', ('x',), values) time.sleep(0.01) threads = [threading.Thread(target=insert) for _ in range(4)] map(lambda t: t.start(), threads) # Insert data for a little bit so we have enough work to do while # vacuuming. time.sleep(20) conn = psycopg2.connect('dbname=pipeline user=%s host=localhost port=%s' % (getpass.getuser(), pipeline.port)) conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) cur = conn.cursor() cur.execute('VACUUM FULL test_vacuum_full') conn.close() # Now kill the insert threads. stop = True map(lambda t: t.join(), threads)
def test_stream_stats(pipeline, clean_db): """ Verify that stream-level statistics collection works """ # create a few streams for n in range(8): sname = 's%d' % n pipeline.create_stream(sname, x='int') cvname = 'cv%d' % n pipeline.create_cv(cvname, 'SELECT count(*) FROM %s' % sname) for n in range(8): sname = 's%d' % n x = n + 1 values = [(v, ) for v in range(1000 * x)] pipeline.insert(sname, ('x', ), values) time.sleep(2) for n in range(8): sname = 's%d' % n row = pipeline.execute( "SELECT stream, input_rows, input_batches, input_bytes FROM pipelinedb.stream_stats WHERE stream = '%s'" % sname)[0] x = n + 1 assert row['input_rows'] == 1000 * x
def assert_result_changes(func, args): """ Verifies that the result of the given function changes with time """ pipeline.create_stream('stream0', x='int', y='text', z='int') name = 'assert_%s_decreases' % func pipeline.create_cv( name, "SELECT %s(%s) FROM stream0 WHERE arrival_timestamp > clock_timestamp() - interval '2 seconds'" % (func, args)) rows = [(n, str(n), n + 1) for n in range(1000)] pipeline.insert('stream0', ('x', 'y', 'z'), rows) current = 1 results = [] while current: row = pipeline.execute('SELECT * FROM %s' % name).first() current = row[func] if current is None: break results.append(current) # Verify that we actually read something assert results pipeline.drop_cv(name)
def test_bloom_intersection(pipeline, clean_db): """ Verify that bloom_intersection works """ pipeline.create_stream('test_bloom_stream', x='int', k='int') q = """ SELECT k::int, bloom_agg(x::integer) FROM test_bloom_stream GROUP BY k """ desc = ('k', 'x') pipeline.create_cv('test_bloom_intersection', q) rows = [] for i in range(10000): rows.append((0, 2 * i)) rows.append((1, i)) pipeline.insert('test_bloom_stream', desc, rows) cvq = """ SELECT bloom_cardinality(bloom_intersection_agg(bloom_agg)) FROM test_bloom_intersection """ result = list(pipeline.execute(cvq)) assert len(result) == 1 result = result[0] assert result[0] == 5530
def test_join_with_where(pipeline, clean_db): """ Verify that stream-table joins using a WHERE clause work properly """ num_cols = 4 q = """ SELECT s.col0::integer FROM stream0 s, wt WHERE s.col0 = 1 AND wt.col0 = 1 """ wt_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) pipeline.create_table('wt', **wt_cols) pipeline.create_table('wt_s', **wt_cols) wt = _generate_rows(num_cols, 64) s = _generate_rows(num_cols, 64) _insert(pipeline, 'wt', wt, 0.1) _insert(pipeline, 'wt_s', s, 0.1) pipeline.create_stream('stream0', **wt_cols) pipeline.create_cv('test_join_where', q) _insert(pipeline, 'stream0', s) expected = pipeline.execute('SELECT COUNT(*) FROM wt_s s, wt WHERE s.col0 = 1 AND wt.col0 = 1')[0] result = pipeline.execute('SELECT COUNT(*) FROM test_join_where')[0] assert result['count'] == expected['count']
def test_concurrent_vacuum_full(pipeline, clean_db): pipeline.create_stream('test_vacuum_stream', x='int') pipeline.create_cv( 'test_vacuum_full', 'SELECT x::int, COUNT(*) FROM test_vacuum_stream GROUP BY x') stop = False def insert(): while not stop: values = [(random.randint(0, 1000000), ) for _ in xrange(1000)] pipeline.insert('test_vacuum_stream', ('x', ), values) time.sleep(0.01) threads = [threading.Thread(target=insert) for _ in range(4)] map(lambda t: t.start(), threads) # Insert data for a little bit so we have enough work to do while # vacuuming. time.sleep(20) conn = psycopg2.connect('dbname=pipeline user=%s host=localhost port=%s' % (getpass.getuser(), pipeline.port)) conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) cur = conn.cursor() cur.execute('VACUUM FULL test_vacuum_full') conn.close() # Now kill the insert threads. stop = True map(lambda t: t.join(), threads)
def test_incremental_join(pipeline, clean_db): """ Verify that join results increase appropriately as we incrementally add stream events to the input """ num_cols = 4 join_cols = [0, 1] t_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) pipeline.create_table('inc', **t_cols) pipeline.create_stream('stream0', **t_cols) q = """ SELECT s.col0::integer FROM inc JOIN stream0 s ON inc.col0 = s.col0 AND inc.col1 = s.col1::integer """ t = _generate_rows(num_cols, 64) _insert(pipeline, 'inc', t, 0.1) pipeline.create_cv('test_join', q) s = [] for n in range(2): row = _generate_row(num_cols) _insert(pipeline, 'stream0', [row]) s.append(row) expected = _join(t, s, join_cols) result = pipeline.execute('SELECT COUNT(*) FROM test_join')[0] assert result['count'] == len(expected)
def test_indexed(pipeline, clean_db): """ Verify that stream-table joins involving indexed tables work """ pipeline.create_stream('stream0', x='int', y='int') q = """ SELECT stream0.x::integer, count(*) FROM stream0 JOIN test_indexed_t t ON stream0.x = t.x GROUP BY stream0.x """ pipeline.create_table('test_indexed_t', x='integer', y='integer') pipeline.execute('CREATE INDEX idx ON test_indexed_t(x)') t = _generate_rows(2, 1000) s = _generate_rows(2, 1000) pipeline.insert('test_indexed_t', ('x', 'y'), t) time.sleep(0.1) pipeline.create_cv('test_indexed', q) pipeline.insert('stream0', ('x', 'y'), s) expected = _join(s, t, [0]) result = pipeline.execute('SELECT sum(count) FROM test_indexed')[0] assert result['sum'] == len(expected)
def test_hll_agg_hashing(pipeline, clean_db): """ Verify that hll_agg correctly hashes different input types """ pipeline.create_stream('test_hll_stream', x='int', y='text', z='float8') q = """ SELECT hll_agg(x::integer) AS i, hll_agg(y::text) AS t, hll_agg(z::float8) AS f FROM test_hll_stream """ desc = ('x', 'y', 'z') pipeline.create_cv('test_hll_hashing', q) rows = [] for n in range(10000): rows.append((n, '%d' % n, float(n))) rows.append((n, '%05d' % n, float(n))) pipeline.insert('test_hll_stream', desc, rows) cvq = """ SELECT hll_cardinality(i), hll_cardinality(t), hll_cardinality(f) FROM test_hll_hashing """ result = list(pipeline.execute(cvq)) assert len(result) == 1 result = result[0] assert result[0] == 9976 assert result[1] == 19951 assert result[2] == 10062
def test_combine_in_view(pipeline, clean_db): """ Verify that combines in views on top of continuous views work """ q = """ SELECT x::integer, avg(y::integer) FROM stream0 GROUP BY x """ desc = ('x', 'y') pipeline.create_stream('stream0', x='int', y='float8') pipeline.create_cv('test_combine_view', q) pipeline.execute('CREATE VIEW v AS SELECT combine(avg) FROM test_combine_view') rows = [] for n in range(10000): rows.append((random.randint(1, 256), random.randint(1, 1024))) pipeline.insert('stream0', desc, rows) view = list(pipeline.execute('SELECT * FROM v')) assert len(view) == 1 expected = sum(r[1] for r in rows) / float(len(rows)) assert abs(float(view[0][0]) - expected) < 0.00001 pipeline.execute('DROP VIEW v')
def test_join_with_where(pipeline, clean_db): """ Verify that stream-table joins using a WHERE clause work properly """ num_cols = 4 q = """ SELECT s.col0::integer FROM stream s, wt WHERE s.col0 = 1 AND wt.col0 = 1 """ wt_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) pipeline.create_table('wt', **wt_cols) pipeline.create_table('wt_s', **wt_cols) wt = _generate_rows(num_cols, 64) s = _generate_rows(num_cols, 64) _insert(pipeline, 'wt', wt, 0.1) _insert(pipeline, 'wt_s', s, 0.1) pipeline.create_stream('stream', **wt_cols) pipeline.create_cv('test_join_where', q) _insert(pipeline, 'stream', s) expected = pipeline.execute('SELECT COUNT(*) FROM wt_s s, wt WHERE s.col0 = 1 AND wt.col0 = 1').first() result = pipeline.execute('SELECT COUNT(*) FROM test_join_where').first() assert result['count'] == expected['count']
def test_bloom_contains(pipeline, clean_db): """ Verify that bloom_contains works """ pipeline.create_stream("test_bloom_stream", x="int") q = """ SELECT bloom_agg(x::integer) FROM test_bloom_stream """ desc = "x" pipeline.create_cv("test_bloom_contains", q) rows = [] for i in range(10000): rows.append((2 * i,)) pipeline.insert("test_bloom_stream", desc, rows) cvq = """ SELECT bloom_contains(bloom_agg, 0), bloom_contains(bloom_agg, 5000), bloom_contains(bloom_agg, 1), bloom_contains(bloom_agg, 5001) FROM test_bloom_contains """ result = list(pipeline.execute(cvq)) assert len(result) == 1 result = result[0] assert result[0] == True assert result[1] == True assert result[2] == False assert result[3] == False
def test_colums_subset(pipeline, clean_db): """ Verify that copying data from a file into a stream works when the file's input columns are a subset of the stream0's columns """ pipeline.create_stream('stream0', x='int', y='float8', z='numeric', m='int') q = 'SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric), max(m::integer) FROM stream0' pipeline.create_cv('test_copy_subset', q) pipeline.create_table('test_copy_subset_t', x='integer', y='float8', z='numeric') path = os.path.abspath(os.path.join(pipeline.data_dir, 'test_copy.csv')) rows = [] for n in range(10000): row = random.randint(1, 1024), random.randint(1, 1024), random.random() rows.append(row) _generate_csv(path, rows, desc=('x', 'y', 'z')) pipeline.execute('COPY test_copy_subset_t (x, y, z) FROM \'%s\' HEADER CSV' % path) pipeline.execute('COPY stream0 (x, y, z) FROM \'%s\' HEADER CSV' % path) expected = pipeline.execute('SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric) FROM test_copy_subset_t')[0] result = pipeline.execute('SELECT s0, s1, avg FROM test_copy_subset') assert len(result) == 1 result = result[0] assert result[0] == expected[0] assert result[1] == expected[1] assert result[2] == expected[2]
def test_restart_recovery(pipeline, clean_db): pipeline.create_stream('stream0', x='int') q = 'SELECT COUNT(*) FROM stream0' pipeline.create_cv('test_restart_recovery', q) pipeline.insert('stream0', ['x'], [(1, ), (1, )]) result = pipeline.execute('SELECT * FROM test_restart_recovery').first() assert result['count'] == 2 # Need to sleep here, otherwise on restart the materialization table is # empty. Not sure why. time.sleep(0.1) # Restart. pipeline.stop() pipeline.run() result = pipeline.execute('SELECT * FROM test_restart_recovery').first() assert result['count'] == 2 pipeline.insert('stream0', ['x'], [(1, ), (1, )]) result = pipeline.execute('SELECT * FROM test_restart_recovery').first() assert result['count'] == 4
def test_simple_crash(pipeline, clean_db): """ Test simple worker and combiner crashes. """ pipeline.create_stream('stream0', x='int') q = 'SELECT COUNT(*) FROM stream0' pipeline.create_cv('test_simple_crash', q) pipeline.insert('stream0', ['x'], [(1, ), (1, )]) result = pipeline.execute('SELECT * FROM test_simple_crash').first() assert result['count'] == 2 # This batch can potentially get lost. pipeline.insert('stream0', ['x'], [(1, ), (1, )]) assert kill_worker() pipeline.insert('stream0', ['x'], [(1, ), (1, )]) result = pipeline.execute('SELECT * FROM test_simple_crash').first() assert result['count'] in [4, 6] # This batch can potentially get lost. pipeline.insert('stream0', ['x'], [(1, ), (1, )]) assert kill_combiner() pipeline.insert('stream0', ['x'], [(1, ), (1, )]) result = pipeline.execute('SELECT * FROM test_simple_crash').first() assert result['count'] in [6, 8, 10] # To ensure that all remaining events in ZMQ queues have been consumed time.sleep(2)
def test_null_groups(pipeline, clean_db): """ Verify that null group columns are considered equal """ pipeline.create_stream('stream', x='int', y='int', z='int') q = """ SELECT x::integer, y::integer, z::integer, COUNT(*) FROM stream GROUP BY x, y, z; """ desc = ('x', 'y', 'z') pipeline.create_cv('test_null_groups', q) pipeline.create_table('test_null_groups_t', x='integer', y='integer', z='integer') rows = [] for n in range(10000): vals = list(random.randint(0, 10) for n in range(3)) vals = map(lambda n: random.random() > 0.1 and n or None, vals) rows.append(tuple(vals)) pipeline.insert('stream', desc, rows) pipeline.insert('test_null_groups_t', desc, rows) table_q = """ SELECT x, y, z, COUNT(*) FROM test_null_groups_t GROUP BY x, y, z ORDER BY x, y, z; """ expected = list(pipeline.execute(table_q)) result = list(pipeline.execute('SELECT x, y, z, count FROM test_null_groups ORDER BY x, y, z')) for r, e in zip(result, expected): assert r == e
def test_bloom_contains(pipeline, clean_db): """ Verify that bloom_contains works """ pipeline.create_stream('test_bloom_stream', x='int') q = """ SELECT bloom_agg(x::integer) FROM test_bloom_stream """ desc = ('x') pipeline.create_cv('test_bloom_contains', q) rows = [] for i in range(10000): rows.append((2 * i, )) pipeline.insert('test_bloom_stream', desc, rows) cvq = """ SELECT bloom_contains(bloom_agg, 0), bloom_contains(bloom_agg, 5000), bloom_contains(bloom_agg, 1), bloom_contains(bloom_agg, 5001) FROM test_bloom_contains """ result = list(pipeline.execute(cvq)) assert len(result) == 1 result = result[0] assert result[0] == True assert result[1] == True assert result[2] == False assert result[3] == False
def test_sliding_windows(pipeline, clean_db): """ Verify that sliding window queries are properly dumped and restored """ pipeline.create_stream('stream0', x='int') pipeline.execute('CREATE VIEW sw_v WITH (sw = \'20 seconds\') AS SELECT count(*) FROM stream0') pipeline.insert('stream0', ('x',), [(x,) for x in range(10)]) result = pipeline.execute('SELECT count FROM sw_v')[0] assert result['count'] == 10 _dump(pipeline, 'test_sw.sql') pipeline.drop_all() _restore(pipeline, 'test_sw.sql') result = pipeline.execute('SELECT count FROM sw_v')[0] assert result['count'] == 10 # We should still drop back to 0 within 20 seconds result = pipeline.execute('SELECT count FROM sw_v')[0] while result['count'] > 0: time.sleep(1) result = pipeline.execute('SELECT count FROM sw_v')[0] result = pipeline.execute('SELECT count FROM sw_v')[0] # Disabled until #157 (currently combine doesn't return 0 on NULL input for this aggregate) # assert result == 0 assert result['count'] is None
def test_join_multiple_tables(pipeline, clean_db): """ Verify that stream-table joins involving multiple tables work """ num_cols = 8 join_cols = [0] t0_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) t1_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) pipeline.create_table('t0', **t0_cols) pipeline.create_table('t1', **t1_cols) pipeline.create_stream('stream', **t0_cols) q = """ SELECT s.col0::integer FROM t0 JOIN t1 ON t0.col0 = t1.col0 JOIN stream s ON t1.col0 = s.col0 """ t0 = _generate_rows(num_cols, 64) t1 = _generate_rows(num_cols, 64) s = _generate_rows(num_cols, 64) _insert(pipeline, 't1', t1, 0.1) _insert(pipeline, 't0', t0, 0.1) pipeline.create_cv('test_join_multi', q) _insert(pipeline, 'stream', s) expected = _join(t0, _join(s, t1, join_cols), join_cols) result = pipeline.execute('SELECT COUNT(*) FROM test_join_multi').first() assert result['count'] == len(expected)
def test_bloom_intersection(pipeline, clean_db): """ Verify that bloom_intersection works """ pipeline.create_stream("test_bloom_stream", x="int", k="int") q = """ SELECT k::int, bloom_agg(x::integer) FROM test_bloom_stream GROUP BY k """ desc = ("k", "x") pipeline.create_cv("test_bloom_intersection", q) rows = [] for i in range(10000): rows.append((0, 2 * i)) rows.append((1, i)) pipeline.insert("test_bloom_stream", desc, rows) cvq = """ SELECT bloom_cardinality(bloom_intersection_agg(bloom_agg)) FROM test_bloom_intersection """ result = list(pipeline.execute(cvq)) assert len(result) == 1 result = result[0] assert result[0] == 5530
def test_indexed(pipeline, clean_db): """ Verify that stream-table joins involving indexed tables work """ pipeline.create_stream('stream', x='int', y='int') q = """ SELECT stream.x::integer, count(*) FROM stream JOIN test_indexed_t t ON stream.x = t.x GROUP BY stream.x """ pipeline.create_table('test_indexed_t', x='integer', y='integer') pipeline.execute('CREATE INDEX idx ON test_indexed_t(x)') t = _generate_rows(2, 1000) s = _generate_rows(2, 1000) pipeline.insert('test_indexed_t', ('x', 'y'), t) time.sleep(0.1) pipeline.create_cv('test_indexed', q) pipeline.insert('stream', ('x', 'y'), s) expected = _join(s, t, [0]) result = pipeline.execute('SELECT sum(count) FROM test_indexed').first() assert result['sum'] == len(expected)
def test_filter_clause(pipeline, clean_db): """ Verify that FILTER clauses work on aggregates and sliding window aggregates """ pipeline.create_stream('test_filter_stream', x='int') q = """ SELECT SUM(x::int) FILTER (WHERE mod(x, 2) = 0) AS sum2, SUM(x::int) FILTER (WHERE mod(x, 3) = 0) AS sum3 FROM test_filter_stream """ sw = """ WHERE arrival_timestamp > clock_timestamp() - interval '30 second' """ pipeline.create_cv('test_filter', q) pipeline.create_cv('test_filter_sw', '%s %s' % (q, sw)) desc = ('x', ) rows = [] for n in range(1000): rows.append((n, )) pipeline.insert('test_filter_stream', desc, rows) sum2 = sum(filter(lambda x: x % 2 == 0, map(lambda x: x[0], rows))) sum3 = sum(filter(lambda x: x % 3 == 0, map(lambda x: x[0], rows))) result1 = pipeline.execute('SELECT * FROM test_filter').first() result2 = pipeline.execute('SELECT * FROM test_filter_sw').first() assert result1['sum2'] == result2['sum2'] == sum2 assert result1['sum3'] == result2['sum3'] == sum3
def test_user_low_and_high_card(pipeline, clean_db): """ Verify that HLL's with low and high cardinalities are correcly combined """ pipeline.create_stream('test_hll_stream', x='int', k='integer') q = """ SELECT k::integer, hll_agg(x::integer) FROM test_hll_stream GROUP BY k """ desc = ('k', 'x') pipeline.create_cv('test_hll_agg', q) # Low cardinalities rows = [] for n in range(1000): rows.append((0, random.choice((-1, -2)))) rows.append((1, random.choice((-3, -4)))) # High cardinalities for n in range(10000): rows.append((2, n)) rows.append((3, n)) pipeline.insert('test_hll_stream', desc, rows) result = pipeline.execute('SELECT hll_cardinality(combine(hll_agg)) ' 'FROM test_hll_agg WHERE k in (0, 1)').first() assert result[0] == 4 result = pipeline.execute('SELECT hll_cardinality(combine(hll_agg)) ' 'FROM test_hll_agg WHERE k in (2, 3)').first() assert result[0] == 9976 result = pipeline.execute('SELECT hll_cardinality(combine(hll_agg)) ' 'FROM test_hll_agg').first() assert result[0] == 9983
def test_sliding_windows(pipeline, clean_db): """ Verify that sliding window queries are properly dumped and restored """ pipeline.create_stream('stream0', x='int') pipeline.execute('CREATE CONTINUOUS VIEW sw_v WITH (sw = \'20 seconds\') AS SELECT count(*) FROM stream0') pipeline.insert('stream0', ('x',), [(x,) for x in range(10)]) result = pipeline.execute('SELECT count FROM sw_v').first() assert result['count'] == 10 _dump(pipeline, 'test_sw.sql') pipeline.drop_all() _restore(pipeline, 'test_sw.sql') result = pipeline.execute('SELECT count FROM sw_v').first() assert result['count'] == 10 # We should still drop back to 0 within 20 seconds result = pipeline.execute('SELECT count FROM sw_v').first() while result['count'] > 0: time.sleep(1) result = pipeline.execute('SELECT count FROM sw_v').first() result = pipeline.execute('SELECT count FROM sw_v').first() assert result['count'] == 0
def test_single_continuous_view(pipeline, clean_db): """ Verify that specific continuous views can be dropped and restored """ pipeline.create_stream('stream0', x='int') pipeline.create_cv('test_single0', 'SELECT COUNT(*) FROM stream0') pipeline.create_cv('test_single1', 'SELECT COUNT(*) FROM stream0') pipeline.insert('stream0', ('x',), [(x,) for x in range(10)]) result = pipeline.execute('SELECT count FROM test_single0').first() assert result['count'] == 10 result = pipeline.execute('SELECT count FROM test_single1').first() assert result['count'] == 10 _dump(pipeline, 'test_single.sql', tables=['test_single0', 'stream0', 'test_single0_mrel']) pipeline.drop_all() _restore(pipeline, 'test_single.sql') result = pipeline.execute('SELECT count FROM test_single0').first() assert result['count'] == 10 # We didn't dump this one result = list(pipeline.execute('SELECT * FROM pg_class WHERE relname LIKE \'%%test_single1%%\'')) assert not result
def test_copy_to_typed_stream(pipeline, clean_db): """ Verify that copying data from a file into a typed stream works. """ pipeline.create_stream('stream', x='integer', y='float8', z='numeric') q = 'SELECT sum(x) AS s0, sum(y) AS s1, avg(z) FROM stream' pipeline.create_cv('test_copy_to_typed_stream', q) pipeline.create_table('test_copy_to_typed_stream_t', x='integer', y='float8', z='numeric') path = os.path.abspath(os.path.join(pipeline.tmp_dir, 'test_copy.csv')) rows = [] for n in range(10000): row = random.randint(1, 1024), random.randint(1, 1024), random.random() rows.append(row) _generate_csv(path, rows, desc=('x', 'y', 'z')) pipeline.execute('COPY test_copy_to_typed_stream_t (x, y, z) FROM \'%s\' HEADER CSV' % path) pipeline.execute('COPY stream (x, y, z) FROM \'%s\' HEADER CSV' % path) expected = pipeline.execute('SELECT sum(x) AS s0, sum(y) AS s1, avg(z) FROM test_copy_to_typed_stream_t').first() result = list(pipeline.execute('SELECT s0, s1, avg FROM test_copy_to_typed_stream')) assert len(result) == 1 result = result[0] assert result[0] == expected[0] assert result[1] == expected[1] assert result[2] == expected[2]
def test_simple_aggs(pipeline, clean_db): """ Verify that combines work properly on simple aggs """ q = """ SELECT x::integer %% 10 AS k, avg(x), sum(y::float8), count(*) FROM stream0 GROUP BY k; """ desc = ('x', 'y') pipeline.create_stream('stream0', x='int', y='float8') pipeline.create_cv('test_simple_aggs', q) pipeline.create_table('test_simple_aggs_t', x='integer', y='float8') rows = [] for n in range(10000): row = (random.randint(0, 1000), random.random()) rows.append(row) pipeline.insert('stream0', desc, rows) pipeline.insert('test_simple_aggs_t', desc, rows) table_result = list(pipeline.execute('SELECT avg(x), sum(y::float8), count(*) FROM test_simple_aggs_t')) cv_result = list(pipeline.execute('SELECT combine(avg), combine(sum), combine(count) FROM test_simple_aggs')) assert len(table_result) == len(cv_result) for tr, cr in zip(table_result, cv_result): assert abs(tr[0] - cr[0]) < 0.00001 assert abs(tr[1] - cr[1]) < 0.00001 assert abs(tr[2] - cr[2]) < 0.00001
def test_join_across_batches(pipeline, clean_db): """ Verify that stream-table joins are properly built when they span across multiple input batches """ num_cols = 4 join_cols = [0] t_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) pipeline.create_table('batch', **t_cols) pipeline.create_stream('stream0', **t_cols) q = """ SELECT s.col0::integer FROM batch JOIN stream0 s ON batch.col0 = s.col0 """ t = _generate_rows(num_cols, 64) _insert(pipeline, 'batch', t, 0.1) s = _generate_rows(num_cols, 64) pipeline.create_cv('test_batched_join', q) _insert(pipeline, 'stream0', s) expected = _join(t, s, join_cols) result = pipeline.execute('SELECT COUNT(*) FROM test_batched_join')[0] assert result['count'] == len(expected)
def test_join_across_batches(pipeline, clean_db): """ Verify that stream-table joins are properly built when they span across multiple input batches """ num_cols = 4 join_cols = [0] t_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) pipeline.create_table('batch', **t_cols) pipeline.create_stream('stream', **t_cols) q = """ SELECT s.col0::integer FROM batch JOIN stream s ON batch.col0 = s.col0 """ t = _generate_rows(num_cols, 64) _insert(pipeline, 'batch', t, 0.1) s = _generate_rows(num_cols, 64) pipeline.create_cv('test_batched_join', q) _insert(pipeline, 'stream', s) expected = _join(t, s, join_cols) result = pipeline.execute('SELECT COUNT(*) FROM test_batched_join').first() assert result['count'] == len(expected)
def test_join_multiple_tables(pipeline, clean_db): """ Verify that stream-table joins involving multiple tables work """ num_cols = 8 join_cols = [0] t0_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) t1_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) pipeline.create_table('t0', **t0_cols) pipeline.create_table('t1', **t1_cols) pipeline.create_stream('stream0', **t0_cols) q = """ SELECT s.col0::integer FROM t0 JOIN t1 ON t0.col0 = t1.col0 JOIN stream0 s ON t1.col0 = s.col0 """ t0 = _generate_rows(num_cols, 64) t1 = _generate_rows(num_cols, 64) s = _generate_rows(num_cols, 64) _insert(pipeline, 't1', t1, 0.1) _insert(pipeline, 't0', t0, 0.1) pipeline.create_cv('test_join_multi', q) _insert(pipeline, 'stream0', s) expected = _join(t0, _join(s, t1, join_cols), join_cols) result = pipeline.execute('SELECT COUNT(*) FROM test_join_multi')[0] assert result['count'] == len(expected)
def test_incremental_join(pipeline, clean_db): """ Verify that join results increase appropriately as we incrementally add stream events to the input """ num_cols = 4 join_cols = [0, 1] t_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) pipeline.create_table('inc', **t_cols) pipeline.create_stream('stream', **t_cols) q = """ SELECT s.col0::integer FROM inc JOIN stream s ON inc.col0 = s.col0 AND inc.col1 = s.col1::integer """ t = _generate_rows(num_cols, 64) _insert(pipeline, 'inc', t, 0.1) pipeline.create_cv('test_join', q) s = [] for n in range(2): row = _generate_row(num_cols) _insert(pipeline, 'stream', [row]) s.append(row) expected = _join(t, s, join_cols) result = pipeline.execute('SELECT COUNT(*) FROM test_join').first() assert result['count'] == len(expected)
def test_bloom_agg_hashing(pipeline, clean_db): """ Verify that bloom_agg correctly hashes different input types """ pipeline.create_stream("test_bloom_stream", x="int", y="text", z="float8") q = """ SELECT bloom_agg(x::integer) AS i, bloom_agg(y::text) AS t, bloom_agg(z::float8) AS f FROM test_bloom_stream """ desc = ("x", "y", "z") pipeline.create_cv("test_bloom_hashing", q) rows = [] for n in range(10000): rows.append((n, "%d" % n, float(n))) rows.append((n, "%05d" % n, float(n))) pipeline.insert("test_bloom_stream", desc, rows) cvq = """ SELECT bloom_cardinality(i), bloom_cardinality(t), bloom_cardinality(f) FROM test_bloom_hashing """ result = list(pipeline.execute(cvq)) assert len(result) == 1 result = result[0] assert result[0] == 8879 assert result[1] == 15614 assert result[2] == 8855
def test_regression(pipeline, clean_db): path = os.path.abspath(os.path.join(pipeline.tmp_dir, 'test_copy.csv')) _generate_csv(path, [['2015-06-01 00:00:00','De','Adam_Babareka','1','37433']], desc=('day', 'project', 'title', 'count', 'size')) pipeline.create_stream('copy_regression_stream', count='int', day='timestamp', project='text', title='text', size='int') pipeline.create_cv('test_copy_regression', 'SELECT sum(count) FROM copy_regression_stream') pipeline.execute("COPY copy_regression_stream (day, project, title, count, size) FROM '%s' CSV HEADER" % path)
def test_online_drop_column(pipeline, clean_db): pipeline.create_stream('stream1', c0='integer') try: pipeline.execute('ALTER STREAM stream1 DROP c0') assert False except: pass
def test_online_drop_column(pipeline, clean_db): pipeline.create_stream("stream1", c0="integer") try: pipeline.execute("ALTER STREAM stream1 DROP c0") assert False except: pass
def test_regression(pipeline, clean_db): path = os.path.abspath(os.path.join(pipeline.data_dir, 'test_copy.csv')) _generate_csv(path, [['2015-06-01 00:00:00', 'De', 'Adam_Babareka', '1', '37433']], desc=('day', 'project', 'title', 'count', 'size')) pipeline.create_stream('copy_regression_stream', count='int', day='timestamp', project='text', title='text', size='int') pipeline.create_cv('test_copy_regression', 'SELECT sum(count) FROM copy_regression_stream') pipeline.execute("COPY copy_regression_stream (day, project, title, count, size) FROM '%s' CSV HEADER" % path)
def test_cq_stats(pipeline, clean_db): """ Verify that CQ statistics collection works """ num_combiners = int(pipeline.execute('SHOW continuous_query_num_combiners').first()['continuous_query_num_combiners']) num_workers = int(pipeline.execute('SHOW continuous_query_num_workers').first()['continuous_query_num_workers']) pipeline.create_stream('stream0', x='int') # 10 rows q = 'SELECT x::integer %% 10 AS g, COUNT(*) FROM stream0 GROUP BY g' pipeline.create_cv('test_10_groups', q) # 1 row q = 'SELECT COUNT(*) FROM stream0' pipeline.create_cv('test_1_group', q) values = [(random.randint(1, 1024),) for n in range(1000)] pipeline.insert('stream0', ('x',), values) pipeline.insert('stream0', ('x',), values) # Sleep a little so that the next time we insert, we force the stats collector. # Must be >= 1s since that's the force interval. time.sleep(1) pipeline.insert('stream0', ('x',), values) pipeline.insert('stream0', ('x',), values) # Sleep a little so the stats collector flushes all the stats. time.sleep(1) proc_result = list(pipeline.execute('SELECT * FROM pipeline_proc_stats')) cq_result = list(pipeline.execute('SELECT * FROM pipeline_query_stats')) proc_rows = len(proc_result) cq_rows = len(cq_result) # We are guaranteed to send data to all combiners but only at least 1 worker # since we randomly select which worker to send the data to. assert proc_rows >= num_combiners + 1 assert proc_rows <= num_combiners + num_workers assert cq_rows == 4 # We get 2000 in case the first two microbatches go to the same worker # and the second two go to a different one. In this case, both will flush # the first microbatch they see, so 1000 + 1000. result = pipeline.execute("SELECT * FROM pipeline_query_stats WHERE name = 'test_10_groups' AND type = 'worker'").first() assert result['input_rows'] in [2000, 3000, 4000] result = pipeline.execute("SELECT * FROM pipeline_query_stats WHERE name = 'test_10_groups' AND type = 'combiner'").first() assert result['output_rows'] == 10 result = pipeline.execute("SELECT * FROM pipeline_query_stats WHERE name = 'test_1_group' AND type = 'worker'").first() assert result['input_rows'] in [2000, 3000, 4000] result = pipeline.execute("SELECT * FROM pipeline_query_stats WHERE name = 'test_1_group' AND type = 'combiner'").first() assert result['output_rows'] == 1
def test_cq_stats(pipeline, clean_db): """ Verify that CQ statistics collection works """ num_combiners = int(pipeline.execute('SHOW continuous_query_num_combiners').first()['continuous_query_num_combiners']) num_workers = int(pipeline.execute('SHOW continuous_query_num_workers').first()['continuous_query_num_workers']) pipeline.create_stream('stream', x='int') # 10 rows q = 'SELECT x::integer %% 10 AS g, COUNT(*) FROM stream GROUP BY g' pipeline.create_cv('test_10_groups', q) # 1 row q = 'SELECT COUNT(*) FROM stream' pipeline.create_cv('test_1_group', q) values = [(random.randint(1, 1024),) for n in range(1000)] pipeline.insert('stream', ('x',), values) pipeline.insert('stream', ('x',), values) # Sleep a little so that the next time we insert, we force the stats collector. # Must be >= 1s since that's the force interval. time.sleep(1) pipeline.insert('stream', ('x',), values) pipeline.insert('stream', ('x',), values) # Sleep a little so the stats collector flushes all the stats. time.sleep(1) proc_result = list(pipeline.execute('SELECT * FROM pipeline_proc_stats')) cq_result = list(pipeline.execute('SELECT * FROM pipeline_query_stats')) proc_rows = len(proc_result) cq_rows = len(cq_result) # We are guaranteed to send data to all combiners but only at least 1 worker # since we randomly select which worker to send the data to. assert proc_rows >= num_combiners + 1 assert proc_rows <= num_combiners + num_workers assert cq_rows == 4 # We get 2000 in case the first two microbatches go to the same worker # and the second two go to a different one. In this case, both will flush # the first microbatch they see, so 1000 + 1000. result = pipeline.execute("SELECT * FROM pipeline_query_stats WHERE name = 'test_10_groups' AND type = 'worker'").first() assert result['input_rows'] in [2000, 3000, 4000] result = pipeline.execute("SELECT * FROM pipeline_query_stats WHERE name = 'test_10_groups' AND type = 'combiner'").first() assert result['output_rows'] == 10 result = pipeline.execute("SELECT * FROM pipeline_query_stats WHERE name = 'test_1_group' AND type = 'worker'").first() assert result['input_rows'] in [2000, 3000, 4000] result = pipeline.execute("SELECT * FROM pipeline_query_stats WHERE name = 'test_1_group' AND type = 'combiner'").first() assert result['output_rows'] == 1
def test_join_ordering(pipeline, clean_db): """ Verify that the correct plan is generated regardless of the ordering of streams and tables. """ num_cols = 8 join_cols = [0] ordering0_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) ordering1_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) pipeline.create_table('ordering0', **ordering0_cols) pipeline.create_table('ordering1', **ordering1_cols) ordering0 = _generate_rows(num_cols, 64) ordering1 = _generate_rows(num_cols, 64) _insert(pipeline, 'ordering0', ordering0, 0.1) _insert(pipeline, 'ordering1', ordering1, 0.1) pipeline.create_stream('stream', **ordering0_cols) # stream, table, table q0 = """ SELECT s.col0::integer, ordering0.col3, ordering1.col4 FROM stream s JOIN ordering0 ON s.col0 = ordering0.col0 JOIN ordering1 ON ordering0.col0 = ordering1.col0 """ pipeline.create_cv('test_ordering0', q0) # table, stream, table q1 = """ SELECT s.col0::integer, ordering0.col3, ordering1.col4 FROM ordering0 JOIN stream s ON s.col0 = ordering0.col0 JOIN ordering1 ON ordering0.col0 = ordering1.col0 """ pipeline.create_cv('test_ordering1', q1) # table, table, stream q2 = """ SELECT s.col0::integer, ordering0.col3, ordering1.col4 FROM ordering0 JOIN ordering1 ON ordering0.col0 = ordering1.col0 JOIN stream s ON s.col0 = ordering0.col0 """ pipeline.create_cv('test_ordering2', q2) s = _generate_rows(num_cols, 64) _insert(pipeline, 'stream', s) expected = _join(ordering0, _join(ordering1, s, join_cols), join_cols) result0 = pipeline.execute('SELECT COUNT(*) FROM test_ordering0').first() result1 = pipeline.execute('SELECT COUNT(*) FROM test_ordering1').first() result2 = pipeline.execute('SELECT COUNT(*) FROM test_ordering2').first() assert result0['count'] == len(expected) assert result1['count'] == len(expected) assert result2['count'] == len(expected)
def test_join_ordering(pipeline, clean_db): """ Verify that the correct plan is generated regardless of the ordering of streams and tables. """ num_cols = 8 join_cols = [0] ordering0_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) ordering1_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) pipeline.create_table('ordering0', **ordering0_cols) pipeline.create_table('ordering1', **ordering1_cols) ordering0 = _generate_rows(num_cols, 64) ordering1 = _generate_rows(num_cols, 64) _insert(pipeline, 'ordering0', ordering0, 0.1) _insert(pipeline, 'ordering1', ordering1, 0.1) pipeline.create_stream('stream0', **ordering0_cols) # stream, table, table q0 = """ SELECT s.col0::integer, ordering0.col3, ordering1.col4 FROM stream0 s JOIN ordering0 ON s.col0 = ordering0.col0 JOIN ordering1 ON ordering0.col0 = ordering1.col0 """ pipeline.create_cv('test_ordering0', q0) # table, stream, table q1 = """ SELECT s.col0::integer, ordering0.col3, ordering1.col4 FROM ordering0 JOIN stream0 s ON s.col0 = ordering0.col0 JOIN ordering1 ON ordering0.col0 = ordering1.col0 """ pipeline.create_cv('test_ordering1', q1) # table, table, stream q2 = """ SELECT s.col0::integer, ordering0.col3, ordering1.col4 FROM ordering0 JOIN ordering1 ON ordering0.col0 = ordering1.col0 JOIN stream0 s ON s.col0 = ordering0.col0 """ pipeline.create_cv('test_ordering2', q2) s = _generate_rows(num_cols, 64) _insert(pipeline, 'stream0', s) expected = _join(ordering0, _join(ordering1, s, join_cols), join_cols) result0 = pipeline.execute('SELECT COUNT(*) FROM test_ordering0')[0] result1 = pipeline.execute('SELECT COUNT(*) FROM test_ordering1')[0] result2 = pipeline.execute('SELECT COUNT(*) FROM test_ordering2')[0] assert result0['count'] == len(expected) assert result1['count'] == len(expected) assert result2['count'] == len(expected)
def test_multiple_databases(pipeline, clean_db): conn = psycopg2.connect('dbname=postgres user=%s host=localhost port=%s' % (getpass.getuser(), pipeline.port)) conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cur = conn.cursor() cur.execute('CREATE DATABASE tmp_pipeline') cur.close() q = 'SELECT x::int FROM dbstream' pipeline.create_stream('dbstream', x='int') pipeline.create_cv('test_multiple_databases', q) # Insert data in first database. pipeline.insert('dbstream', ['x'], map(lambda x: (x, ), range(0, 10, 2))) result = pipeline.execute('SELECT * FROM test_multiple_databases') assert sorted(row['x'] for row in result) == range(0, 10, 2) # Create same CV in the other database, make sure its created and write different data to it. tmp_conn = psycopg2.connect( 'dbname=tmp_pipeline user=%s host=localhost port=%s' % (getpass.getuser(), pipeline.port)) cur = tmp_conn.cursor() cur.execute('CREATE EXTENSION pipelinedb') cur.execute('CREATE FOREIGN TABLE dbstream (x int) SERVER pipelinedb') cur.execute('CREATE VIEW test_multiple_databases AS %s' % q) tmp_conn.commit() cur.execute('INSERT INTO dbstream (x) VALUES %s' % ', '.join(map(lambda x: '(%d)' % x, range(1, 11, 2)))) cur.execute('SELECT * FROM test_multiple_databases') tmp_conn.commit() assert sorted(row[0] for row in cur) == range(1, 11, 2) # Ensure that the data written to the other database isn't seen by the first database. result = pipeline.execute('SELECT * FROM test_multiple_databases') assert sorted(row['x'] for row in result) == range(0, 10, 2) # Insert new data to both databases. pipeline.insert('dbstream', ['x'], map(lambda x: (x, ), range(10, 20, 2))) cur.execute('INSERT INTO dbstream (x) VALUES %s' % ', '.join(map(lambda x: '(%d)' % x, range(11, 21, 2)))) # Ensure both databases still saw the data written out to them. result = pipeline.execute('SELECT * FROM test_multiple_databases') assert sorted(row['x'] for row in result) == range(0, 20, 2) cur.execute('SELECT * FROM test_multiple_databases') tmp_conn.commit() assert sorted(row[0] for row in cur) == range(1, 21, 2) cur.close() tmp_conn.close() cur = conn.cursor() cur.execute('DROP DATABASE tmp_pipeline') cur.close() conn.close()