def test_online_add_column(pipeline, clean_db): """ Verify that we can add columns to a stream while not affecting running CQs """ pipeline.create_stream("stream0", c0="integer") pipeline.create_cv("cv0", "SELECT c0 FROM stream0") pipeline.insert("stream0", ("c0",), [(n,) for n in range(0, 1000)]) result = list(pipeline.execute("SELECT * FROM cv0")) assert len(result) == 1000 for row in result: for col in row: assert col is not None pipeline.execute("ALTER STREAM stream0 ADD c1 integer") pipeline.create_cv("cv1", "SELECT c0, c1 FROM stream0") pipeline.insert("stream0", ("c0", "c1"), [(n, n) for n in range(1000, 2000)]) result = list(pipeline.execute("SELECT * FROM cv1 WHERE c1 >= 1000")) assert len(result) == 1000 for row in result: for col in row: assert col is not None pipeline.execute("ALTER STREAM stream0 ADD c2 integer") pipeline.create_cv("cv2", "SELECT c0, c1, c2 FROM stream0") pipeline.insert("stream0", ("c0", "c1", "c2"), [(n, n, n) for n in range(2000, 3000)]) result = list(pipeline.execute("SELECT * FROM cv2 WHERE c2 >= 2000")) assert len(result) == 1000 for row in result: for col in row: assert col is not None pipeline.execute("ALTER STREAM stream0 ADD c3 integer") pipeline.create_cv("cv3", "SELECT c0, c1, c2, c3 FROM stream0") pipeline.insert("stream0", ("c0", "c1", "c2", "c3"), [(n, n, n, n) for n in range(3000, 4000)]) result = list(pipeline.execute("SELECT * FROM cv3 WHERE c3 >= 3000")) assert len(result) == 1000 for row in result: for col in row: assert col is not None pipeline.execute("ALTER STREAM stream0 ADD c4 integer") pipeline.create_cv("cv4", "SELECT c0, c1, c2, c3, c4 FROM stream0") pipeline.insert("stream0", ("c0", "c1", "c2", "c3", "c4"), [(n, n, n, n, n) for n in range(4000, 5000)]) result = list(pipeline.execute("SELECT * FROM cv4 WHERE c4 >= 4000")) assert len(result) == 1000 for row in result: for col in row: assert col is not None
def test_prepared_extended(pipeline, clean_db): """ Verify that we can write to streams using the extended protocol. This test shells out to a binary because psycopg2 doesn't use the extended protocol. """ q = """ SELECT COUNT(x::integer) AS x, COUNT(y::integer) AS y, COUNT(z::integer) AS z FROM extended_stream """ pipeline.create_cv('test_prepared_extended', q) # This will insert 1000 via a paramaterized insert, and 1000 via unparamaterized insert cmd = ['./extended', 'pipeline', str(pipeline.port), 'extended_stream', '1000'] stdout, stderr = subprocess.Popen(cmd).communicate() assert stdout is None assert stderr is None rows = list(pipeline.execute('SELECT x, y, z FROM test_prepared_extended')) assert len(rows) == 1 result = rows[0] assert result['x'] == 2000 assert result['y'] == 2000 assert result['z'] == 2000
def test_postmaster_worker_recovery(pipeline, clean_db): """ Verify that the Postmaster only restarts crashed worker processes, and does not attempt to start them when the continuous query scheduler should. """ result = pipeline.execute('SELECT COUNT(*) FROM pipeline_proc_stats WHERE type = \'worker\'').first() expected_workers = result['count'] result = pipeline.execute('SELECT COUNT(*) FROM pipeline_proc_stats WHERE type = \'combiner\'').first() expected_combiners = result['count'] q = 'SELECT COUNT(*) FROM stream' pipeline.create_cv('test_pm_recovery', q) pipeline.insert('stream', ['x'], [(1, ), (1, )]) def backend(): try: # Just keep a long-running backend connection open client = pipeline.engine.connect() client.execute('SELECT pg_sleep(10000)') except: pass t = threading.Thread(target=backend) t.start() attempts = 0 result = None backend_pid = 0 while not result and attempts < 10: result = pipeline.execute("""SELECT pid, query FROM pg_stat_activity WHERE lower(query) LIKE '%%pg_sleep%%'""").first() time.sleep(1) attempts += 1 assert result backend_pid = result['pid'] os.kill(backend_pid, signal.SIGKILL) attempts = 0 pipeline.conn = None while attempts < 15: try: pipeline.conn = pipeline.engine.connect() break except: time.sleep(1) pass attempts += 1 assert pipeline.conn # Now verify that we have the correct number of CQ worker procs result = pipeline.execute('SELECT COUNT(*) FROM pipeline_proc_stats WHERE type = \'worker\'').first() assert result['count'] == expected_workers result = pipeline.execute('SELECT COUNT(*) FROM pipeline_proc_stats WHERE type = \'combiner\'').first() assert result['count'] == expected_combiners
def _test_agg(pipeline, agg, check_fn=None): name = agg[:agg.find('(')] q = 'SELECT g::integer, %s OVER (PARTITION BY g ORDER BY ts::timestamp) FROM %s' cv_name = 'test_%s' % name table_name = 'test_%s_t' % name desc = ('ts', 'g', 'x', 'y', 'z') pipeline.create_cv(cv_name, q % (agg, 'stream')) pipeline.create_table(table_name, ts='timestamp', x='integer', y='integer', z='integer', g='integer') rows = [] for i, n in enumerate(range(1000)): ts = str(datetime.utcnow() + timedelta(seconds=i)) row = ts, n % 10, random.randint(1, 256), random.randint(1, 256), random.randint(1, 256) rows.append(row) pipeline.insert('stream', desc, rows) pipeline.insert(table_name, desc, rows) if check_fn: return check_fn(pipeline) expected = list(pipeline.execute(q % (agg, table_name) + ' ORDER BY g')) result = list(pipeline.execute('SELECT * FROM %s ORDER BY g' % cv_name)) assert len(expected) == len(result) for e, r in zip(expected, result): assert e == r pipeline.drop_cv(cv_name) pipeline.drop_table(table_name)
def test_bloom_contains(pipeline, clean_db): """ Verify that bloom_contains works """ pipeline.create_stream("test_bloom_stream", x="int") q = """ SELECT bloom_agg(x::integer) FROM test_bloom_stream """ desc = "x" pipeline.create_cv("test_bloom_contains", q) rows = [] for i in range(10000): rows.append((2 * i,)) pipeline.insert("test_bloom_stream", desc, rows) cvq = """ SELECT bloom_contains(bloom_agg, 0), bloom_contains(bloom_agg, 5000), bloom_contains(bloom_agg, 1), bloom_contains(bloom_agg, 5001) FROM test_bloom_contains """ result = list(pipeline.execute(cvq)) assert len(result) == 1 result = result[0] assert result[0] == True assert result[1] == True assert result[2] == False assert result[3] == False
def test_concurrent_vacuum_full(pipeline, clean_db): pipeline.create_cv( 'test_vacuum_full', 'SELECT x::int, COUNT(*) FROM test_vacuum_stream GROUP BY x') stop = False def insert(): while not stop: values = [(random.randint(0, 1000000), ) for _ in xrange(1000)] pipeline.insert('test_vacuum_stream', ('x', ), values) time.sleep(0.01) threads = [threading.Thread(target=insert) for _ in range(4)] map(lambda t: t.start(), threads) # Insert data for a little bit so we have enough work to do while # vacuuming. time.sleep(20) conn = psycopg2.connect('dbname=pipeline user=%s host=localhost port=%s' % (getpass.getuser(), pipeline.port)) conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) cur = conn.cursor() cur.execute('VACUUM FULL test_vacuum_full') conn.close() # Now kill the insert threads. stop = True map(lambda t: t.join(), threads)
def test_bloom_agg_hashing(pipeline, clean_db): """ Verify that bloom_agg correctly hashes different input types """ pipeline.create_stream("test_bloom_stream", x="int", y="text", z="float8") q = """ SELECT bloom_agg(x::integer) AS i, bloom_agg(y::text) AS t, bloom_agg(z::float8) AS f FROM test_bloom_stream """ desc = ("x", "y", "z") pipeline.create_cv("test_bloom_hashing", q) rows = [] for n in range(10000): rows.append((n, "%d" % n, float(n))) rows.append((n, "%05d" % n, float(n))) pipeline.insert("test_bloom_stream", desc, rows) cvq = """ SELECT bloom_cardinality(i), bloom_cardinality(t), bloom_cardinality(f) FROM test_bloom_hashing """ result = list(pipeline.execute(cvq)) assert len(result) == 1 result = result[0] assert result[0] == 8879 assert result[1] == 15614 assert result[2] == 8855
def test_simple_crash(pipeline, clean_db): """ Test simple worker and combiner crashes. """ q = 'SELECT COUNT(*) FROM stream' pipeline.create_cv('test_simple_crash', q) pipeline.insert('stream', ['x'], [(1, ), (1, )]) result = pipeline.execute('SELECT * FROM test_simple_crash').first() assert result['count'] == 2 # We can potentially lose one batch for a worker or combiner crash. # In our case each batch adds a count 2 and since we're adding 3 batches # we should either see an increment from the previous count of 4 or 6. pipeline.insert('stream', ['x'], [(1, ), (1, )]) assert kill_worker() pipeline.insert('stream', ['x'], [(1, ), (1, )]) result = pipeline.execute('SELECT * FROM test_simple_crash').first() assert result['count'] == 6 pipeline.insert('stream', ['x'], [(1, ), (1, )]) assert kill_combiner() pipeline.insert('stream', ['x'], [(1, ), (1, )]) result = pipeline.execute('SELECT * FROM test_simple_crash').first() assert result['count'] == 10
def test_simple_aggs(pipeline, clean_db): """ Verify that combines work properly on simple aggs """ q = """ SELECT x::integer %% 10 AS k, avg(x), sum(y::float8), count(*) FROM stream0 GROUP BY k; """ desc = ('x', 'y') pipeline.create_stream('stream0', x='int', y='float8') pipeline.create_cv('test_simple_aggs', q) pipeline.create_table('test_simple_aggs_t', x='integer', y='float8') rows = [] for n in range(10000): row = (random.randint(0, 1000), random.random()) rows.append(row) pipeline.insert('stream0', desc, rows) pipeline.insert('test_simple_aggs_t', desc, rows) table_result = list(pipeline.execute('SELECT avg(x), sum(y::float8), count(*) FROM test_simple_aggs_t')) cv_result = list(pipeline.execute('SELECT combine(avg), combine(sum), combine(count) FROM test_simple_aggs')) assert len(table_result) == len(cv_result) for tr, cr in zip(table_result, cv_result): assert abs(tr[0] - cr[0]) < 0.00001 assert abs(tr[1] - cr[1]) < 0.00001 assert abs(tr[2] - cr[2]) < 0.00001
def test_join_across_batches(pipeline, clean_db): """ Verify that stream-table joins are properly built when they span across multiple input batches """ num_cols = 4 join_cols = [0] t_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) pipeline.create_table('batch', **t_cols) pipeline.create_stream('stream', **t_cols) q = """ SELECT s.col0::integer FROM batch JOIN stream s ON batch.col0 = s.col0 """ t = _generate_rows(num_cols, 64) _insert(pipeline, 'batch', t, 0.1) s = _generate_rows(num_cols, 64) pipeline.create_cv('test_batched_join', q) _insert(pipeline, 'stream', s) expected = _join(t, s, join_cols) result = pipeline.execute('SELECT COUNT(*) FROM test_batched_join').first() assert result['count'] == len(expected)
def test_combine_in_view(pipeline, clean_db): """ Verify that combines in views on top of continuous views work """ q = """ SELECT x::integer, avg(y::integer) FROM stream0 GROUP BY x """ desc = ('x', 'y') pipeline.create_stream('stream0', x='int', y='float8') pipeline.create_cv('test_combine_view', q) pipeline.execute('CREATE VIEW v AS SELECT combine(avg) FROM test_combine_view') rows = [] for n in range(10000): rows.append((random.randint(1, 256), random.randint(1, 1024))) pipeline.insert('stream0', desc, rows) view = list(pipeline.execute('SELECT * FROM v')) assert len(view) == 1 expected = sum(r[1] for r in rows) / float(len(rows)) assert abs(float(view[0][0]) - expected) < 0.00001 pipeline.execute('DROP VIEW v')
def test_null_groups(pipeline, clean_db): """ Verify that null group columns are considered equal """ pipeline.create_stream('stream', x='int', y='int', z='int') q = """ SELECT x::integer, y::integer, z::integer, COUNT(*) FROM stream GROUP BY x, y, z; """ desc = ('x', 'y', 'z') pipeline.create_cv('test_null_groups', q) pipeline.create_table('test_null_groups_t', x='integer', y='integer', z='integer') rows = [] for n in range(10000): vals = list(random.randint(0, 10) for n in range(3)) vals = map(lambda n: random.random() > 0.1 and n or None, vals) rows.append(tuple(vals)) pipeline.insert('stream', desc, rows) pipeline.insert('test_null_groups_t', desc, rows) table_q = """ SELECT x, y, z, COUNT(*) FROM test_null_groups_t GROUP BY x, y, z ORDER BY x, y, z; """ expected = list(pipeline.execute(table_q)) result = list(pipeline.execute('SELECT x, y, z, count FROM test_null_groups ORDER BY x, y, z')) for r, e in zip(result, expected): assert r == e
def assert_result_changes(func, args): """ Verifies that the result of the given function changes with time """ pipeline.create_stream('stream', x='int', y='text', z='int') name = 'assert_%s_decreases' % func pipeline.create_cv(name, "SELECT %s(%s) FROM stream WHERE arrival_timestamp > clock_timestamp() - interval '2 seconds'" % (func, args)) rows = [(n, str(n), n + 1) for n in range(1000)] pipeline.insert('stream', ('x', 'y', 'z'), rows) current = 1 results = [] while current: row = pipeline.execute('SELECT * FROM %s' % name).first() current = row[func] if current is None: break results.append(current) # Verify that we actually read something assert results pipeline.drop_cv(name)
def test_incremental_join(pipeline, clean_db): """ Verify that join results increase appropriately as we incrementally add stream events to the input """ num_cols = 4 join_cols = [0, 1] t_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) pipeline.create_table('inc', **t_cols) pipeline.create_stream('stream', **t_cols) q = """ SELECT s.col0::integer FROM inc JOIN stream s ON inc.col0 = s.col0 AND inc.col1 = s.col1::integer """ t = _generate_rows(num_cols, 64) _insert(pipeline, 'inc', t, 0.1) pipeline.create_cv('test_join', q) s = [] for n in range(2): row = _generate_row(num_cols) _insert(pipeline, 'stream', [row]) s.append(row) expected = _join(t, s, join_cols) result = pipeline.execute('SELECT COUNT(*) FROM test_join').first() assert result['count'] == len(expected)
def test_user_low_and_high_card(pipeline, clean_db): """ Verify that Bloom filters's with low and high cardinalities are correcly unioned """ q = """ SELECT k::integer, bloom_agg(x::integer) FROM test_bloom_stream GROUP BY k """ desc = ('k', 'x') pipeline.create_cv('test_bloom_agg', q) # Low cardinalities rows = [] for n in range(1000): rows.append((0, random.choice((-1, -2)))) rows.append((1, random.choice((-3, -4)))) # High cardinalities for n in range(10000): rows.append((2, n)) rows.append((3, n)) pipeline.insert('test_bloom_stream', desc, rows) result = pipeline.execute('SELECT bloom_cardinality(combine(bloom_agg)) ' 'FROM test_bloom_agg WHERE k in (0, 1)').first() assert result[0] == 4 result = pipeline.execute('SELECT bloom_cardinality(combine(bloom_agg)) ' 'FROM test_bloom_agg WHERE k in (2, 3)').first() assert result[0] == 8879 result = pipeline.execute('SELECT bloom_cardinality(combine(bloom_agg)) ' 'FROM test_bloom_agg').first() assert result[0] == 8881
def test_bloom_intersection(pipeline, clean_db): """ Verify that bloom_intersection works """ pipeline.create_stream("test_bloom_stream", x="int", k="int") q = """ SELECT k::int, bloom_agg(x::integer) FROM test_bloom_stream GROUP BY k """ desc = ("k", "x") pipeline.create_cv("test_bloom_intersection", q) rows = [] for i in range(10000): rows.append((0, 2 * i)) rows.append((1, i)) pipeline.insert("test_bloom_stream", desc, rows) cvq = """ SELECT bloom_cardinality(bloom_intersection_agg(bloom_agg)) FROM test_bloom_intersection """ result = list(pipeline.execute(cvq)) assert len(result) == 1 result = result[0] assert result[0] == 5530
def test_hll_agg_hashing(pipeline, clean_db): """ Verify that hll_agg correctly hashes different input types """ pipeline.create_stream('test_hll_stream', x='int', y='text', z='float8') q = """ SELECT hll_agg(x::integer) AS i, hll_agg(y::text) AS t, hll_agg(z::float8) AS f FROM test_hll_stream """ desc = ('x', 'y', 'z') pipeline.create_cv('test_hll_hashing', q) rows = [] for n in range(10000): rows.append((n, '%d' % n, float(n))) rows.append((n, '%05d' % n, float(n))) pipeline.insert('test_hll_stream', desc, rows) cvq = """ SELECT hll_cardinality(i), hll_cardinality(t), hll_cardinality(f) FROM test_hll_hashing """ result = list(pipeline.execute(cvq)) assert len(result) == 1 result = result[0] assert result[0] == 9976 assert result[1] == 19951 assert result[2] == 10062
def test_single_continuous_view(pipeline, clean_db): """ Verify that specific continuous views can be dropped and restored """ pipeline.create_cv("test_single0", "SELECT COUNT(*) FROM stream") pipeline.create_cv("test_single1", "SELECT COUNT(*) FROM stream") pipeline.insert("stream", ("x",), [(x,) for x in range(10)]) result = pipeline.execute("SELECT count FROM test_single0").first() assert result["count"] == 10 result = pipeline.execute("SELECT count FROM test_single1").first() assert result["count"] == 10 _dump(pipeline, "test_single.sql", cv_name="test_single0") pipeline.drop_all_views() _restore(pipeline, "test_single.sql") result = pipeline.execute("SELECT count FROM test_single0").first() assert result["count"] == 10 # We didn't dump this one result = list(pipeline.execute("SELECT * FROM pg_class WHERE relname LIKE '%%test_single1%%'")) assert not result
def test_copy_to_typed_stream(pipeline, clean_db): """ Verify that copying data from a file into a typed stream works. """ pipeline.create_stream('stream', x='integer', y='float8', z='numeric') q = 'SELECT sum(x) AS s0, sum(y) AS s1, avg(z) FROM stream' pipeline.create_cv('test_copy_to_typed_stream', q) pipeline.create_table('test_copy_to_typed_stream_t', x='integer', y='float8', z='numeric') path = os.path.abspath(os.path.join(pipeline.tmp_dir, 'test_copy.csv')) rows = [] for n in range(10000): row = random.randint(1, 1024), random.randint(1, 1024), random.random() rows.append(row) _generate_csv(path, rows, desc=('x', 'y', 'z')) pipeline.execute('COPY test_copy_to_typed_stream_t (x, y, z) FROM \'%s\' HEADER CSV' % path) pipeline.execute('COPY stream (x, y, z) FROM \'%s\' HEADER CSV' % path) expected = pipeline.execute('SELECT sum(x) AS s0, sum(y) AS s1, avg(z) FROM test_copy_to_typed_stream_t').first() result = list(pipeline.execute('SELECT s0, s1, avg FROM test_copy_to_typed_stream')) assert len(result) == 1 result = result[0] assert result[0] == expected[0] assert result[1] == expected[1] assert result[2] == expected[2]
def test_cq_stats(pipeline, clean_db): """ Verify that CQ statistics collection works """ num_combiners = int( pipeline.execute("SHOW continuous_query_num_combiners").first()["continuous_query_num_combiners"] ) num_workers = int(pipeline.execute("SHOW continuous_query_num_workers").first()["continuous_query_num_workers"]) # 10 rows q = "SELECT x::integer %% 10 AS g, COUNT(*) FROM stream GROUP BY g" pipeline.create_cv("test_10_groups", q) # 1 row q = "SELECT COUNT(*) FROM stream" pipeline.create_cv("test_1_group", q) values = [(random.randint(1, 1024),) for n in range(1000)] pipeline.insert("stream", ("x",), values) pipeline.insert("stream", ("x",), values) # Sleep a little so that the next time we insert, we force the stats collector. time.sleep(0.5) pipeline.insert("stream", ("x",), values) pipeline.insert("stream", ("x",), values) # Sleep a little so the stats collector flushes all the stats. time.sleep(0.5) proc_result = pipeline.execute("SELECT * FROM pipeline_proc_stats") cq_result = pipeline.execute("SELECT * FROM pipeline_query_stats") proc_rows = len(list(proc_result)) cq_rows = len(list(cq_result)) assert proc_rows == 1 + num_combiners + num_workers assert cq_rows == 4 # When sleeping, we only force the stats collection for the first CQ, so we're not guaranteed to have seen # all stats flushed for the 10 group view. The stats flushed should be anywhere between the two inserts. result = pipeline.execute( "SELECT * FROM pipeline_query_stats WHERE name = 'test_10_groups' AND type = 'worker'" ).first() assert result["input_rows"] >= 2000 assert result["input_rows"] <= 4000 result = pipeline.execute( "SELECT * FROM pipeline_query_stats WHERE name = 'test_10_groups' AND type = 'combiner'" ).first() assert result["output_rows"] == 10 result = pipeline.execute( "SELECT * FROM pipeline_query_stats WHERE name = 'test_1_group' AND type = 'worker'" ).first() assert result["input_rows"] == 4000 result = pipeline.execute( "SELECT * FROM pipeline_query_stats WHERE name = 'test_1_group' AND type = 'combiner'" ).first() assert result["output_rows"] == 1
def test_colums_subset(pipeline, clean_db): """ Verify that copying data from a file into a stream works when the file's input columns are a subset of the stream's columns """ q = 'SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric), max(m::integer) FROM stream' pipeline.create_cv('test_copy_subset', q) pipeline.create_table('test_copy_subset_t', x='integer', y='float8', z='numeric') path = os.path.abspath(os.path.join(pipeline.tmp_dir, 'test_copy.csv')) rows = [] for n in range(10000): row = random.randint(1, 1024), random.randint(1, 1024), random.random() rows.append(row) _generate_csv(path, rows, desc=('x', 'y', 'z')) pipeline.execute('COPY test_copy_subset_t (x, y, z) FROM \'%s\' HEADER CSV' % path) pipeline.execute('COPY stream (x, y, z) FROM \'%s\' HEADER CSV' % path) expected = pipeline.execute('SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric) FROM test_copy_subset_t').first() result = list(pipeline.execute('SELECT s0, s1, avg FROM test_copy_subset')) assert len(result) == 1 result = result[0] assert result[0] == expected[0] assert result[1] == expected[1] assert result[2] == expected[2]
def test_join_with_where(pipeline, clean_db): """ Verify that stream-table joins using a WHERE clause work properly """ num_cols = 4 q = """ SELECT s.col0::integer FROM stream s, wt WHERE s.col0 = 1 AND wt.col0 = 1 """ wt_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) pipeline.create_table('wt', **wt_cols) pipeline.create_table('wt_s', **wt_cols) wt = _generate_rows(num_cols, 64) s = _generate_rows(num_cols, 64) _insert(pipeline, 'wt', wt, 0.1) _insert(pipeline, 'wt_s', s, 0.1) pipeline.create_stream('stream', **wt_cols) pipeline.create_cv('test_join_where', q) _insert(pipeline, 'stream', s) expected = pipeline.execute('SELECT COUNT(*) FROM wt_s s, wt WHERE s.col0 = 1 AND wt.col0 = 1').first() result = pipeline.execute('SELECT COUNT(*) FROM test_join_where').first() assert result['count'] == expected['count']
def test_bloom_agg_hashing(pipeline, clean_db): """ Verify that bloom_agg correctly hashes different input types """ q = """ SELECT bloom_agg(x::integer) AS i, bloom_agg(y::text) AS t, bloom_agg(z::float8) AS f FROM test_bloom_stream """ desc = ('x', 'y', 'z') pipeline.create_cv('test_bloom_hashing', q) rows = [] for n in range(10000): rows.append((n, '%d' % n, float(n))) rows.append((n, '%05d' % n, float(n))) pipeline.insert('test_bloom_stream', desc, rows) cvq = """ SELECT bloom_cardinality(i), bloom_cardinality(t), bloom_cardinality(f) FROM test_bloom_hashing """ result = list(pipeline.execute(cvq)) assert len(result) == 1 result = result[0] assert result[0] == 8879 assert result[1] == 15614 assert result[2] == 8855
def test_activate_deactivate(pipeline, clean_db): pipeline.create_cv('v', 'SELECT count(*) FROM stream') pipeline.insert('stream', ('x', ), [(1, )]) conn = psycopg2.connect('dbname=pipeline user=%s host=localhost port=%s' % (getpass.getuser(), pipeline.port)) conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cur = conn.cursor() cur.execute('DEACTIVATE') cur.close() try: pipeline.insert('stream', ('x', ), [(1, )]) assert False except: pass cur = conn.cursor() cur.execute('ACTIVATE') cur.close() conn.close() pipeline.insert('stream', ('x', ), [(1, )]) count = pipeline.execute('SELECT * FROM v').first()['count'] assert count == 2
def test_single_continuous_view(pipeline, clean_db): """ Verify that specific continuous views can be dropped and restored """ pipeline.create_stream('stream0', x='int') pipeline.create_cv('test_single0', 'SELECT COUNT(*) FROM stream0') pipeline.create_cv('test_single1', 'SELECT COUNT(*) FROM stream0') pipeline.insert('stream0', ('x',), [(x,) for x in range(10)]) result = pipeline.execute('SELECT count FROM test_single0').first() assert result['count'] == 10 result = pipeline.execute('SELECT count FROM test_single1').first() assert result['count'] == 10 _dump(pipeline, 'test_single.sql', tables=['test_single0', 'stream0', 'test_single0_mrel']) pipeline.drop_all() _restore(pipeline, 'test_single.sql') result = pipeline.execute('SELECT count FROM test_single0').first() assert result['count'] == 10 # We didn't dump this one result = list(pipeline.execute('SELECT * FROM pg_class WHERE relname LIKE \'%%test_single1%%\'')) assert not result
def test_user_low_and_high_card(pipeline, clean_db): """ Verify that HLL's with low and high cardinalities are correcly combined """ pipeline.create_stream('test_hll_stream', x='int', k='integer') q = """ SELECT k::integer, hll_agg(x::integer) FROM test_hll_stream GROUP BY k """ desc = ('k', 'x') pipeline.create_cv('test_hll_agg', q) # Low cardinalities rows = [] for n in range(1000): rows.append((0, random.choice((-1, -2)))) rows.append((1, random.choice((-3, -4)))) # High cardinalities for n in range(10000): rows.append((2, n)) rows.append((3, n)) pipeline.insert('test_hll_stream', desc, rows) result = pipeline.execute('SELECT hll_cardinality(combine(hll_agg)) ' 'FROM test_hll_agg WHERE k in (0, 1)').first() assert result[0] == 4 result = pipeline.execute('SELECT hll_cardinality(combine(hll_agg)) ' 'FROM test_hll_agg WHERE k in (2, 3)').first() assert result[0] == 9976 result = pipeline.execute('SELECT hll_cardinality(combine(hll_agg)) ' 'FROM test_hll_agg').first() assert result[0] == 9983
def test_join_multiple_tables(pipeline, clean_db): """ Verify that stream-table joins involving multiple tables work """ num_cols = 8 join_cols = [0] t0_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) t1_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)]) pipeline.create_table('t0', **t0_cols) pipeline.create_table('t1', **t1_cols) pipeline.create_stream('stream', **t0_cols) q = """ SELECT s.col0::integer FROM t0 JOIN t1 ON t0.col0 = t1.col0 JOIN stream s ON t1.col0 = s.col0 """ t0 = _generate_rows(num_cols, 64) t1 = _generate_rows(num_cols, 64) s = _generate_rows(num_cols, 64) _insert(pipeline, 't1', t1, 0.1) _insert(pipeline, 't0', t0, 0.1) pipeline.create_cv('test_join_multi', q) _insert(pipeline, 'stream', s) expected = _join(t0, _join(s, t1, join_cols), join_cols) result = pipeline.execute('SELECT COUNT(*) FROM test_join_multi').first() assert result['count'] == len(expected)
def test_indexed(pipeline, clean_db): """ Verify that stream-table joins involving indexed tables work """ pipeline.create_stream('stream', x='int', y='int') q = """ SELECT stream.x::integer, count(*) FROM stream JOIN test_indexed_t t ON stream.x = t.x GROUP BY stream.x """ pipeline.create_table('test_indexed_t', x='integer', y='integer') pipeline.execute('CREATE INDEX idx ON test_indexed_t(x)') t = _generate_rows(2, 1000) s = _generate_rows(2, 1000) pipeline.insert('test_indexed_t', ('x', 'y'), t) time.sleep(0.1) pipeline.create_cv('test_indexed', q) pipeline.insert('stream', ('x', 'y'), s) expected = _join(s, t, [0]) result = pipeline.execute('SELECT sum(count) FROM test_indexed').first() assert result['sum'] == len(expected)
def assert_result_changes(func, args): """ Verifies that the result of the given function changes with time """ name = "assert_%s_decreases" % func pipeline.create_cv( name, "SELECT %s(%s) FROM stream WHERE arrival_timestamp > clock_timestamp() - interval '2 seconds'" % (func, args), ) rows = [(n, str(n), n + 1) for n in range(1000)] pipeline.insert("stream", ("x", "y", "z"), rows) current = 1 results = [] while current: row = pipeline.execute("SELECT * FROM %s" % name).first() current = row[func] if current is None: break results.append(current) # Verify that we actually read something assert results pipeline.drop_cv(name)
def test_multi_client(pipeline, clean_db): """ Regression test for multi client. """ TRIGGER_OUTPUT_LOGFILE = '/tmp/.pipelinedb_pipeline_test.log' pipeline.create_cv('cv0', 'SELECT x::integer,count(*) FROM stream group by x') conn_str = pipeline.get_conn_string() pipeline.create_cv_trigger('t0', 'cv0', 'true', 'pipeline_send_alert_new_row') # recv_alerts client needs pipeline on its path client_env = os.environ.copy() client_env["PATH"] = client_env["PATH"] + ":" + pipeline.get_bin_dir() cmd = [pipeline.get_recv_alerts(), '-d', conn_str, '-a', 'cv0.t0']; time.sleep(2) outfile = open(TRIGGER_OUTPUT_LOGFILE, 'w') client1 = subprocess.Popen(cmd, env=client_env) client2 = subprocess.Popen(cmd, env=client_env) time.sleep(4) client1.terminate() client2.terminate()
def test_simple_insert(pipeline, clean_db): """ Verify that we can insert some rows and count some stuff """ pipeline.create_stream('stream0', key='int') pipeline.create_cv( 'cv', 'SELECT key::integer, COUNT(*) FROM stream0 GROUP BY key') rows = [(n % 10, ) for n in range(1000)] pipeline.insert('stream0', ('key', ), rows) result = list(pipeline.execute('SELECT key, count FROM cv ORDER BY key')) assert len(result) == 10 for i, row in enumerate(result): assert row['key'] == i assert row['count'] == 100
def test_static_streams(pipeline, clean_db): """ Verify that static stream definitions are dumped and restored """ pipeline.create_stream('static', x='int', y='float8') _dump(pipeline, 'test_static.sql') pipeline.drop_stream('static') _restore(pipeline, 'test_static.sql') # Force the requirement of a static stream pipeline.create_cv('static_cv', 'SELECT x, y FROM static') pipeline.insert('static', ('x', 'y'), [(0, 1)]) result = pipeline.execute('SELECT x, y FROM static_cv').first() assert result['x'] == 0 assert result['y'] == 1
def test_schema_only(pipeline, clean_db): """ Verify that it is possible to only dump continuous view schemas and not data """ pipeline.create_cv('test_schema', 'SELECT COUNT(*) FROM stream') pipeline.insert('stream', ('x',), [(x,) for x in range(10)]) result = pipeline.execute('SELECT count FROM test_schema').first() assert result['count'] == 10 _dump(pipeline, 'test_schema.sql', schema_only=True) pipeline.drop_all_views() _restore(pipeline, 'test_schema.sql') # No data loaded result = list(pipeline.execute('SELECT count FROM test_schema')) assert not result
def test_cksketch_frequency(pipeline, clean_db): pipeline.create_stream('test_cmsketch_stream', k='int', x='int') q = """ SELECT k::integer, freq_agg(x::int) AS c FROM test_cmsketch_stream GROUP BY k """ desc = ('k', 'x') pipeline.create_cv('test_cmsketch_frequency', q) rows = [(n, None) for n in range(100)] pipeline.insert('test_cmsketch_stream', desc, rows) result = pipeline.execute( 'SELECT freq(c, null) AS x FROM test_cmsketch_frequency ORDER BY k') assert len(result) == 100 for row in result: assert row[0] == 0
def test_nested_transforms(pipeline, clean_db): pipeline.create_stream('stream0', x='int') pipeline.create_stream('stream2', x='int') pipeline.create_stream('stream4', x='int') pipeline.create_cv('cv0', 'SELECT count(*) FROM stream4') pipeline.create_cv('cv1', 'SELECT count(*) FROM stream2') pipeline.create_ct('ct0', 'SELECT x::int FROM stream2 WHERE mod(x, 4) = 0', "pipelinedb.insert_into_stream('stream4')") pipeline.create_ct('ct1', 'SELECT x::int FROM stream0 WHERE mod(x, 2) = 0', "pipelinedb.insert_into_stream('stream2')") pipeline.insert('stream0', ('x',), [(n,) for n in range(1000)]) count = pipeline.execute('SELECT count FROM cv0')[0]['count'] assert count == 250 count = pipeline.execute('SELECT count FROM cv1')[0]['count'] assert count == 500
def test_deadlock_regress(pipeline, clean_db): nitems = 2000000 tmp_file = os.path.join(tempfile.gettempdir(), 'tmp.json') query = 'SELECT generate_series(1, %d) AS n' % nitems pipeline.execute("COPY (%s) TO '%s'" % (query, tmp_file)) pipeline.create_stream('s1', n='int') pipeline.create_stream('s2', n='int') pipeline.create_ct('ct', 'SELECT n FROM s1 WHERE n IS NOT NULL', "pipeline_stream_insert('s2')") pipeline.create_cv('cv', 'SELECT count(*) FROM s2') for copy in [True, False]: for nworkers in [1, 4]: for sync in ['off', 'on']: pipeline.stop() pipeline.run({ 'continuous_query_num_workers': nworkers, 'synchronous_stream_insert': sync }) pipeline.execute('TRUNCATE CONTINUOUS VIEW cv') pipeline.execute('COMMIT') if copy: pipeline.execute("COPY s1 (n) FROM '%s'" % tmp_file) else: pipeline.execute('INSERT INTO s1 (n) %s' % query) count = dict( pipeline.execute('SELECT count FROM cv').first() or {}) ntries = 5 while count.get('count') != nitems and ntries > 0: assert sync == 'off' time.sleep(1) count = dict( pipeline.execute('SELECT count FROM cv').first() or {}) ntries -= 1 assert count and count['count'] == nitems os.remove(tmp_file) pipeline.stop() pipeline.run()
def test_consume_text(pipeline, kafka, clean_db): """ Interpret consumed messages as text """ pipeline.create_stream('comma_stream', x='integer', y='integer', z='integer') pipeline.create_cv('comma_cv', 'SELECT x, y, z FROM comma_stream') kafka.create_topic('test_consume_text_comma') pipeline.create_stream('tab_stream', x='integer', y='integer', z='integer') pipeline.create_cv('tab_cv', 'SELECT x, y, z FROM tab_stream') kafka.create_topic('test_consume_text_tab') pipeline.consume_begin('test_consume_text_comma', 'comma_stream', delimiter=',') pipeline.consume_begin('test_consume_text_tab', 'tab_stream', delimiter='\t') producer = kafka.get_producer('test_consume_text_comma') for n in range(100): message = ','.join(map(str, [n, n, n])) producer.produce(message) producer = kafka.get_producer('test_consume_text_tab') for n in range(100): message = '\t'.join(map(str, [n, n, n])) producer.produce(message) def delimited_messages(): rows = pipeline.execute('SELECT * from comma_cv ORDER BY x') assert len(rows) == 100 for i, row in enumerate(rows): assert tuple([row[0], row[1], row[2]]) == (i, i, i) rows = pipeline.execute('SELECT * from tab_cv ORDER BY x') assert len(rows) == 100 for i, row in enumerate(rows): assert tuple([row[0], row[1], row[2]]) == (i, i, i) assert eventually(delimited_messages)
def test_sw_trigger_sync(pipeline, clean_db): """ Sets up a sliding window query, and inserts data into it before any triggers are added. A trigger is added, and then some more data is inserted. Verify that counts are equal to pre-creation plus post_creation amts """ pipeline.create_cv( 'cv0', 'SELECT x::integer,count(*) FROM stream where (arrival_timestamp > clock_timestamp() - interval \'10 seconds\') group by x;', step_factor='10') rows = [(n % 10, ) for n in range(1000)] pipeline.insert('stream', ('x', ), rows) # sleep to make sure that new inserts are in a new arrival_ts group time.sleep(4) pipeline.create_cv_trigger('t0', 'cv0', 'true', 'pipeline_test_alert_new_row') time.sleep(1) rows = [(n % 10, ) for n in range(10)] pipeline.insert('stream', ('x', ), rows) time.sleep(1) lines = pipeline.read_trigger_output() d = {} for l in lines: assert (len(l) == 2) k = int(l[0]) v = int(l[1]) assert (k >= 0 and k <= 9) d[k] = v assert (len(d) == 10) for x in d: assert (d[x] == 101)
def test_tablespace(pipeline, clean_db): """ Verify that CVs can be created within tablespaces """ path = os.path.abspath('test_tablespace') if os.path.exists(path): shutil.rmtree(path) os.mkdir(path) pipeline.execute("CREATE TABLESPACE test_tablespace LOCATION '%s'" % path) pipeline.create_stream('test_tablespace_s', x='int') q = 'SELECT x % 10 AS g, count(DISTINCT x) FROM test_tablespace_s GROUP BY g' pipeline.create_cv('test_tablespace0', q) pipeline.create_cv('test_tablespace1', q, tablespace='test_tablespace') pipeline.insert('test_tablespace_s', ('x', ), [(x, ) for x in range(10000)]) result0 = pipeline.execute('SELECT count(*) FROM test_tablespace0') result1 = pipeline.execute('SELECT count(*) FROM test_tablespace1') assert len(result0) == 1 assert len(result1) == 1 assert result0[0]['count'] == result1[0]['count'] result0 = pipeline.execute('SELECT combine(count) FROM test_tablespace0') result1 = pipeline.execute('SELECT combine(count) FROM test_tablespace1') assert len(result0) == 1 assert len(result1) == 1 assert result0[0]['combine'] == result1[0]['combine'] # Now verify that test_tablespace1 is physically in the tablespace row = pipeline.execute( "SELECT oid FROM pg_class WHERE relname = 'test_tablespace1_mrel'") oid = row[0]['oid'] found = glob.glob('test_tablespace/*/*/%d' % oid) assert len(found) == 1 pipeline.drop_all() pipeline.execute('DROP TABLESPACE test_tablespace') shutil.rmtree(path)
def test_schema_inference(pipeline, clean_db): """ Verify that types are properly inferred """ pipeline.create_cv( 'test_infer0', 'SELECT x::int8, y::bigint, COUNT(*) FROM infer_stream GROUP BY x, y') pipeline.create_cv( 'test_infer1', 'SELECT x::int4, y::real, COUNT(*) FROM infer_stream GROUP BY x, y') pipeline.create_cv( 'test_infer2', 'SELECT x::int2, y::integer, COUNT(*) FROM infer_stream GROUP BY x, y') pipeline.create_cv( 'test_infer3', 'SELECT x::numeric, y::float8, COUNT(*) FROM infer_stream GROUP BY x, y' ) pipeline.create_cv( 'test_infer4', 'SELECT x::int8, y::bigint, COUNT(*) FROM infer_stream GROUP BY x, y') desc = ('x', 'y') rows = [] for n in range(10000): rows.append( (random.random() + 1, random.random() * random.randint(0, 128))) pipeline.insert('infer_stream', desc, rows) result = pipeline.execute('SELECT * FROM test_infer0 ORDER BY x') for row in result: assert row['count'] result = pipeline.execute('SELECT * FROM test_infer1 ORDER BY x') for row in result: assert row['count'] result = pipeline.execute('SELECT * FROM test_infer2 ORDER BY x') for row in result: assert row['count'] result = pipeline.execute('SELECT * FROM test_infer3 ORDER BY x') for row in result: assert row['count']
def test_tdigest_agg(pipeline, clean_db): """ Test tdigest_agg, tdigest_merge_agg, tdigest_cdf, tdigest_quantile """ q = """ SELECT k::integer, tdigest_agg(x::int) AS t FROM test_tdigest_stream GROUP BY k """ desc = ('k', 'x') pipeline.create_stream('test_tdigest_stream', k='int', x='int') pipeline.create_cv('test_tdigest_agg', q) rows = [] for _ in range(10): for n in range(1000): rows.append((0, n)) rows.append((1, n + 500)) pipeline.insert('test_tdigest_stream', desc, rows) result = list(pipeline.execute( 'SELECT tdigest_quantile(t, 0.1) FROM test_tdigest_agg ORDER BY k') .fetchall()) assert len(result) == 2 assert abs(int(result[0]['tdigest_quantile']) - 99) <= 1 assert abs(int(result[1]['tdigest_quantile']) - 599) <= 1 result = list(pipeline.execute( 'SELECT tdigest_quantile(combine(t), 0.1) FROM test_tdigest_agg') .fetchall()) assert len(result) == 1 assert abs(int(result[0]['tdigest_quantile']) - 200) <= 4 result = list(pipeline.execute( 'SELECT tdigest_cdf(t, 600) FROM test_tdigest_agg ORDER BY k') .fetchall()) assert len(result) == 2 assert round(result[0]['tdigest_cdf'], 2) == 0.6 assert round(result[1]['tdigest_cdf'], 2) == 0.1 result = list(pipeline.execute( 'SELECT tdigest_cdf(combine(t), 600) FROM test_tdigest_agg').fetchall()) assert len(result) == 1 assert round(result[0]['tdigest_cdf'], 2) == 0.35
def test_cont_transforms(pipeline, clean_db): pipeline.execute('CREATE STREAM cv_stream (x int, y text)') pipeline.execute('CREATE STREAM ct_stream (x int, y text)') pipeline.create_cv('test_cv', 'SELECT count(*) FROM cv_stream') pipeline.create_ct( 'test_ct1', 'SELECT x::int, y::text FROM ct_stream WHERE mod(x, 2) = 0', "pipeline_stream_insert('cv_stream', 'cv_stream')") pipeline.create_table('test_t', x='int', y='text') pipeline.execute(''' CREATE OR REPLACE FUNCTION test_tg() RETURNS trigger AS $$ BEGIN INSERT INTO test_t (x, y) VALUES (NEW.x, NEW.y); RETURN NEW; END; $$ LANGUAGE plpgsql; ''') pipeline.create_ct('test_ct2', 'SELECT x::int, y::text FROM ct_stream', 'test_tg()') pipeline.insert('ct_stream', ('x', 'y'), [(1, 'hello'), (2, 'world')]) time.sleep(1) _dump(pipeline, 'test_cont_transform.sql') pipeline.drop_all() pipeline.drop_table('test_t') pipeline.execute('DROP FUNCTION test_tg()') _restore(pipeline, 'test_cont_transform.sql') pipeline.insert('ct_stream', ('x', 'y'), [(1, 'hello'), (2, 'world')]) time.sleep(1) assert pipeline.execute('SELECT count FROM test_cv').first()['count'] == 4 ntups = 0 for row in pipeline.execute('SELECT x, count(*) FROM test_t GROUP BY x'): assert row['count'] == 2 assert row['x'] in (1, 2) ntups += 1 assert ntups == 2
def test_adhoc_against_identical_cv(pipeline, clean_db): """ Verify that an adhoc query produces the same output as an identical continuous view """ q = """ SELECT x::integer + 1 AS g, sum(y::integer), avg(z::integer), count(*) FROM test_adhoc_stream GROUP BY g; """ pipeline.create_cv('test_adhoc_cv', q) rows = [(x % 10, random.randint(1, 1000), random.randint(1, 1000)) for x in range(1000)] path = os.path.abspath( os.path.join(pipeline.tmp_dir, 'test_adhoc_against_identical_cv.sql')) tmp_file = open(path, 'w') for row in rows: v = gen_insert('test_adhoc_stream', ('x', 'y', 'z'), [row]) tmp_file.write(v) tmp_file.close() psql = os.path.abspath(os.path.join(pipeline.tmp_dir, 'bin/psql')) cmd = ['./run_adhoc.expect', psql, str(pipeline.port), 'pipeline', q, path] output = subprocess.Popen(cmd, stdout=PIPE).communicate()[0] lines = output.split('\n') lines = filter(lambda x: not re.match(r'^\s*$', x), lines) lines = [l.split('\t')[1:] for l in lines] lines = lines[-10:] adhoc_results = {} for line in lines: g, s, a, c = line adhoc_results[int(g)] = int(g), int(s), float(a), int(c) cv_result = pipeline.execute('SELECT * FROM test_adhoc_cv') for row in cv_result: adhoc_row = adhoc_results[row[0]] assert adhoc_row[1] == row[1] assert abs(adhoc_row[2] - float(row[2])) < 0.001 assert adhoc_row[3] == row[3]
def test_multiple(pipeline, clean_db): """ Verify that multiple continuous views work together properly """ pipeline.create_cv('cv0', 'SELECT n::numeric FROM stream WHERE n > 10.00001') pipeline.create_cv('cv1', 'SELECT s::text FROM stream WHERE s LIKE \'%%this%%\'') rows = [(float(n + 10), 'this', 100) for n in range(1000)] for n in range(10): rows.append((float(n), 'not a match', -n)) pipeline.insert('stream', ('n', 's', 'unused'), rows) result = list(pipeline.execute('SELECT * FROM cv0')) assert len(result) == 999 result = list(pipeline.execute('SELECT * FROM cv1')) assert len(result) == 1000
def test_dump_data_only(pipeline, clean_db): """ Verify that it is possible to only dump continuous view data and not schemas """ pipeline.create_cv('test_data', 'SELECT COUNT(*) FROM stream') pipeline.insert('stream', ('x', ), [(x, ) for x in range(10)]) result = pipeline.execute('SELECT count FROM test_data').first() assert result['count'] == 10 _dump(pipeline, 'test_data.sql', data_only=True) pipeline.drop_all_queries() pipeline.create_cv('test_data', 'SELECT COUNT(*) FROM stream') _restore(pipeline, 'test_data.sql') result = pipeline.execute('SELECT count FROM test_data').first() assert result['count'] == 10
def test_null_offsets(pipeline, kafka, clean_db): """ Verify that offsets are stored as NULL if a consumer hasn't consumed any messages yet """ kafka.create_topic('null_topic', partitions=4) pipeline.create_stream('null_stream', x='integer') pipeline.create_cv('null0', 'SELECT count(*) FROM null_stream') pipeline.consume_begin('null_topic', 'null_stream', group_id='null_offsets') # Write to a single partition so that only one partition's offsets are updated producer = kafka.get_producer('null_topic') producer.produce('1', partition_key='key') time.sleep(10) pipeline.consume_end() rows = pipeline.execute('SELECT * FROM pipeline_kafka.offsets WHERE "offset" IS NULL') assert len(rows) == 3
def test_colums_subset(pipeline, clean_db): """ Verify that copying data from a file into a stream works when the file's input columns are a subset of the stream0's columns """ pipeline.create_stream('stream0', x='int', y='float8', z='numeric', m='int') q = 'SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric), max(m::integer) FROM stream0' pipeline.create_cv('test_copy_subset', q) pipeline.create_table('test_copy_subset_t', x='integer', y='float8', z='numeric') path = os.path.abspath(os.path.join(pipeline.tmp_dir, 'test_copy.csv')) rows = [] for n in range(10000): row = random.randint(1, 1024), random.randint(1, 1024), random.random() rows.append(row) _generate_csv(path, rows, desc=('x', 'y', 'z')) pipeline.execute( 'COPY test_copy_subset_t (x, y, z) FROM \'%s\' HEADER CSV' % path) pipeline.execute('COPY stream0 (x, y, z) FROM \'%s\' HEADER CSV' % path) expected = pipeline.execute( 'SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric) FROM test_copy_subset_t' ).first() result = list(pipeline.execute('SELECT s0, s1, avg FROM test_copy_subset')) assert len(result) == 1 result = result[0] assert result[0] == expected[0] assert result[1] == expected[1] assert result[2] == expected[2]
def test_hll_count_distinct(pipeline, clean_db): """ Verify that streaming COUNT(DISTINCT) works """ q = 'SELECT COUNT(DISTINCT x::integer) FROM stream' pipeline.create_cv('test_count_distinct', q) desc = ('x', ) values = [(random.randint(1, 1024), ) for n in range(1000)] pipeline.insert('stream', desc, values) expected = len(set(values)) result = pipeline.execute('SELECT count FROM test_count_distinct').first() # Error rate should be well below %2 delta = abs(expected - result['count']) assert delta / float(expected) <= 0.02
def test_distinct(pipeline, clean_db): """ Verify that streaming SELECT DISTINCT ON (...) works """ pipeline.create_stream('stream0', x='int', y='int', z='int') pipeline.create_table('table0', x='int', y='int', z='int') q = 'SELECT DISTINCT ON (x::int, y::int - z::int) x::int, y::int FROM stream0' pipeline.create_cv('test_distinct', q) uniques = defaultdict(set) values = [] for _ in xrange(2000): x, y, z = random.randint(0, 20), random.randint(0, 20), random.randint( 0, 20) values.append((x, y, z)) uniques[(x, y - z)].add(y) pipeline.insert('stream0', ['x', 'y', 'z'], values) pipeline.insert('table0', ['x', 'y', 'z'], values) q = """ SELECT DISTINCT ON (x::int, y::int - z::int) x::int, y::int FROM table0 """ expected = pipeline.execute(q) expected = len(expected) assert expected < 2000 result = pipeline.execute('SELECT COUNT(*) FROM test_distinct')[0] assert expected == result['count'] # Check if the first row was selected for uniques result = pipeline.execute('SELECT * FROM test_distinct') reverse_uniques = defaultdict(set) for (x, _), ys in uniques.iteritems(): for y in ys: reverse_uniques[y].add(x) for row in result: assert row['x'] in reverse_uniques[row['y']]
def test_deadlock_regress(pipeline, clean_db): nitems = 2000000 tmp_file = os.path.join(tempfile.gettempdir(), 'tmp.json') query = 'SELECT generate_series(1, %d) AS n' % nitems pipeline.execute("COPY (%s) TO '%s'" % (query, tmp_file)) pipeline.create_stream('s1', n='int') pipeline.create_stream('s2', n='int') pipeline.create_ct('ct', 'SELECT n FROM s1 WHERE n IS NOT NULL', "pipelinedb.insert_into_stream('s2')") pipeline.create_cv('cv', 'SELECT count(*) FROM s2') for copy in [True, False]: for nworkers in [1, 4]: for sync in ['receive', 'commit']: pipeline.stop() pipeline.run({ 'pipelinedb.num_workers': nworkers, 'pipelinedb.stream_insert_level': 'sync_%s' % sync }) pipeline.execute("SELECT pipelinedb.truncate_continuous_view('cv')") pipeline.execute('COMMIT') if copy: pipeline.execute("COPY s1 (n) FROM '%s'" % tmp_file) else: pipeline.execute('INSERT INTO s1 (n) %s' % query) count = dict(pipeline.execute('SELECT count FROM cv')[0] or {}) ntries = 5 while count.get('count') != nitems and ntries > 0: assert sync == 'receive' time.sleep(1) count = dict(pipeline.execute('SELECT count FROM cv')[0] or {}) ntries -= 1 assert count and count['count'] == nitems os.remove(tmp_file) pipeline.stop() pipeline.run()
def test_combine_table(pipeline, clean_db): pipeline.create_stream('s', x='int') pipeline.create_cv('combine_table', 'SELECT x::int, COUNT(*) FROM s GROUP BY x') values = [(i,) for i in xrange(1000)] pipeline.insert('s', ('x',), values) pipeline.execute('SELECT * INTO tmprel FROM combine_table_mrel') stop = False ninserts = [0] def insert(): while not stop: pipeline.insert('s', ('x',), values) ninserts[0] += 1 time.sleep(0.01) t = threading.Thread(target=insert) t.start() time.sleep(2) conn = psycopg2.connect('dbname=pipeline user=%s host=localhost port=%s' % (getpass.getuser(), pipeline.port)) cur = conn.cursor() cur.execute("SELECT pipeline_combine_table('combine_table', 'tmprel')") conn.commit() conn.close() stop = True t.join() assert ninserts[0] > 0 rows = list(pipeline.execute('SELECT count FROM combine_table')) assert len(rows) == 1000 for row in rows: assert row[0] == ninserts[0] + 2 pipeline.execute('DROP TABLE tmprel')
def test_concurrent_copy(pipeline, clean_db): pipeline.create_stream('stream0', x='int') pipeline.create_cv('concurrent_copy0', 'SELECT x::int, count(*) FROM stream0 GROUP BY x') pipeline.create_cv('concurrent_copy1', 'SELECT count(*) FROM stream0') tmp_file = os.path.join(tempfile.gettempdir(), 'tmp.copy') query = 'SELECT generate_series(1, 2000) AS x' pipeline.execute("COPY (%s) TO '%s'" % (query, tmp_file)) num_threads = 4 stop = False inserted = [0] * num_threads def insert(i): conn = psycopg2.connect( 'dbname=postgres user=%s host=localhost port=%s' % (getpass.getuser(), pipeline.port)) cur = conn.cursor() while not stop: cur.execute("COPY stream0 (x) FROM '%s'" % tmp_file) conn.commit() inserted[i] += 2000 conn.close() threads = [ threading.Thread(target=insert, args=(i, )) for i in range(num_threads) ] map(lambda t: t.start(), threads) time.sleep(60) stop = True map(lambda t: t.join(), threads) time.sleep(5) total = pipeline.execute('SELECT sum(count) FROM concurrent_copy0')[0][0] assert total == sum(inserted) total = pipeline.execute('SELECT count FROM concurrent_copy1')[0][0] assert total == sum(inserted)
def test_transforms(pipeline, clean_db): """ Verify that continuous transforms work properly on output streams """ pipeline.create_stream('stream0', x='int') pipeline.create_cv('sw', 'SELECT x::integer, COUNT(*) FROM stream0 GROUP BY x', sw='5 seconds') # Write a row to a stream each time a row goes out of window q = 'SELECT (old).x FROM sw_osrel WHERE old IS NOT NULL AND new IS NULL' pipeline.create_stream('oow_stream', x='integer') pipeline.create_ct('ct', q, "pipeline_stream_insert('oow_stream')") pipeline.create_cv('ct_recv', 'SELECT x FROM oow_stream') pipeline.insert('stream0', ('x', ), [(x % 100, ) for x in range(10000)]) time.sleep(7) rows = list(pipeline.execute('SELECT * FROM ct_recv')) assert len(rows) == 100
def test_dump(pipeline, clean_db): """ Verify that we can dump and restore CVs using INSERT statements """ pipeline.create_stream('stream0', x='int') q = """ SELECT x::integer %% 100 AS g, avg(x) + 1 AS avg, count(*), count(distinct x) AS distincts FROM stream0 GROUP BY g """ pipeline.create_cv('test_dump', q) rows = [(x, ) for x in range(1000)] pipeline.insert('stream0', ('x', ), rows) def _verify(): result = pipeline.execute('SELECT count(*) FROM test_dump').first() assert result['count'] == 100 result = pipeline.execute('SELECT sum(avg) FROM test_dump').first() assert result['sum'] == 50050 result = pipeline.execute( 'SELECT sum(distincts) FROM test_dump').first() assert result['sum'] == 1000 _verify() _dump(pipeline, 'test_dump.sql') pipeline.drop_all() _restore(pipeline, 'test_dump.sql') _verify() # Now verify that we can successfully add more data to the restored CV rows = [(x, ) for x in range(2000)] pipeline.insert('stream0', ('x', ), rows) result = pipeline.execute('SELECT sum(count) FROM test_dump').first() assert result['sum'] == 3000 result = pipeline.execute('SELECT sum(distincts) FROM test_dump').first() assert result['sum'] == 2000
def test_copy_to_typed_stream(pipeline, clean_db): """ Verify that copying data from a file into a typed stream works. """ pipeline.create_stream('stream', x='integer', y='float8', z='numeric') q = 'SELECT sum(x) AS s0, sum(y) AS s1, avg(z) FROM stream' pipeline.create_cv('test_copy_to_typed_stream', q) pipeline.create_table('test_copy_to_typed_stream_t', x='integer', y='float8', z='numeric') path = os.path.abspath(os.path.join(pipeline.tmp_dir, 'test_copy.csv')) rows = [] for n in range(10000): row = random.randint(1, 1024), random.randint(1, 1024), random.random() rows.append(row) _generate_csv(path, rows, desc=('x', 'y', 'z')) pipeline.execute( 'COPY test_copy_to_typed_stream_t (x, y, z) FROM \'%s\' HEADER CSV' % path) pipeline.execute('COPY stream (x, y, z) FROM \'%s\' HEADER CSV' % path) expected = pipeline.execute( 'SELECT sum(x) AS s0, sum(y) AS s1, avg(z) FROM test_copy_to_typed_stream_t' ).first() result = list( pipeline.execute('SELECT s0, s1, avg FROM test_copy_to_typed_stream')) assert len(result) == 1 result = result[0] assert result[0] == expected[0] assert result[1] == expected[1] assert result[2] == expected[2]
def test_object_aggs(pipeline, clean_db): """ Verify that combines work properly on object aggs """ q = """ SELECT x::integer % 10 AS k, json_agg(x), json_object_agg(x, y::float8), string_agg(s::text, \' :: \')FROM stream0 GROUP BY k; """ desc = ('x', 'y', 's') pipeline.create_stream('stream0', x='int', y='float8', s='text') pipeline.create_cv('test_object_aggs', q) pipeline.create_table('test_object_aggs_t', x='integer', y='float8', s='text') rows = [] for n in range(10000): row = (random.randint(0, 1000), random.random(), str(n) * random.randint(1, 8)) rows.append(row) pipeline.insert('stream0', desc, rows) pipeline.insert('test_object_aggs_t', desc, rows) tq = """ SELECT json_agg(x), json_object_agg(x, y::float8), string_agg(s::text, \' :: \') FROM test_object_aggs_t """ table_result = pipeline.execute(tq) cq = """ SELECT combine(json_agg), combine(json_object_agg), combine(string_agg) FROM test_object_aggs """ cv_result = pipeline.execute(cq) assert len(table_result) == len(cv_result) for tr, cr in zip(table_result, cv_result): assert sorted(tr[0]) == sorted(cr[0]) assert sorted(tr[1]) == sorted(cr[1]) assert sorted(tr[2]) == sorted(cr[2])
def test_concurrent_inserts(pipeline, clean_db): pipeline.create_stream('stream0', x='int') pipeline.create_cv('concurrent_inserts0', 'SELECT x::int, count(*) FROM stream0 GROUP BY x') pipeline.create_cv('concurrent_inserts1', 'SELECT count(*) FROM stream0') num_threads = 4 stop = False inserted = [0] * num_threads def insert(i): conn = psycopg2.connect( 'dbname=postgres user=%s host=localhost port=%s' % (getpass.getuser(), pipeline.port)) cur = conn.cursor() while not stop: cur.execute('INSERT INTO stream0 (x) ' 'SELECT x % 100 FROM generate_series(1, 2000) AS x') conn.commit() inserted[i] += 2000 conn.close() threads = [ threading.Thread(target=insert, args=(i, )) for i in range(num_threads) ] map(lambda t: t.start(), threads) time.sleep(60) stop = True map(lambda t: t.join(), threads) time.sleep(5) total = pipeline.execute( 'SELECT sum(count) FROM concurrent_inserts0')[0]['sum'] assert total == sum(inserted) total = pipeline.execute( 'SELECT count FROM concurrent_inserts1')[0]['count'] assert total == sum(inserted)
def test_nested_expressions(pipeline, clean_db): """ Verify that combines work properly on arbitrarily nested expressions """ q = """ SELECT x::integer %% 10 AS k, (rank(256) WITHIN GROUP (ORDER BY x) + dense_rank(256) WITHIN GROUP (ORDER BY x)) * (avg(x + y::float8) - (sum(x) * avg(y))) AS whoa FROM stream GROUP BY k """ desc = ('x', 'y') pipeline.create_cv('test_nested', q) pipeline.create_table('test_nested_t', x='integer', y='float8') rows = [] for n in range(10000): row = (random.randint(0, 1000), random.random()) rows.append(row) pipeline.insert('stream', desc, rows) pipeline.insert('test_nested_t', desc, rows) # Note that the CQ will use the HLL variant of dense_rank, # so use hll_dense_rank on the table too tq = """ SELECT (rank(256) WITHIN GROUP (ORDER BY x) + hll_dense_rank(256) WITHIN GROUP (ORDER BY x)) * (avg(x + y::float8) - (sum(x) * avg(y))) AS whoa FROM test_nested_t """ table_result = list(pipeline.execute(tq)) cq = """ SELECT combine(whoa) FROM test_nested """ cv_result = list(pipeline.execute(cq)) assert len(table_result) == len(cv_result) for tr, cr in zip(table_result, cv_result): assert abs(tr[0] - cr[0]) < 0.0001
def test_hypothetical_set_aggs(pipeline, clean_db): """ Verify that combines work properly on HS aggs """ q = """ SELECT x::integer % 10 AS k, rank(256) WITHIN GROUP (ORDER BY x), dense_rank(256) WITHIN GROUP (ORDER BY x) FROM stream0 GROUP BY k """ desc = ('x', 'y') pipeline.create_stream('stream0', x='int', y='float8') pipeline.create_cv('test_hs_aggs', q) pipeline.create_table('test_hs_aggs_t', x='integer', y='float8') rows = [] for n in range(10000): row = (random.randint(0, 1000), random.random()) rows.append(row) pipeline.insert('stream0', desc, rows) pipeline.insert('test_hs_aggs_t', desc, rows) # Note that the CQ will use the combinable variant of dense_rank, # so use that on the table too tq = """ SELECT rank(256) WITHIN GROUP (ORDER BY x), combinable_dense_rank(256, x) FROM test_hs_aggs_t """ table_result = pipeline.execute(tq) cq = """ SELECT combine(rank), combine(dense_rank) FROM test_hs_aggs """ cv_result = pipeline.execute(cq) assert len(table_result) == len(cv_result) for tr, cr in zip(table_result, cv_result): assert tr[0] == cr[0] assert tr[1] == cr[1]
def test_combine(pipeline, clean_db): """ Verify that partial tuples are combined with on-disk tuples """ pipeline.create_cv('combine', 'SELECT key::text, COUNT(*) FROM stream GROUP BY key') rows = [] for n in range(100): for m in range(100): key = '%d%d' % (n % 10, m) rows.append((key, 0)) pipeline.insert('stream', ('key', 'unused'), rows) total = 0 result = pipeline.execute('SELECT * FROM combine') for row in result: total += row['count'] assert total == 10000