def test_online_add_column(pipeline, clean_db):
    """
    Verify that we can add columns to a stream while not affecting running CQs
    """
    pipeline.create_stream("stream0", c0="integer")

    pipeline.create_cv("cv0", "SELECT c0 FROM stream0")
    pipeline.insert("stream0", ("c0",), [(n,) for n in range(0, 1000)])
    result = list(pipeline.execute("SELECT * FROM cv0"))

    assert len(result) == 1000

    for row in result:
        for col in row:
            assert col is not None

    pipeline.execute("ALTER STREAM stream0 ADD c1 integer")

    pipeline.create_cv("cv1", "SELECT c0, c1 FROM stream0")
    pipeline.insert("stream0", ("c0", "c1"), [(n, n) for n in range(1000, 2000)])
    result = list(pipeline.execute("SELECT * FROM cv1 WHERE c1 >= 1000"))

    assert len(result) == 1000

    for row in result:
        for col in row:
            assert col is not None

    pipeline.execute("ALTER STREAM stream0 ADD c2 integer")
    pipeline.create_cv("cv2", "SELECT c0, c1, c2 FROM stream0")
    pipeline.insert("stream0", ("c0", "c1", "c2"), [(n, n, n) for n in range(2000, 3000)])
    result = list(pipeline.execute("SELECT * FROM cv2 WHERE c2 >= 2000"))

    assert len(result) == 1000

    for row in result:
        for col in row:
            assert col is not None

    pipeline.execute("ALTER STREAM stream0 ADD c3 integer")
    pipeline.create_cv("cv3", "SELECT c0, c1, c2, c3 FROM stream0")
    pipeline.insert("stream0", ("c0", "c1", "c2", "c3"), [(n, n, n, n) for n in range(3000, 4000)])
    result = list(pipeline.execute("SELECT * FROM cv3 WHERE c3 >= 3000"))

    assert len(result) == 1000

    for row in result:
        for col in row:
            assert col is not None

    pipeline.execute("ALTER STREAM stream0 ADD c4 integer")
    pipeline.create_cv("cv4", "SELECT c0, c1, c2, c3, c4 FROM stream0")
    pipeline.insert("stream0", ("c0", "c1", "c2", "c3", "c4"), [(n, n, n, n, n) for n in range(4000, 5000)])
    result = list(pipeline.execute("SELECT * FROM cv4 WHERE c4 >= 4000"))

    assert len(result) == 1000

    for row in result:
        for col in row:
            assert col is not None
def test_prepared_extended(pipeline, clean_db):
  """
  Verify that we can write to streams using the extended protocol. This test
  shells out to a binary because psycopg2 doesn't use the extended protocol.
  """
  q = """
  SELECT COUNT(x::integer) AS x, COUNT(y::integer) AS y, COUNT(z::integer) AS z FROM extended_stream
  """
  pipeline.create_cv('test_prepared_extended', q)

  # This will insert 1000 via a paramaterized insert, and 1000 via unparamaterized insert
  cmd = ['./extended', 'pipeline', str(pipeline.port), 'extended_stream', '1000']

  stdout, stderr = subprocess.Popen(cmd).communicate()

  assert stdout is None
  assert stderr is None

  rows = list(pipeline.execute('SELECT x, y, z FROM test_prepared_extended'))
  assert len(rows) == 1

  result = rows[0]

  assert result['x'] == 2000
  assert result['y'] == 2000
  assert result['z'] == 2000
def test_postmaster_worker_recovery(pipeline, clean_db):
  """
  Verify that the Postmaster only restarts crashed worker processes, and does not
  attempt to start them when the continuous query scheduler should.
  """
  result = pipeline.execute('SELECT COUNT(*) FROM pipeline_proc_stats WHERE type = \'worker\'').first()
  expected_workers = result['count']

  result = pipeline.execute('SELECT COUNT(*) FROM pipeline_proc_stats WHERE type = \'combiner\'').first()
  expected_combiners = result['count']

  q = 'SELECT COUNT(*) FROM stream'
  pipeline.create_cv('test_pm_recovery', q)
  pipeline.insert('stream', ['x'], [(1, ), (1, )])

  def backend():
    try:
      # Just keep a long-running backend connection open
      client = pipeline.engine.connect()
      client.execute('SELECT pg_sleep(10000)')
    except:
      pass

  t = threading.Thread(target=backend)
  t.start()

  attempts = 0
  result = None
  backend_pid = 0

  while not result and attempts < 10:
    result = pipeline.execute("""SELECT pid, query FROM pg_stat_activity WHERE lower(query) LIKE '%%pg_sleep%%'""").first()
    time.sleep(1)
    attempts += 1

  assert result

  backend_pid = result['pid']
  os.kill(backend_pid, signal.SIGKILL)

  attempts = 0
  pipeline.conn = None

  while attempts < 15:
    try:
      pipeline.conn = pipeline.engine.connect()
      break
    except:
      time.sleep(1)
      pass
    attempts += 1

  assert pipeline.conn

  # Now verify that we have the correct number of CQ worker procs
  result = pipeline.execute('SELECT COUNT(*) FROM pipeline_proc_stats WHERE type = \'worker\'').first()
  assert result['count'] == expected_workers

  result = pipeline.execute('SELECT COUNT(*) FROM pipeline_proc_stats WHERE type = \'combiner\'').first()
  assert result['count'] == expected_combiners
Example #4
0
def _test_agg(pipeline, agg, check_fn=None):
    name = agg[:agg.find('(')]
    q = 'SELECT g::integer, %s OVER (PARTITION BY g ORDER BY ts::timestamp) FROM %s'
    cv_name = 'test_%s' % name
    table_name = 'test_%s_t' % name
    desc = ('ts', 'g', 'x', 'y', 'z')

    pipeline.create_cv(cv_name, q % (agg, 'stream'))
    pipeline.create_table(table_name, ts='timestamp', x='integer', y='integer', z='integer', g='integer')

    rows = []
    for i, n in enumerate(range(1000)):
        ts = str(datetime.utcnow() + timedelta(seconds=i))
        row = ts, n % 10, random.randint(1, 256), random.randint(1, 256), random.randint(1, 256)
        rows.append(row)

    pipeline.insert('stream', desc, rows)
    pipeline.insert(table_name, desc, rows)

    if check_fn:
        return check_fn(pipeline)

    expected = list(pipeline.execute(q % (agg, table_name) + ' ORDER BY g'))
    result = list(pipeline.execute('SELECT * FROM %s ORDER BY g' % cv_name))

    assert len(expected) == len(result)

    for e, r in zip(expected, result):
        assert e == r

    pipeline.drop_cv(cv_name)
    pipeline.drop_table(table_name)
Example #5
0
def test_bloom_contains(pipeline, clean_db):
    """
  Verify that bloom_contains works
  """
    pipeline.create_stream("test_bloom_stream", x="int")

    q = """
  SELECT bloom_agg(x::integer) FROM test_bloom_stream
  """

    desc = "x"
    pipeline.create_cv("test_bloom_contains", q)

    rows = []
    for i in range(10000):
        rows.append((2 * i,))

    pipeline.insert("test_bloom_stream", desc, rows)

    cvq = """
  SELECT bloom_contains(bloom_agg, 0), bloom_contains(bloom_agg, 5000),
  bloom_contains(bloom_agg, 1), bloom_contains(bloom_agg, 5001)
  FROM test_bloom_contains
  """

    result = list(pipeline.execute(cvq))

    assert len(result) == 1
    result = result[0]
    assert result[0] == True
    assert result[1] == True
    assert result[2] == False
    assert result[3] == False
Example #6
0
def test_concurrent_vacuum_full(pipeline, clean_db):
  pipeline.create_cv(
    'test_vacuum_full',
    'SELECT x::int, COUNT(*) FROM test_vacuum_stream GROUP BY x')
  stop = False

  def insert():
    while not stop:
      values = [(random.randint(0, 1000000), ) for _ in xrange(1000)]
      pipeline.insert('test_vacuum_stream', ('x', ), values)
      time.sleep(0.01)

  threads = [threading.Thread(target=insert) for _ in range(4)]
  map(lambda t: t.start(), threads)

  # Insert data for a little bit so we have enough work to do while
  # vacuuming.
  time.sleep(20)

  conn = psycopg2.connect('dbname=pipeline user=%s host=localhost port=%s' %
                          (getpass.getuser(), pipeline.port))
  conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
  cur = conn.cursor()
  cur.execute('VACUUM FULL test_vacuum_full')
  conn.close()

  # Now kill the insert threads.
  stop = True
  map(lambda t: t.join(), threads)
Example #7
0
def test_bloom_agg_hashing(pipeline, clean_db):
    """
  Verify that bloom_agg correctly hashes different input types
  """
    pipeline.create_stream("test_bloom_stream", x="int", y="text", z="float8")

    q = """
  SELECT bloom_agg(x::integer) AS i,
  bloom_agg(y::text) AS t,
  bloom_agg(z::float8) AS f FROM test_bloom_stream
  """
    desc = ("x", "y", "z")
    pipeline.create_cv("test_bloom_hashing", q)

    rows = []
    for n in range(10000):
        rows.append((n, "%d" % n, float(n)))
        rows.append((n, "%05d" % n, float(n)))

    pipeline.insert("test_bloom_stream", desc, rows)

    cvq = """
  SELECT bloom_cardinality(i),
  bloom_cardinality(t), bloom_cardinality(f) FROM test_bloom_hashing
  """
    result = list(pipeline.execute(cvq))

    assert len(result) == 1

    result = result[0]

    assert result[0] == 8879
    assert result[1] == 15614
    assert result[2] == 8855
def test_simple_crash(pipeline, clean_db):
  """
  Test simple worker and combiner crashes.
  """
  q = 'SELECT COUNT(*) FROM stream'
  pipeline.create_cv('test_simple_crash', q)

  pipeline.insert('stream', ['x'], [(1, ), (1, )])

  result = pipeline.execute('SELECT * FROM test_simple_crash').first()
  assert result['count'] == 2

  # We can potentially lose one batch for a worker or combiner crash.
  # In our case each batch adds a count 2 and since we're adding 3 batches
  # we should either see an increment from the previous count of 4 or 6.
  pipeline.insert('stream', ['x'], [(1, ), (1, )])

  assert kill_worker()

  pipeline.insert('stream', ['x'], [(1, ), (1, )])

  result = pipeline.execute('SELECT * FROM test_simple_crash').first()
  assert result['count'] == 6

  pipeline.insert('stream', ['x'], [(1, ), (1, )])

  assert kill_combiner()

  pipeline.insert('stream', ['x'], [(1, ), (1, )])

  result = pipeline.execute('SELECT * FROM test_simple_crash').first()
  assert result['count'] == 10
def test_simple_aggs(pipeline, clean_db):
    """
    Verify that combines work properly on simple aggs
    """
    q = """
    SELECT x::integer %% 10 AS k,
    avg(x), sum(y::float8), count(*) FROM stream0 GROUP BY k;
    """
    desc = ('x', 'y')
    pipeline.create_stream('stream0', x='int', y='float8')
    pipeline.create_cv('test_simple_aggs', q)
    pipeline.create_table('test_simple_aggs_t', x='integer', y='float8')

    rows = []
    for n in range(10000):
        row = (random.randint(0, 1000), random.random())
        rows.append(row)

    pipeline.insert('stream0', desc, rows)
    pipeline.insert('test_simple_aggs_t', desc, rows)

    table_result = list(pipeline.execute('SELECT avg(x), sum(y::float8), count(*) FROM test_simple_aggs_t'))
    cv_result = list(pipeline.execute('SELECT combine(avg), combine(sum), combine(count) FROM test_simple_aggs'))

    assert len(table_result) == len(cv_result)

    for tr, cr in zip(table_result, cv_result):
        assert abs(tr[0] - cr[0]) < 0.00001
        assert abs(tr[1] - cr[1]) < 0.00001
        assert abs(tr[2] - cr[2]) < 0.00001
def test_join_across_batches(pipeline, clean_db):
    """
    Verify that stream-table joins are properly built when they
    span across multiple input batches
    """
    num_cols = 4
    join_cols = [0]
    t_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])
    pipeline.create_table('batch', **t_cols)
    pipeline.create_stream('stream', **t_cols)

    q = """
    SELECT s.col0::integer FROM batch JOIN stream s ON batch.col0 = s.col0
    """

    t = _generate_rows(num_cols, 64)
    _insert(pipeline, 'batch', t, 0.1)

    s = _generate_rows(num_cols, 64)
    pipeline.create_cv('test_batched_join', q)
    _insert(pipeline, 'stream', s)

    expected = _join(t, s, join_cols)
    result = pipeline.execute('SELECT COUNT(*) FROM test_batched_join').first()

    assert result['count'] == len(expected)
def test_combine_in_view(pipeline, clean_db):
    """
    Verify that combines in views on top of continuous views work
    """
    q = """
    SELECT x::integer, avg(y::integer) FROM stream0 GROUP BY x
    """
    desc = ('x', 'y')
    pipeline.create_stream('stream0', x='int', y='float8')
    pipeline.create_cv('test_combine_view', q)
    pipeline.execute('CREATE VIEW v AS SELECT combine(avg) FROM test_combine_view')

    rows = []
    for n in range(10000):
        rows.append((random.randint(1, 256), random.randint(1, 1024)))

    pipeline.insert('stream0', desc, rows)

    view = list(pipeline.execute('SELECT * FROM v'))

    assert len(view) == 1

    expected = sum(r[1] for r in rows) / float(len(rows))

    assert abs(float(view[0][0]) - expected) < 0.00001

    pipeline.execute('DROP VIEW v')
Example #12
0
def test_null_groups(pipeline, clean_db):
    """
    Verify that null group columns are considered equal
    """
    pipeline.create_stream('stream', x='int', y='int', z='int')
    q = """
    SELECT x::integer, y::integer, z::integer, COUNT(*) FROM stream
    GROUP BY x, y, z;
    """
    desc = ('x', 'y', 'z')
    pipeline.create_cv('test_null_groups', q)
    pipeline.create_table('test_null_groups_t', x='integer', y='integer', z='integer')

    rows = []
    for n in range(10000):
        vals = list(random.randint(0, 10) for n in range(3))
        vals = map(lambda n: random.random() > 0.1 and n or None, vals)
        rows.append(tuple(vals))

    pipeline.insert('stream', desc, rows)
    pipeline.insert('test_null_groups_t', desc, rows)

    table_q = """
    SELECT x, y, z, COUNT(*) FROM test_null_groups_t
    GROUP BY x, y, z ORDER BY x, y, z;
    """
    expected = list(pipeline.execute(table_q))
    result = list(pipeline.execute('SELECT x, y, z, count FROM test_null_groups ORDER BY x, y, z'))

    for r, e in zip(result, expected):
        assert r == e
def assert_result_changes(func, args):
    """
    Verifies that the result of the given function changes with time
    """
    pipeline.create_stream('stream', x='int', y='text', z='int')
    name = 'assert_%s_decreases' % func
    pipeline.create_cv(name,
                       "SELECT %s(%s) FROM stream WHERE arrival_timestamp > clock_timestamp() - interval '2 seconds'" % (func, args))

    rows = [(n, str(n), n + 1) for n in range(1000)]
    pipeline.insert('stream', ('x', 'y', 'z'), rows)

    current = 1

    results = []
    while current:
        row = pipeline.execute('SELECT * FROM %s' % name).first()
        current = row[func]
        if current is None:
            break
        results.append(current)

    # Verify that we actually read something
    assert results

    pipeline.drop_cv(name)
def test_incremental_join(pipeline, clean_db):
    """
    Verify that join results increase appropriately as we incrementally
    add stream events to the input
    """
    num_cols = 4
    join_cols = [0, 1]
    t_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])
    pipeline.create_table('inc', **t_cols)
    pipeline.create_stream('stream', **t_cols)

    q = """
    SELECT s.col0::integer FROM inc JOIN stream s ON inc.col0 = s.col0
    AND inc.col1 = s.col1::integer
    """
    t = _generate_rows(num_cols, 64)
    _insert(pipeline, 'inc', t, 0.1)

    pipeline.create_cv('test_join', q)
    s = []
    for n in range(2):
        row = _generate_row(num_cols)
        _insert(pipeline, 'stream', [row])
        s.append(row)

    expected = _join(t, s, join_cols)
    result = pipeline.execute('SELECT COUNT(*) FROM test_join').first()

    assert result['count'] == len(expected)
Example #15
0
def test_user_low_and_high_card(pipeline, clean_db):
    """
    Verify that Bloom filters's with low and high cardinalities are correcly
    unioned
    """
    q = """
    SELECT k::integer, bloom_agg(x::integer) FROM test_bloom_stream GROUP BY k
    """
    desc = ('k', 'x')
    pipeline.create_cv('test_bloom_agg', q)

    # Low cardinalities
    rows = []
    for n in range(1000):
        rows.append((0, random.choice((-1, -2))))
        rows.append((1, random.choice((-3, -4))))

    # High cardinalities
    for n in range(10000):
        rows.append((2, n))
        rows.append((3, n))

    pipeline.insert('test_bloom_stream', desc, rows)

    result = pipeline.execute('SELECT bloom_cardinality(combine(bloom_agg)) '
                              'FROM test_bloom_agg WHERE k in (0, 1)').first()
    assert result[0] == 4

    result = pipeline.execute('SELECT bloom_cardinality(combine(bloom_agg)) '
                              'FROM test_bloom_agg WHERE k in (2, 3)').first()
    assert result[0] == 8879

    result = pipeline.execute('SELECT bloom_cardinality(combine(bloom_agg)) '
                              'FROM test_bloom_agg').first()
    assert result[0] == 8881
Example #16
0
def test_bloom_intersection(pipeline, clean_db):
    """
  Verify that bloom_intersection works
  """
    pipeline.create_stream("test_bloom_stream", x="int", k="int")

    q = """
  SELECT k::int, bloom_agg(x::integer) FROM test_bloom_stream GROUP BY k
  """

    desc = ("k", "x")
    pipeline.create_cv("test_bloom_intersection", q)

    rows = []
    for i in range(10000):
        rows.append((0, 2 * i))
        rows.append((1, i))

    pipeline.insert("test_bloom_stream", desc, rows)

    cvq = """
  SELECT bloom_cardinality(bloom_intersection_agg(bloom_agg))
  FROM test_bloom_intersection
  """

    result = list(pipeline.execute(cvq))

    assert len(result) == 1

    result = result[0]

    assert result[0] == 5530
Example #17
0
def test_hll_agg_hashing(pipeline, clean_db):
    """
    Verify that hll_agg correctly hashes different input types
    """
    pipeline.create_stream('test_hll_stream', x='int', y='text', z='float8')
    q = """
    SELECT hll_agg(x::integer) AS i,
    hll_agg(y::text) AS t,
    hll_agg(z::float8) AS f FROM test_hll_stream
    """
    desc = ('x', 'y', 'z')
    pipeline.create_cv('test_hll_hashing', q)

    rows = []
    for n in range(10000):
        rows.append((n, '%d' % n, float(n)))
        rows.append((n, '%05d' % n, float(n)))

    pipeline.insert('test_hll_stream', desc, rows)

    cvq = """
    SELECT hll_cardinality(i),
    hll_cardinality(t), hll_cardinality(f) FROM test_hll_hashing
    """
    result = list(pipeline.execute(cvq))

    assert len(result) == 1

    result = result[0]

    assert result[0] == 9976
    assert result[1] == 19951
    assert result[2] == 10062
Example #18
0
def test_single_continuous_view(pipeline, clean_db):
    """
  Verify that specific continuous views can be dropped and restored
  """
    pipeline.create_cv("test_single0", "SELECT COUNT(*) FROM stream")
    pipeline.create_cv("test_single1", "SELECT COUNT(*) FROM stream")
    pipeline.insert("stream", ("x",), [(x,) for x in range(10)])

    result = pipeline.execute("SELECT count FROM test_single0").first()
    assert result["count"] == 10

    result = pipeline.execute("SELECT count FROM test_single1").first()
    assert result["count"] == 10

    _dump(pipeline, "test_single.sql", cv_name="test_single0")

    pipeline.drop_all_views()
    _restore(pipeline, "test_single.sql")

    result = pipeline.execute("SELECT count FROM test_single0").first()
    assert result["count"] == 10

    # We didn't dump this one
    result = list(pipeline.execute("SELECT * FROM pg_class WHERE relname LIKE '%%test_single1%%'"))
    assert not result
Example #19
0
def test_copy_to_typed_stream(pipeline, clean_db):
    """
    Verify that copying data from a file into a typed stream works.
    """
    pipeline.create_stream('stream', x='integer', y='float8', z='numeric')

    q = 'SELECT sum(x) AS s0, sum(y) AS s1, avg(z) FROM stream'
    pipeline.create_cv('test_copy_to_typed_stream', q)
    pipeline.create_table('test_copy_to_typed_stream_t', x='integer', y='float8', z='numeric')

    path = os.path.abspath(os.path.join(pipeline.tmp_dir, 'test_copy.csv'))

    rows = []
    for n in range(10000):
        row = random.randint(1, 1024), random.randint(1, 1024), random.random()
        rows.append(row)

    _generate_csv(path, rows, desc=('x', 'y', 'z'))

    pipeline.execute('COPY test_copy_to_typed_stream_t (x, y, z) FROM \'%s\' HEADER CSV' % path)

    pipeline.execute('COPY stream (x, y, z) FROM \'%s\' HEADER CSV' % path)

    expected = pipeline.execute('SELECT sum(x) AS s0, sum(y) AS s1, avg(z) FROM test_copy_to_typed_stream_t').first()
    result = list(pipeline.execute('SELECT s0, s1, avg FROM test_copy_to_typed_stream'))

    assert len(result) == 1

    result = result[0]

    assert result[0] == expected[0]
    assert result[1] == expected[1]
    assert result[2] == expected[2]
Example #20
0
def test_cq_stats(pipeline, clean_db):
    """
    Verify that CQ statistics collection works
    """
    num_combiners = int(
        pipeline.execute("SHOW continuous_query_num_combiners").first()["continuous_query_num_combiners"]
    )
    num_workers = int(pipeline.execute("SHOW continuous_query_num_workers").first()["continuous_query_num_workers"])

    # 10 rows
    q = "SELECT x::integer %% 10 AS g, COUNT(*) FROM stream GROUP BY g"
    pipeline.create_cv("test_10_groups", q)

    # 1 row
    q = "SELECT COUNT(*) FROM stream"
    pipeline.create_cv("test_1_group", q)

    values = [(random.randint(1, 1024),) for n in range(1000)]

    pipeline.insert("stream", ("x",), values)
    pipeline.insert("stream", ("x",), values)
    # Sleep a little so that the next time we insert, we force the stats collector.
    time.sleep(0.5)
    pipeline.insert("stream", ("x",), values)
    pipeline.insert("stream", ("x",), values)

    # Sleep a little so the stats collector flushes all the stats.
    time.sleep(0.5)

    proc_result = pipeline.execute("SELECT * FROM pipeline_proc_stats")
    cq_result = pipeline.execute("SELECT * FROM pipeline_query_stats")

    proc_rows = len(list(proc_result))
    cq_rows = len(list(cq_result))

    assert proc_rows == 1 + num_combiners + num_workers
    assert cq_rows == 4

    # When sleeping, we only force the stats collection for the first CQ, so we're not guaranteed to have seen
    # all stats flushed for the 10 group view. The stats flushed should be anywhere between the two inserts.
    result = pipeline.execute(
        "SELECT * FROM pipeline_query_stats WHERE name = 'test_10_groups' AND type = 'worker'"
    ).first()
    assert result["input_rows"] >= 2000
    assert result["input_rows"] <= 4000

    result = pipeline.execute(
        "SELECT * FROM pipeline_query_stats WHERE name = 'test_10_groups' AND type = 'combiner'"
    ).first()
    assert result["output_rows"] == 10

    result = pipeline.execute(
        "SELECT * FROM pipeline_query_stats WHERE name = 'test_1_group' AND type = 'worker'"
    ).first()
    assert result["input_rows"] == 4000

    result = pipeline.execute(
        "SELECT * FROM pipeline_query_stats WHERE name = 'test_1_group' AND type = 'combiner'"
    ).first()
    assert result["output_rows"] == 1
Example #21
0
def test_colums_subset(pipeline, clean_db):
    """
    Verify that copying data from a file into a stream works when the file's input
    columns are a subset of the stream's columns
    """
    q = 'SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric), max(m::integer) FROM stream'
    pipeline.create_cv('test_copy_subset', q)
    pipeline.create_table('test_copy_subset_t', x='integer', y='float8', z='numeric')

    path = os.path.abspath(os.path.join(pipeline.tmp_dir, 'test_copy.csv'))

    rows = []
    for n in range(10000):
        row = random.randint(1, 1024), random.randint(1, 1024), random.random()
        rows.append(row)

    _generate_csv(path, rows, desc=('x', 'y', 'z'))

    pipeline.execute('COPY test_copy_subset_t (x, y, z) FROM \'%s\' HEADER CSV' % path)

    pipeline.execute('COPY stream (x, y, z) FROM \'%s\' HEADER CSV' % path)

    expected = pipeline.execute('SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric) FROM test_copy_subset_t').first()
    result = list(pipeline.execute('SELECT s0, s1, avg FROM test_copy_subset'))

    assert len(result) == 1

    result = result[0]

    assert result[0] == expected[0]
    assert result[1] == expected[1]
    assert result[2] == expected[2]
def test_join_with_where(pipeline, clean_db):
    """
    Verify that stream-table joins using a WHERE clause work properly
    """
    num_cols = 4
    q = """
    SELECT s.col0::integer FROM stream s, wt WHERE s.col0 = 1 AND wt.col0 = 1
    """
    wt_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])

    pipeline.create_table('wt', **wt_cols)
    pipeline.create_table('wt_s', **wt_cols)

    wt = _generate_rows(num_cols, 64)
    s = _generate_rows(num_cols, 64)

    _insert(pipeline, 'wt', wt, 0.1)
    _insert(pipeline, 'wt_s', s, 0.1)

    pipeline.create_stream('stream', **wt_cols)
    pipeline.create_cv('test_join_where', q)
    _insert(pipeline, 'stream', s)

    expected = pipeline.execute('SELECT COUNT(*) FROM wt_s s, wt WHERE s.col0 = 1 AND wt.col0 = 1').first()
    result = pipeline.execute('SELECT COUNT(*) FROM test_join_where').first()

    assert result['count'] == expected['count']
Example #23
0
def test_bloom_agg_hashing(pipeline, clean_db):
    """
    Verify that bloom_agg correctly hashes different input types
    """
    q = """
    SELECT bloom_agg(x::integer) AS i,
    bloom_agg(y::text) AS t,
    bloom_agg(z::float8) AS f FROM test_bloom_stream
    """
    desc = ('x', 'y', 'z')
    pipeline.create_cv('test_bloom_hashing', q)

    rows = []
    for n in range(10000):
        rows.append((n, '%d' % n, float(n)))
        rows.append((n, '%05d' % n, float(n)))

    pipeline.insert('test_bloom_stream', desc, rows)

    cvq = """
    SELECT bloom_cardinality(i),
    bloom_cardinality(t), bloom_cardinality(f) FROM test_bloom_hashing
    """
    result = list(pipeline.execute(cvq))

    assert len(result) == 1

    result = result[0]

    assert result[0] == 8879
    assert result[1] == 15614
    assert result[2] == 8855
Example #24
0
def test_activate_deactivate(pipeline, clean_db):
  pipeline.create_cv('v', 'SELECT count(*) FROM stream')
  pipeline.insert('stream', ('x', ), [(1, )])

  conn = psycopg2.connect('dbname=pipeline user=%s host=localhost port=%s' % (getpass.getuser(), pipeline.port))
  conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)

  cur = conn.cursor()
  cur.execute('DEACTIVATE')
  cur.close()

  try:
    pipeline.insert('stream', ('x', ), [(1, )])
    assert False
  except:
    pass

  cur = conn.cursor()
  cur.execute('ACTIVATE')
  cur.close()
  conn.close()

  pipeline.insert('stream', ('x', ), [(1, )])
  count = pipeline.execute('SELECT * FROM v').first()['count']
  assert count == 2
def test_single_continuous_view(pipeline, clean_db):
  """
  Verify that specific continuous views can be dropped and restored
  """
  pipeline.create_stream('stream0', x='int')
  pipeline.create_cv('test_single0', 'SELECT COUNT(*) FROM stream0')
  pipeline.create_cv('test_single1', 'SELECT COUNT(*) FROM stream0')
  pipeline.insert('stream0', ('x',), [(x,) for x in range(10)])

  result = pipeline.execute('SELECT count FROM test_single0').first()
  assert result['count'] == 10

  result = pipeline.execute('SELECT count FROM test_single1').first()
  assert result['count'] == 10

  _dump(pipeline, 'test_single.sql', tables=['test_single0', 'stream0', 'test_single0_mrel'])

  pipeline.drop_all()
  _restore(pipeline, 'test_single.sql')

  result = pipeline.execute('SELECT count FROM test_single0').first()
  assert result['count'] == 10

  # We didn't dump this one
  result = list(pipeline.execute('SELECT * FROM pg_class WHERE relname LIKE \'%%test_single1%%\''))
  assert not result
Example #26
0
def test_user_low_and_high_card(pipeline, clean_db):
    """
    Verify that HLL's with low and high cardinalities are correcly combined
    """
    pipeline.create_stream('test_hll_stream', x='int', k='integer')
    q = """
    SELECT k::integer, hll_agg(x::integer) FROM test_hll_stream GROUP BY k
    """
    desc = ('k', 'x')
    pipeline.create_cv('test_hll_agg', q)

    # Low cardinalities
    rows = []
    for n in range(1000):
        rows.append((0, random.choice((-1, -2))))
        rows.append((1, random.choice((-3, -4))))

    # High cardinalities
    for n in range(10000):
        rows.append((2, n))
        rows.append((3, n))

    pipeline.insert('test_hll_stream', desc, rows)

    result = pipeline.execute('SELECT hll_cardinality(combine(hll_agg)) '
                              'FROM test_hll_agg WHERE k in (0, 1)').first()
    assert result[0] == 4

    result = pipeline.execute('SELECT hll_cardinality(combine(hll_agg)) '
                              'FROM test_hll_agg WHERE k in (2, 3)').first()
    assert result[0] == 9976

    result = pipeline.execute('SELECT hll_cardinality(combine(hll_agg)) '
                              'FROM test_hll_agg').first()
    assert result[0] == 9983
def test_join_multiple_tables(pipeline, clean_db):
    """
    Verify that stream-table joins involving multiple tables work
    """
    num_cols = 8
    join_cols = [0]
    t0_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])
    t1_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])

    pipeline.create_table('t0', **t0_cols)
    pipeline.create_table('t1', **t1_cols)
    pipeline.create_stream('stream', **t0_cols)
    q = """
    SELECT s.col0::integer FROM t0 JOIN t1 ON t0.col0 = t1.col0
    JOIN stream s ON t1.col0 = s.col0
    """

    t0 = _generate_rows(num_cols, 64)
    t1 = _generate_rows(num_cols, 64)
    s = _generate_rows(num_cols, 64)

    _insert(pipeline, 't1', t1, 0.1)
    _insert(pipeline, 't0', t0, 0.1)

    pipeline.create_cv('test_join_multi', q)
    _insert(pipeline, 'stream', s)

    expected = _join(t0, _join(s, t1, join_cols), join_cols)
    result = pipeline.execute('SELECT COUNT(*) FROM test_join_multi').first()

    assert result['count'] == len(expected)
def test_indexed(pipeline, clean_db):
    """
    Verify that stream-table joins involving indexed tables work
    """
    pipeline.create_stream('stream', x='int', y='int')
    q = """
    SELECT stream.x::integer, count(*) FROM stream
    JOIN test_indexed_t t ON stream.x = t.x GROUP BY stream.x
    """
    pipeline.create_table('test_indexed_t', x='integer', y='integer')
    pipeline.execute('CREATE INDEX idx ON test_indexed_t(x)')

    t = _generate_rows(2, 1000)
    s = _generate_rows(2, 1000)

    pipeline.insert('test_indexed_t', ('x', 'y'), t)
    time.sleep(0.1)

    pipeline.create_cv('test_indexed', q)
    pipeline.insert('stream', ('x', 'y'), s)

    expected = _join(s, t, [0])
    result = pipeline.execute('SELECT sum(count) FROM test_indexed').first()

    assert result['sum'] == len(expected)
Example #29
0
def assert_result_changes(func, args):
    """
    Verifies that the result of the given function changes with time
    """
    name = "assert_%s_decreases" % func
    pipeline.create_cv(
        name,
        "SELECT %s(%s) FROM stream WHERE arrival_timestamp > clock_timestamp() - interval '2 seconds'" % (func, args),
    )

    rows = [(n, str(n), n + 1) for n in range(1000)]
    pipeline.insert("stream", ("x", "y", "z"), rows)

    current = 1

    results = []
    while current:
        row = pipeline.execute("SELECT * FROM %s" % name).first()
        current = row[func]
        if current is None:
            break
        results.append(current)

    # Verify that we actually read something
    assert results

    pipeline.drop_cv(name)
Example #30
0
def test_multi_client(pipeline, clean_db):
  """
  Regression test for multi client.
  """

  TRIGGER_OUTPUT_LOGFILE = '/tmp/.pipelinedb_pipeline_test.log'
  pipeline.create_cv('cv0', 'SELECT x::integer,count(*) FROM stream group by x')

  conn_str = pipeline.get_conn_string()

  pipeline.create_cv_trigger('t0', 'cv0', 'true', 'pipeline_send_alert_new_row')

  # recv_alerts client needs pipeline on its path

  client_env = os.environ.copy()
  client_env["PATH"] = client_env["PATH"] + ":" + pipeline.get_bin_dir()

  cmd = [pipeline.get_recv_alerts(), '-d', conn_str, '-a', 'cv0.t0'];
  time.sleep(2)

  outfile = open(TRIGGER_OUTPUT_LOGFILE, 'w')
  client1 = subprocess.Popen(cmd, env=client_env)
  client2 = subprocess.Popen(cmd, env=client_env)

  time.sleep(4)

  client1.terminate()
  client2.terminate()
Example #31
0
def test_simple_insert(pipeline, clean_db):
    """
  Verify that we can insert some rows and count some stuff
  """
    pipeline.create_stream('stream0', key='int')
    pipeline.create_cv(
        'cv', 'SELECT key::integer, COUNT(*) FROM stream0 GROUP BY key')

    rows = [(n % 10, ) for n in range(1000)]

    pipeline.insert('stream0', ('key', ), rows)

    result = list(pipeline.execute('SELECT key, count FROM cv ORDER BY key'))

    assert len(result) == 10
    for i, row in enumerate(result):
        assert row['key'] == i
        assert row['count'] == 100
Example #32
0
def test_static_streams(pipeline, clean_db):
    """
  Verify that static stream definitions are dumped and restored
  """
    pipeline.create_stream('static', x='int', y='float8')

    _dump(pipeline, 'test_static.sql')

    pipeline.drop_stream('static')
    _restore(pipeline, 'test_static.sql')

    # Force the requirement of a static stream
    pipeline.create_cv('static_cv', 'SELECT x, y FROM static')
    pipeline.insert('static', ('x', 'y'), [(0, 1)])

    result = pipeline.execute('SELECT x, y FROM static_cv').first()
    assert result['x'] == 0
    assert result['y'] == 1
Example #33
0
def test_schema_only(pipeline, clean_db):
  """
  Verify that it is possible to only dump continuous view schemas and not data
  """
  pipeline.create_cv('test_schema', 'SELECT COUNT(*) FROM stream')
  pipeline.insert('stream', ('x',), [(x,) for x in range(10)])

  result = pipeline.execute('SELECT count FROM test_schema').first()
  assert result['count'] == 10

  _dump(pipeline, 'test_schema.sql', schema_only=True)

  pipeline.drop_all_views()
  _restore(pipeline, 'test_schema.sql')

  # No data loaded
  result = list(pipeline.execute('SELECT count FROM test_schema'))
  assert not result
Example #34
0
def test_cksketch_frequency(pipeline, clean_db):
    pipeline.create_stream('test_cmsketch_stream', k='int', x='int')

    q = """
  SELECT k::integer, freq_agg(x::int) AS c FROM test_cmsketch_stream
  GROUP BY k
  """
    desc = ('k', 'x')
    pipeline.create_cv('test_cmsketch_frequency', q)

    rows = [(n, None) for n in range(100)]
    pipeline.insert('test_cmsketch_stream', desc, rows)

    result = pipeline.execute(
        'SELECT freq(c, null) AS x FROM test_cmsketch_frequency ORDER BY k')
    assert len(result) == 100
    for row in result:
        assert row[0] == 0
def test_nested_transforms(pipeline, clean_db):
  pipeline.create_stream('stream0', x='int')
  pipeline.create_stream('stream2', x='int')
  pipeline.create_stream('stream4', x='int')

  pipeline.create_cv('cv0', 'SELECT count(*) FROM stream4')
  pipeline.create_cv('cv1', 'SELECT count(*) FROM stream2')
  pipeline.create_ct('ct0', 'SELECT x::int FROM stream2 WHERE mod(x, 4) = 0',
             "pipelinedb.insert_into_stream('stream4')")
  pipeline.create_ct('ct1', 'SELECT x::int FROM stream0 WHERE mod(x, 2) = 0',
             "pipelinedb.insert_into_stream('stream2')")

  pipeline.insert('stream0', ('x',), [(n,) for n in range(1000)])

  count = pipeline.execute('SELECT count FROM cv0')[0]['count']
  assert count == 250
  count = pipeline.execute('SELECT count FROM cv1')[0]['count']
  assert count == 500
def test_deadlock_regress(pipeline, clean_db):
    nitems = 2000000
    tmp_file = os.path.join(tempfile.gettempdir(), 'tmp.json')
    query = 'SELECT generate_series(1, %d) AS n' % nitems
    pipeline.execute("COPY (%s) TO '%s'" % (query, tmp_file))

    pipeline.create_stream('s1', n='int')
    pipeline.create_stream('s2', n='int')
    pipeline.create_ct('ct', 'SELECT n FROM s1 WHERE n IS NOT NULL',
                       "pipeline_stream_insert('s2')")
    pipeline.create_cv('cv', 'SELECT count(*) FROM s2')

    for copy in [True, False]:
        for nworkers in [1, 4]:
            for sync in ['off', 'on']:
                pipeline.stop()
                pipeline.run({
                    'continuous_query_num_workers': nworkers,
                    'synchronous_stream_insert': sync
                })

                pipeline.execute('TRUNCATE CONTINUOUS VIEW cv')
                pipeline.execute('COMMIT')

                if copy:
                    pipeline.execute("COPY s1 (n) FROM '%s'" % tmp_file)
                else:
                    pipeline.execute('INSERT INTO s1 (n) %s' % query)

                count = dict(
                    pipeline.execute('SELECT count FROM cv').first() or {})
                ntries = 5
                while count.get('count') != nitems and ntries > 0:
                    assert sync == 'off'
                    time.sleep(1)
                    count = dict(
                        pipeline.execute('SELECT count FROM cv').first() or {})
                    ntries -= 1
                assert count and count['count'] == nitems

    os.remove(tmp_file)

    pipeline.stop()
    pipeline.run()
def test_consume_text(pipeline, kafka, clean_db):
    """
  Interpret consumed messages as text
  """
    pipeline.create_stream('comma_stream',
                           x='integer',
                           y='integer',
                           z='integer')
    pipeline.create_cv('comma_cv', 'SELECT x, y, z FROM comma_stream')
    kafka.create_topic('test_consume_text_comma')

    pipeline.create_stream('tab_stream', x='integer', y='integer', z='integer')
    pipeline.create_cv('tab_cv', 'SELECT x, y, z FROM tab_stream')
    kafka.create_topic('test_consume_text_tab')

    pipeline.consume_begin('test_consume_text_comma',
                           'comma_stream',
                           delimiter=',')
    pipeline.consume_begin('test_consume_text_tab',
                           'tab_stream',
                           delimiter='\t')

    producer = kafka.get_producer('test_consume_text_comma')
    for n in range(100):
        message = ','.join(map(str, [n, n, n]))
        producer.produce(message)

    producer = kafka.get_producer('test_consume_text_tab')
    for n in range(100):
        message = '\t'.join(map(str, [n, n, n]))
        producer.produce(message)

    def delimited_messages():
        rows = pipeline.execute('SELECT * from comma_cv ORDER BY x')
        assert len(rows) == 100
        for i, row in enumerate(rows):
            assert tuple([row[0], row[1], row[2]]) == (i, i, i)

        rows = pipeline.execute('SELECT * from tab_cv ORDER BY x')
        assert len(rows) == 100
        for i, row in enumerate(rows):
            assert tuple([row[0], row[1], row[2]]) == (i, i, i)

    assert eventually(delimited_messages)
Example #38
0
def test_sw_trigger_sync(pipeline, clean_db):
    """
  Sets up a sliding window query, and inserts data into it before
  any triggers are added. A trigger is added, and then some more data is
  inserted.

  Verify that counts are equal to pre-creation plus post_creation amts
  """
    pipeline.create_cv(
        'cv0',
        'SELECT x::integer,count(*) FROM stream where (arrival_timestamp > clock_timestamp() - interval \'10 seconds\') group by x;',
        step_factor='10')

    rows = [(n % 10, ) for n in range(1000)]
    pipeline.insert('stream', ('x', ), rows)

    # sleep to make sure that new inserts are in a new arrival_ts group

    time.sleep(4)

    pipeline.create_cv_trigger('t0', 'cv0', 'true',
                               'pipeline_test_alert_new_row')
    time.sleep(1)

    rows = [(n % 10, ) for n in range(10)]
    pipeline.insert('stream', ('x', ), rows)

    time.sleep(1)
    lines = pipeline.read_trigger_output()
    d = {}

    for l in lines:
        assert (len(l) == 2)

        k = int(l[0])
        v = int(l[1])

        assert (k >= 0 and k <= 9)
        d[k] = v

    assert (len(d) == 10)

    for x in d:
        assert (d[x] == 101)
Example #39
0
def test_tablespace(pipeline, clean_db):
    """
  Verify that CVs can be created within tablespaces
  """
    path = os.path.abspath('test_tablespace')
    if os.path.exists(path):
        shutil.rmtree(path)

    os.mkdir(path)
    pipeline.execute("CREATE TABLESPACE test_tablespace LOCATION '%s'" % path)

    pipeline.create_stream('test_tablespace_s', x='int')

    q = 'SELECT x % 10 AS g, count(DISTINCT x) FROM test_tablespace_s GROUP BY g'
    pipeline.create_cv('test_tablespace0', q)
    pipeline.create_cv('test_tablespace1', q, tablespace='test_tablespace')
    pipeline.insert('test_tablespace_s', ('x', ),
                    [(x, ) for x in range(10000)])

    result0 = pipeline.execute('SELECT count(*) FROM test_tablespace0')
    result1 = pipeline.execute('SELECT count(*) FROM test_tablespace1')

    assert len(result0) == 1
    assert len(result1) == 1
    assert result0[0]['count'] == result1[0]['count']

    result0 = pipeline.execute('SELECT combine(count) FROM test_tablespace0')
    result1 = pipeline.execute('SELECT combine(count) FROM test_tablespace1')

    assert len(result0) == 1
    assert len(result1) == 1
    assert result0[0]['combine'] == result1[0]['combine']

    # Now verify that test_tablespace1 is physically in the tablespace
    row = pipeline.execute(
        "SELECT oid FROM pg_class WHERE relname = 'test_tablespace1_mrel'")
    oid = row[0]['oid']

    found = glob.glob('test_tablespace/*/*/%d' % oid)
    assert len(found) == 1

    pipeline.drop_all()
    pipeline.execute('DROP TABLESPACE test_tablespace')
    shutil.rmtree(path)
def test_schema_inference(pipeline, clean_db):
    """
  Verify that types are properly inferred
  """
    pipeline.create_cv(
        'test_infer0',
        'SELECT x::int8, y::bigint, COUNT(*) FROM infer_stream GROUP BY x, y')
    pipeline.create_cv(
        'test_infer1',
        'SELECT x::int4, y::real, COUNT(*) FROM infer_stream GROUP BY x, y')
    pipeline.create_cv(
        'test_infer2',
        'SELECT x::int2, y::integer, COUNT(*) FROM infer_stream GROUP BY x, y')
    pipeline.create_cv(
        'test_infer3',
        'SELECT x::numeric, y::float8, COUNT(*) FROM infer_stream GROUP BY x, y'
    )
    pipeline.create_cv(
        'test_infer4',
        'SELECT x::int8, y::bigint, COUNT(*) FROM infer_stream GROUP BY x, y')
    desc = ('x', 'y')
    rows = []

    for n in range(10000):
        rows.append(
            (random.random() + 1, random.random() * random.randint(0, 128)))

    pipeline.insert('infer_stream', desc, rows)

    result = pipeline.execute('SELECT * FROM test_infer0 ORDER BY x')
    for row in result:
        assert row['count']

    result = pipeline.execute('SELECT * FROM test_infer1 ORDER BY x')
    for row in result:
        assert row['count']

    result = pipeline.execute('SELECT * FROM test_infer2 ORDER BY x')
    for row in result:
        assert row['count']

    result = pipeline.execute('SELECT * FROM test_infer3 ORDER BY x')
    for row in result:
        assert row['count']
Example #41
0
def test_tdigest_agg(pipeline, clean_db):
    """
    Test tdigest_agg, tdigest_merge_agg, tdigest_cdf, tdigest_quantile
    """
    q = """
    SELECT k::integer, tdigest_agg(x::int) AS t FROM test_tdigest_stream
    GROUP BY k
    """
    desc = ('k', 'x')
    pipeline.create_stream('test_tdigest_stream', k='int', x='int')
    pipeline.create_cv('test_tdigest_agg', q)

    rows = []
    for _ in range(10):
      for n in range(1000):
        rows.append((0, n))
        rows.append((1, n + 500))

    pipeline.insert('test_tdigest_stream', desc, rows)

    result = list(pipeline.execute(
      'SELECT tdigest_quantile(t, 0.1) FROM test_tdigest_agg ORDER BY k')
                  .fetchall())
    assert len(result) == 2
    assert abs(int(result[0]['tdigest_quantile']) - 99) <= 1
    assert abs(int(result[1]['tdigest_quantile']) - 599) <= 1

    result = list(pipeline.execute(
      'SELECT tdigest_quantile(combine(t), 0.1) FROM test_tdigest_agg')
                  .fetchall())
    assert len(result) == 1
    assert abs(int(result[0]['tdigest_quantile']) - 200) <= 4

    result = list(pipeline.execute(
      'SELECT tdigest_cdf(t, 600) FROM test_tdigest_agg ORDER BY k')
                  .fetchall())
    assert len(result) == 2
    assert round(result[0]['tdigest_cdf'], 2) == 0.6
    assert round(result[1]['tdigest_cdf'], 2) == 0.1

    result = list(pipeline.execute(
      'SELECT tdigest_cdf(combine(t), 600) FROM test_tdigest_agg').fetchall())
    assert len(result) == 1
    assert round(result[0]['tdigest_cdf'], 2) == 0.35
Example #42
0
def test_cont_transforms(pipeline, clean_db):
    pipeline.execute('CREATE STREAM cv_stream (x int, y text)')
    pipeline.execute('CREATE STREAM ct_stream (x int, y text)')
    pipeline.create_cv('test_cv', 'SELECT count(*) FROM cv_stream')
    pipeline.create_ct(
        'test_ct1',
        'SELECT x::int, y::text FROM ct_stream WHERE mod(x, 2) = 0',
        "pipeline_stream_insert('cv_stream', 'cv_stream')")
    pipeline.create_table('test_t', x='int', y='text')
    pipeline.execute('''
  CREATE OR REPLACE FUNCTION test_tg()
  RETURNS trigger AS
  $$
  BEGIN
   INSERT INTO test_t (x, y) VALUES (NEW.x, NEW.y);
   RETURN NEW;
  END;
  $$
  LANGUAGE plpgsql;
  ''')
    pipeline.create_ct('test_ct2', 'SELECT x::int, y::text FROM ct_stream',
                       'test_tg()')

    pipeline.insert('ct_stream', ('x', 'y'), [(1, 'hello'), (2, 'world')])
    time.sleep(1)

    _dump(pipeline, 'test_cont_transform.sql')

    pipeline.drop_all()
    pipeline.drop_table('test_t')
    pipeline.execute('DROP FUNCTION test_tg()')

    _restore(pipeline, 'test_cont_transform.sql')

    pipeline.insert('ct_stream', ('x', 'y'), [(1, 'hello'), (2, 'world')])
    time.sleep(1)

    assert pipeline.execute('SELECT count FROM test_cv').first()['count'] == 4
    ntups = 0
    for row in pipeline.execute('SELECT x, count(*) FROM test_t GROUP BY x'):
        assert row['count'] == 2
        assert row['x'] in (1, 2)
        ntups += 1
    assert ntups == 2
Example #43
0
def test_adhoc_against_identical_cv(pipeline, clean_db):
    """
  Verify that an adhoc query produces the same output as an identical
  continuous view
  """
    q = """
  SELECT x::integer + 1 AS g, sum(y::integer), avg(z::integer), count(*)
  FROM test_adhoc_stream GROUP BY g;
  """
    pipeline.create_cv('test_adhoc_cv', q)

    rows = [(x % 10, random.randint(1, 1000), random.randint(1, 1000))
            for x in range(1000)]

    path = os.path.abspath(
        os.path.join(pipeline.tmp_dir, 'test_adhoc_against_identical_cv.sql'))
    tmp_file = open(path, 'w')

    for row in rows:
        v = gen_insert('test_adhoc_stream', ('x', 'y', 'z'), [row])
        tmp_file.write(v)

    tmp_file.close()

    psql = os.path.abspath(os.path.join(pipeline.tmp_dir, 'bin/psql'))
    cmd = ['./run_adhoc.expect', psql, str(pipeline.port), 'pipeline', q, path]
    output = subprocess.Popen(cmd, stdout=PIPE).communicate()[0]

    lines = output.split('\n')
    lines = filter(lambda x: not re.match(r'^\s*$', x), lines)
    lines = [l.split('\t')[1:] for l in lines]
    lines = lines[-10:]

    adhoc_results = {}
    for line in lines:
        g, s, a, c = line
        adhoc_results[int(g)] = int(g), int(s), float(a), int(c)

    cv_result = pipeline.execute('SELECT * FROM test_adhoc_cv')
    for row in cv_result:
        adhoc_row = adhoc_results[row[0]]
        assert adhoc_row[1] == row[1]
        assert abs(adhoc_row[2] - float(row[2])) < 0.001
        assert adhoc_row[3] == row[3]
Example #44
0
def test_multiple(pipeline, clean_db):
  """
  Verify that multiple continuous views work together properly
  """
  pipeline.create_cv('cv0', 'SELECT n::numeric FROM stream WHERE n > 10.00001')
  pipeline.create_cv('cv1',
                     'SELECT s::text FROM stream WHERE s LIKE \'%%this%%\'')

  rows = [(float(n + 10), 'this', 100) for n in range(1000)]
  for n in range(10):
    rows.append((float(n), 'not a match', -n))

  pipeline.insert('stream', ('n', 's', 'unused'), rows)

  result = list(pipeline.execute('SELECT * FROM cv0'))
  assert len(result) == 999

  result = list(pipeline.execute('SELECT * FROM cv1'))
  assert len(result) == 1000
def test_dump_data_only(pipeline, clean_db):
    """
  Verify that it is possible to only dump continuous view data and not schemas
  """
    pipeline.create_cv('test_data', 'SELECT COUNT(*) FROM stream')
    pipeline.insert('stream', ('x', ), [(x, ) for x in range(10)])

    result = pipeline.execute('SELECT count FROM test_data').first()
    assert result['count'] == 10

    _dump(pipeline, 'test_data.sql', data_only=True)

    pipeline.drop_all_queries()

    pipeline.create_cv('test_data', 'SELECT COUNT(*) FROM stream')
    _restore(pipeline, 'test_data.sql')

    result = pipeline.execute('SELECT count FROM test_data').first()
    assert result['count'] == 10
def test_null_offsets(pipeline, kafka, clean_db):
  """
  Verify that offsets are stored as NULL if a consumer hasn't consumed any messages yet
  """
  kafka.create_topic('null_topic', partitions=4)
  pipeline.create_stream('null_stream', x='integer')
  pipeline.create_cv('null0', 'SELECT count(*) FROM null_stream')

  pipeline.consume_begin('null_topic', 'null_stream', group_id='null_offsets')

  # Write to a single partition so that only one partition's offsets are updated
  producer = kafka.get_producer('null_topic')
  producer.produce('1', partition_key='key')

  time.sleep(10)
  pipeline.consume_end()

  rows = pipeline.execute('SELECT * FROM pipeline_kafka.offsets WHERE "offset" IS NULL')
  assert len(rows) == 3
Example #47
0
def test_colums_subset(pipeline, clean_db):
    """
    Verify that copying data from a file into a stream works when the file's input
    columns are a subset of the stream0's columns
    """
    pipeline.create_stream('stream0',
                           x='int',
                           y='float8',
                           z='numeric',
                           m='int')
    q = 'SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric), max(m::integer) FROM stream0'
    pipeline.create_cv('test_copy_subset', q)
    pipeline.create_table('test_copy_subset_t',
                          x='integer',
                          y='float8',
                          z='numeric')

    path = os.path.abspath(os.path.join(pipeline.tmp_dir, 'test_copy.csv'))

    rows = []
    for n in range(10000):
        row = random.randint(1, 1024), random.randint(1, 1024), random.random()
        rows.append(row)

    _generate_csv(path, rows, desc=('x', 'y', 'z'))

    pipeline.execute(
        'COPY test_copy_subset_t (x, y, z) FROM \'%s\' HEADER CSV' % path)

    pipeline.execute('COPY stream0 (x, y, z) FROM \'%s\' HEADER CSV' % path)

    expected = pipeline.execute(
        'SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric) FROM test_copy_subset_t'
    ).first()
    result = list(pipeline.execute('SELECT s0, s1, avg FROM test_copy_subset'))

    assert len(result) == 1

    result = result[0]

    assert result[0] == expected[0]
    assert result[1] == expected[1]
    assert result[2] == expected[2]
def test_hll_count_distinct(pipeline, clean_db):
    """
    Verify that streaming COUNT(DISTINCT) works
    """
    q = 'SELECT COUNT(DISTINCT x::integer) FROM stream'
    pipeline.create_cv('test_count_distinct', q)

    desc = ('x', )
    values = [(random.randint(1, 1024), ) for n in range(1000)]

    pipeline.insert('stream', desc, values)

    expected = len(set(values))
    result = pipeline.execute('SELECT count FROM test_count_distinct').first()

    # Error rate should be well below %2
    delta = abs(expected - result['count'])

    assert delta / float(expected) <= 0.02
Example #49
0
def test_distinct(pipeline, clean_db):
    """
  Verify that streaming SELECT DISTINCT ON (...) works
  """
    pipeline.create_stream('stream0', x='int', y='int', z='int')
    pipeline.create_table('table0', x='int', y='int', z='int')
    q = 'SELECT DISTINCT ON (x::int, y::int - z::int) x::int, y::int FROM stream0'
    pipeline.create_cv('test_distinct', q)

    uniques = defaultdict(set)
    values = []
    for _ in xrange(2000):
        x, y, z = random.randint(0, 20), random.randint(0, 20), random.randint(
            0, 20)
        values.append((x, y, z))
        uniques[(x, y - z)].add(y)

    pipeline.insert('stream0', ['x', 'y', 'z'], values)
    pipeline.insert('table0', ['x', 'y', 'z'], values)

    q = """
  SELECT DISTINCT ON (x::int, y::int - z::int) x::int, y::int FROM table0
  """
    expected = pipeline.execute(q)
    expected = len(expected)

    assert expected < 2000

    result = pipeline.execute('SELECT COUNT(*) FROM test_distinct')[0]

    assert expected == result['count']

    # Check if the first row was selected for uniques
    result = pipeline.execute('SELECT * FROM test_distinct')
    reverse_uniques = defaultdict(set)

    for (x, _), ys in uniques.iteritems():
        for y in ys:
            reverse_uniques[y].add(x)

    for row in result:
        assert row['x'] in reverse_uniques[row['y']]
def test_deadlock_regress(pipeline, clean_db):
  nitems = 2000000
  tmp_file = os.path.join(tempfile.gettempdir(), 'tmp.json')
  query = 'SELECT generate_series(1, %d) AS n' % nitems
  pipeline.execute("COPY (%s) TO '%s'" % (query, tmp_file))

  pipeline.create_stream('s1', n='int')
  pipeline.create_stream('s2', n='int')
  pipeline.create_ct('ct', 'SELECT n FROM s1 WHERE n IS NOT NULL',
             "pipelinedb.insert_into_stream('s2')")
  pipeline.create_cv('cv', 'SELECT count(*) FROM s2')

  for copy in [True, False]:
    for nworkers in [1, 4]:
      for sync in ['receive', 'commit']:
        pipeline.stop()
        pipeline.run({
          'pipelinedb.num_workers': nworkers,
          'pipelinedb.stream_insert_level': 'sync_%s' % sync
          })

        pipeline.execute("SELECT pipelinedb.truncate_continuous_view('cv')")
        pipeline.execute('COMMIT')

        if copy:
          pipeline.execute("COPY s1 (n) FROM '%s'" % tmp_file)
        else:
          pipeline.execute('INSERT INTO s1 (n) %s' % query)

        count = dict(pipeline.execute('SELECT count FROM cv')[0] or {})
        ntries = 5
        while count.get('count') != nitems and ntries > 0:
          assert sync == 'receive'
          time.sleep(1)
          count = dict(pipeline.execute('SELECT count FROM cv')[0] or {})
          ntries -= 1
        assert count and count['count'] == nitems

  os.remove(tmp_file)

  pipeline.stop()
  pipeline.run()
Example #51
0
def test_combine_table(pipeline, clean_db):
  pipeline.create_stream('s', x='int')
  pipeline.create_cv('combine_table',
                     'SELECT x::int, COUNT(*) FROM s GROUP BY x')

  values = [(i,) for i in xrange(1000)]
  pipeline.insert('s', ('x',), values)

  pipeline.execute('SELECT * INTO tmprel FROM combine_table_mrel')

  stop = False
  ninserts = [0]

  def insert():
    while not stop:
      pipeline.insert('s', ('x',), values)
      ninserts[0] += 1
      time.sleep(0.01)

  t = threading.Thread(target=insert)
  t.start()

  time.sleep(2)

  conn = psycopg2.connect('dbname=pipeline user=%s host=localhost port=%s' %
                          (getpass.getuser(), pipeline.port))
  cur = conn.cursor()
  cur.execute("SELECT pipeline_combine_table('combine_table', 'tmprel')")
  conn.commit()
  conn.close()

  stop = True
  t.join()

  assert ninserts[0] > 0

  rows = list(pipeline.execute('SELECT count FROM combine_table'))
  assert len(rows) == 1000
  for row in rows:
    assert row[0] == ninserts[0] + 2

  pipeline.execute('DROP TABLE tmprel')
Example #52
0
def test_concurrent_copy(pipeline, clean_db):
    pipeline.create_stream('stream0', x='int')
    pipeline.create_cv('concurrent_copy0',
                       'SELECT x::int, count(*) FROM stream0 GROUP BY x')
    pipeline.create_cv('concurrent_copy1', 'SELECT count(*) FROM stream0')

    tmp_file = os.path.join(tempfile.gettempdir(), 'tmp.copy')
    query = 'SELECT generate_series(1, 2000) AS x'
    pipeline.execute("COPY (%s) TO '%s'" % (query, tmp_file))

    num_threads = 4
    stop = False
    inserted = [0] * num_threads

    def insert(i):
        conn = psycopg2.connect(
            'dbname=postgres user=%s host=localhost port=%s' %
            (getpass.getuser(), pipeline.port))
        cur = conn.cursor()
        while not stop:
            cur.execute("COPY stream0 (x) FROM '%s'" % tmp_file)
            conn.commit()
            inserted[i] += 2000
        conn.close()

    threads = [
        threading.Thread(target=insert, args=(i, )) for i in range(num_threads)
    ]
    map(lambda t: t.start(), threads)

    time.sleep(60)

    stop = True
    map(lambda t: t.join(), threads)

    time.sleep(5)

    total = pipeline.execute('SELECT sum(count) FROM concurrent_copy0')[0][0]
    assert total == sum(inserted)

    total = pipeline.execute('SELECT count FROM concurrent_copy1')[0][0]
    assert total == sum(inserted)
Example #53
0
def test_transforms(pipeline, clean_db):
    """
  Verify that continuous transforms work properly on output streams
  """
    pipeline.create_stream('stream0', x='int')
    pipeline.create_cv('sw',
                       'SELECT x::integer, COUNT(*) FROM stream0 GROUP BY x',
                       sw='5 seconds')

    # Write a row to a stream each time a row goes out of window
    q = 'SELECT (old).x FROM sw_osrel WHERE old IS NOT NULL AND new IS NULL'
    pipeline.create_stream('oow_stream', x='integer')
    pipeline.create_ct('ct', q, "pipeline_stream_insert('oow_stream')")
    pipeline.create_cv('ct_recv', 'SELECT x FROM oow_stream')

    pipeline.insert('stream0', ('x', ), [(x % 100, ) for x in range(10000)])
    time.sleep(7)

    rows = list(pipeline.execute('SELECT * FROM ct_recv'))
    assert len(rows) == 100
Example #54
0
def test_dump(pipeline, clean_db):
    """
  Verify that we can dump and restore CVs using INSERT statements
  """
    pipeline.create_stream('stream0', x='int')
    q = """
  SELECT x::integer %% 100 AS g, avg(x) + 1 AS avg, count(*), count(distinct x) AS distincts FROM stream0
  GROUP BY g
  """
    pipeline.create_cv('test_dump', q)

    rows = [(x, ) for x in range(1000)]
    pipeline.insert('stream0', ('x', ), rows)

    def _verify():
        result = pipeline.execute('SELECT count(*) FROM test_dump').first()
        assert result['count'] == 100

        result = pipeline.execute('SELECT sum(avg) FROM test_dump').first()
        assert result['sum'] == 50050

        result = pipeline.execute(
            'SELECT sum(distincts) FROM test_dump').first()
        assert result['sum'] == 1000

    _verify()
    _dump(pipeline, 'test_dump.sql')

    pipeline.drop_all()
    _restore(pipeline, 'test_dump.sql')
    _verify()

    # Now verify that we can successfully add more data to the restored CV
    rows = [(x, ) for x in range(2000)]
    pipeline.insert('stream0', ('x', ), rows)

    result = pipeline.execute('SELECT sum(count) FROM test_dump').first()
    assert result['sum'] == 3000

    result = pipeline.execute('SELECT sum(distincts) FROM test_dump').first()
    assert result['sum'] == 2000
def test_copy_to_typed_stream(pipeline, clean_db):
    """
    Verify that copying data from a file into a typed stream works.
    """
    pipeline.create_stream('stream', x='integer', y='float8', z='numeric')

    q = 'SELECT sum(x) AS s0, sum(y) AS s1, avg(z) FROM stream'
    pipeline.create_cv('test_copy_to_typed_stream', q)
    pipeline.create_table('test_copy_to_typed_stream_t',
                          x='integer',
                          y='float8',
                          z='numeric')

    path = os.path.abspath(os.path.join(pipeline.tmp_dir, 'test_copy.csv'))

    rows = []
    for n in range(10000):
        row = random.randint(1, 1024), random.randint(1, 1024), random.random()
        rows.append(row)

    _generate_csv(path, rows, desc=('x', 'y', 'z'))

    pipeline.execute(
        'COPY test_copy_to_typed_stream_t (x, y, z) FROM \'%s\' HEADER CSV' %
        path)

    pipeline.execute('COPY stream (x, y, z) FROM \'%s\' HEADER CSV' % path)

    expected = pipeline.execute(
        'SELECT sum(x) AS s0, sum(y) AS s1, avg(z) FROM test_copy_to_typed_stream_t'
    ).first()
    result = list(
        pipeline.execute('SELECT s0, s1, avg FROM test_copy_to_typed_stream'))

    assert len(result) == 1

    result = result[0]

    assert result[0] == expected[0]
    assert result[1] == expected[1]
    assert result[2] == expected[2]
Example #56
0
def test_object_aggs(pipeline, clean_db):
    """
  Verify that combines work properly on object aggs
  """
    q = """
  SELECT x::integer % 10 AS k,
  json_agg(x), json_object_agg(x, y::float8), string_agg(s::text, \' :: \')FROM stream0 GROUP BY k;
  """
    desc = ('x', 'y', 's')
    pipeline.create_stream('stream0', x='int', y='float8', s='text')
    pipeline.create_cv('test_object_aggs', q)
    pipeline.create_table('test_object_aggs_t',
                          x='integer',
                          y='float8',
                          s='text')

    rows = []
    for n in range(10000):
        row = (random.randint(0, 1000), random.random(),
               str(n) * random.randint(1, 8))
        rows.append(row)

    pipeline.insert('stream0', desc, rows)
    pipeline.insert('test_object_aggs_t', desc, rows)

    tq = """
  SELECT json_agg(x), json_object_agg(x, y::float8), string_agg(s::text, \' :: \') FROM test_object_aggs_t
  """
    table_result = pipeline.execute(tq)

    cq = """
  SELECT combine(json_agg), combine(json_object_agg), combine(string_agg) FROM test_object_aggs
  """
    cv_result = pipeline.execute(cq)

    assert len(table_result) == len(cv_result)

    for tr, cr in zip(table_result, cv_result):
        assert sorted(tr[0]) == sorted(cr[0])
        assert sorted(tr[1]) == sorted(cr[1])
        assert sorted(tr[2]) == sorted(cr[2])
Example #57
0
def test_concurrent_inserts(pipeline, clean_db):
    pipeline.create_stream('stream0', x='int')
    pipeline.create_cv('concurrent_inserts0',
                       'SELECT x::int, count(*) FROM stream0 GROUP BY x')
    pipeline.create_cv('concurrent_inserts1', 'SELECT count(*) FROM stream0')

    num_threads = 4
    stop = False
    inserted = [0] * num_threads

    def insert(i):
        conn = psycopg2.connect(
            'dbname=postgres user=%s host=localhost port=%s' %
            (getpass.getuser(), pipeline.port))
        cur = conn.cursor()
        while not stop:
            cur.execute('INSERT INTO stream0 (x) '
                        'SELECT x % 100 FROM generate_series(1, 2000) AS x')
            conn.commit()
            inserted[i] += 2000
        conn.close()

    threads = [
        threading.Thread(target=insert, args=(i, )) for i in range(num_threads)
    ]
    map(lambda t: t.start(), threads)

    time.sleep(60)

    stop = True
    map(lambda t: t.join(), threads)

    time.sleep(5)

    total = pipeline.execute(
        'SELECT sum(count) FROM concurrent_inserts0')[0]['sum']
    assert total == sum(inserted)

    total = pipeline.execute(
        'SELECT count FROM concurrent_inserts1')[0]['count']
    assert total == sum(inserted)
Example #58
0
def test_nested_expressions(pipeline, clean_db):
    """
    Verify that combines work properly on arbitrarily nested expressions
    """
    q = """
    SELECT x::integer %% 10 AS k,
    (rank(256) WITHIN GROUP (ORDER BY x) + dense_rank(256) WITHIN GROUP (ORDER BY x)) *
        (avg(x + y::float8) - (sum(x) * avg(y))) AS whoa
    FROM stream GROUP BY k
    """
    desc = ('x', 'y')
    pipeline.create_cv('test_nested', q)
    pipeline.create_table('test_nested_t', x='integer', y='float8')

    rows = []
    for n in range(10000):
        row = (random.randint(0, 1000), random.random())
        rows.append(row)

    pipeline.insert('stream', desc, rows)
    pipeline.insert('test_nested_t', desc, rows)

    # Note that the CQ will use the HLL variant of dense_rank,
    # so use hll_dense_rank on the table too
    tq = """
    SELECT
    (rank(256) WITHIN GROUP (ORDER BY x) + hll_dense_rank(256) WITHIN GROUP (ORDER BY x)) *
        (avg(x + y::float8) - (sum(x) * avg(y))) AS whoa
    FROM test_nested_t
    """
    table_result = list(pipeline.execute(tq))

    cq = """
    SELECT combine(whoa) FROM test_nested
    """
    cv_result = list(pipeline.execute(cq))

    assert len(table_result) == len(cv_result)

    for tr, cr in zip(table_result, cv_result):
        assert abs(tr[0] - cr[0]) < 0.0001
Example #59
0
def test_hypothetical_set_aggs(pipeline, clean_db):
    """
  Verify that combines work properly on HS aggs
  """
    q = """
  SELECT x::integer % 10 AS k,
  rank(256) WITHIN GROUP (ORDER BY x),
  dense_rank(256) WITHIN GROUP (ORDER BY x)
  FROM stream0 GROUP BY k
  """
    desc = ('x', 'y')
    pipeline.create_stream('stream0', x='int', y='float8')
    pipeline.create_cv('test_hs_aggs', q)
    pipeline.create_table('test_hs_aggs_t', x='integer', y='float8')

    rows = []
    for n in range(10000):
        row = (random.randint(0, 1000), random.random())
        rows.append(row)

    pipeline.insert('stream0', desc, rows)
    pipeline.insert('test_hs_aggs_t', desc, rows)

    # Note that the CQ will use the combinable variant of dense_rank,
    # so use that on the table too
    tq = """
  SELECT rank(256) WITHIN GROUP (ORDER BY x), combinable_dense_rank(256, x)
  FROM test_hs_aggs_t
  """
    table_result = pipeline.execute(tq)

    cq = """
  SELECT combine(rank), combine(dense_rank) FROM test_hs_aggs
  """
    cv_result = pipeline.execute(cq)

    assert len(table_result) == len(cv_result)

    for tr, cr in zip(table_result, cv_result):
        assert tr[0] == cr[0]
        assert tr[1] == cr[1]
Example #60
0
def test_combine(pipeline, clean_db):
    """
    Verify that partial tuples are combined with on-disk tuples
    """
    pipeline.create_cv('combine',
                       'SELECT key::text, COUNT(*) FROM stream GROUP BY key')

    rows = []
    for n in range(100):
        for m in range(100):
            key = '%d%d' % (n % 10, m)
            rows.append((key, 0))

    pipeline.insert('stream', ('key', 'unused'), rows)

    total = 0
    result = pipeline.execute('SELECT * FROM combine')
    for row in result:
        total += row['count']

    assert total == 10000