Esempio n. 1
0
def test_null_groups(pipeline, clean_db):
    """
  Verify that null group columns are considered equal
  """
    pipeline.create_stream('s', x='int', y='int', z='int')
    q = """
  SELECT x::integer, y::integer, z::integer, COUNT(*) FROM s
  GROUP BY x, y, z;
  """
    desc = ('x', 'y', 'z')
    pipeline.create_cv('test_null_groups', q)
    pipeline.create_table('test_null_groups_t',
                          x='integer',
                          y='integer',
                          z='integer')

    rows = []
    for n in range(10000):
        vals = list(random.randint(0, 10) for n in range(3))
        vals = map(lambda n: random.random() > 0.1 and n or None, vals)
        rows.append(tuple(vals))

    pipeline.insert('s', desc, rows)
    pipeline.insert('test_null_groups_t', desc, rows)

    table_q = """
  SELECT x, y, z, COUNT(*) FROM test_null_groups_t
  GROUP BY x, y, z ORDER BY x, y, z;
  """
    expected = pipeline.execute(table_q)
    result = pipeline.execute(
        'SELECT x, y, z, count FROM test_null_groups ORDER BY x, y, z')

    for r, e in zip(result, expected):
        assert r == e
Esempio n. 2
0
def test_combine_in_view(pipeline, clean_db):
    """
  Verify that combines in views on top of continuous views work
  """
    q = """
  SELECT x::integer, avg(y::integer) FROM stream0 GROUP BY x
  """
    desc = ('x', 'y')
    pipeline.create_stream('stream0', x='int', y='float8')
    pipeline.create_cv('test_combine_view', q)
    pipeline.execute(
        'CREATE VIEW v AS SELECT combine(avg) FROM test_combine_view')

    rows = []
    for n in range(10000):
        rows.append((random.randint(1, 256), random.randint(1, 1024)))

    pipeline.insert('stream0', desc, rows)

    view = pipeline.execute('SELECT * FROM v')

    assert len(view) == 1

    expected = sum(r[1] for r in rows) / float(len(rows))

    assert abs(float(view[0][0]) - expected) < 0.00001

    pipeline.execute('DROP VIEW v')
Esempio n. 3
0
def test_filter_clause(pipeline, clean_db):
    """
    Verify that FILTER clauses work on aggregates and sliding window aggregates
    """
    pipeline.create_stream("test_filter_stream", x="int")
    q = """
    SELECT SUM(x::int) FILTER (WHERE mod(x, 2) = 0) AS sum2, SUM(x::int) FILTER (WHERE mod(x, 3) = 0) AS sum3 FROM test_filter_stream
    """
    sw = """
    WHERE arrival_timestamp > clock_timestamp() - interval '30 second'
    """
    pipeline.create_cv("test_filter", q)
    pipeline.create_cv("test_filter_sw", "%s %s" % (q, sw))

    desc = ("x",)
    rows = []
    for n in range(1000):
        rows.append((n,))

    pipeline.insert("test_filter_stream", desc, rows)

    sum2 = sum(filter(lambda x: x % 2 == 0, map(lambda x: x[0], rows)))
    sum3 = sum(filter(lambda x: x % 3 == 0, map(lambda x: x[0], rows)))

    result1 = pipeline.execute("SELECT * FROM test_filter").first()
    result2 = pipeline.execute("SELECT * FROM test_filter_sw").first()

    assert result1["sum2"] == result2["sum2"] == sum2
    assert result1["sum3"] == result2["sum3"] == sum3
Esempio n. 4
0
def test_single_continuous_view(pipeline, clean_db):
    """
  Verify that specific continuous views can be dropped and restored
  """
    pipeline.create_stream('stream0', x='int')
    pipeline.create_cv('test_single0', 'SELECT COUNT(*) FROM stream0')
    pipeline.create_cv('test_single1', 'SELECT COUNT(*) FROM stream0')
    pipeline.insert('stream0', ('x', ), [(x, ) for x in range(10)])

    result = pipeline.execute('SELECT count FROM test_single0').first()
    assert result['count'] == 10

    result = pipeline.execute('SELECT count FROM test_single1').first()
    assert result['count'] == 10

    _dump(pipeline,
          'test_single.sql',
          tables=['test_single0', 'stream0', 'test_single0_mrel'])

    pipeline.drop_all()
    _restore(pipeline, 'test_single.sql')

    result = pipeline.execute('SELECT count FROM test_single0').first()
    assert result['count'] == 10

    # We didn't dump this one
    result = list(
        pipeline.execute(
            'SELECT * FROM pg_class WHERE relname LIKE \'%%test_single1%%\''))
    assert not result
def test_online_add_column(pipeline, clean_db):
    """
    Verify that we can add columns to a stream while not affecting running CQs
    """
    pipeline.create_stream("stream0", c0="integer")

    pipeline.create_cv("cv0", "SELECT c0 FROM stream0")
    pipeline.insert("stream0", ("c0",), [(n,) for n in range(0, 1000)])
    result = list(pipeline.execute("SELECT * FROM cv0"))

    assert len(result) == 1000

    for row in result:
        for col in row:
            assert col is not None

    pipeline.execute("ALTER STREAM stream0 ADD c1 integer")

    pipeline.create_cv("cv1", "SELECT c0, c1 FROM stream0")
    pipeline.insert("stream0", ("c0", "c1"), [(n, n) for n in range(1000, 2000)])
    result = list(pipeline.execute("SELECT * FROM cv1 WHERE c1 >= 1000"))

    assert len(result) == 1000

    for row in result:
        for col in row:
            assert col is not None

    pipeline.execute("ALTER STREAM stream0 ADD c2 integer")
    pipeline.create_cv("cv2", "SELECT c0, c1, c2 FROM stream0")
    pipeline.insert("stream0", ("c0", "c1", "c2"), [(n, n, n) for n in range(2000, 3000)])
    result = list(pipeline.execute("SELECT * FROM cv2 WHERE c2 >= 2000"))

    assert len(result) == 1000

    for row in result:
        for col in row:
            assert col is not None

    pipeline.execute("ALTER STREAM stream0 ADD c3 integer")
    pipeline.create_cv("cv3", "SELECT c0, c1, c2, c3 FROM stream0")
    pipeline.insert("stream0", ("c0", "c1", "c2", "c3"), [(n, n, n, n) for n in range(3000, 4000)])
    result = list(pipeline.execute("SELECT * FROM cv3 WHERE c3 >= 3000"))

    assert len(result) == 1000

    for row in result:
        for col in row:
            assert col is not None

    pipeline.execute("ALTER STREAM stream0 ADD c4 integer")
    pipeline.create_cv("cv4", "SELECT c0, c1, c2, c3, c4 FROM stream0")
    pipeline.insert("stream0", ("c0", "c1", "c2", "c3", "c4"), [(n, n, n, n, n) for n in range(4000, 5000)])
    result = list(pipeline.execute("SELECT * FROM cv4 WHERE c4 >= 4000"))

    assert len(result) == 1000

    for row in result:
        for col in row:
            assert col is not None
def test_prepared_extended(pipeline, clean_db):
  """
  Verify that we can write to streams using the extended protocol. This test
  shells out to a binary because psycopg2 doesn't use the extended protocol.
  """
  pipeline.create_stream('extended_stream', x='int', y='int', z='int')
  q = """
  SELECT COUNT(x::integer) AS x, COUNT(y::integer) AS y, COUNT(z::integer) AS z FROM extended_stream
  """
  pipeline.create_cv('test_prepared_extended', q)

  # This will insert 1000 via a paramaterized insert, and 1000 via unparamaterized insert
  cmd = ['./extended', 'pipeline', str(pipeline.port), 'extended_stream', '1000']

  stdout, stderr = subprocess.Popen(cmd).communicate()

  assert stdout is None
  assert stderr is None

  rows = list(pipeline.execute('SELECT x, y, z FROM test_prepared_extended'))
  assert len(rows) == 1

  result = rows[0]

  assert result['x'] == 2000
  assert result['y'] == 2000
  assert result['z'] == 2000
def test_colums_subset(pipeline, clean_db):
    """
    Verify that copying data from a file into a stream works when the file's input
    columns are a subset of the stream0's columns
    """
    pipeline.create_stream("stream0", x="int", y="float8", z="numeric", m="int")
    q = "SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric), max(m::integer) FROM stream0"
    pipeline.create_cv("test_copy_subset", q)
    pipeline.create_table("test_copy_subset_t", x="integer", y="float8", z="numeric")

    path = os.path.abspath(os.path.join(pipeline.tmp_dir, "test_copy.csv"))

    rows = []
    for n in range(10000):
        row = random.randint(1, 1024), random.randint(1, 1024), random.random()
        rows.append(row)

    _generate_csv(path, rows, desc=("x", "y", "z"))

    pipeline.execute("COPY test_copy_subset_t (x, y, z) FROM '%s' HEADER CSV" % path)

    pipeline.execute("COPY stream0 (x, y, z) FROM '%s' HEADER CSV" % path)

    expected = pipeline.execute(
        "SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric) FROM test_copy_subset_t"
    ).first()
    result = list(pipeline.execute("SELECT s0, s1, avg FROM test_copy_subset"))

    assert len(result) == 1

    result = result[0]

    assert result[0] == expected[0]
    assert result[1] == expected[1]
    assert result[2] == expected[2]
def test_prepared_extended(pipeline, clean_db):
    """
  Verify that we can write to streams using the extended protocol. This test
  shells out to a binary because psycopg2 doesn't use the extended protocol.
  """
    pipeline.create_stream('extended_stream', x='int', y='int', z='int')
    q = """
  SELECT COUNT(x::integer) AS x, COUNT(y::integer) AS y, COUNT(z::integer) AS z FROM extended_stream
  """
    pipeline.create_cv('test_prepared_extended', q)

    # This will insert 1000 via a paramaterized insert, and 1000 via unparamaterized insert
    cmd = [
        './extended', 'postgres',
        str(pipeline.port), 'extended_stream', '1000'
    ]

    stdout, stderr = subprocess.Popen(cmd).communicate()

    assert stdout is None
    assert stderr is None

    rows = pipeline.execute('SELECT x, y, z FROM test_prepared_extended')
    assert len(rows) == 1

    result = rows[0]

    assert result['x'] == 2000
    assert result['y'] == 2000
    assert result['z'] == 2000
Esempio n. 9
0
def test_sliding_windows(pipeline, clean_db):
    """
  Verify that sliding window queries are properly dumped and restored
  """
    pipeline.create_stream('stream0', x='int')
    pipeline.execute(
        'CREATE CONTINUOUS VIEW sw_v WITH (sw = \'20 seconds\') AS SELECT count(*) FROM stream0'
    )
    pipeline.insert('stream0', ('x', ), [(x, ) for x in range(10)])

    result = pipeline.execute('SELECT count FROM sw_v').first()
    assert result['count'] == 10

    _dump(pipeline, 'test_sw.sql')

    pipeline.drop_all()
    _restore(pipeline, 'test_sw.sql')

    result = pipeline.execute('SELECT count FROM sw_v').first()
    assert result['count'] == 10

    # We should still drop back to 0 within 20 seconds
    result = pipeline.execute('SELECT count FROM sw_v').first()
    while result['count'] > 0:
        time.sleep(1)
        result = pipeline.execute('SELECT count FROM sw_v').first()

    result = pipeline.execute('SELECT count FROM sw_v').first()
    assert result['count'] == 0
Esempio n. 10
0
def test_null_groups(pipeline, clean_db):
    """
    Verify that null group columns are considered equal
    """
    pipeline.create_stream("s", x="int", y="int", z="int")
    q = """
    SELECT x::integer, y::integer, z::integer, COUNT(*) FROM s
    GROUP BY x, y, z;
    """
    desc = ("x", "y", "z")
    pipeline.create_cv("test_null_groups", q)
    pipeline.create_table("test_null_groups_t", x="integer", y="integer", z="integer")

    rows = []
    for n in range(10000):
        vals = list(random.randint(0, 10) for n in range(3))
        vals = map(lambda n: random.random() > 0.1 and n or None, vals)
        rows.append(tuple(vals))

    pipeline.insert("s", desc, rows)
    pipeline.insert("test_null_groups_t", desc, rows)

    table_q = """
    SELECT x, y, z, COUNT(*) FROM test_null_groups_t
    GROUP BY x, y, z ORDER BY x, y, z;
    """
    expected = list(pipeline.execute(table_q))
    result = list(pipeline.execute("SELECT x, y, z, count FROM test_null_groups ORDER BY x, y, z"))

    for r, e in zip(result, expected):
        assert r == e
Esempio n. 11
0
def test_freq_agg(pipeline, clean_db):
    """
  Test freq_agg, freq_merge_agg
  """
    pipeline.create_stream('test_cmsketch_stream', k='int', x='int')

    q = """
  SELECT k::integer, freq_agg(x::int) AS c FROM test_cmsketch_stream
  GROUP BY k
  """
    desc = ('k', 'x')
    pipeline.create_cv('test_cmsketch_agg', q)

    rows = []
    for n in range(1000):
        rows.append((0, n % 20))
        rows.append((1, n % 50))

    pipeline.insert('test_cmsketch_stream', desc, rows)

    result = pipeline.execute(
        'SELECT freq(c, 10) AS x, freq(c, 40) AS y, freq(c, 60) FROM test_cmsketch_agg ORDER BY k'
    )
    assert len(result) == 2
    assert (result[0][0], result[0][1], result[0][2]) == (50, 0, 0)
    assert (result[1][0], result[1][1], result[1][2]) == (20, 20, 0)

    result = pipeline.execute(
        'SELECT freq(combine(c), 10) AS x, freq(combine(c), 40) AS y, freq(combine(c), 60) FROM test_cmsketch_agg'
    )
    assert len(result) == 1
    (result[0][0], result[0][1], result[0][2]) == (70, 20, 0)
Esempio n. 12
0
def test_simple_crash(pipeline, clean_db):
  """
  Test simple worker and combiner crashes.
  """
  pipeline.create_stream('stream0', x='int')
  q = 'SELECT COUNT(*) FROM stream0'
  pipeline.create_cv('test_simple_crash', q)

  pipeline.insert('stream0', ['x'], [(1,), (1,)])

  result = pipeline.execute('SELECT * FROM test_simple_crash').first()
  assert result['count'] == 2

  # This batch can potentially get lost.
  pipeline.insert('stream0', ['x'], [(1,), (1,)])

  assert kill_worker()

  pipeline.insert('stream0', ['x'], [(1,), (1,)])

  result = pipeline.execute('SELECT * FROM test_simple_crash').first()
  assert result['count'] in [4, 6]

  # This batch can potentially get lost.
  pipeline.insert('stream0', ['x'], [(1,), (1,)])

  assert kill_combiner()

  pipeline.insert('stream0', ['x'], [(1,), (1,)])

  result = pipeline.execute('SELECT * FROM test_simple_crash').first()
  assert result['count'] in [6, 8, 10]

  # To ensure that all remaining events in ZMQ queues have been consumed
  time.sleep(2)
Esempio n. 13
0
def test_restart_recovery(pipeline, clean_db):
  pipeline.create_stream('stream0', x='int')
  q = 'SELECT COUNT(*) FROM stream0'
  pipeline.create_cv('test_restart_recovery', q)

  pipeline.insert('stream0', ['x'], [(1,), (1,)])

  result = pipeline.execute('SELECT * FROM test_restart_recovery').first()
  assert result['count'] == 2

  # Need to sleep here, otherwise on restart the materialization table is
  # empty. Not sure why.
  time.sleep(0.1)

  # Restart.
  pipeline.stop()
  pipeline.run()

  result = pipeline.execute('SELECT * FROM test_restart_recovery').first()
  assert result['count'] == 2

  pipeline.insert('stream0', ['x'], [(1,), (1,)])

  result = pipeline.execute('SELECT * FROM test_restart_recovery').first()
  assert result['count'] == 4
Esempio n. 14
0
def test_simple_aggs(pipeline, clean_db):
    """
  Verify that combines work properly on simple aggs
  """
    q = """
  SELECT x::integer % 10 AS k,
  avg(x), sum(y::float8), count(*) FROM stream0 GROUP BY k;
  """
    desc = ('x', 'y')
    pipeline.create_stream('stream0', x='int', y='float8')
    pipeline.create_cv('test_simple_aggs', q)
    pipeline.create_table('test_simple_aggs_t', x='integer', y='float8')

    rows = []
    for n in range(10000):
        row = (random.randint(0, 1000), random.random())
        rows.append(row)

    pipeline.insert('stream0', desc, rows)
    pipeline.insert('test_simple_aggs_t', desc, rows)

    table_result = pipeline.execute(
        'SELECT avg(x), sum(y::float8), count(*) FROM test_simple_aggs_t')
    cv_result = pipeline.execute(
        'SELECT combine(avg), combine(sum), combine(count) FROM test_simple_aggs'
    )

    assert len(table_result) == len(cv_result)

    for tr, cr in zip(table_result, cv_result):
        assert abs(tr[0] - cr[0]) < 0.00001
        assert abs(tr[1] - cr[1]) < 0.00001
        assert abs(tr[2] - cr[2]) < 0.00001
Esempio n. 15
0
def test_fss_agg(pipeline, clean_db):
    pipeline.create_stream('test_fss_stream', x='int', k='text')
    q = """
  SELECT k::text, fss_agg(x::int, 5) FROM test_fss_stream
  GROUP BY k
  """
    desc = ('k', 'x')
    pipeline.create_cv('test_fss_agg', q)

    items = range(14)
    random.shuffle(items)
    a_items = items
    b_items = list(reversed(items))

    values = map(lambda i: ('a', i), get_geometric_dist(a_items))
    values.extend(map(lambda i: ('b', i), get_geometric_dist(b_items)))
    random.shuffle(values)

    pipeline.insert('test_fss_stream', desc, values)
    result = list(
        pipeline.execute(
            'SELECT k, fss_topk_values(fss_agg) FROM test_fss_agg ORDER BY k'))
    topk = map(int, result[0][1].rstrip('}').lstrip('{').split(','))
    assert sorted(topk) == sorted(a_items[-5:])
    topk = map(int, result[1][1].rstrip('}').lstrip('{').split(','))
    assert sorted(topk) == sorted(b_items[-5:])
Esempio n. 16
0
def assert_result_changes(func, args):
    """
    Verifies that the result of the given function changes with time
    """
    pipeline.create_stream('stream', x='int', y='text', z='int')
    name = 'assert_%s_decreases' % func
    pipeline.create_cv(name,
                       "SELECT %s(%s) FROM stream WHERE arrival_timestamp > clock_timestamp() - interval '2 seconds'" % (func, args))

    rows = [(n, str(n), n + 1) for n in range(1000)]
    pipeline.insert('stream', ('x', 'y', 'z'), rows)

    current = 1

    results = []
    while current:
        row = pipeline.execute('SELECT * FROM %s' % name).first()
        current = row[func]
        if current is None:
            break
        results.append(current)

    # Verify that we actually read something
    assert results

    pipeline.drop_cv(name)
Esempio n. 17
0
def test_cmsketch_agg(pipeline, clean_db):
    """
    Test cmsketch_agg, cmsketch_merge_agg, cmsketch_cdf, cmsketch_quantile
    """
    pipeline.create_stream('test_cmsketch_stream', k='int', x='int')

    q = """
    SELECT k::integer, cmsketch_agg(x::int) AS c FROM test_cmsketch_stream
    GROUP BY k
    """
    desc = ('k', 'x')
    pipeline.create_cv('test_cmsketch_agg', q)

    rows = []
    for n in range(1000):
        rows.append((0, n % 20))
        rows.append((1, n % 50))

    pipeline.insert('test_cmsketch_stream', desc, rows)

    result = list(pipeline.execute(
      'SELECT cmsketch_frequency(c, 10) AS x, cmsketch_frequency(c, 40) AS y, '
      'cmsketch_frequency(c, 60) FROM test_cmsketch_agg ORDER BY k').fetchall())
    assert len(result) == 2
    assert tuple(result[0]) == (50, 0, 0)
    assert tuple(result[1]) == (20, 20, 0)

    result = list(pipeline.execute(
      'SELECT cmsketch_frequency(combine(c), 10) AS x, '
      'cmsketch_frequency(combine(c), 40) AS y, cmsketch_frequency(combine(c), 60) '
      'FROM test_cmsketch_agg').fetchall())
    assert len(result) == 1
    assert tuple(result[0]) == (70, 20, 0)
Esempio n. 18
0
def test_concurrent_vacuum_full(pipeline, clean_db):
  pipeline.create_stream('test_vacuum_stream', x='int')
  pipeline.create_cv(
    'test_vacuum_full',
    'SELECT x::int, COUNT(*) FROM test_vacuum_stream GROUP BY x')
  stop = False

  def insert():
    while not stop:
      values = [(random.randint(0, 1000000),) for _ in xrange(1000)]
      pipeline.insert('test_vacuum_stream', ('x',), values)
      time.sleep(0.01)

  threads = [threading.Thread(target=insert) for _ in range(4)]
  map(lambda t: t.start(), threads)

  # Insert data for a little bit so we have enough work to do while
  # vacuuming.
  time.sleep(20)

  conn = psycopg2.connect('dbname=pipeline user=%s host=localhost port=%s' %
                          (getpass.getuser(), pipeline.port))
  conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
  cur = conn.cursor()
  cur.execute('VACUUM FULL test_vacuum_full')
  conn.close()

  # Now kill the insert threads.
  stop = True
  map(lambda t: t.join(), threads)
def test_stream_stats(pipeline, clean_db):
    """
  Verify that stream-level statistics collection works
  """
    # create a few streams
    for n in range(8):
        sname = 's%d' % n
        pipeline.create_stream(sname, x='int')
        cvname = 'cv%d' % n
        pipeline.create_cv(cvname, 'SELECT count(*) FROM %s' % sname)

    for n in range(8):
        sname = 's%d' % n
        x = n + 1
        values = [(v, ) for v in range(1000 * x)]
        pipeline.insert(sname, ('x', ), values)

    time.sleep(2)

    for n in range(8):
        sname = 's%d' % n
        row = pipeline.execute(
            "SELECT stream, input_rows, input_batches, input_bytes FROM pipelinedb.stream_stats WHERE stream = '%s'"
            % sname)[0]
        x = n + 1
        assert row['input_rows'] == 1000 * x
Esempio n. 20
0
def assert_result_changes(func, args):
    """
    Verifies that the result of the given function changes with time
    """
    pipeline.create_stream('stream0', x='int', y='text', z='int')
    name = 'assert_%s_decreases' % func
    pipeline.create_cv(
        name,
        "SELECT %s(%s) FROM stream0 WHERE arrival_timestamp > clock_timestamp() - interval '2 seconds'"
        % (func, args))

    rows = [(n, str(n), n + 1) for n in range(1000)]
    pipeline.insert('stream0', ('x', 'y', 'z'), rows)

    current = 1

    results = []
    while current:
        row = pipeline.execute('SELECT * FROM %s' % name).first()
        current = row[func]
        if current is None:
            break
        results.append(current)

    # Verify that we actually read something
    assert results

    pipeline.drop_cv(name)
Esempio n. 21
0
def test_bloom_intersection(pipeline, clean_db):
    """
  Verify that bloom_intersection works
  """
    pipeline.create_stream('test_bloom_stream', x='int', k='int')

    q = """
  SELECT k::int, bloom_agg(x::integer) FROM test_bloom_stream GROUP BY k
  """

    desc = ('k', 'x')
    pipeline.create_cv('test_bloom_intersection', q)

    rows = []
    for i in range(10000):
        rows.append((0, 2 * i))
        rows.append((1, i))

    pipeline.insert('test_bloom_stream', desc, rows)

    cvq = """
  SELECT bloom_cardinality(bloom_intersection_agg(bloom_agg))
  FROM test_bloom_intersection
  """

    result = list(pipeline.execute(cvq))

    assert len(result) == 1

    result = result[0]

    assert result[0] == 5530
def test_join_with_where(pipeline, clean_db):
  """
  Verify that stream-table joins using a WHERE clause work properly
  """
  num_cols = 4
  q = """
  SELECT s.col0::integer FROM stream0 s, wt WHERE s.col0 = 1 AND wt.col0 = 1
  """
  wt_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])

  pipeline.create_table('wt', **wt_cols)
  pipeline.create_table('wt_s', **wt_cols)

  wt = _generate_rows(num_cols, 64)
  s = _generate_rows(num_cols, 64)

  _insert(pipeline, 'wt', wt, 0.1)
  _insert(pipeline, 'wt_s', s, 0.1)

  pipeline.create_stream('stream0', **wt_cols)
  pipeline.create_cv('test_join_where', q)
  _insert(pipeline, 'stream0', s)

  expected = pipeline.execute('SELECT COUNT(*) FROM wt_s s, wt WHERE s.col0 = 1 AND wt.col0 = 1')[0]
  result = pipeline.execute('SELECT COUNT(*) FROM test_join_where')[0]

  assert result['count'] == expected['count']
Esempio n. 23
0
def test_concurrent_vacuum_full(pipeline, clean_db):
  pipeline.create_stream('test_vacuum_stream', x='int')
  pipeline.create_cv(
    'test_vacuum_full',
    'SELECT x::int, COUNT(*) FROM test_vacuum_stream GROUP BY x')
  stop = False

  def insert():
    while not stop:
      values = [(random.randint(0, 1000000), ) for _ in xrange(1000)]
      pipeline.insert('test_vacuum_stream', ('x', ), values)
      time.sleep(0.01)

  threads = [threading.Thread(target=insert) for _ in range(4)]
  map(lambda t: t.start(), threads)

  # Insert data for a little bit so we have enough work to do while
  # vacuuming.
  time.sleep(20)

  conn = psycopg2.connect('dbname=pipeline user=%s host=localhost port=%s' %
                          (getpass.getuser(), pipeline.port))
  conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
  cur = conn.cursor()
  cur.execute('VACUUM FULL test_vacuum_full')
  conn.close()

  # Now kill the insert threads.
  stop = True
  map(lambda t: t.join(), threads)
def test_incremental_join(pipeline, clean_db):
  """
  Verify that join results increase appropriately as we incrementally
  add stream events to the input
  """
  num_cols = 4
  join_cols = [0, 1]
  t_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])
  pipeline.create_table('inc', **t_cols)
  pipeline.create_stream('stream0', **t_cols)

  q = """
  SELECT s.col0::integer FROM inc JOIN stream0 s ON inc.col0 = s.col0
  AND inc.col1 = s.col1::integer
  """
  t = _generate_rows(num_cols, 64)
  _insert(pipeline, 'inc', t, 0.1)

  pipeline.create_cv('test_join', q)
  s = []
  for n in range(2):
    row = _generate_row(num_cols)
    _insert(pipeline, 'stream0', [row])
    s.append(row)

  expected = _join(t, s, join_cols)
  result = pipeline.execute('SELECT COUNT(*) FROM test_join')[0]

  assert result['count'] == len(expected)
def test_indexed(pipeline, clean_db):
  """
  Verify that stream-table joins involving indexed tables work
  """
  pipeline.create_stream('stream0', x='int', y='int')
  q = """
  SELECT stream0.x::integer, count(*) FROM stream0
  JOIN test_indexed_t t ON stream0.x = t.x GROUP BY stream0.x
  """
  pipeline.create_table('test_indexed_t', x='integer', y='integer')
  pipeline.execute('CREATE INDEX idx ON test_indexed_t(x)')

  t = _generate_rows(2, 1000)
  s = _generate_rows(2, 1000)

  pipeline.insert('test_indexed_t', ('x', 'y'), t)
  time.sleep(0.1)

  pipeline.create_cv('test_indexed', q)
  pipeline.insert('stream0', ('x', 'y'), s)

  expected = _join(s, t, [0])
  result = pipeline.execute('SELECT sum(count) FROM test_indexed')[0]

  assert result['sum'] == len(expected)
Esempio n. 26
0
def test_hll_agg_hashing(pipeline, clean_db):
    """
    Verify that hll_agg correctly hashes different input types
    """
    pipeline.create_stream('test_hll_stream', x='int', y='text', z='float8')
    q = """
    SELECT hll_agg(x::integer) AS i,
    hll_agg(y::text) AS t,
    hll_agg(z::float8) AS f FROM test_hll_stream
    """
    desc = ('x', 'y', 'z')
    pipeline.create_cv('test_hll_hashing', q)

    rows = []
    for n in range(10000):
        rows.append((n, '%d' % n, float(n)))
        rows.append((n, '%05d' % n, float(n)))

    pipeline.insert('test_hll_stream', desc, rows)

    cvq = """
    SELECT hll_cardinality(i),
    hll_cardinality(t), hll_cardinality(f) FROM test_hll_hashing
    """
    result = list(pipeline.execute(cvq))

    assert len(result) == 1

    result = result[0]

    assert result[0] == 9976
    assert result[1] == 19951
    assert result[2] == 10062
Esempio n. 27
0
def test_combine_in_view(pipeline, clean_db):
    """
    Verify that combines in views on top of continuous views work
    """
    q = """
    SELECT x::integer, avg(y::integer) FROM stream0 GROUP BY x
    """
    desc = ('x', 'y')
    pipeline.create_stream('stream0', x='int', y='float8')
    pipeline.create_cv('test_combine_view', q)
    pipeline.execute('CREATE VIEW v AS SELECT combine(avg) FROM test_combine_view')

    rows = []
    for n in range(10000):
        rows.append((random.randint(1, 256), random.randint(1, 1024)))

    pipeline.insert('stream0', desc, rows)

    view = list(pipeline.execute('SELECT * FROM v'))

    assert len(view) == 1

    expected = sum(r[1] for r in rows) / float(len(rows))

    assert abs(float(view[0][0]) - expected) < 0.00001

    pipeline.execute('DROP VIEW v')
def test_join_with_where(pipeline, clean_db):
    """
    Verify that stream-table joins using a WHERE clause work properly
    """
    num_cols = 4
    q = """
    SELECT s.col0::integer FROM stream s, wt WHERE s.col0 = 1 AND wt.col0 = 1
    """
    wt_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])

    pipeline.create_table('wt', **wt_cols)
    pipeline.create_table('wt_s', **wt_cols)

    wt = _generate_rows(num_cols, 64)
    s = _generate_rows(num_cols, 64)

    _insert(pipeline, 'wt', wt, 0.1)
    _insert(pipeline, 'wt_s', s, 0.1)

    pipeline.create_stream('stream', **wt_cols)
    pipeline.create_cv('test_join_where', q)
    _insert(pipeline, 'stream', s)

    expected = pipeline.execute('SELECT COUNT(*) FROM wt_s s, wt WHERE s.col0 = 1 AND wt.col0 = 1').first()
    result = pipeline.execute('SELECT COUNT(*) FROM test_join_where').first()

    assert result['count'] == expected['count']
Esempio n. 29
0
def test_hll_agg_hashing(pipeline, clean_db):
    """
    Verify that hll_agg correctly hashes different input types
    """
    pipeline.create_stream('test_hll_stream', x='int', y='text', z='float8')
    q = """
    SELECT hll_agg(x::integer) AS i,
    hll_agg(y::text) AS t,
    hll_agg(z::float8) AS f FROM test_hll_stream
    """
    desc = ('x', 'y', 'z')
    pipeline.create_cv('test_hll_hashing', q)

    rows = []
    for n in range(10000):
        rows.append((n, '%d' % n, float(n)))
        rows.append((n, '%05d' % n, float(n)))

    pipeline.insert('test_hll_stream', desc, rows)

    cvq = """
    SELECT hll_cardinality(i),
    hll_cardinality(t), hll_cardinality(f) FROM test_hll_hashing
    """
    result = list(pipeline.execute(cvq))

    assert len(result) == 1

    result = result[0]

    assert result[0] == 9976
    assert result[1] == 19951
    assert result[2] == 10062
Esempio n. 30
0
def test_bloom_contains(pipeline, clean_db):
    """
  Verify that bloom_contains works
  """
    pipeline.create_stream("test_bloom_stream", x="int")

    q = """
  SELECT bloom_agg(x::integer) FROM test_bloom_stream
  """

    desc = "x"
    pipeline.create_cv("test_bloom_contains", q)

    rows = []
    for i in range(10000):
        rows.append((2 * i,))

    pipeline.insert("test_bloom_stream", desc, rows)

    cvq = """
  SELECT bloom_contains(bloom_agg, 0), bloom_contains(bloom_agg, 5000),
  bloom_contains(bloom_agg, 1), bloom_contains(bloom_agg, 5001)
  FROM test_bloom_contains
  """

    result = list(pipeline.execute(cvq))

    assert len(result) == 1
    result = result[0]
    assert result[0] == True
    assert result[1] == True
    assert result[2] == False
    assert result[3] == False
def test_colums_subset(pipeline, clean_db):
  """
  Verify that copying data from a file into a stream works when the file's input
  columns are a subset of the stream0's columns
  """
  pipeline.create_stream('stream0', x='int', y='float8', z='numeric', m='int')
  q = 'SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric), max(m::integer) FROM stream0'
  pipeline.create_cv('test_copy_subset', q)
  pipeline.create_table('test_copy_subset_t', x='integer', y='float8', z='numeric')

  path = os.path.abspath(os.path.join(pipeline.data_dir, 'test_copy.csv'))

  rows = []
  for n in range(10000):
    row = random.randint(1, 1024), random.randint(1, 1024), random.random()
    rows.append(row)

  _generate_csv(path, rows, desc=('x', 'y', 'z'))

  pipeline.execute('COPY test_copy_subset_t (x, y, z) FROM \'%s\' HEADER CSV' % path)

  pipeline.execute('COPY stream0 (x, y, z) FROM \'%s\' HEADER CSV' % path)

  expected = pipeline.execute('SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric) FROM test_copy_subset_t')[0]
  result = pipeline.execute('SELECT s0, s1, avg FROM test_copy_subset')

  assert len(result) == 1

  result = result[0]

  assert result[0] == expected[0]
  assert result[1] == expected[1]
  assert result[2] == expected[2]
Esempio n. 32
0
def test_restart_recovery(pipeline, clean_db):
    pipeline.create_stream('stream0', x='int')
    q = 'SELECT COUNT(*) FROM stream0'
    pipeline.create_cv('test_restart_recovery', q)

    pipeline.insert('stream0', ['x'], [(1, ), (1, )])

    result = pipeline.execute('SELECT * FROM test_restart_recovery').first()
    assert result['count'] == 2

    # Need to sleep here, otherwise on restart the materialization table is
    # empty. Not sure why.
    time.sleep(0.1)

    # Restart.
    pipeline.stop()
    pipeline.run()

    result = pipeline.execute('SELECT * FROM test_restart_recovery').first()
    assert result['count'] == 2

    pipeline.insert('stream0', ['x'], [(1, ), (1, )])

    result = pipeline.execute('SELECT * FROM test_restart_recovery').first()
    assert result['count'] == 4
Esempio n. 33
0
def test_simple_crash(pipeline, clean_db):
    """
  Test simple worker and combiner crashes.
  """
    pipeline.create_stream('stream0', x='int')
    q = 'SELECT COUNT(*) FROM stream0'
    pipeline.create_cv('test_simple_crash', q)

    pipeline.insert('stream0', ['x'], [(1, ), (1, )])

    result = pipeline.execute('SELECT * FROM test_simple_crash').first()
    assert result['count'] == 2

    # This batch can potentially get lost.
    pipeline.insert('stream0', ['x'], [(1, ), (1, )])

    assert kill_worker()

    pipeline.insert('stream0', ['x'], [(1, ), (1, )])

    result = pipeline.execute('SELECT * FROM test_simple_crash').first()
    assert result['count'] in [4, 6]

    # This batch can potentially get lost.
    pipeline.insert('stream0', ['x'], [(1, ), (1, )])

    assert kill_combiner()

    pipeline.insert('stream0', ['x'], [(1, ), (1, )])

    result = pipeline.execute('SELECT * FROM test_simple_crash').first()
    assert result['count'] in [6, 8, 10]

    # To ensure that all remaining events in ZMQ queues have been consumed
    time.sleep(2)
Esempio n. 34
0
def test_null_groups(pipeline, clean_db):
    """
    Verify that null group columns are considered equal
    """
    pipeline.create_stream('stream', x='int', y='int', z='int')
    q = """
    SELECT x::integer, y::integer, z::integer, COUNT(*) FROM stream
    GROUP BY x, y, z;
    """
    desc = ('x', 'y', 'z')
    pipeline.create_cv('test_null_groups', q)
    pipeline.create_table('test_null_groups_t', x='integer', y='integer', z='integer')

    rows = []
    for n in range(10000):
        vals = list(random.randint(0, 10) for n in range(3))
        vals = map(lambda n: random.random() > 0.1 and n or None, vals)
        rows.append(tuple(vals))

    pipeline.insert('stream', desc, rows)
    pipeline.insert('test_null_groups_t', desc, rows)

    table_q = """
    SELECT x, y, z, COUNT(*) FROM test_null_groups_t
    GROUP BY x, y, z ORDER BY x, y, z;
    """
    expected = list(pipeline.execute(table_q))
    result = list(pipeline.execute('SELECT x, y, z, count FROM test_null_groups ORDER BY x, y, z'))

    for r, e in zip(result, expected):
        assert r == e
Esempio n. 35
0
def test_bloom_contains(pipeline, clean_db):
    """
  Verify that bloom_contains works
  """
    pipeline.create_stream('test_bloom_stream', x='int')

    q = """
  SELECT bloom_agg(x::integer) FROM test_bloom_stream
  """

    desc = ('x')
    pipeline.create_cv('test_bloom_contains', q)

    rows = []
    for i in range(10000):
        rows.append((2 * i, ))

    pipeline.insert('test_bloom_stream', desc, rows)

    cvq = """
  SELECT bloom_contains(bloom_agg, 0), bloom_contains(bloom_agg, 5000),
  bloom_contains(bloom_agg, 1), bloom_contains(bloom_agg, 5001)
  FROM test_bloom_contains
  """

    result = list(pipeline.execute(cvq))

    assert len(result) == 1
    result = result[0]
    assert result[0] == True
    assert result[1] == True
    assert result[2] == False
    assert result[3] == False
def test_sliding_windows(pipeline, clean_db):
  """
  Verify that sliding window queries are properly dumped and restored
  """
  pipeline.create_stream('stream0', x='int')
  pipeline.execute('CREATE VIEW sw_v WITH (sw = \'20 seconds\') AS SELECT count(*) FROM stream0')
  pipeline.insert('stream0', ('x',), [(x,) for x in range(10)])

  result = pipeline.execute('SELECT count FROM sw_v')[0]
  assert result['count'] == 10

  _dump(pipeline, 'test_sw.sql')

  pipeline.drop_all()
  _restore(pipeline, 'test_sw.sql')

  result = pipeline.execute('SELECT count FROM sw_v')[0]
  assert result['count'] == 10

  # We should still drop back to 0 within 20 seconds
  result = pipeline.execute('SELECT count FROM sw_v')[0]
  while result['count'] > 0:
    time.sleep(1)
    result = pipeline.execute('SELECT count FROM sw_v')[0]

  result = pipeline.execute('SELECT count FROM sw_v')[0]
    # Disabled until #157 (currently combine doesn't return 0 on NULL input for this aggregate)
  # assert result == 0
  assert result['count'] is None
def test_join_multiple_tables(pipeline, clean_db):
    """
    Verify that stream-table joins involving multiple tables work
    """
    num_cols = 8
    join_cols = [0]
    t0_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])
    t1_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])

    pipeline.create_table('t0', **t0_cols)
    pipeline.create_table('t1', **t1_cols)
    pipeline.create_stream('stream', **t0_cols)
    q = """
    SELECT s.col0::integer FROM t0 JOIN t1 ON t0.col0 = t1.col0
    JOIN stream s ON t1.col0 = s.col0
    """

    t0 = _generate_rows(num_cols, 64)
    t1 = _generate_rows(num_cols, 64)
    s = _generate_rows(num_cols, 64)

    _insert(pipeline, 't1', t1, 0.1)
    _insert(pipeline, 't0', t0, 0.1)

    pipeline.create_cv('test_join_multi', q)
    _insert(pipeline, 'stream', s)

    expected = _join(t0, _join(s, t1, join_cols), join_cols)
    result = pipeline.execute('SELECT COUNT(*) FROM test_join_multi').first()

    assert result['count'] == len(expected)
Esempio n. 38
0
def test_bloom_intersection(pipeline, clean_db):
    """
  Verify that bloom_intersection works
  """
    pipeline.create_stream("test_bloom_stream", x="int", k="int")

    q = """
  SELECT k::int, bloom_agg(x::integer) FROM test_bloom_stream GROUP BY k
  """

    desc = ("k", "x")
    pipeline.create_cv("test_bloom_intersection", q)

    rows = []
    for i in range(10000):
        rows.append((0, 2 * i))
        rows.append((1, i))

    pipeline.insert("test_bloom_stream", desc, rows)

    cvq = """
  SELECT bloom_cardinality(bloom_intersection_agg(bloom_agg))
  FROM test_bloom_intersection
  """

    result = list(pipeline.execute(cvq))

    assert len(result) == 1

    result = result[0]

    assert result[0] == 5530
def test_indexed(pipeline, clean_db):
    """
    Verify that stream-table joins involving indexed tables work
    """
    pipeline.create_stream('stream', x='int', y='int')
    q = """
    SELECT stream.x::integer, count(*) FROM stream
    JOIN test_indexed_t t ON stream.x = t.x GROUP BY stream.x
    """
    pipeline.create_table('test_indexed_t', x='integer', y='integer')
    pipeline.execute('CREATE INDEX idx ON test_indexed_t(x)')

    t = _generate_rows(2, 1000)
    s = _generate_rows(2, 1000)

    pipeline.insert('test_indexed_t', ('x', 'y'), t)
    time.sleep(0.1)

    pipeline.create_cv('test_indexed', q)
    pipeline.insert('stream', ('x', 'y'), s)

    expected = _join(s, t, [0])
    result = pipeline.execute('SELECT sum(count) FROM test_indexed').first()

    assert result['sum'] == len(expected)
Esempio n. 40
0
def test_filter_clause(pipeline, clean_db):
    """
    Verify that FILTER clauses work on aggregates and sliding window aggregates
    """
    pipeline.create_stream('test_filter_stream', x='int')
    q = """
    SELECT SUM(x::int) FILTER (WHERE mod(x, 2) = 0) AS sum2, SUM(x::int) FILTER (WHERE mod(x, 3) = 0) AS sum3 FROM test_filter_stream
    """
    sw = """
    WHERE arrival_timestamp > clock_timestamp() - interval '30 second'
    """
    pipeline.create_cv('test_filter', q)
    pipeline.create_cv('test_filter_sw', '%s %s' % (q, sw))

    desc = ('x', )
    rows = []
    for n in range(1000):
        rows.append((n, ))

    pipeline.insert('test_filter_stream', desc, rows)

    sum2 = sum(filter(lambda x: x % 2 == 0, map(lambda x: x[0], rows)))
    sum3 = sum(filter(lambda x: x % 3 == 0, map(lambda x: x[0], rows)))

    result1 = pipeline.execute('SELECT * FROM test_filter').first()
    result2 = pipeline.execute('SELECT * FROM test_filter_sw').first()

    assert result1['sum2'] == result2['sum2'] == sum2
    assert result1['sum3'] == result2['sum3'] == sum3
Esempio n. 41
0
def test_user_low_and_high_card(pipeline, clean_db):
    """
    Verify that HLL's with low and high cardinalities are correcly combined
    """
    pipeline.create_stream('test_hll_stream', x='int', k='integer')
    q = """
    SELECT k::integer, hll_agg(x::integer) FROM test_hll_stream GROUP BY k
    """
    desc = ('k', 'x')
    pipeline.create_cv('test_hll_agg', q)

    # Low cardinalities
    rows = []
    for n in range(1000):
        rows.append((0, random.choice((-1, -2))))
        rows.append((1, random.choice((-3, -4))))

    # High cardinalities
    for n in range(10000):
        rows.append((2, n))
        rows.append((3, n))

    pipeline.insert('test_hll_stream', desc, rows)

    result = pipeline.execute('SELECT hll_cardinality(combine(hll_agg)) '
                              'FROM test_hll_agg WHERE k in (0, 1)').first()
    assert result[0] == 4

    result = pipeline.execute('SELECT hll_cardinality(combine(hll_agg)) '
                              'FROM test_hll_agg WHERE k in (2, 3)').first()
    assert result[0] == 9976

    result = pipeline.execute('SELECT hll_cardinality(combine(hll_agg)) '
                              'FROM test_hll_agg').first()
    assert result[0] == 9983
Esempio n. 42
0
def test_sliding_windows(pipeline, clean_db):
  """
  Verify that sliding window queries are properly dumped and restored
  """
  pipeline.create_stream('stream0', x='int')
  pipeline.execute('CREATE CONTINUOUS VIEW sw_v WITH (sw = \'20 seconds\') AS SELECT count(*) FROM stream0')
  pipeline.insert('stream0', ('x',), [(x,) for x in range(10)])

  result = pipeline.execute('SELECT count FROM sw_v').first()
  assert result['count'] == 10

  _dump(pipeline, 'test_sw.sql')

  pipeline.drop_all()
  _restore(pipeline, 'test_sw.sql')

  result = pipeline.execute('SELECT count FROM sw_v').first()
  assert result['count'] == 10

  # We should still drop back to 0 within 20 seconds
  result = pipeline.execute('SELECT count FROM sw_v').first()
  while result['count'] > 0:
    time.sleep(1)
    result = pipeline.execute('SELECT count FROM sw_v').first()

  result = pipeline.execute('SELECT count FROM sw_v').first()
  assert result['count'] == 0
Esempio n. 43
0
def test_user_low_and_high_card(pipeline, clean_db):
    """
    Verify that HLL's with low and high cardinalities are correcly combined
    """
    pipeline.create_stream('test_hll_stream', x='int', k='integer')
    q = """
    SELECT k::integer, hll_agg(x::integer) FROM test_hll_stream GROUP BY k
    """
    desc = ('k', 'x')
    pipeline.create_cv('test_hll_agg', q)

    # Low cardinalities
    rows = []
    for n in range(1000):
        rows.append((0, random.choice((-1, -2))))
        rows.append((1, random.choice((-3, -4))))

    # High cardinalities
    for n in range(10000):
        rows.append((2, n))
        rows.append((3, n))

    pipeline.insert('test_hll_stream', desc, rows)

    result = pipeline.execute('SELECT hll_cardinality(combine(hll_agg)) '
                              'FROM test_hll_agg WHERE k in (0, 1)').first()
    assert result[0] == 4

    result = pipeline.execute('SELECT hll_cardinality(combine(hll_agg)) '
                              'FROM test_hll_agg WHERE k in (2, 3)').first()
    assert result[0] == 9976

    result = pipeline.execute('SELECT hll_cardinality(combine(hll_agg)) '
                              'FROM test_hll_agg').first()
    assert result[0] == 9983
Esempio n. 44
0
def test_single_continuous_view(pipeline, clean_db):
  """
  Verify that specific continuous views can be dropped and restored
  """
  pipeline.create_stream('stream0', x='int')
  pipeline.create_cv('test_single0', 'SELECT COUNT(*) FROM stream0')
  pipeline.create_cv('test_single1', 'SELECT COUNT(*) FROM stream0')
  pipeline.insert('stream0', ('x',), [(x,) for x in range(10)])

  result = pipeline.execute('SELECT count FROM test_single0').first()
  assert result['count'] == 10

  result = pipeline.execute('SELECT count FROM test_single1').first()
  assert result['count'] == 10

  _dump(pipeline, 'test_single.sql', tables=['test_single0', 'stream0', 'test_single0_mrel'])

  pipeline.drop_all()
  _restore(pipeline, 'test_single.sql')

  result = pipeline.execute('SELECT count FROM test_single0').first()
  assert result['count'] == 10

  # We didn't dump this one
  result = list(pipeline.execute('SELECT * FROM pg_class WHERE relname LIKE \'%%test_single1%%\''))
  assert not result
Esempio n. 45
0
def test_copy_to_typed_stream(pipeline, clean_db):
    """
    Verify that copying data from a file into a typed stream works.
    """
    pipeline.create_stream('stream', x='integer', y='float8', z='numeric')

    q = 'SELECT sum(x) AS s0, sum(y) AS s1, avg(z) FROM stream'
    pipeline.create_cv('test_copy_to_typed_stream', q)
    pipeline.create_table('test_copy_to_typed_stream_t', x='integer', y='float8', z='numeric')

    path = os.path.abspath(os.path.join(pipeline.tmp_dir, 'test_copy.csv'))

    rows = []
    for n in range(10000):
        row = random.randint(1, 1024), random.randint(1, 1024), random.random()
        rows.append(row)

    _generate_csv(path, rows, desc=('x', 'y', 'z'))

    pipeline.execute('COPY test_copy_to_typed_stream_t (x, y, z) FROM \'%s\' HEADER CSV' % path)

    pipeline.execute('COPY stream (x, y, z) FROM \'%s\' HEADER CSV' % path)

    expected = pipeline.execute('SELECT sum(x) AS s0, sum(y) AS s1, avg(z) FROM test_copy_to_typed_stream_t').first()
    result = list(pipeline.execute('SELECT s0, s1, avg FROM test_copy_to_typed_stream'))

    assert len(result) == 1

    result = result[0]

    assert result[0] == expected[0]
    assert result[1] == expected[1]
    assert result[2] == expected[2]
Esempio n. 46
0
def test_simple_aggs(pipeline, clean_db):
    """
    Verify that combines work properly on simple aggs
    """
    q = """
    SELECT x::integer %% 10 AS k,
    avg(x), sum(y::float8), count(*) FROM stream0 GROUP BY k;
    """
    desc = ('x', 'y')
    pipeline.create_stream('stream0', x='int', y='float8')
    pipeline.create_cv('test_simple_aggs', q)
    pipeline.create_table('test_simple_aggs_t', x='integer', y='float8')

    rows = []
    for n in range(10000):
        row = (random.randint(0, 1000), random.random())
        rows.append(row)

    pipeline.insert('stream0', desc, rows)
    pipeline.insert('test_simple_aggs_t', desc, rows)

    table_result = list(pipeline.execute('SELECT avg(x), sum(y::float8), count(*) FROM test_simple_aggs_t'))
    cv_result = list(pipeline.execute('SELECT combine(avg), combine(sum), combine(count) FROM test_simple_aggs'))

    assert len(table_result) == len(cv_result)

    for tr, cr in zip(table_result, cv_result):
        assert abs(tr[0] - cr[0]) < 0.00001
        assert abs(tr[1] - cr[1]) < 0.00001
        assert abs(tr[2] - cr[2]) < 0.00001
def test_join_across_batches(pipeline, clean_db):
  """
  Verify that stream-table joins are properly built when they
  span across multiple input batches
  """
  num_cols = 4
  join_cols = [0]
  t_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])
  pipeline.create_table('batch', **t_cols)
  pipeline.create_stream('stream0', **t_cols)

  q = """
  SELECT s.col0::integer FROM batch JOIN stream0 s ON batch.col0 = s.col0
  """

  t = _generate_rows(num_cols, 64)
  _insert(pipeline, 'batch', t, 0.1)

  s = _generate_rows(num_cols, 64)
  pipeline.create_cv('test_batched_join', q)
  _insert(pipeline, 'stream0', s)

  expected = _join(t, s, join_cols)
  result = pipeline.execute('SELECT COUNT(*) FROM test_batched_join')[0]

  assert result['count'] == len(expected)
def test_join_across_batches(pipeline, clean_db):
    """
    Verify that stream-table joins are properly built when they
    span across multiple input batches
    """
    num_cols = 4
    join_cols = [0]
    t_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])
    pipeline.create_table('batch', **t_cols)
    pipeline.create_stream('stream', **t_cols)

    q = """
    SELECT s.col0::integer FROM batch JOIN stream s ON batch.col0 = s.col0
    """

    t = _generate_rows(num_cols, 64)
    _insert(pipeline, 'batch', t, 0.1)

    s = _generate_rows(num_cols, 64)
    pipeline.create_cv('test_batched_join', q)
    _insert(pipeline, 'stream', s)

    expected = _join(t, s, join_cols)
    result = pipeline.execute('SELECT COUNT(*) FROM test_batched_join').first()

    assert result['count'] == len(expected)
def test_join_multiple_tables(pipeline, clean_db):
  """
  Verify that stream-table joins involving multiple tables work
  """
  num_cols = 8
  join_cols = [0]
  t0_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])
  t1_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])

  pipeline.create_table('t0', **t0_cols)
  pipeline.create_table('t1', **t1_cols)
  pipeline.create_stream('stream0', **t0_cols)
  q = """
  SELECT s.col0::integer FROM t0 JOIN t1 ON t0.col0 = t1.col0
  JOIN stream0 s ON t1.col0 = s.col0
  """

  t0 = _generate_rows(num_cols, 64)
  t1 = _generate_rows(num_cols, 64)
  s = _generate_rows(num_cols, 64)

  _insert(pipeline, 't1', t1, 0.1)
  _insert(pipeline, 't0', t0, 0.1)

  pipeline.create_cv('test_join_multi', q)
  _insert(pipeline, 'stream0', s)

  expected = _join(t0, _join(s, t1, join_cols), join_cols)
  result = pipeline.execute('SELECT COUNT(*) FROM test_join_multi')[0]

  assert result['count'] == len(expected)
def test_incremental_join(pipeline, clean_db):
    """
    Verify that join results increase appropriately as we incrementally
    add stream events to the input
    """
    num_cols = 4
    join_cols = [0, 1]
    t_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])
    pipeline.create_table('inc', **t_cols)
    pipeline.create_stream('stream', **t_cols)

    q = """
    SELECT s.col0::integer FROM inc JOIN stream s ON inc.col0 = s.col0
    AND inc.col1 = s.col1::integer
    """
    t = _generate_rows(num_cols, 64)
    _insert(pipeline, 'inc', t, 0.1)

    pipeline.create_cv('test_join', q)
    s = []
    for n in range(2):
        row = _generate_row(num_cols)
        _insert(pipeline, 'stream', [row])
        s.append(row)

    expected = _join(t, s, join_cols)
    result = pipeline.execute('SELECT COUNT(*) FROM test_join').first()

    assert result['count'] == len(expected)
Esempio n. 51
0
def test_bloom_agg_hashing(pipeline, clean_db):
    """
  Verify that bloom_agg correctly hashes different input types
  """
    pipeline.create_stream("test_bloom_stream", x="int", y="text", z="float8")

    q = """
  SELECT bloom_agg(x::integer) AS i,
  bloom_agg(y::text) AS t,
  bloom_agg(z::float8) AS f FROM test_bloom_stream
  """
    desc = ("x", "y", "z")
    pipeline.create_cv("test_bloom_hashing", q)

    rows = []
    for n in range(10000):
        rows.append((n, "%d" % n, float(n)))
        rows.append((n, "%05d" % n, float(n)))

    pipeline.insert("test_bloom_stream", desc, rows)

    cvq = """
  SELECT bloom_cardinality(i),
  bloom_cardinality(t), bloom_cardinality(f) FROM test_bloom_hashing
  """
    result = list(pipeline.execute(cvq))

    assert len(result) == 1

    result = result[0]

    assert result[0] == 8879
    assert result[1] == 15614
    assert result[2] == 8855
Esempio n. 52
0
def test_regression(pipeline, clean_db):
  path = os.path.abspath(os.path.join(pipeline.tmp_dir, 'test_copy.csv'))
  _generate_csv(path, [['2015-06-01 00:00:00','De','Adam_Babareka','1','37433']], desc=('day', 'project', 'title', 'count', 'size'))

  pipeline.create_stream('copy_regression_stream', count='int', day='timestamp', project='text', title='text', size='int')
  pipeline.create_cv('test_copy_regression', 'SELECT sum(count) FROM copy_regression_stream')

  pipeline.execute("COPY copy_regression_stream (day, project, title, count, size) FROM '%s' CSV HEADER" % path)
Esempio n. 53
0
def test_online_drop_column(pipeline, clean_db):
  pipeline.create_stream('stream1', c0='integer')

  try:
    pipeline.execute('ALTER STREAM stream1 DROP c0')
    assert False
  except:
    pass
Esempio n. 54
0
def test_online_drop_column(pipeline, clean_db):
    pipeline.create_stream("stream1", c0="integer")

    try:
        pipeline.execute("ALTER STREAM stream1 DROP c0")
        assert False
    except:
        pass
def test_regression(pipeline, clean_db):
  path = os.path.abspath(os.path.join(pipeline.data_dir, 'test_copy.csv'))
  _generate_csv(path, [['2015-06-01 00:00:00', 'De', 'Adam_Babareka', '1', '37433']], desc=('day', 'project', 'title', 'count', 'size'))

  pipeline.create_stream('copy_regression_stream', count='int', day='timestamp', project='text', title='text', size='int')
  pipeline.create_cv('test_copy_regression', 'SELECT sum(count) FROM copy_regression_stream')

  pipeline.execute("COPY copy_regression_stream (day, project, title, count, size) FROM '%s' CSV HEADER" % path)
Esempio n. 56
0
def test_cq_stats(pipeline, clean_db):
    """
    Verify that CQ statistics collection works
    """
    num_combiners = int(pipeline.execute('SHOW continuous_query_num_combiners').first()['continuous_query_num_combiners'])
    num_workers = int(pipeline.execute('SHOW continuous_query_num_workers').first()['continuous_query_num_workers'])

    pipeline.create_stream('stream0', x='int')

    # 10 rows
    q = 'SELECT x::integer %% 10 AS g, COUNT(*) FROM stream0 GROUP BY g'
    pipeline.create_cv('test_10_groups', q)

    # 1 row
    q = 'SELECT COUNT(*) FROM stream0'
    pipeline.create_cv('test_1_group', q)

    values = [(random.randint(1, 1024),) for n in range(1000)]

    pipeline.insert('stream0', ('x',), values)
    pipeline.insert('stream0', ('x',), values)
    # Sleep a little so that the next time we insert, we force the stats collector.
    # Must be >= 1s since that's the force interval.
    time.sleep(1)
    pipeline.insert('stream0', ('x',), values)
    pipeline.insert('stream0', ('x',), values)

    # Sleep a little so the stats collector flushes all the stats.
    time.sleep(1)

    proc_result = list(pipeline.execute('SELECT * FROM pipeline_proc_stats'))
    cq_result = list(pipeline.execute('SELECT * FROM pipeline_query_stats'))

    proc_rows = len(proc_result)
    cq_rows = len(cq_result)

    # We are guaranteed to send data to all combiners but only at least 1 worker
    # since we randomly select which worker to send the data to.
    assert proc_rows >= num_combiners + 1
    assert proc_rows <= num_combiners + num_workers
    assert cq_rows == 4

    # We get 2000 in case the first two microbatches go to the same worker
    # and the second two go to a different one. In this case, both will flush
    # the first microbatch they see, so 1000 + 1000.
    result = pipeline.execute("SELECT * FROM pipeline_query_stats WHERE name = 'test_10_groups' AND type = 'worker'").first()
    assert result['input_rows'] in [2000, 3000, 4000]

    result = pipeline.execute("SELECT * FROM pipeline_query_stats WHERE name = 'test_10_groups' AND type = 'combiner'").first()
    assert result['output_rows'] == 10

    result = pipeline.execute("SELECT * FROM pipeline_query_stats WHERE name = 'test_1_group' AND type = 'worker'").first()
    assert result['input_rows'] in [2000, 3000, 4000]

    result = pipeline.execute("SELECT * FROM pipeline_query_stats WHERE name = 'test_1_group' AND type = 'combiner'").first()
    assert result['output_rows'] == 1
Esempio n. 57
0
def test_cq_stats(pipeline, clean_db):
    """
    Verify that CQ statistics collection works
    """
    num_combiners = int(pipeline.execute('SHOW continuous_query_num_combiners').first()['continuous_query_num_combiners'])
    num_workers = int(pipeline.execute('SHOW continuous_query_num_workers').first()['continuous_query_num_workers'])

    pipeline.create_stream('stream', x='int')

    # 10 rows
    q = 'SELECT x::integer %% 10 AS g, COUNT(*) FROM stream GROUP BY g'
    pipeline.create_cv('test_10_groups', q)

    # 1 row
    q = 'SELECT COUNT(*) FROM stream'
    pipeline.create_cv('test_1_group', q)

    values = [(random.randint(1, 1024),) for n in range(1000)]

    pipeline.insert('stream', ('x',), values)
    pipeline.insert('stream', ('x',), values)
    # Sleep a little so that the next time we insert, we force the stats collector.
    # Must be >= 1s since that's the force interval.
    time.sleep(1)
    pipeline.insert('stream', ('x',), values)
    pipeline.insert('stream', ('x',), values)

    # Sleep a little so the stats collector flushes all the stats.
    time.sleep(1)

    proc_result = list(pipeline.execute('SELECT * FROM pipeline_proc_stats'))
    cq_result = list(pipeline.execute('SELECT * FROM pipeline_query_stats'))

    proc_rows = len(proc_result)
    cq_rows = len(cq_result)

    # We are guaranteed to send data to all combiners but only at least 1 worker
    # since we randomly select which worker to send the data to.
    assert proc_rows >= num_combiners + 1
    assert proc_rows <= num_combiners + num_workers
    assert cq_rows == 4

    # We get 2000 in case the first two microbatches go to the same worker
    # and the second two go to a different one. In this case, both will flush
    # the first microbatch they see, so 1000 + 1000.
    result = pipeline.execute("SELECT * FROM pipeline_query_stats WHERE name = 'test_10_groups' AND type = 'worker'").first()
    assert result['input_rows'] in [2000, 3000, 4000]

    result = pipeline.execute("SELECT * FROM pipeline_query_stats WHERE name = 'test_10_groups' AND type = 'combiner'").first()
    assert result['output_rows'] == 10

    result = pipeline.execute("SELECT * FROM pipeline_query_stats WHERE name = 'test_1_group' AND type = 'worker'").first()
    assert result['input_rows'] in [2000, 3000, 4000]

    result = pipeline.execute("SELECT * FROM pipeline_query_stats WHERE name = 'test_1_group' AND type = 'combiner'").first()
    assert result['output_rows'] == 1
def test_join_ordering(pipeline, clean_db):
    """
    Verify that the correct plan is generated regardless of the ordering of
    streams and tables.
    """
    num_cols = 8
    join_cols = [0]
    ordering0_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])
    ordering1_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])

    pipeline.create_table('ordering0', **ordering0_cols)
    pipeline.create_table('ordering1', **ordering1_cols)

    ordering0 = _generate_rows(num_cols, 64)
    ordering1 = _generate_rows(num_cols, 64)
    _insert(pipeline, 'ordering0', ordering0, 0.1)
    _insert(pipeline, 'ordering1', ordering1, 0.1)

    pipeline.create_stream('stream', **ordering0_cols)

    # stream, table, table
    q0 = """
    SELECT s.col0::integer, ordering0.col3, ordering1.col4 FROM
    stream s JOIN ordering0 ON s.col0 = ordering0.col0
    JOIN ordering1 ON ordering0.col0 = ordering1.col0
    """
    pipeline.create_cv('test_ordering0', q0)

    # table, stream, table
    q1 = """
    SELECT s.col0::integer, ordering0.col3, ordering1.col4 FROM
    ordering0 JOIN stream s ON s.col0 = ordering0.col0
    JOIN ordering1 ON ordering0.col0 = ordering1.col0
    """
    pipeline.create_cv('test_ordering1', q1)

    # table, table, stream
    q2 = """
    SELECT s.col0::integer, ordering0.col3, ordering1.col4 FROM
    ordering0 JOIN ordering1 ON ordering0.col0 = ordering1.col0
    JOIN stream s ON s.col0 = ordering0.col0
    """
    pipeline.create_cv('test_ordering2', q2)

    s = _generate_rows(num_cols, 64)
    _insert(pipeline, 'stream', s)

    expected = _join(ordering0, _join(ordering1, s, join_cols), join_cols)

    result0 = pipeline.execute('SELECT COUNT(*) FROM test_ordering0').first()
    result1 = pipeline.execute('SELECT COUNT(*) FROM test_ordering1').first()
    result2 = pipeline.execute('SELECT COUNT(*) FROM test_ordering2').first()

    assert result0['count'] == len(expected)
    assert result1['count'] == len(expected)
    assert result2['count'] == len(expected)
def test_join_ordering(pipeline, clean_db):
  """
  Verify that the correct plan is generated regardless of the ordering of
  streams and tables.
  """
  num_cols = 8
  join_cols = [0]
  ordering0_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])
  ordering1_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])

  pipeline.create_table('ordering0', **ordering0_cols)
  pipeline.create_table('ordering1', **ordering1_cols)

  ordering0 = _generate_rows(num_cols, 64)
  ordering1 = _generate_rows(num_cols, 64)
  _insert(pipeline, 'ordering0', ordering0, 0.1)
  _insert(pipeline, 'ordering1', ordering1, 0.1)

  pipeline.create_stream('stream0', **ordering0_cols)

  # stream, table, table
  q0 = """
  SELECT s.col0::integer, ordering0.col3, ordering1.col4 FROM
  stream0 s JOIN ordering0 ON s.col0 = ordering0.col0
  JOIN ordering1 ON ordering0.col0 = ordering1.col0
  """
  pipeline.create_cv('test_ordering0', q0)

  # table, stream, table
  q1 = """
  SELECT s.col0::integer, ordering0.col3, ordering1.col4 FROM
  ordering0 JOIN stream0 s ON s.col0 = ordering0.col0
  JOIN ordering1 ON ordering0.col0 = ordering1.col0
  """
  pipeline.create_cv('test_ordering1', q1)

  # table, table, stream
  q2 = """
  SELECT s.col0::integer, ordering0.col3, ordering1.col4 FROM
  ordering0 JOIN ordering1 ON ordering0.col0 = ordering1.col0
  JOIN stream0 s ON s.col0 = ordering0.col0
  """
  pipeline.create_cv('test_ordering2', q2)

  s = _generate_rows(num_cols, 64)
  _insert(pipeline, 'stream0', s)

  expected = _join(ordering0, _join(ordering1, s, join_cols), join_cols)

  result0 = pipeline.execute('SELECT COUNT(*) FROM test_ordering0')[0]
  result1 = pipeline.execute('SELECT COUNT(*) FROM test_ordering1')[0]
  result2 = pipeline.execute('SELECT COUNT(*) FROM test_ordering2')[0]

  assert result0['count'] == len(expected)
  assert result1['count'] == len(expected)
  assert result2['count'] == len(expected)
Esempio n. 60
0
def test_multiple_databases(pipeline, clean_db):
    conn = psycopg2.connect('dbname=postgres user=%s host=localhost port=%s' %
                            (getpass.getuser(), pipeline.port))
    conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)

    cur = conn.cursor()
    cur.execute('CREATE DATABASE tmp_pipeline')
    cur.close()

    q = 'SELECT x::int FROM dbstream'
    pipeline.create_stream('dbstream', x='int')
    pipeline.create_cv('test_multiple_databases', q)

    # Insert data in first database.
    pipeline.insert('dbstream', ['x'], map(lambda x: (x, ), range(0, 10, 2)))
    result = pipeline.execute('SELECT * FROM test_multiple_databases')
    assert sorted(row['x'] for row in result) == range(0, 10, 2)

    # Create same CV in the other database, make sure its created and write different data to it.
    tmp_conn = psycopg2.connect(
        'dbname=tmp_pipeline user=%s host=localhost port=%s' %
        (getpass.getuser(), pipeline.port))
    cur = tmp_conn.cursor()
    cur.execute('CREATE EXTENSION pipelinedb')
    cur.execute('CREATE FOREIGN TABLE dbstream (x int) SERVER pipelinedb')
    cur.execute('CREATE VIEW test_multiple_databases AS %s' % q)
    tmp_conn.commit()
    cur.execute('INSERT INTO dbstream (x) VALUES %s' %
                ', '.join(map(lambda x: '(%d)' % x, range(1, 11, 2))))
    cur.execute('SELECT * FROM test_multiple_databases')
    tmp_conn.commit()
    assert sorted(row[0] for row in cur) == range(1, 11, 2)

    # Ensure that the data written to the other database isn't seen by the first database.
    result = pipeline.execute('SELECT * FROM test_multiple_databases')
    assert sorted(row['x'] for row in result) == range(0, 10, 2)

    # Insert new data to both databases.
    pipeline.insert('dbstream', ['x'], map(lambda x: (x, ), range(10, 20, 2)))
    cur.execute('INSERT INTO dbstream (x) VALUES %s' %
                ', '.join(map(lambda x: '(%d)' % x, range(11, 21, 2))))

    # Ensure both databases still saw the data written out to them.
    result = pipeline.execute('SELECT * FROM test_multiple_databases')
    assert sorted(row['x'] for row in result) == range(0, 20, 2)
    cur.execute('SELECT * FROM test_multiple_databases')
    tmp_conn.commit()
    assert sorted(row[0] for row in cur) == range(1, 21, 2)

    cur.close()
    tmp_conn.close()
    cur = conn.cursor()
    cur.execute('DROP DATABASE tmp_pipeline')
    cur.close()
    conn.close()