def test_simple_aggs(pipeline, clean_db):
    """
    Verify that combines work properly on simple aggs
    """
    q = """
    SELECT x::integer %% 10 AS k,
    avg(x), sum(y::float8), count(*) FROM stream0 GROUP BY k;
    """
    desc = ('x', 'y')
    pipeline.create_stream('stream0', x='int', y='float8')
    pipeline.create_cv('test_simple_aggs', q)
    pipeline.create_table('test_simple_aggs_t', x='integer', y='float8')

    rows = []
    for n in range(10000):
        row = (random.randint(0, 1000), random.random())
        rows.append(row)

    pipeline.insert('stream0', desc, rows)
    pipeline.insert('test_simple_aggs_t', desc, rows)

    table_result = list(pipeline.execute('SELECT avg(x), sum(y::float8), count(*) FROM test_simple_aggs_t'))
    cv_result = list(pipeline.execute('SELECT combine(avg), combine(sum), combine(count) FROM test_simple_aggs'))

    assert len(table_result) == len(cv_result)

    for tr, cr in zip(table_result, cv_result):
        assert abs(tr[0] - cr[0]) < 0.00001
        assert abs(tr[1] - cr[1]) < 0.00001
        assert abs(tr[2] - cr[2]) < 0.00001
def test_null_groups(pipeline, clean_db):
    """
    Verify that null group columns are considered equal
    """
    pipeline.create_stream("s", x="int", y="int", z="int")
    q = """
    SELECT x::integer, y::integer, z::integer, COUNT(*) FROM s
    GROUP BY x, y, z;
    """
    desc = ("x", "y", "z")
    pipeline.create_cv("test_null_groups", q)
    pipeline.create_table("test_null_groups_t", x="integer", y="integer", z="integer")

    rows = []
    for n in range(10000):
        vals = list(random.randint(0, 10) for n in range(3))
        vals = map(lambda n: random.random() > 0.1 and n or None, vals)
        rows.append(tuple(vals))

    pipeline.insert("s", desc, rows)
    pipeline.insert("test_null_groups_t", desc, rows)

    table_q = """
    SELECT x, y, z, COUNT(*) FROM test_null_groups_t
    GROUP BY x, y, z ORDER BY x, y, z;
    """
    expected = list(pipeline.execute(table_q))
    result = list(pipeline.execute("SELECT x, y, z, count FROM test_null_groups ORDER BY x, y, z"))

    for r, e in zip(result, expected):
        assert r == e
def test_incremental_join(pipeline, clean_db):
    """
    Verify that join results increase appropriately as we incrementally
    add stream events to the input
    """
    num_cols = 4
    join_cols = [0, 1]
    t_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])
    pipeline.create_table('inc', **t_cols)
    pipeline.create_stream('stream', **t_cols)

    q = """
    SELECT s.col0::integer FROM inc JOIN stream s ON inc.col0 = s.col0
    AND inc.col1 = s.col1::integer
    """
    t = _generate_rows(num_cols, 64)
    _insert(pipeline, 'inc', t, 0.1)

    pipeline.create_cv('test_join', q)
    s = []
    for n in range(2):
        row = _generate_row(num_cols)
        _insert(pipeline, 'stream', [row])
        s.append(row)

    expected = _join(t, s, join_cols)
    result = pipeline.execute('SELECT COUNT(*) FROM test_join').first()

    assert result['count'] == len(expected)
def test_hll_distinct(pipeline, clean_db):
    """
    Verify that combines work on HLL COUNT DISTINCT queries
    """
    q = """
    SELECT x::integer %% 10 AS k, COUNT(DISTINCT x) AS count FROM stream GROUP BY k
    """
    desc = ('x', 'y')
    pipeline.create_cv('test_hll_distinct', q)
    pipeline.create_table('test_hll_distinct_t', x='integer', y='float8')

    rows = []
    for n in range(10000):
        row = (random.randint(0, 1000), random.random())
        rows.append(row)

    pipeline.insert('stream', desc, rows)
    pipeline.insert('test_hll_distinct_t', desc, rows)

    # Note that the CQ will use the HLL variant of COUNT DISTINCT,
    # so use hll_count_distinct on the table too
    tq = """
    SELECT hll_count_distinct(x) FROM test_hll_distinct_t
    """
    table_result = list(pipeline.execute(tq))

    cq = """
    SELECT combine(count) FROM test_hll_distinct
    """
    cv_result = list(pipeline.execute(cq))

    assert len(table_result) == len(cv_result)

    for tr, cr in zip(table_result, cv_result):
        assert tr == cr
def test_indexed(pipeline, clean_db):
    """
    Verify that stream-table joins involving indexed tables work
    """
    pipeline.create_stream('stream', x='int', y='int')
    q = """
    SELECT stream.x::integer, count(*) FROM stream
    JOIN test_indexed_t t ON stream.x = t.x GROUP BY stream.x
    """
    pipeline.create_table('test_indexed_t', x='integer', y='integer')
    pipeline.execute('CREATE INDEX idx ON test_indexed_t(x)')

    t = _generate_rows(2, 1000)
    s = _generate_rows(2, 1000)

    pipeline.insert('test_indexed_t', ('x', 'y'), t)
    time.sleep(0.1)

    pipeline.create_cv('test_indexed', q)
    pipeline.insert('stream', ('x', 'y'), s)

    expected = _join(s, t, [0])
    result = pipeline.execute('SELECT sum(count) FROM test_indexed').first()

    assert result['sum'] == len(expected)
def test_join_multiple_tables(pipeline, clean_db):
    """
    Verify that stream-table joins involving multiple tables work
    """
    num_cols = 8
    join_cols = [0]
    t0_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])
    t1_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])

    pipeline.create_table('t0', **t0_cols)
    pipeline.create_table('t1', **t1_cols)
    pipeline.create_stream('stream', **t0_cols)
    q = """
    SELECT s.col0::integer FROM t0 JOIN t1 ON t0.col0 = t1.col0
    JOIN stream s ON t1.col0 = s.col0
    """

    t0 = _generate_rows(num_cols, 64)
    t1 = _generate_rows(num_cols, 64)
    s = _generate_rows(num_cols, 64)

    _insert(pipeline, 't1', t1, 0.1)
    _insert(pipeline, 't0', t0, 0.1)

    pipeline.create_cv('test_join_multi', q)
    _insert(pipeline, 'stream', s)

    expected = _join(t0, _join(s, t1, join_cols), join_cols)
    result = pipeline.execute('SELECT COUNT(*) FROM test_join_multi').first()

    assert result['count'] == len(expected)
def test_join_with_where(pipeline, clean_db):
    """
    Verify that stream-table joins using a WHERE clause work properly
    """
    num_cols = 4
    q = """
    SELECT s.col0::integer FROM stream s, wt WHERE s.col0 = 1 AND wt.col0 = 1
    """
    wt_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])

    pipeline.create_table('wt', **wt_cols)
    pipeline.create_table('wt_s', **wt_cols)

    wt = _generate_rows(num_cols, 64)
    s = _generate_rows(num_cols, 64)

    _insert(pipeline, 'wt', wt, 0.1)
    _insert(pipeline, 'wt_s', s, 0.1)

    pipeline.create_stream('stream', **wt_cols)
    pipeline.create_cv('test_join_where', q)
    _insert(pipeline, 'stream', s)

    expected = pipeline.execute('SELECT COUNT(*) FROM wt_s s, wt WHERE s.col0 = 1 AND wt.col0 = 1').first()
    result = pipeline.execute('SELECT COUNT(*) FROM test_join_where').first()

    assert result['count'] == expected['count']
def test_stats_aggs(pipeline, clean_db):
    """
    Verify that combines work on stats aggs
    """
    q = """
    SELECT x::integer %% 10 AS k,
    regr_sxx(x, y::float8), stddev(x) FROM stream GROUP BY k;
    """
    desc = ('x', 'y')
    pipeline.create_cv('test_stats_aggs', q)
    pipeline.create_table('test_stats_aggs_t', x='integer', y='float8')

    rows = []
    for n in range(10000):
        row = (random.randint(0, 1000), random.random())
        rows.append(row)

    pipeline.insert('stream', desc, rows)
    pipeline.insert('test_stats_aggs_t', desc, rows)

    tq = """
    SELECT regr_sxx(x, y::float8), stddev(x) FROM test_stats_aggs_t
    """
    table_result = list(pipeline.execute(tq))

    cq = """
    SELECT combine(regr_sxx), combine(stddev) FROM test_stats_aggs
    """
    cv_result = list(pipeline.execute(cq))

    assert len(table_result) == len(cv_result)

    for tr, cr in zip(table_result, cv_result):
        assert abs(tr[0] - cr[0]) < 0.00001
        assert abs(tr[1] - cr[1]) < 0.00001
def test_colums_subset(pipeline, clean_db):
    """
    Verify that copying data from a file into a stream works when the file's input
    columns are a subset of the stream0's columns
    """
    pipeline.create_stream("stream0", x="int", y="float8", z="numeric", m="int")
    q = "SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric), max(m::integer) FROM stream0"
    pipeline.create_cv("test_copy_subset", q)
    pipeline.create_table("test_copy_subset_t", x="integer", y="float8", z="numeric")

    path = os.path.abspath(os.path.join(pipeline.tmp_dir, "test_copy.csv"))

    rows = []
    for n in range(10000):
        row = random.randint(1, 1024), random.randint(1, 1024), random.random()
        rows.append(row)

    _generate_csv(path, rows, desc=("x", "y", "z"))

    pipeline.execute("COPY test_copy_subset_t (x, y, z) FROM '%s' HEADER CSV" % path)

    pipeline.execute("COPY stream0 (x, y, z) FROM '%s' HEADER CSV" % path)

    expected = pipeline.execute(
        "SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric) FROM test_copy_subset_t"
    ).first()
    result = list(pipeline.execute("SELECT s0, s1, avg FROM test_copy_subset"))

    assert len(result) == 1

    result = result[0]

    assert result[0] == expected[0]
    assert result[1] == expected[1]
    assert result[2] == expected[2]
Exemple #10
0
def _test_agg(pipeline, agg, check_fn=None):
    name = agg[:agg.find('(')]
    q = 'SELECT g::integer, %s OVER (PARTITION BY g ORDER BY ts::timestamp) FROM %s'
    cv_name = 'test_%s' % name
    table_name = 'test_%s_t' % name
    desc = ('ts', 'g', 'x', 'y', 'z')

    pipeline.create_cv(cv_name, q % (agg, 'stream'))
    pipeline.create_table(table_name, ts='timestamp', x='integer', y='integer', z='integer', g='integer')

    rows = []
    for i, n in enumerate(range(1000)):
        ts = str(datetime.utcnow() + timedelta(seconds=i))
        row = ts, n % 10, random.randint(1, 256), random.randint(1, 256), random.randint(1, 256)
        rows.append(row)

    pipeline.insert('stream', desc, rows)
    pipeline.insert(table_name, desc, rows)

    if check_fn:
        return check_fn(pipeline)

    expected = list(pipeline.execute(q % (agg, table_name) + ' ORDER BY g'))
    result = list(pipeline.execute('SELECT * FROM %s ORDER BY g' % cv_name))

    assert len(expected) == len(result)

    for e, r in zip(expected, result):
        assert e == r

    pipeline.drop_cv(cv_name)
    pipeline.drop_table(table_name)
def test_join_across_batches(pipeline, clean_db):
    """
    Verify that stream-table joins are properly built when they
    span across multiple input batches
    """
    num_cols = 4
    join_cols = [0]
    t_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])
    pipeline.create_table('batch', **t_cols)
    pipeline.create_stream('stream', **t_cols)

    q = """
    SELECT s.col0::integer FROM batch JOIN stream s ON batch.col0 = s.col0
    """

    t = _generate_rows(num_cols, 64)
    _insert(pipeline, 'batch', t, 0.1)

    s = _generate_rows(num_cols, 64)
    pipeline.create_cv('test_batched_join', q)
    _insert(pipeline, 'stream', s)

    expected = _join(t, s, join_cols)
    result = pipeline.execute('SELECT COUNT(*) FROM test_batched_join').first()

    assert result['count'] == len(expected)
def test_copy_to_typed_stream(pipeline, clean_db):
    """
    Verify that copying data from a file into a typed stream works.
    """
    pipeline.create_stream('stream', x='integer', y='float8', z='numeric')

    q = 'SELECT sum(x) AS s0, sum(y) AS s1, avg(z) FROM stream'
    pipeline.create_cv('test_copy_to_typed_stream', q)
    pipeline.create_table('test_copy_to_typed_stream_t', x='integer', y='float8', z='numeric')

    path = os.path.abspath(os.path.join(pipeline.tmp_dir, 'test_copy.csv'))

    rows = []
    for n in range(10000):
        row = random.randint(1, 1024), random.randint(1, 1024), random.random()
        rows.append(row)

    _generate_csv(path, rows, desc=('x', 'y', 'z'))

    pipeline.execute('COPY test_copy_to_typed_stream_t (x, y, z) FROM \'%s\' HEADER CSV' % path)

    pipeline.execute('COPY stream (x, y, z) FROM \'%s\' HEADER CSV' % path)

    expected = pipeline.execute('SELECT sum(x) AS s0, sum(y) AS s1, avg(z) FROM test_copy_to_typed_stream_t').first()
    result = list(pipeline.execute('SELECT s0, s1, avg FROM test_copy_to_typed_stream'))

    assert len(result) == 1

    result = result[0]

    assert result[0] == expected[0]
    assert result[1] == expected[1]
    assert result[2] == expected[2]
def test_colums_subset(pipeline, clean_db):
    """
    Verify that copying data from a file into a stream works when the file's input
    columns are a subset of the stream's columns
    """
    q = 'SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric), max(m::integer) FROM stream'
    pipeline.create_cv('test_copy_subset', q)
    pipeline.create_table('test_copy_subset_t', x='integer', y='float8', z='numeric')

    path = os.path.abspath(os.path.join(pipeline.tmp_dir, 'test_copy.csv'))

    rows = []
    for n in range(10000):
        row = random.randint(1, 1024), random.randint(1, 1024), random.random()
        rows.append(row)

    _generate_csv(path, rows, desc=('x', 'y', 'z'))

    pipeline.execute('COPY test_copy_subset_t (x, y, z) FROM \'%s\' HEADER CSV' % path)

    pipeline.execute('COPY stream (x, y, z) FROM \'%s\' HEADER CSV' % path)

    expected = pipeline.execute('SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric) FROM test_copy_subset_t').first()
    result = list(pipeline.execute('SELECT s0, s1, avg FROM test_copy_subset'))

    assert len(result) == 1

    result = result[0]

    assert result[0] == expected[0]
    assert result[1] == expected[1]
    assert result[2] == expected[2]
Exemple #14
0
def test_null_groups(pipeline, clean_db):
    """
    Verify that null group columns are considered equal
    """
    pipeline.create_stream('stream', x='int', y='int', z='int')
    q = """
    SELECT x::integer, y::integer, z::integer, COUNT(*) FROM stream
    GROUP BY x, y, z;
    """
    desc = ('x', 'y', 'z')
    pipeline.create_cv('test_null_groups', q)
    pipeline.create_table('test_null_groups_t', x='integer', y='integer', z='integer')

    rows = []
    for n in range(10000):
        vals = list(random.randint(0, 10) for n in range(3))
        vals = map(lambda n: random.random() > 0.1 and n or None, vals)
        rows.append(tuple(vals))

    pipeline.insert('stream', desc, rows)
    pipeline.insert('test_null_groups_t', desc, rows)

    table_q = """
    SELECT x, y, z, COUNT(*) FROM test_null_groups_t
    GROUP BY x, y, z ORDER BY x, y, z;
    """
    expected = list(pipeline.execute(table_q))
    result = list(pipeline.execute('SELECT x, y, z, count FROM test_null_groups ORDER BY x, y, z'))

    for r, e in zip(result, expected):
        assert r == e
def test_join_ordering(pipeline, clean_db):
    """
    Verify that the correct plan is generated regardless of the ordering of
    streams and tables.
    """
    num_cols = 8
    join_cols = [0]
    ordering0_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])
    ordering1_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])

    pipeline.create_table('ordering0', **ordering0_cols)
    pipeline.create_table('ordering1', **ordering1_cols)

    ordering0 = _generate_rows(num_cols, 64)
    ordering1 = _generate_rows(num_cols, 64)
    _insert(pipeline, 'ordering0', ordering0, 0.1)
    _insert(pipeline, 'ordering1', ordering1, 0.1)

    pipeline.create_stream('stream', **ordering0_cols)

    # stream, table, table
    q0 = """
    SELECT s.col0::integer, ordering0.col3, ordering1.col4 FROM
    stream s JOIN ordering0 ON s.col0 = ordering0.col0
    JOIN ordering1 ON ordering0.col0 = ordering1.col0
    """
    pipeline.create_cv('test_ordering0', q0)

    # table, stream, table
    q1 = """
    SELECT s.col0::integer, ordering0.col3, ordering1.col4 FROM
    ordering0 JOIN stream s ON s.col0 = ordering0.col0
    JOIN ordering1 ON ordering0.col0 = ordering1.col0
    """
    pipeline.create_cv('test_ordering1', q1)

    # table, table, stream
    q2 = """
    SELECT s.col0::integer, ordering0.col3, ordering1.col4 FROM
    ordering0 JOIN ordering1 ON ordering0.col0 = ordering1.col0
    JOIN stream s ON s.col0 = ordering0.col0
    """
    pipeline.create_cv('test_ordering2', q2)

    s = _generate_rows(num_cols, 64)
    _insert(pipeline, 'stream', s)

    expected = _join(ordering0, _join(ordering1, s, join_cols), join_cols)

    result0 = pipeline.execute('SELECT COUNT(*) FROM test_ordering0').first()
    result1 = pipeline.execute('SELECT COUNT(*) FROM test_ordering1').first()
    result2 = pipeline.execute('SELECT COUNT(*) FROM test_ordering2').first()

    assert result0['count'] == len(expected)
    assert result1['count'] == len(expected)
    assert result2['count'] == len(expected)
Exemple #16
0
def test_bloom_type(pipeline, clean_db):
    pipeline.create_table("test_bloom_type", x="int", y="bloom")
    pipeline.execute("INSERT INTO test_bloom_type (x, y) VALUES " "(1, bloom_empty()), (2, bloom_empty())")

    for i in xrange(1000):
        pipeline.execute("UPDATE test_bloom_type SET y = bloom_add(y, %d / x)" % i)

    result = list(pipeline.execute("SELECT bloom_cardinality(y) " "FROM test_bloom_type ORDER BY x"))
    assert result[0][0] == 986
    assert result[1][0] == 495
Exemple #17
0
def test_hll_type(pipeline, clean_db):
  pipeline.create_table('test_hll_type', x='int', y='hll')
  pipeline.execute('INSERT INTO test_hll_type (x, y) VALUES '
                   '(1, hll_empty()), (2, hll_empty())')

  for i in xrange(1000):
    pipeline.execute('UPDATE test_hll_type SET y = hll_add(y, %d / x)' % i)

  result = list(pipeline.execute('SELECT hll_cardinality(y) '
                                 'FROM test_hll_type ORDER BY x'))
  assert result[0][0] == 995
  assert result[1][0] == 497
Exemple #18
0
def test_cmsketch_type(pipeline, clean_db):
  pipeline.create_table('test_cmsketch_type', x='int', y='cmsketch')
  pipeline.execute('INSERT INTO test_cmsketch_type (x, y) VALUES '
                   '(1, cmsketch_empty()), (2, cmsketch_empty())')

  for i in xrange(1000):
    pipeline.execute('UPDATE test_cmsketch_type '
                     'SET y = cmsketch_add(y, {} %% x)'.format(i))

  result = list(pipeline.execute('SELECT cmsketch_frequency(y, 0), '
                                 'cmsketch_frequency(y, 1) '
                                 'FROM test_cmsketch_type ORDER BY x'))
  assert result[0] == (1000, 0)
  assert result[1] == (500, 500)
def test_cmsketch_type(pipeline, clean_db):
    pipeline.create_table("test_cmsketch_type", x="int", y="cmsketch")
    pipeline.execute("INSERT INTO test_cmsketch_type (x, y) VALUES " "(1, cmsketch_empty()), (2, cmsketch_empty())")

    for i in xrange(1000):
        pipeline.execute("UPDATE test_cmsketch_type " "SET y = cmsketch_add(y, {} %% x)".format(i))

    result = list(
        pipeline.execute(
            "SELECT cmsketch_frequency(y, 0), " "cmsketch_frequency(y, 1) " "FROM test_cmsketch_type ORDER BY x"
        )
    )
    assert result[0] == (1000, 0)
    assert result[1] == (500, 500)
Exemple #20
0
def test_tdigest_type(pipeline, clean_db):
  pipeline.create_table('test_tdigest_type', x='int', y='tdigest')
  pipeline.execute('INSERT INTO test_tdigest_type (x, y) VALUES '
                   '(1, tdigest_empty()), (2, tdigest_empty())')

  for i in xrange(1000):
    pipeline.execute('UPDATE test_tdigest_type '
                     'SET y = tdigest_add(y, {} %% (x * 500))'.format(i))

  result = list(pipeline.execute('SELECT tdigest_cdf(y, 400), '
                                 'tdigest_quantile(y, 0.9)'
                                 'FROM test_tdigest_type ORDER BY x'))
  assert map(lambda x: round(x, 1), result[0]) == [0.8, 449.5]
  assert map(lambda x: round(x, 1), result[1]) == [0.4, 899.5]
def test_join_with_aggs(pipeline, clean_db):
    """
    Verify that joins involving aggregates referencing columns from
    multiple tables work
    """
    num_cols = 4
    join_cols = [1]
    q = """
    SELECT
    sum(s.col0::integer) AS s0,
    sum(a0.col0::integer) AS s1,
    sum(a1.col0::integer) AS s2
    FROM a1 JOIN a0 ON a1.col1 = a0.col1
    JOIN stream s ON s.col1::integer = a0.col1
    """
    a0_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])
    a1_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])

    pipeline.create_table('a0', **a0_cols)
    pipeline.create_table('a1', **a1_cols)

    a0 = _generate_rows(num_cols, 64)
    a1 = _generate_rows(num_cols, 64)
    s = _generate_rows(num_cols, 64)

    _insert(pipeline, 'a0', a0, 0.1)
    _insert(pipeline, 'a1', a1, 0.1)

    pipeline.create_stream('stream', **a0_cols)
    pipeline.create_cv('test_agg_join', q)
    _insert(pipeline, 'stream', s)

    expected = _join(a1, _join(a0, s, join_cols), join_cols)
    result = pipeline.execute('SELECT * FROM test_agg_join').first()

    # sum of col0 from stream
    s0_expected = sum([r[num_cols * 2] for r in expected])

    # sum of col0 from a0
    s1_expected = sum([r[num_cols * 1] for r in expected])

    # sum of col0 from a1
    s2_expected = sum([r[num_cols * 0] for r in expected])

    assert s0_expected == result['s0']
    assert s1_expected == result['s1']
    assert s2_expected == result['s2']
Exemple #22
0
def test_percentile_cont_agg(pipeline, clean_db):
  range_top = 100000
  q = [0.0, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 1.0]

  batches = []
  min_seen = range_top
  max_seen = 0
  for _ in xrange(10):
    b = [(random.randint(0, range_top),) for _ in xrange(5000)]
    min_seen = min(min_seen, min(b)[0])
    max_seen = max(max_seen, max(b)[0])
    batches.append(b)

  pipeline.create_stream('test_stream', x='int')
  query = '''SELECT
  percentile_cont(ARRAY[%s])
  WITHIN GROUP (ORDER BY x::integer) FROM %s
  ''' % (', '.join(map(lambda f: str(f), q)), '%s')

  pipeline.create_cv('test_cq_percentile_cont', query % 'test_stream')
  pipeline.create_table('test_percentile_cont', x='integer')

  for b in batches:
    pipeline.insert('test_stream', ('x',), b)
    pipeline.insert('test_percentile_cont', ('x',), b)

  actual = pipeline.execute(query % 'test_percentile_cont')
  result = pipeline.execute('SELECT * FROM test_cq_percentile_cont')

  actual = actual.first()['percentile_cont']
  result = result.first()['percentile_cont']

  assert len(actual) == len(result)
  assert result == sorted(result)
  diff = [abs(actual[i] - result[i]) for i in xrange(len(actual))]

  # 0th and 100th percentile should be accurate.
  assert result[0] == min_seen
  assert result[-1] == max_seen

  # 1st and 99th percentile should be within 0.1%.
  assert diff[1] <= 0.001 * range_top
  assert diff[-2] <= 0.001 * range_top

  # All percentiles should be within 0.5%.
  assert all(x <= 0.005 * range_top for x in diff)
def test_join_with_aggs(pipeline, clean_db):
    """
    Verify that joins involving aggregates referencing columns from
    multiple tables work
    """
    num_cols = 4
    join_cols = [1]
    q = """
    SELECT
    sum(s.col0::integer) AS s0,
    sum(a0.col0::integer) AS s1,
    sum(a1.col0::integer) AS s2
    FROM a1 JOIN a0 ON a1.col1 = a0.col1
    JOIN stream s ON s.col1::integer = a0.col1
    """
    a0_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])
    a1_cols = dict([('col%d' % n, 'integer') for n in range(num_cols)])

    pipeline.create_table('a0', **a0_cols)
    pipeline.create_table('a1', **a1_cols)

    a0 = _generate_rows(num_cols, 64)
    a1 = _generate_rows(num_cols, 64)
    s = _generate_rows(num_cols, 64)

    _insert(pipeline, 'a0', a0, 0.1)
    _insert(pipeline, 'a1', a1, 0.1)

    pipeline.create_cv('test_agg_join', q)
    _insert(pipeline, 'stream', s)

    expected = _join(a1, _join(a0, s, join_cols), join_cols)
    result = pipeline.execute('SELECT * FROM test_agg_join').first()

    # sum of col0 from stream
    s0_expected = sum([r[num_cols * 2] for r in expected])

    # sum of col0 from a0
    s1_expected = sum([r[num_cols * 1] for r in expected])

    # sum of col0 from a1
    s2_expected = sum([r[num_cols * 0] for r in expected])

    assert s0_expected == result['s0']
    assert s1_expected == result['s1']
    assert s2_expected == result['s2']
def test_cont_transforms(pipeline, clean_db):
  pipeline.execute('CREATE FOREIGN TABLE cv_stream (x int, y text) SERVER pipelinedb')
  pipeline.execute('CREATE FOREIGN TABLE ct_stream (x int, y text) SERVER pipelinedb')
  pipeline.create_cv('test_cv', 'SELECT count(*) FROM cv_stream')
  pipeline.create_ct('test_ct1', 'SELECT x::int, y::text FROM ct_stream WHERE mod(x, 2) = 0',
                     "pipelinedb.insert_into_stream('cv_stream', 'cv_stream')")
  pipeline.create_table('test_t', x='int', y='text')
  pipeline.execute('''
  CREATE OR REPLACE FUNCTION test_tg()
  RETURNS trigger AS
  $$
  BEGIN
   INSERT INTO test_t (x, y) VALUES (NEW.x, NEW.y);
   RETURN NEW;
  END;
  $$
  LANGUAGE plpgsql;
  ''')
  pipeline.create_ct('test_ct2', 'SELECT x::int, y::text FROM ct_stream',
                     'test_tg')

  pipeline.insert('ct_stream', ('x', 'y'), [(1, 'hello'), (2, 'world')])
  time.sleep(1)

  assert pipeline.execute('SELECT count FROM test_cv')[0]['count'] == 2

  _dump(pipeline, 'test_cont_transform.sql')

  pipeline.drop_all()
  pipeline.drop_table('test_t')
  pipeline.execute('DROP FUNCTION test_tg()')

  _restore(pipeline, 'test_cont_transform.sql')
  
  pipeline.insert('ct_stream', ('x', 'y'), [(1, 'hello'), (2, 'world')])
  time.sleep(1)

  assert pipeline.execute('SELECT count FROM test_cv')[0]['count'] == 4
  ntups = 0
  
  for row in pipeline.execute('SELECT x, count(*) FROM test_t GROUP BY x'):
    assert row['count'] == 2
    assert row['x'] in (1, 2)
    ntups += 1
  assert ntups == 2
def test_colums_subset(pipeline, clean_db):
    """
    Verify that copying data from a file into a stream works when the file's input
    columns are a subset of the stream0's columns
    """
    pipeline.create_stream('stream0',
                           x='int',
                           y='float8',
                           z='numeric',
                           m='int')
    q = 'SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric), max(m::integer) FROM stream0'
    pipeline.create_cv('test_copy_subset', q)
    pipeline.create_table('test_copy_subset_t',
                          x='integer',
                          y='float8',
                          z='numeric')

    path = os.path.abspath(os.path.join(pipeline.tmp_dir, 'test_copy.csv'))

    rows = []
    for n in range(10000):
        row = random.randint(1, 1024), random.randint(1, 1024), random.random()
        rows.append(row)

    _generate_csv(path, rows, desc=('x', 'y', 'z'))

    pipeline.execute(
        'COPY test_copy_subset_t (x, y, z) FROM \'%s\' HEADER CSV' % path)

    pipeline.execute('COPY stream0 (x, y, z) FROM \'%s\' HEADER CSV' % path)

    expected = pipeline.execute(
        'SELECT sum(x::integer) AS s0, sum(y::float8) AS s1, avg(z::numeric) FROM test_copy_subset_t'
    ).first()
    result = list(pipeline.execute('SELECT s0, s1, avg FROM test_copy_subset'))

    assert len(result) == 1

    result = result[0]

    assert result[0] == expected[0]
    assert result[1] == expected[1]
    assert result[2] == expected[2]
def test_cont_transforms(pipeline, clean_db):
  pipeline.execute('CREATE STREAM cv_stream (x int, y text)')
  pipeline.execute('CREATE STREAM ct_stream (x int, y text)')
  pipeline.create_cv('test_cv', 'SELECT count(*) FROM cv_stream')
  pipeline.create_ct('test_ct1', 'SELECT x::int, y::text FROM ct_stream WHERE mod(x, 2) = 0',
                     "pipeline_stream_insert('cv_stream', 'cv_stream')")
  pipeline.create_table('test_t', x='int', y='text')
  pipeline.execute('''
  CREATE OR REPLACE FUNCTION test_tg()
  RETURNS trigger AS
  $$
  BEGIN
   INSERT INTO test_t (x, y) VALUES (NEW.x, NEW.y);
   RETURN NEW;
  END;
  $$
  LANGUAGE plpgsql;
  ''')
  pipeline.create_ct('test_ct2', 'SELECT x::int, y::text FROM ct_stream',
                     'test_tg()')

  pipeline.insert('ct_stream', ('x', 'y'), [(1, 'hello'), (2, 'world')])
  time.sleep(1)

  _dump(pipeline, 'test_cont_transform.sql')

  pipeline.drop_all()
  pipeline.drop_table('test_t')
  pipeline.execute('DROP FUNCTION test_tg()')

  _restore(pipeline, 'test_cont_transform.sql')

  pipeline.insert('ct_stream', ('x', 'y'), [(1, 'hello'), (2, 'world')])
  time.sleep(1)

  assert pipeline.execute('SELECT count FROM test_cv').first()['count'] == 4
  ntups = 0
  for row in pipeline.execute('SELECT x, count(*) FROM test_t GROUP BY x'):
    assert row['count'] == 2
    assert row['x'] in (1, 2)
    ntups += 1
  assert ntups == 2
def test_distinct(pipeline, clean_db):
    """
  Verify that streaming SELECT DISTINCT ON (...) works
  """
    pipeline.create_stream('stream0', x='int', y='int', z='int')
    pipeline.create_table('table0', x='int', y='int', z='int')
    q = 'SELECT DISTINCT ON (x::int, y::int - z::int) x::int, y::int FROM stream0'
    pipeline.create_cv('test_distinct', q)

    uniques = defaultdict(set)
    values = []
    for _ in xrange(2000):
        x, y, z = random.randint(0, 20), random.randint(0, 20), random.randint(
            0, 20)
        values.append((x, y, z))
        uniques[(x, y - z)].add(y)

    pipeline.insert('stream0', ['x', 'y', 'z'], values)
    pipeline.insert('table0', ['x', 'y', 'z'], values)

    q = """
  SELECT DISTINCT ON (x::int, y::int - z::int) x::int, y::int FROM table0
  """
    expected = pipeline.execute(q)
    expected = len(expected)

    assert expected < 2000

    result = pipeline.execute('SELECT COUNT(*) FROM test_distinct')[0]

    assert expected == result['count']

    # Check if the first row was selected for uniques
    result = pipeline.execute('SELECT * FROM test_distinct')
    reverse_uniques = defaultdict(set)

    for (x, _), ys in uniques.iteritems():
        for y in ys:
            reverse_uniques[y].add(x)

    for row in result:
        assert row['x'] in reverse_uniques[row['y']]
def test_tdigest_type(pipeline, clean_db):
    pipeline.create_table('test_tdigest_type', x='int', y='tdigest')
    pipeline.execute('INSERT INTO test_tdigest_type (x, y) VALUES '
                     '(1, tdigest_empty()), (2, tdigest_empty())')

    for i in xrange(1000):
        pipeline.execute(
            'UPDATE test_tdigest_type SET y = dist_add(y, %d %% (x * 500))' %
            i)

    result = list(
        pipeline.execute('SELECT dist_cdf(y, 400), '
                         'dist_quantile(y, 0.9)'
                         'FROM test_tdigest_type ORDER BY x'))
    assert map(
        lambda x: round(x, 1),
        (result[0]['dist_cdf'], result[0]['dist_quantile'])) == [0.8, 449.5]
    assert map(
        lambda x: round(x, 1),
        (result[1]['dist_cdf'], result[1]['dist_quantile'])) == [0.4, 899.5]
def test_copy_to_typed_stream(pipeline, clean_db):
    """
    Verify that copying data from a file into a typed stream works.
    """
    pipeline.create_stream('stream', x='integer', y='float8', z='numeric')

    q = 'SELECT sum(x) AS s0, sum(y) AS s1, avg(z) FROM stream'
    pipeline.create_cv('test_copy_to_typed_stream', q)
    pipeline.create_table('test_copy_to_typed_stream_t',
                          x='integer',
                          y='float8',
                          z='numeric')

    path = os.path.abspath(os.path.join(pipeline.tmp_dir, 'test_copy.csv'))

    rows = []
    for n in range(10000):
        row = random.randint(1, 1024), random.randint(1, 1024), random.random()
        rows.append(row)

    _generate_csv(path, rows, desc=('x', 'y', 'z'))

    pipeline.execute(
        'COPY test_copy_to_typed_stream_t (x, y, z) FROM \'%s\' HEADER CSV' %
        path)

    pipeline.execute('COPY stream (x, y, z) FROM \'%s\' HEADER CSV' % path)

    expected = pipeline.execute(
        'SELECT sum(x) AS s0, sum(y) AS s1, avg(z) FROM test_copy_to_typed_stream_t'
    ).first()
    result = list(
        pipeline.execute('SELECT s0, s1, avg FROM test_copy_to_typed_stream'))

    assert len(result) == 1

    result = result[0]

    assert result[0] == expected[0]
    assert result[1] == expected[1]
    assert result[2] == expected[2]
Exemple #30
0
def test_nested_expressions(pipeline, clean_db):
    """
    Verify that combines work properly on arbitrarily nested expressions
    """
    q = """
    SELECT x::integer %% 10 AS k,
    (rank(256) WITHIN GROUP (ORDER BY x) + dense_rank(256) WITHIN GROUP (ORDER BY x)) *
        (avg(x + y::float8) - (sum(x) * avg(y))) AS whoa
    FROM stream GROUP BY k
    """
    desc = ('x', 'y')
    pipeline.create_cv('test_nested', q)
    pipeline.create_table('test_nested_t', x='integer', y='float8')

    rows = []
    for n in range(10000):
        row = (random.randint(0, 1000), random.random())
        rows.append(row)

    pipeline.insert('stream', desc, rows)
    pipeline.insert('test_nested_t', desc, rows)

    # Note that the CQ will use the HLL variant of dense_rank,
    # so use hll_dense_rank on the table too
    tq = """
    SELECT
    (rank(256) WITHIN GROUP (ORDER BY x) + hll_dense_rank(256) WITHIN GROUP (ORDER BY x)) *
        (avg(x + y::float8) - (sum(x) * avg(y))) AS whoa
    FROM test_nested_t
    """
    table_result = list(pipeline.execute(tq))

    cq = """
    SELECT combine(whoa) FROM test_nested
    """
    cv_result = list(pipeline.execute(cq))

    assert len(table_result) == len(cv_result)

    for tr, cr in zip(table_result, cv_result):
        assert abs(tr[0] - cr[0]) < 0.0001
Exemple #31
0
def test_hypothetical_set_aggs(pipeline, clean_db):
    """
  Verify that combines work properly on HS aggs
  """
    q = """
  SELECT x::integer % 10 AS k,
  rank(256) WITHIN GROUP (ORDER BY x),
  dense_rank(256) WITHIN GROUP (ORDER BY x)
  FROM stream0 GROUP BY k
  """
    desc = ('x', 'y')
    pipeline.create_stream('stream0', x='int', y='float8')
    pipeline.create_cv('test_hs_aggs', q)
    pipeline.create_table('test_hs_aggs_t', x='integer', y='float8')

    rows = []
    for n in range(10000):
        row = (random.randint(0, 1000), random.random())
        rows.append(row)

    pipeline.insert('stream0', desc, rows)
    pipeline.insert('test_hs_aggs_t', desc, rows)

    # Note that the CQ will use the combinable variant of dense_rank,
    # so use that on the table too
    tq = """
  SELECT rank(256) WITHIN GROUP (ORDER BY x), combinable_dense_rank(256, x)
  FROM test_hs_aggs_t
  """
    table_result = pipeline.execute(tq)

    cq = """
  SELECT combine(rank), combine(dense_rank) FROM test_hs_aggs
  """
    cv_result = pipeline.execute(cq)

    assert len(table_result) == len(cv_result)

    for tr, cr in zip(table_result, cv_result):
        assert tr[0] == cr[0]
        assert tr[1] == cr[1]
def test_distinct(pipeline, clean_db):
  """
  Verify that streaming SELECT DISTINCT ON (...) works
  """
  pipeline.create_stream('stream0', x='int', y='int', z='int')
  pipeline.create_table('table0', x='int', y='int', z='int')
  q = 'SELECT DISTINCT ON (x::int, y::int - z::int) x::int, y::int FROM stream0'
  pipeline.create_cv('test_distinct', q)

  uniques = defaultdict(set)
  values = []
  for _ in xrange(2000):
    x, y, z = random.randint(0, 20), random.randint(0, 20), random.randint(0, 20)
    values.append((x, y, z))
    uniques[(x, y - z)].add(y)

  pipeline.insert('stream0', ['x', 'y', 'z'], values)
  pipeline.insert('table0', ['x', 'y', 'z'], values)

  q = """
  SELECT DISTINCT ON (x::int, y::int - z::int) x::int, y::int FROM table0
  """
  expected = list(pipeline.execute(q))
  expected = len(expected)

  assert expected < 2000

  result = pipeline.execute('SELECT COUNT(*) FROM test_distinct').first()

  assert expected == result['count']

  # Check if the first row was selected for uniques
  result = pipeline.execute('SELECT * FROM test_distinct')
  reverse_uniques = defaultdict(set)

  for (x, _), ys in uniques.iteritems():
    for y in ys:
      reverse_uniques[y].add(x)

  for row in result:
    assert row['x'] in reverse_uniques[row['y']]
def test_nested_expressions(pipeline, clean_db):
    """
    Verify that combines work properly on arbitrarily nested expressions
    """
    q = """
    SELECT x::integer %% 10 AS k,
    (rank(256) WITHIN GROUP (ORDER BY x) + dense_rank(256) WITHIN GROUP (ORDER BY x)) *
        (avg(x + y::float8) - (sum(x) * avg(y))) AS whoa
    FROM stream GROUP BY k
    """
    desc = ('x', 'y')
    pipeline.create_cv('test_nested', q)
    pipeline.create_table('test_nested_t', x='integer', y='float8')

    rows = []
    for n in range(10000):
        row = (random.randint(0, 1000), random.random())
        rows.append(row)

    pipeline.insert('stream', desc, rows)
    pipeline.insert('test_nested_t', desc, rows)

    # Note that the CQ will use the HLL variant of dense_rank,
    # so use hll_dense_rank on the table too
    tq = """
    SELECT
    (rank(256) WITHIN GROUP (ORDER BY x) + hll_dense_rank(256) WITHIN GROUP (ORDER BY x)) *
        (avg(x + y::float8) - (sum(x) * avg(y))) AS whoa
    FROM test_nested_t
    """
    table_result = list(pipeline.execute(tq))

    cq = """
    SELECT combine(whoa) FROM test_nested
    """
    cv_result = list(pipeline.execute(cq))

    assert len(table_result) == len(cv_result)

    for tr, cr in zip(table_result, cv_result):
        assert abs(tr[0] - cr[0]) < 0.0001
Exemple #34
0
def test_object_aggs(pipeline, clean_db):
    """
  Verify that combines work properly on object aggs
  """
    q = """
  SELECT x::integer % 10 AS k,
  json_agg(x), json_object_agg(x, y::float8), string_agg(s::text, \' :: \')FROM stream0 GROUP BY k;
  """
    desc = ('x', 'y', 's')
    pipeline.create_stream('stream0', x='int', y='float8', s='text')
    pipeline.create_cv('test_object_aggs', q)
    pipeline.create_table('test_object_aggs_t',
                          x='integer',
                          y='float8',
                          s='text')

    rows = []
    for n in range(10000):
        row = (random.randint(0, 1000), random.random(),
               str(n) * random.randint(1, 8))
        rows.append(row)

    pipeline.insert('stream0', desc, rows)
    pipeline.insert('test_object_aggs_t', desc, rows)

    tq = """
  SELECT json_agg(x), json_object_agg(x, y::float8), string_agg(s::text, \' :: \') FROM test_object_aggs_t
  """
    table_result = pipeline.execute(tq)

    cq = """
  SELECT combine(json_agg), combine(json_object_agg), combine(string_agg) FROM test_object_aggs
  """
    cv_result = pipeline.execute(cq)

    assert len(table_result) == len(cv_result)

    for tr, cr in zip(table_result, cv_result):
        assert sorted(tr[0]) == sorted(cr[0])
        assert sorted(tr[1]) == sorted(cr[1])
        assert sorted(tr[2]) == sorted(cr[2])
def test_hypothetical_set_aggs(pipeline, clean_db):
    """
    Verify that combines work properly on HS aggs
    """
    q = """
    SELECT x::integer %% 10 AS k,
    rank(256) WITHIN GROUP (ORDER BY x),
    dense_rank(256) WITHIN GROUP (ORDER BY x)
    FROM stream0 GROUP BY k
    """
    desc = ('x', 'y')
    pipeline.create_stream('stream0', x='int', y='float8')
    pipeline.create_cv('test_hs_aggs', q)
    pipeline.create_table('test_hs_aggs_t', x='integer', y='float8')

    rows = []
    for n in range(10000):
        row = (random.randint(0, 1000), random.random())
        rows.append(row)

    pipeline.insert('stream0', desc, rows)
    pipeline.insert('test_hs_aggs_t', desc, rows)

    # Note that the CQ will use the HLL variant of dense_rank,
    # so use hll_dense_rank on the table too
    tq = """
    SELECT rank(256) WITHIN GROUP (ORDER BY x), hll_dense_rank(256) WITHIN GROUP (ORDER BY x)
    FROM test_hs_aggs_t
    """
    table_result = list(pipeline.execute(tq))

    cq = """
    SELECT combine(rank), combine(dense_rank) FROM test_hs_aggs
    """
    cv_result = list(pipeline.execute(cq))

    assert len(table_result) == len(cv_result)

    for tr, cr in zip(table_result, cv_result):
        assert tr == cr
def test_produce(pipeline, kafka, clean_db):
  """
  Tests pipeline_kafka.emit_tuple and pipeline_kafka.produce_message
  """
  pipeline.create_stream('stream0', payload='json')
  pipeline.create_cv('cv', 'SELECT payload FROM stream0')
  pipeline.create_table('t', x='integer', y='integer')
  pipeline.execute("""CREATE TRIGGER tg AFTER INSERT ON t
    FOR EACH ROW EXECUTE PROCEDURE pipeline_kafka.emit_tuple('topic')
    """)

  kafka.create_topic('topic', partitions=4)
  pipeline.consume_begin('topic', 'stream0')

  for i in range(100):
    pipeline.insert('t', ('x', 'y'), [(i, 2 * i)])

  def messages():
    rows = pipeline.execute('SELECT * FROM cv')
    assert len(rows) == 100

  assert eventually(messages)
def test_produce(pipeline, kafka, clean_db):
    """
  Tests pipeline_kafka.emit_tuple and pipeline_kafka.produce_message
  """
    pipeline.create_stream('stream0', payload='json')
    pipeline.create_cv('cv', 'SELECT payload FROM stream0')
    pipeline.create_table('t', x='integer', y='integer')
    pipeline.execute("""CREATE TRIGGER tg AFTER INSERT ON t
    FOR EACH ROW EXECUTE PROCEDURE pipeline_kafka.emit_tuple('topic')
    """)

    kafka.create_topic('topic', partitions=4)
    pipeline.consume_begin('topic', 'stream0')

    for i in range(100):
        pipeline.insert('t', ('x', 'y'), [(i, 2 * i)])

    def messages():
        rows = pipeline.execute('SELECT * FROM cv')
        assert len(rows) == 100

    assert eventually(messages)
Exemple #38
0
def _test_agg(pipeline, agg, check_fn=None):
    name = agg[:agg.find('(')]
    q = 'SELECT g::integer, %s OVER (PARTITION BY g ORDER BY ts::timestamp) FROM %s'
    cv_name = 'test_%s' % name
    table_name = 'test_%s_t' % name
    desc = ('ts', 'g', 'x', 'y', 'z')

    pipeline.create_cv(cv_name, q % (agg, 'stream'))
    pipeline.create_table(table_name,
                          ts='timestamp',
                          x='integer',
                          y='integer',
                          z='integer',
                          g='integer')

    rows = []
    for i, n in enumerate(range(1000)):
        ts = str(datetime.utcnow() + timedelta(seconds=i))
        row = ts, n % 10, random.randint(1, 256), random.randint(
            1, 256), random.randint(1, 256)
        rows.append(row)

    pipeline.insert('stream', desc, rows)
    pipeline.insert(table_name, desc, rows)

    if check_fn:
        return check_fn(pipeline)

    expected = list(pipeline.execute(q % (agg, table_name) + ' ORDER BY g'))
    result = list(pipeline.execute('SELECT * FROM %s ORDER BY g' % cv_name))

    assert len(expected) == len(result)

    for e, r in zip(expected, result):
        assert e == r

    pipeline.drop_cv(cv_name)
    pipeline.drop_table(table_name)
Exemple #39
0
def test_windowed_combine(pipeline, clean_db):
    """
  Verify that windowed queries with combines work
  """
    q = """
  SELECT x::integer, avg(y::integer) FROM stream0 GROUP BY x
  """
    desc = ('x', 'y')
    pipeline.create_stream('stream0', x='int', y='float8')
    pipeline.create_cv('test_windowed_combine', q)
    pipeline.create_table('test_windowed_combine_t', x='integer', y='integer')

    rows = []
    for n in range(10000):
        row = (n, n)
        rows.append(row)

    pipeline.insert('stream0', desc, rows)
    pipeline.insert('test_windowed_combine_t', desc, rows)

    table = """
  SELECT first_value(x) OVER w, avg(y) OVER w
  FROM test_windowed_combine_t
  WINDOW w AS (ORDER BY x ROWS BETWEEN CURRENT ROW AND 5 FOLLOWING)
  ORDER BY first_value
  """
    expected = pipeline.execute(table)
    combine = """
  SELECT first_value(x) OVER w, avg(avg) OVER w
  FROM test_windowed_combine
  WINDOW w AS (ORDER BY x ROWS BETWEEN CURRENT ROW AND 5 FOLLOWING)
  ORDER BY first_value
  """
    actual = pipeline.execute(combine)

    for e, a in zip(expected, actual):
        assert e[0] == a[0]
        assert e[1] == a[1]
def test_object_aggs(pipeline, clean_db):
    """
    Verify that combines work properly on object aggs
    """
    q = """
    SELECT x::integer %% 10 AS k,
    json_agg(x), json_object_agg(x, y::float8), string_agg(s::text, \' :: \')FROM stream0 GROUP BY k;
    """
    desc = ('x', 'y', 's')
    pipeline.create_stream('stream0', x='int', y='float8', s='text')
    pipeline.create_cv('test_object_aggs', q)
    pipeline.create_table('test_object_aggs_t', x='integer', y='float8', s='text')

    rows = []
    for n in range(10000):
        row = (random.randint(0, 1000), random.random(), str(n) * random.randint(1, 8))
        rows.append(row)

    pipeline.insert('stream0', desc, rows)
    pipeline.insert('test_object_aggs_t', desc, rows)

    tq = """
    SELECT json_agg(x), json_object_agg(x, y::float8), string_agg(s::text, \' :: \') FROM test_object_aggs_t
    """
    table_result = list(pipeline.execute(tq))

    cq = """
    SELECT combine(json_agg), combine(json_object_agg), combine(string_agg) FROM test_object_aggs
    """
    cv_result = list(pipeline.execute(cq))

    assert len(table_result) == len(cv_result)

    for tr, cr in zip(table_result, cv_result):
        assert sorted(tr[0]) == sorted(cr[0])
        assert sorted(tr[1]) == sorted(cr[1])
        assert sorted(tr[2]) == sorted(cr[2])
def test_windowed_combine(pipeline, clean_db):
    """
    Verify that windowed queries with combines work
    """
    q = """
    SELECT x::integer, avg(y::integer) FROM stream0 GROUP BY x
    """
    desc = ('x', 'y')
    pipeline.create_stream('stream0', x='int', y='float8')
    pipeline.create_cv('test_windowed_combine', q)
    pipeline.create_table('test_windowed_combine_t', x='integer', y='integer')

    rows = []
    for n in range(10000):
        row = (n, n)
        rows.append(row)

    pipeline.insert('stream0', desc, rows)
    pipeline.insert('test_windowed_combine_t', desc, rows)

    table = """
    SELECT first_value(x) OVER w, avg(y) OVER w
    FROM test_windowed_combine_t
    WINDOW w AS (ORDER BY x ROWS BETWEEN CURRENT ROW AND 5 FOLLOWING)
    ORDER BY first_value
    """
    expected = list(pipeline.execute(table))
    combine = """
    SELECT first_value(x) OVER w, avg(avg) OVER w
    FROM test_windowed_combine
    WINDOW w AS (ORDER BY x ROWS BETWEEN CURRENT ROW AND 5 FOLLOWING)
    ORDER BY first_value
    """
    actual = list(pipeline.execute(combine))

    for e, a in zip(expected, actual):
        assert e == a
Exemple #42
0
def test_null_groups(pipeline, clean_db):
    """
    Verify that null group columns are considered equal
    """
    pipeline.create_stream('s', x='int', y='int', z='int')
    q = """
    SELECT x::integer, y::integer, z::integer, COUNT(*) FROM s
    GROUP BY x, y, z;
    """
    desc = ('x', 'y', 'z')
    pipeline.create_cv('test_null_groups', q)
    pipeline.create_table('test_null_groups_t',
                          x='integer',
                          y='integer',
                          z='integer')

    rows = []
    for n in range(10000):
        vals = list(random.randint(0, 10) for n in range(3))
        vals = map(lambda n: random.random() > 0.1 and n or None, vals)
        rows.append(tuple(vals))

    pipeline.insert('s', desc, rows)
    pipeline.insert('test_null_groups_t', desc, rows)

    table_q = """
    SELECT x, y, z, COUNT(*) FROM test_null_groups_t
    GROUP BY x, y, z ORDER BY x, y, z;
    """
    expected = list(pipeline.execute(table_q))
    result = list(
        pipeline.execute(
            'SELECT x, y, z, count FROM test_null_groups ORDER BY x, y, z'))

    for r, e in zip(result, expected):
        assert r == e
def test_indexed(pipeline, clean_db):
    """
    Verify that stream-table joins involving indexed tables work
    """
    q = """
    SELECT stream.x::integer, count(*) FROM stream
    JOIN test_indexed_t t ON stream.x = t.x GROUP BY stream.x
    """
    pipeline.create_table('test_indexed_t', x='integer', y='integer')
    pipeline.execute('CREATE INDEX idx ON test_indexed_t(x)')

    t = _generate_rows(2, 1000)
    s = _generate_rows(2, 1000)

    pipeline.insert('test_indexed_t', ('x', 'y'), t)
    time.sleep(0.1)

    pipeline.create_cv('test_indexed', q)
    pipeline.insert('stream', ('x', 'y'), s)

    expected = _join(s, t, [0])
    result = pipeline.execute('SELECT sum(count) FROM test_indexed').first()

    assert result['sum'] == len(expected)
Exemple #44
0
def test_hll_distinct(pipeline, clean_db):
    """
  Verify that combines work on HLL COUNT DISTINCT queries
  """
    q = """
  SELECT x::integer % 10 AS k, COUNT(DISTINCT x) AS count FROM stream0 GROUP BY k
  """
    desc = ('x', 'y')
    pipeline.create_stream('stream0', x='int', y='float8')
    pipeline.create_cv('test_hll_distinct', q)
    pipeline.create_table('test_hll_distinct_t', x='integer', y='float8')

    rows = []
    for n in range(10000):
        row = (random.randint(0, 1000), random.random())
        rows.append(row)

    pipeline.insert('stream0', desc, rows)
    pipeline.insert('test_hll_distinct_t', desc, rows)

    # Note that the CQ will use the HLL variant of COUNT DISTINCT,
    # so use hll_count_distinct on the table too
    tq = """
  SELECT hll_count_distinct(x) FROM test_hll_distinct_t
  """
    table_result = pipeline.execute(tq)

    cq = """
  SELECT combine(count) FROM test_hll_distinct
  """
    cv_result = pipeline.execute(cq)

    assert len(table_result) == len(cv_result)

    for tr, cr in zip(table_result, cv_result):
        assert tr[0] == cr[0]
Exemple #45
0
def test_stats_aggs(pipeline, clean_db):
    """
  Verify that combines work on stats aggs
  """
    q = """
  SELECT x::integer % 10 AS k,
  regr_sxx(x, y::float8), stddev(x) FROM stream0 GROUP BY k;
  """
    desc = ('x', 'y')
    pipeline.create_stream('stream0', x='int', y='float8')
    pipeline.create_cv('test_stats_aggs', q)
    pipeline.create_table('test_stats_aggs_t', x='integer', y='float8')

    rows = []
    for n in range(10000):
        row = (random.randint(0, 1000), random.random())
        rows.append(row)

    pipeline.insert('stream0', desc, rows)
    pipeline.insert('test_stats_aggs_t', desc, rows)

    tq = """
  SELECT regr_sxx(x, y::float8), stddev(x) FROM test_stats_aggs_t
  """
    table_result = pipeline.execute(tq)

    cq = """
  SELECT combine(regr_sxx), combine(stddev) FROM test_stats_aggs
  """
    cv_result = pipeline.execute(cq)

    assert len(table_result) == len(cv_result)

    for tr, cr in zip(table_result, cv_result):
        assert abs(tr[0] - cr[0]) < 0.00001
        assert abs(tr[1] - cr[1]) < 0.00001
Exemple #46
0
def test_binary_upgrade(pipeline, clean_db):
    """
  Verify that binary upgrades properly transfer all objects and data
  into the new installation
  """
    # Create some regular tables with data, and create an index on half of them
    for n in range(16):
        name = 't_%d' % n
        pipeline.create_table(name, x='integer', y='text', z='text')
        rows = [(x, name, name) for x in range(1000)]
        pipeline.insert(name, ('x', 'y', 'z'), rows)
        if n >= 8:
            pipeline.execute('CREATE INDEX idx_%s ON %s(y)' % (name, name))

    # Create some streams
    for n in range(8):
        name = 's_%d' % n
        pipeline.create_stream(name, x='integer', y='text')

    # Now create some CVs with data, some with indices
    for n in range(32):
        name = 'cv_%d' % n
        pipeline.create_stream('stream_%d' % n, x='int', y='text', z='text')
        pipeline.create_cv(
            name,
            'SELECT z::text, COUNT(DISTINCT z) AS distinct_count, COUNT(*) FROM stream_%d GROUP BY z'
            % n)
        rows = [(x, name, name) for x in range(1000)]
        pipeline.insert('stream_%d' % n, ('x', 'y', 'z'), rows)
        if n >= 16:
            pipeline.execute('CREATE INDEX idx_%s ON %s(z)' % (name, name))

    # Now create some in another namespace
    pipeline.execute('CREATE SCHEMA namespace')
    for n in range(8):
        name = 'namespace.cv_%d' % n
        pipeline.create_stream('namespace.stream_%d' % n,
                               x='int',
                               y='text',
                               z='text')
        pipeline.create_cv(
            name,
            'SELECT z::text, COUNT(DISTINCT z) AS distinct_count, COUNT(*) FROM namespace.stream_%d GROUP BY z'
            % n)
        rows = [(x, name, name) for x in range(1000)]
        pipeline.insert('namespace.stream_%d' % n, ('x', 'y', 'z'), rows)
        if n >= 4:
            pipeline.execute('CREATE INDEX namespace_idx_%d ON %s(z)' %
                             (n, name))

    create_fn = """
  CREATE OR REPLACE FUNCTION tg_fn()
  RETURNS trigger AS
  $$
  BEGIN
   RETURN NEW;
  END;
  $$
  LANGUAGE plpgsql;
  """
    pipeline.execute(create_fn)

    pipeline.create_stream('stream0', z='text')

    # Create some transforms
    for n in range(8):
        name = 'ct_%d' % n
        pipeline.create_ct(name, 'SELECT z::text FROM stream0', 'tg_fn()')

    time.sleep(10)

    old_bin_dir = new_bin_dir = pipeline.bin_dir
    old_data_dir = pipeline.data_dir
    new_data_dir = os.path.abspath('test_binary_upgrade_data_dir')

    pipeline.stop()

    p = subprocess.Popen(
        [os.path.join(pipeline.bin_dir, 'pipeline-init'), '-D', new_data_dir])
    stdout, stderr = p.communicate()

    result = subprocess.check_call([
        os.path.join(pipeline.bin_dir, 'pipeline-upgrade'), '-b', old_bin_dir,
        '-B', new_bin_dir, '-d', old_data_dir, '-D', new_data_dir
    ])

    assert result == 0

    # The cleanup path expects this to be running, but we're done with it
    pipeline.run()

    # pipeline-upgrade returned successfully and has already done sanity checks
    # but let's manually verify that all objects were migrated to the new data directory
    upgraded = PipelineDB(data_dir=new_data_dir)
    upgraded.run()

    # Tables
    for n in range(16):
        name = 't_%d' % n
        q = 'SELECT x, y, z FROM %s ORDER BY x' % name
        rows = upgraded.execute(q)
        for i, row in enumerate(rows):
            x, y, z = row
            assert x == i
            assert y == name
            assert z == name

    # Streams
    for n in range(8):
        name = 's_%d' % n
        rows = list(
            upgraded.execute(
                "SELECT oid FROM pg_class WHERE relkind = '$' AND relname = '%s'"
                % name))
        assert len(rows) == 1

    # CVs
    for n in range(32):
        name = 'cv_%d' % n
        rows = list(
            upgraded.execute('SELECT z, distinct_count, count FROM %s' % name))
        assert len(rows) == 1

        assert rows[0][0] == name
        assert rows[0][1] == 1
        assert rows[0][2] == 1000

    # CVs in separate schema
    for n in range(8):
        name = 'namespace.cv_%d' % n
        rows = list(
            upgraded.execute('SELECT z, distinct_count, count FROM %s' % name))
        assert len(rows) == 1

        assert rows[0][0] == name
        assert rows[0][1] == 1
        assert rows[0][2] == 1000

    # Transforms
    for n in range(8):
        name = 'ct_%d' % n
        q = """
    SELECT c.relname FROM pg_class c JOIN pipeline_query pq
    ON c.oid = pq.relid WHERE pq.type = 't' AND c.relname = '%s'
    """ % name
        rows = list(upgraded.execute(q))
        assert len(rows) == 1

    upgraded.stop()
    shutil.rmtree(new_data_dir)
def test_binary_upgrade(pipeline, clean_db):
    """
  Verify that binary upgrades properly transfer all objects and data
  into the new installation
  """
    if pipeline.version_num == 110000:
        pytest.skip('skipping until PG11 supports dump/restore WITH OIDS')

    # Create some regular tables with data, and create an index on half of them
    for n in range(16):
        name = 't_%d' % n
        pipeline.create_table(name, x='integer', y='text', z='text')
        rows = [(x, name, name) for x in range(1000)]
        pipeline.insert(name, ('x', 'y', 'z'), rows)
        if n >= 8:
            pipeline.execute('CREATE INDEX idx_%s ON %s(y)' % (name, name))

    # Create some streams
    for n in range(8):
        name = 's_%d' % n
        pipeline.create_stream(name, x='integer', y='text')

    # Now create some CVs with data, some with indices
    for n in range(32):
        name = 'cv_%d' % n
        pipeline.create_stream('stream_%d' % n, x='int', y='text', z='text')
        pipeline.create_cv(
            name,
            'SELECT z::text, COUNT(DISTINCT z) AS distinct_count, COUNT(*) FROM stream_%d GROUP BY z'
            % n)
        if n >= 16:
            pipeline.execute('CREATE INDEX idx_%s ON %s(z)' % (name, name))

    # Create some STJs
    for n in range(8):
        pipeline.create_cv(
            'stj_%d' % n,
            'SELECT t.x, count(*) FROM stream_%d s JOIN t_%d t ON s.x = t.x GROUP BY t.x'
            % (n, n))

    # Create some SW CVs
    for n in range(8):
        pipeline.create_cv('sw_%d' % n,
                           'SELECT count(*) FROM stream_%d' % n,
                           sw='%d days' % (n + 1),
                           step_factor=n + 1)

    # Create some CVs/CTs/streams that we'll rename
    for n in range(4):
        pipeline.create_stream('to_rename_s_%d' % n, x='int')
        pipeline.create_cv(
            'to_rename_cv_%d' % n,
            'SELECT x, count(*) FROM to_rename_s_%d GROUP BY x' % n)
        pipeline.create_ct('to_rename_ct_%d' % n,
                           'SELECT x FROM to_rename_s_%d' % n)
        pipeline.create_cv(
            'to_rename_ct_reader_%d' % n,
            "SELECT count(*) FROM output_of('to_rename_ct_%d')" % n)

        rows = [(x, ) for x in range(1000)]
        pipeline.insert('to_rename_s_%d' % n, ('x', ), rows)

    # Now rename them
    for n in range(4):
        pipeline.execute(
            'ALTER FOREIGN TABLE to_rename_s_%d RENAME TO renamed_s_%d' %
            (n, n))
        pipeline.execute('ALTER VIEW to_rename_cv_%d RENAME TO renamed_cv_%d' %
                         (n, n))
        pipeline.execute('ALTER VIEW to_rename_ct_%d RENAME TO renamed_ct_%d' %
                         (n, n))
        pipeline.execute(
            'ALTER VIEW to_rename_ct_reader_%d RENAME TO renamed_ct_reader_%d'
            % (n, n))

        # And write some data using the new stream names
        rows = [(x, ) for x in range(1000)]
        pipeline.insert('renamed_s_%d' % n, ('x', ), rows)

    # Create a CV chain that combines output streams
    q = """
  SELECT (new).z, combine((delta).count) AS count, combine((delta).distinct_count) AS distinct_count FROM output_of('cv_0') GROUP BY (new).z
  """
    pipeline.create_cv('combine_cv_0', q)
    q = """
  SELECT combine((delta).count) AS count, combine((delta).distinct_count) AS distinct_count FROM output_of('combine_cv_0')
  """
    pipeline.create_cv('combine_cv_1', q)

    for n in range(32):
        name = 'cv_%d' % n
        rows = [(x, name, name) for x in range(1000)]
        pipeline.insert('stream_%d' % n, ('x', 'y', 'z'), rows)

    # Create a CV with a TTL to verify TTL info is restored properly
    pipeline.create_cv(
        'ttlcv',
        'SELECT second(arrival_timestamp), count(*) FROM stream_0 GROUP BY second',
        ttl='1 hour',
        ttl_column='second')

    # Now create some in another namespace
    pipeline.execute('CREATE SCHEMA namespace')
    for n in range(8):
        name = 'namespace.cv_%d' % n
        pipeline.create_stream('namespace.stream_%d' % n,
                               x='int',
                               y='text',
                               z='text')
        pipeline.create_cv(
            name,
            'SELECT z::text, COUNT(DISTINCT z) AS distinct_count, COUNT(*) FROM namespace.stream_%d GROUP BY z'
            % n)
        rows = [(x, name, name) for x in range(1000)]
        pipeline.insert('namespace.stream_%d' % n, ('x', 'y', 'z'), rows)
        if n >= 4:
            pipeline.execute('CREATE INDEX namespace_idx_%d ON %s(z)' %
                             (n, name))

    create_fn = """
  CREATE OR REPLACE FUNCTION tg_fn()
  RETURNS trigger AS
  $$
  BEGIN
   RETURN NEW;
  END;
  $$
  LANGUAGE plpgsql;
  """
    pipeline.execute(create_fn)

    pipeline.create_stream('stream0', z='text')

    # Create some transforms with trigger functions
    for n in range(8):
        name = 'ct_%d' % n
        pipeline.create_ct(name, 'SELECT z::text FROM stream0', 'tg_fn')

    # Create some transforms without trigger functions
    for n in range(8):
        name = 'ct_no_trig_%d' % n
        pipeline.create_ct(name, 'SELECT z::text FROM stream0')

    time.sleep(10)

    old_bin_dir = new_bin_dir = pipeline.bin_dir
    old_data_dir = pipeline.data_dir
    new_data_dir0 = os.path.abspath('test_binary_upgrade_data_dir0')

    if os.path.exists(new_data_dir0):
        shutil.rmtree(new_data_dir0)

    pipeline.stop()

    p = subprocess.Popen(
        [os.path.join(pipeline.bin_dir, 'initdb'), '-D', new_data_dir0])
    stdout, stderr = p.communicate()

    with open(os.path.join(new_data_dir0, 'postgresql.conf'), 'a') as f:
        f.write('shared_preload_libraries=pipelinedb\n')
        f.write('max_worker_processes=128\n')
        f.write('pipelinedb.stream_insert_level=sync_commit\n')

    result = subprocess.check_call([
        os.path.join(pipeline.bin_dir, 'pg_upgrade'), '-b', old_bin_dir, '-B',
        new_bin_dir, '-d', old_data_dir, '-D', new_data_dir0
    ])

    assert result == 0

    # The cleanup path expects this to be running, but we're done with it
    pipeline.run()

    # pg_upgrade returned successfully and has already done sanity checks
    # but let's manually verify that all objects were migrated to the new data directory
    upgraded = PipelineDB(data_dir=new_data_dir0)
    upgraded.run()

    # Tables
    for n in range(16):
        name = 't_%d' % n
        q = 'SELECT x, y, z FROM %s ORDER BY x' % name
        rows = upgraded.execute(q)
        for i, row in enumerate(rows):
            assert row['x'] == i
            assert row['y'] == name
            assert row['z'] == name

    # Streams
    for n in range(8):
        name = 's_%d' % n
        rows = list(
            upgraded.execute(
                "SELECT oid FROM pg_class WHERE relkind = 'f' AND relname = '%s'"
                % name))
        assert len(rows) == 1

    # CVs
    for n in range(32):
        name = 'cv_%d' % n
        rows = list(
            upgraded.execute('SELECT z, distinct_count, count FROM %s' % name))
        assert len(rows) == 1

        assert rows[0][0] == name
        assert rows[0][1] == 1
        assert rows[0][2] == 1000

    # CV with TTL
    row = list(
        upgraded.execute(
            "SELECT ttl, ttl_attno FROM pg_class c JOIN pipelinedb.cont_query pq on c.oid = pq.relid WHERE c.relname = 'ttlcv'"
        ))[0]
    assert row[0] == 3600
    assert row[1] == 1

    # CVs in separate schema
    for n in range(8):
        name = 'namespace.cv_%d' % n
        rows = list(
            upgraded.execute('SELECT z, distinct_count, count FROM %s' % name))
        assert len(rows) == 1

        assert rows[0][0] == name
        assert rows[0][1] == 1
        assert rows[0][2] == 1000

    # Transforms with trigger functions
    for n in range(8):
        name = 'ct_%d' % n
        q = """
    SELECT c.relname FROM pg_class c JOIN pipelinedb.cont_query pq
    ON c.oid = pq.relid WHERE pq.type = 't' AND c.relname = '%s'
    """ % name
        rows = list(upgraded.execute(q))
        assert len(rows) == 1

    # Transforms without trigger functions
    for n in range(8):
        name = 'ct_no_trig_%d' % n
        q = """
    SELECT c.relname FROM pg_class c JOIN pipelinedb.cont_query pq
    ON c.oid = pq.relid WHERE pq.type = 't' AND c.relname = '%s'
    """ % name
        rows = list(upgraded.execute(q))
        assert len(rows) == 1

    # Verify SW CVs
    for n in range(8):
        name = 'sw_%d' % n
        row = upgraded.execute(
            "SELECT ttl, step_factor FROM pipelinedb.cont_query cq JOIN pg_class c ON cq.relid = c.oid WHERE relname = '%s'"
            % name)[0]
        assert row['ttl'] == (n + 1) * 3600 * 24
        assert row['step_factor'] == n + 1

        row = upgraded.execute('SELECT count FROM %s' % name)[0]
        assert row['count'] == 1000

    # Verify renamed CVs/CTs/streams
    for n in range(4):
        row = upgraded.execute('SELECT combine(count) FROM renamed_cv_%d' %
                               n)[0]
        assert row['combine'] == 2000
        row = upgraded.execute(
            'SELECT combine(count) FROM renamed_ct_reader_%d' % n)[0]
        assert row['combine'] == 2000

    # Verify chained CVs
    row = upgraded.execute(
        'SELECT z, count, distinct_count FROM combine_cv_0')[0]
    assert row['z'] == 'cv_0'
    assert row['count'] == 1000
    assert row['distinct_count'] == 1

    row = upgraded.execute('SELECT count, distinct_count FROM combine_cv_1')[0]
    assert row['count'] == 1000
    assert row['distinct_count'] == 1

    # Now insert some new data and verify CVs are still updating properly
    for n in range(32):
        name = 'cv_%d' % n
        rows = [(x, name, name) for x in range(1000)]
        upgraded.insert('stream_%d' % n, ('x', 'y', 'z'), rows)

    for n in range(32):
        name = 'cv_%d' % n
        rows = list(
            upgraded.execute('SELECT z, distinct_count, count FROM %s' % name))
        assert len(rows) == 1

        assert rows[0][0] == name
        assert rows[0][1] == 1
        assert rows[0][2] == 2000

    row = upgraded.execute(
        'SELECT z, count, distinct_count FROM combine_cv_0')[0]
    assert row['z'] == 'cv_0'
    assert row['count'] == 2000
    assert row['distinct_count'] == 1

    row = upgraded.execute('SELECT count, distinct_count FROM combine_cv_1')[0]
    assert row['count'] == 2000
    assert row['distinct_count'] == 1

    # Verify STJs
    for n in range(8):
        cv = 'stj_%d' % n
        row = upgraded.execute('SELECT sum(count) FROM %s' % cv)[0]
        assert row['sum'] == 2000

    # Rename objects again before the second upgrade
    for n in range(4):
        upgraded.execute(
            'ALTER FOREIGN TABLE renamed_s_%d RENAME TO renamed_again_s_%d' %
            (n, n))
        upgraded.execute(
            'ALTER VIEW renamed_cv_%d RENAME TO renamed_again_cv_%d' % (n, n))
        upgraded.execute(
            'ALTER VIEW renamed_ct_%d RENAME TO renamed_again_ct_%d' % (n, n))
        upgraded.execute(
            'ALTER VIEW renamed_ct_reader_%d RENAME TO renamed_again_ct_reader_%d'
            % (n, n))

        # And write some data using the new stream names
        rows = [(x, ) for x in range(1000)]
        upgraded.insert('renamed_again_s_%d' % n, ('x', ), rows)

    upgraded.stop()

    new_data_dir1 = os.path.abspath('test_binary_upgrade_data_dir1')
    if os.path.exists(new_data_dir1):
        shutil.rmtree(new_data_dir1)

    p = subprocess.Popen(
        [os.path.join(pipeline.bin_dir, 'initdb'), '-D', new_data_dir1])
    stdout, stderr = p.communicate()

    with open(os.path.join(new_data_dir1, 'postgresql.conf'), 'a') as f:
        f.write('shared_preload_libraries=pipelinedb\n')
        f.write('max_worker_processes=128\n')
        f.write('pipelinedb.stream_insert_level=sync_commit\n')

    # Now upgrade the upgraded DB to verify that restored DBs can be updated properly
    result = subprocess.check_call([
        os.path.join(pipeline.bin_dir, 'pg_upgrade'), '-b', old_bin_dir, '-B',
        new_bin_dir, '-d', new_data_dir0, '-D', new_data_dir1
    ])

    assert result == 0

    # but let's manually verify that all objects were migrated to the new data directory
    upgraded = PipelineDB(data_dir=new_data_dir1)
    upgraded.run()

    # Tables
    for n in range(16):
        name = 't_%d' % n
        q = 'SELECT x, y, z FROM %s ORDER BY x' % name
        rows = upgraded.execute(q)
        for i, row in enumerate(rows):
            assert row['x'] == i
            assert row['y'] == name
            assert row['z'] == name

    # Streams
    for n in range(8):
        name = 's_%d' % n
        rows = list(
            upgraded.execute(
                "SELECT oid FROM pg_class WHERE relkind = 'f' AND relname = '%s'"
                % name))
        assert len(rows) == 1

    # CVs
    for n in range(32):
        name = 'cv_%d' % n
        rows = list(
            upgraded.execute('SELECT z, distinct_count, count FROM %s' % name))
        assert len(rows) == 1

        assert rows[0][0] == name
        assert rows[0][1] == 1
        assert rows[0][2] == 2000

    # CV with TTL
    row = list(
        upgraded.execute(
            "SELECT ttl, ttl_attno FROM pg_class c JOIN pipelinedb.cont_query pq on c.oid = pq.relid WHERE c.relname = 'ttlcv'"
        ))[0]
    assert row[0] == 3600
    assert row[1] == 1

    # CVs in separate schema
    for n in range(8):
        name = 'namespace.cv_%d' % n
        rows = list(
            upgraded.execute('SELECT z, distinct_count, count FROM %s' % name))
        assert len(rows) == 1

        assert rows[0][0] == name
        assert rows[0][1] == 1
        assert rows[0][2] == 1000

    # Transforms with trigger functions
    for n in range(8):
        name = 'ct_%d' % n
        q = """
    SELECT c.relname FROM pg_class c JOIN pipelinedb.cont_query pq
    ON c.oid = pq.relid WHERE pq.type = 't' AND c.relname = '%s'
    """ % name
        rows = list(upgraded.execute(q))
        assert len(rows) == 1

    # Transforms without trigger functions
    for n in range(8):
        name = 'ct_no_trig_%d' % n
        q = """
    SELECT c.relname FROM pg_class c JOIN pipelinedb.cont_query pq
    ON c.oid = pq.relid WHERE pq.type = 't' AND c.relname = '%s'
    """ % name
        rows = list(upgraded.execute(q))
        assert len(rows) == 1

    # Verify SW Cvs
    for n in range(8):
        name = 'sw_%d' % n
        step_factor = n + 1
        row = upgraded.execute(
            "SELECT ttl, step_factor FROM pipelinedb.cont_query cq JOIN pg_class c ON cq.relid = c.oid WHERE relname = '%s'"
            % name)[0]
        assert row['ttl'] == (n + 1) * 3600 * 24
        assert row['step_factor'] == n + 1

        row = upgraded.execute('SELECT count FROM %s' % name)[0]
        assert row['count'] == 2000

    # Verify renamed CVs/CTs/streams
    for n in range(4):
        row = upgraded.execute(
            'SELECT combine(count) FROM renamed_again_cv_%d' % n)[0]
        assert row['combine'] == 3000
        row = upgraded.execute(
            'SELECT combine(count) FROM renamed_again_ct_reader_%d' % n)[0]
        assert row['combine'] == 3000

    # Verify chained CV
    row = upgraded.execute(
        'SELECT z, count, distinct_count FROM combine_cv_0')[0]
    assert row['z'] == 'cv_0'
    assert row['count'] == 2000
    assert row['distinct_count'] == 1

    row = upgraded.execute('SELECT count, distinct_count FROM combine_cv_1')[0]
    assert row['count'] == 2000
    assert row['distinct_count'] == 1

    # Now insert some new data and verify CVs are still updating properly
    for n in range(32):
        name = 'cv_%d' % n
        rows = [(x, name, name) for x in range(1000)]
        upgraded.insert('stream_%d' % n, ('x', 'y', 'z'), rows)

    for n in range(32):
        name = 'cv_%d' % n
        rows = list(
            upgraded.execute('SELECT z, distinct_count, count FROM %s' % name))
        assert len(rows) == 1

        assert rows[0][0] == name
        assert rows[0][1] == 1
        assert rows[0][2] == 3000

    row = upgraded.execute(
        'SELECT z, count, distinct_count FROM combine_cv_0')[0]
    assert row['z'] == 'cv_0'
    assert row['count'] == 3000
    assert row['distinct_count'] == 1

    row = upgraded.execute('SELECT count, distinct_count FROM combine_cv_1')[0]
    assert row['count'] == 3000
    assert row['distinct_count'] == 1

    # Verify STJs
    for n in range(8):
        cv = 'stj_%d' % n
        row = upgraded.execute('SELECT sum(count) FROM %s' % cv)[0]
        assert row['sum'] == 3000

    upgraded.stop()

    pipeline.execute('DROP VIEW combine_cv_0 CASCADE')
    shutil.rmtree(new_data_dir0)
    shutil.rmtree(new_data_dir1)
Exemple #48
0
def test_binary_upgrade(pipeline, clean_db):
  """
  Verify that binary upgrades properly transfer all objects and data
  into the new installation
  """
  # Create some regular tables with data, and create an index on half of them
  for n in range(16):
    name = 't_%d' % n
    pipeline.create_table(name, x='integer', y='text', z='text')
    rows = [(x, name, name) for x in range(1000)]
    pipeline.insert(name, ('x', 'y', 'z'), rows)
    if n >= 8:
      pipeline.execute('CREATE INDEX idx_%s ON %s(y)' % (name, name))

  # Create some streams
  for n in range(8):
    name = 's_%d' % n
    pipeline.create_stream(name, x='integer', y='text')

  # Now create some CVs with data, some with indices
  for n in range(32):
    name = 'cv_%d' % n
    pipeline.create_cv(name, 'SELECT z::text, COUNT(DISTINCT z) AS distinct_count, COUNT(*) FROM stream_%d GROUP BY z' % n)
    rows = [(x, name, name) for x in range(1000)]
    pipeline.insert('stream_%d' % n, ('x', 'y', 'z'), rows)
    if n >= 16:
      pipeline.execute('CREATE INDEX idx_%s ON %s(z)' % (name, name))

  # Now create some in another namespace
  pipeline.execute('CREATE SCHEMA namespace')
  for n in range(8):
    name = 'namespace.cv_%d' % n
    pipeline.create_cv(name, 'SELECT z::text, COUNT(DISTINCT z) AS distinct_count, COUNT(*) FROM namespace.stream_%d GROUP BY z' % n)
    rows = [(x, name, name) for x in range(1000)]
    pipeline.insert('namespace.stream_%d' % n, ('x', 'y', 'z'), rows)
    if n >= 4:
      pipeline.execute('CREATE INDEX namespace_idx_%d ON %s(z)' % (n, name))

  create_fn = """
  CREATE OR REPLACE FUNCTION tg_fn()
  RETURNS trigger AS
  $$
  BEGIN
   RETURN NEW;
  END;
  $$
  LANGUAGE plpgsql;
  """
  pipeline.execute(create_fn)

  # Create some transforms
  for n in range(8):
    name = 'ct_%d' % n
    pipeline.create_ct(name, 'SELECT z::text FROM stream', 'tg_fn()')

  time.sleep(10)

  old_bin_dir = new_bin_dir = pipeline.bin_dir
  old_data_dir = pipeline.data_dir
  new_data_dir = os.path.abspath('test_binary_upgrade_data_dir')

  pipeline.stop()

  p = subprocess.Popen([
    os.path.join(pipeline.bin_dir, 'pipeline-init'), '-D', new_data_dir])
  stdout, stderr = p.communicate()

  result = subprocess.check_call([
    os.path.join(pipeline.bin_dir, 'pipeline-upgrade'),
    '-b', old_bin_dir, '-B', new_bin_dir,
    '-d', old_data_dir, '-D', new_data_dir])

  assert result == 0

  # The cleanup path expects this to be running, but we're done with it
  pipeline.run()

  # pipeline-upgrade returned successfully and has already done sanity checks
  # but let's manually verify that all objects were migrated to the new data directory
  upgraded = PipelineDB(data_dir=new_data_dir)
  upgraded.run()

  # Tables
  for n in range(16):
    name = 't_%d' % n
    q = 'SELECT x, y, z FROM %s ORDER BY x' % name
    rows = upgraded.execute(q)
    for i, row in enumerate(rows):
      x, y, z = row
      assert x == i
      assert y == name
      assert z == name

  # Streams
  for n in range(8):
    name = 's_%d' % n
    rows = list(upgraded.execute("SELECT oid FROM pg_class WHERE relkind = '$' AND relname = '%s'" % name))
    assert len(rows) == 1

  # CVs
  for n in range(32):
    name = 'cv_%d' % n
    rows = list(upgraded.execute('SELECT z, distinct_count, count FROM %s' % name))
    assert len(rows) == 1

    assert rows[0][0] == name
    assert rows[0][1] == 1
    assert rows[0][2] == 1000

  # CVs in separate schema
  for n in range(8):
    name = 'namespace.cv_%d' % n
    rows = list(upgraded.execute('SELECT z, distinct_count, count FROM %s' % name))
    assert len(rows) == 1

    assert rows[0][0] == name
    assert rows[0][1] == 1
    assert rows[0][2] == 1000

  # Transforms
  for n in range(8):
    name = 'ct_%d' % n
    q = """
    SELECT c.relname FROM pg_class c JOIN pipeline_query pq
    ON c.oid = pq.relid WHERE pq.type = 't' AND c.relname = '%s'
    """ % name
    rows = list(upgraded.execute(q))
    assert len(rows) == 1

  upgraded.stop()
  shutil.rmtree(new_data_dir)