def test_unaligned_multibucket_queries(table_maker, querier):
    records = []

    # Create 5 records spanning 3 buckets, of which we want the middle 3
    records += create_test_records(
        start=YEAR_2010+DatalakeRecord.TIME_BUCKET_SIZE_IN_MS*1/4,
        end=YEAR_2010+DatalakeRecord.TIME_BUCKET_SIZE_IN_MS*1/4+1, what='foo')
    records += create_test_records(
        start=YEAR_2010+DatalakeRecord.TIME_BUCKET_SIZE_IN_MS*3/4,
        end=YEAR_2010+DatalakeRecord.TIME_BUCKET_SIZE_IN_MS*3/4+1, what='foo')
    records += create_test_records(
        start=YEAR_2010+DatalakeRecord.TIME_BUCKET_SIZE_IN_MS*6/4,
        end=YEAR_2010+DatalakeRecord.TIME_BUCKET_SIZE_IN_MS*6/4+1, what='foo')
    records += create_test_records(
        start=YEAR_2010+DatalakeRecord.TIME_BUCKET_SIZE_IN_MS*9/4,
        end=YEAR_2010+DatalakeRecord.TIME_BUCKET_SIZE_IN_MS*9/4+1, what='foo')
    records += create_test_records(
        start=YEAR_2010+DatalakeRecord.TIME_BUCKET_SIZE_IN_MS*11/4,
        end=YEAR_2010+DatalakeRecord.TIME_BUCKET_SIZE_IN_MS*11/4+1, what='foo')

    table_maker(records)
    start = YEAR_2010 + DatalakeRecord.TIME_BUCKET_SIZE_IN_MS * 3 / 4
    end = YEAR_2010 + DatalakeRecord.TIME_BUCKET_SIZE_IN_MS * 9 / 4
    results = get_page(querier.query_by_time, [start, end, 'foo'])
    evaluate_time_based_results(results, 3)
def test_latest_happened_yesterday(table_maker, querier):
    yesterday = int(time.time() * 1000) - _ONE_DAY_MS
    records = create_test_records(start=yesterday, end=None, what='tower',
                                  where='pisa')
    table_maker(records)
    result = querier.query_latest('tower', 'pisa')
    _validate_latest_result(result, what='tower', where='pisa')
def test_deduplicating_work_id_records(table_maker, querier):
    start = YEAR_2010
    end = YEAR_2010 + 2 * DatalakeRecord.TIME_BUCKET_SIZE_IN_MS
    records = create_test_records(start=start, end=end, what='foo',
                                  work_id='job0')
    table_maker(records)
    results = querier.query_by_work_id('job0', 'foo')
    assert len(results) == 1
def test_paginate_work_id_records(table_maker, querier):
    records = []
    for i in range(150):
        records += create_test_records(what='foo', work_id='job0',
                                       start=1456833600000,
                                       end=1456837200000)
    table_maker(records)
    results = get_multiple_pages(querier.query_by_work_id, ['job0', 'foo'])
    assert len(results) == 150
def test_query_by_time(table_maker, querier):
    records = []
    for start in range(YEAR_2010, YEAR_2010+100, 10):
        end = start + 9
        records += create_test_records(start=start, end=end, what='foo')
    table_maker(records)
    results = querier.query_by_time(YEAR_2010, YEAR_2010+9, 'foo')
    assert len(results) == 1
    assert all_results_between(results, YEAR_2010, YEAR_2010+9)
def test_query_by_work_id(table_maker, querier):
    records = []
    for i in range(2):
        work_id = 'work{}'.format(i)
        records += create_test_records(work_id=work_id, what='foo')
    table_maker(records)
    results = querier.query_by_work_id('work0', 'foo')
    assert len(results) == 1
    assert all_results(results, work_id='work0')
def test_deduplicating_time_records(table_maker, querier):
    # Create a record that definitively spans two time buckets, and make sure
    # that we only get one record back when we query for it.
    start = YEAR_2010
    two_buckets = 2 * DatalakeRecord.TIME_BUCKET_SIZE_IN_MS
    end = YEAR_2010 + two_buckets
    records = create_test_records(start=start, end=end, what='foo')
    table_maker(records)
    results = querier.query_by_time(start, end+two_buckets, 'foo')
    assert len(results) == 1
def test_paginate_many_records_single_time_bucket(table_maker, querier):
    records = []
    interval = DatalakeRecord.TIME_BUCKET_SIZE_IN_MS/150
    very_end = YEAR_2010 + DatalakeRecord.TIME_BUCKET_SIZE_IN_MS
    for start in range(YEAR_2010, very_end, interval):
        end = start + interval
        records += create_test_records(start=start, end=end, what='foo')
    table_maker(records)
    results = get_multiple_pages(
        querier.query_by_time, [YEAR_2010, very_end, 'foo'])
    evaluate_time_based_results(results, 150)
def test_paginate_few_records_single_bucket_no_empty_page(table_maker,
                                                          querier):
    records = []
    # Fill one bucket with 2x MAX_RESULTS,
    # but we only want the last record.
    interval = DatalakeRecord.TIME_BUCKET_SIZE_IN_MS / MAX_RESULTS / 2
    very_end = YEAR_2010 + DatalakeRecord.TIME_BUCKET_SIZE_IN_MS
    for start in range(YEAR_2010, very_end, interval):
        end = start + interval
        records += create_test_records(start=start, end=end, what='foo')
    table_maker(records)
    results = get_page(querier.query_by_time, [very_end - interval + 1,
                       very_end, 'foo'])
    evaluate_time_based_results(results, 1)
def test_query_by_time_with_where(table_maker, querier):
    records = []
    for i in range(4):
        where = 'worker{}'.format(i)
        records += create_test_records(start=YEAR_2010, end=YEAR_2010+10,
                                       what='foo', where=where)

    table_maker(records)
    results = querier.query_by_time(YEAR_2010, YEAR_2010+10, 'foo',
                                    where='worker2')
    assert len(results) == 1
    assert all_results(results, start=YEAR_2010, end=YEAR_2010+10,
                       where='worker2')
    assert all_results_between(results, YEAR_2010, YEAR_2010+10)
def test_latest_many_records_single_time_bucket(table_maker, querier):
    now = int(time.time() * 1000)
    records = []
    bucket = now/DatalakeRecord.TIME_BUCKET_SIZE_IN_MS
    start = bucket * DatalakeRecord.TIME_BUCKET_SIZE_IN_MS
    interval = DatalakeRecord.TIME_BUCKET_SIZE_IN_MS/150
    very_end = start + DatalakeRecord.TIME_BUCKET_SIZE_IN_MS
    last_start = very_end - interval
    for t in range(start, very_end, interval):
        end = t + interval
        records += create_test_records(start=t, end=end,
                                       what='meow', where='tree')
    table_maker(records)
    result = querier.query_latest('meow', 'tree')
    _validate_latest_result(result, what='meow', where='tree',
                            start=last_start)
Beispiel #12
0
 def maker(content, metadata):
     path = metadata['id'] + '/data'
     s3_file_maker('datalake-test', path, content, metadata)
     records = create_test_records(**metadata)
     table_maker(records)
def test_latest_happened_today(table_maker, querier):
    now = int(time.time() * 1000)
    records = create_test_records(start=now, end=None, what='foo', where='boo')
    table_maker(records)
    result = querier.query_latest('foo', 'boo')
    _validate_latest_result(result, what='foo', where='boo')