def test_timespan_too_big(random_metadata):
    url = 's3://foo/blapp'
    random_metadata['start'] = 0
    random_metadata['end'] = (DatalakeRecord.MAXIMUM_BUCKET_SPAN + 1) * \
        DatalakeRecord.TIME_BUCKET_SIZE_IN_MS
    with pytest.raises(UnsupportedTimeRange):
        DatalakeRecord.list_from_metadata(url, random_metadata)
def test_timespan_too_big(s3_file_from_metadata, random_metadata):
    url = 's3://foo/blapp'
    s3_file_from_metadata(url, random_metadata)
    random_metadata['start'] = 0
    random_metadata['end'] = (DatalakeRecord.MAXIMUM_BUCKET_SPAN + 1) * \
        DatalakeRecord.TIME_BUCKET_SIZE_IN_MS
    with pytest.raises(UnsupportedTimeRange):
        DatalakeRecord.list_from_metadata(url, random_metadata)
Example #3
0
 def maker(**kwargs):
     m = random_metadata()
     m.update(**kwargs)
     key = '/'.join([str(v) for v in kwargs.values()])
     url = 's3://datalake-test/' + key
     s3_file_from_metadata(url, m)
     return DatalakeRecord.list_from_metadata(url, m)
def test_get_time_buckets_misaligned():
    # Test for regression on bug when querying over x buckets for a timeframe
    # (end - start) of < x buckets (i.e. end of B0 to start of B2)
    start = DatalakeRecord.TIME_BUCKET_SIZE_IN_MS * 4 / 5
    end = DatalakeRecord.TIME_BUCKET_SIZE_IN_MS * 11 / 5
    buckets = DatalakeRecord.get_time_buckets(start, end)
    assert buckets == [0, 1, 2]
def test_list_from_s3_url(s3_file_from_metadata, random_metadata):
    url = 's3://foo/bar'
    s3_file_from_metadata(url, random_metadata)
    records = DatalakeRecord.list_from_url(url)
    assert len(records) >= 1
    for r in records:
        assert r['metadata'] == random_metadata
def test_list_from_metadata(s3_file_from_metadata, random_metadata):
    url = 's3://foo/baz'
    s3_file_from_metadata(url, random_metadata)
    records = DatalakeRecord.list_from_metadata(url, random_metadata)
    assert len(records) >= 1
    for r in records:
        assert r['metadata'] == random_metadata
def test_no_end(random_metadata):
    url = 's3://foo/baz'
    del(random_metadata['end'])
    records = DatalakeRecord.list_from_metadata(url, random_metadata)
    assert len(records) >= 1
    for r in records:
        assert r['metadata'] == random_metadata
def test_get_time_buckets_misaligned():
    # Test for regression on bug when querying over x buckets for a timeframe
    # (end - start) of < x buckets (i.e. end of B0 to start of B2)
    start = DatalakeRecord.TIME_BUCKET_SIZE_IN_MS * 4 / 5
    end = DatalakeRecord.TIME_BUCKET_SIZE_IN_MS * 11 / 5
    buckets = DatalakeRecord.get_time_buckets(start, end)
    assert buckets == [0, 1, 2]
def test_no_end_exclusion(table_maker, querier):
    m = random_metadata()
    del(m['end'])
    url = 's3://datalake-test/' + m['id']
    records = DatalakeRecord.list_from_metadata(url, m)
    table_maker(records)
    results = querier.query_by_time(m['start'] + 1, m['start'] + 2, m['what'])
    assert len(results) == 0
def test_no_end_exclusion(table_maker, querier, s3_file_from_metadata):
    m = random_metadata()
    del (m['end'])
    url = 's3://datalake-test/' + m['id']
    s3_file_from_metadata(url, m)
    records = DatalakeRecord.list_from_metadata(url, m)
    table_maker(records)
    results = querier.query_by_time(m['start'] + 1, m['start'] + 2, m['what'])
    assert len(results) == 0
Example #11
0
def test_no_end(random_metadata, s3_file_from_metadata):
    url = 's3://foo/baz'
    del (random_metadata['end'])
    expected_metadata = random_metadata.copy()
    expected_metadata['end'] = None
    s3_file_from_metadata(url, random_metadata)
    records = DatalakeRecord.list_from_metadata(url, random_metadata)
    assert len(records) >= 1
    for r in records:
        assert r['metadata'] == expected_metadata
Example #12
0
    def query_by_time(self, start, end, what, where=None, cursor=None):
        results = []
        buckets = DatalakeRecord.get_time_buckets(start, end)

        if cursor:
            current_bucket = cursor['current_time_bucket']
            i = buckets.index(current_bucket)
            buckets = buckets[i:]

        for b in buckets:
            cursor = self._query_time_bucket(b, results, start, end, what,
                                             where, cursor)

        return QueryResults(results, cursor)
Example #13
0
def test_record_size_and_create_time(s3_file_maker, random_metadata):
    url = 's3://foo/bar'
    now = int(time.time() * 1000.0)

    # s3 create times have a 1s resolution. So we just tolerate 2x that to
    # ensure the test passes reasonably.
    max_tolerable_delta = 2000

    s3_file_maker('foo', 'bar', 'thissongisjust23byteslong', random_metadata)
    records = DatalakeRecord.list_from_url(url)
    assert len(records) >= 1
    for r in records:
        assert r['metadata'] == random_metadata
        assert abs(r['create_time'] - now) <= max_tolerable_delta
        assert r['size'] == 25
def test_null_end(table_maker, querier):
    m = {
        "start": 1461023640000,
        "what": "file",
        "version": 0,
        "end": None,
        "work_id": None,
        "path": "/home/foo/file",
        "where": "somehost",
        "id": "fedcba09876543210",
        "hash": "0123456789abcdef"
    }
    url = 's3://datalake-test/' + m['id']
    records = DatalakeRecord.list_from_metadata(url, m)
    table_maker(records)
    results = querier.query_by_time(1461023630000, 1461023650000, 'file')
    assert len(results) == 1
Example #15
0
    def query_by_time(self, start, end, what, where=None, cursor=None):
        results = []
        buckets = DatalakeRecord.get_time_buckets(start, end)

        if cursor:
            current_bucket = cursor['current_time_bucket']
            i = buckets.index(current_bucket)
            buckets = buckets[i:]

        for b in buckets:
            cursor = self._query_time_bucket(b, results, start, end, what,
                                             where, cursor)

        if cursor and \
           cursor.current_time_bucket and \
           cursor.current_time_bucket > buckets[-1]:
            # this is a corner case. It means that the next query would take us
            # into the next bucket, but the next bucket is beyond the time of
            # interest. Just clear the cursor in this case.
            cursor = None

        return QueryResults(results, cursor)
Example #16
0
 def ingest(self, url):
     '''ingest the metadata associated with the given url'''
     records = DatalakeRecord.list_from_url(url)
     for r in records:
         self.storage.store(r)
 def datalake_records(self):
     if self['eventName'] not in self.EVENTS_WITH_RECORDS:
         return []
     return [dlr for dlr in DatalakeRecord.list_from_url(self.s3_url)]
Example #18
0
def test_no_such_datalake_file_in_bucket(s3_bucket_maker):
    s3_bucket_maker('test-bucket')
    url = 's3://test-bucket/such/file'
    with pytest.raises(NoSuchDatalakeFile):
        DatalakeRecord.list_from_url(url)
Example #19
0
def test_no_such_bucket(s3_connection):
    url = 's3://no/such/file'
    with pytest.raises(NoSuchDatalakeFile):
        DatalakeRecord.list_from_url(url)
Example #20
0
def test_from_url_fails_without_boto():
    with pytest.raises(InsufficientConfiguration):
        DatalakeRecord.list_from_url('s3://foo/bar')
Example #21
0
def test_from_url_fails_without_boto():
    with pytest.raises(InsufficientConfiguration):
        DatalakeRecord.list_from_url('s3://foo/bar')
Example #22
0
def test_no_such_bucket(s3_connection):
    url = 's3://no/such/file'
    with pytest.raises(NoSuchDatalakeFile):
        DatalakeRecord.list_from_url(url)
Example #23
0
def test_no_such_datalake_file_in_bucket(s3_bucket_maker):
    s3_bucket_maker('test-bucket')
    url = 's3://test-bucket/such/file'
    with pytest.raises(NoSuchDatalakeFile):
        DatalakeRecord.list_from_url(url)
Example #24
0
def test_no_metadata(s3_file_maker):
    url = 's3://foo/bar'
    s3_file_maker('foo', 'bar', 'the content', None)
    with pytest.raises(InvalidDatalakeMetadata):
        DatalakeRecord.list_from_url(url)
Example #25
0
 def maker(content, metadata):
     path = metadata['id'] + '/data'
     s3_file_maker('datalake-test', path, content, metadata)
     url = 's3://datalake-test/' + path
     records = DatalakeRecord.list_from_metadata(url, metadata)
     table_maker(records)
 def datalake_records(self):
     if self['eventName'] not in self.EVENTS_WITH_RECORDS:
         return []
     return [dlr for dlr in DatalakeRecord.list_from_url(self.s3_url)]
Example #27
0
def create_test_records(bucket='datalake-test', **kwargs):
    m = random_metadata()
    m.update(**kwargs)
    url = 's3://' + bucket + '/' + '/'.join([str(v) for v in kwargs.values()])
    return DatalakeRecord.list_from_metadata(url, m)
Example #28
0
 def ingest(self, url):
     '''ingest the metadata associated with the given url'''
     records = DatalakeRecord.list_from_url(url)
     for r in records:
         self.storage.store(r)