def test_absolute_windows_path_single_slash(basic_metadata): # some cygwin environments seem to have a single slash after the # drive. Shrug. path = r'Z:\foo\bar.txt' basic_metadata['path'] = path m = Metadata(basic_metadata) assert m['path'] == path
def list_from_metadata(cls, url, metadata): '''return a list of DatalakeRecords for the url and metadata''' key = cls._get_key(url) metadata = Metadata(**metadata) ct = cls._get_create_time(key) time_buckets = cls.get_time_buckets_from_metadata(metadata) return [cls(url, metadata, t, ct, key.size) for t in time_buckets]
def _get_metadata_from_key(cls, key): metadata = key.get_metadata('datalake') if not metadata: msg = 'No datalake metadata for s3://{}{}' msg = msg.format(key.bucket.name, key.name) raise InvalidDatalakeMetadata(msg) return Metadata.from_json(metadata)
def list(self, what, start=None, end=None, where=None, work_id=None): '''list metadata records for specified files Args: what: what kind of file to list (e.g., syslog, nginx) start: List only files after this time. This argument is polymorphic. datetimes are accepted. Strings will be converted to datetimes, so inputs like `2015-12-21` and `2015-12-21T09:11:14.08Z` are acceptable. Floats will be interpreted as UTC seconds since the epoch. Integers will be interpreted as milliseconds since the epoch. end: List only files before this time. Same semantics as start. where: List only files from this host. work_id: Show only files with this work id. returns a generator that lists records of the form: { 'url': <url>, 'metadata': <metadata>, } ''' url = self.http_url + '/v0/archive/files/' params = dict( what=what, start=None if start is None else Metadata.normalize_date(start), end=None if end is None else Metadata.normalize_date(end), where=where, work_id=work_id, ) response = self._requests_get(url, params=params) while True: self._check_http_response(response) response = response.json() for record in response['records']: yield record if response['next']: response = self._requests_get(response['next']) else: break
def tester(start, end): random_metadata['start'] = Metadata.normalize_date(start) random_metadata['end'] = Metadata.normalize_date(end) r = { 'records': [ { 'url': 's3://bucket/file', 'metadata': random_metadata, } ], 'next': None, } prepare_response(r, what=random_metadata['what'], start=random_metadata['start'], end=random_metadata['end']) l = list(archive.list(random_metadata['what'], start=start, end=end)) assert len(l) == 1 assert l[0]['url'] == 's3://bucket/file' assert l[0]['metadata'] == random_metadata
def _get_metadata(cls, url): parsed_url = urlparse(url) bucket = cls._get_bucket(parsed_url.netloc) key = bucket.get_key(parsed_url.path) if key is None: msg = '{} does not appear to be in the datalake' msg = msg.format(url) raise NoSuchDatalakeFile(msg) metadata = key.get_metadata('datalake') if not metadata: raise InvalidDatalakeMetadata('No datalake metadata for ' + url) return Metadata.from_json(metadata)
def __init__(self, fd, **metadata_fields): '''Create a File Args: fd: file-like object from which the file data can be read. metadata_fields: known metadata fields that go with this file. Missing fields will be added if they can be determined. Othwerise, InvalidDatalakeMetadata will be raised. ''' self._fd = fd self._initialize_methods_from_fd() self._infer_metadata_fields(metadata_fields) self.metadata = Metadata(metadata_fields)
def __init__(self, stream, **metadata_fields): '''Create a StreamingFile A StreamingFile is never loaded as a whole into memory. Args: stream: a generator from which the file data can be read. metadata_fields: known metadata fields that go with this file. Missing fields will be added if they can be determined. Othwerise, InvalidDatalakeMetadata will be raised. ''' self._stream = stream self._buffer = b'' self._content_gen = False self.metadata = Metadata(metadata_fields)
def test_normalize_date_with_datetime(basic_metadata): date = dateparse('2015-03-20T00:00:00Z') ms = Metadata.normalize_date(date) assert ms == 1426809600000
def _get_metadata_from_key(self, key): m = key.get_metadata(METADATA_NAME) return Metadata.from_json(m)
def test_unallowed_spaces(basic_metadata): basic_metadata['where'] = 'SAN FRANCISCO' with pytest.raises(InvalidDatalakeMetadata): Metadata(basic_metadata)
def test_unallowed_characters(basic_metadata): basic_metadata['what'] = '123#$' with pytest.raises(InvalidDatalakeMetadata): Metadata(basic_metadata)
def test_id_not_overwritten(basic_metadata): basic_metadata['id'] = '123' m = Metadata(basic_metadata) assert 'id' in m assert m['id'] == '123'
def test_none_for_required_field(basic_metadata): basic_metadata['where'] = None with pytest.raises(InvalidDatalakeMetadata): Metadata(basic_metadata)
def test_invalid_date(basic_metadata): basic_metadata['end'] = 'bxfl230' with pytest.raises(InvalidDatalakeMetadata): Metadata(basic_metadata)
def test_unsupported_version(basic_metadata): basic_metadata['version'] = '100' with pytest.raises(UnsupportedDatalakeMetadataVersion): Metadata(basic_metadata)
def test_normalize_garbage(basic_metadata): with pytest.raises(InvalidDatalakeMetadata): Metadata.normalize_date('bleeblaaablooo')
def test_version_default(basic_metadata): del (basic_metadata['version']) m = Metadata(basic_metadata) assert 'version' in m assert m['version'] == 0
def test_from_to_json(basic_metadata): m1 = Metadata.from_json(basic_json) m2 = m1.json assert sorted(m2) == sorted(basic_json)
def test_normalize_date(basic_metadata): basic_metadata['start'] = '2015-03-20' m = Metadata(basic_metadata) assert m['start'] == 1426809600000
def test_from_invalid_json(): with pytest.raises(InvalidDatalakeMetadata): Metadata.from_json('{flee floo')
def test_id_gets_assigned(basic_metadata): m = Metadata(basic_metadata) assert 'id' in m assert m['id'] is not None
def test_from_none_json(): with pytest.raises(InvalidDatalakeMetadata): Metadata.from_json(None)
def test_work_id_gets_assigned(basic_metadata): m = Metadata(basic_metadata) assert 'work_id' in m assert m['work_id'] is None
def test_end_before_start(basic_metadata): end = basic_metadata['end'] basic_metadata['end'] = basic_metadata['start'] basic_metadata['start'] = end with pytest.raises(InvalidDatalakeMetadata): Metadata(basic_metadata)
def test_no_end_allowed(basic_metadata): del (basic_metadata['end']) m = Metadata(basic_metadata) assert 'end' not in m
def test_random_metadata(random_metadata): # Others rely on datalake-common's random_metadata to be valid. So make # sure it doesn't throw any errors. Metadata(random_metadata)
def test_unallowed_capitals(basic_metadata): basic_metadata['what'] = 'MYFILE' with pytest.raises(InvalidDatalakeMetadata): Metadata(basic_metadata)
def test_normalize_float_date(basic_metadata): basic_metadata['start'] = '1426809600.123' m = Metadata(basic_metadata) assert m['start'] == 1426809600123
def test_unallowed_dots(basic_metadata): basic_metadata['where'] = 'this.that.com' with pytest.raises(InvalidDatalakeMetadata): Metadata(basic_metadata)
def test_normalize_int_date(basic_metadata): basic_metadata['end'] = '1426809600123' m = Metadata(basic_metadata) assert m['end'] == 1426809600123