Exemple #1
0
def test_absolute_windows_path_single_slash(basic_metadata):
    # some cygwin environments seem to have a single slash after the
    # drive. Shrug.
    path = r'Z:\foo\bar.txt'
    basic_metadata['path'] = path
    m = Metadata(basic_metadata)
    assert m['path'] == path
Exemple #2
0
 def list_from_metadata(cls, url, metadata):
     '''return a list of DatalakeRecords for the url and metadata'''
     key = cls._get_key(url)
     metadata = Metadata(**metadata)
     ct = cls._get_create_time(key)
     time_buckets = cls.get_time_buckets_from_metadata(metadata)
     return [cls(url, metadata, t, ct, key.size) for t in time_buckets]
Exemple #3
0
 def _get_metadata_from_key(cls, key):
     metadata = key.get_metadata('datalake')
     if not metadata:
         msg = 'No datalake metadata for s3://{}{}'
         msg = msg.format(key.bucket.name, key.name)
         raise InvalidDatalakeMetadata(msg)
     return Metadata.from_json(metadata)
Exemple #4
0
    def list(self, what, start=None, end=None, where=None, work_id=None):
        '''list metadata records for specified files

        Args:
          what: what kind of file to list (e.g., syslog, nginx)

          start: List only files after this time. This argument is
          polymorphic. datetimes are accepted. Strings will be converted to
          datetimes, so inputs like `2015-12-21` and `2015-12-21T09:11:14.08Z`
          are acceptable. Floats will be interpreted as UTC seconds since the
          epoch. Integers will be interpreted as milliseconds since the epoch.

          end: List only files before this time. Same semantics as start.

          where: List only files from this host.

          work_id: Show only files with this work id.

        returns a generator that lists records of the form:
            {
                'url': <url>,
                'metadata': <metadata>,
            }
        '''
        url = self.http_url + '/v0/archive/files/'
        params = dict(
            what=what,
            start=None if start is None else Metadata.normalize_date(start),
            end=None if end is None else Metadata.normalize_date(end),
            where=where,
            work_id=work_id,
        )
        response = self._requests_get(url, params=params)

        while True:
            self._check_http_response(response)
            response = response.json()
            for record in response['records']:
                yield record
            if response['next']:
                response = self._requests_get(response['next'])
            else:
                break
Exemple #5
0
    def tester(start, end):
        random_metadata['start'] = Metadata.normalize_date(start)
        random_metadata['end'] = Metadata.normalize_date(end)
        r = {
            'records': [
                {
                    'url': 's3://bucket/file',
                    'metadata': random_metadata,
                }
            ],
            'next': None,
        }

        prepare_response(r, what=random_metadata['what'],
                         start=random_metadata['start'],
                         end=random_metadata['end'])
        l = list(archive.list(random_metadata['what'], start=start, end=end))
        assert len(l) == 1
        assert l[0]['url'] == 's3://bucket/file'
        assert l[0]['metadata'] == random_metadata
Exemple #6
0
 def _get_metadata(cls, url):
     parsed_url = urlparse(url)
     bucket = cls._get_bucket(parsed_url.netloc)
     key = bucket.get_key(parsed_url.path)
     if key is None:
         msg = '{} does not appear to be in the datalake'
         msg = msg.format(url)
         raise NoSuchDatalakeFile(msg)
     metadata = key.get_metadata('datalake')
     if not metadata:
         raise InvalidDatalakeMetadata('No datalake metadata for ' + url)
     return Metadata.from_json(metadata)
Exemple #7
0
 def _get_metadata(cls, url):
     parsed_url = urlparse(url)
     bucket = cls._get_bucket(parsed_url.netloc)
     key = bucket.get_key(parsed_url.path)
     if key is None:
         msg = '{} does not appear to be in the datalake'
         msg = msg.format(url)
         raise NoSuchDatalakeFile(msg)
     metadata = key.get_metadata('datalake')
     if not metadata:
         raise InvalidDatalakeMetadata('No datalake metadata for ' + url)
     return Metadata.from_json(metadata)
Exemple #8
0
    def __init__(self, fd, **metadata_fields):
        '''Create a File

        Args:

            fd: file-like object from which the file data can be read.

            metadata_fields: known metadata fields that go with this
            file. Missing fields will be added if they can be
            determined. Othwerise, InvalidDatalakeMetadata will be raised.

        '''
        self._fd = fd
        self._initialize_methods_from_fd()
        self._infer_metadata_fields(metadata_fields)
        self.metadata = Metadata(metadata_fields)
Exemple #9
0
    def __init__(self, stream, **metadata_fields):
        '''Create a StreamingFile
        A StreamingFile is never loaded as a whole into memory.

        Args:

            stream: a generator from which the file data can be read.

            metadata_fields: known metadata fields that go with this
            file. Missing fields will be added if they can be
            determined. Othwerise, InvalidDatalakeMetadata will be raised.

        '''
        self._stream = stream
        self._buffer = b''
        self._content_gen = False
        self.metadata = Metadata(metadata_fields)
def test_normalize_date_with_datetime(basic_metadata):
    date = dateparse('2015-03-20T00:00:00Z')
    ms = Metadata.normalize_date(date)
    assert ms == 1426809600000
Exemple #11
0
 def _get_metadata_from_key(self, key):
     m = key.get_metadata(METADATA_NAME)
     return Metadata.from_json(m)
def test_unallowed_spaces(basic_metadata):
    basic_metadata['where'] = 'SAN FRANCISCO'
    with pytest.raises(InvalidDatalakeMetadata):
        Metadata(basic_metadata)
def test_unallowed_characters(basic_metadata):
    basic_metadata['what'] = '123#$'
    with pytest.raises(InvalidDatalakeMetadata):
        Metadata(basic_metadata)
def test_id_not_overwritten(basic_metadata):
    basic_metadata['id'] = '123'
    m = Metadata(basic_metadata)
    assert 'id' in m
    assert m['id'] == '123'
def test_none_for_required_field(basic_metadata):
    basic_metadata['where'] = None
    with pytest.raises(InvalidDatalakeMetadata):
        Metadata(basic_metadata)
def test_invalid_date(basic_metadata):
    basic_metadata['end'] = 'bxfl230'
    with pytest.raises(InvalidDatalakeMetadata):
        Metadata(basic_metadata)
def test_unsupported_version(basic_metadata):
    basic_metadata['version'] = '100'
    with pytest.raises(UnsupportedDatalakeMetadataVersion):
        Metadata(basic_metadata)
def test_normalize_garbage(basic_metadata):
    with pytest.raises(InvalidDatalakeMetadata):
        Metadata.normalize_date('bleeblaaablooo')
def test_version_default(basic_metadata):
    del (basic_metadata['version'])
    m = Metadata(basic_metadata)
    assert 'version' in m
    assert m['version'] == 0
def test_from_to_json(basic_metadata):
    m1 = Metadata.from_json(basic_json)
    m2 = m1.json
    assert sorted(m2) == sorted(basic_json)
def test_normalize_date(basic_metadata):
    basic_metadata['start'] = '2015-03-20'
    m = Metadata(basic_metadata)
    assert m['start'] == 1426809600000
def test_from_invalid_json():
    with pytest.raises(InvalidDatalakeMetadata):
        Metadata.from_json('{flee floo')
def test_id_gets_assigned(basic_metadata):
    m = Metadata(basic_metadata)
    assert 'id' in m
    assert m['id'] is not None
def test_from_none_json():
    with pytest.raises(InvalidDatalakeMetadata):
        Metadata.from_json(None)
def test_work_id_gets_assigned(basic_metadata):
    m = Metadata(basic_metadata)
    assert 'work_id' in m
    assert m['work_id'] is None
def test_end_before_start(basic_metadata):
    end = basic_metadata['end']
    basic_metadata['end'] = basic_metadata['start']
    basic_metadata['start'] = end
    with pytest.raises(InvalidDatalakeMetadata):
        Metadata(basic_metadata)
def test_no_end_allowed(basic_metadata):
    del (basic_metadata['end'])
    m = Metadata(basic_metadata)
    assert 'end' not in m
def test_random_metadata(random_metadata):
    # Others rely on datalake-common's random_metadata to be valid. So make
    # sure it doesn't throw any errors.
    Metadata(random_metadata)
def test_unallowed_capitals(basic_metadata):
    basic_metadata['what'] = 'MYFILE'
    with pytest.raises(InvalidDatalakeMetadata):
        Metadata(basic_metadata)
def test_normalize_float_date(basic_metadata):
    basic_metadata['start'] = '1426809600.123'
    m = Metadata(basic_metadata)
    assert m['start'] == 1426809600123
def test_unallowed_dots(basic_metadata):
    basic_metadata['where'] = 'this.that.com'
    with pytest.raises(InvalidDatalakeMetadata):
        Metadata(basic_metadata)
def test_normalize_int_date(basic_metadata):
    basic_metadata['end'] = '1426809600123'
    m = Metadata(basic_metadata)
    assert m['end'] == 1426809600123