def enqueue(self, filename, compress=False, **metadata_fields): '''enqueue a file with the specified metadata to be pushed Args: filename: the file to enqueue compress: whether or not to compress the file before enqueueing Returns the File with complete metadata that will be pushed. ''' log.info('Enqueing ' + filename) if compress: try: f = File.from_filename_compressed(filename, **metadata_fields) except OverflowError: log.warning('Compression failed. Falling back to uncompressed ' 'uploads') f = File.from_filename(filename, **metadata_fields) else: f = File.from_filename(filename, **metadata_fields) fname = f.metadata['id'] + '.tar' dest = os.path.join(self.queue_dir, fname) f.to_bundle(dest) return f
def test_bundle_with_invalid_metadata(bundle_maker, random_metadata): del (random_metadata['what']) m = json.dumps(random_metadata).encode('utf-8') b = bundle_maker(content='1234'.encode('utf-8'), metadata=m, version='0'.encode('utf-8')) with pytest.raises(InvalidDatalakeMetadata): File.from_bundle(b)
def test_bundle_with_invalid_metadata(bundle_maker, random_metadata): del(random_metadata['what']) m = json.dumps(random_metadata).encode('utf-8') b = bundle_maker(content='1234'.encode('utf-8'), metadata=m, version='0'.encode('utf-8')) with pytest.raises(InvalidDatalakeMetadata): File.from_bundle(b)
def random_file(tmpdir, metadata=None): name = random_word(10) content = random_word(256) f = tmpdir.join(name) f.write(content) if metadata is None: metadata = random_metadata() return File.from_filename(f.strpath, **metadata)
def random_file(tmpdir, metadata=None): name = random_word(10) content = random_word(256) f = tmpdir.join(name) f.write(content) if metadata is None: metadata = generate_random_metadata() return File.from_filename(f.strpath, **metadata)
def _fetch_s3_url(self, url, stream=False): k = self._get_key_from_url(url) m = self._get_metadata_from_key(k) if stream: return StreamingFile(k, **m) fd = BytesIO() k.get_contents_to_file(fd) fd.seek(0) return File(fd, **m)
def _fetch_http_url(self, url, stream=False): m = self._get_metadata_from_http_url(url) k = self._stream_http_url(url) if stream: return StreamingHTTPFile(k, **m) fd = BytesIO() for block in k.iter_content(1024): fd.write(block) fd.seek(0) return File(fd, **m)
def test_valid_bundle(tmpdir, random_metadata): p = os.path.join(str(tmpdir), 'foo.tar') f1 = random_file(tmpdir, metadata=random_metadata) f1.to_bundle(p) f2 = File.from_bundle(p) assert f1.metadata == f2.metadata content1 = f1.read() content2 = f2.read() assert content1 assert content1 == content2
def enqueue(self, filename, compress=False, **metadata_fields): '''enqueue a file with the specified metadata to be pushed Args: filename: the file to enqueue compress: whether or not to compress the file before enqueueing Returns the File with complete metadata that will be pushed. ''' log.info('Enqueing ' + filename) if compress: f = File.from_filename_compressed(filename, **metadata_fields) else: f = File.from_filename(filename, **metadata_fields) fname = f.metadata['id'] + '.tar' dest = os.path.join(self.queue_dir, fname) f.to_bundle(dest) return f
def _synchronous_push(self, filename): try: f = File.from_bundle(filename) except InvalidDatalakeBundle as e: msg = '{}. Skipping upload.'.format(e.args[0]) log.exception(msg) return url = self._archive.push(f) msg = 'Pushed {}({}) to {}'.format(filename, f.metadata['path'], url) log.info(msg) os.unlink(filename) if self._callback is not None: self._callback(filename)
def prepare_metadata_and_push(self, filename, **metadata_fields): '''push a file to the archive with the specified metadata Args: filename: path of the file to push metadata_fields: metadata fields for file. Missing fields will be added if they can be determined. Othwerise, InvalidDatalakeMetadata will be raised. returns the url to which the file was pushed. ''' f = File.from_filename(filename, **metadata_fields) return self.push(f)
def test_pre_python_3_bundle(): # prior to python 3 support, we relied on python to choose the most # suitable encoding for files. Now we do it explicitly. Make sure legacy # bundles work. eyedee = '7c72f3ab092445a08aa6983c864c087c' expected_content = b'Wake up.\nEat. Mmm.\nHappy hour.\nSleep.\n' expected_metadata = { 'end': 1474308636507, 'hash': '70373dec2de49d566fc1e34bacca7561', 'id': eyedee, 'path': '/home/brian/src/datalake/chicken.log', 'start': 1474308548000, 'version': 0, 'what': 'chicken', 'where': 'nomad', 'work_id': None } b = os.path.join(legacy_bundles, eyedee + '.tar') f = File.from_bundle(b) assert f.metadata == expected_metadata assert f.read() == expected_content
def test_bundle_not_tar(tmpfile): f = tmpfile('foobar') with pytest.raises(InvalidDatalakeBundle): File.from_bundle(f)
def test_bundle_without_content(bundle_maker, random_metadata): m = json.dumps(random_metadata).encode('utf-8') b = bundle_maker(metadata=m, version='0'.encode('utf-8')) with pytest.raises(InvalidDatalakeBundle): File.from_bundle(b)
def test_non_existent_file(): with pytest.raises(IOError): File.from_filename('surelythisfiledoesnotexist.txt')
def test_bundle_with_non_json_metadata(bundle_maker): b = bundle_maker(content='1234'.encode('utf-8'), metadata='not:a%json#'.encode('utf-8'), version='0'.encode('utf-8')) with pytest.raises(InvalidDatalakeBundle): File.from_bundle(b)
def test_bundle_without_version(bundle_maker, random_metadata): m = json.dumps(random_metadata).encode('utf-8') b = bundle_maker(content='1234'.encode('utf-8'), metadata=m) with pytest.raises(InvalidDatalakeBundle): File.from_bundle(b)