def test_fileobj_tar_gz_should_not_simplify_archives(self): source = os.path.join(self.temp_dir, 'source_dir') os.mkdir(source) self.write_string_to_file('testing', os.path.join(source, 'filename')) self.do_upload(('source.tar.gz', tar_gzip_directory(source))) self.assertEqual(['filename'], self.listdir()) self.check_file_equals_string('filename', 'testing')
def test_single_fileobj_tar_gz_no_simplify_archives(self): source = os.path.join(self.temp_dir, 'source_dir') os.mkdir(source) self.write_string_to_file('testing', os.path.join(source, 'filename')) self.do_upload([('source.tar.gz', tar_gzip_directory(source))], simplify_archives=False) self.assertEqual(['filename'], os.listdir(self.bundle_location)) self.check_file_contains_string(os.path.join(self.bundle_location, 'filename'), 'testing')
def stream_tarred_gzipped_directory(self, target): """ Returns a file-like object containing a tarred and gzipped archive of the given directory. """ bundle_state = self._bundle_model.get_bundle_state(target.bundle_uuid) # Raises NotFoundException if uuid is invalid if bundle_state == State.PREPARING: raise NotFoundError( "Bundle {} hasn't started running yet, files not available". format(target.bundle_uuid)) elif bundle_state != State.RUNNING: directory_path = self._get_target_path(target) return file_util.tar_gzip_directory(directory_path) else: # stream_tarred_gzipped_directory calls are sent to the worker even # on a shared filesystem since # 1) due to NFS caching the worker has more up to date # information on directory contents # 2) the logic of hiding # the dependency paths doesn't need to be re-implemented here. worker = self._bundle_model.get_bundle_worker(target.bundle_uuid) response_socket_id = self._worker_model.allocate_socket( worker['user_id'], worker['worker_id']) try: read_args = {'type': 'stream_directory'} self._send_read_message(worker, response_socket_id, target, read_args) fileobj = self._get_read_response_stream(response_socket_id) return Deallocating(fileobj, self._worker_model, response_socket_id) except Exception: self._worker_model.deallocate_socket(response_socket_id) raise
def test_url_tar_gz_should_not_simplify_archives(self): source = os.path.join(self.temp_dir, 'source_dir') os.mkdir(source) self.write_string_to_file('testing', os.path.join(source, 'filename')) self.do_upload( self.mock_url_source(BytesIO(tar_gzip_directory(source).read()), ext=".tar.gz")) self.check_file_equals_string('filename', 'testing')
def write_git_repo(self, source: str, bundle_path: str): with tempfile.TemporaryDirectory() as tmpdir: file_util.git_clone(source, tmpdir) # Upload a fileobj with the repo's tarred and gzipped contents. self.write_fileobj(".tar.gz", tar_gzip_directory(tmpdir), bundle_path, unpack_archive=True)
def test_url_tar_gz(self): source = os.path.join(self.temp_dir, 'source_dir') os.mkdir(source) self.write_string_to_file('testing', os.path.join(source, 'file1')) self.write_string_to_file('testing', os.path.join(source, 'file2')) self.do_upload( self.mock_url_source(BytesIO(tar_gzip_directory(source).read()), ext=".tar.gz")) self.assertIn('file2', self.listdir())
def test_tar_empty(self): dir = tempfile.mkdtemp() self.addCleanup(lambda: remove_path(dir)) temp_dir = tempfile.mkdtemp() self.addCleanup(lambda: remove_path(temp_dir)) output_dir = os.path.join(temp_dir, 'output') un_tar_directory(tar_gzip_directory(dir), output_dir, 'gz') self.assertEqual(os.listdir(output_dir), [])
def test_fileobj_tar_gz_with_dsstore_should_not_simplify_archive(self): """If the user included two files, README and .DS_Store, in the archive, the archive should not be simplified because we have more than one file in the archive. """ source = os.path.join(self.temp_dir, 'source_dir') os.mkdir(source) self.write_string_to_file('testing', os.path.join(source, 'README')) self.write_string_to_file('testing', os.path.join(source, '.DS_Store')) self.do_upload(('source.tar.gz', tar_gzip_directory(source))) self.assertEqual(['.DS_Store', 'README'], sorted(self.listdir()))
def test_single_local_tar_gz_path_simplify_archives(self): source_dir = os.path.join(self.temp_dir, 'source_dir') os.mkdir(source_dir) self.write_string_to_file('testing', os.path.join(source_dir, 'filename')) source = os.path.join(self.temp_dir, 'source.tar.gz') with open(source, 'wb') as f: f.write(tar_gzip_directory(source_dir).read()) self.do_upload([source], simplify_archives=True) self.assertTrue(os.path.exists(source)) self.check_file_contains_string(self.bundle_location, 'testing')
def test_tar_always_ignore(self): temp_dir = tempfile.mkdtemp() self.addCleanup(lambda: remove_path(temp_dir)) output_dir = os.path.join(temp_dir, 'output') un_tar_directory(tar_gzip_directory(IGNORE_TEST_DIR), output_dir, 'gz') output_dir_entries = os.listdir(output_dir) self.assertNotIn('._ignored', output_dir_entries) self.assertIn('dir', output_dir_entries) self.assertNotIn('__MACOSX', output_dir_entries) self.assertFalse(os.path.exists(os.path.join(output_dir, 'dir', '__MACOSX'))) self.assertFalse(os.path.exists(os.path.join(output_dir, 'dir', '._ignored2')))
def test_tar_exclude_ignore(self): temp_dir = tempfile.mkdtemp() self.addCleanup(lambda: remove_path(temp_dir)) output_dir = os.path.join(temp_dir, 'output') un_tar_directory( tar_gzip_directory(IGNORE_TEST_DIR, ignore_file='.tarignore'), output_dir, 'gz' ) output_dir_entries = os.listdir(output_dir) self.assertIn('not_ignored.txt', output_dir_entries) self.assertIn('dir', output_dir_entries) self.assertNotIn('ignored.txt', output_dir_entries) self.assertNotIn('ignored_dir', output_dir_entries) self.assertTrue(os.path.exists(os.path.join(output_dir, 'dir', 'not_ignored2.txt'))) self.assertFalse(os.path.exists(os.path.join(output_dir, 'dir', 'ignored2.txt')))
def test_tar_has_files(self): temp_dir = tempfile.mkdtemp() self.addCleanup(lambda: remove_path(temp_dir)) output_dir = os.path.join(temp_dir, 'output') un_tar_directory( tar_gzip_directory(FILES_DIR, False, ['f2'], ['f1', 'b.txt']), output_dir, 'gz' ) output_dir_entries = os.listdir(output_dir) self.assertIn('dir1', output_dir_entries) self.assertIn('a.txt', output_dir_entries) self.assertNotIn('b.txt', output_dir_entries) self.assertTrue(os.path.exists(os.path.join(output_dir, 'dir1', 'f1'))) self.assertFalse(os.path.exists(os.path.join(output_dir, 'dir1', 'f2'))) self.assertTrue(os.path.islink(os.path.join(output_dir, 'a-symlink.txt')))
def upload_folder(self, bundle, contents): with tempfile.TemporaryDirectory() as tmpdir: for item in contents: path, contents = item file_path = os.path.join(tmpdir, path) os.makedirs(os.path.dirname(file_path), exist_ok=True) with open(file_path, "wb+") as f: f.write(contents) os.chmod(file_path, self.DEFAULT_PERM_FILE) os.chmod(os.path.dirname(file_path), self.DEFAULT_PERM_DIR) self.upload_manager.upload_to_bundle_store( bundle, source=["contents.tar.gz", tar_gzip_directory(tmpdir)], git=False, unpack=True, use_azure_blob_beta=self.use_azure_blob_beta, )
def stream_tarred_gzipped_directory(self, uuid, path): """ Returns a file-like object containing a tarred and gzipped archive of the given directory. """ if self._is_available_locally(uuid): directory_path = self._get_target_path(uuid, path) return file_util.tar_gzip_directory(directory_path) else: worker = self._worker_model.get_bundle_worker(uuid) response_socket_id = self._worker_model.allocate_socket( worker['user_id'], worker['worker_id']) try: read_args = {'type': 'stream_directory'} self._send_read_message(worker, response_socket_id, uuid, path, read_args) fileobj = self._get_read_response_stream(response_socket_id) return Deallocating(fileobj, self._worker_model, response_socket_id) except Exception: self._worker_model.deallocate_socket(response_socket_id) raise
def stream_thread(final_path): with closing( tar_gzip_directory( final_path, exclude_names=exclude_names)) as fileobj: reply_fn(None, {}, fileobj)
def archive(self, *args, **kwargs): return tar_gzip_directory(*args, **kwargs)
def pack_files_for_upload(sources, should_unpack, follow_symlinks, exclude_patterns=None, force_compression=False): """ Create a single flat tarfile containing all the sources. Caller is responsible for closing the returned fileobj. Note: It may be possible to achieve additional speed gains on certain cases if we disable compression when tar-ing directories. But for now, force_compression only affects the case of single, uncompressed files. :param sources: list of paths to files to pack :param should_unpack: will unpack archives iff True :param follow_symlinks: will follow symlinks if True else behavior undefined :param exclude_patterns: list of glob patterns for files to ignore, or None to include all files :param force_compression: True to always use compression :return: dict with { 'fileobj': <file object of archive>, 'filename': <name of archive file>, 'filesize': <size of archive in bytes, or None if unknown>, 'should_unpack': <True iff archive should be unpacked at server>, 'should_simplify': <True iff directory should be 'simplified' at server> } """ exclude_patterns = exclude_patterns or [] def resolve_source(source): # Resolve symlink if desired resolved_source = source if follow_symlinks: resolved_source = os.path.realpath(source) if not os.path.exists(resolved_source): raise UsageError('Broken symlink') elif os.path.islink(source): raise UsageError('Not following symlinks.') return resolved_source sources = list(map(resolve_source, sources)) # For efficiency, return single files and directories directly if len(sources) == 1: source = sources[0] filename = os.path.basename(source) if os.path.isdir(sources[0]): archived = tar_gzip_directory(source, follow_symlinks=follow_symlinks, exclude_patterns=exclude_patterns) return { 'fileobj': archived, 'filename': filename + '.tar.gz', 'filesize': None, 'should_unpack': True, 'should_simplify': False, } elif path_is_archive(source): return { 'fileobj': open(source, mode='rb'), 'filename': filename, 'filesize': os.path.getsize(source), 'should_unpack': should_unpack, 'should_simplify': True, } elif force_compression: return { 'fileobj': gzip_file(source), 'filename': filename + '.gz', 'filesize': None, 'should_unpack': True, 'should_simplify': False, } else: return { 'fileobj': open(source, mode='rb'), 'filename': filename, 'filesize': os.path.getsize(source), 'should_unpack': False, 'should_simplify': False, } # Build archive file incrementally from all sources # TODO: For further optimization, could either uses a temporary named pipe # or a wrapper around a TemporaryFile to concurrently write to the tarfile # while the REST client reads and sends it to the server. At the moment, # we wait for the tarfile to be created until we rewind and pass the file # to the client to be sent to the server. scratch_dir = tempfile.mkdtemp() archive_fileobj = tempfile.SpooledTemporaryFile() archive = tarfile.open(name='we', mode='w:gz', fileobj=archive_fileobj) def should_exclude(fn): basefn = os.path.basename(fn) return any(fnmatch(basefn, p) for p in exclude_patterns) for source in sources: if should_unpack and path_is_archive(source): # Unpack archive into scratch space dest_basename = strip_archive_ext(os.path.basename(source)) dest_path = os.path.join(scratch_dir, dest_basename) unpack(get_archive_ext(source), source, dest_path) # Add file or directory to archive archive.add(dest_path, arcname=dest_basename, recursive=True) else: # Add file to archive, or add files recursively if directory archive.add(source, arcname=os.path.basename(source), recursive=True, exclude=should_exclude) # Clean up, rewind archive file, and return it archive.close() shutil.rmtree(scratch_dir) filesize = archive_fileobj.tell() archive_fileobj.seek(0) return { 'fileobj': archive_fileobj, 'filename': 'contents.tar.gz', 'filesize': filesize, 'should_unpack': True, 'should_simplify': False, }
def tar_bz2_directory(*args, **kwargs): """Method used for creating a .tar.bz2 archive from a directory. This is just used for tests; it's not performance optimized and should not be used in production code.""" output = tar_gzip_directory(*args, **kwargs) return BytesIO(bz2.compress(gzip.decompress(output.read())))