def upload_bundle(self, source_file, bundle_type, worksheet_uuid): """ Upload |source_file| (a stream) to |worksheet_uuid|. """ # Construct info for creating the bundle. bundle_subclass = get_bundle_subclass(bundle_type) # program or data metadata = metadata_util.fill_missing_metadata(bundle_subclass, {}, initial_metadata={'name': source_file.filename, 'description': 'Upload ' + source_file.filename}) info = {'bundle_type': bundle_type, 'metadata': metadata} # Upload it by creating a file handle and copying source_file to it (see RemoteBundleClient.upload_bundle in the CLI). remote_file_uuid = self.client.open_temp_file(metadata['name']) try: with closing(RPCFileHandle(remote_file_uuid, self.client.proxy)) as dest: file_util.copy(source_file.file, dest, autoflush=False, print_status='Uploading %s' % metadata['name']) pack = False # For now, always unpack (note: do this after set remote_file_uuid, which needs the extension) if not pack and zip_util.path_is_archive(metadata['name']): metadata['name'] = zip_util.strip_archive_ext(metadata['name']) # Then tell the client that the uploaded file handle is there. new_bundle_uuid = self.client.finish_upload_bundle( [remote_file_uuid], not pack, # unpack info, worksheet_uuid, True) # add_to_worksheet except: self.client.finalize_file(remote_file_uuid) raise return new_bundle_uuid
def upload_bundle(self, sources, follow_symlinks, exclude_patterns, git, unpack, remove_sources, info, worksheet_uuid, add_to_worksheet): """ See local_bundle_client.py for documentation on the usage. Strategy: 1) We copy the |sources| to a temporary directory on the server (streaming either a tar or tar.gz depending on whether compression is needed). 2) We politely ask the server to finish_upload_bundle (performs a LocalBundleClient.upload_bundle from the temporary directory). """ # URLs can be directly passed to the local client. if all(path_util.path_is_url(source) for source in sources): return self.upload_bundle_url(sources, follow_symlinks, exclude_patterns, git, unpack, remove_sources, info, worksheet_uuid, add_to_worksheet) # 1) Copy sources up to the server (temporary remote zip file) remote_file_uuids = [] for source in sources: remote_file_uuid = self.open_temp_file(zip_util.add_packed_suffix(os.path.basename(source))) remote_file_uuids.append(remote_file_uuid) dest_handle = RPCFileHandle(remote_file_uuid, self.proxy) if zip_util.path_is_archive(source): source_handle = open(source) else: source_handle = zip_util.open_packed_path(source, follow_symlinks, exclude_patterns) unpack = True # We packed it, so we have to unpack it status = 'Uploading %s%s to %s' % (source, ' ('+info['uuid']+')' if 'uuid' in info else '', self.address) # FileServer does not expose an API for forcibly flushing writes, so # we rely on closing the file to flush it. file_util.copy(source_handle, dest_handle, autoflush=False, print_status=status) dest_handle.close() # 2) Install upload (this call will be in charge of deleting the temporary file). result = self.finish_upload_bundle(remote_file_uuids, unpack, info, worksheet_uuid, add_to_worksheet) return result
def get_blob(uuid, path=''): """ API to download the contents of a bundle or a subpath within a bundle. For directories this method always returns a tarred and gzipped archive of the directory. For files, if the request has an Accept-Encoding header containing gzip, then the returned file is gzipped. """ check_bundles_have_read_permission(local.model, request.user, [uuid]) bundle = local.model.get_bundle(uuid) target_info = local.download_manager.get_target_info(uuid, path, 0) if target_info is None: abort(httplib.NOT_FOUND, 'Not found.') # Figure out the file name. if not path and bundle.metadata.name: filename = bundle.metadata.name else: filename = target_info['name'] if target_info['type'] == 'directory': # Always tar and gzip directories. filename = filename + '.tar.gz' fileobj = local.download_manager.stream_tarred_gzipped_directory( uuid, path) elif target_info['type'] == 'file': if not zip_util.path_is_archive( filename) and request_accepts_gzip_encoding(): # Let's gzip to save bandwidth. The browser will transparently decode # the file. filename = filename + '.gz' fileobj = local.download_manager.stream_file(uuid, path, gzipped=True) else: fileobj = local.download_manager.stream_file(uuid, path, gzipped=False) else: # Symlinks. abort(httplib.FORBIDDEN, 'Cannot download files of this type.') # Set headers. mimetype, _ = mimetypes.guess_type(filename, strict=False) response.set_header('Content-Type', mimetype or 'text/plain') if zip_util.get_archive_ext( filename) == '.gz' and request_accepts_gzip_encoding(): filename = zip_util.strip_archive_ext(filename) response.set_header('Content-Encoding', 'gzip') else: response.set_header('Content-Encoding', 'identity') response.set_header('Content-Disposition', 'filename="%s"' % filename) return fileobj
def get_blob(uuid, path=''): """ API to download the contents of a bundle or a subpath within a bundle. For directories this method always returns a tarred and gzipped archive of the directory. For files, if the request has an Accept-Encoding header containing gzip, then the returned file is gzipped. """ check_bundles_have_read_permission(local.model, request.user, [uuid]) bundle = local.model.get_bundle(uuid) target_info = local.download_manager.get_target_info(uuid, path, 0) if target_info is None: abort(httplib.NOT_FOUND, 'Not found.') # Figure out the file name. if not path and bundle.metadata.name: filename = bundle.metadata.name else: filename = target_info['name'] if target_info['type'] == 'directory': # Always tar and gzip directories. filename = filename + '.tar.gz' fileobj = local.download_manager.stream_tarred_gzipped_directory(uuid, path) elif target_info['type'] == 'file': if not zip_util.path_is_archive(filename) and request_accepts_gzip_encoding(): # Let's gzip to save bandwidth. The browser will transparently decode # the file. filename = filename + '.gz' fileobj = local.download_manager.stream_file(uuid, path, gzipped=True) else: fileobj = local.download_manager.stream_file(uuid, path, gzipped=False) else: # Symlinks. abort(httplib.FORBIDDEN, 'Cannot download files of this type.') # Set headers. mimetype, _ = mimetypes.guess_type(filename, strict=False) response.set_header('Content-Type', mimetype or 'text/plain') if zip_util.get_archive_ext(filename) == '.gz' and request_accepts_gzip_encoding(): filename = zip_util.strip_archive_ext(filename) response.set_header('Content-Encoding', 'gzip') else: response.set_header('Content-Encoding', 'identity') response.set_header('Content-Disposition', 'filename="%s"' % filename) return fileobj
def upload_bundle(self, sources, follow_symlinks, exclude_patterns, git, unpack, remove_sources, info, worksheet_uuid, add_to_worksheet): """ See local_bundle_client.py for documentation on the usage. Strategy: 1) We copy the |sources| to a temporary directory on the server (streaming either a tar or tar.gz depending on whether compression is needed). 2) We politely ask the server to finish_upload_bundle (performs a LocalBundleClient.upload_bundle from the temporary directory). """ # URLs can be directly passed to the local client. if all(path_util.path_is_url(source) for source in sources): return self.upload_bundle_url(sources, follow_symlinks, exclude_patterns, git, unpack, remove_sources, info, worksheet_uuid, add_to_worksheet) remote_file_uuids = [] try: # 1) Copy sources up to the server (temporary remote zip file) for source in sources: if zip_util.path_is_archive(source): source_handle = open(source) temp_file_name = os.path.basename(source) elif os.path.isdir(source): source_handle = tar_gzip_directory(source, follow_symlinks, exclude_patterns) temp_file_name = os.path.basename(source) + '.tar.gz' unpack = True # We packed it, so we have to unpack it else: resolved_source = source if follow_symlinks: resolved_source = os.path.realpath(source) if not os.path.exists(resolved_source): raise UsageError('Broken symlink') elif os.path.islink(source): raise UsageError('Not following symlinks.') source_handle = gzip_file(resolved_source) temp_file_name = os.path.basename(source) + '.gz' unpack = True # We packed it, so we have to unpack it remote_file_uuid = self.open_temp_file(temp_file_name) remote_file_uuids.append(remote_file_uuid) with closing(RPCFileHandle(remote_file_uuid, self.proxy)) as dest_handle: status = 'Uploading %s%s to %s' % (source, ' ('+info['uuid']+')' if 'uuid' in info else '', self.address) file_util.copy(source_handle, dest_handle, autoflush=False, print_status=status) # 2) Install upload (this call will be in charge of deleting the temporary file). return self.finish_upload_bundle(remote_file_uuids, unpack, info, worksheet_uuid, add_to_worksheet) except: for remote_file_uuid in remote_file_uuids: self.finalize_file(remote_file_uuid) raise
def upload_to_bundle_store(self, bundle: Bundle, source: Source, git: bool, unpack: bool): """Uploads the given source to the bundle store. Given arguments are the same as UploadManager.upload_to_bundle_store(). Used when uploading from rest server.""" try: # bundle_path = self._bundle_store.get_bundle_location(bundle.uuid) is_url, is_fileobj, filename = self._interpret_source(source) if is_url: assert isinstance(source, str) if git: bundle_path = self._update_and_get_bundle_location( bundle, is_directory=True) self.write_git_repo(source, bundle_path) else: # If downloading from a URL, convert the source to a file object. is_fileobj = True source = (filename, urlopen_with_retry(source)) if is_fileobj: source_filename, source_fileobj = cast(Tuple[str, IO[bytes]], source) source_ext = zip_util.get_archive_ext(source_filename) if unpack and zip_util.path_is_archive(filename): bundle_path = self._update_and_get_bundle_location( bundle, is_directory=source_ext in ARCHIVE_EXTS_DIR) self.write_fileobj(source_ext, source_fileobj, bundle_path, unpack_archive=True) else: bundle_path = self._update_and_get_bundle_location( bundle, is_directory=False) self.write_fileobj(source_ext, source_fileobj, bundle_path, unpack_archive=False) except UsageError: if FileSystems.exists(bundle_path): path_util.remove(bundle_path) raise
def upload_bundle(self, source_file, bundle_type, worksheet_uuid): """ Upload |source_file| (a stream) to |worksheet_uuid|. """ # Construct info for creating the bundle. bundle_subclass = get_bundle_subclass(bundle_type) # program or data metadata = metadata_util.fill_missing_metadata( bundle_subclass, {}, initial_metadata={ 'name': source_file.filename, 'description': 'Upload ' + source_file.filename }) info = {'bundle_type': bundle_type, 'metadata': metadata} # Upload it by creating a file handle and copying source_file to it (see RemoteBundleClient.upload_bundle in the CLI). remote_file_uuid = self.client.open_temp_file(metadata['name']) try: with closing(RPCFileHandle(remote_file_uuid, self.client.proxy)) as dest: file_util.copy(source_file.file, dest, autoflush=False, print_status='Uploading %s' % metadata['name']) pack = False # For now, always unpack (note: do this after set remote_file_uuid, which needs the extension) if not pack and zip_util.path_is_archive(metadata['name']): metadata['name'] = zip_util.strip_archive_ext(metadata['name']) # Then tell the client that the uploaded file handle is there. new_bundle_uuid = self.client.finish_upload_bundle( [remote_file_uuid], not pack, # unpack info, worksheet_uuid, True) # add_to_worksheet except: self.client.finalize_file(remote_file_uuid) raise return new_bundle_uuid
def _can_unpack_file(self, path): return os.path.isfile(path) and zip_util.path_is_archive(path)
def upload_to_bundle_store(self, bundle, sources, follow_symlinks, exclude_patterns, remove_sources, git, unpack, simplify_archives): """ Uploads contents for the given bundle to the bundle store. |sources|: specifies the locations of the contents to upload. Each element is either a URL, a local path or a tuple (filename, file-like object). |follow_symlinks|: for local path(s), whether to follow (resolve) symlinks, but only if remove_sources is False. |exclude_patterns|: for local path(s), don't upload these patterns (e.g., *.o), but only if remove_sources is False. |remove_sources|: for local path(s), whether |sources| should be removed |git|: for URLs, whether |source| is a git repo to clone. |unpack|: for each source in |sources|, whether to unpack it if it's an archive. |simplify_archives|: whether to simplify unpacked archives so that if they contain a single file, the final path is just that file, not a directory containing that file. If |sources| contains one source, then the bundle contents will be that source. Otherwise, the bundle contents will be a directory with each of the sources. Exceptions: - If |git|, then each source is replaced with the result of running 'git clone |source|' - If |unpack| is True or a source is an archive (zip, tar.gz, etc.), then unpack the source. """ bundle_path = self._bundle_store.get_bundle_location(bundle.uuid) try: path_util.make_directory(bundle_path) # Note that for uploads with a single source, the directory # structure is simplified at the end. for source in sources: is_url, is_local_path, is_fileobj, filename = self._interpret_source(source) source_output_path = os.path.join(bundle_path, filename) if is_url: if git: source_output_path = file_util.strip_git_ext(source_output_path) file_util.git_clone(source, source_output_path) else: file_util.download_url(source, source_output_path) if unpack and self._can_unpack_file(source_output_path): self._unpack_file( source_output_path, zip_util.strip_archive_ext(source_output_path), remove_source=True, simplify_archive=simplify_archives) elif is_local_path: source_path = path_util.normalize(source) path_util.check_isvalid(source_path, 'upload') if unpack and self._can_unpack_file(source_path): self._unpack_file( source_path, zip_util.strip_archive_ext(source_output_path), remove_source=remove_sources, simplify_archive=simplify_archives) elif remove_sources: path_util.rename(source_path, source_output_path) else: path_util.copy(source_path, source_output_path, follow_symlinks=follow_symlinks, exclude_patterns=exclude_patterns) elif is_fileobj: if unpack and zip_util.path_is_archive(filename): self._unpack_fileobj( source[0], source[1], zip_util.strip_archive_ext(source_output_path), simplify_archive=simplify_archives) else: with open(source_output_path, 'wb') as out: shutil.copyfileobj(source[1], out) if len(sources) == 1: self._simplify_directory(bundle_path) except: if os.path.exists(bundle_path): path_util.remove(bundle_path) raise
def upload_to_bundle_store( self, bundle, sources, follow_symlinks, exclude_patterns, remove_sources, git, unpack, simplify_archives, ): """ Uploads contents for the given bundle to the bundle store. |sources|: specifies the locations of the contents to upload. Each element is either a URL, a local path or a tuple (filename, binary file-like object). |follow_symlinks|: for local path(s), whether to follow (resolve) symlinks, but only if remove_sources is False. |exclude_patterns|: for local path(s), don't upload these patterns (e.g., *.o), but only if remove_sources is False. |remove_sources|: for local path(s), whether |sources| should be removed |git|: for URLs, whether |source| is a git repo to clone. |unpack|: for each source in |sources|, whether to unpack it if it's an archive. |simplify_archives|: whether to simplify unpacked archives so that if they contain a single file, the final path is just that file, not a directory containing that file. If |sources| contains one source, then the bundle contents will be that source. Otherwise, the bundle contents will be a directory with each of the sources. Exceptions: - If |git|, then each source is replaced with the result of running 'git clone |source|' - If |unpack| is True or a source is an archive (zip, tar.gz, etc.), then unpack the source. """ exclude_patterns = (self._default_exclude_patterns + exclude_patterns if exclude_patterns else self._default_exclude_patterns) bundle_link_url = getattr(bundle.metadata, "link_url", None) if bundle_link_url: # Don't do anything for linked bundles. return bundle_path = self._bundle_store.get_bundle_location(bundle.uuid) try: path_util.make_directory(bundle_path) # Note that for uploads with a single source, the directory # structure is simplified at the end. for source in sources: is_url, is_local_path, is_fileobj, filename = self._interpret_source( source) source_output_path = os.path.join(bundle_path, filename) if is_url: if git: source_output_path = file_util.strip_git_ext( source_output_path) file_util.git_clone(source, source_output_path) else: file_util.download_url(source, source_output_path) if unpack and self._can_unpack_file( source_output_path): self._unpack_file( source_output_path, zip_util.strip_archive_ext(source_output_path), remove_source=True, simplify_archive=simplify_archives, ) elif is_local_path: source_path = path_util.normalize(source) path_util.check_isvalid(source_path, 'upload') if unpack and self._can_unpack_file(source_path): self._unpack_file( source_path, zip_util.strip_archive_ext(source_output_path), remove_source=remove_sources, simplify_archive=simplify_archives, ) elif remove_sources: path_util.rename(source_path, source_output_path) else: path_util.copy( source_path, source_output_path, follow_symlinks=follow_symlinks, exclude_patterns=exclude_patterns, ) elif is_fileobj: if unpack and zip_util.path_is_archive(filename): self._unpack_fileobj( source[0], source[1], zip_util.strip_archive_ext(source_output_path), simplify_archive=simplify_archives, ) else: with open(source_output_path, 'wb') as out: shutil.copyfileobj(source[1], out) if len(sources) == 1: self._simplify_directory(bundle_path) except: if os.path.exists(bundle_path): path_util.remove(bundle_path) raise
def test_local_file_archive(self): """This local should be categorized as an archive.""" path = "/tmp/file.tar.gz" self.assertEqual(path_is_archive(path), True) self.assertEqual(get_archive_ext(path), ".tar.gz") self.assertEqual(strip_archive_ext(path), "/tmp/file")
def upload_to_bundle_store( self, bundle: Dict, packed_source: Dict, use_azure_blob_beta: bool, destination_bundle_store=None, ): """ Bypass server upload. Upload from client directly to different blob storage (Azure, GCS, Disk storage). Bypass server uploading is used in following situations: # 1. The server set CODALAB_DEFAULT_BUNDLE_STORE_NAME # 2. If the user specify `--store` and blob storage is on Azure """ need_bypass = True bundle_store_uuid = None # 1) Read destination store from --store if user has specified it if destination_bundle_store is not None and destination_bundle_store != '': storage_info = self._client.fetch_one( 'bundle_stores', params={ 'name': destination_bundle_store, 'include': ['uuid', 'storage_type', 'url'], }, ) bundle_store_uuid = storage_info['uuid'] if storage_info['storage_type'] in ( StorageType.DISK_STORAGE.value, ): need_bypass = False # The user specify --store to upload to disk storage # 2) Pack the files to be uploaded source_ext = zip_util.get_archive_ext(packed_source['filename']) if packed_source['should_unpack'] and zip_util.path_is_archive( packed_source['filename']): unpack_before_upload = True is_dir = source_ext in zip_util.ARCHIVE_EXTS_DIR else: unpack_before_upload = False is_dir = False # 3) Create a bundle location for the bundle params = {'need_bypass': need_bypass, 'is_dir': is_dir} data = self._client.add_bundle_location(bundle['id'], bundle_store_uuid, params)[0].get('attributes') # 4) If bundle location has bundle_conn_str, we should bypass the server when uploading. if data.get('bundle_conn_str', None) is not None: # Mimic the rest server behavior # decided the bundle type (file/directory) and decide whether need to unpack bundle_conn_str = data.get('bundle_conn_str') index_conn_str = data.get('index_conn_str') bundle_url = data.get('bundle_url') bundle_read_str = data.get('bundle_read_url', bundle_url) try: progress = FileTransferProgress('Sent ', f=self.stderr) with closing(packed_source['fileobj']), progress: self.upload_Azure_blob_storage( fileobj=packed_source['fileobj'], bundle_url=bundle_url, bundle_conn_str=bundle_conn_str, bundle_read_str=bundle_read_str, index_conn_str=index_conn_str, source_ext=source_ext, should_unpack=unpack_before_upload, progress_callback=progress.update, ) except Exception as err: self._client.update_bundle_state( bundle['id'], params={ 'success': False, 'error_msg': f'Bypass server upload error. {err}', }, ) raise err else: self._client.update_bundle_state(bundle['id'], params={'success': True}) else: # 5) Otherwise, upload the bundle directly through the server. progress = FileTransferProgress('Sent ', packed_source['filesize'], f=self.stderr) with closing(packed_source['fileobj']), progress: self._client.upload_contents_blob( bundle['id'], fileobj=packed_source['fileobj'], params={ 'filename': packed_source['filename'], 'unpack': packed_source['should_unpack'], 'state_on_success': State.READY, 'finalize_on_success': True, 'use_azure_blob_beta': use_azure_blob_beta, 'store': destination_bundle_store or '', }, progress_callback=progress.update, )
def upload_bundle(self, sources, follow_symlinks, exclude_patterns, git, unpack, remove_sources, info, worksheet_uuid, add_to_worksheet): """ See local_bundle_client.py for documentation on the usage. Strategy: 1) We copy the |sources| to a temporary directory on the server (streaming either a tar or tar.gz depending on whether compression is needed). 2) We politely ask the server to finish_upload_bundle (performs a LocalBundleClient.upload_bundle from the temporary directory). """ # URLs can be directly passed to the local client. if all(path_util.path_is_url(source) for source in sources): return self.upload_bundle_url(sources, follow_symlinks, exclude_patterns, git, unpack, remove_sources, info, worksheet_uuid, add_to_worksheet) remote_file_uuids = [] try: # 1) Copy sources up to the server (temporary remote zip file) for source in sources: if zip_util.path_is_archive(source): source_handle = open(source) temp_file_name = os.path.basename(source) elif os.path.isdir(source): source_handle = tar_gzip_directory(source, follow_symlinks, exclude_patterns) temp_file_name = os.path.basename(source) + '.tar.gz' unpack = True # We packed it, so we have to unpack it else: resolved_source = source if follow_symlinks: resolved_source = os.path.realpath(source) if not os.path.exists(resolved_source): raise UsageError('Broken symlink') elif os.path.islink(source): raise UsageError('Not following symlinks.') source_handle = gzip_file(resolved_source) temp_file_name = os.path.basename(source) + '.gz' unpack = True # We packed it, so we have to unpack it remote_file_uuid = self.open_temp_file(temp_file_name) remote_file_uuids.append(remote_file_uuid) with closing(RPCFileHandle(remote_file_uuid, self.proxy)) as dest_handle: status = 'Uploading %s%s to %s' % ( source, ' (' + info['uuid'] + ')' if 'uuid' in info else '', self.address) file_util.copy(source_handle, dest_handle, autoflush=False, print_status=status) # 2) Install upload (this call will be in charge of deleting the temporary file). return self.finish_upload_bundle(remote_file_uuids, unpack, info, worksheet_uuid, add_to_worksheet) except: for remote_file_uuid in remote_file_uuids: self.finalize_file(remote_file_uuid) raise
def _fetch_bundle_contents_blob(uuid, path=''): """ API to download the contents of a bundle or a subpath within a bundle. For directories this method always returns a tarred and gzipped archive of the directory. For files, if the request has an Accept-Encoding header containing gzip, then the returned file is gzipped. """ byte_range = get_request_range() head_lines = query_get_type(int, 'head', default=0) tail_lines = query_get_type(int, 'tail', default=0) max_line_length = query_get_type(int, 'max_line_length', default=128) check_bundles_have_read_permission(local.model, request.user, [uuid]) bundle = local.model.get_bundle(uuid) target_info = local.download_manager.get_target_info(uuid, path, 0) if target_info is None: abort(httplib.NOT_FOUND, 'Not found.') # Figure out the file name. if not path and bundle.metadata.name: filename = bundle.metadata.name else: filename = target_info['name'] if target_info['type'] == 'directory': if byte_range: abort(httplib.BAD_REQUEST, 'Range not supported for directory blobs.') if head_lines: abort(httplib.BAD_REQUEST, 'Head not supported for directory blobs.') # Always tar and gzip directories. filename = filename + '.tar.gz' fileobj = local.download_manager.stream_tarred_gzipped_directory( uuid, path) elif target_info['type'] == 'file': gzipped = False if not zip_util.path_is_archive( filename) and request_accepts_gzip_encoding(): # Let's gzip to save bandwidth. The browser will transparently decode # the file. filename = filename + '.gz' gzipped = True if byte_range and (head_lines or tail_lines): abort(httplib.BAD_REQUEST, 'Head and range not supported on the same request.') elif byte_range: start, end = byte_range fileobj = local.download_manager.read_file_section( uuid, path, start, end - start + 1, gzipped) elif head_lines or tail_lines: fileobj = local.download_manager.summarize_file( uuid, path, head_lines, tail_lines, max_line_length, None, gzipped) else: fileobj = local.download_manager.stream_file(uuid, path, gzipped) else: # Symlinks. abort(httplib.FORBIDDEN, 'Cannot download files of this type.') # Set headers. mimetype, _ = mimetypes.guess_type(filename, strict=False) response.set_header('Content-Type', mimetype or 'text/plain') if zip_util.get_archive_ext( filename) == '.gz' and request_accepts_gzip_encoding(): filename = zip_util.strip_archive_ext(filename) response.set_header('Content-Encoding', 'gzip') else: response.set_header('Content-Encoding', 'identity') response.set_header('Content-Disposition', 'filename="%s"' % filename) return fileobj
def test_url_archive(self): """This URL should be categorized as an archive.""" path = "https://codalab.org/file.tar.gz" self.assertEqual(path_is_archive(path), True) self.assertEqual(get_archive_ext(path), ".tar.gz") self.assertEqual(strip_archive_ext(path), "https://codalab.org/file")
def upload(self, sources, follow_symlinks, exclude_patterns, git, unpack, remove_sources): ''' |sources|: specifies the locations of the contents to upload. Each element is either a URL or a local path. |follow_symlinks|: for local path(s), whether to follow (resolve) symlinks |exclude_patterns|: for local path(s), don't upload these patterns (e.g., *.o) |git|: for URL, whether |source| is a git repo to clone. |unpack|: for each source in |sources|, whether to unpack it if it's an archive. |remove_sources|: remove |sources|. If |sources| contains one source, then the bundle contents will be that source. Otherwise, the bundle contents will be a directory with each of the sources. Exceptions: - If |git|, then each source is replaced with the result of running 'git clone |source|' - If |unpack| is True or a source is an archive (zip, tar.gz, etc.), then unpack the source. Install the contents of the directory at |source| into DATA_SUBDIRECTORY in a subdirectory named by a hash of the contents. Return a (data_hash, metadata) pair, where the metadata is a dict mapping keys to precomputed statistics about the new data directory. ''' to_delete = [] # Create temporary directory as a staging area and put everything there. temp_path = tempfile.mkdtemp('-bundle_store_upload') temp_subpaths = [] for source in sources: # Where to save |source| to (might change this value if we unpack). temp_subpath = os.path.join(temp_path, os.path.basename(source)) if remove_sources: to_delete.append(source) source_unpack = unpack and zip_util.path_is_archive(source) if path_util.path_is_url(source): # Download the URL. print_util.open_line('BundleStore.upload: downloading %s to %s' % (source, temp_path)) if git: file_util.git_clone(source, temp_subpath) else: file_util.download_url(source, temp_subpath, print_status=True) if source_unpack: zip_util.unpack(temp_subpath, zip_util.strip_archive_ext(temp_subpath)) path_util.remove(temp_subpath) temp_subpath = zip_util.strip_archive_ext(temp_subpath) print_util.clear_line() else: # Copy the local path. source_path = path_util.normalize(source) path_util.check_isvalid(source_path, 'upload') # Recursively copy the directory into a new BundleStore temp directory. print_util.open_line('BundleStore.upload: %s => %s' % (source_path, temp_subpath)) if source_unpack: zip_util.unpack(source_path, zip_util.strip_archive_ext(temp_subpath)) temp_subpath = zip_util.strip_archive_ext(temp_subpath) else: if remove_sources: path_util.rename(source_path, temp_subpath) else: path_util.copy(source_path, temp_subpath, follow_symlinks=follow_symlinks, exclude_patterns=exclude_patterns) print_util.clear_line() temp_subpaths.append(temp_subpath) # If exactly one source, then upload that directly. if len(temp_subpaths) == 1: to_delete.append(temp_path) temp_path = temp_subpaths[0] # Multiplex between uploading a directory and uploading a file here. # All other path_util calls will use these lists of directories and files. if os.path.isdir(temp_path): dirs_and_files = path_util.recursive_ls(temp_path) else: dirs_and_files = ([], [temp_path]) # Hash the contents of the temporary directory, and then if there is no # data with this hash value, move this directory into the data directory. print_util.open_line('BundleStore.upload: hashing %s' % temp_path) data_hash = '0x%s' % (path_util.hash_directory(temp_path, dirs_and_files),) print_util.clear_line() print_util.open_line('BundleStore.upload: computing size of %s' % temp_path) data_size = path_util.get_size(temp_path, dirs_and_files) print_util.clear_line() final_path = os.path.join(self.data, data_hash) if os.path.exists(final_path): # Already exists, just delete it path_util.remove(temp_path) else: print >>sys.stderr, 'BundleStore.upload: moving %s to %s' % (temp_path, final_path) path_util.rename(temp_path, final_path) # Delete paths. for path in to_delete: if os.path.exists(path): path_util.remove(path) # After this operation there should always be a directory at the final path. assert(os.path.lexists(final_path)), 'Uploaded to %s failed!' % (final_path,) return (data_hash, {'data_size': data_size})
def upload(self, sources, follow_symlinks, exclude_patterns, git, unpack, remove_sources, uuid): """ |sources|: specifies the locations of the contents to upload. Each element is either a URL or a local path. |follow_symlinks|: for local path(s), whether to follow (resolve) symlinks |exclude_patterns|: for local path(s), don't upload these patterns (e.g., *.o) |git|: for URL, whether |source| is a git repo to clone. |unpack|: for each source in |sources|, whether to unpack it if it's an archive. |remove_sources|: remove |sources|. If |sources| contains one source, then the bundle contents will be that source. Otherwise, the bundle contents will be a directory with each of the sources. Exceptions: - If |git|, then each source is replaced with the result of running 'git clone |source|' - If |unpack| is True or a source is an archive (zip, tar.gz, etc.), then unpack the source. Install the contents of the directory at |source| into DATA_SUBDIRECTORY in a subdirectory named by a hash of the contents. Return a (data_hash, metadata) pair, where the metadata is a dict mapping keys to precomputed statistics about the new data directory. """ to_delete = [] # If just a single file, set the final path to be equal to that file single_path = len(sources) == 1 # Determine which disk this will go on disk_choice = self.ring.get_node(uuid) final_path = os.path.join(self.partitions, disk_choice, self.DATA_SUBDIRECTORY, uuid) if os.path.exists(final_path): raise UsageError('Path %s already present in bundle store' % final_path) # Only make if not there elif not single_path: path_util.make_directory(final_path) # Paths to resources subpaths = [] for source in sources: # Where to save |source| to (might change this value if we unpack). if not single_path: subpath = os.path.join(final_path, os.path.basename(source)) else: subpath = final_path if remove_sources: to_delete.append(source) source_unpack = unpack and zip_util.path_is_archive(source) if source_unpack and single_path: # Load the file into the bundle store under the given path subpath += zip_util.get_archive_ext(source) if path_util.path_is_url(source): # Download the URL. print_util.open_line('BundleStore.upload: downloading %s to %s' % (source, subpath)) if git: file_util.git_clone(source, subpath) else: file_util.download_url(source, subpath, print_status=True) if source_unpack: zip_util.unpack(subpath, zip_util.strip_archive_ext(subpath)) path_util.remove(subpath) subpath = zip_util.strip_archive_ext(subpath) print_util.clear_line() else: # Copy the local path. source_path = path_util.normalize(source) path_util.check_isvalid(source_path, 'upload') # Recursively copy the directory into the BundleStore print_util.open_line('BundleStore.upload: %s => %s' % (source_path, subpath)) if source_unpack: zip_util.unpack(source_path, zip_util.strip_archive_ext(subpath)) subpath = zip_util.strip_archive_ext(subpath) else: if remove_sources: path_util.rename(source_path, subpath) else: path_util.copy(source_path, subpath, follow_symlinks=follow_symlinks, exclude_patterns=exclude_patterns) print_util.clear_line() subpaths.append(subpath) dirs_and_files = None if os.path.isdir(final_path): dirs_and_files = path_util.recursive_ls(final_path) else: dirs_and_files = [], [final_path] # Hash the contents of the bundle directory. Update the data_hash attribute # for the bundle print_util.open_line('BundleStore.upload: hashing %s' % final_path) data_hash = '0x%s' % (path_util.hash_directory(final_path, dirs_and_files)) print_util.clear_line() print_util.open_line('BundleStore.upload: computing size of %s' % final_path) data_size = path_util.get_size(final_path, dirs_and_files) print_util.clear_line() # Delete paths. for path in to_delete: if os.path.exists(path): path_util.remove(path) # After this operation there should always be a directory at the final path. assert (os.path.lexists(final_path)), 'Uploaded to %s failed!' % (final_path,) return (data_hash, {'data_size': data_size})