def get_filesystem(path): """Get the correct filesystem for the specified path """ try: return get_filesystem(path) except Exception as e: raise BeamIOError('Enable to get the Filesystem', {path: e})
def _copy_model_dir(trained_model, dest, extract_model_fn): """Copy a folder. Args: trained_model: Folder containing a model. dest: Folder to copy trained_model to. extract_model_fn: A function to extract the model path from a file hierarchy like the one generated from tf.learn's `Estimator.export_savedmodel()`. Returns: dest Raises: ValueError: If the model directory doesn't match the tf.learn structure. """ if extract_model_fn: trained_model = extract_model_fn(trained_model) else: trained_model = trained_model def append_trailing_slash(path): return path if path.endswith('/') else path + '/' # TODO(user): Remove this gaurd after new Beam release try: from apache_beam.io import filesystems_util # pylint: disable=g-import-not-at-top file_system = filesystems_util.get_filesystem( append_trailing_slash(trained_model)) file_system.copy([append_trailing_slash(trained_model)], [append_trailing_slash(dest)]) except ImportError: fileio.ChannelFactory.copytree( append_trailing_slash(trained_model), append_trailing_slash(dest)) return dest
def _rename_batch(batch): """_rename_batch executes batch rename operations.""" source_files, destination_files = batch exceptions = [] if self._file_system is None: self._file_system = get_filesystem(file_path_prefix) try: self._file_system.rename(source_files, destination_files) return exceptions except BeamIOError as exp: if exp.exception_details is None: raise for (src, dest), exception in exp.exception_details.iteritems(): if exception: logging.warning('Rename not successful: %s -> %s, %s', src, dest, exception) should_report = True if isinstance(exception, IOError): # May have already been copied. try: if self._file_system.exists(dest): should_report = False except Exception as exists_e: # pylint: disable=broad-except logging.warning( 'Exception when checking if file %s exists: ' '%s', dest, exists_e) if should_report: logging.warning( ('Exception in _rename_batch. src: %s, ' 'dest: %s, err: %s'), src, dest, exception) exceptions.append(exception) else: logging.debug('Rename successful: %s -> %s', src, dest) return exceptions
def extract_model_fn(trained_model_dir): """Extract Model.""" # gcsio.glob() will return all the files under # <trained_model_dir>/export/<export_name>/* for this reason we search for # one specific file (saved_model.pb) according to the tf.learn directory # hierarchy. Where the * corresponds to the model timestamp. # TODO(user): Remove this gaurd after new Beam release try: from apache_beam.io import filesystems_util # pylint: disable=g-import-not-at-top file_system = filesystems_util.get_filesystem( trained_model_dir) match_result = file_system.match([ os.path.join(trained_model_dir, 'export', export_name, '*', 'saved_model.pb') ])[0] paths = [f.path for f in match_result.metadata_list] except ImportError: paths = fileio.ChannelFactory.glob( os.path.join(trained_model_dir, 'export', export_name, '*', 'saved_model.pb')) # We still validate that there in only one model under the given path. if len(paths) == 1: return paths[0].replace('saved_model.pb', '') else: raise ValueError( 'The model on %s was not exported by tf.learn. ' 'Or there is more than one matching model path: ' '%s' % (trained_model_dir, paths))
def initialize_write(self): file_path_prefix = self.file_path_prefix.get() file_name_suffix = self.file_name_suffix.get() tmp_dir = file_path_prefix + file_name_suffix + time.strftime( '-temp-%Y-%m-%d_%H-%M-%S') if self._file_system is None: self._file_system = get_filesystem(file_path_prefix) self._file_system.mkdirs(tmp_dir) return tmp_dir
def rename_batch(src_dest_pairs): sources = [s for s, _ in src_dest_pairs] destinations = [d for _, d in src_dest_pairs] bfs = get_filesystem() try: bfs.rename(sources, destinations) return [] except BeamIOError as exp: return [(s, d, e) for (s, d), e in exp.exception_details.iteritems()]
def open(path, mode, mime_type='application/octet-stream', compression_type=CompressionTypes.AUTO): bfs = get_filesystem(path) if mode == 'rb': return bfs.open(path, mime_type, compression_type) elif mode == 'wb': return bfs.create(path, mime_type, compression_type)
def open(self, temp_path): """Opens ``temp_path``, returning an opaque file handle object. The returned file handle is passed to ``write_[encoded_]record`` and ``close``. """ if self._file_system is None: self._file_system = get_filesystem(self.file_path_prefix.get()) return self._file_system.create(temp_path, self.mime_type, self.compression_type)
def file_copy(from_path, to_path): if not from_path.endswith(names.PICKLED_MAIN_SESSION_FILE): self.assertEqual(expected_from_path, from_path) filesystem = get_filesystem(expected_to_dir) self.assertEqual( filesystem.join(expected_to_dir, names.DATAFLOW_SDK_TARBALL_FILE), to_path) if from_path.startswith('gs://') or to_path.startswith('gs://'): logging.info('Faking file_copy(%s, %s)', from_path, to_path) else: shutil.copyfile(from_path, to_path)
def __init__(self, file_pattern, min_bundle_size=0, compression_type=CompressionTypes.AUTO, splittable=True, validate=True): """Initializes ``FileBasedSource``. Args: file_pattern: the file glob to read. min_bundle_size: minimum size of bundles that should be generated when performing initial splitting on this source. compression_type: compression type to use splittable: whether FileBasedSource should try to logically split a single file into data ranges so that different parts of the same file can be read in parallel. If set to False, FileBasedSource will prevent both initial and dynamic splitting of sources for single files. File patterns that represent multiple files may still get split into sources for individual files. Even if set to True by the user, FileBasedSource may choose to not split the file, for example, for compressed files where currently it is not possible to efficiently read a data range without decompressing the whole file. validate: Boolean flag to verify that the files exist during the pipeline creation time. Raises: TypeError: when compression_type is not valid or if file_pattern is not a string. ValueError: when compression and splittable files are specified. IOError: when the file pattern specified yields an empty result. """ if not isinstance(file_pattern, basestring): raise TypeError( '%s: file_pattern must be a string; got %r instead' % (self.__class__.__name__, file_pattern)) self._pattern = file_pattern self._file_system = get_filesystem(file_pattern) self._concat_source = None self._min_bundle_size = min_bundle_size if not CompressionTypes.is_valid_compression_type(compression_type): raise TypeError( 'compression_type must be CompressionType object but ' 'was %s' % type(compression_type)) self._compression_type = compression_type if compression_type in (CompressionTypes.UNCOMPRESSED, CompressionTypes.AUTO): self._splittable = splittable else: # We can't split compressed files efficiently so turn off splitting. self._splittable = False if validate: self._validate()
def _validate(self): """Validate if there are actual files in the specified glob pattern """ pattern = self._pattern.get() if self._file_system is None: self._file_system = get_filesystem(pattern) # Limit the responses as we only want to check if something exists match_result = self._file_system.match([pattern], limits=[1])[0] if len(match_result.metadata_list) <= 0: raise IOError('No files found based on the file pattern %s' % pattern)
def _validate(self): """Validate if there are actual files in the specified glob pattern """ pattern = self._pattern.get() if self._file_system is None: self._file_system = get_filesystem(pattern) # Limit the responses as we only want to check if something exists match_result = self._file_system.match([pattern], limits=[1])[0] if len(match_result.metadata_list) <= 0: raise IOError( 'No files found based on the file pattern %s' % pattern)
def size_of_files_in_glob(path, file_names=None): bfs = get_filesystem(path) match_result = bfs.match([path], [limit])[0] part_files = {f.path:f.size_in_bytes for f in match_result.metadata_list} if file_names is not None: specific_files = {} match_results = bfs.match(file_names) for match_result in match_results: for metadata in match_result.metadata_list: specific_files[metadata.path] = metadata.size_in_bytes return part_files.update(specific_files)
def __init__(self, options): self.options = options self.google_cloud_options = options.view_as(GoogleCloudOptions) if not self.google_cloud_options.job_name: self.google_cloud_options.job_name = self.default_job_name( self.google_cloud_options.job_name) required_google_cloud_options = [ 'project', 'job_name', 'temp_location' ] missing = [ option for option in required_google_cloud_options if not getattr(self.google_cloud_options, option) ] if missing: raise ValueError('Missing required configuration parameters: %s' % missing) if not self.google_cloud_options.staging_location: logging.info( 'Defaulting to the temp_location as staging_location: %s', self.google_cloud_options.temp_location) (self.google_cloud_options.staging_location ) = self.google_cloud_options.temp_location # Make the staging and temp locations job name and time specific. This is # needed to avoid clashes between job submissions using the same staging # area or team members using same job names. This method is not entirely # foolproof since two job submissions with same name can happen at exactly # the same time. However the window is extremely small given that # time.time() has at least microseconds granularity. We add the suffix only # for GCS staging locations where the potential for such clashes is high. if self.google_cloud_options.staging_location.startswith('gs://'): path_suffix = '%s.%f' % (self.google_cloud_options.job_name, time.time()) filesystem = get_filesystem( self.google_cloud_options.staging_location) self.google_cloud_options.staging_location = filesystem.join( self.google_cloud_options.staging_location, path_suffix) self.google_cloud_options.temp_location = filesystem.join( self.google_cloud_options.temp_location, path_suffix) self.proto = dataflow.Job(name=self.google_cloud_options.job_name) if self.options.view_as(StandardOptions).streaming: self.proto.type = dataflow.Job.TypeValueValuesEnum.JOB_TYPE_STREAMING else: self.proto.type = dataflow.Job.TypeValueValuesEnum.JOB_TYPE_BATCH self.base64_str_re = re.compile(r'^[A-Za-z0-9+/]*=*$') self.coder_str_re = re.compile(r'^([A-Za-z]+\$)([A-Za-z0-9+/]*=*)$')
def __init__(self, file_path_prefix, coder, file_name_suffix='', num_shards=0, shard_name_template=None, mime_type='application/octet-stream', compression_type=CompressionTypes.AUTO): """ Raises: TypeError: if file path parameters are not a string or ValueProvider, or if compression_type is not member of CompressionTypes. ValueError: if shard_name_template is not of expected format. """ if not (isinstance(file_path_prefix, basestring) or isinstance(file_path_prefix, ValueProvider)): raise TypeError( 'file_path_prefix must be a string or ValueProvider;' 'got %r instead' % file_path_prefix) if not (isinstance(file_name_suffix, basestring) or isinstance(file_name_suffix, ValueProvider)): raise TypeError( 'file_name_suffix must be a string or ValueProvider;' 'got %r instead' % file_name_suffix) if not CompressionTypes.is_valid_compression_type(compression_type): raise TypeError( 'compression_type must be CompressionType object but ' 'was %s' % type(compression_type)) if shard_name_template is None: shard_name_template = DEFAULT_SHARD_NAME_TEMPLATE elif shard_name_template is '': num_shards = 1 if isinstance(file_path_prefix, basestring): file_path_prefix = StaticValueProvider(str, file_path_prefix) if isinstance(file_name_suffix, basestring): file_name_suffix = StaticValueProvider(str, file_name_suffix) self.file_path_prefix = file_path_prefix self.file_name_suffix = file_name_suffix self.num_shards = num_shards self.coder = coder self.shard_name_format = self._template_to_format(shard_name_template) self.compression_type = compression_type self.mime_type = mime_type if file_path_prefix.is_accessible(): self._file_system = get_filesystem(file_path_prefix.get()) else: self._file_system = None
def _get_concat_source(self): if self._concat_source is None: pattern = self._pattern.get() single_file_sources = [] if self._file_system is None: self._file_system = get_filesystem(pattern) match_result = self._file_system.match([pattern])[0] files_metadata = match_result.metadata_list # We create a reference for FileBasedSource that will be serialized along # with each _SingleFileSource. To prevent this FileBasedSource from having # a reference to ConcatSource (resulting in quadratic space complexity) # we clone it here. file_based_source_ref = pickler.loads(pickler.dumps(self)) for file_metadata in files_metadata: file_name = file_metadata.path file_size = file_metadata.size_in_bytes if file_size == 0: continue # Ignoring empty file. # We determine splittability of this specific file. splittable = self.splittable if (splittable and self._compression_type == CompressionTypes.AUTO): compression_type = CompressionTypes.detect_compression_type( file_name) if compression_type != CompressionTypes.UNCOMPRESSED: splittable = False single_file_source = _SingleFileSource( file_based_source_ref, file_name, 0, file_size, min_bundle_size=self._min_bundle_size, splittable=splittable) single_file_sources.append(single_file_source) self._concat_source = concat_source.ConcatSource( single_file_sources) return self._concat_source
def _get_concat_source(self): if self._concat_source is None: pattern = self._pattern.get() single_file_sources = [] if self._file_system is None: self._file_system = get_filesystem(pattern) match_result = self._file_system.match([pattern])[0] files_metadata = match_result.metadata_list # We create a reference for FileBasedSource that will be serialized along # with each _SingleFileSource. To prevent this FileBasedSource from having # a reference to ConcatSource (resulting in quadratic space complexity) # we clone it here. file_based_source_ref = pickler.loads(pickler.dumps(self)) for file_metadata in files_metadata: file_name = file_metadata.path file_size = file_metadata.size_in_bytes if file_size == 0: continue # Ignoring empty file. # We determine splittability of this specific file. splittable = self.splittable if (splittable and self._compression_type == CompressionTypes.AUTO): compression_type = CompressionTypes.detect_compression_type( file_name) if compression_type != CompressionTypes.UNCOMPRESSED: splittable = False single_file_source = _SingleFileSource( file_based_source_ref, file_name, 0, file_size, min_bundle_size=self._min_bundle_size, splittable=splittable) single_file_sources.append(single_file_source) self._concat_source = concat_source.ConcatSource(single_file_sources) return self._concat_source
def __init__(self, file_path, expected_checksum, sleep_secs=None): """Initialize a FileChecksumMatcher object Args: file_path : A string that is the full path of output file. This path can contain globs. expected_checksum : A hash string that is computed from expected result. sleep_secs : Number of seconds to wait before verification start. Extra time are given to make sure output files are ready on FS. """ if sleep_secs is not None: if isinstance(sleep_secs, int): self.sleep_secs = sleep_secs else: raise ValueError('Sleep seconds, if received, must be int. ' 'But received: %r, %s' % (sleep_secs, type(sleep_secs))) else: self.sleep_secs = None self.file_path = file_path self.file_system = get_filesystem(self.file_path) self.expected_checksum = expected_checksum
def exists(path): bfs = get_filesystem(path) return bfs.exists(path)
def __init__(self, file_pattern, min_bundle_size=0, compression_type=CompressionTypes.AUTO, splittable=True, validate=True): """Initializes ``FileBasedSource``. Args: file_pattern: the file glob to read a string or a ValueProvider (placeholder to inject a runtime value). min_bundle_size: minimum size of bundles that should be generated when performing initial splitting on this source. compression_type: compression type to use splittable: whether FileBasedSource should try to logically split a single file into data ranges so that different parts of the same file can be read in parallel. If set to False, FileBasedSource will prevent both initial and dynamic splitting of sources for single files. File patterns that represent multiple files may still get split into sources for individual files. Even if set to True by the user, FileBasedSource may choose to not split the file, for example, for compressed files where currently it is not possible to efficiently read a data range without decompressing the whole file. validate: Boolean flag to verify that the files exist during the pipeline creation time. Raises: TypeError: when compression_type is not valid or if file_pattern is not a string or a ValueProvider. ValueError: when compression and splittable files are specified. IOError: when the file pattern specified yields an empty result. """ if (not (isinstance(file_pattern, basestring) or isinstance(file_pattern, ValueProvider))): raise TypeError('%s: file_pattern must be of type string' ' or ValueProvider; got %r instead' % (self.__class__.__name__, file_pattern)) if isinstance(file_pattern, basestring): file_pattern = StaticValueProvider(str, file_pattern) self._pattern = file_pattern if file_pattern.is_accessible(): self._file_system = get_filesystem(file_pattern.get()) else: self._file_system = None self._concat_source = None self._min_bundle_size = min_bundle_size if not CompressionTypes.is_valid_compression_type(compression_type): raise TypeError('compression_type must be CompressionType object but ' 'was %s' % type(compression_type)) self._compression_type = compression_type if compression_type in (CompressionTypes.UNCOMPRESSED, CompressionTypes.AUTO): self._splittable = splittable else: # We can't split compressed files efficiently so turn off splitting. self._splittable = False if validate and file_pattern.is_accessible(): self._validate()
def estimate_size(self): pattern = self._pattern.get() if self._file_system is None: self._file_system = get_filesystem(pattern) match_result = self._file_system.match([pattern])[0] return sum([f.size_in_bytes for f in match_result.metadata_list])
def stage_job_resources( options, file_copy=_dependency_file_copy, build_setup_args=None, temp_dir=None, populate_requirements_cache=_populate_requirements_cache): """Creates (if needed) and stages job resources to options.staging_location. Args: options: Command line options. More specifically the function will expect staging_location, requirements_file, setup_file, and save_main_session options to be present. file_copy: Callable for copying files. The default version will copy from a local file to a GCS location using the gsutil tool available in the Google Cloud SDK package. build_setup_args: A list of command line arguments used to build a setup package. Used only if options.setup_file is not None. Used only for testing. temp_dir: Temporary folder where the resource building can happen. If None then a unique temp directory will be created. Used only for testing. populate_requirements_cache: Callable for populating the requirements cache. Used only for testing. Returns: A list of file names (no paths) for the resources staged. All the files are assumed to be staged in options.staging_location. Raises: RuntimeError: If files specified are not found or error encountered while trying to create the resources (e.g., build a setup package). """ temp_dir = temp_dir or tempfile.mkdtemp() resources = [] google_cloud_options = options.view_as(GoogleCloudOptions) setup_options = options.view_as(SetupOptions) # Make sure that all required options are specified. There are a few that have # defaults to support local running scenarios. if google_cloud_options.staging_location is None: raise RuntimeError( 'The --staging_location option must be specified.') if google_cloud_options.temp_location is None: raise RuntimeError( 'The --temp_location option must be specified.') filesystem = get_filesystem(google_cloud_options.staging_location) # Stage a requirements file if present. if setup_options.requirements_file is not None: if not os.path.isfile(setup_options.requirements_file): raise RuntimeError('The file %s cannot be found. It was specified in the ' '--requirements_file command line option.' % setup_options.requirements_file) staged_path = filesystem.join(google_cloud_options.staging_location, REQUIREMENTS_FILE) file_copy(setup_options.requirements_file, staged_path) resources.append(REQUIREMENTS_FILE) requirements_cache_path = ( os.path.join(tempfile.gettempdir(), 'dataflow-requirements-cache') if setup_options.requirements_cache is None else setup_options.requirements_cache) # Populate cache with packages from requirements and stage the files # in the cache. if not os.path.exists(requirements_cache_path): os.makedirs(requirements_cache_path) populate_requirements_cache( setup_options.requirements_file, requirements_cache_path) for pkg in glob.glob(os.path.join(requirements_cache_path, '*')): file_copy(pkg, filesystem.join(google_cloud_options.staging_location, os.path.basename(pkg))) resources.append(os.path.basename(pkg)) # Handle a setup file if present. # We will build the setup package locally and then copy it to the staging # location because the staging location is a GCS path and the file cannot be # created directly there. if setup_options.setup_file is not None: if not os.path.isfile(setup_options.setup_file): raise RuntimeError('The file %s cannot be found. It was specified in the ' '--setup_file command line option.' % setup_options.setup_file) if os.path.basename(setup_options.setup_file) != 'setup.py': raise RuntimeError( 'The --setup_file option expects the full path to a file named ' 'setup.py instead of %s' % setup_options.setup_file) tarball_file = _build_setup_package(setup_options.setup_file, temp_dir, build_setup_args) staged_path = filesystem.join(google_cloud_options.staging_location, WORKFLOW_TARBALL_FILE) file_copy(tarball_file, staged_path) resources.append(WORKFLOW_TARBALL_FILE) # Handle extra local packages that should be staged. if setup_options.extra_packages is not None: resources.extend( _stage_extra_packages(setup_options.extra_packages, google_cloud_options.staging_location, temp_dir=temp_dir, file_copy=file_copy)) # Pickle the main session if requested. # We will create the pickled main session locally and then copy it to the # staging location because the staging location is a GCS path and the file # cannot be created directly there. if setup_options.save_main_session: pickled_session_file = os.path.join(temp_dir, names.PICKLED_MAIN_SESSION_FILE) pickler.dump_session(pickled_session_file) staged_path = filesystem.join(google_cloud_options.staging_location, names.PICKLED_MAIN_SESSION_FILE) file_copy(pickled_session_file, staged_path) resources.append(names.PICKLED_MAIN_SESSION_FILE) if hasattr(setup_options, 'sdk_location'): if setup_options.sdk_location == 'default': stage_tarball_from_remote_location = True elif (setup_options.sdk_location.startswith('gs://') or setup_options.sdk_location.startswith('http://') or setup_options.sdk_location.startswith('https://')): stage_tarball_from_remote_location = True else: stage_tarball_from_remote_location = False staged_path = filesystem.join(google_cloud_options.staging_location, names.DATAFLOW_SDK_TARBALL_FILE) if stage_tarball_from_remote_location: # If --sdk_location is not specified then the appropriate package # will be obtained from PyPI (https://pypi.python.org) based on the # version of the currently running SDK. If the option is # present then no version matching is made and the exact URL or path # is expected. # # Unit tests running in the 'python setup.py test' context will # not have the sdk_location attribute present and therefore we # will not stage a tarball. if setup_options.sdk_location == 'default': sdk_remote_location = 'pypi' else: sdk_remote_location = setup_options.sdk_location _stage_beam_sdk_tarball(sdk_remote_location, staged_path, temp_dir) resources.append(names.DATAFLOW_SDK_TARBALL_FILE) else: # Check if we have a local Beam SDK tarball present. This branch is # used by tests running with the SDK built at head. if setup_options.sdk_location == 'default': module_path = os.path.abspath(__file__) sdk_path = os.path.join( os.path.dirname(module_path), '..', '..', '..', names.DATAFLOW_SDK_TARBALL_FILE) elif os.path.isdir(setup_options.sdk_location): sdk_path = os.path.join( setup_options.sdk_location, names.DATAFLOW_SDK_TARBALL_FILE) else: sdk_path = setup_options.sdk_location if os.path.isfile(sdk_path): logging.info('Copying Beam SDK "%s" to staging location.', sdk_path) file_copy(sdk_path, staged_path) resources.append(names.DATAFLOW_SDK_TARBALL_FILE) else: if setup_options.sdk_location == 'default': raise RuntimeError('Cannot find default Beam SDK tar file "%s"', sdk_path) elif not setup_options.sdk_location: logging.info('Beam SDK will not be staged since --sdk_location ' 'is empty.') else: raise RuntimeError( 'The file "%s" cannot be found. Its location was specified by ' 'the --sdk_location command-line option.' % sdk_path) # Delete all temp files created while staging job resources. shutil.rmtree(temp_dir) return resources
def open_file(self, file_name): return get_filesystem(file_name).open( file_name, 'application/octet-stream', compression_type=self._compression_type)
def size_in_bytes(path): bfs = get_filesystem(path) match_result = bfs.match([path])[0] return [f.size_in_bytes for f in match_result.metadata_list][0]
def mkdir(path): bfs = get_filesystem(path) return bfs.mkdirs(path)
def _stage_extra_packages(extra_packages, staging_location, temp_dir, file_copy=_dependency_file_copy): """Stages a list of local extra packages. Args: extra_packages: Ordered list of local paths to extra packages to be staged. staging_location: Staging location for the packages. temp_dir: Temporary folder where the resource building can happen. Caller is responsible for cleaning up this folder after this function returns. file_copy: Callable for copying files. The default version will copy from a local file to a GCS location using the gsutil tool available in the Google Cloud SDK package. Returns: A list of file names (no paths) for the resources staged. All the files are assumed to be staged in staging_location. Raises: RuntimeError: If files specified are not found or do not have expected name patterns. """ resources = [] staging_filesystem = get_filesystem(staging_location) staging_temp_dir = None local_packages = [] for package in extra_packages: if not (os.path.basename(package).endswith('.tar') or os.path.basename(package).endswith('.tar.gz') or os.path.basename(package).endswith('.whl')): raise RuntimeError( 'The --extra_package option expects a full path ending with ' '".tar" or ".tar.gz" instead of %s' % package) if os.path.basename(package).endswith('.whl'): logging.warning( 'The .whl package "%s" is provided in --extra_package. ' 'This functionality is not officially supported. Since wheel ' 'packages are binary distributions, this package must be ' 'binary-compatible with the worker environment (e.g. Python 2.7 ' 'running on an x64 Linux host).') if not os.path.isfile(package): if package.startswith('gs://'): if not staging_temp_dir: staging_temp_dir = tempfile.mkdtemp(dir=temp_dir) logging.info('Downloading extra package: %s locally before staging', package) _dependency_file_copy(package, staging_temp_dir) else: raise RuntimeError( 'The file %s cannot be found. It was specified in the ' '--extra_packages command line option.' % package) else: local_packages.append(package) if staging_temp_dir: temp_fs = get_filesystem(staging_temp_dir) local_packages.extend( [temp_fs.join(staging_temp_dir, f) for f in os.listdir( staging_temp_dir)]) for package in local_packages: basename = os.path.basename(package) staged_path = staging_filesystem.join(staging_location, basename) file_copy(package, staged_path) resources.append(basename) # Create a file containing the list of extra packages and stage it. # The file is important so that in the worker the packages are installed # exactly in the order specified. This approach will avoid extra PyPI # requests. For example if package A depends on package B and package A # is installed first then the installer will try to satisfy the # dependency on B by downloading the package from PyPI. If package B is # installed first this is avoided. with open(os.path.join(temp_dir, EXTRA_PACKAGES_FILE), 'wt') as f: for package in local_packages: f.write('%s\n' % os.path.basename(package)) staged_path = staging_filesystem.join(staging_location, EXTRA_PACKAGES_FILE) # Note that the caller of this function is responsible for deleting the # temporary folder where all temp files are created, including this one. file_copy(os.path.join(temp_dir, EXTRA_PACKAGES_FILE), staged_path) resources.append(EXTRA_PACKAGES_FILE) return resources
def rename(src, dest): bfs = get_filesystem(path) bfs.rename([src], [dest])
def rmdir(path): bfs = get_filesystem(path) bfs.delete([path])
def copytree(src, dest): bfs = get_filesystem(src) return bfs.copy([src], [dest])
def rename(src, dest): bfs = get_filesystem(src) return bfs.rename([src], [dest])
def exists(path): bfs = get_filesystem(path) bfs.exists(path)
def rm(path): bfs = get_filesystem(path) return bfs.delete([path])
def glob(path, limit=None): bfs = get_filesystem(path) match_result = bfs.match([path], [limit])[0] return [f.path for f in match_result.metadata_list]