Beispiel #1
0
 def get_filesystem(path):
     """Get the correct filesystem for the specified path
 """
     try:
         return get_filesystem(path)
     except Exception as e:
         raise BeamIOError('Enable to get the Filesystem', {path: e})
Beispiel #2
0
    def _copy_model_dir(trained_model, dest, extract_model_fn):
        """Copy a folder.

    Args:
      trained_model: Folder containing a model.
      dest: Folder to copy trained_model to.
      extract_model_fn: A function to extract the model path from a file
        hierarchy like the one generated from tf.learn's
        `Estimator.export_savedmodel()`.

    Returns:
      dest
    Raises:
      ValueError: If the model directory doesn't match the tf.learn structure.
    """
        if extract_model_fn:
            trained_model = extract_model_fn(trained_model)
        else:
            trained_model = trained_model

        def append_trailing_slash(path):
            return path if path.endswith('/') else path + '/'

        # TODO(user): Remove this gaurd after new Beam release
        try:
            from apache_beam.io import filesystems_util  # pylint: disable=g-import-not-at-top
            file_system = filesystems_util.get_filesystem(
                append_trailing_slash(trained_model))
            file_system.copy([append_trailing_slash(trained_model)],
                             [append_trailing_slash(dest)])
        except ImportError:
            fileio.ChannelFactory.copytree(
                append_trailing_slash(trained_model),
                append_trailing_slash(dest))
        return dest
Beispiel #3
0
 def get_filesystem(path):
   """Get the correct filesystem for the specified path
   """
   try:
     return get_filesystem(path)
   except Exception as e:
     raise BeamIOError('Enable to get the Filesystem', {path: e})
Beispiel #4
0
 def _rename_batch(batch):
     """_rename_batch executes batch rename operations."""
     source_files, destination_files = batch
     exceptions = []
     if self._file_system is None:
         self._file_system = get_filesystem(file_path_prefix)
     try:
         self._file_system.rename(source_files, destination_files)
         return exceptions
     except BeamIOError as exp:
         if exp.exception_details is None:
             raise
         for (src,
              dest), exception in exp.exception_details.iteritems():
             if exception:
                 logging.warning('Rename not successful: %s -> %s, %s',
                                 src, dest, exception)
                 should_report = True
                 if isinstance(exception, IOError):
                     # May have already been copied.
                     try:
                         if self._file_system.exists(dest):
                             should_report = False
                     except Exception as exists_e:  # pylint: disable=broad-except
                         logging.warning(
                             'Exception when checking if file %s exists: '
                             '%s', dest, exists_e)
                 if should_report:
                     logging.warning(
                         ('Exception in _rename_batch. src: %s, '
                          'dest: %s, err: %s'), src, dest, exception)
                     exceptions.append(exception)
             else:
                 logging.debug('Rename successful: %s -> %s', src, dest)
         return exceptions
Beispiel #5
0
 def extract_model_fn(trained_model_dir):
     """Extract Model."""
     # gcsio.glob() will return all the files under
     # <trained_model_dir>/export/<export_name>/* for this reason we search for
     # one specific file (saved_model.pb) according to the tf.learn directory
     # hierarchy. Where the * corresponds to the model timestamp.
     # TODO(user): Remove this gaurd after new Beam release
     try:
         from apache_beam.io import filesystems_util  # pylint: disable=g-import-not-at-top
         file_system = filesystems_util.get_filesystem(
             trained_model_dir)
         match_result = file_system.match([
             os.path.join(trained_model_dir, 'export', export_name, '*',
                          'saved_model.pb')
         ])[0]
         paths = [f.path for f in match_result.metadata_list]
     except ImportError:
         paths = fileio.ChannelFactory.glob(
             os.path.join(trained_model_dir, 'export', export_name, '*',
                          'saved_model.pb'))
     # We still validate that there in only one model under the given path.
     if len(paths) == 1:
         return paths[0].replace('saved_model.pb', '')
     else:
         raise ValueError(
             'The model on %s was not exported by tf.learn. '
             'Or there is more than one matching model path: '
             '%s' % (trained_model_dir, paths))
Beispiel #6
0
 def initialize_write(self):
     file_path_prefix = self.file_path_prefix.get()
     file_name_suffix = self.file_name_suffix.get()
     tmp_dir = file_path_prefix + file_name_suffix + time.strftime(
         '-temp-%Y-%m-%d_%H-%M-%S')
     if self._file_system is None:
         self._file_system = get_filesystem(file_path_prefix)
     self._file_system.mkdirs(tmp_dir)
     return tmp_dir
Beispiel #7
0
 def rename_batch(src_dest_pairs):
   sources = [s for s, _ in src_dest_pairs]
   destinations = [d for _, d in src_dest_pairs]
   bfs = get_filesystem()
   try:
     bfs.rename(sources, destinations)
     return []
   except BeamIOError as exp:
     return [(s, d, e) for (s, d), e in exp.exception_details.iteritems()]
Beispiel #8
0
 def open(path,
          mode,
          mime_type='application/octet-stream',
          compression_type=CompressionTypes.AUTO):
     bfs = get_filesystem(path)
     if mode == 'rb':
         return bfs.open(path, mime_type, compression_type)
     elif mode == 'wb':
         return bfs.create(path, mime_type, compression_type)
Beispiel #9
0
    def open(self, temp_path):
        """Opens ``temp_path``, returning an opaque file handle object.

    The returned file handle is passed to ``write_[encoded_]record`` and
    ``close``.
    """
        if self._file_system is None:
            self._file_system = get_filesystem(self.file_path_prefix.get())
        return self._file_system.create(temp_path, self.mime_type,
                                        self.compression_type)
Beispiel #10
0
 def file_copy(from_path, to_path):
     if not from_path.endswith(names.PICKLED_MAIN_SESSION_FILE):
         self.assertEqual(expected_from_path, from_path)
         filesystem = get_filesystem(expected_to_dir)
         self.assertEqual(
             filesystem.join(expected_to_dir,
                             names.DATAFLOW_SDK_TARBALL_FILE), to_path)
     if from_path.startswith('gs://') or to_path.startswith('gs://'):
         logging.info('Faking file_copy(%s, %s)', from_path, to_path)
     else:
         shutil.copyfile(from_path, to_path)
Beispiel #11
0
    def __init__(self,
                 file_pattern,
                 min_bundle_size=0,
                 compression_type=CompressionTypes.AUTO,
                 splittable=True,
                 validate=True):
        """Initializes ``FileBasedSource``.

    Args:
      file_pattern: the file glob to read.
      min_bundle_size: minimum size of bundles that should be generated when
                       performing initial splitting on this source.
      compression_type: compression type to use
      splittable: whether FileBasedSource should try to logically split a single
                  file into data ranges so that different parts of the same file
                  can be read in parallel. If set to False, FileBasedSource will
                  prevent both initial and dynamic splitting of sources for
                  single files. File patterns that represent multiple files may
                  still get split into sources for individual files. Even if set
                  to True by the user, FileBasedSource may choose to not split
                  the file, for example, for compressed files where currently
                  it is not possible to efficiently read a data range without
                  decompressing the whole file.
      validate: Boolean flag to verify that the files exist during the pipeline
                creation time.
    Raises:
      TypeError: when compression_type is not valid or if file_pattern is not a
                 string.
      ValueError: when compression and splittable files are specified.
      IOError: when the file pattern specified yields an empty result.
    """
        if not isinstance(file_pattern, basestring):
            raise TypeError(
                '%s: file_pattern must be a string;  got %r instead' %
                (self.__class__.__name__, file_pattern))

        self._pattern = file_pattern
        self._file_system = get_filesystem(file_pattern)
        self._concat_source = None
        self._min_bundle_size = min_bundle_size
        if not CompressionTypes.is_valid_compression_type(compression_type):
            raise TypeError(
                'compression_type must be CompressionType object but '
                'was %s' % type(compression_type))
        self._compression_type = compression_type
        if compression_type in (CompressionTypes.UNCOMPRESSED,
                                CompressionTypes.AUTO):
            self._splittable = splittable
        else:
            # We can't split compressed files efficiently so turn off splitting.
            self._splittable = False
        if validate:
            self._validate()
Beispiel #12
0
    def _validate(self):
        """Validate if there are actual files in the specified glob pattern
    """
        pattern = self._pattern.get()
        if self._file_system is None:
            self._file_system = get_filesystem(pattern)

        # Limit the responses as we only want to check if something exists
        match_result = self._file_system.match([pattern], limits=[1])[0]
        if len(match_result.metadata_list) <= 0:
            raise IOError('No files found based on the file pattern %s' %
                          pattern)
  def _validate(self):
    """Validate if there are actual files in the specified glob pattern
    """
    pattern = self._pattern.get()
    if self._file_system is None:
      self._file_system = get_filesystem(pattern)

    # Limit the responses as we only want to check if something exists
    match_result = self._file_system.match([pattern], limits=[1])[0]
    if len(match_result.metadata_list) <= 0:
      raise IOError(
          'No files found based on the file pattern %s' % pattern)
Beispiel #14
0
  def size_of_files_in_glob(path, file_names=None):
    bfs = get_filesystem(path)
    match_result = bfs.match([path], [limit])[0]
    part_files = {f.path:f.size_in_bytes for f in match_result.metadata_list}

    if file_names is not None:
      specific_files = {}
      match_results = bfs.match(file_names)
      for match_result in match_results:
        for metadata in match_result.metadata_list:
          specific_files[metadata.path] = metadata.size_in_bytes

    return part_files.update(specific_files)
Beispiel #15
0
    def __init__(self, options):
        self.options = options
        self.google_cloud_options = options.view_as(GoogleCloudOptions)
        if not self.google_cloud_options.job_name:
            self.google_cloud_options.job_name = self.default_job_name(
                self.google_cloud_options.job_name)

        required_google_cloud_options = [
            'project', 'job_name', 'temp_location'
        ]
        missing = [
            option for option in required_google_cloud_options
            if not getattr(self.google_cloud_options, option)
        ]
        if missing:
            raise ValueError('Missing required configuration parameters: %s' %
                             missing)

        if not self.google_cloud_options.staging_location:
            logging.info(
                'Defaulting to the temp_location as staging_location: %s',
                self.google_cloud_options.temp_location)
            (self.google_cloud_options.staging_location
             ) = self.google_cloud_options.temp_location

        # Make the staging and temp locations job name and time specific. This is
        # needed to avoid clashes between job submissions using the same staging
        # area or team members using same job names. This method is not entirely
        # foolproof since two job submissions with same name can happen at exactly
        # the same time. However the window is extremely small given that
        # time.time() has at least microseconds granularity. We add the suffix only
        # for GCS staging locations where the potential for such clashes is high.
        if self.google_cloud_options.staging_location.startswith('gs://'):
            path_suffix = '%s.%f' % (self.google_cloud_options.job_name,
                                     time.time())
            filesystem = get_filesystem(
                self.google_cloud_options.staging_location)
            self.google_cloud_options.staging_location = filesystem.join(
                self.google_cloud_options.staging_location, path_suffix)
            self.google_cloud_options.temp_location = filesystem.join(
                self.google_cloud_options.temp_location, path_suffix)

        self.proto = dataflow.Job(name=self.google_cloud_options.job_name)
        if self.options.view_as(StandardOptions).streaming:
            self.proto.type = dataflow.Job.TypeValueValuesEnum.JOB_TYPE_STREAMING
        else:
            self.proto.type = dataflow.Job.TypeValueValuesEnum.JOB_TYPE_BATCH
        self.base64_str_re = re.compile(r'^[A-Za-z0-9+/]*=*$')
        self.coder_str_re = re.compile(r'^([A-Za-z]+\$)([A-Za-z0-9+/]*=*)$')
Beispiel #16
0
    def __init__(self,
                 file_path_prefix,
                 coder,
                 file_name_suffix='',
                 num_shards=0,
                 shard_name_template=None,
                 mime_type='application/octet-stream',
                 compression_type=CompressionTypes.AUTO):
        """
     Raises:
      TypeError: if file path parameters are not a string or ValueProvider,
                 or if compression_type is not member of CompressionTypes.
      ValueError: if shard_name_template is not of expected format.
    """
        if not (isinstance(file_path_prefix, basestring)
                or isinstance(file_path_prefix, ValueProvider)):
            raise TypeError(
                'file_path_prefix must be a string or ValueProvider;'
                'got %r instead' % file_path_prefix)
        if not (isinstance(file_name_suffix, basestring)
                or isinstance(file_name_suffix, ValueProvider)):
            raise TypeError(
                'file_name_suffix must be a string or ValueProvider;'
                'got %r instead' % file_name_suffix)

        if not CompressionTypes.is_valid_compression_type(compression_type):
            raise TypeError(
                'compression_type must be CompressionType object but '
                'was %s' % type(compression_type))
        if shard_name_template is None:
            shard_name_template = DEFAULT_SHARD_NAME_TEMPLATE
        elif shard_name_template is '':
            num_shards = 1
        if isinstance(file_path_prefix, basestring):
            file_path_prefix = StaticValueProvider(str, file_path_prefix)
        if isinstance(file_name_suffix, basestring):
            file_name_suffix = StaticValueProvider(str, file_name_suffix)
        self.file_path_prefix = file_path_prefix
        self.file_name_suffix = file_name_suffix
        self.num_shards = num_shards
        self.coder = coder
        self.shard_name_format = self._template_to_format(shard_name_template)
        self.compression_type = compression_type
        self.mime_type = mime_type
        if file_path_prefix.is_accessible():
            self._file_system = get_filesystem(file_path_prefix.get())
        else:
            self._file_system = None
Beispiel #17
0
    def _get_concat_source(self):
        if self._concat_source is None:
            pattern = self._pattern.get()

            single_file_sources = []
            if self._file_system is None:
                self._file_system = get_filesystem(pattern)
            match_result = self._file_system.match([pattern])[0]
            files_metadata = match_result.metadata_list

            # We create a reference for FileBasedSource that will be serialized along
            # with each _SingleFileSource. To prevent this FileBasedSource from having
            # a reference to ConcatSource (resulting in quadratic space complexity)
            # we clone it here.
            file_based_source_ref = pickler.loads(pickler.dumps(self))

            for file_metadata in files_metadata:
                file_name = file_metadata.path
                file_size = file_metadata.size_in_bytes
                if file_size == 0:
                    continue  # Ignoring empty file.

                # We determine splittability of this specific file.
                splittable = self.splittable
                if (splittable
                        and self._compression_type == CompressionTypes.AUTO):
                    compression_type = CompressionTypes.detect_compression_type(
                        file_name)
                    if compression_type != CompressionTypes.UNCOMPRESSED:
                        splittable = False

                single_file_source = _SingleFileSource(
                    file_based_source_ref,
                    file_name,
                    0,
                    file_size,
                    min_bundle_size=self._min_bundle_size,
                    splittable=splittable)
                single_file_sources.append(single_file_source)
            self._concat_source = concat_source.ConcatSource(
                single_file_sources)
        return self._concat_source
  def _get_concat_source(self):
    if self._concat_source is None:
      pattern = self._pattern.get()

      single_file_sources = []
      if self._file_system is None:
        self._file_system = get_filesystem(pattern)
      match_result = self._file_system.match([pattern])[0]
      files_metadata = match_result.metadata_list

      # We create a reference for FileBasedSource that will be serialized along
      # with each _SingleFileSource. To prevent this FileBasedSource from having
      # a reference to ConcatSource (resulting in quadratic space complexity)
      # we clone it here.
      file_based_source_ref = pickler.loads(pickler.dumps(self))

      for file_metadata in files_metadata:
        file_name = file_metadata.path
        file_size = file_metadata.size_in_bytes
        if file_size == 0:
          continue  # Ignoring empty file.

        # We determine splittability of this specific file.
        splittable = self.splittable
        if (splittable and
            self._compression_type == CompressionTypes.AUTO):
          compression_type = CompressionTypes.detect_compression_type(
              file_name)
          if compression_type != CompressionTypes.UNCOMPRESSED:
            splittable = False

        single_file_source = _SingleFileSource(
            file_based_source_ref, file_name,
            0,
            file_size,
            min_bundle_size=self._min_bundle_size,
            splittable=splittable)
        single_file_sources.append(single_file_source)
      self._concat_source = concat_source.ConcatSource(single_file_sources)
    return self._concat_source
  def __init__(self, file_path, expected_checksum, sleep_secs=None):
    """Initialize a FileChecksumMatcher object

    Args:
      file_path : A string that is the full path of output file. This path
        can contain globs.
      expected_checksum : A hash string that is computed from expected
        result.
      sleep_secs : Number of seconds to wait before verification start.
        Extra time are given to make sure output files are ready on FS.
    """
    if sleep_secs is not None:
      if isinstance(sleep_secs, int):
        self.sleep_secs = sleep_secs
      else:
        raise ValueError('Sleep seconds, if received, must be int. '
                         'But received: %r, %s' % (sleep_secs,
                                                   type(sleep_secs)))
    else:
      self.sleep_secs = None

    self.file_path = file_path
    self.file_system = get_filesystem(self.file_path)
    self.expected_checksum = expected_checksum
Beispiel #20
0
    def __init__(self, file_path, expected_checksum, sleep_secs=None):
        """Initialize a FileChecksumMatcher object

    Args:
      file_path : A string that is the full path of output file. This path
        can contain globs.
      expected_checksum : A hash string that is computed from expected
        result.
      sleep_secs : Number of seconds to wait before verification start.
        Extra time are given to make sure output files are ready on FS.
    """
        if sleep_secs is not None:
            if isinstance(sleep_secs, int):
                self.sleep_secs = sleep_secs
            else:
                raise ValueError('Sleep seconds, if received, must be int. '
                                 'But received: %r, %s' %
                                 (sleep_secs, type(sleep_secs)))
        else:
            self.sleep_secs = None

        self.file_path = file_path
        self.file_system = get_filesystem(self.file_path)
        self.expected_checksum = expected_checksum
Beispiel #21
0
 def exists(path):
     bfs = get_filesystem(path)
     return bfs.exists(path)
  def __init__(self,
               file_pattern,
               min_bundle_size=0,
               compression_type=CompressionTypes.AUTO,
               splittable=True,
               validate=True):
    """Initializes ``FileBasedSource``.

    Args:
      file_pattern: the file glob to read a string or a ValueProvider
                    (placeholder to inject a runtime value).
      min_bundle_size: minimum size of bundles that should be generated when
                       performing initial splitting on this source.
      compression_type: compression type to use
      splittable: whether FileBasedSource should try to logically split a single
                  file into data ranges so that different parts of the same file
                  can be read in parallel. If set to False, FileBasedSource will
                  prevent both initial and dynamic splitting of sources for
                  single files. File patterns that represent multiple files may
                  still get split into sources for individual files. Even if set
                  to True by the user, FileBasedSource may choose to not split
                  the file, for example, for compressed files where currently
                  it is not possible to efficiently read a data range without
                  decompressing the whole file.
      validate: Boolean flag to verify that the files exist during the pipeline
                creation time.
    Raises:
      TypeError: when compression_type is not valid or if file_pattern is not a
                 string or a ValueProvider.
      ValueError: when compression and splittable files are specified.
      IOError: when the file pattern specified yields an empty result.
    """

    if (not (isinstance(file_pattern, basestring)
             or isinstance(file_pattern, ValueProvider))):
      raise TypeError('%s: file_pattern must be of type string'
                      ' or ValueProvider; got %r instead'
                      % (self.__class__.__name__, file_pattern))

    if isinstance(file_pattern, basestring):
      file_pattern = StaticValueProvider(str, file_pattern)
    self._pattern = file_pattern
    if file_pattern.is_accessible():
      self._file_system = get_filesystem(file_pattern.get())
    else:
      self._file_system = None

    self._concat_source = None
    self._min_bundle_size = min_bundle_size
    if not CompressionTypes.is_valid_compression_type(compression_type):
      raise TypeError('compression_type must be CompressionType object but '
                      'was %s' % type(compression_type))
    self._compression_type = compression_type
    if compression_type in (CompressionTypes.UNCOMPRESSED,
                            CompressionTypes.AUTO):
      self._splittable = splittable
    else:
      # We can't split compressed files efficiently so turn off splitting.
      self._splittable = False
    if validate and file_pattern.is_accessible():
      self._validate()
 def estimate_size(self):
   pattern = self._pattern.get()
   if self._file_system is None:
     self._file_system = get_filesystem(pattern)
   match_result = self._file_system.match([pattern])[0]
   return sum([f.size_in_bytes for f in match_result.metadata_list])
Beispiel #24
0
def stage_job_resources(
    options, file_copy=_dependency_file_copy, build_setup_args=None,
    temp_dir=None, populate_requirements_cache=_populate_requirements_cache):
  """Creates (if needed) and stages job resources to options.staging_location.

  Args:
    options: Command line options. More specifically the function will expect
      staging_location, requirements_file, setup_file, and save_main_session
      options to be present.
    file_copy: Callable for copying files. The default version will copy from
      a local file to a GCS location using the gsutil tool available in the
      Google Cloud SDK package.
    build_setup_args: A list of command line arguments used to build a setup
      package. Used only if options.setup_file is not None. Used only for
      testing.
    temp_dir: Temporary folder where the resource building can happen. If None
      then a unique temp directory will be created. Used only for testing.
    populate_requirements_cache: Callable for populating the requirements cache.
      Used only for testing.

  Returns:
    A list of file names (no paths) for the resources staged. All the files
    are assumed to be staged in options.staging_location.

  Raises:
    RuntimeError: If files specified are not found or error encountered while
      trying to create the resources (e.g., build a setup package).
  """
  temp_dir = temp_dir or tempfile.mkdtemp()
  resources = []

  google_cloud_options = options.view_as(GoogleCloudOptions)
  setup_options = options.view_as(SetupOptions)
  # Make sure that all required options are specified. There are a few that have
  # defaults to support local running scenarios.
  if google_cloud_options.staging_location is None:
    raise RuntimeError(
        'The --staging_location option must be specified.')
  if google_cloud_options.temp_location is None:
    raise RuntimeError(
        'The --temp_location option must be specified.')

  filesystem = get_filesystem(google_cloud_options.staging_location)

  # Stage a requirements file if present.
  if setup_options.requirements_file is not None:
    if not os.path.isfile(setup_options.requirements_file):
      raise RuntimeError('The file %s cannot be found. It was specified in the '
                         '--requirements_file command line option.' %
                         setup_options.requirements_file)
    staged_path = filesystem.join(google_cloud_options.staging_location,
                                  REQUIREMENTS_FILE)
    file_copy(setup_options.requirements_file, staged_path)
    resources.append(REQUIREMENTS_FILE)
    requirements_cache_path = (
        os.path.join(tempfile.gettempdir(), 'dataflow-requirements-cache')
        if setup_options.requirements_cache is None
        else setup_options.requirements_cache)
    # Populate cache with packages from requirements and stage the files
    # in the cache.
    if not os.path.exists(requirements_cache_path):
      os.makedirs(requirements_cache_path)
    populate_requirements_cache(
        setup_options.requirements_file, requirements_cache_path)
    for pkg in  glob.glob(os.path.join(requirements_cache_path, '*')):
      file_copy(pkg, filesystem.join(google_cloud_options.staging_location,
                                     os.path.basename(pkg)))
      resources.append(os.path.basename(pkg))

  # Handle a setup file if present.
  # We will build the setup package locally and then copy it to the staging
  # location because the staging location is a GCS path and the file cannot be
  # created directly there.
  if setup_options.setup_file is not None:
    if not os.path.isfile(setup_options.setup_file):
      raise RuntimeError('The file %s cannot be found. It was specified in the '
                         '--setup_file command line option.' %
                         setup_options.setup_file)
    if os.path.basename(setup_options.setup_file) != 'setup.py':
      raise RuntimeError(
          'The --setup_file option expects the full path to a file named '
          'setup.py instead of %s' % setup_options.setup_file)
    tarball_file = _build_setup_package(setup_options.setup_file, temp_dir,
                                        build_setup_args)
    staged_path = filesystem.join(google_cloud_options.staging_location,
                                  WORKFLOW_TARBALL_FILE)
    file_copy(tarball_file, staged_path)
    resources.append(WORKFLOW_TARBALL_FILE)

  # Handle extra local packages that should be staged.
  if setup_options.extra_packages is not None:
    resources.extend(
        _stage_extra_packages(setup_options.extra_packages,
                              google_cloud_options.staging_location,
                              temp_dir=temp_dir, file_copy=file_copy))

  # Pickle the main session if requested.
  # We will create the pickled main session locally and then copy it to the
  # staging location because the staging location is a GCS path and the file
  # cannot be created directly there.
  if setup_options.save_main_session:
    pickled_session_file = os.path.join(temp_dir,
                                        names.PICKLED_MAIN_SESSION_FILE)
    pickler.dump_session(pickled_session_file)
    staged_path = filesystem.join(google_cloud_options.staging_location,
                                  names.PICKLED_MAIN_SESSION_FILE)
    file_copy(pickled_session_file, staged_path)
    resources.append(names.PICKLED_MAIN_SESSION_FILE)

  if hasattr(setup_options, 'sdk_location'):
    if setup_options.sdk_location == 'default':
      stage_tarball_from_remote_location = True
    elif (setup_options.sdk_location.startswith('gs://') or
          setup_options.sdk_location.startswith('http://') or
          setup_options.sdk_location.startswith('https://')):
      stage_tarball_from_remote_location = True
    else:
      stage_tarball_from_remote_location = False

    staged_path = filesystem.join(google_cloud_options.staging_location,
                                  names.DATAFLOW_SDK_TARBALL_FILE)
    if stage_tarball_from_remote_location:
      # If --sdk_location is not specified then the appropriate package
      # will be obtained from PyPI (https://pypi.python.org) based on the
      # version of the currently running SDK. If the option is
      # present then no version matching is made and the exact URL or path
      # is expected.
      #
      # Unit tests running in the 'python setup.py test' context will
      # not have the sdk_location attribute present and therefore we
      # will not stage a tarball.
      if setup_options.sdk_location == 'default':
        sdk_remote_location = 'pypi'
      else:
        sdk_remote_location = setup_options.sdk_location
      _stage_beam_sdk_tarball(sdk_remote_location, staged_path, temp_dir)
      resources.append(names.DATAFLOW_SDK_TARBALL_FILE)
    else:
      # Check if we have a local Beam SDK tarball present. This branch is
      # used by tests running with the SDK built at head.
      if setup_options.sdk_location == 'default':
        module_path = os.path.abspath(__file__)
        sdk_path = os.path.join(
            os.path.dirname(module_path), '..', '..', '..',
            names.DATAFLOW_SDK_TARBALL_FILE)
      elif os.path.isdir(setup_options.sdk_location):
        sdk_path = os.path.join(
            setup_options.sdk_location, names.DATAFLOW_SDK_TARBALL_FILE)
      else:
        sdk_path = setup_options.sdk_location
      if os.path.isfile(sdk_path):
        logging.info('Copying Beam SDK "%s" to staging location.', sdk_path)
        file_copy(sdk_path, staged_path)
        resources.append(names.DATAFLOW_SDK_TARBALL_FILE)
      else:
        if setup_options.sdk_location == 'default':
          raise RuntimeError('Cannot find default Beam SDK tar file "%s"',
                             sdk_path)
        elif not setup_options.sdk_location:
          logging.info('Beam SDK will not be staged since --sdk_location '
                       'is empty.')
        else:
          raise RuntimeError(
              'The file "%s" cannot be found. Its location was specified by '
              'the --sdk_location command-line option.' %
              sdk_path)

  # Delete all temp files created while staging job resources.
  shutil.rmtree(temp_dir)
  return resources
 def open_file(self, file_name):
   return get_filesystem(file_name).open(
       file_name, 'application/octet-stream',
       compression_type=self._compression_type)
Beispiel #26
0
 def size_in_bytes(path):
     bfs = get_filesystem(path)
     match_result = bfs.match([path])[0]
     return [f.size_in_bytes for f in match_result.metadata_list][0]
Beispiel #27
0
 def mkdir(path):
     bfs = get_filesystem(path)
     return bfs.mkdirs(path)
Beispiel #28
0
 def open_file(self, file_name):
     return get_filesystem(file_name).open(
         file_name,
         'application/octet-stream',
         compression_type=self._compression_type)
Beispiel #29
0
def _stage_extra_packages(extra_packages, staging_location, temp_dir,
                          file_copy=_dependency_file_copy):
  """Stages a list of local extra packages.

  Args:
    extra_packages: Ordered list of local paths to extra packages to be staged.
    staging_location: Staging location for the packages.
    temp_dir: Temporary folder where the resource building can happen. Caller
      is responsible for cleaning up this folder after this function returns.
    file_copy: Callable for copying files. The default version will copy from
      a local file to a GCS location using the gsutil tool available in the
      Google Cloud SDK package.

  Returns:
    A list of file names (no paths) for the resources staged. All the files
    are assumed to be staged in staging_location.

  Raises:
    RuntimeError: If files specified are not found or do not have expected
      name patterns.
  """
  resources = []
  staging_filesystem = get_filesystem(staging_location)
  staging_temp_dir = None
  local_packages = []
  for package in extra_packages:
    if not (os.path.basename(package).endswith('.tar') or
            os.path.basename(package).endswith('.tar.gz') or
            os.path.basename(package).endswith('.whl')):
      raise RuntimeError(
          'The --extra_package option expects a full path ending with '
          '".tar" or ".tar.gz" instead of %s' % package)
    if os.path.basename(package).endswith('.whl'):
      logging.warning(
          'The .whl package "%s" is provided in --extra_package. '
          'This functionality is not officially supported. Since wheel '
          'packages are binary distributions, this package must be '
          'binary-compatible with the worker environment (e.g. Python 2.7 '
          'running on an x64 Linux host).')

    if not os.path.isfile(package):
      if package.startswith('gs://'):
        if not staging_temp_dir:
          staging_temp_dir = tempfile.mkdtemp(dir=temp_dir)
        logging.info('Downloading extra package: %s locally before staging',
                     package)
        _dependency_file_copy(package, staging_temp_dir)
      else:
        raise RuntimeError(
            'The file %s cannot be found. It was specified in the '
            '--extra_packages command line option.' % package)
    else:
      local_packages.append(package)

  if staging_temp_dir:
    temp_fs = get_filesystem(staging_temp_dir)
    local_packages.extend(
        [temp_fs.join(staging_temp_dir, f) for f in os.listdir(
            staging_temp_dir)])

  for package in local_packages:
    basename = os.path.basename(package)
    staged_path = staging_filesystem.join(staging_location, basename)
    file_copy(package, staged_path)
    resources.append(basename)
  # Create a file containing the list of extra packages and stage it.
  # The file is important so that in the worker the packages are installed
  # exactly in the order specified. This approach will avoid extra PyPI
  # requests. For example if package A depends on package B and package A
  # is installed first then the installer will try to satisfy the
  # dependency on B by downloading the package from PyPI. If package B is
  # installed first this is avoided.
  with open(os.path.join(temp_dir, EXTRA_PACKAGES_FILE), 'wt') as f:
    for package in local_packages:
      f.write('%s\n' % os.path.basename(package))
  staged_path = staging_filesystem.join(staging_location, EXTRA_PACKAGES_FILE)
  # Note that the caller of this function is responsible for deleting the
  # temporary folder where all temp files are created, including this one.
  file_copy(os.path.join(temp_dir, EXTRA_PACKAGES_FILE), staged_path)
  resources.append(EXTRA_PACKAGES_FILE)

  return resources
Beispiel #30
0
 def rename(src, dest):
   bfs = get_filesystem(path)
   bfs.rename([src], [dest])
Beispiel #31
0
 def rmdir(path):
   bfs = get_filesystem(path)
   bfs.delete([path])
Beispiel #32
0
 def copytree(src, dest):
     bfs = get_filesystem(src)
     return bfs.copy([src], [dest])
Beispiel #33
0
 def rename(src, dest):
     bfs = get_filesystem(src)
     return bfs.rename([src], [dest])
Beispiel #34
0
 def exists(path):
   bfs = get_filesystem(path)
   bfs.exists(path)
Beispiel #35
0
 def rm(path):
     bfs = get_filesystem(path)
     return bfs.delete([path])
Beispiel #36
0
 def glob(path, limit=None):
     bfs = get_filesystem(path)
     match_result = bfs.match([path], [limit])[0]
     return [f.path for f in match_result.metadata_list]
Beispiel #37
0
 def estimate_size(self):
     pattern = self._pattern.get()
     if self._file_system is None:
         self._file_system = get_filesystem(pattern)
     match_result = self._file_system.match([pattern])[0]
     return sum([f.size_in_bytes for f in match_result.metadata_list])