Example #1
0
    def _next_file(self):
        """Find next filename.

    self._filenames may need to be expanded via listbucket.

    Returns:
      None if no more file is left. Filename otherwise.
    """
        while True:
            if self._bucket_iter:
                try:
                    return self._bucket_iter.next().filename
                except StopIteration:
                    self._bucket_iter = None
                    self._bucket = None
            if self._index >= len(self._filenames):
                return
            filename = self._filenames[self._index]
            self._index += 1
            if self._delimiter is None or not filename.endswith(
                    self._delimiter):
                return filename
            self._bucket = cloudstorage.listbucket(filename,
                                                   delimiter=self._delimiter)
            self._bucket_iter = iter(self._bucket)
Example #2
0
  def _try_to_clean_garbage(self, writer_spec):


    tmpl = string.Template(self._TMPFILE_PREFIX)
    prefix = tmpl.substitute(
        id=self.status.mapreduce_id, shard=self.status.shard)
    bucket = self.status.writer_spec[self.BUCKET_NAME_PARAM]
    account_id = writer_spec.get(self._ACCOUNT_ID_PARAM, None)
    for f in cloudstorage.listbucket("/%s/%s" % (bucket, prefix),
                                     _account_id=account_id):
      self._remove_file(f.filename, self.status.writer_spec)
Example #3
0
    def split_input(cls, job_config):
        """Returns a list of input readers.

    An equal number of input files are assigned to each shard (+/- 1). If there
    are fewer files than shards, fewer than the requested number of shards will
    be used. Input files are currently never split (although for some formats
    could be and may be split in a future implementation).

    Args:
      job_config: map_job.JobConfig

    Returns:
      A list of InputReaders. None when no input data can be found.
    """
        reader_params = job_config.input_reader_params
        bucket = reader_params[cls.BUCKET_NAME_PARAM]
        filenames = reader_params[cls.OBJECT_NAMES_PARAM]
        delimiter = reader_params.get(cls.DELIMITER_PARAM)
        account_id = reader_params.get(cls._ACCOUNT_ID_PARAM)
        buffer_size = reader_params.get(cls.BUFFER_SIZE_PARAM)
        path_filter = reader_params.get(cls.PATH_FILTER_PARAM)

        all_filenames = []
        for filename in filenames:
            if filename.endswith("*"):
                all_filenames.extend([
                    file_stat.filename for file_stat in
                    cloudstorage.listbucket("/" + bucket + "/" + filename[:-1],
                                            delimiter=delimiter,
                                            _account_id=account_id)
                ])
            else:
                all_filenames.append("/%s/%s" % (bucket, filename))

        readers = []
        for shard in range(0, job_config.shard_count):
            shard_filenames = all_filenames[shard::job_config.shard_count]
            if shard_filenames:
                readers.append(
                    cls(shard_filenames,
                        buffer_size=buffer_size,
                        _account_id=account_id,
                        delimiter=delimiter,
                        path_filter=path_filter))
        return readers
Example #4
0
  def split_input(cls, job_config):
    """Returns a list of input readers.

    An equal number of input files are assigned to each shard (+/- 1). If there
    are fewer files than shards, fewer than the requested number of shards will
    be used. Input files are currently never split (although for some formats
    could be and may be split in a future implementation).

    Args:
      job_config: map_job.JobConfig

    Returns:
      A list of InputReaders. None when no input data can be found.
    """
    reader_params = job_config.input_reader_params
    bucket = reader_params[cls.BUCKET_NAME_PARAM]
    filenames = reader_params[cls.OBJECT_NAMES_PARAM]
    delimiter = reader_params.get(cls.DELIMITER_PARAM)
    account_id = reader_params.get(cls._ACCOUNT_ID_PARAM)
    buffer_size = reader_params.get(cls.BUFFER_SIZE_PARAM)
    path_filter = reader_params.get(cls.PATH_FILTER_PARAM)


    all_filenames = []
    for filename in filenames:
      if filename.endswith("*"):
        all_filenames.extend(
            [file_stat.filename for file_stat in cloudstorage.listbucket(
                "/" + bucket + "/" + filename[:-1], delimiter=delimiter,
                _account_id=account_id)])
      else:
        all_filenames.append("/%s/%s" % (bucket, filename))


    readers = []
    for shard in range(0, job_config.shard_count):
      shard_filenames = all_filenames[shard::job_config.shard_count]
      if shard_filenames:
        readers.append(cls(
            shard_filenames, buffer_size=buffer_size, _account_id=account_id,
            delimiter=delimiter, path_filter=path_filter))
    return readers
Example #5
0
  def _next_file(self):
    """Find next filename.

    self._filenames may need to be expanded via listbucket.

    Returns:
      None if no more file is left. Filename otherwise.
    """
    while True:
      if self._bucket_iter:
        try:
          return self._bucket_iter.next().filename
        except StopIteration:
          self._bucket_iter = None
          self._bucket = None
      if self._index >= len(self._filenames):
        return
      filename = self._filenames[self._index]
      self._index += 1
      if self._delimiter is None or not filename.endswith(self._delimiter):
        return filename
      self._bucket = cloudstorage.listbucket(filename,
                                             delimiter=self._delimiter)
      self._bucket_iter = iter(self._bucket)