Exemple #1
0
    def _next_file(self):
        """Find next filename.

    self._filenames may need to be expanded via listbucket.

    Returns:
      None if no more file is left. Filename otherwise.
    """
        while True:
            if self._bucket_iter:
                try:
                    return self._bucket_iter.next().filename
                except StopIteration:
                    self._bucket_iter = None
                    self._bucket = None
            if self._index >= len(self._filenames):
                return
            filename = self._filenames[self._index]
            self._index += 1
            if self._delimiter is None or not filename.endswith(
                    self._delimiter):
                return filename
            self._bucket = cloudstorage.listbucket(filename,
                                                   delimiter=self._delimiter)
            self._bucket_iter = iter(self._bucket)
Exemple #2
0
    def split_input(cls, job_config):
        """Returns a list of input readers.

    An equal number of input files are assigned to each shard (+/- 1). If there
    are fewer files than shards, fewer than the requested number of shards will
    be used. Input files are currently never split (although for some formats
    could be and may be split in a future implementation).

    Args:
      job_config: map_job.JobConfig

    Returns:
      A list of InputReaders. None when no input data can be found.
    """
        reader_params = job_config.input_reader_params
        bucket = reader_params[cls.BUCKET_NAME_PARAM]
        filenames = reader_params[cls.OBJECT_NAMES_PARAM]
        delimiter = reader_params.get(cls.DELIMITER_PARAM)
        account_id = reader_params.get(cls._ACCOUNT_ID_PARAM)
        buffer_size = reader_params.get(cls.BUFFER_SIZE_PARAM)
        path_filter = reader_params.get(cls.PATH_FILTER_PARAM)

        all_filenames = []
        for filename in filenames:
            if filename.endswith("*"):
                all_filenames.extend([
                    file_stat.filename for file_stat in
                    cloudstorage.listbucket("/" + bucket + "/" + filename[:-1],
                                            delimiter=delimiter,
                                            _account_id=account_id)
                ])
            else:
                all_filenames.append("/%s/%s" % (bucket, filename))

        readers = []
        for shard in range(0, job_config.shard_count):
            shard_filenames = all_filenames[shard::job_config.shard_count]
            if shard_filenames:
                readers.append(
                    cls(shard_filenames,
                        buffer_size=buffer_size,
                        _account_id=account_id,
                        delimiter=delimiter,
                        path_filter=path_filter))
        return readers
Exemple #3
0
    def _try_to_clean_garbage(self, writer_spec, exclude_list=()):
        """Tries to remove any files created by this shard that aren't needed.

    Args:
      writer_spec: writer_spec for the MR.
      exclude_list: A list of filenames (strings) that should not be
        removed.
    """

        tmpl = string.Template(self._TMPFILE_PREFIX)
        prefix = tmpl.substitute(id=self.status.mapreduce_id,
                                 shard=self.status.shard)
        bucket = self._get_tmp_gcs_bucket(writer_spec)
        account_id = self._get_tmp_account_id(writer_spec)
        for f in cloudstorage.listbucket("/%s/%s" % (bucket, prefix),
                                         _account_id=account_id):
            if f.filename not in exclude_list:
                self._remove_tmpfile(f.filename, self.status.writer_spec)