def _next_file(self): """Find next filename. self._filenames may need to be expanded via listbucket. Returns: None if no more file is left. Filename otherwise. """ while True: if self._bucket_iter: try: return self._bucket_iter.next().filename except StopIteration: self._bucket_iter = None self._bucket = None if self._index >= len(self._filenames): return filename = self._filenames[self._index] self._index += 1 if self._delimiter is None or not filename.endswith( self._delimiter): return filename self._bucket = cloudstorage.listbucket(filename, delimiter=self._delimiter) self._bucket_iter = iter(self._bucket)
def _try_to_clean_garbage(self, writer_spec): tmpl = string.Template(self._TMPFILE_PREFIX) prefix = tmpl.substitute( id=self.status.mapreduce_id, shard=self.status.shard) bucket = self.status.writer_spec[self.BUCKET_NAME_PARAM] account_id = writer_spec.get(self._ACCOUNT_ID_PARAM, None) for f in cloudstorage.listbucket("/%s/%s" % (bucket, prefix), _account_id=account_id): self._remove_file(f.filename, self.status.writer_spec)
def split_input(cls, job_config): """Returns a list of input readers. An equal number of input files are assigned to each shard (+/- 1). If there are fewer files than shards, fewer than the requested number of shards will be used. Input files are currently never split (although for some formats could be and may be split in a future implementation). Args: job_config: map_job.JobConfig Returns: A list of InputReaders. None when no input data can be found. """ reader_params = job_config.input_reader_params bucket = reader_params[cls.BUCKET_NAME_PARAM] filenames = reader_params[cls.OBJECT_NAMES_PARAM] delimiter = reader_params.get(cls.DELIMITER_PARAM) account_id = reader_params.get(cls._ACCOUNT_ID_PARAM) buffer_size = reader_params.get(cls.BUFFER_SIZE_PARAM) path_filter = reader_params.get(cls.PATH_FILTER_PARAM) all_filenames = [] for filename in filenames: if filename.endswith("*"): all_filenames.extend([ file_stat.filename for file_stat in cloudstorage.listbucket("/" + bucket + "/" + filename[:-1], delimiter=delimiter, _account_id=account_id) ]) else: all_filenames.append("/%s/%s" % (bucket, filename)) readers = [] for shard in range(0, job_config.shard_count): shard_filenames = all_filenames[shard::job_config.shard_count] if shard_filenames: readers.append( cls(shard_filenames, buffer_size=buffer_size, _account_id=account_id, delimiter=delimiter, path_filter=path_filter)) return readers
def split_input(cls, job_config): """Returns a list of input readers. An equal number of input files are assigned to each shard (+/- 1). If there are fewer files than shards, fewer than the requested number of shards will be used. Input files are currently never split (although for some formats could be and may be split in a future implementation). Args: job_config: map_job.JobConfig Returns: A list of InputReaders. None when no input data can be found. """ reader_params = job_config.input_reader_params bucket = reader_params[cls.BUCKET_NAME_PARAM] filenames = reader_params[cls.OBJECT_NAMES_PARAM] delimiter = reader_params.get(cls.DELIMITER_PARAM) account_id = reader_params.get(cls._ACCOUNT_ID_PARAM) buffer_size = reader_params.get(cls.BUFFER_SIZE_PARAM) path_filter = reader_params.get(cls.PATH_FILTER_PARAM) all_filenames = [] for filename in filenames: if filename.endswith("*"): all_filenames.extend( [file_stat.filename for file_stat in cloudstorage.listbucket( "/" + bucket + "/" + filename[:-1], delimiter=delimiter, _account_id=account_id)]) else: all_filenames.append("/%s/%s" % (bucket, filename)) readers = [] for shard in range(0, job_config.shard_count): shard_filenames = all_filenames[shard::job_config.shard_count] if shard_filenames: readers.append(cls( shard_filenames, buffer_size=buffer_size, _account_id=account_id, delimiter=delimiter, path_filter=path_filter)) return readers
def _next_file(self): """Find next filename. self._filenames may need to be expanded via listbucket. Returns: None if no more file is left. Filename otherwise. """ while True: if self._bucket_iter: try: return self._bucket_iter.next().filename except StopIteration: self._bucket_iter = None self._bucket = None if self._index >= len(self._filenames): return filename = self._filenames[self._index] self._index += 1 if self._delimiter is None or not filename.endswith(self._delimiter): return filename self._bucket = cloudstorage.listbucket(filename, delimiter=self._delimiter) self._bucket_iter = iter(self._bucket)