def next(self): """Returns a handler to the next file. Non existent files will be logged and skipped. The file might have been removed after input splitting. Returns: The next input from this input reader in the form of a cloudstorage ReadBuffer that supports a File-like interface (read, readline, seek, tell, and close). An error may be raised if the file can not be opened. Raises: StopIteration: The list of files has been exhausted. """ options = {} if self._buffer_size: options["read_buffer_size"] = self._buffer_size if self._account_id: options["_account_id"] = self._account_id while True: filename = self._next_file() if filename is None: raise StopIteration() if self._path_filter and not self._path_filter.accept(self._slice_ctx, filename): continue try: start_time = time.time() handle = cloudstorage.open(filename, **options) self._slice_ctx.incr(self.COUNTER_IO_READ_MSEC, int(time.time() - start_time) * 1000) self._slice_ctx.incr(self.COUNTER_FILE_READ) return handle except cloudstorage.NotFoundError: logging.warning("File %s may have been removed. Skipping file.", filename) self._slice_ctx.incr(self.COUNTER_FILE_MISSING)
def create(cls, mr_spec, shard_number, shard_attempt, _writer_state=None): """Inherit docs.""" mapper_spec = mr_spec.mapper params = output_writers._get_params(mapper_spec) bucket_name = params.get(cls.BUCKET_NAME_PARAM) shards = mapper_spec.shard_count filehandles = [] filename = (mr_spec.name + "/" + mr_spec.mapreduce_id + "/shard-" + str(shard_number) + "-bucket-") for i in range(shards): full_filename = "/%s/%s%d" % (bucket_name, filename, i) filehandles.append(cloudstorage.open(full_filename, mode="w")) return cls(filehandles)
def _open_file(cls, writer_spec, filename_suffix, use_tmp_bucket=False): """Opens a new gcs file for writing.""" if use_tmp_bucket: bucket = cls._get_tmp_gcs_bucket(writer_spec) account_id = cls._get_tmp_account_id(writer_spec) else: bucket = cls._get_gcs_bucket(writer_spec) account_id = cls._get_account_id(writer_spec) filename = "/%s/%s" % (bucket, filename_suffix) content_type = writer_spec.get(cls.CONTENT_TYPE_PARAM, None) options = {} if cls.ACL_PARAM in writer_spec: options["x-goog-acl"] = writer_spec.get(cls.ACL_PARAM) return cloudstorage.open(filename, mode="w", content_type=content_type, options=options, _account_id=account_id)
def _sort_records_map(records): """Map function sorting records. Converts records to KeyValue protos, sorts them by key and writes them into new GCS file. Creates _OutputFile entity to record resulting file name. Args: records: list of records which are serialized KeyValue protos. """ ctx = context.get() l = len(records) key_records = [None] * l logging.debug("Parsing") for i in range(l): proto = file_service_pb.KeyValue() proto.ParseFromString(records[i]) key_records[i] = (proto.key(), records[i]) logging.debug("Sorting") key_records.sort(cmp=_compare_keys) logging.debug("Writing") mapper_spec = ctx.mapreduce_spec.mapper params = input_readers._get_params(mapper_spec) bucket_name = params.get("bucket_name") filename = (ctx.mapreduce_spec.name + "/" + ctx.mapreduce_id + "/output-" + ctx.shard_id + "-" + str(int(time.time()))) full_filename = "/%s/%s" % (bucket_name, filename) filehandle = cloudstorage.open(full_filename, mode="w") with output_writers.GCSRecordsPool(filehandle, ctx=ctx) as pool: for key_record in key_records: pool.append(key_record[1]) logging.debug("Finalizing") filehandle.close() entity = _OutputFile(key_name=full_filename, parent=_OutputFile.get_root_key(ctx.mapreduce_id)) entity.put()
def _sort_records_map(records): """Map function sorting records. Converts records to KeyValue protos, sorts them by key and writes them into new GCS file. Creates _OutputFile entity to record resulting file name. Args: records: list of records which are serialized KeyValue protos. """ ctx = context.get() l = len(records) key_records = [None] * l logging.debug("Parsing") for i in range(l): proto = kv_pb.KeyValue() proto.ParseFromString(records[i]) key_records[i] = (proto.key(), records[i]) logging.debug("Sorting") key_records.sort(cmp=_compare_keys) logging.debug("Writing") mapper_spec = ctx.mapreduce_spec.mapper params = input_readers._get_params(mapper_spec) bucket_name = params.get("bucket_name") filename = (ctx.mapreduce_spec.name + "/" + ctx.mapreduce_id + "/output-" + ctx.shard_id + "-" + str(int(time.time()))) full_filename = "/%s/%s" % (bucket_name, filename) filehandle = cloudstorage.open(full_filename, mode="w") with output_writers.GCSRecordsPool(filehandle, ctx=ctx) as pool: for key_record in key_records: pool.append(key_record[1]) logging.debug("Finalizing") filehandle.close() entity = _OutputFile(key_name=full_filename, parent=_OutputFile.get_root_key(ctx.mapreduce_id)) entity.put()
def _next_seg(self): """Get next seg.""" if self._seg: self._seg.close() self._seg_index += 1 if self._seg_index > self._last_seg_index: self._seg = None return filename = self._seg_prefix + str(self._seg_index) stat = cloudstorage.stat(filename) writer = output_writers._GoogleCloudStorageOutputWriter if writer._VALID_LENGTH not in stat.metadata: raise ValueError("Expect %s in metadata for file %s." % (writer._VALID_LENGTH, filename)) self._seg_valid_length = int(stat.metadata[writer._VALID_LENGTH]) if self._seg_valid_length > stat.st_size: raise ValueError( "Valid length %s is too big for file %s of length %s" % (self._seg_valid_length, filename, stat.st_size)) self._seg = cloudstorage.open(filename)
def _next_seg(self): """Get next seg.""" if self._seg: self._seg.close() self._seg_index += 1 if self._seg_index > self._last_seg_index: self._seg = None return filename = self._seg_prefix + str(self._seg_index) stat = cloudstorage.stat(filename) writer = output_writers._GoogleCloudStorageOutputWriter if writer._VALID_LENGTH not in stat.metadata: raise ValueError( "Expect %s in metadata for file %s." % (writer._VALID_LENGTH, filename)) self._seg_valid_length = int(stat.metadata[writer._VALID_LENGTH]) if self._seg_valid_length > stat.st_size: raise ValueError( "Valid length %s is too big for file %s of length %s" % (self._seg_valid_length, filename, stat.st_size)) self._seg = cloudstorage.open(filename)
def __next__(self): """Returns a handler to the next file. Non existent files will be logged and skipped. The file might have been removed after input splitting. Returns: The next input from this input reader in the form of a cloudstorage ReadBuffer that supports a File-like interface (read, readline, seek, tell, and close). An error may be raised if the file can not be opened. Raises: StopIteration: The list of files has been exhausted. """ options = {} if self._buffer_size: options["read_buffer_size"] = self._buffer_size if self._account_id: options["_account_id"] = self._account_id while True: filename = self._next_file() if filename is None: raise StopIteration() if (self._path_filter and not self._path_filter.accept(self._slice_ctx, filename)): continue try: start_time = time.time() handle = cloudstorage.open(filename, **options) self._slice_ctx.incr(self.COUNTER_IO_READ_MSEC, int(time.time() - start_time) * 1000) self._slice_ctx.incr(self.COUNTER_FILE_READ) return handle except cloudstorage.NotFoundError: logging.warning( "File %s may have been removed. Skipping file.", filename) self._slice_ctx.incr(self.COUNTER_FILE_MISSING)
def __iter__(self): """Iterate over records in input files. self._offsets is always correctly updated so that stopping iterations doesn't skip records and doesn't read the same record twice. Raises: Exception: when Files list and offsets do not match. Yields: The result. """ ctx = context.get() mapper_spec = ctx.mapreduce_spec.mapper shard_number = ctx._shard_state.shard_number filenames = mapper_spec.params[self.FILES_PARAM][shard_number] if len(filenames) != len(self._offsets): raise Exception("Files list and offsets do not match.") readers = [] for (i, filename) in enumerate(filenames): offset = self._offsets[i] reader = records.RecordsReader( cloudstorage.open(filename, read_buffer_size=self.GCS_BUFFER_SIZE)) reader.seek(offset) readers.append((None, None, i, reader)) current_result = None current_count = 0 current_size = 0 while readers: (key, value, index, reader) = readers[0] if key is not None: current_count += 1 current_size += len(value) should_yield = False if current_result: if key != current_result[0]: should_yield = True elif (self._max_values_count != -1 and current_count >= self._max_values_count): current_result[2] = True should_yield = True elif (self._max_values_size != -1 and current_size >= self._max_values_size): current_result[2] = True should_yield = True if should_yield: yield current_result if not current_result or should_yield: current_result = [key, [], False] current_count = 0 current_size = 0 current_result[1].append(value) try: self._offsets[index] = reader.tell() start_time = time.time() binary_record = reader.read() if context.get(): operation.counters.Increment( input_readers.COUNTER_IO_READ_BYTES, len(binary_record))(context.get()) operation.counters.Increment( input_readers.COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(context.get()) proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) heapq.heapreplace(readers, (proto.key(), proto.value(), index, reader)) except EOFError: heapq.heappop(readers) if current_result: yield current_result
def __iter__(self): """Iterate over records in input files. self._offsets is always correctly updated so that stopping iterations doesn't skip records and doesn't read the same record twice. Raises: Exception: when Files list and offsets do not match. Yields: The result. """ ctx = context.get() mapper_spec = ctx.mapreduce_spec.mapper shard_number = ctx._shard_state.shard_number filenames = mapper_spec.params[self.FILES_PARAM][shard_number] if len(filenames) != len(self._offsets): raise Exception("Files list and offsets do not match.") readers = [] for (i, filename) in enumerate(filenames): offset = self._offsets[i] reader = records.RecordsReader( cloudstorage.open(filename, read_buffer_size=self.GCS_BUFFER_SIZE)) reader.seek(offset) readers.append((None, None, i, reader)) current_result = None current_count = 0 current_size = 0 while readers: (key, value, index, reader) = readers[0] if key is not None: current_count += 1 current_size += len(value) should_yield = False if current_result: if key != current_result[0]: should_yield = True elif (self._max_values_count != -1 and current_count >= self._max_values_count): current_result[2] = True should_yield = True elif (self._max_values_size != -1 and current_size >= self._max_values_size): current_result[2] = True should_yield = True if should_yield: yield current_result if not current_result or should_yield: current_result = [key, [], False] current_count = 0 current_size = 0 current_result[1].append(value) try: self._offsets[index] = reader.tell() start_time = time.time() binary_record = reader.read() if context.get(): operation.counters.Increment( input_readers.COUNTER_IO_READ_BYTES, len(binary_record))(context.get()) operation.counters.Increment( input_readers.COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(context.get()) proto = kv_pb.KeyValue() proto.ParseFromString(binary_record) heapq.heapreplace(readers, (proto.key(), proto.value(), index, reader)) except EOFError: heapq.heappop(readers) if current_result: yield current_result