def testSortFile(self): """Test sorting a file.""" input_file = files.blobstore.create() input_data = [(str(i), "_" + str(i)) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = shuffler._SortChunksPipeline("testjob", [input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler._SortChunksPipeline.from_id(p.pipeline_id) input_data.sort() output_files = p.outputs.default.value[0] output_data = [] for output_file in output_files: with files.open(output_file, "r") as f: for binary_record in records.RecordsReader(f): proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) output_data.append((proto.key(), proto.value())) self.assertEquals(input_data, output_data)
def testSortFile(self): """Test sorting a file.""" bucket_name = "testbucket" test_filename = "testfile" full_filename = "/%s/%s" % (bucket_name, test_filename) input_data = [ (str(i), "_" + str(i)) for i in range(100)] with cloudstorage.open(full_filename, mode="w") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) p = shuffler._SortChunksPipeline("testjob", bucket_name, [[full_filename]]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler._SortChunksPipeline.from_id(p.pipeline_id) input_data.sort() output_files = p.outputs.default.value[0] output_data = [] for output_file in output_files: with cloudstorage.open(output_file) as f: for binary_record in records.RecordsReader(f): proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) output_data.append((proto.key(), proto.value())) self.assertEquals(input_data, output_data) self.assertEquals(1, len(self.emails))
def testMergeFiles(self): """Test merging multiple files.""" input_data = [(str(i), "_" + str(i)) for i in range(100)] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = TestMergePipeline([input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data] self.assertEquals(expected_data, output_data)
def write(self, data): """Write data. Args: data: actual data yielded from handler. Type is writer-specific. """ ctx = context.get() if len(data) != 2: logging.error("Got bad tuple of length %d (2-tuple expected): %s", len(data), data) try: key = str(data[0]) value = str(data[1]) except TypeError: logging.error("Expecting a tuple, but got %s: %s", data.__class__.__name__, data) file_index = key.__hash__() % len(self._filenames) pool_name = "kv_pool%d" % file_index filename = self._filenames[file_index] if ctx.get_pool(pool_name) is None: ctx.register_pool(pool_name, output_writers.RecordsPool(filename=filename, ctx=ctx)) proto = file_service_pb.KeyValue() proto.set_key(key) proto.set_value(value) ctx.get_pool(pool_name).append(proto.Encode())
def write(self, data): """Write data. Args: data: actual data yielded from handler. Type is writer-specific. """ ctx = context.get() if len(data) != 2: logging.error("Got bad tuple of length %d (2-tuple expected): %s", len(data), data) try: key = str(data[0]) value = str(data[1]) except TypeError: logging.error("Expecting a tuple, but got %s: %s", data.__class__.__name__, data) file_index = key.__hash__() % len(self._filehandles) # Work-around: Since we don't have access to the context in the to_json() # function, but we need to flush each pool before we serialize the # filehandle, we rely on a member variable instead of using context for # pool management. pool = self._pools[file_index] if pool is None: filehandle = self._filehandles[file_index] pool = output_writers.GCSRecordsPool(filehandle=filehandle, ctx=ctx) self._pools[file_index] = pool proto = file_service_pb.KeyValue() proto.set_key(key) proto.set_value(value) pool.append(proto.Encode())
def testMergeFiles(self): """Test merging multiple files.""" input_data = [(str(i), "_" + str(i)) for i in range(100)] input_data.sort() bucket_name = "testbucket" test_filename = "testfile" full_filename = "/%s/%s" % (bucket_name, test_filename) with cloudstorage.open(full_filename, mode="w") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) p = TestMergePipeline(bucket_name, [full_filename, full_filename, full_filename]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with cloudstorage.open(output_file) as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [ str((k, [v, v, v], False)) for (k, v) in input_data] self.assertEquals(expected_data, output_data) self.assertEquals(1, len(self.emails))
def _hashing_map(binary_record): """A map function used in hash phase. Reads KeyValue from binary record and yields (key, value). """ proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) yield (proto.key(), proto.value())
def __iter__(self): """Iterate over records in input files. self._offsets is always correctly updated so that stopping iterations doesn't skip records and doesn't read the same record twice. """ ctx = context.get() mapper_spec = ctx.mapreduce_spec.mapper shard_number = ctx.shard_state.shard_number filenames = mapper_spec.params[self.FILES_PARAM][shard_number] if len(filenames) != len(self._offsets): raise Exception("Files list and offsets do not match.") readers = [] for (i, filename) in enumerate(filenames): offset = self._offsets[i] reader = records.RecordsReader(files.BufferedFile(filename)) reader.seek(offset) readers.append((None, None, i, reader)) current_result = None while readers: (key, value, index, reader) = readers[0] if key is not None: if current_result and key != current_result[0]: yield current_result if not current_result or key != current_result[0]: current_result = (key, []) current_result[1].append(value) try: self._offsets[index] = reader.tell() start_time = time.time() binary_record = reader.read() if context.get(): operation.counters.Increment( input_readers.COUNTER_IO_READ_BYTES, len(binary_record))(context.get()) operation.counters.Increment( input_readers.COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(context.get()) proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) heapq.heapreplace(readers, (proto.key(), proto.value(), index, reader)) except EOFError: heapq.heappop(readers) if current_result: yield current_result
def testHashingMultipleFiles(self): """Test hashing files.""" input_data = [(str(i), str(i)) for i in range(100)] input_data.sort() bucket_name = "testbucket" test_filename = "testfile" full_filename = "/%s/%s" % (bucket_name, test_filename) with cloudstorage.open(full_filename, mode="w") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) p = shuffler._HashPipeline("testjob", bucket_name, [full_filename, full_filename, full_filename]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler._HashPipeline.from_id(p.pipeline_id) list_of_output_files = p.outputs.default.value output_data = [] for output_files in list_of_output_files: for output_file in output_files: with cloudstorage.open(output_file) as f: for binary_record in records.RecordsReader(f): proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) output_data.append((proto.key(), proto.value())) output_data.sort() self.assertEquals(300, len(output_data)) for i in range(len(input_data)): self.assertEquals(input_data[i], output_data[(3 * i)]) self.assertEquals(input_data[i], output_data[(3 * i) + 1]) self.assertEquals(input_data[i], output_data[(3 * i) + 2]) self.assertEquals(1, len(self.emails))
def testPartialRecords(self): """Test merging into partial key values.""" try: self._prev_max_values_count = shuffler._MergePipeline._MAX_VALUES_COUNT # force max values count to extremely low value. shuffler._MergePipeline._MAX_VALUES_COUNT = 1 input_data = [("1", "a"), ("2", "b"), ("3", "c")] input_data.sort() bucket_name = "testbucket" test_filename = "testfile" full_filename = "/%s/%s" % (bucket_name, test_filename) with cloudstorage.open(full_filename, mode="w") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) p = TestMergePipeline(bucket_name, [full_filename, full_filename, full_filename]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with cloudstorage.open(output_file) as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [ ("1", ["a"], True), ("1", ["a"], True), ("1", ["a"], False), ("2", ["b"], True), ("2", ["b"], True), ("2", ["b"], False), ("3", ["c"], True), ("3", ["c"], True), ("3", ["c"], False), ] self.assertEquals([str(e) for e in expected_data], output_data) finally: shuffler._MergePipeline._MAX_VALUES_COUNT = self._prev_max_values_count self.assertEquals(1, len(self.emails))
def testPartialRecords(self): """Test merging into partial key values.""" try: self._prev_max_values_count = shuffler._MergePipeline._MAX_VALUES_COUNT # force max values count to extremely low value. shuffler._MergePipeline._MAX_VALUES_COUNT = 1 input_data = [('1', 'a'), ('2', 'b'), ('3', 'c')] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = TestMergePipeline([input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [ ('1', ['a'], True), ('1', ['a'], True), ('1', ['a'], False), ('2', ['b'], True), ('2', ['b'], True), ('2', ['b'], False), ('3', ['c'], True), ('3', ['c'], True), ('3', ['c'], False), ] self.assertEquals([str(e) for e in expected_data], output_data) finally: shuffler._MergePipeline._MAX_VALUES_COUNT = self._prev_max_values_count
def write(self, data, ctx): if len(data) != 2: logging.error("Got bad tuple of length %d (2-tuple expected): %s", len(data), data) try: key = str(data[0]) value = str(data[1]) except TypeError: logging.error("Expecting a tuple, but got %s: %s", data.__class__.__name__, data) proto = file_service_pb.KeyValue() proto.set_key(key) proto.set_value(value) FileRecordsOutputWriter.write(self, proto.Encode(), ctx)
def _sort_records_map(records): """Map function sorting records. Converts records to KeyValue protos, sorts them by key and writes them into new GCS file. Creates _OutputFile entity to record resulting file name. Args: records: list of records which are serialized KeyValue protos. """ ctx = context.get() l = len(records) key_records = [None] * l logging.debug("Parsing") for i in range(l): proto = file_service_pb.KeyValue() proto.ParseFromString(records[i]) key_records[i] = (proto.key(), records[i]) logging.debug("Sorting") key_records.sort(cmp=_compare_keys) logging.debug("Writing") mapper_spec = ctx.mapreduce_spec.mapper params = input_readers._get_params(mapper_spec) bucket_name = params.get("bucket_name") filename = (ctx.mapreduce_spec.name + "/" + ctx.mapreduce_id + "/output-" + ctx.shard_id + "-" + str(int(time.time()))) full_filename = "/%s/%s" % (bucket_name, filename) filehandle = cloudstorage.open(full_filename, mode="w") with output_writers.GCSRecordsPool(filehandle, ctx=ctx) as pool: for key_record in key_records: pool.append(key_record[1]) logging.debug("Finalizing") filehandle.close() entity = _OutputFile(key_name=full_filename, parent=_OutputFile.get_root_key(ctx.mapreduce_id)) entity.put()
def _sort_records(records): """Map function sorting records. Converts records to KeyValue protos, sorts them by key and writes them into new blobstore file. Creates _OutputFile entity to record resulting file name. Args: records: list of records which are serialized KeyValue protos. """ ctx = context.get() l = len(records) proto_records = [None] * l logging.debug("Parsing") for i in range(l): proto = file_service_pb.KeyValue() proto.ParseFromString(records[i]) proto_records[i] = proto logging.debug("Sorting") proto_records.sort(cmp=_compare_keys) logging.debug("Writing") blob_file_name = (ctx.mapreduce_spec.name + "-" + ctx.mapreduce_id + "-output") output_path = files.blobstore.create( _blobinfo_uploaded_filename=blob_file_name) with output_writers.RecordsPool(output_path, ctx=ctx) as pool: for proto in proto_records: pool.append(proto.Encode()) logging.debug("Finalizing") files.finalize(output_path) time.sleep(1) output_path = files.blobstore.get_file_name( files.blobstore.get_blob_key(output_path)) entity = _OutputFile(key_name=output_path, parent=_OutputFile.get_root_key(ctx.mapreduce_id)) entity.put()
def testShuffleFiles(self): """Test shuffling multiple files.""" input_data = [(str(i), str(i)) for i in range(100)] input_data.sort() bucket_name = "testbucket" test_filename = "testfile" full_filename = "/%s/%s" % (bucket_name, test_filename) with cloudstorage.open(full_filename, mode="w") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) p = shuffler.ShufflePipeline("testjob", {"bucket_name": bucket_name}, [full_filename, full_filename, full_filename]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler.ShufflePipeline.from_id(p.pipeline_id) output_files = p.outputs.default.value output_data = [] for output_file in output_files: with cloudstorage.open(output_file) as f: for record in records.RecordsReader(f): proto = file_service_pb.KeyValues() proto.ParseFromString(record) output_data.append((proto.key(), proto.value_list())) output_data.sort() expected_data = sorted([ (str(k), [str(v), str(v), str(v)]) for (k, v) in input_data]) self.assertEquals(expected_data, output_data) self.assertEquals(1, len(self.emails))
def testShuffleFiles(self): """Test shuffling multiple files.""" input_data = [(str(i), str(i)) for i in range(100)] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = shuffler.ShufflePipeline("testjob", [input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler.ShufflePipeline.from_id(p.pipeline_id) output_files = p.outputs.default.value output_data = [] for output_file in output_files: with files.open(output_file, "r") as f: for record in records.RecordsReader(f): proto = file_service_pb.KeyValues() proto.ParseFromString(record) output_data.append((proto.key(), proto.value_list())) output_data.sort() expected_data = sorted([(str(k), [str(v), str(v), str(v)]) for (k, v) in input_data]) self.assertEquals(expected_data, output_data)
def __iter__(self): """Iterate over records in input files. self._offsets is always correctly updated so that stopping iterations doesn't skip records and doesn't read the same record twice. """ ctx = context.get() mapper_spec = ctx.mapreduce_spec.mapper shard_number = ctx._shard_state.shard_number filenames = mapper_spec.params[self.FILES_PARAM][shard_number] if len(filenames) != len(self._offsets): raise Exception("Files list and offsets do not match.") # Heap with (Key, Value, Index, reader) pairs. readers = [] # Initialize heap for (i, filename) in enumerate(filenames): offset = self._offsets[i] reader = records.RecordsReader(files.BufferedFile(filename)) reader.seek(offset) readers.append((None, None, i, reader)) # Read records from heap and merge values with the same key. # current_result is yielded and consumed buy _merge_map. # current_result = (key, value, is_partial) current_result = None current_count = 0 current_size = 0 while readers: (key, value, index, reader) = readers[0] if key is not None: current_count += 1 current_size += len(value) should_yield = False if current_result: if key != current_result[0]: # New key encountered should_yield = True elif (self._max_values_count != -1 and current_count >= self._max_values_count): # Maximum number of values encountered. current_result[2] = True should_yield = True elif (self._max_values_size != -1 and current_size >= self._max_values_size): # Maximum size of values encountered current_result[2] = True should_yield = True if should_yield: # New key encountered or maximum count hit. Yield current key. yield current_result if not current_result or should_yield: current_result = [key, [], False] current_count = 0 current_size = 0 current_result[1].append(value) # Read next key/value from reader. try: self._offsets[index] = reader.tell() start_time = time.time() binary_record = reader.read() # update counters if context.get(): operation.counters.Increment( input_readers.COUNTER_IO_READ_BYTES, len(binary_record))(context.get()) operation.counters.Increment( input_readers.COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(context.get()) proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) # Put read data back into heap. heapq.heapreplace(readers, (proto.key(), proto.value(), index, reader)) except EOFError: heapq.heappop(readers) # Yield leftovers. if current_result: yield current_result
def __iter__(self): """Iterate over records in input files. self._offsets is always correctly updated so that stopping iterations doesn't skip records and doesn't read the same record twice. Raises: Exception: when Files list and offsets do not match. Yields: The result. """ ctx = context.get() mapper_spec = ctx.mapreduce_spec.mapper shard_number = ctx._shard_state.shard_number filenames = mapper_spec.params[self.FILES_PARAM][shard_number] if len(filenames) != len(self._offsets): raise Exception("Files list and offsets do not match.") readers = [] for (i, filename) in enumerate(filenames): offset = self._offsets[i] reader = records.RecordsReader( cloudstorage.open(filename, read_buffer_size=self.GCS_BUFFER_SIZE)) reader.seek(offset) readers.append((None, None, i, reader)) current_result = None current_count = 0 current_size = 0 while readers: (key, value, index, reader) = readers[0] if key is not None: current_count += 1 current_size += len(value) should_yield = False if current_result: if key != current_result[0]: should_yield = True elif (self._max_values_count != -1 and current_count >= self._max_values_count): current_result[2] = True should_yield = True elif (self._max_values_size != -1 and current_size >= self._max_values_size): current_result[2] = True should_yield = True if should_yield: yield current_result if not current_result or should_yield: current_result = [key, [], False] current_count = 0 current_size = 0 current_result[1].append(value) try: self._offsets[index] = reader.tell() start_time = time.time() binary_record = reader.read() if context.get(): operation.counters.Increment( input_readers.COUNTER_IO_READ_BYTES, len(binary_record))(context.get()) operation.counters.Increment( input_readers.COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(context.get()) proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) heapq.heapreplace(readers, (proto.key(), proto.value(), index, reader)) except EOFError: heapq.heappop(readers) if current_result: yield current_result