def testSortFile(self): """Test sorting a file.""" input_file = files.blobstore.create() input_data = [(str(i), "_" + str(i)) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = shuffler._SortChunksPipeline("testjob", [input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler._SortChunksPipeline.from_id(p.pipeline_id) input_data.sort() output_files = p.outputs.default.value[0] output_data = [] for output_file in output_files: with files.open(output_file, "r") as f: for binary_record in records.RecordsReader(f): proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) output_data.append((proto.key(), proto.value())) self.assertEquals(input_data, output_data)
def testHugeTaskUseDatastore(self): """Test map job with huge parameter values.""" input_file = files.blobstore.create() input_data = [str(i) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for record in input_data: w.write(record) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) mapreduce_id = control.start_map( "test_map", __name__ + ".TestHandler", "mapreduce.input_readers.RecordsReader", { "file": input_file, # the parameter can't be compressed and wouldn't fit into # taskqueue payload "huge_parameter": random_string(900000) }, shard_count=4, base_path="/mapreduce_base_path") test_support.execute_until_empty(self.taskqueue) self.assertEquals(100, len(TestHandler.processed_entites)) self.assertEquals([], util._HugeTaskPayload.all().fetch(100))
def flush(self): """Flush pool contents.""" # Write data to in-memory buffer first. buf = _StringWriter() with records.RecordsWriter(buf) as w: for record in self._buffer: w.write(record) str_buf = buf.to_string() if not self._exclusive and len(str_buf) > _FILES_API_MAX_SIZE: # Shouldn't really happen because of flush size. raise errors.Error( "Buffer too big. Can't write more than %s bytes in one request: " "risk of writes interleaving. Got: %s" % (_FILES_API_MAX_SIZE, len(str_buf))) # Write data to file. start_time = time.time() with files.open(self._filename, "a", exclusive_lock=self._exclusive) as f: f.write(str_buf) if self._ctx: operation.counters.Increment( COUNTER_IO_WRITE_BYTES, len(str_buf))(self._ctx) if self._ctx: operation.counters.Increment( COUNTER_IO_WRITE_MSEC, int((time.time() - start_time) * 1000))(self._ctx) # reset buffer self._buffer = [] self._size = 0 gc.collect()
def testMergeFiles(self): """Test merging multiple files.""" input_data = [(str(i), "_" + str(i)) for i in range(100)] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = TestMergePipeline([input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data] self.assertEquals(expected_data, output_data)
def testPartialRecords(self): """Test merging into partial key values.""" try: self._prev_max_values_count = shuffler._MergePipeline._MAX_VALUES_COUNT # force max values count to extremely low value. shuffler._MergePipeline._MAX_VALUES_COUNT = 1 input_data = [('1', 'a'), ('2', 'b'), ('3', 'c')] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = TestMergePipeline([input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [ ('1', ['a'], True), ('1', ['a'], True), ('1', ['a'], False), ('2', ['b'], True), ('2', ['b'], True), ('2', ['b'], False), ('3', ['c'], True), ('3', ['c'], True), ('3', ['c'], False), ] self.assertEquals([str(e) for e in expected_data], output_data) finally: shuffler._MergePipeline._MAX_VALUES_COUNT = self._prev_max_values_count
def createMockData(self, data): """Create mock data for FetchContentPipeline""" input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: key = str(data[0]) value = str(data[1]) proto = file_service_pb.KeyValue() proto.set_key(key) proto.set_value(value) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) return input_file
def testRecordsReader(self): """End-to-end test for records reader.""" input_file = files.blobstore.create() input_data = [str(i) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for record in input_data: w.write(record) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) mapreduce_id = control.start_map( "test_map", __name__ + ".TestHandler", "mapreduce.input_readers.RecordsReader", {"file": input_file}, shard_count=4, base_path="/mapreduce_base_path") test_support.execute_until_empty(self.taskqueue) self.assertEquals(100, len(TestHandler.processed_entites))
def testShuffleFiles(self): """Test shuffling multiple files.""" input_data = [(str(i), str(i)) for i in range(100)] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = shuffler.ShufflePipeline("testjob", [input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler.ShufflePipeline.from_id(p.pipeline_id) output_files = p.outputs.default.value output_data = [] for output_file in output_files: with files.open(output_file, "r") as f: for record in records.RecordsReader(f): proto = file_service_pb.KeyValues() proto.ParseFromString(record) output_data.append((proto.key(), proto.value_list())) output_data.sort() expected_data = sorted([(str(k), [str(v), str(v), str(v)]) for (k, v) in input_data]) self.assertEquals(expected_data, output_data)
def testReadPartial(self): input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: # First record is full proto = file_service_pb.KeyValues() proto.set_key("key1") proto.value_list().extend(["a", "b"]) w.write(proto.Encode()) # Second record is partial proto = file_service_pb.KeyValues() proto.set_key("key2") proto.value_list().extend(["a", "b"]) proto.set_partial(True) w.write(proto.Encode()) proto = file_service_pb.KeyValues() proto.set_key("key2") proto.value_list().extend(["c", "d"]) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) reader = mapreduce_pipeline._ReducerReader([input_file], 0) self.assertEquals( [("key1", ["a", "b"]), input_readers.ALLOW_CHECKPOINT, ("key2", ["a", "b", "c", "d"])], list(reader)) # now test state serialization reader = mapreduce_pipeline._ReducerReader([input_file], 0) i = reader.__iter__() self.assertEquals( {"position": 0, "current_values": None, "current_key": None, "filenames": [input_file]}, reader.to_json()) self.assertEquals(("key1", ["a", "b"]), i.next()) self.assertEquals( {"position": 19, "current_values": None, "current_key": None, "filenames": [input_file]}, reader.to_json()) self.assertEquals(input_readers.ALLOW_CHECKPOINT, i.next()) self.assertEquals( {"position": 40, "current_values": ["a", "b"], "current_key": "key2", "filenames": [input_file]}, reader.to_json()) self.assertEquals(("key2", ["a", "b", "c", "d"]), i.next()) self.assertEquals( {"position": 59, "current_values": None, "current_key": None, "filenames": [input_file]}, reader.to_json()) try: i.next() self.fail("Exception expected") except StopIteration: # expected pass # now do test deserialization at every moment. reader = mapreduce_pipeline._ReducerReader([input_file], 0) i = reader.__iter__() reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json()) self.assertEquals(("key1", ["a", "b"]), i.next()) reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json()) self.assertEquals(input_readers.ALLOW_CHECKPOINT, i.next()) reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json()) self.assertEquals(("key2", ["a", "b", "c", "d"]), i.next()) reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json()) try: i.next() self.fail("Exception expected") except StopIteration: # expected pass