def testSortFile(self): """Test sorting a file.""" input_file = files.blobstore.create() input_data = [(str(i), "_" + str(i)) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = shuffler._SortChunksPipeline("testjob", [input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler._SortChunksPipeline.from_id(p.pipeline_id) input_data.sort() output_files = p.outputs.default.value[0] output_data = [] for output_file in output_files: with files.open(output_file, "r") as f: for binary_record in records.RecordsReader(f): proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) output_data.append((proto.key(), proto.value())) self.assertEquals(input_data, output_data)
def testShuffleFiles(self): """Test shuffling multiple files.""" input_data = [(str(i), str(i)) for i in range(100)] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name(files.blobstore.get_blob_key(input_file)) p = shuffler.ShufflePipeline("testjob", [input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler.ShufflePipeline.from_id(p.pipeline_id) output_files = p.outputs.default.value output_data = [] for output_file in output_files: with files.open(output_file, "r") as f: for record in records.RecordsReader(f): proto = file_service_pb.KeyValues() proto.ParseFromString(record) output_data.append((proto.key(), proto.value_list())) output_data.sort() expected_data = sorted([(str(k), [str(v), str(v), str(v)]) for (k, v) in input_data]) self.assertEquals(expected_data, output_data)
def testMergeFiles(self): """Test merging multiple files.""" input_data = [(str(i), "_" + str(i)) for i in range(100)] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = TestMergePipeline([input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data] self.assertEquals(expected_data, output_data)
def testMergeFiles(self): """Test merging multiple files.""" input_data = [(str(i), "_" + str(i)) for i in range(100)] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name(files.blobstore.get_blob_key(input_file)) p = TestMergePipeline([input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data] self.assertEquals(expected_data, output_data)
def testSortFile(self): """Test sorting a file.""" input_file = files.blobstore.create() input_data = [(str(i), "_" + str(i)) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name(files.blobstore.get_blob_key(input_file)) p = shuffler._SortChunksPipeline("testjob", [input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler._SortChunksPipeline.from_id(p.pipeline_id) input_data.sort() output_files = p.outputs.default.value[0] output_data = [] for output_file in output_files: with files.open(output_file, "r") as f: for binary_record in records.RecordsReader(f): proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) output_data.append((proto.key(), proto.value())) self.assertEquals(input_data, output_data)
def testPartialRecords(self): """Test merging into partial key values.""" try: self._prev_max_values_count = shuffler._MergePipeline._MAX_VALUES_COUNT # force max values count to extremely low value. shuffler._MergePipeline._MAX_VALUES_COUNT = 1 input_data = [('1', 'a'), ('2', 'b'), ('3', 'c')] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = TestMergePipeline([input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [ ('1', ['a'], True), ('1', ['a'], True), ('1', ['a'], False), ('2', ['b'], True), ('2', ['b'], True), ('2', ['b'], False), ('3', ['c'], True), ('3', ['c'], True), ('3', ['c'], False), ] self.assertEquals([str(e) for e in expected_data], output_data) finally: shuffler._MergePipeline._MAX_VALUES_COUNT = self._prev_max_values_count
def testNoCombiner(self): """Test running with low values count but without combiner.""" # Even though this test doesn't have combiner specified, it's still # interesting to run. It forces MergePipeline to produce partial # key values and we verify that they are combined correctly in reader. # Prepare test data entity_count = 200 for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".test_combiner_map", __name__ + ".test_combiner_reduce", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=output_writers.__name__ + ".BlobstoreOutputWriter", mapper_params={"entity_kind": __name__ + ".TestEntity"}, shards=4, ) p.start() test_support.execute_until_empty(self.taskqueue) p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) self.assertEquals(1, len(p.outputs.default.value)) output_file = p.outputs.default.value[0] file_content = [] with files.open(output_file, "r") as f: file_content = sorted(f.read(10000000).strip().split("\n")) self.assertEquals(["('0', 9800)", "('1', 9900)", "('2', 10000)", "('3', 10100)"], file_content)
def testLotsOfValuesForSingleKey(self): TestEntity(data=str(1)).put() # Run Mapreduce p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".map_yield_lots_of_values", __name__ + ".reduce_length", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=output_writers.__name__ + ".BlobstoreRecordsOutputWriter", mapper_params={ "entity_kind": __name__ + "." + TestEntity.__name__, }, shards=16) p.start() test_support.execute_until_empty(self.taskqueue) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith("Pipeline successful:")) # Verify reduce output. p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) output_data = [] for output_file in p.outputs.default.value: with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = ["('1', 50000)"] expected_data.sort() output_data.sort() self.assertEquals(expected_data, output_data)
def testHugeTaskUseDatastore(self): """Test map job with huge parameter values.""" input_file = files.blobstore.create() input_data = [str(i) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for record in input_data: w.write(record) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) mapreduce_id = control.start_map( "test_map", __name__ + ".TestHandler", "mapreduce.input_readers.RecordsReader", { "file": input_file, # the parameter can't be compressed and wouldn't fit into # taskqueue payload "huge_parameter": random_string(900000) }, shard_count=4, base_path="/mapreduce_base_path") test_support.execute_until_empty(self.taskqueue) self.assertEquals(100, len(TestHandler.processed_entites)) self.assertEquals([], util._HugeTaskPayload.all().fetch(100))
def testRecordsReader(self): """End-to-end test for records reader.""" input_file = files.blobstore.create() input_data = [str(i) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for record in input_data: w.write(record) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) mapreduce_id = control.start_map( "test_map", __name__ + ".TestHandler", "mapreduce.input_readers.RecordsReader", { "file": input_file }, shard_count=4, base_path="/mapreduce_base_path") test_support.execute_until_empty(self.taskqueue) self.assertEquals(100, len(TestHandler.processed_entites))
def flush(self): """Flush pool contents.""" # Write data to in-memory buffer first. buf = _StringWriter() with records.RecordsWriter(buf) as w: for record in self._buffer: w.write(record) str_buf = buf.to_string() if not self._exclusive and len(str_buf) > _FILES_API_MAX_SIZE: # Shouldn't really happen because of flush size. raise errors.Error( "Buffer too big. Can't write more than %s bytes in one request: " "risk of writes interleaving. Got: %s" % (_FILES_API_MAX_SIZE, len(str_buf))) # Write data to file. start_time = time.time() with files.open(self._filename, "a", exclusive_lock=self._exclusive) as f: f.write(str_buf) if self._ctx: operation.counters.Increment( COUNTER_IO_WRITE_BYTES, len(str_buf))(self._ctx) if self._ctx: operation.counters.Increment( COUNTER_IO_WRITE_MSEC, int((time.time() - start_time) * 1000))(self._ctx) # reset buffer self._buffer = [] self._size = 0 gc.collect()
def testLotsOfValuesForSingleKey(self): TestEntity(data=str(1)).put() # Run Mapreduce p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".map_yield_lots_of_values", __name__ + ".reduce_length", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec= output_writers.__name__ + ".BlobstoreRecordsOutputWriter", mapper_params= { "entity_kind": __name__ + "." + TestEntity.__name__, }, shards=16) p.start() test_support.execute_until_empty(self.taskqueue) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith( "Pipeline successful:")) # Verify reduce output. p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) output_data = [] for output_file in p.outputs.default.value: with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = ["('1', 50000)"] expected_data.sort() output_data.sort() self.assertEquals(expected_data, output_data)
def testSingleShard(self): entity_count = 1000 for _ in range(entity_count): TestEntity().put() mapreduce_id = control.start_map( "test_map", __name__ + ".test_handler_yield_key_str", "mapreduce.input_readers.DatastoreInputReader", { "entity_kind": __name__ + "." + TestEntity.__name__, }, shard_count=4, base_path="/mapreduce_base_path", output_writer_spec=BLOBSTORE_WRITER_NAME) test_support.execute_until_empty(self.taskqueue) mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id) filenames = output_writers.BlobstoreOutputWriter.get_filenames( mapreduce_state) self.assertEqual(1, len(filenames)) blob_name = filenames[0] self.assertTrue(blob_name.startswith("/blobstore/")) self.assertFalse(blob_name.startswith("/blobstore/writable:")) with files.open(blob_name, "r") as f: data = f.read(10000000) self.assertEquals(1000, len(data.strip().split("\n")))
def testDedicatedParams(self): entity_count = 1000 for _ in range(entity_count): TestEntity().put() mapreduce_id = control.start_map( "test_map", __name__ + ".test_handler_yield_key_str", "mapreduce.input_readers.DatastoreInputReader", { "input_reader": { "entity_kind": __name__ + "." + TestEntity.__name__, }, "output_writer": { "filesystem": "gs", "gs_bucket_name": "bucket", }, }, shard_count=4, base_path="/mapreduce_base_path", output_writer_spec=FILE_WRITER_NAME) test_support.execute_until_empty(self.taskqueue) mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id) filenames = output_writers.FileOutputWriter.get_filenames(mapreduce_state) self.assertEqual(1, len(filenames)) self.assertTrue(filenames[0].startswith("/gs/bucket/")) with files.open(filenames[0], "r") as f: data = f.read(10000000) self.assertEquals(1000, len(data.strip().split("\n")))
def testAppendAndFlush(self): self.pool.append("a") self.assertEquals("", self.file_service.get_content("tempfile")) self.pool.append("b") self.assertEquals("", self.file_service.get_content("tempfile")) self.pool.flush() self.assertEquals(["a", "b"], list(records.RecordsReader(files.open("tempfile", "r"))))
def testMultipleRequests(self): """Tests restoring the reader state across multiple requests.""" input_file = files.blobstore.create() # Create a file with two records. with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: proto = file_service_pb.KeyValues() proto.set_key("key2") proto.value_list().extend(["a", "b"]) proto.set_partial(True) w.write(proto.Encode()) proto = file_service_pb.KeyValues() proto.set_key("key2") proto.value_list().extend(["c", "d"]) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) # Now read the records in two attempts, serializing and recreating the # input reader as if it's a separate request. reader = mapreduce_pipeline._ReducerReader([input_file], 0) it = iter(reader) self.assertEquals(input_readers.ALLOW_CHECKPOINT, it.next()) reader_state = reader.to_json() other_reader = mapreduce_pipeline._ReducerReader.from_json(reader_state) it = iter(reader) self.assertEquals(("key2", ["a", "b", "c", "d"]), it.next())
def testPartialRecords(self): """Test merging into partial key values.""" try: self._prev_max_values_count = shuffler._MergePipeline._MAX_VALUES_COUNT # force max values count to extremely low value. shuffler._MergePipeline._MAX_VALUES_COUNT = 1 input_data = [("1", "a"), ("2", "b"), ("3", "c")] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name(files.blobstore.get_blob_key(input_file)) p = TestMergePipeline([input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [ ("1", ["a"], True), ("1", ["a"], True), ("1", ["a"], False), ("2", ["b"], True), ("2", ["b"], True), ("2", ["b"], False), ("3", ["c"], True), ("3", ["c"], True), ("3", ["c"], False), ] self.assertEquals([str(e) for e in expected_data], output_data) finally: shuffler._MergePipeline._MAX_VALUES_COUNT = self._prev_max_values_count
def flush(self): """Flush pool contents.""" for filename, data in self._append_buffer.iteritems(): with files.open(filename, 'a') as f: if len(data) > self._max_size: raise "Bad data: " + str(len(data)) f.write(data) self._append_buffer = {} self._size = 0
def createMockDataLine(self, data): file_name = "myblob_01" file_path = files.blobstore.create("text/plain", file_name) with files.open(file_path, 'a') as fp: fp.write(data) files.finalize(file_path) blob_key = files.blobstore.get_blob_key(file_path) file_name = files.blobstore.get_file_name(blob_key) return file_name
def testAppendAndFlush(self): self.pool.append("a") self.assertEquals("", self.file_service.get_content("tempfile")) self.pool.append("b") self.assertEquals("", self.file_service.get_content("tempfile")) self.pool.flush() self.assertEquals( ["a", "b"], list(records.RecordsReader(files.open("tempfile", "r"))))
def testCleanup_ListOfLists(self): """Tests cleaning up a list of file lists.""" # Prepare test data entity_count = 200 for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() # Run map p = mapper_pipeline.MapperPipeline( "test", handler_spec=__name__ + ".test_map", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=output_writers.__name__ + ".KeyValueBlobstoreOutputWriter", params={ "entity_kind": __name__ + ".TestEntity", }, ) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id) # Can open files file_list = finished_map.outputs.default.value self.assertTrue(len(file_list) > 0) for name in file_list: files.open(name, "r").read(0) grouped_list = [file_list] # Cleanup cleanup = mapper_pipeline._CleanupPipeline(grouped_list) cleanup.start() test_support.execute_until_empty(self.taskqueue) # Cannot open files for name in file_list: self.assertRaises(files.Error, files.open, name, "r")
def testCleanup_ListOfLists(self): """Tests cleaning up a list of file lists.""" # Prepare test data entity_count = 200 for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() # Run map p = mapper_pipeline.MapperPipeline( "test", handler_spec=__name__ + ".test_map", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec= output_writers.__name__ + ".KeyValueBlobstoreOutputWriter", params={ "entity_kind": __name__ + ".TestEntity", }, ) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id) # Can open files file_list = finished_map.outputs.default.value self.assertTrue(len(file_list) > 0) for name in file_list: files.open(name, "r").read(0) grouped_list = [file_list] # Cleanup cleanup = mapper_pipeline._CleanupPipeline(grouped_list) cleanup.start() test_support.execute_until_empty(self.taskqueue) # Cannot open files for name in file_list: self.assertRaises(files.Error, files.open, name, "r")
def testMapReduce(self): # Prepare test data word_count = 2 file_path = "/gs/foo/bar" self.createGSData(file_path, "foo bar foo bar foo bar foo") # Run Mapreduce p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".test_map", "googlestorage.shuffler.ShufflePipeline", __name__ + ".test_reduce", input_reader_spec="googlestorage.input_readers.GoogleStorageLineInputReader", output_writer_spec= "googlestorage.output_writers.GoogleStorageOutputWriter", mapper_params={"file_paths": file_path, "gs_bucket_name": "temp_test", "gs_acl": "public-read"}, shuffler_params={"gs_bucket_name": "temp_test", "mime_type": "text/plain", "gs_acl": "public-read"}, reducer_params={"gs_bucket_name": "output_test", "mime_type": "text/plain", "gs_acl": "public-read",}, shards=2) p.start() test_support.execute_until_empty(self.taskqueue) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith( "Pipeline successful:")) # Verify reduce output. p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) for output_file in p.outputs.default.value: with files.open(output_file, "r") as fp: buf = fp.read(1000000) filestream = StringIO(buf) output_data = filestream.read() outputList = output_data[:-1].split('\n') expected_data = ["foo: 4", "bar: 3"] expected_data.sort() outputList.sort() self.assertEquals(expected_data, outputList) # Verify that mapreduce doesn't leave intermediate files behind. blobInfos = blobstore.BlobInfo.all().fetch(limit=1000) for blobinfo in blobInfos: self.assertTrue( "Bad filename: %s" % blobinfo.filename, re.match("test-reduce-.*-output-\d+", blobinfo.filename))
def createInvalidMockData(self): blob_keys = [] url = "invalidScheme://test_url.com" file_path = files.blobstore.create("text/plain", url) with files.open(file_path, 'a') as fp: fp.write(url) files.finalize(file_path) blob_key = files.blobstore.get_blob_key(file_path) file_name = files.blobstore.get_file_name(blob_key) blob_keys.append(str(file_name)) return blob_keys
def createGSData(self, file_count, data): file_paths = [] for file_number in range(file_count): file_path = "/gs/foo/bar%d" % file_number write_path = files.gs.create(file_path, mime_type='text/plain', acl='public-read') with files.open(write_path, 'a') as fp: fp.write(data) files.finalize(write_path) file_paths.append(file_path) return file_paths
def createMockData(self, url_count, shard): blob_keys = [] for num in range(shard): file_name = "myblob_%d" % num urls = "\n".join(["http://test_url_%d.com" % i for i in range(url_count)]) file_path = files.blobstore.create("text/plain", file_name) with files.open(file_path, 'a') as fp: fp.write(urls) files.finalize(file_path) blob_key = files.blobstore.get_blob_key(file_path) file_name = files.blobstore.get_file_name(blob_key) blob_keys.append(str(file_name)) return blob_keys
def createMockData(self, url_count, shard): blob_keys = [] for num in range(shard): file_name = "myblob_%d" % num urls = "\n".join( ["http://test_url_%d.com" % i for i in range(url_count)]) file_path = files.blobstore.create("text/plain", file_name) with files.open(file_path, 'a') as fp: fp.write(urls) files.finalize(file_path) blob_key = files.blobstore.get_blob_key(file_path) file_name = files.blobstore.get_file_name(blob_key) blob_keys.append(str(file_name)) return blob_keys
def testShuffleFiles(self): """Test shuffling multiple files.""" input_data = [(str(i), str(i)) for i in range(100)] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = shuffler.ShufflePipeline("testjob", [input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler.ShufflePipeline.from_id(p.pipeline_id) output_files = p.outputs.default.value output_data = [] for output_file in output_files: with files.open(output_file, "r") as f: for record in records.RecordsReader(f): proto = file_service_pb.KeyValues() proto.ParseFromString(record) output_data.append((proto.key(), proto.value_list())) output_data.sort() expected_data = sorted([(str(k), [str(v), str(v), str(v)]) for (k, v) in input_data]) self.assertEquals(expected_data, output_data)
def testMapReduce(self): # Prepare test data entity_count = 200 for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() # Run Mapreduce p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".test_map", __name__ + ".test_reduce", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec= output_writers.__name__ + ".BlobstoreRecordsOutputWriter", mapper_params={ "entity_kind": __name__ + "." + TestEntity.__name__, }, shards=16) p.start() test_support.execute_until_empty(self.taskqueue) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith( "Pipeline successful:")) # Verify reduce output. p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) output_data = [] for output_file in p.outputs.default.value: with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [ str((str(d), ["", ""])) for d in range(entity_count)] expected_data.sort() output_data.sort() self.assertEquals(expected_data, output_data) # Verify that mapreduce doesn't leave intermediate files behind. blobInfos = blobstore.BlobInfo.all().fetch(limit=1000) for blobinfo in blobInfos: self.assertTrue( "Bad filename: %s" % blobinfo.filename, re.match("test-reduce-.*-output-\d+", blobinfo.filename))
def flush(self): """Flush pool contents.""" start_time = time.time() for filename, data in self._append_buffer.iteritems(): with files.open(filename, "a") as f: if len(data) > self._flush_size: raise errors.Error("Bad data: %s" % len(data)) if self._ctx: operation.counters.Increment( COUNTER_IO_WRITE_BYTES, len(data))(self._ctx) f.write(data) if self._ctx: operation.counters.Increment( COUNTER_IO_WRITE_MSEC, int((time.time() - start_time) * 1000))(self._ctx) self._append_buffer = {} self._size = 0
def createMockData(self, data): """Create mock data for FetchContentPipeline""" input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: key = str(data[0]) value = str(data[1]) proto = file_service_pb.KeyValue() proto.set_key(key) proto.set_value(value) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) return input_file
def flush(self): """Flush pool contents.""" start_time = time.time() for filename, data in self._append_buffer.iteritems(): with files.open(filename, "a") as f: if len(data) > self._flush_size: raise "Bad data: " + str(len(data)) if self._ctx: operation.counters.Increment( COUNTER_IO_WRITE_BYTES, len(data))(self._ctx) f.write(data) if self._ctx: operation.counters.Increment( COUNTER_IO_WRITE_MSEC, int((time.time() - start_time) * 1000))(self._ctx) self._append_buffer = {} self._size = 0
def flush(self): """Flush pool contents.""" start_time = time.time() for filename, data in self._append_buffer.iteritems(): with files.open(filename, "a") as f: if len(data) > _FILES_API_MAX_SIZE: raise errors.Error("Bad data of length: %s" % len(data)) if self._ctx: operation.counters.Increment(COUNTER_IO_WRITE_BYTES, len(data))(self._ctx) f.write(data) if self._ctx: operation.counters.Increment( COUNTER_IO_WRITE_MSEC, int( (time.time() - start_time) * 1000))(self._ctx) self._append_buffer = {} self._size = 0
def testCombiner(self): """Test running with low values count but with combiner.""" # Prepare test data entity_count = 200 for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".test_combiner_map", __name__ + ".test_combiner_reduce", combiner_spec=__name__ + ".TestCombiner", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=output_writers.__name__ + ".BlobstoreOutputWriter", mapper_params={ "entity_kind": __name__ + ".TestEntity", }, shards=4) p.start() test_support.execute_until_empty(self.taskqueue) p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) self.assertEquals(1, len(p.outputs.default.value)) output_file = p.outputs.default.value[0] file_content = [] with files.open(output_file, "r") as f: file_content = sorted(f.read(10000000).strip().split("\n")) self.assertEquals( ["('0', 9800)", "('1', 9900)", "('2', 10000)", "('3', 10100)"], file_content) self.assertTrue(TestCombiner.invocations) for invocation in TestCombiner.invocations: key = invocation[0] values = invocation[1] combiner_values = invocation[2] self.assertTrue(key) self.assertTrue(values) self.assertEquals(1, len(values)) self.assertTrue(int(values[0]) % 4 == int(key))
def testCombiner(self): """Test running with low values count but with combiner.""" # Prepare test data entity_count = 200 for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".test_combiner_map", __name__ + ".test_combiner_reduce", combiner_spec=__name__ + ".TestCombiner", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=output_writers.__name__ + ".BlobstoreOutputWriter", mapper_params={"entity_kind": __name__ + ".TestEntity"}, shards=4, ) p.start() test_support.execute_until_empty(self.taskqueue) p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) self.assertEquals(1, len(p.outputs.default.value)) output_file = p.outputs.default.value[0] file_content = [] with files.open(output_file, "r") as f: file_content = sorted(f.read(10000000).strip().split("\n")) self.assertEquals(["('0', 9800)", "('1', 9900)", "('2', 10000)", "('3', 10100)"], file_content) self.assertTrue(TestCombiner.invocations) for invocation in TestCombiner.invocations: key = invocation[0] values = invocation[1] combiner_values = invocation[2] self.assertTrue(key) self.assertTrue(values) self.assertEquals(1, len(values)) self.assertTrue(int(values[0]) % 4 == int(key))
def requestCrawlerOutput(self): logging.debug("Request Crawler Output") file_content = 'DEFAULT_CONTENT' url = "http://web.ist.utl.pt/ist163512/crawler.txt" result = urllib2.urlopen(url) file_content = result.read() file_name = files.blobstore.create(mime_type='application/octet-stream') self.crawler_file_name = file_name with files.open(file_name, 'a') as f: f.write("%s" % file_content) files.finalize(file_name) logging.debug("File saved successfully") key = files.blobstore.get_blob_key(file_name) return key
def testMultipleShards(self): entity_count = 1000 for _ in range(entity_count): TestEntity().put() mapreduce_id = control.start_map( "test_map", __name__ + ".test_handler_yield_key_str", "mapreduce.input_readers.DatastoreInputReader", { "entity_kind": __name__ + "." + TestEntity.__name__, "output_sharding": "input", "filesystem": "gs", }, shard_count=4, base_path="/mapreduce_base_path", output_writer_spec=BLOBSTORE_WRITER_NAME) test_support.execute_until_empty(self.taskqueue) mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id) filenames = output_writers.BlobstoreOutputWriter.get_filenames( mapreduce_state) self.assertEqual(4, len(filenames)) file_lengths = [] for filename in filenames: self.assertTrue(filename.startswith("/blobstore/")) self.assertFalse(filename.startswith("/blobstore/writable:")) with files.open(filename, "r") as f: data = f.read(10000000) file_lengths.append(len(data.strip().split("\n"))) # these numbers are totally random and depend on our sharding, # which is quite deterministic. expected_lengths = [199, 210, 275, 316] self.assertEqual(1000, sum(expected_lengths)) self.assertEquals(expected_lengths, file_lengths)
def testMultipleShards(self): entity_count = 1000 for _ in range(entity_count): TestEntity().put() mapreduce_id = control.start_map( "test_map", __name__ + ".test_handler_yield_key_str", "mapreduce.input_readers.DatastoreInputReader", { "entity_kind": __name__ + "." + TestEntity.__name__, "output_sharding": "input", }, shard_count=4, base_path="/mapreduce_base_path", output_writer_spec=BLOBSTORE_WRITER_NAME) test_support.execute_until_empty(self.taskqueue) mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id) filenames = output_writers.BlobstoreOutputWriter.get_filenames( mapreduce_state) self.assertEqual(4, len(filenames)) file_lengths = [] for filename in filenames: self.assertTrue(filename.startswith("/blobstore/")) self.assertFalse(filename.startswith("/blobstore/writable:")) with files.open(filename, "r") as f: data = f.read(10000000) file_lengths.append(len(data.strip().split("\n"))) # these numbers are totally random and depend on our sharding, # which is quite deterministic. expected_lengths = [199, 210, 275, 316] self.assertEqual(1000, sum(expected_lengths)) self.assertEquals(expected_lengths, file_lengths)
def testRecordsReader(self): """End-to-end test for records reader.""" input_file = files.blobstore.create() input_data = [str(i) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for record in input_data: w.write(record) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) mapreduce_id = control.start_map( "test_map", __name__ + ".TestHandler", "mapreduce.input_readers.RecordsReader", {"file": input_file}, shard_count=4, base_path="/mapreduce_base_path") test_support.execute_until_empty(self.taskqueue) self.assertEquals(100, len(TestHandler.processed_entites))
def get(self): filekey = self.request.get("filekey") #key = ManageCrawlerOutput.requestCrawlerOutput(self) # str_key = str(key) self.response.headers['Content-Type'] = 'text/plain' query = db.GqlQuery("SELECT * FROM WebSiteInfo") file_name = files.blobstore.create(mime_type='application/octet-stream') with files.open(file_name, 'a') as f: for q in query: w = WebSiteInfo() w=q title = str(w.title) title = re.sub(r" ", "", title) #title.replace(" ","-") self.response.out.write("parsed title %s\n" % title) f.write("%s %s\n" % (title,w.siteLinks)) files.finalize(file_name) logging.debug("File saved successfully") key = files.blobstore.get_blob_key(file_name) # info = blobstore.get(key) # reader = info.open() # file_content = reader.read(501900) # self.response.out.write("\n\n") # self.response.out.write("%s" % file_content) pipeline = PageRankPipeline(filekey, str(key)) pipeline.start() self.redirect(pipeline.base_path + "/status?root=" + pipeline.pipeline_id)
def testNoCombiner(self): """Test running with low values count but without combiner.""" # Even though this test doesn't have combiner specified, it's still # interesting to run. It forces MergePipeline to produce partial # key values and we verify that they are combined correctly in reader. # Prepare test data entity_count = 200 for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".test_combiner_map", __name__ + ".test_combiner_reduce", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=output_writers.__name__ + ".BlobstoreOutputWriter", mapper_params={ "entity_kind": __name__ + ".TestEntity", }, shards=4) p.start() test_support.execute_until_empty(self.taskqueue) p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) self.assertEquals(1, len(p.outputs.default.value)) output_file = p.outputs.default.value[0] file_content = [] with files.open(output_file, "r") as f: file_content = sorted(f.read(10000000).strip().split("\n")) self.assertEquals( ["('0', 9800)", "('1', 9900)", "('2', 10000)", "('3', 10100)"], file_content)
def next(self): """Returns the next input from as an (offset, line) tuple.""" self._has_iterated = True if not self._filestream: with files.open(self._file_path, 'r') as fp: value = fp.read() self._filestream = StringIO(value) if self._start_position: self._filestream.seek(self._start_position) self._filestream.readline() start_position = self._filestream.tell() if start_position > self._end_position: self.stopIteration() line = self._filestream.readline() if not line: self.stopIteration() return start_position, line.rstrip("\n")
def testSingleRequest(self): """Tests when a key can be handled during a single request.""" input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: # First record is full proto = file_service_pb.KeyValues() proto.set_key("key1") proto.value_list().extend(["a", "b"]) w.write(proto.Encode()) # Second record is partial proto = file_service_pb.KeyValues() proto.set_key("key2") proto.value_list().extend(["a", "b"]) proto.set_partial(True) w.write(proto.Encode()) proto = file_service_pb.KeyValues() proto.set_key("key2") proto.value_list().extend(["c", "d"]) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) reader = mapreduce_pipeline._ReducerReader([input_file], 0) self.assertEquals( [("key1", ["a", "b"]), input_readers.ALLOW_CHECKPOINT, ("key2", ["a", "b", "c", "d"])], list(reader)) # now test state serialization reader = mapreduce_pipeline._ReducerReader([input_file], 0) i = reader.__iter__() self.assertEquals( {"position": 0, "current_values": "Ti4=", "current_key": "Ti4=", "filenames": [input_file]}, reader.to_json()) self.assertEquals(("key1", ["a", "b"]), i.next()) self.assertEquals( {"position": 19, "current_values": "Ti4=", "current_key": "Ti4=", "filenames": [input_file]}, reader.to_json()) self.assertEquals(input_readers.ALLOW_CHECKPOINT, i.next()) self.assertEquals( {"position": 40, "current_values": "KGxwMApTJ2EnCnAxCmFTJ2InCnAyCmEu", "current_key": "UydrZXkyJwpwMAou", "filenames": [input_file]}, reader.to_json()) self.assertEquals(("key2", ["a", "b", "c", "d"]), i.next()) self.assertEquals( {"position": 59, "current_values": "Ti4=", "current_key": "Ti4=", "filenames": [input_file]}, reader.to_json()) try: i.next() self.fail("Exception expected") except StopIteration: # expected pass # now do test deserialization at every moment. reader = mapreduce_pipeline._ReducerReader([input_file], 0) i = reader.__iter__() reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json()) self.assertEquals(("key1", ["a", "b"]), i.next()) reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json()) self.assertEquals(input_readers.ALLOW_CHECKPOINT, i.next()) reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json()) self.assertEquals(("key2", ["a", "b", "c", "d"]), i.next()) reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json()) try: i.next() self.fail("Exception expected") except StopIteration: # expected pass
def testReadPartial(self): input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: # First record is full proto = file_service_pb.KeyValues() proto.set_key("key1") proto.value_list().extend(["a", "b"]) w.write(proto.Encode()) # Second record is partial proto = file_service_pb.KeyValues() proto.set_key("key2") proto.value_list().extend(["a", "b"]) proto.set_partial(True) w.write(proto.Encode()) proto = file_service_pb.KeyValues() proto.set_key("key2") proto.value_list().extend(["c", "d"]) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) reader = mapreduce_pipeline._ReducerReader([input_file], 0) self.assertEquals( [("key1", ["a", "b"]), input_readers.ALLOW_CHECKPOINT, ("key2", ["a", "b", "c", "d"])], list(reader)) # now test state serialization reader = mapreduce_pipeline._ReducerReader([input_file], 0) i = reader.__iter__() self.assertEquals( {"position": 0, "current_values": None, "current_key": None, "filenames": [input_file]}, reader.to_json()) self.assertEquals(("key1", ["a", "b"]), i.next()) self.assertEquals( {"position": 19, "current_values": None, "current_key": None, "filenames": [input_file]}, reader.to_json()) self.assertEquals(input_readers.ALLOW_CHECKPOINT, i.next()) self.assertEquals( {"position": 40, "current_values": ["a", "b"], "current_key": "key2", "filenames": [input_file]}, reader.to_json()) self.assertEquals(("key2", ["a", "b", "c", "d"]), i.next()) self.assertEquals( {"position": 59, "current_values": None, "current_key": None, "filenames": [input_file]}, reader.to_json()) try: i.next() self.fail("Exception expected") except StopIteration: # expected pass # now do test deserialization at every moment. reader = mapreduce_pipeline._ReducerReader([input_file], 0) i = reader.__iter__() reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json()) self.assertEquals(("key1", ["a", "b"]), i.next()) reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json()) self.assertEquals(input_readers.ALLOW_CHECKPOINT, i.next()) reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json()) self.assertEquals(("key2", ["a", "b", "c", "d"]), i.next()) reader = mapreduce_pipeline._ReducerReader.from_json(reader.to_json()) try: i.next() self.fail("Exception expected") except StopIteration: # expected pass
def createGSData(self, file_path, data): write_path = files.gs.create(file_path, mime_type='text/plain', acl='public-read') with files.open(write_path, 'a') as fp: fp.write(data) files.finalize(write_path)