def testSuccessfulRun(self): createMockCrawlDbDatum(2, 6, False) p = pipelines._ExactDomainMapreducePipeline( "ExactDomainMapreducePipeline", params={ "entity_kind": "lakshmi.datum.CrawlDbDatum", }, shard_count=3) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._RobotsFetchPipeline.from_id(p.pipeline_id) # Can open files file_paths = finished_map.outputs.default.value self.assertTrue(len(file_paths) > 0) self.assertTrue(file_paths[0].startswith("/blobstore/")) for file_path in file_paths: blob_key = files.blobstore.get_blob_key(file_path) reader = input_readers.BlobstoreLineInputReader(blob_key, 0, 100) u = 0 for content in reader: self.assertTrue(content[1] != None) u += 1 self.assertEqual(2, u) query = CrawlDbDatum.query( CrawlDbDatum.extract_domain_url == "http://hoge_0.com") entities = query.fetch() for entity in entities: self.assertEquals("http://hoge_0.com", entity.extract_domain_url)
def testSuccessfulRun(self): file_name1 = self.createMockData(( "http://hoge_0.com", "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3" )) file_name2 = self.createMockData(( "http://hoge_1.com", "User-agent: test\nAllow: /content_0\nAllow: /content_1\nDisallow: /content_3" )) createMockCrawlDbDatum(2, 6, True) p = pipelines._FetchSetsBufferPipeline("FetchSetsBufferPipeline", [file_name1, file_name2]) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._FetchSetsBufferPipeline.from_id( p.pipeline_id) # Can open files file_paths = finished_map.outputs.default.value self.assertTrue(len(file_paths) > 0) self.assertTrue(file_paths[0].startswith("/blobstore/")) reader = input_readers.RecordsReader(file_paths, 0) for binary_record in reader: proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) key = proto.key() value = proto.value() self.assertTrue(key is not None) self.assertTrue(value is not None)
def testSuccessfulRun(self): createMockCrawlDbDatum(2, 2, True) file_name1 = self.createMockData(("http://hoge_0.com/content_0", True)) file_name2 = self.createMockData( ("http://hoge_1.com/content_0", False)) static_content = "<html><body>TestContent</body></html>" self.setReturnValue(content=static_content, headers={ "Content-Length": len(static_content), "Content-Type": "text/html" }) p = pipelines._FetchPagePipeline("FetchPipeline", [file_name1, file_name2], 2) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._FetchPagePipeline.from_id(p.pipeline_id) # Can open files file_paths = finished_map.outputs.default.value self.assertTrue(len(file_paths) > 0) self.assertTrue(file_paths[0].startswith("/blobstore/")) entities = CrawlDbDatum.query( CrawlDbDatum.url == "http://hoge_0.com/content_0").fetch() entity = entities[0] fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch() self.assertTrue(fetched_datum is not None)
def testEmptyMapper(self): """Test empty mapper over empty dataset.""" p = mapper_pipeline.MapperPipeline( "empty_map", handler_spec=__name__ + ".test_empty_handler", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", params={ "input_reader": { "entity_kind": __name__ + ".TestEntity", # Test datetime can be json serialized. "filters": [("dt", "=", datetime.datetime(2000, 1, 1))], }, }, ) p.start() test_support.execute_until_empty(self.taskqueue) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith( "Pipeline successful:")) p = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id) self.assertTrue(p.outputs.job_id.value) counters = p.outputs.counters.value self.assertTrue(counters) self.assertTrue(context.COUNTER_MAPPER_WALLTIME_MS in counters)
def testFetchError(self): blob_keys = self.createInvalidMockData() static_content = "User-agent: *\nDisallow: /search\nDisallow: /sdch\nDisallow: /groups" self.setReturnValue(content=static_content, headers={ "Content-Length": len(static_content), "Content-Type": "text/html" }) p = pipelines._RobotsFetchPipeline("RobotsFetchPipeline", blob_keys, 2) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._RobotsFetchPipeline.from_id(p.pipeline_id) # Can open files file_list = finished_map.outputs.default.value self.assertTrue(len(file_list) > 0) reader = input_readers.RecordsReader(file_list, 0) for binary_record in reader: proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) key = proto.key() value = proto.value() self.assertEquals("invalidScheme://test_url.com", key) self.assertEquals("User-agent: *\nDisallow: /", value)
def testSingleShard(self): entity_count = 1000 for _ in range(entity_count): TestEntity().put() mapreduce_id = control.start_map( "test_map", __name__ + ".test_handler_yield_key_str", "mapreduce.input_readers.DatastoreInputReader", { "entity_kind": __name__ + "." + TestEntity.__name__, }, shard_count=4, base_path="/mapreduce_base_path", output_writer_spec=BLOBSTORE_WRITER_NAME) test_support.execute_until_empty(self.taskqueue) mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id) filenames = output_writers.BlobstoreOutputWriter.get_filenames( mapreduce_state) self.assertEqual(1, len(filenames)) blob_name = filenames[0] self.assertTrue(blob_name.startswith("/blobstore/")) self.assertFalse(blob_name.startswith("/blobstore/writable:")) with files.open(blob_name, "r") as f: data = f.read(10000000) self.assertEquals(1000, len(data.strip().split("\n")))
def testFetchEndToEnd(self): """Test for through of fetcher job""" createMockCrawlDbDatum("http://foo.com/bar.txt") static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3" self.setReturnValue(url="http://foo.com/robots.txt", content=static_robots, headers={ "Content-Length": len(static_robots), "content-type": "text/plain" }) static_content = "test" static_content_length = len(static_content) self.setReturnValue(url="http://foo.com/bar.txt", content=static_content, headers={ "Content-Length": static_content_length, "Content-Type": "text/plain" }) p = pipelines.FetcherPipeline( "FetcherPipeline", params={"entity_kind": "lakshmi.datum.CrawlDbDatum"}, parser_params={"text/plain": __name__ + "._parserNotOutlinks"}, shards=2) p.start() test_support.execute_until_empty(self.taskqueue)
def testSortFile(self): """Test sorting a file.""" input_file = files.blobstore.create() input_data = [(str(i), "_" + str(i)) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = shuffler._SortChunksPipeline("testjob", [input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler._SortChunksPipeline.from_id(p.pipeline_id) input_data.sort() output_files = p.outputs.default.value[0] output_data = [] for output_file in output_files: with files.open(output_file, "r") as f: for binary_record in records.RecordsReader(f): proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) output_data.append((proto.key(), proto.value())) self.assertEquals(input_data, output_data)
def testSuccessfulRun(self): createMockCrawlDbDatum(2, 6, False) p = pipelines._ExactDomainMapreducePipeline("ExactDomainMapreducePipeline", params={ "entity_kind": "lakshmi.datum.CrawlDbDatum", }, shard_count=3) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._RobotsFetchPipeline.from_id(p.pipeline_id) # Can open files file_paths = finished_map.outputs.default.value self.assertTrue(len(file_paths) > 0) self.assertTrue(file_paths[0].startswith("/blobstore/")) for file_path in file_paths: blob_key = files.blobstore.get_blob_key(file_path) reader = input_readers.BlobstoreLineInputReader(blob_key, 0, 100) u = 0 for content in reader: self.assertTrue(content[1]!=None) u += 1 self.assertEqual(2, u) query = CrawlDbDatum.query(CrawlDbDatum.extract_domain_url=="http://hoge_0.com") entities = query.fetch() for entity in entities: self.assertEquals("http://hoge_0.com", entity.extract_domain_url)
def testMergeFiles(self): """Test merging multiple files.""" input_data = [(str(i), "_" + str(i)) for i in range(100)] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = TestMergePipeline([input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data] self.assertEquals(expected_data, output_data)
def _run_test(self, num_shards, num_files, multi_slices=False): bucket_name = "testing" object_prefix = "file-" job_name = "test_map" expected_content = self.create_test_content(bucket_name, object_prefix, num_files) job = map_job.Job.submit( map_job.JobConfig(job_name=job_name, mapper=_InputReaderMemoryMapper, input_reader_cls=input_reader.GCSInputReader, input_reader_params={ "bucket_name": bucket_name, "objects": [object_prefix + "*"], "path_filter": _MyPathFilter() }, shard_count=num_shards)) test_support.execute_until_empty(self.taskqueue) self.assertEqual(expected_content.sort(), _memory_mapper_data.sort()) self.assertEqual(job.SUCCESS, job.get_status()) self.assertEqual( num_files - 1, job.get_counter(input_reader.GCSInputReader.COUNTER_FILE_READ)) if multi_slices: ss = model.ShardState.find_all_by_mapreduce_state(job._state) for s in ss: self.assertTrue(s.slice_id > 0)
def testFailedMapReduce(self): # Add some random data. entity_count = 200 for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".test_failed_map", __name__ + ".test_mapreduce_reduce", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=(output_writers.__name__ + ".BlobstoreRecordsOutputWriter"), mapper_params={ "entity_kind": __name__ + "." + TestEntity.__name__, }, shards=16) p.start() test_support.execute_until_empty(self.taskqueue) p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) self.assertEqual(model.MapreduceState.RESULT_FAILED, p.outputs.result_status.value) self.assertEqual(0, len(p.outputs.default.value))
def testNoCombiner(self): """Test running with low values count but without combiner.""" # Even though this test doesn't have combiner specified, it's still # interesting to run. It forces MergePipeline to produce partial # key values and we verify that they are combined correctly in reader. # Prepare test data entity_count = 200 for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".test_combiner_map", __name__ + ".test_combiner_reduce", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=output_writers.__name__ + ".BlobstoreOutputWriter", mapper_params={"entity_kind": __name__ + ".TestEntity"}, shards=4, ) p.start() test_support.execute_until_empty(self.taskqueue) p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) self.assertEquals(1, len(p.outputs.default.value)) output_file = p.outputs.default.value[0] file_content = [] with files.open(output_file, "r") as f: file_content = sorted(f.read(10000000).strip().split("\n")) self.assertEquals(["('0', 9800)", "('1', 9900)", "('2', 10000)", "('3', 10100)"], file_content)
def testProcessEntites(self): """Test empty mapper over non-empty dataset.""" for _ in range(100): TestEntity().put() p = mapper_pipeline.MapperPipeline( "empty_map", handler_spec=__name__ + ".test_empty_handler", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", params={ "entity_kind": __name__ + ".TestEntity", }, ) p.start() test_support.execute_until_empty(self.taskqueue) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith("Pipeline successful:")) p = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id) self.assertTrue(p.outputs.job_id.value) counters = p.outputs.counters.value self.assertTrue(counters) self.assertTrue(context.COUNTER_MAPPER_WALLTIME_MS in counters) self.assertEquals(100, counters[context.COUNTER_MAPPER_CALLS])
def testFetchEndToEnd(self): """Test for through of fetcher job""" createMockCrawlDbDatum("http://foo.com/bar.txt") static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3" self.setReturnValue(url="http://foo.com/robots.txt", content=static_robots, headers={"Content-Length": len(static_robots), "content-type": "text/plain"}) static_content = "test" static_content_length = len(static_content) self.setReturnValue(url="http://foo.com/bar.txt", content=static_content, headers={"Content-Length": static_content_length, "Content-Type": "text/plain"}) p = pipelines.FetcherPipeline("FetcherPipeline", params={ "entity_kind": "lakshmi.datum.CrawlDbDatum" }, parser_params={ "text/plain": __name__ + "._parserNotOutlinks" }, shards=2) p.start() test_support.execute_until_empty(self.taskqueue)
def testHugeTaskUseDatastore(self): """Test map job with huge parameter values.""" input_data = [str(i) for i in range(100)] bucket_name = "testbucket" test_filename = "testfile" full_filename = "/%s/%s" % (bucket_name, test_filename) with cloudstorage.open(full_filename, mode="w") as f: with records.RecordsWriter(f) as w: for record in input_data: w.write(record) control.start_map( "test_map", __name__ + ".TestHandler", input_readers.__name__ + ".GoogleCloudStorageRecordInputReader", { "input_reader": { "bucket_name": bucket_name, "objects": [test_filename], # the parameter can't be compressed and wouldn't fit into # taskqueue payload "huge_parameter": random_string(900000) } }, shard_count=4, base_path="/mapreduce_base_path") test_support.execute_until_empty(self.taskqueue) self.assertEquals(100, len(TestHandler.processed_entites)) self.assertEquals([], model._HugeTaskPayload.all().fetch(100))
def testFailedMapReduce(self): # Add some random data. entity_count = 200 for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".test_failed_map", __name__ + ".test_mapreduce_reduce", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=( output_writers.__name__ + ".BlobstoreRecordsOutputWriter"), mapper_params={ "entity_kind": __name__ + "." + TestEntity.__name__, }, shards=16) p.start() test_support.execute_until_empty(self.taskqueue) p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) self.assertEqual(model.MapreduceState.RESULT_FAILED, p.outputs.result_status.value) self.assertEqual(0, len(p.outputs.default.value))
def testSortFile(self): """Test sorting a file.""" bucket_name = "testbucket" test_filename = "testfile" full_filename = "/%s/%s" % (bucket_name, test_filename) input_data = [(str(i), "_" + str(i)) for i in range(100)] with cloudstorage.open(full_filename, mode="w") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = kv_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) p = shuffler._SortChunksPipeline("testjob", bucket_name, [[full_filename]]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler._SortChunksPipeline.from_id(p.pipeline_id) input_data.sort() output_files = p.outputs.default.value[0] output_data = [] for output_file in output_files: with cloudstorage.open(output_file) as f: for binary_record in records.RecordsReader(f): proto = kv_pb.KeyValue() proto.ParseFromString(binary_record) output_data.append((proto.key(), proto.value())) self.assertEquals(input_data, output_data) self.assertEquals(1, len(self.emails))
def testRecordsReader(self): """End-to-end test for records reader.""" input_data = [str(i) for i in range(100)] bucket_name = "testbucket" test_filename = "testfile" full_filename = "/%s/%s" % (bucket_name, test_filename) with cloudstorage.open(full_filename, mode="w") as f: with records.RecordsWriter(f) as w: for record in input_data: w.write(record) control.start_map("test_map", __name__ + ".TestHandler", input_readers.__name__ + ".GoogleCloudStorageRecordInputReader", { "input_reader": { "bucket_name": bucket_name, "objects": [test_filename] } }, shard_count=4, base_path="/mapreduce_base_path") test_support.execute_until_empty(self.taskqueue) self.assertEquals(100, len(TestHandler.processed_entites))
def _run_test(self, num_shards, num_files): bucket_name = "testing" object_prefix = "file-" job_name = "test_map" input_class = (input_readers.__name__ + "." + input_readers._GoogleCloudStorageInputReader.__name__) expected_content = self.create_test_content(bucket_name, object_prefix, num_files) control.start_map( job_name, __name__ + "." + "_input_reader_memory_mapper", input_class, { "input_reader": { "bucket_name": bucket_name, "objects": [object_prefix + "*"] }, }, shard_count=num_shards) test_support.execute_until_empty(self.taskqueue) self.assertEqual(expected_content.sort(), _memory_mapper_data.sort())
def testMergeFiles(self): """Test merging multiple files.""" input_data = [(str(i), "_" + str(i)) for i in range(100)] input_data.sort() bucket_name = "testbucket" test_filename = "testfile" full_filename = "/%s/%s" % (bucket_name, test_filename) with cloudstorage.open(full_filename, mode="w") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = kv_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) p = TestMergePipeline(bucket_name, [full_filename, full_filename, full_filename]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with cloudstorage.open(output_file) as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data] self.assertEquals(expected_data, output_data) self.assertEquals(1, len(self.emails))
def _runTest(self, num_shards): entity_count = 1000 bucket_name = "bucket" job_name = "test_map" for _ in range(entity_count): TestEntity().put() mapreduce_id = control.start_map( job_name, __name__ + ".test_handler_yield_key_str", DATASTORE_READER_NAME, { "entity_kind": __name__ + "." + TestEntity.__name__, "output_writer": { "bucket_name": bucket_name, }, }, shard_count=num_shards, output_writer_spec=self.WRITER_NAME) test_support.execute_until_empty(self.taskqueue) mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id) filenames = self.WRITER_CLS.get_filenames(mapreduce_state) self.assertEqual(num_shards, len(set(filenames))) total_entries = 0 for shard in range(num_shards): self.assertTrue(filenames[shard].startswith( "/%s/%s" % (bucket_name, job_name))) data = cloudstorage.open(filenames[shard]).read() # strip() is used to remove the last newline of each file so that split() # does not retrun extraneous empty entries. total_entries += len(data.strip().split("\n")) self.assertEqual(entity_count, total_entries)
def testDedicatedParams(self): entity_count = 1000 for _ in range(entity_count): TestEntity().put() mapreduce_id = control.start_map( "test_map", __name__ + ".test_handler_yield_key_str", DATASTORE_READER_NAME, { "input_reader": { "entity_kind": __name__ + "." + TestEntity.__name__, }, "output_writer": { "filesystem": "gs", "gs_bucket_name": "bucket", }, }, shard_count=4, base_path="/mapreduce_base_path", output_writer_spec=FILE_WRITER_NAME) test_support.execute_until_empty(self.taskqueue) mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id) filenames = output_writers.FileOutputWriter.get_filenames( mapreduce_state) self.assertEqual(1, len(filenames)) self.assertTrue(filenames[0].startswith("/gs/bucket/")) with files.open(filenames[0], "r") as f: data = f.read(10000000) self.assertEquals(1000, len(data.strip().split("\n")))
def testMultipleShards(self): entity_count = 1000 for _ in range(entity_count): TestEntity().put() mapreduce_id = control.start_map( "test_map", __name__ + ".test_handler_yield_key_str", DATASTORE_READER_NAME, { "entity_kind": __name__ + "." + TestEntity.__name__, "output_sharding": "input", }, shard_count=4, base_path="/mapreduce_base_path", output_writer_spec=BLOBSTORE_WRITER_NAME) test_support.execute_until_empty(self.taskqueue) mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id) filenames = output_writers.BlobstoreOutputWriter.get_filenames( mapreduce_state) self.assertEqual(4, len(set(filenames))) file_lengths = [] for filename in filenames: self.assertTrue(filename.startswith("/blobstore/")) self.assertFalse(filename.startswith("/blobstore/writable:")) with files.open(filename, "r") as f: data = f.read(10000000) file_lengths.append(len(data.strip().split("\n"))) self.assertEqual(1000, sum(file_lengths))
def testShardRetryTooMany(self): entity_count = 200 db.delete(TestOutputEntity.all()) db.delete(RetryCount.all()) for i in range(entity_count): TestEntity(data=str(i)).put() p = mapper_pipeline.MapperPipeline( "test", handler_spec=__name__ + ".test_shard_retry_too_many_map", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", params={ "input_reader": { "entity_kind": __name__ + "." + TestEntity.__name__, }, }, shards=5) p.max_attempts = 1 p.start() test_support.execute_until_empty(self.taskqueue) state = model.MapreduceState.all().get() self.assertEqual(model.MapreduceState.RESULT_FAILED, state.result_status) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith( "Pipeline aborted:"))
def testShardRetry(self): entity_count = 200 db.delete(TestOutputEntity.all()) db.delete(RetryCount.all()) for i in range(entity_count): TestEntity(data=str(i)).put() p = mapper_pipeline.MapperPipeline( "test", handler_spec=__name__ + ".test_shard_retry_map", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", params={ "input_reader": { "entity_kind": __name__ + "." + TestEntity.__name__, }, }, shards=5) p.start() test_support.execute_until_empty(self.taskqueue) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith( "Pipeline successful:")) p = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id) outputs = [] for output in TestOutputEntity.all(): outputs.append(int(output.data)) outputs.sort() expected_outputs = [i for i in range(entity_count)] expected_outputs.sort() self.assertEquals(expected_outputs, outputs)
def testSmoke(self): """Test all handlers still works. This test doesn't care about the integrity of the job outputs. Just that things works under webapp2 framework. """ # Run Mapreduce p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".TestMapreduceMap", __name__ + ".TestMapreduceReduce", input_reader_spec=input_readers.__name__ + ".RandomStringInputReader", output_writer_spec=( output_writers.__name__ + "._GoogleCloudStorageRecordOutputWriter"), mapper_params={ "input_reader": { "count": 100 }, }, reducer_params={ "output_writer": { "bucket_name": "test" }, }, shards=3) p.start() test_support.execute_until_empty(self.taskqueue) # Verify output. p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) self.assertEqual(model.MapreduceState.RESULT_SUCCESS, p.outputs.result_status.value)
def testSuccessfulRun(self): p = shuffler._ShuffleServicePipeline("testjob", ["file1", "file2"]) p.start() test_support.execute_until_empty(self.taskqueue) request = self.file_service.shuffle_request self.assertTrue(request) self.assertTrue(request.shuffle_name().startswith("testjob-")) self.assertEquals(2, len(request.input_list())) self.assertEquals(1, request.input(0).format()) self.assertEquals("file1", request.input(0).path()) self.assertEquals(1, request.input(1).format()) self.assertEquals("file2", request.input(1).path()) self.assertEquals(2, len(request.output().path_list())) callback = request.callback() self.assertTrue(callback.url().startswith( "/mapreduce/pipeline/callback?pipeline_id=")) self.assertEquals(self.version_id, callback.app_version_id()) self.assertEquals("GET", callback.method()) self.assertEquals("default", callback.queue()) callback_task = { "url": callback.url(), "method": callback.method(), } test_support.execute_task(callback_task) test_support.execute_until_empty(self.taskqueue) p = shuffler._ShuffleServicePipeline.from_id(p.pipeline_id) self.assertTrue(p.has_finalized) output_files = p.outputs.default.value self.assertEquals(2, len(output_files)) self.assertTrue(output_files[0].startswith("/blobstore/")) self.assertTrue(output_files[1].startswith("/blobstore/"))
def testLotsOfValuesForSingleKey(self): TestEntity(data=str(1)).put() # Run Mapreduce p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".map_yield_lots_of_values", __name__ + ".reduce_length", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec= output_writers.__name__ + ".BlobstoreRecordsOutputWriter", mapper_params= { "entity_kind": __name__ + "." + TestEntity.__name__, }, shards=16) p.start() test_support.execute_until_empty(self.taskqueue) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith( "Pipeline successful:")) # Verify reduce output. p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) output_data = [] for output_file in p.outputs.default.value: with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = ["('1', 50000)"] expected_data.sort() output_data.sort() self.assertEquals(expected_data, output_data)
def testOrgsForAnotherProgram(self): """Tests that status of organizations for another program is untouched.""" # seed another program program = seeder_logic.seed(program_model.Program) # seed a few pre-accepted and pre-rejected organizations pre_accepted_orgs = [] for i in range(2): org = org_utils.seedOrganization( program.key(), org_id='pre_accepted_org_id_%s' % i, status=org_model.Status.PRE_ACCEPTED) pre_accepted_orgs.append(org.key) pre_rejected_orgs = [] for i in range(3): org = org_utils.seedOrganization( program.key(), org_id='pre_rejrected_org_id_%s' % i, status=org_model.Status.PRE_REJECTED) pre_rejected_orgs.append(org.key) mapreduce_control.start_map( 'ApplyOrgAdmissionDecisions', params=self.params) test_support.execute_until_empty(self.get_task_queue_stub()) # check that pre-accepted organizations are still pre-accepted for org_key in pre_accepted_orgs: org = org_key.get() self.assertEqual(org.status, org_model.Status.PRE_ACCEPTED) # check that pre-rejected organizations are still pre-rejected for org_key in pre_rejected_orgs: org = org_key.get() self.assertEqual(org.status, org_model.Status.PRE_REJECTED)
def testShuffleFiles(self): """Test shuffling multiple files.""" input_data = [(str(i), str(i)) for i in range(100)] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name(files.blobstore.get_blob_key(input_file)) p = shuffler.ShufflePipeline("testjob", [input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler.ShufflePipeline.from_id(p.pipeline_id) output_files = p.outputs.default.value output_data = [] for output_file in output_files: with files.open(output_file, "r") as f: for record in records.RecordsReader(f): proto = file_service_pb.KeyValues() proto.ParseFromString(record) output_data.append((proto.key(), proto.value_list())) output_data.sort() expected_data = sorted([(str(k), [str(v), str(v), str(v)]) for (k, v) in input_data]) self.assertEquals(expected_data, output_data)
def _runTest(self, num_shards): entity_count = 1000 bucket_name = "bucket" job_name = "test_map" for _ in range(entity_count): TestEntity().put() mapreduce_id = control.start_map( job_name, __name__ + ".test_handler_yield_key_str", DATASTORE_READER_NAME, { "entity_kind": __name__ + "." + TestEntity.__name__, "output_writer": { "bucket_name": bucket_name, }, }, shard_count=num_shards, output_writer_spec=self.WRITER_NAME) test_support.execute_until_empty(self.taskqueue) mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id) filenames = self.WRITER_CLS.get_filenames(mapreduce_state) self.assertEqual(num_shards, len(filenames)) total_entries = 0 for shard in range(num_shards): self.assertTrue(filenames[shard].startswith("/%s/%s" % (bucket_name, job_name))) data = cloudstorage.open(filenames[shard]).read() # strip() is used to remove the last newline of each file so that split() # does not retrun extraneous empty entries. total_entries += len(data.strip().split("\n")) self.assertEqual(entity_count, total_entries)
def testFailedMapReduce(self): bucket_name = "testbucket" max_attempts_before = pipeline.pipeline._DEFAULT_MAX_ATTEMPTS try: pipeline.pipeline._DEFAULT_MAX_ATTEMPTS = 1 # Add some random data. entity_count = 200 print dir(pipeline.pipeline) for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".test_failed_map", __name__ + ".test_mapreduce_reduce", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=(output_writers.__name__ + "._GoogleCloudStorageRecordOutputWriter"), mapper_params={"entity_kind": __name__ + "." + TestEntity.__name__}, reducer_params={"output_writer": {"bucket_name": bucket_name}}, shards=3, ) p.max_attempts = 1 p.start() test_support.execute_until_empty(self.taskqueue) p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) self.assertTrue(p.was_aborted) finally: pipeline.pipeline._DEFAULT_MAX_ATTEMPTS = max_attempts_before
def testDedicatedParams(self): entity_count = 1000 for _ in range(entity_count): TestEntity().put() mapreduce_id = control.start_map( "test_map", __name__ + ".test_handler_yield_key_str", "mapreduce.input_readers.DatastoreInputReader", { "input_reader": { "entity_kind": __name__ + "." + TestEntity.__name__, }, "output_writer": { "filesystem": "gs", "gs_bucket_name": "bucket", }, }, shard_count=4, base_path="/mapreduce_base_path", output_writer_spec=FILE_WRITER_NAME) test_support.execute_until_empty(self.taskqueue) mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id) filenames = output_writers.FileOutputWriter.get_filenames(mapreduce_state) self.assertEqual(1, len(filenames)) self.assertTrue(filenames[0].startswith("/gs/bucket/")) with files.open(filenames[0], "r") as f: data = f.read(10000000) self.assertEquals(1000, len(data.strip().split("\n")))
def testFailedMap(self): for i in range(1): TestEntity(data=str(i)).put() pipeline.pipeline._DEFAULT_MAX_ATTEMPTS = 1 p = mapper_pipeline.MapperPipeline( "test", handler_spec=__name__ + ".test_fail_map", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", params={ "input_reader": { "entity_kind": __name__ + "." + TestEntity.__name__, }, }, shards=5) p.start() test_support.execute_until_empty(self.taskqueue) p = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id) self.assertTrue(p.was_aborted) self.assertTrue(p.outputs.job_id.filled) state = model.MapreduceState.get_by_job_id(p.outputs.job_id.value) self.assertEqual(model.MapreduceState.RESULT_FAILED, state.result_status) self.assertFalse(p.outputs.result_status.filled) self.assertFalse(p.outputs.default.filled) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith( "Pipeline aborted:"))
def testHugeTaskUseDatastore(self): """Test map job with huge parameter values.""" input_file = files.blobstore.create() input_data = [str(i) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for record in input_data: w.write(record) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) mapreduce_id = control.start_map( "test_map", __name__ + ".TestHandler", "mapreduce.input_readers.RecordsReader", { "file": input_file, # the parameter can't be compressed and wouldn't fit into # taskqueue payload "huge_parameter": random_string(900000) }, shard_count=4, base_path="/mapreduce_base_path") test_support.execute_until_empty(self.taskqueue) self.assertEquals(100, len(TestHandler.processed_entites)) self.assertEquals([], model._HugeTaskPayload.all().fetch(100))
def _run_test(self, num_shards, num_files, multi_slices=False): bucket_name = "testing" object_prefix = "file-" job_name = "test_map" expected_content = self.create_test_content(bucket_name, object_prefix, num_files) job = map_job.Job.submit(map_job.JobConfig( job_name=job_name, mapper=_InputReaderMemoryMapper, input_reader_cls=input_reader.GCSInputReader, input_reader_params={"bucket_name": bucket_name, "objects": [object_prefix + "*"], "path_filter": _MyPathFilter()}, shard_count=num_shards)) test_support.execute_until_empty(self.taskqueue) self.assertEqual(expected_content.sort(), _memory_mapper_data.sort()) self.assertEqual(job.SUCCESS, job.get_status()) self.assertEqual( num_files - 1, job.get_counter(input_reader.GCSInputReader.COUNTER_FILE_READ)) if multi_slices: ss = model.ShardState.find_all_by_mapreduce_state(job._state) for s in ss: self.assertTrue(s.slice_id > 0)
def testRecordsReader(self): """End-to-end test for records reader.""" input_file = files.blobstore.create() input_data = [str(i) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for record in input_data: w.write(record) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) mapreduce_id = control.start_map( "test_map", __name__ + ".TestHandler", "mapreduce.input_readers.RecordsReader", { "file": input_file }, shard_count=4, base_path="/mapreduce_base_path") test_support.execute_until_empty(self.taskqueue) self.assertEquals(100, len(TestHandler.processed_entites))
def testProcessEntites(self): """Test empty mapper over non-empty dataset.""" for _ in range(100): TestEntity().put() p = mapper_pipeline.MapperPipeline( "empty_map", handler_spec=__name__ + ".test_empty_handler", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", params={ "entity_kind": __name__ + ".TestEntity", }, ) p.start() test_support.execute_until_empty(self.taskqueue) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith( "Pipeline successful:")) p = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id) self.assertTrue(p.outputs.job_id.value) counters = p.outputs.counters.value self.assertTrue(counters) self.assertTrue(context.COUNTER_MAPPER_WALLTIME_MS in counters) self.assertEquals(100, counters[context.COUNTER_MAPPER_CALLS])
def testLotsOfValuesForSingleKey(self): TestEntity(data=str(1)).put() # Run Mapreduce p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".map_yield_lots_of_values", __name__ + ".reduce_length", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=output_writers.__name__ + ".BlobstoreRecordsOutputWriter", mapper_params={ "entity_kind": __name__ + "." + TestEntity.__name__, }, shards=16) p.start() test_support.execute_until_empty(self.taskqueue) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith("Pipeline successful:")) # Verify reduce output. p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) output_data = [] for output_file in p.outputs.default.value: with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = ["('1', 50000)"] expected_data.sort() output_data.sort() self.assertEquals(expected_data, output_data)
def testDecisionsAreApplied(self): """Tests that status of organizations is changed after the job.""" mapreduce_control.start_map( 'ApplyOrgAdmissionDecisions', params=self.params) test_support.execute_until_empty(self.get_task_queue_stub()) # check that pre-rejected organizations are accepted now for org_key in self.pre_accepted_orgs: org = org_key.get() self.assertEqual(org.status, org_model.Status.ACCEPTED) # check that pre-rejected organizations are rejected now for org_key in self.pre_rejected_orgs: org = org_key.get() self.assertEqual(org.status, org_model.Status.REJECTED) # check that nothing has changed regarding applying organizations for org_key in self.applying_orgs: org = org_key.get() self.assertEqual(org.status, org_model.Status.APPLYING) for org_key in self.pre_accepted_orgs: org = org_key.get() subject = notifications.DEF_ACCEPTED_ORG % { 'org': org.name, } self.assertEmailSent(cc=org.contact.email, subject=subject) for org_key in self.pre_rejected_orgs: org = org_key.get() subject = notifications.DEF_REJECTED_ORG % { 'org': org.name, } self.assertEmailSent(cc=org.contact.email, subject=subject)
def testSmoke(self): """Test all handlers still works. This test doesn't care about the integrity of the job outputs. Just that things works under webapp2 framework. """ # Run Mapreduce p = mapreduce_pipeline.MapreducePipeline( "test", __name__ + ".TestMapreduceMap", __name__ + ".TestMapreduceReduce", input_reader_spec=input_readers.__name__ + ".RandomStringInputReader", output_writer_spec=(output_writers.__name__ + "._GoogleCloudStorageRecordOutputWriter"), mapper_params={ "input_reader": { "count": 100 }, }, reducer_params={ "output_writer": { "bucket_name": "test" }, }, shards=3) p.start() test_support.execute_until_empty(self.taskqueue) # Verify output. p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) self.assertEqual(model.MapreduceState.RESULT_SUCCESS, p.outputs.result_status.value)
def testOrgsForAnotherProgram(self): """Tests that status of organizations for another program is untouched.""" # seed another program program = seeder_logic.seed(program_model.Program) # seed a few pre-accepted and pre-rejected organizations pre_accepted_orgs = [] for i in range(2): org = org_utils.seedOrganization( program.key(), org_id='pre_accepted_org_id_%s' % i, status=org_model.Status.PRE_ACCEPTED) pre_accepted_orgs.append(org.key) pre_rejected_orgs = [] for i in range(3): org = org_utils.seedOrganization( program.key(), org_id='pre_rejrected_org_id_%s' % i, status=org_model.Status.PRE_REJECTED) pre_rejected_orgs.append(org.key) mapreduce_control.start_map('ApplyOrgAdmissionDecisions', params=self.params) test_support.execute_until_empty(self.get_task_queue_stub()) # check that pre-accepted organizations are still pre-accepted for org_key in pre_accepted_orgs: org = org_key.get() self.assertEqual(org.status, org_model.Status.PRE_ACCEPTED) # check that pre-rejected organizations are still pre-rejected for org_key in pre_rejected_orgs: org = org_key.get() self.assertEqual(org.status, org_model.Status.PRE_REJECTED)
def testSortFile(self): """Test sorting a file.""" input_file = files.blobstore.create() input_data = [(str(i), "_" + str(i)) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name(files.blobstore.get_blob_key(input_file)) p = shuffler._SortChunksPipeline("testjob", [input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler._SortChunksPipeline.from_id(p.pipeline_id) input_data.sort() output_files = p.outputs.default.value[0] output_data = [] for output_file in output_files: with files.open(output_file, "r") as f: for binary_record in records.RecordsReader(f): proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) output_data.append((proto.key(), proto.value())) self.assertEquals(input_data, output_data)
def testDecisionsAreApplied(self): """Tests that status of organizations is changed after the job.""" mapreduce_control.start_map('ApplyOrgAdmissionDecisions', params=self.params) test_support.execute_until_empty(self.get_task_queue_stub()) # check that pre-rejected organizations are accepted now for org_key in self.pre_accepted_orgs: org = org_key.get() self.assertEqual(org.status, org_model.Status.ACCEPTED) # check that pre-rejected organizations are rejected now for org_key in self.pre_rejected_orgs: org = org_key.get() self.assertEqual(org.status, org_model.Status.REJECTED) # check that nothing has changed regarding applying organizations for org_key in self.applying_orgs: org = org_key.get() self.assertEqual(org.status, org_model.Status.APPLYING) for org_key in self.pre_accepted_orgs: org = org_key.get() subject = notifications.DEF_ACCEPTED_ORG % { 'org': org.name, } self.assertEmailSent(cc=org.contact.email, subject=subject) for org_key in self.pre_rejected_orgs: org = org_key.get() subject = notifications.DEF_REJECTED_ORG % { 'org': org.name, } self.assertEmailSent(cc=org.contact.email, subject=subject)
def testMultipleShards(self): entity_count = 1000 for _ in range(entity_count): TestEntity().put() mapreduce_id = control.start_map( "test_map", __name__ + ".test_handler_yield_key_str", DATASTORE_READER_NAME, { "entity_kind": __name__ + "." + TestEntity.__name__, "output_sharding": "input", }, shard_count=4, base_path="/mapreduce_base_path", output_writer_spec=BLOBSTORE_WRITER_NAME) test_support.execute_until_empty(self.taskqueue) mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id) filenames = output_writers.BlobstoreOutputWriter.get_filenames( mapreduce_state) self.assertEqual(4, len(filenames)) file_lengths = [] for filename in filenames: self.assertTrue(filename.startswith("/blobstore/")) self.assertFalse(filename.startswith("/blobstore/writable:")) with files.open(filename, "r") as f: data = f.read(10000000) file_lengths.append(len(data.strip().split("\n"))) self.assertEqual(1000, sum(file_lengths))
def testMergeFiles(self): """Test merging multiple files.""" input_data = [(str(i), "_" + str(i)) for i in range(100)] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name(files.blobstore.get_blob_key(input_file)) p = TestMergePipeline([input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data] self.assertEquals(expected_data, output_data)
def testHugeTaskUseDatastore(self): """Test map job with huge parameter values.""" input_file = files.blobstore.create() input_data = [str(i) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for record in input_data: w.write(record) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) mapreduce_id = control.start_map( "test_map", __name__ + ".TestHandler", "mapreduce.input_readers.RecordsReader", { "file": input_file, # the parameter can't be compressed and wouldn't fit into # taskqueue payload "huge_parameter": random_string(900000) }, shard_count=4, base_path="/mapreduce_base_path") test_support.execute_until_empty(self.taskqueue) self.assertEquals(100, len(TestHandler.processed_entites)) self.assertEquals([], util._HugeTaskPayload.all().fetch(100))
def testShuffleNoFile(self): p = shuffler.ShufflePipeline("testjob", []) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler.ShufflePipeline.from_id(p.pipeline_id) for filename in p.outputs.default.value: self.assertEqual(0, files.stat(filename).st_size)
def testMapReduce(self): # Prepare test data bucket_name = "testbucket" job_name = "test_job" entity_count = 200 for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() # Run Mapreduce p = mapreduce_pipeline.MapreducePipeline( job_name, __name__ + ".test_mapreduce_map", __name__ + ".test_mapreduce_reduce", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=( output_writers.__name__ + "._GoogleCloudStorageRecordOutputWriter"), mapper_params={ "entity_kind": __name__ + "." + TestEntity.__name__, "bucket_name": bucket_name }, reducer_params={ "output_writer": { "bucket_name": bucket_name }, }, shards=16) p.start() test_support.execute_until_empty(self.taskqueue) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith( "Pipeline successful:")) # Verify reduce output. p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) self.assertEqual(model.MapreduceState.RESULT_SUCCESS, p.outputs.result_status.value) output_data = [] for output_file in p.outputs.default.value: with cloudstorage.open(output_file) as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [ str((str(d), ["", ""])) for d in range(entity_count)] expected_data.sort() output_data.sort() self.assertEquals(expected_data, output_data) # Verify that mapreduce doesn't leave intermediate files behind. temp_file_stats = cloudstorage.listbucket("/" + bucket_name) for stat in temp_file_stats: if stat.filename: self.assertFalse( stat.filename.startswith("/%s/%s-shuffle-" % (bucket_name, job_name)))
def testShuffleNoFile(self): p = shuffler.ShufflePipeline( "testjob", []) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler.ShufflePipeline.from_id(p.pipeline_id) for filename in p.outputs.default.value: self.assertEqual(0, files.stat(filename).st_size)
def testMapReduce(self): # Prepare test data bucket_name = "testbucket" job_name = "test_job" entity_count = 200 for i in range(entity_count): TestEntity(data=str(i)).put() TestEntity(data=str(i)).put() # Run Mapreduce p = mapreduce_pipeline.MapreducePipeline( job_name, __name__ + ".test_mapreduce_map", __name__ + ".test_mapreduce_reduce", input_reader_spec=input_readers.__name__ + ".DatastoreInputReader", output_writer_spec=(output_writers.__name__ + "._GoogleCloudStorageRecordOutputWriter"), mapper_params={ "entity_kind": __name__ + "." + TestEntity.__name__, "bucket_name": bucket_name }, reducer_params={ "output_writer": { "bucket_name": bucket_name }, }, shards=16) p.start() test_support.execute_until_empty(self.taskqueue) self.assertEquals(1, len(self.emails)) self.assertTrue(self.emails[0][1].startswith("Pipeline successful:")) # Verify reduce output. p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id) self.assertEqual(model.MapreduceState.RESULT_SUCCESS, p.outputs.result_status.value) output_data = [] for output_file in p.outputs.default.value: with cloudstorage.open(output_file) as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [str((str(d), ["", ""])) for d in range(entity_count)] expected_data.sort() output_data.sort() self.assertEquals(expected_data, output_data) # Verify that mapreduce doesn't leave intermediate files behind. temp_file_stats = cloudstorage.listbucket("/" + bucket_name) for stat in temp_file_stats: if stat.filename: self.assertFalse( stat.filename.startswith("/%s/%s-shuffle-" % (bucket_name, job_name)))
def testAbort(self): job = map_job.Job.submit(self.config) self.assertEqual(map_job.Job.RUNNING, job.get_status()) job.abort() self.assertEqual(map_job.Job.RUNNING, job.get_status()) # Execute all tasks. test_support.execute_until_empty(self.taskqueue) job = map_job.Job.get_job_by_id(job_id=self.config.job_id) self.assertEqual(map_job.Job.ABORTED, job.get_status())