Example #1
0
    def testSuccessfulRun(self):
        createMockCrawlDbDatum(2, 6, False)

        p = pipelines._ExactDomainMapreducePipeline(
            "ExactDomainMapreducePipeline",
            params={
                "entity_kind": "lakshmi.datum.CrawlDbDatum",
            },
            shard_count=3)
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        finished_map = pipelines._RobotsFetchPipeline.from_id(p.pipeline_id)

        # Can open files
        file_paths = finished_map.outputs.default.value
        self.assertTrue(len(file_paths) > 0)
        self.assertTrue(file_paths[0].startswith("/blobstore/"))

        for file_path in file_paths:
            blob_key = files.blobstore.get_blob_key(file_path)
            reader = input_readers.BlobstoreLineInputReader(blob_key, 0, 100)
            u = 0
            for content in reader:
                self.assertTrue(content[1] != None)
                u += 1

        self.assertEqual(2, u)

        query = CrawlDbDatum.query(
            CrawlDbDatum.extract_domain_url == "http://hoge_0.com")
        entities = query.fetch()
        for entity in entities:
            self.assertEquals("http://hoge_0.com", entity.extract_domain_url)
Example #2
0
    def testSuccessfulRun(self):
        file_name1 = self.createMockData((
            "http://hoge_0.com",
            "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3"
        ))
        file_name2 = self.createMockData((
            "http://hoge_1.com",
            "User-agent: test\nAllow: /content_0\nAllow: /content_1\nDisallow: /content_3"
        ))
        createMockCrawlDbDatum(2, 6, True)
        p = pipelines._FetchSetsBufferPipeline("FetchSetsBufferPipeline",
                                               [file_name1, file_name2])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        finished_map = pipelines._FetchSetsBufferPipeline.from_id(
            p.pipeline_id)

        # Can open files
        file_paths = finished_map.outputs.default.value
        self.assertTrue(len(file_paths) > 0)
        self.assertTrue(file_paths[0].startswith("/blobstore/"))

        reader = input_readers.RecordsReader(file_paths, 0)
        for binary_record in reader:
            proto = file_service_pb.KeyValue()
            proto.ParseFromString(binary_record)
            key = proto.key()
            value = proto.value()
            self.assertTrue(key is not None)
            self.assertTrue(value is not None)
Example #3
0
    def testSuccessfulRun(self):
        createMockCrawlDbDatum(2, 2, True)
        file_name1 = self.createMockData(("http://hoge_0.com/content_0", True))
        file_name2 = self.createMockData(
            ("http://hoge_1.com/content_0", False))
        static_content = "<html><body>TestContent</body></html>"
        self.setReturnValue(content=static_content,
                            headers={
                                "Content-Length": len(static_content),
                                "Content-Type": "text/html"
                            })
        p = pipelines._FetchPagePipeline("FetchPipeline",
                                         [file_name1, file_name2], 2)
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        finished_map = pipelines._FetchPagePipeline.from_id(p.pipeline_id)

        # Can open files
        file_paths = finished_map.outputs.default.value
        self.assertTrue(len(file_paths) > 0)
        self.assertTrue(file_paths[0].startswith("/blobstore/"))

        entities = CrawlDbDatum.query(
            CrawlDbDatum.url == "http://hoge_0.com/content_0").fetch()
        entity = entities[0]
        fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch()
        self.assertTrue(fetched_datum is not None)
  def testEmptyMapper(self):
    """Test empty mapper over empty dataset."""
    p = mapper_pipeline.MapperPipeline(
        "empty_map",
        handler_spec=__name__ + ".test_empty_handler",
        input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
        params={
            "input_reader": {
                "entity_kind": __name__ + ".TestEntity",
                # Test datetime can be json serialized.
                "filters": [("dt", "=", datetime.datetime(2000, 1, 1))],
                },
            },
        )
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    self.assertEquals(1, len(self.emails))
    self.assertTrue(self.emails[0][1].startswith(
        "Pipeline successful:"))

    p = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id)
    self.assertTrue(p.outputs.job_id.value)

    counters = p.outputs.counters.value
    self.assertTrue(counters)
    self.assertTrue(context.COUNTER_MAPPER_WALLTIME_MS in counters)
Example #5
0
    def testFetchError(self):
        blob_keys = self.createInvalidMockData()
        static_content = "User-agent: *\nDisallow: /search\nDisallow: /sdch\nDisallow: /groups"
        self.setReturnValue(content=static_content,
                            headers={
                                "Content-Length": len(static_content),
                                "Content-Type": "text/html"
                            })
        p = pipelines._RobotsFetchPipeline("RobotsFetchPipeline", blob_keys, 2)
        p.start()

        test_support.execute_until_empty(self.taskqueue)
        finished_map = pipelines._RobotsFetchPipeline.from_id(p.pipeline_id)

        # Can open files
        file_list = finished_map.outputs.default.value
        self.assertTrue(len(file_list) > 0)
        reader = input_readers.RecordsReader(file_list, 0)
        for binary_record in reader:
            proto = file_service_pb.KeyValue()
            proto.ParseFromString(binary_record)
            key = proto.key()
            value = proto.value()
            self.assertEquals("invalidScheme://test_url.com", key)
            self.assertEquals("User-agent: *\nDisallow: /", value)
  def testSingleShard(self):
    entity_count = 1000

    for _ in range(entity_count):
      TestEntity().put()

    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".test_handler_yield_key_str",
        "mapreduce.input_readers.DatastoreInputReader",
        {
            "entity_kind": __name__ + "." + TestEntity.__name__,
        },
        shard_count=4,
        base_path="/mapreduce_base_path",
        output_writer_spec=BLOBSTORE_WRITER_NAME)

    test_support.execute_until_empty(self.taskqueue)

    mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
    filenames = output_writers.BlobstoreOutputWriter.get_filenames(
        mapreduce_state)
    self.assertEqual(1, len(filenames))
    blob_name = filenames[0]
    self.assertTrue(blob_name.startswith("/blobstore/"))
    self.assertFalse(blob_name.startswith("/blobstore/writable:"))

    with files.open(blob_name, "r") as f:
      data = f.read(10000000)
      self.assertEquals(1000, len(data.strip().split("\n")))
    def testFetchEndToEnd(self):
        """Test for through of fetcher job"""
        createMockCrawlDbDatum("http://foo.com/bar.txt")
        static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3"
        self.setReturnValue(url="http://foo.com/robots.txt",
                            content=static_robots,
                            headers={
                                "Content-Length": len(static_robots),
                                "content-type": "text/plain"
                            })

        static_content = "test"
        static_content_length = len(static_content)
        self.setReturnValue(url="http://foo.com/bar.txt",
                            content=static_content,
                            headers={
                                "Content-Length": static_content_length,
                                "Content-Type": "text/plain"
                            })
        p = pipelines.FetcherPipeline(
            "FetcherPipeline",
            params={"entity_kind": "lakshmi.datum.CrawlDbDatum"},
            parser_params={"text/plain": __name__ + "._parserNotOutlinks"},
            shards=2)
        p.start()
        test_support.execute_until_empty(self.taskqueue)
Example #8
0
    def testSortFile(self):
        """Test sorting a file."""
        input_file = files.blobstore.create()

        input_data = [(str(i), "_" + str(i)) for i in range(100)]

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        p = shuffler._SortChunksPipeline("testjob", [input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler._SortChunksPipeline.from_id(p.pipeline_id)

        input_data.sort()
        output_files = p.outputs.default.value[0]
        output_data = []
        for output_file in output_files:
            with files.open(output_file, "r") as f:
                for binary_record in records.RecordsReader(f):
                    proto = file_service_pb.KeyValue()
                    proto.ParseFromString(binary_record)
                    output_data.append((proto.key(), proto.value()))

        self.assertEquals(input_data, output_data)
Example #9
0
  def testSuccessfulRun(self):
    createMockCrawlDbDatum(2, 6, False)
    
    p = pipelines._ExactDomainMapreducePipeline("ExactDomainMapreducePipeline",
                                                 params={
                                                         "entity_kind": "lakshmi.datum.CrawlDbDatum",
                                                         },
                                                 shard_count=3)
    p.start()
    test_support.execute_until_empty(self.taskqueue)
    finished_map = pipelines._RobotsFetchPipeline.from_id(p.pipeline_id)
    
    # Can open files
    file_paths = finished_map.outputs.default.value
    self.assertTrue(len(file_paths) > 0)
    self.assertTrue(file_paths[0].startswith("/blobstore/"))
    
    for file_path in file_paths:
      blob_key = files.blobstore.get_blob_key(file_path)
      reader = input_readers.BlobstoreLineInputReader(blob_key, 0, 100)
      u = 0
      for content in reader:
        self.assertTrue(content[1]!=None)
        u += 1
    
    self.assertEqual(2, u)

    query = CrawlDbDatum.query(CrawlDbDatum.extract_domain_url=="http://hoge_0.com")
    entities = query.fetch()
    for entity in entities:
      self.assertEquals("http://hoge_0.com", entity.extract_domain_url)
Example #10
0
    def testMergeFiles(self):
        """Test merging multiple files."""
        input_data = [(str(i), "_" + str(i)) for i in range(100)]
        input_data.sort()

        input_file = files.blobstore.create()

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        p = TestMergePipeline([input_file, input_file, input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = TestMergePipeline.from_id(p.pipeline_id)

        output_file = p.outputs.default.value[0]
        output_data = []
        with files.open(output_file, "r") as f:
            for record in records.RecordsReader(f):
                output_data.append(record)

        expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data]
        self.assertEquals(expected_data, output_data)
    def _run_test(self, num_shards, num_files, multi_slices=False):
        bucket_name = "testing"
        object_prefix = "file-"
        job_name = "test_map"
        expected_content = self.create_test_content(bucket_name, object_prefix,
                                                    num_files)
        job = map_job.Job.submit(
            map_job.JobConfig(job_name=job_name,
                              mapper=_InputReaderMemoryMapper,
                              input_reader_cls=input_reader.GCSInputReader,
                              input_reader_params={
                                  "bucket_name": bucket_name,
                                  "objects": [object_prefix + "*"],
                                  "path_filter": _MyPathFilter()
                              },
                              shard_count=num_shards))

        test_support.execute_until_empty(self.taskqueue)
        self.assertEqual(expected_content.sort(), _memory_mapper_data.sort())
        self.assertEqual(job.SUCCESS, job.get_status())
        self.assertEqual(
            num_files - 1,
            job.get_counter(input_reader.GCSInputReader.COUNTER_FILE_READ))
        if multi_slices:
            ss = model.ShardState.find_all_by_mapreduce_state(job._state)
            for s in ss:
                self.assertTrue(s.slice_id > 0)
Example #12
0
    def testFailedMapReduce(self):
        # Add some random data.
        entity_count = 200

        for i in range(entity_count):
            TestEntity(data=str(i)).put()
            TestEntity(data=str(i)).put()

        p = mapreduce_pipeline.MapreducePipeline(
            "test",
            __name__ + ".test_failed_map",
            __name__ + ".test_mapreduce_reduce",
            input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
            output_writer_spec=(output_writers.__name__ +
                                ".BlobstoreRecordsOutputWriter"),
            mapper_params={
                "entity_kind": __name__ + "." + TestEntity.__name__,
            },
            shards=16)
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
        self.assertEqual(model.MapreduceState.RESULT_FAILED,
                         p.outputs.result_status.value)
        self.assertEqual(0, len(p.outputs.default.value))
Example #13
0
    def testNoCombiner(self):
        """Test running with low values count but without combiner."""
        # Even though this test doesn't have combiner specified, it's still
        # interesting to run. It forces MergePipeline to produce partial
        # key values and we verify that they are combined correctly in reader.

        # Prepare test data
        entity_count = 200

        for i in range(entity_count):
            TestEntity(data=str(i)).put()
            TestEntity(data=str(i)).put()

        p = mapreduce_pipeline.MapreducePipeline(
            "test",
            __name__ + ".test_combiner_map",
            __name__ + ".test_combiner_reduce",
            input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
            output_writer_spec=output_writers.__name__ + ".BlobstoreOutputWriter",
            mapper_params={"entity_kind": __name__ + ".TestEntity"},
            shards=4,
        )
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
        self.assertEquals(1, len(p.outputs.default.value))
        output_file = p.outputs.default.value[0]

        file_content = []
        with files.open(output_file, "r") as f:
            file_content = sorted(f.read(10000000).strip().split("\n"))

        self.assertEquals(["('0', 9800)", "('1', 9900)", "('2', 10000)", "('3', 10100)"], file_content)
    def testProcessEntites(self):
        """Test empty mapper over non-empty dataset."""
        for _ in range(100):
            TestEntity().put()

        p = mapper_pipeline.MapperPipeline(
            "empty_map",
            handler_spec=__name__ + ".test_empty_handler",
            input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
            params={
                "entity_kind": __name__ + ".TestEntity",
            },
        )
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        self.assertEquals(1, len(self.emails))
        self.assertTrue(self.emails[0][1].startswith("Pipeline successful:"))

        p = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id)
        self.assertTrue(p.outputs.job_id.value)

        counters = p.outputs.counters.value
        self.assertTrue(counters)
        self.assertTrue(context.COUNTER_MAPPER_WALLTIME_MS in counters)
        self.assertEquals(100, counters[context.COUNTER_MAPPER_CALLS])
 def testFetchEndToEnd(self):
   """Test for through of fetcher job"""
   createMockCrawlDbDatum("http://foo.com/bar.txt")
   static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3"
   self.setReturnValue(url="http://foo.com/robots.txt",
       content=static_robots,
       headers={"Content-Length": len(static_robots),
         "content-type": "text/plain"})
   
   static_content = "test"
   static_content_length = len(static_content)
   self.setReturnValue(url="http://foo.com/bar.txt",
       content=static_content,
       headers={"Content-Length": static_content_length,
           "Content-Type": "text/plain"})
   p = pipelines.FetcherPipeline("FetcherPipeline",
       params={
         "entity_kind": "lakshmi.datum.CrawlDbDatum"
       },
       parser_params={
         "text/plain": __name__ + "._parserNotOutlinks"
       },
       shards=2)
   p.start()
   test_support.execute_until_empty(self.taskqueue)
Example #16
0
    def testHugeTaskUseDatastore(self):
        """Test map job with huge parameter values."""
        input_data = [str(i) for i in range(100)]

        bucket_name = "testbucket"
        test_filename = "testfile"
        full_filename = "/%s/%s" % (bucket_name, test_filename)

        with cloudstorage.open(full_filename, mode="w") as f:
            with records.RecordsWriter(f) as w:
                for record in input_data:
                    w.write(record)

        control.start_map(
            "test_map",
            __name__ + ".TestHandler",
            input_readers.__name__ + ".GoogleCloudStorageRecordInputReader",
            {
                "input_reader": {
                    "bucket_name": bucket_name,
                    "objects": [test_filename],
                    # the parameter can't be compressed and wouldn't fit into
                    # taskqueue payload
                    "huge_parameter": random_string(900000)
                }
            },
            shard_count=4,
            base_path="/mapreduce_base_path")

        test_support.execute_until_empty(self.taskqueue)
        self.assertEquals(100, len(TestHandler.processed_entites))
        self.assertEquals([], model._HugeTaskPayload.all().fetch(100))
  def testFailedMapReduce(self):
     # Add some random data.
    entity_count = 200

    for i in range(entity_count):
      TestEntity(data=str(i)).put()
      TestEntity(data=str(i)).put()

    p = mapreduce_pipeline.MapreducePipeline(
        "test",
        __name__ + ".test_failed_map",
        __name__ + ".test_mapreduce_reduce",
        input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
        output_writer_spec=(
            output_writers.__name__ + ".BlobstoreRecordsOutputWriter"),
        mapper_params={
            "entity_kind": __name__ + "." + TestEntity.__name__,
            },
        shards=16)
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
    self.assertEqual(model.MapreduceState.RESULT_FAILED,
                     p.outputs.result_status.value)
    self.assertEqual(0, len(p.outputs.default.value))
    def testSortFile(self):
        """Test sorting a file."""
        bucket_name = "testbucket"
        test_filename = "testfile"
        full_filename = "/%s/%s" % (bucket_name, test_filename)

        input_data = [(str(i), "_" + str(i)) for i in range(100)]

        with cloudstorage.open(full_filename, mode="w") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = kv_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())

        p = shuffler._SortChunksPipeline("testjob", bucket_name, [[full_filename]])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler._SortChunksPipeline.from_id(p.pipeline_id)

        input_data.sort()
        output_files = p.outputs.default.value[0]
        output_data = []
        for output_file in output_files:
            with cloudstorage.open(output_file) as f:
                for binary_record in records.RecordsReader(f):
                    proto = kv_pb.KeyValue()
                    proto.ParseFromString(binary_record)
                    output_data.append((proto.key(), proto.value()))

        self.assertEquals(input_data, output_data)
        self.assertEquals(1, len(self.emails))
Example #19
0
    def testRecordsReader(self):
        """End-to-end test for records reader."""
        input_data = [str(i) for i in range(100)]

        bucket_name = "testbucket"
        test_filename = "testfile"
        full_filename = "/%s/%s" % (bucket_name, test_filename)

        with cloudstorage.open(full_filename, mode="w") as f:
            with records.RecordsWriter(f) as w:
                for record in input_data:
                    w.write(record)

        control.start_map("test_map",
                          __name__ + ".TestHandler",
                          input_readers.__name__ +
                          ".GoogleCloudStorageRecordInputReader", {
                              "input_reader": {
                                  "bucket_name": bucket_name,
                                  "objects": [test_filename]
                              }
                          },
                          shard_count=4,
                          base_path="/mapreduce_base_path")

        test_support.execute_until_empty(self.taskqueue)
        self.assertEquals(100, len(TestHandler.processed_entites))
  def _run_test(self, num_shards, num_files):
    bucket_name = "testing"
    object_prefix = "file-"
    job_name = "test_map"
    input_class = (input_readers.__name__ + "." +
                   input_readers._GoogleCloudStorageInputReader.__name__)

    expected_content = self.create_test_content(bucket_name,
                                                object_prefix,
                                                num_files)

    control.start_map(
        job_name,
        __name__ + "." + "_input_reader_memory_mapper",
        input_class,
        {
            "input_reader": {
                "bucket_name": bucket_name,
                "objects": [object_prefix + "*"]
            },
        },
        shard_count=num_shards)

    test_support.execute_until_empty(self.taskqueue)
    self.assertEqual(expected_content.sort(), _memory_mapper_data.sort())
    def testMergeFiles(self):
        """Test merging multiple files."""
        input_data = [(str(i), "_" + str(i)) for i in range(100)]
        input_data.sort()

        bucket_name = "testbucket"
        test_filename = "testfile"
        full_filename = "/%s/%s" % (bucket_name, test_filename)

        with cloudstorage.open(full_filename, mode="w") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = kv_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())

        p = TestMergePipeline(bucket_name, [full_filename, full_filename, full_filename])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = TestMergePipeline.from_id(p.pipeline_id)

        output_file = p.outputs.default.value[0]
        output_data = []
        with cloudstorage.open(output_file) as f:
            for record in records.RecordsReader(f):
                output_data.append(record)

        expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data]
        self.assertEquals(expected_data, output_data)
        self.assertEquals(1, len(self.emails))
    def _runTest(self, num_shards):
        entity_count = 1000
        bucket_name = "bucket"
        job_name = "test_map"

        for _ in range(entity_count):
            TestEntity().put()

        mapreduce_id = control.start_map(
            job_name,
            __name__ + ".test_handler_yield_key_str",
            DATASTORE_READER_NAME, {
                "entity_kind": __name__ + "." + TestEntity.__name__,
                "output_writer": {
                    "bucket_name": bucket_name,
                },
            },
            shard_count=num_shards,
            output_writer_spec=self.WRITER_NAME)

        test_support.execute_until_empty(self.taskqueue)
        mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
        filenames = self.WRITER_CLS.get_filenames(mapreduce_state)

        self.assertEqual(num_shards, len(set(filenames)))
        total_entries = 0
        for shard in range(num_shards):
            self.assertTrue(filenames[shard].startswith(
                "/%s/%s" % (bucket_name, job_name)))
            data = cloudstorage.open(filenames[shard]).read()
            # strip() is used to remove the last newline of each file so that split()
            # does not retrun extraneous empty entries.
            total_entries += len(data.strip().split("\n"))
        self.assertEqual(entity_count, total_entries)
    def testDedicatedParams(self):
        entity_count = 1000

        for _ in range(entity_count):
            TestEntity().put()

        mapreduce_id = control.start_map(
            "test_map",
            __name__ + ".test_handler_yield_key_str",
            DATASTORE_READER_NAME, {
                "input_reader": {
                    "entity_kind": __name__ + "." + TestEntity.__name__,
                },
                "output_writer": {
                    "filesystem": "gs",
                    "gs_bucket_name": "bucket",
                },
            },
            shard_count=4,
            base_path="/mapreduce_base_path",
            output_writer_spec=FILE_WRITER_NAME)

        test_support.execute_until_empty(self.taskqueue)

        mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
        filenames = output_writers.FileOutputWriter.get_filenames(
            mapreduce_state)
        self.assertEqual(1, len(filenames))
        self.assertTrue(filenames[0].startswith("/gs/bucket/"))

        with files.open(filenames[0], "r") as f:
            data = f.read(10000000)
            self.assertEquals(1000, len(data.strip().split("\n")))
    def testMultipleShards(self):
        entity_count = 1000

        for _ in range(entity_count):
            TestEntity().put()

        mapreduce_id = control.start_map(
            "test_map",
            __name__ + ".test_handler_yield_key_str",
            DATASTORE_READER_NAME, {
                "entity_kind": __name__ + "." + TestEntity.__name__,
                "output_sharding": "input",
            },
            shard_count=4,
            base_path="/mapreduce_base_path",
            output_writer_spec=BLOBSTORE_WRITER_NAME)

        test_support.execute_until_empty(self.taskqueue)

        mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
        filenames = output_writers.BlobstoreOutputWriter.get_filenames(
            mapreduce_state)
        self.assertEqual(4, len(set(filenames)))

        file_lengths = []
        for filename in filenames:
            self.assertTrue(filename.startswith("/blobstore/"))
            self.assertFalse(filename.startswith("/blobstore/writable:"))

            with files.open(filename, "r") as f:
                data = f.read(10000000)
                file_lengths.append(len(data.strip().split("\n")))

        self.assertEqual(1000, sum(file_lengths))
  def testShardRetryTooMany(self):
    entity_count = 200
    db.delete(TestOutputEntity.all())
    db.delete(RetryCount.all())

    for i in range(entity_count):
      TestEntity(data=str(i)).put()

    p = mapper_pipeline.MapperPipeline(
        "test",
        handler_spec=__name__ + ".test_shard_retry_too_many_map",
        input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
        params={
            "input_reader": {
                "entity_kind": __name__ + "." + TestEntity.__name__,
            },
        },
        shards=5)
    p.max_attempts = 1
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    state = model.MapreduceState.all().get()
    self.assertEqual(model.MapreduceState.RESULT_FAILED, state.result_status)

    self.assertEquals(1, len(self.emails))
    self.assertTrue(self.emails[0][1].startswith(
        "Pipeline aborted:"))
  def testShardRetry(self):
    entity_count = 200
    db.delete(TestOutputEntity.all())
    db.delete(RetryCount.all())

    for i in range(entity_count):
      TestEntity(data=str(i)).put()

    p = mapper_pipeline.MapperPipeline(
        "test",
        handler_spec=__name__ + ".test_shard_retry_map",
        input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
        params={
            "input_reader": {
                "entity_kind": __name__ + "." + TestEntity.__name__,
            },
        },
        shards=5)
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    self.assertEquals(1, len(self.emails))
    self.assertTrue(self.emails[0][1].startswith(
        "Pipeline successful:"))

    p = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id)
    outputs = []
    for output in TestOutputEntity.all():
      outputs.append(int(output.data))
    outputs.sort()

    expected_outputs = [i for i in range(entity_count)]
    expected_outputs.sort()
    self.assertEquals(expected_outputs, outputs)
  def testSmoke(self):
    """Test all handlers still works.

    This test doesn't care about the integrity of the job outputs.
    Just that things works under webapp2 framework.
    """
    # Run Mapreduce
    p = mapreduce_pipeline.MapreducePipeline(
        "test",
        __name__ + ".TestMapreduceMap",
        __name__ + ".TestMapreduceReduce",
        input_reader_spec=input_readers.__name__ + ".RandomStringInputReader",
        output_writer_spec=(
            output_writers.__name__ + "._GoogleCloudStorageRecordOutputWriter"),
        mapper_params={
            "input_reader": {
                "count": 100
            },
        },
        reducer_params={
            "output_writer": {
                "bucket_name": "test"
            },
        },
        shards=3)
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    # Verify output.
    p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
    self.assertEqual(model.MapreduceState.RESULT_SUCCESS,
                     p.outputs.result_status.value)
    def testSuccessfulRun(self):
        p = shuffler._ShuffleServicePipeline("testjob", ["file1", "file2"])
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        request = self.file_service.shuffle_request
        self.assertTrue(request)
        self.assertTrue(request.shuffle_name().startswith("testjob-"))
        self.assertEquals(2, len(request.input_list()))
        self.assertEquals(1, request.input(0).format())
        self.assertEquals("file1", request.input(0).path())
        self.assertEquals(1, request.input(1).format())
        self.assertEquals("file2", request.input(1).path())
        self.assertEquals(2, len(request.output().path_list()))

        callback = request.callback()
        self.assertTrue(callback.url().startswith(
            "/mapreduce/pipeline/callback?pipeline_id="))
        self.assertEquals(self.version_id, callback.app_version_id())
        self.assertEquals("GET", callback.method())
        self.assertEquals("default", callback.queue())

        callback_task = {
            "url": callback.url(),
            "method": callback.method(),
        }
        test_support.execute_task(callback_task)
        test_support.execute_until_empty(self.taskqueue)

        p = shuffler._ShuffleServicePipeline.from_id(p.pipeline_id)
        self.assertTrue(p.has_finalized)
        output_files = p.outputs.default.value
        self.assertEquals(2, len(output_files))
        self.assertTrue(output_files[0].startswith("/blobstore/"))
        self.assertTrue(output_files[1].startswith("/blobstore/"))
Example #29
0
  def testLotsOfValuesForSingleKey(self):
    TestEntity(data=str(1)).put()
    # Run Mapreduce
    p = mapreduce_pipeline.MapreducePipeline(
        "test",
        __name__ + ".map_yield_lots_of_values",
        __name__ + ".reduce_length",
        input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
        output_writer_spec=
            output_writers.__name__ + ".BlobstoreRecordsOutputWriter",
        mapper_params= {
            "entity_kind": __name__ + "." + TestEntity.__name__,
            },
        shards=16)
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    self.assertEquals(1, len(self.emails))
    self.assertTrue(self.emails[0][1].startswith(
        "Pipeline successful:"))

    # Verify reduce output.
    p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
    output_data = []
    for output_file in p.outputs.default.value:
      with files.open(output_file, "r") as f:
        for record in records.RecordsReader(f):
          output_data.append(record)

    expected_data = ["('1', 50000)"]
    expected_data.sort()
    output_data.sort()
    self.assertEquals(expected_data, output_data)
  def testSuccessfulRun(self):
    p = shuffler._ShuffleServicePipeline("testjob", ["file1", "file2"])
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    request = self.file_service.shuffle_request
    self.assertTrue(request)
    self.assertTrue(request.shuffle_name().startswith("testjob-"))
    self.assertEquals(2, len(request.input_list()))
    self.assertEquals(1, request.input(0).format())
    self.assertEquals("file1", request.input(0).path())
    self.assertEquals(1, request.input(1).format())
    self.assertEquals("file2", request.input(1).path())
    self.assertEquals(2, len(request.output().path_list()))

    callback = request.callback()
    self.assertTrue(callback.url().startswith(
        "/mapreduce/pipeline/callback?pipeline_id="))
    self.assertEquals(self.version_id, callback.app_version_id())
    self.assertEquals("GET", callback.method())
    self.assertEquals("default", callback.queue())

    callback_task = {
        "url": callback.url(),
        "method": callback.method(),
        }
    test_support.execute_task(callback_task)
    test_support.execute_until_empty(self.taskqueue)

    p = shuffler._ShuffleServicePipeline.from_id(p.pipeline_id)
    self.assertTrue(p.has_finalized)
    output_files = p.outputs.default.value
    self.assertEquals(2, len(output_files))
    self.assertTrue(output_files[0].startswith("/blobstore/"))
    self.assertTrue(output_files[1].startswith("/blobstore/"))
  def testOrgsForAnotherProgram(self):
    """Tests that status of organizations for another program is untouched."""
    # seed another program
    program = seeder_logic.seed(program_model.Program)

    # seed a few pre-accepted and pre-rejected organizations
    pre_accepted_orgs = []
    for i in range(2):
      org = org_utils.seedOrganization(
          program.key(), org_id='pre_accepted_org_id_%s' % i,
          status=org_model.Status.PRE_ACCEPTED)
      pre_accepted_orgs.append(org.key)

    pre_rejected_orgs = []
    for i in range(3):
      org = org_utils.seedOrganization(
          program.key(), org_id='pre_rejrected_org_id_%s' % i,
          status=org_model.Status.PRE_REJECTED)
      pre_rejected_orgs.append(org.key)

    mapreduce_control.start_map(
        'ApplyOrgAdmissionDecisions', params=self.params)
    test_support.execute_until_empty(self.get_task_queue_stub())

    # check that pre-accepted organizations are still pre-accepted
    for org_key in pre_accepted_orgs:
      org = org_key.get()
      self.assertEqual(org.status, org_model.Status.PRE_ACCEPTED)

    # check that pre-rejected organizations are still pre-rejected
    for org_key in pre_rejected_orgs:
      org = org_key.get()
      self.assertEqual(org.status, org_model.Status.PRE_REJECTED)
    def testShuffleFiles(self):
        """Test shuffling multiple files."""
        input_data = [(str(i), str(i)) for i in range(100)]
        input_data.sort()

        input_file = files.blobstore.create()

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(files.blobstore.get_blob_key(input_file))

        p = shuffler.ShufflePipeline("testjob", [input_file, input_file, input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler.ShufflePipeline.from_id(p.pipeline_id)

        output_files = p.outputs.default.value
        output_data = []
        for output_file in output_files:
            with files.open(output_file, "r") as f:
                for record in records.RecordsReader(f):
                    proto = file_service_pb.KeyValues()
                    proto.ParseFromString(record)
                    output_data.append((proto.key(), proto.value_list()))
        output_data.sort()

        expected_data = sorted([(str(k), [str(v), str(v), str(v)]) for (k, v) in input_data])
        self.assertEquals(expected_data, output_data)
  def _runTest(self, num_shards):
    entity_count = 1000
    bucket_name = "bucket"
    job_name = "test_map"

    for _ in range(entity_count):
      TestEntity().put()

    mapreduce_id = control.start_map(
        job_name,
        __name__ + ".test_handler_yield_key_str",
        DATASTORE_READER_NAME,
        {
            "entity_kind": __name__ + "." + TestEntity.__name__,
            "output_writer": {
                "bucket_name": bucket_name,
            },
        },
        shard_count=num_shards,
        output_writer_spec=self.WRITER_NAME)

    test_support.execute_until_empty(self.taskqueue)
    mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
    filenames = self.WRITER_CLS.get_filenames(mapreduce_state)

    self.assertEqual(num_shards, len(filenames))
    total_entries = 0
    for shard in range(num_shards):
      self.assertTrue(filenames[shard].startswith("/%s/%s" % (bucket_name,
                                                              job_name)))
      data = cloudstorage.open(filenames[shard]).read()
      # strip() is used to remove the last newline of each file so that split()
      # does not retrun extraneous empty entries.
      total_entries += len(data.strip().split("\n"))
    self.assertEqual(entity_count, total_entries)
    def testFailedMapReduce(self):
        bucket_name = "testbucket"
        max_attempts_before = pipeline.pipeline._DEFAULT_MAX_ATTEMPTS
        try:
            pipeline.pipeline._DEFAULT_MAX_ATTEMPTS = 1

            # Add some random data.
            entity_count = 200

            print dir(pipeline.pipeline)

            for i in range(entity_count):
                TestEntity(data=str(i)).put()
                TestEntity(data=str(i)).put()

            p = mapreduce_pipeline.MapreducePipeline(
                "test",
                __name__ + ".test_failed_map",
                __name__ + ".test_mapreduce_reduce",
                input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
                output_writer_spec=(output_writers.__name__ + "._GoogleCloudStorageRecordOutputWriter"),
                mapper_params={"entity_kind": __name__ + "." + TestEntity.__name__},
                reducer_params={"output_writer": {"bucket_name": bucket_name}},
                shards=3,
            )
            p.max_attempts = 1
            p.start()
            test_support.execute_until_empty(self.taskqueue)

            p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
            self.assertTrue(p.was_aborted)
        finally:
            pipeline.pipeline._DEFAULT_MAX_ATTEMPTS = max_attempts_before
  def testDedicatedParams(self):
    entity_count = 1000

    for _ in range(entity_count):
      TestEntity().put()

    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".test_handler_yield_key_str",
        "mapreduce.input_readers.DatastoreInputReader",
        {
            "input_reader": {
                "entity_kind": __name__ + "." + TestEntity.__name__,
            },
            "output_writer": {
                "filesystem": "gs",
                "gs_bucket_name": "bucket",
            },
        },
        shard_count=4,
        base_path="/mapreduce_base_path",
        output_writer_spec=FILE_WRITER_NAME)

    test_support.execute_until_empty(self.taskqueue)

    mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
    filenames = output_writers.FileOutputWriter.get_filenames(mapreduce_state)
    self.assertEqual(1, len(filenames))
    self.assertTrue(filenames[0].startswith("/gs/bucket/"))

    with files.open(filenames[0], "r") as f:
      data = f.read(10000000)
      self.assertEquals(1000, len(data.strip().split("\n")))
  def testFailedMap(self):
    for i in range(1):
      TestEntity(data=str(i)).put()

    pipeline.pipeline._DEFAULT_MAX_ATTEMPTS = 1

    p = mapper_pipeline.MapperPipeline(
        "test",
        handler_spec=__name__ + ".test_fail_map",
        input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
        params={
            "input_reader": {
                "entity_kind": __name__ + "." + TestEntity.__name__,
            },
        },
        shards=5)
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    p = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id)
    self.assertTrue(p.was_aborted)

    self.assertTrue(p.outputs.job_id.filled)
    state = model.MapreduceState.get_by_job_id(p.outputs.job_id.value)
    self.assertEqual(model.MapreduceState.RESULT_FAILED, state.result_status)
    self.assertFalse(p.outputs.result_status.filled)
    self.assertFalse(p.outputs.default.filled)

    self.assertEquals(1, len(self.emails))
    self.assertTrue(self.emails[0][1].startswith(
        "Pipeline aborted:"))
    def testSortFile(self):
        """Test sorting a file."""
        bucket_name = "testbucket"
        test_filename = "testfile"
        full_filename = "/%s/%s" % (bucket_name, test_filename)

        input_data = [(str(i), "_" + str(i)) for i in range(100)]

        with cloudstorage.open(full_filename, mode="w") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = kv_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())

        p = shuffler._SortChunksPipeline("testjob", bucket_name,
                                         [[full_filename]])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler._SortChunksPipeline.from_id(p.pipeline_id)

        input_data.sort()
        output_files = p.outputs.default.value[0]
        output_data = []
        for output_file in output_files:
            with cloudstorage.open(output_file) as f:
                for binary_record in records.RecordsReader(f):
                    proto = kv_pb.KeyValue()
                    proto.ParseFromString(binary_record)
                    output_data.append((proto.key(), proto.value()))

        self.assertEquals(input_data, output_data)
        self.assertEquals(1, len(self.emails))
  def testHugeTaskUseDatastore(self):
    """Test map job with huge parameter values."""
    input_file = files.blobstore.create()
    input_data = [str(i) for i in range(100)]

    with files.open(input_file, "a") as f:
      with records.RecordsWriter(f) as w:
        for record in input_data:
          w.write(record)
    files.finalize(input_file)
    input_file = files.blobstore.get_file_name(
        files.blobstore.get_blob_key(input_file))

    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".TestHandler",
        "mapreduce.input_readers.RecordsReader",
        {
            "file": input_file,
            # the parameter can't be compressed and wouldn't fit into
            # taskqueue payload
            "huge_parameter": random_string(900000)
        },
        shard_count=4,
        base_path="/mapreduce_base_path")

    test_support.execute_until_empty(self.taskqueue)
    self.assertEquals(100, len(TestHandler.processed_entites))
    self.assertEquals([], model._HugeTaskPayload.all().fetch(100))
  def _run_test(self, num_shards, num_files, multi_slices=False):
    bucket_name = "testing"
    object_prefix = "file-"
    job_name = "test_map"
    expected_content = self.create_test_content(bucket_name,
                                                object_prefix,
                                                num_files)
    job = map_job.Job.submit(map_job.JobConfig(
        job_name=job_name,
        mapper=_InputReaderMemoryMapper,
        input_reader_cls=input_reader.GCSInputReader,
        input_reader_params={"bucket_name": bucket_name,
                             "objects": [object_prefix + "*"],
                             "path_filter": _MyPathFilter()},
        shard_count=num_shards))

    test_support.execute_until_empty(self.taskqueue)
    self.assertEqual(expected_content.sort(), _memory_mapper_data.sort())
    self.assertEqual(job.SUCCESS, job.get_status())
    self.assertEqual(
        num_files - 1,
        job.get_counter(input_reader.GCSInputReader.COUNTER_FILE_READ))
    if multi_slices:
      ss = model.ShardState.find_all_by_mapreduce_state(job._state)
      for s in ss:
        self.assertTrue(s.slice_id > 0)
  def testRecordsReader(self):
    """End-to-end test for records reader."""
    input_file = files.blobstore.create()
    input_data = [str(i) for i in range(100)]

    with files.open(input_file, "a") as f:
      with records.RecordsWriter(f) as w:
        for record in input_data:
          w.write(record)
    files.finalize(input_file)
    input_file = files.blobstore.get_file_name(
        files.blobstore.get_blob_key(input_file))

    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".TestHandler",
        "mapreduce.input_readers.RecordsReader",
        {
            "file": input_file
        },
        shard_count=4,
        base_path="/mapreduce_base_path")

    test_support.execute_until_empty(self.taskqueue)
    self.assertEquals(100, len(TestHandler.processed_entites))
  def testProcessEntites(self):
    """Test empty mapper over non-empty dataset."""
    for _ in range(100):
      TestEntity().put()

    p = mapper_pipeline.MapperPipeline(
        "empty_map",
        handler_spec=__name__ + ".test_empty_handler",
        input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
        params={
            "entity_kind": __name__ + ".TestEntity",
            },
        )
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    self.assertEquals(1, len(self.emails))
    self.assertTrue(self.emails[0][1].startswith(
        "Pipeline successful:"))

    p = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id)
    self.assertTrue(p.outputs.job_id.value)

    counters = p.outputs.counters.value
    self.assertTrue(counters)
    self.assertTrue(context.COUNTER_MAPPER_WALLTIME_MS in counters)
    self.assertEquals(100, counters[context.COUNTER_MAPPER_CALLS])
Example #42
0
    def testLotsOfValuesForSingleKey(self):
        TestEntity(data=str(1)).put()
        # Run Mapreduce
        p = mapreduce_pipeline.MapreducePipeline(
            "test",
            __name__ + ".map_yield_lots_of_values",
            __name__ + ".reduce_length",
            input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
            output_writer_spec=output_writers.__name__ +
            ".BlobstoreRecordsOutputWriter",
            mapper_params={
                "entity_kind": __name__ + "." + TestEntity.__name__,
            },
            shards=16)
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        self.assertEquals(1, len(self.emails))
        self.assertTrue(self.emails[0][1].startswith("Pipeline successful:"))

        # Verify reduce output.
        p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
        output_data = []
        for output_file in p.outputs.default.value:
            with files.open(output_file, "r") as f:
                for record in records.RecordsReader(f):
                    output_data.append(record)

        expected_data = ["('1', 50000)"]
        expected_data.sort()
        output_data.sort()
        self.assertEquals(expected_data, output_data)
  def testDecisionsAreApplied(self):
    """Tests that status of organizations is changed after the job."""
    mapreduce_control.start_map(
        'ApplyOrgAdmissionDecisions', params=self.params)
    test_support.execute_until_empty(self.get_task_queue_stub())

    # check that pre-rejected organizations are accepted now
    for org_key in self.pre_accepted_orgs:
      org = org_key.get()
      self.assertEqual(org.status, org_model.Status.ACCEPTED)

    # check that pre-rejected organizations are rejected now
    for org_key in self.pre_rejected_orgs:
      org = org_key.get()
      self.assertEqual(org.status, org_model.Status.REJECTED)

    # check that nothing has changed regarding applying organizations
    for org_key in self.applying_orgs:
      org = org_key.get()
      self.assertEqual(org.status, org_model.Status.APPLYING)

    for org_key in self.pre_accepted_orgs:
      org = org_key.get()
      subject = notifications.DEF_ACCEPTED_ORG % {
          'org': org.name,
          }
      self.assertEmailSent(cc=org.contact.email, subject=subject)

    for org_key in self.pre_rejected_orgs:
      org = org_key.get()
      subject = notifications.DEF_REJECTED_ORG % {
          'org': org.name,
          }
      self.assertEmailSent(cc=org.contact.email, subject=subject)
Example #44
0
    def testSmoke(self):
        """Test all handlers still works.

    This test doesn't care about the integrity of the job outputs.
    Just that things works under webapp2 framework.
    """
        # Run Mapreduce
        p = mapreduce_pipeline.MapreducePipeline(
            "test",
            __name__ + ".TestMapreduceMap",
            __name__ + ".TestMapreduceReduce",
            input_reader_spec=input_readers.__name__ +
            ".RandomStringInputReader",
            output_writer_spec=(output_writers.__name__ +
                                "._GoogleCloudStorageRecordOutputWriter"),
            mapper_params={
                "input_reader": {
                    "count": 100
                },
            },
            reducer_params={
                "output_writer": {
                    "bucket_name": "test"
                },
            },
            shards=3)
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        # Verify output.
        p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
        self.assertEqual(model.MapreduceState.RESULT_SUCCESS,
                         p.outputs.result_status.value)
    def testMergeFiles(self):
        """Test merging multiple files."""
        input_data = [(str(i), "_" + str(i)) for i in range(100)]
        input_data.sort()

        bucket_name = "testbucket"
        test_filename = "testfile"
        full_filename = "/%s/%s" % (bucket_name, test_filename)

        with cloudstorage.open(full_filename, mode="w") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = kv_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())

        p = TestMergePipeline(bucket_name,
                              [full_filename, full_filename, full_filename])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = TestMergePipeline.from_id(p.pipeline_id)

        output_file = p.outputs.default.value[0]
        output_data = []
        with cloudstorage.open(output_file) as f:
            for record in records.RecordsReader(f):
                output_data.append(record)

        expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data]
        self.assertEquals(expected_data, output_data)
        self.assertEquals(1, len(self.emails))
Example #46
0
    def testOrgsForAnotherProgram(self):
        """Tests that status of organizations for another program is untouched."""
        # seed another program
        program = seeder_logic.seed(program_model.Program)

        # seed a few pre-accepted and pre-rejected organizations
        pre_accepted_orgs = []
        for i in range(2):
            org = org_utils.seedOrganization(
                program.key(),
                org_id='pre_accepted_org_id_%s' % i,
                status=org_model.Status.PRE_ACCEPTED)
            pre_accepted_orgs.append(org.key)

        pre_rejected_orgs = []
        for i in range(3):
            org = org_utils.seedOrganization(
                program.key(),
                org_id='pre_rejrected_org_id_%s' % i,
                status=org_model.Status.PRE_REJECTED)
            pre_rejected_orgs.append(org.key)

        mapreduce_control.start_map('ApplyOrgAdmissionDecisions',
                                    params=self.params)
        test_support.execute_until_empty(self.get_task_queue_stub())

        # check that pre-accepted organizations are still pre-accepted
        for org_key in pre_accepted_orgs:
            org = org_key.get()
            self.assertEqual(org.status, org_model.Status.PRE_ACCEPTED)

        # check that pre-rejected organizations are still pre-rejected
        for org_key in pre_rejected_orgs:
            org = org_key.get()
            self.assertEqual(org.status, org_model.Status.PRE_REJECTED)
    def testSortFile(self):
        """Test sorting a file."""
        input_file = files.blobstore.create()

        input_data = [(str(i), "_" + str(i)) for i in range(100)]

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(files.blobstore.get_blob_key(input_file))

        p = shuffler._SortChunksPipeline("testjob", [input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler._SortChunksPipeline.from_id(p.pipeline_id)

        input_data.sort()
        output_files = p.outputs.default.value[0]
        output_data = []
        for output_file in output_files:
            with files.open(output_file, "r") as f:
                for binary_record in records.RecordsReader(f):
                    proto = file_service_pb.KeyValue()
                    proto.ParseFromString(binary_record)
                    output_data.append((proto.key(), proto.value()))

        self.assertEquals(input_data, output_data)
Example #48
0
    def testDecisionsAreApplied(self):
        """Tests that status of organizations is changed after the job."""
        mapreduce_control.start_map('ApplyOrgAdmissionDecisions',
                                    params=self.params)
        test_support.execute_until_empty(self.get_task_queue_stub())

        # check that pre-rejected organizations are accepted now
        for org_key in self.pre_accepted_orgs:
            org = org_key.get()
            self.assertEqual(org.status, org_model.Status.ACCEPTED)

        # check that pre-rejected organizations are rejected now
        for org_key in self.pre_rejected_orgs:
            org = org_key.get()
            self.assertEqual(org.status, org_model.Status.REJECTED)

        # check that nothing has changed regarding applying organizations
        for org_key in self.applying_orgs:
            org = org_key.get()
            self.assertEqual(org.status, org_model.Status.APPLYING)

        for org_key in self.pre_accepted_orgs:
            org = org_key.get()
            subject = notifications.DEF_ACCEPTED_ORG % {
                'org': org.name,
            }
            self.assertEmailSent(cc=org.contact.email, subject=subject)

        for org_key in self.pre_rejected_orgs:
            org = org_key.get()
            subject = notifications.DEF_REJECTED_ORG % {
                'org': org.name,
            }
            self.assertEmailSent(cc=org.contact.email, subject=subject)
  def testSingleShard(self):
    entity_count = 1000

    for _ in range(entity_count):
      TestEntity().put()

    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".test_handler_yield_key_str",
        "mapreduce.input_readers.DatastoreInputReader",
        {
            "entity_kind": __name__ + "." + TestEntity.__name__,
        },
        shard_count=4,
        base_path="/mapreduce_base_path",
        output_writer_spec=BLOBSTORE_WRITER_NAME)

    test_support.execute_until_empty(self.taskqueue)

    mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
    filenames = output_writers.BlobstoreOutputWriter.get_filenames(
        mapreduce_state)
    self.assertEqual(1, len(filenames))
    blob_name = filenames[0]
    self.assertTrue(blob_name.startswith("/blobstore/"))
    self.assertFalse(blob_name.startswith("/blobstore/writable:"))

    with files.open(blob_name, "r") as f:
      data = f.read(10000000)
      self.assertEquals(1000, len(data.strip().split("\n")))
  def testMultipleShards(self):
    entity_count = 1000

    for _ in range(entity_count):
      TestEntity().put()

    mapreduce_id = control.start_map(
        "test_map",
        __name__ + ".test_handler_yield_key_str",
        DATASTORE_READER_NAME,
        {
            "entity_kind": __name__ + "." + TestEntity.__name__,
            "output_sharding": "input",
        },
        shard_count=4,
        base_path="/mapreduce_base_path",
        output_writer_spec=BLOBSTORE_WRITER_NAME)

    test_support.execute_until_empty(self.taskqueue)

    mapreduce_state = model.MapreduceState.get_by_job_id(mapreduce_id)
    filenames = output_writers.BlobstoreOutputWriter.get_filenames(
        mapreduce_state)
    self.assertEqual(4, len(filenames))

    file_lengths = []
    for filename in filenames:
      self.assertTrue(filename.startswith("/blobstore/"))
      self.assertFalse(filename.startswith("/blobstore/writable:"))

      with files.open(filename, "r") as f:
        data = f.read(10000000)
        file_lengths.append(len(data.strip().split("\n")))

    self.assertEqual(1000, sum(file_lengths))
    def testMergeFiles(self):
        """Test merging multiple files."""
        input_data = [(str(i), "_" + str(i)) for i in range(100)]
        input_data.sort()

        input_file = files.blobstore.create()

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(files.blobstore.get_blob_key(input_file))

        p = TestMergePipeline([input_file, input_file, input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = TestMergePipeline.from_id(p.pipeline_id)

        output_file = p.outputs.default.value[0]
        output_data = []
        with files.open(output_file, "r") as f:
            for record in records.RecordsReader(f):
                output_data.append(record)

        expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data]
        self.assertEquals(expected_data, output_data)
    def testHugeTaskUseDatastore(self):
        """Test map job with huge parameter values."""
        input_file = files.blobstore.create()
        input_data = [str(i) for i in range(100)]

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for record in input_data:
                    w.write(record)
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        mapreduce_id = control.start_map(
            "test_map",
            __name__ + ".TestHandler",
            "mapreduce.input_readers.RecordsReader",
            {
                "file": input_file,
                # the parameter can't be compressed and wouldn't fit into
                # taskqueue payload
                "huge_parameter": random_string(900000)
            },
            shard_count=4,
            base_path="/mapreduce_base_path")

        test_support.execute_until_empty(self.taskqueue)
        self.assertEquals(100, len(TestHandler.processed_entites))
        self.assertEquals([], util._HugeTaskPayload.all().fetch(100))
Example #53
0
    def testShuffleNoFile(self):
        p = shuffler.ShufflePipeline("testjob", [])
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        p = shuffler.ShufflePipeline.from_id(p.pipeline_id)
        for filename in p.outputs.default.value:
            self.assertEqual(0, files.stat(filename).st_size)
  def testMapReduce(self):
    # Prepare test data
    bucket_name = "testbucket"
    job_name = "test_job"
    entity_count = 200

    for i in range(entity_count):
      TestEntity(data=str(i)).put()
      TestEntity(data=str(i)).put()

    # Run Mapreduce
    p = mapreduce_pipeline.MapreducePipeline(
        job_name,
        __name__ + ".test_mapreduce_map",
        __name__ + ".test_mapreduce_reduce",
        input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
        output_writer_spec=(
            output_writers.__name__ + "._GoogleCloudStorageRecordOutputWriter"),
        mapper_params={
            "entity_kind": __name__ + "." + TestEntity.__name__,
            "bucket_name": bucket_name
        },
        reducer_params={
            "output_writer": {
                "bucket_name": bucket_name
            },
        },
        shards=16)
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    self.assertEquals(1, len(self.emails))
    self.assertTrue(self.emails[0][1].startswith(
        "Pipeline successful:"))

    # Verify reduce output.
    p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
    self.assertEqual(model.MapreduceState.RESULT_SUCCESS,
                     p.outputs.result_status.value)
    output_data = []
    for output_file in p.outputs.default.value:
      with cloudstorage.open(output_file) as f:
        for record in records.RecordsReader(f):
          output_data.append(record)

    expected_data = [
        str((str(d), ["", ""])) for d in range(entity_count)]
    expected_data.sort()
    output_data.sort()
    self.assertEquals(expected_data, output_data)

    # Verify that mapreduce doesn't leave intermediate files behind.
    temp_file_stats = cloudstorage.listbucket("/" + bucket_name)
    for stat in temp_file_stats:
      if stat.filename:
        self.assertFalse(
            stat.filename.startswith("/%s/%s-shuffle-" %
                                     (bucket_name, job_name)))
  def testShuffleNoFile(self):
    p = shuffler.ShufflePipeline(
        "testjob", [])
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    p = shuffler.ShufflePipeline.from_id(p.pipeline_id)
    for filename in p.outputs.default.value:
      self.assertEqual(0, files.stat(filename).st_size)
    def testMapReduce(self):
        # Prepare test data
        bucket_name = "testbucket"
        job_name = "test_job"
        entity_count = 200

        for i in range(entity_count):
            TestEntity(data=str(i)).put()
            TestEntity(data=str(i)).put()

        # Run Mapreduce
        p = mapreduce_pipeline.MapreducePipeline(
            job_name,
            __name__ + ".test_mapreduce_map",
            __name__ + ".test_mapreduce_reduce",
            input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
            output_writer_spec=(output_writers.__name__ +
                                "._GoogleCloudStorageRecordOutputWriter"),
            mapper_params={
                "entity_kind": __name__ + "." + TestEntity.__name__,
                "bucket_name": bucket_name
            },
            reducer_params={
                "output_writer": {
                    "bucket_name": bucket_name
                },
            },
            shards=16)
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        self.assertEquals(1, len(self.emails))
        self.assertTrue(self.emails[0][1].startswith("Pipeline successful:"))

        # Verify reduce output.
        p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
        self.assertEqual(model.MapreduceState.RESULT_SUCCESS,
                         p.outputs.result_status.value)
        output_data = []
        for output_file in p.outputs.default.value:
            with cloudstorage.open(output_file) as f:
                for record in records.RecordsReader(f):
                    output_data.append(record)

        expected_data = [str((str(d), ["", ""])) for d in range(entity_count)]
        expected_data.sort()
        output_data.sort()
        self.assertEquals(expected_data, output_data)

        # Verify that mapreduce doesn't leave intermediate files behind.
        temp_file_stats = cloudstorage.listbucket("/" + bucket_name)
        for stat in temp_file_stats:
            if stat.filename:
                self.assertFalse(
                    stat.filename.startswith("/%s/%s-shuffle-" %
                                             (bucket_name, job_name)))
Example #57
0
  def testAbort(self):
    job = map_job.Job.submit(self.config)
    self.assertEqual(map_job.Job.RUNNING, job.get_status())
    job.abort()
    self.assertEqual(map_job.Job.RUNNING, job.get_status())

    # Execute all tasks.
    test_support.execute_until_empty(self.taskqueue)
    job = map_job.Job.get_job_by_id(job_id=self.config.job_id)
    self.assertEqual(map_job.Job.ABORTED, job.get_status())