Esempio n. 1
0
    def testValidate_BadBatchSize(self):
        """Test validate function rejects bad batch sizes."""
        params = {"entity_kind": testutil.ENTITY_KIND, "batch_size": "xxx"}
        conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME,
                                 mapper=map_job.Mapper,
                                 input_reader_cls=self.reader_cls,
                                 input_reader_params=params,
                                 shard_count=1)
        self.assertRaises(errors.BadReaderParamsError,
                          self.reader_cls.validate, conf)
        # Batch size of 0 is invalid.
        params = {"entity_kind": testutil.ENTITY_KIND, "batch_size": "0"}
        conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME,
                                 mapper=map_job.Mapper,
                                 input_reader_cls=self.reader_cls,
                                 input_reader_params=params,
                                 shard_count=1)
        self.assertRaises(errors.BadReaderParamsError,
                          self.reader_cls.validate, conf)

        # Batch size of -1 is invalid.
        params = {"entity_kind": testutil.ENTITY_KIND, "batch_size": "-1"}
        conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME,
                                 mapper=map_job.Mapper,
                                 input_reader_cls=self.reader_cls,
                                 input_reader_params=params,
                                 shard_count=1)
        self.assertRaises(errors.BadReaderParamsError,
                          self.reader_cls.validate, conf)
    def testSplitInput_withNsAndDefaultNs(self):
        shards = 2
        # 10 entities in the default namespace
        empty_ns_keys = [str(k) for k in range(10)]
        self._create_entities(empty_ns_keys,
                              dict([(k, 1) for k in empty_ns_keys]), None)
        # 10 entities for each of N different non-default namespaces. The number
        # of namespaces, N, is set to be twice the cutoff for switching to sharding
        # by namespace instead of keys.
        non_empty_ns_keys = []
        for ns_num in range(self.reader_cls.MAX_NAMESPACES_FOR_KEY_SHARD * 2):
            ns_keys = ["n-%02d-k-%02d" % (ns_num, k) for k in range(10)]
            non_empty_ns_keys.extend(ns_keys)
            self._create_entities(ns_keys, dict([(k, 1) for k in ns_keys]),
                                  "%02d" % ns_num)

        # Test a query over all namespaces
        params = {"entity_kind": self.entity_kind, "namespace": None}
        conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME,
                                 mapper=map_job.Mapper,
                                 input_reader_cls=self.reader_cls,
                                 input_reader_params=params,
                                 shard_count=shards)
        results = self.reader_cls.split_input(conf)
        self.assertEqual(shards, len(results))
        all_keys = empty_ns_keys + non_empty_ns_keys
        self._assertEqualsForAllShards_splitInput(all_keys, len(all_keys),
                                                  *results)
    def _run_test(self, num_shards, num_files, multi_slices=False):
        bucket_name = "testing"
        object_prefix = "file-"
        job_name = "test_map"
        expected_content = self.create_test_content(bucket_name, object_prefix,
                                                    num_files)
        job = map_job.Job.submit(
            map_job.JobConfig(job_name=job_name,
                              mapper=_InputReaderMemoryMapper,
                              input_reader_cls=input_reader.GCSInputReader,
                              input_reader_params={
                                  "bucket_name": bucket_name,
                                  "objects": [object_prefix + "*"],
                                  "path_filter": _MyPathFilter()
                              },
                              shard_count=num_shards))

        test_support.execute_until_empty(self.taskqueue)
        self.assertEqual(expected_content.sort(), _memory_mapper_data.sort())
        self.assertEqual(job.SUCCESS, job.get_status())
        self.assertEqual(
            num_files - 1,
            job.get_counter(input_reader.GCSInputReader.COUNTER_FILE_READ))
        if multi_slices:
            ss = model.ShardState.find_all_by_mapreduce_state(job._state)
            for s in ss:
                self.assertTrue(s.slice_id > 0)
Esempio n. 4
0
    def testSplitInput_shardByFilters_lotsOfNS(self):
        """Lots means more than 2 in test cases."""
        entities = self._create_entities(range(12), {}, "f")
        self._set_vals(entities, list(range(6)), list(range(2)))
        entities = self._create_entities(range(12, 24), {}, "g")
        self._set_vals(entities, list(range(6)), list(range(2)))
        entities = self._create_entities(range(24, 36), {}, "h")
        self._set_vals(entities, list(range(6)), list(range(2)))
        entities = self._create_entities(range(36, 48), {}, "h")
        self._set_vals(entities, [0] * 6, list(range(2)))

        params = {
            "entity_kind": self.entity_kind,
            "filters": [("a", ">", 0), ("a", "<=", 3), ("b", "=", 1)]
        }
        conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME,
                                 mapper=map_job.Mapper,
                                 input_reader_cls=self.reader_cls,
                                 input_reader_params=params,
                                 shard_count=100)
        results = conf.input_reader_cls.split_input(conf)
        self.assertEquals(3, len(results))
        self._assertEquals_splitInput(results[0], ["3", "5", "7"])
        self._assertEquals_splitInput(results[1], ["15", "17", "19"])
        self._assertEquals_splitInput(results[2], ["27", "29", "31"])
Esempio n. 5
0
 def testUserProvidesJobID(self):
   conf = map_job.JobConfig(
       job_name="foo",
       job_id="id",
       mapper=map_job.Mapper,
       input_reader_cls=sample_input_reader.SampleInputReader,
       input_reader_params={"foo": 1})
   self.assertEqual("id", conf.job_id)
Esempio n. 6
0
 def setUp(self):
   super(MapJobStatusTest, self).setUp()
   self.config = map_job.JobConfig(
       job_name="test_map",
       shard_count=1,
       mapper=map_job.Mapper,
       input_reader_cls=sample_input_reader.SampleInputReader,
       input_reader_params={"count": TEST_SAMPLE_INPUT_READER_COUNT})
Esempio n. 7
0
 def testValidate_WrongTypeNamespace(self):
     """Tests validate function rejects namespace of incorrect type."""
     params = {"entity_kind": testutil.ENTITY_KIND, "namespace": 5}
     conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME,
                              mapper=map_job.Mapper,
                              input_reader_cls=self.reader_cls,
                              input_reader_params=params,
                              shard_count=1)
     self.assertRaises(errors.BadReaderParamsError,
                       self.reader_cls.validate, conf)
Esempio n. 8
0
    def testValidate(self):
        # Check that some input reader params are required.
        conf = map_job.JobConfig(
            job_name="test_handler",
            mapper=map_job.Mapper,
            input_reader_cls=sample_input_reader.SampleInputReader,
            input_reader_params={},
            shard_count=99)
        self.assertRaises(errors.BadReaderParamsError,
                          sample_input_reader.SampleInputReader.validate, conf)

        # Check that count is an integer.
        conf = map_job.JobConfig(
            job_name="test_handler",
            mapper=map_job.Mapper,
            input_reader_cls=sample_input_reader.SampleInputReader,
            input_reader_params={"count": "1000"},
            shard_count=99)
        self.assertRaises(errors.BadReaderParamsError,
                          sample_input_reader.SampleInputReader.validate, conf)

        # Check that count is a positive integer.
        conf = map_job.JobConfig(
            job_name="test_handler",
            mapper=map_job.Mapper,
            input_reader_cls=sample_input_reader.SampleInputReader,
            input_reader_params={"count": -1},
            shard_count=99)
        self.assertRaises(errors.BadReaderParamsError,
                          sample_input_reader.SampleInputReader.validate, conf)

        # Check that string_length is an integer.
        conf = map_job.JobConfig(
            job_name="test_handler",
            mapper=map_job.Mapper,
            input_reader_cls=sample_input_reader.SampleInputReader,
            input_reader_params={
                "count": 10,
                "string_length": 1.5
            },
            shard_count=99)
        self.assertRaises(errors.BadReaderParamsError,
                          sample_input_reader.SampleInputReader.validate, conf)
Esempio n. 9
0
 def testValidate_NoEntityFails(self):
     """Test validate function raises exception with no entity parameter."""
     params = {}
     conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME,
                              mapper=map_job.Mapper,
                              input_reader_cls=self.reader_cls,
                              input_reader_params=params,
                              shard_count=1)
     self.assertRaises(errors.BadReaderParamsError,
                       self.reader_cls.validate, conf)
Esempio n. 10
0
 def testValidate_Passes(self):
     """Test validate function accepts valid parameters."""
     params = {
         "entity_kind": testutil.ENTITY_KIND,
     }
     conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME,
                              mapper=map_job.Mapper,
                              input_reader_cls=self.reader_cls,
                              input_reader_params=params,
                              shard_count=1)
     self.reader_cls.validate(conf)
Esempio n. 11
0
 def testValidate_EntityKindWithNoModel(self):
     """Test validate function with bad entity kind."""
     params = {
         "entity_kind": "foo",
     }
     conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME,
                              mapper=map_job.Mapper,
                              input_reader_cls=self.reader_cls,
                              input_reader_params=params,
                              shard_count=1)
     self.reader_cls.validate(conf)
 def testSplitInput_noEntity(self):
     params = {
         "entity_kind": self.entity_kind,
     }
     conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME,
                              mapper=map_job.Mapper,
                              input_reader_cls=self.reader_cls,
                              input_reader_params=params,
                              shard_count=1)
     results = self.reader_cls.split_input(conf)
     self.assertEquals(None, results)
Esempio n. 13
0
 def testValidate_EntityKindWithNoModel(self):
     """Test validate function with no model."""
     params = {
         "entity_kind": "foo",
     }
     conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME,
                              mapper=map_job.Mapper,
                              input_reader_cls=self.reader_cls,
                              input_reader_params=params,
                              shard_count=1)
     self.assertRaises(errors.BadReaderParamsError,
                       conf.input_reader_cls.validate, conf)
Esempio n. 14
0
    def testValidate_Filters(self):
        """Tests validating filters parameter."""
        params = {
            "entity_kind": self.entity_kind,
            "filters": [("a", "=", 1), ("b", "=", 2)],
        }
        new = datetime.datetime.now()
        old = new.replace(year=new.year - 1)
        conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME,
                                 mapper=map_job.Mapper,
                                 input_reader_cls=self.reader_cls,
                                 input_reader_params=params,
                                 shard_count=1)
        conf.input_reader_cls.validate(conf)

        conf.input_reader_params["filters"] = [["a", ">", 1], ["a", "<", 2]]
        conf.input_reader_cls.validate(conf)

        conf.input_reader_params["filters"] = [["datetime_property", ">", old],
                                               [
                                                   "datetime_property", "<=",
                                                   new
                                               ], ["a", "=", 1]]
        conf.input_reader_cls.validate(conf)

        conf.input_reader_params["filters"] = [["a", "=", 1]]
        conf.input_reader_cls.validate(conf)

        # Invalid field c
        conf.input_reader_params["filters"] = [("c", "=", 1)]
        self.assertRaises(errors.BadReaderParamsError,
                          conf.input_reader_cls.validate, conf)

        # Expect a range.
        conf.input_reader_params["filters"] = [("a", "<=", 1)]
        self.assertRaises(errors.BadReaderParamsError,
                          conf.input_reader_cls.validate, conf)

        # Value should be a datetime.
        conf.input_reader_params["filters"] = [["datetime_property", ">", 1],
                                               [
                                                   "datetime_property", "<=",
                                                   datetime.datetime.now()
                                               ]]
        self.assertRaises(errors.BadReaderParamsError,
                          conf.input_reader_cls.validate, conf)

        # Expect a closed range.
        params["filters"] = [["datetime_property", ">", new],
                             ["datetime_property", "<=", old]]
        self.assertRaises(errors.BadReaderParamsError,
                          conf.input_reader_cls.validate, conf)
Esempio n. 15
0
 def testSmoke(self):
   conf = map_job.JobConfig(
       job_name="foo",
       mapper=map_job.Mapper,
       input_reader_cls=sample_input_reader.SampleInputReader,
       input_reader_params={"foo": 1})
   self.assertEqual("foo", conf.job_name)
   self.assertTrue(conf.job_id)
   self.assertEqual(map_job.Mapper, conf.mapper)
   self.assertEqual(sample_input_reader.SampleInputReader,
                    conf.input_reader_cls)
   self.assertEqual({"foo": 1}, conf.input_reader_params)
   self.assertEqual(parameters.config.SHARD_COUNT, conf.shard_count)
Esempio n. 16
0
 def testEndToEnd(self):
     conf = map_job.JobConfig(
         job_name="test_handler",
         mapper=map_job.Mapper,
         input_reader_cls=sample_input_reader.SampleInputReader,
         input_reader_params={"count": 1000},
         shard_count=99)
     readers = sample_input_reader.SampleInputReader.split_input(conf)
     i = 0
     for reader in readers:
         for _ in reader:
             i += 1
     self.assertEquals(1000, i)
Esempio n. 17
0
  def testSmoke(self):
    # Force handler to serialize on every call.
    parameters.config._SLICE_DURATION_SEC = 0

    job_config = map_job.JobConfig(
        job_name="test_map",
        mapper=MyMapper,
        input_reader_cls=sample_input_reader.SampleInputReader,
        input_reader_params={"count": TEST_SAMPLE_INPUT_READER_COUNT},
        user_params={"foo": 1, "bar": 2})
    MyMapper.original_conf = job_config
    map_job.Job.submit(job_config)
    test_support.execute_until_empty(self.taskqueue)
    job = map_job.Job.get_job_by_id(job_config.job_id)
    self.assertEqual(map_job.Job.SUCCESS, job.get_status())
 def testSplitInput_moreThanOneUnevenNS(self):
     self._create_entities(range(5), {"1": 1, "3": 3}, "1")
     self._create_entities(range(10, 13), {"11": 11}, "2")
     params = {
         "entity_kind": self.entity_kind,
     }
     conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME,
                              mapper=map_job.Mapper,
                              input_reader_cls=self.reader_cls,
                              input_reader_params=params,
                              shard_count=4)
     results = self.reader_cls.split_input(conf)
     self.assertTrue(len(results) >= 3)
     self._assertEqualsForAllShards_splitInput(
         ["0", "1", "2", "3", "4", "10", "11", "12"], None, *results)
 def testSplitInput_withNs_moreShardThanScatter(self):
     self._create_entities(range(3), {"1": 1}, "f")
     params = {
         "entity_kind": self.entity_kind,
         "namespace": "f",
     }
     conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME,
                              mapper=map_job.Mapper,
                              input_reader_cls=self.reader_cls,
                              input_reader_params=params,
                              shard_count=4)
     results = self.reader_cls.split_input(conf)
     self.assertTrue(len(results) >= 2)
     self._assertEqualsForAllShards_splitInput(["0", "1", "2"], None,
                                               *results)
Esempio n. 20
0
 def setUp(self):
   super(MapJobStartTest, self).setUp()
   TestHooks.enqueue_kickoff_task_calls = []
   self.config = map_job.JobConfig(
       job_name="test_map",
       shard_count=1,
       mapper=map_job.Mapper,
       input_reader_cls=sample_input_reader.SampleInputReader,
       input_reader_params={"count": TEST_SAMPLE_INPUT_READER_COUNT},
       queue_name="crazy-queue",
       _base_path="/mr_base",
       _force_writes=True,
       shard_max_attempts=5,
       _task_max_attempts=6,
       done_callback_url="www.google.com",
       _hooks_cls=TestHooks)
Esempio n. 21
0
 def testSplitInput_shardByFilters_noEntity(self):
     params = {
         "entity_kind": self.entity_kind,
         "namespace": "f",
         "filters": [("a", ">", 0), ("a", "<=", 3), ("b", "=", 1)]
     }
     conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME,
                              mapper=map_job.Mapper,
                              input_reader_cls=self.reader_cls,
                              input_reader_params=params,
                              shard_count=100)
     results = conf.input_reader_cls.split_input(conf)
     self.assertEquals(3, len(results))
     self._assertEquals_splitInput(results[0], [])
     self._assertEquals_splitInput(results[1], [])
     self._assertEquals_splitInput(results[2], [])
 def testSplitInput_lotsOfNS(self):
     self._create_entities(range(3), {"1": 1}, "9")
     self._create_entities(range(3, 6), {"4": 4}, "_")
     self._create_entities(range(6, 9), {"7": 7}, "a")
     params = {
         "entity_kind": self.entity_kind,
     }
     conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME,
                              mapper=map_job.Mapper,
                              input_reader_cls=self.reader_cls,
                              input_reader_params=params,
                              shard_count=3)
     results = self.reader_cls.split_input(conf)
     self.assertEquals(3, len(results))
     self._assertEquals_splitInput(results[0], ["0", "1", "2"])
     self._assertEquals_splitInput(results[1], ["3", "4", "5"])
     self._assertEquals_splitInput(results[2], ["6", "7", "8"])
Esempio n. 23
0
 def testSplitInput_shardByFilters_withNs(self):
     entities = self._create_entities(range(12), {}, "f")
     self._set_vals(entities, list(range(6)), list(range(2)))
     params = {
         "entity_kind": self.entity_kind,
         "namespace": "f",
         "filters": [("a", ">", 0), ("a", "<=", 3), ("b", "=", 1)],
     }
     conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME,
                              mapper=map_job.Mapper,
                              input_reader_cls=self.reader_cls,
                              input_reader_params=params,
                              shard_count=2)
     results = conf.input_reader_cls.split_input(conf)
     self.assertEquals(2, len(results))
     self._assertEquals_splitInput(results[0], ["3", "5"])
     self._assertEquals_splitInput(results[1], ["7"])
    def testEntityKindWithDot(self):
        self._create_entities(range(3), {"1": 1}, "",
                              testutil.TestEntityWithDot)

        params = {
            "entity_kind": testutil.TestEntityWithDot.kind(),
            "namespace": "",
        }
        conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME,
                                 mapper=map_job.Mapper,
                                 input_reader_cls=self.reader_cls,
                                 input_reader_params=params,
                                 shard_count=2)
        results = conf.input_reader_cls.split_input(conf)
        self.assertEquals(2, len(results))
        self._assertEqualsForAllShards_splitInput(["0", "1", "2"], None,
                                                  *results)
Esempio n. 25
0
  def create_job_config(self, num_shards=None, input_params=None):
    """Create a JobConfig using GCSInputReader.

    Args:
      num_shards: optionally specify the number of shards.
      input_params: parameters for the input reader.

    Returns:
      a JobConfig with default settings and specified input_params.
    """
    job_config = map_job.JobConfig(
        job_name="TestJob",
        mapper=map_job.Mapper,
        input_reader_cls=self.READER_CLS,
        input_reader_params=input_params,
        shard_count=num_shards)
    return job_config
    def testSmoke(self):
        entity_count = 10

        # Force handler to serialize on every call.
        parameters.config._SLICE_DURATION_SEC = 0

        job = map_job.Job.submit(
            map_job.JobConfig(
                job_name="test_map",
                mapper=MyMapper,
                input_reader_cls=sample_input_reader.SampleInputReader,
                input_reader_params={"count": entity_count},
                output_writer_cls=output_writers.
                _GoogleCloudStorageOutputWriter,
                output_writer_params={"bucket_name": "bucket"}))
        test_support.execute_until_empty(self.taskqueue)
        total = 0
        for m in MyMapper.mappers.values():
            total += m.processed
        self.assertEqual(entity_count, total)

        # Verify counters.
        counters = dict(job.get_counters())
        self.assertEqual(counters["FOO_COUNTER"], 2 * entity_count)
        self.assertEqual(counters["BAR_COUNTER"], -1 * entity_count)
        self.assertEqual(counters["SLICES"], MyMapper.slices)

        # Verify outputs.
        files = output_writers._GoogleCloudStorageOutputWriter.get_filenames(
            job._state)
        outputs = collections.defaultdict(lambda: 0)
        expected = {
            "foo\n": entity_count,
            "bar\n": entity_count,
            "end_slice\n": MyMapper.slices,
            "begin_slice\n": MyMapper.slices
        }
        for fn in files:
            f = cloudstorage.open(fn)
            for line in f:
                outputs[line] += 1
        self.assertEqual(expected, outputs)
    def testValidate_Filters(self):
        """Tests validating filters parameter."""
        params = {
            "entity_kind": self.entity_kind,
            "filters": [("a", "=", 1), ("b", "=", 2)],
        }
        new = datetime.datetime.now()
        old = new.replace(year=new.year - 1)
        conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME,
                                 mapper=map_job.Mapper,
                                 input_reader_cls=self.reader_cls,
                                 input_reader_params=params,
                                 shard_count=1)
        conf.input_reader_cls.validate(conf)

        # Only equality filters supported.
        params["filters"] = [["datetime_property", ">", old],
                             ["datetime_property", "<=", new], ["a", "=", 1]]
        self.assertRaises(errors.BadReaderParamsError,
                          conf.input_reader_cls.validate, conf)
    def testRawEntityTypeFromOtherApp(self):
        """Test reading from other app."""
        OTHER_KIND = "bar"
        OTHER_APP = "foo"
        apiproxy_stub_map.apiproxy.GetStub("datastore_v3").SetTrusted(True)
        expected_keys = [str(i) for i in range(10)]
        for k in expected_keys:
            datastore.Put(datastore.Entity(OTHER_KIND, name=k, _app=OTHER_APP))

        params = {
            "entity_kind": OTHER_KIND,
            "_app": OTHER_APP,
        }
        conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME,
                                 mapper=map_job.Mapper,
                                 input_reader_cls=self.reader_cls,
                                 input_reader_params=params,
                                 shard_count=1)
        itr = conf.input_reader_cls.split_input(conf)[0]
        self._assertEquals_splitInput(itr, expected_keys)
        apiproxy_stub_map.apiproxy.GetStub("datastore_v3").SetTrusted(False)