def testValidate_BadBatchSize(self): """Test validate function rejects bad batch sizes.""" params = {"entity_kind": testutil.ENTITY_KIND, "batch_size": "xxx"} conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME, mapper=map_job.Mapper, input_reader_cls=self.reader_cls, input_reader_params=params, shard_count=1) self.assertRaises(errors.BadReaderParamsError, self.reader_cls.validate, conf) # Batch size of 0 is invalid. params = {"entity_kind": testutil.ENTITY_KIND, "batch_size": "0"} conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME, mapper=map_job.Mapper, input_reader_cls=self.reader_cls, input_reader_params=params, shard_count=1) self.assertRaises(errors.BadReaderParamsError, self.reader_cls.validate, conf) # Batch size of -1 is invalid. params = {"entity_kind": testutil.ENTITY_KIND, "batch_size": "-1"} conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME, mapper=map_job.Mapper, input_reader_cls=self.reader_cls, input_reader_params=params, shard_count=1) self.assertRaises(errors.BadReaderParamsError, self.reader_cls.validate, conf)
def testSplitInput_withNsAndDefaultNs(self): shards = 2 # 10 entities in the default namespace empty_ns_keys = [str(k) for k in range(10)] self._create_entities(empty_ns_keys, dict([(k, 1) for k in empty_ns_keys]), None) # 10 entities for each of N different non-default namespaces. The number # of namespaces, N, is set to be twice the cutoff for switching to sharding # by namespace instead of keys. non_empty_ns_keys = [] for ns_num in range(self.reader_cls.MAX_NAMESPACES_FOR_KEY_SHARD * 2): ns_keys = ["n-%02d-k-%02d" % (ns_num, k) for k in range(10)] non_empty_ns_keys.extend(ns_keys) self._create_entities(ns_keys, dict([(k, 1) for k in ns_keys]), "%02d" % ns_num) # Test a query over all namespaces params = {"entity_kind": self.entity_kind, "namespace": None} conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME, mapper=map_job.Mapper, input_reader_cls=self.reader_cls, input_reader_params=params, shard_count=shards) results = self.reader_cls.split_input(conf) self.assertEqual(shards, len(results)) all_keys = empty_ns_keys + non_empty_ns_keys self._assertEqualsForAllShards_splitInput(all_keys, len(all_keys), *results)
def _run_test(self, num_shards, num_files, multi_slices=False): bucket_name = "testing" object_prefix = "file-" job_name = "test_map" expected_content = self.create_test_content(bucket_name, object_prefix, num_files) job = map_job.Job.submit( map_job.JobConfig(job_name=job_name, mapper=_InputReaderMemoryMapper, input_reader_cls=input_reader.GCSInputReader, input_reader_params={ "bucket_name": bucket_name, "objects": [object_prefix + "*"], "path_filter": _MyPathFilter() }, shard_count=num_shards)) test_support.execute_until_empty(self.taskqueue) self.assertEqual(expected_content.sort(), _memory_mapper_data.sort()) self.assertEqual(job.SUCCESS, job.get_status()) self.assertEqual( num_files - 1, job.get_counter(input_reader.GCSInputReader.COUNTER_FILE_READ)) if multi_slices: ss = model.ShardState.find_all_by_mapreduce_state(job._state) for s in ss: self.assertTrue(s.slice_id > 0)
def testSplitInput_shardByFilters_lotsOfNS(self): """Lots means more than 2 in test cases.""" entities = self._create_entities(range(12), {}, "f") self._set_vals(entities, list(range(6)), list(range(2))) entities = self._create_entities(range(12, 24), {}, "g") self._set_vals(entities, list(range(6)), list(range(2))) entities = self._create_entities(range(24, 36), {}, "h") self._set_vals(entities, list(range(6)), list(range(2))) entities = self._create_entities(range(36, 48), {}, "h") self._set_vals(entities, [0] * 6, list(range(2))) params = { "entity_kind": self.entity_kind, "filters": [("a", ">", 0), ("a", "<=", 3), ("b", "=", 1)] } conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME, mapper=map_job.Mapper, input_reader_cls=self.reader_cls, input_reader_params=params, shard_count=100) results = conf.input_reader_cls.split_input(conf) self.assertEquals(3, len(results)) self._assertEquals_splitInput(results[0], ["3", "5", "7"]) self._assertEquals_splitInput(results[1], ["15", "17", "19"]) self._assertEquals_splitInput(results[2], ["27", "29", "31"])
def testUserProvidesJobID(self): conf = map_job.JobConfig( job_name="foo", job_id="id", mapper=map_job.Mapper, input_reader_cls=sample_input_reader.SampleInputReader, input_reader_params={"foo": 1}) self.assertEqual("id", conf.job_id)
def setUp(self): super(MapJobStatusTest, self).setUp() self.config = map_job.JobConfig( job_name="test_map", shard_count=1, mapper=map_job.Mapper, input_reader_cls=sample_input_reader.SampleInputReader, input_reader_params={"count": TEST_SAMPLE_INPUT_READER_COUNT})
def testValidate_WrongTypeNamespace(self): """Tests validate function rejects namespace of incorrect type.""" params = {"entity_kind": testutil.ENTITY_KIND, "namespace": 5} conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME, mapper=map_job.Mapper, input_reader_cls=self.reader_cls, input_reader_params=params, shard_count=1) self.assertRaises(errors.BadReaderParamsError, self.reader_cls.validate, conf)
def testValidate(self): # Check that some input reader params are required. conf = map_job.JobConfig( job_name="test_handler", mapper=map_job.Mapper, input_reader_cls=sample_input_reader.SampleInputReader, input_reader_params={}, shard_count=99) self.assertRaises(errors.BadReaderParamsError, sample_input_reader.SampleInputReader.validate, conf) # Check that count is an integer. conf = map_job.JobConfig( job_name="test_handler", mapper=map_job.Mapper, input_reader_cls=sample_input_reader.SampleInputReader, input_reader_params={"count": "1000"}, shard_count=99) self.assertRaises(errors.BadReaderParamsError, sample_input_reader.SampleInputReader.validate, conf) # Check that count is a positive integer. conf = map_job.JobConfig( job_name="test_handler", mapper=map_job.Mapper, input_reader_cls=sample_input_reader.SampleInputReader, input_reader_params={"count": -1}, shard_count=99) self.assertRaises(errors.BadReaderParamsError, sample_input_reader.SampleInputReader.validate, conf) # Check that string_length is an integer. conf = map_job.JobConfig( job_name="test_handler", mapper=map_job.Mapper, input_reader_cls=sample_input_reader.SampleInputReader, input_reader_params={ "count": 10, "string_length": 1.5 }, shard_count=99) self.assertRaises(errors.BadReaderParamsError, sample_input_reader.SampleInputReader.validate, conf)
def testValidate_NoEntityFails(self): """Test validate function raises exception with no entity parameter.""" params = {} conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME, mapper=map_job.Mapper, input_reader_cls=self.reader_cls, input_reader_params=params, shard_count=1) self.assertRaises(errors.BadReaderParamsError, self.reader_cls.validate, conf)
def testValidate_Passes(self): """Test validate function accepts valid parameters.""" params = { "entity_kind": testutil.ENTITY_KIND, } conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME, mapper=map_job.Mapper, input_reader_cls=self.reader_cls, input_reader_params=params, shard_count=1) self.reader_cls.validate(conf)
def testValidate_EntityKindWithNoModel(self): """Test validate function with bad entity kind.""" params = { "entity_kind": "foo", } conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME, mapper=map_job.Mapper, input_reader_cls=self.reader_cls, input_reader_params=params, shard_count=1) self.reader_cls.validate(conf)
def testSplitInput_noEntity(self): params = { "entity_kind": self.entity_kind, } conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME, mapper=map_job.Mapper, input_reader_cls=self.reader_cls, input_reader_params=params, shard_count=1) results = self.reader_cls.split_input(conf) self.assertEquals(None, results)
def testValidate_EntityKindWithNoModel(self): """Test validate function with no model.""" params = { "entity_kind": "foo", } conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME, mapper=map_job.Mapper, input_reader_cls=self.reader_cls, input_reader_params=params, shard_count=1) self.assertRaises(errors.BadReaderParamsError, conf.input_reader_cls.validate, conf)
def testValidate_Filters(self): """Tests validating filters parameter.""" params = { "entity_kind": self.entity_kind, "filters": [("a", "=", 1), ("b", "=", 2)], } new = datetime.datetime.now() old = new.replace(year=new.year - 1) conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME, mapper=map_job.Mapper, input_reader_cls=self.reader_cls, input_reader_params=params, shard_count=1) conf.input_reader_cls.validate(conf) conf.input_reader_params["filters"] = [["a", ">", 1], ["a", "<", 2]] conf.input_reader_cls.validate(conf) conf.input_reader_params["filters"] = [["datetime_property", ">", old], [ "datetime_property", "<=", new ], ["a", "=", 1]] conf.input_reader_cls.validate(conf) conf.input_reader_params["filters"] = [["a", "=", 1]] conf.input_reader_cls.validate(conf) # Invalid field c conf.input_reader_params["filters"] = [("c", "=", 1)] self.assertRaises(errors.BadReaderParamsError, conf.input_reader_cls.validate, conf) # Expect a range. conf.input_reader_params["filters"] = [("a", "<=", 1)] self.assertRaises(errors.BadReaderParamsError, conf.input_reader_cls.validate, conf) # Value should be a datetime. conf.input_reader_params["filters"] = [["datetime_property", ">", 1], [ "datetime_property", "<=", datetime.datetime.now() ]] self.assertRaises(errors.BadReaderParamsError, conf.input_reader_cls.validate, conf) # Expect a closed range. params["filters"] = [["datetime_property", ">", new], ["datetime_property", "<=", old]] self.assertRaises(errors.BadReaderParamsError, conf.input_reader_cls.validate, conf)
def testSmoke(self): conf = map_job.JobConfig( job_name="foo", mapper=map_job.Mapper, input_reader_cls=sample_input_reader.SampleInputReader, input_reader_params={"foo": 1}) self.assertEqual("foo", conf.job_name) self.assertTrue(conf.job_id) self.assertEqual(map_job.Mapper, conf.mapper) self.assertEqual(sample_input_reader.SampleInputReader, conf.input_reader_cls) self.assertEqual({"foo": 1}, conf.input_reader_params) self.assertEqual(parameters.config.SHARD_COUNT, conf.shard_count)
def testEndToEnd(self): conf = map_job.JobConfig( job_name="test_handler", mapper=map_job.Mapper, input_reader_cls=sample_input_reader.SampleInputReader, input_reader_params={"count": 1000}, shard_count=99) readers = sample_input_reader.SampleInputReader.split_input(conf) i = 0 for reader in readers: for _ in reader: i += 1 self.assertEquals(1000, i)
def testSmoke(self): # Force handler to serialize on every call. parameters.config._SLICE_DURATION_SEC = 0 job_config = map_job.JobConfig( job_name="test_map", mapper=MyMapper, input_reader_cls=sample_input_reader.SampleInputReader, input_reader_params={"count": TEST_SAMPLE_INPUT_READER_COUNT}, user_params={"foo": 1, "bar": 2}) MyMapper.original_conf = job_config map_job.Job.submit(job_config) test_support.execute_until_empty(self.taskqueue) job = map_job.Job.get_job_by_id(job_config.job_id) self.assertEqual(map_job.Job.SUCCESS, job.get_status())
def testSplitInput_moreThanOneUnevenNS(self): self._create_entities(range(5), {"1": 1, "3": 3}, "1") self._create_entities(range(10, 13), {"11": 11}, "2") params = { "entity_kind": self.entity_kind, } conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME, mapper=map_job.Mapper, input_reader_cls=self.reader_cls, input_reader_params=params, shard_count=4) results = self.reader_cls.split_input(conf) self.assertTrue(len(results) >= 3) self._assertEqualsForAllShards_splitInput( ["0", "1", "2", "3", "4", "10", "11", "12"], None, *results)
def testSplitInput_withNs_moreShardThanScatter(self): self._create_entities(range(3), {"1": 1}, "f") params = { "entity_kind": self.entity_kind, "namespace": "f", } conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME, mapper=map_job.Mapper, input_reader_cls=self.reader_cls, input_reader_params=params, shard_count=4) results = self.reader_cls.split_input(conf) self.assertTrue(len(results) >= 2) self._assertEqualsForAllShards_splitInput(["0", "1", "2"], None, *results)
def setUp(self): super(MapJobStartTest, self).setUp() TestHooks.enqueue_kickoff_task_calls = [] self.config = map_job.JobConfig( job_name="test_map", shard_count=1, mapper=map_job.Mapper, input_reader_cls=sample_input_reader.SampleInputReader, input_reader_params={"count": TEST_SAMPLE_INPUT_READER_COUNT}, queue_name="crazy-queue", _base_path="/mr_base", _force_writes=True, shard_max_attempts=5, _task_max_attempts=6, done_callback_url="www.google.com", _hooks_cls=TestHooks)
def testSplitInput_shardByFilters_noEntity(self): params = { "entity_kind": self.entity_kind, "namespace": "f", "filters": [("a", ">", 0), ("a", "<=", 3), ("b", "=", 1)] } conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME, mapper=map_job.Mapper, input_reader_cls=self.reader_cls, input_reader_params=params, shard_count=100) results = conf.input_reader_cls.split_input(conf) self.assertEquals(3, len(results)) self._assertEquals_splitInput(results[0], []) self._assertEquals_splitInput(results[1], []) self._assertEquals_splitInput(results[2], [])
def testSplitInput_lotsOfNS(self): self._create_entities(range(3), {"1": 1}, "9") self._create_entities(range(3, 6), {"4": 4}, "_") self._create_entities(range(6, 9), {"7": 7}, "a") params = { "entity_kind": self.entity_kind, } conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME, mapper=map_job.Mapper, input_reader_cls=self.reader_cls, input_reader_params=params, shard_count=3) results = self.reader_cls.split_input(conf) self.assertEquals(3, len(results)) self._assertEquals_splitInput(results[0], ["0", "1", "2"]) self._assertEquals_splitInput(results[1], ["3", "4", "5"]) self._assertEquals_splitInput(results[2], ["6", "7", "8"])
def testSplitInput_shardByFilters_withNs(self): entities = self._create_entities(range(12), {}, "f") self._set_vals(entities, list(range(6)), list(range(2))) params = { "entity_kind": self.entity_kind, "namespace": "f", "filters": [("a", ">", 0), ("a", "<=", 3), ("b", "=", 1)], } conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME, mapper=map_job.Mapper, input_reader_cls=self.reader_cls, input_reader_params=params, shard_count=2) results = conf.input_reader_cls.split_input(conf) self.assertEquals(2, len(results)) self._assertEquals_splitInput(results[0], ["3", "5"]) self._assertEquals_splitInput(results[1], ["7"])
def testEntityKindWithDot(self): self._create_entities(range(3), {"1": 1}, "", testutil.TestEntityWithDot) params = { "entity_kind": testutil.TestEntityWithDot.kind(), "namespace": "", } conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME, mapper=map_job.Mapper, input_reader_cls=self.reader_cls, input_reader_params=params, shard_count=2) results = conf.input_reader_cls.split_input(conf) self.assertEquals(2, len(results)) self._assertEqualsForAllShards_splitInput(["0", "1", "2"], None, *results)
def create_job_config(self, num_shards=None, input_params=None): """Create a JobConfig using GCSInputReader. Args: num_shards: optionally specify the number of shards. input_params: parameters for the input reader. Returns: a JobConfig with default settings and specified input_params. """ job_config = map_job.JobConfig( job_name="TestJob", mapper=map_job.Mapper, input_reader_cls=self.READER_CLS, input_reader_params=input_params, shard_count=num_shards) return job_config
def testSmoke(self): entity_count = 10 # Force handler to serialize on every call. parameters.config._SLICE_DURATION_SEC = 0 job = map_job.Job.submit( map_job.JobConfig( job_name="test_map", mapper=MyMapper, input_reader_cls=sample_input_reader.SampleInputReader, input_reader_params={"count": entity_count}, output_writer_cls=output_writers. _GoogleCloudStorageOutputWriter, output_writer_params={"bucket_name": "bucket"})) test_support.execute_until_empty(self.taskqueue) total = 0 for m in MyMapper.mappers.values(): total += m.processed self.assertEqual(entity_count, total) # Verify counters. counters = dict(job.get_counters()) self.assertEqual(counters["FOO_COUNTER"], 2 * entity_count) self.assertEqual(counters["BAR_COUNTER"], -1 * entity_count) self.assertEqual(counters["SLICES"], MyMapper.slices) # Verify outputs. files = output_writers._GoogleCloudStorageOutputWriter.get_filenames( job._state) outputs = collections.defaultdict(lambda: 0) expected = { "foo\n": entity_count, "bar\n": entity_count, "end_slice\n": MyMapper.slices, "begin_slice\n": MyMapper.slices } for fn in files: f = cloudstorage.open(fn) for line in f: outputs[line] += 1 self.assertEqual(expected, outputs)
def testValidate_Filters(self): """Tests validating filters parameter.""" params = { "entity_kind": self.entity_kind, "filters": [("a", "=", 1), ("b", "=", 2)], } new = datetime.datetime.now() old = new.replace(year=new.year - 1) conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME, mapper=map_job.Mapper, input_reader_cls=self.reader_cls, input_reader_params=params, shard_count=1) conf.input_reader_cls.validate(conf) # Only equality filters supported. params["filters"] = [["datetime_property", ">", old], ["datetime_property", "<=", new], ["a", "=", 1]] self.assertRaises(errors.BadReaderParamsError, conf.input_reader_cls.validate, conf)
def testRawEntityTypeFromOtherApp(self): """Test reading from other app.""" OTHER_KIND = "bar" OTHER_APP = "foo" apiproxy_stub_map.apiproxy.GetStub("datastore_v3").SetTrusted(True) expected_keys = [str(i) for i in range(10)] for k in expected_keys: datastore.Put(datastore.Entity(OTHER_KIND, name=k, _app=OTHER_APP)) params = { "entity_kind": OTHER_KIND, "_app": OTHER_APP, } conf = map_job.JobConfig(job_name=self.TEST_JOB_NAME, mapper=map_job.Mapper, input_reader_cls=self.reader_cls, input_reader_params=params, shard_count=1) itr = conf.input_reader_cls.split_input(conf)[0] self._assertEquals_splitInput(itr, expected_keys) apiproxy_stub_map.apiproxy.GetStub("datastore_v3").SetTrusted(False)