def test_split_input(self): SHARD_COUNT = 10 BATCH_SIZE = 2 mapper_spec = model.MapperSpec( "FooHandler", "mapreduce_utils.DatastoreQueryInputReader", { "input_reader": { "entity_kind": self.TEST_ENTITY_IMPORT_PATH, "batch_size": BATCH_SIZE, } }, SHARD_COUNT) def num_expected(): batch_size = min(len(self.dataSet), BATCH_SIZE) free_division = abs(len(self.dataSet) / batch_size) return min(free_division, SHARD_COUNT) ds_input_readers = DatastoreQueryInputReader.split_input(mapper_spec) self.assertEqual(3, num_expected()) # 1-3, 3-5, 5-None self.assertEqual(3, len(ds_input_readers)) # batch_size = dataSet bigger half BATCH_SIZE = int(math.ceil(len(self.dataSet) / 2.0)) mapper_spec = model.MapperSpec( "FooHandler", "mapreduce_utils.DatastoreQueryInputReader", { "input_reader": { "entity_kind": self.TEST_ENTITY_IMPORT_PATH, "batch_size": BATCH_SIZE, } }, SHARD_COUNT) ds_input_readers = DatastoreQueryInputReader.split_input(mapper_spec) self.assertEqual(2, num_expected()) # 1-4, 4-None self.assertEqual(2, len(ds_input_readers)) # batch_size > dataSet itself BATCH_SIZE = len(self.dataSet) * 2 mapper_spec = model.MapperSpec( "FooHandler", "mapreduce_utils.DatastoreQueryInputReader", { "input_reader": { "entity_kind": self.TEST_ENTITY_IMPORT_PATH, "batch_size": BATCH_SIZE, } }, SHARD_COUNT) ds_input_readers = DatastoreQueryInputReader.split_input(mapper_spec) self.assertEqual(1, num_expected()) # 1-None self.assertEqual(1, len(ds_input_readers))
def test_with_filter_factory(self): SHARD_COUNT = 10 FF_PATH = \ "test_mapreduce_utils.DatastoreQueryInputReaderTest." \ "simple_parametrized_filter_factory" params = { "input_reader": { "entity_kind": self.TEST_ENTITY_IMPORT_PATH, "filter_factory_spec": { "name": FF_PATH, "args": ["B"] } } } mapper_spec = model.MapperSpec( "FooHandler", "mapreduce_utils.DatastoreQueryInputReader", params, SHARD_COUNT) ds_input_readers = DatastoreQueryInputReader.split_input(mapper_spec) got = reduce(operator.add, (list(reader) for reader in ds_input_readers)) self.assertEqual(2, len(got)) data1, data2, = filter(lambda i: i['type'] == "B", self.dataSet) got.sort(key=lambda i: i.name) self.assertDictEqual(data1, db.to_dict(got.pop(0))) self.assertDictEqual(data2, db.to_dict(got.pop(0)))
def test_world(self): SHARD_COUNT = 10 mapper_spec = model.MapperSpec( "FooHandler", "mapreduce_utils.DatastoreQueryInputReader", { "input_reader": { "entity_kind": self.TEST_ENTITY_IMPORT_PATH, } }, SHARD_COUNT) ds_input_readers = DatastoreQueryInputReader.split_input(mapper_spec) got = reduce(operator.add, (list(reader) for reader in ds_input_readers)) self.assertEqual(len(self.dataSet), len(got))
def test_with_query_filters(self): SHARD_COUNT = 10 mapper_spec = model.MapperSpec( "FooHandler", "mapreduce_utils.DatastoreQueryInputReader", { "input_reader": { "entity_kind": self.TEST_ENTITY_IMPORT_PATH, "filters": [("type", "=", "C")], } }, SHARD_COUNT) ds_input_readers = DatastoreQueryInputReader.split_input(mapper_spec) got = reduce(operator.add, (list(reader) for reader in ds_input_readers)) self.assertEqual(3, len(got)) data1, data2, data3 = filter(lambda i: i['type'] == "C", self.dataSet) got.sort(key=lambda i: i.name) self.assertDictEqual(data1, db.to_dict(got.pop(0))) self.assertDictEqual(data2, db.to_dict(got.pop(0))) self.assertDictEqual(data3, db.to_dict(got.pop(0)))