Ejemplo n.º 1
0
    def run(self,
            job_name,
            reducer_spec,
            output_writer_spec,
            params,
            bucket_name,
            filenames,
            combiner_spec=None,
            shards=None):
        filenames_only = (util.strip_prefix_from_items("/%s/" % bucket_name,
                                                       filenames))
        new_params = dict(params or {})
        new_params.update({
            "input_reader": {
                "bucket_name": bucket_name,
                "objects": filenames_only,
            }
        })
        if combiner_spec:
            new_params.update({
                "combiner_spec": combiner_spec,
            })

        # TODO(user): Test this
        if shards is None:
            shards = len(filenames)

        yield mapper_pipeline.MapperPipeline(job_name + "-reduce",
                                             reducer_spec,
                                             __name__ + "._ReducerReader",
                                             output_writer_spec,
                                             new_params,
                                             shards=shards)
  def testShardRetry(self):
    entity_count = 200
    db.delete(TestOutputEntity.all())
    db.delete(RetryCount.all())

    for i in range(entity_count):
      TestEntity(data=str(i)).put()

    p = mapper_pipeline.MapperPipeline(
        "test",
        handler_spec=__name__ + ".test_shard_retry_map",
        input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
        params={
            "input_reader": {
                "entity_kind": __name__ + "." + TestEntity.__name__,
            },
        },
        shards=5)
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    self.assertEquals(1, len(self.emails))
    self.assertTrue(self.emails[0][1].startswith(
        "Pipeline successful:"))

    p = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id)
    outputs = []
    for output in TestOutputEntity.all():
      outputs.append(int(output.data))
    outputs.sort()

    expected_outputs = [i for i in range(entity_count)]
    expected_outputs.sort()
    self.assertEquals(expected_outputs, outputs)
    def testProcessEntites(self):
        """Test empty mapper over non-empty dataset."""
        for _ in range(100):
            TestEntity().put()

        p = mapper_pipeline.MapperPipeline(
            "empty_map",
            handler_spec=__name__ + ".test_empty_handler",
            input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
            params={
                "entity_kind": __name__ + ".TestEntity",
            },
        )
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        self.assertEquals(1, len(self.emails))
        self.assertTrue(self.emails[0][1].startswith("Pipeline successful:"))

        p = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id)
        self.assertTrue(p.outputs.job_id.value)

        counters = p.outputs.counters.value
        self.assertTrue(counters)
        self.assertTrue(context.COUNTER_MAPPER_WALLTIME_MS in counters)
        self.assertEquals(100, counters[context.COUNTER_MAPPER_CALLS])
  def testShardRetryTooMany(self):
    entity_count = 200
    db.delete(TestOutputEntity.all())
    db.delete(RetryCount.all())

    for i in range(entity_count):
      TestEntity(data=str(i)).put()

    p = mapper_pipeline.MapperPipeline(
        "test",
        handler_spec=__name__ + ".test_shard_retry_too_many_map",
        input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
        params={
            "input_reader": {
                "entity_kind": __name__ + "." + TestEntity.__name__,
            },
        },
        shards=5)
    p.max_attempts = 1
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    state = model.MapreduceState.all().get()
    self.assertEqual(model.MapreduceState.RESULT_FAILED, state.result_status)

    self.assertEquals(1, len(self.emails))
    self.assertTrue(self.emails[0][1].startswith(
        "Pipeline aborted:"))
Ejemplo n.º 5
0
 def run(self):
   yield mapper_pipeline.MapperPipeline(
       'clean_up_old_exports',
       pipelines.FullName(CleanUpOldExportsMap),
       'mapreduce.input_readers.DatastoreInputReader',
       params={'entity_kind': pipelines.FullName(ExportRatingsResult)},
       shards=pipelines.DEFAULT_SHARDS)
  def testFailedMap(self):
    for i in range(1):
      TestEntity(data=str(i)).put()

    pipeline.pipeline._DEFAULT_MAX_ATTEMPTS = 1

    p = mapper_pipeline.MapperPipeline(
        "test",
        handler_spec=__name__ + ".test_fail_map",
        input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
        params={
            "input_reader": {
                "entity_kind": __name__ + "." + TestEntity.__name__,
            },
        },
        shards=5)
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    p = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id)
    self.assertTrue(p.was_aborted)

    self.assertTrue(p.outputs.job_id.filled)
    state = model.MapreduceState.get_by_job_id(p.outputs.job_id.value)
    self.assertEqual(model.MapreduceState.RESULT_FAILED, state.result_status)
    self.assertFalse(p.outputs.result_status.filled)
    self.assertFalse(p.outputs.default.filled)

    self.assertEquals(1, len(self.emails))
    self.assertTrue(self.emails[0][1].startswith(
        "Pipeline aborted:"))
 def run(self, fbl_json, filters):
     bucket_name = 'dancedeets-hrd.appspot.com'
     params = {
         'entity_kind': 'events.eventdata.DBEvent',
         'filters': filters,
         'handle_batch_size': 20,
         'output_writer': {
             'bucket_name': bucket_name,
             'content_type': 'text/plain',
         }
     }
     params.update(fbl_json)
     # This should use cache, so we can go faster
     find_events_needing_access_tokens = (
         yield mapper_pipeline.MapperPipeline(
             'Find valid events needing access_tokens',
             'events.find_access_tokens.map_events_needing_access_tokens',
             'mapreduce.input_readers.DatastoreInputReader',
             'mapreduce.output_writers.GoogleCloudStorageOutputWriter',
             params=params,
             shards=10,
         ))
     # This will be a single shard
     single_file_of_fb_events = yield CombinerPipeline(
         "Combine event lists into single file", bucket_name,
         find_events_needing_access_tokens)
     # This will use more shards
     yield PassFileToAccessTokenFinder(single_file_of_fb_events)
    def run(self,
            job_name,
            reducer_spec,
            output_writer_spec,
            params,
            filenames,
            combiner_spec=None,
            shards=None):
        new_params = dict(params or {})
        new_params.update({"files": filenames})
        if combiner_spec:
            new_params.update({
                "combiner_spec": combiner_spec,
            })

        # TODO(user): Test this
        if shards is None:
            shards = len(filenames)

        yield mapper_pipeline.MapperPipeline(job_name + "-reduce",
                                             reducer_spec,
                                             __name__ + "._ReducerReader",
                                             output_writer_spec,
                                             new_params,
                                             shards=shards)
Ejemplo n.º 9
0
 def run(self, job_name, reducer_spec, output_writer_spec, params,
         filenames):
     new_params = dict(params or {})
     new_params.update({"files": filenames})
     yield mapper_pipeline.MapperPipeline(
         job_name + "-reduce", reducer_spec,
         shuffler.__name__ + "._MergingReader", output_writer_spec,
         new_params)
Ejemplo n.º 10
0
 def run(self, job_name, filenames):
   yield mapper_pipeline.MapperPipeline(
           job_name + "-shuffle-hash",
           __name__ + "._hashing_map",
           input_readers.__name__ + ".RecordsReader",
           output_writer_spec= __name__ + "._HashingBlobstoreOutputWriter",
           params={'files': filenames},
           shards=len(filenames))
Ejemplo n.º 11
0
 def run(self, job_name, filenames):
   yield mapper_pipeline.MapperPipeline(
           job_name + "-shuffle-merge",
           __name__ + "._merge_map",
           __name__ + "._MergingReader",
           output_writer_spec=
               output_writers.__name__ + ".BlobstoreRecordsOutputWriter",
           params={'files': filenames},
           shards=len(filenames))
Ejemplo n.º 12
0
 def run(self, namespace, job_name, sequence_num, cleanup_params,
         job_runner_args):
     self._started(namespace, job_name, sequence_num)
     yield mapper_pipeline.MapperPipeline(
         job_name=job_name,
         handler_spec=
         'modules.analytics.filters.PreCleanMapReduceJobPipeline.map',
         input_reader_spec='mapreduce.input_readers.DatastoreInputReader',
         params=cleanup_params)
     yield jobs.MapReduceJobRunner(**job_runner_args)
Ejemplo n.º 13
0
 def run(self, name, entity_type, map_fn):
     yield mapper_pipeline.MapperPipeline(
         name,
         map_fn,
         'mapreduce.input_readers.DatastoreInputReader',
         params={
             'entity_kind': entity_type,
             'start_datetime': SerializeDatetime(datetime.now()),
         },
         shards=DEFAULT_SHARDS)
Ejemplo n.º 14
0
def CreateCleanupPipeline(model_class, start_datetime):
    return mapper_pipeline.MapperPipeline(
        'cleanup',
        FullName(CleanupMap),
        'mapreduce.input_readers.DatastoreInputReader',
        params={
            'entity_kind': FullName(model_class),
            'start_datetime': SerializeDatetime(start_datetime)
        },
        shards=DEFAULT_SHARDS)
Ejemplo n.º 15
0
 def run(self, blob_keys, blob_sizes, shards):
     yield mapper_pipeline.MapperPipeline(
         "import_data_mapper",
         "pipeline.insert_data",
         "mapreduce.input_readers.BlobstoreLineInputReader",
         params={
             "blob_keys": blob_keys,
             "blob_sizes": blob_sizes,
         },
         shards=shards)
Ejemplo n.º 16
0
 def run(self, job_name, filenames):
   yield mapper_pipeline.MapperPipeline(
       job_name + "-shuffle-merge",
       __name__ + "._merge_map",
       __name__ + "._MergingReader",
       output_writer_spec=
       output_writers.__name__ + ".BlobstoreRecordsOutputWriter",
       params={
         _MergingReader.FILES_PARAM: filenames,
         _MergingReader.MAX_VALUES_COUNT_PARAM: self._MAX_VALUES_COUNT,
         _MergingReader.MAX_VALUES_SIZE_PARAM: self._MAX_VALUES_SIZE,
         },
       shards=len(filenames))
Ejemplo n.º 17
0
 def run(self, filenames):
     mapper = yield mapper_pipeline.MapperPipeline(
         "sort",
         __name__ + "._sort_records",
         __name__ + "._BatchRecordsReader",
         None, {
             "files": filenames,
             "processing_rate": 1000000,
         },
         shards=1)
     # TODO(user): delete _OutputFile entities after collect
     with pipeline.After(mapper):
         yield _CollectOutputFiles(mapper.job_id)
Ejemplo n.º 18
0
 def run(self, job_name, bucket_name, filenames):
     yield mapper_pipeline.MapperPipeline(
         job_name + "-shuffle-merge",
         __name__ + "._merge_map",
         __name__ + "._MergingReader",
         output_writer_spec=output_writers.__name__ +
         "._GoogleCloudStorageRecordOutputWriter",
         params={
             _MergingReader.FILES_PARAM: filenames,
             _MergingReader.MAX_VALUES_COUNT_PARAM: self._MAX_VALUES_COUNT,
             _MergingReader.MAX_VALUES_SIZE_PARAM: self._MAX_VALUES_SIZE,
             "output_writer": {
                 "bucket_name": bucket_name,
             },
         },
         shards=len(filenames))
Ejemplo n.º 19
0
def DjangoModelMap(
        model,
        mapper_func,
        keys_only=False,
        output_writer="mapreduce.output_writers.BlobstoreOutputWriter",
        params=None):
    """
    A simple wrapper function for running a mapper function over Django model instances.

    Args:
        model:  A Django model class
        mapper: A top-level function that takes a single argument,
            and yields zero or many two-tuples strings
        keys_only: Selects which input reader to use
            if True, then we use 'mapreduce.input_readers.DatastoreKeyInputReader',
            if False, then 'djangoappengine.mapreduce.input_readers.DjangoModelInputReader',
            defaults to False
        params: An optional dictionary of values to pass to the Mapper
    """

    if keys_only:
        input_reader_spec = "mapreduce.input_readers.DatastoreKeyInputReader"
        mapper_params = {
            "entity_kind": model._meta.db_table,
            "mime_type": "text/plain"
        }
    else:
        input_reader_spec = "djangoappengine.mapreduce.input_readers.DjangoModelInputReader"
        mapper_params = {
            "entity_kind": _convert_model_to_string(model),
            "mime_type": "text/plain"
        }

    if params:
        mapper_params.update(params)

    mapper_spec = _convert_func_to_string(mapper_func)

    return mapper_pipeline.MapperPipeline(
        "%s-%s-mapper" % (model._meta.object_name, mapper_spec),
        mapper_spec,
        input_reader_spec,
        output_writer_spec=output_writer,
        params=mapper_params)
Ejemplo n.º 20
0
    def testCleanup_ListOfLists(self):
        """Tests cleaning up a list of file lists."""
        # Prepare test data
        entity_count = 200

        for i in range(entity_count):
            TestEntity(data=str(i)).put()
            TestEntity(data=str(i)).put()

        # Run map
        p = mapper_pipeline.MapperPipeline(
            "test",
            handler_spec=__name__ + ".test_map",
            input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
            output_writer_spec=output_writers.__name__ +
            ".KeyValueBlobstoreOutputWriter",
            params={
                "input_reader": {
                    "entity_kind": __name__ + ".TestEntity",
                },
            },
        )
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        finished_map = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id)

        # Can open files
        file_list = finished_map.outputs.default.value
        self.assertTrue(len(file_list) > 0)
        for name in file_list:
            files.open(name, "r").read(0)

        grouped_list = [file_list]

        # Cleanup
        cleanup = mapper_pipeline._CleanupPipeline(grouped_list)
        cleanup.start()
        test_support.execute_until_empty(self.taskqueue)

        # Cannot open files
        for name in file_list:
            self.assertRaises(files.Error, files.open, name, "r")
 def run(self, job_name, bucket_name, filenames):
     filenames_only = (util.strip_prefix_from_items("/%s/" % bucket_name,
                                                    filenames))
     params = {
         "output_writer": {
             "bucket_name": bucket_name,
             "content_type": "text/plain",
         },
         "input_reader": {
             "bucket_name": bucket_name,
             "objects": filenames_only,
         }
     }
     yield mapper_pipeline.MapperPipeline(
         job_name + "-combine",
         'events.find_access_tokens.file_identity',
         'mapreduce.input_readers.GoogleCloudStorageInputReader',
         'mapreduce.output_writers.GoogleCloudStorageOutputWriter',
         params,
         shards=1)
Ejemplo n.º 22
0
 def run(self, job_name, bucket_name, filenames, shards=None):
     filenames_only = (util.strip_prefix_from_items("/%s/" % bucket_name,
                                                    filenames))
     if shards is None:
         shards = len(filenames)
     yield mapper_pipeline.MapperPipeline(
         job_name + "-shuffle-hash",
         __name__ + "._hashing_map",
         input_readers.__name__ + "._GoogleCloudStorageRecordInputReader",
         output_writer_spec=__name__ + "._HashingGCSOutputWriter",
         params={
             "input_reader": {
                 "bucket_name": bucket_name,
                 "objects": filenames_only,
             },
             "output_writer": {
                 "bucket_name": bucket_name,
             },
         },
         shards=shards)
Ejemplo n.º 23
0
 def run(self, job_name, filenames):
     sort_mappers = []
     for i in range(len(filenames)):
         filename = filenames[i]
         sort_mapper = yield mapper_pipeline.MapperPipeline(
             "%s-shuffle-sort-%s" % (job_name, str(i)),
             __name__ + "._sort_records_map",
             __name__ + "._BatchRecordsReader",
             None, {
                 "files": [filename],
                 "processing_rate": 1000000,
             },
             shards=1)
         sort_mappers.append(sort_mapper)
     with pipeline.After(*sort_mappers):
         job_ids = yield pipeline_common.Append(
             *[mapper.job_id for mapper in sort_mappers])
         result = yield _CollectOutputFiles(job_ids)
         with pipeline.After(result):
             yield _CleanupOutputFiles(job_ids)
         yield pipeline_common.Return(result)
Ejemplo n.º 24
0
  def run(self,
          job_name,
          reducer_spec,
          output_writer_spec,
          params,
          filenames,
          combiner_spec=None):
    new_params = dict(params or {})
    new_params.update({
        "files": filenames
        })
    if combiner_spec:
      new_params.update({
          "combiner_spec": combiner_spec,
          })

    yield mapper_pipeline.MapperPipeline(
        job_name + "-reduce",
        reducer_spec,
        __name__ + "._ReducerReader",
        output_writer_spec,
        new_params)
Ejemplo n.º 25
0
 def run(self, job_name, bucket_name, filenames):
     sort_mappers = []
     for i in range(len(filenames)):
         filenames_only = util.strip_prefix_from_items(
             "/%s/" % bucket_name, filenames[i])
         sort_mapper = yield mapper_pipeline.MapperPipeline(
             "%s-shuffle-sort-%s" % (job_name, str(i)),
             __name__ + "._sort_records_map",
             __name__ + "._BatchGCSRecordsReader",
             None, {
                 "input_reader": {
                     "bucket_name": bucket_name,
                     "objects": filenames_only,
                 },
             },
             shards=1)
         sort_mappers.append(sort_mapper)
     with pipeline.After(*sort_mappers):
         job_ids = yield pipeline_common.Append(
             *[mapper.job_id for mapper in sort_mappers])
         result = yield _CollectOutputFiles(job_ids)
         with pipeline.After(result):
             yield _CleanupOutputFiles(job_ids)
         yield pipeline_common.Return(result)
  def testEmptyMapper(self):
    """Test empty mapper over empty dataset."""
    p = mapper_pipeline.MapperPipeline(
        "empty_map",
        handler_spec=__name__ + ".test_empty_handler",
        input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
        params={
            "input_reader": {
                "entity_kind": __name__ + ".TestEntity",
                # Test datetime can be json serialized.
                "filters": [("dt", "=", datetime.datetime(2000, 1, 1))],
                },
            },
        )
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    self.assertEquals(1, len(self.emails))
    self.assertTrue(self.emails[0][1].startswith(
        "Pipeline successful:"))

    p = mapper_pipeline.MapperPipeline.from_id(p.pipeline_id)
    # Verify outputs.
    # Counter output
    counters = p.outputs.counters.value
    self.assertTrue(counters)
    self.assertTrue(context.COUNTER_MAPPER_WALLTIME_MS in counters)
    # Default output.
    self.assertEqual([], p.outputs.default.value)
    # Job id output.
    self.assertTrue(p.outputs.job_id.filled)
    state = model.MapreduceState.get_by_job_id(p.outputs.job_id.value)
    self.assertEqual(model.MapreduceState.RESULT_SUCCESS, state.result_status)
    # Result status output.
    self.assertEqual(model.MapreduceState.RESULT_SUCCESS,
                     p.outputs.result_status.value)