def run(self, bucket_name, now, input_files, version_id, parent_params=None):
    logging.info('======= Starting Metrics Pipeline')

    mapper_params = {
        'input_reader': {
            GCSInputReader.BUCKET_NAME_PARAM: bucket_name,
            GCSInputReader.OBJECT_NAMES_PARAM: input_files
        }
    }

    if parent_params:
      mapper_params.update(parent_params)

    num_shards = mapper_params[_NUM_SHARDS]
    # Chain together three map reduces; see module comments
    blob_key_1 = (yield mapreduce_pipeline.MapreducePipeline(
        'Process Input CSV',
        mapper_spec='offline.metrics_pipeline.map_csv_to_participant_and_date_metric',
        input_reader_spec='mapreduce.input_readers.GoogleCloudStorageInputReader',
        output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter',
        mapper_params=mapper_params,
        reducer_spec='offline.metrics_pipeline.reduce_participant_data_to_hpo_metric_date_deltas',
        reducer_params={
            'now': now,
            'output_writer': {
                'bucket_name': bucket_name,
                'content_type': 'text/plain'
            }
        },
        shards=num_shards))

    blob_key_2 = (yield mapreduce_pipeline.MapreducePipeline(
        'Calculate Counts',
        mapper_spec='offline.metrics_pipeline.map_hpo_metric_date_deltas_to_hpo_metric_key',
        input_reader_spec='mapreduce.input_readers.GoogleCloudStorageInputReader',
        output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter',
        mapper_params=(yield BlobKeys(bucket_name, blob_key_1, now, version_id)),
        combiner_spec='offline.metrics_pipeline.combine_hpo_metric_date_deltas',
        reducer_spec='offline.metrics_pipeline.reduce_hpo_metric_date_deltas_to_all_date_counts',
        reducer_params={
            'now': now,
            'output_writer': {
                'bucket_name': bucket_name,
                'content_type': 'text/plain',
            }
        },
        shards=num_shards))
    # TODO(danrodney):
    # We need to find a way to delete data written above (DA-167)
    yield mapreduce_pipeline.MapreducePipeline(
        'Write Metrics',
        mapper_spec='offline.metrics_pipeline.map_hpo_metric_date_counts_to_hpo_date_key',
        input_reader_spec='mapreduce.input_readers.GoogleCloudStorageInputReader',
        mapper_params=(yield BlobKeys(bucket_name, blob_key_2, now, version_id)),
        reducer_spec='offline.metrics_pipeline.reduce_hpo_date_metric_counts_to_database_buckets',
        reducer_params={
            'version_id': version_id
        },
        shards=num_shards)
Example #2
0
    def run(self, filekey, blobkey):
        bucket_name = app_identity.get_default_gcs_bucket_name()
        combine_purchase_key = yield mapreduce_pipeline.MapreducePipeline(
                "combine_purchase",
                "main.combine_purchase_map",
                "main.combine_purchase_reduce",
                "mapreduce.input_readers.BlobstoreZipInputReader",
                "mapreduce.output_writers.GoogleCloudStorageOutputWriter",
                mapper_params={
                        "blob_key": blobkey,
                },
                reducer_params={
                        "output_writer": {
                                "bucket_name": bucket_name,
                                "content_type": "text/plain",
                        }
                },
                shards=16)

        song_pairs = yield mapreduce_pipeline.MapreducePipeline(
                "common_purchase",
                "main.common_purchase_map",
                "main.common_purchase_reduce",
                "mapreduce.input_readers.GoogleCloudStorageInputReader",
                "mapreduce.output_writers.GoogleCloudStorageOutputWriter",
                # Pass output from first job as input to second job
                mapper_params= (yield GCSMapperParams(combine_purchase_key)),
                reducer_params={
                        "output_writer": {
                                "bucket_name": bucket_name,
                                "content_type": "text/plain",
                        }
                },
                shards=16)

        most_purchased = yield mapreduce_pipeline.MapreducePipeline(
                "find_most_common",
                "main.find_most_common_map",
                "main.find_most_common_reduce",
                "mapreduce.input_readers.GoogleCloudStorageInputReader",
                "mapreduce.output_writers.GoogleCloudStorageOutputWriter",
                # Pass output from first job as input to second job
                mapper_params= (yield GCSMapperParams(song_pairs)),
                reducer_params={
                        "output_writer": {
                                "bucket_name": bucket_name,
                                "content_type": "text/plain",
                        }
                },

                shards=16)

        yield StoreOutput("find_most_common", filekey, most_purchased)
Example #3
0
    def testFailedMapReduce(self):
        # Add some random data.
        entity_count = 200

        for i in range(entity_count):
            TestEntity(data=str(i)).put()
            TestEntity(data=str(i)).put()

        p = mapreduce_pipeline.MapreducePipeline(
            "test",
            __name__ + ".test_failed_map",
            __name__ + ".test_mapreduce_reduce",
            input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
            output_writer_spec=(output_writers.__name__ +
                                ".BlobstoreRecordsOutputWriter"),
            mapper_params={
                "entity_kind": __name__ + "." + TestEntity.__name__,
            },
            shards=16)
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
        self.assertEqual(model.MapreduceState.RESULT_FAILED,
                         p.outputs.result_status.value)
        self.assertEqual(0, len(p.outputs.default.value))
Example #4
0
    def run(self, region, include_conditional_violations):
        """
        Yields a recidivism calculation MapReduce pipeline to be started.
        :param region: a specific region to calculate recidivism for. Calculates for
        all regions of this is None.
        :param include_conditional_violations: whether or not to include violations
        of conditional release in recidivism calculations (split out into separate
        metrics)
        :return: yields up a MapReduce pipeline to start
        """
        mapper_params = {
            "entity_kind": "models.inmate.Inmate",
            "include_conditional_violations": include_conditional_violations
        }

        if region:
            mapper_params["filters"] = [("region", "=", region)]

        yield mapreduce_pipeline.MapreducePipeline(
            "Calculate recidivism across various dimensions",
            input_reader_spec="mapreduce.input_readers.DatastoreInputReader",
            mapper_spec="calculator.pipeline.map_inmate",
            mapper_params=mapper_params,
            reducer_spec="calculator.pipeline.reduce_recidivism_events",
            shards=64)
Example #5
0
    def run(self, filename, blobkey, ds_key):
        params = "filename %s \tblobkey %s\tds_key %s" % (filename, blobkey,
                                                          ds_key)
        logging.info(params)

        dataset = ndb.Key(urlsafe=ds_key).get()
        rows = dataset.rows
        hashes = rows * dataset.bands
        if len(dataset.random_seeds) != hashes:
            dataset.random_seeds = [
                random.getrandbits(max_bits) for _ in xrange(hashes)
            ]
            logging.warning('Recalculated %d random seeds', hashes)
            dataset.put()

        dataset.buckets = []
        dataset.put()
        output = yield mapreduce_pipeline.MapreducePipeline(
            "locality_sensitive_hashing",
            "blobs.lsh_map",
            "blobs.lsh_bucket",
            'mapreduce.input_readers.BlobstoreZipLineInputReader',
            "mapreduce.output_writers.BlobstoreOutputWriter",
            mapper_params={
                "blob_keys": blobkey,
            },
            reducer_params={
                "mime_type": "text/plain",
            },
            shards=16)
        yield StoreLshResults('OpenLSH', blobkey, ds_key, output)
Example #6
0
    def run(self, job_id, job_class_str, kwargs):
        # Disabling 4 space indentation checker for this docstring because this
        # "Yields:" section yields 2 objects and the Yields/Returns are
        # generally supposed to only yield 1 object which messes up the
        # indentation checking. This is the only case of this happening.
        """Returns a coroutine which runs the job pipeline and stores results.

        Args:
            job_id: str. The ID of the job to run.
            job_class_str: str. Should uniquely identify each type of job.
            kwargs: dict(str : object). Extra arguments used to build the
                MapreducePipeline.

        Yields:
            MapreducePipeline. Ready to start processing. Expects the output of
            that pipeline to be sent back.
            StoreMapReduceResults. Will be constructed with whatever output the
            caller sends back to the coroutine.
        """

        job_class = mapreduce_util.for_name(job_class_str)
        job_class.register_start(job_id, metadata={
            job_class._OUTPUT_KEY_ROOT_PIPELINE_ID: self.root_pipeline_id  # pylint: disable=protected-access
        })

        # TODO(sll): Need try/except/mark-as-canceled here?
        output = yield mapreduce_pipeline.MapreducePipeline(**kwargs)
        yield StoreMapReduceResults(job_id, job_class_str, output)
Example #7
0
def mapreduce_scrape_sources_and_process_events(fbl, min_potential_events,
                                                queue):
    mapper_params = {
        'entity_kind': 'event_scraper.thing_db.Source',
        'min_potential_events': min_potential_events,
        'handle_batch_size': 20,
    }
    reducer_params = {
        'output_writer': {
            'bucket_name': 'dancedeets-hrd.appspot.com',
            'content_type': 'text/plain',
        }
    }
    fb_params = fb_mapreduce.get_fblookup_params(fbl, randomize_tokens=True)
    mapper_params.update(fb_params)
    reducer_params.update(fb_params)

    # output = yield ...
    pipeline = mapreduce_pipeline.MapreducePipeline(
        'Scrape sources, then load and classify the events',
        'event_scraper.thing_scraper2.scrape_sources_for_events',
        'event_scraper.thing_scraper2.process_events',
        'mapreduce.input_readers.DatastoreInputReader',
        'mapreduce.output_writers.GoogleCloudStorageOutputWriter',
        mapper_params=mapper_params,
        reducer_params=reducer_params,
        shards=16,
    )

    pipeline.start(queue_name=queue)
Example #8
0
    def testSmoke(self):
        """Test all handlers still works.

    This test doesn't care about the integrity of the job outputs.
    Just that things works under webapp2 framework.
    """
        # Run Mapreduce
        p = mapreduce_pipeline.MapreducePipeline(
            "test",
            __name__ + ".TestMapreduceMap",
            __name__ + ".TestMapreduceReduce",
            input_reader_spec=input_readers.__name__ +
            ".RandomStringInputReader",
            output_writer_spec=(output_writers.__name__ +
                                "._GoogleCloudStorageRecordOutputWriter"),
            mapper_params={
                "input_reader": {
                    "count": 100
                },
            },
            reducer_params={
                "output_writer": {
                    "bucket_name": "test"
                },
            },
            shards=3)
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        # Verify output.
        p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
        self.assertEqual(model.MapreduceState.RESULT_SUCCESS,
                         p.outputs.result_status.value)
Example #9
0
def launch_job(job_id):
    """Launches a job given its key from MAPREDUCE_JOBS dict."""
    assert job_id in MAPREDUCE_JOBS, 'Unknown mapreduce job id %s' % job_id
    job_def = MAPREDUCE_JOBS[job_id].copy()
    # 256 helps getting things done faster but it is very easy to burn thousands
    # of $ within a few hours. Don't forget to update queue.yaml accordingly.
    job_def.setdefault('shards', 128)
    job_def.setdefault('input_reader_spec',
                       'mapreduce.input_readers.DatastoreInputReader')
    job_def['mapper_params'] = job_def['mapper_params'].copy()
    job_def['mapper_params'].setdefault(
        'bucket_name', app_identity.get_default_gcs_bucket_name())

    if 'reducer_spec' in job_def:
        logging.info('Starting mapreduce job')
        pipeline = mapreduce_pipeline.MapreducePipeline(**job_def)
    else:
        logging.info('Starting mapper-only job')
        job_def['params'] = job_def.pop('mapper_params')
        pipeline = mapreduce_pipeline.MapPipeline(**job_def)

    pipeline.start(base_path=MAPREDUCE_PIPELINE_BASE_PATH,
                   queue_name=MAPREDUCE_TASK_QUEUE)
    logging.info('Pipeline ID: %s', pipeline.pipeline_id)
    return pipeline.pipeline_id
Example #10
0
  def run(self, readsetId, sequenceName, sequenceStart, sequenceEnd):
    bucket = get_bucket_name()
    shards = os.environ['MAPREDUCE_SHARDS']

    # In the first pipeline, generate the raw coverage data.
    raw_coverage_data = yield mapreduce_pipeline.MapreducePipeline(
      "generate_coverage",
      "pipeline.generate_coverage_map",
      "pipeline.generate_coverage_reduce",
      "input_reader.GenomicsAPIInputReader",
      "mapreduce.output_writers._GoogleCloudStorageOutputWriter",
      mapper_params={
        "input_reader": {
          "readsetId": readsetId,
          "sequenceName": sequenceName,
          "sequenceStart": sequenceStart,
          "sequenceEnd": sequenceEnd,
        },
      },
      reducer_params={
        "output_writer": {
          "bucket_name": bucket,
          "content_type": "text/plain",
        },
      },
      shards=shards)

    # Since running the MR to consolidate the output take a very long time,
    # for now just return the individual results.
    yield PipelineReturnIndividualResults(readsetId, sequenceName,
                                          sequenceStart, sequenceEnd,
                                          raw_coverage_data)
Example #11
0
    def run(self, readsetId, sequenceName, sequenceStart, sequenceEnd,
            useMockData):
        logging.debug("Running Pipeline for readsetId %s" % readsetId)
        bucket = os.environ['BUCKET']

        # In the first pipeline, generate the raw coverage data.
        raw_coverage_data = yield mapreduce_pipeline.MapreducePipeline(
            "generate_coverage",
            "pipeline.generate_coverage_map",
            "pipeline.generate_coverage_reduce",
            "input_reader.GenomicsAPIInputReader",
            "mapreduce.output_writers._GoogleCloudStorageOutputWriter",
            mapper_params={
                "input_reader": {
                    "readsetId": readsetId,
                    "sequenceName": sequenceName,
                    "sequenceStart": sequenceStart,
                    "sequenceEnd": sequenceEnd,
                    "useMockData": useMockData,
                },
            },
            reducer_params={
                "output_writer": {
                    "bucket_name": bucket,
                    "content_type": "text/plain",
                },
            },
            shards=16)

        # Pass the results on to the output consolidator.
        yield PipelineConsolidateOutput(raw_coverage_data)
 def run(self, event_ids):
     # Can't do != comparators in our appengine mapreduce queries
     # filters = [('expired_oauth_token', '!=', True)]
     # Unfortunately, many users have a value equal to None, so can't filter on this
     # filters = [('expired_oauth_token', '=', False)]
     # So for now, let's just process all of them, and skip them inside test_user_on_events
     filters = []
     # output = yield ...
     yield mapreduce_pipeline.MapreducePipeline(
         'Find valid access_tokens for events',
         'events.find_access_tokens.test_user_on_events',
         'events.find_access_tokens.save_valid_users_to_event',
         'mapreduce.input_readers.DatastoreInputReader',
         'mapreduce.output_writers.GoogleCloudStorageOutputWriter',
         mapper_params={
             'entity_kind': 'users.users.User',
             'filters': filters,
             'event_ids': ','.join(event_ids),
         },
         reducer_params={
             'output_writer': {
                 'bucket_name': 'dancedeets-hrd.appspot.com',
                 'content_type': 'text/plain',
             }
         },
         shards=2,
     )
Example #13
0
    def run(self, raw_coverage_data):
        bucket = os.environ['BUCKET']
        logging.debug("Got %d raw coverage data output files to consolidate." %
                      len(raw_coverage_data))

        # Remove bucket from filenames. (Would be nice if you didn't have to do
        # this.
        paths = []
        for file in raw_coverage_data:
            paths.append(str.replace(str(file), "/" + bucket + "/", ""))

        # Create another pipeline to combine the raw coverage data into a single
        # file.
        output = yield mapreduce_pipeline.MapreducePipeline(
            "consolidate_output",
            "pipeline.consolidate_output_map",
            "pipeline.consolidate_output_reduce",
            "mapreduce.input_readers._GoogleCloudStorageInputReader",
            "mapreduce.output_writers._GoogleCloudStorageOutputWriter",
            mapper_params={
                "input_reader": {
                    "bucket_name": bucket,
                    "objects": paths,
                },
            },
            reducer_params={
                "output_writer": {
                    "bucket_name": bucket,
                    "content_type": "text/plain",
                },
            },
            shards=1)

        # Return back the final output results.
        yield PipelineReturnResults(output)
Example #14
0
    def testLotsOfValuesForSingleKey(self):
        TestEntity(data=str(1)).put()
        # Run Mapreduce
        p = mapreduce_pipeline.MapreducePipeline(
            "test",
            __name__ + ".map_yield_lots_of_values",
            __name__ + ".reduce_length",
            input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
            output_writer_spec=output_writers.__name__ +
            ".BlobstoreRecordsOutputWriter",
            mapper_params={
                "entity_kind": __name__ + "." + TestEntity.__name__,
            },
            shards=16)
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        self.assertEquals(1, len(self.emails))
        self.assertTrue(self.emails[0][1].startswith("Pipeline successful:"))

        # Verify reduce output.
        p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
        output_data = []
        for output_file in p.outputs.default.value:
            with files.open(output_file, "r") as f:
                for record in records.RecordsReader(f):
                    output_data.append(record)

        expected_data = ["('1', 50000)"]
        expected_data.sort()
        output_data.sort()
        self.assertEquals(expected_data, output_data)
    def run(self, mapper_key, reducer_key, file_name, language):
        """ run """
        logging.debug("filename is %s" % file_name)

        bucket_name = app_identity.get_default_gcs_bucket_name()
        mapper_params = {
            "entity_kind": "src.model.Data",
            "mapper": mapper_key,
            "reducer": reducer_key
        }

        output = yield mapreduce_pipeline.MapreducePipeline(
            file_name,
            mapper_spec="src.mapreduce.interpreter." + language +
            "_mapper_interpreter",
            reducer_spec="src.mapreduce.interpreter." + language +
            "_reducer_interpreter",
            input_reader_spec="mapreduce.input_readers.DatastoreInputReader",
            output_writer_spec=
            "mapreduce.output_writers.GoogleCloudStorageOutputWriter",
            mapper_params=mapper_params,
            reducer_params={
                "output_writer": {
                    "reducer": reducer_key,
                    "bucket_name": bucket_name,
                    "content_type": "text/plain",
                }
            },
            shards=64)

        # @TODO test and improve store output
        yield StoreOutput(output)
 def run(self, records_file_blobkey):
     job_name = "schedulrMapReduce"
     logging.info(
         "***map ***reduce ***library ***cool****** about 2 running: %s" %
         records_file_blobkey)
     # Run Mapreduce
     output = yield mapreduce_pipeline.MapreducePipeline(
         job_name,
         __name__ + ".schedulr_map",
         __name__ + ".schedulr_reduce",
         input_reader_spec=input_readers.__name__ +
         ".BlobstoreLineInputReader",
         output_writer_spec=(output_writers.__name__ + ".FileOutputWriter"),
         mapper_params={
             "input_reader": {
                 "blob_keys": [records_file_blobkey]
             }
         },
         reducer_params={
             "output_writer": {
                 "mime_type": "text/plain",
                 "output_sharding":
                 output_writers.FileOutputWriterBase.OUTPUT_SHARDING_NONE,
                 "filesystem": "blobstore"
             },
         },
         shards=N_SHARDS)
Example #17
0
  def run(self, readsetId, sequenceName, sequenceStart, sequenceEnd,
          raw_coverage_data):
    bucket = get_bucket_name()

    # Remove bucket from filenames. (Would be nice if you didn't have to do
    # this.
    paths = []
    for file in raw_coverage_data:
      paths.append(str.replace(str(file), "/" + bucket + "/", ""))

    # Create another pipeline to combine the raw coverage data into a single
    # file.
    output = yield mapreduce_pipeline.MapreducePipeline(
      "consolidate_output",
      "pipeline.consolidate_output_map",
      "pipeline.consolidate_output_reduce",
      "mapreduce.input_readers._GoogleCloudStorageInputReader",
      "mapreduce.output_writers._GoogleCloudStorageOutputWriter",
      mapper_params={
        "input_reader": {
           "bucket_name": bucket,
           "objects": paths,
        },
      },
      reducer_params={
        "output_writer": {
          "bucket_name": bucket,
          "content_type": "text/plain",
        },
      },
      shards=1)

    # Return back the final output results.
    yield PipelineReturnConsolidatedResults(readsetId, sequenceName,
                                            sequenceStart, sequenceEnd, output)
Example #18
0
 def run(self, job_name, sequence_num, kwargs, namespace, complete_fn):
     with Namespace(namespace):
         db.run_in_transaction(
             DurableJobEntity._start_job, job_name, sequence_num,
             MapReduceJob.build_output(self.root_pipeline_id, []))
     output = yield mapreduce_pipeline.MapreducePipeline(**kwargs)
     yield StoreMapReduceResults(job_name, sequence_num, namespace, output,
                                 complete_fn, kwargs)
Example #19
0
File: jobs.py Project: oulan/oppia
    def run(self, job_id, job_class_str, kwargs):
        job_class = mapreduce_util.for_name(job_class_str)
        job_class.register_start(job_id, metadata={
            job_class._OUTPUT_KEY_ROOT_PIPELINE_ID: self.root_pipeline_id
        })

        # TODO(sll): Need try/except/mark-as-canceled here?
        output = yield mapreduce_pipeline.MapreducePipeline(**kwargs)
        yield StoreMapReduceResults(job_id, job_class_str, output)
Example #20
0
    def run(self, job_name, kwargs, namespace):
        time_started = time.time()

        with Namespace(namespace):
            db.run_in_transaction(
                DurableJobEntity._start_job, job_name,
                MapReduceJob.build_output(self.root_pipeline_id, []))
        output = yield mapreduce_pipeline.MapreducePipeline(**kwargs)
        yield StoreMapReduceResults(job_name, time_started, namespace, output)
    def testMapReduce(self):
        # Prepare test data
        bucket_name = "testbucket"
        job_name = "test_job"
        entity_count = 200

        for i in range(entity_count):
            TestEntity(data=str(i)).put()
            TestEntity(data=str(i)).put()

        # Run Mapreduce
        p = mapreduce_pipeline.MapreducePipeline(
            job_name,
            __name__ + ".test_mapreduce_map",
            __name__ + ".test_mapreduce_reduce",
            input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
            output_writer_spec=(output_writers.__name__ +
                                "._GoogleCloudStorageRecordOutputWriter"),
            mapper_params={
                "entity_kind": __name__ + "." + TestEntity.__name__,
                "bucket_name": bucket_name
            },
            reducer_params={
                "output_writer": {
                    "bucket_name": bucket_name
                },
            },
            shards=16)
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        self.assertEquals(1, len(self.emails))
        self.assertTrue(self.emails[0][1].startswith("Pipeline successful:"))

        # Verify reduce output.
        p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
        self.assertEqual(model.MapreduceState.RESULT_SUCCESS,
                         p.outputs.result_status.value)
        output_data = []
        for output_file in p.outputs.default.value:
            with cloudstorage.open(output_file) as f:
                for record in records.RecordsReader(f):
                    output_data.append(record)

        expected_data = [str((str(d), ["", ""])) for d in range(entity_count)]
        expected_data.sort()
        output_data.sort()
        self.assertEquals(expected_data, output_data)

        # Verify that mapreduce doesn't leave intermediate files behind.
        temp_file_stats = cloudstorage.listbucket("/" + bucket_name)
        for stat in temp_file_stats:
            if stat.filename:
                self.assertFalse(
                    stat.filename.startswith("/%s/%s-shuffle-" %
                                             (bucket_name, job_name)))
 def run(self):
     yield mapreduce_pipeline.MapreducePipeline(
         "items_job",
         "dataflow_pipeline.mapper",
         "dataflow_pipeline.reducer",
         "mapreduce.input_readers.DatastoreInputReader",
         mapper_params={
             "input_reader": {
                 "entity_kind": "models.Transaction"
             }
         },
         shards=1)
    def testMapReduceWithShardRetry(self):
        # Prepare test data
        bucket_name = "testbucket"
        entity_count = 200
        db.delete(RetryCount.all())

        for i in range(entity_count):
            TestEntity(data=str(i)).put()
            TestEntity(data=str(i)).put()

        # Run Mapreduce
        p = mapreduce_pipeline.MapreducePipeline(
            "test",
            __name__ + ".test_mapreduce_map",
            __name__ + ".test_mapreduce_reduce",
            input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
            output_writer_spec=(__name__ + ".TestFileRecordsOutputWriter"),
            mapper_params={
                "input_reader": {
                    "entity_kind": __name__ + "." + TestEntity.__name__,
                },
            },
            reducer_params={
                "output_writer": {
                    "bucket_name": bucket_name
                },
            },
            shards=16)
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        self.assertEquals(1, len(self.emails))
        self.assertTrue(self.emails[0][1].startswith("Pipeline successful:"))

        # Verify reduce output.
        p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
        self.assertEqual(model.MapreduceState.RESULT_SUCCESS,
                         p.outputs.result_status.value)
        output_data = []
        retries = 0
        for output_file in p.outputs.default.value:
            # Get the number of shard retries by parsing filename.
            retries += (int(output_file[-1]) - 1)
            with cloudstorage.open(output_file) as f:
                for record in records.RecordsReader(f):
                    output_data.append(record)

        # Assert file names also suggest the right number of retries.
        self.assertEquals(44, retries)
        expected_data = [str((str(d), ["", ""])) for d in range(entity_count)]
        expected_data.sort()
        output_data.sort()
        self.assertEquals(expected_data, output_data)
Example #24
0
 def run(self, job_name, params, shard_count):
     yield mapreduce_pipeline.MapreducePipeline(
         job_name,
         __name__ + "._extact_domain_map",
         __name__ + "._grouped_domain_reduce",
         "mapreduce.input_readers.DatastoreInputReader",
         "mapreduce.output_writers.BlobstoreOutputWriter",
         mapper_params=params,
         reducer_params={
             "mime_type": "text/plain",
         },
         shards=shard_count)
Example #25
0
def CreatePopularPagesPipeline(start_datetime):
    return mapreduce_pipeline.MapreducePipeline(
        'popular-pages',
        FullName(recommendations.PopularPagesMap),
        FullName(recommendations.PopularPagesReduce),
        'mapreduce.input_readers.DatastoreInputReader',
        mapper_params={
            'entity_kind': FullName(models.PageRating),
            'start_datetime': SerializeDatetime(start_datetime)
        },
        reducer_params={'start_datetime': SerializeDatetime(start_datetime)},
        shards=DEFAULT_SHARDS)
Example #26
0
 def run(self):
     yield mapreduce_pipeline.MapreducePipeline(
         'IndexingMapReduce',
         'index_mapreduce.index.index_map',
         'index_mapreduce.index.index_reduce',
         'mapreduce.input_readers.DatastoreInputReader',
         'mapreduce.output_writers.BlobstoreOutputWriter',
         mapper_params={
             'entity_kind': 'models.Feed',
         },
         reducer_params={'mime_type': 'text/plain'},
         shards=4)
    def testCombiner(self):
        """Test running with low values count but with combiner."""
        # Prepare test data
        entity_count = 200

        for i in range(entity_count):
            TestEntity(data=str(i)).put()
            TestEntity(data=str(i)).put()

        p = mapreduce_pipeline.MapreducePipeline(
            "test",
            __name__ + ".test_combiner_map",
            __name__ + ".test_combiner_reduce",
            combiner_spec=__name__ + ".TestCombiner",
            input_reader_spec=input_readers.__name__ + ".DatastoreInputReader",
            output_writer_spec=output_writers.__name__ +
            ".GoogleCloudStorageOutputWriter",
            mapper_params={
                "entity_kind": __name__ + ".TestEntity",
            },
            reducer_params={
                "output_writer": {
                    "bucket_name": "testbucket"
                },
            },
            shards=4)
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        p = mapreduce_pipeline.MapreducePipeline.from_id(p.pipeline_id)
        self.assertEquals(4, len(p.outputs.default.value))
        file_content = []
        for input_file in p.outputs.default.value:
            with cloudstorage.open(input_file) as infile:
                for line in infile:
                    file_content.append(line.strip())

        file_content = sorted(file_content)

        self.assertEquals(
            ["('0', 9800)", "('1', 9900)", "('2', 10000)", "('3', 10100)"],
            file_content)

        self.assertTrue(TestCombiner.invocations)

        for invocation in TestCombiner.invocations:
            key = invocation[0]
            values = invocation[1]
            self.assertTrue(key)
            self.assertTrue(values)
            self.assertEquals(1, len(values))
            self.assertTrue(int(values[0]) % 4 == int(key))
Example #28
0
 def run(self, list_id, entity_kind, query_pickle):
     yield mapreduce_pipeline.MapreducePipeline(
         'cache_list_items',
         'soc.mapreduce.cache_list_items.mapProcess',
         'soc.mapreduce.cache_list_items.reduceProcess',
         'mapreduce.input_readers.DatastoreInputReader',
         mapper_params={
             'list_id': list_id,
             'entity_kind': entity_kind,
             'query_pickle': query_pickle
         },
         reducer_params={'list_id': list_id},
         shards=_NO_OF_SHARDS)
Example #29
0
def start_count_subscriptions():
    """Kicks off the MapReduce for determining and saving subscription counts."""
    job = mapreduce_pipeline.MapreducePipeline(
        'Count subscriptions',
        'offline_jobs.count_subscriptions_for_topic',
        'offline_jobs.save_subscription_counts_for_topic',
        'mapreduce.input_readers.DatastoreInputReader',
        mapper_params=dict(entity_kind='main.Subscription'),
        shards=4)
    # TODO(bslatkin): Pass through the queue name to run the job on. This is
    # a limitation in the mapper library.
    job.start()
    return job.pipeline_id
Example #30
0
def DjangoModelMapreduce(
        model,
        mapper,
        reducer,
        keys_only=False,
        output_writer="mapreduce.output_writers.BlobstoreOutputWriter",
        extra_mapper_params=None,
        extra_reducer_params=None,
        shards=None):
    """
    A simple wrapper function for creating mapreduce jobs over a Django model.

    Args:
        model:  A Django model class
        mapper: A top-level function that takes a single argument,
            and yields zero or many two-tuples strings
        reducer: A top-level function that takes two arguments
            and yields zero or more values
        output_writer: An optional OutputWriter subclass name,
            defaults to 'mapreduce.output_writers.BlobstoreOutputWriter'
        extra_mapper_params: An optional dictionary of values to pass to the Mapper
        extra_reducer_params: An optional dictionary of values to pass to the Reducer
    """

    if keys_only:
        input_reader_spec = "mapreduce.input_readers.DatastoreKeyInputReader"
        mapper_params = {"entity_kind": model._meta.db_table}
    else:
        input_reader_spec = "djangoappengine.mapreduce.input_readers.DjangoModelInputReader"
        mapper_params = {"entity_kind": _convert_model_to_string(model)}

    if extra_mapper_params:
        mapper_params.update(extra_mapper_params)

    reducer_params = {"mime_type": "text/plain"}
    if extra_reducer_params:
        reducer_params.update(extra_reducer_params)

    mapper_spec = _convert_func_to_string(mapper)
    reducer_spec = _convert_func_to_string(reducer)

    return mapreduce_pipeline.MapreducePipeline(
        "%s-%s-%s-mapreduce" %
        (model._meta.object_name, mapper_spec, reducer_spec),
        mapper_spec,
        reducer_spec,
        input_reader_spec,
        output_writer,
        mapper_params=mapper_params,
        reducer_params=reducer_params,
        shards=shards)