def testExtend(self):
   self.assertEquals([1, 2, 3, 4, 5, 6, 7, 8], self.run_pipeline(
       common.Extend([1, 2, 3], (4, 5, 6), [7], (8,))).default.value)
   self.assertEquals([], self.run_pipeline(
       common.Extend([], (), [], ())).default.value)
   self.assertEquals([1, 2, 3, 4, 5, 6, 7, 8], self.run_pipeline(
       common.Extend([1, 2, 3], [], (4, 5, 6), (), [7], (8,))).default.value)
   self.assertEquals([[1, 2, 3], [4, 5, 6], [7], [8]], self.run_pipeline(
       common.Extend([[1, 2, 3], [4, 5, 6], [7], [8]])).default.value)
   self.assertEquals([], self.run_pipeline(common.Extend()).default.value)
    def run(self, bench_name, description):  # pylint: disable=invalid-name
        """The root pipeline that start simulation tasks and generating report.

    This spawns tasks to spawn more tasks that run simulation and executes the
    generate report task on the aggregated the results.

    Args:
      bench_name: A string bench name.
      description: A string description of this bench job.

    Yields:
      Pipeline instance.
    """
        test_bench_keys = TestBench.query().fetch(keys_only=True)
        test_bench_ids = [k.integer_id() for k in test_bench_keys]

        results = []
        # Size of number of taskqueue tasks we want to spawn per pipeline.
        pipeline_chunk_size = 1000
        for i in xrange(0, len(test_bench_ids), pipeline_chunk_size):
            id_chunk = test_bench_ids[i:i + pipeline_chunk_size]
            result_future = yield RunExperimentalChunkPipeline(
                bench_name, id_chunk)
            results.append(result_future)

        combined_results = yield pipeline_common.Extend(*results)
        yield GenerateComparisonReportPipeline(bench_name, description,
                                               combined_results)
Example #3
0
    def run(self, job_name, mapper_params, filenames, shards=None):
        bucket_name = mapper_params["bucket_name"]
        hashed_files = yield _HashPipeline(job_name,
                                           bucket_name,
                                           filenames,
                                           shards=shards)
        sorted_files = yield _SortChunksPipeline(job_name, bucket_name,
                                                 hashed_files)
        temp_files = [hashed_files, sorted_files]

        merged_files = yield _MergePipeline(job_name, bucket_name,
                                            sorted_files)

        with pipeline.After(merged_files):
            all_temp_files = yield pipeline_common.Extend(*temp_files)
            yield _GCSCleanupPipeline(all_temp_files)

        yield pipeline_common.Return(merged_files)
Example #4
0
class MapreducePipeline(pipeline_base._OutputSlotsMixin,
                        pipeline_base.PipelineBase):
    """Pipeline to execute MapReduce jobs.

  The Shuffle stage uses Google Cloud Storage (GCS). For newly created projects,
  GCS is activated automatically. To activate GCS follow these instructions:
  https://cloud.google.com/storage/docs/signup#activate

  Args:
    job_name: job name as string.
    mapper_spec: specification of mapper to use.
    reducer_spec: specification of reducer to use.
    input_reader_spec: specification of input reader to read data from.
    output_writer_spec: specification of output writer to save reduce output to.
    mapper_params: parameters to use for mapper phase.
    reducer_params: parameters to use for reduce phase.
    shards: number of shards to use as int.
    combiner_spec: Optional. Specification of a combine function. If not
      supplied, no combine step will take place. The combine function takes a
      key, list of values and list of previously combined results. It yields
      combined values that might be processed by another combiner call, but will
      eventually end up in reducer. The combiner output key is assumed to be the
      same as the input key.

  Returns:
    result_status: one of model.MapreduceState._RESULTS. Check this to see
      if the job is successful.
    default: a list of filenames if the mapreduce was successful and
      was outputting files. An empty list otherwise.
  """
    def run(self,
            job_name,
            mapper_spec,
            reducer_spec,
            input_reader_spec,
            output_writer_spec=None,
            mapper_params=None,
            reducer_params=None,
            shards=None,
            combiner_spec=None):
        # Check that you have a bucket_name set in the mapper_params and set it
        # to the default if not.
        if mapper_params.get("bucket_name") is None:
            try:
                mapper_params["bucket_name"] = (
                    app_identity.get_default_gcs_bucket_name())
            except Exception, e:
                raise errors.Error(
                    "Unable to get the GCS default bucket name. "
                    "Check to see that GCS is properly activated. " + str(e))
        if mapper_params["bucket_name"] is None:
            raise errors.Error("There is no GCS default bucket name. "
                               "Check to see that GCS is properly activated.")
        # TODO(user): Check that the bucket is indeed writable.

        map_pipeline = yield MapPipeline(job_name,
                                         mapper_spec,
                                         input_reader_spec,
                                         params=mapper_params,
                                         shards=shards)
        shuffler_pipeline = yield ShufflePipeline(job_name, mapper_params,
                                                  map_pipeline)
        reducer_pipeline = yield ReducePipeline(job_name,
                                                reducer_spec,
                                                output_writer_spec,
                                                reducer_params,
                                                mapper_params["bucket_name"],
                                                shuffler_pipeline,
                                                combiner_spec=combiner_spec)
        with pipeline.After(reducer_pipeline):
            all_temp_files = yield pipeline_common.Extend(
                map_pipeline, shuffler_pipeline)
            yield CleanupPipeline(all_temp_files)

        yield _ReturnPipeline(map_pipeline.result_status,
                              reducer_pipeline.result_status, reducer_pipeline)