Example #1
0
    def run(self,
            job_name,
            reducer_spec,
            output_writer_spec,
            params,
            bucket_name,
            filenames,
            combiner_spec=None,
            shards=None):
        filenames_only = (util.strip_prefix_from_items("/%s/" % bucket_name,
                                                       filenames))
        new_params = dict(params or {})
        new_params.update({
            "input_reader": {
                "bucket_name": bucket_name,
                "objects": filenames_only,
            }
        })
        if combiner_spec:
            new_params.update({
                "combiner_spec": combiner_spec,
            })

        # TODO(user): Test this
        if shards is None:
            shards = len(filenames)

        yield mapper_pipeline.MapperPipeline(job_name + "-reduce",
                                             reducer_spec,
                                             __name__ + "._ReducerReader",
                                             output_writer_spec,
                                             new_params,
                                             shards=shards)
Example #2
0
 def run(self, job_name, bucket_name, filenames):
   sort_mappers = []
   for i in range(len(filenames)):
     filenames_only = util.strip_prefix_from_items("/%s/" % bucket_name,
                                                   filenames[i])
     sort_mapper = yield mapper_pipeline.MapperPipeline(
         "%s-shuffle-sort-%s" % (job_name, str(i)),
         __name__ + "._sort_records_map",
         __name__ + "._BatchGCSRecordsReader",
         None,
         {
             "input_reader": {
                 "bucket_name": bucket_name,
                 "objects": filenames_only,
             },
         },
         shards=1)
     sort_mappers.append(sort_mapper)
   with pipeline.After(*sort_mappers):
     job_ids = yield pipeline_common.Append(*[mapper.job_id for mapper in
                                              sort_mappers])
     result = yield _CollectOutputFiles(job_ids)
     with pipeline.After(result):
       yield _CleanupOutputFiles(job_ids)
     yield pipeline_common.Return(result)
Example #3
0
    def testBasic(self):
        """Basic test of the function."""
        items = ["/foo/bar", "/foos/bar2", "/bar3"]
        prefix = "/foo/"

        self.assertEquals(["bar", "/foos/bar2", "/bar3"],
                          util.strip_prefix_from_items(prefix, items))
  def run(self,
          job_name,
          reducer_spec,
          output_writer_spec,
          params,
          bucket_name,
          filenames,
          combiner_spec=None,
          shards=None):
    filenames_only = (
        util.strip_prefix_from_items("/%s/" % bucket_name, filenames))
    new_params = dict(params or {})
    new_params.update({
        "input_reader": {
            "bucket_name": bucket_name,
            "objects": filenames_only,
        }})
    if combiner_spec:
      new_params.update({
          "combiner_spec": combiner_spec,
          })

    # TODO(user): Test this
    if shards is None:
      shards = len(filenames)

    yield mapper_pipeline.MapperPipeline(
        job_name + "-reduce",
        reducer_spec,
        __name__ + "._ReducerReader",
        output_writer_spec,
        new_params,
        shards=shards)
  def testBasic(self):
    """Basic test of the function."""
    items = ["/foo/bar", "/foos/bar2", "/bar3"]
    prefix = "/foo/"

    self.assertEquals(["bar", "/foos/bar2", "/bar3"],
                      util.strip_prefix_from_items(prefix, items))
Example #6
0
 def run(self, job_name, bucket_name, filenames, shards=None):
     filenames_only = (util.strip_prefix_from_items("/%s/" % bucket_name,
                                                    filenames))
     if shards is None:
         shards = len(filenames)
     yield mapper_pipeline.MapperPipeline(
         job_name + "-shuffle-hash",
         __name__ + "._hashing_map",
         input_readers.__name__ + "._GoogleCloudStorageRecordInputReader",
         output_writer_spec=__name__ + "._HashingGCSOutputWriter",
         params={
             "input_reader": {
                 "bucket_name": bucket_name,
                 "objects": filenames_only,
             },
             "output_writer": {
                 "bucket_name": bucket_name,
             },
         },
         shards=shards)
Example #7
0
 def run(self, job_name, bucket_name, filenames, shards=None):
   filenames_only = (
       util.strip_prefix_from_items("/%s/" % bucket_name, filenames))
   if shards is None:
     shards = len(filenames)
   yield mapper_pipeline.MapperPipeline(
       job_name + "-shuffle-hash",
       __name__ + "._hashing_map",
       input_readers.__name__ + "._GoogleCloudStorageRecordInputReader",
       output_writer_spec=__name__ + "._HashingGCSOutputWriter",
       params={
           "input_reader": {
               "bucket_name": bucket_name,
               "objects": filenames_only,
           },
           "output_writer": {
               "bucket_name": bucket_name,
           },
       },
       shards=shards)
 def run(self, job_name, bucket_name, filenames):
     filenames_only = (util.strip_prefix_from_items("/%s/" % bucket_name,
                                                    filenames))
     params = {
         "output_writer": {
             "bucket_name": bucket_name,
             "content_type": "text/plain",
         },
         "input_reader": {
             "bucket_name": bucket_name,
             "objects": filenames_only,
         }
     }
     yield mapper_pipeline.MapperPipeline(
         job_name + "-combine",
         'events.find_access_tokens.file_identity',
         'mapreduce.input_readers.GoogleCloudStorageInputReader',
         'mapreduce.output_writers.GoogleCloudStorageOutputWriter',
         params,
         shards=1)
 def run(self, job_name, bucket_name, filenames):
     filenames_only = (util.strip_prefix_from_items("/%s/" % bucket_name, filenames))
     params = {
         "output_writer": {
             "bucket_name": bucket_name,
             "content_type": "text/plain",
         },
         "input_reader": {
             "bucket_name": bucket_name,
             "objects": filenames_only,
         }
     }
     yield mapper_pipeline.MapperPipeline(
         job_name + "-combine",
         'dancedeets.events.find_access_tokens.file_identity',
         'mapreduce.input_readers.GoogleCloudStorageInputReader',
         'mapreduce.output_writers.GoogleCloudStorageOutputWriter',
         params,
         shards=1
     )
Example #10
0
 def run(self, job_name, bucket_name, filenames):
     sort_mappers = []
     for i in range(len(filenames)):
         filenames_only = util.strip_prefix_from_items(
             "/%s/" % bucket_name, filenames[i])
         sort_mapper = yield mapper_pipeline.MapperPipeline(
             "%s-shuffle-sort-%s" % (job_name, str(i)),
             __name__ + "._sort_records_map",
             __name__ + "._BatchGCSRecordsReader",
             None, {
                 "input_reader": {
                     "bucket_name": bucket_name,
                     "objects": filenames_only,
                 },
             },
             shards=1)
         sort_mappers.append(sort_mapper)
     with pipeline.After(*sort_mappers):
         job_ids = yield pipeline_common.Append(
             *[mapper.job_id for mapper in sort_mappers])
         result = yield _CollectOutputFiles(job_ids)
         with pipeline.After(result):
             yield _CleanupOutputFiles(job_ids)
         yield pipeline_common.Return(result)