def run(self,
         job_name,
         mapper_spec,
         reducer_spec,
         input_reader_spec,
         output_writer_spec=None,
         mapper_params=None,
         reducer_params=None,
         shards=None):
   map_pipeline = yield MapPipeline(job_name,
                                    mapper_spec,
                                    input_reader_spec,
                                    params=mapper_params,
                                    shards=shards)
   shuffler_pipeline = yield ShufflePipeline(job_name, map_pipeline)
   reducer_pipeline = yield ReducePipeline(job_name,
                                           reducer_spec,
                                           output_writer_spec,
                                           reducer_params,
                                           shuffler_pipeline)
   with pipeline.After(reducer_pipeline):
     all_temp_files = yield pipeline_common.Extend(
         map_pipeline, shuffler_pipeline)
     yield mapper_pipeline._CleanupPipeline(all_temp_files)
   yield pipeline_common.Return(reducer_pipeline)
Example #2
0
 def run(self, job_name, filenames):
     hashed_files = yield _HashPipeline(job_name, filenames)
     sorted_files = yield _SortChunksPipeline(job_name, hashed_files)
     merged_files = yield _MergePipeline(job_name, sorted_files)
     with pipeline.After(merged_files):
         all_temp_files = yield pipeline_common.Extend(
             hashed_files, sorted_files)
         yield mapper_pipeline._CleanupPipeline(all_temp_files)
     yield pipeline_common.Return(merged_files)
Example #3
0
 def run(self, job):
     jobs = service.jobs()
     status = jobs.get(projectId=config.project_id, jobId=job).execute()
     job_state = status['status']['state']
     if job_state == 'PENDING' or job_state == 'RUNNING':
         delay = yield pipeline.common.Delay(seconds=1)
         with pipeline.After(delay):
             yield BqCheck(job)
     else:
         yield pipeline.common.Return(status)
Example #4
0
 def run(self, job_name, filenames):
     sort_mappers = []
     for i in range(len(filenames)):
         filename = filenames[i]
         sort_mapper = yield mapper_pipeline.MapperPipeline(
             "%s-shuffle-sort-%s" % (job_name, str(i)),
             __name__ + "._sort_records_map",
             __name__ + "._BatchRecordsReader",
             None, {
                 "files": [filename],
                 "processing_rate": 1000000,
             },
             shards=1)
         sort_mappers.append(sort_mapper)
     with pipeline.After(*sort_mappers):
         job_ids = yield pipeline_common.Append(
             *[mapper.job_id for mapper in sort_mappers])
         result = yield _CollectOutputFiles(job_ids)
         with pipeline.After(result):
             yield _CleanupOutputFiles(job_ids)
         yield pipeline_common.Return(result)
Example #5
0
 def run(self, filenames):
     mapper = yield mapper_pipeline.MapperPipeline(
         "sort",
         __name__ + "._sort_records",
         __name__ + "._BatchRecordsReader",
         None, {
             "files": filenames,
             "processing_rate": 1000000,
         },
         shards=1)
     # TODO(user): delete _OutputFile entities after collect
     with pipeline.After(mapper):
         yield _CollectOutputFiles(mapper.job_id)
Example #6
0
  def run(self, job_name, filenames, shards=None):
    if files.shuffler.available():
      yield _ShuffleServicePipeline(job_name, filenames)
    else:
      hashed_files = yield _HashPipeline(job_name, filenames, shards=shards)
      sorted_files = yield _SortChunksPipeline(job_name, hashed_files)
      temp_files = [hashed_files, sorted_files]

      merged_files = yield _MergePipeline(job_name, sorted_files)

      with pipeline.After(merged_files):
        all_temp_files = yield pipeline_common.Extend(*temp_files)
        yield mapper_pipeline._CleanupPipeline(all_temp_files)

      yield pipeline_common.Return(merged_files)
Example #7
0
 def run(self, job_name, params, parser_params, shards=8):
     extract_domain_files = yield _ExactDomainMapreducePipeline(
         job_name, params=params, shard_count=shards)
     robots_files = yield _RobotsFetchPipeline(job_name,
                                               extract_domain_files, shards)
     fetch_set_buffer_files = yield _FetchSetsBufferPipeline(
         job_name, robots_files)
     fetch_files = yield _FetchPagePipeline(job_name,
                                            fetch_set_buffer_files, shards)
     outlinks_files = yield _ExtractOutlinksPipeline(
         job_name, fetch_files, parser_params, shards)
     results_files = yield _FetchContentPipeline(job_name, outlinks_files,
                                                 shards)
     temp_files = [
         extract_domain_files, robots_files, fetch_set_buffer_files,
         fetch_files
     ]
     with pipeline.After(results_files):
         all_temp_files = yield pipeline_common.Extend(*temp_files)
         yield mapper_pipeline._CleanupPipeline(all_temp_files)
Example #8
0
    def run(self, job):
        jobs = service.jobs()
        status = jobs.get(projectId=bqproject, jobId=job).execute()

        if status['status']['state'] == 'PENDING' or status['status'][
                'state'] == 'RUNNING':
            message(
                self.root_pipeline_id,
                '<span class="label label-warning">{{ status }}</span> bq://jobs/{{ job }}',
                job=job,
                status=status['status']['state'].lower())
            delay = yield pipeline.common.Delay(seconds=1)
            with pipeline.After(delay):
                yield BqCheck(job)
        else:
            message(
                self.root_pipeline_id,
                '<span class="label label-success">{{ status }}</span> bq://jobs/{{ job }} <a href="{{ base_path }}/status?root={{ root_pipeline_id }}#pipeline-{{ pipeline_id }}">pipeline</a>',
                job=job,
                status=status['status']['state'].lower(),
                base_path=self.base_path,
                pipeline_id=self.pipeline_id)
            yield pipeline.common.Return(status)