def run(self,
         job_name,
         mapper_spec,
         reducer_spec,
         input_reader_spec,
         output_writer_spec=None,
         mapper_params=None,
         reducer_params=None,
         shards=None):
   map_pipeline = yield MapPipeline(job_name,
                                    mapper_spec,
                                    input_reader_spec,
                                    params=mapper_params,
                                    shards=shards)
   shuffler_pipeline = yield ShufflePipeline(job_name, map_pipeline)
   reducer_pipeline = yield ReducePipeline(job_name,
                                           reducer_spec,
                                           output_writer_spec,
                                           reducer_params,
                                           shuffler_pipeline)
   with pipeline.After(reducer_pipeline):
     all_temp_files = yield pipeline_common.Extend(
         map_pipeline, shuffler_pipeline)
     yield mapper_pipeline._CleanupPipeline(all_temp_files)
   yield pipeline_common.Return(reducer_pipeline)
Beispiel #2
0
 def run(self, job_name, filenames):
     hashed_files = yield _HashPipeline(job_name, filenames)
     sorted_files = yield _SortChunksPipeline(job_name, hashed_files)
     merged_files = yield _MergePipeline(job_name, sorted_files)
     with pipeline.After(merged_files):
         all_temp_files = yield pipeline_common.Extend(
             hashed_files, sorted_files)
         yield mapper_pipeline._CleanupPipeline(all_temp_files)
     yield pipeline_common.Return(merged_files)
Beispiel #3
0
    def run(self, folderId, credentialsAsJson):
        credentials = OAuth2Credentials.from_json(credentialsAsJson)
        http = credentials.authorize(httplib2.Http())

        folderQuery = ('mimeType = "application/vnd.google-apps.folder" and '
                       'trashed = false')
        folderListPageToken = None
        deeperDocumentIdsFutures = []
        while True:
            # TODO(michaelcupino): Do this in a try except statement.
            request = config.getService().children().list(
                folderId=folderId,
                pageToken=folderListPageToken,
                maxResults=1000,
                q=folderQuery)
            folderList = request.execute(http=http)

            # Recursively fetch document ids for folders inside the current folder.
            for folder in folderList.get('items'):
                deeperFolderId = folder.get('id')
                deeperDocumentIdsFuture = yield FolderFetcherPipeline(
                    deeperFolderId, credentialsAsJson)
                deeperDocumentIdsFutures.append(deeperDocumentIdsFuture)

            folderListPageToken = folderList.get('nextPageToken')
            if not folderListPageToken:
                break

        documentQuery = (
            'mimeType = "application/vnd.google-apps.document" and '
            'trashed = false')
        docListPageToken = None
        documentIds = []
        while True:
            # TODO(michaelcupino): Do this in a try except statement.
            request = config.getService().children().list(
                folderId=folderId,
                pageToken=docListPageToken,
                maxResults=1000,
                q=documentQuery)
            docList = request.execute(http=http)

            for document in docList.get('items'):
                documentIds.append(document.get('id'))

            docListPageToken = docList.get('nextPageToken')
            if not docListPageToken:
                break

        yield common.Extend(documentIds, *deeperDocumentIdsFutures)
Beispiel #4
0
  def run(self, job_name, filenames, shards=None):
    if files.shuffler.available():
      yield _ShuffleServicePipeline(job_name, filenames)
    else:
      hashed_files = yield _HashPipeline(job_name, filenames, shards=shards)
      sorted_files = yield _SortChunksPipeline(job_name, hashed_files)
      temp_files = [hashed_files, sorted_files]

      merged_files = yield _MergePipeline(job_name, sorted_files)

      with pipeline.After(merged_files):
        all_temp_files = yield pipeline_common.Extend(*temp_files)
        yield mapper_pipeline._CleanupPipeline(all_temp_files)

      yield pipeline_common.Return(merged_files)
Beispiel #5
0
 def run(self, job_name, params, parser_params, shards=8):
     extract_domain_files = yield _ExactDomainMapreducePipeline(
         job_name, params=params, shard_count=shards)
     robots_files = yield _RobotsFetchPipeline(job_name,
                                               extract_domain_files, shards)
     fetch_set_buffer_files = yield _FetchSetsBufferPipeline(
         job_name, robots_files)
     fetch_files = yield _FetchPagePipeline(job_name,
                                            fetch_set_buffer_files, shards)
     outlinks_files = yield _ExtractOutlinksPipeline(
         job_name, fetch_files, parser_params, shards)
     results_files = yield _FetchContentPipeline(job_name, outlinks_files,
                                                 shards)
     temp_files = [
         extract_domain_files, robots_files, fetch_set_buffer_files,
         fetch_files
     ]
     with pipeline.After(results_files):
         all_temp_files = yield pipeline_common.Extend(*temp_files)
         yield mapper_pipeline._CleanupPipeline(all_temp_files)