def run(self, job_name, mapper_spec, reducer_spec, input_reader_spec, output_writer_spec=None, mapper_params=None, reducer_params=None, shards=None): map_pipeline = yield MapPipeline(job_name, mapper_spec, input_reader_spec, params=mapper_params, shards=shards) shuffler_pipeline = yield ShufflePipeline(job_name, map_pipeline) reducer_pipeline = yield ReducePipeline(job_name, reducer_spec, output_writer_spec, reducer_params, shuffler_pipeline) with pipeline.After(reducer_pipeline): all_temp_files = yield pipeline_common.Extend( map_pipeline, shuffler_pipeline) yield mapper_pipeline._CleanupPipeline(all_temp_files) yield pipeline_common.Return(reducer_pipeline)
def run(self, job_name, filenames): hashed_files = yield _HashPipeline(job_name, filenames) sorted_files = yield _SortChunksPipeline(job_name, hashed_files) merged_files = yield _MergePipeline(job_name, sorted_files) with pipeline.After(merged_files): all_temp_files = yield pipeline_common.Extend( hashed_files, sorted_files) yield mapper_pipeline._CleanupPipeline(all_temp_files) yield pipeline_common.Return(merged_files)
def run(self, folderId, credentialsAsJson): credentials = OAuth2Credentials.from_json(credentialsAsJson) http = credentials.authorize(httplib2.Http()) folderQuery = ('mimeType = "application/vnd.google-apps.folder" and ' 'trashed = false') folderListPageToken = None deeperDocumentIdsFutures = [] while True: # TODO(michaelcupino): Do this in a try except statement. request = config.getService().children().list( folderId=folderId, pageToken=folderListPageToken, maxResults=1000, q=folderQuery) folderList = request.execute(http=http) # Recursively fetch document ids for folders inside the current folder. for folder in folderList.get('items'): deeperFolderId = folder.get('id') deeperDocumentIdsFuture = yield FolderFetcherPipeline( deeperFolderId, credentialsAsJson) deeperDocumentIdsFutures.append(deeperDocumentIdsFuture) folderListPageToken = folderList.get('nextPageToken') if not folderListPageToken: break documentQuery = ( 'mimeType = "application/vnd.google-apps.document" and ' 'trashed = false') docListPageToken = None documentIds = [] while True: # TODO(michaelcupino): Do this in a try except statement. request = config.getService().children().list( folderId=folderId, pageToken=docListPageToken, maxResults=1000, q=documentQuery) docList = request.execute(http=http) for document in docList.get('items'): documentIds.append(document.get('id')) docListPageToken = docList.get('nextPageToken') if not docListPageToken: break yield common.Extend(documentIds, *deeperDocumentIdsFutures)
def run(self, job_name, filenames, shards=None): if files.shuffler.available(): yield _ShuffleServicePipeline(job_name, filenames) else: hashed_files = yield _HashPipeline(job_name, filenames, shards=shards) sorted_files = yield _SortChunksPipeline(job_name, hashed_files) temp_files = [hashed_files, sorted_files] merged_files = yield _MergePipeline(job_name, sorted_files) with pipeline.After(merged_files): all_temp_files = yield pipeline_common.Extend(*temp_files) yield mapper_pipeline._CleanupPipeline(all_temp_files) yield pipeline_common.Return(merged_files)
def run(self, job_name, params, parser_params, shards=8): extract_domain_files = yield _ExactDomainMapreducePipeline( job_name, params=params, shard_count=shards) robots_files = yield _RobotsFetchPipeline(job_name, extract_domain_files, shards) fetch_set_buffer_files = yield _FetchSetsBufferPipeline( job_name, robots_files) fetch_files = yield _FetchPagePipeline(job_name, fetch_set_buffer_files, shards) outlinks_files = yield _ExtractOutlinksPipeline( job_name, fetch_files, parser_params, shards) results_files = yield _FetchContentPipeline(job_name, outlinks_files, shards) temp_files = [ extract_domain_files, robots_files, fetch_set_buffer_files, fetch_files ] with pipeline.After(results_files): all_temp_files = yield pipeline_common.Extend(*temp_files) yield mapper_pipeline._CleanupPipeline(all_temp_files)