def map_reduce_queryset(queryset, map_func, reduce_func, output_writer, *args, **kwargs): """ Does a complete map-shuffle-reduce over the queryset output_writer should be a mapreduce OutputWriter subclass Returns the pipeline """ map_func = qualname(map_func) reduce_func = qualname(reduce_func) output_writer = qualname(output_writer) options = extract_options(kwargs) _shards = options.pop("_shards", None) _job_name = options.pop("_job_name", "Map reduce task over {}".format(queryset.model)) _queue_name = options.pop("_queue_name", 'default') pipeline = MapreducePipeline( _job_name, map_func, reduce_func, qualname(DjangoInputReader), output_writer, mapper_params={ "input_reader": DjangoInputReader.params_from_queryset(queryset), }, reducer_params={ 'output_writer': options.pop("_output_writer_kwargs", {}) or {} }, shards=_shards) pipeline.start(queue_name=_queue_name) return pipeline
def run_pipeline(self, pipeline, *args, **kwargs): """Runs the pipeline and returns outputs.""" require_slots_filled = kwargs.pop('_require_slots_filled', True) task_retry = kwargs.pop('_task_retry', True) pipeline.task_retry = task_retry pipeline.start(*args, **kwargs) while True: task_list = self.get_tasks() if not task_list: break print "DID TASKS" for task in task_list: self.run_task(task) delete_tasks([task]) if require_slots_filled: for slot_record in _SlotRecord.all(): self.assertEquals(_SlotRecord.FILLED, slot_record.status, '_SlotRecord = %r' % slot_record.key()) for barrier_record in _BarrierRecord.all(): self.assertEquals(_BarrierRecord.FIRED, barrier_record.status, '_BarrierRecord = %r' % barrier_record.key()) for pipeline_record in _PipelineRecord.all(): self.assertEquals(_PipelineRecord.DONE, pipeline_record.status, '_PipelineRecord = %r' % pipeline_record.key()) return pipeline.__class__.from_id(pipeline.pipeline_id).outputs
def map_reduce_queryset(queryset, map_func, reduce_func, output_writer, *args, **kwargs): """ Does a complete map-shuffle-reduce over the queryset output_writer should be a mapreduce OutputWriter subclass Returns the pipeline """ map_func = qualname(map_func) reduce_func = qualname(reduce_func) output_writer = qualname(output_writer) options = extract_options(kwargs) _shards = options.pop("_shards", None) _job_name = options.pop("_job_name", "Map reduce task over {}".format(queryset.model)) _queue_name = options.pop("_queue_name", 'default') pipeline = MapreducePipeline( _job_name, map_func, reduce_func, qualname(DjangoInputReader), output_writer, mapper_params={ "input_reader": DjangoInputReader.params_from_queryset(queryset), }, reducer_params={ 'output_writer': options.pop("_output_writer_kwargs", {}) or {} }, shards=_shards ) pipeline.start(queue_name=_queue_name) return pipeline
def run_pipeline(self, pipeline, *args, **kwargs): """Runs the pipeline and returns outputs.""" require_slots_filled = kwargs.pop('_require_slots_filled', True) task_retry = kwargs.pop('_task_retry', True) pipeline.task_retry = task_retry pipeline.start(*args, **kwargs) while True: task_list = self.get_tasks() if not task_list: break for task in task_list: self.run_task(task) delete_tasks([task]) if require_slots_filled: for slot_record in _SlotRecord.all(): self.assertEquals(_SlotRecord.FILLED, slot_record.status, '_SlotRecord = %r' % slot_record.key()) for barrier_record in _BarrierRecord.all(): self.assertEquals(_BarrierRecord.FIRED, barrier_record.status, '_BarrierRecord = %r' % barrier_record.key()) for pipeline_record in _PipelineRecord.all(): self.assertEquals( _PipelineRecord.DONE, pipeline_record.status, '_PipelineRecord = %r' % pipeline_record.key()) return pipeline.__class__.from_id(pipeline.pipeline_id).outputs
def runner(type, input): f = str(input.read()) separator = ',' if type == 'csv' else ' ' if type in ('csv', 'txt'): for data in f.split('\n'): data = data.split(separator) if len(data) > 2: pipeline.start(*data) else: for data in eval(f): pipeline.start(data['email'], data['max_posts'], data['max_likes']) print('>>> your config has been sent to the worker and will be done soon.')
def migrate_blob(blob_info, _mapper_params=None): """Starts a mapper pipeline to migrate single blob to cloud storage object. Args: blob_info: The blob to migrate. _mapper_params: Allows injection of mapper parameters for testing. Yields: Various MapReduce counter operations. """ params = _mapper_params or context.get().mapreduce_spec.mapper.params bucket_name = params['bucket_name'] yield counters.Increment('BlobInfo_considered_for_migration') blob_key_str = _get_blob_key_str(blob_info) # dev_appserver's stubs store the GCS blobs in the same place as blobstore # blobs. We'll skip these so our testing is cleaner. if (appengine_config.IS_DEVSERVER and blob_key_str.startswith('encoded_gs_file:')): yield counters.Increment( 'BlobInfo_is_really_GCS_file_on_dev_appserver__skipping') raise StopIteration() # look up the blob_key in the migration table; if already migrated, skip it already_mapped = models.BlobKeyMapping.build_key(blob_key_str).get() if already_mapped: yield counters.Increment('BlobInfo_previously_migrated') raise StopIteration() # no work to do for this blob # if the blob is "small", migrate it in-line if blob_info.size <= config.config.DIRECT_MIGRATION_MAX_SIZE: migrate_single_blob_inline(blob_info, bucket_name) yield counters.Increment('BlobInfo_migrated_within_mapper') # else start a full-scale pipeline to handle the blob migration else: pipeline = MigrateSingleBlobPipeline(blob_key_str, blob_info.filename, blob_info.content_type, bucket_name) pipeline.start(queue_name=config.config.QUEUE_NAME) yield counters.Increment('BlobInfo_migrated_via_secondary_pipeline') yield counters.Increment('BlobInfo_migrated') raise StopIteration()
def map_reduce_entities(kind_name, namespace, map_func, reduce_func, output_writer, *args, **kwargs): """ Does a complete map-shuffle-reduce over the entities output_writer should be a mapreduce OutputWriter subclass _filters is an optional kwarg which will be passed directly to the input reader Returns the pipeline """ map_func = qualname(map_func) reduce_func = qualname(reduce_func) output_writer = qualname(output_writer) options = extract_options(kwargs, additional={"_filters"}) _shards = options.pop("_shards", None) _job_name = options.pop("_job_name", "Map reduce task over {}".format(kind_name)) _queue_name = options.pop("_queue_name", 'default') pipeline = MapreducePipeline( _job_name, map_func, reduce_func, qualname(RawDatastoreInputReader), output_writer, mapper_params={ 'input_reader': { RawDatastoreInputReader.ENTITY_KIND_PARAM: kind_name, RawDatastoreInputReader.NAMESPACE_PARAM: namespace, RawDatastoreInputReader.FILTERS_PARAM: options.pop("_filters", []) }, }, reducer_params={ 'output_writer': options.pop("_output_writer_kwargs", {}) or {} }, shards=_shards) pipeline.start(queue_name=_queue_name) return pipeline
def map_reduce_entities(kind_name, namespace, map_func, reduce_func, output_writer, *args, **kwargs): """ Does a complete map-shuffle-reduce over the entities output_writer should be a mapreduce OutputWriter subclass _filters is an optional kwarg which will be passed directly to the input reader Returns the pipeline """ map_func = qualname(map_func) reduce_func = qualname(reduce_func) output_writer = qualname(output_writer) options = extract_options(kwargs, additional={"_filters"}) _shards = options.pop("_shards", None) _job_name = options.pop("_job_name", "Map reduce task over {}".format(kind_name)) _queue_name = options.pop("_queue_name", 'default') pipeline = MapreducePipeline( _job_name, map_func, reduce_func, qualname(RawDatastoreInputReader), output_writer, mapper_params={ 'input_reader': { RawDatastoreInputReader.ENTITY_KIND_PARAM: kind_name, RawDatastoreInputReader.NAMESPACE_PARAM: namespace, RawDatastoreInputReader.FILTERS_PARAM: options.pop("_filters", []) }, }, reducer_params={ 'output_writer': options.pop("_output_writer_kwargs", {}) or {} }, shards=_shards ) pipeline.start(queue_name=_queue_name) return pipeline
def map_reduce_entities(kind_name, map_func, reduce_func, output_writer, *args, **kwargs): """ Does a complete map-shuffle-reduce over the entities output_writer should be a mapreduce OutputWriter subclass Returns the pipeline """ map_func = qualname(map_func) reduce_func = qualname(reduce_func) output_writer = qualname(output_writer) options = extract_options(kwargs) _shards = options.pop("_shards", None) _job_name = options.pop("_job_name", "Map reduce task over {}".format(kind_name)) _queue_name = options.pop("_queue_name", 'default') pipeline = MapreducePipeline( _job_name, map_func, reduce_func, qualname(RawDatastoreInputReader), output_writer, mapper_params={ 'input_reader': { RawDatastoreInputReader.ENTITY_KIND_PARAM: kind_name }, }, reducer_params={ 'output_writer': options.pop("_output_writer_kwargs", {}) or {} }, shards=_shards) pipeline.start(queue_name=_queue_name) return pipeline
def run_job(): pipeline = TouchPipeline() pipeline.start() return 'Job started'
""" To start Pipeline, run this script from maya's script editor or a shelf button Remember to alter the path below! """ import sys path_to_pipeline = '/Users/liorbenhorin/Documents/Projects/2016/GitHub/pipeline2' if not path_to_pipeline in sys.path: sys.path.append(path_to_pipeline) import pipeline # reload(pipeline) pipeline.start()