def run(self):
     yield mapreduce_pipeline.MapperPipeline(
         "reset",
         handler_spec="restaurantfinder.etl.delete_restaurant",
         input_reader_spec="mapreduce.input_readers.DatastoreInputReader",
         params={"entity_kind": "restaurantfinder.models.Restaurant"},
         shards=50)
Exemple #2
0
 def run(self,
         job_name,
         gs_bucket_name,
         shards=8):
   file_names = []
   try:
     with files.open("/gs/"+gs_bucket_name+TARGETS_FILE_PATH, 'r') as fp:
       targets = fp.read()
       file_names = targets.splitlines()
   except Exception:
     logging.warning("Can't find file:" + TARGETS_FILE_PATH)
   
   if len(file_names)>0:
     yield mapreduce_pipeline.MapperPipeline(
       job_name,
       __name__ + "._to_datastore_map",
       "lakshmi.cooperate.input_readers.GoogleStorageLineInputReader",
       output_writer_spec="mapreduce.output_writers.BlobstoreOutputWriter",
       params={
         "input_reader":{
           "file_paths": file_names,
           "gs_bucket_name": gs_bucket_name
         },
         "output_writer":{
           "mime_type": "text/plain"
         }
       },
       shards=shards)
Exemple #3
0
 def run(self, start_time, end_time, version_ids):
     message(
         self.root_pipeline_id,
         '<span class="label label-info">started</span> ae://logs <i class="icon-arrow-right"></i> gs://{{ bucket }} <a href="{{ base_path }}/status?root={{ root_pipeline_id }}#pipeline-{{ pipeline_id }}">pipeline</a>',
         bucket=gsbucketname,
         base_path=self.base_path,
         pipeline_id=self.pipeline_id)
     yield mapreduce_pipeline.MapperPipeline(
         "log2bq",
         "main.log2csv",
         "mapreduce.input_readers.LogInputReader",
         output_writer_spec="mapreduce.output_writers.FileOutputWriter",
         params={
             "input_reader": {
                 "start_time": start_time,
                 "end_time": end_time,
                 "version_ids": version_ids,
             },
             "output_writer": {
                 "filesystem": "gs",
                 "gs_bucket_name": gsbucketname,
             },
             "root_pipeline_id": self.root_pipeline_id,
         },
         shards=16)
Exemple #4
0
 def run(self):
     params = {'urls': services.config.FIRST_AID_TOPIC_URLS}
     results = yield mapreduce_pipeline.MapperPipeline(
         "Update First Aid topic list",
         handler_spec=__name__ + ".topic_handler",
         input_reader_spec=__name__ + ".UrlInputReader",
         params=params,
         shards=2)
     yield SaveCounters(results.counters)
Exemple #5
0
 def run(self, *args, **kwargs):
     params = {
         'entity_kind': 'todo.models.user.User',
     }
     yield mapreduce_pipeline.MapperPipeline(
         'sync',
         'todo.pipelines.SyncPipeline.map',
         'mapreduce.input_readers.DatastoreInputReader',
         params=params)
Exemple #6
0
 def run(self, job_name, params, clean_all=False, shards=8):
     memcache.set(key=CLEAN_ALL_KEY, value=clean_all)
     yield mapreduce_pipeline.MapperPipeline(
         job_name,
         __name__ + "._clean_map",
         "mapreduce.input_readers.DatastoreInputReader",
         output_writer_spec=output_writers.__name__ +
         ".BlobstoreOutputWriter",
         params=params,
         shards=shards)
Exemple #7
0
 def run(self, job_name, blob_keys, shards):
     yield mapreduce_pipeline.MapperPipeline(
         job_name,
         __name__ + "._robots_fetch_map",
         __name__ + "._RobotsLineInputReader",
         output_writer_spec=output_writers.__name__ +
         ".KeyValueBlobstoreOutputWriter",
         params={
             "blob_keys": blob_keys,
         },
         shards=shards)
Exemple #8
0
 def run(self, blob_key, year, skip_header=True, **kwargs):
     mapper_params = {"blob_keys": blob_key, "student_year": year}
     result = yield mapreduce_pipeline.MapperPipeline(
         "Process student csv file (year %s)" % year,
         handler_spec=__name__ + ".row_handler",
         input_reader_spec=(
             "mapreduce.input_readers.BlobstoreLineInputReader"),
         params=mapper_params,
         shards=4)
     yield ResetMemcache(result)
     yield SaveCounters(result.counters)
Exemple #9
0
 def run(self, *args, **kwargs):
     """ run """
     mapper_params = {
         "entity_kind": "app.User",
     }
     yield mapreduce_pipeline.MapperPipeline(
         "Print all usernames",
         handler_spec="app.appengine_mapper",
         input_reader_spec="mapreduce.input_readers.DatastoreInputReader",
         params=mapper_params,
     )
Exemple #10
0
 def run(self, job_name, file_names):
     yield mapreduce_pipeline.MapperPipeline(
         job_name,
         __name__ + "._makeFetchSetBufferMap",
         "mapreduce.input_readers.RecordsReader",
         output_writer_spec=output_writers.__name__ +
         ".KeyValueBlobstoreOutputWriter",
         params={
             "files": file_names,
         },
         shards=len(file_names))
Exemple #11
0
 def run(self, job_name, file_names, shards=8):
     yield mapreduce_pipeline.MapperPipeline(
         job_name,
         __name__ + "._fetchContentMap",
         "mapreduce.input_readers.RecordsReader",
         output_writer_spec=output_writers.__name__ +
         ".BlobstoreOutputWriter",
         params={
             "files": file_names,
         },
         shards=shards)
    def run(self):
        email_params = {
            "entity_kind": "education.core.models.Student",
            "output_sharding": "input"
        }
        email_blob = yield mapreduce_pipeline.MapperPipeline(
            "Query student secondary emails",
            handler_spec=__name__ + ".student_handler",
            input_reader_spec="mapreduce.input_readers.DatastoreInputReader",
            output_writer_spec=__name__ + ".EmailWriter",
            params=email_params,
            shards=2)

        stats_blob = yield mapreduce_pipeline.MapperPipeline(
            "Query Rosh Review for stats",
            handler_spec=__name__ + ".emails_handler",
            input_reader_spec=(
                "mapreduce.input_readers.BlobstoreLineInputReader"),
            output_writer_spec="education.tasks.utils.ValueWriter",
            params=(yield BlobKeys(email_blob)),
            shards=2)

        filtered_stats_blob = yield mapreduce_pipeline.MapperPipeline(
            "Validate Rosh Review for stats",
            handler_spec=__name__ + ".stats_validation_handler",
            input_reader_spec=(
                "mapreduce.input_readers.BlobstoreLineInputReader"),
            output_writer_spec="education.tasks.utils.ValueWriter",
            params=(yield BlobKeys(stats_blob)),
            shards=2)

        results = yield mapreduce_pipeline.MapperPipeline(
            "Process Rosh Review stats",
            handler_spec=__name__ + ".stats_handler",
            input_reader_spec=(
                "mapreduce.input_readers.BlobstoreLineInputReader"),
            params=(yield BlobKeys(filtered_stats_blob)),
            shards=2)

        yield SaveCounters(email_blob.counters, stats_blob.counters,
                           filtered_stats_blob.counters, results.counters)
Exemple #13
0
 def run(self, job_name, file_names, parser_params, shard_count=8):
     _set_parser_param(_PARSER_PARAM_KEY, parser_params)
     yield mapreduce_pipeline.MapperPipeline(
         job_name,
         __name__ + "._extract_content_urls_map",
         __name__ + "._RobotsLineInputReader",
         output_writer_spec=output_writers.__name__ +
         ".KeyValueBlobstoreOutputWriter",
         params={
             "blob_keys": file_names,
         },
         shards=shard_count)
Exemple #14
0
 def run(self, filenames):
     yield mapreduce_pipeline.MapperPipeline(
         "sort",
         __name__ + ".test_handler_yield_str",
         shuffler.__name__ + "._MergingReader",
         output_writers.__name__ + ".BlobstoreRecordsOutputWriter",
         params={
             shuffler._MergingReader.FILES_PARAM: [filenames],
             shuffler._MergingReader.MAX_VALUES_COUNT_PARAM:
             shuffler._MergePipeline._MAX_VALUES_COUNT,
             shuffler._MergingReader.MAX_VALUES_SIZE_PARAM:
             shuffler._MergePipeline._MAX_VALUES_SIZE,
         },
     )
Exemple #15
0
 def run(self, *args, **kwargs):
     params = {
         'entity_kind': 'todo.models.user.User',
         'output_writer': {
             'bucket_name': app_identity.get_default_gcs_bucket_name(),
             'content_type': 'text/plain',
         },
     }
     yield mapreduce_pipeline.MapperPipeline(
         'export',
         'todo.pipelines.ExportPipeline.map',
         'mapreduce.input_readers.DatastoreInputReader',
         'mapreduce.output_writers.GoogleCloudStorageConsistentOutputWriter',
         params=params)
Exemple #16
0
    def run(self, str_start_date, str_end_date):
        start_date = self.str_to_date(str_start_date)
        end_date = self.str_to_date(str_end_date)

        url_pattern = services.config.FIRST_AID_REPORT_URL_PATTERN

        with InOrder():
            query_params = {
                'urls': list(self._urls(url_pattern, start_date, end_date))
            }
            query_results = yield mapreduce_pipeline.MapperPipeline(
                ("Fetch First Aid stats (%s to %s)" % (
                    str_start_date,
                    str_end_date,
                )),
                handler_spec=__name__ + ".query_handler",
                input_reader_spec=__name__ + ".UrlInputReader",
                params=query_params,
                shards=2)

            stats_mapper_params = {
                'entity_kind': 'education.core.models.Student',
                'filters': [(
                    'is_setup',
                    '=',
                    True,
                )]
            }
            build_result = yield mapreduce_pipeline.MapperPipeline(
                "Rebuild stats",
                handler_spec=__name__ + ".student_handler",
                params=stats_mapper_params,
                input_reader_spec=(
                    "mapreduce.input_readers.DatastoreInputReader"),
                shards=4)

        yield SaveCounters(query_results.counters, build_result.counters)
 def run(self, bucket_name, filenames):
     yield mapreduce_pipeline.MapperPipeline(
         "sort",
         __name__ + ".test_handler_yield_str",
         shuffler.__name__ + "._MergingReader",
         output_writers.__name__ + "._GoogleCloudStorageRecordOutputWriter",
         params={
             shuffler._MergingReader.FILES_PARAM: [filenames],
             shuffler._MergingReader.MAX_VALUES_COUNT_PARAM:
             shuffler._MergePipeline._MAX_VALUES_COUNT,
             shuffler._MergingReader.MAX_VALUES_SIZE_PARAM:
             shuffler._MergePipeline._MAX_VALUES_SIZE,
             "output_writer": {
                 "bucket_name": bucket_name,
             },
         },
     )
 def run(self, entity_type):
     output = yield mapreduce_pipeline.MapperPipeline(
         "Datastore Mapper %s" % entity_type,
         "main.datastore_map",
         "mapreduce.input_readers.DatastoreInputReader",
         output_writer_spec="mapreduce.output_writers.FileOutputWriter",
         params={
             "input_reader": {
                 "entity_kind": entity_type,
             },
             "output_writer": {
                 "filesystem": "gs",
                 "gs_bucket_name": GS_BUCKET,
                 "output_sharding": "none",
             }
         },
         shards=12)
     yield CloudStorageToBigQuery(output)
Exemple #19
0
 def run(self, start_time, end_time, version_ids):
   # Create a MapperPipeline w/ `LogInputReader`, `FileOutputWriter`
   yield mapreduce_pipeline.MapperPipeline(
       "log2bq",
       "mlabns.handlers.log2bq.log2csv",
       "mapreduce.input_readers.LogInputReader",
       "mapreduce.output_writers.FileOutputWriter",
       params={
           "input_reader" : {
               "start_time": start_time,
               "end_time": end_time,
               "minimum_log_level": logservice.LOG_LEVEL_INFO,
               "version_ids": version_ids,
               },
           "output_writer" : {
               "filesystem": "gs",
               "gs_bucket_name": config.gs_bucket_name,
               }
           },
       shards=16)
    def run(self, bucket_name):
        """Copies all blobs.

    Args:
      bucket_name: the bucket to copy the blobs into.

    Yields:
      A MapperPipeline for the MapReduce job to copy the blobs.
    """
        if not bucket_name:
            raise ValueError('bucket_name is required.')
        params = {
            'entity_kind': 'google.appengine.ext.blobstore.blobstore.BlobInfo',
            'bucket_name': bucket_name,
        }
        yield mapreduce_pipeline.MapperPipeline(
            'iterate_blobs',
            'app.migrator.migrate_blob',
            'app.migrator.BlobstoreDatastoreInputReader',
            params=params,
            shards=config.config.NUM_SHARDS)
    def run(self, blob_key_str, filename, content_type, bucket_name):
        """Copies a single blob.

    Args:
      blob_key_str: The BlobKey's encrypted string.
      filename: An optional filename from the blob being copied.
      content_type: The content-type for the blob.
      bucket_name: The bucket to copy the blob info.

    Yields:
      Pipelines to copy the blob and store the mapping results in Datastore.
    """
        output_writer_params = {
            'bucket_name': bucket_name,
            'content_type': content_type,
            'naming_format': build_gcs_filename(blob_key_str,
                                                filename=filename),
        }
        if filename:
            output_writer_params['content_disposition'] = (
                build_content_disposition(filename.encode('utf8')))

        params = {
            'blob_key': blob_key_str,
            'blob_keys': blob_key_str,
            'bucket_name': bucket_name,
            'output_writer': output_writer_params,
        }

        output = yield mapreduce_pipeline.MapperPipeline(
            'copy_blob_to_gcs',
            'app.migrator.yield_data',
            'app.migrator.BlobstoreInputReader',
            output_writer_spec=
            'mapreduce.output_writers.GoogleCloudStorageConsistentOutputWriter',
            params=params,
            shards=1)  # must be 1 because no reducer in MapperPipeline

        yield StoreMappingEntity(blob_key_str, output)
Exemple #22
0
 def run(self,
         job_name,
         input_entity_kind,
         gs_bucket_name,
         shards=8):
   file_names = yield mapreduce_pipeline.MapperPipeline(
       job_name,
       __name__ + "._to_csv_map",
       "mapreduce.input_readers.DatastoreInputReader",
       output_writer_spec="mapreduce.output_writers.FileOutputWriter",
       params={
         "input_reader":{
             "entity_kind": input_entity_kind,
             },
         "output_writer":{
             "filesystem": "gs",
             "gs_bucket_name": gs_bucket_name,
             "output_sharding":"none",
             }
       },
       shards=shards)
   yield SaveResultFileNames(file_names, gs_bucket_name)
Exemple #23
0
    def run(self):
        """Deletes the blobstore blobs.

    Be extremely careful with this pipeline. This pipeline is used
    to delete the blobstore blobs that have been migrated.

    You must ensure that the blobs have been correctly migrated before
    invoking this pipeline.

    THERE IS NO TURNING BACK!

    Yields:
      A MapperPipeline for the MapReduce job to delete the source blobs.
    """
        params = {
            'entity_kind': models.BlobKeyMapping._get_kind(),
        }
        yield mapreduce_pipeline.MapperPipeline(
            'delete_mapping_entities',
            'app.scrubber.delete_blobstore_blob',
            'mapreduce.input_readers.DatastoreKeyInputReader',
            params=params,
            shards=config.config.NUM_SHARDS)
Exemple #24
0
    def run(self):
        """Deletes the mapping entiies created in Datastore.

    Be extremely careful with this pipeline. This pipeline is provided
    for convenience in case you need to fully run another blob migration,
    e.g., because you migrated to the wrong bucket the first time.

    If you run this pipeline after deleting the source blobs, you have
    no way to map from old blob keys to new GCS files and it may be
    extremely difficult to use the new GCS files.

    Yields:
      A MapperPipeline for the MapReduce job to delete the mapping entities.
    """
        params = {
            'entity_kind': models.BlobKeyMapping._get_kind(),
        }
        yield mapreduce_pipeline.MapperPipeline(
            'delete_mapping_entities',
            'app.scrubber.delete_mapping_entity',
            'mapreduce.input_readers.DatastoreKeyInputReader',
            params=params,
            shards=config.config.NUM_SHARDS)