コード例 #1
0
 def run(self):
     yield mapreduce_pipeline.MapperPipeline(
         "reset",
         handler_spec="restaurantfinder.etl.delete_restaurant",
         input_reader_spec="mapreduce.input_readers.DatastoreInputReader",
         params={"entity_kind": "restaurantfinder.models.Restaurant"},
         shards=50)
コード例 #2
0
 def run(self,
         job_name,
         gs_bucket_name,
         shards=8):
   file_names = []
   try:
     with files.open("/gs/"+gs_bucket_name+TARGETS_FILE_PATH, 'r') as fp:
       targets = fp.read()
       file_names = targets.splitlines()
   except Exception:
     logging.warning("Can't find file:" + TARGETS_FILE_PATH)
   
   if len(file_names)>0:
     yield mapreduce_pipeline.MapperPipeline(
       job_name,
       __name__ + "._to_datastore_map",
       "lakshmi.cooperate.input_readers.GoogleStorageLineInputReader",
       output_writer_spec="mapreduce.output_writers.BlobstoreOutputWriter",
       params={
         "input_reader":{
           "file_paths": file_names,
           "gs_bucket_name": gs_bucket_name
         },
         "output_writer":{
           "mime_type": "text/plain"
         }
       },
       shards=shards)
コード例 #3
0
ファイル: main.py プロジェクト: jeremi/log2bq
 def run(self, start_time, end_time, version_ids):
     message(
         self.root_pipeline_id,
         '<span class="label label-info">started</span> ae://logs <i class="icon-arrow-right"></i> gs://{{ bucket }} <a href="{{ base_path }}/status?root={{ root_pipeline_id }}#pipeline-{{ pipeline_id }}">pipeline</a>',
         bucket=gsbucketname,
         base_path=self.base_path,
         pipeline_id=self.pipeline_id)
     yield mapreduce_pipeline.MapperPipeline(
         "log2bq",
         "main.log2csv",
         "mapreduce.input_readers.LogInputReader",
         output_writer_spec="mapreduce.output_writers.FileOutputWriter",
         params={
             "input_reader": {
                 "start_time": start_time,
                 "end_time": end_time,
                 "version_ids": version_ids,
             },
             "output_writer": {
                 "filesystem": "gs",
                 "gs_bucket_name": gsbucketname,
             },
             "root_pipeline_id": self.root_pipeline_id,
         },
         shards=16)
コード例 #4
0
 def run(self):
     params = {'urls': services.config.FIRST_AID_TOPIC_URLS}
     results = yield mapreduce_pipeline.MapperPipeline(
         "Update First Aid topic list",
         handler_spec=__name__ + ".topic_handler",
         input_reader_spec=__name__ + ".UrlInputReader",
         params=params,
         shards=2)
     yield SaveCounters(results.counters)
コード例 #5
0
 def run(self, *args, **kwargs):
     params = {
         'entity_kind': 'todo.models.user.User',
     }
     yield mapreduce_pipeline.MapperPipeline(
         'sync',
         'todo.pipelines.SyncPipeline.map',
         'mapreduce.input_readers.DatastoreInputReader',
         params=params)
コード例 #6
0
ファイル: pipelines.py プロジェクト: cloudysunny14/lakshmi
 def run(self, job_name, params, clean_all=False, shards=8):
     memcache.set(key=CLEAN_ALL_KEY, value=clean_all)
     yield mapreduce_pipeline.MapperPipeline(
         job_name,
         __name__ + "._clean_map",
         "mapreduce.input_readers.DatastoreInputReader",
         output_writer_spec=output_writers.__name__ +
         ".BlobstoreOutputWriter",
         params=params,
         shards=shards)
コード例 #7
0
ファイル: pipelines.py プロジェクト: cloudysunny14/lakshmi
 def run(self, job_name, blob_keys, shards):
     yield mapreduce_pipeline.MapperPipeline(
         job_name,
         __name__ + "._robots_fetch_map",
         __name__ + "._RobotsLineInputReader",
         output_writer_spec=output_writers.__name__ +
         ".KeyValueBlobstoreOutputWriter",
         params={
             "blob_keys": blob_keys,
         },
         shards=shards)
コード例 #8
0
ファイル: students.py プロジェクト: gayancliyanage/education
 def run(self, blob_key, year, skip_header=True, **kwargs):
     mapper_params = {"blob_keys": blob_key, "student_year": year}
     result = yield mapreduce_pipeline.MapperPipeline(
         "Process student csv file (year %s)" % year,
         handler_spec=__name__ + ".row_handler",
         input_reader_spec=(
             "mapreduce.input_readers.BlobstoreLineInputReader"),
         params=mapper_params,
         shards=4)
     yield ResetMemcache(result)
     yield SaveCounters(result.counters)
コード例 #9
0
 def run(self, *args, **kwargs):
     """ run """
     mapper_params = {
         "entity_kind": "app.User",
     }
     yield mapreduce_pipeline.MapperPipeline(
         "Print all usernames",
         handler_spec="app.appengine_mapper",
         input_reader_spec="mapreduce.input_readers.DatastoreInputReader",
         params=mapper_params,
     )
コード例 #10
0
ファイル: pipelines.py プロジェクト: cloudysunny14/lakshmi
 def run(self, job_name, file_names):
     yield mapreduce_pipeline.MapperPipeline(
         job_name,
         __name__ + "._makeFetchSetBufferMap",
         "mapreduce.input_readers.RecordsReader",
         output_writer_spec=output_writers.__name__ +
         ".KeyValueBlobstoreOutputWriter",
         params={
             "files": file_names,
         },
         shards=len(file_names))
コード例 #11
0
ファイル: pipelines.py プロジェクト: cloudysunny14/lakshmi
 def run(self, job_name, file_names, shards=8):
     yield mapreduce_pipeline.MapperPipeline(
         job_name,
         __name__ + "._fetchContentMap",
         "mapreduce.input_readers.RecordsReader",
         output_writer_spec=output_writers.__name__ +
         ".BlobstoreOutputWriter",
         params={
             "files": file_names,
         },
         shards=shards)
コード例 #12
0
    def run(self):
        email_params = {
            "entity_kind": "education.core.models.Student",
            "output_sharding": "input"
        }
        email_blob = yield mapreduce_pipeline.MapperPipeline(
            "Query student secondary emails",
            handler_spec=__name__ + ".student_handler",
            input_reader_spec="mapreduce.input_readers.DatastoreInputReader",
            output_writer_spec=__name__ + ".EmailWriter",
            params=email_params,
            shards=2)

        stats_blob = yield mapreduce_pipeline.MapperPipeline(
            "Query Rosh Review for stats",
            handler_spec=__name__ + ".emails_handler",
            input_reader_spec=(
                "mapreduce.input_readers.BlobstoreLineInputReader"),
            output_writer_spec="education.tasks.utils.ValueWriter",
            params=(yield BlobKeys(email_blob)),
            shards=2)

        filtered_stats_blob = yield mapreduce_pipeline.MapperPipeline(
            "Validate Rosh Review for stats",
            handler_spec=__name__ + ".stats_validation_handler",
            input_reader_spec=(
                "mapreduce.input_readers.BlobstoreLineInputReader"),
            output_writer_spec="education.tasks.utils.ValueWriter",
            params=(yield BlobKeys(stats_blob)),
            shards=2)

        results = yield mapreduce_pipeline.MapperPipeline(
            "Process Rosh Review stats",
            handler_spec=__name__ + ".stats_handler",
            input_reader_spec=(
                "mapreduce.input_readers.BlobstoreLineInputReader"),
            params=(yield BlobKeys(filtered_stats_blob)),
            shards=2)

        yield SaveCounters(email_blob.counters, stats_blob.counters,
                           filtered_stats_blob.counters, results.counters)
コード例 #13
0
ファイル: pipelines.py プロジェクト: cloudysunny14/lakshmi
 def run(self, job_name, file_names, parser_params, shard_count=8):
     _set_parser_param(_PARSER_PARAM_KEY, parser_params)
     yield mapreduce_pipeline.MapperPipeline(
         job_name,
         __name__ + "._extract_content_urls_map",
         __name__ + "._RobotsLineInputReader",
         output_writer_spec=output_writers.__name__ +
         ".KeyValueBlobstoreOutputWriter",
         params={
             "blob_keys": file_names,
         },
         shards=shard_count)
コード例 #14
0
 def run(self, filenames):
     yield mapreduce_pipeline.MapperPipeline(
         "sort",
         __name__ + ".test_handler_yield_str",
         shuffler.__name__ + "._MergingReader",
         output_writers.__name__ + ".BlobstoreRecordsOutputWriter",
         params={
             shuffler._MergingReader.FILES_PARAM: [filenames],
             shuffler._MergingReader.MAX_VALUES_COUNT_PARAM:
             shuffler._MergePipeline._MAX_VALUES_COUNT,
             shuffler._MergingReader.MAX_VALUES_SIZE_PARAM:
             shuffler._MergePipeline._MAX_VALUES_SIZE,
         },
     )
コード例 #15
0
 def run(self, *args, **kwargs):
     params = {
         'entity_kind': 'todo.models.user.User',
         'output_writer': {
             'bucket_name': app_identity.get_default_gcs_bucket_name(),
             'content_type': 'text/plain',
         },
     }
     yield mapreduce_pipeline.MapperPipeline(
         'export',
         'todo.pipelines.ExportPipeline.map',
         'mapreduce.input_readers.DatastoreInputReader',
         'mapreduce.output_writers.GoogleCloudStorageConsistentOutputWriter',
         params=params)
コード例 #16
0
    def run(self, str_start_date, str_end_date):
        start_date = self.str_to_date(str_start_date)
        end_date = self.str_to_date(str_end_date)

        url_pattern = services.config.FIRST_AID_REPORT_URL_PATTERN

        with InOrder():
            query_params = {
                'urls': list(self._urls(url_pattern, start_date, end_date))
            }
            query_results = yield mapreduce_pipeline.MapperPipeline(
                ("Fetch First Aid stats (%s to %s)" % (
                    str_start_date,
                    str_end_date,
                )),
                handler_spec=__name__ + ".query_handler",
                input_reader_spec=__name__ + ".UrlInputReader",
                params=query_params,
                shards=2)

            stats_mapper_params = {
                'entity_kind': 'education.core.models.Student',
                'filters': [(
                    'is_setup',
                    '=',
                    True,
                )]
            }
            build_result = yield mapreduce_pipeline.MapperPipeline(
                "Rebuild stats",
                handler_spec=__name__ + ".student_handler",
                params=stats_mapper_params,
                input_reader_spec=(
                    "mapreduce.input_readers.DatastoreInputReader"),
                shards=4)

        yield SaveCounters(query_results.counters, build_result.counters)
コード例 #17
0
 def run(self, bucket_name, filenames):
     yield mapreduce_pipeline.MapperPipeline(
         "sort",
         __name__ + ".test_handler_yield_str",
         shuffler.__name__ + "._MergingReader",
         output_writers.__name__ + "._GoogleCloudStorageRecordOutputWriter",
         params={
             shuffler._MergingReader.FILES_PARAM: [filenames],
             shuffler._MergingReader.MAX_VALUES_COUNT_PARAM:
             shuffler._MergePipeline._MAX_VALUES_COUNT,
             shuffler._MergingReader.MAX_VALUES_SIZE_PARAM:
             shuffler._MergePipeline._MAX_VALUES_SIZE,
             "output_writer": {
                 "bucket_name": bucket_name,
             },
         },
     )
コード例 #18
0
 def run(self, entity_type):
     output = yield mapreduce_pipeline.MapperPipeline(
         "Datastore Mapper %s" % entity_type,
         "main.datastore_map",
         "mapreduce.input_readers.DatastoreInputReader",
         output_writer_spec="mapreduce.output_writers.FileOutputWriter",
         params={
             "input_reader": {
                 "entity_kind": entity_type,
             },
             "output_writer": {
                 "filesystem": "gs",
                 "gs_bucket_name": GS_BUCKET,
                 "output_sharding": "none",
             }
         },
         shards=12)
     yield CloudStorageToBigQuery(output)
コード例 #19
0
ファイル: log2bq.py プロジェクト: pboothe/mlab-ns
 def run(self, start_time, end_time, version_ids):
   # Create a MapperPipeline w/ `LogInputReader`, `FileOutputWriter`
   yield mapreduce_pipeline.MapperPipeline(
       "log2bq",
       "mlabns.handlers.log2bq.log2csv",
       "mapreduce.input_readers.LogInputReader",
       "mapreduce.output_writers.FileOutputWriter",
       params={
           "input_reader" : {
               "start_time": start_time,
               "end_time": end_time,
               "minimum_log_level": logservice.LOG_LEVEL_INFO,
               "version_ids": version_ids,
               },
           "output_writer" : {
               "filesystem": "gs",
               "gs_bucket_name": config.gs_bucket_name,
               }
           },
       shards=16)
コード例 #20
0
    def run(self, bucket_name):
        """Copies all blobs.

    Args:
      bucket_name: the bucket to copy the blobs into.

    Yields:
      A MapperPipeline for the MapReduce job to copy the blobs.
    """
        if not bucket_name:
            raise ValueError('bucket_name is required.')
        params = {
            'entity_kind': 'google.appengine.ext.blobstore.blobstore.BlobInfo',
            'bucket_name': bucket_name,
        }
        yield mapreduce_pipeline.MapperPipeline(
            'iterate_blobs',
            'app.migrator.migrate_blob',
            'app.migrator.BlobstoreDatastoreInputReader',
            params=params,
            shards=config.config.NUM_SHARDS)
コード例 #21
0
    def run(self, blob_key_str, filename, content_type, bucket_name):
        """Copies a single blob.

    Args:
      blob_key_str: The BlobKey's encrypted string.
      filename: An optional filename from the blob being copied.
      content_type: The content-type for the blob.
      bucket_name: The bucket to copy the blob info.

    Yields:
      Pipelines to copy the blob and store the mapping results in Datastore.
    """
        output_writer_params = {
            'bucket_name': bucket_name,
            'content_type': content_type,
            'naming_format': build_gcs_filename(blob_key_str,
                                                filename=filename),
        }
        if filename:
            output_writer_params['content_disposition'] = (
                build_content_disposition(filename.encode('utf8')))

        params = {
            'blob_key': blob_key_str,
            'blob_keys': blob_key_str,
            'bucket_name': bucket_name,
            'output_writer': output_writer_params,
        }

        output = yield mapreduce_pipeline.MapperPipeline(
            'copy_blob_to_gcs',
            'app.migrator.yield_data',
            'app.migrator.BlobstoreInputReader',
            output_writer_spec=
            'mapreduce.output_writers.GoogleCloudStorageConsistentOutputWriter',
            params=params,
            shards=1)  # must be 1 because no reducer in MapperPipeline

        yield StoreMappingEntity(blob_key_str, output)
コード例 #22
0
 def run(self,
         job_name,
         input_entity_kind,
         gs_bucket_name,
         shards=8):
   file_names = yield mapreduce_pipeline.MapperPipeline(
       job_name,
       __name__ + "._to_csv_map",
       "mapreduce.input_readers.DatastoreInputReader",
       output_writer_spec="mapreduce.output_writers.FileOutputWriter",
       params={
         "input_reader":{
             "entity_kind": input_entity_kind,
             },
         "output_writer":{
             "filesystem": "gs",
             "gs_bucket_name": gs_bucket_name,
             "output_sharding":"none",
             }
       },
       shards=shards)
   yield SaveResultFileNames(file_names, gs_bucket_name)
コード例 #23
0
    def run(self):
        """Deletes the blobstore blobs.

    Be extremely careful with this pipeline. This pipeline is used
    to delete the blobstore blobs that have been migrated.

    You must ensure that the blobs have been correctly migrated before
    invoking this pipeline.

    THERE IS NO TURNING BACK!

    Yields:
      A MapperPipeline for the MapReduce job to delete the source blobs.
    """
        params = {
            'entity_kind': models.BlobKeyMapping._get_kind(),
        }
        yield mapreduce_pipeline.MapperPipeline(
            'delete_mapping_entities',
            'app.scrubber.delete_blobstore_blob',
            'mapreduce.input_readers.DatastoreKeyInputReader',
            params=params,
            shards=config.config.NUM_SHARDS)
コード例 #24
0
    def run(self):
        """Deletes the mapping entiies created in Datastore.

    Be extremely careful with this pipeline. This pipeline is provided
    for convenience in case you need to fully run another blob migration,
    e.g., because you migrated to the wrong bucket the first time.

    If you run this pipeline after deleting the source blobs, you have
    no way to map from old blob keys to new GCS files and it may be
    extremely difficult to use the new GCS files.

    Yields:
      A MapperPipeline for the MapReduce job to delete the mapping entities.
    """
        params = {
            'entity_kind': models.BlobKeyMapping._get_kind(),
        }
        yield mapreduce_pipeline.MapperPipeline(
            'delete_mapping_entities',
            'app.scrubber.delete_mapping_entity',
            'mapreduce.input_readers.DatastoreKeyInputReader',
            params=params,
            shards=config.config.NUM_SHARDS)