Example #1
0
def model_datastoreio():
  """Using a Read and Write transform to read/write to Cloud Datastore."""

  import uuid
  import apache_beam as beam
  from apache_beam.options.pipeline_options import PipelineOptions
  from apache_beam.io.gcp.datastore.v1new.datastoreio import ReadFromDatastore
  from apache_beam.io.gcp.datastore.v1new.datastoreio import WriteToDatastore
  from apache_beam.io.gcp.datastore.v1new.types import Entity
  from apache_beam.io.gcp.datastore.v1new.types import Key
  from apache_beam.io.gcp.datastore.v1new.types import Query

  project = 'my_project'
  kind = 'my_kind'
  query = Query(kind, project)

  # [START model_datastoreio_read]
  p = beam.Pipeline(options=PipelineOptions())
  entities = p | 'Read From Datastore' >> ReadFromDatastore(query)
  # [END model_datastoreio_read]

  # [START model_datastoreio_write]
  p = beam.Pipeline(options=PipelineOptions())
  musicians = p | 'Musicians' >> beam.Create(
      ['Mozart', 'Chopin', 'Beethoven', 'Vivaldi'])

  def to_entity(content):
    key = Key([kind, str(uuid.uuid4())])
    entity = Entity(key)
    entity.set_properties({'content': content})
    return entity

  entities = musicians | 'To Entity' >> beam.Map(to_entity)
  entities | 'Write To Datastore' >> WriteToDatastore(project)
Example #2
0
def create_multi_datasource_reader(pipeline,
                                   project,
                                   namespace,
                                   kinds,
                                   keys_only=False):
    if not kinds:
        kinds = [None]

    sources = []
    for kind in kinds:
        # namespace を指定しない(==None)と [default] namespace が使われる
        query = Query(project=project, namespace=namespace, kind=kind)
        if keys_only:
            # see
            # https://beam.apache.org/releases/pydoc/2.14.0/_modules/apache_beam/io/gcp/datastore/v1new/types.html#Query
            # https://google-cloud-python.readthedocs.io/en/0.32.0/_modules/google/cloud/datastore/query.html#Query.keys_only
            query.projection = ['__key__']
        if not kind:
            # kind を指定しない場合は明示的に __key__ asc でソートしないとエラーになる
            query.order = ['__key__']

        description = 'ReadFromDatastore kind={}'.format(kind if kind else "*")

        s = pipeline | description >> ReadFromDatastore(query=query)
        sources.append(s)
    return sources
Example #3
0
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        default='gs://wordcounttest2/data/datatest.txt',
                        help='Input file to process.')
    parser.add_argument(
        '--output',
        dest='output',
        # CHANGE 1/5: The Google Cloud Storage path is required
        # for outputting the results.
        default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend([
        # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to
        # run your pipeline on the Google Cloud Dataflow Service.
        '--runner=DirectRunner',
        # CHANGE 3/5: Your project ID is required in order to run your pipeline on
        # the Google Cloud Dataflow Service.
        '--project=SET_YOUR_PROJECT_ID_HERE',
        # CHANGE 4/5: Your Google Cloud Storage path is required for staging local
        # files.
        '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
        # CHANGE 5/5: Your Google Cloud Storage path is required for temporary
        # files.
        '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
        '--job_name=your-wordcount-job',
    ])

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    with beam.Pipeline(options=pipeline_options) as p:

        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromDatastore(known_args.input)
        types = type(lines)
        # Count the occurrences of each word.
        counts = (lines
                  | 'Split' >> (beam.FlatMap(lambda x: re.findall(
                      r'[A-Za-z\']+', x)).with_output_types(unicode))
                  | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
                  | 'GroupAndSum' >> beam.CombinePerKey(sum))

        # Format the counts into a PCollection of strings.
        def format_result(word_count):
            (word, count) = word_count
            return '%s: %s, %s' % (word, count, types)

        output = counts | 'Format' >> beam.Map(format_result)

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | WriteToText(known_args.output)
def read_from_datastore(project, user_options, pipeline_options):
    """Creates a pipeline that reads entities from Cloud Datastore."""
    p = beam.Pipeline(options=pipeline_options)
    # Create a query to read entities from datastore.
    query = make_ancestor_query(project, user_options.kind,
                                user_options.namespace, user_options.ancestor)

    # Read entities from Cloud Datastore into a PCollection.
    lines = p | 'read from datastore' >> ReadFromDatastore(query)

    # Count the occurrences of each word.
    def count_ones(word_ones):
        (word, ones) = word_ones
        return word, sum(ones)

    counts = (lines
              | 'split' >> beam.ParDo(WordExtractingDoFn())
              | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
              | 'group' >> beam.GroupByKey()
              | 'count' >> beam.Map(count_ones))

    # Format the counts into a PCollection of strings.
    def format_result(word_count):
        (word, count) = word_count
        return '%s: %s' % (word, count)

    output = counts | 'format' >> beam.Map(format_result)

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | 'write' >> beam.io.WriteToText(
        file_path_prefix=user_options.output,
        num_shards=user_options.num_shards)

    result = p.run()
    # Wait until completion, main thread would access post-completion job results.
    result.wait_until_finish()
    return result
def main():
    project = 'chromeperf'
    options = PipelineOptions()
    options.view_as(DebugOptions).add_experiment('use_beam_bq_sink')
    options.view_as(GoogleCloudOptions).project = project
    bq_export_options = options.view_as(BqExportOptions)

    p = beam.Pipeline(options=options)
    entities_read = Metrics.counter('main', 'entities_read')
    failed_entity_transforms = Metrics.counter('main',
                                               'failed_entity_transforms')

    test_metadata_entities = (
        p
        | 'ReadFromDatastore(TestMetadata)' >> ReadFromDatastore(
            Query(project=project, kind='TestMetadata')))

    # TODO: fetch SparseDiagnostics entities and join with TestMetadata here for
    # additional metadata.

    test_metadata_rows = (
        test_metadata_entities
        | 'ConvertEntityToRow(TestMetadata)' >> beam.FlatMap(
            ConvertEntity(TestMetadataEntityToRowDict, entities_read,
                          failed_entity_transforms)))

    """
  CREATE TABLE `chromeperf.chromeperf_dashboard_data.test_metadata`
  (test STRING NOT NULL,
   internal_only BOOLEAN NOT NULL,
   improvement_direction STRING,
   units STRING,
   has_rows BOOLEAN NOT NULL,
   deprecated BOOLEAN NOT NULL,
   description STRING,
   unescaped_story_name STRING,
   parent STRING,
   bot_group STRING NOT NULL,
   bot STRING NOT NULL,
   measurement STRING NOT NULL,
   )
  CLUSTER BY bot_group, bot, measurement;
  """  # pylint: disable=pointless-string-statement
    bq_testmetadata_schema = {
        'fields': [
            # 'test' corresponds to the same column in the Rows export.
            {
                'name': 'test',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'internal_only',
                'type': 'BOOLEAN',
                'mode': 'REQUIRED'
            },
            {
                'name': 'improvement_direction',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'units',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'has_rows',
                'type': 'BOOLEAN',
                'mode': 'REQUIRED'
            },
            {
                'name': 'deprecated',
                'type': 'BOOLEAN',
                'mode': 'REQUIRED'
            },
            {
                'name': 'description',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'unescaped_story_name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'parent',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            # bot_group, bot, and measurement correspond to same columns in the
            # Rows export.
            {
                'name': 'bot_group',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'bot',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'measurement',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
        ],
    }

    def TableNameFn(unused_element):
        return '{project}:{dataset}.test_metadata{suffix}'.format(
            project=project,
            dataset=bq_export_options.dataset.get(),
            suffix=bq_export_options.table_suffix)

    _ = (
        test_metadata_rows
        | 'WriteToBigQuery(test_metadata)' >> beam.io.WriteToBigQuery(
            TableNameFn,
            schema=bq_testmetadata_schema,
            method=beam.io.WriteToBigQuery.Method.FILE_LOADS,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
            create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
            # Cluster by the same columns as the Rows export, so that efficient
            # queries work the same way with this table (and to make efficient
            # joins with that table simpler).
            additional_bq_parameters={
                'clustering': {
                    'fields': ['bot_group', 'bot', 'measurement']
                }
            }))

    result = p.run()
    result.wait_until_finish()
    PrintCounters(result)
Example #6
0
def run(argv=None):
    """Main entry point."""

    parser = argparse.ArgumentParser()

    parser.add_argument('--kind',
                        dest='kind',
                        default='writereadtest',
                        help='Datastore Kind')
    parser.add_argument('--num_entities',
                        dest='num_entities',
                        type=int,
                        required=True,
                        help='Number of entities to write')
    parser.add_argument('--limit',
                        dest='limit',
                        type=int,
                        help='Limit of number of entities to write')

    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    gcloud_options = pipeline_options.view_as(GoogleCloudOptions)
    job_name = gcloud_options.job_name
    kind = known_args.kind
    num_entities = known_args.num_entities
    project = gcloud_options.project

    # Pipeline 1: Create and write the specified number of Entities to the
    # Cloud Datastore.
    ancestor_key = Key([kind, str(uuid.uuid4())], project=project)
    _LOGGER.info('Writing %s entities to %s', num_entities, project)
    p = new_pipeline_with_job_name(pipeline_options, job_name, '-write')
    _ = (
        p
        | 'Input' >> beam.Create(list(range(num_entities)))
        | 'To String' >> beam.Map(str)
        |
        'To Entity' >> beam.Map(EntityWrapper(kind, ancestor_key).make_entity)
        | 'Write to Datastore' >> WriteToDatastore(project))
    p.run()

    query = Query(kind=kind, project=project, ancestor=ancestor_key)
    # Optional Pipeline 2: If a read limit was provided, read it and confirm
    # that the expected entities were read.
    if known_args.limit is not None:
        _LOGGER.info(
            'Querying a limited set of %s entities and verifying count.',
            known_args.limit)
        p = new_pipeline_with_job_name(pipeline_options, job_name,
                                       '-verify-limit')
        query.limit = known_args.limit
        entities = p | 'read from datastore' >> ReadFromDatastore(query)
        assert_that(entities | beam.combiners.Count.Globally(),
                    equal_to([known_args.limit]))

        p.run()
        query.limit = None

    # Pipeline 3: Query the written Entities and verify result.
    _LOGGER.info('Querying entities, asserting they match.')
    p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify')
    entities = p | 'read from datastore' >> ReadFromDatastore(query)

    assert_that(entities | beam.combiners.Count.Globally(),
                equal_to([num_entities]))

    p.run()

    # Pipeline 4: Delete Entities.
    _LOGGER.info('Deleting entities.')
    p = new_pipeline_with_job_name(pipeline_options, job_name, '-delete')
    entities = p | 'read from datastore' >> ReadFromDatastore(query)
    _ = (entities
         | 'To Keys' >> beam.Map(lambda entity: entity.key)
         | 'delete entities' >> DeleteFromDatastore(project))

    p.run()

    # Pipeline 5: Query the written Entities, verify no results.
    _LOGGER.info(
        'Querying for the entities to make sure there are none present.')
    p = new_pipeline_with_job_name(pipeline_options, job_name,
                                   '-verify-deleted')
    entities = p | 'read from datastore' >> ReadFromDatastore(query)

    assert_that(entities | beam.combiners.Count.Globally(), equal_to([0]))

    p.run()
Example #7
0
import argparse
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.io.gcp.datastore.v1new.datastoreio import ReadFromDatastore
from apache_beam.io.gcp.datastore.v1new.types import Query

parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(None)
pipeline_options = PipelineOptions(pipeline_args)
project = pipeline_options.view_as(GoogleCloudOptions).project

# define the pipeline steps
p = beam.Pipeline(options=pipeline_options)

data = p | 'Read from Datastore' >> ReadFromDatastore(
    query=Query('natality-guid', project, limit=5))
scored = data | 'Print' >> beam.Map(print)

# run the pipeline
result = p.run()
result.wait_until_finish()