예제 #1
0
파일: beamutil.py 프로젝트: tomoemon/dsflow
def create_multi_datasource_reader(pipeline,
                                   project,
                                   namespace,
                                   kinds,
                                   keys_only=False):
    if not kinds:
        kinds = [None]

    sources = []
    for kind in kinds:
        # namespace を指定しない(==None)と [default] namespace が使われる
        query = Query(project=project, namespace=namespace, kind=kind)
        if keys_only:
            # see
            # https://beam.apache.org/releases/pydoc/2.14.0/_modules/apache_beam/io/gcp/datastore/v1new/types.html#Query
            # https://google-cloud-python.readthedocs.io/en/0.32.0/_modules/google/cloud/datastore/query.html#Query.keys_only
            query.projection = ['__key__']
        if not kind:
            # kind を指定しない場合は明示的に __key__ asc でソートしないとエラーになる
            query.order = ['__key__']

        description = 'ReadFromDatastore kind={}'.format(kind if kind else "*")

        s = pipeline | description >> ReadFromDatastore(query=query)
        sources.append(s)
    return sources
예제 #2
0
  def check_estimated_size_bytes(self, entity_bytes, timestamp, namespace=None):
    """A helper method to test get_estimated_size_bytes"""
    self._mock_client.namespace = namespace
    self._mock_client.query.return_value = self._mock_query
    self._mock_query.project = self._PROJECT
    self._mock_query.namespace = namespace
    self._mock_query.fetch.side_effect = [
        [{'timestamp': timestamp}],
        [{'entity_bytes': entity_bytes}],
    ]
    self._mock_query.kind = self._KIND

    split_query_fn = ReadFromDatastore._SplitQueryFn(num_splits=0)
    self.assertEqual(entity_bytes,
                     split_query_fn.get_estimated_size_bytes(self._mock_client,
                                                             self._mock_query))

    if namespace is None:
      ns_keyword = '_'
    else:
      ns_keyword = '_Ns_'
    self._mock_client.query.assert_has_calls([
        call(kind='__Stat%sTotal__' % ns_keyword, order=['-timestamp']),
        call().fetch(limit=1),
        call(kind='__Stat%sKind__' % ns_keyword),
        call().add_filter('kind_name', '=', self._KIND),
        call().add_filter('timestamp', '=', timestamp),
        call().fetch(limit=1),
    ])
예제 #3
0
def model_datastoreio():
  """Using a Read and Write transform to read/write to Cloud Datastore."""

  import uuid
  import apache_beam as beam
  from apache_beam.options.pipeline_options import PipelineOptions
  from apache_beam.io.gcp.datastore.v1new.datastoreio import ReadFromDatastore
  from apache_beam.io.gcp.datastore.v1new.datastoreio import WriteToDatastore
  from apache_beam.io.gcp.datastore.v1new.types import Entity
  from apache_beam.io.gcp.datastore.v1new.types import Key
  from apache_beam.io.gcp.datastore.v1new.types import Query

  project = 'my_project'
  kind = 'my_kind'
  query = Query(kind, project)

  # [START model_datastoreio_read]
  p = beam.Pipeline(options=PipelineOptions())
  entities = p | 'Read From Datastore' >> ReadFromDatastore(query)
  # [END model_datastoreio_read]

  # [START model_datastoreio_write]
  p = beam.Pipeline(options=PipelineOptions())
  musicians = p | 'Musicians' >> beam.Create(
      ['Mozart', 'Chopin', 'Beethoven', 'Vivaldi'])

  def to_entity(content):
    key = Key([kind, str(uuid.uuid4())])
    entity = Entity(key)
    entity.set_properties({'content': content})
    return entity

  entities = musicians | 'To Entity' >> beam.Map(to_entity)
  entities | 'Write To Datastore' >> WriteToDatastore(project)
예제 #4
0
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        default='gs://wordcounttest2/data/datatest.txt',
                        help='Input file to process.')
    parser.add_argument(
        '--output',
        dest='output',
        # CHANGE 1/5: The Google Cloud Storage path is required
        # for outputting the results.
        default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend([
        # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to
        # run your pipeline on the Google Cloud Dataflow Service.
        '--runner=DirectRunner',
        # CHANGE 3/5: Your project ID is required in order to run your pipeline on
        # the Google Cloud Dataflow Service.
        '--project=SET_YOUR_PROJECT_ID_HERE',
        # CHANGE 4/5: Your Google Cloud Storage path is required for staging local
        # files.
        '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
        # CHANGE 5/5: Your Google Cloud Storage path is required for temporary
        # files.
        '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
        '--job_name=your-wordcount-job',
    ])

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    with beam.Pipeline(options=pipeline_options) as p:

        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromDatastore(known_args.input)
        types = type(lines)
        # Count the occurrences of each word.
        counts = (lines
                  | 'Split' >> (beam.FlatMap(lambda x: re.findall(
                      r'[A-Za-z\']+', x)).with_output_types(unicode))
                  | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
                  | 'GroupAndSum' >> beam.CombinePerKey(sum))

        # Format the counts into a PCollection of strings.
        def format_result(word_count):
            (word, count) = word_count
            return '%s: %s, %s' % (word, count, types)

        output = counts | 'Format' >> beam.Map(format_result)

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | WriteToText(known_args.output)
예제 #5
0
  def test_SplitQueryFn_with_query_limit(self):
    """A test that verifies no split is performed when the query has a limit."""
    with patch.object(helper, 'get_client', return_value=self._mock_client):
      num_splits = 4
      expected_num_splits = 1
      self._mock_query.limit = 3
      split_query_fn = ReadFromDatastore._SplitQueryFn(num_splits)
      split_queries = split_query_fn.process(self._mock_query)

      self.assertEqual(expected_num_splits, len(split_queries))
예제 #6
0
  def test_SplitQueryFn_with_num_splits(self):
    with patch.object(helper, 'get_client', return_value=self._mock_client):
      num_splits = 23
      expected_num_splits = 23

      def fake_get_splits(unused_client, query, num_splits):
        return [query] * num_splits

      with patch.object(query_splitter, 'get_splits',
                        side_effect=fake_get_splits):
        split_query_fn = ReadFromDatastore._SplitQueryFn(num_splits)
        split_queries = split_query_fn.process(self._mock_query)

        self.assertEqual(expected_num_splits, len(split_queries))
예제 #7
0
def run(argv=None):
    from apache_beam.io.gcp.bigquery_file_loads import BigQueryBatchFileLoads
    from apache_beam.io.gcp.datastore.v1new.datastoreio import ReadFromDatastore
    from datetime import datetime

    options = Ds2bqOptions(flags=argv)
    options.view_as(beam.options.pipeline_options.GoogleCloudOptions
                    ).region = "asia-northeast1"
    options.view_as(
        beam.options.pipeline_options.WorkerOptions).num_workers = 2
    options.view_as(
        beam.options.pipeline_options.WorkerOptions).disk_size_gb = 50

    # Setup
    options.view_as(beam.options.pipeline_options.StandardOptions
                    ).runner = 'DataflowRunner'
    options.view_as(
        beam.options.pipeline_options.SetupOptions).setup_file = './setup.py'

    logging.info(options)

    project_id = options.view_as(
        beam.options.pipeline_options.GoogleCloudOptions).project
    gcs_dir = "gs://{}-dataflow/temp/{}".format(
        project_id,
        datetime.now().strftime("%Y%m%d%H%M%S"))

    with beam.Pipeline(options=options) as p:
        from transform.datastore import convert, CreateQuery, GetKinds
        from transform.bigquery import GetBqTableMap, get_partition_conf
        table_names_dict = beam.pvalue.AsDict(
            p | "Get BigQuery Table Map" >> GetBqTableMap(
                project_id, options.dataset))

        entities = (p
                    | 'Get Kinds' >> GetKinds(project_id)
                    | 'Create Query' >> beam.ParDo(CreateQuery(project_id))
                    | 'Get Entity' >> beam.ParDo(ReadFromDatastore._QueryFn()))

        _ = (entities
             | 'Convert Entity' >> beam.Map(convert)
             | 'BigQuery Load' >> BigQueryBatchFileLoads(
                 destination=lambda row, table_dict: table_dict[row["__key__"][
                     "kind"]],
                 custom_gcs_temp_location=gcs_dir,
                 write_disposition='WRITE_TRUNCATE',
                 table_side_inputs=(table_names_dict, ),
                 additional_bq_parameters=get_partition_conf,
                 schema='SCHEMA_AUTODETECT'))
예제 #8
0
  def test_SplitQueryFn_with_exception(self):
    """A test that verifies that no split is performed when failures occur."""
    with patch.object(helper, 'get_client', return_value=self._mock_client):
      # Force _SplitQueryFn to compute the number of query splits
      num_splits = 0
      expected_num_splits = 1
      entity_bytes = (expected_num_splits *
                      ReadFromDatastore._DEFAULT_BUNDLE_SIZE_BYTES)
      with patch.object(
          ReadFromDatastore._SplitQueryFn, 'get_estimated_size_bytes',
          return_value=entity_bytes):

        with patch.object(query_splitter, 'get_splits',
                          side_effect=query_splitter.QuerySplitterError(
                              "Testing query split error")):
          split_query_fn = ReadFromDatastore._SplitQueryFn(num_splits)
          split_queries = split_query_fn.process(self._mock_query)

          self.assertEqual(expected_num_splits, len(split_queries))
          self.assertEqual(self._mock_query, split_queries[0])
예제 #9
0
  def test_SplitQueryFn_without_num_splits(self):
    with patch.object(helper, 'get_client', return_value=self._mock_client):
      # Force _SplitQueryFn to compute the number of query splits
      num_splits = 0
      expected_num_splits = 23
      entity_bytes = (expected_num_splits *
                      ReadFromDatastore._DEFAULT_BUNDLE_SIZE_BYTES)
      with patch.object(
          ReadFromDatastore._SplitQueryFn, 'get_estimated_size_bytes',
          return_value=entity_bytes):

        def fake_get_splits(unused_client, query, num_splits):
          return [query] * num_splits

        with patch.object(query_splitter, 'get_splits',
                          side_effect=fake_get_splits):
          split_query_fn = ReadFromDatastore._SplitQueryFn(num_splits)
          split_queries = split_query_fn.process(self._mock_query)

          self.assertEqual(expected_num_splits, len(split_queries))
예제 #10
0
 def test_QueryFn_metric_on_failure(self):
     MetricsEnvironment.process_wide_container().reset()
     with patch.object(helper, 'get_client',
                       return_value=self._mock_client):
         self._mock_query.project = self._PROJECT
         self._mock_query.namespace = self._NAMESPACE
         _query_fn = ReadFromDatastore._QueryFn()
         client_query = self._mock_query._to_client_query()
         # Test with exception
         client_query.fetch.side_effect = [
             exceptions.DeadlineExceeded("Deadline exceed")
         ]
         list(_query_fn.process(self._mock_query))
         self.verify_read_call_metric(self._PROJECT, self._NAMESPACE,
                                      "deadline_exceeded", 1)
         # Test success
         client_query.fetch.side_effect = [[]]
         list(_query_fn.process(self._mock_query))
         self.verify_read_call_metric(self._PROJECT, self._NAMESPACE, "ok",
                                      1)
예제 #11
0
def read_from_datastore(project, user_options, pipeline_options):
    """Creates a pipeline that reads entities from Cloud Datastore."""
    p = beam.Pipeline(options=pipeline_options)
    # Create a query to read entities from datastore.
    query = make_ancestor_query(project, user_options.kind,
                                user_options.namespace, user_options.ancestor)

    # Read entities from Cloud Datastore into a PCollection.
    lines = p | 'read from datastore' >> ReadFromDatastore(query)

    # Count the occurrences of each word.
    def count_ones(word_ones):
        (word, ones) = word_ones
        return word, sum(ones)

    counts = (lines
              | 'split' >> beam.ParDo(WordExtractingDoFn())
              | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
              | 'group' >> beam.GroupByKey()
              | 'count' >> beam.Map(count_ones))

    # Format the counts into a PCollection of strings.
    def format_result(word_count):
        (word, count) = word_count
        return '%s: %s' % (word, count)

    output = counts | 'format' >> beam.Map(format_result)

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | 'write' >> beam.io.WriteToText(
        file_path_prefix=user_options.output,
        num_shards=user_options.num_shards)

    result = p.run()
    # Wait until completion, main thread would access post-completion job results.
    result.wait_until_finish()
    return result
예제 #12
0
def main():
    project = 'chromeperf'
    options = PipelineOptions()
    options.view_as(DebugOptions).add_experiment('use_beam_bq_sink')
    options.view_as(GoogleCloudOptions).project = project
    bq_export_options = options.view_as(BqExportOptions)

    p = beam.Pipeline(options=options)
    entities_read = Metrics.counter('main', 'entities_read')
    failed_entity_transforms = Metrics.counter('main',
                                               'failed_entity_transforms')

    test_metadata_entities = (
        p
        | 'ReadFromDatastore(TestMetadata)' >> ReadFromDatastore(
            Query(project=project, kind='TestMetadata')))

    # TODO: fetch SparseDiagnostics entities and join with TestMetadata here for
    # additional metadata.

    test_metadata_rows = (
        test_metadata_entities
        | 'ConvertEntityToRow(TestMetadata)' >> beam.FlatMap(
            ConvertEntity(TestMetadataEntityToRowDict, entities_read,
                          failed_entity_transforms)))

    """
  CREATE TABLE `chromeperf.chromeperf_dashboard_data.test_metadata`
  (test STRING NOT NULL,
   internal_only BOOLEAN NOT NULL,
   improvement_direction STRING,
   units STRING,
   has_rows BOOLEAN NOT NULL,
   deprecated BOOLEAN NOT NULL,
   description STRING,
   unescaped_story_name STRING,
   parent STRING,
   bot_group STRING NOT NULL,
   bot STRING NOT NULL,
   measurement STRING NOT NULL,
   )
  CLUSTER BY bot_group, bot, measurement;
  """  # pylint: disable=pointless-string-statement
    bq_testmetadata_schema = {
        'fields': [
            # 'test' corresponds to the same column in the Rows export.
            {
                'name': 'test',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'internal_only',
                'type': 'BOOLEAN',
                'mode': 'REQUIRED'
            },
            {
                'name': 'improvement_direction',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'units',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'has_rows',
                'type': 'BOOLEAN',
                'mode': 'REQUIRED'
            },
            {
                'name': 'deprecated',
                'type': 'BOOLEAN',
                'mode': 'REQUIRED'
            },
            {
                'name': 'description',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'unescaped_story_name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'parent',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            # bot_group, bot, and measurement correspond to same columns in the
            # Rows export.
            {
                'name': 'bot_group',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'bot',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'measurement',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
        ],
    }

    def TableNameFn(unused_element):
        return '{project}:{dataset}.test_metadata{suffix}'.format(
            project=project,
            dataset=bq_export_options.dataset.get(),
            suffix=bq_export_options.table_suffix)

    _ = (
        test_metadata_rows
        | 'WriteToBigQuery(test_metadata)' >> beam.io.WriteToBigQuery(
            TableNameFn,
            schema=bq_testmetadata_schema,
            method=beam.io.WriteToBigQuery.Method.FILE_LOADS,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
            create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
            # Cluster by the same columns as the Rows export, so that efficient
            # queries work the same way with this table (and to make efficient
            # joins with that table simpler).
            additional_bq_parameters={
                'clustering': {
                    'fields': ['bot_group', 'bot', 'measurement']
                }
            }))

    result = p.run()
    result.wait_until_finish()
    PrintCounters(result)
예제 #13
0
def run(argv=None):
    """Main entry point."""

    parser = argparse.ArgumentParser()

    parser.add_argument('--kind',
                        dest='kind',
                        default='writereadtest',
                        help='Datastore Kind')
    parser.add_argument('--num_entities',
                        dest='num_entities',
                        type=int,
                        required=True,
                        help='Number of entities to write')
    parser.add_argument('--limit',
                        dest='limit',
                        type=int,
                        help='Limit of number of entities to write')

    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    gcloud_options = pipeline_options.view_as(GoogleCloudOptions)
    job_name = gcloud_options.job_name
    kind = known_args.kind
    num_entities = known_args.num_entities
    project = gcloud_options.project

    # Pipeline 1: Create and write the specified number of Entities to the
    # Cloud Datastore.
    ancestor_key = Key([kind, str(uuid.uuid4())], project=project)
    _LOGGER.info('Writing %s entities to %s', num_entities, project)
    p = new_pipeline_with_job_name(pipeline_options, job_name, '-write')
    _ = (
        p
        | 'Input' >> beam.Create(list(range(num_entities)))
        | 'To String' >> beam.Map(str)
        |
        'To Entity' >> beam.Map(EntityWrapper(kind, ancestor_key).make_entity)
        | 'Write to Datastore' >> WriteToDatastore(project))
    p.run()

    query = Query(kind=kind, project=project, ancestor=ancestor_key)
    # Optional Pipeline 2: If a read limit was provided, read it and confirm
    # that the expected entities were read.
    if known_args.limit is not None:
        _LOGGER.info(
            'Querying a limited set of %s entities and verifying count.',
            known_args.limit)
        p = new_pipeline_with_job_name(pipeline_options, job_name,
                                       '-verify-limit')
        query.limit = known_args.limit
        entities = p | 'read from datastore' >> ReadFromDatastore(query)
        assert_that(entities | beam.combiners.Count.Globally(),
                    equal_to([known_args.limit]))

        p.run()
        query.limit = None

    # Pipeline 3: Query the written Entities and verify result.
    _LOGGER.info('Querying entities, asserting they match.')
    p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify')
    entities = p | 'read from datastore' >> ReadFromDatastore(query)

    assert_that(entities | beam.combiners.Count.Globally(),
                equal_to([num_entities]))

    p.run()

    # Pipeline 4: Delete Entities.
    _LOGGER.info('Deleting entities.')
    p = new_pipeline_with_job_name(pipeline_options, job_name, '-delete')
    entities = p | 'read from datastore' >> ReadFromDatastore(query)
    _ = (entities
         | 'To Keys' >> beam.Map(lambda entity: entity.key)
         | 'delete entities' >> DeleteFromDatastore(project))

    p.run()

    # Pipeline 5: Query the written Entities, verify no results.
    _LOGGER.info(
        'Querying for the entities to make sure there are none present.')
    p = new_pipeline_with_job_name(pipeline_options, job_name,
                                   '-verify-deleted')
    entities = p | 'read from datastore' >> ReadFromDatastore(query)

    assert_that(entities | beam.combiners.Count.Globally(), equal_to([0]))

    p.run()
예제 #14
0
import argparse
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.io.gcp.datastore.v1new.datastoreio import ReadFromDatastore
from apache_beam.io.gcp.datastore.v1new.types import Query

parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(None)
pipeline_options = PipelineOptions(pipeline_args)
project = pipeline_options.view_as(GoogleCloudOptions).project

# define the pipeline steps
p = beam.Pipeline(options=pipeline_options)

data = p | 'Read from Datastore' >> ReadFromDatastore(
    query=Query('natality-guid', project, limit=5))
scored = data | 'Print' >> beam.Map(print)

# run the pipeline
result = p.run()
result.wait_until_finish()