def testValueProviderFilters(self): self.vp_filters = [ [(StaticValueProvider(str, 'property_name'), StaticValueProvider(str, '='), StaticValueProvider(str, 'value'))], [(StaticValueProvider(str, 'property_name'), StaticValueProvider(str, '='), StaticValueProvider(str, 'value')), ('property_name', '=', 'value')], ] self.expected_filters = [ [('property_name', '=', 'value')], [('property_name', '=', 'value'), ('property_name', '=', 'value')], ] for vp_filter, exp_filter in zip(self.vp_filters, self.expected_filters): q = Query(kind='kind', project=self._PROJECT, namespace=self._NAMESPACE, filters=vp_filter) cq = q._to_client_query(self._test_client) self.assertEqual(exp_filter, cq.filters) logging.info('query: %s', q) # Test __repr__()
def create_multi_datasource_reader(pipeline, project, namespace, kinds, keys_only=False): if not kinds: kinds = [None] sources = [] for kind in kinds: # namespace を指定しない(==None)と [default] namespace が使われる query = Query(project=project, namespace=namespace, kind=kind) if keys_only: # see # https://beam.apache.org/releases/pydoc/2.14.0/_modules/apache_beam/io/gcp/datastore/v1new/types.html#Query # https://google-cloud-python.readthedocs.io/en/0.32.0/_modules/google/cloud/datastore/query.html#Query.keys_only query.projection = ['__key__'] if not kind: # kind を指定しない場合は明示的に __key__ asc でソートしないとエラーになる query.order = ['__key__'] description = 'ReadFromDatastore kind={}'.format(kind if kind else "*") s = pipeline | description >> ReadFromDatastore(query=query) sources.append(s) return sources
def testQueryEmptyNamespace(self): # Test that we can pass a namespace of None. self._test_client.namespace = None q = Query(project=self._PROJECT, namespace=None) cq = q._to_client_query(self._test_client) self.assertEqual(self._test_client.project, cq.project) self.assertEqual(None, cq.namespace)
def testQuery(self): filters = [('property_name', '=', 'value')] projection = ['f1', 'f2'] order = projection distinct_on = projection ancestor_key = Key(['kind', 'id'], project=self._PROJECT) q = Query(kind='kind', project=self._PROJECT, namespace=self._NAMESPACE, ancestor=ancestor_key, filters=filters, projection=projection, order=order, distinct_on=distinct_on) cq = q._to_client_query(self._test_client) self.assertEqual(self._PROJECT, cq.project) self.assertEqual(self._NAMESPACE, cq.namespace) self.assertEqual('kind', cq.kind) self.assertEqual(ancestor_key.to_client_key(), cq.ancestor) self.assertEqual(filters, cq.filters) self.assertEqual(projection, cq.projection) self.assertEqual(order, cq.order) self.assertEqual(distinct_on, cq.distinct_on) logging.info('query: %s', q) # Test __repr__()
def testValueProviderNamespace(self): self.vp_namespace = StaticValueProvider(str, 'vp_namespace') self.expected_namespace = 'vp_namespace' q = Query(kind='kind', project=self._PROJECT, namespace=self.vp_namespace) cq = q._to_client_query(self._test_client) self.assertEqual(self.expected_namespace, cq.namespace) _LOGGER.info('query: %s', q) # Test __repr__()
def model_datastoreio(): """Using a Read and Write transform to read/write to Cloud Datastore.""" import uuid import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.io.gcp.datastore.v1new.datastoreio import ReadFromDatastore from apache_beam.io.gcp.datastore.v1new.datastoreio import WriteToDatastore from apache_beam.io.gcp.datastore.v1new.types import Entity from apache_beam.io.gcp.datastore.v1new.types import Key from apache_beam.io.gcp.datastore.v1new.types import Query project = 'my_project' kind = 'my_kind' query = Query(kind, project) # [START model_datastoreio_read] p = beam.Pipeline(options=PipelineOptions()) entities = p | 'Read From Datastore' >> ReadFromDatastore(query) # [END model_datastoreio_read] # [START model_datastoreio_write] p = beam.Pipeline(options=PipelineOptions()) musicians = p | 'Musicians' >> beam.Create( ['Mozart', 'Chopin', 'Beethoven', 'Vivaldi']) def to_entity(content): key = Key([kind, str(uuid.uuid4())]) entity = Entity(key) entity.set_properties({'content': content}) return entity entities = musicians | 'To Entity' >> beam.Map(to_entity) entities | 'Write To Datastore' >> WriteToDatastore(project)
def process(self, element): """ :param element: a kind name :return: [Query] """ from apache_beam.io.gcp.datastore.v1new.types import Query return [Query(kind=element, project=self.project_id)]
def make_ancestor_query(project, kind, namespace, ancestor): """Creates a Cloud Datastore ancestor query. The returned query will fetch all the entities that have the parent key name set to the given `ancestor`. """ ancestor_key = Key([kind, ancestor], project=project, namespace=namespace) return Query(kind, project, namespace, ancestor_key)
def main(): project = 'chromeperf' options = PipelineOptions() options.view_as(DebugOptions).add_experiment('use_beam_bq_sink') options.view_as(GoogleCloudOptions).project = project bq_export_options = options.view_as(BqExportOptions) p = beam.Pipeline(options=options) entities_read = Metrics.counter('main', 'entities_read') failed_entity_transforms = Metrics.counter('main', 'failed_entity_transforms') test_metadata_entities = ( p | 'ReadFromDatastore(TestMetadata)' >> ReadFromDatastore( Query(project=project, kind='TestMetadata'))) # TODO: fetch SparseDiagnostics entities and join with TestMetadata here for # additional metadata. test_metadata_rows = ( test_metadata_entities | 'ConvertEntityToRow(TestMetadata)' >> beam.FlatMap( ConvertEntity(TestMetadataEntityToRowDict, entities_read, failed_entity_transforms))) """ CREATE TABLE `chromeperf.chromeperf_dashboard_data.test_metadata` (test STRING NOT NULL, internal_only BOOLEAN NOT NULL, improvement_direction STRING, units STRING, has_rows BOOLEAN NOT NULL, deprecated BOOLEAN NOT NULL, description STRING, unescaped_story_name STRING, parent STRING, bot_group STRING NOT NULL, bot STRING NOT NULL, measurement STRING NOT NULL, ) CLUSTER BY bot_group, bot, measurement; """ # pylint: disable=pointless-string-statement bq_testmetadata_schema = { 'fields': [ # 'test' corresponds to the same column in the Rows export. { 'name': 'test', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'internal_only', 'type': 'BOOLEAN', 'mode': 'REQUIRED' }, { 'name': 'improvement_direction', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'units', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'has_rows', 'type': 'BOOLEAN', 'mode': 'REQUIRED' }, { 'name': 'deprecated', 'type': 'BOOLEAN', 'mode': 'REQUIRED' }, { 'name': 'description', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'unescaped_story_name', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'parent', 'type': 'STRING', 'mode': 'NULLABLE' }, # bot_group, bot, and measurement correspond to same columns in the # Rows export. { 'name': 'bot_group', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'bot', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'measurement', 'type': 'STRING', 'mode': 'REQUIRED' }, ], } def TableNameFn(unused_element): return '{project}:{dataset}.test_metadata{suffix}'.format( project=project, dataset=bq_export_options.dataset.get(), suffix=bq_export_options.table_suffix) _ = ( test_metadata_rows | 'WriteToBigQuery(test_metadata)' >> beam.io.WriteToBigQuery( TableNameFn, schema=bq_testmetadata_schema, method=beam.io.WriteToBigQuery.Method.FILE_LOADS, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER, # Cluster by the same columns as the Rows export, so that efficient # queries work the same way with this table (and to make efficient # joins with that table simpler). additional_bq_parameters={ 'clustering': { 'fields': ['bot_group', 'bot', 'measurement'] } })) result = p.run() result.wait_until_finish() PrintCounters(result)
def run(argv=None): """Main entry point.""" parser = argparse.ArgumentParser() parser.add_argument('--kind', dest='kind', default='writereadtest', help='Datastore Kind') parser.add_argument('--num_entities', dest='num_entities', type=int, required=True, help='Number of entities to write') parser.add_argument('--limit', dest='limit', type=int, help='Limit of number of entities to write') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) gcloud_options = pipeline_options.view_as(GoogleCloudOptions) job_name = gcloud_options.job_name kind = known_args.kind num_entities = known_args.num_entities project = gcloud_options.project # Pipeline 1: Create and write the specified number of Entities to the # Cloud Datastore. ancestor_key = Key([kind, str(uuid.uuid4())], project=project) _LOGGER.info('Writing %s entities to %s', num_entities, project) p = new_pipeline_with_job_name(pipeline_options, job_name, '-write') _ = ( p | 'Input' >> beam.Create(list(range(num_entities))) | 'To String' >> beam.Map(str) | 'To Entity' >> beam.Map(EntityWrapper(kind, ancestor_key).make_entity) | 'Write to Datastore' >> WriteToDatastore(project)) p.run() query = Query(kind=kind, project=project, ancestor=ancestor_key) # Optional Pipeline 2: If a read limit was provided, read it and confirm # that the expected entities were read. if known_args.limit is not None: _LOGGER.info( 'Querying a limited set of %s entities and verifying count.', known_args.limit) p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify-limit') query.limit = known_args.limit entities = p | 'read from datastore' >> ReadFromDatastore(query) assert_that(entities | beam.combiners.Count.Globally(), equal_to([known_args.limit])) p.run() query.limit = None # Pipeline 3: Query the written Entities and verify result. _LOGGER.info('Querying entities, asserting they match.') p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify') entities = p | 'read from datastore' >> ReadFromDatastore(query) assert_that(entities | beam.combiners.Count.Globally(), equal_to([num_entities])) p.run() # Pipeline 4: Delete Entities. _LOGGER.info('Deleting entities.') p = new_pipeline_with_job_name(pipeline_options, job_name, '-delete') entities = p | 'read from datastore' >> ReadFromDatastore(query) _ = (entities | 'To Keys' >> beam.Map(lambda entity: entity.key) | 'delete entities' >> DeleteFromDatastore(project)) p.run() # Pipeline 5: Query the written Entities, verify no results. _LOGGER.info( 'Querying for the entities to make sure there are none present.') p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify-deleted') entities = p | 'read from datastore' >> ReadFromDatastore(query) assert_that(entities | beam.combiners.Count.Globally(), equal_to([0])) p.run()
import argparse import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.options.pipeline_options import GoogleCloudOptions from apache_beam.io.gcp.datastore.v1new.datastoreio import ReadFromDatastore from apache_beam.io.gcp.datastore.v1new.types import Query parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(None) pipeline_options = PipelineOptions(pipeline_args) project = pipeline_options.view_as(GoogleCloudOptions).project # define the pipeline steps p = beam.Pipeline(options=pipeline_options) data = p | 'Read from Datastore' >> ReadFromDatastore( query=Query('natality-guid', project, limit=5)) scored = data | 'Print' >> beam.Map(print) # run the pipeline result = p.run() result.wait_until_finish()