def model_datastoreio(): """Using a Read and Write transform to read/write to Cloud Datastore.""" import uuid import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.io.gcp.datastore.v1new.datastoreio import ReadFromDatastore from apache_beam.io.gcp.datastore.v1new.datastoreio import WriteToDatastore from apache_beam.io.gcp.datastore.v1new.types import Entity from apache_beam.io.gcp.datastore.v1new.types import Key from apache_beam.io.gcp.datastore.v1new.types import Query project = 'my_project' kind = 'my_kind' query = Query(kind, project) # [START model_datastoreio_read] p = beam.Pipeline(options=PipelineOptions()) entities = p | 'Read From Datastore' >> ReadFromDatastore(query) # [END model_datastoreio_read] # [START model_datastoreio_write] p = beam.Pipeline(options=PipelineOptions()) musicians = p | 'Musicians' >> beam.Create( ['Mozart', 'Chopin', 'Beethoven', 'Vivaldi']) def to_entity(content): key = Key([kind, str(uuid.uuid4())]) entity = Entity(key) entity.set_properties({'content': content}) return entity entities = musicians | 'To Entity' >> beam.Map(to_entity) entities | 'Write To Datastore' >> WriteToDatastore(project)
def check_DatastoreWriteFn(self, num_entities, use_fixed_batch_size=False): """A helper function to test _DatastoreWriteFn.""" with patch.object(helper, 'get_client', return_value=self._mock_client): entities = helper.create_entities(num_entities) expected_entities = [ entity.to_client_entity() for entity in entities ] # Infer project from write fn project arg. if num_entities: key = Key(['k1', 1234], project=self._PROJECT) expected_key = key.to_client_key() key.project = None entities[0].key = key expected_entities[0].key = expected_key all_batch_entities = [] commit_count = [0] self._mock_client.batch.side_effect = (lambda: FakeBatch( all_batch_items=all_batch_entities, commit_count=commit_count)) datastore_write_fn = WriteToDatastore._DatastoreWriteFn( self._PROJECT) datastore_write_fn.start_bundle() for entity in entities: datastore_write_fn.process(entity) datastore_write_fn.finish_bundle() self.assertListEqual([e.key for e in all_batch_entities], [e.key for e in expected_entities]) batch_count = math.ceil(num_entities / util.WRITE_BATCH_MAX_SIZE) self.assertLessEqual(batch_count, commit_count[0])
def write_to_datastore(project, user_options, pipeline_options): """Creates a pipeline that writes entities to Cloud Datastore.""" with beam.Pipeline(options=pipeline_options) as p: _ = (p | 'read' >> ReadFromText(user_options.input) | 'create entity' >> beam.Map( EntityWrapper(project, user_options.namespace, user_options.kind, user_options.ancestor).make_entity) | 'write to datastore' >> WriteToDatastore(project))
def test_DatastoreWriteLargeEntities(self): """100*100kB entities gets split over two Commit RPCs.""" with patch.object(helper, 'get_client', return_value=self._mock_client): entities = helper.create_entities(100) commit_count = [0] self._mock_client.batch.side_effect = ( lambda: FakeBatch(commit_count=commit_count)) datastore_write_fn = WriteToDatastore._DatastoreWriteFn(self._PROJECT) datastore_write_fn.start_bundle() for entity in entities: entity.set_properties({'large': u'A' * 100000}) datastore_write_fn.process(entity) datastore_write_fn.finish_bundle() self.assertEqual(2, commit_count[0])
def run(): import sys args = sys.argv[1:] options = CopyOptions(args) if not options.src.project: options.src.project = options.project if not options.dst.project: options.dst.project = options.project kind = options.dst.kinds[0] if options.dst.kinds else None changer = ChangeKey(options.dst.project, options.dst.namespace, kind) p = beam.Pipeline(options=options) sources = create_multi_datasource_reader(p, options.src.project, options.src.namespace, options.src.kinds) sources | beam.Flatten() \ | 'ChangeKey' >> beam.ParDo(changer) \ | 'OptionalMapper' >> beam.ParDo(OptionalProcess(options.mapper)) \ | 'WriteToDatastore' >> WriteToDatastore(options.dst.project) p.run().wait_until_finish()
key = Key(['natality-guid', element['guid']]) entity = Entity(key) entity.set_properties({ 'weight': element['weight'], 'time': element['time'] }) yield entity # set up pipeline options parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args() pipeline_options = PipelineOptions(pipeline_args) project = pipeline_options.view_as(GoogleCloudOptions).project # define the topics topic = "projects/{project}/topics/{topic}" topic = topic.format(project=project, topic="natality") # define the pipeline steps p = beam.Pipeline(options=pipeline_options) (p | 'Read PubSub' >> beam.io.ReadFromPubSub(topic=topic) | 'Apply Model' >> beam.ParDo(ApplyDoFn()) | 'Create Entities' >> beam.ParDo(CreateEntityDoFn()) | 'Save to Datastore' >> WriteToDatastore(project)) # run the pipeline result = p.run() result.wait_until_finish()
def run(argv=None): """Main entry point.""" parser = argparse.ArgumentParser() parser.add_argument('--kind', dest='kind', default='writereadtest', help='Datastore Kind') parser.add_argument('--num_entities', dest='num_entities', type=int, required=True, help='Number of entities to write') parser.add_argument('--limit', dest='limit', type=int, help='Limit of number of entities to write') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) gcloud_options = pipeline_options.view_as(GoogleCloudOptions) job_name = gcloud_options.job_name kind = known_args.kind num_entities = known_args.num_entities project = gcloud_options.project # Pipeline 1: Create and write the specified number of Entities to the # Cloud Datastore. ancestor_key = Key([kind, str(uuid.uuid4())], project=project) _LOGGER.info('Writing %s entities to %s', num_entities, project) p = new_pipeline_with_job_name(pipeline_options, job_name, '-write') _ = ( p | 'Input' >> beam.Create(list(range(num_entities))) | 'To String' >> beam.Map(str) | 'To Entity' >> beam.Map(EntityWrapper(kind, ancestor_key).make_entity) | 'Write to Datastore' >> WriteToDatastore(project)) p.run() query = Query(kind=kind, project=project, ancestor=ancestor_key) # Optional Pipeline 2: If a read limit was provided, read it and confirm # that the expected entities were read. if known_args.limit is not None: _LOGGER.info( 'Querying a limited set of %s entities and verifying count.', known_args.limit) p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify-limit') query.limit = known_args.limit entities = p | 'read from datastore' >> ReadFromDatastore(query) assert_that(entities | beam.combiners.Count.Globally(), equal_to([known_args.limit])) p.run() query.limit = None # Pipeline 3: Query the written Entities and verify result. _LOGGER.info('Querying entities, asserting they match.') p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify') entities = p | 'read from datastore' >> ReadFromDatastore(query) assert_that(entities | beam.combiners.Count.Globally(), equal_to([num_entities])) p.run() # Pipeline 4: Delete Entities. _LOGGER.info('Deleting entities.') p = new_pipeline_with_job_name(pipeline_options, job_name, '-delete') entities = p | 'read from datastore' >> ReadFromDatastore(query) _ = (entities | 'To Keys' >> beam.Map(lambda entity: entity.key) | 'delete entities' >> DeleteFromDatastore(project)) p.run() # Pipeline 5: Query the written Entities, verify no results. _LOGGER.info( 'Querying for the entities to make sure there are none present.') p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify-deleted') entities = p | 'read from datastore' >> ReadFromDatastore(query) assert_that(entities | beam.combiners.Count.Globally(), equal_to([0])) p.run()
id = int(fields[0]) key = Key([kind, id]) entity = Entity(key) president = fields[1] names = president.split(' ') entity.set_properties({ 'id': id, 'firstName': names[0], 'lastName': names[1], 'startYear': int(fields[2]), 'endYear': int(fields[3]), 'party': fields[4], 'homeState': fields[5], 'dateOfBirth': datetime.strptime(fields[6], '%Y-%m-%d') }) return entity entities = lines | 'To Entity' >> beam.Map(to_entity) entities | 'Write To Datastore' >> WriteToDatastore(project) # lines | 'Write to Cloud Storage' >> beam.io.WriteToText('gs://[GCLOUD_BUCKET]/out') p.run().wait_until_finish()