예제 #1
0
def model_datastoreio():
  """Using a Read and Write transform to read/write to Cloud Datastore."""

  import uuid
  import apache_beam as beam
  from apache_beam.options.pipeline_options import PipelineOptions
  from apache_beam.io.gcp.datastore.v1new.datastoreio import ReadFromDatastore
  from apache_beam.io.gcp.datastore.v1new.datastoreio import WriteToDatastore
  from apache_beam.io.gcp.datastore.v1new.types import Entity
  from apache_beam.io.gcp.datastore.v1new.types import Key
  from apache_beam.io.gcp.datastore.v1new.types import Query

  project = 'my_project'
  kind = 'my_kind'
  query = Query(kind, project)

  # [START model_datastoreio_read]
  p = beam.Pipeline(options=PipelineOptions())
  entities = p | 'Read From Datastore' >> ReadFromDatastore(query)
  # [END model_datastoreio_read]

  # [START model_datastoreio_write]
  p = beam.Pipeline(options=PipelineOptions())
  musicians = p | 'Musicians' >> beam.Create(
      ['Mozart', 'Chopin', 'Beethoven', 'Vivaldi'])

  def to_entity(content):
    key = Key([kind, str(uuid.uuid4())])
    entity = Entity(key)
    entity.set_properties({'content': content})
    return entity

  entities = musicians | 'To Entity' >> beam.Map(to_entity)
  entities | 'Write To Datastore' >> WriteToDatastore(project)
예제 #2
0
    def check_DatastoreWriteFn(self, num_entities, use_fixed_batch_size=False):
        """A helper function to test _DatastoreWriteFn."""
        with patch.object(helper, 'get_client',
                          return_value=self._mock_client):
            entities = helper.create_entities(num_entities)
            expected_entities = [
                entity.to_client_entity() for entity in entities
            ]

            # Infer project from write fn project arg.
            if num_entities:
                key = Key(['k1', 1234], project=self._PROJECT)
                expected_key = key.to_client_key()
                key.project = None
                entities[0].key = key
                expected_entities[0].key = expected_key

            all_batch_entities = []
            commit_count = [0]
            self._mock_client.batch.side_effect = (lambda: FakeBatch(
                all_batch_items=all_batch_entities, commit_count=commit_count))

            datastore_write_fn = WriteToDatastore._DatastoreWriteFn(
                self._PROJECT)

            datastore_write_fn.start_bundle()
            for entity in entities:
                datastore_write_fn.process(entity)
            datastore_write_fn.finish_bundle()

            self.assertListEqual([e.key for e in all_batch_entities],
                                 [e.key for e in expected_entities])
            batch_count = math.ceil(num_entities / util.WRITE_BATCH_MAX_SIZE)
            self.assertLessEqual(batch_count, commit_count[0])
예제 #3
0
def write_to_datastore(project, user_options, pipeline_options):
    """Creates a pipeline that writes entities to Cloud Datastore."""
    with beam.Pipeline(options=pipeline_options) as p:
        _ = (p
             | 'read' >> ReadFromText(user_options.input)
             | 'create entity' >> beam.Map(
                 EntityWrapper(project, user_options.namespace,
                               user_options.kind,
                               user_options.ancestor).make_entity)
             | 'write to datastore' >> WriteToDatastore(project))
예제 #4
0
  def test_DatastoreWriteLargeEntities(self):
    """100*100kB entities gets split over two Commit RPCs."""
    with patch.object(helper, 'get_client', return_value=self._mock_client):
      entities = helper.create_entities(100)
      commit_count = [0]
      self._mock_client.batch.side_effect = (
          lambda: FakeBatch(commit_count=commit_count))

      datastore_write_fn = WriteToDatastore._DatastoreWriteFn(self._PROJECT)
      datastore_write_fn.start_bundle()
      for entity in entities:
        entity.set_properties({'large': u'A' * 100000})
        datastore_write_fn.process(entity)
      datastore_write_fn.finish_bundle()

      self.assertEqual(2, commit_count[0])
예제 #5
0
def run():
    import sys

    args = sys.argv[1:]
    options = CopyOptions(args)

    if not options.src.project:
        options.src.project = options.project
    if not options.dst.project:
        options.dst.project = options.project

    kind = options.dst.kinds[0] if options.dst.kinds else None
    changer = ChangeKey(options.dst.project, options.dst.namespace, kind)

    p = beam.Pipeline(options=options)
    sources = create_multi_datasource_reader(p, options.src.project,
                                             options.src.namespace,
                                             options.src.kinds)

    sources | beam.Flatten() \
            | 'ChangeKey' >> beam.ParDo(changer) \
            | 'OptionalMapper' >> beam.ParDo(OptionalProcess(options.mapper)) \
            | 'WriteToDatastore' >> WriteToDatastore(options.dst.project)
    p.run().wait_until_finish()
예제 #6
0
        key = Key(['natality-guid', element['guid']])
        entity = Entity(key)
        entity.set_properties({
            'weight': element['weight'],
            'time': element['time']
        })
        yield entity


# set up pipeline options
parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args()
pipeline_options = PipelineOptions(pipeline_args)
project = pipeline_options.view_as(GoogleCloudOptions).project

# define the topics
topic = "projects/{project}/topics/{topic}"
topic = topic.format(project=project, topic="natality")

# define the pipeline steps
p = beam.Pipeline(options=pipeline_options)
(p
 | 'Read PubSub' >> beam.io.ReadFromPubSub(topic=topic)
 | 'Apply Model' >> beam.ParDo(ApplyDoFn())
 | 'Create Entities' >> beam.ParDo(CreateEntityDoFn())
 | 'Save to Datastore' >> WriteToDatastore(project))

# run the pipeline
result = p.run()
result.wait_until_finish()
예제 #7
0
def run(argv=None):
    """Main entry point."""

    parser = argparse.ArgumentParser()

    parser.add_argument('--kind',
                        dest='kind',
                        default='writereadtest',
                        help='Datastore Kind')
    parser.add_argument('--num_entities',
                        dest='num_entities',
                        type=int,
                        required=True,
                        help='Number of entities to write')
    parser.add_argument('--limit',
                        dest='limit',
                        type=int,
                        help='Limit of number of entities to write')

    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    gcloud_options = pipeline_options.view_as(GoogleCloudOptions)
    job_name = gcloud_options.job_name
    kind = known_args.kind
    num_entities = known_args.num_entities
    project = gcloud_options.project

    # Pipeline 1: Create and write the specified number of Entities to the
    # Cloud Datastore.
    ancestor_key = Key([kind, str(uuid.uuid4())], project=project)
    _LOGGER.info('Writing %s entities to %s', num_entities, project)
    p = new_pipeline_with_job_name(pipeline_options, job_name, '-write')
    _ = (
        p
        | 'Input' >> beam.Create(list(range(num_entities)))
        | 'To String' >> beam.Map(str)
        |
        'To Entity' >> beam.Map(EntityWrapper(kind, ancestor_key).make_entity)
        | 'Write to Datastore' >> WriteToDatastore(project))
    p.run()

    query = Query(kind=kind, project=project, ancestor=ancestor_key)
    # Optional Pipeline 2: If a read limit was provided, read it and confirm
    # that the expected entities were read.
    if known_args.limit is not None:
        _LOGGER.info(
            'Querying a limited set of %s entities and verifying count.',
            known_args.limit)
        p = new_pipeline_with_job_name(pipeline_options, job_name,
                                       '-verify-limit')
        query.limit = known_args.limit
        entities = p | 'read from datastore' >> ReadFromDatastore(query)
        assert_that(entities | beam.combiners.Count.Globally(),
                    equal_to([known_args.limit]))

        p.run()
        query.limit = None

    # Pipeline 3: Query the written Entities and verify result.
    _LOGGER.info('Querying entities, asserting they match.')
    p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify')
    entities = p | 'read from datastore' >> ReadFromDatastore(query)

    assert_that(entities | beam.combiners.Count.Globally(),
                equal_to([num_entities]))

    p.run()

    # Pipeline 4: Delete Entities.
    _LOGGER.info('Deleting entities.')
    p = new_pipeline_with_job_name(pipeline_options, job_name, '-delete')
    entities = p | 'read from datastore' >> ReadFromDatastore(query)
    _ = (entities
         | 'To Keys' >> beam.Map(lambda entity: entity.key)
         | 'delete entities' >> DeleteFromDatastore(project))

    p.run()

    # Pipeline 5: Query the written Entities, verify no results.
    _LOGGER.info(
        'Querying for the entities to make sure there are none present.')
    p = new_pipeline_with_job_name(pipeline_options, job_name,
                                   '-verify-deleted')
    entities = p | 'read from datastore' >> ReadFromDatastore(query)

    assert_that(entities | beam.combiners.Count.Globally(), equal_to([0]))

    p.run()
    id = int(fields[0])
    key = Key([kind, id])
    entity = Entity(key)
    president = fields[1]
    names = president.split(' ')
    entity.set_properties({
        'id':
        id,
        'firstName':
        names[0],
        'lastName':
        names[1],
        'startYear':
        int(fields[2]),
        'endYear':
        int(fields[3]),
        'party':
        fields[4],
        'homeState':
        fields[5],
        'dateOfBirth':
        datetime.strptime(fields[6], '%Y-%m-%d')
    })
    return entity


entities = lines | 'To Entity' >> beam.Map(to_entity)
entities | 'Write To Datastore' >> WriteToDatastore(project)
# lines | 'Write to Cloud Storage' >> beam.io.WriteToText('gs://[GCLOUD_BUCKET]/out')
p.run().wait_until_finish()