def model_datastoreio(): """Using a Read and Write transform to read/write to Cloud Datastore.""" import uuid from google.cloud.proto.datastore.v1 import entity_pb2 from google.cloud.proto.datastore.v1 import query_pb2 import googledatastore import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.io.gcp.datastore.v1.datastoreio import ReadFromDatastore from apache_beam.io.gcp.datastore.v1.datastoreio import WriteToDatastore project = 'my_project' kind = 'my_kind' query = query_pb2.Query() query.kind.add().name = kind # [START model_datastoreio_read] p = beam.Pipeline(options=PipelineOptions()) entities = p | 'Read From Datastore' >> ReadFromDatastore(project, query) # [END model_datastoreio_read] # [START model_datastoreio_write] p = beam.Pipeline(options=PipelineOptions()) musicians = p | 'Musicians' >> beam.Create( ['Mozart', 'Chopin', 'Beethoven', 'Vivaldi']) def to_entity(content): entity = entity_pb2.Entity() googledatastore.helper.add_key_path(entity.key, kind, str(uuid.uuid4())) googledatastore.helper.add_properties(entity, {'content': unicode(content)}) return entity entities = musicians | 'To Entity' >> beam.Map(to_entity) entities | 'Write To Datastore' >> WriteToDatastore(project)
def test_get_splits_query_with_unsupported_filter(self): query = query_pb2.Query() query.kind.add() test_filter = query.filter.composite_filter.filters.add() test_filter.property_filter.op = PropertyFilter.GREATER_THAN self.assertRaises(ValueError, query_splitter.get_splits, None, query, 2)
def test_get_splits_query_with_order(self): query = query_pb2.Query() query.kind.add() query.order.add() self.assertRaises(ValueError, query_splitter.get_splits, None, query, 3)
def test__build_protobuf_all_values(self): from google.cloud.proto.datastore.v1 import query_pb2 from google.cloud.datastore.query import Query client = _Client(None, None) query = Query(client) limit = 15 offset = 9 start_bytes = b'i\xb7\x1d' start_cursor = 'abcd' end_bytes = b'\xc3\x1c\xb3' end_cursor = 'wxyz' iterator = self._make_one( query, client, limit=limit, offset=offset, start_cursor=start_cursor, end_cursor=end_cursor) self.assertEqual(iterator.max_results, limit) iterator.num_results = 4 iterator._skipped_results = 1 pb = iterator._build_protobuf() expected_pb = query_pb2.Query( start_cursor=start_bytes, end_cursor=end_bytes, offset=offset - iterator._skipped_results, ) expected_pb.limit.value = limit - iterator.num_results self.assertEqual(pb, expected_pb)
def _next_page_helper(self, txn_id=None): from google.cloud.iterator import Page from google.cloud.proto.datastore.v1 import datastore_pb2 from google.cloud.proto.datastore.v1 import entity_pb2 from google.cloud.proto.datastore.v1 import query_pb2 from google.cloud.datastore.query import Query more_enum = query_pb2.QueryResultBatch.NOT_FINISHED result = _make_query_response([], b'', more_enum, 0) project = 'prujekt' ds_api = _make_datastore_api(result) if txn_id is None: client = _Client(project, datastore_api=ds_api) else: transaction = mock.Mock(id=txn_id, spec=['id']) client = _Client( project, datastore_api=ds_api, transaction=transaction) query = Query(client) iterator = self._make_one(query, client) page = iterator._next_page() self.assertIsInstance(page, Page) self.assertIs(page._parent, iterator) partition_id = entity_pb2.PartitionId(project_id=project) if txn_id is None: read_options = datastore_pb2.ReadOptions() else: read_options = datastore_pb2.ReadOptions(transaction=txn_id) empty_query = query_pb2.Query() ds_api.run_query.assert_called_once_with( project, partition_id, read_options, query=empty_query)
def get_namespaces(self): # Skip auth-ing to db in test operations if not self.argv: return ['4952435991248896_1'] query_pb = query_pb2.Query() helper.set_kind(query_pb, "__namespace__") client = apache_helper.get_datastore(PROJECT) namespace_entities = apache_helper.fetch_entities( PROJECT, '', query_pb, client) namespaces = [] for n in namespace_entities: # Get namespace name or id key_path = n.key.path[-1] if key_path.HasField('id'): name_or_id = key_path.id else: name_or_id = key_path.name # Avoid duplicates and test namespaces if len(str(name_or_id)) > 1 and name_or_id not in namespaces: namespaces.append(name_or_id) return namespaces
def _create_split(last_key, next_key, query): """Create a new {@link Query} given the query and range.. Args: last_key: the previous key. If null then assumed to be the beginning. next_key: the next key. If null then assumed to be the end. query: the desired query. Returns: A split query with fetches entities in the range [last_key, next_key) """ if not (last_key or next_key): return query split_query = query_pb2.Query() split_query.CopyFrom(query) composite_filter = split_query.filter.composite_filter composite_filter.op = CompositeFilter.AND if query.HasField('filter'): composite_filter.filters.add().CopyFrom(query.filter) if last_key: lower_bound = composite_filter.filters.add() lower_bound.property_filter.property.name = KEY_PROPERTY_NAME lower_bound.property_filter.op = PropertyFilter.GREATER_THAN_OR_EQUAL lower_bound.property_filter.value.key_value.CopyFrom(last_key) if next_key: upper_bound = composite_filter.filters.add() upper_bound.property_filter.property.name = KEY_PROPERTY_NAME upper_bound.property_filter.op = PropertyFilter.LESS_THAN upper_bound.property_filter.value.key_value.CopyFrom(next_key) return split_query
def split_query(self, query, num_splits): """Generate dummy query splits.""" split_queries = [] for _ in range(0, num_splits): q = query_pb2.Query() q.CopyFrom(query) split_queries.append(q) return split_queries
def test_get_splits_with_two_splits(self): query = query_pb2.Query() kind = query.kind.add() kind.name = 'shakespeare-demo' num_splits = 2 num_entities = 97 batch_size = 9 self.check_get_splits(query, num_splits, num_entities, batch_size)
def test_get_splits_with_multiple_splits(self): query = query_pb2.Query() kind = query.kind.add() kind.name = 'shakespeare-demo' num_splits = 4 num_entities = 369 batch_size = 12 self.check_get_splits(query, num_splits, num_entities, batch_size)
def test_get_splits_with_large_num_splits(self): query = query_pb2.Query() kind = query.kind.add() kind.name = 'shakespeare-demo' num_splits = 10 num_entities = 4 batch_size = 10 self.check_get_splits(query, num_splits, num_entities, batch_size)
def test_get_splits_with_batch_size_exact_multiple(self): """Test get_splits when num scatter keys is a multiple of batch size.""" query = query_pb2.Query() kind = query.kind.add() kind.name = 'shakespeare-demo' num_splits = 4 num_entities = 400 batch_size = 32 self.check_get_splits(query, num_splits, num_entities, batch_size)
def test_get_splits_with_large_batch_size(self): """Test get_splits when all scatter keys are retured in a single req.""" query = query_pb2.Query() kind = query.kind.add() kind.name = 'shakespeare-demo' num_splits = 4 num_entities = 400 batch_size = 500 self.check_get_splits(query, num_splits, num_entities, batch_size)
def test__build_protobuf_empty(self): from google.cloud.proto.datastore.v1 import query_pb2 from google.cloud.datastore.query import Query client = _Client(None, None) query = Query(client) iterator = self._make_one(query, client) pb = iterator._build_protobuf() expected_pb = query_pb2.Query() self.assertEqual(pb, expected_pb)
def make_latest_timestamp_query(namespace): """Make a Query to fetch the latest timestamp statistics.""" query = query_pb2.Query() if namespace is None: query.kind.add().name = '__Stat_Total__' else: query.kind.add().name = '__Stat_Ns_Total__' # Descending order of `timestamp` datastore_helper.add_property_orders(query, "-timestamp") # Only get the latest entity query.limit.value = 1 return query
def test_create_scatter_query(self): query = query_pb2.Query() kind = query.kind.add() kind.name = 'shakespeare-demo' num_splits = 10 scatter_query = query_splitter._create_scatter_query(query, num_splits) self.assertEqual(scatter_query.kind[0], kind) self.assertEqual(scatter_query.limit.value, (num_splits - 1) * query_splitter.KEYS_PER_SPLIT) self.assertEqual(scatter_query.order[0].direction, query_pb2.PropertyOrder.ASCENDING) self.assertEqual(scatter_query.projection[0].property.name, query_splitter.KEY_PROPERTY_NAME)
def make_ancestor_query(kind, namespace, ancestor): """Creates a Cloud Datastore ancestor query.""" ancestor_key = entity_pb2.Key() datastore_helper.add_key_path(ancestor_key, kind, ancestor) if namespace is not None: ancestor_key.partition_id.namespace_id = namespace query = query_pb2.Query() query.kind.add().name = kind datastore_helper.set_property_filter( query.filter, '__key__', PropertyFilter.HAS_ANCESTOR, ancestor_key) return query
def create_query(self, kinds=(), order=False, limit=None, offset=None, inequality_filter=False): query = query_pb2.Query() for kind in kinds: query.kind.add().name = kind if order: query.order.add() if limit is not None: query.limit.value = limit if offset is not None: query.offset = offset if inequality_filter: test_filter = query.filter.composite_filter.filters.add() test_filter.property_filter.op = PropertyFilter.GREATER_THAN return query
def make_query(kind): """Creates a Cloud Datastore query to retrieve all entities with a 'created_at' date > N days ago. """ days = 4 now = datetime.datetime.now() earlier = now - datetime.timedelta(days=days) query = query_pb2.Query() query.kind.add().name = kind datastore_helper.set_property_filter(query.filter, 'created_at', PropertyFilter.GREATER_THAN, earlier) return query
def query(self): # Instantiate a filter protobuf # You MUST instantiate the filter before the query, then instantiate # the query with the filter. filter_pb = query_pb2.Filter() # Get all non-deleted model instances helper.set_property_filter(filter_pb, 'deleted', query_pb2.PropertyFilter.EQUAL, False) # Instantiate a query protobuf query_pb = query_pb2.Query(filter=filter_pb) helper.set_kind(query_pb, self.model) return query_pb
def setUp(self): self._mock_datastore = MagicMock() self._query = query_pb2.Query() self._query.kind.add().name = 'dummy_kind' patch_retry(self, helper) self._retriable_errors = [ RPCError("dummy", code_pb2.INTERNAL, "failed"), SocketError(errno.ECONNRESET, "Connection Reset"), SocketError(errno.ETIMEDOUT, "Timed out") ] self._non_retriable_errors = [ RPCError("dummy", code_pb2.UNAUTHENTICATED, "failed"), SocketError(errno.EADDRNOTAVAIL, "Address not available") ]
def make_ancestor_query(kind, namespace, ancestor): """Creates a Cloud Datastore ancestor query. The returned query will fetch all the entities that have the parent key name set to the given `ancestor`. """ ancestor_key = entity_pb2.Key() datastore_helper.add_key_path(ancestor_key, kind, ancestor) if namespace is not None: ancestor_key.partition_id.namespace_id = namespace query = query_pb2.Query() query.kind.add().name = kind datastore_helper.set_property_filter( query.filter, '__key__', PropertyFilter.HAS_ANCESTOR, ancestor_key) return query
def expand(self, pcoll): query = query_pb2.Query() query.kind.add().name = 'Tweet' now = datetime.datetime.now() # The 'earlier' var will be set to a static value on template creation. # That is, because of the way that templates work, the value is defined # at template compile time, not runtime. # But defining a filter based on this value will still serve to make the # query more efficient than if we didn't filter at all. earlier = now - datetime.timedelta(days=self.days) datastore_helper.set_property_filter(query.filter, 'created_at', PropertyFilter.GREATER_THAN, earlier) return (pcoll | 'read from datastore' >> ReadFromDatastore( self.project, query, None))
def make_kind_stats_query(namespace, kind, latest_timestamp): """Make a Query to fetch the latest kind statistics.""" kind_stat_query = query_pb2.Query() if namespace is None: kind_stat_query.kind.add().name = '__Stat_Kind__' else: kind_stat_query.kind.add().name = '__Stat_Ns_Kind__' kind_filter = datastore_helper.set_property_filter( query_pb2.Filter(), 'kind_name', PropertyFilter.EQUAL, unicode(kind)) timestamp_filter = datastore_helper.set_property_filter( query_pb2.Filter(), 'timestamp', PropertyFilter.EQUAL, latest_timestamp) datastore_helper.set_composite_filter(kind_stat_query.filter, CompositeFilter.AND, kind_filter, timestamp_filter) return kind_stat_query
def _create_scatter_query(query, num_splits): """Creates a scatter query from the given user query.""" scatter_query = query_pb2.Query() for kind in query.kind: scatter_kind = scatter_query.kind.add() scatter_kind.CopyFrom(kind) # ascending order datastore_helper.add_property_orders(scatter_query, SCATTER_PROPERTY_NAME) # There is a split containing entities before and after each scatter entity: # ||---*------*------*------*------*------*------*---|| * = scatter entity # If we represent each split as a region before a scatter entity, there is an # extra region following the last scatter point. Thus, we do not need the # scatter entity for the last region. scatter_query.limit.value = (num_splits - 1) * KEYS_PER_SPLIT datastore_helper.add_projection(scatter_query, KEY_PROPERTY_NAME) return scatter_query
def test__next_page(self): from google.cloud.iterator import Page from google.cloud.proto.datastore.v1 import query_pb2 from google.cloud.datastore.query import Query connection = _Connection() more_enum = query_pb2.QueryResultBatch.NOT_FINISHED result = _make_query_response([], b'', more_enum, 0) connection._results = [result] project = 'prujekt' client = _Client(project, connection) query = Query(client) iterator = self._make_one(query, client) page = iterator._next_page() self.assertIsInstance(page, Page) self.assertIs(page._parent, iterator) self.assertEqual(connection._called_with, [{ 'query_pb': query_pb2.Query(), 'project': project, 'namespace': None, 'transaction_id': None, }])
def run(argv=None): """Main entry point.""" parser = argparse.ArgumentParser() parser.add_argument('--kind', dest='kind', default='writereadtest', help='Datastore Kind') parser.add_argument('--num_entities', dest='num_entities', type=int, required=True, help='Number of entities to write') parser.add_argument('--limit', dest='limit', type=int, help='Limit of number of entities to write') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True gcloud_options = pipeline_options.view_as(GoogleCloudOptions) job_name = gcloud_options.job_name kind = known_args.kind num_entities = known_args.num_entities project = gcloud_options.project # a random ancesor key ancestor = str(uuid.uuid4()) query = make_ancestor_query(kind, None, ancestor) # Pipeline 1: Create and write the specified number of Entities to the # Cloud Datastore. logging.info('Writing %s entities to %s', num_entities, project) p = new_pipeline_with_job_name(pipeline_options, job_name, '-write') # pylint: disable=expression-not-assigned (p | 'Input' >> beam.Create(list(range(known_args.num_entities))) | 'To String' >> beam.Map(str) | 'To Entity' >> beam.Map(EntityWrapper(kind, None, ancestor).make_entity) | 'Write to Datastore' >> WriteToDatastore(project)) p.run() # Optional Pipeline 2: If a read limit was provided, read it and confirm # that the expected entities were read. if known_args.limit is not None: logging.info( 'Querying a limited set of %s entities and verifying count.', known_args.limit) p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify-limit') query_with_limit = query_pb2.Query() query_with_limit.CopyFrom(query) query_with_limit.limit.value = known_args.limit entities = p | 'read from datastore' >> ReadFromDatastore( project, query_with_limit) assert_that(entities | beam.combiners.Count.Globally(), equal_to([known_args.limit])) p.run() # Pipeline 3: Query the written Entities and verify result. logging.info('Querying entities, asserting they match.') p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify') entities = p | 'read from datastore' >> ReadFromDatastore(project, query) assert_that(entities | beam.combiners.Count.Globally(), equal_to([num_entities])) p.run() # Pipeline 4: Delete Entities. logging.info('Deleting entities.') p = new_pipeline_with_job_name(pipeline_options, job_name, '-delete') entities = p | 'read from datastore' >> ReadFromDatastore(project, query) # pylint: disable=expression-not-assigned (entities | 'To Keys' >> beam.Map(lambda entity: entity.key) | 'Delete keys' >> DeleteFromDatastore(project)) p.run() # Pipeline 5: Query the written Entities, verify no results. logging.info( 'Querying for the entities to make sure there are none present.') p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify-deleted') entities = p | 'read from datastore' >> ReadFromDatastore(project, query) assert_that(entities | beam.combiners.Count.Globally(), equal_to([0])) p.run()
def setUp(self): self._mock_datastore = MagicMock() self._query = query_pb2.Query() self._query.kind.add().name = self._KIND
def test_get_splits_query_with_offset(self): query = query_pb2.Query() query.kind.add() query.offset = 10 self.assertRaises(ValueError, query_splitter.get_splits, None, query, 2)
def test_get_splits_query_with_multiple_kinds(self): query = query_pb2.Query() query.kind.add() query.kind.add() self.assertRaises(ValueError, query_splitter.get_splits, None, query, 4)