def txn(shards): marker_key = ShardedTaskMarker.get_key(identifier, query._Query__namespace) try: rpc.Get(marker_key) # If the marker already exists, don't do anything - just return return except datastore_errors.EntityNotFoundError: pass marker = ShardedTaskMarker(identifier, query, namespace=query._Query__namespace) if shards: for shard in shards: marker["shards_queued"].append(cPickle.dumps(shard)) else: # No shards, then there is nothing to do! marker["is_finished"] = True marker["time_started"] = datetime.utcnow() marker.put() if not marker["is_finished"]: deferred.defer( marker.begin_processing, operation, operation_method, entities_per_task, queue, _transactional=True, _queue=queue ) return marker_key
def Run(self, limit, offset): opts = self._gae_query._Query__query_options if opts.keys_only or opts.projection: return self._gae_query.Run(limit=limit, offset=offset) ret = caching.get_from_cache(self._identifier, self._namespace) if ret is not None and not utils.entity_matches_query( ret, self._gae_query): ret = None if ret is None: # We do a fast keys_only query to get the result keys_query = rpc.Query(self._gae_query._Query__kind, keys_only=True, namespace=self._namespace) keys_query.update(self._gae_query) keys = keys_query.Run(limit=limit, offset=offset) # Do a consistent get so we don't cache stale data, and recheck the result matches the query ret = [ x for x in rpc.Get(keys) if x and utils.entity_matches_query(x, self._gae_query) ] if len(ret) == 1: caching.add_entities_to_cache( self._model, [ret[0]], caching.CachingSituation.DATASTORE_GET, self._namespace, ) return iter(ret) return iter([ret])
def txn(): try: marker = rpc.Get(self.key()) marker.__class__ = ShardedTaskMarker queued_shards = marker[ShardedTaskMarker.QUEUED_KEY] processing_shards = marker[ShardedTaskMarker.RUNNING_KEY] queued_count = len(queued_shards) for j in range(min(BATCH_SIZE, queued_count)): pickled_shard = queued_shards.pop() processing_shards.append(pickled_shard) shard = cPickle.loads(str(pickled_shard)) deferred.defer( self.run_shard, query, shard, operation, operation_method, entities_per_task=entities_per_task, # Defer this task onto the correct queue with `_queue`, passing the `queue` # parameter back to the function again so that it can do the same next time queue=queue, _queue=queue, _transactional=True, ) marker.put() except datastore_errors.EntityNotFoundError: logging.error( "Unable to start task %s as marker is missing", self.key().id_or_name() ) return
def txn(): pickled_shard = cPickle.dumps(shard) marker = rpc.Get(self.key()) marker.__class__ = ShardedTaskMarker marker[ShardedTaskMarker.RUNNING_KEY].remove(pickled_shard) marker[ShardedTaskMarker.FINISHED_KEY].append(pickled_shard) marker.put()
def is_mapper_running(identifier, namespace): """ Returns True if the mapper exists, but it's not finished """ try: marker = rpc.Get(ShardedTaskMarker.get_key(identifier, namespace)) return not marker["is_finished"] except datastore_errors.EntityNotFoundError: return False
def mapper_exists(identifier, namespace): """ Returns True if the mapper exists, False otherwise """ try: rpc.Get(ShardedTaskMarker.get_key(identifier, namespace)) return True except datastore_errors.EntityNotFoundError: return False
def begin_processing(self, operation, operation_method, entities_per_task, queue): BATCH_SIZE = 3 # Unpickle the source query query = cPickle.loads(str(self["query"])) def txn(): try: marker = rpc.Get(self.key()) marker.__class__ = ShardedTaskMarker queued_shards = marker[ShardedTaskMarker.QUEUED_KEY] processing_shards = marker[ShardedTaskMarker.RUNNING_KEY] queued_count = len(queued_shards) for j in range(min(BATCH_SIZE, queued_count)): pickled_shard = queued_shards.pop() processing_shards.append(pickled_shard) shard = cPickle.loads(str(pickled_shard)) deferred.defer( self.run_shard, query, shard, operation, operation_method, entities_per_task=entities_per_task, # Defer this task onto the correct queue with `_queue`, passing the `queue` # parameter back to the function again so that it can do the same next time queue=queue, _queue=queue, _transactional=True, ) marker.put() except datastore_errors.EntityNotFoundError: logging.error( "Unable to start task %s as marker is missing", self.key().id_or_name() ) return # Reload the marker (non-transactionally) and defer the shards in batches # transactionally. If this task fails somewhere, it will resume where it left off marker = rpc.Get(self.key()) for i in range(0, len(marker[ShardedTaskMarker.QUEUED_KEY]), BATCH_SIZE): rpc.RunInTransaction(txn)
def delete_batch(key_slice): entities = rpc.Get(key_slice) # FIXME: We need to make sure the entity still matches the query! # entities = (x for x in entities if utils.entity_matches_query(x, self.select.gae_query)) to_delete = [] to_update = [] updated_keys = [] # Go through the entities for entity in entities: if entity is None: continue wipe_polymodel_from_entity(entity, self.table_to_delete) if not entity.get('class'): to_delete.append(entity.key()) if constraints_enabled: constraints.release(self.model, entity) else: to_update.append(entity) updated_keys.append(entity.key()) rpc.DeleteAsync(to_delete) rpc.PutAsync(to_update) # Clean up any special index things that need to be cleaned for indexer in indexers_for_model(self.model): for key in to_delete: indexer.cleanup(key) caching.remove_entities_from_cache_by_key( updated_keys, self.namespace ) return len(updated_keys)
def Run(self, limit=None, offset=None): """ Here are the options: 1. Single key, hit memcache 2. Multikey projection, async MultiQueries with ancestors chained 3. Full select, datastore get """ opts = self.queries[0]._Query__query_options key_count = len(self.queries_by_key) is_projection = False max_cache_count = getattr(settings, "DJANGAE_CACHE_MAX_ENTITY_COUNT", DEFAULT_MAX_ENTITY_COUNT) cache_results = True results = None if key_count == 1: # FIXME: Potentially could use get_multi in memcache and the make a query # for whatever remains key = self.queries_by_key.keys()[0] result = caching.get_from_cache_by_key(key) if result is not None: results = [result] cache_results = False # Don't update cache, we just got it from there if results is None: if opts.projection and self.can_multi_query: is_projection = True cache_results = False # Don't cache projection results! # If we can multi-query in a single query, we do so using a number of # ancestor queries (to stay consistent) otherwise, we just do a # datastore Get, but this will return extra data over the RPC to_fetch = (offset or 0) + limit if limit else None additional_cols = set([ x[0] for x in self.ordering if x[0] not in opts.projection ]) multi_query = [] orderings = self.queries[0]._Query__orderings for key, queries in self.queries_by_key.items(): for query in queries: if additional_cols: # We need to include additional orderings in the projection so that we can # sort them in memory. Annoyingly that means reinstantiating the queries query = rpc.Query( kind=query._Query__kind, filters=query, projection=list(opts.projection).extend( list(additional_cols)), namespace=self.namespace, ) query.Ancestor(key) # Make this an ancestor query multi_query.append(query) if len(multi_query) == 1: results = multi_query[0].Run(limit=to_fetch) else: results = AsyncMultiQuery(multi_query, orderings).Run(limit=to_fetch) else: results = rpc.Get(self.queries_by_key.keys()) def iter_results(results): returned = 0 # This is safe, because Django is fetching all results any way :( sorted_results = sorted(results, cmp=partial( utils.django_ordering_comparison, self.ordering)) sorted_results = [ result for result in sorted_results if result is not None ] if cache_results and sorted_results: caching.add_entities_to_cache( self.model, sorted_results[:max_cache_count], caching.CachingSituation.DATASTORE_GET, self.namespace, ) for result in sorted_results: if is_projection: entity_matches_query = True else: entity_matches_query = any( utils.entity_matches_query(result, qry) for qry in self.queries_by_key[result.key()]) if not entity_matches_query: continue if offset and returned < offset: # Skip entities based on offset returned += 1 continue else: yield _convert_entity_based_on_query_options(result, opts) returned += 1 # If there is a limit, we might be done! if limit is not None and returned == (offset or 0) + limit: break return iter_results(results)
def txn(): caching.remove_entities_from_cache_by_key([key], self.namespace) try: result = rpc.Get(key) except datastore_errors.EntityNotFoundError: # Return false to indicate update failure return False if ( isinstance(self.select.gae_query, (Query, meta_queries.UniqueQuery)) # ignore QueryByKeys and NoOpQuery and not utils.entity_matches_query(result, self.select.gae_query) ): # Due to eventual consistency they query may have returned an entity which no longer # matches the query return False original = copy.deepcopy(result) instance_kwargs = {field.attname: value for field, param, value in self.values} # Note: If you replace MockInstance with self.model, you'll find that some delete # tests fail in the test app. This is because any unspecified fields would then call # get_default (even though we aren't going to use them) which may run a query which # fails inside this transaction. Given as we are just using MockInstance so that we can # call django_instance_to_entities it on it with the subset of fields we pass in, # what we have is fine. meta = self.model._meta instance = MockInstance( _original=MockInstance(_meta=meta, **result), _meta=meta, **instance_kwargs ) # Convert the instance to an entity primary, descendents = django_instance_to_entities( self.connection, [x[0] for x in self.values], # Pass in the fields that were updated True, instance, model=self.model ) # Update the entity we read above with the new values result.update(primary) # Remove fields which have been marked to be unindexed for col in getattr(primary, "_properties_to_remove", []): if col in result: del result[col] # Make sure that any polymodel classes which were in the original entity are kept, # as django_instance_to_entities may have wiped them as well as added them. polymodel_classes = list(set( original.get(POLYMODEL_CLASS_ATTRIBUTE, []) + result.get(POLYMODEL_CLASS_ATTRIBUTE, []) )) if polymodel_classes: result[POLYMODEL_CLASS_ATTRIBUTE] = polymodel_classes def perform_insert(): """ Inserts result, and any descendents with their ancestor value set """ inserted_key = rpc.Put(result) if descendents: for i, descendent in enumerate(descendents): descendents[i] = Entity( descendent.kind(), parent=inserted_key, namespace=inserted_key.namespace(), id=descendent.key().id() or None, name=descendent.key().name() or None ) descendents[i].update(descendent) rpc.Put(descendents) if not constraints.has_active_unique_constraints(self.model): # The fast path, no constraint checking perform_insert() caching.add_entities_to_cache( self.model, [result], caching.CachingSituation.DATASTORE_PUT, self.namespace, skip_memcache=True, ) else: markers_to_acquire[:], markers_to_release[:] = constraints.get_markers_for_update( self.model, original, result ) perform_insert() constraints.update_identifiers(markers_to_acquire, markers_to_release, result.key()) # If the rpc.Put() fails then the exception will only be raised when the # transaction applies, which means that we will still get to here and will still have # applied the marker changes (because they're in a nested, independent transaction). # Hence we set this flag to tell us that we got this far and that we should roll them back. rollback_markers[0] = True # If something dies between here and the `return` statement then we'll have stale unique markers try: # Update the cache before dealing with unique markers, as CachingSituation.DATASTORE_PUT # will only update the context cache caching.add_entities_to_cache( self.model, [result], caching.CachingSituation.DATASTORE_PUT, self.namespace, skip_memcache=True, ) except: # We ignore the exception because raising will rollback the transaction causing # an inconsistent state logger.exception("Unable to update the context cache") pass # Return true to indicate update success return True
def run_shard( self, original_query, shard, operation, operation_method=None, offset=0, entities_per_task=None, queue=_DEFAULT_QUEUE ): """ Given a rpc.Query which does not have any high/low bounds on it, apply the bounds of the given shard (which is a pair of keys), and run either the given `operation` (if it's a function) or the given method of the given operation (if it's an object) on each entity that the query returns, starting at entity `offset`, and redeferring every `entities_per_task` entities to avoid hitting DeadlineExceededError. Tries (but does not guarantee) to avoid processing the same entity more than once. """ entities_per_task = entities_per_task or getattr( settings, "DJANGAE_MIGRATION_DEFAULT_ENTITIES_PER_TASK", 100 ) if operation_method: function = getattr(operation, operation_method) else: function = operation marker = rpc.Get(self.key()) if cPickle.dumps(shard) not in marker[ShardedTaskMarker.RUNNING_KEY]: return # Copy the query so that we can re-defer the original, unadulterated version, because once # we've applied limits and ordering to the query it causes pickle errors with defer. query = copy.deepcopy(original_query) query.Order("__key__") query["__key__ >="] = shard[0] query["__key__ <"] = shard[1] num_entities_processed = 0 try: results = query.Run(offset=offset, limit=entities_per_task) for entity in results: function(entity) num_entities_processed += 1 if num_entities_processed >= entities_per_task: raise Redefer() except (DeadlineExceededError, Redefer): # By keeping track of how many entities we've processed, we can (hopefully) avoid # re-processing entities if we hit DeadlineExceededError by redeferring with the # incremented offset. But note that if we get crushed by the HARD DeadlineExceededError # before we can redefer, then the whole task will retry and so entities will get # processed twice. deferred.defer( self.run_shard, original_query, shard, operation, operation_method, offset=offset+num_entities_processed, entities_per_task=entities_per_task, # Defer this task onto the correct queue (with `_queue`), passing the `queue` # parameter back to the function again so that it can do the same next time queue=queue, _queue=queue, ) return # This is important! # Once we've run the operation on all the entities, mark the shard as done def txn(): pickled_shard = cPickle.dumps(shard) marker = rpc.Get(self.key()) marker.__class__ = ShardedTaskMarker marker[ShardedTaskMarker.RUNNING_KEY].remove(pickled_shard) marker[ShardedTaskMarker.FINISHED_KEY].append(pickled_shard) marker.put() rpc.RunInTransaction(txn)