def start_mapping(identifier, query, operation, operation_method=None, shard_count=None, entities_per_task=None, queue=None): """ This must *transactionally* defer a task which will call `operation._wrapped_map_entity` on all entities of the given `kind` in the given `namespace` and will then transactionally update the entity of the given `task_marker_key_key` with `is_finished=True` after all entities have been mapped. """ shard_count = shard_count or getattr( settings, "DJANGAE_MIGRATION_DEFAULT_SHARD_COUNT", 32) shards_to_run = shard_query(query, shard_count) queue = queue or getattr(settings, "DJANGAE_MIGRATION_DEFAULT_QUEUE", _DEFAULT_QUEUE) def txn(shards): marker_key = ShardedTaskMarker.get_key(identifier, query._Query__namespace) try: rpc.Get(marker_key) # If the marker already exists, don't do anything - just return return except datastore_errors.EntityNotFoundError: pass marker = ShardedTaskMarker(identifier, query, namespace=query._Query__namespace) if shards: for shard in shards: marker["shards_queued"].append(cPickle.dumps(shard)) else: # No shards, then there is nothing to do! marker["is_finished"] = True marker["time_started"] = datetime.utcnow() marker.put() if not marker["is_finished"]: deferred.defer(marker.begin_processing, operation, operation_method, entities_per_task, queue, _transactional=True, _queue=queue) return marker_key return rpc.RunInTransaction(txn, shards_to_run)
def begin_processing(self, operation, operation_method, entities_per_task, queue): BATCH_SIZE = 3 # Unpickle the source query query = cPickle.loads(str(self["query"])) def txn(): try: marker = rpc.Get(self.key()) marker.__class__ = ShardedTaskMarker queued_shards = marker[ShardedTaskMarker.QUEUED_KEY] processing_shards = marker[ShardedTaskMarker.RUNNING_KEY] queued_count = len(queued_shards) for j in range(min(BATCH_SIZE, queued_count)): pickled_shard = queued_shards.pop() processing_shards.append(pickled_shard) shard = cPickle.loads(str(pickled_shard)) deferred.defer( self.run_shard, query, shard, operation, operation_method, entities_per_task=entities_per_task, # Defer this task onto the correct queue with `_queue`, passing the `queue` # parameter back to the function again so that it can do the same next time queue=queue, _queue=queue, _transactional=True, ) marker.put() except datastore_errors.EntityNotFoundError: logging.error( "Unable to start task %s as marker is missing", self.key().id_or_name() ) return # Reload the marker (non-transactionally) and defer the shards in batches # transactionally. If this task fails somewhere, it will resume where it left off marker = rpc.Get(self.key()) for i in range(0, len(marker[ShardedTaskMarker.QUEUED_KEY]), BATCH_SIZE): rpc.RunInTransaction(txn)
def run_shard( self, original_query, shard, operation, operation_method=None, offset=0, entities_per_task=None, queue=_DEFAULT_QUEUE ): """ Given a rpc.Query which does not have any high/low bounds on it, apply the bounds of the given shard (which is a pair of keys), and run either the given `operation` (if it's a function) or the given method of the given operation (if it's an object) on each entity that the query returns, starting at entity `offset`, and redeferring every `entities_per_task` entities to avoid hitting DeadlineExceededError. Tries (but does not guarantee) to avoid processing the same entity more than once. """ entities_per_task = entities_per_task or getattr( settings, "DJANGAE_MIGRATION_DEFAULT_ENTITIES_PER_TASK", 100 ) if operation_method: function = getattr(operation, operation_method) else: function = operation marker = rpc.Get(self.key()) if cPickle.dumps(shard) not in marker[ShardedTaskMarker.RUNNING_KEY]: return # Copy the query so that we can re-defer the original, unadulterated version, because once # we've applied limits and ordering to the query it causes pickle errors with defer. query = copy.deepcopy(original_query) query.Order("__key__") query["__key__ >="] = shard[0] query["__key__ <"] = shard[1] num_entities_processed = 0 try: results = query.Run(offset=offset, limit=entities_per_task) for entity in results: function(entity) num_entities_processed += 1 if num_entities_processed >= entities_per_task: raise Redefer() except (DeadlineExceededError, Redefer): # By keeping track of how many entities we've processed, we can (hopefully) avoid # re-processing entities if we hit DeadlineExceededError by redeferring with the # incremented offset. But note that if we get crushed by the HARD DeadlineExceededError # before we can redefer, then the whole task will retry and so entities will get # processed twice. deferred.defer( self.run_shard, original_query, shard, operation, operation_method, offset=offset+num_entities_processed, entities_per_task=entities_per_task, # Defer this task onto the correct queue (with `_queue`), passing the `queue` # parameter back to the function again so that it can do the same next time queue=queue, _queue=queue, ) return # This is important! # Once we've run the operation on all the entities, mark the shard as done def txn(): pickled_shard = cPickle.dumps(shard) marker = rpc.Get(self.key()) marker.__class__ = ShardedTaskMarker marker[ShardedTaskMarker.RUNNING_KEY].remove(pickled_shard) marker[ShardedTaskMarker.FINISHED_KEY].append(pickled_shard) marker.put() rpc.RunInTransaction(txn)