Exemple #1
0
def start_mapping(identifier,
                  query,
                  operation,
                  operation_method=None,
                  shard_count=None,
                  entities_per_task=None,
                  queue=None):
    """ This must *transactionally* defer a task which will call `operation._wrapped_map_entity` on
        all entities of the given `kind` in the given `namespace` and will then transactionally
        update the entity of the given `task_marker_key_key` with `is_finished=True` after all
        entities have been mapped.
    """
    shard_count = shard_count or getattr(
        settings, "DJANGAE_MIGRATION_DEFAULT_SHARD_COUNT", 32)
    shards_to_run = shard_query(query, shard_count)
    queue = queue or getattr(settings, "DJANGAE_MIGRATION_DEFAULT_QUEUE",
                             _DEFAULT_QUEUE)

    def txn(shards):
        marker_key = ShardedTaskMarker.get_key(identifier,
                                               query._Query__namespace)
        try:
            rpc.Get(marker_key)

            # If the marker already exists, don't do anything - just return
            return
        except datastore_errors.EntityNotFoundError:
            pass

        marker = ShardedTaskMarker(identifier,
                                   query,
                                   namespace=query._Query__namespace)

        if shards:
            for shard in shards:
                marker["shards_queued"].append(cPickle.dumps(shard))
        else:
            # No shards, then there is nothing to do!
            marker["is_finished"] = True
        marker["time_started"] = datetime.utcnow()
        marker.put()
        if not marker["is_finished"]:
            deferred.defer(marker.begin_processing,
                           operation,
                           operation_method,
                           entities_per_task,
                           queue,
                           _transactional=True,
                           _queue=queue)

        return marker_key

    return rpc.RunInTransaction(txn, shards_to_run)
Exemple #2
0
    def begin_processing(self, operation, operation_method, entities_per_task, queue):
        BATCH_SIZE = 3

        # Unpickle the source query
        query = cPickle.loads(str(self["query"]))

        def txn():
            try:
                marker = rpc.Get(self.key())
                marker.__class__ = ShardedTaskMarker

                queued_shards = marker[ShardedTaskMarker.QUEUED_KEY]
                processing_shards = marker[ShardedTaskMarker.RUNNING_KEY]
                queued_count = len(queued_shards)

                for j in range(min(BATCH_SIZE, queued_count)):
                    pickled_shard = queued_shards.pop()
                    processing_shards.append(pickled_shard)
                    shard = cPickle.loads(str(pickled_shard))
                    deferred.defer(
                        self.run_shard,
                        query,
                        shard,
                        operation,
                        operation_method,
                        entities_per_task=entities_per_task,
                        # Defer this task onto the correct queue with `_queue`, passing the `queue`
                        # parameter back to the function again so that it can do the same next time
                        queue=queue,
                        _queue=queue,
                        _transactional=True,
                    )

                marker.put()
            except datastore_errors.EntityNotFoundError:
                logging.error(
                    "Unable to start task %s as marker is missing",
                    self.key().id_or_name()
                )
                return

        # Reload the marker (non-transactionally) and defer the shards in batches
        # transactionally. If this task fails somewhere, it will resume where it left off
        marker = rpc.Get(self.key())
        for i in range(0, len(marker[ShardedTaskMarker.QUEUED_KEY]), BATCH_SIZE):
            rpc.RunInTransaction(txn)
Exemple #3
0
    def run_shard(
        self, original_query, shard, operation, operation_method=None, offset=0,
        entities_per_task=None, queue=_DEFAULT_QUEUE
    ):
        """ Given a rpc.Query which does not have any high/low bounds on it, apply the bounds
            of the given shard (which is a pair of keys), and run either the given `operation`
            (if it's a function) or the given method of the given operation (if it's an object) on
            each entity that the query returns, starting at entity `offset`, and redeferring every
            `entities_per_task` entities to avoid hitting DeadlineExceededError.
            Tries (but does not guarantee) to avoid processing the same entity more than once.
        """
        entities_per_task = entities_per_task or getattr(
            settings, "DJANGAE_MIGRATION_DEFAULT_ENTITIES_PER_TASK", 100
        )
        if operation_method:
            function = getattr(operation, operation_method)
        else:
            function = operation

        marker = rpc.Get(self.key())
        if cPickle.dumps(shard) not in marker[ShardedTaskMarker.RUNNING_KEY]:
            return

        # Copy the query so that we can re-defer the original, unadulterated version, because once
        # we've applied limits and ordering to the query it causes pickle errors with defer.
        query = copy.deepcopy(original_query)
        query.Order("__key__")
        query["__key__ >="] = shard[0]
        query["__key__ <"] = shard[1]

        num_entities_processed = 0
        try:
            results = query.Run(offset=offset, limit=entities_per_task)
            for entity in results:
                function(entity)
                num_entities_processed += 1
                if num_entities_processed >= entities_per_task:
                    raise Redefer()
        except (DeadlineExceededError, Redefer):
            # By keeping track of how many entities we've processed, we can (hopefully) avoid
            # re-processing entities if we hit DeadlineExceededError by redeferring with the
            # incremented offset.  But note that if we get crushed by the HARD DeadlineExceededError
            # before we can redefer, then the whole task will retry and so entities will get
            # processed twice.
            deferred.defer(
                self.run_shard,
                original_query,
                shard,
                operation,
                operation_method,
                offset=offset+num_entities_processed,
                entities_per_task=entities_per_task,
                # Defer this task onto the correct queue (with `_queue`), passing the `queue`
                # parameter back to the function again so that it can do the same next time
                queue=queue,
                _queue=queue,
            )
            return  # This is important!

        # Once we've run the operation on all the entities, mark the shard as done
        def txn():
            pickled_shard = cPickle.dumps(shard)
            marker = rpc.Get(self.key())
            marker.__class__ = ShardedTaskMarker
            marker[ShardedTaskMarker.RUNNING_KEY].remove(pickled_shard)
            marker[ShardedTaskMarker.FINISHED_KEY].append(pickled_shard)
            marker.put()

        rpc.RunInTransaction(txn)