Example #1
0
    def txn(shards):
        marker_key = ShardedTaskMarker.get_key(identifier, query._Query__namespace)
        try:
            rpc.Get(marker_key)

            # If the marker already exists, don't do anything - just return
            return
        except datastore_errors.EntityNotFoundError:
            pass

        marker = ShardedTaskMarker(identifier, query, namespace=query._Query__namespace)

        if shards:
            for shard in shards:
                marker["shards_queued"].append(cPickle.dumps(shard))
        else:
            # No shards, then there is nothing to do!
            marker["is_finished"] = True
        marker["time_started"] = datetime.utcnow()
        marker.put()
        if not marker["is_finished"]:
            deferred.defer(
                marker.begin_processing, operation, operation_method, entities_per_task, queue,
                _transactional=True, _queue=queue
            )

        return marker_key
Example #2
0
    def Run(self, limit, offset):
        opts = self._gae_query._Query__query_options
        if opts.keys_only or opts.projection:
            return self._gae_query.Run(limit=limit, offset=offset)

        ret = caching.get_from_cache(self._identifier, self._namespace)
        if ret is not None and not utils.entity_matches_query(
                ret, self._gae_query):
            ret = None

        if ret is None:
            # We do a fast keys_only query to get the result
            keys_query = rpc.Query(self._gae_query._Query__kind,
                                   keys_only=True,
                                   namespace=self._namespace)
            keys_query.update(self._gae_query)
            keys = keys_query.Run(limit=limit, offset=offset)

            # Do a consistent get so we don't cache stale data, and recheck the result matches the query
            ret = [
                x for x in rpc.Get(keys)
                if x and utils.entity_matches_query(x, self._gae_query)
            ]
            if len(ret) == 1:
                caching.add_entities_to_cache(
                    self._model,
                    [ret[0]],
                    caching.CachingSituation.DATASTORE_GET,
                    self._namespace,
                )
            return iter(ret)

        return iter([ret])
Example #3
0
        def txn():
            try:
                marker = rpc.Get(self.key())
                marker.__class__ = ShardedTaskMarker

                queued_shards = marker[ShardedTaskMarker.QUEUED_KEY]
                processing_shards = marker[ShardedTaskMarker.RUNNING_KEY]
                queued_count = len(queued_shards)

                for j in range(min(BATCH_SIZE, queued_count)):
                    pickled_shard = queued_shards.pop()
                    processing_shards.append(pickled_shard)
                    shard = cPickle.loads(str(pickled_shard))
                    deferred.defer(
                        self.run_shard,
                        query,
                        shard,
                        operation,
                        operation_method,
                        entities_per_task=entities_per_task,
                        # Defer this task onto the correct queue with `_queue`, passing the `queue`
                        # parameter back to the function again so that it can do the same next time
                        queue=queue,
                        _queue=queue,
                        _transactional=True,
                    )

                marker.put()
            except datastore_errors.EntityNotFoundError:
                logging.error(
                    "Unable to start task %s as marker is missing",
                    self.key().id_or_name()
                )
                return
Example #4
0
 def txn():
     pickled_shard = cPickle.dumps(shard)
     marker = rpc.Get(self.key())
     marker.__class__ = ShardedTaskMarker
     marker[ShardedTaskMarker.RUNNING_KEY].remove(pickled_shard)
     marker[ShardedTaskMarker.FINISHED_KEY].append(pickled_shard)
     marker.put()
Example #5
0
def is_mapper_running(identifier, namespace):
    """
        Returns True if the mapper exists, but it's not finished
    """
    try:
        marker = rpc.Get(ShardedTaskMarker.get_key(identifier, namespace))
        return not marker["is_finished"]
    except datastore_errors.EntityNotFoundError:
        return False
Example #6
0
def mapper_exists(identifier, namespace):
    """
        Returns True if the mapper exists, False otherwise
    """
    try:
        rpc.Get(ShardedTaskMarker.get_key(identifier, namespace))
        return True
    except datastore_errors.EntityNotFoundError:
        return False
Example #7
0
    def begin_processing(self, operation, operation_method, entities_per_task, queue):
        BATCH_SIZE = 3

        # Unpickle the source query
        query = cPickle.loads(str(self["query"]))

        def txn():
            try:
                marker = rpc.Get(self.key())
                marker.__class__ = ShardedTaskMarker

                queued_shards = marker[ShardedTaskMarker.QUEUED_KEY]
                processing_shards = marker[ShardedTaskMarker.RUNNING_KEY]
                queued_count = len(queued_shards)

                for j in range(min(BATCH_SIZE, queued_count)):
                    pickled_shard = queued_shards.pop()
                    processing_shards.append(pickled_shard)
                    shard = cPickle.loads(str(pickled_shard))
                    deferred.defer(
                        self.run_shard,
                        query,
                        shard,
                        operation,
                        operation_method,
                        entities_per_task=entities_per_task,
                        # Defer this task onto the correct queue with `_queue`, passing the `queue`
                        # parameter back to the function again so that it can do the same next time
                        queue=queue,
                        _queue=queue,
                        _transactional=True,
                    )

                marker.put()
            except datastore_errors.EntityNotFoundError:
                logging.error(
                    "Unable to start task %s as marker is missing",
                    self.key().id_or_name()
                )
                return

        # Reload the marker (non-transactionally) and defer the shards in batches
        # transactionally. If this task fails somewhere, it will resume where it left off
        marker = rpc.Get(self.key())
        for i in range(0, len(marker[ShardedTaskMarker.QUEUED_KEY]), BATCH_SIZE):
            rpc.RunInTransaction(txn)
Example #8
0
        def delete_batch(key_slice):
            entities = rpc.Get(key_slice)

            # FIXME: We need to make sure the entity still matches the query!
#            entities = (x for x in entities if utils.entity_matches_query(x, self.select.gae_query))

            to_delete = []
            to_update = []
            updated_keys = []

            # Go through the entities
            for entity in entities:
                if entity is None:
                    continue

                wipe_polymodel_from_entity(entity, self.table_to_delete)
                if not entity.get('class'):
                    to_delete.append(entity.key())
                    if constraints_enabled:
                        constraints.release(self.model, entity)
                else:
                    to_update.append(entity)
                updated_keys.append(entity.key())

            rpc.DeleteAsync(to_delete)
            rpc.PutAsync(to_update)

            # Clean up any special index things that need to be cleaned
            for indexer in indexers_for_model(self.model):
                for key in to_delete:
                    indexer.cleanup(key)

            caching.remove_entities_from_cache_by_key(
                updated_keys, self.namespace
            )

            return len(updated_keys)
Example #9
0
    def Run(self, limit=None, offset=None):
        """
            Here are the options:

            1. Single key, hit memcache
            2. Multikey projection, async MultiQueries with ancestors chained
            3. Full select, datastore get
        """
        opts = self.queries[0]._Query__query_options
        key_count = len(self.queries_by_key)

        is_projection = False

        max_cache_count = getattr(settings, "DJANGAE_CACHE_MAX_ENTITY_COUNT",
                                  DEFAULT_MAX_ENTITY_COUNT)

        cache_results = True
        results = None
        if key_count == 1:
            # FIXME: Potentially could use get_multi in memcache and the make a query
            # for whatever remains
            key = self.queries_by_key.keys()[0]
            result = caching.get_from_cache_by_key(key)
            if result is not None:
                results = [result]
                cache_results = False  # Don't update cache, we just got it from there

        if results is None:
            if opts.projection and self.can_multi_query:
                is_projection = True
                cache_results = False  # Don't cache projection results!

                # If we can multi-query in a single query, we do so using a number of
                # ancestor queries (to stay consistent) otherwise, we just do a
                # datastore Get, but this will return extra data over the RPC
                to_fetch = (offset or 0) + limit if limit else None
                additional_cols = set([
                    x[0] for x in self.ordering if x[0] not in opts.projection
                ])

                multi_query = []
                orderings = self.queries[0]._Query__orderings
                for key, queries in self.queries_by_key.items():
                    for query in queries:
                        if additional_cols:
                            # We need to include additional orderings in the projection so that we can
                            # sort them in memory. Annoyingly that means reinstantiating the queries
                            query = rpc.Query(
                                kind=query._Query__kind,
                                filters=query,
                                projection=list(opts.projection).extend(
                                    list(additional_cols)),
                                namespace=self.namespace,
                            )

                        query.Ancestor(key)  # Make this an ancestor query
                        multi_query.append(query)

                if len(multi_query) == 1:
                    results = multi_query[0].Run(limit=to_fetch)
                else:
                    results = AsyncMultiQuery(multi_query,
                                              orderings).Run(limit=to_fetch)
            else:
                results = rpc.Get(self.queries_by_key.keys())

        def iter_results(results):
            returned = 0
            # This is safe, because Django is fetching all results any way :(
            sorted_results = sorted(results,
                                    cmp=partial(
                                        utils.django_ordering_comparison,
                                        self.ordering))
            sorted_results = [
                result for result in sorted_results if result is not None
            ]
            if cache_results and sorted_results:
                caching.add_entities_to_cache(
                    self.model,
                    sorted_results[:max_cache_count],
                    caching.CachingSituation.DATASTORE_GET,
                    self.namespace,
                )

            for result in sorted_results:
                if is_projection:
                    entity_matches_query = True
                else:
                    entity_matches_query = any(
                        utils.entity_matches_query(result, qry)
                        for qry in self.queries_by_key[result.key()])

                if not entity_matches_query:
                    continue

                if offset and returned < offset:
                    # Skip entities based on offset
                    returned += 1
                    continue
                else:

                    yield _convert_entity_based_on_query_options(result, opts)

                    returned += 1

                    # If there is a limit, we might be done!
                    if limit is not None and returned == (offset or 0) + limit:
                        break

        return iter_results(results)
Example #10
0
        def txn():
            caching.remove_entities_from_cache_by_key([key], self.namespace)

            try:
                result = rpc.Get(key)
            except datastore_errors.EntityNotFoundError:
                # Return false to indicate update failure
                return False

            if (
                isinstance(self.select.gae_query, (Query, meta_queries.UniqueQuery)) # ignore QueryByKeys and NoOpQuery
                and not utils.entity_matches_query(result, self.select.gae_query)
            ):
                # Due to eventual consistency they query may have returned an entity which no longer
                # matches the query
                return False

            original = copy.deepcopy(result)

            instance_kwargs = {field.attname: value for field, param, value in self.values}

            # Note: If you replace MockInstance with self.model, you'll find that some delete
            # tests fail in the test app. This is because any unspecified fields would then call
            # get_default (even though we aren't going to use them) which may run a query which
            # fails inside this transaction. Given as we are just using MockInstance so that we can
            # call django_instance_to_entities it on it with the subset of fields we pass in,
            # what we have is fine.
            meta = self.model._meta
            instance = MockInstance(
                _original=MockInstance(_meta=meta, **result),
                _meta=meta,
                **instance_kwargs
            )

            # Convert the instance to an entity
            primary, descendents = django_instance_to_entities(
                self.connection,
                [x[0] for x in self.values],  # Pass in the fields that were updated
                True, instance,
                model=self.model
            )

            # Update the entity we read above with the new values
            result.update(primary)

            # Remove fields which have been marked to be unindexed
            for col in getattr(primary, "_properties_to_remove", []):
                if col in result:
                    del result[col]

            # Make sure that any polymodel classes which were in the original entity are kept,
            # as django_instance_to_entities may have wiped them as well as added them.
            polymodel_classes = list(set(
                original.get(POLYMODEL_CLASS_ATTRIBUTE, []) + result.get(POLYMODEL_CLASS_ATTRIBUTE, [])
            ))
            if polymodel_classes:
                result[POLYMODEL_CLASS_ATTRIBUTE] = polymodel_classes

            def perform_insert():
                """
                    Inserts result, and any descendents with their ancestor
                    value set
                """
                inserted_key = rpc.Put(result)
                if descendents:
                    for i, descendent in enumerate(descendents):
                        descendents[i] = Entity(
                            descendent.kind(),
                            parent=inserted_key,
                            namespace=inserted_key.namespace(),
                            id=descendent.key().id() or None,
                            name=descendent.key().name() or None
                        )
                        descendents[i].update(descendent)
                    rpc.Put(descendents)

            if not constraints.has_active_unique_constraints(self.model):
                # The fast path, no constraint checking
                perform_insert()

                caching.add_entities_to_cache(
                    self.model,
                    [result],
                    caching.CachingSituation.DATASTORE_PUT,
                    self.namespace,
                    skip_memcache=True,
                )
            else:
                markers_to_acquire[:], markers_to_release[:] = constraints.get_markers_for_update(
                    self.model, original, result
                )

                perform_insert()

                constraints.update_identifiers(markers_to_acquire, markers_to_release, result.key())

                # If the rpc.Put() fails then the exception will only be raised when the
                # transaction applies, which means that we will still get to here and will still have
                # applied the marker changes (because they're in a nested, independent transaction).
                # Hence we set this flag to tell us that we got this far and that we should roll them back.
                rollback_markers[0] = True
                # If something dies between here and the `return` statement then we'll have stale unique markers

                try:
                    # Update the cache before dealing with unique markers, as CachingSituation.DATASTORE_PUT
                    # will only update the context cache
                    caching.add_entities_to_cache(
                        self.model,
                        [result],
                        caching.CachingSituation.DATASTORE_PUT,
                        self.namespace,
                        skip_memcache=True,
                    )
                except:
                    # We ignore the exception because raising will rollback the transaction causing
                    # an inconsistent state
                    logger.exception("Unable to update the context cache")
                    pass

            # Return true to indicate update success
            return True
Example #11
0
    def run_shard(
        self, original_query, shard, operation, operation_method=None, offset=0,
        entities_per_task=None, queue=_DEFAULT_QUEUE
    ):
        """ Given a rpc.Query which does not have any high/low bounds on it, apply the bounds
            of the given shard (which is a pair of keys), and run either the given `operation`
            (if it's a function) or the given method of the given operation (if it's an object) on
            each entity that the query returns, starting at entity `offset`, and redeferring every
            `entities_per_task` entities to avoid hitting DeadlineExceededError.
            Tries (but does not guarantee) to avoid processing the same entity more than once.
        """
        entities_per_task = entities_per_task or getattr(
            settings, "DJANGAE_MIGRATION_DEFAULT_ENTITIES_PER_TASK", 100
        )
        if operation_method:
            function = getattr(operation, operation_method)
        else:
            function = operation

        marker = rpc.Get(self.key())
        if cPickle.dumps(shard) not in marker[ShardedTaskMarker.RUNNING_KEY]:
            return

        # Copy the query so that we can re-defer the original, unadulterated version, because once
        # we've applied limits and ordering to the query it causes pickle errors with defer.
        query = copy.deepcopy(original_query)
        query.Order("__key__")
        query["__key__ >="] = shard[0]
        query["__key__ <"] = shard[1]

        num_entities_processed = 0
        try:
            results = query.Run(offset=offset, limit=entities_per_task)
            for entity in results:
                function(entity)
                num_entities_processed += 1
                if num_entities_processed >= entities_per_task:
                    raise Redefer()
        except (DeadlineExceededError, Redefer):
            # By keeping track of how many entities we've processed, we can (hopefully) avoid
            # re-processing entities if we hit DeadlineExceededError by redeferring with the
            # incremented offset.  But note that if we get crushed by the HARD DeadlineExceededError
            # before we can redefer, then the whole task will retry and so entities will get
            # processed twice.
            deferred.defer(
                self.run_shard,
                original_query,
                shard,
                operation,
                operation_method,
                offset=offset+num_entities_processed,
                entities_per_task=entities_per_task,
                # Defer this task onto the correct queue (with `_queue`), passing the `queue`
                # parameter back to the function again so that it can do the same next time
                queue=queue,
                _queue=queue,
            )
            return  # This is important!

        # Once we've run the operation on all the entities, mark the shard as done
        def txn():
            pickled_shard = cPickle.dumps(shard)
            marker = rpc.Get(self.key())
            marker.__class__ = ShardedTaskMarker
            marker[ShardedTaskMarker.RUNNING_KEY].remove(pickled_shard)
            marker[ShardedTaskMarker.FINISHED_KEY].append(pickled_shard)
            marker.put()

        rpc.RunInTransaction(txn)