Exemple #1
0
    def __init__(self, *model_types):
        """Initializes the decorator to target the given types of models.

        Args:
            *model_types: tuple(class). The models the decorator will target. If
                an argument is a base class, all of its subclasses will be
                targeted as well.

        Raises:
            TypeError. When a non-model type is provided.
        """
        if not model_types:
            raise ValueError('Must target at least one model')
        self._targeted_model_types = set()
        for t in model_types:
            if t in _MODEL_TYPES_BY_BASE_CLASS:
                self._targeted_model_types.update(_MODEL_TYPES_BY_BASE_CLASS[t])
            elif t in _ALL_MODEL_TYPES:
                self._targeted_model_types.add(t)
            else:
                raise TypeError(
                    '%r is not a model registered in core.platform' % t)
        self._targeted_kinds = {
            job_utils.get_model_kind(t) for t in self._targeted_model_types
        }
    def __init__(self, message, model_or_kind, model_id=None):
        """Initializes a new audit error.

        Args:
            message: str. The message describing the error.
            model_or_kind: Model|bytes. If model_id is not provided, then this
                is a model (type: BaseModel).
                Otherwise, this is a model's kind (type: bytes).
            model_id: bytes|None. The model's ID, or None when model_or_kind is
                a model.

        Raises:
            TypeError. When the input message is not a string.
            ValueError. When the input message is empty.
        """
        if not python_utils.is_string(message):
            raise TypeError('message must be a string')

        if not message:
            raise ValueError('message must be a non-empty string')

        if model_id is None:
            model_id = job_utils.get_model_id(model_or_kind)
            model_kind = job_utils.get_model_kind(model_or_kind)
        else:
            model_kind = model_or_kind

        error_message = '%s in %s(id=%s): %s' % (
            self.__class__.__name__,
            model_kind, utils.quoted(model_id), message)

        super(BaseAuditError, self).__init__(stderr=error_message)
Exemple #3
0
    def __init__(self, model_class, property_obj):
        """Initializes a new ModelProperty instance.

        Args:
            model_class: type(base_model.BaseModel). The model's class.
            property_obj: datastore_services.Property|@property. An NDB Property
                or a Python @property.

        Raises:
            TypeError. The model_class is not a type.
            TypeError. The model_class is not a subclass of BaseModel.
            TypeError. The property_obj is not an NDB Property.
            ValueError. The property_obj is not in the model_class.
        """
        if not isinstance(model_class, type):
            raise TypeError('%r is not a model class' % model_class)
        elif not issubclass(model_class, base_models.BaseModel):
            raise TypeError('%r is not a subclass of BaseModel' % model_class)

        self._model_kind = job_utils.get_model_kind(model_class)

        if property_obj is model_class.id:
            # BaseModel.id is a Python @property, not an NDB Property.
            property_name = 'id'
        elif not isinstance(property_obj, datastore_services.Property):
            raise TypeError('%r is not an NDB Property' % property_obj)
        elif not any(p is property_obj
                     for p in model_class._properties.values()):  # pylint: disable=protected-access
            raise ValueError('%r is not a property of %s' %
                             (property_obj, self._model_kind))
        else:
            property_name = property_obj._name  # pylint: disable=protected-access

        self._property_name = property_name
Exemple #4
0
    def model_kind(self):
        """Returns the kind of model this instance refers to.

        Returns:
            bytes. The model's kind.
        """
        return job_utils.get_model_kind(self._model_class)
Exemple #5
0
    def from_model(cls, model):
        """Creates a model key from the given model.

        Args:
            model: Model. The model to create a key for.

        Returns:
            ModelKey. The corresponding model key.
        """
        return cls(model_kind=job_utils.get_model_kind(model),
                   model_id=job_utils.get_model_id(model))
Exemple #6
0
    def __init__(self, model_or_kind, model_id=None):
        """Initializes a new audit error.

        Args:
            model_or_kind: Model|bytes. If model_id is not provided, then this
                is a model (type: BaseModel).
                Otherwise, this is a model's kind (type: bytes).
            model_id: bytes|None. The model's ID, or None when model_or_kind is
                a model.
        """
        if model_id is not None:
            model_kind = model_or_kind
        else:
            model_id = job_utils.get_model_id(model_or_kind)
            model_kind = job_utils.get_model_kind(model_or_kind)
        # At first, self._message is a tuple of model identifiers that will be
        # used to annotate the _actual_ message provided by subclasses.
        self._message = (model_kind, model_id)
Exemple #7
0
    def _get_model_kind(self, model_class):
        """Returns the kind of the model class.

        Args:
            model_class: BaseModel. A subclass of BaseModel.

        Returns:
            str. The model's kind.

        Raises:
            TypeError. The model class is not a subclass of BaseModel.
        """
        if not isinstance(model_class, type):
            raise TypeError('%r is an instance, not a type' % model_class)
        if not issubclass(model_class, base_models.BaseModel):
            raise TypeError(
                '%s is not a subclass of BaseModel' % model_class.__name__)
        return job_utils.get_model_kind(model_class)
Exemple #8
0
    def run(self):
        """Returns a PCollection of audit errors aggregated from all models.

        Returns:
            PCollection. A PCollection of audit errors discovered during the
            audit.

        Raises:
            ValueError. When the `datastoreio` option, which provides the
                PTransforms for performing datastore IO operations, is None.
        """
        existing_models, deleted_models = (
            self.pipeline
            | 'Get all models' >> ndb_io.GetModels(
                datastore_services.query_everything(), self.datastoreio_stub)
            | 'Partition by model.deleted' >> (
                beam.Partition(lambda model, _: int(model.deleted), 2))
        )

        models_of_kind_by_index = (
            existing_models
            # NOTE: Partition returns a statically-sized list of PCollections.
            # Creating partitions is wasteful when there are fewer items than
            # there are partitions, like in our unit tests. In exchange, in
            # production the job will be able to take advantage of the high
            # parallelizability of PCollections, which are designed for enormous
            # datasets and parallel processing.
            #
            # Alternatively, we could have used GroupBy. However, that returns
            # an _iterable_ of items rather than a PCollection, and so it is
            # vulnerable to out-of-memory errors.
            #
            # Since this job is concerned with running audits on EVERY MODEL IN
            # STORAGE, Partition is the clear winner regardless of the overhead
            # we'll see in unit tests.
            | 'Split models into parallelizable PCollections' >> beam.Partition(
                lambda m, _, kinds: kinds.index(job_utils.get_model_kind(m)),
                # NOTE: Partition requires a hard-coded number of slices; it
                # cannot be used with dynamic numbers generated in a pipeline.
                # KIND_BY_INDEX is a constant tuple so that requirement is
                # satisfied in this case.
                len(KIND_BY_INDEX), KIND_BY_INDEX)
        )

        existing_key_count_pcolls = []
        missing_key_error_pcolls = []
        audit_error_pcolls = [
            deleted_models
            | 'Apply ValidateDeletedModel on deleted models' >> (
                beam.ParDo(base_validation.ValidateDeletedModel()))
        ]

        model_groups = python_utils.ZIP(KIND_BY_INDEX, models_of_kind_by_index)
        for kind, models_of_kind in model_groups:
            audit_error_pcolls.extend(models_of_kind | ApplyAuditDoFns(kind))

            if kind in ALL_MODEL_KINDS_REFERENCED_BY_PROPERTIES:
                existing_key_count_pcolls.append(
                    models_of_kind | GetExistingModelKeyCounts(kind))

            if kind in ID_REFERENCING_PROPERTIES_BY_KIND_OF_POSSESSOR:
                missing_key_error_pcolls.extend(
                    models_of_kind | GetMissingModelKeyErrors(kind))

        existing_key_counts = (
            existing_key_count_pcolls
            | 'Flatten PCollections of existing key counts' >> beam.Flatten()
        )
        missing_key_errors = (
            missing_key_error_pcolls
            | 'Flatten PCollections of missing key errors' >> beam.Flatten()
        )
        audit_error_pcolls.append(
            (existing_key_counts, missing_key_errors)
            | 'Group counts and errors by key' >> beam.CoGroupByKey()
            | 'Filter keys without any errors' >> (
                beam.FlatMapTuple(self._get_model_relationship_errors))
        )

        return audit_error_pcolls | 'Combine audit results' >> beam.Flatten()
Exemple #9
0
 def test_get_from_bad_value(self):
     self.assertRaisesRegexp(
         TypeError, 'not a model type or instance',
         lambda: job_utils.get_model_kind(123))
Exemple #10
0
 def test_get_from_datastore_model_class(self):
     self.assertEqual(
         job_utils.get_model_kind(base_models.BaseModel), 'BaseModel')
Exemple #11
0
    def test_get_from_datastore_model(self):
        model = base_models.BaseModel()

        self.assertEqual(job_utils.get_model_kind(model), 'BaseModel')
Exemple #12
0
 def test_get_from_bad_value(self) -> None:
     self.assertRaisesRegexp(  # type: ignore[no-untyped-call]
         TypeError, 'not a model type or instance',
         lambda: job_utils.get_model_kind(123))  # type: ignore[arg-type]
Exemple #13
0
 def test_get_from_cloud_datastore_entity(self):
     entity = cloud_datastore_types.Entity(
         key=cloud_datastore_types.Key('BaseModel', '123', project='foo'))
     self.assertEqual(job_utils.get_model_kind(entity), 'BaseModel')
Exemple #14
0
    def run(self):
        """Returns a PCollection of audit errors aggregated from all models.

        Returns:
            PCollection. A PCollection of audit errors discovered during the
            audit.

        Raises:
            ValueError. When the `model_getter` option, which should be the type
                of PTransform we will use to fetch models from the datastore, is
                None.
        """
        if self.job_options.model_getter is None:
            raise ValueError('JobOptions.model_getter must not be None')

        existing_models, deleted_models = (
            self.pipeline
            | 'Get all models' >> self.job_options.model_getter()
            | 'Partition by model.deleted' >>
            (beam.Partition(lambda model, _: int(model.deleted), 2)))

        models_of_kind_by_index = (
            existing_models
            # NOTE: Partition returns a statically-sized list of PCollections.
            # Creating partitions is wasteful when there are fewer items than
            # there are partitions, like in our unit tests. In exchange, in
            # production the job will be able to take advantage of the high
            # parallelizability of PCollections, which are designed for enormous
            # datasets and parallel processing.
            #
            # Alternatively, we could have used GroupBy. However, that returns
            # an _iterable_ of items rather than a PCollection, and so it is
            # vulnerable to out-of-memory errors.
            #
            # Since this job is concerned with running audits on EVERY MODEL IN
            # STORAGE, Partition is the clear winner regardless of the overhead
            # we'll see in unit tests.
            |
            'Split models into parallelizable PCollections' >> beam.Partition(
                lambda m, _, kinds: kinds.index(job_utils.get_model_kind(m)),
                # NOTE: Partition requires a hard-coded number of slices; it
                # cannot be used with dynamic numbers generated in a pipeline.
                # KIND_BY_INDEX is a constant tuple so that requirement is
                # satisfied in this case.
                len(KIND_BY_INDEX),
                KIND_BY_INDEX))

        audit_error_pcolls = [
            deleted_models
            | 'Apply ValidateDeletedModel on deleted models' >>
            (beam.ParDo(base_model_audits.ValidateDeletedModel()))
        ]

        model_groups = python_utils.ZIP(KIND_BY_INDEX, models_of_kind_by_index)
        for kind, models_of_kind in model_groups:
            # NOTE: Using extend() instead of append() because ApplyAuditDoFns
            # produces an iterable of PCollections rather than a single one.
            # NOTE: Label is missing because ApplyAuditDoFns labels itself.
            audit_error_pcolls.extend(models_of_kind | ApplyAuditDoFns(kind))

        return audit_error_pcolls | 'Combine audit results' >> beam.Flatten()