def __init__(self, *model_types): """Initializes the decorator to target the given types of models. Args: *model_types: tuple(class). The models the decorator will target. If an argument is a base class, all of its subclasses will be targeted as well. Raises: TypeError. When a non-model type is provided. """ if not model_types: raise ValueError('Must target at least one model') self._targeted_model_types = set() for t in model_types: if t in _MODEL_TYPES_BY_BASE_CLASS: self._targeted_model_types.update(_MODEL_TYPES_BY_BASE_CLASS[t]) elif t in _ALL_MODEL_TYPES: self._targeted_model_types.add(t) else: raise TypeError( '%r is not a model registered in core.platform' % t) self._targeted_kinds = { job_utils.get_model_kind(t) for t in self._targeted_model_types }
def __init__(self, message, model_or_kind, model_id=None): """Initializes a new audit error. Args: message: str. The message describing the error. model_or_kind: Model|bytes. If model_id is not provided, then this is a model (type: BaseModel). Otherwise, this is a model's kind (type: bytes). model_id: bytes|None. The model's ID, or None when model_or_kind is a model. Raises: TypeError. When the input message is not a string. ValueError. When the input message is empty. """ if not python_utils.is_string(message): raise TypeError('message must be a string') if not message: raise ValueError('message must be a non-empty string') if model_id is None: model_id = job_utils.get_model_id(model_or_kind) model_kind = job_utils.get_model_kind(model_or_kind) else: model_kind = model_or_kind error_message = '%s in %s(id=%s): %s' % ( self.__class__.__name__, model_kind, utils.quoted(model_id), message) super(BaseAuditError, self).__init__(stderr=error_message)
def __init__(self, model_class, property_obj): """Initializes a new ModelProperty instance. Args: model_class: type(base_model.BaseModel). The model's class. property_obj: datastore_services.Property|@property. An NDB Property or a Python @property. Raises: TypeError. The model_class is not a type. TypeError. The model_class is not a subclass of BaseModel. TypeError. The property_obj is not an NDB Property. ValueError. The property_obj is not in the model_class. """ if not isinstance(model_class, type): raise TypeError('%r is not a model class' % model_class) elif not issubclass(model_class, base_models.BaseModel): raise TypeError('%r is not a subclass of BaseModel' % model_class) self._model_kind = job_utils.get_model_kind(model_class) if property_obj is model_class.id: # BaseModel.id is a Python @property, not an NDB Property. property_name = 'id' elif not isinstance(property_obj, datastore_services.Property): raise TypeError('%r is not an NDB Property' % property_obj) elif not any(p is property_obj for p in model_class._properties.values()): # pylint: disable=protected-access raise ValueError('%r is not a property of %s' % (property_obj, self._model_kind)) else: property_name = property_obj._name # pylint: disable=protected-access self._property_name = property_name
def model_kind(self): """Returns the kind of model this instance refers to. Returns: bytes. The model's kind. """ return job_utils.get_model_kind(self._model_class)
def from_model(cls, model): """Creates a model key from the given model. Args: model: Model. The model to create a key for. Returns: ModelKey. The corresponding model key. """ return cls(model_kind=job_utils.get_model_kind(model), model_id=job_utils.get_model_id(model))
def __init__(self, model_or_kind, model_id=None): """Initializes a new audit error. Args: model_or_kind: Model|bytes. If model_id is not provided, then this is a model (type: BaseModel). Otherwise, this is a model's kind (type: bytes). model_id: bytes|None. The model's ID, or None when model_or_kind is a model. """ if model_id is not None: model_kind = model_or_kind else: model_id = job_utils.get_model_id(model_or_kind) model_kind = job_utils.get_model_kind(model_or_kind) # At first, self._message is a tuple of model identifiers that will be # used to annotate the _actual_ message provided by subclasses. self._message = (model_kind, model_id)
def _get_model_kind(self, model_class): """Returns the kind of the model class. Args: model_class: BaseModel. A subclass of BaseModel. Returns: str. The model's kind. Raises: TypeError. The model class is not a subclass of BaseModel. """ if not isinstance(model_class, type): raise TypeError('%r is an instance, not a type' % model_class) if not issubclass(model_class, base_models.BaseModel): raise TypeError( '%s is not a subclass of BaseModel' % model_class.__name__) return job_utils.get_model_kind(model_class)
def run(self): """Returns a PCollection of audit errors aggregated from all models. Returns: PCollection. A PCollection of audit errors discovered during the audit. Raises: ValueError. When the `datastoreio` option, which provides the PTransforms for performing datastore IO operations, is None. """ existing_models, deleted_models = ( self.pipeline | 'Get all models' >> ndb_io.GetModels( datastore_services.query_everything(), self.datastoreio_stub) | 'Partition by model.deleted' >> ( beam.Partition(lambda model, _: int(model.deleted), 2)) ) models_of_kind_by_index = ( existing_models # NOTE: Partition returns a statically-sized list of PCollections. # Creating partitions is wasteful when there are fewer items than # there are partitions, like in our unit tests. In exchange, in # production the job will be able to take advantage of the high # parallelizability of PCollections, which are designed for enormous # datasets and parallel processing. # # Alternatively, we could have used GroupBy. However, that returns # an _iterable_ of items rather than a PCollection, and so it is # vulnerable to out-of-memory errors. # # Since this job is concerned with running audits on EVERY MODEL IN # STORAGE, Partition is the clear winner regardless of the overhead # we'll see in unit tests. | 'Split models into parallelizable PCollections' >> beam.Partition( lambda m, _, kinds: kinds.index(job_utils.get_model_kind(m)), # NOTE: Partition requires a hard-coded number of slices; it # cannot be used with dynamic numbers generated in a pipeline. # KIND_BY_INDEX is a constant tuple so that requirement is # satisfied in this case. len(KIND_BY_INDEX), KIND_BY_INDEX) ) existing_key_count_pcolls = [] missing_key_error_pcolls = [] audit_error_pcolls = [ deleted_models | 'Apply ValidateDeletedModel on deleted models' >> ( beam.ParDo(base_validation.ValidateDeletedModel())) ] model_groups = python_utils.ZIP(KIND_BY_INDEX, models_of_kind_by_index) for kind, models_of_kind in model_groups: audit_error_pcolls.extend(models_of_kind | ApplyAuditDoFns(kind)) if kind in ALL_MODEL_KINDS_REFERENCED_BY_PROPERTIES: existing_key_count_pcolls.append( models_of_kind | GetExistingModelKeyCounts(kind)) if kind in ID_REFERENCING_PROPERTIES_BY_KIND_OF_POSSESSOR: missing_key_error_pcolls.extend( models_of_kind | GetMissingModelKeyErrors(kind)) existing_key_counts = ( existing_key_count_pcolls | 'Flatten PCollections of existing key counts' >> beam.Flatten() ) missing_key_errors = ( missing_key_error_pcolls | 'Flatten PCollections of missing key errors' >> beam.Flatten() ) audit_error_pcolls.append( (existing_key_counts, missing_key_errors) | 'Group counts and errors by key' >> beam.CoGroupByKey() | 'Filter keys without any errors' >> ( beam.FlatMapTuple(self._get_model_relationship_errors)) ) return audit_error_pcolls | 'Combine audit results' >> beam.Flatten()
def test_get_from_bad_value(self): self.assertRaisesRegexp( TypeError, 'not a model type or instance', lambda: job_utils.get_model_kind(123))
def test_get_from_datastore_model_class(self): self.assertEqual( job_utils.get_model_kind(base_models.BaseModel), 'BaseModel')
def test_get_from_datastore_model(self): model = base_models.BaseModel() self.assertEqual(job_utils.get_model_kind(model), 'BaseModel')
def test_get_from_bad_value(self) -> None: self.assertRaisesRegexp( # type: ignore[no-untyped-call] TypeError, 'not a model type or instance', lambda: job_utils.get_model_kind(123)) # type: ignore[arg-type]
def test_get_from_cloud_datastore_entity(self): entity = cloud_datastore_types.Entity( key=cloud_datastore_types.Key('BaseModel', '123', project='foo')) self.assertEqual(job_utils.get_model_kind(entity), 'BaseModel')
def run(self): """Returns a PCollection of audit errors aggregated from all models. Returns: PCollection. A PCollection of audit errors discovered during the audit. Raises: ValueError. When the `model_getter` option, which should be the type of PTransform we will use to fetch models from the datastore, is None. """ if self.job_options.model_getter is None: raise ValueError('JobOptions.model_getter must not be None') existing_models, deleted_models = ( self.pipeline | 'Get all models' >> self.job_options.model_getter() | 'Partition by model.deleted' >> (beam.Partition(lambda model, _: int(model.deleted), 2))) models_of_kind_by_index = ( existing_models # NOTE: Partition returns a statically-sized list of PCollections. # Creating partitions is wasteful when there are fewer items than # there are partitions, like in our unit tests. In exchange, in # production the job will be able to take advantage of the high # parallelizability of PCollections, which are designed for enormous # datasets and parallel processing. # # Alternatively, we could have used GroupBy. However, that returns # an _iterable_ of items rather than a PCollection, and so it is # vulnerable to out-of-memory errors. # # Since this job is concerned with running audits on EVERY MODEL IN # STORAGE, Partition is the clear winner regardless of the overhead # we'll see in unit tests. | 'Split models into parallelizable PCollections' >> beam.Partition( lambda m, _, kinds: kinds.index(job_utils.get_model_kind(m)), # NOTE: Partition requires a hard-coded number of slices; it # cannot be used with dynamic numbers generated in a pipeline. # KIND_BY_INDEX is a constant tuple so that requirement is # satisfied in this case. len(KIND_BY_INDEX), KIND_BY_INDEX)) audit_error_pcolls = [ deleted_models | 'Apply ValidateDeletedModel on deleted models' >> (beam.ParDo(base_model_audits.ValidateDeletedModel())) ] model_groups = python_utils.ZIP(KIND_BY_INDEX, models_of_kind_by_index) for kind, models_of_kind in model_groups: # NOTE: Using extend() instead of append() because ApplyAuditDoFns # produces an iterable of PCollections rather than a single one. # NOTE: Label is missing because ApplyAuditDoFns labels itself. audit_error_pcolls.extend(models_of_kind | ApplyAuditDoFns(kind)) return audit_error_pcolls | 'Combine audit results' >> beam.Flatten()