class JSONMultiSchemaModel(fake_models.FakeModel): entity = JSONSchemaField(schema=multischema( { 'PERSON': object(type=const('PERSON'), name=string(), required=['type', 'name']), 'ANIMAL': object( type=const('ANIMAL'), age=number(), required=['type', 'age']) }, by_field='type'))
class JSONArrayMultiSchemaModel(fake_models.FakeModel): class EntityType(Enum): WORKER = 'WORKER' FREE = 'FREE' entities = JSONSchemaField(schema=array( one_of(object(name=string(), type=enum(EntityType, const=EntityType.WORKER.value), required=['type', 'name']), object(age=number(), type=enum(EntityType, const=EntityType.FREE.value), required=['type', 'age']), by_field='type')))
class DownloadRequest(ValidatingModel): objects = DownloadRequestManager() # # Version Control # created_datetime = models.DateTimeField(auto_now_add=True) updated_datetime = models.DateTimeField(auto_now=True) # # Authorship # created_by = models.ForeignKey(Account, null=True, blank=True, on_delete=models.SET_NULL) # # Waiters # waiters = models.ManyToManyField( Account, related_name='download_requests_as_waiter') # # Data Related Fields # @unique class FilterOperator(Enum): GREATER_THAN = '>' GREATER_THAN_EQUAL = '>=' SMALLER_THAN = '<' SMALLER_THAN_EQUAL = '<=' EQUAL = '=' NOT_EQUAL = '!=' column_type_to_operators = { CatalogueItem.ColumnType.INTEGER.value: [ FilterOperator.GREATER_THAN.value, FilterOperator.GREATER_THAN_EQUAL.value, FilterOperator.SMALLER_THAN.value, FilterOperator.SMALLER_THAN_EQUAL.value, FilterOperator.EQUAL.value, FilterOperator.NOT_EQUAL.value, ], CatalogueItem.ColumnType.FLOAT.value: [ FilterOperator.GREATER_THAN.value, FilterOperator.GREATER_THAN_EQUAL.value, FilterOperator.SMALLER_THAN.value, FilterOperator.SMALLER_THAN_EQUAL.value, FilterOperator.EQUAL.value, FilterOperator.NOT_EQUAL.value, ], CatalogueItem.ColumnType.STRING.value: [ FilterOperator.GREATER_THAN.value, FilterOperator.GREATER_THAN_EQUAL.value, FilterOperator.SMALLER_THAN.value, FilterOperator.SMALLER_THAN_EQUAL.value, FilterOperator.EQUAL.value, FilterOperator.NOT_EQUAL.value, ], CatalogueItem.ColumnType.BOOLEAN.value: [ FilterOperator.EQUAL.value, FilterOperator.NOT_EQUAL.value, ], } spec = JSONSchemaField( schema=object(columns=array(string()), filters=array( object(name=string(), operator=enum( *[o.value for o in FilterOperator]), value=one_of(number(), string(), boolean()), required=['name', 'operator', 'value'])), randomize_ratio=number(), required=['columns', 'filters'])) normalized_spec = models.TextField(default='', blank=True) blob_name = models.CharField(null=True, blank=True, max_length=256) real_size = models.IntegerField(null=True, blank=True) estimated_size = models.IntegerField(null=True, blank=True) # # CATALOGER / EXECUTOR # catalogue_item = models.ForeignKey(CatalogueItem, on_delete=models.CASCADE, related_name='download_requests') executor_job_id = models.CharField(null=True, blank=True, max_length=256) is_cancelled = models.BooleanField(default=False) def execute(self): self.blob_name = self.catalogue_item.executor.execute(self) self.save() def clean(self): self.validate_spec_in_context_of_catalogue_item_spec() def validate_spec_in_context_of_catalogue_item_spec(self): """Validate spec using `CatalogueItem.spec`. - `spec.columns` must be taken from the list of registered columns as specified in `catalogue_item.spec` - `spec.filters[i].name` must be taken from the list of registered columns as specified in `catalogue_item.spec` - `spec.filters[i].operator` must be taken from the list of operators allowed for a column type - `spec.filters[i].value` must be of column type (or None is `is_nullable` was set) - `spec.randomize_ratio` must be in range [0, 1] """ # -- only `catalogue_item.spec` columns are allowed # -- in `columns` and `filters` sections allowed_columns = set( [col['name'] for col in self.catalogue_item.spec]) columns = set(self.spec['columns']) if not columns: raise ValidationError( "at least one column must be specified in 'columns'") if len(self.spec['columns']) != len(columns): raise ValidationError("columns must appear only once in 'columns'") col_is_nullable = { column_spec['name']: column_spec['is_nullable'] for column_spec in self.catalogue_item.spec } col_types = { column_spec['name']: column_spec['type'] for column_spec in self.catalogue_item.spec } filter_columns = set(f['name'] for f in self.spec['filters']) if not columns.issubset(allowed_columns): unknown_columns = columns - allowed_columns unknown_columns = ', '.join([f"'{c}'" for c in unknown_columns]) # noqa raise ValidationError( f"unknown columns in 'columns' detected: {unknown_columns}") if not filter_columns.issubset(allowed_columns): unknown_columns = filter_columns - allowed_columns unknown_columns = ', '.join([f"'{c}'" for c in unknown_columns]) raise ValidationError( f"unknown columns in 'filters' detected: {unknown_columns}") filters = self.spec['filters'] for f in filters: # -- operators in the filters must be valid ones operator = f.get('operator') name = f.get('name') value = f.get('value') if not operator or not name or not value: continue # -- types used in filter must correspond to the types of their # -- respectful columns in `catalogue_item.spec` col_type = col_types[name] col_python_type = ( CatalogueItem.column_type_to_python_type[col_type]) expected_types = (col_python_type, ) if col_is_nullable[name]: expected_types += (type(None), ) if not isinstance(value, expected_types): raise ValidationError(f"column type and filter value type " f"mismatch detected for column '{name}'") allowed_operators = self.column_type_to_operators[col_type] if operator not in allowed_operators: raise ValidationError( f"operator '{operator}' not allowed for column '{name}' " f"detected") # -- randomized_ratio must be in range [0, 1] randomize_ratio = self.spec.get('randomize_ratio') if not isinstance(randomize_ratio, float): return if randomize_ratio < 0 or randomize_ratio > 1: raise ValidationError( "'randomize_ratio' not in allowed [0, 1] range detected") @staticmethod def normalize_spec(spec): columns = ','.join(sorted(spec['columns'])) filters = ','.join( sorted([ f"{fltr['name']}{fltr['operator']}{fltr['value']}" for fltr in spec['filters'] ])) randomize_ratio = spec.get('randomize_ratio', 1) return (f'columns:{columns}|' f'filters:{filters}|' f'randomize_ratio:{randomize_ratio}') def __str__(self): return (f'{self.id} - ' f'{self.created_by.email}: ' f'requested {self.catalogue_item.name}') @property def download_uri(self): if not self.blob_name: return account_name = settings.AZURE_BLOB_STORAGE_ACCOUNT_NAME container_name = settings.AZURE_BLOB_STORAGE_CONTAINER # FIXME: !!! this is a quick fix just to make it work blob_name = self.blob_name.replace(container_name + '/', '') sas = generate_blob_sas( account_name=account_name, account_key=settings.AZURE_BLOB_STORAGE_ACCOUNT_KEY, container_name=container_name, blob_name=blob_name, permission='r', start=timezone.now(), expiry=timezone.now() + timedelta(seconds=3600)) return (f'https://{account_name}.blob.core.windows.net/' f'{container_name}/{blob_name}?{sas}')
class CatalogueItem(ValidatingModel): # # Version Control # created_datetime = models.DateTimeField(auto_now_add=True) updated_datetime = models.DateTimeField(auto_now=True) # # Authorship # created_by = models.ForeignKey(Account, null=True, blank=True, on_delete=models.SET_NULL, related_name='catalogue_item_as_creator') updated_by = models.ForeignKey(Account, null=True, blank=True, on_delete=models.SET_NULL, related_name='catalogue_item_as_updater') maintained_by = models.ForeignKey( Account, null=True, blank=True, on_delete=models.SET_NULL, related_name='catalogue_item_as_maintainer') researchers = models.ManyToManyField(Account) # # SPEC # name = models.CharField(max_length=256, unique=True) description = models.TextField(blank=True, null=True) sample = JSONSchemaField(default=list, blank=True, schema=array(object())) @unique class ColumnType(Enum): INTEGER = 'INTEGER' FLOAT = 'FLOAT' STRING = 'STRING' BOOLEAN = 'BOOLEAN' DATETIME = 'DATETIME' column_type_to_python_type = { ColumnType.INTEGER.value: int, ColumnType.FLOAT.value: float, ColumnType.STRING.value: str, ColumnType.DATETIME.value: str, ColumnType.BOOLEAN.value: bool, } SPEC_SCHEMA = array( object( name=string(), description=string(), type=enum(*[t.value for t in ColumnType]), is_enum=boolean(), size=null_or(number()), is_nullable=boolean(), distribution=null_or( array( object(value=one_of(null(), number(), string(), boolean()), count=number(), required=['value', 'count']))), required=[ 'name', 'type', 'size', 'is_nullable', 'distribution', 'is_enum', ], )) spec = JSONSchemaField(schema=SPEC_SCHEMA, validators=[spec_validator]) # # EXECUTOR # @unique class Executor(Enum): DATABRICKS = 'DATABRICKS' ATHENA = 'ATHENA' executor_type = EnumChoiceField(max_length=256, enum=Executor) @property def executor(self): if self.executor_type == CatalogueItem.Executor.ATHENA.value: return AthenaExecutor() elif self.executor_type == CatalogueItem.Executor.DATABRICKS.value: return DatabricksExecutor() else: raise NotImplementedError() def clean(self): self.validate_samples_in_context_of_spec() def validate_samples_in_context_of_spec(self): """Validate `samples` using `CatalogueItem.spec` info. - `sample` entries must have the same names as registered in `spec` - `sample` entries values must be the same as the ones registered in `spec` (if `is_nullable` was set to True also None is allowed) """ if not self.sample: return to_python_type = CatalogueItem.column_type_to_python_type col_name_to_type = { column_spec['name']: to_python_type[column_spec['type']] for column_spec in self.spec } col_is_nullable = { column_spec['name']: column_spec['is_nullable'] for column_spec in self.spec } # -- all sample entries should have the same names row_names = set.intersection(*[set(row.keys()) for row in self.sample]) expected_names = set( [column_spec['name'] for column_spec in self.spec]) if row_names != expected_names: raise ValidationError( f"Sample column names and spec names are not identical") # -- take into account that some columns are nullable for i, row in enumerate(self.sample): for name, value in row.items(): expected_types = (col_name_to_type[name], ) if col_is_nullable[name]: expected_types += (type(None), ) if not isinstance(value, expected_types): raise ValidationError( f"column type and sample value type " f"mismatch detected for row number {i} " f"column '{name}'") @property def database(self): return self.name.split('.')[0] @property def table(self): return self.name def update_samples_and_distributions(self): if self.executor_type == self.Executor.ATHENA.value: executor = AthenaExecutor() else: raise NotImplementedError() # -- sample self.sample = executor.get_sample(self) for column in self.spec: # -- size column['size'] = executor.get_size(column['name'], self) # -- distributions column['distribution'] = executor.get_distribution( column['name'], column['type'], column['is_enum'], self) self.save() def __str__(self): return self.name