Beispiel #1
0
class JSONMultiSchemaModel(fake_models.FakeModel):

    entity = JSONSchemaField(schema=multischema(
        {
            'PERSON':
            object(type=const('PERSON'),
                   name=string(),
                   required=['type', 'name']),
            'ANIMAL':
            object(
                type=const('ANIMAL'), age=number(), required=['type', 'age'])
        },
        by_field='type'))
Beispiel #2
0
class JSONArrayMultiSchemaModel(fake_models.FakeModel):
    class EntityType(Enum):

        WORKER = 'WORKER'

        FREE = 'FREE'

    entities = JSONSchemaField(schema=array(
        one_of(object(name=string(),
                      type=enum(EntityType, const=EntityType.WORKER.value),
                      required=['type', 'name']),
               object(age=number(),
                      type=enum(EntityType, const=EntityType.FREE.value),
                      required=['type', 'age']),
               by_field='type')))
Beispiel #3
0
class DownloadRequest(ValidatingModel):

    objects = DownloadRequestManager()

    #
    # Version Control
    #
    created_datetime = models.DateTimeField(auto_now_add=True)

    updated_datetime = models.DateTimeField(auto_now=True)

    #
    # Authorship
    #
    created_by = models.ForeignKey(Account,
                                   null=True,
                                   blank=True,
                                   on_delete=models.SET_NULL)

    #
    # Waiters
    #
    waiters = models.ManyToManyField(
        Account, related_name='download_requests_as_waiter')

    #
    # Data Related Fields
    #
    @unique
    class FilterOperator(Enum):

        GREATER_THAN = '>'

        GREATER_THAN_EQUAL = '>='

        SMALLER_THAN = '<'

        SMALLER_THAN_EQUAL = '<='

        EQUAL = '='

        NOT_EQUAL = '!='

    column_type_to_operators = {
        CatalogueItem.ColumnType.INTEGER.value: [
            FilterOperator.GREATER_THAN.value,
            FilterOperator.GREATER_THAN_EQUAL.value,
            FilterOperator.SMALLER_THAN.value,
            FilterOperator.SMALLER_THAN_EQUAL.value,
            FilterOperator.EQUAL.value,
            FilterOperator.NOT_EQUAL.value,
        ],
        CatalogueItem.ColumnType.FLOAT.value: [
            FilterOperator.GREATER_THAN.value,
            FilterOperator.GREATER_THAN_EQUAL.value,
            FilterOperator.SMALLER_THAN.value,
            FilterOperator.SMALLER_THAN_EQUAL.value,
            FilterOperator.EQUAL.value,
            FilterOperator.NOT_EQUAL.value,
        ],
        CatalogueItem.ColumnType.STRING.value: [
            FilterOperator.GREATER_THAN.value,
            FilterOperator.GREATER_THAN_EQUAL.value,
            FilterOperator.SMALLER_THAN.value,
            FilterOperator.SMALLER_THAN_EQUAL.value,
            FilterOperator.EQUAL.value,
            FilterOperator.NOT_EQUAL.value,
        ],
        CatalogueItem.ColumnType.BOOLEAN.value: [
            FilterOperator.EQUAL.value,
            FilterOperator.NOT_EQUAL.value,
        ],
    }

    spec = JSONSchemaField(
        schema=object(columns=array(string()),
                      filters=array(
                          object(name=string(),
                                 operator=enum(
                                     *[o.value for o in FilterOperator]),
                                 value=one_of(number(), string(), boolean()),
                                 required=['name', 'operator', 'value'])),
                      randomize_ratio=number(),
                      required=['columns', 'filters']))

    normalized_spec = models.TextField(default='', blank=True)

    blob_name = models.CharField(null=True, blank=True, max_length=256)

    real_size = models.IntegerField(null=True, blank=True)

    estimated_size = models.IntegerField(null=True, blank=True)

    #
    # CATALOGER / EXECUTOR
    #
    catalogue_item = models.ForeignKey(CatalogueItem,
                                       on_delete=models.CASCADE,
                                       related_name='download_requests')

    executor_job_id = models.CharField(null=True, blank=True, max_length=256)

    is_cancelled = models.BooleanField(default=False)

    def execute(self):
        self.blob_name = self.catalogue_item.executor.execute(self)
        self.save()

    def clean(self):
        self.validate_spec_in_context_of_catalogue_item_spec()

    def validate_spec_in_context_of_catalogue_item_spec(self):
        """Validate spec using `CatalogueItem.spec`.

        - `spec.columns` must be taken from the list of registered columns
          as specified in `catalogue_item.spec`
        - `spec.filters[i].name` must be taken from the list of
          registered columns as specified in `catalogue_item.spec`
        - `spec.filters[i].operator` must be taken from the list of
          operators allowed for a column type
        - `spec.filters[i].value` must be of column type (or None is
          `is_nullable` was set)
        - `spec.randomize_ratio` must be in range [0, 1]

        """

        # -- only `catalogue_item.spec` columns are allowed
        # -- in `columns` and `filters` sections
        allowed_columns = set(
            [col['name'] for col in self.catalogue_item.spec])
        columns = set(self.spec['columns'])

        if not columns:
            raise ValidationError(
                "at least one column must be specified in 'columns'")

        if len(self.spec['columns']) != len(columns):
            raise ValidationError("columns must appear only once in 'columns'")

        col_is_nullable = {
            column_spec['name']: column_spec['is_nullable']
            for column_spec in self.catalogue_item.spec
        }
        col_types = {
            column_spec['name']: column_spec['type']
            for column_spec in self.catalogue_item.spec
        }

        filter_columns = set(f['name'] for f in self.spec['filters'])

        if not columns.issubset(allowed_columns):
            unknown_columns = columns - allowed_columns
            unknown_columns = ', '.join([f"'{c}'"
                                         for c in unknown_columns])  # noqa
            raise ValidationError(
                f"unknown columns in 'columns' detected: {unknown_columns}")

        if not filter_columns.issubset(allowed_columns):
            unknown_columns = filter_columns - allowed_columns
            unknown_columns = ', '.join([f"'{c}'" for c in unknown_columns])
            raise ValidationError(
                f"unknown columns in 'filters' detected: {unknown_columns}")

        filters = self.spec['filters']
        for f in filters:
            # -- operators in the filters must be valid ones
            operator = f.get('operator')
            name = f.get('name')
            value = f.get('value')

            if not operator or not name or not value:
                continue

            # -- types used in filter must correspond to the types of their
            # -- respectful columns in `catalogue_item.spec`
            col_type = col_types[name]
            col_python_type = (
                CatalogueItem.column_type_to_python_type[col_type])
            expected_types = (col_python_type, )
            if col_is_nullable[name]:
                expected_types += (type(None), )

            if not isinstance(value, expected_types):
                raise ValidationError(f"column type and filter value type "
                                      f"mismatch detected for column '{name}'")

            allowed_operators = self.column_type_to_operators[col_type]
            if operator not in allowed_operators:
                raise ValidationError(
                    f"operator '{operator}' not allowed for column '{name}' "
                    f"detected")

        # -- randomized_ratio must be in range [0, 1]
        randomize_ratio = self.spec.get('randomize_ratio')
        if not isinstance(randomize_ratio, float):
            return

        if randomize_ratio < 0 or randomize_ratio > 1:
            raise ValidationError(
                "'randomize_ratio' not in allowed [0, 1] range detected")

    @staticmethod
    def normalize_spec(spec):

        columns = ','.join(sorted(spec['columns']))
        filters = ','.join(
            sorted([
                f"{fltr['name']}{fltr['operator']}{fltr['value']}"
                for fltr in spec['filters']
            ]))

        randomize_ratio = spec.get('randomize_ratio', 1)
        return (f'columns:{columns}|'
                f'filters:{filters}|'
                f'randomize_ratio:{randomize_ratio}')

    def __str__(self):
        return (f'{self.id} - '
                f'{self.created_by.email}: '
                f'requested {self.catalogue_item.name}')

    @property
    def download_uri(self):

        if not self.blob_name:
            return

        account_name = settings.AZURE_BLOB_STORAGE_ACCOUNT_NAME

        container_name = settings.AZURE_BLOB_STORAGE_CONTAINER

        # FIXME: !!! this is a quick fix just to make it work
        blob_name = self.blob_name.replace(container_name + '/', '')

        sas = generate_blob_sas(
            account_name=account_name,
            account_key=settings.AZURE_BLOB_STORAGE_ACCOUNT_KEY,
            container_name=container_name,
            blob_name=blob_name,
            permission='r',
            start=timezone.now(),
            expiry=timezone.now() + timedelta(seconds=3600))

        return (f'https://{account_name}.blob.core.windows.net/'
                f'{container_name}/{blob_name}?{sas}')
Beispiel #4
0
class CatalogueItem(ValidatingModel):

    #
    # Version Control
    #
    created_datetime = models.DateTimeField(auto_now_add=True)

    updated_datetime = models.DateTimeField(auto_now=True)

    #
    # Authorship
    #
    created_by = models.ForeignKey(Account,
                                   null=True,
                                   blank=True,
                                   on_delete=models.SET_NULL,
                                   related_name='catalogue_item_as_creator')

    updated_by = models.ForeignKey(Account,
                                   null=True,
                                   blank=True,
                                   on_delete=models.SET_NULL,
                                   related_name='catalogue_item_as_updater')

    maintained_by = models.ForeignKey(
        Account,
        null=True,
        blank=True,
        on_delete=models.SET_NULL,
        related_name='catalogue_item_as_maintainer')

    researchers = models.ManyToManyField(Account)

    #
    # SPEC
    #
    name = models.CharField(max_length=256, unique=True)

    description = models.TextField(blank=True, null=True)

    sample = JSONSchemaField(default=list, blank=True, schema=array(object()))

    @unique
    class ColumnType(Enum):

        INTEGER = 'INTEGER'

        FLOAT = 'FLOAT'

        STRING = 'STRING'

        BOOLEAN = 'BOOLEAN'

        DATETIME = 'DATETIME'

    column_type_to_python_type = {
        ColumnType.INTEGER.value: int,
        ColumnType.FLOAT.value: float,
        ColumnType.STRING.value: str,
        ColumnType.DATETIME.value: str,
        ColumnType.BOOLEAN.value: bool,
    }

    SPEC_SCHEMA = array(
        object(
            name=string(),
            description=string(),
            type=enum(*[t.value for t in ColumnType]),
            is_enum=boolean(),
            size=null_or(number()),
            is_nullable=boolean(),
            distribution=null_or(
                array(
                    object(value=one_of(null(), number(), string(), boolean()),
                           count=number(),
                           required=['value', 'count']))),
            required=[
                'name',
                'type',
                'size',
                'is_nullable',
                'distribution',
                'is_enum',
            ],
        ))

    spec = JSONSchemaField(schema=SPEC_SCHEMA, validators=[spec_validator])

    #
    # EXECUTOR
    #
    @unique
    class Executor(Enum):

        DATABRICKS = 'DATABRICKS'

        ATHENA = 'ATHENA'

    executor_type = EnumChoiceField(max_length=256, enum=Executor)

    @property
    def executor(self):
        if self.executor_type == CatalogueItem.Executor.ATHENA.value:
            return AthenaExecutor()
        elif self.executor_type == CatalogueItem.Executor.DATABRICKS.value:
            return DatabricksExecutor()
        else:
            raise NotImplementedError()

    def clean(self):
        self.validate_samples_in_context_of_spec()

    def validate_samples_in_context_of_spec(self):
        """Validate `samples` using `CatalogueItem.spec` info.

        - `sample` entries must have the same names as registered in `spec`
        - `sample` entries values must be the same as the ones registered in
          `spec` (if `is_nullable` was set to True also None is allowed)

        """

        if not self.sample:
            return

        to_python_type = CatalogueItem.column_type_to_python_type
        col_name_to_type = {
            column_spec['name']: to_python_type[column_spec['type']]
            for column_spec in self.spec
        }
        col_is_nullable = {
            column_spec['name']: column_spec['is_nullable']
            for column_spec in self.spec
        }

        # -- all sample entries should have the same names
        row_names = set.intersection(*[set(row.keys()) for row in self.sample])
        expected_names = set(
            [column_spec['name'] for column_spec in self.spec])

        if row_names != expected_names:
            raise ValidationError(
                f"Sample column names and spec names are not identical")

        # -- take into account that some columns are nullable
        for i, row in enumerate(self.sample):
            for name, value in row.items():
                expected_types = (col_name_to_type[name], )
                if col_is_nullable[name]:
                    expected_types += (type(None), )

                if not isinstance(value, expected_types):
                    raise ValidationError(
                        f"column type and sample value type "
                        f"mismatch detected for row number {i} "
                        f"column '{name}'")

    @property
    def database(self):
        return self.name.split('.')[0]

    @property
    def table(self):
        return self.name

    def update_samples_and_distributions(self):

        if self.executor_type == self.Executor.ATHENA.value:
            executor = AthenaExecutor()

        else:
            raise NotImplementedError()

        # -- sample
        self.sample = executor.get_sample(self)

        for column in self.spec:

            # -- size
            column['size'] = executor.get_size(column['name'], self)

            # -- distributions
            column['distribution'] = executor.get_distribution(
                column['name'], column['type'], column['is_enum'], self)

        self.save()

    def __str__(self):
        return self.name