def validate_params(cls, values):
        """Make sure that all params for given aggregation type are set and valid"""
        agg_type = EnumHelper.from_value_safe(AggregationType,
                                              values.get('type'))
        if not agg_type:
            return values

        if values['type'] == AggregationType.not_set or values[
                'type'] in cls._SIMPLE_AGGS:
            values['params'] = None
            return values

        if 'params' not in values:
            raise ValueError('Missing "params" field')

        if agg_type in cls._WITH_SORT_DIMENSION_AGGS:
            values['params'] = AggregationParamsSortDimension(
                **values['params'])
        elif agg_type == AggregationType.count_distinct.value:
            values['params'] = AggregationParamsCountDistinct(
                **values['params'])
        else:
            raise ValueError(
                f'Unsupported aggregation type - {values["type"]}')

        return values
Exemple #2
0
def scaffold_missing_fields(target_dataset: Optional[str] = None,
                            yes: bool = False,
                            no_remote: bool = True):
    """Scaffold missing field files."""
    echo_info('Loading local state...')
    state = get_local_state(target_dataset=target_dataset)

    errors = []

    for dataset, (fields, models) in state.get_objects_by_package().items():
        for idx, error in enumerate(
                validate_missing_files(fields, models, package_name=dataset)):
            if idx == 0:
                echo_info(
                    f'\nFields referenced in models without definition in dataset {dataset}:'
                )
            echo_info(f'  {error.field_slug}')
            errors.append(error)

    if len(errors) == 0:
        echo_info('No issues found')
        return

    echo_info('')
    if not yes and not click.confirm(
            'You will not be able to query these fields until you define them. Do you want to do that now?'
    ):
        # User decided not to fix issues
        return

    loaded_models: Dict[str, PanoModel] = {}
    if not no_remote:
        connection = Connection.get()
        dialect_name = Connection.get_dialect_name(connection)
        query_runtime = EnumHelper.from_value_safe(HuskyQueryRuntime,
                                                   dialect_name)

        scanner_cls = Scanner.get_scanner(query_runtime)
        scanner = scanner_cls()

        echo_info('Scanning remote storage...')
        scanner.scan()
        echo_info('Finished scanning remote storage...')
        loaded_models = scanner.models

    echo_info('Scanning fields...')
    fields = scan_fields_for_errors(errors, loaded_models)
    action_list = ActionList(
        actions=[Action(desired=field) for field in fields])

    echo_info('Updating local state...')

    executor = LocalExecutor()
    for action in action_list.actions:
        try:
            executor.execute(action)
        except Exception:
            echo_error(f'Error: Failed to execute action {action.description}')
    echo_info(
        f'Updated {executor.success_count}/{executor.total_count} fields')
Exemple #3
0
    def to_husky(origin: PanoField) -> Taxon:
        """Maps external field definitions to internal taxon representation"""

        slug = origin.slug if origin.data_source is None else f'{origin.data_source}{NAMESPACE_DELIMITER}{origin.slug}'
        aggregation = None
        if origin.aggregation:
            aggregation = AggregationDefinition.parse_obj(
                origin.aggregation.to_dict())

        validation = EnumHelper.from_value(ValidationType, origin.data_type)
        assert validation

        return Taxon(
            slug=slug,
            taxon_group=origin.group,
            display_name=origin.display_name,
            taxon_type=origin.field_type,
            validation_type=validation,
            taxon_description=origin.description,
            data_source=origin.data_source,
            calculation=origin.calculation,
            aggregation=aggregation,
            display_state=DisplayState.visible,
            company_id=get_company_id(),
        )
    def _detect_filter_clause_type(
            data: Dict[str, Any], expected_type: FilterClauseType,
            filter_clause: Type['FilterClause']
    ) -> Optional[Type['FilterClause']]:
        if 'type' not in data:
            return None

        if EnumHelper.from_value_safe(FilterClauseType,
                                      data['type']) == expected_type:
            return filter_clause

        return None
Exemple #5
0
    def compile_transformation_request(cls, req: TransformRequest, company_id: str) -> Tuple[str, HuskyQueryRuntime]:
        """
        Compiles Transform request to its SQL representation

        :param req: Input request
        :param company_id: Company ID

        :return: SQL and type of dialect
        """
        sorted_fields = sorted(req.requested_fields)
        # prepare origin description
        origin = DataRequestOrigin(
            {
                'system': 'FDQ',
                'extra': {
                    'purpose': 'taxonomy.transform.compile',
                },
            }
        )

        # get all used taxons in the request
        used_taxons_map = fetch_all_used_taxons_map(company_id, sorted_fields)

        # figure out set of all virtual data sources covered by the taxons in the request
        used_vds = {taxon.data_source for taxon in used_taxons_map.values() if taxon.data_source}

        # generate subrequest for each virtual data source
        # this will allow Husky to push the taxons into relevant subrequests
        subrequests = []
        for vds in sorted(used_vds):
            subrequest = ApiDataRequest({'scope': {'company_id': company_id}, 'properties': {'data_sources': [vds]}})

            subrequests.append(subrequest)

        # finalize the blending husky request
        husky_request_dict = {'data_subrequests': subrequests, 'taxons': req.requested_fields, 'origin': origin}

        husky_request = BlendingDataRequest(husky_request_dict)

        connection = Connection.get()

        query_runtime_name = Connection.get_dialect_name(connection)
        query_runtime = EnumHelper.from_value_safe(HuskyQueryRuntime, query_runtime_name)
        context = HuskyQueryContext(query_runtime)

        husky_dataframe = QueryBuilder.validate_data_request(context, husky_request)

        # add another layer of query to use correct names
        final_query = cls._correct_column_aliases(context, husky_dataframe)

        return compile_query(final_query, context.dialect), context.query_runtime
Exemple #6
0
    def from_request(cls, data_request: Union[BlendingDataRequest, InternalDataRequest]):
        if data_request.physical_data_sources:
            if len(data_request.physical_data_sources) == 1:
                request_pds = data_request.physical_data_sources[0]
                connection = Connections.get_by_name(request_pds, True)

                query_runtime_name = Connections.get_connection_engine(connection).dialect.name
                query_runtime = EnumHelper.from_value_safe(HuskyQueryRuntime, query_runtime_name)
                if query_runtime is None:
                    raise UnsupportedSQLOutputException(query_runtime_name)

                return cls(query_runtime)
            elif len(data_request.physical_data_sources) > 1:
                raise TooManyPhysicalDataSourcesException(data_request.physical_data_sources)
        else:
            return cls(HuskyQueryRuntime.snowflake)
Exemple #7
0
def scan(filter_reg_ex: Optional[str] = None):
    """Scan all metadata for given source and filter."""

    connection_info = Connection.get()
    dialect_name = Connection.get_dialect_name(connection_info)

    query_runtime = EnumHelper.from_value_safe(HuskyQueryRuntime, dialect_name)
    if not query_runtime:
        raise UnsupportedDialectError(dialect_name)

    scanner_cls = Scanner.get_scanner(query_runtime)
    scanner = scanner_cls()

    echo_info('Started scanning the data source')
    scanner.scan(force_reset=True)
    echo_info('Finished scanning the data source')

    # apply regular expression as a filter on model names
    if filter_reg_ex:
        re_compiled = re.compile(filter_reg_ex)
        models = [
            model for model in scanner.models.values()
            if re_compiled.match(model.model_name)
        ]
    else:
        models = list(scanner.models.values())

    if len(scanner.models) == 0:
        echo_info('No tables have been found')
        return

    progress_bar = tqdm(total=len(scanner.models))
    writer = FileWriter()
    for model in models:
        writer.write_scanned_model(model)
        progress_bar.write(f'Discovered model {model.model_name}')

        progress_bar.update()

    progress_bar.write(f'Scanned {progress_bar.total} tables')
 def _parse_taxon_expr(
     ctx: HuskyQueryContext,
     taxon: Taxon,
     tel_prefix: str,
     data_sources: Iterable[str],
     all_taxons: TaxonMap,
     subrequest_only=False,
 ):
     taxon_type = EnumHelper.from_value(TaxonTypeEnum, taxon.taxon_type)
     try:
         return TaxonTelDialect().render(
             expr=cast(str, taxon.calculation),
             ctx=ctx,
             taxon_map=all_taxons,
             taxon_slug=tel_prefix,
             comparison=taxon.is_comparison_taxon,
             data_sources=data_sources,
             taxon_type=taxon_type,
             aggregation=taxon.aggregation,
             subrequest_only=subrequest_only,
         )
     except TelExpressionException as error:
         raise HuskyInvalidTelException(error, taxon.slug)
Exemple #9
0
def map_error_to_field(error: MissingFieldFileError,
                       loaded_models: Dict[str, PanoModel]) -> PanoField:
    # try to find the field in scanned state
    model = loaded_models.get(error.model_name)
    data_type = ValidationType.text

    if model:
        # model with this field was scanned so let's try to find this field
        field = [
            model_field for model_field in model.fields
            if error.field_slug in model_field.field_map
        ]

        if len(field) == 1:
            # exactly this field was scanned so let's determine its correct validation type
            field_data_type = EnumHelper.from_value_safe(
                ValidationType, field[0].data_type)
            if field_data_type:
                data_type = field_data_type

    field_type = TaxonTypeEnum.metric if data_type in METRIC_VALIDATION_TYPES else TaxonTypeEnum.dimension

    if field_type is TaxonTypeEnum.dimension:
        aggregation = Aggregation(type='group_by', params=None)
    else:
        aggregation = Aggregation(type='sum', params=None)

    return PanoField(
        slug=error.field_slug,
        field_type=field_type.value,
        display_name=error.field_slug,
        data_source=error.dataset_slug,
        group='CLI',
        data_type=data_type.value,
        aggregation=aggregation,
    )
Exemple #10
0
    def deep_construct(cls: Type[BaseModel],
                       _fields_set: Optional[Set[str]] = None,
                       **values: Any):
        """
        Copied from pydantic BaseModel and modified to be able to construct models recursively from
        primitive data types and enum values. It can deserialize models inheriting from PydanticModel,
        including lists of models.

        WARNING:
        - Dictionaries and sets are copied without any changes, even if they contain Pydantic models
        - Invalid enum values are ignored and replaced with None (no exception is thrown)

        Creates a new model setting __dict__ and __fields_set__ from trusted or pre-validated data.
        Default values are respected, but no other validation is performed.
        """
        m = cls.__new__(cls)

        for field_name, field in m.__fields__.items():
            field_type = field.type_
            if field.shape == SHAPE_LIST:
                # Lists can have their actual types kinda hidden
                # Would need change if we have List[Union[TypeA,TypeB]].. but that is quite an edge case and
                # not sure pydantic even supports that
                list_field_type = field.sub_fields[0].type_
                if (inspect.isclass(list_field_type)
                        and issubclass(list_field_type, PydanticModel)
                        and values.get(field_name) is not None):
                    deserialized_list = []
                    for model in values[field_name]:
                        if model is not None:
                            deserialized_list.append(
                                list_field_type.deep_construct(**model))
                        else:
                            deserialized_list.append(None)
                    values[field_name] = deserialized_list
                if issubclass(type(list_field_type),
                              EnumMeta) and values.get(field_name) is not None:
                    deserialized_list = []
                    for enum in values[field_name]:
                        if enum is not None:
                            # Deserialize enum and replace invalid values with None, do not throw exception
                            deserialized_list.append(
                                EnumHelper.from_value_safe(
                                    list_field_type, enum))
                        else:
                            deserialized_list.append(None)
                    values[field_name] = deserialized_list
            elif (inspect.isclass(field_type)
                  and issubclass(field_type, PydanticModel)
                  and values.get(field_name) is not None):
                values[field_name] = field_type.deep_construct(
                    **values[field_name])
            elif issubclass(type(field_type),
                            EnumMeta) and values.get(field_name) is not None:
                # Deserialize enum and replace invalid values with None, do not throw exception
                values[field_name] = EnumHelper.from_value_safe(
                    field_type, values[field_name])

        object.__setattr__(m, '__dict__', {
            **deepcopy(cls.__field_defaults__),
            **values
        })
        if _fields_set is None:
            _fields_set = set(values.keys())
        object.__setattr__(m, '__fields_set__', _fields_set)
        return m