Esempi in Python per ProfilerError.ProfilerError, esempi in Python per great_expectations.exceptions.ProfilerError.ProfilerError

Esempio n. 1

0

Mostra file

File: user_configurable_profiler.py Progetto: yjlee215/great_expectations

    def _validate_semantic_types_dict(self, dataset):
        """
        Validates a semantic_types dict to ensure correct formatting, that all semantic_types are recognized, and that
        the semantic_types align with the column data types
        Args:
            dataset: A GE dataset
            config: A config dictionary

        Returns:
            The validated semantic_types dictionary

        """
        if not isinstance(self.semantic_types_dict, dict):
            raise ValueError(
                f"The semantic_types dict in the config must be a dictionary, but is currently a "
                f"{type(self.semantic_types_dict)}. Please reformat."
            )
        for k, v in self.semantic_types_dict.items():
            assert isinstance(v, list), (
                "Entries in semantic type dict must be lists of column names e.g. "
                "{'semantic_types': {'numeric': ['number_of_transactions']}}"
            )
            if k.upper() not in profiler_semantic_types:
                raise ValueError(
                    f"{k} is not a recognized semantic_type. Please only include one of "
                    f"{profiler_semantic_types}"
                )

        selected_columns = [
            column
            for column_list in self.semantic_types_dict.values()
            for column in column_list
        ]
        if selected_columns:
            for column in selected_columns:
                if column not in dataset.get_table_columns():
                    raise ProfilerError(f"Column {column} does not exist.")
                elif column in self.ignored_columns:
                    raise ValueError(
                        f"Column {column} is specified in both the semantic_types_dict and the list of "
                        f"ignored columns. Please remove one of these entries to proceed."
                    )

        for semantic_type, column_list in self.semantic_types_dict.items():
            for column_name in column_list:
                processed_column = self.column_info.get(column_name)
                if semantic_type == "datetime":
                    assert processed_column.get("type") in ("DATETIME", "STRING",), (
                        f"Column {column_name} must be a datetime column or a string but appears to be "
                        f"{processed_column.get('type')}"
                    )
                elif semantic_type == "numeric":
                    assert processed_column.get("type") in (
                        "INT",
                        "FLOAT",
                        "NUMERIC",
                    ), f"Column {column_name} must be an int or a float but appears to be {processed_column.get('type')}"
                elif semantic_type in ("STRING", "VALUE_SET"):
                    pass
        return self.semantic_types_dict

Esempio n. 2

0

Mostra file

File: basic_suite_builder_profiler.py Progetto: rpatil524/great_expectations

def _check_that_columns_exist(dataset, columns) -> None:
    if columns:
        for column in columns:
            if column not in dataset.get_table_columns():
                raise ProfilerError(f"Column {column} does not exist.")

Esempio n. 3

0

Mostra file

File: basic_suite_builder_profiler.py Progetto: rpatil524/great_expectations

def _check_that_expectations_are_available(dataset, expectations) -> None:
    if expectations:
        for expectation in expectations:
            if expectation not in dataset.list_available_expectation_types():
                raise ProfilerError(
                    f"Expectation {expectation} is not available.")

Esempio n. 4

0

Mostra file

File: basic_suite_builder_profiler.py Progetto: rpatil524/great_expectations

    def _profile(cls, dataset, configuration=None):
        logger.debug(f"Running profiler with configuration: {configuration}")
        if configuration == "demo":
            return cls._demo_profile(dataset)

        existing_columns = dataset.get_table_columns()
        selected_columns = existing_columns
        included_expectations = []
        excluded_expectations = []

        if configuration:
            if ("included_expectations" in configuration
                    and "excluded_expectations" in configuration):
                raise ProfilerError(
                    "Please specify either `included_expectations` or `excluded_expectations`."
                )
            if "included_expectations" in configuration:
                included_expectations = configuration["included_expectations"]
                if included_expectations in [False, None, []]:
                    included_expectations = None
                _check_that_expectations_are_available(dataset,
                                                       included_expectations)
            if "excluded_expectations" in configuration:
                excluded_expectations = configuration["excluded_expectations"]
                if excluded_expectations in [False, None, []]:
                    excluded_expectations = None
                _check_that_expectations_are_available(dataset,
                                                       excluded_expectations)

            if ("included_columns" in configuration
                    and "excluded_columns" in configuration):
                raise ProfilerError(
                    "Please specify either `excluded_columns` or `included_columns`."
                )
            elif "included_columns" in configuration:
                selected_columns = configuration["included_columns"]
                if selected_columns in [False, None, []]:
                    selected_columns = []
            elif "excluded_columns" in configuration:
                excluded_columns = configuration["excluded_columns"]
                if excluded_columns in [False, None, []]:
                    excluded_columns = []
                selected_columns = set(existing_columns) - set(
                    excluded_columns)

        _check_that_columns_exist(dataset, selected_columns)
        if included_expectations is None:
            suite = cls._build_column_description_metadata(dataset)
            # remove column exist expectations
            suite.expectations = []
            return suite

        dataset.set_default_expectation_argument("catch_exceptions", False)
        dataset = cls._build_table_row_count_expectation(
            dataset,
            excluded_expectations=excluded_expectations,
            included_expectations=included_expectations,
        )
        dataset.set_config_value("interactive_evaluation", True)
        dataset = cls._build_table_column_expectations(
            dataset,
            excluded_expectations=excluded_expectations,
            included_expectations=included_expectations,
        )

        column_cache = {}
        if selected_columns:
            with tqdm(total=len(selected_columns),
                      desc="Profiling Columns",
                      delay=5) as pbar:
                for column in selected_columns:
                    pbar.set_postfix_str(column)
                    cardinality = cls._get_column_cardinality_with_caching(
                        dataset, column, column_cache)
                    column_type = cls._get_column_type_with_caching(
                        dataset, column, column_cache)

                    if cardinality in [
                            ProfilerCardinality.TWO,
                            ProfilerCardinality.VERY_FEW,
                            ProfilerCardinality.FEW,
                    ]:
                        cls._create_expectations_for_low_card_column(
                            dataset, column, column_cache)
                    elif cardinality in [
                            ProfilerCardinality.MANY,
                            ProfilerCardinality.VERY_MANY,
                            ProfilerCardinality.UNIQUE,
                    ]:
                        # TODO we will want to finesse the number and types of
                        #  expectations created here. The simple version is deny/allow list
                        #  and the more complex version is desired per column type and
                        #  cardinality. This deserves more thought on configuration.
                        dataset.expect_column_values_to_be_unique(column)

                        if column_type in [
                                ProfilerDataType.INT,
                                ProfilerDataType.FLOAT,
                        ]:
                            cls._create_expectations_for_numeric_column(
                                dataset, column)
                        elif column_type in [ProfilerDataType.DATETIME]:
                            cls._create_expectations_for_datetime_column(
                                dataset,
                                column,
                                excluded_expectations=excluded_expectations,
                                included_expectations=included_expectations,
                            )
                        elif column_type in [ProfilerDataType.STRING]:
                            cls._create_expectations_for_string_column(
                                dataset,
                                column,
                                excluded_expectations=excluded_expectations,
                                included_expectations=included_expectations,
                            )
                        elif column_type in [ProfilerDataType.UNKNOWN]:
                            logger.debug(
                                f"Skipping expectation creation for column {column} of unknown type: {column_type}"
                            )
                    pbar.update()

        if excluded_expectations:
            # NOTE: we reach into a private member here because of an expected future
            # refactor that will make the suite directly accessible
            dataset._expectation_suite.remove_all_expectations_of_type(
                excluded_expectations)
        if included_expectations:
            for expectation in dataset.get_expectation_suite(
                    discard_failed_expectations=False,
                    suppress_logging=True,
            ).expectations:
                if expectation.expectation_type not in included_expectations:
                    try:
                        dataset.remove_expectation(
                            ExpectationConfiguration(
                                expectation_type=expectation.expectation_type,
                                kwargs=expectation.kwargs,
                            ),
                            match_type="domain",
                            remove_multiple_matches=True,
                        )
                    except ValueError:
                        logger.debug(
                            f"Attempted to remove {expectation}, which was not found."
                        )

        expectation_suite = cls._build_column_description_metadata(dataset)

        return expectation_suite

Esempio n. 5

0

Mostra file

File: basic_suite_builder_profiler.py Progetto: tsanikgr/great_expectations

    def _profile(cls, dataset, configuration=None):
        logger.debug(f"Running profiler with configuration: {configuration}")
        if configuration == "demo":
            return cls._demo_profile(dataset)

        existing_columns = dataset.get_table_columns()
        selected_columns = existing_columns
        included_expectations = []
        excluded_expectations = []

        if configuration:
            if ("included_expectations" in configuration
                    and "excluded_expectations" in configuration):
                raise ProfilerError(
                    "Please specify either `included_expectations` or `excluded_expectations`."
                )
            if "included_expectations" in configuration:
                included_expectations = configuration["included_expectations"]
                if included_expectations in [False, None, []]:
                    included_expectations = None
                _check_that_expectations_are_available(dataset,
                                                       included_expectations)
            if "excluded_expectations" in configuration:
                excluded_expectations = configuration["excluded_expectations"]
                _check_that_expectations_are_available(dataset,
                                                       excluded_expectations)

            if ("included_columns" in configuration
                    and "excluded_columns" in configuration):
                raise ProfilerError(
                    "Please specify either `excluded_columns` or `included_columns`."
                )
            elif "included_columns" in configuration:
                selected_columns = configuration["included_columns"]
            elif "excluded_columns" in configuration:
                excluded_columns = configuration["excluded_columns"]
                if excluded_columns in [False, None, []]:
                    excluded_columns = []
                selected_columns = set(existing_columns) - set(
                    excluded_columns)

        _check_that_columns_exist(dataset, selected_columns)
        if included_expectations is None:
            suite = cls._build_column_description_metadata(dataset)
            # remove column exist expectations
            suite.expectations = []
            return suite

        dataset.set_default_expectation_argument("catch_exceptions", False)
        dataset = cls._build_table_row_count_expectation(dataset,
                                                         tolerance=0.1)
        dataset.set_config_value("interactive_evaluation", True)
        dataset = cls._build_table_column_expectations(dataset)

        column_cache = {}
        if selected_columns:
            for column in selected_columns:
                cardinality = cls._get_column_cardinality_with_caching(
                    dataset, column, column_cache)
                column_type = cls._get_column_type_with_caching(
                    dataset, column, column_cache)

                if cardinality in [
                        ProfilerCardinality.TWO,
                        ProfilerCardinality.VERY_FEW,
                        ProfilerCardinality.FEW,
                ]:
                    cls._create_expectations_for_low_card_column(
                        dataset, column, column_cache)
                elif cardinality in [
                        ProfilerCardinality.MANY,
                        ProfilerCardinality.VERY_MANY,
                        ProfilerCardinality.UNIQUE,
                ]:
                    # TODO we will want to finesse the number and types of
                    #  expectations created here. The simple version is blacklisting
                    #  and the more complex version is desired per column type and
                    #  cardinality. This deserves more thought on configuration.
                    dataset.expect_column_values_to_be_unique(column)

                    if column_type in [
                            ProfilerDataType.INT, ProfilerDataType.FLOAT
                    ]:
                        cls._create_expectations_for_numeric_column(
                            dataset, column)
                    elif column_type in [ProfilerDataType.DATETIME]:
                        cls._create_expectations_for_datetime_column(
                            dataset, column)
                    elif column_type in [ProfilerDataType.STRING]:
                        cls._create_expectations_for_string_column(
                            dataset, column)
                    elif column_type in [ProfilerDataType.UNKNOWN]:
                        logger.debug(
                            f"Skipping expectation creation for column {column} of unknown type: {column_type}"
                        )

        if excluded_expectations:
            dataset = _remove_table_expectations(dataset,
                                                 excluded_expectations)
            dataset = _remove_column_expectations(dataset,
                                                  excluded_expectations)
        if included_expectations:
            for expectation in dataset.get_expectation_suite().expectations:
                if expectation.expectation_type not in included_expectations:
                    try:
                        dataset.remove_expectation(
                            expectation_type=expectation.expectation_type,
                            expectation_kwargs=expectation.kwargs,
                            column=expectation.kwargs.get("column", None),
                            remove_multiple_matches=True,
                        )
                    except ValueError:
                        logger.debug(
                            f"Attempted to remove {expectation}, which was not found."
                        )

        expectation_suite = cls._build_column_description_metadata(dataset)

        return expectation_suite