コード例 #1
0
    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:
        """
        generate features for the input.
        Input:
            typing.Union[container.Dataset, container.DataFrame, container.ndarray, container.matrix, container.List]
        Output:
            typing.Union[container.Dataset, container.DataFrame, container.ndarray, container.matrix, container.List]
        """
        # Wrap as container, if needed
        inputs = inputs.copy()
        if not pytypes.is_of_type(inputs, types.Container):
            if isinstance(inputs, pd.DataFrame):
                inputs = container.DataFrame(inputs)
            elif isinstance(inputs, np.matrix):
                inputs = container.matrix(inputs)
            elif isinstance(inputs, np.ndarray):
                inputs = container.ndarray(inputs)
            elif isinstance(inputs, list):
                inputs = container.List(inputs)
            else:
                # Inputs is not a container, and cannot be converted to a container.
                # Nothing to do, since cannot store the computed metadata.
                return CallResult(inputs)

        # calling the utility to detect integer and float datatype columns
        # inputs = dtype_detector.detector(inputs)

        # calling the utility to categorical datatype columns
        metadata = self._produce(inputs, inputs.metadata, [])
        # I guess there are updating the metdata here
        inputs.metadata = metadata

        if inputs.shape[0] > 100:
            self._sample_df = inputs.dropna().iloc[0:100, :]
        else:
            self._sample_df = inputs

        # calling date detector

        self._DateFeaturizer = DateFeaturizerOrg(inputs)
        try:
            cols = self._DateFeaturizer.detect_date_columns(self._sample_df)
        except Exception as e:
            _logger.error(traceback.print_exc(e))
            cols = list()
        if cols:
            indices = [
                inputs.columns.get_loc(c) for c in cols if c in inputs.columns
            ]
            for i in indices:
                old_metadata = dict(
                    inputs.metadata.query((mbase.ALL_ELEMENTS, i)))
                temp_value = list(old_metadata["semantic_types"])
                if len(temp_value) >= 1:
                    # if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' not in old_metadata.get(
                    #         "semantic_types", []):
                    #     old_metadata["semantic_types"] = (
                    #         'https://metadata.datadrivendiscovery.org/types/CategoricalData',
                    #         'https://metadata.datadrivendiscovery.org/types/Attribute')
                    if 'https://metadata.datadrivendiscovery.org/types/Time' not in old_metadata.get(
                            "semantic_types", []):
                        old_metadata["semantic_types"] += (
                            'https://metadata.datadrivendiscovery.org/types/Time',
                        )
                # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str):
                #     old_metadata["structural_type"] = type("str")
                # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int):
                #     old_metadata["structural_type"] = type(10)
                # else:
                #     old_metadata["structural_type"] = type(10.2)

                _logger.info(
                    "Date detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'",
                    {
                        'column_index':
                        i,
                        'old_metadata':
                        dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))),
                        'new_metadata':
                        old_metadata,
                    },
                )

                inputs.metadata = inputs.metadata.update(
                    (mbase.ALL_ELEMENTS, i), old_metadata)

        # calling the PhoneParser detector

        try:
            PhoneParser_indices = PhoneParser.detect(df=self._sample_df)
        except Exception as e:
            _logger.error(traceback.print_exc(e))
            PhoneParser_indices = dict()
        if PhoneParser_indices.get("columns_to_perform"):
            for i in PhoneParser_indices["columns_to_perform"]:
                old_metadata = dict(
                    inputs.metadata.query((mbase.ALL_ELEMENTS, i)))
                # print("old metadata", old_metadata)
                if 'https://metadata.datadrivendiscovery.org/types/isAmericanPhoneNumber' not in old_metadata.get(
                        "semantic_types", []):
                    old_metadata["semantic_types"] += (
                        'https://metadata.datadrivendiscovery.org/types/isAmericanPhoneNumber',
                    )

                # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str):
                #     old_metadata["structural_type"] = type("str")
                # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int):
                #     old_metadata["structural_type"] = type(10)
                # else:
                #     old_metadata["structural_type"] = type(10.2)

                _logger.info(
                    "Phone detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'",
                    {
                        'column_index':
                        i,
                        'old_metadata':
                        dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))),
                        'new_metadata':
                        old_metadata,
                    },
                )
                inputs.metadata = inputs.metadata.update(
                    (mbase.ALL_ELEMENTS, i), old_metadata)

        # calling the PunctuationSplitter detector

        try:
            PunctuationSplitter_indices = PunctuationParser.detect(
                df=self._sample_df,
                max_avg_length=self.hyperparams['split_on_column_with_avg_len']
            )
        except Exception as e:
            _logger.error(traceback.print_exc(e))
            PunctuationSplitter_indices = dict()
        if PunctuationSplitter_indices.get("columns_to_perform"):
            for i in PunctuationSplitter_indices["columns_to_perform"]:
                old_metadata = dict(
                    inputs.metadata.query((mbase.ALL_ELEMENTS, i)))
                if 'https://metadata.datadrivendiscovery.org/types/TokenizableByPunctuation' not in old_metadata.get(
                        "semantic_types", []):
                    old_metadata["semantic_types"] += (
                        'https://metadata.datadrivendiscovery.org/types/TokenizableByPunctuation',
                    )

                # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str):
                #     old_metadata["structural_type"] = type("str")
                # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int):
                #     old_metadata["structural_type"] = type(10)
                # else:
                #     old_metadata["structural_type"] = type(10.2)

                _logger.info(
                    "Punctuation detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'",
                    {
                        'column_index':
                        i,
                        'old_metadata':
                        dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))),
                        'new_metadata':
                        old_metadata,
                    },
                )
                inputs.metadata = inputs.metadata.update(
                    (mbase.ALL_ELEMENTS, i), old_metadata)

        # calling the NumAlphaSplitter detector

        try:
            NumAlphaSplitter_indices = NumAlphaParser.detect(
                df=self._sample_df,
                max_avg_length=self.
                hyperparams['split_on_column_with_avg_len'],
            )
        except Exception as e:
            _logger.error(traceback.print_exc(e))
            NumAlphaSplitter_indices = dict()

        if NumAlphaSplitter_indices.get("columns_to_perform"):
            for i in NumAlphaSplitter_indices["columns_to_perform"]:
                old_metadata = dict(
                    inputs.metadata.query((mbase.ALL_ELEMENTS, i)))
                if 'https://metadata.datadrivendiscovery.org/types/TokenizableIntoNumericAndAlphaTokens' not in old_metadata.get(
                        "semantic_types", []):
                    old_metadata["semantic_types"] += (
                        'https://metadata.datadrivendiscovery.org/types/TokenizableIntoNumericAndAlphaTokens',
                    )

                # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str):
                #     old_metadata["structural_type"] = type("str")
                # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int):
                #     old_metadata["structural_type"] = type(10)
                # else:
                #     old_metadata["structural_type"] = type(10.2)

                _logger.info(
                    "NumAlpha detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'",
                    {
                        'column_index':
                        i,
                        'old_metadata':
                        dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))),
                        'new_metadata':
                        old_metadata,
                    },
                )
                inputs.metadata = inputs.metadata.update(
                    (mbase.ALL_ELEMENTS, i), old_metadata)

        inputs = self._relabel_categorical(inputs)
        return CallResult(inputs)
コード例 #2
0
class Profiler(TransformerPrimitiveBase[Input, Output, Hyperparams]):
    """
    data profiler moduel. Now only supports csv data.

    Parameters:
    ----------
    _punctuation_outlier_weight: a integer
        the coefficient used in outlier detection for punctuation. default is 3

    _numerical_outlier_weight

    _token_delimiter: a string
        delimiter that used to seperate tokens, default is blank space " ".

    _detect_language: boolean
        true: do detect language; false: not detect language

    _topk: a integer

    _verbose: boolean
        control the _verbose

    Attributes:
    ----------
    """
    metadata = hyperparams.base.PrimitiveMetadata({
        'id':
        'b2612849-39e4-33ce-bfda-24f3e2cb1e93',
        'version':
        config.VERSION,
        'name':
        "DSBox Profiler",
        'description':
        'Generate profiles of datasets',
        'python_path':
        'd3m.primitives.schema_discovery.Profiler.DSBOX',
        'primitive_family':
        PrimitiveFamily.SCHEMA_DISCOVERY,
        'algorithm_types': [
            PrimitiveAlgorithmType.DATA_PROFILING,
        ],
        'keywords': ['data_profiler'],
        'source': {
            'name': config.D3M_PERFORMER_TEAM,
            "contact": config.D3M_CONTACT,
            'uris': [config.REPOSITORY],
        },
        # The same path the primitive is registered with entry points in setup.py.
        'installation': [config.INSTALLATION],
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        # A metafeature about preconditions required for this primitive to operate well.
        "precondition": [],
        "hyperparms_to_tune": []
    })

    def __init__(self, *, hyperparams: Hyperparams) -> None:
        super().__init__(hyperparams=hyperparams)

        # All other attributes must be private with leading underscore
        self.hyperparams = hyperparams
        self._punctuation_outlier_weight = 3
        self._numerical_outlier_weight = 3
        self._token_delimiter = " "
        self._detect_language = False
        self._topk = 10
        self._verbose = VERBOSE
        self._sample_df = None
        self._DateFeaturizer = None
        # list of specified features to compute
        self._specified_features = hyperparams[
            "metafeatures"] if hyperparams else default_metafeatures

    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:
        """
        generate features for the input.
        Input:
            typing.Union[container.Dataset, container.DataFrame, container.ndarray, container.matrix, container.List]
        Output:
            typing.Union[container.Dataset, container.DataFrame, container.ndarray, container.matrix, container.List]
        """
        # Wrap as container, if needed
        inputs = inputs.copy()
        if not pytypes.is_of_type(inputs, types.Container):
            if isinstance(inputs, pd.DataFrame):
                inputs = container.DataFrame(inputs)
            elif isinstance(inputs, np.matrix):
                inputs = container.matrix(inputs)
            elif isinstance(inputs, np.ndarray):
                inputs = container.ndarray(inputs)
            elif isinstance(inputs, list):
                inputs = container.List(inputs)
            else:
                # Inputs is not a container, and cannot be converted to a container.
                # Nothing to do, since cannot store the computed metadata.
                return CallResult(inputs)

        # calling the utility to detect integer and float datatype columns
        # inputs = dtype_detector.detector(inputs)

        # calling the utility to categorical datatype columns
        metadata = self._produce(inputs, inputs.metadata, [])
        # I guess there are updating the metdata here
        inputs.metadata = metadata

        if inputs.shape[0] > 100:
            self._sample_df = inputs.dropna().iloc[0:100, :]
        else:
            self._sample_df = inputs

        # calling date detector

        self._DateFeaturizer = DateFeaturizerOrg(inputs)
        try:
            cols = self._DateFeaturizer.detect_date_columns(self._sample_df)
        except Exception as e:
            _logger.error(traceback.print_exc(e))
            cols = list()
        if cols:
            indices = [
                inputs.columns.get_loc(c) for c in cols if c in inputs.columns
            ]
            for i in indices:
                old_metadata = dict(
                    inputs.metadata.query((mbase.ALL_ELEMENTS, i)))
                temp_value = list(old_metadata["semantic_types"])
                if len(temp_value) >= 1:
                    # if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' not in old_metadata.get(
                    #         "semantic_types", []):
                    #     old_metadata["semantic_types"] = (
                    #         'https://metadata.datadrivendiscovery.org/types/CategoricalData',
                    #         'https://metadata.datadrivendiscovery.org/types/Attribute')
                    if 'https://metadata.datadrivendiscovery.org/types/Time' not in old_metadata.get(
                            "semantic_types", []):
                        old_metadata["semantic_types"] += (
                            'https://metadata.datadrivendiscovery.org/types/Time',
                        )
                # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str):
                #     old_metadata["structural_type"] = type("str")
                # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int):
                #     old_metadata["structural_type"] = type(10)
                # else:
                #     old_metadata["structural_type"] = type(10.2)

                _logger.info(
                    "Date detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'",
                    {
                        'column_index':
                        i,
                        'old_metadata':
                        dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))),
                        'new_metadata':
                        old_metadata,
                    },
                )

                inputs.metadata = inputs.metadata.update(
                    (mbase.ALL_ELEMENTS, i), old_metadata)

        # calling the PhoneParser detector

        try:
            PhoneParser_indices = PhoneParser.detect(df=self._sample_df)
        except Exception as e:
            _logger.error(traceback.print_exc(e))
            PhoneParser_indices = dict()
        if PhoneParser_indices.get("columns_to_perform"):
            for i in PhoneParser_indices["columns_to_perform"]:
                old_metadata = dict(
                    inputs.metadata.query((mbase.ALL_ELEMENTS, i)))
                # print("old metadata", old_metadata)
                if 'https://metadata.datadrivendiscovery.org/types/isAmericanPhoneNumber' not in old_metadata.get(
                        "semantic_types", []):
                    old_metadata["semantic_types"] += (
                        'https://metadata.datadrivendiscovery.org/types/isAmericanPhoneNumber',
                    )

                # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str):
                #     old_metadata["structural_type"] = type("str")
                # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int):
                #     old_metadata["structural_type"] = type(10)
                # else:
                #     old_metadata["structural_type"] = type(10.2)

                _logger.info(
                    "Phone detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'",
                    {
                        'column_index':
                        i,
                        'old_metadata':
                        dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))),
                        'new_metadata':
                        old_metadata,
                    },
                )
                inputs.metadata = inputs.metadata.update(
                    (mbase.ALL_ELEMENTS, i), old_metadata)

        # calling the PunctuationSplitter detector

        try:
            PunctuationSplitter_indices = PunctuationParser.detect(
                df=self._sample_df,
                max_avg_length=self.hyperparams['split_on_column_with_avg_len']
            )
        except Exception as e:
            _logger.error(traceback.print_exc(e))
            PunctuationSplitter_indices = dict()
        if PunctuationSplitter_indices.get("columns_to_perform"):
            for i in PunctuationSplitter_indices["columns_to_perform"]:
                old_metadata = dict(
                    inputs.metadata.query((mbase.ALL_ELEMENTS, i)))
                if 'https://metadata.datadrivendiscovery.org/types/TokenizableByPunctuation' not in old_metadata.get(
                        "semantic_types", []):
                    old_metadata["semantic_types"] += (
                        'https://metadata.datadrivendiscovery.org/types/TokenizableByPunctuation',
                    )

                # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str):
                #     old_metadata["structural_type"] = type("str")
                # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int):
                #     old_metadata["structural_type"] = type(10)
                # else:
                #     old_metadata["structural_type"] = type(10.2)

                _logger.info(
                    "Punctuation detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'",
                    {
                        'column_index':
                        i,
                        'old_metadata':
                        dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))),
                        'new_metadata':
                        old_metadata,
                    },
                )
                inputs.metadata = inputs.metadata.update(
                    (mbase.ALL_ELEMENTS, i), old_metadata)

        # calling the NumAlphaSplitter detector

        try:
            NumAlphaSplitter_indices = NumAlphaParser.detect(
                df=self._sample_df,
                max_avg_length=self.
                hyperparams['split_on_column_with_avg_len'],
            )
        except Exception as e:
            _logger.error(traceback.print_exc(e))
            NumAlphaSplitter_indices = dict()

        if NumAlphaSplitter_indices.get("columns_to_perform"):
            for i in NumAlphaSplitter_indices["columns_to_perform"]:
                old_metadata = dict(
                    inputs.metadata.query((mbase.ALL_ELEMENTS, i)))
                if 'https://metadata.datadrivendiscovery.org/types/TokenizableIntoNumericAndAlphaTokens' not in old_metadata.get(
                        "semantic_types", []):
                    old_metadata["semantic_types"] += (
                        'https://metadata.datadrivendiscovery.org/types/TokenizableIntoNumericAndAlphaTokens',
                    )

                # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str):
                #     old_metadata["structural_type"] = type("str")
                # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int):
                #     old_metadata["structural_type"] = type(10)
                # else:
                #     old_metadata["structural_type"] = type(10.2)

                _logger.info(
                    "NumAlpha detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'",
                    {
                        'column_index':
                        i,
                        'old_metadata':
                        dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))),
                        'new_metadata':
                        old_metadata,
                    },
                )
                inputs.metadata = inputs.metadata.update(
                    (mbase.ALL_ELEMENTS, i), old_metadata)

        inputs = self._relabel_categorical(inputs)
        return CallResult(inputs)

    @staticmethod
    def _relabel_categorical(inputs: Input) -> Output:
        for col in range(inputs.shape[1]):
            old_metadata = dict(
                inputs.metadata.query((mbase.ALL_ELEMENTS, col)))
            semantic_type = old_metadata.get('semantic_types', [])

            if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_type:
                if not HelperFunction.is_categorical(inputs.iloc[:, col]):
                    old_metadata['semantic_types'] = tuple(
                        i for i in old_metadata['semantic_types'] if i !=
                        'https://metadata.datadrivendiscovery.org/types/CategoricalData'
                    )

                    numerics = pd.to_numeric(inputs.iloc[:, col],
                                             errors='coerce')
                    length = numerics.shape[0]
                    nans = numerics.isnull().sum()

                    if nans / length > 0.9:
                        if "http://schema.org/Text" not in old_metadata[
                                'semantic_types']:
                            old_metadata['semantic_types'] += (
                                "http://schema.org/Text", )

                    else:
                        intcheck = (numerics % 1) == 0
                        if np.sum(intcheck) / length > 0.9:
                            if "http://schema.org/Integer" not in old_metadata[
                                    'semantic_types']:
                                old_metadata['semantic_types'] += (
                                    "http://schema.org/Integer", )
                                # old_metadata['structural_type'] = type(10)
                                # inputs.iloc[:, col] = numerics
                        else:
                            if "http://schema.org/Float" not in old_metadata[
                                    'semantic_types']:
                                old_metadata['semantic_types'] += (
                                    "http://schema.org/Float", )
                                # old_metadata['structural_type'] = type(10.2)
                                # inputs.iloc[:, col] = numerics

            inputs.metadata = inputs.metadata.update((mbase.ALL_ELEMENTS, col),
                                                     old_metadata)

        return inputs

    def _produce(self,
                 inputs: Input,
                 metadata: DataMetadata = None,
                 prefix: Selector = None) -> DataMetadata:
        """
        Parameters:
        -----------
        Input:
            typing.Union[container.Dataset, container.DataFrame, container.ndarray, container.matrix, container.List]
        metadata: DataMetadata
            Store generate metadata. If metadata is None, then inputs must be container, which has a metadata field to store the generated data.
        prefix: Selector
            Selector prefix into metadata

        """
        if isinstance(inputs, container.Dataset):
            for table_id, resource in inputs.items():
                prefix = prefix + [table_id]
                metadata = self._produce(resource, metadata, prefix)
        elif isinstance(inputs, list):
            for index, item in enumerate(inputs):
                metadata = self._produce(item, metadata, prefix + [index])
        elif isinstance(inputs, pd.DataFrame):
            metadata = self._profile_data(inputs, metadata, prefix)
        elif isinstance(inputs, np.matrix) or (isinstance(inputs, np.ndarray)
                                               and len(inputs.shape) == 2):
            df = pd.DataFrame(inputs)
            metadata = self._profile_data(df, metadata, prefix)
        elif isinstance(inputs, container.ndarray):
            metadata = self._profile_ndarray(inputs, metadata, prefix)

        return metadata

    def _profile_ndarray(self, array, metadata, prefix):
        # TODO: What to do with ndarrays?
        return metadata

    def _profile_data(self, data, metadata, prefix):
        """
        Main function to profile the data. This functions will
        1. calculate features
        2. update metadata with features

        Parameters
        ----------
        data: pandas.DataFrame that needs to be profiled
        ----------
        """
        if self._verbose:
            print(
                "====================have a look on the data: ====================\n"
            )
            print(data.head(2))

        # calculations
        if self._verbose:
            print(
                "====================calculating the features ... ====================\n"
            )

        # STEP 1: data-level calculations
        if ("pearson_correlation_of_features" in self._specified_features):
            corr_pearson = data.corr()
            corr_columns = list(corr_pearson.columns)
            corr_id = [data.columns.get_loc(n) for n in corr_columns]

        if ("spearman_correlation_of_features" in self._specified_features):
            corr_spearman = data.corr(method='spearman')
            corr_columns = list(corr_spearman.columns)
            corr_id = [data.columns.get_loc(n) for n in corr_columns]

        is_category = category_detection.category_detect(data)

        # STEP 2: column-level calculations
        column_counter = -1
        for column_name in data:
            column_counter += 1
            col = data[column_name]
            # dict: map feature name to content
            each_res = defaultdict(lambda: defaultdict())

            # 17 Feb 2019: Disabling automatic detection of category data. This dangerous
            # because the data profiler may gave different labels on different partitons
            # of the same dataset. Testing partitions are smaller, so they tend to get
            # labelled categorical.

            # if 'semantic_types' in self._specified_features and is_category[column_name]:
            #     # rewrites old metadata
            #     old_metadata = dict(data.metadata.query((mbase.ALL_ELEMENTS, column_counter)))
            #     temp_value = list(old_metadata["semantic_types"])
            #     if len(temp_value) == 2:
            #         ##print("$$$$$$", ('https://metadata.datadrivendiscovery.org/types/CategoricalData', temp_value[1]))
            #         each_res["semantic_types"] = (
            #             'https://metadata.datadrivendiscovery.org/types/CategoricalData', temp_value[-1])
            #     elif len(temp_value) == 1:
            #         each_res["semantic_types"] = (
            #             'https://metadata.datadrivendiscovery.org/types/CategoricalData', temp_value[-1])
            #     elif len(temp_value) == 3:
            #         each_res["semantic_types"] = (
            #             'https://metadata.datadrivendiscovery.org/types/CategoricalData', temp_value[-2],
            #             temp_value[-1])
            #     _logger.info(f'Category type detected "{column_name}": old={temp_value} new={each_res["semantic_types"]}')

            if (("spearman_correlation_of_features"
                 in self._specified_features)
                    and (column_name in corr_columns)):
                stats_sp = corr_spearman[column_name].describe()
                each_res["spearman_correlation_of_features"] = {
                    'min': stats_sp['min'],
                    'max': stats_sp['max'],
                    'mean': stats_sp['mean'],
                    'median': stats_sp['50%'],
                    'std': stats_sp['std']
                }

            if (("spearman_correlation_of_features"
                 in self._specified_features)
                    and (column_name in corr_columns)):
                stats_pr = corr_pearson[column_name].describe()
                each_res["pearson_correlation_of_features"] = {
                    'min': stats_pr['min'],
                    'max': stats_pr['max'],
                    'mean': stats_pr['mean'],
                    'median': stats_pr['50%'],
                    'std': stats_pr['std']
                }

            if col.dtype.kind in np.typecodes['AllInteger'] + 'uMmf':
                if ("number_of_missing_values" in self._specified_features):
                    each_res["number_of_missing_values"] = pd.isnull(col).sum()
                if ("ratio_of_missing_values" in self._specified_features):
                    each_res["ratio_of_missing_values"] = pd.isnull(
                        col).sum() / col.size
                if ("number_of_distinct_values" in self._specified_features):
                    each_res["number_of_distinct_values"] = col.nunique()
                if ("ratio_of_distinct_values" in self._specified_features):
                    each_res["ratio_of_distinct_values"] = col.nunique(
                    ) / float(col.size)

            if col.dtype.kind == 'b':
                if ("most_common_raw_values" in self._specified_features):
                    fc_hih.compute_common_values(col.dropna().astype(str),
                                                 each_res, self._topk)

            elif col.dtype.kind in np.typecodes['AllInteger'] + 'uf':
                fc_hih.compute_numerics(
                    col, each_res, self._specified_features
                )  # TODO: do the checks inside the function
                if ("most_common_raw_values" in self._specified_features):
                    fc_hih.compute_common_values(col.dropna().astype(str),
                                                 each_res, self._topk)

            else:

                # Need to compute str missing values before fillna
                if "number_of_missing_values" in self._specified_features:
                    each_res["number_of_missing_values"] = pd.isnull(col).sum()
                if "ratio_of_missing_values" in self._specified_features:
                    each_res["ratio_of_missing_values"] = pd.isnull(
                        col).sum() / col.size

                col = col.astype(object).fillna('').astype(str)

                # compute_missing_space Must be put as the first one because it may change the data content, see function def for details
                fc_lfh.compute_missing_space(col, each_res,
                                             self._specified_features)
                # fc_lfh.compute_filename(col, each_res)
                fc_lfh.compute_length_distinct(
                    col,
                    each_res,
                    delimiter=self._token_delimiter,
                    feature_list=self._specified_features)
                if ("natural_language_of_feature" in self._specified_features):
                    fc_lfh.compute_lang(col, each_res)
                if ("most_common_punctuations" in self._specified_features):
                    fc_lfh.compute_punctuation(
                        col,
                        each_res,
                        weight_outlier=self._punctuation_outlier_weight)

                fc_hih.compute_numerics(col, each_res,
                                        self._specified_features)
                if ("most_common_numeric_tokens" in self._specified_features):
                    fc_hih.compute_common_numeric_tokens(
                        col, each_res, self._topk)
                if ("most_common_alphanumeric_tokens"
                        in self._specified_features):
                    fc_hih.compute_common_alphanumeric_tokens(
                        col, each_res, self._topk)
                if ("most_common_raw_values" in self._specified_features):
                    fc_hih.compute_common_values(col, each_res, self._topk)
                fc_hih.compute_common_tokens(col, each_res, self._topk,
                                             self._specified_features)
                if ("numeric_char_density" in self._specified_features):
                    fc_hih.compute_numeric_density(col, each_res)
                fc_hih.compute_contain_numeric_values(col, each_res,
                                                      self._specified_features)
                fc_hih.compute_common_tokens_by_puncs(col, each_res,
                                                      self._topk,
                                                      self._specified_features)

            # update metadata for a specific column

            metadata = metadata.update(prefix + [ALL_ELEMENTS, column_counter],
                                       each_res)

            # _logger.info(
            #     "category detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'",
            #     {
            #         'column_index': column_counter,
            #         'old_metadata': old_metadata,
            #         'new_metadata': dict(data.metadata.query((mbase.ALL_ELEMENTS, column_counter))),
            #     },
            # )
        return metadata
コード例 #3
0
    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:
        self._input_data_copy = inputs.copy()
        cols_to_drop = list()

        date_cols = self._mapping.get("date_columns")
        if date_cols:
            cols_to_drop += self._mapping.get("date_columns")
            original_cols = self._get_cols(self._input_data_copy)
            dfo = DateFeaturizerOrg(dataframe=self._input_data_copy)
            df = dfo.featurize_date_columns(date_cols)
            current_cols = self._get_cols(df["df"])

            _logger.info(
                "Date Featurizer. 'created_columns': '%(created_columns)s'.",
                {
                    'created_columns':
                    str(list(set(current_cols).difference(original_cols))),
                },
            )

            self._input_data_copy = df["df"]

        phone_cols = self._mapping.get("phone_columns")
        if phone_cols:
            cols_to_drop += phone_cols.get("columns_to_perform", [])
            original_cols = self._get_cols(self._input_data_copy)
            df = PhoneParser.perform(df=self._input_data_copy,
                                     columns_perform=phone_cols)
            current_cols = self._get_cols(df)

            _logger.info(
                "Phone Featurizer. 'created_columns': '%(created_columns)s'.",
                {
                    'created_columns':
                    str(list(set(current_cols).difference(original_cols))),
                },
            )

            self._input_data_copy = df

        an_cols = self._mapping.get("alpha_numeric_columns")
        if an_cols:
            cols_to_drop += an_cols.get("columns_to_perform", [])
            original_cols = self._get_cols(self._input_data_copy)
            df = NumAlphaParser.perform(df=self._input_data_copy,
                                        columns_perform=an_cols)
            current_cols = self._get_cols(df)

            _logger.info(
                "NumAlpha Featurizer. 'created_columns': '%(created_columns)s'.",
                {
                    'created_columns':
                    str(list(set(current_cols).difference(original_cols))),
                },
            )

            self._input_data_copy = df

        punc_cols = self._mapping.get("punctuation_columns")
        if punc_cols:
            cols_to_drop += punc_cols.get("columns_to_perform", [])
            original_cols = self._get_cols(self._input_data_copy)
            df = PunctuationParser.perform(df=self._input_data_copy,
                                           columns_perform=punc_cols)
            current_cols = self._get_cols(df)

            _logger.info(
                "Punctuation Featurizer. 'created_columns': '%(created_columns)s'.",
                {
                    'created_columns':
                    str(list(set(current_cols).difference(original_cols))),
                },
            )

            self._input_data_copy = df

        if cols_to_drop:
            self._input_data_copy = common_utils.remove_columns(
                self._input_data_copy, list(set(cols_to_drop)))
        self._update_structural_type()

        return CallResult(self._input_data_copy, True, 1)