Beispiel #1
0
    def check_type(cls,
                   name: str,
                   value: OptionType,
                   *,
                   attr_name: str = None,
                   expected_type: Type[OptionType] = None):
        """
        If name is an option, ensure that value matches its type.

        @param name: Name of the option to check.

        @param value: Value of the option to check.

        @param attr_name: If given, it will be used instead of look up.

        @param expected_type: If given, it will be used instead of look up.

        @raise TypeError: If value mismatches type for a given name.

        @note: If name does not point to an existing option, typing.Any is implied.
        """
        if not _HAS_TYPEGUARD and not _HAS_PYTYPES:
            return

        if attr_name is None and expected_type is None:
            attr_name = cls._option_names.get(name)

            if attr_name is None:
                return

        if expected_type is None:
            expected_type = cls._option_types.get(attr_name, Any)

        if _HAS_TYPEGUARD:
            try:
                typeguard.check_type(name, value, expected_type, None)
            except TypeError:
                valid_type = False
            else:
                valid_type = True
        elif _HAS_PYTYPES:
            valid_type = pytypes.is_of_type(value, expected_type)

        if not valid_type:
            raise TypeError('type of {} must be {}; got {} instead'.format(
                name, _qualified_name(expected_type), _qualified_name(value)))
Beispiel #2
0
            def assert_is_of_type(field, value, typ):
                """
                Type checker relying on `pytypes` (python 2+)

                :param field:
                :param value:
                :param typ:
                :return:
                """
                try:
                    valid = is_of_type(value, typ)
                except Exception as e:
                    # raise from
                    new_e = FieldTypeError(field, value, typ)
                    new_e.__cause__ = e
                    raise new_e
                else:
                    if not valid:
                        raise FieldTypeError(field, value, typ)
Beispiel #3
0
    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:
        """
        generate features for the input.
        Input:
            typing.Union[container.Dataset, container.DataFrame, container.ndarray, container.matrix, container.List]
        Output:
            typing.Union[container.Dataset, container.DataFrame, container.ndarray, container.matrix, container.List]
        """
        # Wrap as container, if needed
        inputs = inputs.copy()
        if not pytypes.is_of_type(inputs, types.Container):
            if isinstance(inputs, pd.DataFrame):
                inputs = container.DataFrame(inputs)
            elif isinstance(inputs, np.matrix):
                inputs = container.matrix(inputs)
            elif isinstance(inputs, np.ndarray):
                inputs = container.ndarray(inputs)
            elif isinstance(inputs, list):
                inputs = container.List(inputs)
            else:
                # Inputs is not a container, and cannot be converted to a container.
                # Nothing to do, since cannot store the computed metadata.
                return CallResult(inputs)

        # calling the utility to detect integer and float datatype columns
        # inputs = dtype_detector.detector(inputs)

        # calling the utility to categorical datatype columns
        metadata = self._produce(inputs, inputs.metadata, [])
        # I guess there are updating the metdata here
        inputs.metadata = metadata

        if inputs.shape[0] > 100:
            self._sample_df = inputs.dropna().iloc[0:100, :]
        else:
            self._sample_df = inputs

        # calling date detector

        self._DateFeaturizer = DateFeaturizerOrg(inputs)
        try:
            cols = self._DateFeaturizer.detect_date_columns(self._sample_df)
        except Exception as e:
            _logger.error(traceback.print_exc(e))
            cols = list()
        if cols:
            indices = [
                inputs.columns.get_loc(c) for c in cols if c in inputs.columns
            ]
            for i in indices:
                old_metadata = dict(
                    inputs.metadata.query((mbase.ALL_ELEMENTS, i)))
                temp_value = list(old_metadata["semantic_types"])
                if len(temp_value) >= 1:
                    # if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' not in old_metadata.get(
                    #         "semantic_types", []):
                    #     old_metadata["semantic_types"] = (
                    #         'https://metadata.datadrivendiscovery.org/types/CategoricalData',
                    #         'https://metadata.datadrivendiscovery.org/types/Attribute')
                    if 'https://metadata.datadrivendiscovery.org/types/Time' not in old_metadata.get(
                            "semantic_types", []):
                        old_metadata["semantic_types"] += (
                            'https://metadata.datadrivendiscovery.org/types/Time',
                        )
                # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str):
                #     old_metadata["structural_type"] = type("str")
                # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int):
                #     old_metadata["structural_type"] = type(10)
                # else:
                #     old_metadata["structural_type"] = type(10.2)

                _logger.info(
                    "Date detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'",
                    {
                        'column_index':
                        i,
                        'old_metadata':
                        dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))),
                        'new_metadata':
                        old_metadata,
                    },
                )

                inputs.metadata = inputs.metadata.update(
                    (mbase.ALL_ELEMENTS, i), old_metadata)

        # calling the PhoneParser detector

        try:
            PhoneParser_indices = PhoneParser.detect(df=self._sample_df)
        except Exception as e:
            _logger.error(traceback.print_exc(e))
            PhoneParser_indices = dict()
        if PhoneParser_indices.get("columns_to_perform"):
            for i in PhoneParser_indices["columns_to_perform"]:
                old_metadata = dict(
                    inputs.metadata.query((mbase.ALL_ELEMENTS, i)))
                # print("old metadata", old_metadata)
                if 'https://metadata.datadrivendiscovery.org/types/isAmericanPhoneNumber' not in old_metadata.get(
                        "semantic_types", []):
                    old_metadata["semantic_types"] += (
                        'https://metadata.datadrivendiscovery.org/types/isAmericanPhoneNumber',
                    )

                # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str):
                #     old_metadata["structural_type"] = type("str")
                # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int):
                #     old_metadata["structural_type"] = type(10)
                # else:
                #     old_metadata["structural_type"] = type(10.2)

                _logger.info(
                    "Phone detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'",
                    {
                        'column_index':
                        i,
                        'old_metadata':
                        dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))),
                        'new_metadata':
                        old_metadata,
                    },
                )
                inputs.metadata = inputs.metadata.update(
                    (mbase.ALL_ELEMENTS, i), old_metadata)

        # calling the PunctuationSplitter detector

        try:
            PunctuationSplitter_indices = PunctuationParser.detect(
                df=self._sample_df,
                max_avg_length=self.hyperparams['split_on_column_with_avg_len']
            )
        except Exception as e:
            _logger.error(traceback.print_exc(e))
            PunctuationSplitter_indices = dict()
        if PunctuationSplitter_indices.get("columns_to_perform"):
            for i in PunctuationSplitter_indices["columns_to_perform"]:
                old_metadata = dict(
                    inputs.metadata.query((mbase.ALL_ELEMENTS, i)))
                if 'https://metadata.datadrivendiscovery.org/types/TokenizableByPunctuation' not in old_metadata.get(
                        "semantic_types", []):
                    old_metadata["semantic_types"] += (
                        'https://metadata.datadrivendiscovery.org/types/TokenizableByPunctuation',
                    )

                # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str):
                #     old_metadata["structural_type"] = type("str")
                # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int):
                #     old_metadata["structural_type"] = type(10)
                # else:
                #     old_metadata["structural_type"] = type(10.2)

                _logger.info(
                    "Punctuation detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'",
                    {
                        'column_index':
                        i,
                        'old_metadata':
                        dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))),
                        'new_metadata':
                        old_metadata,
                    },
                )
                inputs.metadata = inputs.metadata.update(
                    (mbase.ALL_ELEMENTS, i), old_metadata)

        # calling the NumAlphaSplitter detector

        try:
            NumAlphaSplitter_indices = NumAlphaParser.detect(
                df=self._sample_df,
                max_avg_length=self.
                hyperparams['split_on_column_with_avg_len'],
            )
        except Exception as e:
            _logger.error(traceback.print_exc(e))
            NumAlphaSplitter_indices = dict()

        if NumAlphaSplitter_indices.get("columns_to_perform"):
            for i in NumAlphaSplitter_indices["columns_to_perform"]:
                old_metadata = dict(
                    inputs.metadata.query((mbase.ALL_ELEMENTS, i)))
                if 'https://metadata.datadrivendiscovery.org/types/TokenizableIntoNumericAndAlphaTokens' not in old_metadata.get(
                        "semantic_types", []):
                    old_metadata["semantic_types"] += (
                        'https://metadata.datadrivendiscovery.org/types/TokenizableIntoNumericAndAlphaTokens',
                    )

                # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str):
                #     old_metadata["structural_type"] = type("str")
                # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int):
                #     old_metadata["structural_type"] = type(10)
                # else:
                #     old_metadata["structural_type"] = type(10.2)

                _logger.info(
                    "NumAlpha detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'",
                    {
                        'column_index':
                        i,
                        'old_metadata':
                        dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))),
                        'new_metadata':
                        old_metadata,
                    },
                )
                inputs.metadata = inputs.metadata.update(
                    (mbase.ALL_ELEMENTS, i), old_metadata)

        inputs = self._relabel_categorical(inputs)
        return CallResult(inputs)
Beispiel #4
0
    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:
        """
        generate features for the input.
        Input:
            typing.Union[container.Dataset, container.DataFrame, container.ndarray, container.matrix, container.List]
        Output:
            typing.Union[container.Dataset, container.DataFrame, container.ndarray, container.matrix, container.List]
        """
        # Wrap as container, if needed
        if not pytypes.is_of_type(inputs, types.Container):
            if isinstance(inputs, pd.DataFrame):
                inputs = container.DataFrame(inputs)
            elif isinstance(inputs, np.matrix):
                inputs = container.matrix(inputs)
            elif isinstance(inputs, np.ndarray):
                inputs = container.ndarray(inputs)
            elif isinstance(inputs, list):
                inputs = container.List(inputs)
            else:
                # Inputs is not a container, and cannot be converted to a container.
                # Nothing to do, since cannot store the computed metadata.
                return CallResult(inputs)

        # calling the utility to detect integer and float datatype columns
        inputs = dtype_detector.detector(inputs)

        # calling date detector

        # self._DateFeaturizer = date_detector.DateFeaturizer(inputs)
        self._DateFeaturizer = DateFeaturizerOrg(inputs)
        if inputs.shape[0] > 50:
            self._sample_df = inputs.dropna().iloc[0:50, :]
        else:
            self._sample_df = inputs
        cols = self._DateFeaturizer.detect_date_columns(self._sample_df)
        if cols:
            indices = [
                inputs.columns.get_loc(c) for c in cols if c in inputs.columns
            ]
            for i in indices:
                old_metadata = dict(
                    inputs.metadata.query((mbase.ALL_ELEMENTS, i)))
                temp_value = list(old_metadata["semantic_types"])
                if len(temp_value) >= 1:
                    if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' not in old_metadata[
                            "semantic_types"]:
                        old_metadata["semantic_types"] += (
                            'https://metadata.datadrivendiscovery.org/types/CategoricalData',
                        )
                    if 'https://metadata.datadrivendiscovery.org/types/Time' not in old_metadata[
                            "semantic_types"]:
                        old_metadata["semantic_types"] += (
                            'https://metadata.datadrivendiscovery.org/types/Time',
                        )
                if isinstance(self._sample_df.iloc[:, i].head(1).values[0],
                              str):
                    old_metadata["structural_type"] = type("str")
                elif isinstance(self._sample_df.iloc[:, i].head(1).values[0],
                                int):
                    old_metadata["structural_type"] = type(10)
                else:
                    old_metadata["structural_type"] = type(10.2)

                _logger.info(
                    "Date detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'",
                    {
                        'column_index':
                        i,
                        'old_metadata':
                        dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))),
                        'new_metadata':
                        old_metadata,
                    },
                )

                inputs.metadata = inputs.metadata.update(
                    (mbase.ALL_ELEMENTS, i), old_metadata)

        # calling the utility to categorical datatype columns
        metadata = self._produce(inputs, inputs.metadata, [])
        # I guess there are updating the metdata here
        inputs.metadata = metadata

        # calling the PhoneParser detector

        self._PhoneParser = PhoneParser(self._sample_df)

        PhoneParser_indices = self._PhoneParser.detect()
        if PhoneParser_indices:
            for i in PhoneParser_indices:
                old_metadata = dict(
                    inputs.metadata.query((mbase.ALL_ELEMENTS, i)))
                # print("old metadata", old_metadata)
                if 'https://metadata.datadrivendiscovery.org/types/AmericanPhoneNumber' not in old_metadata[
                        "semantic_types"]:
                    old_metadata["semantic_types"] += (
                        'https://metadata.datadrivendiscovery.org/types/AmericanPhoneNumber',
                    )
                if 'https://metadata.datadrivendiscovery.org/types/UnnormalizedEntity' not in old_metadata[
                        "semantic_types"]:
                    old_metadata["semantic_types"] += (
                        'https://metadata.datadrivendiscovery.org/types/UnnormalizedEntity',
                    )

                if isinstance(self._sample_df.iloc[:, i].head(1).values[0],
                              str):
                    old_metadata["structural_type"] = type("str")
                elif isinstance(self._sample_df.iloc[:, i].head(1).values[0],
                                int):
                    old_metadata["structural_type"] = type(10)
                else:
                    old_metadata["structural_type"] = type(10.2)

                _logger.info(
                    "Phone detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'",
                    {
                        'column_index':
                        i,
                        'old_metadata':
                        dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))),
                        'new_metadata':
                        old_metadata,
                    },
                )
                inputs.metadata = inputs.metadata.update(
                    (mbase.ALL_ELEMENTS, i), old_metadata)

        # calling the PunctuationSplitter detector

        self._PunctuationSplitter = PunctuationParser(self._sample_df)

        PunctuationSplitter_indices = self._PunctuationSplitter.detect()
        if PunctuationSplitter_indices:
            for i in PunctuationSplitter_indices:
                old_metadata = dict(
                    inputs.metadata.query((mbase.ALL_ELEMENTS, i)))
                if 'https://metadata.datadrivendiscovery.org/types/CanBeSplitByPunctuation' not in old_metadata[
                        "semantic_types"]:
                    old_metadata["semantic_types"] += (
                        'https://metadata.datadrivendiscovery.org/types/CanBeSplitByPunctuation',
                    )

                if isinstance(self._sample_df.iloc[:, i].head(1).values[0],
                              str):
                    old_metadata["structural_type"] = type("str")
                elif isinstance(self._sample_df.iloc[:, i].head(1).values[0],
                                int):
                    old_metadata["structural_type"] = type(10)
                else:
                    old_metadata["structural_type"] = type(10.2)

                _logger.info(
                    "Punctuation detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'",
                    {
                        'column_index':
                        i,
                        'old_metadata':
                        dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))),
                        'new_metadata':
                        old_metadata,
                    },
                )
                inputs.metadata = inputs.metadata.update(
                    (mbase.ALL_ELEMENTS, i), old_metadata)

        # calling the NumAlphaSplitter detector

        self._NumAlphaSplitter = NumAlphaParser(self._sample_df)

        NumAlphaSplitter_indices = self._NumAlphaSplitter.detect()

        if NumAlphaSplitter_indices:
            for i in NumAlphaSplitter_indices:
                old_metadata = dict(
                    inputs.metadata.query((mbase.ALL_ELEMENTS, i)))
                if 'https://metadata.datadrivendiscovery.org/types/CanBeSplitByAlphanumeric' not in old_metadata[
                        "semantic_types"]:
                    old_metadata["semantic_types"] += (
                        'https://metadata.datadrivendiscovery.org/types/CanBeSplitByAlphanumeric',
                    )

                if isinstance(self._sample_df.iloc[:, i].head(1).values[0],
                              str):
                    old_metadata["structural_type"] = type("str")
                elif isinstance(self._sample_df.iloc[:, i].head(1).values[0],
                                int):
                    old_metadata["structural_type"] = type(10)
                else:
                    old_metadata["structural_type"] = type(10.2)

                _logger.info(
                    "NumAlpha detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'",
                    {
                        'column_index':
                        i,
                        'old_metadata':
                        dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))),
                        'new_metadata':
                        old_metadata,
                    },
                )
                inputs.metadata = inputs.metadata.update(
                    (mbase.ALL_ELEMENTS, i), old_metadata)

        return CallResult(inputs)