def check_type(cls, name: str, value: OptionType, *, attr_name: str = None, expected_type: Type[OptionType] = None): """ If name is an option, ensure that value matches its type. @param name: Name of the option to check. @param value: Value of the option to check. @param attr_name: If given, it will be used instead of look up. @param expected_type: If given, it will be used instead of look up. @raise TypeError: If value mismatches type for a given name. @note: If name does not point to an existing option, typing.Any is implied. """ if not _HAS_TYPEGUARD and not _HAS_PYTYPES: return if attr_name is None and expected_type is None: attr_name = cls._option_names.get(name) if attr_name is None: return if expected_type is None: expected_type = cls._option_types.get(attr_name, Any) if _HAS_TYPEGUARD: try: typeguard.check_type(name, value, expected_type, None) except TypeError: valid_type = False else: valid_type = True elif _HAS_PYTYPES: valid_type = pytypes.is_of_type(value, expected_type) if not valid_type: raise TypeError('type of {} must be {}; got {} instead'.format( name, _qualified_name(expected_type), _qualified_name(value)))
def assert_is_of_type(field, value, typ): """ Type checker relying on `pytypes` (python 2+) :param field: :param value: :param typ: :return: """ try: valid = is_of_type(value, typ) except Exception as e: # raise from new_e = FieldTypeError(field, value, typ) new_e.__cause__ = e raise new_e else: if not valid: raise FieldTypeError(field, value, typ)
def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: """ generate features for the input. Input: typing.Union[container.Dataset, container.DataFrame, container.ndarray, container.matrix, container.List] Output: typing.Union[container.Dataset, container.DataFrame, container.ndarray, container.matrix, container.List] """ # Wrap as container, if needed inputs = inputs.copy() if not pytypes.is_of_type(inputs, types.Container): if isinstance(inputs, pd.DataFrame): inputs = container.DataFrame(inputs) elif isinstance(inputs, np.matrix): inputs = container.matrix(inputs) elif isinstance(inputs, np.ndarray): inputs = container.ndarray(inputs) elif isinstance(inputs, list): inputs = container.List(inputs) else: # Inputs is not a container, and cannot be converted to a container. # Nothing to do, since cannot store the computed metadata. return CallResult(inputs) # calling the utility to detect integer and float datatype columns # inputs = dtype_detector.detector(inputs) # calling the utility to categorical datatype columns metadata = self._produce(inputs, inputs.metadata, []) # I guess there are updating the metdata here inputs.metadata = metadata if inputs.shape[0] > 100: self._sample_df = inputs.dropna().iloc[0:100, :] else: self._sample_df = inputs # calling date detector self._DateFeaturizer = DateFeaturizerOrg(inputs) try: cols = self._DateFeaturizer.detect_date_columns(self._sample_df) except Exception as e: _logger.error(traceback.print_exc(e)) cols = list() if cols: indices = [ inputs.columns.get_loc(c) for c in cols if c in inputs.columns ] for i in indices: old_metadata = dict( inputs.metadata.query((mbase.ALL_ELEMENTS, i))) temp_value = list(old_metadata["semantic_types"]) if len(temp_value) >= 1: # if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' not in old_metadata.get( # "semantic_types", []): # old_metadata["semantic_types"] = ( # 'https://metadata.datadrivendiscovery.org/types/CategoricalData', # 'https://metadata.datadrivendiscovery.org/types/Attribute') if 'https://metadata.datadrivendiscovery.org/types/Time' not in old_metadata.get( "semantic_types", []): old_metadata["semantic_types"] += ( 'https://metadata.datadrivendiscovery.org/types/Time', ) # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str): # old_metadata["structural_type"] = type("str") # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int): # old_metadata["structural_type"] = type(10) # else: # old_metadata["structural_type"] = type(10.2) _logger.info( "Date detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'", { 'column_index': i, 'old_metadata': dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))), 'new_metadata': old_metadata, }, ) inputs.metadata = inputs.metadata.update( (mbase.ALL_ELEMENTS, i), old_metadata) # calling the PhoneParser detector try: PhoneParser_indices = PhoneParser.detect(df=self._sample_df) except Exception as e: _logger.error(traceback.print_exc(e)) PhoneParser_indices = dict() if PhoneParser_indices.get("columns_to_perform"): for i in PhoneParser_indices["columns_to_perform"]: old_metadata = dict( inputs.metadata.query((mbase.ALL_ELEMENTS, i))) # print("old metadata", old_metadata) if 'https://metadata.datadrivendiscovery.org/types/isAmericanPhoneNumber' not in old_metadata.get( "semantic_types", []): old_metadata["semantic_types"] += ( 'https://metadata.datadrivendiscovery.org/types/isAmericanPhoneNumber', ) # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str): # old_metadata["structural_type"] = type("str") # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int): # old_metadata["structural_type"] = type(10) # else: # old_metadata["structural_type"] = type(10.2) _logger.info( "Phone detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'", { 'column_index': i, 'old_metadata': dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))), 'new_metadata': old_metadata, }, ) inputs.metadata = inputs.metadata.update( (mbase.ALL_ELEMENTS, i), old_metadata) # calling the PunctuationSplitter detector try: PunctuationSplitter_indices = PunctuationParser.detect( df=self._sample_df, max_avg_length=self.hyperparams['split_on_column_with_avg_len'] ) except Exception as e: _logger.error(traceback.print_exc(e)) PunctuationSplitter_indices = dict() if PunctuationSplitter_indices.get("columns_to_perform"): for i in PunctuationSplitter_indices["columns_to_perform"]: old_metadata = dict( inputs.metadata.query((mbase.ALL_ELEMENTS, i))) if 'https://metadata.datadrivendiscovery.org/types/TokenizableByPunctuation' not in old_metadata.get( "semantic_types", []): old_metadata["semantic_types"] += ( 'https://metadata.datadrivendiscovery.org/types/TokenizableByPunctuation', ) # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str): # old_metadata["structural_type"] = type("str") # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int): # old_metadata["structural_type"] = type(10) # else: # old_metadata["structural_type"] = type(10.2) _logger.info( "Punctuation detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'", { 'column_index': i, 'old_metadata': dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))), 'new_metadata': old_metadata, }, ) inputs.metadata = inputs.metadata.update( (mbase.ALL_ELEMENTS, i), old_metadata) # calling the NumAlphaSplitter detector try: NumAlphaSplitter_indices = NumAlphaParser.detect( df=self._sample_df, max_avg_length=self. hyperparams['split_on_column_with_avg_len'], ) except Exception as e: _logger.error(traceback.print_exc(e)) NumAlphaSplitter_indices = dict() if NumAlphaSplitter_indices.get("columns_to_perform"): for i in NumAlphaSplitter_indices["columns_to_perform"]: old_metadata = dict( inputs.metadata.query((mbase.ALL_ELEMENTS, i))) if 'https://metadata.datadrivendiscovery.org/types/TokenizableIntoNumericAndAlphaTokens' not in old_metadata.get( "semantic_types", []): old_metadata["semantic_types"] += ( 'https://metadata.datadrivendiscovery.org/types/TokenizableIntoNumericAndAlphaTokens', ) # if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str): # old_metadata["structural_type"] = type("str") # elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int): # old_metadata["structural_type"] = type(10) # else: # old_metadata["structural_type"] = type(10.2) _logger.info( "NumAlpha detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'", { 'column_index': i, 'old_metadata': dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))), 'new_metadata': old_metadata, }, ) inputs.metadata = inputs.metadata.update( (mbase.ALL_ELEMENTS, i), old_metadata) inputs = self._relabel_categorical(inputs) return CallResult(inputs)
def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]: """ generate features for the input. Input: typing.Union[container.Dataset, container.DataFrame, container.ndarray, container.matrix, container.List] Output: typing.Union[container.Dataset, container.DataFrame, container.ndarray, container.matrix, container.List] """ # Wrap as container, if needed if not pytypes.is_of_type(inputs, types.Container): if isinstance(inputs, pd.DataFrame): inputs = container.DataFrame(inputs) elif isinstance(inputs, np.matrix): inputs = container.matrix(inputs) elif isinstance(inputs, np.ndarray): inputs = container.ndarray(inputs) elif isinstance(inputs, list): inputs = container.List(inputs) else: # Inputs is not a container, and cannot be converted to a container. # Nothing to do, since cannot store the computed metadata. return CallResult(inputs) # calling the utility to detect integer and float datatype columns inputs = dtype_detector.detector(inputs) # calling date detector # self._DateFeaturizer = date_detector.DateFeaturizer(inputs) self._DateFeaturizer = DateFeaturizerOrg(inputs) if inputs.shape[0] > 50: self._sample_df = inputs.dropna().iloc[0:50, :] else: self._sample_df = inputs cols = self._DateFeaturizer.detect_date_columns(self._sample_df) if cols: indices = [ inputs.columns.get_loc(c) for c in cols if c in inputs.columns ] for i in indices: old_metadata = dict( inputs.metadata.query((mbase.ALL_ELEMENTS, i))) temp_value = list(old_metadata["semantic_types"]) if len(temp_value) >= 1: if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' not in old_metadata[ "semantic_types"]: old_metadata["semantic_types"] += ( 'https://metadata.datadrivendiscovery.org/types/CategoricalData', ) if 'https://metadata.datadrivendiscovery.org/types/Time' not in old_metadata[ "semantic_types"]: old_metadata["semantic_types"] += ( 'https://metadata.datadrivendiscovery.org/types/Time', ) if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str): old_metadata["structural_type"] = type("str") elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int): old_metadata["structural_type"] = type(10) else: old_metadata["structural_type"] = type(10.2) _logger.info( "Date detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'", { 'column_index': i, 'old_metadata': dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))), 'new_metadata': old_metadata, }, ) inputs.metadata = inputs.metadata.update( (mbase.ALL_ELEMENTS, i), old_metadata) # calling the utility to categorical datatype columns metadata = self._produce(inputs, inputs.metadata, []) # I guess there are updating the metdata here inputs.metadata = metadata # calling the PhoneParser detector self._PhoneParser = PhoneParser(self._sample_df) PhoneParser_indices = self._PhoneParser.detect() if PhoneParser_indices: for i in PhoneParser_indices: old_metadata = dict( inputs.metadata.query((mbase.ALL_ELEMENTS, i))) # print("old metadata", old_metadata) if 'https://metadata.datadrivendiscovery.org/types/AmericanPhoneNumber' not in old_metadata[ "semantic_types"]: old_metadata["semantic_types"] += ( 'https://metadata.datadrivendiscovery.org/types/AmericanPhoneNumber', ) if 'https://metadata.datadrivendiscovery.org/types/UnnormalizedEntity' not in old_metadata[ "semantic_types"]: old_metadata["semantic_types"] += ( 'https://metadata.datadrivendiscovery.org/types/UnnormalizedEntity', ) if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str): old_metadata["structural_type"] = type("str") elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int): old_metadata["structural_type"] = type(10) else: old_metadata["structural_type"] = type(10.2) _logger.info( "Phone detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'", { 'column_index': i, 'old_metadata': dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))), 'new_metadata': old_metadata, }, ) inputs.metadata = inputs.metadata.update( (mbase.ALL_ELEMENTS, i), old_metadata) # calling the PunctuationSplitter detector self._PunctuationSplitter = PunctuationParser(self._sample_df) PunctuationSplitter_indices = self._PunctuationSplitter.detect() if PunctuationSplitter_indices: for i in PunctuationSplitter_indices: old_metadata = dict( inputs.metadata.query((mbase.ALL_ELEMENTS, i))) if 'https://metadata.datadrivendiscovery.org/types/CanBeSplitByPunctuation' not in old_metadata[ "semantic_types"]: old_metadata["semantic_types"] += ( 'https://metadata.datadrivendiscovery.org/types/CanBeSplitByPunctuation', ) if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str): old_metadata["structural_type"] = type("str") elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int): old_metadata["structural_type"] = type(10) else: old_metadata["structural_type"] = type(10.2) _logger.info( "Punctuation detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'", { 'column_index': i, 'old_metadata': dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))), 'new_metadata': old_metadata, }, ) inputs.metadata = inputs.metadata.update( (mbase.ALL_ELEMENTS, i), old_metadata) # calling the NumAlphaSplitter detector self._NumAlphaSplitter = NumAlphaParser(self._sample_df) NumAlphaSplitter_indices = self._NumAlphaSplitter.detect() if NumAlphaSplitter_indices: for i in NumAlphaSplitter_indices: old_metadata = dict( inputs.metadata.query((mbase.ALL_ELEMENTS, i))) if 'https://metadata.datadrivendiscovery.org/types/CanBeSplitByAlphanumeric' not in old_metadata[ "semantic_types"]: old_metadata["semantic_types"] += ( 'https://metadata.datadrivendiscovery.org/types/CanBeSplitByAlphanumeric', ) if isinstance(self._sample_df.iloc[:, i].head(1).values[0], str): old_metadata["structural_type"] = type("str") elif isinstance(self._sample_df.iloc[:, i].head(1).values[0], int): old_metadata["structural_type"] = type(10) else: old_metadata["structural_type"] = type(10.2) _logger.info( "NumAlpha detector. 'column_index': '%(column_index)d', 'old_metadata': '%(old_metadata)s', 'new_metadata': '%(new_metadata)s'", { 'column_index': i, 'old_metadata': dict(inputs.metadata.query((mbase.ALL_ELEMENTS, i))), 'new_metadata': old_metadata, }, ) inputs.metadata = inputs.metadata.update( (mbase.ALL_ELEMENTS, i), old_metadata) return CallResult(inputs)