class Numeric(BaseIntent): """Defines a numeric column type.""" confidence_computation = { MetricWrapper(num_valid): 0.3, MetricWrapper(unique_heur, invert=True): 0.2, MetricWrapper(is_numeric): 0.4, MetricWrapper(is_string, invert=True): 0.1, } def fit(self, X, y=None, **fit_params): """Empty fit. Args: X: The input data y: The response variable **fit_params: Additional parameters for the fit Returns: self """ return self def transform(self, X, y=None): """Convert a column to a numeric form. Args: X: The input data y: The response variable Returns: A column with all rows converted to numbers. """ return X.apply(pd.to_numeric, errors="coerce") @classmethod def column_summary(cls, df): # noqa result = standard_col_summary(df) data_transformed = pd.to_numeric(df.iloc[:, 0], errors="coerce") invalid_pct = ( data_transformed.isnull().sum() * 100.0 / result["count"] - result["nan_percent"]) outliers = get_outliers(data_transformed, count=5).values.tolist() result.update([ ("invalid_percent", invalid_pct), ("mean", float(data_transformed.mean())), ("std", float(data_transformed.std())), ("min", float(data_transformed.min())), ("25%", float(data_transformed.quantile(0.25))), ("50%", float(data_transformed.quantile(0.5))), ("75%", float(data_transformed.quantile(0.75))), ("max", float(data_transformed.max())), ("5_outliers", outliers), ]) return result
def test_metric_invert(retval): """Test metric invert computation.""" from foreshadow.metrics import MetricWrapper def test(X): return retval metric_wrapper = MetricWrapper(test, 0, invert=True) assert (1 - retval) == metric_wrapper.calculate([1, 2, 3])
def test_metric_default_return(): """Test metric default return value when a function errors.""" from foreshadow.metrics import MetricWrapper def test(X): raise Exception metric_wrapper = MetricWrapper(test, 0) assert 0 == metric_wrapper.calculate([1, 2, 3])
def test_metric_last_call(metric_fn, arg, kwargs): """Test arbitrary function reroutes from call to last_call Args: metric_fn: arbitrary metric function arg: arg to metric call kwargs: any kwargs to metric call """ from foreshadow.metrics import MetricWrapper metric_wrapper = MetricWrapper(metric_fn) _ = metric_wrapper.calculate(arg, **kwargs) assert metric_wrapper.last_call() == 1
def __init__(self): transformations = [drop_transform] super().__init__( transformations, confidence_computation={ MetricWrapper(calculate_percentage_of_rows_matching_regex): 1 }, )
def __init__( self, transformations, output_columns=None, confidence_computation=None, default=return_original_row, # cache_manager=None, ): """Construct any cleaner/flattener. Args: transformations: a callable that takes a string and returns a tuple with the length of the transformed characters and then transformed string. output_columns: If none, any lists returned by the transformations are assumed to be separate columns in the new DataFrame. Otherwise, pass the names for each desired output column to be used. confidence_computation: The dict of {metric: weight} for the subclass's metric computation. This implies an OVR model. default: Function that returns the default value for a row if the transformation failed. Accepts the row as input. Raises: ValueError: If not a list, int, or None specifying expected output columns. """ if not isinstance(output_columns, (int, list, type(None))): raise ValueError("output columns not a valid type") self.default = default self.output_columns = output_columns self.transformations = transformations self.confidence_computation = { MetricWrapper(calculate_percentage_of_rows_matching_regex): 0.8, MetricWrapper(avg_col_regex): 0.2, } # self.confidence_computation = {regex_rows: 0.8, avg_col_regex: 0.2} # self.cache_manager = cache_manager if confidence_computation is not None: self.confidence_computation = confidence_computation
class Text(BaseIntent): """Defines a text column type.""" confidence_computation = { MetricWrapper(num_valid): 0.2, MetricWrapper(unique_heur): 0.2, MetricWrapper(is_numeric, invert=True): 0.2, MetricWrapper(is_string): 0.2, MetricWrapper(has_long_text): 0.2, } def fit(self, X, y=None, **fit_params): """Empty fit. Args: X: The input data y: The response variable **fit_params: Additional parameters for the fit Returns: self """ return self def transform(self, X, y=None): """Convert a column to a text form. Args: X: The input data y: The response variable Returns: A column with all rows converted to text. """ return X.astype(str) @classmethod def column_summary(cls, df): # noqa return standard_col_summary(df)
def test_metric_print(fn, regex): """Test metric prints correct/useful information about itself. Args: fn: function of metric that returns a string regex: useful information to check """ from foreshadow.metrics import MetricWrapper metric_fn = MetricWrapper(lambda x: 1) assert re.search(regex, getattr(metric_fn, fn)())
class Categorical(BaseIntent): """Defines a categoric column type.""" confidence_computation = { MetricWrapper(num_valid): 0.25, MetricWrapper(unique_heur): 0.65, MetricWrapper(is_numeric, invert=True): 0.1, } def fit(self, X, y=None, **fit_params): """Empty fit. Args: X: The input data y: The response variable **fit_params: Additional parameters for the fit Returns: self """ return self def transform(self, X, y=None): """Pass-through transform. Args: X: The input data y: The response variable Returns: The input column """ return X @classmethod def column_summary(cls, df): # noqa return standard_col_summary(df)