def _process_selection(selection, original_data): if _is_series(selection): if _is_dask_series(selection): # Dask index values are a delayed object - can't compare below without computing index_vals = selection.index.values.compute() else: index_vals = selection.index.values if _is_dataframe(original_data) and set(index_vals) == set( original_data.columns): # Selecting a single row from a DataFrame, returned as Series without Woodwork initialized schema = None elif _is_dataframe(original_data): # Selecting a single column from a DataFrame schema = original_data.ww.schema.columns[selection.name] else: # Selecting a new Series from an existing Series schema = original_data.ww._schema if schema: selection.ww.init(schema=copy.deepcopy(schema), validate=False) elif _is_dataframe(selection): # Selecting a new DataFrame from an existing DataFrame schema = original_data.ww.schema new_schema = schema.get_subset_schema(list(selection.columns)) selection.ww.init_with_full_schema(schema=new_schema, validate=False) # Selecting a single value or return selection from above return selection
def test_iloc_column(sample_series): if _is_dask_series(sample_series): pytest.xfail("iloc is not supported with Dask inputs") series = sample_series.copy() logical_type = Categorical semantic_tags = ["tag1", "tag2"] description = "custom column description" origin = "base" metadata = {"meta_key": "custom metadata"} series.ww.init( logical_type=logical_type, semantic_tags=semantic_tags, description=description, origin=origin, metadata=metadata, ) sliced = series.ww.iloc[2:] assert sliced.name == "sample_series" assert isinstance(sliced.ww.logical_type, logical_type) assert sliced.ww.semantic_tags == {"category", "tag1", "tag2"} assert sliced.ww.description == description assert sliced.ww.origin == origin assert sliced.ww.metadata == metadata pd.testing.assert_series_equal(to_pandas(sliced), to_pandas(series.iloc[2:])) assert series.ww.iloc[0] == "a" series = sample_series.copy() series.ww.init(use_standard_tags=False) sliced = series.ww.iloc[:] assert sliced.name assert isinstance(sliced.ww.logical_type, logical_type) assert sliced.ww.semantic_tags == set()
def _is_numeric_series(series, logical_type): """Determines whether a series will be considered numeric for the purposes of determining if it can be a time_index.""" if _is_spark_series(series): series = series.to_pandas() if _is_dask_series(series): series = series.get_partition(0).compute() # If column can't be made to be numeric, don't bother checking Logical Type try: pd.to_numeric(series, errors="raise") except (ValueError, TypeError): return False if logical_type is not None: if isinstance(logical_type, str): logical_type = ww.type_system.str_to_logical_type(logical_type) # Allow numeric columns to be interpreted as Datetimes - doesn't allow strings even if they could be numeric if _get_ltype_class( logical_type ) == ww.logical_types.Datetime and pd.api.types.is_numeric_dtype( series): return True else: logical_type = ww.type_system.infer_logical_type(series) return "numeric" in logical_type.standard_tags
def test_loc_column(sample_series): series = sample_series.copy() logical_type = Categorical semantic_tags = ["tag1", "tag2"] series.ww.init(logical_type=logical_type, semantic_tags=semantic_tags) sliced = series.ww.loc[2:] assert sliced.name == "sample_series" assert isinstance(sliced.ww.logical_type, logical_type) assert sliced.ww.semantic_tags == {"category", "tag1", "tag2"} pd.testing.assert_series_equal(to_pandas(sliced), to_pandas(series.loc[2:])) single_val = series.ww.loc[0] if _is_dask_series(series): # Dask returns a series - convert to pandas to check the value single_val = single_val.compute() assert len(single_val) == 1 single_val = single_val.loc[0] assert single_val == "a" series = sample_series.copy() series.ww.init(use_standard_tags=False) sliced = series.ww.loc[:] assert sliced.name assert isinstance(sliced.ww.logical_type, logical_type) assert sliced.ww.semantic_tags == set()
def test_iloc_column_does_not_propagate_changes_to_data(sample_series): if _is_dask_series(sample_series): pytest.xfail("iloc is not supported with Dask inputs") logical_type = Categorical semantic_tags = ["tag1", "tag2"] description = "custom column description" origin = "base" metadata = {"meta_key": "custom metadata"} sample_series.ww.init( logical_type=logical_type, semantic_tags=semantic_tags, description=description, origin=origin, metadata=metadata, use_standard_tags=False, ) sliced = sample_series.ww.iloc[2:] sample_series.ww.add_semantic_tags("new_tag") assert sliced.ww.semantic_tags == {"tag1", "tag2"} assert sliced.ww.semantic_tags is not sample_series.ww.semantic_tags sample_series.ww.metadata["new_key"] = "new_value" assert sliced.ww.metadata == {"meta_key": "custom metadata"} assert sliced.ww.metadata is not sample_series.ww.metadata
def test_series_methods_on_accessor_other_returns(sample_series): sample_series.ww.init() col_shape = sample_series.ww.shape series_shape = sample_series.shape if _is_dask_series(sample_series): col_shape = (col_shape[0].compute(), ) series_shape = series_shape[0].compute() assert col_shape == (len(sample_series), ) assert col_shape == series_shape assert sample_series.name == sample_series.ww.name series_nunique = sample_series.nunique() ww_nunique = sample_series.ww.nunique() if _is_dask_series(sample_series): series_nunique = series_nunique.compute() ww_nunique = ww_nunique.compute() assert series_nunique == ww_nunique
def test_ordinal_with_incomplete_ranking(sample_series): if _is_spark_series(sample_series) or _is_dask_series(sample_series): pytest.xfail( "Fails with Dask and Spark - ordinal data validation not supported" ) ordinal_incomplete_order = Ordinal(order=["a", "b"]) error_msg = re.escape( "Ordinal column sample_series contains values that are not " "present in the order values provided: ['c']") with pytest.raises(ValueError, match=error_msg): sample_series.ww.init(logical_type=ordinal_incomplete_order)
def test_series_methods_on_accessor_inplace(sample_series): # TODO: Try to find a supported inplace method for Dask, if one exists if _is_dask_series(sample_series): pytest.xfail("Dask does not support pop.") comparison_series = sample_series.copy() sample_series.ww.init() comparison_series.ww.init() val = sample_series.ww.pop(0) assert sample_series.ww._schema == comparison_series.ww._schema assert len(sample_series) == len(comparison_series) - 1 assert val == "a"
def test_locIndexer_class(sample_df): sample_df.ww.init() ind = _locIndexer(sample_df) pd.testing.assert_frame_equal(to_pandas(ind.data), to_pandas(sample_df)) pd.testing.assert_frame_equal(to_pandas(ind[1:2]), to_pandas(sample_df.loc[1:2])) single_val = ind[0, "id"] if _is_dask_series(single_val): # Dask returns a series - convert to pandas to check the value single_val = single_val.compute() assert len(single_val) == 1 single_val = single_val.loc[0] assert single_val == 0
def test_latlong_formatting_with_init_series(latlongs): expected_series = pd.Series([(1.0, 2.0), (3.0, 4.0)]) if _is_dask_series(latlongs[0]): expected_series = dd.from_pandas(expected_series, npartitions=2) elif _is_spark_series(latlongs[0]): expected_series = ps.Series([[1.0, 2.0], [3.0, 4.0]]) expected_series.ww.init(logical_type=LatLong) for series in latlongs: new_series = init_series(series, logical_type=LatLong) assert isinstance(new_series.ww.logical_type, LatLong) pd.testing.assert_series_equal(to_pandas(new_series), to_pandas(expected_series)) assert expected_series.ww._schema == new_series.ww._schema
def test_set_logical_type_invalid_dtype_change(sample_series): if _is_dask_series(sample_series): pytest.xfail( "Dask type conversion with astype does not fail until compute is called" ) if _is_spark_series(sample_series): pytest.xfail( "Spark allows this conversion, filling values it cannot convert with NaN " "and converting dtype to float.") sample_series.ww.init(logical_type="Categorical") error_message = ( "Error converting datatype for sample_series from type category to " "type int64. Please confirm the underlying data is consistent with logical type Integer." ) with pytest.raises(TypeConversionError, match=error_message): sample_series.ww.set_logical_type("Integer")
def test_init_series_error_on_invalid_conversion(sample_series): if _is_dask_series(sample_series): pytest.xfail( "Dask type conversion with astype does not fail until compute is called" ) if _is_spark_series(sample_series): pytest.xfail( "Spark allows this conversion, filling values it cannot convert with NaN " "and converting dtype to float.") error_message = ( "Error converting datatype for sample_series from type category to type Int64. " "Please confirm the underlying data is consistent with logical type IntegerNullable." ) with pytest.raises(TypeConversionError, match=error_message): init_series(sample_series, logical_type="integer_nullable")
def test_ordinal_with_order(sample_series): if _is_spark_series(sample_series) or _is_dask_series(sample_series): pytest.xfail( "Fails with Dask and Spark - ordinal data validation not compatible" ) series = sample_series.copy() ordinal_with_order = Ordinal(order=["a", "b", "c"]) series.ww.init(logical_type=ordinal_with_order) assert isinstance(series.ww.logical_type, Ordinal) assert series.ww.logical_type.order == ["a", "b", "c"] series = sample_series.copy() series.ww.init(logical_type="Categorical") new_series = series.ww.set_logical_type(ordinal_with_order) assert isinstance(new_series.ww.logical_type, Ordinal) assert new_series.ww.logical_type.order == ["a", "b", "c"]
def __init__(self, data): self.data = data if _is_dask_dataframe(data): raise TypeError("iloc is not supported for Dask DataFrames") elif _is_dask_series(data): raise TypeError("iloc is not supported for Dask Series")
def infer_logical_type(self, series): """Infer the logical type for the given series Args: series (pandas.Series): The series for which to infer the LogicalType. """ if isinstance(series, pd.Series): # Special case for series with no valid values if series.count() == 0: return Unknown() series = series.head(INFERENCE_SAMPLE_SIZE) else: if _is_dask_series(series): series = series.head(INFERENCE_SAMPLE_SIZE) elif _is_spark_series(series): series = series.head(INFERENCE_SAMPLE_SIZE).to_pandas() else: raise ValueError(f"Unsupported series type `{type(series)}`" ) # pragma: no cover # For dask or spark collections, unknown type special case comes # *after* head calls to avoid evaluating a potentially large # dataset if series.count() == 0: return Unknown() def get_inference_matches(types_to_check, series, type_matches=[]): # Since NaturalLanguage isn't inferred by default, make sure to check # any children of NaturalLanguage, otherwise they never get evaluated check_next = [] for logical_type in types_to_check: inference_func = self.inference_functions.get(logical_type) if inference_func and inference_func(series): type_matches.append(logical_type) check_next.extend(self._get_children(logical_type)) elif not inference_func: check_next.extend(self._get_children(logical_type)) if len(check_next) > 0: get_inference_matches(check_next, series, type_matches) return type_matches # Don't include NaturalLanguage as we only want to check that if # no other matches are found types_to_check = [ ltype for ltype in self.root_types if ltype != NaturalLanguage ] type_matches = get_inference_matches(types_to_check, series) if len(type_matches) == 0: # Check if this is NaturalLanguage, otherwise set # type to default type (Unknown). Assume that a column # can only be natural language if it is not already a # match for another type. Also improves performance by # limiting the times the natural language inference function # is called. if self.inference_functions.get( NaturalLanguage ) and self.inference_functions[NaturalLanguage](series): logical_type = NaturalLanguage else: logical_type = self.default_type elif len(type_matches) == 1: # If we match only one type, return it logical_type = type_matches[0] else: # If multiple matches, get the most specific one. If multiple # matches have the same level of specificity, the first # match found at that level will be returned best_match = type_matches[0] best_depth = self._get_depth(best_match) for logical_type in type_matches[1:]: ltype_depth = self._get_depth(logical_type) if ltype_depth > best_depth: best_match = logical_type best_depth = ltype_depth logical_type = best_match return logical_type()
def test_is_spark_series(sample_series_spark): assert _is_spark_series(sample_series_spark) assert not _is_dask_series(pd.Series())