Esempio n. 1
0
def _process_selection(selection, original_data):
    if _is_series(selection):
        if _is_dask_series(selection):
            # Dask index values are a delayed object - can't compare below without computing
            index_vals = selection.index.values.compute()
        else:
            index_vals = selection.index.values
        if _is_dataframe(original_data) and set(index_vals) == set(
                original_data.columns):
            # Selecting a single row from a DataFrame, returned as Series without Woodwork initialized
            schema = None
        elif _is_dataframe(original_data):
            # Selecting a single column from a DataFrame
            schema = original_data.ww.schema.columns[selection.name]
        else:
            # Selecting a new Series from an existing Series
            schema = original_data.ww._schema
        if schema:
            selection.ww.init(schema=copy.deepcopy(schema), validate=False)
    elif _is_dataframe(selection):
        # Selecting a new DataFrame from an existing DataFrame
        schema = original_data.ww.schema
        new_schema = schema.get_subset_schema(list(selection.columns))
        selection.ww.init_with_full_schema(schema=new_schema, validate=False)
    # Selecting a single value or return selection from above
    return selection
Esempio n. 2
0
def test_iloc_column(sample_series):
    if _is_dask_series(sample_series):
        pytest.xfail("iloc is not supported with Dask inputs")
    series = sample_series.copy()
    logical_type = Categorical
    semantic_tags = ["tag1", "tag2"]
    description = "custom column description"
    origin = "base"
    metadata = {"meta_key": "custom metadata"}
    series.ww.init(
        logical_type=logical_type,
        semantic_tags=semantic_tags,
        description=description,
        origin=origin,
        metadata=metadata,
    )

    sliced = series.ww.iloc[2:]
    assert sliced.name == "sample_series"
    assert isinstance(sliced.ww.logical_type, logical_type)
    assert sliced.ww.semantic_tags == {"category", "tag1", "tag2"}
    assert sliced.ww.description == description
    assert sliced.ww.origin == origin
    assert sliced.ww.metadata == metadata
    pd.testing.assert_series_equal(to_pandas(sliced),
                                   to_pandas(series.iloc[2:]))

    assert series.ww.iloc[0] == "a"

    series = sample_series.copy()
    series.ww.init(use_standard_tags=False)
    sliced = series.ww.iloc[:]
    assert sliced.name
    assert isinstance(sliced.ww.logical_type, logical_type)
    assert sliced.ww.semantic_tags == set()
Esempio n. 3
0
def _is_numeric_series(series, logical_type):
    """Determines whether a series will be considered numeric
    for the purposes of determining if it can be a time_index."""
    if _is_spark_series(series):
        series = series.to_pandas()
    if _is_dask_series(series):
        series = series.get_partition(0).compute()

    # If column can't be made to be numeric, don't bother checking Logical Type
    try:
        pd.to_numeric(series, errors="raise")
    except (ValueError, TypeError):
        return False

    if logical_type is not None:
        if isinstance(logical_type, str):
            logical_type = ww.type_system.str_to_logical_type(logical_type)

        # Allow numeric columns to be interpreted as Datetimes - doesn't allow strings even if they could be numeric
        if _get_ltype_class(
                logical_type
        ) == ww.logical_types.Datetime and pd.api.types.is_numeric_dtype(
                series):
            return True
    else:
        logical_type = ww.type_system.infer_logical_type(series)

    return "numeric" in logical_type.standard_tags
Esempio n. 4
0
def test_loc_column(sample_series):
    series = sample_series.copy()
    logical_type = Categorical
    semantic_tags = ["tag1", "tag2"]
    series.ww.init(logical_type=logical_type, semantic_tags=semantic_tags)

    sliced = series.ww.loc[2:]
    assert sliced.name == "sample_series"
    assert isinstance(sliced.ww.logical_type, logical_type)
    assert sliced.ww.semantic_tags == {"category", "tag1", "tag2"}
    pd.testing.assert_series_equal(to_pandas(sliced),
                                   to_pandas(series.loc[2:]))

    single_val = series.ww.loc[0]

    if _is_dask_series(series):
        # Dask returns a series - convert to pandas to check the value
        single_val = single_val.compute()
        assert len(single_val) == 1
        single_val = single_val.loc[0]
    assert single_val == "a"

    series = sample_series.copy()
    series.ww.init(use_standard_tags=False)
    sliced = series.ww.loc[:]
    assert sliced.name
    assert isinstance(sliced.ww.logical_type, logical_type)
    assert sliced.ww.semantic_tags == set()
Esempio n. 5
0
def test_iloc_column_does_not_propagate_changes_to_data(sample_series):
    if _is_dask_series(sample_series):
        pytest.xfail("iloc is not supported with Dask inputs")
    logical_type = Categorical
    semantic_tags = ["tag1", "tag2"]
    description = "custom column description"
    origin = "base"
    metadata = {"meta_key": "custom metadata"}
    sample_series.ww.init(
        logical_type=logical_type,
        semantic_tags=semantic_tags,
        description=description,
        origin=origin,
        metadata=metadata,
        use_standard_tags=False,
    )

    sliced = sample_series.ww.iloc[2:]
    sample_series.ww.add_semantic_tags("new_tag")
    assert sliced.ww.semantic_tags == {"tag1", "tag2"}
    assert sliced.ww.semantic_tags is not sample_series.ww.semantic_tags

    sample_series.ww.metadata["new_key"] = "new_value"
    assert sliced.ww.metadata == {"meta_key": "custom metadata"}
    assert sliced.ww.metadata is not sample_series.ww.metadata
Esempio n. 6
0
def test_series_methods_on_accessor_other_returns(sample_series):
    sample_series.ww.init()
    col_shape = sample_series.ww.shape
    series_shape = sample_series.shape
    if _is_dask_series(sample_series):
        col_shape = (col_shape[0].compute(), )
        series_shape = series_shape[0].compute()
    assert col_shape == (len(sample_series), )
    assert col_shape == series_shape

    assert sample_series.name == sample_series.ww.name
    series_nunique = sample_series.nunique()
    ww_nunique = sample_series.ww.nunique()
    if _is_dask_series(sample_series):
        series_nunique = series_nunique.compute()
        ww_nunique = ww_nunique.compute()
    assert series_nunique == ww_nunique
Esempio n. 7
0
def test_ordinal_with_incomplete_ranking(sample_series):
    if _is_spark_series(sample_series) or _is_dask_series(sample_series):
        pytest.xfail(
            "Fails with Dask and Spark - ordinal data validation not supported"
        )

    ordinal_incomplete_order = Ordinal(order=["a", "b"])
    error_msg = re.escape(
        "Ordinal column sample_series contains values that are not "
        "present in the order values provided: ['c']")
    with pytest.raises(ValueError, match=error_msg):
        sample_series.ww.init(logical_type=ordinal_incomplete_order)
Esempio n. 8
0
def test_series_methods_on_accessor_inplace(sample_series):
    # TODO: Try to find a supported inplace method for Dask, if one exists
    if _is_dask_series(sample_series):
        pytest.xfail("Dask does not support pop.")
    comparison_series = sample_series.copy()

    sample_series.ww.init()
    comparison_series.ww.init()

    val = sample_series.ww.pop(0)
    assert sample_series.ww._schema == comparison_series.ww._schema
    assert len(sample_series) == len(comparison_series) - 1
    assert val == "a"
Esempio n. 9
0
def test_locIndexer_class(sample_df):
    sample_df.ww.init()
    ind = _locIndexer(sample_df)
    pd.testing.assert_frame_equal(to_pandas(ind.data), to_pandas(sample_df))
    pd.testing.assert_frame_equal(to_pandas(ind[1:2]),
                                  to_pandas(sample_df.loc[1:2]))
    single_val = ind[0, "id"]
    if _is_dask_series(single_val):
        # Dask returns a series - convert to pandas to check the value
        single_val = single_val.compute()
        assert len(single_val) == 1
        single_val = single_val.loc[0]
    assert single_val == 0
Esempio n. 10
0
def test_latlong_formatting_with_init_series(latlongs):
    expected_series = pd.Series([(1.0, 2.0), (3.0, 4.0)])
    if _is_dask_series(latlongs[0]):
        expected_series = dd.from_pandas(expected_series, npartitions=2)
    elif _is_spark_series(latlongs[0]):
        expected_series = ps.Series([[1.0, 2.0], [3.0, 4.0]])

    expected_series.ww.init(logical_type=LatLong)
    for series in latlongs:
        new_series = init_series(series, logical_type=LatLong)
        assert isinstance(new_series.ww.logical_type, LatLong)
        pd.testing.assert_series_equal(to_pandas(new_series),
                                       to_pandas(expected_series))
        assert expected_series.ww._schema == new_series.ww._schema
Esempio n. 11
0
def test_set_logical_type_invalid_dtype_change(sample_series):
    if _is_dask_series(sample_series):
        pytest.xfail(
            "Dask type conversion with astype does not fail until compute is called"
        )
    if _is_spark_series(sample_series):
        pytest.xfail(
            "Spark allows this conversion, filling values it cannot convert with NaN "
            "and converting dtype to float.")
    sample_series.ww.init(logical_type="Categorical")
    error_message = (
        "Error converting datatype for sample_series from type category to "
        "type int64. Please confirm the underlying data is consistent with logical type Integer."
    )
    with pytest.raises(TypeConversionError, match=error_message):
        sample_series.ww.set_logical_type("Integer")
Esempio n. 12
0
def test_init_series_error_on_invalid_conversion(sample_series):
    if _is_dask_series(sample_series):
        pytest.xfail(
            "Dask type conversion with astype does not fail until compute is called"
        )
    if _is_spark_series(sample_series):
        pytest.xfail(
            "Spark allows this conversion, filling values it cannot convert with NaN "
            "and converting dtype to float.")

    error_message = (
        "Error converting datatype for sample_series from type category to type Int64. "
        "Please confirm the underlying data is consistent with logical type IntegerNullable."
    )
    with pytest.raises(TypeConversionError, match=error_message):
        init_series(sample_series, logical_type="integer_nullable")
Esempio n. 13
0
def test_ordinal_with_order(sample_series):
    if _is_spark_series(sample_series) or _is_dask_series(sample_series):
        pytest.xfail(
            "Fails with Dask and Spark - ordinal data validation not compatible"
        )

    series = sample_series.copy()
    ordinal_with_order = Ordinal(order=["a", "b", "c"])
    series.ww.init(logical_type=ordinal_with_order)
    assert isinstance(series.ww.logical_type, Ordinal)
    assert series.ww.logical_type.order == ["a", "b", "c"]

    series = sample_series.copy()
    series.ww.init(logical_type="Categorical")
    new_series = series.ww.set_logical_type(ordinal_with_order)
    assert isinstance(new_series.ww.logical_type, Ordinal)
    assert new_series.ww.logical_type.order == ["a", "b", "c"]
Esempio n. 14
0
 def __init__(self, data):
     self.data = data
     if _is_dask_dataframe(data):
         raise TypeError("iloc is not supported for Dask DataFrames")
     elif _is_dask_series(data):
         raise TypeError("iloc is not supported for Dask Series")
Esempio n. 15
0
    def infer_logical_type(self, series):
        """Infer the logical type for the given series

        Args:
            series (pandas.Series): The series for which to infer the LogicalType.
        """
        if isinstance(series, pd.Series):
            # Special case for series with no valid values
            if series.count() == 0:
                return Unknown()

            series = series.head(INFERENCE_SAMPLE_SIZE)
        else:
            if _is_dask_series(series):
                series = series.head(INFERENCE_SAMPLE_SIZE)
            elif _is_spark_series(series):
                series = series.head(INFERENCE_SAMPLE_SIZE).to_pandas()
            else:
                raise ValueError(f"Unsupported series type `{type(series)}`"
                                 )  # pragma: no cover

            # For dask or spark collections, unknown type special case comes
            # *after* head calls to avoid evaluating a potentially large
            # dataset
            if series.count() == 0:
                return Unknown()

        def get_inference_matches(types_to_check, series, type_matches=[]):
            # Since NaturalLanguage isn't inferred by default, make sure to check
            # any children of NaturalLanguage, otherwise they never get evaluated
            check_next = []
            for logical_type in types_to_check:
                inference_func = self.inference_functions.get(logical_type)
                if inference_func and inference_func(series):
                    type_matches.append(logical_type)
                    check_next.extend(self._get_children(logical_type))
                elif not inference_func:
                    check_next.extend(self._get_children(logical_type))
            if len(check_next) > 0:
                get_inference_matches(check_next, series, type_matches)
            return type_matches

        # Don't include NaturalLanguage as we only want to check that if
        # no other matches are found
        types_to_check = [
            ltype for ltype in self.root_types if ltype != NaturalLanguage
        ]
        type_matches = get_inference_matches(types_to_check, series)

        if len(type_matches) == 0:
            # Check if this is NaturalLanguage, otherwise set
            # type to default type (Unknown). Assume that a column
            # can only be natural language if it is not already a
            # match for another type. Also improves performance by
            # limiting the times the natural language inference function
            # is called.
            if self.inference_functions.get(
                    NaturalLanguage
            ) and self.inference_functions[NaturalLanguage](series):
                logical_type = NaturalLanguage
            else:
                logical_type = self.default_type
        elif len(type_matches) == 1:
            # If we match only one type, return it
            logical_type = type_matches[0]
        else:
            # If multiple matches, get the most specific one. If multiple
            # matches have the same level of specificity, the first
            # match found at that level will be returned
            best_match = type_matches[0]
            best_depth = self._get_depth(best_match)
            for logical_type in type_matches[1:]:
                ltype_depth = self._get_depth(logical_type)
                if ltype_depth > best_depth:
                    best_match = logical_type
                    best_depth = ltype_depth
            logical_type = best_match

        return logical_type()
Esempio n. 16
0
def test_is_spark_series(sample_series_spark):
    assert _is_spark_series(sample_series_spark)
    assert not _is_dask_series(pd.Series())