Beispiel #1
0
def test_estimate_segments():

    df = pd.DataFrame({"target": ["hat", "jug", "hat"], "confidence": [1.2, 3.4, 4.5], "sentiment": ["happy", "sad", "sad"]})
    res = _estimate_segments(df, target_field="confidence", max_segments=4)
    assert res == ["target"]

    res = _estimate_segments(df, target_field="confidence", max_segments=3)
    assert res == ["target"]

    res = _estimate_segments(df, target_field="confidence", max_segments=1)
    assert res == []
Beispiel #2
0
    def estimate_segments(
        self,
        df: pd.DataFrame,
        name: str,
        target_field: str = None,
        max_segments: int = 30,
        dry_run: bool = False,
    ) -> Optional[Union[List[Dict], List[str]]]:
        """
        Estimates the most important features and values on which to segment
        data profiling using entropy-based methods.

        :param df: the dataframe of data to profile
        :param name: name for discovery in the logger, automatically applied
        to loggers with same dataset_name
        :param target_field: target field (optional)
        :param max_segments: upper threshold for total combinations of segments,
        default 30
        :param dry_run: run calculation but do not write results to metadata
        :return: a list of segmentation feature names
        """
        segments = _estimate_segments(df=df, target_field=target_field, max_segments=max_segments)

        if not dry_run:
            self.metadata_writer.autosegmentation_write(name, segments)

        return segments
Beispiel #3
0
def test_estimate_segments_empty():

    df = pd.DataFrame({"target": [], "confidence": [], "sentiment": []})
    res = _estimate_segments(df, target_field="confidence", max_segments=4)
    assert res == []