Python GeneralHistogramStat Beispiele, parallelm.mlops.stats.health.general_hist_stat.GeneralHistogramStat Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: test_general_histogram_stat.py Projekt: vakjha1/mlpiper

def test_special_feature_handle_in_continuous_feature():
    set_of_features_values = [[1, 2, 3, "A", "1993/12/1"],
                              [2, 3, 4, "B", "1993/12/1"],
                              [3, 3, 5, "C", "1993/12/1"]]

    set_of_features_names = ["c0", "c1", "c2", "c3", "c4"]

    model_stat_with_both = []

    general_histogram_stat_with_both = GeneralHistogramStat()
    general_histogram_stat_with_both.max_cat_unique_values = 3
    general_histogram_stat_with_both.create_and_set_general_stat(
        set_of_features_values=set_of_features_values,
        set_of_features_names=set_of_features_names,
        model_stat=model_stat_with_both)

    # c3 is continuous since it has more than 3 values. but cannot be used at continuous as it is not numeric
    assert set(
        general_histogram_stat_with_both.set_of_continuous_features) == {
            "c0", "c2"
        }

    assert set(
        general_histogram_stat_with_both.set_of_categorical_features) == {
            "c1", "c4"
        }
    assert general_histogram_stat_with_both.contender_continuous_histogram == []
    assert general_histogram_stat_with_both.contender_categorical_histogram == []
    assert general_histogram_stat_with_both.contender_continuous_hist_bins is None

Beispiel #2

0

Datei anzeigen

def test_generation_of_continuous_hist_for_pandas():
    array_of_data = np.array(
        [[1, 2, 3, "A"], [2, 3, 4, "B"], [2, 3, 4, "A"], [3, 4, 4, "B"], [2, 3, 4, "B"], [1, 24, 34, "B"],
         [1, 24, 34, "B"], [1, 24, 34, "B"]])
    feature_length = len(array_of_data[0])

    array_of_features = []
    for i in range(feature_length):
        array_of_features.append("c" + str(i))

    pd_data = pd.DataFrame(data=array_of_data, columns=array_of_features)

    features_values = np.array(pd_data.values)
    features_names = list(pd_data.columns)

    # generating general stats like categorical/continuous features and contender histograms.
    general_hist_stat = GeneralHistogramStat()
    general_hist_stat.max_cat_unique_values = 3
    general_hist_stat \
        .create_and_set_general_stat(set_of_features_values=features_values,
                                     set_of_features_names=features_names,
                                     model_stat=None)

    # For Categorical Values
    # categorical feature names
    categorical_features_names = general_hist_stat.set_of_categorical_features

    # categorical feature values in order of names
    categorical_feature_values_array = []
    for each_categorical_f_names in categorical_features_names:
        categorical_feature_values_array.append(
            features_values[:, features_names.index(each_categorical_f_names)])
    categorical_features_values = np.array(categorical_feature_values_array).T

    # predefined bins of contender categorical hist
    pred_bins_categorical_hist = general_hist_stat.contender_categorical_hist_bins
    contender_categorical_histogram_representation = general_hist_stat.contender_categorical_histogram

    # generating categorical histogram if categorical_features_values exists.
    current_categorical_histogram_representation = None

    if len(categorical_features_values) > 0:
        current_categorical_histogram = CategoricalHistogram() \
            .fit(categorical_features_values,
                 categorical_features_names,
                 num_bins=13,
                 pred_bins=pred_bins_categorical_hist)

        current_categorical_histogram_representation = \
            current_categorical_histogram.get_feature_histogram_rep()

        c3_stat = current_categorical_histogram_representation[0]
        assert isinstance(c3_stat, CategoricalHistogramDataObject)
        assert c3_stat.get_feature_name() == "c3"
        assert False not in (c3_stat.get_bins() == [0.25, 0.75])
        #
        assert c3_stat.get_edges() == ['A', 'B']
        assert c3_stat.get_edge_list() == ['A', 'B']

Beispiel #3

0

Datei anzeigen

Datei: test_general_analyst_stat.py Projekt: vakjha1/mlpiper

def test_categorical_feature_analysis():
    raw_cat = pd.Categorical(["a", "b", "c", "a", "d", "b"],
                             categories=["b", "c", "d"],
                             ordered=False)

    df = pd.DataFrame(
        {"A": ["Harshil", "Yakov", "Sriram", "Sindhu", "LiorK", "Harshil"]})
    df["B"] = raw_cat

    set_of_features_values = df.values
    set_of_features_names = df.keys()

    model_stat_with_both = []

    general_histogram_stat_with_both = GeneralHistogramStat()
    general_histogram_stat_with_both.create_and_set_general_stat(
        set_of_features_values=set_of_features_values,
        set_of_features_names=set_of_features_names,
        model_stat=model_stat_with_both)

    categorical_features_names = general_histogram_stat_with_both.set_of_categorical_features

    categorical_features_values = PythonChannelHealth._create_feature_subset(
        features_values=set_of_features_values,
        features_names=set_of_features_names,
        selection_features_subset=categorical_features_names)

    categorical_data_analyst_result = CategoricalDataAnalyst \
        .analyze(set_of_categorical_feature_names=categorical_features_names,
                 set_of_categorical_feature_values=categorical_features_values)

    assert len(categorical_data_analyst_result) == 2

    A_feature_analysis = CategoricalDataAnalysisResult(feature_name="A",
                                                       count=6,
                                                       NAs="0.0%",
                                                       unique=5,
                                                       top="Harshil",
                                                       freq_top=2,
                                                       avg_str_len=6.0)
    assert (categorical_data_analyst_result["A"] == A_feature_analysis)

    B_feature_analysis = CategoricalDataAnalysisResult(feature_name="B",
                                                       count=6,
                                                       NAs="33.33%",
                                                       unique=3,
                                                       top="b",
                                                       freq_top=2,
                                                       avg_str_len=1.0)
    assert (categorical_data_analyst_result["B"] == B_feature_analysis)

Beispiel #4

0

Datei anzeigen

Datei: test_general_histogram_stat.py Projekt: vakjha1/mlpiper

def test_comparator():
    i1_categorical_histogram_data_object = CategoricalHistogramDataObject(
        feature_name="c1",
        edges=["a", "b", "c", "d"],
        bins=[0.4, 0.2, 0.3, 0.1])
    c1_categorical_histogram_data_object = CategoricalHistogramDataObject(
        feature_name="c1",
        edges=["a", "b", "c", "d"],
        bins=[0.3, 0.4, 0.1, 0.2])
    i2_categorical_histogram_data_object = CategoricalHistogramDataObject(
        feature_name="c2", edges=["a", "b", "c"], bins=[0.4, 0.2, 0.3])
    c2_categorical_histogram_data_object = CategoricalHistogramDataObject(
        feature_name="c2", edges=["a", "b", "c"], bins=[0.3, 0.4, 0.1])
    i_categorical_histogram = [
        i1_categorical_histogram_data_object,
        i2_categorical_histogram_data_object
    ]
    c_categorical_histogram = [
        c1_categorical_histogram_data_object,
        c2_categorical_histogram_data_object
    ]

    features, score = GeneralHistogramStat.calculate_overlap_score(
        inferring_hist_rep=i_categorical_histogram,
        contender_hist_rep=c_categorical_histogram)

    assert set(features) == {"c1", "c2"}
    # score of c2
    assert int(score[0] * 100) == 83

    # score of c1
    assert int(score[1] * 100) == 83

Beispiel #5

0

Datei anzeigen

Datei: test_data_frame.py Projekt: vakjha1/mlpiper

def test_histogram_stats(generate_da_with_missing_data):
    pd_data = pd.read_csv(generate_da_with_missing_data)

    features_values = np.array(pd_data.values)
    features_names = list(pd_data.columns)

    # generating general stats like categorical/continuous features and contender histograms.
    general_hist_stat = GeneralHistogramStat()
    general_hist_stat.max_cat_unique_values = 3
    general_hist_stat \
        .create_and_set_general_stat(set_of_features_values=features_values,
                                     set_of_features_names=features_names,
                                     model_stat=None)

    cat_features = ['Missing::All', 'Missing::Even', 'Missing::Seq', 'emptystrings']
    cont_features = ['Missing::Float']

    assert set(general_hist_stat.set_of_categorical_features) == set(cat_features)
    assert set(general_hist_stat.set_of_continuous_features) == set(cont_features)

Beispiel #6

0

Datei anzeigen

    def _compare_health(current_histogram_representation,
                        contender_histogram_representation,
                        stat_object_method,
                        name_of_stat,
                        model_id):
        """
        Method is responsible for comparing two histogram representation and output score using stat_object_method

        :param current_histogram_representation: inferring histogram representation
        :param contender_histogram_representation: training histogram representation
        :param stat_object_method: stat object method to output stat
        :param name_of_stat: be it continuous or categorical
        :return:
        """
        # If contender_histogram_representation is present then overlap can be calculated """

        contender_histogram_present = False
        if isinstance(contender_histogram_representation, list):
            if len(contender_histogram_representation) > 0:
                contender_histogram_present = True

        elif contender_histogram_representation is not None:
            contender_histogram_present = True

        compared_feature_names = []
        compared_feature_score = []
        if contender_histogram_present:
            compared_feature_names, compared_feature_score = \
                GeneralHistogramStat.calculate_overlap_score(
                    contender_hist_rep=contender_histogram_representation,
                    inferring_hist_rep=current_histogram_representation)

            mlops_stat_hist_score = _HistogramOverlapScoreStat() \
                .name(name_of_stat) \
                .features(list(compared_feature_names)) \
                .data(list(compared_feature_score))

            if stat_object_method is not None:
                stat_object_method(mlops_stat=mlops_stat_hist_score.get_mlops_stat(model_id),
                                   reflex_event_message_type=ReflexEvent.MLHealthModel)

        return compared_feature_names, compared_feature_score

Beispiel #7

0

Datei anzeigen

def test_generation_of_continuous_hist_for_pandas_having_weird_vals():
    array_of_data = np.array([["2A", 2, "3", 4, "A"], ["0C", 6, "4.5", 6, "B"],
                              ["2", 2, "3", 4, "C"], ["A0", 6, "4.5", 6, "D"],
                              ["2B", 2, "3", 4, "E"], ["0W", 6, "4.5", 6,
                                                       "F"]])

    feature_length = len(array_of_data[0])

    array_of_features = []
    for i in range(feature_length):
        array_of_features.append("c" + str(i))

    pd_data = pd.DataFrame(data=array_of_data, columns=array_of_features)

    features_values = np.array(pd_data.values)
    features_names = list(pd_data.columns)

    model_stat_with_both_stat = [
        u'{"data":"{\\"c4\\": [{\\"A\\": 0.0}, {\\"B\\": 0.16666666666666666}, {\\"C\\": 0.0}]}","graphType":"BARGRAPH","timestamp":1536614947113726976,"mode":"INSTANT","name":"categoricalDataHistogram","type":"Health"}',
        u'{"data":"{\\"c1\\": [{\\"1 to 3\\": 0.0}, {\\"3 to 5\\": 0.16666666666666666}, {\\"5 to 7\\": 0.0}], \\"c2\\": [{\\"2 to 3.5\\": 0.0}, {\\"3.5 to 5\\": 0.16666666666666666}, {\\"5 to 6.5\\": 0.0}], \\"c3\\": [{\\"1 to 3\\": 0.0}, {\\"3 to 5\\": 0.16666666666666666}, {\\"5 to 7\\": 0.16666666666666666}], \\"c0\\": [{\\"1 to 3\\": 0.0}, {\\"3 to 5\\": 0.16666666666666666}, {\\"5 to 7\\": 0.0}]}","graphType":"BARGRAPH","timestamp":1536614947113726976,"mode":"INSTANT","name":"continuousDataHistogram","type":"Health"}'
    ]

    # generating general stats like categorical/continuous features and contender histograms.
    general_hist_stat = GeneralHistogramStat()
    general_hist_stat.max_cat_unique_values = 3
    general_hist_stat \
        .create_and_set_general_stat(set_of_features_values=features_values,
                                     set_of_features_names=features_names,
                                     model_stat=model_stat_with_both_stat)

    # For Continuous Values
    # continuous feature names
    continuous_features_names = general_hist_stat.set_of_continuous_features

    # continuous feature values in order of names
    continuous_feature_values_array = []
    for each_continuous_f_names in continuous_features_names:
        continuous_feature_values_array.append(
            features_values[:,
                            features_names.index(each_continuous_f_names)])
    continuous_features_values = np.array(continuous_feature_values_array).T

    # predefined bins of contender continuous hist
    pred_bins_continuous_hist = general_hist_stat.contender_continuous_hist_bins
    contender_continuous_histogram_representation = general_hist_stat.contender_continuous_histogram

    # generating continuous histogram if continuous_features_values exists.
    current_continuous_histogram_representation = None

    if len(continuous_features_values) > 0:
        current_continuous_histogram = ContinuousHistogram() \
            .fit(continuous_features_values,
                 continuous_features_names,
                 num_bins=13,
                 pred_bins=pred_bins_continuous_hist)

        current_continuous_histogram_representation = \
            current_continuous_histogram.get_feature_histogram_rep()

        for each_stat in current_continuous_histogram_representation:
            if each_stat.get_feature_name == "c1":
                assert each_stat.get_feature_name() == "c1"

                assert isinstance(each_stat, ContinuousHistogramDataObject)
                assert False not in (np.ceil(
                    each_stat.get_bins() * 100) == np.ceil(
                        np.array([0.5, 0, 0.5]) * 100))

                assert False not in (np.ceil(each_stat.get_edge_list() *
                                             100) == np.ceil(
                                                 [1.0, 3.0, 5.0, 7.0] * 100))

            elif each_stat.get_feature_name == "c2":
                assert each_stat.get_feature_name() == "c2"

                assert isinstance(each_stat, ContinuousHistogramDataObject)
                assert False not in (np.ceil(
                    each_stat.get_bins() * 100) == np.ceil(
                        np.array([0.5, 0.5, 0]) * 100))
                assert False not in (np.ceil(each_stat.get_edge_list() *
                                             100) == np.ceil(
                                                 [2.0, 3.5, 5.0, 6.5] * 100))

            elif each_stat.get_feature_name == "c3":
                assert each_stat.get_feature_name() == "c3"

                assert isinstance(each_stat, ContinuousHistogramDataObject)
                assert False not in (np.ceil(
                    each_stat.get_bins() * 100) == np.ceil(
                        np.array([0, 0.5, 0.5]) * 100))

                assert False not in (np.ceil(each_stat.get_edge_list() *
                                             100) == np.ceil(
                                                 [1.0, 3.0, 5.0, 7.0] * 100))

Beispiel #8

0

Datei anzeigen

def test_generation_of_continuous_hist_for_pandas():
    array_of_data = np.array([[1, 2, 3, "A"], [2, 3, 4, "B"], [2, 3, 4, "A"],
                              [3, 4, 4, "B"], [2, 3, 4, "B"], [1, 24, 34,
                                                               "B"]])
    feature_length = len(array_of_data[0])

    array_of_features = []
    for i in range(feature_length):
        array_of_features.append("c" + str(i))

    pd_data = pd.DataFrame(data=array_of_data, columns=array_of_features)

    features_values = np.array(pd_data.values)
    features_names = list(pd_data.columns)

    # generating general stats like categorical/continuous features and contender histograms.
    general_hist_stat = GeneralHistogramStat()
    general_hist_stat.max_cat_unique_values = 3
    general_hist_stat \
        .create_and_set_general_stat(set_of_features_values=features_values,
                                     set_of_features_names=features_names,
                                     model_stat=None)

    # For Continuous Values
    # continuous feature names
    continuous_features_names = general_hist_stat.set_of_continuous_features

    # continuous feature values in order of names
    continuous_feature_values_array = []
    for each_continuous_f_names in continuous_features_names:
        continuous_feature_values_array.append(
            features_values[:,
                            features_names.index(each_continuous_f_names)])
    continuous_features_values = np.array(continuous_feature_values_array).T

    # predefined bins of contender continuous hist
    pred_bins_continuous_hist = general_hist_stat.contender_continuous_hist_bins
    contender_continuous_histogram_representation = general_hist_stat.contender_continuous_histogram

    # generating continuous histogram if continuous_features_values exists.
    current_continuous_histogram_representation = None

    if len(continuous_features_values) > 0:
        current_continuous_histogram = ContinuousHistogram() \
            .fit(continuous_features_values,
                 continuous_features_names,
                 num_bins=13,
                 pred_bins=pred_bins_continuous_hist)

        current_continuous_histogram_representation = \
            current_continuous_histogram.get_feature_histogram_rep()

        c0_stat = current_continuous_histogram_representation[0]
        assert isinstance(c0_stat, ContinuousHistogramDataObject)
        assert False not in (np.ceil(c0_stat.get_bins() * 100) == np.ceil(
            np.array([
                0.0, 0.0, 0.0, 0.33333333, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0,
                0.16666667, 0
            ]) * 100))

        assert False not in (np.ceil(c0_stat.get_edge_list() * 100) == np.ceil(
            [
                -float("inf"), 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0,
                3.0, 3.0, 4.0,
                float("inf")
            ] * 100))

Beispiel #9

0

Datei anzeigen

    def generate_health_and_heatmap_stat(stat_object_method,
                                         logger,
                                         features_values,
                                         features_names,
                                         model_stat,
                                         model_id,
                                         num_bins=13,
                                         # TODO: Have ability to get this argument from user!
                                         data_analysis=True):
        """
        Method is highly responsible and creates continuous/categorical histograms. Also creates heatmap and compare two histogram if program is running on inference.

        :param stat_object_method: stat object method to output stat
        :param logger: logger to log
        :param features_values: feature array
        :param features_names: feature names
        :param model_stat: model stat
        :param num_bins: max number of bins for features.
        :return:
        """
        # generating general stats like categorical/continuous features and contender histograms.
        general_hist_stat = GeneralHistogramStat()
        general_hist_stat \
            .create_and_set_general_stat(set_of_features_values=features_values,
                                         set_of_features_names=features_names,
                                         model_stat=model_stat)

        # For Continuous Values
        # continuous feature names
        continuous_features_names = general_hist_stat.set_of_continuous_features
        # predefined bins of contender continuous hist
        pred_bins_continuous_hist = general_hist_stat.contender_continuous_hist_bins
        contender_continuous_histogram_representation = general_hist_stat.contender_continuous_histogram

        continuous_features_values = PythonChannelHealth. \
            _create_feature_subset(features_values=features_values,
                                   features_names=features_names,
                                   selection_features_subset=continuous_features_names)
        current_continuous_histogram_representation = \
            PythonChannelHealth._create_current_hist_rep(
                features_values=continuous_features_values,
                features_names=continuous_features_names,
                num_bins=num_bins,
                pred_bins_hist=pred_bins_continuous_hist,
                stat_object_method=stat_object_method,
                name_of_stat=PyHealth.CONTINUOUS_HISTOGRAM_KEY,
                model_id=model_id)

        # running data analysis for continuous dataset
        if data_analysis:
            continuous_data_analyst_result = ContinuousDataAnalyst \
                .analyze(set_of_continuous_feature_names=continuous_features_names,
                         set_of_continuous_feature_values=continuous_features_values)

            # outputting stat only if analysis result is there
            if len(continuous_data_analyst_result) > 0:
                cont_da = Table() \
                    .name("Continuous Data Analysis") \
                    .cols(["Count",
                           "Missing",
                           "Zeros",
                           "Standard Deviation",
                           "Min",
                           "Mean",
                           "Median",
                           "Max"])

                for f_n in continuous_data_analyst_result.keys():
                    f_v = continuous_data_analyst_result[f_n]
                    cont_da.add_row(str(f_v.feature_name),
                                    [f_v.count,
                                     f_v.NAs,
                                     f_v.zeros,
                                     f_v.std,
                                     f_v.min,
                                     f_v.mean,
                                     f_v.median,
                                     f_v.max])

                # outputting stat using stat object as stat message type
                stat_object_method(mlops_stat=cont_da.get_mlops_stat(model_id=model_id),
                                   reflex_event_message_type=ReflexEvent.StatsMessage)

        logger.debug("continuous features values: {}".format(continuous_features_values))
        logger.debug("continuous features names: {}".format(continuous_features_names))
        logger.debug(
            "current histogram representation: {}".format(current_continuous_histogram_representation))
        logger.debug(
            "contender histogram representation: {}".format(contender_continuous_histogram_representation))

        # For Categorical Values
        # categorical feature names
        categorical_features_names = general_hist_stat.set_of_categorical_features

        # predefined bins of contender categorical hist
        pred_bins_categorical_hist = general_hist_stat.contender_categorical_hist_bins
        contender_categorical_histogram_representation = general_hist_stat.contender_categorical_histogram

        categorical_features_values = PythonChannelHealth._create_feature_subset(features_values=features_values,
                                                                                 features_names=features_names,
                                                                                 selection_features_subset=categorical_features_names)
        current_categorical_histogram_representation = \
            PythonChannelHealth._create_current_hist_rep(
                categorical_features_values,
                categorical_features_names,
                num_bins,
                pred_bins_categorical_hist,
                stat_object_method,
                name_of_stat=PyHealth.CATEGORICAL_HISTOGRAM_KEY,
                model_id=model_id)

        # running data analysis for categorical dataset
        if data_analysis:
            categorical_data_analyst_result = CategoricalDataAnalyst \
                .analyze(set_of_categorical_feature_names=categorical_features_names,
                         set_of_categorical_feature_values=categorical_features_values)

            # outputting stat only if analysis result is there
            if len(categorical_data_analyst_result) > 0:

                categ_da = Table() \
                    .name("Categorical Data Analysis") \
                    .cols(["Count",
                           "Missing",
                           "Uniques",
                           "Top Frequently Occurring Category",
                           "Top Frequency",
                           "Average String Length"])

                for f_n in categorical_data_analyst_result.keys():
                    f_v = categorical_data_analyst_result[f_n]
                    categ_da. \
                        add_row(str(f_v.feature_name),
                                [f_v.count,
                                 f_v.NAs,
                                 f_v.unique,
                                 f_v.top,
                                 f_v.freq_top,
                                 f_v.avg_str_len])

                # outputting stat using stat object as stat message type
                stat_object_method(mlops_stat=categ_da.get_mlops_stat(model_id=model_id),
                                   reflex_event_message_type=ReflexEvent.StatsMessage)

        logger.debug("categorical features values: {}".format(categorical_features_values))
        logger.debug("categorical features names: {}".format(categorical_features_names))
        logger.debug(
            "current histogram representation: {}".format(current_categorical_histogram_representation))
        logger.debug(
            "contender histogram representation: {}".format(contender_categorical_histogram_representation))

        # If model_stat is given, it means it is inference program
        # so it needs to create heatmap and score too.
        if model_stat is not None:
            if continuous_features_values.shape[0] > 0:
                continuous_features_names, heat_map_values = PythonChannelHealth. \
                    _create_current_continuous_heatmap_rep(continuous_features_values=continuous_features_values,
                                                           continuous_features_names=continuous_features_names,
                                                           stat_object_method=stat_object_method,
                                                           model_id=model_id)
                logger.debug("features: {}, heatmap values: {}".format(continuous_features_names,
                                                                       heat_map_values))

                compared_continuous_feature_names, compared_continuous_feature_score = PythonChannelHealth. \
                    _compare_health(
                    current_histogram_representation=current_continuous_histogram_representation,
                    contender_histogram_representation=contender_continuous_histogram_representation,
                    stat_object_method=stat_object_method,
                    name_of_stat=PyHealth.CONTINUOUS_HISTOGRAM_OVERLAP_SCORE_KEY,
                    model_id=model_id)
                logger.debug(
                    "continuous features: {}, overlap scores: {}".format(compared_continuous_feature_names,
                                                                         compared_continuous_feature_score))

            if categorical_features_values.shape[0] > 0:
                compared_categorical_feature_names, compared_categorical_feature_names = PythonChannelHealth. \
                    _compare_health(
                    current_histogram_representation=current_categorical_histogram_representation,
                    contender_histogram_representation=contender_categorical_histogram_representation,
                    stat_object_method=stat_object_method,
                    name_of_stat=PyHealth.CATEGORICAL_HISTOGRAM_OVERLAP_SCORE_KEY,
                    model_id=model_id)
                logger.debug(
                    "categorical features: {}, overlap scores: {}".format(
                        compared_categorical_feature_names, compared_categorical_feature_names))

Beispiel #10

0

Datei anzeigen

Datei: test_general_analyst_stat.py Projekt: vakjha1/mlpiper

def test_continuous_feature_analysis():
    set_of_features_values = np.array([[1, 2, 3, "A", "1993/12/1"],
                                       [2, 1, np.NAN, "B", "1993/12/1"],
                                       [2, 3, 4, "B", "1993/12/1"],
                                       [0, 9, 5, "A", "1993/12/1"],
                                       [0, 9, 5, "A", np.nan],
                                       [2, 1, np.NAN, "B", "1993/12/1"]])

    set_of_features_names = ["c0", "c1", "c2", "c3", "c4"]

    model_stat_with_both = []

    general_histogram_stat_with_both = GeneralHistogramStat()
    general_histogram_stat_with_both.max_cat_unique_values = 3
    general_histogram_stat_with_both.create_and_set_general_stat(
        set_of_features_values=set_of_features_values,
        set_of_features_names=set_of_features_names,
        model_stat=model_stat_with_both)

    # c3 is continuous since it has more than 3 values. but cannot be used at continuous as it is not numeric
    assert set(
        general_histogram_stat_with_both.set_of_continuous_features) == {
            "c0", "c1", "c2"
        }
    assert set(
        general_histogram_stat_with_both.set_of_categorical_features) == {
            "c3", "c4"
        }

    # For Continuous Values
    # continuous feature names
    continuous_features_names = general_histogram_stat_with_both.set_of_continuous_features

    continuous_features_values = PythonChannelHealth._create_feature_subset(
        features_values=set_of_features_values,
        features_names=set_of_features_names,
        selection_features_subset=continuous_features_names)

    continuous_data_analyst_result = ContinuousDataAnalyst \
        .analyze(set_of_continuous_feature_names=continuous_features_names,
                 set_of_continuous_feature_values=continuous_features_values)

    assert len(continuous_data_analyst_result) == 3

    c2_feature_analysis = ContinuousDataAnalysisResult(feature_name="c2",
                                                       count=6,
                                                       NAs="33.33%",
                                                       min=3.0,
                                                       max=5.0,
                                                       mean=4.25,
                                                       median=4.5,
                                                       std=0.9574,
                                                       zeros=0)
    assert (continuous_data_analyst_result['c2'] == c2_feature_analysis)

    c1_feature_analysis = ContinuousDataAnalysisResult(feature_name="c1",
                                                       count=6,
                                                       NAs="0.0%",
                                                       min=1.0,
                                                       max=9.0,
                                                       mean=4.1667,
                                                       median=2.5,
                                                       std=3.8166,
                                                       zeros=0)
    assert (continuous_data_analyst_result['c1'] == c1_feature_analysis)

    c0_feature_analysis = ContinuousDataAnalysisResult(feature_name="c0",
                                                       count=6,
                                                       NAs="0.0%",
                                                       min=0.0,
                                                       max=2.0,
                                                       mean=1.1667,
                                                       median=1.5,
                                                       std=0.9831,
                                                       zeros=2)
    assert (continuous_data_analyst_result['c0'] == c0_feature_analysis)

Beispiel #11

0

Datei anzeigen

Datei: test_general_histogram_stat.py Projekt: vakjha1/mlpiper

def test_general_stat_initialization():
    set_of_features_values = [[1, 2, 3], [2, 3, 4], [3, 3, 5]]

    set_of_features_names = ["c0", "c1", "c2"]

    model_stat_with_both = []

    general_histogram_stat_with_both = GeneralHistogramStat()
    general_histogram_stat_with_both.max_cat_unique_values = 3
    general_histogram_stat_with_both.create_and_set_general_stat(
        set_of_features_values=set_of_features_values,
        set_of_features_names=set_of_features_names,
        model_stat=model_stat_with_both)

    assert set(
        general_histogram_stat_with_both.set_of_continuous_features) == {
            "c0", "c2"
        }
    assert set(
        general_histogram_stat_with_both.set_of_categorical_features) == {
            "c1"
        }
    assert general_histogram_stat_with_both.contender_continuous_histogram == []
    assert general_histogram_stat_with_both.contender_categorical_histogram == []
    assert general_histogram_stat_with_both.contender_continuous_hist_bins is None

    model_stat_with_cont_stat = [
        u'{"data":"{\\"c2\\": [{\\"-inf to -2.1273143387113773\\": 0.0}, {\\"-2.1273143387113773 to -1.1041662771274905\\": 0.16666666666666666}, {\\"-1.1041662771274905 to inf\\": 0.0}], \\"c0\\": [{\\"-inf to -17.229967937210233\\": 0.0}, {\\"-17.229967937210233 to -12.15785255468716\\": 0.0}, {\\"-12.15785255468716 to inf\\": 0.16666666666666666}]}","graphType":"BARGRAPH","timestamp":1536614947113726976,"mode":"INSTANT","name":"continuousDataHistogram","type":"Health"}'
    ]
    general_histogram_stat_with_cont_stat = GeneralHistogramStat()
    general_histogram_stat_with_cont_stat.create_and_set_general_stat(
        set_of_features_values=set_of_features_values,
        set_of_features_names=set_of_features_names,
        model_stat=model_stat_with_cont_stat)

    assert set(
        general_histogram_stat_with_cont_stat.set_of_continuous_features) == {
            "c0", "c2"
        }
    assert general_histogram_stat_with_cont_stat.set_of_categorical_features == []

    for each_continuous_histogram_data_object in general_histogram_stat_with_cont_stat.contender_continuous_histogram:
        assert isinstance(each_continuous_histogram_data_object,
                          ContinuousHistogramDataObject)
        if each_continuous_histogram_data_object.get_feature_name() == "c2":
            assert each_continuous_histogram_data_object.get_edges() == [
                '-inf to -2.1273143387113773',
                '-2.1273143387113773 to -1.1041662771274905',
                '-1.1041662771274905 to inf'
            ]
            assert each_continuous_histogram_data_object.get_bins() == [
                0.0, 0.16666666666666666, 0.0
            ]

        elif each_continuous_histogram_data_object.get_feature_name() == "c0":
            assert each_continuous_histogram_data_object.get_edges() == [
                '-inf to -17.229967937210233',
                '-17.229967937210233 to -12.15785255468716',
                '-12.15785255468716 to inf'
            ]
            assert each_continuous_histogram_data_object.get_bins() == [
                0.0, 0.0, 0.16666666666666666
            ]

    assert general_histogram_stat_with_cont_stat.contender_categorical_histogram == []

    model_stat_with_cat_stat = [
        u'{"data":"{\\"c2\\": [{\\"A\\": 0.0}, {\\"B\\": 0.16666666666666666}, {\\"C\\": 0.0}], \\"c0\\": [{\\"D\\": 0.0}, {\\"E\\": 0.0}, {\\"F\\": 0.16666666666666666}]}","graphType":"BARGRAPH","timestamp":1536614947113726976,"mode":"INSTANT","name":"categoricalDataHistogram","type":"Health"}'
    ]
    general_histogram_stat_with_cat_stat = GeneralHistogramStat()
    general_histogram_stat_with_cat_stat.create_and_set_general_stat(
        set_of_features_values=set_of_features_values,
        set_of_features_names=set_of_features_names,
        model_stat=model_stat_with_cat_stat)

    assert general_histogram_stat_with_cat_stat.set_of_continuous_features == []
    assert set(
        general_histogram_stat_with_cat_stat.set_of_categorical_features) == {
            "c0", "c2"
        }

    for each_categorical_histogram_data_object in general_histogram_stat_with_cat_stat.contender_categorical_histogram:
        assert isinstance(each_categorical_histogram_data_object,
                          CategoricalHistogramDataObject)
        if each_categorical_histogram_data_object.get_feature_name() == "c2":
            assert each_categorical_histogram_data_object.get_edges() == [
                'A', 'B', 'C'
            ]
            assert list(each_categorical_histogram_data_object.get_bins()) == [
                0.0, 0.16666666666666666, 0.0
            ]

        elif each_categorical_histogram_data_object.get_feature_name() == "c0":
            assert each_categorical_histogram_data_object.get_edges() == [
                'D', 'E', 'F'
            ]
            assert each_categorical_histogram_data_object.get_bins() == [
                0.0, 0.0, 0.16666666666666666
            ]

    assert general_histogram_stat_with_cat_stat.contender_continuous_histogram == []

    model_stat_with_both_stat = [
        u'{"data":"{\\"c2\\": [{\\"A\\": 0.0}, {\\"B\\": 0.16666666666666666}, {\\"C\\": 0.0}]}","graphType":"BARGRAPH","timestamp":1536614947113726976,"mode":"INSTANT","name":"categoricalDataHistogram","type":"Health"}',
        u'{"data":"{\\"c0\\": [{\\"1 to 3\\": 0.0}, {\\"3 to 5\\": 0.1}, {\\"5 to 7\\": 0.0}]}","graphType":"BARGRAPH","timestamp":1536614947113726976,"mode":"INSTANT","name":"continuousDataHistogram","type":"Health"}'
    ]
    general_histogram_stat_with_both_stat = GeneralHistogramStat()
    general_histogram_stat_with_both_stat.create_and_set_general_stat(
        set_of_features_values=set_of_features_values,
        set_of_features_names=set_of_features_names,
        model_stat=model_stat_with_both_stat)

    assert set(
        general_histogram_stat_with_both_stat.set_of_continuous_features) == {
            "c0"
        }
    assert set(
        general_histogram_stat_with_both_stat.set_of_categorical_features) == {
            "c2"
        }

    for each_categorical_histogram_data_object in general_histogram_stat_with_both_stat.contender_categorical_histogram:
        assert isinstance(each_categorical_histogram_data_object,
                          CategoricalHistogramDataObject)
        assert each_categorical_histogram_data_object.get_edges() == [
            'A', 'B', 'C'
        ]
        assert each_categorical_histogram_data_object.get_bins() == [
            0.0, 0.16666666666666666, 0.0
        ]

    for each_categorical_histogram_data_object in general_histogram_stat_with_both_stat.contender_continuous_histogram:
        assert each_categorical_histogram_data_object.get_edges() == [
            '1 to 3', '3 to 5', '5 to 7'
        ]
        assert each_categorical_histogram_data_object.get_bins() == [
            0.0, 0.1, 0.0
        ]