def test_class_prediction_error_quickmethod_X_test_only(self):
        """
        Test the ClassPredictionError quickmethod
        """
        X, y = load_occupancy(return_dataset=True).to_numpy()
        X_train, X_test, y_train, y_test = tts(X,
                                               y,
                                               test_size=0.2,
                                               shuffle=True,
                                               random_state=42)

        fig = plt.figure()
        ax = fig.add_subplot()

        clf = LinearSVC(random_state=42)
        with pytest.raises(
                YellowbrickValueError,
                match="must specify both X_test and y_test or neither",
        ):
            class_prediction_error(clf,
                                   X_train=X_train,
                                   y_train=y_train,
                                   X_test=X_test,
                                   ax=ax,
                                   show=False)
Esempio n. 2
0
    def test_integrated_radviz_pandas_classes_features(self):
        """
        Test RadViz with classes and features specified using Pandas
        """
        # Load the data from the fixture
        data = load_occupancy(return_dataset=True)
        X, y = data.to_pandas()

        features = ["temperature", "relative humidity", "light"]
        classes = [
            k
            for k, _ in sorted(data.meta["labels"].items(), key=lambda i: i[1])
        ]

        assert isinstance(X, pd.DataFrame)
        assert isinstance(y, pd.Series)

        # Filter the dataset to make sure it's not just class names
        X = X[features]
        y = y.astype(int)

        # Test the visualizer
        visualizer = RadViz(features=features, classes=classes)
        visualizer.fit_transform(X, y)
        visualizer.finalize()
        self.assert_images_similar(visualizer, tol=0.1)
Esempio n. 3
0
    def test_integrated_radviz_numpy_classes_features(self):
        """
        Test RadViz with classes and features specified using numpy
        """
        # Load the data from the fixture
        data = load_occupancy(return_dataset=True)
        X, y = data.to_numpy()

        features = data.meta["features"][0:3]
        classes = [
            k
            for k, _ in sorted(data.meta["labels"].items(), key=lambda i: i[1])
        ]

        assert isinstance(X, np.ndarray)
        assert isinstance(y, np.ndarray)

        # Filter the dataset to make sure it's not just class names
        X = X[:, :3]
        y = y.astype(int)

        # Test the visualizer
        visualizer = RadViz(features=features, classes=classes)
        visualizer.fit_transform(X, y)
        visualizer.finalize()
        self.assert_images_similar(visualizer, tol=0.1)
Esempio n. 4
0
def balance():
    X, y = load_occupancy()
    _, _, y_train, y_test = tts(X, y, test_size=0.2)

    oz = ClassBalance(ax=newfig(), labels=["unoccupied", "occupied"])
    oz.fit(y_train, y_test)
    savefig(oz, "class_balance")
Esempio n. 5
0
def classreport():
    X, y = load_occupancy()
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
    oz = ClassificationReport(GaussianNB(), support=True, ax=newfig())
    oz.fit(X_train, y_train)
    oz.score(X_test, y_test)
    savefig(oz, "classification_report")
    def test_pandas_integration(self):
        """
        Test the precision_recall_curve with Pandas dataframes
        """
        X, y = load_occupancy(return_dataset=True).to_pandas()

        model = DecisionTreeClassifier(random_state=14)

        X_train, X_test, y_train, y_test = tts(
            X, y, test_size=0.2, shuffle=True, random_state=555
        )

        oz = PrecisionRecallCurve(
            model,
            per_class=True,
            micro=False,
            fill_area=False,
            iso_f1_curves=True,
            ap_score=False,
            classes=["unoccupied", "occupied"],
        )
        oz.fit(X_train, y_train)
        oz.score(X_test, y_test)

        oz.finalize()

        self.assert_images_similar(oz, tol=5.0)
    def test_class_prediction_error_quickmethod_X_test_and_y_test(self):
        """
        Test the ClassPredictionError quickmethod
        """
        X, y = load_occupancy(return_dataset=True).to_numpy()
        X_train, X_test, y_train, y_test = tts(X,
                                               y,
                                               test_size=0.2,
                                               shuffle=True,
                                               random_state=42)

        fig = plt.figure()
        ax = fig.add_subplot()

        clf = LinearSVC(random_state=42)
        viz = class_prediction_error(clf,
                                     X_train=X_train,
                                     y_train=y_train,
                                     X_test=X_test,
                                     y_test=y_test,
                                     ax=ax,
                                     show=False)

        # Not sure why the tolerance must be so high for this
        # Failing on travis with RMS 9.544
        # AppVeyor and Linux conda fail due to non-text-based differences: RMS 12.961
        self.assert_images_similar(viz, tol=13, windows_tol=13)
Esempio n. 8
0
    def test_missing_test_data_in_quick_method(self):
        """
        Test quick method when test data is missing.
        """
        X, y = load_occupancy(return_dataset=True).to_numpy()

        X_train, X_test, y_train, y_test = tts(X,
                                               y,
                                               test_size=0.2,
                                               shuffle=True,
                                               random_state=55555)

        emsg = "both X_test and y_test are required if one is specified"

        with pytest.raises(YellowbrickValueError, match=emsg):
            precision_recall_curve(RandomForestClassifier(),
                                   X_train,
                                   y_train,
                                   y_test=y_test,
                                   show=False)

        with pytest.raises(YellowbrickValueError, match=emsg):
            precision_recall_curve(RandomForestClassifier(),
                                   X_train,
                                   y_train,
                                   X_test,
                                   show=False)
Esempio n. 9
0
    def test_rocauc_quickmethod(self):
        """
        Test the ROCAUC quick method
        """
        X, y = load_occupancy(return_dataset=True).to_numpy()
        model = DecisionTreeClassifier()

        # TODO: image comparison of the quick method
        roc_auc(model, X, y)
Esempio n. 10
0
    def test_rank2d_quick_method(self):
        """
        Test Rank2D quick method
        """
        X, y = load_occupancy()
        oz = rank2d(X, y, algorithm="spearman", colormap="RdYlGn_r")

        assert isinstance(oz, Rank2D)
        self.assert_images_similar(oz, tol=0.1)
Esempio n. 11
0
    def test_rocauc_quickmethod(self):
        """
        Test the ROCAUC quick method
        """
        X, y = load_occupancy(return_dataset=True).to_numpy()
        model = LogisticRegression()

        # compare the images
        visualizer = roc_auc(model, X, y, show=False)
        self.assert_images_similar(visualizer)
    def test_parallel_coordinates_quickmethod(self):
        """
        Test the quick method producing a valid visualization
        """
        X, y = load_occupancy(return_dataset=True).to_numpy()

        # Compare the images
        # Use only the first 100 samples so the test will run faster
        visualizer = parallel_coordinates(X, y, sample=100, show=False)
        self.assert_images_similar(visualizer)
Esempio n. 13
0
    def test_quick_method(self):
        """
        Test the quick method with producing a valid visualization
        """
        data = load_occupancy(return_dataset=True)
        _, y = data.to_numpy()

        visualizer = balanced_binning_reference(y, show=False)

        assert isinstance(visualizer, BalancedBinningReference)
        self.assert_images_similar(visualizer, tol=0.5)
Esempio n. 14
0
def manifold(dataset, manifold):
    if dataset == "concrete":
        X, y = load_concrete()
    elif dataset == "occupancy":
        X, y = load_occupancy()
    else:
        raise ValueError("unknown dataset")

    oz = Manifold(manifold=manifold, ax=newfig())
    oz.fit_transform(X, y)
    savefig(oz, "{}_{}_manifold".format(dataset, manifold))
    def test_integrated_scatter(self):
        """
        Test scatter on the real, occupancy data set
        """
        # Load the data from the fixture
        X, y = load_occupancy(return_dataset=True).to_numpy()

        # Test the visualizer
        features = ["temperature", "relative humidity"]
        visualizer = ScatterViz(features=features)
        visualizer.fit_transform_show(X[:, :2], y)
    def test_classes_greater_than_indices(self):
        """
        A model error should be raised when there are more classes in fit than score
        """
        X, y = load_occupancy(return_dataset=True).to_numpy()
        classes = ["unoccupied", "occupied", "partytime"]

        model = LinearSVC(random_state=42)
        model.fit(X, y)
        with pytest.raises(ModelError):
            visualizer = ClassPredictionError(model, classes=classes)
            visualizer.score(X, y)
    def test_classes_less_than_indices(self):
        """
        Assert error when there is an attempt to filter classes
        """
        X, y = load_occupancy(return_dataset=True).to_numpy()
        classes = ["unoccupied"]

        model = LinearSVC(random_state=42)
        model.fit(X, y)
        with pytest.raises(NotImplementedError):
            visualizer = ClassPredictionError(model, classes=classes)
            visualizer.score(X, y)
Esempio n. 18
0
    def test_pandas_bins(self):
        """
        Test Histogram on a Pandas Dataframe
        """
        # Load the data from the fixture
        data = load_occupancy(return_dataset=True)
        X, y = data.to_pandas()

        visualizer = BalancedBinningReference()
        visualizer.fit(y)
        visualizer.finalize()
        self.assert_images_similar(visualizer, tol=0.5)
Esempio n. 19
0
    def test_integrated_scatter_with_pandas(self):
        """
        Test scatterviz on the real, occupancy data set with pandas
        """
        # Load the data from the fixture
        # Load the data from the fixture
        X, y = load_occupancy(return_dataset=True).to_pandas()

        # Test the visualizer
        features = ["temperature", "relative humidity"]
        visualizer = ScatterViz(features=features)
        visualizer.fit_transform_poof(X, y)
    def test_score_returns_score(self):
        """
        Test that ClassPredictionError score() returns a score between 0 and 1
        """
        X, y = load_occupancy(return_dataset=True).to_numpy()

        # Create and fit the visualizer
        visualizer = ClassPredictionError(LinearSVC(random_state=42))
        visualizer.fit(X, y)

        # Score the visualizer
        s = visualizer.score(X, y)
        assert 0 <= s <= 1
    def test_scatter_quick_method(self):
        """
        Test scatter quick method on the real, occupancy data set
        """
        # Load the data from the fixture
        X, y = load_occupancy(return_dataset=True).to_numpy()

        # Test the visualizer
        features = ["temperature", "relative humidity"]
        viz = scatterviz(X[:, :2], y=y, ax=None, features=features)

        # test that is returns a matplotlib obj with axes
        assert isinstance(viz, ScatterVisualizer)
    def test_numpy_occupancy_balance(self):
        """
        Test NumPy arrays with string target in balance mode
        """
        data = load_occupancy(return_dataset=True)
        X, y = data.to_numpy()

        # Create and fit the visualizer
        oz = ClassBalance()
        assert oz.fit(y) is oz

        # oz.finalize()
        self.assert_images_similar(oz)
Esempio n. 23
0
def select_features_example(
        algorithm="isomap",
        path="images/occupancy_select_k_best_isomap_manifold.png",
        **kwargs):
    _, ax = plt.subplots(figsize=(9, 6))

    model = Pipeline([
        ("selectk", SelectKBest(k=3, score_func=f_classif)),
        ("viz", Manifold(ax=ax, manifold=algorithm, **kwargs)),
    ])

    X, y = load_occupancy()
    model.fit(X, y)
    model.named_steps["viz"].poof(outpath=path)
Esempio n. 24
0
    def test_integrated_radviz_with_pandas(self):
        """
        Test RadViz with Pandas on the occupancy dataset
        """
        data = load_occupancy(return_dataset=True)
        X, y = data.to_pandas()

        assert isinstance(X, pd.DataFrame)
        assert isinstance(y, pd.Series)

        # Test the visualizer
        visualizer = RadViz()
        visualizer.fit_transform_poof(X, y)
        self.assert_images_similar(visualizer, tol=0.1)
Esempio n. 25
0
    def test_integrated_radviz_with_numpy(self):
        """
        Test RadViz with numpy on the occupancy dataset
        """
        data = load_occupancy(return_dataset=True)
        X, y = data.to_numpy()

        assert isinstance(X, np.ndarray)
        assert isinstance(y, np.ndarray)

        # Test the visualizer
        visualizer = RadViz()
        visualizer.fit_transform_poof(X, y)
        self.assert_images_similar(visualizer, tol=0.1)
Esempio n. 26
0
    def test_quick_method_with_test_set(self):
        """
        Test quick method when both train and test data is supplied
        """
        X, y = load_occupancy(return_dataset=True).to_numpy()

        X_train, X_test, y_train, y_test = tts(
            X, y, test_size=0.2, shuffle=True, random_state=555
        )

        viz = precision_recall_curve(
            RandomForestClassifier(random_state=72), X_train, y_train, X_test, y_test
        )
        self.assert_images_similar(viz)
Esempio n. 27
0
    def test_pandas_occupancy_compare(self):
        """
        Test pandas data frame with string target in compare mode
        """
        data = load_occupancy(return_dataset=True)
        X, y = data.to_pandas()

        _, _, y_train, y_test = tts(X, y, test_size=0.4, random_state=2242)

        # Create and fit the visualizer
        oz = ClassBalance()
        assert oz.fit(y_train, y_test) is oz

        # oz.finalize()
        self.assert_images_similar(oz, tol=0.5)  # w/o tol fails with RMS 0.433
    def test_numpy_occupancy_compare(self):
        """
        Test NumPy arrays with string target in compare mode
        """
        data = load_occupancy(return_dataset=True)
        X, y = data.to_numpy()

        _, _, y_train, y_test = tts(X, y, test_size=0.4, random_state=2242)

        # Create and fit the visualizer
        oz = ClassBalance()
        assert oz.fit(y_train, y_test) is oz

        # oz.finalize()
        self.assert_images_similar(oz)
Esempio n. 29
0
    def test_stack_param_incorrectly_used_throws_error(self):
        """
        Test incorrectly using stack param on a dataset with two classes which
        does not return a coef_ array in the shape of (n_classes, n_features)
        """
        X, y = load_occupancy()

        viz = FeatureImportances(
            LogisticRegression(solver="liblinear", random_state=222), stack=True
        )

        expected_error = "The model used does not return coef_ array"

        with pytest.raises(YellowbrickValueError, match=expected_error):
            viz.fit(X, y)
    def test_pandas_integration(self):
        """
        Assert no errors during class prediction error integration with Pandas
        """
        X, y = load_occupancy(return_dataset=True).to_pandas()
        classes = ["unoccupied", "occupied"]

        model = LinearSVC(random_state=42)
        model.fit(X, y)
        visualizer = ClassPredictionError(model, classes=classes)
        visualizer.score(X, y)
        visualizer.finalize()

        # AppVeyor and Linux conda fail due to non-text-based differences
        # AppVeyor fails with RMS 13.161 - 13.289 (python - miniconda)
        self.assert_images_similar(visualizer, tol=12.5, windows_tol=13.3)
def compare_class_balance(path="images/class_balance_compare.png"):
    data = load_occupancy()

    features = ["temperature", "relative_humidity", "light", "C02", "humidity"]
    classes = ['unoccupied', 'occupied']

    # Extract the numpy arrays from the data frame
    X = data[features]
    y = data["occupancy"]

    # Create the train and test data
    _, _, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # Instantiate the classification model and visualizer
    visualizer = ClassBalance(labels=classes)

    visualizer.fit(y_train, y_test)
    return visualizer.poof(outpath=path)