예제 #1
0
def test_score_from_params_clustering():
    np.random.seed(123)
    X = np.random.randn(100, 10)

    kmeans = KMeans(n_clusters=3, random_state=123)
    result1 = score_from_params_clustering(kmeans, X, scoring=["silhouette", "davies_bouldin"])

    with pytest.raises(sklearn.exceptions.NotFittedError):
        kmeans.predict(X)

    assert isinstance(result1, pd.DataFrame)
    assert list(result1.columns) == ["test_silhouette", "test_davies_bouldin", "fit_time", "score_time"]
    assert len(result1) == 1

    kmeans = KMeans(n_clusters=3, random_state=123)
    result2, yhat = score_from_params_clustering(
        kmeans, X, scoring=["silhouette", "davies_bouldin"], return_predict=True
    )

    with pytest.raises(sklearn.exceptions.NotFittedError):
        kmeans.predict(X)

    assert isinstance(result2, pd.DataFrame)
    assert list(result2.columns) == ["test_silhouette", "test_davies_bouldin", "fit_time", "score_time"]

    assert len(result2) == 1
    assert yhat.shape == (100,)
    assert len(np.unique(yhat)) == 3

    assert np.abs(result1.iloc[:, 0] - result2.iloc[:, 0]).max() <= 10 ** (-5)
    assert np.abs(result1.iloc[:, 1] - result2.iloc[:, 1]).max() <= 10 ** (-5)
예제 #2
0
def test_score_from_params_clustering_with_scorer_object():
    X = np.random.randn(100, 10)

    kmeans = KMeans(n_clusters=3, random_state=123)
    result1 = score_from_params_clustering(kmeans, X, scoring=SCORERS["silhouette"])
    assert result1.shape[0] == 1
    assert isinstance(result1, pd.DataFrame)

    result2 = score_from_params_clustering(kmeans, X, scoring="silhouette")
    assert result2.shape[0] == 1
    assert isinstance(result2, pd.DataFrame)

    assert np.abs(result1.iloc[:, 0] - result2.iloc[:, 0]).max() <= 10 ** (-5)

    result1 = score_from_params_clustering(kmeans, X, scoring=SCORERS["calinski_harabasz"])
    assert result1.shape[0] == 1
    assert isinstance(result1, pd.DataFrame)

    result2 = score_from_params_clustering(kmeans, X, scoring="calinski_harabasz")
    assert result2.shape[0] == 1
    assert isinstance(result2, pd.DataFrame)

    assert np.abs(result1.iloc[:, 0] - result2.iloc[:, 0]).max() <= 10 ** (-5)

    result1 = score_from_params_clustering(kmeans, X, scoring=SCORERS["davies_bouldin"])
    assert result1.shape[0] == 1
    assert isinstance(result1, pd.DataFrame)

    result2 = score_from_params_clustering(kmeans, X, scoring="davies_bouldin")
    assert result2.shape[0] == 1
    assert isinstance(result2, pd.DataFrame)

    assert np.abs(result1.iloc[:, 0] - result2.iloc[:, 0]).max() <= 10 ** (-5)
예제 #3
0
def test_score_from_params(x_data_type, shuffle, graph_pipeline):
    np.random.seed(123)
    X = np.random.randn(100, 10)

    X = convert_generic(X, output_type=x_data_type)

    if x_data_type == DataTypes.DataFrame:
        X.columns = ["col_%d" % i for i in range(X.shape[1])]

    if shuffle:
        ii = np.arange(X.shape[0])
        np.random.shuffle(ii)

        if isinstance(X, pd.DataFrame):
            X = X.loc[ii, :]
        else:
            X = X[ii, :]

    scoring = ["silhouette", "davies_bouldin", "calinski_harabasz"]

    if graph_pipeline:
        estimator = GraphPipeline(
            {"pt": DebugPassThrough(), "lg": KMeans(n_clusters=3, random_state=123)}, edges=[("pt", "lg")]
        )
    else:
        estimator = KMeans(n_clusters=3, random_state=123)

    ##################
    ### Only score ###
    ##################

    res = score_from_params_clustering(estimator, X, scoring=scoring, verbose=0)

    assert isinstance(res, pd.DataFrame)
    assert res.shape[0] == 1
    for s in scoring:
        assert ("test_" + s) in set(res.columns)

    with pytest.raises(NotFittedError):
        estimator.predict(X)

    ##########################
    ### Score + Prediction ###
    ##########################
    res, label = score_from_params_clustering(estimator, X, scoring=scoring, verbose=0, return_predict=True)

    assert isinstance(res, pd.DataFrame)
    assert res.shape[0] == 1
    for s in scoring:
        assert ("test_" + s) in set(res.columns)

    assert isinstance(label, np.ndarray)

    assert len(np.unique(label)) == 3

    with pytest.raises(NotFittedError):
        estimator.predict(X)

    ####################
    ### Predict only ###
    ####################
    res, label = score_from_params_clustering(
        estimator, X, scoring=scoring, verbose=0, return_predict=True, no_scoring=True
    )

    assert len(np.unique(label)) == 3
    assert res is None

    with pytest.raises(NotFittedError):
        estimator.predict(X)