コード例 #1
0
def test_edge():
    test_df = pd.DataFrame({'X1': np.zeros(10), 'X2': np.ones(10)})

    centers, labels = fit(test_df, 1)

    print(labels)

    assert centers.all() == np.array([0, 1]).all()
コード例 #2
0
def test_center():
    # Helper data
    iris = datasets.load_iris()  # loading the iris dataset
    features = iris.data
    test_df1 = pd.DataFrame({'X1': features[:, 2], 'X2': features[:, 3]})

    test_df2 = pd.DataFrame({'X1': np.arange(9), 'X2': np.arange(9)})
    # getting centers of clusters
    centers1, _ = fit(test_df2, 1)

    centers2, _ = fit(test_df1, 2)

    assert centers2.all() == np.array([[4.92525253, 1.68181818],
                                       [1.49215686, 0.2627451]]).all(), \
        "Centers did not match"

    assert centers1.all() == np.array([4, 4]).all(), "Centers did not match"
コード例 #3
0
def silhouette(X, k_array):
    """
    Plots a graph of silhouette scores for each k value
    in the given array using fit. Returns a list of each k value
    in k_array paired with its corresponding silhouette score.

    Parameters
    ----------
    X : 2-d array, shape=(n_samples, n_features)
        - The data to be clustered.
    k_array : array
         - An array of all contending k values.

    Returns
    -------
    1-d array
        - An array containing silhouette scores in the same order as k_array.

    Altair chart object
        - An Altair chart displaying silhouette scores
        with their corresponding k values.

    Examples
    --------
    >>> X = np.array([[1, 2], [1, 4], [1, 0],
    ...               [10, 2], [10, 4], [10, 0]])
    >>> k_array = [2, 3, 4, 5]
    >>> silhouette(X, k_array)

    """

    if (X.dtype != "float" and X.dtype != "int"):
        raise ValueError("Input X must be numeric")

    if not (isinstance(k_array[0], int)):
        raise ValueError("Input k_array must be type int")

    scores = []
    for i in range(len(k_array)):
        centers, labels = fit(X, k_array[i])
        score = sil_score(X, labels)
        scores.append([k_array[i], score])

    scores = pd.DataFrame(scores)
    scores.rename(columns={0: "k", 1: "Score"}, inplace=True)

    chart = (alt.Chart(scores).mark_line().encode(
        alt.X('k:O', axis=alt.Axis(title='k')),
        alt.Y('Score:Q', axis=alt.Axis(title='Silhouette score')),
    ).properties(title="Silhouette scores", width=800))

    return (scores["Score"], chart)
コード例 #4
0
def test_exceptions():
    # Helper data
    test_df4 = "this is a python package"
    test_df2 = pd.DataFrame({'X1': np.arange(9), 'X2': np.arange(9)})
    test_df5 = pd.DataFrame({'X1': [1, 2, 3, 4], 'X2': [1, 2, "A", "BC"]})

    K = -2
    num_init = 0
    max_iteration = 4.5

    # checking the exception handling of the function
    try:
        fit(test_df4, 2)
        print("Should throw an error if data is not in "
              "a dataframe or numpy array")
        raise
    except ValueError:
        pass

    try:
        fit(test_df2, K)
    except ValueError:
        pass

    try:
        fit(test_df5, 2)
    except ValueError:
        pass

    try:
        fit(test_df2, 1, n_init=num_init)
    except ValueError:
        pass

    try:
        fit(test_df2, 1, max_iter=max_iteration)
    except ValueError:
        pass
コード例 #5
0
def test_label():
    # Helper data
    test_df3 = pd.DataFrame({
        'X1':
        np.concatenate((np.arange(5, 10), np.arange(15, 20)), axis=0),
        'X2':
        np.concatenate((np.arange(5, 10), np.arange(15, 20)), axis=0)
    })

    # getting the labels for the helper data
    _, labels = fit(test_df3, 2)

    assert labels.all() == np.concatenate((np.zeros(5), np.ones(5)),
                                          axis=0).all(), "labels did not match"
コード例 #6
0
def elbow(X, centers_list):
    """
    Creates a plot of inertia vs number of cluster centers
    as per the elbow method. Calculates and returns the inertia
    values for all cluster centers. Useful for identifying the optimal
    number of clusters while using k-means clustering algorithm.

    Parameters
    ----------
    X : array-like, shape=(n_samples, n_features)
        Input data that is to be clustered.
    centers_list : list or 1-d array-like
        A list of all possible numbers of cluster centers

    Returns
    -------
    tuple
        A tuple of an altair plot object containing a line plot of
        k (number of cluster centers) vs inertia and inertia for all k.

    Examples
    --------
    >>> from Kmeans_python.elbow import elbow
    >>> import numpy as np
    >>> X = np.array([[1, 2], [1, 4], [1, 0],
    ...               [10, 2], [10, 4], [10, 0]])
    >>> centers = [2, 3, 4, 5]
    >>> elbow(X, centers)
    >>> (alt.Chart(...),
        [2.8284271247461903, 2.8284271247461903, 1.4142135623730951, 0.0])
        """

    # Check if number of centers is contained in an array or list
    if not ((isinstance(centers_list, list)) |
            (isinstance(centers_list, np.ndarray))):
        raise ValueError("Invalid input type for list of numbers of clusters.\
            centers_list must be list or a numpy array.")

    # Ensure input arguments are valid
    if not ((isinstance(X, pd.DataFrame)) | (isinstance(X, np.ndarray))):
        raise ValueError("Invalid input type for samples. X must be \
            pandas dataframe or a numpy array.")

    # Check if there are atleast two samples
    if not X.shape[0] >= 2:
        raise ValueError("At least two samples should be there in data")

    # Prompt user to reshape if data has only one feature
    if len(X.shape) == 1:
        raise ValueError("If you have only one feature in the dataset\
            please reshape your data using X.reshape(-1, 1)")

    # Check if number of centers are numeric values
    data = np.reshape(centers_list, -1)
    if not any([isinstance(x, int) or isinstance(x, np.int64) for x in data]):
        raise ValueError(
            "Invalid input type for centers. Centers_list must contain \
            only numeric values.")

    # Check if all number of centers are integers
    for k in centers_list:
        if int(k) != np.ceil(k):
            raise ValueError("Number of centers should be integers")

    # Check if data points are numbers
    data = np.reshape(np.array(X), -1)
    if not any([isinstance(x, int) or isinstance(x, np.int64) for x in data]):
        raise ValueError("Invalid input type for samples. X must contain \
            only numeric values.")

    # Check if the range of number of centers is valid
    if (np.min(centers_list) < 1) | (np.max(centers_list) > X.shape[0]):
        raise ValueError("Invalid values in list of numbers of clusters. \
            Number of clusters should be between 1 and number of samples")

    if isinstance(X, pd.DataFrame):
        X = X.to_numpy()

    # Convert all integer types to int
    centers_list = [int(x) for x in centers_list]

    # Iterate through centers list and get inertia
    inertia = []
    for k in centers_list:
        # Fit Kmeans algorithm to get cluster centers and labels
        centers, labels = fit(X, k, n_init=10, max_iter=200)
        # Compute inertia
        for cluster in range(k):
            x_cluster = X[np.where(labels == cluster)]
            cluster_inertia = np.linalg.norm(x_cluster - centers[cluster])
        inertia.append(np.sum(cluster_inertia))
    # Save results to a dataframe
    results = pd.DataFrame({"k": centers_list, "inertia": inertia})

    # Create a plot object of K vs Inertia
    p = alt.Chart(results).mark_line().encode(
        alt.X("k:Q", title="k"),
        alt.Y("inertia:Q", title="Inertia")).properties(
            title="Optimal K Using Elbow Method", width=700,
            height=300).configure_axis(
                labelFontSize=20,
                titleFontSize=20).configure_title(fontSize=20)

    return p, inertia