Python create_datasetの例、data_cleaner.create_dataset Pythonの例

コード例 #1

0

ファイルを表示

def plot_accuracy(seed):
    colors = ['navy', 'darkorange']

    target_names = ['High Risk', 'Low Risk']

    def make_ellipses(gmm, ax):
        for n, color in enumerate(colors):
            if gmm.covariance_type == 'full':
                covariances = gmm.covariances_[n][:2, :2]
            elif gmm.covariance_type == 'tied':
                covariances = gmm.covariances_[:2, :2]
            elif gmm.covariance_type == 'diag':
                covariances = np.diag(gmm.covariances_[n][:2])
            elif gmm.covariance_type == 'spherical':
                covariances = np.eye(gmm.means_.shape[1]) * gmm.covariances_[n]
            v, w = np.linalg.eigh(covariances)
            u = w[0] / np.linalg.norm(w[0])
            angle = np.arctan2(u[1], u[0])
            angle = 180 * angle / np.pi  # convert to degrees
            v = 2. * np.sqrt(2.) * np.sqrt(v)
            ell = mpl.patches.Ellipse(gmm.means_[n, :2],
                                      v[0],
                                      v[1],
                                      180 + angle,
                                      color=color)
            ell.set_clip_box(ax.bbox)
            ell.set_alpha(0.5)
            ax.add_artist(ell)
            ax.set_aspect('equal', 'datalim')

    X, y = create_dataset()
    # Break up the dataset into non-overlapping training (75%) and testing
    # (25%) sets.
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
    # Only take the first fold.
    train_index, test_index = next(iter(skf.split(X, y)))

    X_train = X[train_index]
    y_train = y[train_index]
    X_test = X[test_index]
    y_test = y[test_index]

    n_classes = len(np.unique(y_train))

    # Try GMMs using different types of covariances.
    estimators = {
        cov_type: GaussianMixture(n_components=n_classes,
                                  covariance_type=cov_type,
                                  max_iter=20,
                                  random_state=0)
        for cov_type in ['spherical', 'diag', 'tied', 'full']
    }

    n_estimators = len(estimators)

    plt.figure(figsize=(3 * n_estimators // 2, 6))
    plt.subplots_adjust(bottom=.05,
                        top=0.95,
                        hspace=.15,
                        wspace=.05,
                        left=.02,
                        right=.98)

    for index, (name, estimator) in enumerate(estimators.items()):
        # Since we have class labels for the training data, we can
        # initialize the GMM parameters in a supervised manner.
        estimator.means_init = np.array(
            [X_train[y_train == i].mean(axis=0) for i in range(n_classes)])

        # Train the other parameters using the EM algorithm.
        estimator.fit(X_train)

        h = plt.subplot(2, n_estimators // 2, index + 1)
        make_ellipses(estimator, h)

        for n, color in enumerate(colors):
            data = X[y == n]
            plt.scatter(data[:, 0],
                        data[:, 1],
                        s=2,
                        color=color,
                        label=target_names[n])
        # Plot the test data with crosses
        for n, color in enumerate(colors):
            data = X_test[y_test == n]
            plt.scatter(data[:, 0],
                        data[:, 1],
                        s=20 * 8,
                        marker='x',
                        color=color)

        y_train_pred = estimator.predict(X_train)
        train_accuracy = np.mean(y_train_pred.ravel() == y_train.ravel()) * 100
        plt.text(0.05,
                 0.9,
                 'Train accuracy: %.1f' % train_accuracy,
                 transform=h.transAxes,
                 fontsize=(18))

        y_test_pred = estimator.predict(X_test)
        test_accuracy = np.mean(y_test_pred.ravel() == y_test.ravel()) * 100
        plt.text(0.05,
                 0.8,
                 'Test accuracy: %.1f' % test_accuracy,
                 transform=h.transAxes,
                 fontsize=(18))

        plt.title('GMM: K=2, Cv-type = {}'.format(name))

    plt.legend(scatterpoints=40, loc='lower right', prop=dict(size=25))

    plt.show()

コード例 #2

0

ファイルを表示

        draw_ellipse(mean,
                     cov,
                     alpha=weights * w_factor,
                     edgecolor=color,
                     facecolor=color,
                     linewidth=4)

    ax.set(title='Plot of the best fitting GMM')
    plt.show()


if __name__ == '__main__':
    # Create data set
    seed = 56
    np.random.seed(seed)
    X, y = create_dataset()

    n_components_range = range(1, 3)
    cv_types = ['full']

    models = create_gmm_models(cv_types, n_components_range)
    best_gmm, score = my_cv(X, y, models, K_out=10, K_in=10, seed=seed)
    best_gmm.fit(X)
    clf = best_gmm.predict(X)
    cent = best_gmm.means_
    covars = best_gmm.covariances_

    plot_accuracy(seed)
    plot_gmms(cv_types, n_components_range, best_gmm)

    #Cluster validity

コード例 #3

0

ファイルを表示

    d = lambda x, y: np.sqrt(((x - y)**2).sum())

    densities = kde(data, K)

    N = len(data)
    ards = np.zeros(N)
    for i, obs in enumerate(data):
        distances = np.zeros(N)
        for j, obs2 in enumerate(data):
            if i == j:
                continue
            distances[j] += d(obs, obs2)
        distances.sort()
        ards[i] = distances[:K].sum()

    ards = densities / ards * K
    ards.sort()

    plt.bar(np.linspace(1, N, N), ards)
    plt.title("KNN Average relative density")
    plt.xlabel("Observation")
    plt.ylabel("ARD")
    plt.show()


if __name__ == "__main__":

    data, target = create_dataset()
    # gdk(data)
    ard(data, 10)

コード例 #4

0

ファイルを表示

from data_cleaner import create_dataset

from sklearn import metrics
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.datasets import make_blobs
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA

from scipy.spatial import distance
from scipy.cluster import hierarchy
from matplotlib.pyplot import figure, title, plot, ylim, legend, show

import numpy as np
import matplotlib.pyplot as plt

data, y_true = create_dataset()

Rand = np.zeros((4, ))
Jaccard = np.zeros((4, ))
NMI = np.zeros((4, ))
KCluster = np.zeros((4, ))

linkage_list = ["complete", "average", "single", "ward"]

for i, link in enumerate(linkage_list):
    model = AgglomerativeClustering(n_clusters=2,
                                    affinity='euclidean',
                                    linkage=link)
    y_pred = model.fit_predict(data)
    Rand[i] = metrics.adjusted_rand_score(y_true, y_pred)
    Jaccard[i] = metrics.jaccard_score(

コード例 #5

0

ファイルを表示

def plot_bic(n_components_range):
    # Number of samples per component
    n_samples = 500

    # Generate random sample, two components
    np.random.seed(56)
    X, y = create_dataset()

    lowest_bic = np.infty
    bic = []
    cv_types = ['spherical', 'tied', 'diag', 'full']
    for cv_type in cv_types:
        for n_components in n_components_range:
            # Fit a Gaussian mixture with EM
            gmm = mixture.GaussianMixture(n_components=n_components,
                                          covariance_type=cv_type)
            gmm.fit(X)
            bic.append(gmm.bic(X))
            if bic[-1] < lowest_bic:
                lowest_bic = bic[-1]
                best_gmm = gmm

    bic = np.array(bic)
    color_iter = itertools.cycle(
        ['navy', 'turquoise', 'cornflowerblue', 'darkorange'])
    clf = best_gmm
    bars = []

    # Plot the BIC scores
    plt.figure(figsize=(8, 6))
    spl = plt.subplot(2, 1, 1)
    for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)):
        xpos = np.array(n_components_range) + .2 * (i - 2)
        bars.append(
            plt.bar(xpos,
                    bic[i * len(n_components_range):(i + 1) *
                        len(n_components_range)],
                    width=.2,
                    color=color))
    plt.xticks(n_components_range)
    plt.ylim([bic.min() * 1.01 - .01 * bic.max(), bic.max()])
    plt.title('BIC score per model')
    xpos = np.mod(bic.argmin(), len(n_components_range)) + .65 +\
        .2 * np.floor(bic.argmin() / len(n_components_range))
    plt.text(xpos, bic.min() * 0.97 + .03 * bic.max(), '*', fontsize=14)
    spl.set_xlabel('Number of components')
    spl.legend([b[0] for b in bars], cv_types)

    # Plot the winner
    splot = plt.subplot(2, 1, 2)
    Y_ = clf.predict(X)
    for i, (mean, cov,
            color) in enumerate(zip(clf.means_, clf.covariances_, color_iter)):
        v, w = linalg.eigh(cov)
        if not np.any(Y_ == i):
            continue
        plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)

        # Plot an ellipse to show the Gaussian component
        angle = np.arctan2(w[0][1], w[0][0])
        angle = 180. * angle / np.pi  # convert to degrees
        v = 2. * np.sqrt(2.) * np.sqrt(v)
        ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color)
        ell.set_clip_box(splot.bbox)
        ell.set_alpha(.5)
        splot.add_artist(ell)

    plt.xticks(())
    plt.yticks(())
    plt.title('Selected GMM: full model, 2 components')
    plt.subplots_adjust(hspace=.35, bottom=.02)
    plt.show()

コード例 #6

0

ファイルを表示

ファイル: mining.py プロジェクト: wdmdev/ML_proj_3

from data_cleaner import create_dataset
import numpy as np
from apyori import apriori

data, pred = create_dataset()

data = np.asarray(data)

X = np.zeros(data.shape)

means = np.mean(data,axis=0)

for i, obs in enumerate(data):
    X[i] = obs > means

label = [
  "PV",
  "TA",
  "GD",
  "VE",
  "TE",
  "PS",
  "NW",
  "JA"
]
print(X)
def mat2transactions(X, labels=[]):
    T = []
    for i in range(X.shape[0]):
        l = np.nonzero(X[i, :])[0].tolist()
        if labels: