Python create_dataset Exemples, data_cleaner.create_dataset Python Exemples

Exemple #1

0

Afficher le fichier

def plot_accuracy(seed):
    colors = ['navy', 'darkorange']

    target_names = ['High Risk', 'Low Risk']

    def make_ellipses(gmm, ax):
        for n, color in enumerate(colors):
            if gmm.covariance_type == 'full':
                covariances = gmm.covariances_[n][:2, :2]
            elif gmm.covariance_type == 'tied':
                covariances = gmm.covariances_[:2, :2]
            elif gmm.covariance_type == 'diag':
                covariances = np.diag(gmm.covariances_[n][:2])
            elif gmm.covariance_type == 'spherical':
                covariances = np.eye(gmm.means_.shape[1]) * gmm.covariances_[n]
            v, w = np.linalg.eigh(covariances)
            u = w[0] / np.linalg.norm(w[0])
            angle = np.arctan2(u[1], u[0])
            angle = 180 * angle / np.pi  # convert to degrees
            v = 2. * np.sqrt(2.) * np.sqrt(v)
            ell = mpl.patches.Ellipse(gmm.means_[n, :2],
                                      v[0],
                                      v[1],
                                      180 + angle,
                                      color=color)
            ell.set_clip_box(ax.bbox)
            ell.set_alpha(0.5)
            ax.add_artist(ell)
            ax.set_aspect('equal', 'datalim')

    X, y = create_dataset()
    # Break up the dataset into non-overlapping training (75%) and testing
    # (25%) sets.
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
    # Only take the first fold.
    train_index, test_index = next(iter(skf.split(X, y)))

    X_train = X[train_index]
    y_train = y[train_index]
    X_test = X[test_index]
    y_test = y[test_index]

    n_classes = len(np.unique(y_train))

    # Try GMMs using different types of covariances.
    estimators = {
        cov_type: GaussianMixture(n_components=n_classes,
                                  covariance_type=cov_type,
                                  max_iter=20,
                                  random_state=0)
        for cov_type in ['spherical', 'diag', 'tied', 'full']
    }

    n_estimators = len(estimators)

    plt.figure(figsize=(3 * n_estimators // 2, 6))
    plt.subplots_adjust(bottom=.05,
                        top=0.95,
                        hspace=.15,
                        wspace=.05,
                        left=.02,
                        right=.98)

    for index, (name, estimator) in enumerate(estimators.items()):
        # Since we have class labels for the training data, we can
        # initialize the GMM parameters in a supervised manner.
        estimator.means_init = np.array(
            [X_train[y_train == i].mean(axis=0) for i in range(n_classes)])

        # Train the other parameters using the EM algorithm.
        estimator.fit(X_train)

        h = plt.subplot(2, n_estimators // 2, index + 1)
        make_ellipses(estimator, h)

        for n, color in enumerate(colors):
            data = X[y == n]
            plt.scatter(data[:, 0],
                        data[:, 1],
                        s=2,
                        color=color,
                        label=target_names[n])
        # Plot the test data with crosses
        for n, color in enumerate(colors):
            data = X_test[y_test == n]
            plt.scatter(data[:, 0],
                        data[:, 1],
                        s=20 * 8,
                        marker='x',
                        color=color)

        y_train_pred = estimator.predict(X_train)
        train_accuracy = np.mean(y_train_pred.ravel() == y_train.ravel()) * 100
        plt.text(0.05,
                 0.9,
                 'Train accuracy: %.1f' % train_accuracy,
                 transform=h.transAxes,
                 fontsize=(18))

        y_test_pred = estimator.predict(X_test)
        test_accuracy = np.mean(y_test_pred.ravel() == y_test.ravel()) * 100
        plt.text(0.05,
                 0.8,
                 'Test accuracy: %.1f' % test_accuracy,
                 transform=h.transAxes,
                 fontsize=(18))

        plt.title('GMM: K=2, Cv-type = {}'.format(name))

    plt.legend(scatterpoints=40, loc='lower right', prop=dict(size=25))

    plt.show()

Exemple #2

0

Afficher le fichier

        draw_ellipse(mean,
                     cov,
                     alpha=weights * w_factor,
                     edgecolor=color,
                     facecolor=color,
                     linewidth=4)

    ax.set(title='Plot of the best fitting GMM')
    plt.show()


if __name__ == '__main__':
    # Create data set
    seed = 56
    np.random.seed(seed)
    X, y = create_dataset()

    n_components_range = range(1, 3)
    cv_types = ['full']

    models = create_gmm_models(cv_types, n_components_range)
    best_gmm, score = my_cv(X, y, models, K_out=10, K_in=10, seed=seed)
    best_gmm.fit(X)
    clf = best_gmm.predict(X)
    cent = best_gmm.means_
    covars = best_gmm.covariances_

    plot_accuracy(seed)
    plot_gmms(cv_types, n_components_range, best_gmm)

    #Cluster validity

Exemple #3

0

Afficher le fichier

    d = lambda x, y: np.sqrt(((x - y)**2).sum())

    densities = kde(data, K)

    N = len(data)
    ards = np.zeros(N)
    for i, obs in enumerate(data):
        distances = np.zeros(N)
        for j, obs2 in enumerate(data):
            if i == j:
                continue
            distances[j] += d(obs, obs2)
        distances.sort()
        ards[i] = distances[:K].sum()

    ards = densities / ards * K
    ards.sort()

    plt.bar(np.linspace(1, N, N), ards)
    plt.title("KNN Average relative density")
    plt.xlabel("Observation")
    plt.ylabel("ARD")
    plt.show()


if __name__ == "__main__":

    data, target = create_dataset()
    # gdk(data)
    ard(data, 10)

Exemple #4

0

Afficher le fichier

from data_cleaner import create_dataset

from sklearn import metrics
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.datasets import make_blobs
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA

from scipy.spatial import distance
from scipy.cluster import hierarchy
from matplotlib.pyplot import figure, title, plot, ylim, legend, show

import numpy as np
import matplotlib.pyplot as plt

data, y_true = create_dataset()

Rand = np.zeros((4, ))
Jaccard = np.zeros((4, ))
NMI = np.zeros((4, ))
KCluster = np.zeros((4, ))

linkage_list = ["complete", "average", "single", "ward"]

for i, link in enumerate(linkage_list):
    model = AgglomerativeClustering(n_clusters=2,
                                    affinity='euclidean',
                                    linkage=link)
    y_pred = model.fit_predict(data)
    Rand[i] = metrics.adjusted_rand_score(y_true, y_pred)
    Jaccard[i] = metrics.jaccard_score(

Exemple #5

0

Afficher le fichier

def plot_bic(n_components_range):
    # Number of samples per component
    n_samples = 500

    # Generate random sample, two components
    np.random.seed(56)
    X, y = create_dataset()

    lowest_bic = np.infty
    bic = []
    cv_types = ['spherical', 'tied', 'diag', 'full']
    for cv_type in cv_types:
        for n_components in n_components_range:
            # Fit a Gaussian mixture with EM
            gmm = mixture.GaussianMixture(n_components=n_components,
                                          covariance_type=cv_type)
            gmm.fit(X)
            bic.append(gmm.bic(X))
            if bic[-1] < lowest_bic:
                lowest_bic = bic[-1]
                best_gmm = gmm

    bic = np.array(bic)
    color_iter = itertools.cycle(
        ['navy', 'turquoise', 'cornflowerblue', 'darkorange'])
    clf = best_gmm
    bars = []

    # Plot the BIC scores
    plt.figure(figsize=(8, 6))
    spl = plt.subplot(2, 1, 1)
    for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)):
        xpos = np.array(n_components_range) + .2 * (i - 2)
        bars.append(
            plt.bar(xpos,
                    bic[i * len(n_components_range):(i + 1) *
                        len(n_components_range)],
                    width=.2,
                    color=color))
    plt.xticks(n_components_range)
    plt.ylim([bic.min() * 1.01 - .01 * bic.max(), bic.max()])
    plt.title('BIC score per model')
    xpos = np.mod(bic.argmin(), len(n_components_range)) + .65 +\
        .2 * np.floor(bic.argmin() / len(n_components_range))
    plt.text(xpos, bic.min() * 0.97 + .03 * bic.max(), '*', fontsize=14)
    spl.set_xlabel('Number of components')
    spl.legend([b[0] for b in bars], cv_types)

    # Plot the winner
    splot = plt.subplot(2, 1, 2)
    Y_ = clf.predict(X)
    for i, (mean, cov,
            color) in enumerate(zip(clf.means_, clf.covariances_, color_iter)):
        v, w = linalg.eigh(cov)
        if not np.any(Y_ == i):
            continue
        plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)

        # Plot an ellipse to show the Gaussian component
        angle = np.arctan2(w[0][1], w[0][0])
        angle = 180. * angle / np.pi  # convert to degrees
        v = 2. * np.sqrt(2.) * np.sqrt(v)
        ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color)
        ell.set_clip_box(splot.bbox)
        ell.set_alpha(.5)
        splot.add_artist(ell)

    plt.xticks(())
    plt.yticks(())
    plt.title('Selected GMM: full model, 2 components')
    plt.subplots_adjust(hspace=.35, bottom=.02)
    plt.show()

Exemple #6

0

Afficher le fichier

Fichier : mining.py Projet : wdmdev/ML_proj_3

from data_cleaner import create_dataset
import numpy as np
from apyori import apriori

data, pred = create_dataset()

data = np.asarray(data)

X = np.zeros(data.shape)

means = np.mean(data,axis=0)

for i, obs in enumerate(data):
    X[i] = obs > means

label = [
  "PV",
  "TA",
  "GD",
  "VE",
  "TE",
  "PS",
  "NW",
  "JA"
]
print(X)
def mat2transactions(X, labels=[]):
    T = []
    for i in range(X.shape[0]):
        l = np.nonzero(X[i, :])[0].tolist()
        if labels: