Python MiniBatchKMeans Exemples, sklearn.cluster.MiniBatchKMeans Python Exemples

Exemple #1

0

Afficher le fichier

    def compute_clusters(self, n_ones_clusters=1000, n_zeros_clusters=1000):
        """ Compute cluster centers using a MiniBatch K-means algorithm

        Also compute weights for each centroid, where the weight is equivalent
        to the number of points assigned to that centroid

        """
        ones_kmeans = cluster.MiniBatchKMeans(n_clusters=n_ones_clusters)
        zeros_kmeans = cluster.MiniBatchKMeans(n_clusters=n_zeros_clusters)

        ones_idx = np.where(self.targets == 1)
        zeros_idx = np.where(self.targets == 0)

        normalized_training, normalized_targets, normalized_tests = self.get_normalized_production_set()

        ones_labels = ones_kmeans.fit_predict(normalized_training[ones_idx])
        zeros_labels = zeros_kmeans.fit_predict(normalized_training[zeros_idx])

        ones_weights = np.zeros(n_ones_clusters)
        zeros_weights = np.zeros(n_zeros_clusters)

        for thing in ones_labels:
            ones_weights[thing] += 1
        for thing in zeros_labels:
            zeros_weights[thing] += 1

        np.savetxt("%s/data/ones_cluster_centers_n%d.dat" % (self.cwd, n_ones_clusters), ones_kmeans.cluster_centers_)
        np.savetxt("%s/data/ones_weights_n%d.dat" % (self.cwd, n_ones_clusters), ones_weights)

        np.savetxt("%s/data/zeros_cluster_centers_n%d.dat" % (self.cwd, n_zeros_clusters), zeros_kmeans.cluster_centers_)
        np.savetxt("%s/data/zeros_weights_n%d.dat" % (self.cwd, n_zeros_clusters), zeros_weights)

Exemple #2

0

Afficher le fichier

def make_folds(X, y, target_size, method='random'):
    n_Y = y.shape[0]
    n_folds = int(n_Y / target_size) + int(target_size > n_Y)

    if method == 'random':
        fold_assignment = np.random.permutation(n_Y) % n_folds
    elif method == 'cluster':
        # Thanks scikit
        print('Clustering [sklearn.cluster] inputs')
        clusterer = skcluster.MiniBatchKMeans(n_clusters=n_folds,
                                              batch_size=1000)
        fold_assignment = clusterer.fit_predict(X)
    elif method == 'rcluster':
        print('Clustering [sklearn.cluster] inputs')
        clusters = skcluster.MiniBatchKMeans(n_clusters=n_folds,
                                             batch_size=1000,
                                             compute_labels=True).fit(X)
        Xcluster = clusters.cluster_centers_
        print('Interpolating probability')
        n_X = X.shape[0]
        assign_prob = np.zeros((n_folds, n_X))
        tris = Delaunay(Xcluster)
        base_labels = clusters.labels_
        for i in range(n_folds):
            indicator = np.zeros(n_folds)
            indicator[i] = 1.
            row = interp.LinearNDInterpolator(tris, indicator,
                                              fill_value=-1)(X)
            row[row < 0] = base_labels[row < 0] == i
            assign_prob[i] = row

        # now use these as selection probabilities
        assign_prob = np.cumsum(assign_prob, axis=0)

        rvec = np.random.random(n_X)
        fold_assignment = np.sum(rvec[np.newaxis, :] < assign_prob, axis=0)

        # veryfy fold assignment?
        # pl.scatter(X[:, 0], X[:, 1], c=fold_assignment)
        # pl.show()
        # exit()

    else:
        raise NameError('Unrecognised fold method:' + method)

    fold_inds = np.unique(fold_assignment)
    folds = Folds(n_folds, [], [],
                  [])  # might contain lists in the multitask case
    where = lambda y, v: y[np.where(v)[0]]
    for f in fold_inds:
        folds.X.append(where(X, fold_assignment == f))
        folds.Y.append(where(y, fold_assignment == f))
        folds.flat_y.append(where(y, fold_assignment == f))

    return folds

Exemple #3

0

Afficher le fichier

def mini_cv(df):
    df1 = df[['pickup_x', 'pickup_y']].rename(columns={
        'pickup_x': 'x',
        'pickup_y': 'y'
    })
    df2 = df[['dropoff_x', 'dropoff_y']].rename(columns={
        'dropoff_x': 'x',
        'dropoff_y': 'y'
    })
    df3 = pd.concat([df1, df2])
    x = df3[['x', 'y']].as_matrix()
    nlist = list(range(3, 61))
    hyperparams = {
        'n_clusters': nlist,
        'init': ['k-means++', 'random'],
        'batch_size': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
    }
    l1 = list(ParameterGrid(hyperparams))
    l2 = []
    for i in l1:
        gc.enable()
        gc.collect()
        model = cluster.MiniBatchKMeans(**i)
        y_pre = model.fit_predict(x)
        name = str(i)
        # plt.figure(figsize=(12,12))
        # plt.title(name)
        # plt.scatter(x[:, 0], x[:, 1], c=y_pre)
        # plt.show()
        chs = metrics.calinski_harabaz_score(x, y_pre)
        l2.append((chs, i))
        print('Score for this fit is', chs)
    return max(l2)

Exemple #4

0

Afficher le fichier

def kmeans(X, k, max_iter=16, init='kmc2'):
    X = X.astype(np.float32)
    np.random.seed(123)

    # if k is huge, initialize centers with cartesian product of centroids
    # in two subspaces
    if init == 'subspaces':
        sqrt_k = int(np.sqrt(k) + .5)
        if sqrt_k ** 2 != k:
            raise ValueError("K must be a square number if init='subspaces'")

        _, D = X.shape
        centroids0, _ = kmeans(X[:, :D/2], sqrt_k, max_iter=2)
        centroids1, _ = kmeans(X[:, D/2:], sqrt_k, max_iter=2)
        seeds = np.empty((k, D), dtype=np.float32)
        for i in range(sqrt_k):
            for j in range(sqrt_k):
                row = i * sqrt_k + j
                seeds[row, :D/2] = centroids0[i]
                seeds[row, D/2:] = centroids1[j]

    elif init == 'kmc2':
        seeds = kmc2.kmc2(X, k).astype(np.float32)
    else:
        raise ValueError("init parameter must be one of {'kmc2', 'subspaces'}")

    estimator = cluster.MiniBatchKMeans(k, init=seeds, max_iter=max_iter).fit(X)
    return estimator.cluster_centers_, estimator.labels_

Exemple #5

0

Afficher le fichier

Fichier : codebooks.py Projet : vcampmany/M3_ImageClassi

def compute_codebook(D, code_size, nfeatures, fold_i=None, features='sift'):
    if features == 'sift':
        features = ''  # do not change filename for basic sift
    elif features == 'dense_sift':
        features = 'dense_sift_'

    if fold_i is not None:
        code_name = "codebooks/" + str(code_size) + "_" + features + str(
            nfeatures) + "_fold_" + str(fold_i) + ".dat"
    else:
        code_name = "codebooks/" + str(code_size) + "_" + features + str(
            nfeatures) + ".dat"
    if not os.path.isfile(code_name):
        print 'Computing kmeans with ' + str(code_size) + ' centroids'
        init = time.time()
        codebook = cluster.MiniBatchKMeans(n_clusters=code_size,
                                           verbose=False,
                                           batch_size=code_size * 20,
                                           compute_labels=False,
                                           reassignment_ratio=10**-4)
        codebook.fit(D)
        cPickle.dump(codebook, open(code_name, "wb"))
        end = time.time()
        print 'Done in ' + str(end - init) + ' secs.'
    else:
        codebook = cPickle.load(open(code_name, "r"))

    return codebook

Exemple #6

0

Afficher le fichier

def k_means(n_clusters, samples):
    """
    Run k-means clustering on vertex coordinates.

    Parameters:
    - - - - -
    n_clusters : int
        number of clusters to generate
    samples : array
        Euclidean-space coordinates of vertices
    """

    # Run Mini-Batch K-Means
    k_means = cluster.MiniBatchKMeans(n_clusters=n_clusters,
                                      init='k-means++',
                                      max_iter=1000,
                                      batch_size=10000,
                                      verbose=False,
                                      compute_labels=True,
                                      max_no_improvement=100,
                                      n_init=5,
                                      reassignment_ratio=0.1)
    k_means.fit(samples)

    labels = k_means.labels_.copy()
    labels = labels.astype(np.int32) + 1

    return labels

Exemple #7

0

Afficher le fichier

    def create_clusters_batch(self, models):
        all_purity = {'MiniBatchKMeans': [], 'AgglomerativeClustering': []}

        two_means = cluster.MiniBatchKMeans(init='k-means++',
                                            n_clusters=len(self.categories))
        average_linkage = cluster.AgglomerativeClustering(linkage="average",
                                                          affinity="cosine",
                                                          n_clusters=len(
                                                              self.categories))

        clustering_algorithms = (('MiniBatchKMeans', two_means),
                                 ('AgglomerativeClustering', average_linkage))
        for name, algorithm in clustering_algorithms:
            print(name)
            for m in models:
                self.model = m
                labels, embeddings, colors, _, cats = self.get_embeddings_and_labels(
                )

                algorithm.fit(embeddings)

                if hasattr(algorithm, 'labels_'):
                    cluster_labels = algorithm.labels_.astype(np.int)
                else:
                    cluster_labels = algorithm.predict(embeddings)
                    purity = self.purity_score(np.array(cats),
                                               np.array(cluster_labels))
                all_purity[name].append(purity)
                print(round(purity, 3))
        print("Averrage Purity for Kmeans: {} for Agg: {}".format(
            (sum(all_purity['MiniBatchKMeans']) /
             len(all_purity['MiniBatchKMeans'])),
            (sum(all_purity['AgglomerativeClustering']) /
             len(all_purity['AgglomerativeClustering']))))

Exemple #8

0

Afficher le fichier

Fichier : mutual_info.py Projet : hronoses/EmbedderSDR

    def prepare_input(self):
        targets = []
        classifier = cluster.MiniBatchKMeans(n_clusters=self.n_bins_default,
                                             batch_size=BATCH_SIZE,
                                             compute_labels=False)
        for images, labels in tqdm(
                self.eval_batches(),
                total=len(self.eval_loader),
                desc="MutualInfo: quantizing input data. Stage 1"):
            images = images.flatten(start_dim=1)
            classifier.partial_fit(images, labels)
            targets.append(labels)
        targets = torch.cat(targets, dim=0)
        self.accuracy_estimator = AccuracyFromMutualInfo(
            n_classes=len(targets.unique()))
        self.quantized['target'] = targets.numpy()

        centroids_predicted = []
        for images, _ in tqdm(
                self.eval_batches(),
                total=len(self.eval_loader),
                desc="MutualInfo: quantizing input data. Stage 2"):
            images = images.flatten(start_dim=1)
            centroids_predicted.append(classifier.predict(images))
        self.quantized['input'] = np.hstack(centroids_predicted)

Exemple #9

0

Afficher le fichier

Fichier : utils.py Projet : gro-mit/fproject

    def minibatchkmeans(self):
        minibatch_kmeans = cluster.MiniBatchKMeans(n_clusters = self.n_clusters, init = 'k-means++', batch_size = 50)
        minibatch_kmeans.fit(self.data)
        #print minibatch_kmeans.labels_
        #print self.labels

        return self.report(self.labels, minibatch_kmeans.labels_), minibatch_kmeans.labels_

Exemple #10

0

Afficher le fichier

    def clusterKmeans(self, file, numClus, pca=False):

        print("Clustering...")
        x = self.loadAndPCA(file, pca)

        self.numClusters = numClus

        # Check nltk clustering with cosine distance

        clusterer = clus.MiniBatchKMeans(numClus,
                                         verbose=True,
                                         batch_size=5000,
                                         max_no_improvement=1000,
                                         compute_labels=True,
                                         reassignment_ratio=0.001)
        #clusterer = clus.KMeans(n_clusters=numClus, n_jobs=-1, verbose=1)
        scores = clusterer.fit_transform(x)
        print("Clustering done.")

        counts = Counter(clusterer.labels_)

        # Add counts
        for i in range(0, len(counts)):
            self.clusSizes.append(counts[i])

        print("Clustering output: ")
        print(self.clusSizes)

        # TODO : Check the outcome of clustering from different
        # Embedding sizes and with/without PCA

        return clusterer.labels_, scores

Exemple #11

0

Afficher le fichier

Fichier : server.py Projet : philkr/geoloc

 def _load_cluster(self, cluster_file):
     from mpl_toolkits.basemap import Basemap
     from sklearn import cluster
     bm_param, km_param = np.load(cluster_file)
     self.m = Basemap(resolution='h', **bm_param)
     self.km = cluster.MiniBatchKMeans(n_clusters=km_param.shape[0])
     self.km.cluster_centers_ = km_param

Exemple #12

0

Afficher le fichier

 def __init__(self,
              n_clusters=50,
              pca_n_components=20,
              kmpca_n_components=3,
              kernel_n_components=30):
     self.counter = text.CountVectorizer(stop_words='english',
                                         ngram_range=(1, 2),
                                         min_df=30,
                                         binary=True,
                                         lowercase=True)
     self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters,
                                       n_init=10,
                                       batch_size=10000,
                                       verbose=1)
     self.pca = decomposition.RandomizedPCA(n_components=pca_n_components)
     self.kmpca = decomposition.RandomizedPCA(
         n_components=kmpca_n_components)
     self.rbf = kernel_approximation.RBFSampler(
         n_components=kernel_n_components)
     self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30,
                                                      max_depth=5,
                                                      n_jobs=4)
     self.X_names = [
         'Title_CounterX', 'Title_ClusterdX', 'Title_KmX', 'Title_PCAX',
         'Title_PCAClusterdX', 'Title_RbfX', 'Title_TreeX'
     ]
     self.linear_feature_selector = None

Exemple #13

0

Afficher le fichier

Fichier : description_features_model.py Projet : battlegg/fun_with_kaggle

 def __init__(self,
              n_clusters=50,
              pca_n_components=30,
              kmpca_n_components=3,
              kernel_n_components=30):
     ## use (min_df = 30, max_df = 0.5) to generate a lot of features - more choic for feature selection
     ## use (min_df = 0.001, max_df = 0.05) to generate fewer features - better clustering
     self.counter = text.CountVectorizer(stop_words='english',
                                         charset='utf-8',
                                         charset_error='ignore',
                                         ngram_range=(1, 1),
                                         min_df=0.001,
                                         max_df=0.05,
                                         binary=True,
                                         lowercase=True)
     self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters,
                                       n_init=10,
                                       batch_size=10000,
                                       verbose=1)
     self.pca = decomposition.RandomizedPCA(n_components=pca_n_components)
     self.kmpca = decomposition.RandomizedPCA(
         n_components=kmpca_n_components)
     self.rbf = kernel_approximation.RBFSampler(
         n_components=kernel_n_components)
     self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30,
                                                      max_depth=5,
                                                      n_jobs=4)
     self.X_names = [
         'Desc_CounterX', 'Desc_ClusterdX', 'Desc_KmX', 'Desc_PCAX',
         'Desc_PCAClusterdX', 'Desc_RbfX', 'Desc_TreeX'
     ]
     self.linear_feature_selector = None

Exemple #14

0

Afficher le fichier

 def build_codebook(self, k):
     return cluster.MiniBatchKMeans(n_clusters=k,
                                    verbose=False,
                                    batch_size=k * 20,
                                    compute_labels=False,
                                    reassignment_ratio=10**-4,
                                    random_state=42)

Exemple #15

0

Afficher le fichier

Fichier : PLSA.py Projet : pjmore/ECE471Project

 def cluster(self, data) -> List[int]:
     c = cluster.MiniBatchKMeans(n_clusters=self.NumVisualWords,
                                 init='k-means++',
                                 init_size=self.NumVisualWords * 3,
                                 max_iter=100).fit(data)
     self.WordCenters = c.cluster_centers_
     return c.labels_

Exemple #16

0

Afficher le fichier

def minibatch_kmeans(n_clusters: int,
                     name: str = "minibatch_kmeans",
                     **kwargs) -> ClusterOperation:
    """Returns ClusterOperation with mini-batchkmeans algorithm

    Parameters
    ----------
    n_clusters : int
        number of clusters to create

    name : str
        name of this operation, default `minibatch_kmeans`

    kwargs :
        keyword arguments to pass to sklearn.cluster.MiniBatchKMeans class

    Returns
    -------
    ClusterOperation
        Operation with MiniBatchKMeans algorithm

    Example
    -------
    >>> op = minibatch_kmeans(n_clusters=10)
    """
    model = skcluster.MiniBatchKMeans(n_clusters=n_clusters, **kwargs)
    return ClusterOperation(model=model, name=name)

Exemple #17

0

Afficher le fichier

 def ClusterTrain(self, component=2, model='Agglomerative'):
     """Using cluster method to divide the sample into different category
     unsupervisedly. Different model can be used.
         1. Spectral Clustering
         2. Agglomerative Clustering
         3. MiniBatch KMeans
     Parameters
     ----------
     component: int, the dimension that convert to.
     model: string, the model you select for manifold learning
     """
     print '-' * 49 + '\n' + 'Clustering\n' + '-' * 49
     clusterlist = {
         'spectral':
         cluster.SpectralClustering(n_clusters=component,
                                    eigen_solver='arpack',
                                    affinity="nearest_neighbors",
                                    random_state=0),
         'Agglomerative':
         cluster.AgglomerativeClustering(n_clusters=component,
                                         linkage='ward'),  #nice
         'MiniBatch':
         cluster.MiniBatchKMeans(n_clusters=component)
     }
     MyCluster = clusterlist[model]
     return MyCluster.fit_predict(self.Feature)

Exemple #18

0

Afficher le fichier

Fichier : get_anchor.py Projet : dzwallkilled/PyTorch-YOLOv3

def main(args):
    print("Reading Data ...")

    ann_file = json.load(open(os.path.join(args.root, args.file_list), 'r'))
    data = []
    for _i, _a in enumerate(tqdm(ann_file['annotations'])):
        _, _, w, h = _a['bbox']
        data.append([w / 1920, h / 1920])

    data = np.array(data)
    if args.engine.startswith("sklearn"):
        if args.engine == "sklearn":
            km = cluster.KMeans(n_clusters=args.num_clusters,
                                tol=args.tol,
                                verbose=True)
        elif args.engine == "sklearn-mini":
            km = cluster.MiniBatchKMeans(n_clusters=args.num_clusters,
                                         tol=args.tol,
                                         verbose=True)
        km.fit(data)
        result = km.cluster_centers_
        # distance = km.inertia_ / data.shape[0]
        distance = avg_iou(data, result)
    else:
        result = k_means(data, args.num_clusters, args.tol)
        distance = avg_iou(data, result)

    write_anchors_to_file(result, distance, args.output)

Exemple #19

0

Afficher le fichier

Fichier : image_representation.py Projet : Jordi-Gene-Mola/M3-Team1-mcv2017

def BoW_hardAssignment(k, D, Train_descriptors):
    #compute the codebook
    print 'Computing kmeans with ' + str(k) + ' centroids'
    init = time.time()
    codebook = cluster.MiniBatchKMeans(n_clusters=k,
                                       verbose=False,
                                       batch_size=k * 20,
                                       compute_labels=False,
                                       reassignment_ratio=10**-4,
                                       random_state=42)
    codebook.fit(D)
    end = time.time()
    print 'Done in ' + str(end - init) + ' secs.'

    # get train visual word encoding
    print 'Getting Train BoVW representation'
    init = time.time()
    visual_words = np.zeros((len(Train_descriptors), k), dtype=np.float32)
    for i in xrange(len(Train_descriptors)):
        words = codebook.predict(Train_descriptors[i])
        visual_words[i, :] = np.bincount(words, minlength=k)
    end = time.time()
    print 'Done in ' + str(end - init) + ' secs.'

    return words, visual_words, codebook

Exemple #20

0

Afficher le fichier

def cluster(file_list, output, n_clusters=None, max_files=None):
    import warnings
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    from mpl_toolkits.basemap import Basemap
    import numpy as np

    if n_clusters is None: n_clusters = 100

    # Parse the coordinates
    parser = CoordParser()
    c = np.array([parser(l) for l in open(file_list, 'r')])

    # Create the basemap parameters
    bnd = 0
    basemap_params = dict(projection='merc',
                          llcrnrlat=np.min(c[:, 0]) - bnd,
                          urcrnrlat=np.max(c[:, 0]) + bnd,
                          llcrnrlon=np.min(c[:, 1]) - bnd,
                          urcrnrlon=np.max(c[:, 1]) + bnd)

    # Select a subset of the coordinates to cluster
    if max_files is None:
        max_files = 100000
    np.random.shuffle(c)
    c = c[:max_files]

    # Project the coordinates into x, y coordinates
    m = Basemap(**basemap_params)
    x, y = m(c[:, 1], c[:, 0])

    from sklearn import cluster
    km = cluster.MiniBatchKMeans(n_clusters=n_clusters).fit(
        np.concatenate((x[:, None], y[:, None]), axis=1))

    np.save(output, (basemap_params, km.cluster_centers_))

Exemple #21

0

Afficher le fichier

Fichier : utils.py Projet : AgrielScience/Boosters

def cluster_model(newdata, data, model_name, input_param):
    ds = data
    params = input_param
    if str.lower(model_name) == 'kmeans':
        cluster_obj = cluster.KMeans(n_clusters=params['n_clusters'])
    if str.lower(model_name) == str.lower('MiniBatchKMeans'):
        cluster_obj = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
    if str.lower(model_name) == str.lower('SpectralClustering'):
        cluster_obj = cluster.SpectralClustering(n_clusters=params['n_clusters'])
    if str.lower(model_name) == str.lower('MeanShift'):
        cluster_obj = cluster.MeanShift(bandwidth=params['bandwidth'])
    if str.lower(model_name) == str.lower('DBSCAN'):
        cluster_obj = cluster.DBSCAN(eps=params['eps'])
    if str.lower(model_name) == str.lower('AffinityPropagation'):
        cluster_obj = cluster.AffinityPropagation(damping=params['damping'],
                                                  preference=params['preference'])
        cluster_obj.fit(ds)
    if str.lower(model_name) == str.lower('Birch'):
        cluster_obj = cluster.Birch(n_clusters=input_param['n_clusters'])
    if str.lower(model_name) == str.lower('GaussianMixture'):
        cluster_obj = mixture.GaussianMixture(n_components=params['n_clusters'],
                                              covariance_type='full')
        cluster_obj.fit(ds)

    if str.lower(model_name) in ['affinitypropagation', 'gaussianmixture']:
        model_result = cluster_obj.predict(ds)
    else:
        model_result = cluster_obj.fit_predict(ds)

    newdata[model_name] = pd.DataFrame(model_result)

    return newdata

Exemple #22

0

Afficher le fichier

    def _initialize_parameters(self, X, random_state):
        """Initialize the model parameters.

        Parameters
        ----------
        X : array-like, shape  (n_samples, n_features)

        random_state : RandomState
            A random number generator instance.
        """
        n_samples, _ = X.shape

        if self.init_params == 'kmeans':
            resp = np.zeros((n_samples, self.n_components))
            label = cluster.MiniBatchKMeans(
                n_clusters=self.n_components,
                n_init=1,
                random_state=random_state).fit(X).labels_
            resp[np.arange(n_samples), label] = 1
        elif self.init_params == 'random':
            resp = random_state.rand(n_samples, self.n_components)
            resp /= resp.sum(axis=1)[:, np.newaxis]
        else:
            raise ValueError("Unimplemented initialization method '%s'" %
                             self.init_params)

        self._initialize(X, resp)

Exemple #23

0

Afficher le fichier

    def update_data(self, attrname, old, new):

        #store the models here
        models = [
            cluster.MiniBatchKMeans(n_clusters=self.k_means_slider.value),
            cluster.DBSCAN(eps=self.DBSCAN_slider.value),
            cluster.Birch(n_clusters=self.birch_slider.value),
            cluster.MeanShift(bandwidth=self.bandwidth, bin_seeding=True)
        ]
        #AgglomerativeClustering

        assert len(models) == 4

        for model in models:
            model.fit(self.X)

        for i in range(4):
            if hasattr(model, 'labels_'):
                y_pred = models[i].labels_.astype(np.int)
            else:
                y_pred = models[i].predict(self.X)

            self.colors[i] = [Spectral6[f % 6] for f in y_pred]

            self.source[i].data['colors'] = self.colors[i]

Exemple #24

0

Afficher le fichier

Fichier : case1.py Projet : rbnuria/Clustering

def definition_clusters(subset):
    #Importante -> normalizar el conjunto de datos que utilizamos
    normalized_set = preprocessing.normalize(subset, norm='l2')

    print("-------- Definiendo los clusteres...")

    k_means = cl.KMeans(init='k-means++', n_clusters=5, n_init=100)

    two_means = cl.MiniBatchKMeans(n_clusters=5, init='k-means++')

    # estimate bandwidth for mean shift
    bandwidth = cl.estimate_bandwidth(normalized_set, quantile=0.3)
    ms = cl.MeanShift(bandwidth=bandwidth)

    # connectivity matrix for structured Ward
    #connectivity = kneighbors_graph(normalized_set, n_neighbors=10, include_self=False)
    # make connectivity symmetric
    #connectivity = 0.5 * (connectivity + connectivity.T)
    ward = cl.AgglomerativeClustering(n_clusters=100, linkage='ward')

    average = cl.AgglomerativeClustering(n_clusters=100, linkage='average')

    #Utilizarlo para casos de estudio pequeños
    #n_jobs = -1 para q vaya en paralelo
    #spectral = cl.SpectralClustering(n_clusters=3, affinity="nearest_neighbors",n_jobs=-1, n_neighbors = 3)

    #dbscan = cl.DBSCAN(eps=0.3)

    #Los añadimos a una lista
    clustering_algorithms = (('K-Means', k_means), ('MeanShift', ms),
                             ('MiniBatchMeans',
                              two_means), ('AgglomerativeWard', ward),
                             ('AgglomerativeAverage', average))

    return clustering_algorithms

Exemple #25

0

Afficher le fichier

Fichier : sophisticatedAnalysis.py Projet : mp6510/trip-optimizer

def plotTripCluster(data, numClusters):
    '''
    Function to cluster all 1.4 million trips to 80 stereotypical template trips and then look at the distribution of this "bag of trips" and how it changes over time.
    '''

    tripAttributes = np.array(data.loc[:, [
        'src lat [km]', 'src long [km]', 'dst lat [km]', 'dst long [km]',
        'duration [min]'
    ]])
    meanTripAttr = tripAttributes.mean(axis=0)
    stdTripAttr = tripAttributes.std(axis=0)
    tripAttributes = stats.zscore(tripAttributes, axis=0)

    TripKmeansModel = cluster.MiniBatchKMeans(n_clusters=numClusters,
                                              batch_size=120000,
                                              n_init=100,
                                              random_state=1)
    clusterInds = TripKmeansModel.fit_predict(tripAttributes)

    clusterTotalCounts, _ = np.histogram(clusterInds, bins=numClusters)
    sortedClusterInds = np.flipud(np.argsort(clusterTotalCounts))

    plt.figure(figsize=(12, 4))
    plt.title('Cluster Histogram of all trip')
    plt.bar(range(1, numClusters + 1), clusterTotalCounts[sortedClusterInds])
    plt.ylabel('Frequency [counts]')
    plt.xlabel('Cluster index (sorted by cluster frequency)')
    plt.xlim(0, numClusters + 1)

    plt.savefig('Figures/cluster-histogram-trip.png')

    return meanTripAttr, stdTripAttr

Exemple #26

0

Afficher le fichier

Fichier : pyannote_handler.py Projet : francisr8/transcriptify-ai

def select_cluster_algorithm(algorithm, no_clusters):
    if algorithm == 'SpectralClustering':
        return cluster.SpectralClustering(n_clusters=no_clusters)
    elif algorithm == 'MiniBatchKMeans':
        return cluster.MiniBatchKMeans(n_clusters=no_clusters)
    elif algorithm == 'AgglomerativeClustering':
        return cluster.AgglomerativeClustering(n_clusters=no_clusters)

Exemple #27

0

Afficher le fichier

Fichier : test_minibatch.py Projet : lindajoy/dask-ml

 def test_basic(self, single_chunk_blobs):
     X, y = single_chunk_blobs
     a = cluster.PartialMiniBatchKMeans(n_clusters=3, random_state=0)
     b = cluster_.MiniBatchKMeans(n_clusters=3, random_state=0)
     a.fit(X)
     b.partial_fit(X)
     assert_estimator_equal(a, b, exclude=['random_state_'])

Exemple #28

0

Afficher le fichier

def clustering_K_means(pontos):
    from sklearn import datasets
    import matplotlib.pyplot as plt
    from sklearn import datasets
    import matplotlib.pyplot as plt
    from sklearn import cluster

    y_kmeans = []
    #print(len(pontos),len(y),type(pontos),type(y_kmeans))

    #ira agrupar sem 2 grupos um será o grupo de palavras chaves e o outro será o grupo de não palavras chaves
    kmeans = cluster.MiniBatchKMeans(n_clusters=2, batch_size=10)

    y_kmeans = kmeans.fit_predict(pontos)

    for i in range(0, len(pontos)):

        if y_kmeans[i] == 0:
            print('\033[31m' + '0' + '\033[0;0m', pontos[i], y_kmeans[i],
                  listPhrase[i].phrase, "\n")
    for i in range(0, len(pontos)):
        if y_kmeans[i] == 1:
            print('\033[32m' + '1' + '\033[0;0m', pontos[i], y_kmeans[i],
                  listPhrase[i].phrase, "\n")

    # desenha os pontos no gr ́afico
    # as cores s~ao definidas pelo valor de y (grupo) e
    # h ́a contorno nos c ́ırculos (edgecolor)
    plt.scatter(pontos[:, 0],
                pontos[:, 1],
                marker='o',
                c=y_kmeans,
                s=25,
                edgecolor='k')
    plt.show()

Exemple #29

0

Afficher le fichier

Fichier : utils.py Projet : ninickl/bolt

def kmeans(X, k, max_iter=16, init='kmc2'):
    X = X.astype(np.float32)

    # if k is huge, initialize centers with cartesian product of centroids
    # in two subspaces
    sqrt_k = int(np.sqrt(k) + .5)
    if k > 256 and sqrt_k**2 == k and init == 'subspaces':
        print "kmeans: clustering in subspaces first; k, sqrt(k) =" \
            " {}, {}".format(k, sqrt_k)
        _, D = X.shape
        centroids0, _ = kmeans(X[:, :D / 2], sqrt_k, max_iter=1)
        centroids1, _ = kmeans(X[:, D / 2:], sqrt_k, max_iter=1)
        seeds = np.empty((k, D), dtype=np.float32)
        for i in range(sqrt_k):
            for j in range(sqrt_k):
                row = i * sqrt_k + j
                seeds[row, :D / 2] = centroids0[i]
                seeds[row, D / 2:] = centroids1[j]
    elif init == 'kmc2':
        seeds = kmc2.kmc2(X, k).astype(np.float32)
    else:
        raise ValueError("init parameter must be one of {'kmc2', 'subspaces'}")

    estimator = cluster.MiniBatchKMeans(k, init=seeds,
                                        max_iter=max_iter).fit(X)
    return estimator.cluster_centers_, estimator.labels_

Exemple #30

0

Afficher le fichier

Fichier : clustering_algos.py Projet : Ali-E/clustering_analysis

def K_means(coords, hyper_params={}):
    params = {'n_clusters': 2} # default values
    params.update(hyper_params)
    clustering_obj = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
    clustering_obj.fit(coords)
    y_pred = clustering_obj.labels_.astype(np.int)
    return y_pred