コード例 #1
0
    def compute_clusters(self, n_ones_clusters=1000, n_zeros_clusters=1000):
        """ Compute cluster centers using a MiniBatch K-means algorithm

        Also compute weights for each centroid, where the weight is equivalent
        to the number of points assigned to that centroid

        """
        ones_kmeans = cluster.MiniBatchKMeans(n_clusters=n_ones_clusters)
        zeros_kmeans = cluster.MiniBatchKMeans(n_clusters=n_zeros_clusters)

        ones_idx = np.where(self.targets == 1)
        zeros_idx = np.where(self.targets == 0)

        normalized_training, normalized_targets, normalized_tests = self.get_normalized_production_set()

        ones_labels = ones_kmeans.fit_predict(normalized_training[ones_idx])
        zeros_labels = zeros_kmeans.fit_predict(normalized_training[zeros_idx])

        ones_weights = np.zeros(n_ones_clusters)
        zeros_weights = np.zeros(n_zeros_clusters)

        for thing in ones_labels:
            ones_weights[thing] += 1
        for thing in zeros_labels:
            zeros_weights[thing] += 1

        np.savetxt("%s/data/ones_cluster_centers_n%d.dat" % (self.cwd, n_ones_clusters), ones_kmeans.cluster_centers_)
        np.savetxt("%s/data/ones_weights_n%d.dat" % (self.cwd, n_ones_clusters), ones_weights)

        np.savetxt("%s/data/zeros_cluster_centers_n%d.dat" % (self.cwd, n_zeros_clusters), zeros_kmeans.cluster_centers_)
        np.savetxt("%s/data/zeros_weights_n%d.dat" % (self.cwd, n_zeros_clusters), zeros_weights)
コード例 #2
0
def make_folds(X, y, target_size, method='random'):
    n_Y = y.shape[0]
    n_folds = int(n_Y / target_size) + int(target_size > n_Y)

    if method == 'random':
        fold_assignment = np.random.permutation(n_Y) % n_folds
    elif method == 'cluster':
        # Thanks scikit
        print('Clustering [sklearn.cluster] inputs')
        clusterer = skcluster.MiniBatchKMeans(n_clusters=n_folds,
                                              batch_size=1000)
        fold_assignment = clusterer.fit_predict(X)
    elif method == 'rcluster':
        print('Clustering [sklearn.cluster] inputs')
        clusters = skcluster.MiniBatchKMeans(n_clusters=n_folds,
                                             batch_size=1000,
                                             compute_labels=True).fit(X)
        Xcluster = clusters.cluster_centers_
        print('Interpolating probability')
        n_X = X.shape[0]
        assign_prob = np.zeros((n_folds, n_X))
        tris = Delaunay(Xcluster)
        base_labels = clusters.labels_
        for i in range(n_folds):
            indicator = np.zeros(n_folds)
            indicator[i] = 1.
            row = interp.LinearNDInterpolator(tris, indicator,
                                              fill_value=-1)(X)
            row[row < 0] = base_labels[row < 0] == i
            assign_prob[i] = row

        # now use these as selection probabilities
        assign_prob = np.cumsum(assign_prob, axis=0)

        rvec = np.random.random(n_X)
        fold_assignment = np.sum(rvec[np.newaxis, :] < assign_prob, axis=0)

        # veryfy fold assignment?
        # pl.scatter(X[:, 0], X[:, 1], c=fold_assignment)
        # pl.show()
        # exit()

    else:
        raise NameError('Unrecognised fold method:' + method)

    fold_inds = np.unique(fold_assignment)
    folds = Folds(n_folds, [], [],
                  [])  # might contain lists in the multitask case
    where = lambda y, v: y[np.where(v)[0]]
    for f in fold_inds:
        folds.X.append(where(X, fold_assignment == f))
        folds.Y.append(where(y, fold_assignment == f))
        folds.flat_y.append(where(y, fold_assignment == f))

    return folds
コード例 #3
0
def mini_cv(df):
    df1 = df[['pickup_x', 'pickup_y']].rename(columns={
        'pickup_x': 'x',
        'pickup_y': 'y'
    })
    df2 = df[['dropoff_x', 'dropoff_y']].rename(columns={
        'dropoff_x': 'x',
        'dropoff_y': 'y'
    })
    df3 = pd.concat([df1, df2])
    x = df3[['x', 'y']].as_matrix()
    nlist = list(range(3, 61))
    hyperparams = {
        'n_clusters': nlist,
        'init': ['k-means++', 'random'],
        'batch_size': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
    }
    l1 = list(ParameterGrid(hyperparams))
    l2 = []
    for i in l1:
        gc.enable()
        gc.collect()
        model = cluster.MiniBatchKMeans(**i)
        y_pre = model.fit_predict(x)
        name = str(i)
        # plt.figure(figsize=(12,12))
        # plt.title(name)
        # plt.scatter(x[:, 0], x[:, 1], c=y_pre)
        # plt.show()
        chs = metrics.calinski_harabaz_score(x, y_pre)
        l2.append((chs, i))
        print('Score for this fit is', chs)
    return max(l2)
コード例 #4
0
def kmeans(X, k, max_iter=16, init='kmc2'):
    X = X.astype(np.float32)
    np.random.seed(123)

    # if k is huge, initialize centers with cartesian product of centroids
    # in two subspaces
    if init == 'subspaces':
        sqrt_k = int(np.sqrt(k) + .5)
        if sqrt_k ** 2 != k:
            raise ValueError("K must be a square number if init='subspaces'")

        _, D = X.shape
        centroids0, _ = kmeans(X[:, :D/2], sqrt_k, max_iter=2)
        centroids1, _ = kmeans(X[:, D/2:], sqrt_k, max_iter=2)
        seeds = np.empty((k, D), dtype=np.float32)
        for i in range(sqrt_k):
            for j in range(sqrt_k):
                row = i * sqrt_k + j
                seeds[row, :D/2] = centroids0[i]
                seeds[row, D/2:] = centroids1[j]

    elif init == 'kmc2':
        seeds = kmc2.kmc2(X, k).astype(np.float32)
    else:
        raise ValueError("init parameter must be one of {'kmc2', 'subspaces'}")

    estimator = cluster.MiniBatchKMeans(k, init=seeds, max_iter=max_iter).fit(X)
    return estimator.cluster_centers_, estimator.labels_
コード例 #5
0
ファイル: codebooks.py プロジェクト: vcampmany/M3_ImageClassi
def compute_codebook(D, code_size, nfeatures, fold_i=None, features='sift'):
    if features == 'sift':
        features = ''  # do not change filename for basic sift
    elif features == 'dense_sift':
        features = 'dense_sift_'

    if fold_i is not None:
        code_name = "codebooks/" + str(code_size) + "_" + features + str(
            nfeatures) + "_fold_" + str(fold_i) + ".dat"
    else:
        code_name = "codebooks/" + str(code_size) + "_" + features + str(
            nfeatures) + ".dat"
    if not os.path.isfile(code_name):
        print 'Computing kmeans with ' + str(code_size) + ' centroids'
        init = time.time()
        codebook = cluster.MiniBatchKMeans(n_clusters=code_size,
                                           verbose=False,
                                           batch_size=code_size * 20,
                                           compute_labels=False,
                                           reassignment_ratio=10**-4)
        codebook.fit(D)
        cPickle.dump(codebook, open(code_name, "wb"))
        end = time.time()
        print 'Done in ' + str(end - init) + ' secs.'
    else:
        codebook = cPickle.load(open(code_name, "r"))

    return codebook
コード例 #6
0
def k_means(n_clusters, samples):
    """
    Run k-means clustering on vertex coordinates.

    Parameters:
    - - - - -
    n_clusters : int
        number of clusters to generate
    samples : array
        Euclidean-space coordinates of vertices
    """

    # Run Mini-Batch K-Means
    k_means = cluster.MiniBatchKMeans(n_clusters=n_clusters,
                                      init='k-means++',
                                      max_iter=1000,
                                      batch_size=10000,
                                      verbose=False,
                                      compute_labels=True,
                                      max_no_improvement=100,
                                      n_init=5,
                                      reassignment_ratio=0.1)
    k_means.fit(samples)

    labels = k_means.labels_.copy()
    labels = labels.astype(np.int32) + 1

    return labels
コード例 #7
0
    def create_clusters_batch(self, models):
        all_purity = {'MiniBatchKMeans': [], 'AgglomerativeClustering': []}

        two_means = cluster.MiniBatchKMeans(init='k-means++',
                                            n_clusters=len(self.categories))
        average_linkage = cluster.AgglomerativeClustering(linkage="average",
                                                          affinity="cosine",
                                                          n_clusters=len(
                                                              self.categories))

        clustering_algorithms = (('MiniBatchKMeans', two_means),
                                 ('AgglomerativeClustering', average_linkage))
        for name, algorithm in clustering_algorithms:
            print(name)
            for m in models:
                self.model = m
                labels, embeddings, colors, _, cats = self.get_embeddings_and_labels(
                )

                algorithm.fit(embeddings)

                if hasattr(algorithm, 'labels_'):
                    cluster_labels = algorithm.labels_.astype(np.int)
                else:
                    cluster_labels = algorithm.predict(embeddings)
                    purity = self.purity_score(np.array(cats),
                                               np.array(cluster_labels))
                all_purity[name].append(purity)
                print(round(purity, 3))
        print("Averrage Purity for Kmeans: {} for Agg: {}".format(
            (sum(all_purity['MiniBatchKMeans']) /
             len(all_purity['MiniBatchKMeans'])),
            (sum(all_purity['AgglomerativeClustering']) /
             len(all_purity['AgglomerativeClustering']))))
コード例 #8
0
ファイル: mutual_info.py プロジェクト: hronoses/EmbedderSDR
    def prepare_input(self):
        targets = []
        classifier = cluster.MiniBatchKMeans(n_clusters=self.n_bins_default,
                                             batch_size=BATCH_SIZE,
                                             compute_labels=False)
        for images, labels in tqdm(
                self.eval_batches(),
                total=len(self.eval_loader),
                desc="MutualInfo: quantizing input data. Stage 1"):
            images = images.flatten(start_dim=1)
            classifier.partial_fit(images, labels)
            targets.append(labels)
        targets = torch.cat(targets, dim=0)
        self.accuracy_estimator = AccuracyFromMutualInfo(
            n_classes=len(targets.unique()))
        self.quantized['target'] = targets.numpy()

        centroids_predicted = []
        for images, _ in tqdm(
                self.eval_batches(),
                total=len(self.eval_loader),
                desc="MutualInfo: quantizing input data. Stage 2"):
            images = images.flatten(start_dim=1)
            centroids_predicted.append(classifier.predict(images))
        self.quantized['input'] = np.hstack(centroids_predicted)
コード例 #9
0
ファイル: utils.py プロジェクト: gro-mit/fproject
    def minibatchkmeans(self):
        minibatch_kmeans = cluster.MiniBatchKMeans(n_clusters = self.n_clusters, init = 'k-means++', batch_size = 50)
        minibatch_kmeans.fit(self.data)
        #print minibatch_kmeans.labels_
        #print self.labels

        return self.report(self.labels, minibatch_kmeans.labels_), minibatch_kmeans.labels_
コード例 #10
0
    def clusterKmeans(self, file, numClus, pca=False):

        print("Clustering...")
        x = self.loadAndPCA(file, pca)

        self.numClusters = numClus

        # Check nltk clustering with cosine distance

        clusterer = clus.MiniBatchKMeans(numClus,
                                         verbose=True,
                                         batch_size=5000,
                                         max_no_improvement=1000,
                                         compute_labels=True,
                                         reassignment_ratio=0.001)
        #clusterer = clus.KMeans(n_clusters=numClus, n_jobs=-1, verbose=1)
        scores = clusterer.fit_transform(x)
        print("Clustering done.")

        counts = Counter(clusterer.labels_)

        # Add counts
        for i in range(0, len(counts)):
            self.clusSizes.append(counts[i])

        print("Clustering output: ")
        print(self.clusSizes)

        # TODO : Check the outcome of clustering from different
        # Embedding sizes and with/without PCA

        return clusterer.labels_, scores
コード例 #11
0
ファイル: server.py プロジェクト: philkr/geoloc
 def _load_cluster(self, cluster_file):
     from mpl_toolkits.basemap import Basemap
     from sklearn import cluster
     bm_param, km_param = np.load(cluster_file)
     self.m = Basemap(resolution='h', **bm_param)
     self.km = cluster.MiniBatchKMeans(n_clusters=km_param.shape[0])
     self.km.cluster_centers_ = km_param
コード例 #12
0
 def __init__(self,
              n_clusters=50,
              pca_n_components=20,
              kmpca_n_components=3,
              kernel_n_components=30):
     self.counter = text.CountVectorizer(stop_words='english',
                                         ngram_range=(1, 2),
                                         min_df=30,
                                         binary=True,
                                         lowercase=True)
     self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters,
                                       n_init=10,
                                       batch_size=10000,
                                       verbose=1)
     self.pca = decomposition.RandomizedPCA(n_components=pca_n_components)
     self.kmpca = decomposition.RandomizedPCA(
         n_components=kmpca_n_components)
     self.rbf = kernel_approximation.RBFSampler(
         n_components=kernel_n_components)
     self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30,
                                                      max_depth=5,
                                                      n_jobs=4)
     self.X_names = [
         'Title_CounterX', 'Title_ClusterdX', 'Title_KmX', 'Title_PCAX',
         'Title_PCAClusterdX', 'Title_RbfX', 'Title_TreeX'
     ]
     self.linear_feature_selector = None
コード例 #13
0
 def __init__(self,
              n_clusters=50,
              pca_n_components=30,
              kmpca_n_components=3,
              kernel_n_components=30):
     ## use (min_df = 30, max_df = 0.5) to generate a lot of features - more choic for feature selection
     ## use (min_df = 0.001, max_df = 0.05) to generate fewer features - better clustering
     self.counter = text.CountVectorizer(stop_words='english',
                                         charset='utf-8',
                                         charset_error='ignore',
                                         ngram_range=(1, 1),
                                         min_df=0.001,
                                         max_df=0.05,
                                         binary=True,
                                         lowercase=True)
     self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters,
                                       n_init=10,
                                       batch_size=10000,
                                       verbose=1)
     self.pca = decomposition.RandomizedPCA(n_components=pca_n_components)
     self.kmpca = decomposition.RandomizedPCA(
         n_components=kmpca_n_components)
     self.rbf = kernel_approximation.RBFSampler(
         n_components=kernel_n_components)
     self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30,
                                                      max_depth=5,
                                                      n_jobs=4)
     self.X_names = [
         'Desc_CounterX', 'Desc_ClusterdX', 'Desc_KmX', 'Desc_PCAX',
         'Desc_PCAClusterdX', 'Desc_RbfX', 'Desc_TreeX'
     ]
     self.linear_feature_selector = None
コード例 #14
0
 def build_codebook(self, k):
     return cluster.MiniBatchKMeans(n_clusters=k,
                                    verbose=False,
                                    batch_size=k * 20,
                                    compute_labels=False,
                                    reassignment_ratio=10**-4,
                                    random_state=42)
コード例 #15
0
ファイル: PLSA.py プロジェクト: pjmore/ECE471Project
 def cluster(self, data) -> List[int]:
     c = cluster.MiniBatchKMeans(n_clusters=self.NumVisualWords,
                                 init='k-means++',
                                 init_size=self.NumVisualWords * 3,
                                 max_iter=100).fit(data)
     self.WordCenters = c.cluster_centers_
     return c.labels_
コード例 #16
0
def minibatch_kmeans(n_clusters: int,
                     name: str = "minibatch_kmeans",
                     **kwargs) -> ClusterOperation:
    """Returns ClusterOperation with mini-batchkmeans algorithm

    Parameters
    ----------
    n_clusters : int
        number of clusters to create

    name : str
        name of this operation, default `minibatch_kmeans`

    kwargs :
        keyword arguments to pass to sklearn.cluster.MiniBatchKMeans class

    Returns
    -------
    ClusterOperation
        Operation with MiniBatchKMeans algorithm

    Example
    -------
    >>> op = minibatch_kmeans(n_clusters=10)
    """
    model = skcluster.MiniBatchKMeans(n_clusters=n_clusters, **kwargs)
    return ClusterOperation(model=model, name=name)
コード例 #17
0
 def ClusterTrain(self, component=2, model='Agglomerative'):
     """Using cluster method to divide the sample into different category
     unsupervisedly. Different model can be used.
         1. Spectral Clustering
         2. Agglomerative Clustering
         3. MiniBatch KMeans
     Parameters
     ----------
     component: int, the dimension that convert to.
     model: string, the model you select for manifold learning
     """
     print '-' * 49 + '\n' + 'Clustering\n' + '-' * 49
     clusterlist = {
         'spectral':
         cluster.SpectralClustering(n_clusters=component,
                                    eigen_solver='arpack',
                                    affinity="nearest_neighbors",
                                    random_state=0),
         'Agglomerative':
         cluster.AgglomerativeClustering(n_clusters=component,
                                         linkage='ward'),  #nice
         'MiniBatch':
         cluster.MiniBatchKMeans(n_clusters=component)
     }
     MyCluster = clusterlist[model]
     return MyCluster.fit_predict(self.Feature)
コード例 #18
0
def main(args):
    print("Reading Data ...")

    ann_file = json.load(open(os.path.join(args.root, args.file_list), 'r'))
    data = []
    for _i, _a in enumerate(tqdm(ann_file['annotations'])):
        _, _, w, h = _a['bbox']
        data.append([w / 1920, h / 1920])

    data = np.array(data)
    if args.engine.startswith("sklearn"):
        if args.engine == "sklearn":
            km = cluster.KMeans(n_clusters=args.num_clusters,
                                tol=args.tol,
                                verbose=True)
        elif args.engine == "sklearn-mini":
            km = cluster.MiniBatchKMeans(n_clusters=args.num_clusters,
                                         tol=args.tol,
                                         verbose=True)
        km.fit(data)
        result = km.cluster_centers_
        # distance = km.inertia_ / data.shape[0]
        distance = avg_iou(data, result)
    else:
        result = k_means(data, args.num_clusters, args.tol)
        distance = avg_iou(data, result)

    write_anchors_to_file(result, distance, args.output)
コード例 #19
0
def BoW_hardAssignment(k, D, Train_descriptors):
    #compute the codebook
    print 'Computing kmeans with ' + str(k) + ' centroids'
    init = time.time()
    codebook = cluster.MiniBatchKMeans(n_clusters=k,
                                       verbose=False,
                                       batch_size=k * 20,
                                       compute_labels=False,
                                       reassignment_ratio=10**-4,
                                       random_state=42)
    codebook.fit(D)
    end = time.time()
    print 'Done in ' + str(end - init) + ' secs.'

    # get train visual word encoding
    print 'Getting Train BoVW representation'
    init = time.time()
    visual_words = np.zeros((len(Train_descriptors), k), dtype=np.float32)
    for i in xrange(len(Train_descriptors)):
        words = codebook.predict(Train_descriptors[i])
        visual_words[i, :] = np.bincount(words, minlength=k)
    end = time.time()
    print 'Done in ' + str(end - init) + ' secs.'

    return words, visual_words, codebook
コード例 #20
0
def cluster(file_list, output, n_clusters=None, max_files=None):
    import warnings
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    from mpl_toolkits.basemap import Basemap
    import numpy as np

    if n_clusters is None: n_clusters = 100

    # Parse the coordinates
    parser = CoordParser()
    c = np.array([parser(l) for l in open(file_list, 'r')])

    # Create the basemap parameters
    bnd = 0
    basemap_params = dict(projection='merc',
                          llcrnrlat=np.min(c[:, 0]) - bnd,
                          urcrnrlat=np.max(c[:, 0]) + bnd,
                          llcrnrlon=np.min(c[:, 1]) - bnd,
                          urcrnrlon=np.max(c[:, 1]) + bnd)

    # Select a subset of the coordinates to cluster
    if max_files is None:
        max_files = 100000
    np.random.shuffle(c)
    c = c[:max_files]

    # Project the coordinates into x, y coordinates
    m = Basemap(**basemap_params)
    x, y = m(c[:, 1], c[:, 0])

    from sklearn import cluster
    km = cluster.MiniBatchKMeans(n_clusters=n_clusters).fit(
        np.concatenate((x[:, None], y[:, None]), axis=1))

    np.save(output, (basemap_params, km.cluster_centers_))
コード例 #21
0
ファイル: utils.py プロジェクト: AgrielScience/Boosters
def cluster_model(newdata, data, model_name, input_param):
    ds = data
    params = input_param
    if str.lower(model_name) == 'kmeans':
        cluster_obj = cluster.KMeans(n_clusters=params['n_clusters'])
    if str.lower(model_name) == str.lower('MiniBatchKMeans'):
        cluster_obj = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
    if str.lower(model_name) == str.lower('SpectralClustering'):
        cluster_obj = cluster.SpectralClustering(n_clusters=params['n_clusters'])
    if str.lower(model_name) == str.lower('MeanShift'):
        cluster_obj = cluster.MeanShift(bandwidth=params['bandwidth'])
    if str.lower(model_name) == str.lower('DBSCAN'):
        cluster_obj = cluster.DBSCAN(eps=params['eps'])
    if str.lower(model_name) == str.lower('AffinityPropagation'):
        cluster_obj = cluster.AffinityPropagation(damping=params['damping'],
                                                  preference=params['preference'])
        cluster_obj.fit(ds)
    if str.lower(model_name) == str.lower('Birch'):
        cluster_obj = cluster.Birch(n_clusters=input_param['n_clusters'])
    if str.lower(model_name) == str.lower('GaussianMixture'):
        cluster_obj = mixture.GaussianMixture(n_components=params['n_clusters'],
                                              covariance_type='full')
        cluster_obj.fit(ds)

    if str.lower(model_name) in ['affinitypropagation', 'gaussianmixture']:
        model_result = cluster_obj.predict(ds)
    else:
        model_result = cluster_obj.fit_predict(ds)

    newdata[model_name] = pd.DataFrame(model_result)

    return newdata
コード例 #22
0
    def _initialize_parameters(self, X, random_state):
        """Initialize the model parameters.

        Parameters
        ----------
        X : array-like, shape  (n_samples, n_features)

        random_state : RandomState
            A random number generator instance.
        """
        n_samples, _ = X.shape

        if self.init_params == 'kmeans':
            resp = np.zeros((n_samples, self.n_components))
            label = cluster.MiniBatchKMeans(
                n_clusters=self.n_components,
                n_init=1,
                random_state=random_state).fit(X).labels_
            resp[np.arange(n_samples), label] = 1
        elif self.init_params == 'random':
            resp = random_state.rand(n_samples, self.n_components)
            resp /= resp.sum(axis=1)[:, np.newaxis]
        else:
            raise ValueError("Unimplemented initialization method '%s'" %
                             self.init_params)

        self._initialize(X, resp)
コード例 #23
0
    def update_data(self, attrname, old, new):

        #store the models here
        models = [
            cluster.MiniBatchKMeans(n_clusters=self.k_means_slider.value),
            cluster.DBSCAN(eps=self.DBSCAN_slider.value),
            cluster.Birch(n_clusters=self.birch_slider.value),
            cluster.MeanShift(bandwidth=self.bandwidth, bin_seeding=True)
        ]
        #AgglomerativeClustering

        assert len(models) == 4

        for model in models:
            model.fit(self.X)

        for i in range(4):
            if hasattr(model, 'labels_'):
                y_pred = models[i].labels_.astype(np.int)
            else:
                y_pred = models[i].predict(self.X)

            self.colors[i] = [Spectral6[f % 6] for f in y_pred]

            self.source[i].data['colors'] = self.colors[i]
コード例 #24
0
ファイル: case1.py プロジェクト: rbnuria/Clustering
def definition_clusters(subset):
    #Importante -> normalizar el conjunto de datos que utilizamos
    normalized_set = preprocessing.normalize(subset, norm='l2')

    print("-------- Definiendo los clusteres...")

    k_means = cl.KMeans(init='k-means++', n_clusters=5, n_init=100)

    two_means = cl.MiniBatchKMeans(n_clusters=5, init='k-means++')

    # estimate bandwidth for mean shift
    bandwidth = cl.estimate_bandwidth(normalized_set, quantile=0.3)
    ms = cl.MeanShift(bandwidth=bandwidth)

    # connectivity matrix for structured Ward
    #connectivity = kneighbors_graph(normalized_set, n_neighbors=10, include_self=False)
    # make connectivity symmetric
    #connectivity = 0.5 * (connectivity + connectivity.T)
    ward = cl.AgglomerativeClustering(n_clusters=100, linkage='ward')

    average = cl.AgglomerativeClustering(n_clusters=100, linkage='average')

    #Utilizarlo para casos de estudio pequeños
    #n_jobs = -1 para q vaya en paralelo
    #spectral = cl.SpectralClustering(n_clusters=3, affinity="nearest_neighbors",n_jobs=-1, n_neighbors = 3)

    #dbscan = cl.DBSCAN(eps=0.3)

    #Los añadimos a una lista
    clustering_algorithms = (('K-Means', k_means), ('MeanShift', ms),
                             ('MiniBatchMeans',
                              two_means), ('AgglomerativeWard', ward),
                             ('AgglomerativeAverage', average))

    return clustering_algorithms
コード例 #25
0
def plotTripCluster(data, numClusters):
    '''
    Function to cluster all 1.4 million trips to 80 stereotypical template trips and then look at the distribution of this "bag of trips" and how it changes over time.
    '''

    tripAttributes = np.array(data.loc[:, [
        'src lat [km]', 'src long [km]', 'dst lat [km]', 'dst long [km]',
        'duration [min]'
    ]])
    meanTripAttr = tripAttributes.mean(axis=0)
    stdTripAttr = tripAttributes.std(axis=0)
    tripAttributes = stats.zscore(tripAttributes, axis=0)

    TripKmeansModel = cluster.MiniBatchKMeans(n_clusters=numClusters,
                                              batch_size=120000,
                                              n_init=100,
                                              random_state=1)
    clusterInds = TripKmeansModel.fit_predict(tripAttributes)

    clusterTotalCounts, _ = np.histogram(clusterInds, bins=numClusters)
    sortedClusterInds = np.flipud(np.argsort(clusterTotalCounts))

    plt.figure(figsize=(12, 4))
    plt.title('Cluster Histogram of all trip')
    plt.bar(range(1, numClusters + 1), clusterTotalCounts[sortedClusterInds])
    plt.ylabel('Frequency [counts]')
    plt.xlabel('Cluster index (sorted by cluster frequency)')
    plt.xlim(0, numClusters + 1)

    plt.savefig('Figures/cluster-histogram-trip.png')

    return meanTripAttr, stdTripAttr
コード例 #26
0
def select_cluster_algorithm(algorithm, no_clusters):
    if algorithm == 'SpectralClustering':
        return cluster.SpectralClustering(n_clusters=no_clusters)
    elif algorithm == 'MiniBatchKMeans':
        return cluster.MiniBatchKMeans(n_clusters=no_clusters)
    elif algorithm == 'AgglomerativeClustering':
        return cluster.AgglomerativeClustering(n_clusters=no_clusters)
コード例 #27
0
ファイル: test_minibatch.py プロジェクト: lindajoy/dask-ml
 def test_basic(self, single_chunk_blobs):
     X, y = single_chunk_blobs
     a = cluster.PartialMiniBatchKMeans(n_clusters=3, random_state=0)
     b = cluster_.MiniBatchKMeans(n_clusters=3, random_state=0)
     a.fit(X)
     b.partial_fit(X)
     assert_estimator_equal(a, b, exclude=['random_state_'])
コード例 #28
0
def clustering_K_means(pontos):
    from sklearn import datasets
    import matplotlib.pyplot as plt
    from sklearn import datasets
    import matplotlib.pyplot as plt
    from sklearn import cluster

    y_kmeans = []
    #print(len(pontos),len(y),type(pontos),type(y_kmeans))

    #ira agrupar sem 2 grupos um será o grupo de palavras chaves e o outro será o grupo de não palavras chaves
    kmeans = cluster.MiniBatchKMeans(n_clusters=2, batch_size=10)

    y_kmeans = kmeans.fit_predict(pontos)

    for i in range(0, len(pontos)):

        if y_kmeans[i] == 0:
            print('\033[31m' + '0' + '\033[0;0m', pontos[i], y_kmeans[i],
                  listPhrase[i].phrase, "\n")
    for i in range(0, len(pontos)):
        if y_kmeans[i] == 1:
            print('\033[32m' + '1' + '\033[0;0m', pontos[i], y_kmeans[i],
                  listPhrase[i].phrase, "\n")

    # desenha os pontos no gr ́afico
    # as cores s~ao definidas pelo valor de y (grupo) e
    # h ́a contorno nos c ́ırculos (edgecolor)
    plt.scatter(pontos[:, 0],
                pontos[:, 1],
                marker='o',
                c=y_kmeans,
                s=25,
                edgecolor='k')
    plt.show()
コード例 #29
0
ファイル: utils.py プロジェクト: ninickl/bolt
def kmeans(X, k, max_iter=16, init='kmc2'):
    X = X.astype(np.float32)

    # if k is huge, initialize centers with cartesian product of centroids
    # in two subspaces
    sqrt_k = int(np.sqrt(k) + .5)
    if k > 256 and sqrt_k**2 == k and init == 'subspaces':
        print "kmeans: clustering in subspaces first; k, sqrt(k) =" \
            " {}, {}".format(k, sqrt_k)
        _, D = X.shape
        centroids0, _ = kmeans(X[:, :D / 2], sqrt_k, max_iter=1)
        centroids1, _ = kmeans(X[:, D / 2:], sqrt_k, max_iter=1)
        seeds = np.empty((k, D), dtype=np.float32)
        for i in range(sqrt_k):
            for j in range(sqrt_k):
                row = i * sqrt_k + j
                seeds[row, :D / 2] = centroids0[i]
                seeds[row, D / 2:] = centroids1[j]
    elif init == 'kmc2':
        seeds = kmc2.kmc2(X, k).astype(np.float32)
    else:
        raise ValueError("init parameter must be one of {'kmc2', 'subspaces'}")

    estimator = cluster.MiniBatchKMeans(k, init=seeds,
                                        max_iter=max_iter).fit(X)
    return estimator.cluster_centers_, estimator.labels_
コード例 #30
0
def K_means(coords, hyper_params={}):
    params = {'n_clusters': 2} # default values
    params.update(hyper_params)
    clustering_obj = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
    clustering_obj.fit(coords)
    y_pred = clustering_obj.labels_.astype(np.int)
    return y_pred