def fit(method, X, n_clusters, samples_by_cluster, max_iter):
    if X.shape[0] <= samples_by_cluster * n_clusters:
        n_clusters = int(X.shape[0] / samples_by_cluster)

    if X.shape[0] < n_clusters or n_clusters == 0:
        n_clusters = 1

    if method == "kmeans":
        model = cluster.KMeans(n_clusters=n_clusters, max_iter=max_iter)
    elif method == "kmeans":
        model = mixture.GaussianMixture(n_components=n_clusters,
                                        max_iter=max_iter)
    elif method == "bgmm":
        model = mixture.BayesianGaussianMixture(
            n_components=n_clusters, max_iter=max_iter)
    else:
        model = cluster.Birch(n_clusters=n_clusters, compute_labels=False)

    while True:
        try:
            model.fit(X)
            return model
        except Exception as e:
            if type(model) == cluster.birch.Birch:
                model = cluster.Birch(n_clusters=n_clusters,
                                      compute_labels=False)
                if n_clusters > 1:
                    n_clusters -= 1
                continue
            else:
                raise(e)
    return model
Example #2
0
def configuraciones_birch():
    brc_1 = cl.Birch(n_clusters=5, threshold=0.1)
    brc_05 = cl.Birch(n_clusters=5, threshold=0.05)
    brc_01 = cl.Birch(n_clusters=5, threshold=0.01)

    #Los añadimos a una lista
    clustering_algorithms = (('Birch thershold=0.1',
                              brc_1), ('Birch thershold=0.05', brc_05),
                             ('Birch thershold=0.01', brc_01))

    return clustering_algorithms
Example #3
0
def configuraciones_birch2():
    brc_01 = cl.Birch(n_clusters=10, threshold=0.01)
    brc_05 = cl.Birch(n_clusters=10, threshold=0.05)
    brc_07 = cl.Birch(n_clusters=10, threshold=0.07)

    #Los añadimos a una lista
    clustering_algorithms = (
        ('Birch-01', brc_01),
        ('Birch-05', brc_05),
        ('Birch-07', brc_07),
    )

    return clustering_algorithms
Example #4
0
def definition_clusters(subset):
    #Importante -> normalizar el conjunto de datos que utilizamos
    normalized_set = preprocessing.normalize(subset, norm='l2')

    print("-------- Definiendo los clusteres...")

    k_means = cl.KMeans(init='k-means++', n_clusters=5, n_init=100)

    # estimate bandwidth for mean shift
    bandwidth = cl.estimate_bandwidth(normalized_set, quantile=0.3)
    ms = cl.MeanShift(bandwidth=bandwidth)

    two_means = cl.MiniBatchKMeans(n_clusters=5, init='k-means++')

    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(normalized_set,
                                    n_neighbors=10,
                                    include_self=False)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)
    ward = cl.AgglomerativeClustering(n_clusters=5, linkage='ward')

    #dbscan = cl.DBSCAN(eps=0.3, n_clusters=5)

    brc = cl.Birch(n_clusters=5, threshold=0.1)

    #Los añadimos a una lista
    clustering_algorithms = (('K-Means', k_means),
                             ('MiniBatchKMeans', two_means), ('MeanShift', ms),
                             ('Agglomerative', ward), ('Birch', brc))

    return clustering_algorithms
Example #5
0
def evaluate(df, thres_list=[0.3, 0.5, 0.7, 0.9, 1.1, 1.3, 1.5]):

    # preprocessing
    x_train_list = preprocessing(df.T.to_dict().values())

    # modeling and embedding projection
    model_dm = algo.document_embeddings(x_train_list, vector_size=3, epochs=100)

    vectors = []
    for text_list, label_str in x_train_list:
        vector = model_dm.infer_vector(text_list)
        vectors.append(vector)

    # clustring with average silhouette method
    results = []
    for thres in thres_list:
        brc = cluster.Birch(
            branching_factor=50, n_clusters=None, threshold=thres, compute_labels=True,
            )
        clrs = brc.fit_predict(vectors)
        logger.warning("clrs: {0}".format(clrs))
        silhouette_avg = metrics.silhouette_score(vectors, clrs)
        logger.warning("[thres {0}] silhouette_avg: {1}".format(thres, silhouette_avg))
        results.append(
            {"score": silhouette_avg, "clrs": clrs, "thres": thres, "vectors": vectors}
            )

    return results
    def birch(self, data):
        """
        Wrapper for sklearn.cluster.birch with parameters from the dynamic
        reconfigure config.

        Parameters
        ----------
        data : numpy.array
            Points in order: [[x1,y1,z1], [x2,y2,z2], ....]

        Returns
        -------
        labels : ndarray
            The cluster labels

        Notes
        -----
        https://scikit-learn.org/stable/modules/generated/sklearn.cluster.Birch.html

        """

        params = {'branching_factor': self.B_branching_factor,
                  'threshold': self.B_threshold,
                  'n_clusters': None,
                  'compute_labels': True}

        return cluster.Birch(**params).fit_predict(data)
def use_birch(mat, n_cluster):
    clusters = cls.Birch(threshold=0.0005, n_clusters=n_cluster).fit(mat)
    hist, bin_edges = np.histogram(clusters.labels_,
                                   bins=np.arange(n_cluster + 1))
    print 'Birch clustering:', clusters.labels_
    print hist
    return clusters.labels_
	def Birch(self):
		"""
		Uses `sklearn's Birch <http://scikit-learn.org/stable/modules/generated/sklearn.cluster.Birch.html>`_
		**Defaults and var_params:** sklearn.cluster.Birch(threshold=0.5, branching_factor=50, compute_labels=True, copy=True)

		Other Parameters
		----------------
		var_params: dict
			Pass variable params through constructor as dictionary pairs. Current default parameters are listed above

		Returns
		-------
		labels: list of ints
			Solution of clustering labels for each object (updated in object.out)
		"""
		params = {}
		params['distance'] = 'euclidean' #not mutable
		params['threshold'] = 0.5
		params['branching_factor'] = 50
		params['n_clusters'] = self.K
		params['compute_labels'] = True
		params['copy'] = True
		
		if not self.K:
			raise ValueError('Birch clustering requires an argument K=<intiger value>')

		params = returnParams(self.var_params, params, 'Birch')
		d = returnDistanceMatrix(self.data, params['distance'])

		solution = skc.Birch(threshold=params['threshold'], branching_factor=params['branching_factor'], n_clusters=params['n_clusters'],
			compute_labels=params['compute_labels'], copy=params['copy'])
		solution.fit(d)

		self.out = solution.labels_
		self.var_params = params
Example #9
0
def get_train_test_idx(X, n_interp, do_clustering=True, do_birch=True):
    if do_clustering:
        Xscaled = prep.StandardScaler().fit_transform(X)
        if do_birch:
            threshold_birch = 1.
            n_clusters = int(len(Xscaled) / n_interp)
            while True:
                try:
                    clus = cluster.Birch(threshold=threshold_birch, n_clusters=n_clusters)
                    clus.fit(Xscaled)
                    if len(np.unique(clus.labels_)) == n_clusters:
                        break
                    else:
                        threshold_birch *=0.5
                except:
                    threshold_birch *=0.5
        else:
            n_clusters = int(len(Xscaled) / n_interp)
            clus = cluster.KMeans(n_clusters=n_clusters)
            clus.fit(Xscaled)
        clus_indices = [np.where(clus.labels_ == i)[0] for i in np.unique(clus.labels_)]
        idx_train = []
        for label in np.unique(clus.labels_):
            cluster_center = Xscaled[clus_indices[label]].mean(axis=0)
            dists = ((Xscaled - cluster_center)**2).sum(axis=1)**0.5
            idx_train.append(np.argmin(dists))

    else:
        idx_train = list(np.arange(0, len(X), n_interp))

    idx_test = set(np.arange(0, len(X))).difference(idx_train)
    idx_test = list(idx_test)
    return idx_train, idx_test
Example #10
0
def cluster_model(newdata, data, model_name, input_param):
    ds = data
    params = input_param
    if str.lower(model_name) == 'kmeans':
        cluster_obj = cluster.KMeans(n_clusters=params['n_clusters'])
    if str.lower(model_name) == str.lower('MiniBatchKMeans'):
        cluster_obj = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
    if str.lower(model_name) == str.lower('SpectralClustering'):
        cluster_obj = cluster.SpectralClustering(n_clusters=params['n_clusters'])
    if str.lower(model_name) == str.lower('MeanShift'):
        cluster_obj = cluster.MeanShift(bandwidth=params['bandwidth'])
    if str.lower(model_name) == str.lower('DBSCAN'):
        cluster_obj = cluster.DBSCAN(eps=params['eps'])
    if str.lower(model_name) == str.lower('AffinityPropagation'):
        cluster_obj = cluster.AffinityPropagation(damping=params['damping'],
                                                  preference=params['preference'])
        cluster_obj.fit(ds)
    if str.lower(model_name) == str.lower('Birch'):
        cluster_obj = cluster.Birch(n_clusters=input_param['n_clusters'])
    if str.lower(model_name) == str.lower('GaussianMixture'):
        cluster_obj = mixture.GaussianMixture(n_components=params['n_clusters'],
                                              covariance_type='full')
        cluster_obj.fit(ds)

    if str.lower(model_name) in ['affinitypropagation', 'gaussianmixture']:
        model_result = cluster_obj.predict(ds)
    else:
        model_result = cluster_obj.fit_predict(ds)

    newdata[model_name] = pd.DataFrame(model_result)

    return newdata
Example #11
0
    def update_data(self, attrname, old, new):

        #store the models here
        models = [
            cluster.MiniBatchKMeans(n_clusters=self.k_means_slider.value),
            cluster.DBSCAN(eps=self.DBSCAN_slider.value),
            cluster.Birch(n_clusters=self.birch_slider.value),
            cluster.MeanShift(bandwidth=self.bandwidth, bin_seeding=True)
        ]
        #AgglomerativeClustering

        assert len(models) == 4

        for model in models:
            model.fit(self.X)

        for i in range(4):
            if hasattr(model, 'labels_'):
                y_pred = models[i].labels_.astype(np.int)
            else:
                y_pred = models[i].predict(self.X)

            self.colors[i] = [Spectral6[f % 6] for f in y_pred]

            self.source[i].data['colors'] = self.colors[i]
Example #12
0
def definition_clusters(subset):
    #Importante -> normalizar el conjunto de datos que utilizamos
    normalized_set = preprocessing.normalize(subset, norm='l2')

    print("-------- Definiendo los clusteres...")

    k_means = cl.KMeans(init='k-means++', n_clusters=5, n_init=100)

    # estimate bandwidth for mean shift
    bandwidth = cl.estimate_bandwidth(normalized_set, quantile=0.3)
    ms = cl.MeanShift(bandwidth=bandwidth, bin_seeding=True)

    #Utilizarlo para casos de estudio pequeños
    spectral = cl.SpectralClustering(n_clusters=5, affinity="rbf")

    dbscan = cl.DBSCAN(eps=0.1)

    #Ponemos threshold bajo porque nos daba un warning en el fit_predict
    brc = cl.Birch(n_clusters=5, threshold=0.1)

    #Los añadimos a una lista
    clustering_algorithms = (('K-Means', k_means), ('MeanShift',
                                                    ms), ('DBSCAN', dbscan),
                             ('Birch', brc), ('SpectralClustering', spectral))

    return clustering_algorithms
Example #13
0
def main():
    path = '/home/s/Documents/Taxi/Taxi-Stops/stops.csv'
    df = pd.read_csv(path)
    df = df.sort_values('Latitude')
    lon0 = df['Longitude']
    lat0 = df['Latitude']
    time1 = [t[1:-1].split(', ') for t in list(df['Time'])]
    # print(time1)
    vehicle1 = df['Vehicle_No']
    n = len(df.index)
    lon = np.asarray(lon0[:n]).reshape(-1, 1)
    lat = np.asarray(lat0[:n]).reshape(-1, 1)
    points = np.concatenate((lat, lon), axis=1) * (6378137 / 180) * math.pi
    radius = input('Enter the raduis of cluster(Recommended 20-25 meters): ')
    # clustering = cluster.OPTICS(min_samples=5, max_eps=5, metric='euclidean', xi=0.05).fit(points)
    clustering = cluster.Birch(threshold=int(radius),
                               branching_factor=500,
                               n_clusters=None,
                               compute_labels=True,
                               copy=True).fit(points)
    # clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=40).fit(points)
    col = []
    cent_lat1 = []
    cent_lon1 = []
    min_taxis = input(
        'Enter minimum number of Taxis for a location to be considered a stop: '
    )
    res = dict(collections.Counter(clustering.labels_))
    for key, value in res.items():
        cent_lat1.append(clustering.subcluster_centers_[key][0] *
                         (180 / (6378137 * math.pi)))
        cent_lon1.append(clustering.subcluster_centers_[key][1] *
                         (180 / (6378137 * math.pi)))
        col.append(len(cent_lat1))
    num_clusters = len(cent_lat1)
    df = pd.DataFrame(list(zip(cent_lon1, cent_lat1)),
                      columns=['Longitude', 'Latitude'])
    df.to_csv('centers.csv', index=False)
    print('Cluster centers are stored in "centers.csv"')
    lat1 = []
    lon1 = []
    col = []
    time = [[] for i in range(num_clusters)]
    vehicle = [[] for i in range(num_clusters)]
    for i in range(n):
        if (res[clustering.labels_[i]] > int(min_taxis)):
            lat1.append(points[i][0] * (180 / (6378137 * math.pi)))
            lon1.append(points[i][1] * (180 / (6378137 * math.pi)))
            time[clustering.labels_[i]].extend(time1[i])
            vehicle[clustering.labels_[i]].append(vehicle1[i])
            col.append(clustering.labels_[i])
    print(time)
    df = pd.DataFrame(list(zip(lon1, lat1)), columns=['Longitude', 'Latitude'])
    df.to_csv('clusters.csv', index=False)
    print('Clusters are stored in "clusters.csv"')
    fig = go.Figure(data=go.Scatter(
        x=lat1, y=lon1, mode='markers', marker=dict(color=col), text=col))
    # plotly.offline.plot(fig, filename='stops.html')
    print('Plot is stored in "stops.html".')
Example #14
0
def get_algorithm(algorithm_name: str, clusters: int) -> cluster:
    if algorithm_name == "Birch":
        return cluster.Birch(n_clusters=clusters)
    elif algorithm_name == "Spectral Clustering":
        return cluster.SpectralClustering(n_clusters=clusters)
    elif algorithm_name == 'Affinity Propagation':
        return cluster.AffinityPropagation()
    else:
        raise NotImplementedError(f'algorithm: {algorithm_name} not implemented')
Example #15
0
def birch():
    data_1 = numpy.random.normal(loc=0.0, scale=0.1, size=[100, 2])
    data_2 = numpy.random.normal(loc=0.1, scale=0.1, size=[100, 2])
    data = numpy.concatenate([data_1, data_2], axis=0)
    x = [item[0] for item in data]
    y = [item[1] for item in data]
    y_pre = cluster.Birch(threshold=0.05, branching_factor=50, n_clusters=2).fit_predict(data)
    plt.scatter(x, y, c=y_pre)
    plt.show()
Example #16
0
def clustering(X, algorithm, n_clusters=2):

    X = np.transpose(X)

    # normalize dataset for easier parameter selection
    X = StandardScaler().fit_transform(X)

    # estimate bandwidth for mean shift
    bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)

    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(X, n_neighbors=5, include_self=False)

    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)

    # Generate the new colors:
    if algorithm == 'KMeans':
        model = cluster.KMeans(n_clusters=n_clusters, random_state=0)

    elif algorithm == 'Birch':
        model = cluster.Birch(n_clusters=n_clusters)

    elif algorithm == 'DBSCAN':
        model = cluster.DBSCAN(eps=.2)

    elif algorithm == 'AffinityPropagation':
        model = cluster.AffinityPropagation(damping=.9, preference=-200)

    elif algorithm == 'MeanShift':
        model = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)

    elif algorithm == 'SpectralClustering':
        model = cluster.SpectralClustering(n_clusters=n_clusters,
                                           eigen_solver='arpack',
                                           affinity="nearest_neighbors")

    elif algorithm == 'Ward':
        model = cluster.AgglomerativeClustering(n_clusters=n_clusters,
                                                linkage='ward',
                                                connectivity=connectivity)

    elif algorithm == 'AgglomerativeClustering':
        model = cluster.AgglomerativeClustering(linkage="average",
                                                affinity="cityblock",
                                                n_clusters=n_clusters,
                                                connectivity=connectivity)

    model.fit(X)

    if hasattr(model, 'labels_'):
        y_pred = model.labels_.astype(np.int)
    else:
        y_pred = model.predict(X)

    return X, y_pred
Example #17
0
def compute_clusters(vectors, clusters, algorithm='kmeans'):
    # select clustering algorithm
    if algorithm == 'kmeans':
        algorithm = cluster.MiniBatchKMeans(n_clusters=len(set(clusters)))
    elif algorithm == 'dbscan':
        algorithm = cluster.DBSCAN(eps=1.25, n_jobs=-1)
    elif algorithm == 'optics':
        algorithm = cluster.OPTICS(min_samples=10,
                                   eps=10,
                                   cluster_method='dbscan',
                                   n_jobs=-1)
    elif algorithm == 'birch':
        algorithm = cluster.Birch(n_clusters=len(set(clusters)))
    elif algorithm == 'spectral':
        algorithm = cluster.SpectralClustering(n_clusters=len(set(clusters)),
                                               eigen_solver='arpack',
                                               affinity="nearest_neighbors",
                                               n_jobs=-1)
    elif algorithm == 'affinity':
        algorithm = cluster.AffinityPropagation(damping=.9, preference=-200)
    else:
        raise NotImplementedError(f"Not implemented for algorithm {algorithm}")

    # predict cluster memberships
    algorithm.fit(vectors)
    if hasattr(algorithm, 'labels_'):
        labels = algorithm.labels_.astype(np.int)
    else:
        labels = algorithm.predict(vectors)

    #transform categorical labels to digits
    if isinstance(clusters[0], str):
        labels_true = LabelEncoder().fit_transform(clusters)
    elif isinstance(clusters[0], (int, np.int)):
        labels_true = clusters

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)

    print('Estimated number of clusters: %d' % n_clusters_)
    print('Estimated number of noise points: %d' % n_noise_)
    print("Homogeneity: %0.3f" %
          metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" %
          metrics.completeness_score(labels_true, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    print("Adjusted Rand Index: %0.3f" %
          metrics.adjusted_rand_score(labels_true, labels))
    print("Adjusted Mutual Information: %0.3f" %
          metrics.adjusted_mutual_info_score(labels_true, labels))
    print("Silhouette Coefficient: %0.3f" %
          metrics.silhouette_score(vectors, labels))

    return labels, algorithm
Example #18
0
def birch_clustering(options, all_text):
	print("Running Birch Clustering...")
	X = TfidfVectorizer(max_df=0.5, max_features=10000, min_df=2, stop_words='english', use_idf=True).fit_transform(all_text)
	c = cluster.Birch(n_clusters=options.num_clusters).fit(X)	
	print("Label counts: ", Counter(c.labels_))
	
	if options.save_intermediate:
		pickle.dump(c, open(os.path.join(options.intermediate_out_directory, 'cluster_birch.pkl'), 'wb'))
		pickle.dump(X, open(os.path.join(options.intermediate_out_directory, 'cluster_tfidf.pkl'), 'wb'))

	return X, c
Example #19
0
def select_n_clusters(data, data_pca, n_clusters_range):
    scores = []
    for n in n_clusters_range:
        birch = cluster.Birch(n_clusters=n).fit(data_pca)
        score = get_score(data, birch)
        scores.append(score)
    for i, score_function in enumerate(['silhouette_score', 'calinski_harabaz_score']):
        plt.subplot(1, 2, i+1)
        plt.title(score_function)
        plt.plot(n_clusters_range, [item[score_function] for item in scores])
    plt.show()
Example #20
0
def findClusters_Birch(data):
    '''
        Cluster data using BIRCH algorithm
    '''
    # create the classifier object
    birch = cl.Birch(branching_factor=100,
                     n_clusters=4,
                     compute_labels=True,
                     copy=True)

    # fit the data
    return birch.fit(data)
Example #21
0
    def __init__(self, conn, args, data, split_type, num_clusters):
        """Constructor for Cluster object.

        :param conn: database connection object.
        :param args: dict of arguments read from the arguments file.
        :param data: data to cluster.
        :param split_type: Split train test data randomly or by date to allow testing by specific date ranges.
        :param num_clusters: Number of clusters to create.
        :return: Cluster instance.
        """

        self.conn = conn
        self.args = args
        self.data = data
        self.split_type = split_type

        self.pca_model = None
        self.cluster_model = None
        self.algorithm = args['cluster_algorithm']

        # http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html
        hdbsc = hdbscan.HDBSCAN(min_cluster_size=10)
        affinity_propagation = cluster.AffinityPropagation()
        ms = cluster.MeanShift(bin_seeding=True)
        spectral = cluster.SpectralClustering(n_clusters=num_clusters,
                                              eigen_solver='arpack',
                                              affinity="nearest_neighbors",
                                              random_state=self.args['seed'])
        ward = cluster.AgglomerativeClustering(n_clusters=num_clusters,
                                               linkage='ward')
        birch = cluster.Birch(n_clusters=num_clusters)
        two_means = cluster.MiniBatchKMeans(n_clusters=num_clusters,
                                            random_state=self.args['seed'])
        average_linkage = cluster.AgglomerativeClustering(
            linkage="average", n_clusters=num_clusters)
        hdbsc = hdbscan.HDBSCAN(min_cluster_size=10)
        kmeans = cluster.KMeans(n_clusters=num_clusters,
                                random_state=self.args['seed'])
        dbscan = cluster.DBSCAN()

        self.clustering_algorithms = {
            'MiniBatchKMeans': two_means,
            'AffinityPropagation': affinity_propagation,
            'MeanShift': ms,
            'SpectralClustering': spectral,
            'Ward': ward,
            'AgglomerativeClustering': average_linkage,
            'DBSCAN': dbscan,
            'Birch': birch,
            'HDBSCAN': hdbsc,
            'KMeans': kmeans
        }
Example #22
0
    def fit(self, dataset):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            requested_n_clusters = self.configuration["NCLU"]
            engine = cluster.Birch(n_clusters=requested_n_clusters)
            self.model = engine.fit(dataset)
            fitted_n_clusters = len(self.model.subcluster_centers_)
            if fitted_n_clusters < requested_n_clusters:
                # INFO: Birch must have issued a warning
                result = fitted_n_clusters
            else:
                result = requested_n_clusters

        return result
 def Birch(self, parameters):  # data, threshold, branching_factor):
     result = {}
     default_threshold = 3
     default_branching_factor = 3
     data = np.array(parameters['data'])
     data = preprocessing.MinMaxScaler().fit_transform(data)
     if parameters.get('threshold') is not None:
         default_threshold = int(parameters['threshold'])
     if parameters.get('branching_factor') is not None:
         default_branching_factor = int(parameters['branching_factor'])
     model = skc.Birch(threshold=default_threshold,
                       branching_factor=default_branching_factor)
     clustering = model.fit(data)
     result['labels'] = clustering.labels_
     return result
Example #24
0
def detection_with_birch(image_set):
    """

    :param image_set: The bottleneck values of the relevant images.
    :return: Predictions vector
    """

    # The branching_factor, might be fine tune for better results
    clf = cluster.Birch(n_clusters=2)

    clf.fit(image_set)

    predictions = clf.labels_
    predictions = normalize_predictions(predictions)

    return predictions
Example #25
0
def birch(data):
    '''
	for branching_factor in np.arange(50,60,10):
		print "\nBranch factor = "+str(branching_factor)
		clusterer = skcluster.Birch(branching_factor=branching_factor, n_clusters=None, threshold=0.5, compute_labels=True)
		clusterer.fit(data)
		clusterer.fit_predict(data)
		cluster_labels = clusterer.fit_predict(data)
		silhouette_avg = silhouette_score(data, cluster_labels) 
		print "Default cluster"
 		print (len(set(cluster_labels)), silhouette_avg)

		for ncluster in np.arange(3,4,1):
		'''
    maxsilh = float('-inf')
    centroid_best = []

    for ncluster in range(3, 11):
        clusterer = skcluster.Birch(n_clusters=ncluster, compute_labels=True)
        clusterer.fit(data)
        clusterer.fit_predict(data)
        cluster_labels = clusterer.fit_predict(data)
        silhouette_avg = silhouette_score(data, cluster_labels)

        if silhouette_avg > maxsilh:
            maxsilh = silhouette_avg
            kbest = ncluster

            center_avg_hash = dict()
            center_num_hash = dict()

            for label, centers in zip(clusterer.subcluster_labels_,
                                      clusterer.subcluster_centers_):
                if label not in center_avg_hash:
                    center_avg_hash[label] = np.array(centers)
                    center_num_hash[label] = 1
                else:
                    center_avg_hash[label] += np.array(centers)
                    center_num_hash[label] += 1
            centroid_best = []
            for label, sum_center in center_avg_hash.items():
                #print label
                avg_center = sum_center / (center_num_hash[label] * 1.)
                centroid_best.append(avg_center)

    print(kbest, maxsilh)
    return np.array(centroid_best), kbest
def birch_clustering(data, filename):
    print("Executing birch...")
    model = cluster.Birch(n_clusters=_cluster_size).fit(data.values)
    cluster_ids = model.labels_
    with open("results/birch_" + filename[4:-4] + "_model.txt", "w") as model_output:
        for gene, id in zip(data.index.values, cluster_ids):
            model_output.write(str(gene) + ": " + str(id) + "\n")
    print("Counting cluster size...")
    cluster_size = {i: 0 for i in range(_cluster_size)}
    for id in cluster_ids:
        cluster_size[id] += 1
    print("Final results...")
    for cid, csize in cluster_size.items():
        print("Size of " + str(cid) + " is:\t" + str(csize))

    joblib.dump(model, "models/birch_" + filename[4:-4] + "_model.sav")
    return
Example #27
0
def main():
    data_origin = read_data('iris.data')
    data_converted = convert_data(data_origin, 0)
    true_labels = data_converted.iloc[:, -1]
    data_clean = clean_data(data_converted.iloc[:, :-2])
    plot_distribution(data_clean, size=[1, 3], title='data_clean distribution')

    scaler = sk.preprocessing.StandardScaler().fit(data_clean)
    data_standard = scaler.transform(data_clean)
    plot_distribution(data_standard,
                      size=[1, 3],
                      title='data_standard distribution')

    pca_components = 2
    pca = sk.decomposition.PCA(n_components=pca_components)
    pca.fit(data_standard)
    print('The sum of explained_variance_ratio_ is": ',
          sum(pca.explained_variance_ratio_))
    data_pca = pca.fit_transform(data_standard)
    plot_distribution(data_pca,
                      size=[1, pca_components],
                      title='data_pca distribution')
    plt.scatter(data_pca[:, 0], data_pca[:, 1])
    plt.show()

    n_clusters = 3
    dimension_show = [1, 2]
    scores = dict()
    kmeans = cluster.KMeans(n_clusters=n_clusters).fit(data_pca)
    scores['kmeans'] = show_result(data_clean, data_pca, true_labels, kmeans,
                                   n_clusters, dimension_show)

    ap = cluster.AffinityPropagation(preference=-100).fit(data_pca)
    scores['ap'] = show_result(data_clean, data_pca, true_labels, ap,
                               max(ap.labels_) + 1, dimension_show)

    dbscan = cluster.DBSCAN(eps=0.38, min_samples=10).fit(data_pca)
    scores['dbscan'] = show_result(data_clean, data_pca, true_labels, dbscan,
                                   n_clusters, dimension_show)

    birch = cluster.Birch(n_clusters=3).fit(data_pca)
    scores['birch'] = show_result(data_clean, data_pca, true_labels, birch,
                                  n_clusters, dimension_show)

    compare_scores(scores)
    return 0
Example #28
0
def call_algo(name, params):
    algo = None
    if name == "dbscan" or name == "DBSCAN":
        algo = cluster.DBSCAN(eps=params['eps'],
                              min_samples=params['n_neighbors'])
    elif name == "spectral":
        algo = cluster.SpectralClustering(n_clusters=params['n_clusters'],
                                          eigen_solver='arpack',
                                          affinity="nearest_neighbors")
    elif name == "birch" or name == "Birch":
        algo = cluster.Birch(n_clusters=params['n_clusters'])
    elif name == "gmm" or name == "GMM":
        algo = mixture.GaussianMixture(n_components=params['n_clusters'],
                                       covariance_type='full')
    else:
        print "unknown algo; exit"
        exit(0)
    return algo
Example #29
0
    def get_algorithm(self):

        if(self.algorithmName == "kmeans"):

            cluster_alg = cluster.MiniBatchKMeans(n_clusters=int(self.parms['k']))

        elif(self.algorithmName == "mean_shift"):

            bandwidth = cluster.estimate_bandwidth(self.X, quantile=float(self.parms['quantile']))
            cluster_alg = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)

        elif(self.algorithmName == "affinity_propagation"):

            cluster_alg = cluster.AffinityPropagation(damping=float(self.parms['damping']))

        elif(self.algorithmName == "birch"):

            cluster_alg = cluster.Birch(n_clusters=int(self.parms['k']))

        elif(self.algorithmName == "ward"):

            connectivity = kneighbors_graph(self.X, n_neighbors=int(self.parms['n_neighbors']), include_self=False)
            connectivity = 0.5 * (connectivity + connectivity.T)
            cluster_alg = cluster.AgglomerativeClustering(n_clusters=int(self.parms['k']), linkage='ward', connectivity=connectivity)

        elif(self.algorithmName == "spectral"):

            cluster_alg = cluster.SpectralClustering(n_clusters=int(self.parms['k']), eigen_solver='arpack', affinity="nearest_neighbors")

        elif(self.algorithmName == "dbscan"):

            cluster_alg = cluster.DBSCAN(eps=float(self.parms['eps']))

        elif(self.algorithmName == "agglomerative"):

            connectivity = kneighbors_graph(self.X, n_neighbors=int(self.parms['n_neighbors']), include_self=False)
            connectivity = 0.5 * (connectivity + connectivity.T)

            cluster_alg = cluster.AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=int(self.parms['k']), connectivity=connectivity)

        else:
            return None

        return cluster_alg
Example #30
0
def update_data(attrname, old, new):

    # Get the drop down values
    algorithm = dropdown.value
    global X

    # Generate the new colors:
    if algorithm == 'MiniBatchKMeans':
        model = cluster.MiniBatchKMeans(n_clusters=2)
    elif algorithm == 'AffinityPropagation':
        model = cluster.AffinityPropagation(damping=.9, preference=-200)
    elif algorithm == 'MeanShift':
        model = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
    elif algorithm == 'SpectralClustering':
        model = cluster.SpectralClustering(n_clusters=2,
                                           eigen_solver='arpack',
                                           affinity="nearest_neighbors")
    elif algorithm == 'Ward':
        model = cluster.AgglomerativeClustering(n_clusters=2,
                                                linkage='ward',
                                                connectivity=connectivity)
    elif algorithm == 'AgglomerativeClustering':
        model = cluster.AgglomerativeClustering(linkage="average",
                                                affinity="cityblock",
                                                n_clusters=2,
                                                connectivity=connectivity)
    elif algorithm == 'Birch':
        model = cluster.Birch(n_clusters=2)
    elif algorithm == 'DBSCAN':
        model = cluster.DBSCAN(eps=.2)
    else:
        print('No Algorithm selected')
    model.fit(X)

    if hasattr(model, 'labels_'):
        y_pred = model.labels_.astype(np.int)
    else:
        y_pred = model.predict(X)

    colors = [Spectral6[i] for i in y_pred]

    source.data['colors'] = colors
    plot.title = algorithm