Beispiel #1
0
def map_clusters(n_list, n_clusters):
    # x = np.array([[28.596596, 77.344098], [28.574783, 77.333393]])
    # x = np.append(x, [[28.596596, 77.344098], [28.574783, 77.333393], [28.582515, 77.246735],
    #                   [28.582915, 77.215735], [28.635639, 77.201197], [28.464873, 76.995451]], axis=0)
    x = np.array([[28.596596, 0], [28.574783, 0], [28.996596,
                                                   0], [28.674783, 0],
                  [28.582515, 0], [28.582915, 0], [28.635639, 0],
                  [28.464873, 0]])
    # x = np.append(x, n_list, axis=0)
    # define the model
    model = Birch(threshold=0.01, n_clusters=n_clusters)
    # fit the model
    model.fit(n_list)
    # assign a cluster to each example
    yhat = model.predict(n_list)
    # retrieve unique clusters
    clusters = unique(yhat)
    dic = {}
    # create scatter plot for samples from each cluster
    for cluster in clusters:
        # 	# get row indexes for samples with this cluster
        row_ix = where(yhat == cluster)
        # 	# create scatter of these samples
        dic[cluster] = row_ix[0]
        # pyplot.scatter(x[row_ix, 0], x[row_ix, 1])
    # print(dic)
    # pyplot.show()
    return dic
 def _runAlgorithm(self):
     birch = Birch(branching_factor=50,
                   n_clusters=self.params['birch'],
                   threshold=0.5)
     birch.fit(self.m_data)
     self.m_resultLabels = birch.labels_
     pass
Beispiel #3
0
def birch_clustering(principal_components, principal_df, number_of_clusters):
    final_df = pd.concat([principal_df], axis=1)
    model = Birch(threshold=0.01, n_clusters=number_of_clusters)
    # fit the model
    model.fit(principal_components)
    # assign a cluster to each example
    yhat = model.predict(principal_components)
    # retrieve unique clusters
    clusters = unique(yhat)
    final_df['Segment'] = model.labels_
    # create scatter plot for samples from each cluster
    for cluster in clusters:
        # get row indexes for samples with this cluster
        row_ix = where(yhat == cluster)
        # create scatter of these samples
        plt.scatter(principal_components[row_ix, 0],
                    principal_components[row_ix, 1],
                    s=75)
    final_df.rename({
        0: 'PC1',
        1: 'PC2',
        2: 'PC3',
        'y': 'Race'
    },
                    axis=1,
                    inplace=True)
    plt.title("BIRCH Clustering")
    add_race_labels(final_df)
    calc_silhouette(data=principal_components,
                    prediction=yhat,
                    n_clusters=len(clusters))
    return final_df
Beispiel #4
0
    def birch_enrich(self, input_clustering, numclusters=10, threshold=1.7):
        """Enrich the training set with BIRCH clustering algorithm.
        BIRCH (balanced iterative reducing and clustering using hierarchies) is an unsupervised data mining algorithm
        used to perform hierarchical clustering over particularly large data-sets. An advantage of BIRCH is its ability
        to incrementally and dynamically cluster incoming, multi-dimensional metric data points in an attempt to produce
        the best quality clustering for a given set of resources (memory and time constraints). In most cases, BIRCH
        only requires a single scan of the database.
        :param numclusters: Number of clusters
        :type numclusters: int
        :param threshold: The radius of the subcluster obtained by merging a new sample and the closest subcluster
        should be lesser than the threshold.
        :type threshold: float
        """
        self.X = self.X.astype(float)
        birch = Birch(threshold=threshold, n_clusters=numclusters)
        birch.fit(input_clustering)
        labels = birch.labels_
        cluster_centers = birch.subcluster_centers_
        n_features = self.vocabulary.__len__()

        sum = 0
        for x in range(self.X.__len__()):
            sum = sum + np.count_nonzero(self.X[x])
        mean_n_features_in_docs = sum / self.X.__len__()

        for x in range(self.X.__len__()):
            # check gamma, influence of length
            gamma = mean_n_features_in_docs / np.count_nonzero(self.X[x])
            x_label = labels[x]
            center_vector = cluster_centers[x_label]
            for i in range(n_features):
                self.X[x][i] = self.X[x][i] + gamma * center_vector[i]
def clusteringReminMost(window):
    brc = Birch(branching_factor=50,
                n_clusters=3,
                threshold=0.5,
                compute_labels=True)
    brc.fit(window)
    Class = brc.predict(window)
    #统计各个类别的信息,找出个数最多的类别,取出这些数据,从而强化历史数据
    num0 = 0
    num1 = 0
    num2 = 0

    for i in Class:
        if i == 0:
            num0 += 1
        elif i == 1:
            num1 += 1
        else:
            num2 += 1
    lable = chooseMax(num0, num1, num2)
    newwindow = window[0:1]
    for i in range(1, len(Class)):
        if Class[i] == lable:  #属于目标类别,则进行添加
            newwindow = newwindow.append(window[i - 1:i])  #都为pandas数据结果
    return newwindow
Beispiel #6
0
def birchclustering(datalist):
    brc = Birch(branching_factor=50,
                n_clusters=None,
                threshold=0.17,
                compute_labels=True)
    brc.fit(datalist)
    return brc
def BirchModel(data, actualLabels):
    pca = PCA(n_components=2).fit(data)
    pca_2d = pca.transform(data)
    birch_model = Birch(threshold=0.1, n_clusters=10)
    t0 = time()
    birch_model.fit(pca_2d)
    labels = birch_model.labels_
    centroids = birch_model.subcluster_centers_
    n_clusters = np.unique(labels).size
    print('% 9s' % 'init'
          '    time   h**o   compl  v-meas     ARI AMI  silhouette')
    print(
        '% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f' %
        ('Birch Model', (time() - t0),
         metrics.homogeneity_score(actualLabels, birch_model.labels_),
         metrics.completeness_score(actualLabels, birch_model.labels_),
         metrics.v_measure_score(actualLabels, birch_model.labels_),
         metrics.adjusted_rand_score(actualLabels, birch_model.labels_),
         metrics.adjusted_mutual_info_score(actualLabels, birch_model.labels_),
         metrics.silhouette_score(
             data, birch_model.labels_, metric='euclidean',
             sample_size=10000)))

    scatter = plt.scatter(pca_2d[:, 0], pca_2d[:, 1], c=labels, marker='*')
    plt.plot(centroids[:, 0],
             centroids[:, 1],
             'X',
             markeredgecolor='k',
             markersize=3)
    plt.colorbar(scatter)

    plt.title('Birch Model Clustering')
    plt.show()
    def scan_callback(self, msg):
        pose = self.pose.copy()
        bearings = self.bearings.copy()

        ranges = np.array(msg.ranges)
        inf_flag = (-1 * np.isinf(ranges).astype(int) + 1)
        ranges = np.nan_to_num(ranges) * inf_flag

        euc_coord_x = pose[0] + np.cos(bearings - pose[2]) * ranges
        euc_coord_y = pose[1] + np.sin(bearings - pose[2]) * ranges
        dist_flag = np.where( (euc_coord_x-pose[0])**2 + \
                        (euc_coord_y-pose[1])**2 != 0.0)[0]
        points = np.array([euc_coord_x, euc_coord_y]).T
        points = points[dist_flag]

        self.obsv = []
        if len(points) > 0:
            brc = Birch(n_clusters=None, threshold=0.05)
            brc.fit(points)
            labels = brc.predict(points)
            u_labels = np.unique(labels)
            for l in u_labels:
                seg_idx = np.where(labels == l)
                seg = points[seg_idx]
                if seg.shape[0] <= 1:
                    fit_cov = 10
                else:
                    fit_cov = np.trace(np.cov(seg.T))
                if fit_cov < 0.001 and seg.shape[0] >= 3:
                    self.obsv.append(seg.mean(axis=0))
            print(self.obsv)
Beispiel #9
0
def birch_ad_with_smoothing(latency_df, threshold):
    # anomaly detection on response time of service invocation. 
    # input: response times of service invocations, threshold for birch clustering
    # output: anomalous service invocation
    
    anomalies = []
    for svc, latency in latency_df.iteritems():
        # No anomaly detection in db
        if svc != 'timestamp' and 'Unnamed' not in svc and 'rabbitmq' not in svc and 'db' not in svc:
            latency = latency.rolling(window=smoothing_window, min_periods=1).mean()
            x = np.array(latency)
            x = np.where(np.isnan(x), 0, x)
            normalized_x = preprocessing.normalize([x])

            X = normalized_x.reshape(-1,1)

#            threshold = 0.05

            brc = Birch(branching_factor=50, n_clusters=None, threshold=threshold, compute_labels=True)
            brc.fit(X)
            brc.predict(X)

            labels = brc.labels_
#            centroids = brc.subcluster_centers_
            n_clusters = np.unique(labels).size
            if n_clusters > 1:
                anomalies.append(svc)
    return anomalies
class BirchColorExtractor:
    def __init__(self,
                 n_colors=None,
                 threshold=0.5,
                 branching_factor=50,
                 compute_labels=True,
                 copy=True):
        self.birch = Birch(n_clusters=n_colors,
                           threshold=threshold,
                           branching_factor=branching_factor,
                           compute_labels=compute_labels,
                           copy=copy)

    def extract(self, img):
        img_array = np.array(img, dtype=np.float64) / 255

        # Load Image and transform to a 2D numpy array.
        w, h, d = tuple(img_array.shape)
        assert d == 3
        image_array = np.reshape(img_array, (w * h, d))

        print("Fitting model on a small sub-sample of the data")
        # manually fit on batches
        self.birch.fit(image_array)

        # Get labels for all points
        print("Predicting color indices on the full image (birch)")
        labels = self.birch.labels_

        main_color_array = 255 * self.birch.subcluster_centers_
        return [
            dict(color=dict(r=color[0], g=color[1], b=color[2]),
                 count=labels[labels == i].shape[0])
            for i, color in enumerate(main_color_array)
        ]
Beispiel #11
0
def birch_clusters(textdata,
                   trained_doc2vec,
                   n_clusters,
                   start_alpha=0.025,
                   infer_epoch=100,
                   branching_factor=10,
                   threshold=0.01,
                   compute_labels=True,
                   metric='cosine',
                   **kwargs):
    infer_list = []

    for doc in textdata:
        infer_list.append(
            trained_doc2vec.infer_vector(doc,
                                         alpha=start_alpha,
                                         steps=infer_epoch,
                                         **kwargs))
    brc = Birch(branching_factor=branching_factor,
                n_clusters=int(n_clusters),
                threshold=threshold,
                compute_labels=compute_labels)

    brc.fit(infer_list)
    clusters = brc.predict(infer_list)
    birch_labels = brc.labels_

    silhouette_score = metrics.silhouette_score(infer_list,
                                                birch_labels,
                                                metric=metric)

    return silhouette_score, clusters
Beispiel #12
0
def cluster_latlon(n_clusters, data):
    #split the data between "around NYC" and "other locations" basically our first two clusters
    data_c = data[(data.longitude > -74.05) & (data.longitude < -73.75) &
                  (data.latitude > 40.4) & (data.latitude < 40.9)]
    data_e = data[~(data.longitude > -74.05) & (data.longitude < -73.75) &
                  (data.latitude > 40.4) & (data.latitude < 40.9)]
    #put it in matrix form
    coords = data_c.as_matrix(columns=['latitude', "longitude"])

    brc = Birch(branching_factor=100,
                n_clusters=n_clusters,
                threshold=0.01,
                compute_labels=True)

    brc.fit(coords)
    clusters = brc.predict(coords)
    data_c["cluster_" + str(n_clusters)] = clusters
    data_e["cluster_" + str(
        n_clusters)] = -1  #assign cluster label -1 for the non NYC listings
    data = pd.concat([data_c, data_e])
    plt.scatter(data_c["longitude"],
                data_c["latitude"],
                c=data_c["cluster_" + str(n_clusters)],
                s=10,
                linewidth=0.1)
    plt.title(str(n_clusters) + " Neighbourhoods from clustering")
    plt.show()
    return data
Beispiel #13
0
def birch_skm_part1_helper(data, m, k, delta):
    """
    The function receive data and calculates k centers using the birch function in sklearn, and their quantile radius
    :param data: numpy array
    :param m: Size of the data
    :param k: Number of centers.
    :param delta: int
    :return: tuple of two numpy array. (k_medoids, k_distances).
    """
    birch_instance = Birch(n_clusters=k, threshold=0.1)  # birch instance
    birch_instance.fit(data)  # Run birch on the data
    labels = birch_instance.predict(data) # calculate the cluster number for each point
    l_medoids = []
    # since birch does not return centers, I have to calculate them
    for label in range(
            np.unique(labels).size):
        # calculate the center for each cluster
        cluster = data[labels == label]
        kmedoids_instance_for_birch = kmedoids(cluster.tolist(), init_centers(cluster, 1))
        kmedoids_instance_for_birch.process()
        l_medoids.append(cluster[kmedoids_instance_for_birch.get_medoids()][0])
    l_medoids = np.array(l_medoids)
    q = calc_q(m, delta)  # calculate q
    # calculate the distance to the quantile points around each center
    l_distances = calc_quantile_radius_around_centers(data, l_medoids, q, k)
    return l_medoids, l_distances
Beispiel #14
0
    def birch(self, x, threshold = 0.01):
        """

        """
        model = Birch(threshold = threshold, n_clusters = self.max_clusters)
        model.fit(x)
        return model
Beispiel #15
0
def train(feature, weights, cluster_num, feature_path = None, down = 0.006, up = 0.0085, bf_index = 2):
	if feature_path != None:
		feature = pd.read_csv(feature_path)
	X = []
	print("Training...\n")
	for i in range(len(feature[feature.columns[0]])):
		f = np.array(feature.iloc[i][1:])
		key = f[bf_index]
		if key > up:
			f_w = combine(feature.iloc[i][1:], weights)
			X.append(f_w)
	clf = Birch(n_clusters = cluster_num)
	clf = KMeans(n_clusters = cluster_num)
	clf.fit(X)
	pred = []
	for i in range(len(feature[feature.columns[0]])):
		f = np.array(feature.iloc[i][1:])
		key = f[bf_index]
		if key > up:
			p = clf.predict([combine(f, weights)])
			pred.append(p[0])
		if key < down:
			pred.append(cluster_num)
		if key > down and key < up:
			pred.append(cluster_num + 1)
	joblib.dump(clf, 'curve_model_Birch.pkl') 
	print(pred)
	return pred
def add_cluster_column(train_df, test_df, n_clusters):
    train_df['source'] = 'train'
    test_df['source'] = 'test'

    total_rows = train_df.shape[0] + test_df.shape[0]

    data = pd.concat([train_df, test_df])

    #split the data between "around NYC" and "other locations"
    data_c = data[(data.longitude > -74.05) & (data.longitude < -73.75) &
                  (data.latitude > 40.4) & (data.latitude < 40.9)]
    data_e = data[~((data.longitude > -74.05) & (data.longitude < -73.75) &
                    (data.latitude > 40.4) & (data.latitude < 40.9))]
    #put it in matrix form
    coords = data_c.as_matrix(columns=['latitude', "longitude"])

    brc = Birch(branching_factor=100,
                n_clusters=n_clusters,
                threshold=0.01,
                compute_labels=True)

    brc.fit(coords)
    clusters = brc.predict(coords)
    data_c["num_cluster_" + str(n_clusters)] = clusters
    data_e["num_cluster_" + str(
        n_clusters)] = -1  #assign cluster label -1 for the non NYC listings
    data = pd.concat([data_c, data_e])

    print('lost: {}'.format(total_rows -
                            data[data['source'] == 'train'].shape[0] -
                            data[data['source'] == 'test'].shape[0]))
    return data[data['source'] == 'train'], data[data['source'] == 'test']
def get_clustered_data(data_matrix,
                       clustering_algorithm=model_constants.KMEANS,
                       distance_metric='euclidean',
                       num_clusters=3):
    if clustering_algorithm.lower() == model_constants.AFFINITY_PROP:
        aff_prop = AffinityPropagation(affinity=distance_metric)
        aff_prop.fit(data_matrix)
        return aff_prop.labels_, aff_prop
    elif clustering_algorithm.lower() == model_constants.DBSCAN:
        dbscan = DBSCAN(metric=distance_metric)
        dbscan.fit(data_matrix)
        return dbscan.labels_, dbscan
    elif clustering_algorithm.lower() == model_constants.OPTICS:
        optics = OPTICS(metric=distance_metric)
        optics.fit(data_matrix)
        return optics.labels_, optics
    elif clustering_algorithm.lower() == model_constants.MEANSHIFT:
        mean_shift = MeanShift()
        mean_shift.fit(data_matrix)
        return mean_shift.labels_, mean_shift
    elif clustering_algorithm.lower() == model_constants.BIRCH:
        birch = Birch(n_clusters=num_clusters)
        birch.fit(data_matrix)
        return birch.labels_, birch
    elif clustering_algorithm.lower() == model_constants.AGGLOMERATIVE:
        agglomerative = AgglomerativeClustering(n_clusters=num_clusters,
                                                affinity=distance_metric)
        agglomerative.fit(data_matrix)
        return agglomerative.labels_, agglomerative
    else:
        kmeans = KMeans(n_clusters=num_clusters, random_state=42)
        kmeans.fit(data_matrix)
        return kmeans.labels_, kmeans
Beispiel #18
0
def birchcluster(X):
    brc = Birch()
    brc.fit(X)
    # Plot result
    labels = brc.labels_
    centroids = brc.subcluster_centers_
    n_clusters = np.unique(labels).size
    print("n_clusters : %d" % n_clusters)
    return labels
def birch_algo(X, threshold=1.7, clustering=None):
    birch = Birch(threshold=threshold, n_clusters=clustering)
    t = time()
    birch.fit(X)
    time_ = time() - t
    labels = birch.labels_
    centroids = birch.subcluster_centers_
    n_clusters = np.unique(labels).size
    print(" The number of clusters is : %d" % n_clusters)
def birch_algo(X, threshold=1.7, clustering=None):
        birch = Birch(threshold=threshold, n_clusters=clustering)
        t = time()
        birch.fit(X)
        time_ = time() - t
        labels = birch.labels_
        centroids = birch.subcluster_centers_
        n_clusters = np.unique(labels).size
        print(" The number of clusters is : %d" % n_clusters)
def birch_algo(X, threshold=1.7, clustering=None):
        birch = Birch(threshold=threshold, n_clusters=clustering)
        birch.fit(X)
        labels = birch.labels_
        centroids = birch.subcluster_centers_
        labels_unique = np.unique(labels)
        n_clusters = labels_unique.size
        print(" The number of clusters is : %d" % n_clusters)
        return labels, centroids, n_clusters     
Beispiel #22
0
def birch(data,threshold,branching_factor):
    # bir = Birch(threshold=args['threshold'], branching_factor=int(args['branching_factor']))

    db = Birch(threshold=threshold, branching_factor=branching_factor)
    db.fit(data)
    pred = db.fit_predict(data)
    score = sil_score(data,pred)
    print(score)
    return db,pred,score
Beispiel #23
0
 def cluster_birch(self):
     print "Starting Birch clustering"
     brc = Birch(branching_factor=10,
                 n_clusters=40,
                 threshold=self.cluster_distance,
                 compute_labels=False)
     brc.fit(self.all_frames_xy)
     clusters = brc.predict(self.all_frames_xy)
     return clusters
def test_feature_names_out():
    """Check `get_feature_names_out` for `Birch`."""
    X, _ = make_blobs(n_samples=80, n_features=4, random_state=0)
    brc = Birch(n_clusters=4)
    brc.fit(X)
    n_clusters = brc.subcluster_centers_.shape[0]

    names_out = brc.get_feature_names_out()
    assert_array_equal([f"birch{i}" for i in range(n_clusters)], names_out)
Beispiel #25
0
def birchcluster(X):
  brc = Birch()
  brc.fit(X)
  # Plot result
  labels = brc.labels_
  centroids = brc.subcluster_centers_
  n_clusters = np.unique(labels).size
  print("n_clusters : %d" % n_clusters)
  return labels
Beispiel #26
0
def test_n_samples_leaves_roots():
    # Sanity check for the number of samples in leaves and roots
    X, y = make_blobs(n_samples=10)
    brc = Birch()
    brc.fit(X)
    n_samples_root = sum([sc.n_samples_ for sc in brc.root_.subclusters_])
    n_samples_leaves = sum([sc.n_samples_ for leaf in brc._get_leaves()
                            for sc in leaf.subclusters_])
    assert n_samples_leaves == X.shape[0]
    assert n_samples_root == X.shape[0]
def test_threshold():
    # Test that the leaf subclusters have a threshold lesser than radius
    X, y = make_blobs(n_samples=80, centers=4)
    brc = Birch(threshold=0.5, n_clusters=None)
    brc.fit(X)
    check_threshold(brc, 0.5)

    brc = Birch(threshold=5.0, n_clusters=None)
    brc.fit(X)
    check_threshold(brc, 5.0)
Beispiel #28
0
def update_k_clusters(attrname, old, new):
    k_cluster = int(k_slider.value)
    brc = Birch(branching_factor=50,
                n_clusters=k_cluster,
                threshold=0.5,
                compute_labels=True)
    brc.fit(tweet_vecs)
    predictions = brc.predict(tweet_vecs)
    colors = get_colors(predictions)
    brc_data.data = dict(colors=colors, x=tsne_vecs[:, 0], y=tsne_vecs[:, 1])
Beispiel #29
0
def BIRCH2_duplicate_removal(dataframe, threshold=0.8):
    # Note this method now takes a dataframe as input

    if len(dataframe) < 2:
        # nothing to do
        return dataframe

    Crater_data = dataframe
    # extract axes
    x = Crater_data[0].values.tolist()
    y = Crater_data[1].values.tolist()
    r = Crater_data[2].values.tolist()
    p = Crater_data[3].values.tolist()
    Points = []

    X = np.column_stack((x, y))
    brc = Birch(branching_factor=50,
                n_clusters=int(threshold * len(x)),
                threshold=0.5,
                compute_labels=True)
    brc.fit(X)
    groups_pred = brc.predict(X)

    for c in set(groups_pred):
        idx = [i for i, e in enumerate(groups_pred) if e == c]

        Group_x = []
        Group_y = []
        Group_r = []
        Group_p = []
        index = []

        for i in idx:
            if i in range(0, len(x)):
                Group_x.append(x[i])
                Group_y.append(y[i])
                Group_r.append(r[i])
                Group_p.append(p[i])
                index.append(i)

        # after group is defined, extract its elements from list
        Points.append([Group_x, Group_y, Group_r, Group_p])

    # now reduce groups
    center_size = []
    for i, (Xs, Ys, Rr, Ps) in enumerate(Points):
        # we take the point with best prediction confidence
        best_index = np.argmax(Ps)
        x_center = Xs[best_index]
        y_center = Ys[best_index]
        radius = Rr[best_index]
        prob = Ps[best_index]
        center_size += [[x_center, y_center, radius, prob]]

    return pd.DataFrame(center_size)
def compute_clusters(data: List) -> np.ndarray:
    print("--->Computing clusters")
    birch = Birch(branching_factor=50,
                  n_clusters=5,
                  threshold=0.3,
                  copy=True,
                  compute_labels=True)

    birch.fit(data)
    predictions = np.array(birch.predict(data))
    return predictions
class BirchSklearn(AbstractClusteringAlgorithm):
    def __init__(self, **kwargs):
        from sklearn.cluster import Birch
        self.model = Birch(**kwargs)

    def fit(self, x: [np.ndarray]):
        self.model.fit(x)

    @property
    def labels_(self):
        return self.model.labels_
Beispiel #32
0
def skitleanBirch():
    data = pd.read_csv("soy_rock.csv", header=None)
    X = data.values.tolist()
    randomm = randint(5, 20)

    brc = Birch(branching_factor=randomm,
                n_clusters=4,
                threshold=0.1,
                compute_labels=True)
    brc.fit(X)
    pred = brc.predict(X)
    return pred
def birch(x, n_clusters=None, threshold=0.5, branching_factor=5):
  birch_model = Birch(
    threshold=threshold, 
    n_clusters=n_clusters, 
    branching_factor=branching_factor
  )
  birch_model.fit(x)

  centroids = birch_model.subcluster_centers_
  c = birch_model.labels_
  k = len(centroids)

  return birch_model, (centroids, c, k)
def main():
    #remove sub folders
    removeSubFolders(path+algorithm+'\\')
    
    for file in os.listdir(path):
        if file.endswith("-d.txt"):
            text_file = open(path+file,'r')
            
            ar = (text_file.readline().split(' '))
            ar.remove('\n')
            if(len(ar)>0):
                #print map(int,ar)
                row = map(int,ar);
                data.append(row)
                fileNames.append(file)
            #print(row)

    #create np array

    npData = np.array(data)
    n_samples, n_features = npData.shape
    brc = Birch(branching_factor=50, n_clusters=n_digits, threshold=0.5,compute_labels=True)
    #kmeans = KMeans(init='random', n_clusters=n_digits, n_init=500)
    brc.fit(npData)
    list1 = brc.labels_
    list2 = fileNames
    print brc.labels_
    print fileNames

    list1, list2 = zip(*sorted(zip(list1, list2)))

    print list1
    print list2
    '''
    k=0
    lim = len(list1)-1
    for i in range(0,n_digits):
        
        while(list1[k]==i):
            # want to copy these into folders
            copychar(list1[k],list2[k])
            print list1[k],list2[k]
            k+=1
            if k==lim:
                break
    '''
    for i in range(0,len(list1)):
        print list1[i],list2[i]
        copychar(list1[i],list2[i])
def runBrich(K_cluster, cluster_input):
    # clustering by topic-probability vector of each category
    t0 = time()
    bri = Birch(n_clusters=K_cluster)
    bri.fit(cluster_input)
    print("done in %0.3fs" % (time() - t0))

    with open('result/brich_cluster_' + str(K_cluster) + '.txt', 'w') as f:
        f.write("cluster_centers\n")
        f.write(str(bri.subcluster_centers_))
        f.write("\n==========\n")
        f.write("labels (sequence of cluster # which input belongs to )\n")
        f.write(str(bri.labels_))
        f.write("\n==========\n")
        f.write("inertia\n")
        f.write(str(bri.subcluster_labels_))
        f.write("\n==========\n")

    return bri.labels_
    def split_birch(self, branching_factor, threshold):

        # Extract dataset from files
        dataset = [f.dataset for f in self.files]

        # Initialize classifier
        classifier = Birch(branching_factor=branching_factor, n_clusters=None, threshold=threshold)

        classifier.fit(dataset)

        # Get index
        index = classifier.predict(dataset)

        count = max(index) + 1

        # Create new clusters
        clusters = [Cluster(self.directory, self.name + '-' + str(i)) for i in range(count)]
        for i in range(0, len(self.files), 1):
            clusters[index[i]].add_file(self.files[i])

        return clusters
Beispiel #37
0
def build_model(df, cluster_type="kmeans", seed=1):
    if cluster_type == "birch":
        model = Birch(n_clusters=N_CLUSTERS)
        res = model.fit_predict(df)
    elif cluster_type == "minibatch":
        model = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=seed)
        res = model.fit_predict(df)
    elif cluster_type == "em":
        model = mixture.GMM(n_components=N_CLUSTERS)
        model.fit(df)
        res = model.predict(df)
    elif cluster_type == 'lda':
        model = lda.LDA(n_topics=N_CLUSTERS, n_iter=1500, random_state=seed)
        data_to_cluster = np.array(df).astype(int)
        lda_res = model.fit_transform(data_to_cluster)
        res = []
        for i in lda_res:  #for now - do hard clustering, take the higheset propability
            res.append(i.argmax())
    else:
        model = KMeans(n_clusters=N_CLUSTERS, random_state=seed)
        res = model.fit_predict(df)
        df_array = np.array(df)

        dis_dict = {}
        for i in range(N_CLUSTERS):
            dis_dict[i] = clusters_centers[i]
        all_dist = []
        for line_idx in range(len(df_array)):
            label =  model.labels_[line_idx]
            dist = calc_distance(df_array[line_idx],dis_dict[label])
            all_dist.append(dist)
        df["distance_from_cluster"] = all_dist

    #clusters = model.labels_.tolist()
    #print ("clusters are:",clusters)
    print(""">>>> model is: %s, # of clusters:%s, and %s""" %(cluster_type,N_CLUSTERS,Counter(res)))
    res = [str(i) for i in res]
    docs_clusteres = zip(df.index,res)
    return docs_clusteres
Beispiel #38
0
def test_birch_with_depot_calculation():
    points = points_from_file('tsps/berlin52.txt')
    matrix = load_matrix(points)
    X = [[p[1],p[2]] for p in points]
    est = Birch(n_clusters=3)
    est.fit(X)
    labels = est.labels_
    hl_matrix, clusters, G = load_matrices_from_labels(points,labels)
    depots, C = compute_depots(clusters, matrix, G, per_cluster=True)
    depots_actual, _ = compute_depots(clusters, matrix, G)
    cluster_optimal_cost, R, hl_route = clustered_tsp_solve(points, 3, labels=labels, depots=depots)
    cluster_optimal_cost += C

    print(depots_actual)
    print(R,C)

    for depot in depots_actual:
        for r in R:
            if r[1][0] == depot:
                for point in r[1]:
                    print(matrix.points[point])
        print('')
Beispiel #39
0
   def obtainCodebook(self, sampled_x, x):

      print 'Obatining codebook using Birch from skilean...'
   
      scaled_x_sampled = StandardScaler().fit_transform(sampled_x)
      scaled_x = StandardScaler().fit_transform(x)
      
      brc = BIRCH(branching_factor=self.branching_factor, n_clusters=self.nclusters, threshold=self.threshold, compute_labels=True)
      
      #obatin the codebook and the projections of the images on the codebook (clusters of words)
      codebook = brc.fit(scaled_x_sampled)
      clusters = brc.predict(scaled_x)
      
      print 'Clusters obtained.'
      
      return codebook, clusters 
Beispiel #40
0
   def obtainClusters(self, hist):

      print 'Obatining clusters using Birch from skilean...'
   
      hist = np.array(hist)
      hist = hist.astype(float)      
      scaled_vec = StandardScaler().fit_transform(hist)
      
      brc = BIRCH(branching_factor=self.branching_factor, n_clusters=self.nclusters, threshold=self.threshold, compute_labels=True)
      
      #obatin the codebook and the projections of the images on the codebook (clusters of words)
      codebook = brc.fit(scaled_vec)
      clusters = brc.predict(scaled_vec)
      
      print 'Clusters obtained.'
      
      return clusters
import numpy as np
from sklearn.cluster import Birch
from sklearn.datasets.samples_generator import make_blobs
import matplotlib.pyplot as plt
from itertools import cycle

# Generates random vectors to cluster
n_samples = 50
centers = [[0, 1], [4, -2], [-2, 2], [0, -1]]
X, _ = make_blobs(n_samples=n_samples, centers=centers, cluster_std=0.2)

# Creates the Birch classificator and gives it the vectors
brc = Birch(branching_factor=50, n_clusters=None, threshold=0.8, compute_labels=True)
brc.fit(X)

labels = brc.labels_
cluster_centers = brc.subcluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

# Prints the points generated
plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    my_members = labels == k
    cluster_center = cluster_centers[k]
    plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
    plt.axis([-4,12,-4,12])
plt.title('Estimated number of clusters: %d' % n_clusters_)
Beispiel #42
0
def birchclustering(datalist):
    brc = Birch(branching_factor=50, n_clusters=None, threshold=0.17,compute_labels=True)
    brc.fit(datalist)
    return brc
	def cluster_birch(self):
		print "Starting Birch clustering"
		brc = Birch(branching_factor=10, n_clusters=40, threshold=self.cluster_distance,compute_labels=False)
		brc.fit(self.all_frames_xy)
		clusters = brc.predict(self.all_frames_xy)
		return clusters
def cluster_junctions(juncs):
    birch_model = Birch(threshold=3, n_clusters=None)
    X = np.array(juncs)
    birch_model.fit(X)

    return birch_model.labels_
Beispiel #45
0
station_array = np.array(station_list)
dsp_array = np.array(dsp_list)

# extract the unique station names
stations = np.unique(station_array)
print stations

for sta in stations:
    events = event_array[station_array == sta, :]
    dsp_shortlist = dsp_array[station_array == sta]
    print sta, events.shape, dsp_shortlist.shape

    # cluster on events so as to compare dispersion curves for nearby
    # events
    brc = Birch(branching_factor=50, n_clusters=None, threshold=dist, compute_labels=True)
    brc.fit(events)
    labels = brc.predict(events)
    print np.max(labels)
    for lab in np.unique(labels):
        dsp_this_label_list = dsp_shortlist[labels == lab]
        cluster_name = os.path.join(dirname, "cluster_%s_%03d" % (sta, lab))
        plot_all_dsp(dsp_this_label_list, legend=False, fname="%s_gvel.png" % cluster_name)
        plot_all_map(dsp_this_label_list, fname="%s_map.png" % cluster_name, legend=False)
        f = open("%s_info.txt" % cluster_name, "w")
        for (dsp, dsp_dict) in dsp_this_label_list:
            f.write(
                "%s %s %d %03d %02d %02d %.3f %.3f\n"
                % (
                    dsp_dict["STA"],
                    dsp_dict["COMP"],
                    dsp_dict["YEAR"],