Beispiel #1
0
def birch_ad_with_smoothing(latency_df, threshold):
    # anomaly detection on response time of service invocation. 
    # input: response times of service invocations, threshold for birch clustering
    # output: anomalous service invocation
    
    anomalies = []
    for svc, latency in latency_df.iteritems():
        # No anomaly detection in db
        if svc != 'timestamp' and 'Unnamed' not in svc and 'rabbitmq' not in svc and 'db' not in svc:
            latency = latency.rolling(window=smoothing_window, min_periods=1).mean()
            x = np.array(latency)
            x = np.where(np.isnan(x), 0, x)
            normalized_x = preprocessing.normalize([x])

            X = normalized_x.reshape(-1,1)

#            threshold = 0.05

            brc = Birch(branching_factor=50, n_clusters=None, threshold=threshold, compute_labels=True)
            brc.fit(X)
            brc.predict(X)

            labels = brc.labels_
#            centroids = brc.subcluster_centers_
            n_clusters = np.unique(labels).size
            if n_clusters > 1:
                anomalies.append(svc)
    return anomalies
class Birch_(ClusterModel):
    def __init__(self, num_clusters, feature_names, train_x, train_y, rep):
        ClusterModel.__init__(self, train_x, train_y, feature_names, rep)
        self.birch_model = Birch(n_clusters=num_clusters).fit(train_x)
        self.birch_model.predict(train_x)
        self.labels = self.birch_model.labels_
        self.num_clusters = num_clusters
Beispiel #3
0
 def _get_centers(self, x):
     x = np.array(x)
     if self.hidden is None:
         brc = Birch()
         brc.fit(x)
         brc.predict(x)
         return brc.subcluster_centers_
     else:
         if x.shape[0] == 1:
             x = x.T
             print(x.shape)
         idx = np.random.choice(x.shape[0], self.hidden, replace=False)
         print(idx)
         return x[idx]
Beispiel #4
0
class Birch_algo_wrapper:
    def __init__(self):
        self.wrapped = Birch(n_clusters=None,
                             threshold=0.5,
                             branching_factor=50)

    def fit(self, data):
        return self.wrapped.fit(data)

    def fit_predict(self, data):
        self.wrapped = self.wrapped.partial_fit(data)
        return self.wrapped.predict(data)

    def predict(self, data):
        return self.wrapped.predict(data)
def bclustering(matlist, numlist, thre):
    ids = []
    brc = Birch(branching_factor=80,
                n_clusters=300,
                threshold=thre,
                compute_labels=True)
    brc.fit(np.asarray(matlist, dtype=float))
    brc.predict(np.asarray(matlist, dtype=float))
    labels = brc.labels_
    k = len(set(labels))
    # k is the numbers of clusters, including the bad cluster label, for instance -1
    for i in range(0, k):
        list_id = np.asarray(numlist)[labels == i]
        ids.append(list(list_id))
    return ids
Beispiel #6
0
def train(feature, weights, cluster_num, feature_path = None, down = 0.006, up = 0.0085, bf_index = 2):
	if feature_path != None:
		feature = pd.read_csv(feature_path)
	X = []
	print("Training...\n")
	for i in range(len(feature[feature.columns[0]])):
		f = np.array(feature.iloc[i][1:])
		key = f[bf_index]
		if key > up:
			f_w = combine(feature.iloc[i][1:], weights)
			X.append(f_w)
	clf = Birch(n_clusters = cluster_num)
	clf = KMeans(n_clusters = cluster_num)
	clf.fit(X)
	pred = []
	for i in range(len(feature[feature.columns[0]])):
		f = np.array(feature.iloc[i][1:])
		key = f[bf_index]
		if key > up:
			p = clf.predict([combine(f, weights)])
			pred.append(p[0])
		if key < down:
			pred.append(cluster_num)
		if key > down and key < up:
			pred.append(cluster_num + 1)
	joblib.dump(clf, 'curve_model_Birch.pkl') 
	print(pred)
	return pred
Beispiel #7
0
def birch_skm_part1_helper(data, m, k, delta):
    """
    The function receive data and calculates k centers using the birch function in sklearn, and their quantile radius
    :param data: numpy array
    :param m: Size of the data
    :param k: Number of centers.
    :param delta: int
    :return: tuple of two numpy array. (k_medoids, k_distances).
    """
    birch_instance = Birch(n_clusters=k, threshold=0.1)  # birch instance
    birch_instance.fit(data)  # Run birch on the data
    labels = birch_instance.predict(data) # calculate the cluster number for each point
    l_medoids = []
    # since birch does not return centers, I have to calculate them
    for label in range(
            np.unique(labels).size):
        # calculate the center for each cluster
        cluster = data[labels == label]
        kmedoids_instance_for_birch = kmedoids(cluster.tolist(), init_centers(cluster, 1))
        kmedoids_instance_for_birch.process()
        l_medoids.append(cluster[kmedoids_instance_for_birch.get_medoids()][0])
    l_medoids = np.array(l_medoids)
    q = calc_q(m, delta)  # calculate q
    # calculate the distance to the quantile points around each center
    l_distances = calc_quantile_radius_around_centers(data, l_medoids, q, k)
    return l_medoids, l_distances
Beispiel #8
0
class ClusteringObjectClassifierModel(object):
    def __init__(self):
        self.learned_classes = dict()
        self.max_classes = 10
        self.estimator = Birch(n_clusters=None, threshold=10.0)

    def online_fit(self, X, class_name):
        self.estimator.partial_fit(X)

        cluster_id = np.asscalar(self.estimator.labels_)
        if cluster_id not in self.learned_classes:
            print("Assigning cluster id %d to class %s" %
                  (cluster_id, class_name))
            self.learned_classes[cluster_id] = class_name

        return self.__pca_on_cluster_centers(
            self.estimator.subcluster_centers_)

    def __pca_on_cluster_centers(self, cluster_centers):
        pca = PCA(n_components=2)
        coords = np.atleast_2d(pca.fit_transform(cluster_centers))
        if len(coords) < 2:
            return np.zeros(1), np.zeros(1)

        return coords[:, 0], coords[:, 1]

    def predict_class(self, X):
        if not hasattr(self.estimator, "root_"):
            return False, False

        cluster_id = np.asscalar(self.estimator.predict(X))
        if cluster_id not in self.learned_classes:
            return False, False

        return self.learned_classes[cluster_id], cluster_id
Beispiel #9
0
def birch_clusters(textdata,
                   trained_doc2vec,
                   n_clusters,
                   start_alpha=0.025,
                   infer_epoch=100,
                   branching_factor=10,
                   threshold=0.01,
                   compute_labels=True,
                   metric='cosine',
                   **kwargs):
    infer_list = []

    for doc in textdata:
        infer_list.append(
            trained_doc2vec.infer_vector(doc,
                                         alpha=start_alpha,
                                         steps=infer_epoch,
                                         **kwargs))
    brc = Birch(branching_factor=branching_factor,
                n_clusters=int(n_clusters),
                threshold=threshold,
                compute_labels=compute_labels)

    brc.fit(infer_list)
    clusters = brc.predict(infer_list)
    birch_labels = brc.labels_

    silhouette_score = metrics.silhouette_score(infer_list,
                                                birch_labels,
                                                metric=metric)

    return silhouette_score, clusters
Beispiel #10
0
def cluster_latlon(n_clusters, data):
    #split the data between "around NYC" and "other locations" basically our first two clusters
    data_c = data[(data.longitude > -74.05) & (data.longitude < -73.75) &
                  (data.latitude > 40.4) & (data.latitude < 40.9)]
    data_e = data[~(data.longitude > -74.05) & (data.longitude < -73.75) &
                  (data.latitude > 40.4) & (data.latitude < 40.9)]
    #put it in matrix form
    coords = data_c.as_matrix(columns=['latitude', "longitude"])

    brc = Birch(branching_factor=100,
                n_clusters=n_clusters,
                threshold=0.01,
                compute_labels=True)

    brc.fit(coords)
    clusters = brc.predict(coords)
    data_c["cluster_" + str(n_clusters)] = clusters
    data_e["cluster_" + str(
        n_clusters)] = -1  #assign cluster label -1 for the non NYC listings
    data = pd.concat([data_c, data_e])
    plt.scatter(data_c["longitude"],
                data_c["latitude"],
                c=data_c["cluster_" + str(n_clusters)],
                s=10,
                linewidth=0.1)
    plt.title(str(n_clusters) + " Neighbourhoods from clustering")
    plt.show()
    return data
Beispiel #11
0
def do_BIRCH(nc = 100):
    os.chdir("/home/admin123/Clustering_MD/Paper/clustering.experiments/")
    fp = "Jan_2016_Delays_Recoded.csv"
    df = pd.read_csv(fp)
    X = df.as_matrix()
    del df
    ipca = IncrementalPCA(n_components = 2)
    X_ipca = ipca.fit_transform(X)
    del X

    
    logger.debug("Starting BIRCH on large dataset - " + str(X_ipca.shape[0]) + " rows!")
    brc = Birch(branching_factor=50, n_clusters=nc,\
                threshold=0.25,compute_labels=True)
    brc = brc.fit(X_ipca)
    labels = brc.predict(X_ipca)
    logger.debug("Done with BIRCH !")    
    chis = metrics.calinski_harabaz_score(X_ipca, labels)
    logger.debug("CH index score : " + str(chis))
    colors = cm.rainbow(np.linspace(0, 1, nc))
    ax = plt.gca()
    for l,c  in zip(labels, colors):
        mask = labels == l
        ax.plot(X_ipca[mask, 0], X_ipca[mask, 1], 'w',\
                    markerfacecolor=c , marker='.')
    ax.set_title("BIRCH Airline Delay for January 2016")
    ax.set_xlabel("Principal Component 1")
    ax.set_ylabel("Principal Component 2")
    plt.grid()
    plt.show()
        

    return
Beispiel #12
0
def birch(df):
    print("                                    ----------------------")
    print("                                      Birch Clustering")
    print("                                    ----------------------")
    df1 = df.drop(columns=[
        'Class Attribute', 'Semester type', 'Speaker Type', 'Course',
        'Course instructor'
    ])
    df2 = df.drop(columns=[
        'Class Attribute', 'Semester type', 'Speaker Type', 'Course',
        'Class Size'
    ])
    df3 = df.drop(columns=[
        'Class Attribute', 'Semester type', 'Speaker Type', 'Class Size',
        'Course instructor'
    ])
    p1 = df1.to_numpy()
    p2 = df2.to_numpy()
    p3 = df3.to_numpy()

    data = np.array(np.concatenate([p1, p2, p3]))

    x_range = range(len(data))
    x = np.array(list(zip(x_range, data))).reshape(len(x_range), 2)

    plt.scatter(x[:, 0], x[:, 1])
    plt.show()

    bclust = Birch(branching_factor=100, threshold=.5).fit(x)
    print(bclust)

    labels = bclust.predict(x)

    plt.scatter(x[:, 0], x[:, 1], c=labels)
    plt.show()
def detect_segments(data):
    # rho = [normal_to_angle(row[2], row[3]) for row in data]
    rho = np.arctan2(data[:,3],data[:,2])
    rho[rho < 0] += 2*math.pi
    #dist = [point_to_dist(row[0],row[1],row[2],row[3]) for row in data]
    dist = np.fabs(data[:,0]*data[:,2] + data[:,1]*data[:,3])

    # X = [(r,d) for (r,d) in zip(rho,dist)]
    X = list(zip(rho,dist))

    brc = Birch(branching_factor=50,n_clusters=None, threshold=0.5)
    rrr = brc.fit(X)
    labels = brc.predict(X)

    # sorted_data = [row + [label] for (row,label) in zip(data,labels)]
    # sorted_data = np.concatenate((data,np.array([labels],dtype=float).T),axis=1)
    sorted_data = np.zeros((data.shape[0],data.shape[1]+1))
    sorted_data[:,0:4] = data
    sorted_data[:,4:5] = np.array([labels],dtype=float).T

    # sorted_data = sorted(sorted_data, key=lambda row: row[4])
    sorted_data = sorted_data[sorted_data[:,4].argsort()]

    segments = extract_segments(sorted_data)

    filtered_data = list(filter(lambda row: row[4] not in segments, sorted_data))

    return segments, filtered_data
Beispiel #14
0
def density(df):
    print(
        "                                   ------------------------------------"
    )
    print(
        "                                     Density Based Spatial Clustering"
    )
    print(
        "                                   ------------------------------------"
    )
    df = df.drop(columns=['Class Attribute', 'Semester type', 'Speaker Type'])
    data = df.to_numpy()

    np.random.seed(12)
    p1 = np.random.randint(5, 21, 110)
    p2 = np.random.randint(20, 30, 120)
    p3 = np.random.randint(8, 21, 90)

    data = np.array(np.concatenate([p1, p2, p3]))
    x_range = range(len(data))
    x = np.array(list(zip(x_range, data))).reshape(len(x_range), 2)

    plt.scatter(x[:, 0], x[:, 1])
    plt.show()

    bclust = Birch(branching_factor=100, threshold=.5).fit(x)
    print(bclust)

    labels = bclust.predict(x)

    plt.scatter(x[:, 0], x[:, 1], c=labels)
    plt.show()
    print(
        "--------------------------------------------------------------------------------------------------------"
    )
    def scan_callback(self, msg):
        pose = self.pose.copy()
        bearings = self.bearings.copy()

        ranges = np.array(msg.ranges)
        inf_flag = (-1 * np.isinf(ranges).astype(int) + 1)
        ranges = np.nan_to_num(ranges) * inf_flag

        euc_coord_x = pose[0] + np.cos(bearings - pose[2]) * ranges
        euc_coord_y = pose[1] + np.sin(bearings - pose[2]) * ranges
        dist_flag = np.where( (euc_coord_x-pose[0])**2 + \
                        (euc_coord_y-pose[1])**2 != 0.0)[0]
        points = np.array([euc_coord_x, euc_coord_y]).T
        points = points[dist_flag]

        self.obsv = []
        if len(points) > 0:
            brc = Birch(n_clusters=None, threshold=0.05)
            brc.fit(points)
            labels = brc.predict(points)
            u_labels = np.unique(labels)
            for l in u_labels:
                seg_idx = np.where(labels == l)
                seg = points[seg_idx]
                if seg.shape[0] <= 1:
                    fit_cov = 10
                else:
                    fit_cov = np.trace(np.cov(seg.T))
                if fit_cov < 0.001 and seg.shape[0] >= 3:
                    self.obsv.append(seg.mean(axis=0))
            print(self.obsv)
Beispiel #16
0
def map_clusters(n_list, n_clusters):
    # x = np.array([[28.596596, 77.344098], [28.574783, 77.333393]])
    # x = np.append(x, [[28.596596, 77.344098], [28.574783, 77.333393], [28.582515, 77.246735],
    #                   [28.582915, 77.215735], [28.635639, 77.201197], [28.464873, 76.995451]], axis=0)
    x = np.array([[28.596596, 0], [28.574783, 0], [28.996596,
                                                   0], [28.674783, 0],
                  [28.582515, 0], [28.582915, 0], [28.635639, 0],
                  [28.464873, 0]])
    # x = np.append(x, n_list, axis=0)
    # define the model
    model = Birch(threshold=0.01, n_clusters=n_clusters)
    # fit the model
    model.fit(n_list)
    # assign a cluster to each example
    yhat = model.predict(n_list)
    # retrieve unique clusters
    clusters = unique(yhat)
    dic = {}
    # create scatter plot for samples from each cluster
    for cluster in clusters:
        # 	# get row indexes for samples with this cluster
        row_ix = where(yhat == cluster)
        # 	# create scatter of these samples
        dic[cluster] = row_ix[0]
        # pyplot.scatter(x[row_ix, 0], x[row_ix, 1])
    # print(dic)
    # pyplot.show()
    return dic
Beispiel #17
0
def birch_clustering(principal_components, principal_df, number_of_clusters):
    final_df = pd.concat([principal_df], axis=1)
    model = Birch(threshold=0.01, n_clusters=number_of_clusters)
    # fit the model
    model.fit(principal_components)
    # assign a cluster to each example
    yhat = model.predict(principal_components)
    # retrieve unique clusters
    clusters = unique(yhat)
    final_df['Segment'] = model.labels_
    # create scatter plot for samples from each cluster
    for cluster in clusters:
        # get row indexes for samples with this cluster
        row_ix = where(yhat == cluster)
        # create scatter of these samples
        plt.scatter(principal_components[row_ix, 0],
                    principal_components[row_ix, 1],
                    s=75)
    final_df.rename({
        0: 'PC1',
        1: 'PC2',
        2: 'PC3',
        'y': 'Race'
    },
                    axis=1,
                    inplace=True)
    plt.title("BIRCH Clustering")
    add_race_labels(final_df)
    calc_silhouette(data=principal_components,
                    prediction=yhat,
                    n_clusters=len(clusters))
    return final_df
def clusteringReminMost(window):
    brc = Birch(branching_factor=50,
                n_clusters=3,
                threshold=0.5,
                compute_labels=True)
    brc.fit(window)
    Class = brc.predict(window)
    #统计各个类别的信息,找出个数最多的类别,取出这些数据,从而强化历史数据
    num0 = 0
    num1 = 0
    num2 = 0

    for i in Class:
        if i == 0:
            num0 += 1
        elif i == 1:
            num1 += 1
        else:
            num2 += 1
    lable = chooseMax(num0, num1, num2)
    newwindow = window[0:1]
    for i in range(1, len(Class)):
        if Class[i] == lable:  #属于目标类别,则进行添加
            newwindow = newwindow.append(window[i - 1:i])  #都为pandas数据结果
    return newwindow
def add_cluster_column(train_df, test_df, n_clusters):
    train_df['source'] = 'train'
    test_df['source'] = 'test'

    total_rows = train_df.shape[0] + test_df.shape[0]

    data = pd.concat([train_df, test_df])

    #split the data between "around NYC" and "other locations"
    data_c = data[(data.longitude > -74.05) & (data.longitude < -73.75) &
                  (data.latitude > 40.4) & (data.latitude < 40.9)]
    data_e = data[~((data.longitude > -74.05) & (data.longitude < -73.75) &
                    (data.latitude > 40.4) & (data.latitude < 40.9))]
    #put it in matrix form
    coords = data_c.as_matrix(columns=['latitude', "longitude"])

    brc = Birch(branching_factor=100,
                n_clusters=n_clusters,
                threshold=0.01,
                compute_labels=True)

    brc.fit(coords)
    clusters = brc.predict(coords)
    data_c["num_cluster_" + str(n_clusters)] = clusters
    data_e["num_cluster_" + str(
        n_clusters)] = -1  #assign cluster label -1 for the non NYC listings
    data = pd.concat([data_c, data_e])

    print('lost: {}'.format(total_rows -
                            data[data['source'] == 'train'].shape[0] -
                            data[data['source'] == 'test'].shape[0]))
    return data[data['source'] == 'train'], data[data['source'] == 'test']
Beispiel #20
0
    def clusterize_birch(self, vectors):
        brc = Birch(branching_factor=8,
                    n_clusters=(int(len(vectors) / 6))).fit(vectors)
        print('Fit ready')
        predictions = brc.predict(vectors)
        print('Predict ready')

        return predictions
Beispiel #21
0
def birch(data_train, data_test, label_train, label_test, args):
    print('birch')
    birch = Birch(n_clusters=10).fit(data_train)
    predict = birch.predict(data_test)
    print('birch done')
    compare_class(predict, label_test)
    if args.create_mean:
        create_images_from_rows('bi', mean_image(predict, data_test))
Beispiel #22
0
 def cluster_birch(self):
     print "Starting Birch clustering"
     brc = Birch(branching_factor=10,
                 n_clusters=40,
                 threshold=self.cluster_distance,
                 compute_labels=False)
     brc.fit(self.all_frames_xy)
     clusters = brc.predict(self.all_frames_xy)
     return clusters
Beispiel #23
0
 def find_outliers(self, values, dodgy_node='hello'):
     # flag if a KPI is exhibiting anomalous behaviour
     if self.find_root_cause_with_KDE:
         X = np.reshape(values, (-1, 1))
         KDE = KernelDensity(kernel='gaussian', bandwidth=1.0).fit(X)
         KDE_scores = KDE.score_samples(X)
         outliers = np.where(KDE_scores < np.percentile(KDE_scores, 1))[0]
         return (len(outliers) > 0), -np.mean(KDE_scores)
     else:
         normalized_values = preprocessing.normalize([values
                                                      ]).reshape(-1, 1)
         birch = Birch(n_clusters=None, threshold=0.06, compute_labels=True)
         birch.fit(normalized_values)
         birch.predict(normalized_values)
         labels = birch.labels_
         birch_clustering_score = len(
             labels[np.where(labels != 0)]) / len(labels)
         return (birch_clustering_score > 0), birch_clustering_score
 def birch(self, number_of_clusters, output_file_path):
     print("birch Clustering in progress...")
     arr = np.array(self.__data_set)
     birch_clustering = Birch(branching_factor=50, n_clusters=number_of_clusters, threshold=20,
                              compute_labels=False).fit(arr)
     labels = birch_clustering.predict(arr)
     print("Birch Clustering done!")
     print("generating Birch clustering csv")
     self.__generate_result_clustering_csv(labels, output_file_path)
     print("Birch clustering csv created successfully!")
Beispiel #25
0
def update_k_clusters(attrname, old, new):
    k_cluster = int(k_slider.value)
    brc = Birch(branching_factor=50,
                n_clusters=k_cluster,
                threshold=0.5,
                compute_labels=True)
    brc.fit(tweet_vecs)
    predictions = brc.predict(tweet_vecs)
    colors = get_colors(predictions)
    brc_data.data = dict(colors=colors, x=tsne_vecs[:, 0], y=tsne_vecs[:, 1])
Beispiel #26
0
def BIRCH2_duplicate_removal(dataframe, threshold=0.8):
    # Note this method now takes a dataframe as input

    if len(dataframe) < 2:
        # nothing to do
        return dataframe

    Crater_data = dataframe
    # extract axes
    x = Crater_data[0].values.tolist()
    y = Crater_data[1].values.tolist()
    r = Crater_data[2].values.tolist()
    p = Crater_data[3].values.tolist()
    Points = []

    X = np.column_stack((x, y))
    brc = Birch(branching_factor=50,
                n_clusters=int(threshold * len(x)),
                threshold=0.5,
                compute_labels=True)
    brc.fit(X)
    groups_pred = brc.predict(X)

    for c in set(groups_pred):
        idx = [i for i, e in enumerate(groups_pred) if e == c]

        Group_x = []
        Group_y = []
        Group_r = []
        Group_p = []
        index = []

        for i in idx:
            if i in range(0, len(x)):
                Group_x.append(x[i])
                Group_y.append(y[i])
                Group_r.append(r[i])
                Group_p.append(p[i])
                index.append(i)

        # after group is defined, extract its elements from list
        Points.append([Group_x, Group_y, Group_r, Group_p])

    # now reduce groups
    center_size = []
    for i, (Xs, Ys, Rr, Ps) in enumerate(Points):
        # we take the point with best prediction confidence
        best_index = np.argmax(Ps)
        x_center = Xs[best_index]
        y_center = Ys[best_index]
        radius = Rr[best_index]
        prob = Ps[best_index]
        center_size += [[x_center, y_center, radius, prob]]

    return pd.DataFrame(center_size)
def compute_clusters(data: List) -> np.ndarray:
    print("--->Computing clusters")
    birch = Birch(branching_factor=50,
                  n_clusters=5,
                  threshold=0.3,
                  copy=True,
                  compute_labels=True)

    birch.fit(data)
    predictions = np.array(birch.predict(data))
    return predictions
Beispiel #28
0
def skitleanBirch():
    data = pd.read_csv("soy_rock.csv", header=None)
    X = data.values.tolist()
    randomm = randint(5, 20)

    brc = Birch(branching_factor=randomm,
                n_clusters=4,
                threshold=0.1,
                compute_labels=True)
    brc.fit(X)
    pred = brc.predict(X)
    return pred
Beispiel #29
0
def cluster_sentences(sentences):
    X = vectorize(sentences)
    bcl = Birch(branching_factor=10, n_clusters=None).fit(
        X)  # the algorithm figures out the clusters
    clusters = bcl.predict(X)
    labels = bcl.labels_
    norm_X = normalize_vectors(X, labels)
    cluster_means = calculate_mean(norm_X, clusters)
    cluster_sentences = find_minimum_from_mean(cluster_means, norm_X)
    sents = vectors_to_sentences(cluster_sentences)
    print(sents)
    return sents
Beispiel #30
0
class Birch_algo_wrapper:
    def __init__(self):
        self.wrapped = Birch(n_clusters = 2)
        self.data = []
        self.indexes =[]

    def fit(self,data):
        self.wrapped.fit(data)
        self.data = data
        self.indexes = self.wrapped.labels_

    def predict(self,data):
        return self.wrapped.predict(data)
Beispiel #31
0
   def obtainCodebook(self, sampled_x, x):

      print 'Obatining codebook using Birch from skilean...'
   
      scaled_x_sampled = StandardScaler().fit_transform(sampled_x)
      scaled_x = StandardScaler().fit_transform(x)
      
      brc = BIRCH(branching_factor=self.branching_factor, n_clusters=self.nclusters, threshold=self.threshold, compute_labels=True)
      
      #obatin the codebook and the projections of the images on the codebook (clusters of words)
      codebook = brc.fit(scaled_x_sampled)
      clusters = brc.predict(scaled_x)
      
      print 'Clusters obtained.'
      
      return codebook, clusters 
Beispiel #32
0
   def obtainClusters(self, hist):

      print 'Obatining clusters using Birch from skilean...'
   
      hist = np.array(hist)
      hist = hist.astype(float)      
      scaled_vec = StandardScaler().fit_transform(hist)
      
      brc = BIRCH(branching_factor=self.branching_factor, n_clusters=self.nclusters, threshold=self.threshold, compute_labels=True)
      
      #obatin the codebook and the projections of the images on the codebook (clusters of words)
      codebook = brc.fit(scaled_vec)
      clusters = brc.predict(scaled_vec)
      
      print 'Clusters obtained.'
      
      return clusters
    def split_birch(self, branching_factor, threshold):

        # Extract dataset from files
        dataset = [f.dataset for f in self.files]

        # Initialize classifier
        classifier = Birch(branching_factor=branching_factor, n_clusters=None, threshold=threshold)

        classifier.fit(dataset)

        # Get index
        index = classifier.predict(dataset)

        count = max(index) + 1

        # Create new clusters
        clusters = [Cluster(self.directory, self.name + '-' + str(i)) for i in range(count)]
        for i in range(0, len(self.files), 1):
            clusters[index[i]].add_file(self.files[i])

        return clusters
Beispiel #34
0
def build_model(df, cluster_type="kmeans", seed=1):
    if cluster_type == "birch":
        model = Birch(n_clusters=N_CLUSTERS)
        res = model.fit_predict(df)
    elif cluster_type == "minibatch":
        model = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=seed)
        res = model.fit_predict(df)
    elif cluster_type == "em":
        model = mixture.GMM(n_components=N_CLUSTERS)
        model.fit(df)
        res = model.predict(df)
    elif cluster_type == 'lda':
        model = lda.LDA(n_topics=N_CLUSTERS, n_iter=1500, random_state=seed)
        data_to_cluster = np.array(df).astype(int)
        lda_res = model.fit_transform(data_to_cluster)
        res = []
        for i in lda_res:  #for now - do hard clustering, take the higheset propability
            res.append(i.argmax())
    else:
        model = KMeans(n_clusters=N_CLUSTERS, random_state=seed)
        res = model.fit_predict(df)
        df_array = np.array(df)

        dis_dict = {}
        for i in range(N_CLUSTERS):
            dis_dict[i] = clusters_centers[i]
        all_dist = []
        for line_idx in range(len(df_array)):
            label =  model.labels_[line_idx]
            dist = calc_distance(df_array[line_idx],dis_dict[label])
            all_dist.append(dist)
        df["distance_from_cluster"] = all_dist

    #clusters = model.labels_.tolist()
    #print ("clusters are:",clusters)
    print(""">>>> model is: %s, # of clusters:%s, and %s""" %(cluster_type,N_CLUSTERS,Counter(res)))
    res = [str(i) for i in res]
    docs_clusteres = zip(df.index,res)
    return docs_clusteres
Beispiel #35
0
Figures.save_valid_test_performance_measures_vs_hyper_parameters_figure(affinity_propagation_parameter_search_space_for_plotting,
                                                                        affinity_propagation_valid_performance_metrics_for_plotting,
                                                                        affinity_propagation_test_performance_metrics_for_plotting,
                                                                        'Adjusted Mutual Information Score',
                                                                        'AffinityPropagation Clustering damping parameter',
                                                                        'Affinity_Propagation_Performance',
                                                                        0,
                                                                        0.5,
                                                                        left_horizontal_limit=0.5)

# Do BIRCH, optimizing number of calls to partial_fit over a validation set
current_optimal_birch_number_of_calls = 1
initial_optimal_birch_clusterer = Birch()
initial_optimal_birch_clusterer.partial_fit(train_data_set)
initial_optimal_birch_clusterer.set_params(n_clusters=number_of_classes)
initial_birch_valid_predictions = initial_optimal_birch_clusterer.predict(valid_data_set)
initial_birch_test_predictions = initial_optimal_birch_clusterer.predict(test_data_set)

# Add one to the predictions to make them match up with range of labels, then apply Hungarian Fix
for element in range(number_of_valid_observations):
    initial_birch_valid_predictions[element] += 1
for element in range(number_of_test_observations):
    initial_birch_test_predictions[element] += 1
initial_birch_valid_predictions = Clustering.Hungarian_Fix(initial_birch_valid_predictions,
                                                           valid_labels).astype('int')
initial_birch_test_predictions = Clustering.Hungarian_Fix(initial_birch_test_predictions,
                                                          test_labels).astype('int')

# Set a starting point for optimality of the initial performance metric, to be possibly adjusted later
birch_number_of_calls_integer_search_space_start = current_optimal_birch_number_of_calls + 1
birch_number_of_calls_integer_search_space_stop = current_optimal_birch_number_of_calls + 9
	def cluster_birch(self):
		print "Starting Birch clustering"
		brc = Birch(branching_factor=10, n_clusters=40, threshold=self.cluster_distance,compute_labels=False)
		brc.fit(self.all_frames_xy)
		clusters = brc.predict(self.all_frames_xy)
		return clusters
Beispiel #37
0
dsp_array = np.array(dsp_list)

# extract the unique station names
stations = np.unique(station_array)
print stations

for sta in stations:
    events = event_array[station_array == sta, :]
    dsp_shortlist = dsp_array[station_array == sta]
    print sta, events.shape, dsp_shortlist.shape

    # cluster on events so as to compare dispersion curves for nearby
    # events
    brc = Birch(branching_factor=50, n_clusters=None, threshold=dist, compute_labels=True)
    brc.fit(events)
    labels = brc.predict(events)
    print np.max(labels)
    for lab in np.unique(labels):
        dsp_this_label_list = dsp_shortlist[labels == lab]
        cluster_name = os.path.join(dirname, "cluster_%s_%03d" % (sta, lab))
        plot_all_dsp(dsp_this_label_list, legend=False, fname="%s_gvel.png" % cluster_name)
        plot_all_map(dsp_this_label_list, fname="%s_map.png" % cluster_name, legend=False)
        f = open("%s_info.txt" % cluster_name, "w")
        for (dsp, dsp_dict) in dsp_this_label_list:
            f.write(
                "%s %s %d %03d %02d %02d %.3f %.3f\n"
                % (
                    dsp_dict["STA"],
                    dsp_dict["COMP"],
                    dsp_dict["YEAR"],
                    dsp_dict["JDAY"],
Beispiel #38
0
for idx, label in enumerate(labels):
  if label in plays_sums:
    plays_sums[label].append(plays[idx])
  else:
    plays_sums[label] = [plays[idx]]
  # cluster_size[label] += 1

for label in plays_sums:
  median = np.median(np.array(plays_sums[label]))
  plays_sums[label] = median
  
# for idx, size in enumerate(cluster_size):
  # plays_sums[idx] /= size

Y = cluster.get_test_matrix()
# print len(Y)
Y = np.array(Y, dtype=float)
print "Running Birch on test data...",
test_predicts = brc.predict(Y)
print "Done!"
print test_predicts

with open(submit_file, 'w') as submit_fh:
    submit_csv = csv.writer(submit_fh, delimiter=',', quotechar='"',
        quoting=csv.QUOTE_MINIMAL)
    submit_csv.writerow(['Id', 'plays'])

    for idx, test_predict in enumerate(test_predicts):
      submit_csv.writerow([idx+1, plays_sums[test_predict]])
      if idx%10000 == 0:
        print "Row", idx