def birch_clustering(data):
    # 设置birch函数
    birch = Birch(n_clusters=3)
    # 训练数据
    birch.fit_predict(data)
    label_pred = birch.labels_
    x0 = data[label_pred == 0]
    x1 = data[label_pred == 1]
    x2 = data[label_pred == 2]

    x00 = x0[:, 0]
    y00 = x0[:, 1]
    z00 = x0[:, 2]
    x11 = x1[:, 0]
    y11 = x1[:, 1]
    z11 = x1[:, 2]
    x22 = x2[:, 0]
    y22 = x2[:, 1]
    z22 = x2[:, 2]
    fig = plt.figure()
    ax = Axes3D(fig)
    ax.scatter(x00, y00, z00, c="red", marker='o', label='label0')
    ax.scatter(x11, y11, z11, c="green", marker='*', label='label1')
    ax.scatter(x22, y22, z22, c="blue", marker='+', label='label2')
    '''plt.scatter(x0[:, 1], x0[:, 2], c="red", marker='o', label='label0')
    plt.scatter(x1[:, 1], x1[:, 2], c="green", marker='*', label='label1')
    plt.scatter(x2[:, 1], x2[:, 2], c="blue", marker='+', label='label2')
    plt.xlabel('petal length')
    plt.ylabel('petal width')'''
    plt.legend(loc=2)
    plt.show()
    return x0, x1, x2
Example #2
0
 def do_birch(self, values, threshold):
     values = np.array(values)
     normalized_time = preprocessing.normalize([np.array(values)
                                                ]).reshape(-1, 1)
     birch = Birch(branching_factor=50,
                   n_clusters=None,
                   threshold=threshold,
                   compute_labels=True)
     birch.fit_predict(normalized_time)
     return (np.unique(birch.labels_).size > 1)
Example #3
0
    def get_personalization(self, service):
        weight_average = 0.0
        num = 0
        max_corr = 0.01
        metrics = []
        for _, _, data in self.anomalous_subgraph.in_edges(service, data=True):
            weight_average += data['weight']
            num += 1

        for _, destination, data in self.anomalous_subgraph.out_edges(
                service, data=True):
            if self.anomalous_subgraph.nodes[destination]['type'] == 'service':
                num += 1
                weight_average += data['weight']

        hosts = self.trace_data.loc[self.trace_data.serviceName ==
                                    service].cmdb_id.unique()
        host_groups = self.host_data[self.host_data['cmdb_id'].isin(
            hosts)].groupby('cmdb_id')[['name', 'value']]

        for host, host_data_subset in host_groups:
            for KPI, values in host_data_subset.groupby('name')['value']:
                anomalous_data = pd.Series(
                    list(self.trace_data.loc[
                        (self.trace_data.path == self.anomalous_edges[service])
                        & (self.trace_data.cmdb_id == host)]['elapsedTime']))
                values = pd.Series(list(values))
                correlation = 0
                if len(set(anomalous_data)) > 1 and len(set(values)) > 1:
                    correlation = abs(anomalous_data.corr(values))
                    normalized_time = preprocessing.normalize(
                        [np.array(values)]).reshape(-1, 1)
                    birch = Birch(branching_factor=50,
                                  n_clusters=None,
                                  threshold=0.005,
                                  compute_labels=True)
                    birch.fit_predict(normalized_time)
                    labels = birch.labels_
                    coefficient = int(np.unique(labels).size > 1)
                    correlation = coefficient * correlation
                if pd.isna(correlation):
                    correlation = 0
                if correlation > max_corr:
                    metrics.append((host, KPI, correlation))
                    max_corr = correlation

        data = weight_average * max_corr
        metrics.sort(key=lambda tup: tup[2], reverse=True)
        if len(metrics) > 1:
            if metrics[1][2] / metrics[0][2] > 0.9:
                return data, metrics[:2]
            else:
                return data, metrics[:1]
        else:
            return data, metrics
Example #4
0
def define_segments(QLINK_URLS, UNKNOWN_URLS, QUOTA):
    #t = time.clock()
    global quota_for_each_cluster
    global brc
    global v
    global quota
    global select
    quota = 10000
    result_arr = QLINK_URLS + UNKNOWN_URLS
    for i, url in enumerate(result_arr):
        result_arr[i] = urlparse.urlparse(unquote(url.strip()))

    #l_dict =
    v = DictVectorizer(sparse=False)
    data = v.fit_transform(extract_features(result_arr))
    ind_list = []
    ind_list_data = []
    low_bound = 8

    for col in xrange(data.shape[1]):
        if (np.sum(data[:, col]) > low_bound):
            ind_list.append(1)
            ind_list_data.append(col)
        else:
            ind_list.append(0)

    v = v.restrict(ind_list)
    data = data[:, ind_list_data]
    #if (start_url[0].find("wikipedia") != -1):
    #	out_data("som_data_wiki/qlink.tfxidf", data[:500], start_url[:500])
    #	out_data("som_data_wiki/notqlink.tfxidf", data[500:], start_url[500:])
    #	out_data("som_data_wiki/data.tfxidf", data, start_url)
    #	out_template("som_data_wiki/data_features.tv", v.get_feature_names(), len(data))
    #	out_template("som_data_wiki/qlink_features.tv", v.get_feature_names(), len(data) / 2)
    #	out_template("som_data_wiki/notqlink_features.tv", v.get_feature_names(), len(data) / 2)
    #	return 0
    best_cou_clusters = data.shape[1]
    #k_means = KMeans(n_clusters=best_cou_clusters, init = 'random')
    #clust = k_means.fit_predict(data)
    brc = Birch(branching_factor=50,
                n_clusters=best_cou_clusters,
                threshold=0.2,
                compute_labels=True)
    clust = brc.fit_predict(data)
    select = SelectKBest(k=min(data.shape[1], 30))
    data = select.fit_transform(data, clust)
    clust = brc.fit_predict(data)
    #print data.shape

    quota_for_each_cluster = np.zeros(best_cou_clusters)
    clust_qlink = list(clust[:500])
    for i in xrange(best_cou_clusters):
        quota_for_each_cluster[i] = clust_qlink.count(i) / 500.0 * QUOTA
    quota_for_each_cluster *= 2.0
def birch(data):
    space = {
        'threshold': hp.uniform('threshold', 0, 1),
        'branching_factor': hp.choice('branching_factor', range(25, 75)),
    }
    algo = partial(tpe.suggest, n_startup_jobs=10)
    best = fmin(hyper_birch, space, algo=algo, max_evals=50)
    model = Birch(threshold=best['threshold'],
                  branching_factor=int(best['branching_factor'] + 25))
    return best, model.fit_predict(data), sil_score(
        data, model.fit_predict(data)), model.fit(data)
def define_segments(QLINK_URLS, UNKNOWN_URLS, QUOTA):
	#t = time.clock()
	global quota_for_each_cluster
	global brc
	global v
	global quota
	global select
	quota = 10000
	result_arr = QLINK_URLS + UNKNOWN_URLS
	for i, url in enumerate(result_arr):
		result_arr[i] = urlparse.urlparse(unquote(url.strip()))

	#l_dict = 
	v = DictVectorizer(sparse=False)
	data = v.fit_transform(extract_features(result_arr))
	ind_list = []
	ind_list_data = []
	low_bound = 8

	for col in xrange(data.shape[1]):
		if (np.sum(data[:, col]) > low_bound):
			ind_list.append(1)
			ind_list_data.append(col)
		else:
			ind_list.append(0)

	v = v.restrict(ind_list)
	data = data[:, ind_list_data] 
	#if (start_url[0].find("wikipedia") != -1):
	#	out_data("som_data_wiki/qlink.tfxidf", data[:500], start_url[:500])
	#	out_data("som_data_wiki/notqlink.tfxidf", data[500:], start_url[500:])
	#	out_data("som_data_wiki/data.tfxidf", data, start_url)
	#	out_template("som_data_wiki/data_features.tv", v.get_feature_names(), len(data))
	#	out_template("som_data_wiki/qlink_features.tv", v.get_feature_names(), len(data) / 2)
	#	out_template("som_data_wiki/notqlink_features.tv", v.get_feature_names(), len(data) / 2)
	#	return 0
	best_cou_clusters = data.shape[1]
	#k_means = KMeans(n_clusters=best_cou_clusters, init = 'random')
	#clust = k_means.fit_predict(data)
	brc = Birch(branching_factor=50, n_clusters=best_cou_clusters, threshold=0.2, compute_labels=True)
	clust = brc.fit_predict(data)
	select = SelectKBest(k=min(data.shape[1], 30))
	data = select.fit_transform(data, clust)
	clust = brc.fit_predict(data)
	#print data.shape

	quota_for_each_cluster = np.zeros(best_cou_clusters)
	clust_qlink = list(clust[:500])
	for i in xrange(best_cou_clusters):
		quota_for_each_cluster[i] = clust_qlink.count(i) / 500.0 * QUOTA 
	quota_for_each_cluster *= 2.0
Example #7
0
def skLearnBirch(data):
    threshold = getOptimalClustersSilhoutte(data, ClusteringAlgorithm.skLearnBirch)
    brc = Birch(branching_factor=50, n_clusters=None, threshold=threshold, compute_labels=True)
    labels = brc.fit_predict(data)
    selectedClusterNumber = len(brc.subcluster_centers_)

    return (selectedClusterNumber, brc)
def birch(tfidf_matrix):
    b_cluster = Birch(n_clusters=90, threshold=0.7)

    result = b_cluster.fit_predict(tfidf_matrix)
    rbirch = sklearn.metrics.normalized_mutual_info_score(
        cluster, result, average_method='warn')
    print(rbirch)
def hyper_birch(args):
    global data_file
    bir = Birch(threshold=args['threshold'],
                branching_factor=int(args['branching_factor']))
    pred = bir.fit_predict(data_file.data)
    temp = sil_score(data_file.data, pred)
    return -temp
Example #10
0
def compute_optimal_birch_clustering(node_path_counts: np.array,
                                     pca_target_dimension: int,
                                     number_of_walks: int,
                                     significance_level: float):
    """
    Given an array of node path counts, clusters the nodes into an optimal number of clusters using birch clustering.
    The number of clusters is incrementally increased. The optimal number of clusters is the smallest number of clusters
    such that have statistically similar path count distributions at a specified significance level.
    """
    standardized_path_counts = (
            (node_path_counts - np.mean(node_path_counts, axis=1)[:, None]) / np.mean(node_path_counts, axis=1)[:,
                                                                              None]).T

    feature_vectors = compute_principal_components(feature_vectors=standardized_path_counts,
                                                   target_dimension=pca_target_dimension)

    number_of_feature_vectors = feature_vectors.shape[0]

    cluster_labels = None
    for number_of_clusters in range(2, number_of_feature_vectors):  # start from 2 since zero/one clusters is invalid
        # clusterer = KMeans(n_clusters=number_of_clusters, max_iter=30, n_init=8)
        clusterer = Birch(n_clusters=number_of_clusters, threshold=0.05)

        cluster_labels = clusterer.fit_predict(feature_vectors)
        node_path_counts_of_clusters = get_node_path_counts_of_clusters(node_path_counts, cluster_labels)
        if test_quality_of_clusters(node_path_counts_of_clusters, number_of_walks, significance_level):
            return cluster_labels
        else:
            continue

    return cluster_labels
Example #11
0
def birch(data):
    X = data
    birch = Birch(n_clusters=2, threshold=0.5)
    ##训练数据
    labels = birch.fit_predict(X)
    print(Counter(labels))
    return labels
def birch(X_input, k):
    from sklearn.cluster import Birch
    print('start birch cluster:')
    clusterer = Birch(n_clusters=k,threshold=1)
    y = clusterer.fit_predict(X_input)
    print(y)
    return y
Example #13
0
def birch_classer():
    data = get_feature()
    libs.logger.log(data)
    libs.logger.log('birch cluster begining......')
    sse = []
    for clust in range(100, 101):
        libs.logger.log('clust [' + str(clust) + '] is begining.....')
        birch_cluster = Birch(n_clusters=clust, )
        result = birch_cluster.fit_predict(data)

        #store model
        joblib.dump(
            birch_cluster,
            config.MODEL_PATH + 'web_fingerprint_birch_cluster_fit_result.pkl')

        calinski_harabasz_ccore = metrics.calinski_harabaz_score(data, result)

        f = open('web_fingerprint_birch_10w_' + str(clust) + '.txt', 'w')
        for i in range(len(result)):
            info = str(result[i]) + '\n'
            f.write(info)
        f.write('calinski_harabasz_ccore:' + str(calinski_harabasz_ccore))
        f.close()
        libs.logger.log('clust [' + str(clust) + '] finish')
        libs.logger.log('calinski_harabasz_ccore: ' +
                        str(calinski_harabasz_ccore))
        libs.logger.log(result)
Example #14
0
def visual(c, X, y):
    from sklearn.cluster import Birch
    cluster_object = Birch()
    y_pred = cluster_object.fit_predict(X)
    colors = [
        'red', 'green', 'blue', 'cyan', 'black', 'yellow', 'magenta', 'brown',
        'orange', 'silver', 'goldenrod', 'olive', 'dodgerblue'
    ]
    clusters = np.unique(y_pred)
    print("Cluster Labels")
    print(clusters)
    print("Evaluation")
    evaluation_labels(y, y_pred)
    evaluation(X, y_pred)
    for cluster in np.unique(y):
        row_idx = np.where(y == cluster)
        plt.scatter(X[row_idx, 0], X[row_idx, 1])
    plt.title('Dataset')
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.legend()
    plt.show()
    for cluster in clusters:
        row_idx = np.where(y_pred == cluster)
        plt.scatter(X[row_idx, 0], X[row_idx, 1])
    plt.title('Clusters')
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.legend()
    plt.show()
Example #15
0
def birch(X, y, n):
    """
    Birchによるクラスタリング

    Parameters
    ----------
    X : numpy array
        データ
    y : numpy array
        正解ラベル
    n : int
        クラスタ数

    Returns
    -------
    acc_br : float
        正解率
    time_br : float
        実行時間
    """
    br = Birch(n_clusters=2)
    start_br = time.time()
    y_br = br.fit_predict(X)
    end_br = time.time()
    y_br = np.reshape(y_br, (1, len(y[0])))
    acc_br, _, _ = acc(y_br, y)
    time_br = round(end_br - start_br, 2)

    make_graph(X, y_br, n, "Birch")

    return acc_br, time_br
Example #16
0
    def choose_stocks_index(self):
        stock_choosen_num = {}
        for i in range(self.__X.shape[0]):
            birch = Birch(threshold=0.001, n_clusters=self.__n_clusters)
            y_pred = birch.fit_predict(self.__X[i, :, :])
            subcluster_centers = birch.subcluster_centers_

            choosen_stock = np.array([0 for _ in range(self.__n_clusters)])
            min_distance = np.array([-1.0 for _ in range(self.__n_clusters)])
            for ind in range(self.__stock_num):
                stock = self.__X[i, ind, :]
                stock_label = y_pred[ind]
                distance = np.linalg.norm(stock -
                                          subcluster_centers[stock_label],
                                          ord=2)
                if min_distance[stock_label] == -1 or min_distance[
                        stock_label] > distance:
                    min_distance[stock_label] = distance
                    choosen_stock[stock_label] = ind
            for stock in choosen_stock:
                if stock in stock_choosen_num.keys():
                    stock_choosen_num[stock] += 1
                else:
                    stock_choosen_num[stock] = 1
        stock_choosen_num = list(
            sorted(stock_choosen_num.items(), key=lambda x: x[1],
                   reverse=True))
        choosen_stock_ind = list(map(lambda x: x[0],
                                     stock_choosen_num))[:self.__n_clusters]
        return choosen_stock_ind
Example #17
0
def birch(X, k):  # 待聚类点阵,聚类个数

    from sklearn.cluster import Birch
    clusterer = Birch(n_clusters=k)
    y = clusterer.fit_predict(X)

    return y
Example #18
0
def define_segments(QLINK_URLS, UNKNOWN_URLS, QUOTA):
    # url to obj
    qlinks = map(parse_url, QLINK_URLS)
    ulinks = map(parse_url, UNKNOWN_URLS)

    # check netloc
    # print qlinks[0].netloc

    # extract features
    start = time.time()
    qlinks_f = [dict(Counter(zip(*extract_features([link], 0))[0])) for link in qlinks]
    ulinks_f = [dict(Counter(zip(*extract_features([link], 0))[0])) for link in ulinks]
    # print time.time() - start
    # start = time.time()

    v = DictVectorizer(sparse=False)
    x_ = v.fit_transform(qlinks_f + ulinks_f)

    best_features = np.sum(x_, axis=0) > 5

    m_features = np.sum(best_features)

    v = v.restrict(best_features)
    x_ = x_[:, best_features]

    clustering = Birch(branching_factor=BIRCH_BRANCHING_FACTOR, n_clusters=m_features,
                       threshold=BIRCH_THRESHOLD, compute_labels=True)
    y_ = clustering.fit_predict(x_)

    sel = SelectKBest(k=min(m_features, KBEST_K))
    x = sel.fit_transform(x_, y_)

    y = clustering.fit_predict(x)
    q_or_u = np.repeat([1, 0], [len(QLINK_URLS), len(UNKNOWN_URLS)])
    q_ = np.vstack((y, q_or_u)).T

    quota = zip(np.unique(y),
                (np.array([np.sum(q_[q_[:, 0] == c, 1]) for c in np.unique(y)]) / float(len(QLINK_URLS))) * QUOTA * 2)
    quota = {c: int(q) for c, q in quota}

    algos[qlinks[0].netloc] = {
        "clustering": clustering,
        "quota": quota,
        "sel": sel,
        "vect": v,
        "total_quota": QUOTA,
    }
Example #19
0
def clusteringBirch(X, nclusters, paramlist):
    bcl = Birch(threshold=0.5,
                branching_factor=50,
                n_clusters=nclusters,
                compute_labels=True,
                copy=True)
    labels = bcl.fit_predict(X)
    return labels
def main():
    filename = 'dataset.txt'
    x = convert_to_int(load_input(filename))
    brc = Birch(branching_factor=50,
                n_clusters=7,
                threshold=0.5,
                compute_labels=True)
    ans = brc.fit_predict(x)
    plot_points(ans, x)
Example #21
0
def birch(data,threshold,branching_factor):
    # bir = Birch(threshold=args['threshold'], branching_factor=int(args['branching_factor']))

    db = Birch(threshold=threshold, branching_factor=branching_factor)
    db.fit(data)
    pred = db.fit_predict(data)
    score = sil_score(data,pred)
    print(score)
    return db,pred,score
Example #22
0
def hyper_birch(args):
    global basic_data
    global all_data

    bir = Birch(threshold=args['threshold'], branching_factor=int(args['branching_factor']))
    pred = bir.fit_predict(basic_data)
    temp = sil_score(all_data, pred)
    # print(args)
    return -temp
Example #23
0
def birch(test_arr, testDt_List, T, B):
    cluster = Birch(n_clusters=None, threshold=T,
                    branching_factor=B)  #可能需要调threshold参数
    y = cluster.fit_predict(test_arr)
    print(y)
    label = []  # 每个样本所属的类
    for i in range(1, len(cluster.labels_)):
        label.append((testDt_List[i - 1], cluster.labels_[i - 1]))
    return label
Example #24
0
def getOptimalClustersSilhoutte(data,
                                algorithm=ClusteringAlgorithm.skLearnKMeans):
    silhoutteScores = {}
    rotationStored = {}
    thresholdValues = {}
    if algorithm == ClusteringAlgorithm.customKMeans:
        for clusterKmeansNumber in range(2, 20):
            try:
                clf = kMeans.K_Means(clusterKmeansNumber,
                                     tolerance=0.00001,
                                     max_iterations=800)
                rotation = randamozieSeed(data, clusterKmeansNumber)
                clf.fit(data, spherical=True, rotationArray=rotation)
                labels = clf.getLabels(data)
                silhouette_avg = silhouette_score(data, labels)
                silhoutteScores[clusterKmeansNumber] = silhouette_avg
                rotationStored[clusterKmeansNumber] = rotation
                # print(clusterKmeansNumber,">>>>>>>",rotation)
            except:
                continue
                # print(clusterKmeansNumber," chucked")
    elif algorithm == ClusteringAlgorithm.skLearnKMeans:
        for clusterKmeansNumber in range(2, 20):
            clf = KMeans(n_clusters=clusterKmeansNumber)
            labels = clf.fit_predict(data)
            silhouette_avg = silhouette_score(data, labels)
            silhoutteScores[clusterKmeansNumber] = silhouette_avg

    elif algorithm == ClusteringAlgorithm.skLearnBirch:
        for i in range(2, 100):
            brc = Birch(branching_factor=50,
                        n_clusters=None,
                        threshold=0.01 * i,
                        compute_labels=True)

            labels = brc.fit_predict(data)
            print(len(labels))
            try:
                silhouette_avg = silhouette_score(data, labels)
                clusterNumber = len(set(labels))
                silhoutteScores[clusterNumber] = silhouette_avg
                thresholdValues[clusterNumber] = i * 0.01

            except:
                continue

    sortedSil = sorted(silhoutteScores.items(), key=itemgetter(1))
    selectedClusterNumber = sortedSil[-1][0]
    print("selected number of clusters=", selectedClusterNumber)
    if algorithm == ClusteringAlgorithm.customKMeans:
        return (selectedClusterNumber, rotationStored[selectedClusterNumber])
    elif algorithm == ClusteringAlgorithm.skLearnBirch:
        return selectedClusterNumber
    else:
        return thresholdValues[selectedClusterNumber]
Example #25
0
 def find_anomalous_edges(self):
     for edge in self.edges:
         elapsed_time = np.array(
             list(self.trace_data[self.trace_data.path == edge]
                  ['elapsedTime']))
         normalized_time = preprocessing.normalize([elapsed_time
                                                    ]).reshape(-1, 1)
         if self.take_minute_averages_of_trace_data:
             birch = Birch(branching_factor=50,
                           n_clusters=None,
                           threshold=0.05,
                           compute_labels=True)
         else:
             birch = Birch(branching_factor=50,
                           n_clusters=None,
                           threshold=0.001,
                           compute_labels=True)
         birch.fit_predict(normalized_time)
         labels = birch.labels_
         if np.unique(labels).size > 1:
             self.anomalous_edges[edge.split('-')[1]] = edge
Example #26
0
def julei(word, weight):
    clusterer = Birch(n_clusters=3)
    y = clusterer.fit_predict(weight)
    print(y)
    print(y.shape)

    for i in range(14):
        f2 = open(file3[i], 'w+')
        for j in range(len(y)):
            f2.write(word[j] + "   " + str(y[j]) + "\n")
            # print(word[j] + "   " + str(y[j]))
    f2.close()
    return y
Example #27
0
def build_model(df, cluster_type="kmeans", seed=1):
    if cluster_type == "birch":
        model = Birch(n_clusters=N_CLUSTERS)
        res = model.fit_predict(df)
    elif cluster_type == "minibatch":
        model = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=seed)
        res = model.fit_predict(df)
    elif cluster_type == "em":
        model = mixture.GMM(n_components=N_CLUSTERS)
        model.fit(df)
        res = model.predict(df)
    elif cluster_type == 'lda':
        model = lda.LDA(n_topics=N_CLUSTERS, n_iter=1500, random_state=seed)
        data_to_cluster = np.array(df).astype(int)
        lda_res = model.fit_transform(data_to_cluster)
        res = []
        for i in lda_res:  #for now - do hard clustering, take the higheset propability
            res.append(i.argmax())
    else:
        model = KMeans(n_clusters=N_CLUSTERS, random_state=seed)
        res = model.fit_predict(df)
        df_array = np.array(df)

        dis_dict = {}
        for i in range(N_CLUSTERS):
            dis_dict[i] = clusters_centers[i]
        all_dist = []
        for line_idx in range(len(df_array)):
            label =  model.labels_[line_idx]
            dist = calc_distance(df_array[line_idx],dis_dict[label])
            all_dist.append(dist)
        df["distance_from_cluster"] = all_dist

    #clusters = model.labels_.tolist()
    #print ("clusters are:",clusters)
    print(""">>>> model is: %s, # of clusters:%s, and %s""" %(cluster_type,N_CLUSTERS,Counter(res)))
    res = [str(i) for i in res]
    docs_clusteres = zip(df.index,res)
    return docs_clusteres
Example #28
0
    def make_birch_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'birch/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("Расчет TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts,
                                                  self.tf_idf_norm,
                                                  self.tf_idf_is_smooth_idf,
                                                  self.tf_idf_sublinear_tf)
            self.signals.PrintInfo.emit(msg)

        if self.need_tf_idf_formula:
            self.signals.PrintInfo.emit(
                "Расчет TF-IDF по формуле на изображении...")
            idf_filename = output_dir + 'tf_idf_formula.csv'
            msg = self.calculate_and_write_tf_idf_formula(
                idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        birch = Birch(threshold=self.birch_threshold,
                      branching_factor=self.birch_branching_factor,
                      n_clusters=self.birch_clusters_count)

        predict_result = birch.fit_predict(X)
        self.signals.PrintInfo.emit('\nПрогноз по документам:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('Кластер ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)
        self.signals.PrintInfo.emit('Сохранено в:' +
                                    str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)
def BIR(data_matrix, C=None, model_path=None):
    '''
    层次聚类
    :param data_matrix: 输入矩阵 
    :param C: 簇的个数
    :return: 
    '''
    BIR_model = Birch(n_clusters=C)
    labels = BIR_model.fit_predict(data_matrix)
    if model_path is not None:
        joblib.dump(value=BIR_model, filename=model_path)
    # print(BIR_model)
    # labels = get_trans_label(model=BIR_model,labels=labels)
    return labels
Example #30
0
def Bir(data, Data_for_Cluster, k, threshold, branching_factor):
    #Birch聚类的参数选择
    #k = 2  #[4-15,None]
    #threshold = 0.5  #[0.5,0.3,0.1]
    #branching_factor= 50 #[50,20,10]
    print('Bir', '聚类数:', k, 'threshold:', threshold, 'branching_factor:',
          branching_factor)
    Birmod = Birch(n_clusters=k,
                   threshold=threshold,
                   branching_factor=branching_factor)
    pred = Birmod.fit_predict(Data_for_Cluster)
    for i in data.index:
        data.loc[i, 'clustering'] = pred[i]
    return data
Example #31
0
def cluster_birch(n_clusters):
    """
    birch聚类方法,处理经过PCA处理的特征向量
    :param n_clusters:质心数量
    :return:
    """
    data = get_data("../data/feature_vector_pca.csv")
    birch = Birch(n_clusters=n_clusters, threshold=0.4, branching_factor=50)
    clusters = birch.fit_predict(data)
    print("Calinski-Harabasz Score",
          metrics.calinski_harabaz_score(data, clusters))
    print("每个样本点所属类别索引", clusters)
    # print("簇中心", birch.cluster_centers_)
    data_labeled_to_csv(clusters, "data/data_labeld_birch.csv")
def birch_cluster(init_ds,ts_flag = False):
    '''
    Parameters: init_ds - 2D list of data
                ts_flag - boolean specifying if the first column of init_ds is a datetime object or not
    Returns: 2D list with additional column denoting which cluster said row falls into
    '''

    if ts_flag:
        init_ds = [i[1:] for i in init_ds]

    brc = Birch()
    labels = brc.fit_predict(init_ds)
    
    return [init_ds[i]+[labels[i]] for i in range(len(init_ds)) ]
Example #33
0
def birch(filename,output,ktype):
    """
        use BIRCH cluster training
    """
    pass
    # model, word_vectors = w2v(filename)

    # n_words = word_vectors.shape[0]
    # vec_size = word_vectors.shape[1]

    # #  K means training
    # kmeans = KMeans(n_clusters= ktype, n_jobs=-1, random_state=0)
    # idx = kmeans.fit_predict(word_vectors)

    # # Use Simhash
    # word_centroid_list = list(zip(model.wv.index2word, idx))
    # word_centroid_list_simhash = [(Simhash(get_features(item[0])).value,item[1]) for item in word_centroid_list]

    # Use BIRCH training
    # better: cf is 4, sample in cs is 20
    brc = Birch(branching_factor=50, n_clusters=None, threshold=0.5,compute_labels=True)
    # brc.fit(word_centroid_list_simhash)
    brc.fit_predict(word_centroid_list_simhash)
Example #34
0
    def birch_sample(self):
        """Applies Birch and DBSCAN to the image. Not for consumer use.

        """
        self.birch_thr = self.eps_filter/10.
        brc = Birch(branching_factor=50, n_clusters=None, threshold=self.birch_thr, compute_labels=True)
        self.divide_labels = brc.fit_predict(self.xyz)
        tmp_brc = brc.subcluster_centers_
        _frac = 100.*brc.subcluster_centers_.shape[0]/np.float64((self.img_original_reshape.shape[0]*self.img_original_reshape.shape[1]))
        lab_out = np.ones(brc.subcluster_centers_.shape[0], dtype=np.int32)
        agal = DBSCAN(eps=self.eps_filter, min_samples=self.min_samples_reduce, algorithm='ball_tree', n_jobs=-1)
        lab_out = agal.fit_predict(tmp_brc).astype(np.int32)
        _frac = 100.*np.sum(lab_out > -1)/np.float64((self.img_original_reshape.shape[0]*self.img_original_reshape.shape[1]))
        self.dbs_samp_frac = _frac
        return lab_out, tmp_brc
Example #35
0
print(data, citypos.shape)

# KMeans
km = KMeans(n_clusters=100, n_init=1)
itime = time.perf_counter()
kmlabels = km.fit_predict(citypos)
etime = time.perf_counter()
print ('K-means Time = ', etime-itime)

# Minibatch Kmeans
itime = time.perf_counter()
mbkm = MiniBatchKMeans(n_clusters=100, batch_size=1000, n_init=1, max_iter=5000)
mbkmlabels = mbkm.fit_predict(citypos)
etime = time.perf_counter()
print ('MB K-means Time = ', etime-itime)

print('Similarity Km vs MBKm', adjusted_mutual_info_score(kmlabels, mbkmlabels))

# Birch
itime = time.perf_counter()
birch = Birch(threshold=0.02, n_clusters=100, branching_factor=100)
birchlabels = birch.fit_predict(citypos)
etime = time.perf_counter()
print ('BIRCH Time = ',etime-itime)

print('Similarity Km vs BIRCH',adjusted_mutual_info_score(kmlabels, birchlabels))



Example #36
0
import numpy as np
from sklearn.cluster import Birch
import cluster
import csv

clusters = 20
submit_file = 'submit_birch.csv'

X, plays = cluster.get_matrix()
brc = Birch()
X = np.array(X, dtype=float)
plays = np.array(plays, dtype=float)
# print X.shape
print "Running Birch on training data...",
brc = Birch(branching_factor=50, n_clusters=clusters, threshold=0.5, compute_labels=True)
labels = brc.fit_predict(X)
print "Done!"

print labels
# plays_sums = [0] * clusters 
# cluster_size = [0] * clusters
plays_sums = {}

# Median
for idx, label in enumerate(labels):
  if label in plays_sums:
    plays_sums[label].append(plays[idx])
  else:
    plays_sums[label] = [plays[idx]]
  # cluster_size[label] += 1
Example #37
0

np.random.seed(0)

X = np.c_[data_thr.orbit, data_thr.rate, data_thr.rateA, data_thr.rateB,
          data_thr.rateC, data_thr.rateCA]
Html_file = open("clustering_files/birch.html", "w")

scaler = StandardScaler()
X = scaler.fit_transform(X)


for n_clusters in range(2, 10):

    km = Birch(n_clusters=n_clusters)
    preds = km.fit_predict(X)

    print "components:", set(preds)
    print np.bincount(preds)

    data_thr['preds'] = pd.Series(preds).astype("category")

    color_key = ["red", "blue", "yellow", "grey", "black", "purple", "pink",
                 "brown", "green", "orange"] * 2  # Spectral9
    # color_key = color_key[:len(set(preds))+2]


    # single plot rateCA vs rate with predicted classes and ellipses:

    single_plot = bokeh_datashader_plot(data_thr, covs=None, means=None,
                                        x_name='rateCA',
Example #38
0
# set up clustering algorithms
db = DBSCAN(eps=0.3, min_samples=5)
ac = AgglomerativeClustering(n_clusters=2, affinity='euclidean',
                             linkage='average')
#km = MiniBatchKMeans(n_clusters=2, random_state=1, n_init=15)
bc = Birch(n_clusters=2)
#sp = SpectralClustering(n_clusters=2, eigen_solver='arpack', random_state=1) 
#bandwidth = estimate_bandwidth(X, quantile=0.3)
#ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
#ap= AffinityPropagation(damping=.9, preference=-200)

#y_km = km.fit_predict(X)
y_ac = ac.fit_predict(X)
utils.swap_label(y_ac)
y_bc = bc.fit_predict(X)
utils.swap_label(y_bc)
y_db = db.fit_predict(X)
y_db[y_db==-1] = 1
#print np.unique(y_db)
#y_sp = sp.fit_predict(X)
#y_ms = ms.fit_predict(X)
#y_ap = ap.fit_predict(X)

labels = {'AgglomerativeClustering':y_ac}
#labels['MiniBatchKMeans'] = y_km 
labels['DBSCAN'] = y_db
labels['Birch'] = y_bc

# make plot about the clustering results
fig, axes = plt.subplots(3,len(labels), figsize=(17,10))