Python Birch.fit_predict Examples, sklearn.cluster.Birch.fit_predict Python Examples

Example #1

0

Show file

File: dim_reduction_clustering.py Project: idealslee/driving-cycle

def birch_clustering(data):
    # 设置birch函数
    birch = Birch(n_clusters=3)
    # 训练数据
    birch.fit_predict(data)
    label_pred = birch.labels_
    x0 = data[label_pred == 0]
    x1 = data[label_pred == 1]
    x2 = data[label_pred == 2]

    x00 = x0[:, 0]
    y00 = x0[:, 1]
    z00 = x0[:, 2]
    x11 = x1[:, 0]
    y11 = x1[:, 1]
    z11 = x1[:, 2]
    x22 = x2[:, 0]
    y22 = x2[:, 1]
    z22 = x2[:, 2]
    fig = plt.figure()
    ax = Axes3D(fig)
    ax.scatter(x00, y00, z00, c="red", marker='o', label='label0')
    ax.scatter(x11, y11, z11, c="green", marker='*', label='label1')
    ax.scatter(x22, y22, z22, c="blue", marker='+', label='label2')
    '''plt.scatter(x0[:, 1], x0[:, 2], c="red", marker='o', label='label0')
    plt.scatter(x1[:, 1], x1[:, 2], c="green", marker='*', label='label1')
    plt.scatter(x2[:, 1], x2[:, 2], c="blue", marker='+', label='label2')
    plt.xlabel('petal length')
    plt.ylabel('petal width')'''
    plt.legend(loc=2)
    plt.show()
    return x0, x1, x2

Example #2

0

Show file

 def do_birch(self, values, threshold):
     values = np.array(values)
     normalized_time = preprocessing.normalize([np.array(values)
                                                ]).reshape(-1, 1)
     birch = Birch(branching_factor=50,
                   n_clusters=None,
                   threshold=threshold,
                   compute_labels=True)
     birch.fit_predict(normalized_time)
     return (np.unique(birch.labels_).size > 1)

Example #3

0

Show file

    def get_personalization(self, service):
        weight_average = 0.0
        num = 0
        max_corr = 0.01
        metrics = []
        for _, _, data in self.anomalous_subgraph.in_edges(service, data=True):
            weight_average += data['weight']
            num += 1

        for _, destination, data in self.anomalous_subgraph.out_edges(
                service, data=True):
            if self.anomalous_subgraph.nodes[destination]['type'] == 'service':
                num += 1
                weight_average += data['weight']

        hosts = self.trace_data.loc[self.trace_data.serviceName ==
                                    service].cmdb_id.unique()
        host_groups = self.host_data[self.host_data['cmdb_id'].isin(
            hosts)].groupby('cmdb_id')[['name', 'value']]

        for host, host_data_subset in host_groups:
            for KPI, values in host_data_subset.groupby('name')['value']:
                anomalous_data = pd.Series(
                    list(self.trace_data.loc[
                        (self.trace_data.path == self.anomalous_edges[service])
                        & (self.trace_data.cmdb_id == host)]['elapsedTime']))
                values = pd.Series(list(values))
                correlation = 0
                if len(set(anomalous_data)) > 1 and len(set(values)) > 1:
                    correlation = abs(anomalous_data.corr(values))
                    normalized_time = preprocessing.normalize(
                        [np.array(values)]).reshape(-1, 1)
                    birch = Birch(branching_factor=50,
                                  n_clusters=None,
                                  threshold=0.005,
                                  compute_labels=True)
                    birch.fit_predict(normalized_time)
                    labels = birch.labels_
                    coefficient = int(np.unique(labels).size > 1)
                    correlation = coefficient * correlation
                if pd.isna(correlation):
                    correlation = 0
                if correlation > max_corr:
                    metrics.append((host, KPI, correlation))
                    max_corr = correlation

        data = weight_average * max_corr
        metrics.sort(key=lambda tup: tup[2], reverse=True)
        if len(metrics) > 1:
            if metrics[1][2] / metrics[0][2] > 0.9:
                return data, metrics[:2]
            else:
                return data, metrics[:1]
        else:
            return data, metrics

Example #4

0

Show file

def define_segments(QLINK_URLS, UNKNOWN_URLS, QUOTA):
    #t = time.clock()
    global quota_for_each_cluster
    global brc
    global v
    global quota
    global select
    quota = 10000
    result_arr = QLINK_URLS + UNKNOWN_URLS
    for i, url in enumerate(result_arr):
        result_arr[i] = urlparse.urlparse(unquote(url.strip()))

    #l_dict =
    v = DictVectorizer(sparse=False)
    data = v.fit_transform(extract_features(result_arr))
    ind_list = []
    ind_list_data = []
    low_bound = 8

    for col in xrange(data.shape[1]):
        if (np.sum(data[:, col]) > low_bound):
            ind_list.append(1)
            ind_list_data.append(col)
        else:
            ind_list.append(0)

    v = v.restrict(ind_list)
    data = data[:, ind_list_data]
    #if (start_url[0].find("wikipedia") != -1):
    #	out_data("som_data_wiki/qlink.tfxidf", data[:500], start_url[:500])
    #	out_data("som_data_wiki/notqlink.tfxidf", data[500:], start_url[500:])
    #	out_data("som_data_wiki/data.tfxidf", data, start_url)
    #	out_template("som_data_wiki/data_features.tv", v.get_feature_names(), len(data))
    #	out_template("som_data_wiki/qlink_features.tv", v.get_feature_names(), len(data) / 2)
    #	out_template("som_data_wiki/notqlink_features.tv", v.get_feature_names(), len(data) / 2)
    #	return 0
    best_cou_clusters = data.shape[1]
    #k_means = KMeans(n_clusters=best_cou_clusters, init = 'random')
    #clust = k_means.fit_predict(data)
    brc = Birch(branching_factor=50,
                n_clusters=best_cou_clusters,
                threshold=0.2,
                compute_labels=True)
    clust = brc.fit_predict(data)
    select = SelectKBest(k=min(data.shape[1], 30))
    data = select.fit_transform(data, clust)
    clust = brc.fit_predict(data)
    #print data.shape

    quota_for_each_cluster = np.zeros(best_cou_clusters)
    clust_qlink = list(clust[:500])
    for i in xrange(best_cou_clusters):
        quota_for_each_cluster[i] = clust_qlink.count(i) / 500.0 * QUOTA
    quota_for_each_cluster *= 2.0

Example #5

0

Show file

File: Database_Auto_Cluster.py Project: wujunming1/superalloy-project

def birch(data):
    space = {
        'threshold': hp.uniform('threshold', 0, 1),
        'branching_factor': hp.choice('branching_factor', range(25, 75)),
    }
    algo = partial(tpe.suggest, n_startup_jobs=10)
    best = fmin(hyper_birch, space, algo=algo, max_evals=50)
    model = Birch(threshold=best['threshold'],
                  branching_factor=int(best['branching_factor'] + 25))
    return best, model.fit_predict(data), sil_score(
        data, model.fit_predict(data)), model.fit(data)

Example #6

0

Show file

File: sekitei_segments.py Project: alex0parhomenko/technosfera

def define_segments(QLINK_URLS, UNKNOWN_URLS, QUOTA):
	#t = time.clock()
	global quota_for_each_cluster
	global brc
	global v
	global quota
	global select
	quota = 10000
	result_arr = QLINK_URLS + UNKNOWN_URLS
	for i, url in enumerate(result_arr):
		result_arr[i] = urlparse.urlparse(unquote(url.strip()))

	#l_dict = 
	v = DictVectorizer(sparse=False)
	data = v.fit_transform(extract_features(result_arr))
	ind_list = []
	ind_list_data = []
	low_bound = 8

	for col in xrange(data.shape[1]):
		if (np.sum(data[:, col]) > low_bound):
			ind_list.append(1)
			ind_list_data.append(col)
		else:
			ind_list.append(0)

	v = v.restrict(ind_list)
	data = data[:, ind_list_data] 
	#if (start_url[0].find("wikipedia") != -1):
	#	out_data("som_data_wiki/qlink.tfxidf", data[:500], start_url[:500])
	#	out_data("som_data_wiki/notqlink.tfxidf", data[500:], start_url[500:])
	#	out_data("som_data_wiki/data.tfxidf", data, start_url)
	#	out_template("som_data_wiki/data_features.tv", v.get_feature_names(), len(data))
	#	out_template("som_data_wiki/qlink_features.tv", v.get_feature_names(), len(data) / 2)
	#	out_template("som_data_wiki/notqlink_features.tv", v.get_feature_names(), len(data) / 2)
	#	return 0
	best_cou_clusters = data.shape[1]
	#k_means = KMeans(n_clusters=best_cou_clusters, init = 'random')
	#clust = k_means.fit_predict(data)
	brc = Birch(branching_factor=50, n_clusters=best_cou_clusters, threshold=0.2, compute_labels=True)
	clust = brc.fit_predict(data)
	select = SelectKBest(k=min(data.shape[1], 30))
	data = select.fit_transform(data, clust)
	clust = brc.fit_predict(data)
	#print data.shape

	quota_for_each_cluster = np.zeros(best_cou_clusters)
	clust_qlink = list(clust[:500])
	for i in xrange(best_cou_clusters):
		quota_for_each_cluster[i] = clust_qlink.count(i) / 500.0 * QUOTA 
	quota_for_each_cluster *= 2.0

Example #7

0

Show file

def skLearnBirch(data):
    threshold = getOptimalClustersSilhoutte(data, ClusteringAlgorithm.skLearnBirch)
    brc = Birch(branching_factor=50, n_clusters=None, threshold=threshold, compute_labels=True)
    labels = brc.fit_predict(data)
    selectedClusterNumber = len(brc.subcluster_centers_)

    return (selectedClusterNumber, brc)

Example #8

0

Show file

File: Cluster.py Project: AI3luckydog/201834882xieshengjun

def birch(tfidf_matrix):
    b_cluster = Birch(n_clusters=90, threshold=0.7)

    result = b_cluster.fit_predict(tfidf_matrix)
    rbirch = sklearn.metrics.normalized_mutual_info_score(
        cluster, result, average_method='warn')
    print(rbirch)

Example #9

0

Show file

File: Database_Auto_Cluster.py Project: wujunming1/superalloy-project

def hyper_birch(args):
    global data_file
    bir = Birch(threshold=args['threshold'],
                branching_factor=int(args['branching_factor']))
    pred = bir.fit_predict(data_file.data)
    temp = sil_score(data_file.data, pred)
    return -temp

Example #10

0

Show file

def compute_optimal_birch_clustering(node_path_counts: np.array,
                                     pca_target_dimension: int,
                                     number_of_walks: int,
                                     significance_level: float):
    """
    Given an array of node path counts, clusters the nodes into an optimal number of clusters using birch clustering.
    The number of clusters is incrementally increased. The optimal number of clusters is the smallest number of clusters
    such that have statistically similar path count distributions at a specified significance level.
    """
    standardized_path_counts = (
            (node_path_counts - np.mean(node_path_counts, axis=1)[:, None]) / np.mean(node_path_counts, axis=1)[:,
                                                                              None]).T

    feature_vectors = compute_principal_components(feature_vectors=standardized_path_counts,
                                                   target_dimension=pca_target_dimension)

    number_of_feature_vectors = feature_vectors.shape[0]

    cluster_labels = None
    for number_of_clusters in range(2, number_of_feature_vectors):  # start from 2 since zero/one clusters is invalid
        # clusterer = KMeans(n_clusters=number_of_clusters, max_iter=30, n_init=8)
        clusterer = Birch(n_clusters=number_of_clusters, threshold=0.05)

        cluster_labels = clusterer.fit_predict(feature_vectors)
        node_path_counts_of_clusters = get_node_path_counts_of_clusters(node_path_counts, cluster_labels)
        if test_quality_of_clusters(node_path_counts_of_clusters, number_of_walks, significance_level):
            return cluster_labels
        else:
            continue

    return cluster_labels

Example #11

0

Show file

def birch(data):
    X = data
    birch = Birch(n_clusters=2, threshold=0.5)
    ##训练数据
    labels = birch.fit_predict(X)
    print(Counter(labels))
    return labels

Example #12

0

Show file

File: cluster.py Project: obitoqiu/Evaluation-in-Literary

def birch(X_input, k):
    from sklearn.cluster import Birch
    print('start birch cluster:')
    clusterer = Birch(n_clusters=k,threshold=1)
    y = clusterer.fit_predict(X_input)
    print(y)
    return y

Example #13

0

Show file

def birch_classer():
    data = get_feature()
    libs.logger.log(data)
    libs.logger.log('birch cluster begining......')
    sse = []
    for clust in range(100, 101):
        libs.logger.log('clust [' + str(clust) + '] is begining.....')
        birch_cluster = Birch(n_clusters=clust, )
        result = birch_cluster.fit_predict(data)

        #store model
        joblib.dump(
            birch_cluster,
            config.MODEL_PATH + 'web_fingerprint_birch_cluster_fit_result.pkl')

        calinski_harabasz_ccore = metrics.calinski_harabaz_score(data, result)

        f = open('web_fingerprint_birch_10w_' + str(clust) + '.txt', 'w')
        for i in range(len(result)):
            info = str(result[i]) + '\n'
            f.write(info)
        f.write('calinski_harabasz_ccore:' + str(calinski_harabasz_ccore))
        f.close()
        libs.logger.log('clust [' + str(clust) + '] finish')
        libs.logger.log('calinski_harabasz_ccore: ' +
                        str(calinski_harabasz_ccore))
        libs.logger.log(result)

Example #14

0

Show file

def visual(c, X, y):
    from sklearn.cluster import Birch
    cluster_object = Birch()
    y_pred = cluster_object.fit_predict(X)
    colors = [
        'red', 'green', 'blue', 'cyan', 'black', 'yellow', 'magenta', 'brown',
        'orange', 'silver', 'goldenrod', 'olive', 'dodgerblue'
    ]
    clusters = np.unique(y_pred)
    print("Cluster Labels")
    print(clusters)
    print("Evaluation")
    evaluation_labels(y, y_pred)
    evaluation(X, y_pred)
    for cluster in np.unique(y):
        row_idx = np.where(y == cluster)
        plt.scatter(X[row_idx, 0], X[row_idx, 1])
    plt.title('Dataset')
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.legend()
    plt.show()
    for cluster in clusters:
        row_idx = np.where(y_pred == cluster)
        plt.scatter(X[row_idx, 0], X[row_idx, 1])
    plt.title('Clusters')
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.legend()
    plt.show()

Example #15

0

Show file

File: birch.py Project: Yutaro-Sanada/scikit-learn

def birch(X, y, n):
    """
    Birchによるクラスタリング

    Parameters
    ----------
    X : numpy array
        データ
    y : numpy array
        正解ラベル
    n : int
        クラスタ数

    Returns
    -------
    acc_br : float
        正解率
    time_br : float
        実行時間
    """
    br = Birch(n_clusters=2)
    start_br = time.time()
    y_br = br.fit_predict(X)
    end_br = time.time()
    y_br = np.reshape(y_br, (1, len(y[0])))
    acc_br, _, _ = acc(y_br, y)
    time_br = round(end_br - start_br, 2)

    make_graph(X, y_br, n, "Birch")

    return acc_br, time_br

Example #16

0

Show file

    def choose_stocks_index(self):
        stock_choosen_num = {}
        for i in range(self.__X.shape[0]):
            birch = Birch(threshold=0.001, n_clusters=self.__n_clusters)
            y_pred = birch.fit_predict(self.__X[i, :, :])
            subcluster_centers = birch.subcluster_centers_

            choosen_stock = np.array([0 for _ in range(self.__n_clusters)])
            min_distance = np.array([-1.0 for _ in range(self.__n_clusters)])
            for ind in range(self.__stock_num):
                stock = self.__X[i, ind, :]
                stock_label = y_pred[ind]
                distance = np.linalg.norm(stock -
                                          subcluster_centers[stock_label],
                                          ord=2)
                if min_distance[stock_label] == -1 or min_distance[
                        stock_label] > distance:
                    min_distance[stock_label] = distance
                    choosen_stock[stock_label] = ind
            for stock in choosen_stock:
                if stock in stock_choosen_num.keys():
                    stock_choosen_num[stock] += 1
                else:
                    stock_choosen_num[stock] = 1
        stock_choosen_num = list(
            sorted(stock_choosen_num.items(), key=lambda x: x[1],
                   reverse=True))
        choosen_stock_ind = list(map(lambda x: x[0],
                                     stock_choosen_num))[:self.__n_clusters]
        return choosen_stock_ind

Example #17

0

Show file

def birch(X, k):  # 待聚类点阵,聚类个数

    from sklearn.cluster import Birch
    clusterer = Birch(n_clusters=k)
    y = clusterer.fit_predict(X)

    return y

Example #18

0

Show file

File: sekitei_segments.py Project: KopBob/technosphere

def define_segments(QLINK_URLS, UNKNOWN_URLS, QUOTA):
    # url to obj
    qlinks = map(parse_url, QLINK_URLS)
    ulinks = map(parse_url, UNKNOWN_URLS)

    # check netloc
    # print qlinks[0].netloc

    # extract features
    start = time.time()
    qlinks_f = [dict(Counter(zip(*extract_features([link], 0))[0])) for link in qlinks]
    ulinks_f = [dict(Counter(zip(*extract_features([link], 0))[0])) for link in ulinks]
    # print time.time() - start
    # start = time.time()

    v = DictVectorizer(sparse=False)
    x_ = v.fit_transform(qlinks_f + ulinks_f)

    best_features = np.sum(x_, axis=0) > 5

    m_features = np.sum(best_features)

    v = v.restrict(best_features)
    x_ = x_[:, best_features]

    clustering = Birch(branching_factor=BIRCH_BRANCHING_FACTOR, n_clusters=m_features,
                       threshold=BIRCH_THRESHOLD, compute_labels=True)
    y_ = clustering.fit_predict(x_)

    sel = SelectKBest(k=min(m_features, KBEST_K))
    x = sel.fit_transform(x_, y_)

    y = clustering.fit_predict(x)
    q_or_u = np.repeat([1, 0], [len(QLINK_URLS), len(UNKNOWN_URLS)])
    q_ = np.vstack((y, q_or_u)).T

    quota = zip(np.unique(y),
                (np.array([np.sum(q_[q_[:, 0] == c, 1]) for c in np.unique(y)]) / float(len(QLINK_URLS))) * QUOTA * 2)
    quota = {c: int(q) for c, q in quota}

    algos[qlinks[0].netloc] = {
        "clustering": clustering,
        "quota": quota,
        "sel": sel,
        "vect": v,
        "total_quota": QUOTA,
    }

Example #19

0

Show file

File: clustering.py Project: agorbanev/python_repository

def clusteringBirch(X, nclusters, paramlist):
    bcl = Birch(threshold=0.5,
                branching_factor=50,
                n_clusters=nclusters,
                compute_labels=True,
                copy=True)
    labels = bcl.fit_predict(X)
    return labels

Example #20

0

Show file

File: birch.py Project: h0lmes221B/Data-Mining-Basic-Codes

def main():
    filename = 'dataset.txt'
    x = convert_to_int(load_input(filename))
    brc = Birch(branching_factor=50,
                n_clusters=7,
                threshold=0.5,
                compute_labels=True)
    ans = brc.fit_predict(x)
    plot_points(ans, x)

Example #21

0

Show file

def birch(data,threshold,branching_factor):
    # bir = Birch(threshold=args['threshold'], branching_factor=int(args['branching_factor']))

    db = Birch(threshold=threshold, branching_factor=branching_factor)
    db.fit(data)
    pred = db.fit_predict(data)
    score = sil_score(data,pred)
    print(score)
    return db,pred,score

Example #22

0

Show file

File: GA_new.py Project: wujunming1/superalloy-project

def hyper_birch(args):
    global basic_data
    global all_data

    bir = Birch(threshold=args['threshold'], branching_factor=int(args['branching_factor']))
    pred = bir.fit_predict(basic_data)
    temp = sil_score(all_data, pred)
    # print(args)
    return -temp

Example #23

0

Show file

def birch(test_arr, testDt_List, T, B):
    cluster = Birch(n_clusters=None, threshold=T,
                    branching_factor=B)  #可能需要调threshold参数
    y = cluster.fit_predict(test_arr)
    print(y)
    label = []  # 每个样本所属的类
    for i in range(1, len(cluster.labels_)):
        label.append((testDt_List[i - 1], cluster.labels_[i - 1]))
    return label

Example #24

0

Show file

def getOptimalClustersSilhoutte(data,
                                algorithm=ClusteringAlgorithm.skLearnKMeans):
    silhoutteScores = {}
    rotationStored = {}
    thresholdValues = {}
    if algorithm == ClusteringAlgorithm.customKMeans:
        for clusterKmeansNumber in range(2, 20):
            try:
                clf = kMeans.K_Means(clusterKmeansNumber,
                                     tolerance=0.00001,
                                     max_iterations=800)
                rotation = randamozieSeed(data, clusterKmeansNumber)
                clf.fit(data, spherical=True, rotationArray=rotation)
                labels = clf.getLabels(data)
                silhouette_avg = silhouette_score(data, labels)
                silhoutteScores[clusterKmeansNumber] = silhouette_avg
                rotationStored[clusterKmeansNumber] = rotation
                # print(clusterKmeansNumber,">>>>>>>",rotation)
            except:
                continue
                # print(clusterKmeansNumber," chucked")
    elif algorithm == ClusteringAlgorithm.skLearnKMeans:
        for clusterKmeansNumber in range(2, 20):
            clf = KMeans(n_clusters=clusterKmeansNumber)
            labels = clf.fit_predict(data)
            silhouette_avg = silhouette_score(data, labels)
            silhoutteScores[clusterKmeansNumber] = silhouette_avg

    elif algorithm == ClusteringAlgorithm.skLearnBirch:
        for i in range(2, 100):
            brc = Birch(branching_factor=50,
                        n_clusters=None,
                        threshold=0.01 * i,
                        compute_labels=True)

            labels = brc.fit_predict(data)
            print(len(labels))
            try:
                silhouette_avg = silhouette_score(data, labels)
                clusterNumber = len(set(labels))
                silhoutteScores[clusterNumber] = silhouette_avg
                thresholdValues[clusterNumber] = i * 0.01

            except:
                continue

    sortedSil = sorted(silhoutteScores.items(), key=itemgetter(1))
    selectedClusterNumber = sortedSil[-1][0]
    print("selected number of clusters=", selectedClusterNumber)
    if algorithm == ClusteringAlgorithm.customKMeans:
        return (selectedClusterNumber, rotationStored[selectedClusterNumber])
    elif algorithm == ClusteringAlgorithm.skLearnBirch:
        return selectedClusterNumber
    else:
        return thresholdValues[selectedClusterNumber]

Example #25

0

Show file

 def find_anomalous_edges(self):
     for edge in self.edges:
         elapsed_time = np.array(
             list(self.trace_data[self.trace_data.path == edge]
                  ['elapsedTime']))
         normalized_time = preprocessing.normalize([elapsed_time
                                                    ]).reshape(-1, 1)
         if self.take_minute_averages_of_trace_data:
             birch = Birch(branching_factor=50,
                           n_clusters=None,
                           threshold=0.05,
                           compute_labels=True)
         else:
             birch = Birch(branching_factor=50,
                           n_clusters=None,
                           threshold=0.001,
                           compute_labels=True)
         birch.fit_predict(normalized_time)
         labels = birch.labels_
         if np.unique(labels).size > 1:
             self.anomalous_edges[edge.split('-')[1]] = edge

Example #26

0

Show file

def julei(word, weight):
    clusterer = Birch(n_clusters=3)
    y = clusterer.fit_predict(weight)
    print(y)
    print(y.shape)

    for i in range(14):
        f2 = open(file3[i], 'w+')
        for j in range(len(y)):
            f2.write(word[j] + "   " + str(y[j]) + "\n")
            # print(word[j] + "   " + str(y[j]))
    f2.close()
    return y

Example #27

0

Show file

File: clustering_python.py Project: NoamGit/Data-Hack

def build_model(df, cluster_type="kmeans", seed=1):
    if cluster_type == "birch":
        model = Birch(n_clusters=N_CLUSTERS)
        res = model.fit_predict(df)
    elif cluster_type == "minibatch":
        model = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=seed)
        res = model.fit_predict(df)
    elif cluster_type == "em":
        model = mixture.GMM(n_components=N_CLUSTERS)
        model.fit(df)
        res = model.predict(df)
    elif cluster_type == 'lda':
        model = lda.LDA(n_topics=N_CLUSTERS, n_iter=1500, random_state=seed)
        data_to_cluster = np.array(df).astype(int)
        lda_res = model.fit_transform(data_to_cluster)
        res = []
        for i in lda_res:  #for now - do hard clustering, take the higheset propability
            res.append(i.argmax())
    else:
        model = KMeans(n_clusters=N_CLUSTERS, random_state=seed)
        res = model.fit_predict(df)
        df_array = np.array(df)

        dis_dict = {}
        for i in range(N_CLUSTERS):
            dis_dict[i] = clusters_centers[i]
        all_dist = []
        for line_idx in range(len(df_array)):
            label =  model.labels_[line_idx]
            dist = calc_distance(df_array[line_idx],dis_dict[label])
            all_dist.append(dist)
        df["distance_from_cluster"] = all_dist

    #clusters = model.labels_.tolist()
    #print ("clusters are:",clusters)
    print(""">>>> model is: %s, # of clusters:%s, and %s""" %(cluster_type,N_CLUSTERS,Counter(res)))
    res = [str(i) for i in res]
    docs_clusteres = zip(df.index,res)
    return docs_clusteres

Example #28

0

Show file

    def make_birch_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'birch/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("Расчет TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts,
                                                  self.tf_idf_norm,
                                                  self.tf_idf_is_smooth_idf,
                                                  self.tf_idf_sublinear_tf)
            self.signals.PrintInfo.emit(msg)

        if self.need_tf_idf_formula:
            self.signals.PrintInfo.emit(
                "Расчет TF-IDF по формуле на изображении...")
            idf_filename = output_dir + 'tf_idf_formula.csv'
            msg = self.calculate_and_write_tf_idf_formula(
                idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        birch = Birch(threshold=self.birch_threshold,
                      branching_factor=self.birch_branching_factor,
                      n_clusters=self.birch_clusters_count)

        predict_result = birch.fit_predict(X)
        self.signals.PrintInfo.emit('\nПрогноз по документам:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('Кластер ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)
        self.signals.PrintInfo.emit('Сохранено в:' +
                                    str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)

Example #29

0

Show file

File: MachineLearning.py Project: tony-coder/2020-FC-Unsupervised-Classification-System-for-FintechEnterprises

def BIR(data_matrix, C=None, model_path=None):
    '''
    层次聚类
    :param data_matrix: 输入矩阵 
    :param C: 簇的个数
    :return: 
    '''
    BIR_model = Birch(n_clusters=C)
    labels = BIR_model.fit_predict(data_matrix)
    if model_path is not None:
        joblib.dump(value=BIR_model, filename=model_path)
    # print(BIR_model)
    # labels = get_trans_label(model=BIR_model,labels=labels)
    return labels

Example #30

0

Show file

def Bir(data, Data_for_Cluster, k, threshold, branching_factor):
    #Birch聚类的参数选择
    #k = 2  #[4-15,None]
    #threshold = 0.5  #[0.5,0.3,0.1]
    #branching_factor= 50 #[50,20,10]
    print('Bir', '聚类数：', k, 'threshold：', threshold, 'branching_factor:',
          branching_factor)
    Birmod = Birch(n_clusters=k,
                   threshold=threshold,
                   branching_factor=branching_factor)
    pred = Birmod.fit_predict(Data_for_Cluster)
    for i in data.index:
        data.loc[i, 'clustering'] = pred[i]
    return data

Example #31

0

Show file

def cluster_birch(n_clusters):
    """
    birch聚类方法，处理经过PCA处理的特征向量
    :param n_clusters:质心数量
    :return:
    """
    data = get_data("../data/feature_vector_pca.csv")
    birch = Birch(n_clusters=n_clusters, threshold=0.4, branching_factor=50)
    clusters = birch.fit_predict(data)
    print("Calinski-Harabasz Score",
          metrics.calinski_harabaz_score(data, clusters))
    print("每个样本点所属类别索引", clusters)
    # print("簇中心", birch.cluster_centers_)
    data_labeled_to_csv(clusters, "data/data_labeld_birch.csv")

Example #32

0

Show file

File: birch_cluster.py Project: debasishdebs/parameterTesting

def birch_cluster(init_ds,ts_flag = False):
    '''
    Parameters: init_ds - 2D list of data
                ts_flag - boolean specifying if the first column of init_ds is a datetime object or not
    Returns: 2D list with additional column denoting which cluster said row falls into
    '''

    if ts_flag:
        init_ds = [i[1:] for i in init_ds]

    brc = Birch()
    labels = brc.fit_predict(init_ds)
    
    return [init_ds[i]+[labels[i]] for i in range(len(init_ds)) ]

Example #33

0

Show file

File: cluster.py Project: mylamour/w2vcluster

def birch(filename,output,ktype):
    """
        use BIRCH cluster training
    """
    pass
    # model, word_vectors = w2v(filename)

    # n_words = word_vectors.shape[0]
    # vec_size = word_vectors.shape[1]

    # #  K means training
    # kmeans = KMeans(n_clusters= ktype, n_jobs=-1, random_state=0)
    # idx = kmeans.fit_predict(word_vectors)

    # # Use Simhash
    # word_centroid_list = list(zip(model.wv.index2word, idx))
    # word_centroid_list_simhash = [(Simhash(get_features(item[0])).value,item[1]) for item in word_centroid_list]

    # Use BIRCH training
    # better: cf is 4, sample in cs is 20
    brc = Birch(branching_factor=50, n_clusters=None, threshold=0.5,compute_labels=True)
    # brc.fit(word_centroid_list_simhash)
    brc.fit_predict(word_centroid_list_simhash)

Example #34

0

Show file

    def birch_sample(self):
        """Applies Birch and DBSCAN to the image. Not for consumer use.

        """
        self.birch_thr = self.eps_filter/10.
        brc = Birch(branching_factor=50, n_clusters=None, threshold=self.birch_thr, compute_labels=True)
        self.divide_labels = brc.fit_predict(self.xyz)
        tmp_brc = brc.subcluster_centers_
        _frac = 100.*brc.subcluster_centers_.shape[0]/np.float64((self.img_original_reshape.shape[0]*self.img_original_reshape.shape[1]))
        lab_out = np.ones(brc.subcluster_centers_.shape[0], dtype=np.int32)
        agal = DBSCAN(eps=self.eps_filter, min_samples=self.min_samples_reduce, algorithm='ball_tree', n_jobs=-1)
        lab_out = agal.fit_predict(tmp_brc).astype(np.int32)
        _frac = 100.*np.sum(lab_out > -1)/np.float64((self.img_original_reshape.shape[0]*self.img_original_reshape.shape[1]))
        self.dbs_samp_frac = _frac
        return lab_out, tmp_brc

Example #35

0

Show file

File: CityClustering.py Project: bejar/AMLTNotebooks

print(data, citypos.shape)

# KMeans
km = KMeans(n_clusters=100, n_init=1)
itime = time.perf_counter()
kmlabels = km.fit_predict(citypos)
etime = time.perf_counter()
print ('K-means Time = ', etime-itime)

# Minibatch Kmeans
itime = time.perf_counter()
mbkm = MiniBatchKMeans(n_clusters=100, batch_size=1000, n_init=1, max_iter=5000)
mbkmlabels = mbkm.fit_predict(citypos)
etime = time.perf_counter()
print ('MB K-means Time = ', etime-itime)

print('Similarity Km vs MBKm', adjusted_mutual_info_score(kmlabels, mbkmlabels))

# Birch
itime = time.perf_counter()
birch = Birch(threshold=0.02, n_clusters=100, branching_factor=100)
birchlabels = birch.fit_predict(citypos)
etime = time.perf_counter()
print ('BIRCH Time = ',etime-itime)

print('Similarity Km vs BIRCH',adjusted_mutual_info_score(kmlabels, birchlabels))

Example #36

0

Show file

File: birch.py Project: davidbecerra/practical-3

import numpy as np
from sklearn.cluster import Birch
import cluster
import csv

clusters = 20
submit_file = 'submit_birch.csv'

X, plays = cluster.get_matrix()
brc = Birch()
X = np.array(X, dtype=float)
plays = np.array(plays, dtype=float)
# print X.shape
print "Running Birch on training data...",
brc = Birch(branching_factor=50, n_clusters=clusters, threshold=0.5, compute_labels=True)
labels = brc.fit_predict(X)
print "Done!"

print labels
# plays_sums = [0] * clusters 
# cluster_size = [0] * clusters
plays_sums = {}

# Median
for idx, label in enumerate(labels):
  if label in plays_sums:
    plays_sums[label].append(plays[idx])
  else:
    plays_sums[label] = [plays[idx]]
  # cluster_size[label] += 1

Example #37

0

Show file

File: birch.py Project: ngoix/cyg-x1


np.random.seed(0)

X = np.c_[data_thr.orbit, data_thr.rate, data_thr.rateA, data_thr.rateB,
          data_thr.rateC, data_thr.rateCA]
Html_file = open("clustering_files/birch.html", "w")

scaler = StandardScaler()
X = scaler.fit_transform(X)


for n_clusters in range(2, 10):

    km = Birch(n_clusters=n_clusters)
    preds = km.fit_predict(X)

    print "components:", set(preds)
    print np.bincount(preds)

    data_thr['preds'] = pd.Series(preds).astype("category")

    color_key = ["red", "blue", "yellow", "grey", "black", "purple", "pink",
                 "brown", "green", "orange"] * 2  # Spectral9
    # color_key = color_key[:len(set(preds))+2]


    # single plot rateCA vs rate with predicted classes and ellipses:

    single_plot = bokeh_datashader_plot(data_thr, covs=None, means=None,
                                        x_name='rateCA',

Example #38

0

Show file

File: main.py Project: nosarthur/providers

# set up clustering algorithms
db = DBSCAN(eps=0.3, min_samples=5)
ac = AgglomerativeClustering(n_clusters=2, affinity='euclidean',
                             linkage='average')
#km = MiniBatchKMeans(n_clusters=2, random_state=1, n_init=15)
bc = Birch(n_clusters=2)
#sp = SpectralClustering(n_clusters=2, eigen_solver='arpack', random_state=1) 
#bandwidth = estimate_bandwidth(X, quantile=0.3)
#ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
#ap= AffinityPropagation(damping=.9, preference=-200)

#y_km = km.fit_predict(X)
y_ac = ac.fit_predict(X)
utils.swap_label(y_ac)
y_bc = bc.fit_predict(X)
utils.swap_label(y_bc)
y_db = db.fit_predict(X)
y_db[y_db==-1] = 1
#print np.unique(y_db)
#y_sp = sp.fit_predict(X)
#y_ms = ms.fit_predict(X)
#y_ap = ap.fit_predict(X)

labels = {'AgglomerativeClustering':y_ac}
#labels['MiniBatchKMeans'] = y_km 
labels['DBSCAN'] = y_db
labels['Birch'] = y_bc

# make plot about the clustering results
fig, axes = plt.subplots(3,len(labels), figsize=(17,10))