Python Birch.fit_predict Exemples, sklearn.cluster.Birch.fit_predict Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : dim_reduction_clustering.py Projet : idealslee/driving-cycle

def birch_clustering(data):
    # 设置birch函数
    birch = Birch(n_clusters=3)
    # 训练数据
    birch.fit_predict(data)
    label_pred = birch.labels_
    x0 = data[label_pred == 0]
    x1 = data[label_pred == 1]
    x2 = data[label_pred == 2]

    x00 = x0[:, 0]
    y00 = x0[:, 1]
    z00 = x0[:, 2]
    x11 = x1[:, 0]
    y11 = x1[:, 1]
    z11 = x1[:, 2]
    x22 = x2[:, 0]
    y22 = x2[:, 1]
    z22 = x2[:, 2]
    fig = plt.figure()
    ax = Axes3D(fig)
    ax.scatter(x00, y00, z00, c="red", marker='o', label='label0')
    ax.scatter(x11, y11, z11, c="green", marker='*', label='label1')
    ax.scatter(x22, y22, z22, c="blue", marker='+', label='label2')
    '''plt.scatter(x0[:, 1], x0[:, 2], c="red", marker='o', label='label0')
    plt.scatter(x1[:, 1], x1[:, 2], c="green", marker='*', label='label1')
    plt.scatter(x2[:, 1], x2[:, 2], c="blue", marker='+', label='label2')
    plt.xlabel('petal length')
    plt.ylabel('petal width')'''
    plt.legend(loc=2)
    plt.show()
    return x0, x1, x2

Exemple #2

0

Afficher le fichier

 def do_birch(self, values, threshold):
     values = np.array(values)
     normalized_time = preprocessing.normalize([np.array(values)
                                                ]).reshape(-1, 1)
     birch = Birch(branching_factor=50,
                   n_clusters=None,
                   threshold=threshold,
                   compute_labels=True)
     birch.fit_predict(normalized_time)
     return (np.unique(birch.labels_).size > 1)

Exemple #3

0

Afficher le fichier

    def get_personalization(self, service):
        weight_average = 0.0
        num = 0
        max_corr = 0.01
        metrics = []
        for _, _, data in self.anomalous_subgraph.in_edges(service, data=True):
            weight_average += data['weight']
            num += 1

        for _, destination, data in self.anomalous_subgraph.out_edges(
                service, data=True):
            if self.anomalous_subgraph.nodes[destination]['type'] == 'service':
                num += 1
                weight_average += data['weight']

        hosts = self.trace_data.loc[self.trace_data.serviceName ==
                                    service].cmdb_id.unique()
        host_groups = self.host_data[self.host_data['cmdb_id'].isin(
            hosts)].groupby('cmdb_id')[['name', 'value']]

        for host, host_data_subset in host_groups:
            for KPI, values in host_data_subset.groupby('name')['value']:
                anomalous_data = pd.Series(
                    list(self.trace_data.loc[
                        (self.trace_data.path == self.anomalous_edges[service])
                        & (self.trace_data.cmdb_id == host)]['elapsedTime']))
                values = pd.Series(list(values))
                correlation = 0
                if len(set(anomalous_data)) > 1 and len(set(values)) > 1:
                    correlation = abs(anomalous_data.corr(values))
                    normalized_time = preprocessing.normalize(
                        [np.array(values)]).reshape(-1, 1)
                    birch = Birch(branching_factor=50,
                                  n_clusters=None,
                                  threshold=0.005,
                                  compute_labels=True)
                    birch.fit_predict(normalized_time)
                    labels = birch.labels_
                    coefficient = int(np.unique(labels).size > 1)
                    correlation = coefficient * correlation
                if pd.isna(correlation):
                    correlation = 0
                if correlation > max_corr:
                    metrics.append((host, KPI, correlation))
                    max_corr = correlation

        data = weight_average * max_corr
        metrics.sort(key=lambda tup: tup[2], reverse=True)
        if len(metrics) > 1:
            if metrics[1][2] / metrics[0][2] > 0.9:
                return data, metrics[:2]
            else:
                return data, metrics[:1]
        else:
            return data, metrics

Exemple #4

0

Afficher le fichier

def define_segments(QLINK_URLS, UNKNOWN_URLS, QUOTA):
    #t = time.clock()
    global quota_for_each_cluster
    global brc
    global v
    global quota
    global select
    quota = 10000
    result_arr = QLINK_URLS + UNKNOWN_URLS
    for i, url in enumerate(result_arr):
        result_arr[i] = urlparse.urlparse(unquote(url.strip()))

    #l_dict =
    v = DictVectorizer(sparse=False)
    data = v.fit_transform(extract_features(result_arr))
    ind_list = []
    ind_list_data = []
    low_bound = 8

    for col in xrange(data.shape[1]):
        if (np.sum(data[:, col]) > low_bound):
            ind_list.append(1)
            ind_list_data.append(col)
        else:
            ind_list.append(0)

    v = v.restrict(ind_list)
    data = data[:, ind_list_data]
    #if (start_url[0].find("wikipedia") != -1):
    #	out_data("som_data_wiki/qlink.tfxidf", data[:500], start_url[:500])
    #	out_data("som_data_wiki/notqlink.tfxidf", data[500:], start_url[500:])
    #	out_data("som_data_wiki/data.tfxidf", data, start_url)
    #	out_template("som_data_wiki/data_features.tv", v.get_feature_names(), len(data))
    #	out_template("som_data_wiki/qlink_features.tv", v.get_feature_names(), len(data) / 2)
    #	out_template("som_data_wiki/notqlink_features.tv", v.get_feature_names(), len(data) / 2)
    #	return 0
    best_cou_clusters = data.shape[1]
    #k_means = KMeans(n_clusters=best_cou_clusters, init = 'random')
    #clust = k_means.fit_predict(data)
    brc = Birch(branching_factor=50,
                n_clusters=best_cou_clusters,
                threshold=0.2,
                compute_labels=True)
    clust = brc.fit_predict(data)
    select = SelectKBest(k=min(data.shape[1], 30))
    data = select.fit_transform(data, clust)
    clust = brc.fit_predict(data)
    #print data.shape

    quota_for_each_cluster = np.zeros(best_cou_clusters)
    clust_qlink = list(clust[:500])
    for i in xrange(best_cou_clusters):
        quota_for_each_cluster[i] = clust_qlink.count(i) / 500.0 * QUOTA
    quota_for_each_cluster *= 2.0

Exemple #5

0

Afficher le fichier

Fichier : Database_Auto_Cluster.py Projet : wujunming1/superalloy-project

def birch(data):
    space = {
        'threshold': hp.uniform('threshold', 0, 1),
        'branching_factor': hp.choice('branching_factor', range(25, 75)),
    }
    algo = partial(tpe.suggest, n_startup_jobs=10)
    best = fmin(hyper_birch, space, algo=algo, max_evals=50)
    model = Birch(threshold=best['threshold'],
                  branching_factor=int(best['branching_factor'] + 25))
    return best, model.fit_predict(data), sil_score(
        data, model.fit_predict(data)), model.fit(data)

Exemple #6

0

Afficher le fichier

Fichier : sekitei_segments.py Projet : alex0parhomenko/technosfera

def define_segments(QLINK_URLS, UNKNOWN_URLS, QUOTA):
	#t = time.clock()
	global quota_for_each_cluster
	global brc
	global v
	global quota
	global select
	quota = 10000
	result_arr = QLINK_URLS + UNKNOWN_URLS
	for i, url in enumerate(result_arr):
		result_arr[i] = urlparse.urlparse(unquote(url.strip()))

	#l_dict = 
	v = DictVectorizer(sparse=False)
	data = v.fit_transform(extract_features(result_arr))
	ind_list = []
	ind_list_data = []
	low_bound = 8

	for col in xrange(data.shape[1]):
		if (np.sum(data[:, col]) > low_bound):
			ind_list.append(1)
			ind_list_data.append(col)
		else:
			ind_list.append(0)

	v = v.restrict(ind_list)
	data = data[:, ind_list_data] 
	#if (start_url[0].find("wikipedia") != -1):
	#	out_data("som_data_wiki/qlink.tfxidf", data[:500], start_url[:500])
	#	out_data("som_data_wiki/notqlink.tfxidf", data[500:], start_url[500:])
	#	out_data("som_data_wiki/data.tfxidf", data, start_url)
	#	out_template("som_data_wiki/data_features.tv", v.get_feature_names(), len(data))
	#	out_template("som_data_wiki/qlink_features.tv", v.get_feature_names(), len(data) / 2)
	#	out_template("som_data_wiki/notqlink_features.tv", v.get_feature_names(), len(data) / 2)
	#	return 0
	best_cou_clusters = data.shape[1]
	#k_means = KMeans(n_clusters=best_cou_clusters, init = 'random')
	#clust = k_means.fit_predict(data)
	brc = Birch(branching_factor=50, n_clusters=best_cou_clusters, threshold=0.2, compute_labels=True)
	clust = brc.fit_predict(data)
	select = SelectKBest(k=min(data.shape[1], 30))
	data = select.fit_transform(data, clust)
	clust = brc.fit_predict(data)
	#print data.shape

	quota_for_each_cluster = np.zeros(best_cou_clusters)
	clust_qlink = list(clust[:500])
	for i in xrange(best_cou_clusters):
		quota_for_each_cluster[i] = clust_qlink.count(i) / 500.0 * QUOTA 
	quota_for_each_cluster *= 2.0

Exemple #7

0

Afficher le fichier

def skLearnBirch(data):
    threshold = getOptimalClustersSilhoutte(data, ClusteringAlgorithm.skLearnBirch)
    brc = Birch(branching_factor=50, n_clusters=None, threshold=threshold, compute_labels=True)
    labels = brc.fit_predict(data)
    selectedClusterNumber = len(brc.subcluster_centers_)

    return (selectedClusterNumber, brc)

Exemple #8

0

Afficher le fichier

Fichier : Cluster.py Projet : AI3luckydog/201834882xieshengjun

def birch(tfidf_matrix):
    b_cluster = Birch(n_clusters=90, threshold=0.7)

    result = b_cluster.fit_predict(tfidf_matrix)
    rbirch = sklearn.metrics.normalized_mutual_info_score(
        cluster, result, average_method='warn')
    print(rbirch)

Exemple #9

0

Afficher le fichier

Fichier : Database_Auto_Cluster.py Projet : wujunming1/superalloy-project

def hyper_birch(args):
    global data_file
    bir = Birch(threshold=args['threshold'],
                branching_factor=int(args['branching_factor']))
    pred = bir.fit_predict(data_file.data)
    temp = sil_score(data_file.data, pred)
    return -temp

Exemple #10

0

Afficher le fichier

def compute_optimal_birch_clustering(node_path_counts: np.array,
                                     pca_target_dimension: int,
                                     number_of_walks: int,
                                     significance_level: float):
    """
    Given an array of node path counts, clusters the nodes into an optimal number of clusters using birch clustering.
    The number of clusters is incrementally increased. The optimal number of clusters is the smallest number of clusters
    such that have statistically similar path count distributions at a specified significance level.
    """
    standardized_path_counts = (
            (node_path_counts - np.mean(node_path_counts, axis=1)[:, None]) / np.mean(node_path_counts, axis=1)[:,
                                                                              None]).T

    feature_vectors = compute_principal_components(feature_vectors=standardized_path_counts,
                                                   target_dimension=pca_target_dimension)

    number_of_feature_vectors = feature_vectors.shape[0]

    cluster_labels = None
    for number_of_clusters in range(2, number_of_feature_vectors):  # start from 2 since zero/one clusters is invalid
        # clusterer = KMeans(n_clusters=number_of_clusters, max_iter=30, n_init=8)
        clusterer = Birch(n_clusters=number_of_clusters, threshold=0.05)

        cluster_labels = clusterer.fit_predict(feature_vectors)
        node_path_counts_of_clusters = get_node_path_counts_of_clusters(node_path_counts, cluster_labels)
        if test_quality_of_clusters(node_path_counts_of_clusters, number_of_walks, significance_level):
            return cluster_labels
        else:
            continue

    return cluster_labels

Exemple #11

0

Afficher le fichier

def birch(data):
    X = data
    birch = Birch(n_clusters=2, threshold=0.5)
    ##训练数据
    labels = birch.fit_predict(X)
    print(Counter(labels))
    return labels

Exemple #12

0

Afficher le fichier

Fichier : cluster.py Projet : obitoqiu/Evaluation-in-Literary

def birch(X_input, k):
    from sklearn.cluster import Birch
    print('start birch cluster:')
    clusterer = Birch(n_clusters=k,threshold=1)
    y = clusterer.fit_predict(X_input)
    print(y)
    return y

Exemple #13

0

Afficher le fichier

def birch_classer():
    data = get_feature()
    libs.logger.log(data)
    libs.logger.log('birch cluster begining......')
    sse = []
    for clust in range(100, 101):
        libs.logger.log('clust [' + str(clust) + '] is begining.....')
        birch_cluster = Birch(n_clusters=clust, )
        result = birch_cluster.fit_predict(data)

        #store model
        joblib.dump(
            birch_cluster,
            config.MODEL_PATH + 'web_fingerprint_birch_cluster_fit_result.pkl')

        calinski_harabasz_ccore = metrics.calinski_harabaz_score(data, result)

        f = open('web_fingerprint_birch_10w_' + str(clust) + '.txt', 'w')
        for i in range(len(result)):
            info = str(result[i]) + '\n'
            f.write(info)
        f.write('calinski_harabasz_ccore:' + str(calinski_harabasz_ccore))
        f.close()
        libs.logger.log('clust [' + str(clust) + '] finish')
        libs.logger.log('calinski_harabasz_ccore: ' +
                        str(calinski_harabasz_ccore))
        libs.logger.log(result)

Exemple #14

0

Afficher le fichier

def visual(c, X, y):
    from sklearn.cluster import Birch
    cluster_object = Birch()
    y_pred = cluster_object.fit_predict(X)
    colors = [
        'red', 'green', 'blue', 'cyan', 'black', 'yellow', 'magenta', 'brown',
        'orange', 'silver', 'goldenrod', 'olive', 'dodgerblue'
    ]
    clusters = np.unique(y_pred)
    print("Cluster Labels")
    print(clusters)
    print("Evaluation")
    evaluation_labels(y, y_pred)
    evaluation(X, y_pred)
    for cluster in np.unique(y):
        row_idx = np.where(y == cluster)
        plt.scatter(X[row_idx, 0], X[row_idx, 1])
    plt.title('Dataset')
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.legend()
    plt.show()
    for cluster in clusters:
        row_idx = np.where(y_pred == cluster)
        plt.scatter(X[row_idx, 0], X[row_idx, 1])
    plt.title('Clusters')
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.legend()
    plt.show()

Exemple #15

0

Afficher le fichier

Fichier : birch.py Projet : Yutaro-Sanada/scikit-learn

def birch(X, y, n):
    """
    Birchによるクラスタリング

    Parameters
    ----------
    X : numpy array
        データ
    y : numpy array
        正解ラベル
    n : int
        クラスタ数

    Returns
    -------
    acc_br : float
        正解率
    time_br : float
        実行時間
    """
    br = Birch(n_clusters=2)
    start_br = time.time()
    y_br = br.fit_predict(X)
    end_br = time.time()
    y_br = np.reshape(y_br, (1, len(y[0])))
    acc_br, _, _ = acc(y_br, y)
    time_br = round(end_br - start_br, 2)

    make_graph(X, y_br, n, "Birch")

    return acc_br, time_br

Exemple #16

0

Afficher le fichier

    def choose_stocks_index(self):
        stock_choosen_num = {}
        for i in range(self.__X.shape[0]):
            birch = Birch(threshold=0.001, n_clusters=self.__n_clusters)
            y_pred = birch.fit_predict(self.__X[i, :, :])
            subcluster_centers = birch.subcluster_centers_

            choosen_stock = np.array([0 for _ in range(self.__n_clusters)])
            min_distance = np.array([-1.0 for _ in range(self.__n_clusters)])
            for ind in range(self.__stock_num):
                stock = self.__X[i, ind, :]
                stock_label = y_pred[ind]
                distance = np.linalg.norm(stock -
                                          subcluster_centers[stock_label],
                                          ord=2)
                if min_distance[stock_label] == -1 or min_distance[
                        stock_label] > distance:
                    min_distance[stock_label] = distance
                    choosen_stock[stock_label] = ind
            for stock in choosen_stock:
                if stock in stock_choosen_num.keys():
                    stock_choosen_num[stock] += 1
                else:
                    stock_choosen_num[stock] = 1
        stock_choosen_num = list(
            sorted(stock_choosen_num.items(), key=lambda x: x[1],
                   reverse=True))
        choosen_stock_ind = list(map(lambda x: x[0],
                                     stock_choosen_num))[:self.__n_clusters]
        return choosen_stock_ind

Exemple #17

0

Afficher le fichier

def birch(X, k):  # 待聚类点阵,聚类个数

    from sklearn.cluster import Birch
    clusterer = Birch(n_clusters=k)
    y = clusterer.fit_predict(X)

    return y

Exemple #18

0

Afficher le fichier

Fichier : sekitei_segments.py Projet : KopBob/technosphere

def define_segments(QLINK_URLS, UNKNOWN_URLS, QUOTA):
    # url to obj
    qlinks = map(parse_url, QLINK_URLS)
    ulinks = map(parse_url, UNKNOWN_URLS)

    # check netloc
    # print qlinks[0].netloc

    # extract features
    start = time.time()
    qlinks_f = [dict(Counter(zip(*extract_features([link], 0))[0])) for link in qlinks]
    ulinks_f = [dict(Counter(zip(*extract_features([link], 0))[0])) for link in ulinks]
    # print time.time() - start
    # start = time.time()

    v = DictVectorizer(sparse=False)
    x_ = v.fit_transform(qlinks_f + ulinks_f)

    best_features = np.sum(x_, axis=0) > 5

    m_features = np.sum(best_features)

    v = v.restrict(best_features)
    x_ = x_[:, best_features]

    clustering = Birch(branching_factor=BIRCH_BRANCHING_FACTOR, n_clusters=m_features,
                       threshold=BIRCH_THRESHOLD, compute_labels=True)
    y_ = clustering.fit_predict(x_)

    sel = SelectKBest(k=min(m_features, KBEST_K))
    x = sel.fit_transform(x_, y_)

    y = clustering.fit_predict(x)
    q_or_u = np.repeat([1, 0], [len(QLINK_URLS), len(UNKNOWN_URLS)])
    q_ = np.vstack((y, q_or_u)).T

    quota = zip(np.unique(y),
                (np.array([np.sum(q_[q_[:, 0] == c, 1]) for c in np.unique(y)]) / float(len(QLINK_URLS))) * QUOTA * 2)
    quota = {c: int(q) for c, q in quota}

    algos[qlinks[0].netloc] = {
        "clustering": clustering,
        "quota": quota,
        "sel": sel,
        "vect": v,
        "total_quota": QUOTA,
    }

Exemple #19

0

Afficher le fichier

Fichier : clustering.py Projet : agorbanev/python_repository

def clusteringBirch(X, nclusters, paramlist):
    bcl = Birch(threshold=0.5,
                branching_factor=50,
                n_clusters=nclusters,
                compute_labels=True,
                copy=True)
    labels = bcl.fit_predict(X)
    return labels

Exemple #20

0

Afficher le fichier

Fichier : birch.py Projet : h0lmes221B/Data-Mining-Basic-Codes

def main():
    filename = 'dataset.txt'
    x = convert_to_int(load_input(filename))
    brc = Birch(branching_factor=50,
                n_clusters=7,
                threshold=0.5,
                compute_labels=True)
    ans = brc.fit_predict(x)
    plot_points(ans, x)

Exemple #21

0

Afficher le fichier

def birch(data,threshold,branching_factor):
    # bir = Birch(threshold=args['threshold'], branching_factor=int(args['branching_factor']))

    db = Birch(threshold=threshold, branching_factor=branching_factor)
    db.fit(data)
    pred = db.fit_predict(data)
    score = sil_score(data,pred)
    print(score)
    return db,pred,score

Exemple #22

0

Afficher le fichier

Fichier : GA_new.py Projet : wujunming1/superalloy-project

def hyper_birch(args):
    global basic_data
    global all_data

    bir = Birch(threshold=args['threshold'], branching_factor=int(args['branching_factor']))
    pred = bir.fit_predict(basic_data)
    temp = sil_score(all_data, pred)
    # print(args)
    return -temp

Exemple #23

0

Afficher le fichier

def birch(test_arr, testDt_List, T, B):
    cluster = Birch(n_clusters=None, threshold=T,
                    branching_factor=B)  #可能需要调threshold参数
    y = cluster.fit_predict(test_arr)
    print(y)
    label = []  # 每个样本所属的类
    for i in range(1, len(cluster.labels_)):
        label.append((testDt_List[i - 1], cluster.labels_[i - 1]))
    return label

Exemple #24

0

Afficher le fichier

def getOptimalClustersSilhoutte(data,
                                algorithm=ClusteringAlgorithm.skLearnKMeans):
    silhoutteScores = {}
    rotationStored = {}
    thresholdValues = {}
    if algorithm == ClusteringAlgorithm.customKMeans:
        for clusterKmeansNumber in range(2, 20):
            try:
                clf = kMeans.K_Means(clusterKmeansNumber,
                                     tolerance=0.00001,
                                     max_iterations=800)
                rotation = randamozieSeed(data, clusterKmeansNumber)
                clf.fit(data, spherical=True, rotationArray=rotation)
                labels = clf.getLabels(data)
                silhouette_avg = silhouette_score(data, labels)
                silhoutteScores[clusterKmeansNumber] = silhouette_avg
                rotationStored[clusterKmeansNumber] = rotation
                # print(clusterKmeansNumber,">>>>>>>",rotation)
            except:
                continue
                # print(clusterKmeansNumber," chucked")
    elif algorithm == ClusteringAlgorithm.skLearnKMeans:
        for clusterKmeansNumber in range(2, 20):
            clf = KMeans(n_clusters=clusterKmeansNumber)
            labels = clf.fit_predict(data)
            silhouette_avg = silhouette_score(data, labels)
            silhoutteScores[clusterKmeansNumber] = silhouette_avg

    elif algorithm == ClusteringAlgorithm.skLearnBirch:
        for i in range(2, 100):
            brc = Birch(branching_factor=50,
                        n_clusters=None,
                        threshold=0.01 * i,
                        compute_labels=True)

            labels = brc.fit_predict(data)
            print(len(labels))
            try:
                silhouette_avg = silhouette_score(data, labels)
                clusterNumber = len(set(labels))
                silhoutteScores[clusterNumber] = silhouette_avg
                thresholdValues[clusterNumber] = i * 0.01

            except:
                continue

    sortedSil = sorted(silhoutteScores.items(), key=itemgetter(1))
    selectedClusterNumber = sortedSil[-1][0]
    print("selected number of clusters=", selectedClusterNumber)
    if algorithm == ClusteringAlgorithm.customKMeans:
        return (selectedClusterNumber, rotationStored[selectedClusterNumber])
    elif algorithm == ClusteringAlgorithm.skLearnBirch:
        return selectedClusterNumber
    else:
        return thresholdValues[selectedClusterNumber]

Exemple #25

0

Afficher le fichier

 def find_anomalous_edges(self):
     for edge in self.edges:
         elapsed_time = np.array(
             list(self.trace_data[self.trace_data.path == edge]
                  ['elapsedTime']))
         normalized_time = preprocessing.normalize([elapsed_time
                                                    ]).reshape(-1, 1)
         if self.take_minute_averages_of_trace_data:
             birch = Birch(branching_factor=50,
                           n_clusters=None,
                           threshold=0.05,
                           compute_labels=True)
         else:
             birch = Birch(branching_factor=50,
                           n_clusters=None,
                           threshold=0.001,
                           compute_labels=True)
         birch.fit_predict(normalized_time)
         labels = birch.labels_
         if np.unique(labels).size > 1:
             self.anomalous_edges[edge.split('-')[1]] = edge

Exemple #26

0

Afficher le fichier

def julei(word, weight):
    clusterer = Birch(n_clusters=3)
    y = clusterer.fit_predict(weight)
    print(y)
    print(y.shape)

    for i in range(14):
        f2 = open(file3[i], 'w+')
        for j in range(len(y)):
            f2.write(word[j] + "   " + str(y[j]) + "\n")
            # print(word[j] + "   " + str(y[j]))
    f2.close()
    return y

Exemple #27

0

Afficher le fichier

Fichier : clustering_python.py Projet : NoamGit/Data-Hack

def build_model(df, cluster_type="kmeans", seed=1):
    if cluster_type == "birch":
        model = Birch(n_clusters=N_CLUSTERS)
        res = model.fit_predict(df)
    elif cluster_type == "minibatch":
        model = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=seed)
        res = model.fit_predict(df)
    elif cluster_type == "em":
        model = mixture.GMM(n_components=N_CLUSTERS)
        model.fit(df)
        res = model.predict(df)
    elif cluster_type == 'lda':
        model = lda.LDA(n_topics=N_CLUSTERS, n_iter=1500, random_state=seed)
        data_to_cluster = np.array(df).astype(int)
        lda_res = model.fit_transform(data_to_cluster)
        res = []
        for i in lda_res:  #for now - do hard clustering, take the higheset propability
            res.append(i.argmax())
    else:
        model = KMeans(n_clusters=N_CLUSTERS, random_state=seed)
        res = model.fit_predict(df)
        df_array = np.array(df)

        dis_dict = {}
        for i in range(N_CLUSTERS):
            dis_dict[i] = clusters_centers[i]
        all_dist = []
        for line_idx in range(len(df_array)):
            label =  model.labels_[line_idx]
            dist = calc_distance(df_array[line_idx],dis_dict[label])
            all_dist.append(dist)
        df["distance_from_cluster"] = all_dist

    #clusters = model.labels_.tolist()
    #print ("clusters are:",clusters)
    print(""">>>> model is: %s, # of clusters:%s, and %s""" %(cluster_type,N_CLUSTERS,Counter(res)))
    res = [str(i) for i in res]
    docs_clusteres = zip(df.index,res)
    return docs_clusteres

Exemple #28

0

Afficher le fichier

    def make_birch_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'birch/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("Расчет TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts,
                                                  self.tf_idf_norm,
                                                  self.tf_idf_is_smooth_idf,
                                                  self.tf_idf_sublinear_tf)
            self.signals.PrintInfo.emit(msg)

        if self.need_tf_idf_formula:
            self.signals.PrintInfo.emit(
                "Расчет TF-IDF по формуле на изображении...")
            idf_filename = output_dir + 'tf_idf_formula.csv'
            msg = self.calculate_and_write_tf_idf_formula(
                idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        birch = Birch(threshold=self.birch_threshold,
                      branching_factor=self.birch_branching_factor,
                      n_clusters=self.birch_clusters_count)

        predict_result = birch.fit_predict(X)
        self.signals.PrintInfo.emit('\nПрогноз по документам:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('Кластер ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)
        self.signals.PrintInfo.emit('Сохранено в:' +
                                    str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)

Exemple #29

0

Afficher le fichier

Fichier : MachineLearning.py Projet : tony-coder/2020-FC-Unsupervised-Classification-System-for-FintechEnterprises

def BIR(data_matrix, C=None, model_path=None):
    '''
    层次聚类
    :param data_matrix: 输入矩阵 
    :param C: 簇的个数
    :return: 
    '''
    BIR_model = Birch(n_clusters=C)
    labels = BIR_model.fit_predict(data_matrix)
    if model_path is not None:
        joblib.dump(value=BIR_model, filename=model_path)
    # print(BIR_model)
    # labels = get_trans_label(model=BIR_model,labels=labels)
    return labels

Exemple #30

0

Afficher le fichier

def Bir(data, Data_for_Cluster, k, threshold, branching_factor):
    #Birch聚类的参数选择
    #k = 2  #[4-15,None]
    #threshold = 0.5  #[0.5,0.3,0.1]
    #branching_factor= 50 #[50,20,10]
    print('Bir', '聚类数：', k, 'threshold：', threshold, 'branching_factor:',
          branching_factor)
    Birmod = Birch(n_clusters=k,
                   threshold=threshold,
                   branching_factor=branching_factor)
    pred = Birmod.fit_predict(Data_for_Cluster)
    for i in data.index:
        data.loc[i, 'clustering'] = pred[i]
    return data

Exemple #31

0

Afficher le fichier

def cluster_birch(n_clusters):
    """
    birch聚类方法，处理经过PCA处理的特征向量
    :param n_clusters:质心数量
    :return:
    """
    data = get_data("../data/feature_vector_pca.csv")
    birch = Birch(n_clusters=n_clusters, threshold=0.4, branching_factor=50)
    clusters = birch.fit_predict(data)
    print("Calinski-Harabasz Score",
          metrics.calinski_harabaz_score(data, clusters))
    print("每个样本点所属类别索引", clusters)
    # print("簇中心", birch.cluster_centers_)
    data_labeled_to_csv(clusters, "data/data_labeld_birch.csv")

Exemple #32

0

Afficher le fichier

Fichier : birch_cluster.py Projet : debasishdebs/parameterTesting

def birch_cluster(init_ds,ts_flag = False):
    '''
    Parameters: init_ds - 2D list of data
                ts_flag - boolean specifying if the first column of init_ds is a datetime object or not
    Returns: 2D list with additional column denoting which cluster said row falls into
    '''

    if ts_flag:
        init_ds = [i[1:] for i in init_ds]

    brc = Birch()
    labels = brc.fit_predict(init_ds)
    
    return [init_ds[i]+[labels[i]] for i in range(len(init_ds)) ]

Exemple #33

0

Afficher le fichier

Fichier : cluster.py Projet : mylamour/w2vcluster

def birch(filename,output,ktype):
    """
        use BIRCH cluster training
    """
    pass
    # model, word_vectors = w2v(filename)

    # n_words = word_vectors.shape[0]
    # vec_size = word_vectors.shape[1]

    # #  K means training
    # kmeans = KMeans(n_clusters= ktype, n_jobs=-1, random_state=0)
    # idx = kmeans.fit_predict(word_vectors)

    # # Use Simhash
    # word_centroid_list = list(zip(model.wv.index2word, idx))
    # word_centroid_list_simhash = [(Simhash(get_features(item[0])).value,item[1]) for item in word_centroid_list]

    # Use BIRCH training
    # better: cf is 4, sample in cs is 20
    brc = Birch(branching_factor=50, n_clusters=None, threshold=0.5,compute_labels=True)
    # brc.fit(word_centroid_list_simhash)
    brc.fit_predict(word_centroid_list_simhash)

Exemple #34

0

Afficher le fichier

    def birch_sample(self):
        """Applies Birch and DBSCAN to the image. Not for consumer use.

        """
        self.birch_thr = self.eps_filter/10.
        brc = Birch(branching_factor=50, n_clusters=None, threshold=self.birch_thr, compute_labels=True)
        self.divide_labels = brc.fit_predict(self.xyz)
        tmp_brc = brc.subcluster_centers_
        _frac = 100.*brc.subcluster_centers_.shape[0]/np.float64((self.img_original_reshape.shape[0]*self.img_original_reshape.shape[1]))
        lab_out = np.ones(brc.subcluster_centers_.shape[0], dtype=np.int32)
        agal = DBSCAN(eps=self.eps_filter, min_samples=self.min_samples_reduce, algorithm='ball_tree', n_jobs=-1)
        lab_out = agal.fit_predict(tmp_brc).astype(np.int32)
        _frac = 100.*np.sum(lab_out > -1)/np.float64((self.img_original_reshape.shape[0]*self.img_original_reshape.shape[1]))
        self.dbs_samp_frac = _frac
        return lab_out, tmp_brc

Exemple #35

0

Afficher le fichier

Fichier : CityClustering.py Projet : bejar/AMLTNotebooks

print(data, citypos.shape)

# KMeans
km = KMeans(n_clusters=100, n_init=1)
itime = time.perf_counter()
kmlabels = km.fit_predict(citypos)
etime = time.perf_counter()
print ('K-means Time = ', etime-itime)

# Minibatch Kmeans
itime = time.perf_counter()
mbkm = MiniBatchKMeans(n_clusters=100, batch_size=1000, n_init=1, max_iter=5000)
mbkmlabels = mbkm.fit_predict(citypos)
etime = time.perf_counter()
print ('MB K-means Time = ', etime-itime)

print('Similarity Km vs MBKm', adjusted_mutual_info_score(kmlabels, mbkmlabels))

# Birch
itime = time.perf_counter()
birch = Birch(threshold=0.02, n_clusters=100, branching_factor=100)
birchlabels = birch.fit_predict(citypos)
etime = time.perf_counter()
print ('BIRCH Time = ',etime-itime)

print('Similarity Km vs BIRCH',adjusted_mutual_info_score(kmlabels, birchlabels))

Exemple #36

0

Afficher le fichier

Fichier : birch.py Projet : davidbecerra/practical-3

import numpy as np
from sklearn.cluster import Birch
import cluster
import csv

clusters = 20
submit_file = 'submit_birch.csv'

X, plays = cluster.get_matrix()
brc = Birch()
X = np.array(X, dtype=float)
plays = np.array(plays, dtype=float)
# print X.shape
print "Running Birch on training data...",
brc = Birch(branching_factor=50, n_clusters=clusters, threshold=0.5, compute_labels=True)
labels = brc.fit_predict(X)
print "Done!"

print labels
# plays_sums = [0] * clusters 
# cluster_size = [0] * clusters
plays_sums = {}

# Median
for idx, label in enumerate(labels):
  if label in plays_sums:
    plays_sums[label].append(plays[idx])
  else:
    plays_sums[label] = [plays[idx]]
  # cluster_size[label] += 1

Exemple #37

0

Afficher le fichier

Fichier : birch.py Projet : ngoix/cyg-x1


np.random.seed(0)

X = np.c_[data_thr.orbit, data_thr.rate, data_thr.rateA, data_thr.rateB,
          data_thr.rateC, data_thr.rateCA]
Html_file = open("clustering_files/birch.html", "w")

scaler = StandardScaler()
X = scaler.fit_transform(X)


for n_clusters in range(2, 10):

    km = Birch(n_clusters=n_clusters)
    preds = km.fit_predict(X)

    print "components:", set(preds)
    print np.bincount(preds)

    data_thr['preds'] = pd.Series(preds).astype("category")

    color_key = ["red", "blue", "yellow", "grey", "black", "purple", "pink",
                 "brown", "green", "orange"] * 2  # Spectral9
    # color_key = color_key[:len(set(preds))+2]


    # single plot rateCA vs rate with predicted classes and ellipses:

    single_plot = bokeh_datashader_plot(data_thr, covs=None, means=None,
                                        x_name='rateCA',

Exemple #38

0

Afficher le fichier

Fichier : main.py Projet : nosarthur/providers

# set up clustering algorithms
db = DBSCAN(eps=0.3, min_samples=5)
ac = AgglomerativeClustering(n_clusters=2, affinity='euclidean',
                             linkage='average')
#km = MiniBatchKMeans(n_clusters=2, random_state=1, n_init=15)
bc = Birch(n_clusters=2)
#sp = SpectralClustering(n_clusters=2, eigen_solver='arpack', random_state=1) 
#bandwidth = estimate_bandwidth(X, quantile=0.3)
#ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
#ap= AffinityPropagation(damping=.9, preference=-200)

#y_km = km.fit_predict(X)
y_ac = ac.fit_predict(X)
utils.swap_label(y_ac)
y_bc = bc.fit_predict(X)
utils.swap_label(y_bc)
y_db = db.fit_predict(X)
y_db[y_db==-1] = 1
#print np.unique(y_db)
#y_sp = sp.fit_predict(X)
#y_ms = ms.fit_predict(X)
#y_ap = ap.fit_predict(X)

labels = {'AgglomerativeClustering':y_ac}
#labels['MiniBatchKMeans'] = y_km 
labels['DBSCAN'] = y_db
labels['Birch'] = y_bc

# make plot about the clustering results
fig, axes = plt.subplots(3,len(labels), figsize=(17,10))