コード例 #1
0
def outlier_clusters_ward(x, y, skill=None, memory=None):
    # TODO: incorporate skill
    data = np.vstack((x, y)).T

    if len(data) == 0:
        # uh.
        print 'clustering: NO cluster members!'
        cluster_centers = np.array([[-1, -1]])
        cluster_labels = []
        labels = []
        n_clusters = 0
        dist_within = np.array([])

    elif len(data) == 1:
        print 'clustering: only 1 data point!'
        cluster_centers = data
        cluster_labels = [0]
        labels = np.array([0])
        n_clusters = 1
        dist_within = np.array([0])

    else:
        dist_within = 1000
        dist_max = 75
        n_clusters = 0
        n_clusters_max = 10

        clusterer = AgglomerativeClustering(n_clusters=n_clusters,
                                            memory=memory)

        # while dist_within > dist_max, keep adding clusters
        while (dist_within > dist_max) * (n_clusters < n_clusters_max):
            # iterate n_clusters
            n_clusters += 1
            clusterer.set_params(n_clusters=n_clusters)

            # cluster
            labels = clusterer.fit_predict(data)

            # get cluster_centers
            cluster_labels = range(n_clusters)
            cluster_centers = np.array(
                [np.mean(data[labels == i], axis=0) for i in cluster_labels])

            # find dist_within: the maximum pairwise distance inside a cluster
            dist_within = np.max([
                np.max(pairwise_distances(data[labels == i]))
                for i in cluster_labels
            ])

    dist_within_final = np.array([
        np.max(pairwise_distances(data[labels == i])) for i in cluster_labels
    ])

    return cluster_centers, cluster_labels, labels, n_clusters, dist_within_final
コード例 #2
0
def outlier_clusters_ward(x, y, skill=None, memory=None):
    # TODO: incorporate skill
    data = np.vstack((x, y)).T

    if len(data) == 0:
        # uh.
        print 'clustering: NO cluster members!'
        cluster_centers = np.array([[-1, -1]])
        cluster_labels = []
        labels = []
        n_clusters = 0
        dist_within = np.array([])

    elif len(data) == 1:
        print 'clustering: only 1 data point!'
        cluster_centers = data
        cluster_labels = [0]
        labels = np.array([0])
        n_clusters = 1
        dist_within = np.array([0])

    else:
        dist_within = 1000
        dist_max = 75
        n_clusters = 0
        n_clusters_max = 10

        clusterer = AgglomerativeClustering(n_clusters=n_clusters,
                memory=memory)

        # while dist_within > dist_max, keep adding clusters
        while (dist_within > dist_max) * (n_clusters < n_clusters_max):
            # iterate n_clusters
            n_clusters += 1
            clusterer.set_params(n_clusters=n_clusters)

            # cluster
            labels = clusterer.fit_predict(data)

            # get cluster_centers
            cluster_labels = range(n_clusters)
            cluster_centers = np.array([np.mean(data[labels == i], axis=0)
                                        for i in cluster_labels])

            # find dist_within: the maximum pairwise distance inside a cluster
            dist_within = np.max([np.max(pairwise_distances(
                                  data[labels == i]))
                                  for i in cluster_labels])

    dist_within_final = np.array([np.max(pairwise_distances(
            data[labels == i])) for i in cluster_labels])

    return cluster_centers, cluster_labels, labels, n_clusters, dist_within_final
コード例 #3
0
ファイル: activeClustering.py プロジェクト: eltix/seqomm
print '  '+str(numAS) + '/' + str(nT) + ' physical DOFs selected after thresholding.'
maxK = min(maxK,numAS)
X = sp.array(E)[idx,:]
D = 1.-sp.absolute(sp.dot(X,X.T))

print ' agglomerative clustering:'
print '  - initializing clustering tree...',
est = AgglomerativeClustering(linkage="complete",affinity='precomputed',compute_full_tree=True,memory='./outputs/'+caseName+'/cache')
print ' done.'

S = []

print '  - agglomerative clustering for K = 1 to '+str(maxK)+'...',
for nK in range(1,maxK+1):
  #print nK
  est.set_params(n_clusters = nK)
  est.fit(D)
  labels = est.labels_

  selec = []
  for k in range(nK):
    idk = sp.nonzero(labels==k)[0]
    best_k = idx[idk[sp.argmax(trace[idx[idk]])]]
    selec.append(best_k)
  S.append(selec)
print ' done.'

selDir = './outputs/'+caseName+'/DOFSelection'
os.system('mkdir -pv '+selDir)
print ' write list of clusters...',
f = open(selDir+'/selList_'+str(glob_iter)+'.txt', 'w')
コード例 #4
0
ファイル: analysis.py プロジェクト: rht/panopticon
def get_subclustering(X,
                      score_threshold,
                      max_clusters=50,
                      min_input_size=10,
                      silhouette_threshold=0.2,
                      regularization_factor=0.01,
                      clusteringcachedir='clusteringcachedir/'):
    """

    Parameters
    ----------
    embedding :
        
    score_threshold :
        
    max_clusters :
        (Default value = 10)
    X :
        
    min_input_size :
        (Default value = 5)
    silhouette_threshold :
        (Default value = 0.2)
    regularization_factor :
        (Default value = 0.01)
    clusteringcachedir :
         (Default value = 'clusteringcachedir/')

    Returns
    -------

    
    """
    from sklearn.metrics import silhouette_score
    from sklearn.cluster import AgglomerativeClustering
    from tqdm import tqdm

    if X.shape[0] < min_input_size:
        return np.array([0] * X.shape[0])
    else:
        clustering = AgglomerativeClustering(n_clusters=2,
                                             memory=clusteringcachedir,
                                             affinity='cosine',
                                             compute_full_tree=True,
                                             linkage='average')
        scores = []
        minnk = 2
        for nk in tqdm(range(minnk, np.min([max_clusters, X.shape[0]]), 1)):
            clustering.set_params(n_clusters=nk)
            clustering.fit(X)

            score = silhouette_score(X,
                                     clustering.labels_,
                                     metric='cosine',
                                     sample_size=None)
            # sample_size=np.min([5000, X.shape[0]]))
            scores.append(score)
            #break
        print("scores", np.array(scores))
        print("ignoring regularization factor")
        #        scores = scores - np.arange(len(scores))*regularization_factor
        #        print("corrected scores",np.array(scores))
        if np.max(scores) >= score_threshold:
            print("Number of clusters:", np.argmax(scores) + minnk)
            clustering.set_params(n_clusters=np.argmax(scores) + minnk)
            clustering.fit(X)
            return clustering.labels_
        else:
            return np.array([0] * X.shape[0])
コード例 #5
0
ファイル: lab4.py プロジェクト: h11128/project
from sklearn.cluster import AgglomerativeClustering
X = np.array(data.iloc[:, 0:8])
clustering = AgglomerativeClustering(affinity='euclidean',
                                     compute_full_tree='auto',
                                     connectivity=None,
                                     linkage='ward',
                                     memory=None,
                                     n_clusters=3,
                                     pooling_func='deprecated').fit(X)

Linkage = ["ward", "complete", "average", "single"]
j = 0
for K in Linkage:
    j = j + 1
    clustering.set_params(linkage=K)
    y = clustering.fit_predict(X)
    fig = plt.figure(figsize=(12, 4), dpi=100)
    ax1 = fig.add_subplot(1, 2, 1)
    ax2 = fig.add_subplot(1, 2, 2)
    ax1.set_xlim([-0.1, 1.0])
    vertical_spacing = (j +
                        1) * 10  # extra spacing to separate the silhouettes
    ax1.set_ylim([0, len(X) + vertical_spacing])

    silhouette_avg = silhouette_score(X, y)
    print("For K = {}, the average silhouette score is {:.3f}".format(
        K, silhouette_avg))

    sample_silhouette_values = silhouette_samples(X, y)
    y_lower = 10
コード例 #6
0
     'svd__n_components': np.arange(80, 125, 10),
     'n_clusters': np.arange(12, 15)
 }
 texf_svd = Pipeline([('vect', text.TfidfVectorizer()),
                      ('svd', TruncatedSVD())])
 result = []
 Aggl = AgglomerativeClustering(compute_full_tree=True,
                                affinity='cosine',
                                linkage='average')
 for g in list(model_selection.ParameterGrid(params)):
     print()
     print(g)
     n_clusters = g.pop('n_clusters')
     texf_svd.set_params(**g)
     svd_result = texf_svd.fit_transform(data)
     Aggl.set_params(n_clusters=n_clusters, memory=os.getcwd() + "\\tree")
     labels_pred = Aggl.fit_predict(mark_allzeros(svd_result))
     print(labels_pred)
     count_table = score_data.count_table(text_data.init_num_by_cls,
                                          labels_pred, n_clusters)
     print(
         "homogeneity score, completeness score, v score:",
         metrics.homogeneity_completeness_v_measure(text_data.labels_true(),
                                                    labels_pred))
     print(
         "Adjusted Mutual Information:",
         metrics.adjusted_mutual_info_score(text_data.labels_true(),
                                            labels_pred))
     result.append(
         (g,
          metrics.adjusted_mutual_info_score(text_data.labels_true(),