Esempio n. 1
0
# In[ ]:

mat = np.matrix(pairwise_similarity.A)

# In[ ]:

temp_dict = {}

# In[ ]:

start = time.time()
for i in range(10, 20, 1):
    if (i == 13):
        continue
    nombre = "C_" + str(i)
    Abstracts_Category = SpectralClustering(
        i, affinity='precomputed').fit_predict(mat)
    temp_dict[nombre] = Abstracts_Category
    end = time.time()
    #print((end-start)/60)

# In[ ]:

df_category = pd.DataFrame(temp_dict)

# In[ ]:

#df_category

# In[ ]:

df_temp = pd.concat([Abstracts, df_category], axis=1)
y = y.reshape(-1, 16)

print(np.shape(y))

kmeans = KMeans(n_clusters=2, random_state=0).fit(y)
#print(used_data)

print("Calving count: " + str(calved_count))
print("Random count: " + str(random_count))

labels = kmeans.labels_

print("KMeans")
tabulate2Clusters(labels, calved_count)

spectral = SpectralClustering(n_clusters=2, assign_labels="kmeans").fit(y)
print("\n\nSpectral Clustering")
tabulate2Clusters(spectral.labels_, calved_count)


agg = AgglomerativeClustering(n_clusters=2, linkage="average").fit(y)
print("\n\nAgglomerative Clustering AVG")
tabulate2Clusters(agg.labels_, calved_count)

agg = AgglomerativeClustering(n_clusters=2, linkage="complete").fit(y)
print("\n\nAgglomerative Clustering COMPLETE")
tabulate2Clusters(agg.labels_, calved_count)

agg = AgglomerativeClustering(n_clusters=2, linkage="ward").fit(y)
print("\n\nAgglomerative Clustering WARD")
tabulate2Clusters(agg.labels_, calved_count)
Esempio n. 3
0
def evaluation(y_pred,
               cluster_method="Kmeans",
               num_cluster=25,
               n_neighbors=20,
               min_dist=0.0):

    if cluster_method == "Kmeans":
        embedding = umap.UMAP(n_neighbors=n_neighbors,
                              min_dist=min_dist,
                              n_components=num_cluster,
                              metric="euclidean").fit_transform(y_pred)

        kmeans = KMeans(n_clusters=num_cluster, random_state=1).fit(embedding)
        centroid = kmeans.cluster_centers_.copy()
        y_label = kmeans.labels_.copy()
        y_pseudo = np.zeros((y_pred.shape[0], num_cluster))
    elif cluster_method == "SC":
        embedding = umap.UMAP(n_neighbors=n_neighbors,
                              min_dist=min_dist,
                              n_components=num_cluster,
                              metric="euclidean").fit_transform(y_pred)
        clustering = SpectralClustering(n_clusters=num_cluster,
                                        assign_labels="discretize",
                                        random_state=0).fit(embedding)
        y_label = clustering.labels_.copy()
        centroid = pd.DataFrame(embedding.copy())
        centroid['label'] = y_label
        centroid = centroid.groupby('label').mean().values
        y_pseudo = np.zeros((y_pred.shape[0], num_cluster))

    else:
        embedding = umap.UMAP(n_neighbors=n_neighbors,
                              min_dist=min_dist,
                              n_components=num_cluster,
                              metric="euclidean").fit_transform(y_pred)
        gmm = GaussianMixture(n_components=num_cluster).fit(embedding)
        y_label = gmm.predict(embedding)
        centroid = pd.DataFrame(embedding.copy())
        centroid['label'] = y_label
        centroid = centroid.groupby('label').mean().values

        y_pseudo = np.zeros((y_pred.shape[0], num_cluster))

    ##t-student distribution kernel soft-assignment,alpha=1
    #for j in range(centroid.shape[0]):
    #    y_pseudo[:,j]=(np.linalg.norm(embedding-centroid[j,:],axis=1)+1)**(-1)
    ##cosine distance
    #y_pseudo[:,j]=((1-cosine_similarity(embedding,centroid[j,:].reshape(1,embedding.shape[1]))+1)**(-1))[:,0]
    #y_pseudo = pd.DataFrame(y_pseudo)
    #y_pseudo2=np.zeros((y_pred.shape[0],centroid.shape[0]))
    #for j in range(centroid.shape[0]):
    #    y_pseudo2[:,j]=y_pseudo.iloc[:,j].values/np.sum(
    #        y_pseudo[y_pseudo.columns.difference([j])].values,axis=1)
    #y_pseudo = y_pseudo2

    ##distance based soft-assignment
    for j in range(centroid.shape[0]):
        ##euclidean distance
        y_pseudo[:, j] = 1 / np.linalg.norm(embedding - centroid[j, :], axis=1)
        ##cosine similarity
        #y_pseudo[:,j]=1/(1-cosine_similarity(embedding,centroid[j,:].reshape(1,embedding.shape[1])))[:,0]
    y_pseudo = softmax(y_pseudo, axis=1)

    ##auxiliary target distribution
    f = np.sum(np.square(y_pseudo) / np.sum(y_pseudo, axis=0), axis=1)
    y2 = np.square(y_pseudo) / np.sum(y_pseudo, axis=0)
    au_tar = (y2.T / f).T

    return au_tar, y_label, embedding
Esempio n. 4
0
def spectralClustering(df, k):
    spectral = SpectralClustering(n_clusters=k, random_state=0).fit(df)
    return spectral
Esempio n. 5
0
def get_spectral_clusters(A, k):
    from sklearn.cluster import SpectralClustering
    spec = SpectralClustering(n_clusters=k, random_state = 0, affinity = 'precomputed', assign_labels = 'discretize')
    return spec.fit_predict(A)
Esempio n. 6
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument("config", help="Name of the config file.")
    args = parser.parse_args()

    if not exists(args.config):
        raise FileNotFoundError(args.config + " not found.")

    with open(args.config, "r") as config_file:
        config = yaml.load(config_file, Loader=yaml.Loader)

    patch_data_path = config['patch_data_path']
    base_neuron_act_path = config['base_neuron_act_path']
    train_activations = config['train_activations']
    val_activations = config['val_activations']
    test_activations = config['test_activations']
    output_path = config['output_path']
    cluster_types = config['cluster_types']
    n_clusters = config['n_clusters']
    n_samps = config['n_samps']
    plot_mode = config['plot_mode']
    seed = config['seed']

    if not exists(output_path):
        makedirs(output_path)

    train = pd.read_csv(join(base_neuron_act_path, train_activations))

    if plot_mode == 'train':
        plot_data = train
    elif plot_mode == 'val':
        plot_data = pd.read_csv(join(base_neuron_act_path, val_activations))
    elif plot_mode == 'test':
        plot_data = pd.read_csv(join(base_neuron_act_path, test_activations))

    cluster_dfs = {}

    for cluster_type in cluster_types:

        for n_cluster in n_clusters:

            if cluster_type == 'GMM':

                cluster_dfs[cluster_type] = plot_data
                X = train.loc[:, train.columns.str.contains('neuron')]
                mod = GaussianMixture(n_components=n_cluster,
                                      **config['GMM_kwargs']).fit(X)
                cluster_prob = mod.predict_proba(
                    cluster_dfs[cluster_type].loc[:, cluster_dfs[cluster_type].
                                                  columns.str.contains('neuron'
                                                                       )])
                cluster_dfs[cluster_type]['cluster_prob'] = cluster_prob.max(
                    axis=1)
                cluster_dfs[cluster_type]['cluster'] = mod.predict(
                    cluster_dfs[cluster_type].loc[:, cluster_dfs[cluster_type].
                                                  columns.str.contains('neuron'
                                                                       )])
                plot_prob_dist(cluster_dfs[cluster_type], output_path,
                               cluster_type, n_cluster)
                plot_prob_cdf(cluster_dfs[cluster_type], output_path,
                              cluster_type, n_cluster)

            elif cluster_type == 'Spectral':

                cluster_dfs[cluster_type] = train.sample(n_samps,
                                                         random_state=seed)
                X = cluster_dfs[cluster_type].loc[:, cluster_dfs[cluster_type].
                                                  columns.str.contains('neuron'
                                                                       )]
                mod = SpectralClustering(n_clusters=n_cluster,
                                         **config["Spectral_kwargs"]).fit(X)
                cluster_dfs[cluster_type]['cluster'] = mod.labels_

            joblib.dump(
                mod,
                join(output_path, f'{cluster_type}_{n_cluster}_clusters.mod'))

            plot_cluster_dist(cluster_dfs[cluster_type], output_path,
                              cluster_type, n_cluster)
            plot_storm_clusters(patch_data_path, output_path,
                                cluster_dfs[cluster_type], cluster_type, seed,
                                **config['plot_kwargs'])

    return
def interchromosomal_clusters(cf,k,cluster_file,algorithm='eigh-kmeans',interchr_mat=None,use_builtin_ice=False,use_ice=False,use_cpb=False,save_mat=False,mat_save_dir='.',plot_mat=False,out_file='global_clusters.out'):
    if algorithm not in ['eigh-gmix','eigh-kmeans','spec-kmeans']:
        print "error: algorithm must be either 'eigh-gmix', 'eigh-kmeans' or 'spec-kmeans'"
        
    clusters = {}
    print "[interchromosomal_clusters] k={}, cluster_file={}, out_file={}, algorithm={}".format(k,cluster_file,out_file,algorithm)
    # Read and parse intrachromosomal clusters.
    fc = open(cluster_file,"r")
    for line in fc:
        m = line.rstrip().split('\t')
        chr = m[0]
        clust = m[1].split(',')
        if not clusters.has_key(chr):
            clusters[chr] = []
        clusters[chr].append(np.array([int(v) for v in clust]))

    # Compute interchromosomal sums.
    if interchr_mat is None:
        print "computing interchromosomal matrix..."
        mat_shape = 0
        offset = {}
        for chr in clusters:
            offset[chr] = mat_shape
            mat_shape += len(clusters[chr])

        mat    = np.zeros((mat_shape,mat_shape))
        normat = np.zeros((mat_shape,mat_shape))

        # Iterate over all interchromosomal combinations
        for i in xrange(0,len(clusters)):
            chr_i = clusters.keys()[i]
            ploid_i = (1 if chr_i != 'chrX' else 2)
            sys.stdout.write("{} -> ".format(chr_i));
            for j in xrange(i+1,len(clusters)):
                chr_j = clusters.keys()[j]
                sys.stdout.write("{},".format(chr_j))
                ploid_j = (1 if chr_j != 'chrX' else 2)
                hic_m = cf.matrix(balance=use_builtin_ice).fetch(chr_i,chr_j)
                if use_ice:
                   hic_m = inter_ice(hic_m,20)
                elif use_cpb:
                   hic_m = inter_cpb(hic_m,20)
                   
                # Cluster combinations.
                for ki,clu_i in enumerate(clusters[chr_i]):
                    for kj,clu_j in enumerate(clusters[chr_j]):
                        sumv = np.sum(hic_m[clu_i,:][:,clu_j]) * ploid_i * ploid_j
                        mat[offset[chr_i]+ki,offset[chr_j]+kj] = mat[offset[chr_j]+kj,offset[chr_i]+ki] = sumv
                        normat[offset[chr_i]+ki,offset[chr_j]+kj] = normat[offset[chr_j]+kj,offset[chr_i]+ki] = sumv/float(len(clu_i))/float(len(clu_j))
            # Newline.
            sys.stdout.write('\n')
        # Store matrices for future processing
        if save_mat:
           np.save('{}/interchr_normat.npy'.format(mat_save_dir),normat)
           np.save('{}/interchr_sums.npy'.format(mat_save_dir),mat)
        
    else:
        print "Using user-provided interchromosomal matrix for cluster computation."
        normat = interchr_mat
        
    N = normat.shape[0]
    print "computing clusters, algorithm {}...".format(algorithm)
    if algorithm == 'spec-kmeans':
        spect_clu = SpectralClustering(n_clusters=k, eigen_solver='arpack', affinity='rbf', assign_labels='kmeans', n_jobs=8)
        hic_clust = spect_clu.fit_predict(normat)
    else:
        w, v = scipy.linalg.eigh(normat, eigvals=(N-k,N-1))
        if algorithm == 'eigh-gmix':
            gmix = mixture.GaussianMixture(n_components=k, covariance_type='full', tol=1e-4, max_iter=1000)
            gmix.fit(v)
            hic_clust = gmix.predict(v)
        elif algorithm == 'eigh-kmeans':
            km = KMeans(n_clusters=k,n_jobs=8)
            hic_clust = km.fit_predict(np.sqrt(w)*v)
            with open('{}'.format(out_file+'.weig'),'w+') as outdata:
                for (c,ev) in zip(hic_clust,w*v):
                    outdata.write(str(c)+'\t'+'\t'.join([str(x) for x in ev[::-1]]) + '\n')


    clu_idx = np.argsort(hic_clust)
    P = np.zeros(normat.shape)
    P[np.arange(0,len(clu_idx)),clu_idx] = 1
    # Permute rows and columns.
    W_clust = np.dot(np.dot(P,normat),np.linalg.inv(P))
    if plot_mat:
       plt.matshow(W_clust,cmap=plt.cm.bwr)

    clust_cnt = [(g[0], len(list(g[1]))) for g in itertools.groupby(sorted(hic_clust))]
    # Compute cluster limits
    cnt = np.zeros(k+1,dtype=int)
    for i in xrange(k):
       cnt[i] += clust_cnt[i][1]
    cmcnt = np.cumsum(cnt)
    l = W_clust.shape[0]-1
    if plot_mat:
       for x in cmcnt:
          plt.plot([0,l], [x,x], color='k', linestyle='-', linewidth=1)
          plt.plot([x,x], [0,l], color='k', linestyle='-', linewidth=1)

    print "writing results to {}...".format(out_file)
    with open(out_file,'w+') as fo:
        c_id = 0
        for i in xrange(0,len(clusters)):
            chr_i = clusters.keys()[i]
            for ki,clu_i in enumerate(clusters[chr_i]):
                fo.write("{}\t{}\t{}\t{}\n".format(clusters.keys()[i],ki,hic_clust[c_id],','.join([str(n) for n in clu_i])))
                c_id += 1
    print "[interchromosomal_clusters] Done."
Esempio n. 8
0
# in this case the seeding of the centers is deterministic, hence we run the
# kmeans algorithm only once with n_init=1
pca = PCA(n_components=n_digits).fit(data)
bench_k_means(KMeans(init=pca.components_, n_clusters=n_digits, n_init=1),
              name="PCA-based",
              data=data)
bench_k_means(AffinityPropagation(), name="AP", data=data)
bench_k_means(MeanShift(), name="MS", data=data)
bench_k_means(AgglomerativeClustering(n_clusters=n_digits),
              name="AC",
              data=data)
bench_k_means(AgglomerativeClustering(n_clusters=n_digits),
              name="WD",
              data=data)
bench_k_means(DBSCAN(), name="DB", data=data)
bench_k_means(SpectralClustering(n_clusters=n_digits), name="SC", data=data)

GM = GaussianMixture(n_components=10)
t0 = time()
GM.fit(data)
print('%-5s\t%.2fs\t%.3f\t%.3f\t%.3f' %
      ('GM', (time() - t0),
       metrics.normalized_mutual_info_score(labels, GM.predict(data)),
       metrics.homogeneity_score(labels, GM.predict(data)),
       metrics.completeness_score(labels, GM.predict(data))))

print(82 * '_')

# #############################################################################
# Visualize the results on PCA-reduced data
# reduced_data = PCA(n_components=2).fit_transform(data)
Esempio n. 9
0
         strsub1 = 'K'+str(K)+'N'+str(N)+'c'+str(c)+'la'+str(lambda_n)+'rd'+str(rand) # for saving results
         # simulate graph            
         G = SBM.SBM_simulate_fast(model_sbm1)
         ln, nodeslist = get_label_list(G)
         # algo1: proposed deepwalk algorithm
         model_w2v = SBM.SBM_learn_deepwalk_1(G, num_paths, length_path, emb_dim, rw_filename, emb_filename, winsize)
         X = model_w2v[nodeslist]
         k_means = KMeans(n_clusters=K, max_iter=100, precompute_distances=False)
         k_means.fit(X)
         y_our = k_means.labels_ 
         nmi_arry, ccr_arry, ars_arry = summary_res(nmi_arry, ccr_arry, ars_arry, ln, y_our, 'deep', 'N', N, rand)
         # algo2: spectral clustering 
         A = nx.to_scipy_sparse_matrix(G)
         print 'start spectral clustering'
         if N<10000:
             sc = SpectralClustering(n_clusters = K, affinity = 'precomputed', eigen_solver='arpack')
             sc.fit(A)
             y_sc= sc.labels_ 
         else:
             y_sc = ln
         nmi_arry, ccr_arry, ars_arry = summary_res(nmi_arry, ccr_arry, ars_arry, ln, y_sc, 'sc', 'N', N, rand)
         # algo3: belief propogation
         print 'start ABP algorithm'
         r = 3
         m, mp, lambda1 = SBM.abp_params(model_sbm1)
         y_abp = SBM.SBM_ABP(G, r, lambda1, m, mp)
         nmi_arry, ccr_arry, ars_arry = summary_res(nmi_arry, ccr_arry, ars_arry, ln, y_abp, 'abp', 'N', N, rand)
 import pickle
 savename = 'exp3-3.pkl'
 res = [nmi_arry, ccr_arry, ars_arry]
 pickle.dump(res, open(savename, 'wb'), protocol=2)
Esempio n. 10
0
def Spectral_clustering():  #4
    global cluster
    global labels
    cluster = SpectralClustering(n_clusters=110).fit_predict(text)
    print("Spectral clustering NMI:%s" %
          (metrics.normalized_mutual_info_score(labels, cluster)))
Esempio n. 11
0
    data =cPickle.load(open(text_path, "r"))

    wdict = data['wdict']
    s_l_train = data['s_l_train']
    y_l_train = data['y_l_train']

    return [wdict, s_l_train, y_l_train]

wdict, s_l_train, y_l_train = load_data( text_path)
data =cPickle.load(open(data_path,'rb'))
X = data['z']
tsne = TSNE(n_components=3)
X = tsne.fit_transform(X)
cluster_num = 8
stat = np.zeros((cluster_num, 20000),dtype='float64')
cluster_label = SpectralClustering(n_clusters=cluster_num).fit(X).labels_
print len(cluster_label), len(s_l_train)

idict = dict()
for k in wdict.keys():
    idict[wdict[k]] = k
idict[0] = '<eos>'
idict[1] = '<unk>'

c_num = np.zeros((cluster_num,))
for i, s in enumerate(s_l_train):
    c_num[cluster_label[i]] += 1.0
    l = []
    for j in s:
       l.extend(j)
Esempio n. 12
0
File: main.py Progetto: xbassi/adi
# x = pca.fit_transform(data1[:,2:])

x = TSNE(n_components=3).fit_transform(data1[:,2:])


print(x.shape)

# Kmeans Clustering
# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans
pred = KMeans(n_clusters=4,max_iter=30000,random_state=0).fit_predict(x)
fs = f1_score(y_true, pred, average='weighted')
print("Kmeans F1 Score:",fs)

# Spectral Clustering
# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralClustering.html#sklearn.cluster.SpectralClustering
pred = SpectralClustering(n_clusters=4).fit_predict(x)
fs = f1_score(y_true, pred, average='weighted')
print("Spectral F1 Score:",fs)

# Agglomerative Clustering
# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering
pred = AgglomerativeClustering(n_clusters=4).fit_predict(x)
fs = f1_score(y_true, pred, average='weighted')
print("Agglomerative F1 Score:",fs)

# Density Based
# needs work https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html#sklearn.cluster.DBSCAN
pred = DBSCAN(eps=1, min_samples=2, ).fit_predict(x)
fs = f1_score(y_true, pred, average='weighted')
print("Density Based F1 Score:",fs)
ms.fit_predict(xtrain_lsa)
#cluster_centers_indices = ms.cluster_centers_indices_

labels = ms.labels_  # get clusters
n_clusters = len(labels)
plt.title("Mean Shift")
plt.scatter(xtrain_lsa[:,0], xtrain_lsa[:,1], c =labels )
plt.show()
#print("Silhouette Coefficient: %0.3f"
#    % metrics.silhouette_score(xtrain_lsa, labels, metric='sqeuclidean'))
#print("mean shift, cluster_centers_indices", n_clusters)
"""

#SKIP=True
if DO_ALL is True:
    sc = SpectralClustering(n_clusters=6)  #n_clusters)
    sc.fit(xtrain_lsa)

    predict_sc = sc.fit_predict(xtrain_lsa)  # Predicted clusters
    labels = sc.labels_
    plt.title("SpectralClustering")
    plt.scatter(xtrain_lsa[:, 0], xtrain_lsa[:, 1], c=labels)
    plt.show()
    clusters = len(labels)
    #pd.crosstab(predict_sc, c)

    print("spectral: Silhouette Coefficient: %0.3f" %
          metrics.silhouette_score(xtrain_lsa, labels, metric='sqeuclidean'))
    #print(len(labels)," clusters")

    #KMeans
### So if average number of gates from user is 3 (n_gates_users_avg = 3), n_clusters should be 4

n_c_avg_users = n_gates_users_avg + 1

n_cluster_list = [int(n_c_avg_users)]
#n_cluster_list = [int(n_c_avg_users -1), int(n_c_avg_users), int(n_c_avg_users +1)]

df_pixel_final_labels_hybrid_all_n_c = pd.DataFrame()
final_e_df_example_all_n_c = pd.DataFrame()

for n_clusters in n_cluster_list:

    print(n_clusters)

    clustering_SC = SpectralClustering(n_clusters=n_clusters,
                                       affinity="precomputed",
                                       assign_labels=choose_assign_labels,
                                       random_state=0).fit(hybrid_matrix)

    pixel_final_labels_hybrid = clustering_SC.labels_[:len(
        df_pixel_info_example["x_pixel_coor"])]

    df_pixel_final_labels_hybrid = pd.DataFrame(
        {"pixel_final_label": pixel_final_labels_hybrid})

    ### After clustering the pixels, we can go back to our events and label the events inside each pixel.
    ### First, we update our dataframe that contains pixel information with the labels we found for pixels.

    ### Update the dataframe with pixel labels

    df_pixel_info_example["pixel_final_label"] = pixel_final_labels_hybrid
Esempio n. 15
0
    A[tup[0], tup[1]] = tup[2]*10
    A[tup[1], tup[0]] = tup[2]*10

adj_mat = [[3,2,2,0,0,0,0,0,0],
           [2,3,2,0,0,0,0,0,0],
           [2,2,3,1,0,0,0,0,0],
           [0,0,1,3,3,3,0,0,0],
           [0,0,0,3,3,3,0,0,0],
           [0,0,0,3,3,3,1,0,0],
           [0,0,0,0,0,1,3,1,1],
           [0,0,0,0,0,0,1,3,1],
           [0,0,0,0,0,0,1,1,3]]

#adj_mat = np.array(adj_mat)

sc = SpectralClustering(100, affinity='precomputed', n_init=100)
sc.fit(A)



print('spectral clustering')
print (type(sc.labels_))
#print (len(sc.labels_))
#print(sc.labels_.tolist())
node_map = {}
cluster = sc.labels_.tolist()
for i in range(len(cluster)):
	if cluster[i] in node_map:
		node_map[cluster[i] ].append(i) 
	else:
		node_map[cluster[i] ] = []
def spectral_cluster(vectors):
    print('spectral clustering...')
    return SpectralClustering(n_clusters=4).fit_predict(vectors)
Esempio n. 17
0
        X.append(data[:, :, i].reshape(h * w))
        y.append(tag)
    return X, y


data = sio.loadmat("../ExtYaleB10.mat")
data_train, data_test = data["train"], data["test"]

X_train, y_train = [], []
for tag in range(10):
    X_add, y_add = flatImageByClass(data_train[0][tag], tag)
    X_train += X_add
    y_train += y_add
X_train, y_train = np.array(X_train), np.array(y_train)

X_test, y_test = [], []
for tag in range(10):
    X_add, y_add = flatImageByClass(data_test[0][tag], tag)
    X_test += X_add
    y_test += y_add
X_test, y_test = np.array(X_test), np.array(y_test)

for gamma in [.01, .1, 1, 10, 100]:
    clf = SpectralClustering(n_clusters=10, gamma=gamma)
    labels = clf.fit_predict(X_train)
    # Error
    error = 0
    for i in range(len(labels)):
        if labels[i] != y_train[i]:
            error += 1
    print("gamma = %f: Error ratio: %f" % (gamma, float(error) / len(labels)))
Esempio n. 18
0
# For reproducibility
np.random.seed(1000)

nb_samples = 1000

if __name__ == '__main__':
    # Create dataset
    X, Y = make_moons(n_samples=nb_samples, noise=0.05)

    # Try different gammas with a RBF affinity
    Yss = []
    gammas = np.linspace(0, 12, 4)

    for gamma in gammas:
        sc = SpectralClustering(n_clusters=2, affinity='rbf', gamma=gamma)
        Yss.append(sc.fit_predict(X))

    # Show data
    fig, ax = plt.subplots(1, 4, figsize=(30, 10), sharey=True)

    for x in range(4):
        ax[x].grid()
        ax[x].set_title('Gamma = %.0f' % gammas[x])

        for i in range(nb_samples):
            c = Yss[x][i]

            if c == 0:
                ax[x].scatter(X[i, 0], X[i, 1], marker='o', color='r')
            else:
def cluster_compartments(cf,k,chrlist,eig_dim=None,contact_thr=1,max_sample_size=50000,outlier_pctl=90,corr_outlier_pctl=[5,95],balance_corr_median=False,rm_diags=0,coeffs=None,coeffs_gw=None,seed=None,max_resampling_attempts=10,rearrange_clusters=False,use_oe=True,use_builtin_ice=False,use_ice=False,use_cpb=False,compute_abscore=True,algorithm='eigh-kmeans',outdir='.',out_allchr='clusters_all.txt'):
    if algorithm not in ['eigh-gmix','eigh-kmeans','spec-kmeans']:
        print "error: algorithm must be either 'eigh-gmix', 'eigh-kmeans' or 'spec-kmeans'"
        return

    print "[intrachromosomal_clusters] k={}, outdir={}, algorithm={}".format(k,outdir,algorithm)

    if eig_dim == None:
        eig_dim = k

    if coeffs == None:
       coeffs = {}

    clusters = {}
    sample_idx = {}
    clusters_idx = {}
    snapshot_mat = None
    snapshot_cor = None

    for chr in chrlist:
        if os.path.isfile('{}/clusters_{}.txt'.format(outdir,chr)):
            print "Warning: {} clusters ({}/clusters_{}.txt) already exist. Skipping chromosome.".format(chr,outdir,chr)
            continue
        print "[{}] balancing matrix...".format(chr)
        
        m = cf.matrix(balance=use_builtin_ice).fetch(chr)

        # Threshold contacts.
        m[np.where(m<contact_thr)] = 0

        print "[{}] computing O/E...".format(chr)
        if use_oe is True:
            if coeffs_gw is not None:
                m_oe = oe_apply(m,coeffs_gw).toarray()
            else:
                if not coeffs.has_key(chr):
                    coeffs[chr] = oe_coeffs_mask(cf,chr,use_builtin_ice=use_builtin_ice)[chr]
                m_oe = oe_apply(m,coeffs[chr]).toarray()
        else:
            m_oe = m

        # Remove diagonals
        if rm_diags > 0:
            m_oe = np.triu(m_oe, k=rm_diags) + np.tril(m_oe,k=-rm_diags)

        # Get idx of high quality regions (measured in raw matrix).
        samp_idx = matrix_mask_idx(m_oe)
        sample_idx[chr] = samp_idx
        print "[{}] removing low-quality regions (matrix rows: {}, sample rows: {})...".format(chr,m.shape[0],samp_idx.shape[0])
        # High-quality matrix size
        l = len(samp_idx)
        ssize = min(l,max_sample_size)
        # Sample iteration (keep sampling while clustering fails).
        np.random.seed(seed)
        successful = False
        cnt = 0
        while not successful and cnt < max_resampling_attempts:
            cnt += 1
            # Get sample
            if ssize < l:
                s = np.sort(np.random.choice(samp_idx,ssize,replace=False))
            else:
                s = np.array(samp_idx)
            m_samp = m_oe[s,:][:,s]

            # Apply CPB.
            if use_cpb:
               print "[{}] computing CPB...".format(chr)
               m_samp = cpb(m_samp,20)
            elif use_ice:
               print "[{}] computing ICE...".format(chr)
               m_samp = ice(m_samp,20)

            # Relax outliers
            m_max = np.percentile(m_samp[np.where(m_samp>0)],outlier_pctl)
            m_samp[np.where(m_samp > m_max)] = m_max
            if (~m_samp.any(axis=1)).any():
                print "[{}] sample contains empty rows (singular matrix). resampling ({})...".format(chr,cnt)
                continue
                
            # Compute correlation and remove diagonal
            print "[{}] computing correlation matrix and balancing...".format(chr)
            m_cor = np.corrcoef(m_samp)
            np.fill_diagonal(m_cor,0)
     
            # Increase correlation contrast (5-95 percentiles by default)
            if balance_corr_median:
                m_cor = m_cor - np.median(m_cor[np.triu_indices(ssize,1)])
            min_cor_val = np.percentile(m_cor[np.triu_indices(ssize,1)],corr_outlier_pctl[0])
            max_cor_val = np.percentile(m_cor[np.triu_indices(ssize,1)],corr_outlier_pctl[1])
            m_cor[np.where(m_cor < min_cor_val)] = min_cor_val
            m_cor[np.where(m_cor > max_cor_val)] = max_cor_val

            N = m_cor.shape[0]
            eig_dim = min(N,eig_dim)

            if compute_abscore:
                   print "[{}] computing a/b scores...".format(chr)
                   eigv, abscore = eigs_ab_score(m_cor)

                   # Write weighted eigenvectors
                   with open('{}/clusters_{}.abscore'.format(outdir,chr),'w') as outdata:
                      for i in xrange(0,len(abscore)):
                         outdata.write(str(sample_idx[chr][i])+'\t'+str(abscore[i])+'\n')

            try:
                print "[{}] computing clusters, algorithm {}...".format(chr,algorithm)
                if algorithm == 'spec-kmeans':
                    # some chromosomes crash when using precomputed similarity matrices.
                    # however using RBF seems to give meaningful clustering.
                    spect_clu = SpectralClustering(n_clusters=k, eigen_solver='arpack', affinity='precomputed', assign_labels='kmeans', n_jobs=8)
                    hic_clust = spect_clu.fit_predict(m_cor)
                else:
                    print "[{}] computing eigh...".format(chr)
                    w, v = scipy.linalg.eigh(m_cor, eigvals=(N-eig_dim,N-1))

                    if algorithm == 'eigh-gmix':
                        # Cluster eigenvectors using Gaussian Mixture
                        gmix = mixture.GaussianMixture(n_components=k, covariance_type='full', tol=1e-4, max_iter=1000)
                        gmix.fit(v)
                        hic_clust = gmix.predict(v)
                    elif algorithm == 'eigh-kmeans':
                        # Cluster eigenvalue/eigenvector products with kmeans.
                        print "[{}] computing clusters (k-means)...".format(chr)
                        km = KMeans(n_clusters=k,n_jobs=8)
                        weig = np.sqrt(w)*v
                        hic_clust = km.fit_predict(weig)
                        # Write weighted eigenvectors
                        with open('{}/clusters_{}.weig'.format(outdir,chr),'w') as outdata:
                            for i in xrange(0,len(hic_clust)):
                                outdata.write(str(sample_idx[chr][i])+'\t'+str(hic_clust[i])+'\t'+'\t'.join([str(x) for x in weig[i][::-1]])+'\n')
            except Exception, e:
                print "[{}] error while clustering: {}".format(chr,cnt,str(e))
                cnt = max_resampling_attempts
                break
            successful = True

        if cnt >= max_resampling_attempts:
            print "[{}] max reampling attempts reached, skipping chromosome.".format(chr)
            continue

        # Rearrange clusters for visualization
        # Make cluster index list
        clu_idx = [list() for _ in xrange(k)]
        for i,c in enumerate(hic_clust):
            clu_idx[c].append(i)

        if not rearrange_clusters:
            # Map again to matrix indices
            clusters_idx[chr] = [sample_idx[chr][x] for x in clu_idx]

        else:
            print "[{}] rearranging clusters by similarity...".format(chr)
            for i in xrange(k):
                clu_idx[i] = np.array(clu_idx[i])

            clusters[chr] = list()

            # Find most distant blocks
            l_r = (0,0)
            val = np.inf
            d_sum = np.zeros((k,k))
            for i in xrange(k):
                l_i = len(clu_idx[i])
                for j in xrange(i+1,k):
                    l_j = len(clu_idx[j])
                    d_sum[i,j] = np.sum(m_cor[clu_idx[i],:][:,clu_idx[j]])
                    d = float(d_sum[i,j])/(l_i*l_j)
                    if d < val:
                        l_r = (i,j)
                        val = d

            # Pop left and right blocks (important to do it in this order for index consistency).
            r_idx = clu_idx.pop(l_r[1])
            l_idx = clu_idx.pop(l_r[0])
            r_clusters = [r_idx.copy(),]
            l_clusters = [l_idx.copy(),]

            iters = len(clu_idx)/2 + len(clu_idx)%2
            for i in xrange(iters):
                # Find nearest blocks to L/R.
                len_l = len(l_idx)
                len_r = len(r_idx)
                min_d = np.inf
                max_d = -np.inf
                min_idx = 0
                max_idx = 0
                for i in xrange(len(clu_idx)):
                    len_block = len(clu_idx[i])
                    d_l = float(np.sum(m_cor[l_idx,:][:,clu_idx[i]]))/(len_l*len_block) - val
                    d_r = float(np.sum(m_cor[r_idx,:][:,clu_idx[i]]))/(len_r*len_block) - val
                    r = d_l/d_r
                    if r < min_d:
                        min_idx = i
                        min_d = r
                    if r >= max_d:
                        max_idx = i
                        max_d = r
                # Pop from idx and add to L/R.
                if min_idx > max_idx:
                    r_clusters.append(clu_idx[min_idx].copy())
                    l_clusters.append(clu_idx[max_idx].copy())
                    r_idx = np.append(clu_idx.pop(min_idx),r_idx)
                    l_idx = np.append(l_idx,clu_idx.pop(max_idx))
                elif min_idx < max_idx:
                    r_clusters.append(clu_idx[min_idx].copy())
                    l_clusters.append(clu_idx[max_idx].copy())
                    l_idx = np.append(l_idx,clu_idx.pop(max_idx))
                    r_idx = np.append(clu_idx.pop(min_idx),r_idx)
                else:
                    l_clusters.append(clu_idx[max_idx].copy())
                    l_idx = np.append(l_idx,clu_idx.pop(max_idx))
            # Make final index list.
            clu_idx = np.append(l_idx,r_idx)

            # Make final cluster index list.
            clusters[chr] = l_clusters + list(reversed(r_clusters))

            # Map again to matrix indices
            clusters_idx[chr] = [sample_idx[chr][x] for x in clusters[chr]]

        # Store in disk
        print "[{}] writing clusters to {}/clusters_{}.txt...".format(chr,outdir,chr)
        fout = open('{}/clusters_{}.txt'.format(outdir,chr),'w+')
        for c in clusters_idx[chr]:
            fout.write("{}\t".format(chr))
            fout.write(','.join([str(i) for i in c]))
            fout.write('\n')
        fout.close()
        fall = open('{}/{}'.format(outdir,out_allchr),"a")
        for c in clusters_idx[chr]:
            fall.write("{}\t".format(chr))
            fall.write(','.join([str(i) for i in c]))
            fall.write('\n')
        fall.close()

        '''
import os
# Change the current sys path
os.chdir(
    "/Users/davidlin/Desktop/School/Master/2021_secondSem/SC/image-segmentation/"
)
from os import walk
from Code.lib import tools
import numpy as np
from skimage import segmentation, color
from hmmlearn import hmm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.cluster import KMeans, SpectralClustering
from time import time
import cv2
from PIL import Image
# ---------------------------------------------------------------------------------------
####################################################################
###                                                              ###
### Part 1:  Vector Quantization using K-means with different K  ###
###                                                              ###
####################################################################
# K-means for VQ
# Specify the number of colors
n1_colors = 5
n2_colors = 10
n3_colors = 64
# Load the photo and convert from BGR to RGB model
img_bgr = cv2.imread("./Images/shiba.jpeg")
Esempio n. 21
0
def spectral_clustering(data, n_clusters):
    spec = SpectralClustering(n_clusters=n_clusters, gamma=2.)
    spec.fit(data)
    return spec.labels_
Esempio n. 22
0
X_id = []
y = []
labels = []
# a subset of X_id []
X_IDs = []
# generate X_train using X_id
#ID_list = df.ix[:,0]
ID_bipart = df_bipart.ix[:, 0]
data_bipart = df_bipart.ix[:, 1:]
# need to determin how do you get num_group
num_group = 3
random.seed(17)
kmeans_bipart = KMeans(n_clusters=num_group, random_state=0).fit(data_bipart)
labels_bipart = kmeans_bipart.labels_
labels_bipart = SpectralClustering(num_group, gamma=0.5,
                                   affinity='rbf').fit(data_bipart).labels_

# add new column group to data
df_bipart['group'] = labels_bipart
#############
#group data by their 'group'
# df_bipart = df_bipart.sort_values('group')
#divide by group i
# @i means i is a variable in group
global_y = 0
global_len = 0
global_truth = []
global_fitted = []
print "number of groups" + str(num_group)
for i_group in range(num_group):
    df_bipart_i = df_bipart.query('group == @i_group ')
Esempio n. 23
0
     clusters = np.array(range(len(latlongs)))
 else:
     if CLST_METHOD == 1:
         clObj = KMeans(n_clusters=clustersNo, random_state=seed, n_jobs=4)
         clustersObj = clObj.fit(latlongs)
         (clusters, centroids) = (clustersObj.labels_,
                                  clustersObj.cluster_centers_)
     elif CLST_METHOD == 2:
         clObj = AgglomerativeClustering(n_clusters=clustersNo,
                                         affinity='euclidean',
                                         compute_full_tree='auto',
                                         memory='mycachedir')
         clustersObj = clObj.fit(latlongs)
         (clusters) = (clustersObj.labels_)
     elif CLST_METHOD == 3:
         clObj = SpectralClustering(n_clusters=clustersNo,
                                    random_state=seed)
         clustersObj = clObj.fit(latlongs)
         (clusters) = (clustersObj.labels_)
 ###########################################################################
 # Clustering the pointset
 ###########################################################################
 cLatlongs = aux.appendClustersToLatlongs(latlongs, clusters)
 csvPath = PATH + PLACE + "_CLS_" + str(
     CLST_METHOD) + "_" + namePad + ".csv"
 csvPathSz = PATH + PLACE + "_CLL_" + str(
     CLST_METHOD) + "_" + namePad + ".csv"
 clustersSizes = [sorted(clusters).count(x) for x in range(clustersNo)]
 aux.exportListToCSV(csvPath, cLatlongs)
 np.savetxt(csvPathSz, clustersSizes, fmt='%i', delimiter='\n')
 ###########################################################################
 # Plotting
Esempio n. 24
0
class main():

    with open('/Users/kat/Desktop/Kaggle/Graph.csv', 'rb') as csvfile1:
        graphreader = csv.reader(csvfile1, delimiter=' ', quotechar='|')
        adjgraph = np.empty((6000, 6000))
        adjgraph.fill(0)
        for row in graphreader:
            arr = row[0].split(",")
            adjgraph[int(arr[0]) - 1][int(arr[1]) - 1] = 1
            adjgraph[int(arr[1]) - 1][int(arr[0]) - 1] = 1

    # get features into matrix
    with open('/Users/kat/Desktop/reduced_1035_all_points.csv',
              'rb') as csvfile3:
        EF = csv.reader(csvfile3, delimiter=' ', quotechar='|')
        newEF = []
        for row in EF:
            arr = row[0].split(",")
            arr2 = np.asarray(arr)
            arr3 = arr2.astype(np.float)
            newEF.append(arr3)

    with open('/Users/kat/Desktop/reduced_3000.csv', 'rb') as csvfile3:
        EF = csv.reader(csvfile3, delimiter=' ', quotechar='|')
        adj_new = []
        for row in EF:
            arr = row[0].split(",")
            arr2 = np.asarray(arr)
            arr3 = arr2.astype(np.float)
            adj_new.append(arr3)

    # spectral clustering on adjacency matrix
    spectral = SpectralClustering(10, affinity="precomputed")
    new_plot = spectral.fit_predict(
        adj_new)  #6000 x 1 Array with cluster labels

    matching = {
        0: [],
        1: [],
        2: [],
        3: [],
        4: [],
        5: [],
        6: [],
        7: [],
        8: [],
        9: []
    }
    clusters = {
        0: [],
        1: [],
        2: [],
        3: [],
        4: [],
        5: [],
        6: [],
        7: [],
        8: [],
        9: []
    }

    # get cluster matchings for first 60 points
    with open('/Users/kat/Desktop/Kaggle/Seed.csv', 'rb') as csvfile2:
        seedreader = csv.reader(csvfile2, delimiter=' ', quotechar='|')
        for row in seedreader:
            arr = row[0].split(",")
            findClust = new_plot[int(arr[0]) - 1]
            matching[int(arr[1])].append(
                [int(arr[0]), newEF[int(arr[0]) - 1], findClust])

    for i in range(1, 6001):
        findClust = new_plot[i - 1]
        clusters[findClust].append(newEF[i - 1])

    for i in range(10):
        print "item is " + str(i)
        for item in matching[i]:
            print item[2]

    filtered_features = []
    filtered_features_idx = []
    cluster_5_digit_6 = []
    cluster_8_digit_1 = []

    for i in range(len(new_plot)):
        if new_plot[i] == 5:
            cluster_5_digit_6.append([i + 1, red_pca[i]])
        elif new_plot[i] == 8:
            cluster_8_digit_1.append([i + 1, red_pca[i]])
        else:
            filtered_features.append(red_pca[i])
            filtered_features_idx.append(i + 1)

    cluster_centers_pca_8_clusters = []
    for i in range(10):
        newarray = []
        if i == 1 or i == 6:
            pass
        else:
            for j in range(len(matching[i])):
                newarray.append(np.asarray(matching[i][j][1]))
            newa = np.asarray(newarray)
            cluster_centers_pca_8_clusters.append(newa.mean(axis=0))

    cluster_centers_pca_8_clusters = np.asarray(cluster_centers_pca_8_clusters)

    kmeans_8 = KMeans(
        n_clusters=8,
        init=cluster_centers_pca_8_clusters).fit_predict(filtered_features)

    updated_matching = {
        0: [],
        1: [],
        2: [],
        3: [],
        4: [],
        5: [],
        6: [],
        7: [],
        8: [],
        9: []
    }
    with open('/Users/kat/Desktop/Kaggle/Seed.csv', 'rb') as csvfile2:
        seedreader = csv.reader(csvfile2, delimiter=' ', quotechar='|')
        for row in seedreader:
            arr = row[0].split(",")
            if int(arr[1]) == 1 or int(arr[1]) == 6:
                pass
            else:
                try:
                    idx = filtered_features_idx.index(int(arr[0]))
                    updated_matching[int(arr[1])].append(
                        [int(arr[0]), red_pca[int(arr[0]) - 1], kmeans_8[idx]])
                except ValueError:
                    pass

    finalmatches = {0: 4, 1: 2, 2: 5, 3: 9, 4: 7, 5: 3, 6: 0, 7: 8, 8: 1, 9: 6}

    adjustedcluster = {}
    for i in range(10):
        index = finalmatches[i]
        adjustedcluster[i] = clusters[index]

    cluster_centers = []

    for i in range(10):
        newa = np.asarray(adjustedcluster[i])
        cluster_centers.append(newa.mean(axis=0))

    #for i in range(10):
    #     newarray = []
    #     if i == 1 or i == 6:
    #         pass
    #     else:
    #        for j in range(len(updated_matching[i])):
    #            newarray.append(np.asarray(updated_matching[i][j][1]))
    #        newa = np.asarray(newarray)
    #        cluster_centers_kmeans.append(newa.mean(axis=0))

    digit_6_feat_pca = []
    for item in cluster_5_digit_6:
        digit_6_feat_pca.append(item[1])

    digit_1_feat_pca = []
    for item in cluster_8_digit_1:
        digit_1_feat_pca.append(item[1])

    digit_6_feat_pca = np.asarray(digit_6_feat_pca)
    digit_6_centroid = digit_6_feat_pca.mean(axis=0)
    digit_1_feat_pca = np.asarray(digit_1_feat_pca)
    digit_1_centroid = digit_1_feat_pca.mean(axis=0)

    cluster_centers_kmeans.insert(1, digit_1_centroid)
    cluster_centers_kmeans.insert(6, digit_6_centroid)

    finalclusters = [[0 for i in range(2)] for j in range(4001)]
    finalclusters[0][0] = 'Id'
    finalclusters[0][1] = 'Label'

    for i in range(1, 4001):
        finalclusters[i][0] = 6000 + i
        newdist = []
        for j in range(10):
            newdist.append(dist.euclidean(newEF[i + 5999], cluster_centers[j]))
        label = np.argmin(newdist)
        finalclusters[i][1] = label

    with open('submission7.csv', "w") as output:
        writer = csv.writer(output, lineterminator='\n')
        writer.writerows(finalclusters)
Esempio n. 25
0
def parse_clustering(key, content):
    """Parse the options of the clustering step.

    This function does the same as parse_preproc but works on the clustering
    options.

    Parameters
    -----------
    key : class or str, like {'KMeans', 'AP', 'MS', 'Spectral', 'Hierarchical'}
        The selected clustering algorithm. In case in which key
        is a `class`, it must contain a `fit` method.

    content : dict
        A dictionary containing parameters for each clustering class.
        Each parameter can be a list; in this case for each combination
        of parameters a different pipeline will be created.

    Returns
    -----------
    tpl : tuple
        A tuple made like ('clust_name', clust_obj, 'clustering'), where
        clust_obj implements the `fit` method.
    """
    if inspect.isclass(key):
        cl = key(**content)
        key = cl.__class__.__name__.lower()

    elif 'auto' in (content.get('n_clusters', ''),
                    content.get('preference', '')) \
            and key.lower() != 'hierarchical':
        # Wrapper class that automatically detects the best number of clusters
        # via 10-Fold CV
        content.pop('n_clusters', '')
        content.pop('preference', '')

        kwargs = {
            'param_grid': [],
            'n_jobs': -1,
            'scoring': silhouette_score,
            'cv': 10
        }

        if key.lower() == 'kmeans':
            content.setdefault('init', 'k-means++')
            content.setdefault('n_jobs', 1)
            kwargs['estimator'] = KMeans(**content)
        elif key.lower() == 'ap':
            kwargs['estimator'] = AffinityPropagation(**content)
            kwargs['affinity'] = kwargs['estimator'].affinity
        else:
            logging.error("n_clusters = 'auto' specified outside kmeans or "
                          "ap. Trying to create GridSearchCV pipeline anyway "
                          " ...")
        cl = GridSearchCV(**kwargs)
    elif 'auto' in (content.get('n_clusters', ''),
                    content.get('preference', '')) \
            and key.lower() == 'hierarchical':
        # TODO implement this
        # from adenine.utils.extensions import AgglomerativeClustering
        cl = AgglomerativeClustering(**content)
    else:
        if key.lower() == 'kmeans':
            content.setdefault('n_jobs', -1)
            cl = KMeans(**content)
        elif key.lower() == 'ap':
            content.setdefault('preference', 1)
            cl = AffinityPropagation(**content)
        elif key.lower() == 'ms':
            cl = MeanShift(**content)
        elif key.lower() == 'spectral':
            cl = SpectralClustering(**content)
        elif key.lower() == 'hierarchical':
            cl = AgglomerativeClustering(**content)
        else:
            cl = DummyNone()
    return (key, cl, 'clustering')
Esempio n. 26
0
def run_SpectralClustering(data, clusters):
    estimator = SpectralClustering(n_clusters=clusters).fit(data)
    return estimator.labels_
Esempio n. 27
0
    5: 'sixth'
})
x = df_segm['COMPONENT 2']
y = df_segm['COMPONENT 1']
plt.figure(figsize=(10, 8))
sns.scatterplot(x,
                y,
                hue=df_segm['Segment'],
                palette=['g', 'r', 'c', 'm', '#95a5a6', '#3498db'])
plt.title("CLUSTERS BY PCA COMPONENTS")
plt.show()
sns.scatterplot

######SPECTRAL CLUSTERING ON MFEAT-MO
from sklearn.cluster import SpectralClustering
spectral_model_rbf = SpectralClustering(n_clusters=6, affinity='rbf')
labels_rbf = spectral_model_rbf.fit_predict(s)
colours = {}
colours[0] = 'b'
colours[1] = 'y'
colours[2] = 'g'
colours[3] = 'c'
colours[4] = 'r'
colours[5] = 'm'
cvec = [colours[label] for label in labels_rbf]
b = plt.scatter(s[:, 0], s[:, 1], color='b')
y = plt.scatter(s[:, 0], s[:, 1], color='y')
g = plt.scatter(s[:, 0], s[:, 1], color='g')
c = plt.scatter(s[:, 0], s[:, 1], color='c')
r = plt.scatter(s[:, 0], s[:, 1], color='r')
m = plt.scatter(s[:, 0], s[:, 1], color='m')
Esempio n. 28
0
X = lsa.fit_transform(X)

print("done in %fs" % (time() - t0))

explained_variance = svd.explained_variance_ratio_.sum()
    # print("Explained variance of the SVD step: {}%".format(
    #     int(explained_variance * 100)))

    # print()


# #############################################################################
# Do the actual clustering

sp = SpectralClustering(n_clusters=4, affinity='nearest_neighbors')
print("Clustering sparse data with %s" % sp)
t0 = time()
sp.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

#metrics.normalized_mutual_info_score
print("Normalized: %0.3f" % metrics.normalized_mutual_info_score(labels, sp.labels_))
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, sp.labels_))

print("Completeness: %0.3f" % metrics.completeness_score(labels, sp.labels_))

print()

Esempio n. 29
0
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import matplotlib as mpl

# Load cluster data

X = np.loadtxt('data1.txt')

# Compute SpectralClustering

spc = SpectralClustering(
    n_clusters=6,
    # eigen_solver='arpack', # {None, ‘arpack’, ‘lobpcg’, or ‘amg’}
    kernel_params='',  # chi2_kernel
    assign_labels='kmeans',  # {‘kmeans’, ‘discretize’}
    affinity="nearest_neighbors").fit(
        X)  #‘nearest_neighbors’, ‘precomputed’, ‘rbf’ or
# cosine_similarity, Linear kernel,  Polynomial kernel, Sigmoid kernel,  RBF kernel,  Laplacian kernel,
# Chi-squared kernel

# Hiperparameters
# Affinity matrix construction: distance and kernel;
# kernel parameter (scaling factor);
# number of clusters k;
# clustering method

core_samples_mask = np.zeros_like(spc.labels_, dtype=bool)
#core_samples_mask[spc.core_sample_indices_] = True
Esempio n. 30
0
aaa = aaa.reshape(length, length)
aaa = np.transpose(aaa)
# aaa=np.log(aaa+1)
# aaa=(aaa-aaa.min())/(aaa.max()-aaa.min())

p = count_percent(D3, D2)
p = p * aaa
D = getD(p)
L = getL(D, p)
eigvec = getEigen(L, n)
eigvec = np.real(eigvec)
clf = KMeans(n_clusters=n)
s = clf.fit(eigvec)
C = s.labels_
print('processed data using sc ARI:', metrics.adjusted_rand_score(y, C))
print('NMI:', normalized_mutual_info_score(y, C))
print('ACC:', acc(y, C))

from sklearn.cluster import SpectralClustering

sc1 = SpectralClustering(n_clusters=n,
                         affinity='nearest_neighbors',
                         n_neighbors=KNN_for_neighbord)
print('SC KNN ARI:', metrics.adjusted_rand_score(y, sc1.fit_predict(x1)))
c = 'ARI:' + str(metrics.adjusted_rand_score(y, C)) + '\n' + 'NMI:' + str(
    normalized_mutual_info_score(y, C)) + '\n'
c = c + 'ACC:' + str(acc(y, C)) + '\n' + 'SKARI' + str(
    metrics.adjusted_rand_score(y, sc1.fit_predict(x1)))
fh = open('performanceusoskinimproved.txt', 'w', encoding='utf-8')
fh.write(c)
fh.close()