Esempio n. 1
0
    #for j in : # Tenho que mudar de 2 a qnt linha
    #    list_met.append(j)
    print("Realizando Shilueta " + str(i + 1) + "\r\n")
    numero_k_test = compare_k_AggClustering(List_k, train_np[:, 1:])
    #numero_k_test = int((4572/41148)*numero_k)
    #numero_k_test =10
    f = open("TESTE.txt", "a+")
    f.write("k-Teste:" + str(numero_k_test) + "\r\n")
    f.close()

    print("Loading Clustering....")
    #    print("Tempo inicio: "+ str(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))

    print("Tempo inicio: " + str(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
    clusterer = AgglomerativeClustering(
        n_clusters=numero_k_test, linkage="average",
        affinity='cosine')  #------------------------------MUDAR
    clusterer.fit(test_np[:, 1:])

    #--------------------------- KMEANS"-------##

    labels = []
    labels = clusterer.labels_
    #----------------------------------------------------------------------##
    #print(oi[0,:])
    #id_ =list(oi)
    id_ = test_np[:, 0]
    #print(oi[0,0])
    #print()
    #print("_______________________________")
# %%
# Create the dendrogram
# -------------------------
fig = ff.create_dendrogram(irisPCADF, color_threshold=0)
fig.update_layout(width=800, height=500)
fig.show()
# => We know the iris dataset contains three clusters. The cutoff will be set at five to obtain three clusters
#    We knew ahead of time the number of clusters to make; however, the cutoff line on the dendrogram seems 
#    high in terms of distances. This is one of the difficulties when using a dendrogram


# %%
# Hierarchical clustering
# ------------------------------
# agg = AgglomerativeClustering(n_clusters=7)
agg = AgglomerativeClustering(n_clusters=3)
model = agg.fit(irisPCADF)

# %%
# Add a new class column to irisPCADF
irisPCADF["class"] = model.labels_
irisPCADF.head()

# %%
# Plot to show the results of the hierarchical clustering algorithm
irisPCADF.hvplot.scatter(
    x="Principal Component 1",
    y="Principal Component 2",
    hover_cols=["class"],
    by="class",
)
Esempio n. 3
0
# In[10]:

plt.figure(figsize=(30, 10))

# In[11]:

plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Index')
plt.ylabel('Distance')
sch.dendrogram(
    z,
    leaf_rotation=0.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
)
plt.show()

# In[12]:

# Now applying AgglomerativeClustering choosing 3 as clusters from the dendrogram
from sklearn.cluster import AgglomerativeClustering
h_complete = AgglomerativeClustering(n_clusters=7,
                                     linkage='complete',
                                     affinity="euclidean").fit(df_norm)

# In[13]:

cluster_labels = pd.Series(h_complete.labels_)
air['clust'] = cluster_labels  # creating a  new column and assigning it to new column
air = air.iloc[:, 1:]
air.head()
from sklearn import metrics
from sklearn import datasets
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, AffinityPropagation, SpectralClustering

data = datasets.load_digits()
X, y = data.data, data.target

algorithms = []
algorithms.append(KMeans(n_clusters=10, random_state=1))
algorithms.append(AffinityPropagation())
algorithms.append(
    SpectralClustering(n_clusters=10,
                       random_state=1,
                       affinity='nearest_neighbors'))
algorithms.append(AgglomerativeClustering(n_clusters=10))

data = []
for algo in algorithms:
    algo.fit(X)
    data.append(({
        'ARI':
        metrics.adjusted_rand_score(y, algo.labels_),
        'AMI':
        metrics.adjusted_mutual_info_score(y,
                                           algo.labels_,
                                           average_method='arithmetic'),
        'Homogenity':
        metrics.homogeneity_score(y, algo.labels_),
        'Completeness':
        metrics.completeness_score(y, algo.labels_),
Esempio n. 5
0
import pandas as pd
from scipy import ndimage 
from scipy.cluster import hierarchy 
from scipy.spatial import distance_matrix 
from matplotlib import pyplot as plt 
from sklearn import manifold, datasets 
from sklearn.cluster import AgglomerativeClustering 
from sklearn.datasets.samples_generator import make_blobs 
%matplotlib inline

#generate random data
X1, y1 = make_blobs(n_samples=50, centers=[[4,4], [-2, -1], [1, 1], [10,4]], cluster_std=0.9)
plt.scatter(X1[:, 0], X1[:, 1], marker='o') 

#set up the model
agglom = AgglomerativeClustering(n_clusters = 4, linkage = 'average')
#fit
agglom.fit(X1,y1)
#plot
# Create a figure of size 6 inches by 4 inches.
plt.figure(figsize=(6,4))

# These two lines of code are used to scale the data points down,
# Or else the data points will be scattered very far apart.

# Create a minimum and maximum range of X1.
x_min, x_max = np.min(X1, axis=0), np.max(X1, axis=0)

# Get the average distance for X1.
X1 = (X1 - x_min) / (x_max - x_min)
svd = TruncatedSVD(n_components=k)
get_ipython().run_line_magic('time', 'U = svd.fit_transform(X)')
S = svd.singular_values_
V = svd.components_
print(U.shape, S.shape, V.shape)
sorted(show_topics(V, terms))  #Show the topics obtained with SVD Factorization

# ### 2.7- CLUSTERING
# <a id="clustering"></a>

# In[53]:

k = 4  #Number of clusters chosen assuming each pair of books could be around one cluster
get_ipython().run_line_magic(
    'time', 'X2 = TruncatedSVD(n_components=800).fit_transform(X)')
agg = AgglomerativeClustering(n_clusters=k)

# In[54]:

sample = np.random.choice(len(X2), replace=False, size=3000)
get_ipython().run_line_magic('time',
                             'agg_sample = agg.fit_predict(X2[sample])')

# In[55]:

# let's get the centroid/average of each cluster
centroids = np.array(
    [X2[sample][agg_sample == c].mean(axis=0) for c in range(k)])
print(centroids.shape)

# In[56]:
Esempio n. 7
0
if args.method == 'scikit':
    X = np.array(all_reps.values())

    if True:
        D = dist.squareform(dist.pdist(X, 'euclidean'))
        plt.imshow(D)
        plt.show()

    ms = MeanShift(bandwidth=0.9)
    ms.fit(X)

    labels_ms = ms.labels_
    cluster_centers = ms.cluster_centers_

    # AgglomerativeClustering
    clustering = AgglomerativeClustering(linkage='ward')
    clustering.fit(X)

    labels = clustering.labels_

    labels_unique = np.unique(labels)
    n_clusters = len(labels_unique)

    print n_clusters
    print labels

    all_mean = X.mean(axis=0)
    main_mean = X[labels==0].mean(axis=0)
    others_mean = X[labels!=0].mean(axis=0)

    d1 = np.linalg.norm(all_mean-main_mean)
Esempio n. 8
0
        pic_output = 'E:/Work/python/'  #概率密度图文件名前缀
        for i in range(k):
            print(u'%s%s.png' % (pic_output, i))
            if i > 1:
                density_plot(data[r[u'聚类类别'] == i]).savefig(u'%s%s.png' %
                                                            (pic_output, i))

if __name__ == "__main__2":
    print("AgglomerativeClustering")
    for k in range(mink, maxk):  # k取值1~11,做kmeans聚类,看不同k值对应的簇内误差平方和
        outputfile = 'E:/Work/python/fenlei%d.xlsx' % k
        # 读取数据并进行聚类分析
        # 调用层次聚类算法,进行聚类分析
        linkages = ['ward', 'average', 'complete']
        kmodel = AgglomerativeClustering(linkage=linkages[2],
                                         n_clusters=k,
                                         affinity='manhattan')

        kmodel.fit(data)  # 训练模型
        r1 = pd.Series(kmodel.labels_).value_counts()  # 统计各个类别的数目
        print(r1)
        #r2 = pd.DataFrame(kmodel.cluster_centers_)  # 找出聚类中心

        kmlabels = kmodel.labels_
        r = pd.concat([data, pd.Series(kmlabels, index=data.index)],
                      axis=1)  # 详细输出每个样本对应的类别,横向连接(0是纵向),得到聚类中心对应的类别下的数目
        r.columns = list(data.columns) + [u'聚类类别']  # 重命名表头
        #print(outputfile)
        r.to_excel(outputfile)  # 保存分类结果

        if mink >= maxk - 1:
def clustering(refFiles, species, predFiles, sppred, predAn, param, nbC=1000, nbC2=None, save='show', var=0.05,
               gating='line', showgat=None, method=1, fc='Accuri',channels=[], dicChannels={}):
    """

    :param refFiles:
    :param species:
    :param predFiles:
    :param sppred:
    :param predAn:
    :param param:
    :param nbC:
    :param nbC2:
    :param save:
    :param var:
    :param gating:
    :param showgat:
    :param method:
    :param fc:
    :return:
    """
    fileNames = [a.split('/')[-1][:-4]for a in predFiles]
    # create dicrectory & file option
    cwd = 'Results/'
    now = f.datetime.now()
    dirName = now.strftime("%Y%m%d-%H_%M_%S/")
    f.os.mkdir(cwd + dirName)
    cwd = cwd + dirName
    if showgat and save is not None:
        showgat = save
    else:
        showgat = None
    f.fileOption(cwd, refFiles, species, predFiles, sppred, nbC, nbC2, gating, 'prediction', '', var, 0, True, 0,channels=channels,dicChannels=dicChannels)
    # ## import and treat data
    refArrays = f.importFile(refFiles, gating=gating, save=showgat, fc=fc, cwd=cwd, channels=channels,dicChannels=dicChannels)
    if refArrays == []:
        return 'None'
    predArrays = f.importFile(predFiles, gating=gating, save=showgat, fc=fc, cwd=cwd, channels=channels,dicChannels=dicChannels)
    if predArrays == []:
        return 'None'
    Data = []
    targetPred = []
    if predAn == 'prediction':
        for anArray in predArrays:
            print(1)
            data2, atarget2, species2 = f.treat([anArray], sppred, None, mode='clustering', cluster=True)
            Data.append(data2)
            targetPred.append(atarget2)
            print(2)
    else:
        print(1.1)
        data2, target2, species2 = f.treat(predArrays, sppred, nbC2, mode='clustering', cluster=True)
        Data.append(data2)
        targetPred.append(target2)
        print(1.2)
    dataRef, targetRef, species = f.treat(refArrays, species, nbC, mode='analysis', cluster=True)
    print(3)

    # ##################CALCULATION############################################
    nmax = len(species)
    n = nmax
    species.sort()
    # point moyen ref :
    print(4)
    r = f.pd.concat([dataRef, targetRef], axis=1, join='inner')
    mpRef = r.groupby('SPECIES').mean()
    print(5)
    if method == 1:
        for k in range(len(Data)):
            S = []
            P = []
            for nb in range(2, n + 1):
                clusters = AgglomerativeClustering(n_clusters=nb).fit(Data[k])
                predictL = clusters.labels_
                P.append(predictL)
                S.append((silhouette_score(Data[k], predictL)))

            posMax = S.index(max(S))
            clust_nb = posMax + 2
            clusterSP, dicSP, distance = annotation(Data[k], P[posMax], clust_nb, nmax, mpRef, param, species)

            label = P[posMax]
            print(5.1)
            if clust_nb == 2:
                clusters = AgglomerativeClustering(n_clusters=1).fit(Data[k])
                predictL = clusters.labels_
                spp, dsp, dist = annotation(Data[k], predictL, 1, nmax, mpRef, param, species)

                if f.isBetter(clusterSP, dicSP, spp, dsp, var):
                    clust_nb = 1
                    distance = dist
                    label = predictL

            clusterSP = assignCluster(distance, species)
            predLabel = []
            i = 0
            for i in range(len(label)):
                print(5.2)
                if clusterSP[label[i]] != 0:
                    predLabel.append(clusterSP[label[i]])
                else:
                    predLabel.append(label[i])
            allsp = species + [i for i, x in enumerate(clusterSP) if x == 0]
            if predAn == 'analysis':
                conf = f.r.confusion_matrix(targetPred[k], predLabel, allsp)
                pA = f.pd.DataFrame({'exp': targetPred[k], 'pred': predLabel})
                pA.to_csv(cwd + 'results.csv', sep=';', mode='a')
                f.cmFile(conf, allsp, cwd, 'Prediction CM')
                f.plotConfusionMatrix(conf, allsp, save, cwd, normalize=True,
                                      name='CM with' + str(len(species)) + ' species', predAn='analysis')
            # newSp = list(set(species)) TODO check why I created it

            statistics = f.statAnalysis(predLabel, targetPred[k], allsp)

            f.exportStatistics(statistics, [''], cwd, 'predict')
            f.assessmentValue([statistics], allsp, cwd, [''], 'predict')
            if predAn == 'analysis' and save is not None:
                print(5.3)
                f.graph3d(Data[k], predLabel, targetPred[k], allsp + ['unknown'], param, statistics, save, cwd, 0,
                            name='with' + ' '.join(species), predtype=predAn, clust=True)
            elif predAn == 'prediction' and save is not None:
                print(5.4)
                f.graph3dRef(Data[k], predLabel, label, allsp + list(range(0, clust_nb)), param, statistics, save,
                               cwd,
                               refdata=dataRef, reflabel=targetRef, repeat=0,
                               name='with' + ' '.join(species), predtype=predAn,
                               clust=True)  # TODO changer les graph par les cluster numbers

    elif method == 2:
        print(6)
        # point moyen ref :
        for k in range(len(Data)):
            print(6.1)
            clusterSP = []
            dicSP = {}
            label = []
            clust_nb = 0
            distance = []
            predLabel = []
            n = nmax
            label =[]
            while n > 0:
                print(6.2)
                # ##PREDICTION##
                clusters = AgglomerativeClustering(n_clusters=n).fit(Data[k])
                predictL = clusters.labels_
                if n > 1:
                    print(n, ':', silhouette_score(Data[k], predictL))

                spp, dsp, dist = annotation(Data[k], predictL, n, nmax, mpRef, param, species)

                if f.isBetter(clusterSP, dicSP, spp, dsp, var):
                    dicSP = dsp
                    clust_nb = n
                    distance = dist
                    n = n - 1
                    label = predictL

                else:
                    n = 0
            clusterSP = assignCluster(distance, species)
            print(6.3)
            i = 0
            for i in range(len(label)):
                if clusterSP[label[i]] != 0:
                    predLabel.append(clusterSP[label[i]])
                else:
                    predLabel.append(label[i])
            print(6.4)
            # #### ASSESSMENT PART, GRAPH AND ACCURACY ####
            allsp = species + list(range(0, clust_nb))
            if predAn == 'analysis':
                print(6.7)
                conf = f.r.confusion_matrix(targetPred[k], predLabel, allsp)
                print(6.71)
                pA = f.pd.DataFrame({'exp': targetPred[k], 'pred': predLabel})
                print(6.72)
                pA.to_csv(cwd + 'results.csv', sep=';', mode='a')
                print(6.73)
                # newSp = list(set(species)) TODO why same question
                print(6.74)
                f.plotConfusionMatrix(conf, allsp, save, cwd, normalize=True, name='CM with' + str(len(species)) + ' species', predAn='analysis')
                print(6.75)
                f.cmFile(conf, allsp, cwd, 'Prediction CM')
            print(6.8)
            statistics = f.statAnalysis(predLabel, targetPred[k], allsp)
            f.exportStatistics(statistics, [''], cwd, 'predict')
            f.assessmentValue([statistics], allsp, cwd, [''], 'predict')
            if predAn == 'analysis' and save is not None:
                print(6.9)
                f.graph3d(Data[k], predLabel, targetPred[k], allsp + ['unknown'], param, statistics, save, cwd, 0,
                            name='with' + ' '.join(species), predtype=predAn, clust=True)
            elif predAn == 'prediction' and save is not None:
                print(6.10)
                f.graph3dRef(Data[k], predLabel, label, allsp, param, statistics, save, cwd, refdata=dataRef,
                               reflabel=targetRef, repeat=0, name=fileNames[k], predtype=predAn,
                               clust=True)
                # TODO changer les graph par les cluster numbers

    return f.os.getcwd().replace('\\', '/') + '/' + cwd
Esempio n. 10
0
def hierarchical_clustering(test_df, cluster_num, item_size, affinity_type, linkage_type):
    hierarchy_cluster = AgglomerativeClustering(affinity=affinity_type, n_clusters=cluster_num, linkage=linkage_type).fit(test_df.values[:, :item_size])
    P = hierarchy_cluster.labels_
    dfn = test_df.copy()
    dfn['cluster'] = P + 1
    return dfn
def WSI_Performance(Semeval_evaluation_directory, n_clusters, filedir,
                    topic_JS, suffix_of_inferrred_folders):
    print('######### Results wth #clusters = ' + str(n_clusters) +
          ' ############')
    wtfile_name = Semeval_evaluation_directory + '/unsup_eval/WSI_predicted_' + str(
        n_clusters) + 'cluster.txt'
    wtfile = open(wtfile_name, 'w')

    #Run for both the noun and verb tasks in Semeval 2010 dataset
    for subfolder in ['verbs', 'nouns']:
        rootdir = '/mnt/WSI_new/' + subfolder + '/parsed_files/'
        list_of_folders = []
        for subdir, dirs, files in os.walk(rootdir):
            if '.clean.tsvd.topic.1e-2_new_KL_exp' in subdir:
                list_of_folders.append(subdir)

        for folder in list_of_folders:
            f = open(folder + '/inferred_topics.txt')
            word = folder.strip().split('/')[-1].split('.')[0]
            doc = {}
            min_doc = 100
            topic_pool = []

            for line in f:
                elements = line.strip().split()
                try:
                    tmp = doc[int(elements[0])]
                except:
                    doc[int(elements[0])] = int(elements[1]) - 1
                    topic_pool.append(int(elements[1]) - 1)

                if int(elements[0]) < min_doc:
                    min_doc = int(elements[0])

            topic_pool = np.unique(np.asarray(topic_pool))
            reverse_index = {}

            for i in range(len(topic_pool)):
                reverse_index[topic_pool[i]] = i

            sim_matrix = topic_JS[topic_pool]
            sim_matrix = sim_matrix[:, topic_pool]

            clustering = AgglomerativeClustering(
                n_clusters=min(n_clusters, len(topic_pool)),
                affinity='precomputed',
                linkage='complete').fit(sim_matrix)
            group = clustering.labels_

            for key in doc:
                wtfile.write(word + '.' + subfolder[0] + ' ' + word + '.' +
                             subfolder[0] + '.' + str(key - min_doc + 1) +
                             ' ' + word + '.n.' +
                             str(group[reverse_index[doc[key]]] + 1) + '\n')
            f.close()
    wtfile.close()
    os.system('java -jar ' + Semeval_evaluation_directory +
              '/unsup_eval/vmeasure.jar ' + wtfile_name + ' ' +
              Semeval_evaluation_directory + '/unsup_eval/keys/all.key all')
    os.system('java -jar ' + Semeval_evaluation_directory +
              '/unsup_eval/fscore.jar ' + wtfile_name + ' ' +
              Semeval_evaluation_directory + '/unsup_eval/keys/all.key all')
df = preProcess(df)
print("Pre Processing Time Taken: " + str(time.time() - start_time))
# Get the attribute for processing

for catId in range(len(catList)):
    catbool = df['category']== catList[catId]
    sectDf = df[catbool]
    attribute = sectDf['attributes'].values
    masterDict = at.createAttributeList(attribute)
    #print(masterDict)
    saveDictToFile(masterDict,'MasterDictionary' + str(catList[catId]) + '.csv')
    #np.set_printoptions(threshold=sys.maxsize)
    dataMatrix = tk.tokenizeData(sectDf,masterDict)
    print("Starting Clustering for Group : " + str(catList[catId]) + "Time: " + str(time.time() - start_time))

    cluster = AgglomerativeClustering(n_clusters = None,
    distance_threshold = 0.005,compute_full_tree=True, linkage = "ward")
    cluster.fit_predict(dataMatrix)
    labels = cluster.labels_
    print("End of Clustering for Group : " + str(catList[catId]) + "Time: " + str(time.time() - start_time))

    clusterDict = dict()

    index1 = sectDf['index'].values
    for i in range(len(index1)):
        clusterDict[index1[i]] = labels[i]
    fname = 'ClusterAssignment_Group_' + str(catId) + '.csv'
    saveDictToFile(clusterDict,fname)


'''
    print(np.unique(labels))
Esempio n. 13
0
def run(load_sims=False, out_loc=OUT_LOC):

    if not load_sims:

        # train on patentsview
        print('Querying Author Map')
        mycursor.execute("USE aps")

        mycursor.execute("SELECT COUNT(record_id) FROM authors_raw")
        res = mycursor.fetchall()
        rows = res[0][0]
        mycursor.execute("SELECT record_id, firstname, surname, name FROM authors_raw")

        A_train = {}
        A_papers = {}
        i = 0
        Xs = mycursor.fetchall()
        for x in Xs:
            if i % 10000 == 0:
                flushPrint(i/10000, rows//10000)
            if x[2] is not None:
                surname = str(x[2].decode('utf-8'))
                if x[1] is not None:
                    firstname = str(x[1].decode('utf-8'))
                    if len(firstname) > 0:
                        fi = firstname[0].lower()
                    else:
                        fi = ''
                else:
                    fi = ''
                author = Author(fi + ' ' + surname.lower(), i, firstname=firstname, surname=surname)
            else:
                try:
                    name = str(x[3].decode('utf-8'))
                    author = Author(name.lower(), i)
                except AttributeError:
                    print(x[1], x[2], x[3])
                    continue
            publication = str(x[0].decode('utf-8'))
            if publication in A_papers:
                A_papers[publication].append(author)
            else:
                A_papers[publication] = [author]
            if author in A_train:
                A_train[author].append(publication)
            else:
                A_train[author] = [publication]
            i += 1

        print('\n Querying Reference/Citation Map')

        mycursor.execute('SELECT COUNT(citing_doi) FROM citations_raw')
        res = mycursor.fetchall()
        rows = res[0][0]

        q = 'SELECT citing_doi, cited_doi FROM citations_raw'
        mycursor.execute(q)

        C_train = {}
        R_train = {}
        Xs = mycursor.fetchall()
        i = 0
        for x in Xs:
            if i % 10000 == 0:
                flushPrint(i/10000,rows//10000)
            try:
                citing = str(x[0].decode('utf-8'))
                cited = str(x[1].decode('utf-8'))
            except AttributeError:
                print('Continuing', i)

            if cited in C_train:
                C_train[cited].append(citing)
            else:
                C_train[cited] = [citing]
            if citing in R_train:
                R_train[citing].append(cited)
            else:
                R_train[citing] = [cited]
            i += 1

        with open(os.path.join(out_loc, 'A_train.pkl'), 'wb') as f:
            pickle.dump(A_train, f)
        with open(os.path.join(out_loc, 'A_papers.pkl'), 'wb') as f:
            pickle.dump(A_papers, f)
        with open(os.path.join(out_loc, 'R_train.pkl'), 'wb') as f:
            pickle.dump(R_train, f)
        with open(os.path.join(out_loc, 'C_train.pkl'), 'wb') as f:
            pickle.dump(C_train, f)

    else:

        with open(os.path.join(out_loc,'A_train.pkl'), 'rb') as f:
            A_train = pickle.load(f)
        with open(os.path.join(out_loc, 'A_papers.pkl'), 'rb') as f:
            A_papers = pickle.load(f)
        with open(os.path.join(out_loc, 'R_train.pkl'), 'rb') as f:
            R_train = pickle.load(f)
        with open(os.path.join(out_loc, 'C_train.pkl'), 'rb') as f:
            C_train = pickle.load(f)


    print('\n Beginning Similarity Calculations ... ')
    Sims = {}
    Labels={}
    Pixmaps = {}
    disambs = {}
    rows = len(A_train)
    n=0
    es = list(A_train.keys())
    for e in es:
        if n%10==0:
            flushPrint(n/10, rows//10) #868
        # merge all papers
        papers = A_train[e]
        if 2<=len(papers):
            sim = np.zeros((len(papers),len(papers)))
            # sim = lil_matrix((len(papers),len(papers)))
            pixmap = {papers[i]:i for i in range(len(papers))}
            rev_pixmap = {i:papers[i] for i in range(len(papers))}
            for i,j in itertools.combinations(papers, 2):
                try:
                    sas = sa(A_papers,i,j)
                except Exception as ex:
                    # print('Error in sas', ex)
                    sas = 0.0
                try:
                    srs = sr(R_train,i,j)
                except Exception as ex:
                    # print('Error in srs', ex)
                    srs = 0.0
                try:
                    scs = sc(C_train,i,j)
                except Exception as ex:
                    # print('Error in scs', ex)
                    scs = 0.0
                try:
                    sxs = sx(R_train,i,j)
                except Exception as ex:
                    # print('Error in sxs', ex)
                    sxs = 0.0
                try:
                    sim[pixmap[i],pixmap[j]] = alpha_A*sas + alpha_R*srs + alpha_C*scs + alpha_S*sxs
                    # sim[pixmap[j],pixmap[i]] = alpha_A*sas + alpha_R*srs + alpha_C*scs + alpha_S*sxs
                except Exception as ex:
                    print(ex)


        del A_train[e]
        n+=1

        clustering = AgglomerativeClustering(n_clusters=None, affinity='precomputed', linkage='single', distance_threshold=beta_1)
        clustering2 = AgglomerativeClustering(n_clusters=None, affinity='precomputed', linkage='single', distance_threshold=beta_3)
        X = symmetrize(sim)
        labels = clustering.fit_predict(X)
        n_clusters = clustering.n_clusters_
        if n_clusters > 2:
            S_Sims = np.zeros(shape=(n_clusters,n_clusters))
            for c,d in itertools.combinations(list(range(n_clusters)),2):
                S = 0
                cidxs = np.argwhere(labels == c)
                didxs = np.argwhere(labels == d)
                for cidx in cidxs:
                    for didx in didxs:
                        i = rev_pixmap[cidx[0]]
                        j = rev_pixmap[didx[0]]
                        try:
                            sas = sa(A_papers,i,j)
                        except Exception as ex:
                            # print('Error in sas', ex)
                            sas = 0.0
                        try:
                            srs = sr(R_train,i,j)
                        except Exception as ex:
                            # print('Error in srs', ex)
                            srs = 0.0
                        try:
                            scs = sc(C_train,i,j)
                        except Exception as ex:
                            # print('Error in scs', ex)
                            scs = 0.0
                        try:
                            sxs = sx(R_train,i,j)
                        except Exception as ex:
                            # print('Error in sxs', ex)
                            sxs = 0.0
                        s = alpha_A*sas + alpha_R*srs + alpha_C*scs + alpha_S*sxs
                        if s > beta_2:
                            S += s/(cidxs.shape[0]*didxs.shape[0])
                            S_Sims[c,d] = S
                            S_Sims[d,c] = S
            labels2 = clustering2.fit_predict(S_Sims)
            n_clusters_2 = clustering2.n_clusters_
            t_disam = {}
            for i in range(n_clusters_2):
                u_paper_idxs = np.argwhere(labels2 == i)
                t_disam[i] = []
                for idx in u_paper_idxs:
                    cidxs = np.argwhere(labels == idx)
                    for pidx in cidxs:
                        t_disam[i].append(rev_pixmap[pidx[0]])
            disambs[e] = t_disam
        else:
            disambs[e] = {0:[rev_pixmap[i] for i in range(sim.shape[0])]}

    with open(os.path.join(out_loc, 'Disambiguated.pkl'), 'wb') as f:
        pickle.dump(disambs, f)
def find_clusters(g,
                  nclusters,
                  rho_list,
                  alpha_list,
                  localmethod: str = 'l1reg-rand',
                  normalize: bool = False,
                  normalized_objective: bool = False,
                  cpp: bool = True,
                  epsilon: float = 1.0e-2,
                  iterations: int = 10000000,
                  nsamples_from_rho: int = 50,
                  nsamples_from_alpha: int = 50,
                  linkage: str = 'average',
                  norm_type: int = 2,
                  njobs: int = 1,
                  prefer: str = 'threads',
                  backend: str = 'multiprocessing',
                  metric: str = 'euclidean'):
    """
    Find clusters in a graph using local graph clustering.
    --------------------------------

    This method runs local graph clustering for each node in the graph in parallel.
    Aggregates the embeddings and compute a pairwise distance matrix. 
    Then uses agglomerative clustering to find the clusters. 

    Parameters
    ----------

    g: GraphLocal

    nclusters: int
        Number of clusters to be returned
        
    rho_list: 2D list of floats
        This is an interval of rhos, the regularization parameter for l1-regularized PageRank.
        The first element should be smaller than the second elelement of the list.
        
    alpha_list: 2D list of floats
        This is an interval of alphas, the teleportation parameter for l1-regularized PageRank.
        The first element should be smaller than the second elelement of the list.
        The smaller the more global the personalized PageRank vector is.

    Parameters (optional)
    ---------------------
        
    nsamples_from_rho: int
        Number of samples of rho parameters to be selected from interval rho_list.
        
    nsamples_from_alpha: int
        Number of samples of alpha parameters to be selected from interval alpha_list.

    localmethod: string
        Default = 'l1reg-rand'
        Which method to use.
        Options: 'l1reg', 'l1reg-rand'.
        
    iterations: int
        Default = 1000000
        Maximum number of iterations of ACL algorithm.
        
    epsilon: float
        Default = 1.0e-2
        Tolerance for localmethod

    normalize: bool
        Default = True
        Normalize the output to be directly input into sweepcut routines.
        
    normalized_objective: bool
        Default = True
        Use normalized Laplacian in the objective function, works only for "method=l1reg" and "cpp=True"
        
    cpp: bool
        Default = True
        If true calls the cpp code for approximate pagerank, otherwise, it calls the python code.

    linkage: str
        Default = 'average'
        Which linkage criterion to use for agglomerative clustering. 
        For other options check: 
        https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html 
        
    metric: str
        Default = 'euclidean'
        Metric for measuring distances among nodes.
        For details check:
        https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html
        
    norm_type: int
        Default = 2
        Norm for normalization of the embeddings.
        
    njobs: int
        Default = 1
        Number of jobs to be run in parallel
        
    prefer, backend: str
        Check documentation of https://joblib.readthedocs.io/en/latest/


    Returns
    -------

    labels: np.ndarray
    An np.ndarray of the cluster allocation of each node.
    For example labels[i] is the cluster of node i.
    """

    n = g._num_vertices

    #     is_weighted = g._weighted

    if njobs > 1:
        results = Parallel(n_jobs=njobs, prefer=prefer, backend=backend)(
            delayed(compute_embedding)
            (g, node, rho_list, alpha_list, nsamples_from_rho,
             nsamples_from_alpha, localmethod, normalize, normalized_objective,
             epsilon, iterations, cpp) for node in range(n))
    else:
        results = [
            compute_embedding(g, node, rho_list, alpha_list, nsamples_from_rho,
                              nsamples_from_alpha, localmethod, alpha,
                              normalize, normalized_objective, epsilon,
                              iterations, cpp) for node in range(n)
        ]

    sum_ = 0
    JA = [0]
    IA = []
    A = []
    for data in results:
        vec = data[1] / np.linalg.norm(data[1], norm_type)
        how_many = len(data[0])
        sum_ += how_many
        JA.append(sum_)
        IA.extend(list(data[0]))
        A.extend(list(vec))

    X = sp.sparse.csc_matrix((A, IA, JA), shape=(n, n))

    X = X.transpose()

    Z = pairwise_distances(X, metric=metric, n_jobs=njobs)

    clustering = AgglomerativeClustering(n_clusters=nclusters,
                                         affinity="precomputed",
                                         linkage=linkage).fit(Z)
    labels = clustering.labels_

    return labels
Esempio n. 15
0
    data = []
    vocabulary = {}
    for datum in json_data:
        nameList.append(datum['name'])
        for label in datum['labels_m']:
            index = vocabulary.setdefault(label, len(vocabulary))
            indices.append(index)
            data.append(datum['labels_m'][label])
        indptr.append(len(indices))
    X = csr_matrix((data, indices, indptr), dtype=float).toarray()
    # initialize data if first
    if n == 0:
        for name in nameList:
            accData[name] = []
    # clustering
    AC = AgglomerativeClustering(n_clusters=NUM_OF_CLUSTERS)
    result = AC.fit(X[0:14400])
    print(result.labels_)

    # lenX = len(X)
    # result_total   = []
    # for i in range(DIVISION):
    #     start = int(lenX * i / DIVISION)
    #     end = int(lenX * (i + 1) / DIVISION)
    #     result_trial = AC.fit_predict(X[start: end])
    #     result_total.extend(result_trial)
    # # push data into accData
    # for j in range(len(nameList)):
    #     accData[nameList[j]].append(result_total[j])

# print(accData[list(accData.keys())[0]])
Esempio n. 16
0
descrps = ['euclidean_distance', 'cosine_distance', 'manhattan_distance']
wts = [0, 0.25]
#wt=0.25
for func, ddes in zip(funcs, descrps):
    for wt in wts:
        dp, dv = my_dist(data1, func, wt)
        del dv
        dmtx = dp
        print(f'{ddes}_wt_{wt}')
        K = range(0, 1001, 50)
        distance = []
        for k in K:
            n = 2 if k == 0 else k
            print(n)
            clstmodel = AgglomerativeClustering(n_clusters=n,
                                                affinity="precomputed",
                                                linkage="average").fit(dmtx)
            tot_d = 0  #total distance between each node
            tot_n = 0  #total # of connections
            for i in range(n):
                #calculate average distance
                subdmtx = dmtx[clstmodel.labels_ == i]
                subdmtx = subdmtx[:, clstmodel.labels_ == i]
                tot_d = tot_d + np.sum(np.triu(subdmtx))
                tot_n = tot_n + np.sum(np.triu(subdmtx) != 0)
            distance.append(
                tot_d /
                tot_n)  #avearage distance between any connected two data
        wt1 = int(wt * 100)
        dstns = plt.figure()
        plt.plot(K, distance, 'bx-')
Esempio n. 17
0
def cluster(signatures, clusters):
    clusterer = AgglomerativeClustering(n_clusters=clusters)
    return clusterer.fit_predict(signatures)
Esempio n. 18
0
def Hier_BarGraph_2(Data, title, folder, **kwargs):

    Nodes = kwargs['Dims']

    NODES = []
    for node in open('sclcnetwork.ids').readlines():
        NODES.append(str(node.split('\t')[0].strip()))
    upper_square = [
        'ASCL1', 'ATF2', 'CBFA2T2', 'CEBPD', 'ELF3', 'ETS2', 'FOXA1', 'FOXA2',
        'FLI1', 'INSM1', 'KDM5B', 'LEF1', 'MYB', 'OVOL2', 'PAX5', 'PBX1',
        'POU3F2', 'SOX11', 'SOX2', 'TCF12', 'TCF3', 'TCF4', 'NEUROD1'
    ]
    lower_square = [i for i in NODES if i not in upper_square]

    top = upper_square + lower_square
    NODES = top + ["POU2F3", "YAP1"]

    if Nodes == ['']:
        Nodes = NODES

    colours = [
        "#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"
    ]

    Remove = []
    for node in NODES:
        if not node in list(Data.index):
            Remove.append(node)

    for node in Remove:
        try:
            Nodes.remove(node)
        except ValueError:
            pass
        try:
            upper_square.remove(node)
        except ValueError:
            pass
        try:
            lower_square.remove(node)
        except ValueError:
            pass

    NODES = upper_square + lower_square + ["POU2F3", "YAP1"]

    if not os.path.exists(Path(folder, "bargraph")):
        os.mkdir(Path(folder, "bargraph"))
    data_n = Data.loc[Nodes]
    data_n = data_n.astype(float)
    scaled_data_n = preprocessing.scale(data_n.T)

    data_N = Data.loc[NODES]
    data_N = data_N.astype(float)
    scaled_data_N = preprocessing.scale(data_N.T)

    columns = np.array(data_n.columns)
    Data_n = pd.DataFrame(scaled_data_n.T, index=Nodes, columns=columns).T

    hc_labels_n = []

    patches = []
    for i in range(len(Nodes)):
        patches.append(mpatches.Patch(color=colours[i], label=Nodes[i]))

    for h in range(2, 7):

        hc_n = AgglomerativeClustering(n_clusters=h,
                                       affinity='euclidean',
                                       linkage='ward')
        y_n = hc_n.fit_predict(scaled_data_n)
        hc_labels_n.append(np.array(hc_n.labels_))

    fig, ax = plt.subplots()
    sns.set_context("paper", font_scale=1.5)

    for h in range(2, 7):

        ticks = []
        tick_labels = []
        x = np.arange(len(NODES)) * 3

        for i in range(h):
            #ticks.append(x[int(len(NODES)/2)])
            #tick_labels.append(np.sum(hc_labels_n[h-2] == i))
            y = np.array(
                np.mean(scaled_data_N[hc_labels_n[h - 2] == i], axis=0))
            y_error = np.array(
                np.std(scaled_data_N[hc_labels_n[h - 2] == i], axis=0)) / (
                    (scaled_data_N[hc_labels_n[h - 2] == i]).shape[0])**(0.5)
            barlist = ax.bar(x, y, yerr=y_error, width=2.5, label=str(i))
            #fill the colors
            for n in range(0, len(upper_square)):
                barlist[n].set_color('r')
            for n in range(1 + len(upper_square),
                           len(upper_square) + len(lower_square)):
                barlist[n].set_color('b')
            barlist[len(upper_square)].set_color('orange')
            barlist[-2].set_color('y')
            barlist[-1].set_color('g')

            ax.set_xticks(x)
            ax.set_xticklabels(range(len(NODES)))
            plt.setp(ax.get_xticklabels(), fontsize='8')
            ax.set_xlabel('Nodes')
            ax.set_ylabel("Expression value")
            ax.set_title(title +
                         ": Exp_of_All_Genes_hier={}_cluster={}".format(h, i))
            plt.savefig(Path(
                folder, "bargraph", title +
                "_Exp_of_All_genes_hier_{}_cluster_{}.png".format(h, i)),
                        format='png',
                        bbox_inches="tight")
            plt.cla()
            #x = x + 0.6
        '''
Esempio n. 19
0
mAP_l2 = []
acc1_l2 = []
acc10_l2 = []

mAP_cos = []
acc1_cos = []
acc10_cos = []

mAP_corr = []
acc1_corr = []
acc10_corr = []

for n in n_list:
    clustering = AgglomerativeClustering(n_clusters=n,
                                         affinity='euclidean',
                                         linkage='ward',
                                         distance_threshold=None)
    clustering_train = clustering.fit(original_train.T)
    train_labels = clustering_train.labels_
    #n_connected = clustering_train.n_connected_components_
    train_labels = np.array(train_labels)

    optimised_train_labels = hungarian_algo(train_labels, Y_train, n)
    #print(optimised_train_labels)

    centroid_array = np.zeros((2576, 32))

    for i in range(len(optimised_train_labels)):
        index = optimised_train_labels[i] - 1
        centroid_array[:, index] += original_train[:, i]
Esempio n. 20
0
def Implement_Algorithm(algorithm):
    global G_X_Dataset, G_y_Dataset, G_TIME_Dataset, G_SSE_Dataset, G_CSM_Dataset, G_DBI_Dataset
    print("\n#Implement_Algorithm( " + algorithm + " )")

    if algorithm == "K means":
        for k in n_Clusters:
            print(algorithm, 'Cluster = ', k)
            for i in range(0, 4):
                starttime = time.time()
                algo = KMeans(n_clusters=k,
                              init='k-means++',
                              n_init=10,
                              max_iter=len(G_X_Dataset[i]),
                              random_state=0)
                y_pred = algo.fit_predict(G_X_Dataset[i])
                endtime = time.time()
                G_TIME_Dataset[i].append(endtime - starttime)  # Time Taken
                G_SSE_Dataset[i].append(algo.inertia_)  # SSE
                score = silhouette_score(G_X_Dataset[i],
                                         y_pred,
                                         metric='euclidean')
                G_CSM_Dataset[i].append(score)  # CSM
                #print('Time Taken of Dataset', i+1, 'is ', G_TIME_Dataset[i])
                #print('SSE of Dataset', i+1, 'is ', G_SSE_Dataset[i])
                #print('CSM of Dataset', i+1, 'is ', G_CSM_Dataset[i])

    elif algorithm == "DBSCAN":
        #test_params_DBSCAN(np.int(0))
        #test_params_DBSCAN(np.int(1))
        #test_params_DBSCAN(np.int(2))
        #test_params_DBSCAN(np.int(3))
        for k in n_Clusters:
            for i in range(0, 4):
                starttime = time.time()
                if i == 0: algo = DBSCAN(eps=1.2, min_samples=6)
                elif i == 1: algo = DBSCAN(eps=1.2, min_samples=6)
                elif i == 2: algo = DBSCAN(eps=1.2, min_samples=6)
                elif i == 3: algo = DBSCAN(eps=1.2, min_samples=6)
                y_pred = algo.fit_predict(G_X_Dataset[i])
                endtime = time.time()
                if len(Counter(y_pred)) == 1:
                    print('Error!', Counter(y_pred))
                    break
                G_TIME_Dataset[i].append(endtime - starttime)  # Time Taken
                G_DBI_Dataset[i].append(
                    davies_bouldin_score(G_X_Dataset[i], y_pred))  # DBI
                G_CSM_Dataset[i].append(
                    silhouette_score(G_X_Dataset[i],
                                     y_pred,
                                     metric='euclidean'))  # CSM
            print('Time Taken of Dataset', i + 1, 'is ', G_TIME_Dataset[i])
            print('DBI of Dataset', i + 1, 'is ', G_DBI_Dataset[i])
            print('CSM of Dataset', i + 1, 'is ', G_CSM_Dataset[i])

    elif algorithm == "Agglomerative":
        for k in n_Clusters:
            print(algorithm, 'Cluster = ', k)
            for i in range(0, 4):
                starttime = time.time()
                algo = AgglomerativeClustering(n_clusters=k,
                                               affinity='euclidean',
                                               linkage='complete')
                y_pred = algo.fit_predict(G_X_Dataset[i])
                endtime = time.time()
                if len(Counter(y_pred)) == 1:
                    print('Error!', Counter(y_pred))
                    break
                G_TIME_Dataset[i].append(endtime - starttime)  # Time Taken
                G_DBI_Dataset[i].append(
                    davies_bouldin_score(G_X_Dataset[i], y_pred))  # DBI
                G_CSM_Dataset[i].append(
                    silhouette_score(G_X_Dataset[i],
                                     y_pred,
                                     metric='euclidean'))  # CSM
            print('Time Taken of Dataset', i + 1, 'is ', G_TIME_Dataset[i])
            print('SSE of Dataset', i + 1, 'is ', G_DBI_Dataset[i])
            print('CSM of Dataset', i + 1, 'is ', G_CSM_Dataset[i])

    else:
        print("#[Algorithm_error]")
Esempio n. 21
0
#3 clustres
plt.figure(figsize=(10, 7))
plt.title("Dedrograms")
dend = shc.dendrogram(shc.linkage(data_scaled, method='ward'))

#2 clusters
plt.figure(figsize=(10, 7))
plt.title("Dedrograms")
dend = shc.dendrogram(shc.linkage(data_scaled, method='ward'))
plt.axhline(y=7, color="r", linestyle='--')

#lets make cluster
#hierarchical clustering for 2 clusters
from sklearn.cluster import AgglomerativeClustering
cluster = AgglomerativeClustering(n_clusters=2,
                                  affinity='euclidean',
                                  linkage='ward')
cluster.fit_predict(data_scaled)
#scatter plot
plt.figure(figsize=(10, 7))
plt.scatter(data_scaled['Milk'], data_scaled['Grocery'], c=cluster.labels_)

#K means
#importing needed libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import seaborn as sns
Esempio n. 22
0
def Display_CSM_plot(algorithm):
    print(
        '\n#Display the CSM plot for the best value of the K parameter for each dataset'
    )
    global G_X_Dataset

    # 【注意】需要根据不同算法的最佳K值来设定
    n_clusters = 3  # 先设定我们要分成的簇数 先用通用值 3
    fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4)  # 创建一个画布,画布上共有一行四列四个图
    fig.set_size_inches(17, 10)  # 画布尺寸
    plt.suptitle("Algorithm : " + algorithm, fontsize=18, fontweight='bold')
    # 第一个图是我们的轮廓系数图像,是由各个簇的轮廓系数组成的横向条形图
    # 横向条形图的横坐标是我们的轮廓系数取值,纵坐标是我们的每个样本,因为轮廓系数是对于每一个样本进行计算的
    # 首先我们来设定横坐标
    # 轮廓系数的取值范围在[-1,1]之间,但我们至少是希望轮廓系数要大于0的
    # 太长的横坐标不利于我们的可视化,所以只设定X轴的取值在[-0.1,1]之间
    ax1.set_xlim([0, 1])
    ax2.set_xlim([0, 1])
    ax3.set_xlim([0, 1])
    ax4.set_xlim([0, 1])

    # 接下来设定纵坐标,通常来说,纵坐标是从0开始,最大值取到X.shape[0]的取值
    # 但我们希望,每个簇能够排在一起,不同的簇之间能够有一定的空隙
    # 以便我们看到不同的条形图聚合成的块,理解它是对应了哪一个簇
    # 因此我们在设定纵坐标的取值范围的时候,在X.shape[0]上,加上一个距离(n_clusters + 1) * 10,留作间隔用
    ax1.set_ylim([0, G_X_Dataset[0].shape[0] + (n_clusters + 1) * 10])
    ax2.set_ylim([0, G_X_Dataset[1].shape[0] + (n_clusters + 1) * 10])
    ax3.set_ylim([0, G_X_Dataset[2].shape[0] + (n_clusters + 1) * 10])
    ax4.set_ylim([0, G_X_Dataset[3].shape[0] + (n_clusters + 1) * 10])

    # 开始建模,调用聚类好的标签

    for i in range(0, 4):
        if algorithm == "K means":
            n_clusters = 3
            clusterer_Dataset1 = KMeans(n_clusters=n_clusters,
                                        random_state=10).fit(G_X_Dataset[0])
            clusterer_Dataset2 = KMeans(n_clusters=n_clusters,
                                        random_state=10).fit(G_X_Dataset[1])
            clusterer_Dataset3 = KMeans(n_clusters=n_clusters,
                                        random_state=10).fit(G_X_Dataset[2])
            clusterer_Dataset4 = KMeans(n_clusters=n_clusters,
                                        random_state=10).fit(G_X_Dataset[3])
        elif algorithm == "DBSCAN":
            print('DBSCAN')
            # n_clusters = 4
            clusterer_Dataset1 = DBSCAN(
                1.2,
                min_samples=np.int(2 * PCA_n_components)).fit(G_X_Dataset[0])
            clusterer_Dataset2 = DBSCAN(
                1.2,
                min_samples=np.int(2 * PCA_n_components)).fit(G_X_Dataset[1])
            clusterer_Dataset3 = DBSCAN(
                1.2,
                min_samples=np.int(2 * PCA_n_components)).fit(G_X_Dataset[2])
            clusterer_Dataset4 = DBSCAN(
                1.2,
                min_samples=np.int(2 * PCA_n_components)).fit(G_X_Dataset[3])
        elif algorithm == "Agglomerative":
            print('Agglomerative')
            n_clusters = 5
            clusterer_Dataset1 = AgglomerativeClustering(
                n_clusters=n_clusters,
                affinity='euclidean',
                linkage='complete').fit(G_X_Dataset[0])
            clusterer_Dataset2 = AgglomerativeClustering(
                n_clusters=n_clusters,
                affinity='euclidean',
                linkage='complete').fit(G_X_Dataset[1])
            clusterer_Dataset3 = AgglomerativeClustering(
                n_clusters=n_clusters,
                affinity='euclidean',
                linkage='complete').fit(G_X_Dataset[2])
            clusterer_Dataset4 = AgglomerativeClustering(
                n_clusters=n_clusters,
                affinity='euclidean',
                linkage='complete').fit(G_X_Dataset[3])
        else:
            print("algorithm error!")

        cluster_labels_Dataset1 = clusterer_Dataset1.labels_
        cluster_labels_Dataset2 = clusterer_Dataset2.labels_
        cluster_labels_Dataset3 = clusterer_Dataset3.labels_
        cluster_labels_Dataset4 = clusterer_Dataset4.labels_

        #print(G_X_Dataset[0].shape[0],G_X_Dataset[1].shape[0],G_X_Dataset[2].shape[0],G_X_Dataset[3].shape[0])
        #print(cluster_labels_Dataset1,cluster_labels_Dataset2,cluster_labels_Dataset3,cluster_labels_Dataset4)

        # 调用轮廓系数分数,注意,silhouette_score生成的是所有样本点的轮廓系数均值
        # 两个需要输入的参数是,特征矩阵X和聚类完毕后的标签
        silhouette_avg_Dataset1 = silhouette_score(G_X_Dataset[0],
                                                   cluster_labels_Dataset1)
        silhouette_avg_Dataset2 = silhouette_score(G_X_Dataset[1],
                                                   cluster_labels_Dataset2)
        silhouette_avg_Dataset3 = silhouette_score(G_X_Dataset[2],
                                                   cluster_labels_Dataset3)
        silhouette_avg_Dataset4 = silhouette_score(G_X_Dataset[3],
                                                   cluster_labels_Dataset4)

        # 用print来报一下结果,现在的簇数量下,整体的轮廓系数究竟有多少
        print("For n_clusters =", n_clusters,
              "The average silhouette_score of Dataset1 is :",
              silhouette_avg_Dataset1)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score of Dataset2 is :",
              silhouette_avg_Dataset2)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score of Dataset3 is :",
              silhouette_avg_Dataset3)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score of Dataset4 is :",
              silhouette_avg_Dataset4)

        # 调用silhouette_samples,返回每个样本点的轮廓系数,这就是我们的横坐标
        sample_silhouette_values_Dataset1 = silhouette_samples(
            G_X_Dataset[0], cluster_labels_Dataset1)
        sample_silhouette_values_Dataset2 = silhouette_samples(
            G_X_Dataset[1], cluster_labels_Dataset2)
        sample_silhouette_values_Dataset3 = silhouette_samples(
            G_X_Dataset[2], cluster_labels_Dataset3)
        sample_silhouette_values_Dataset4 = silhouette_samples(
            G_X_Dataset[3], cluster_labels_Dataset4)

        # 设定y轴上的初始取值
        y_lower_Dataset1 = 10
        y_lower_Dataset2 = 10
        y_lower_Dataset3 = 10
        y_lower_Dataset4 = 10

        # 接下来,对每一个簇进行循环
        for j in range(n_clusters):
            # 从每个样本的轮廓系数结果中抽取出第j个簇的轮廓系数,并对他进行排序

            ith_cluster_silhouette_values_Dataset1 = sample_silhouette_values_Dataset1[
                cluster_labels_Dataset1 == j]
            ith_cluster_silhouette_values_Dataset2 = sample_silhouette_values_Dataset2[
                cluster_labels_Dataset2 == j]
            ith_cluster_silhouette_values_Dataset3 = sample_silhouette_values_Dataset3[
                cluster_labels_Dataset3 == j]
            ith_cluster_silhouette_values_Dataset4 = sample_silhouette_values_Dataset4[
                cluster_labels_Dataset4 == j]

            # 注意, .sort()这个命令会直接改掉原数据的顺序
            ith_cluster_silhouette_values_Dataset1.sort()
            ith_cluster_silhouette_values_Dataset2.sort()
            ith_cluster_silhouette_values_Dataset3.sort()
            ith_cluster_silhouette_values_Dataset4.sort()

            # 查看这一个簇中究竟有多少个样本
            size_cluster_j_Dataset1 = ith_cluster_silhouette_values_Dataset1.shape[
                0]
            size_cluster_j_Dataset2 = ith_cluster_silhouette_values_Dataset2.shape[
                0]
            size_cluster_j_Dataset3 = ith_cluster_silhouette_values_Dataset3.shape[
                0]
            size_cluster_j_Dataset4 = ith_cluster_silhouette_values_Dataset4.shape[
                0]
            # 这一个簇在y轴上的取值,应该是由初始值(y_lower)开始,到初始值+加上这个簇中的样本数量结束(y_upper)
            y_upper_Dataset1 = y_lower_Dataset1 + size_cluster_j_Dataset1
            y_upper_Dataset2 = y_lower_Dataset2 + size_cluster_j_Dataset2
            y_upper_Dataset3 = y_lower_Dataset3 + size_cluster_j_Dataset3
            y_upper_Dataset4 = y_lower_Dataset4 + size_cluster_j_Dataset4

            # colormap库中的,使用小数来调用颜色的函数
            # 在nipy_spectral([输入任意小数来代表一个颜色])
            # 在这里我们希望每个簇的颜色是不同的,我们需要的颜色种类刚好是循环的个数的种类
            # 在这里,只要能够确保,每次循环生成的小数是不同的,可以使用任意方式来获取小数
            # 在这里,我是用i的浮点数除以n_clusters,在不同的i下,自然生成不同的小数
            # 以确保所有的簇会有不同的颜色
            color = cm.nipy_spectral(float(j) / n_clusters)

            # 开始填充子图1中的内容
            # fill_between是填充曲线与直角之间的空间的函数
            # fill_betweenx的直角是在纵坐标上
            # fill_betweeny的直角是在横坐标上
            # fill_betweenx的参数应该输入(定义曲线的点的横坐标,定义曲线的点的纵坐标,柱状图的颜色)
            ax1.fill_betweenx(np.arange(y_lower_Dataset1, y_upper_Dataset1),
                              ith_cluster_silhouette_values_Dataset1,
                              facecolor=color,
                              alpha=0.7)
            ax2.fill_betweenx(np.arange(y_lower_Dataset2, y_upper_Dataset2),
                              ith_cluster_silhouette_values_Dataset2,
                              facecolor=color,
                              alpha=0.7)
            ax3.fill_betweenx(np.arange(y_lower_Dataset3, y_upper_Dataset3),
                              ith_cluster_silhouette_values_Dataset3,
                              facecolor=color,
                              alpha=0.7)
            ax4.fill_betweenx(np.arange(y_lower_Dataset4, y_upper_Dataset4),
                              ith_cluster_silhouette_values_Dataset4,
                              facecolor=color,
                              alpha=0.7)
            # 为每个簇的轮廓系数写上簇的编号,并且让簇的编号显示坐标轴上每个条形图的中间位置
            # text的参数为(要显示编号的位置的横坐标,要显示编号的位置的纵坐标,要显示的编号内容)
            ax1.text(-0.05, y_lower_Dataset1 + 0.5 * size_cluster_j_Dataset1,
                     str(j))
            ax2.text(-0.05, y_lower_Dataset2 + 0.5 * size_cluster_j_Dataset2,
                     str(j))
            ax3.text(-0.05, y_lower_Dataset3 + 0.5 * size_cluster_j_Dataset3,
                     str(j))
            ax4.text(-0.05, y_lower_Dataset4 + 0.5 * size_cluster_j_Dataset4,
                     str(j))
            # 为下一个簇计算新的y轴上的初始值,是每一次迭代之后,y的上线再加上10
            # 以此来保证,不同的簇的图像之间显示有空隙
            y_lower_Dataset1 = y_upper_Dataset1 + 5
            y_lower_Dataset2 = y_upper_Dataset2 + 5
            y_lower_Dataset3 = y_upper_Dataset3 + 5
            y_lower_Dataset4 = y_upper_Dataset4 + 5

    # 给图1加上标题,横坐标轴,纵坐标轴的标签
    ax1.set_title("The CSM plot for [Dow Jones Index]")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")
    ax2.set_title("The CSM plot for [Facebook Live Sellers in Thailand]")
    ax2.set_xlabel("The silhouette coefficient values")
    ax2.set_ylabel("Cluster label")
    ax3.set_title("The CSM plot for [Sales Transactions]")
    ax3.set_xlabel("The silhouette coefficient values")
    ax3.set_ylabel("Cluster label")
    ax4.set_title("The CSM plot for [Water Treatment Plant]")
    ax4.set_xlabel("The silhouette coefficient values")
    ax4.set_ylabel("Cluster label")
    # 把整个数据集上的轮廓系数的均值以虚线的形式放入我们的图中
    ax1.axvline(x=silhouette_avg_Dataset1, color="red", linestyle="--")
    ax2.axvline(x=silhouette_avg_Dataset2, color="red", linestyle="--")
    ax3.axvline(x=silhouette_avg_Dataset3, color="red", linestyle="--")
    ax4.axvline(x=silhouette_avg_Dataset4, color="red", linestyle="--")

    # 让y轴不显示任何刻度
    ax1.set_yticks([])
    ax2.set_yticks([])
    ax3.set_yticks([])
    ax4.set_yticks([])

    # 让x轴上的刻度显示为我们规定的列表
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
    ax2.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
    ax3.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
    ax4.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    fig.tight_layout()
    plt.savefig(File_Path_DataSet + 'Result - ' + algorithm + '_CSM Plot',
                dpi=300)
Esempio n. 23
0
    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        batch_size = 32
        first=True
        for start_idx in tqdm.tqdm(range(0, len(documents), batch_size)):
            messages = documents[start_idx: min(start_idx+batch_size, len(documents))]
            message_embeddings = session.run(embed(messages))
            if first:
                embeddings = message_embeddings
                first = False
            else:
                embeddings = np.concatenate((embeddings, message_embeddings), axis=0)
            np.save(embedding_file, embeddings)

cls_num = 200
cluster = AgglomerativeClustering(n_clusters=cls_num, affinity='cosine', linkage='complete').fit(embeddings)
labels = cluster.labels_
output_file = 'caption_%s_cluster_%d.txt' % (caption_type, cls_num)
cluster_items = np.zeros((cls_num,))
for clss_id in range(cls_num):
    # print('-' * 50)
    for idx, sentence in enumerate(documents):
        if labels[idx] == clss_id:
        #     print(sentence, counts[idx])
            cluster_items[clss_id] += counts[idx]

# sort cluster_items
lines = []
idxs = np.argsort(cluster_items).tolist()[::-1]
for clss_id in idxs:
    print('-' * 20 + '%d/%d' %(cluster_items[clss_id], np.sum(counts)) + '-' * 20)
Esempio n. 24
0
def test_agglomerative_clustering():
    # Check that we obtain the correct number of clusters with
    # agglomerative clustering.
    rng = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    n_samples = 100
    X = rng.randn(n_samples, 50)
    connectivity = grid_to_graph(*mask.shape)
    for linkage in ("ward", "complete", "average", "single"):
        clustering = AgglomerativeClustering(n_clusters=10,
                                             connectivity=connectivity,
                                             linkage=linkage)
        clustering.fit(X)
        # test caching
        try:
            tempdir = mkdtemp()
            clustering = AgglomerativeClustering(n_clusters=10,
                                                 connectivity=connectivity,
                                                 memory=tempdir,
                                                 linkage=linkage)
            clustering.fit(X)
            labels = clustering.labels_
            assert np.size(np.unique(labels)) == 10
        finally:
            shutil.rmtree(tempdir)
        # Turn caching off now
        clustering = AgglomerativeClustering(n_clusters=10,
                                             connectivity=connectivity,
                                             linkage=linkage)
        # Check that we obtain the same solution with early-stopping of the
        # tree building
        clustering.compute_full_tree = False
        clustering.fit(X)
        assert_almost_equal(
            normalized_mutual_info_score(clustering.labels_, labels), 1)
        clustering.connectivity = None
        clustering.fit(X)
        assert np.size(np.unique(clustering.labels_)) == 10
        # Check that we raise a TypeError on dense matrices
        clustering = AgglomerativeClustering(
            n_clusters=10,
            connectivity=sparse.lil_matrix(connectivity.toarray()[:10, :10]),
            linkage=linkage)
        assert_raises(ValueError, clustering.fit, X)

    # Test that using ward with another metric than euclidean raises an
    # exception
    clustering = AgglomerativeClustering(n_clusters=10,
                                         connectivity=connectivity.toarray(),
                                         affinity="manhattan",
                                         linkage="ward")
    assert_raises(ValueError, clustering.fit, X)

    # Test using another metric than euclidean works with linkage complete
    for affinity in PAIRED_DISTANCES.keys():
        # Compare our (structured) implementation to scipy
        clustering = AgglomerativeClustering(n_clusters=10,
                                             connectivity=np.ones(
                                                 (n_samples, n_samples)),
                                             affinity=affinity,
                                             linkage="complete")
        clustering.fit(X)
        clustering2 = AgglomerativeClustering(n_clusters=10,
                                              connectivity=None,
                                              affinity=affinity,
                                              linkage="complete")
        clustering2.fit(X)
        assert_almost_equal(
            normalized_mutual_info_score(clustering2.labels_,
                                         clustering.labels_), 1)

    # Test that using a distance matrix (affinity = 'precomputed') has same
    # results (with connectivity constraints)
    clustering = AgglomerativeClustering(n_clusters=10,
                                         connectivity=connectivity,
                                         linkage="complete")
    clustering.fit(X)
    X_dist = pairwise_distances(X)
    clustering2 = AgglomerativeClustering(n_clusters=10,
                                          connectivity=connectivity,
                                          affinity='precomputed',
                                          linkage="complete")
    clustering2.fit(X_dist)
    assert_array_equal(clustering.labels_, clustering2.labels_)
                     'weight': 'bold',
                     'size': 9
                 })

    plt.xticks([])
    plt.yticks([])
    if title is not None:
        plt.title(title, size=17)
    plt.axis('off')
    plt.tight_layout()


#----------------------------------------------------------------------
# 2D embedding of the digits dataset
print("Computing embedding")
X_red = manifold.SpectralEmbedding(n_components=2).fit_transform(X)
print("Done.")

from sklearn.cluster import AgglomerativeClustering

for linkage in ('ward', 'average', 'complete'):
    clustering = AgglomerativeClustering(linkage=linkage, n_clusters=10)
    t0 = time()
    clustering.fit(X_red)
    print("%s : %.2fs" % (linkage, time() - t0))

    plot_clustering(X_red, X, clustering.labels_, "%s linkage" % linkage)

plt.show()

print confusion_matrix(y, clustering.labels_)
def evaluate_categorization(w, X, y, method="kmeans", seed=None):
    """
    Evaluate embeddings on categorization task.

    Parameters
    ----------
    w: Embedding or dict
      Embedding to test.

    X: vector, shape: (n_samples, )
      Vector of words.

    y: vector, shape: (n_samples, )
      Vector of cluster assignments.

    method: string, default: "all"
      What method to use. Possible values are "agglomerative", "kmeans", "all.
      If "agglomerative" is passed, method will fit AgglomerativeClustering (with very crude
      hyperparameter tuning to avoid overfitting).
      If "kmeans" is passed, method will fit KMeans.
      In both cases number of clusters is preset to the correct value.

    seed: int, default: None
      Seed passed to KMeans.

    Returns
    -------
    purity: float
      Purity of the best obtained clustering.

    Notes
    -----
    KMedoids method was excluded as empirically didn't improve over KMeans (for categorization
    tasks available in the package).
    """

    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    assert method in ["all", "kmeans", "agglomerative"], "Uncrecognized method"

    mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
    new_x = []
    new_y = []
    exist_cnt = 0

    for idx, word in enumerate(X.flatten()):
        if word in w:
            new_x.append(X[idx])
            new_y.append(y[idx])
            exist_cnt += 1

    print('exist {} in {}'.format(exist_cnt, len(X)))
    X = np.array(new_x)
    y = np.array(new_y)

    words = np.vstack([w.get(word, mean_vector) for word in X.flatten()])
    ids = np.random.RandomState(seed).choice(range(len(X)),
                                             len(X),
                                             replace=False)

    # Evaluate clustering on several hyperparameters of AgglomerativeClustering and
    # KMeans
    best_purity = 0

    if method == "all" or method == "agglomerative":
        best_purity = calculate_purity(
            y[ids],
            AgglomerativeClustering(n_clusters=len(set(y)),
                                    affinity="euclidean",
                                    linkage="ward").fit_predict(words[ids]))
        logger.debug("Purity={:.3f} using affinity={} linkage={}".format(
            best_purity, 'euclidean', 'ward'))
        for affinity in ["cosine", "euclidean"]:
            for linkage in ["average", "complete"]:
                purity = calculate_purity(
                    y[ids],
                    AgglomerativeClustering(n_clusters=len(set(y)),
                                            affinity=affinity,
                                            linkage=linkage).fit_predict(
                                                words[ids]))
                logger.debug(
                    "Purity={:.3f} using affinity={} linkage={}".format(
                        purity, affinity, linkage))
                best_purity = max(best_purity, purity)

    if method == "all" or method == "kmeans":
        purity = calculate_purity(
            y[ids],
            KMeans(random_state=seed, n_init=10,
                   n_clusters=len(set(y))).fit_predict(words[ids]))
        logger.debug("Purity={:.3f} using KMeans".format(purity))
        best_purity = max(purity, best_purity)

    return best_purity
import sys
sys.path.append("../")
import mglearn

X, y = make_moons(n_samples=200, noise=0.05, random_state=0) 
# rescale the data to zero mean and unit variance 
scaler = StandardScaler() 
scaler.fit(X) 
X_scaled = scaler.transform(X)

fig, axes = plt.subplots(1, 4, figsize=(15, 3), subplot_kw={'xticks': (), 'yticks': ()})

# create a random cluster assignment for reference 
random_state = np.random.RandomState(seed=0) 
random_clusters = random_state.randint(low=0, high=2, size=len(X))

# plot random assignment 
axes[0].scatter(X_scaled[:, 0], X_scaled[:, 1], c=random_clusters, cmap=mglearn.cm3, s=60) 
axes[0].set_title("Random assignment: {:.2f}".format(silhouette_score(X_scaled, random_clusters)))

algorithms = [KMeans(n_clusters=2), AgglomerativeClustering(n_clusters=2), DBSCAN()]

for ax, algorithm in zip(axes[1:], algorithms):
    clusters = algorithm.fit_predict(X_scaled) 
    # plot the cluster assignments and cluster centers 
    ax.scatter(X_scaled[:, 0], X_scaled[:, 1], c=clusters, cmap=mglearn.cm3, s=60) 
    ax.set_title("{} : {:.2f}".format(algorithm.__class__.__name__, silhouette_score(X_scaled, clusters)))

plt.show()

Esempio n. 28
0
def clusterHierarchical(df, n_clusters, distance, linkage):
        model = AgglomerativeClustering(n_clusters = n_clusters,
                                     affinity = distance,
                                     linkage = linkage)
        clustering = model.fit(df)
        return clustering
Esempio n. 29
0
@author: sunil
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


#importing data sets
ds=pd.read_csv('Mall_Customers.csv')
X=ds.iloc[:,[3,4]].values

#plotting dendogram
import scipy.cluster.hierarchy as sch
dendogram=sch.dendrogram(sch.linkage(X, method='ward'))
plt.title('Dendogram')
plt.xlabel('salary')
plt.ylabel('customerscore')

#fittin model heirachical clusturing
from sklearn.cluster import AgglomerativeClustering
hc=AgglomerativeClustering(affinity='euclidean',linkage='ward',n_clusters=5)
y_hc=hc.fit_predict(X)

#plotting graph
plt.scatter(X[y_hc==0,0],X[y_hc==0,1],color='red',label='Cautious',s=100)
plt.scatter(X[y_hc==1,0],X[y_hc==1,1],color='green',label='Standerd',s=100)
plt.scatter(X[y_hc==2,0],X[y_hc==2,1],color='blue',label='target',s=100)
plt.scatter(X[y_hc==3,0],X[y_hc==3,1],color='black',label='Careless',s=100)
plt.scatter(X[y_hc==4,0],X[y_hc==4,1],color='magenta',label='Sensible',s=100)
plt.legend()
plt.show()
Esempio n. 30
0
iris = datasets.load_iris()
X = iris.data
y = iris.target
# Step 1 Model
from sklearn import cluster
cluster = cluster.KMeans(n_clusters=2)
# Step 2 Training
cluster.fit(X)
# Step 3 Evaluation
plt.scatter(X[:, 0], X[:, 1], c=cluster.labels_)
#Mean Shift Clustering
from sklearn.cluster import MeanShift
ms = MeanShift()
ms.fit(iris.data)
from sklearn.cluster import AgglomerativeClustering
groups = AgglomerativeClustering(n_clusters=2)
groups.fit_predict(iris.data)

from sklearn.decomposition import PCA

pca = PCA(n_components=2).fit(iris.data)
pca_2d = pca.transform(iris.data)
for i in range(0, pca_2d.shape[0]):
    if ms.labels_[i] == 1:
        c1 = plt.scatter(pca_2d[i, 0], pca_2d[i, 1], c='r', marker='+')
    elif ms.labels_[i] == 0:
        c2 = plt.scatter(pca_2d[i, 0], pca_2d[i, 1], c='g', marker='o')
#plt.title('Mean shift finds 2 clusters)
plt.show()

#Another Method