Exemple #1
0
def hierarchical(similarity, concepts=2, euclid=False):
    if euclid:
        model = AgglomerativeClustering(n_clusters=concepts)
        return model.fit_predict(similarity)
    else:
        model = AgglomerativeClustering(n_clusters=concepts, affinity='precomputed', linkage='complete')
        return model.fit_predict(1 - similarity)
Exemple #2
0
class HierarchicalTopics(object):

    def __init__(self, corpus):
        """
        corpus is a corpus object, e.g. an HTMLCorpusReader()
        or an HTMLPickledCorpusReader() object
        """
        self.model = None
        self.vocab = list(
            set(normalize(corpus.words(categories=['news'])))
        )

    def vectorize(self, document):
        """
        Vectorizes a document consisting of a list of part of speech
        tagged tokens using the segmentation and tokenization methods.

        One-hot encode the set of documents
        """
        features = set(normalize(document))
        return np.array([
            token in features for token in self.vocab], np.short)

    def cluster(self, corpus):
        """
        Fits the AgglomerativeClustering model to the given data.
        """
        self.model = AgglomerativeClustering()

        self.model.fit_predict([
            self.vectorize(
                corpus.words(fileid)) for fileid in
            corpus.fileids(categories=['news']
                           )
        ])

        self.labels = self.model.labels_
        self.children = self.model.children_

    def plot_dendrogram(self, **kwargs):
        # Distances between each pair of children
        distance = np.arange(self.children.shape[0])
        position = np.arange(self.children.shape[0])

        # Create linkage matrix and then plot the dendrogram
        linkage_matrix = np.column_stack([
            self.children, distance, position]
        ).astype(float)

        # Plot the corresponding dendrogram
        fig, ax = plt.subplots(figsize=(15, 7))  # set size
        ax = dendrogram(linkage_matrix, **kwargs)
        plt.tick_params(axis='x', bottom='off', top='off', labelbottom='off')
        plt.tight_layout()
        plt.show()
Exemple #3
0
def agglom(data, n_clusters):
    knn_graph = kneighbors_graph(data, 30, include_self=False)
    
    cluster = AgglomerativeClustering(n_clusters=n_clusters, connectivity=knn_graph, linkage='ward') # use ward / average / complete for different results
    model = cluster.fit(data)
    
    return cluster.fit_predict(data)
Exemple #4
0
def buckshot(k, mat):
    size = int((k*mat.shape[0])**.5)
    print size
    samp = np.zeros((size, mat.shape[1]))
    inds = np.random.randint(0, mat.shape[0], size)
    print inds
    
    for i in xrange(size):
        samp[i] = mat[inds[i]]
        
    #agglomerative clusting on sample
    hier = AgglomerativeClustering(n_clusters=k, linkage='average', affinity='euclidean', compute_full_tree=True)
    flat = hier.fit_predict(samp)
    
    centroids = []
    #find centroids
    for j in xrange(k):
        i_s = [i for i, l in enumerate(flat) if l == j]
        print len(i_s)
        points = [samp[m] for m in i_s]
        points = np.array(points)
        cent = np.mean(points, axis=0)
        centroids.append(cent)
    
    return centroids
def calculateNumberOfIdealClusters(maxAmount, corpus):
	print "Initializing silhouette analysis"
	range_n_clusters = range(2, maxAmount) # max amount of clusters equal to amount of jobs

	silhouette_high = 0;
	silhouette_high_n_clusters = 2;

	for n_clusters in range_n_clusters:
		# Initialize the clusterer with n_clusters value
		cluster = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward", affinity="euclidean")
		cluster_labels = cluster.fit_predict(corpus)

		# The silhouette_score gives the average value for all the samples.
		# This gives a perspective into the density and separation of the formed clusters
		silhouette_avg = silhouette_score(corpus, cluster_labels)

		print "For n_clusters = %d, the average silhouette_score is: %.5f" % (n_clusters, silhouette_avg)

		if (silhouette_avg > silhouette_high):
		    silhouette_high = silhouette_avg
		    silhouette_high_n_clusters = n_clusters

		# Compute the silhouette scores for each sample
		sample_silhouette_values = silhouette_samples(corpus, cluster_labels)

	print ("Highest score = %f for n_clusters = %d" % (silhouette_high, silhouette_high_n_clusters))
	return silhouette_high_n_clusters
 def sp_connectivity(self,X,connectivity, n_clusters, n):
         
        # plt.figure(figsize=(10, 4))
         
  #       plt.subplot(1, 3, index + 1)
         model = AgglomerativeClustering(linkage="ward",
                                            connectivity=connectivity,
                                             n_clusters=n_clusters)
         #t0 = time.time()
         y = np.zeros(shape=(n))
         y = model.fit_predict(X, None)
         #elapsed_time = time.time() - t0
         return y
         
         
         
         #plt.scatter(X[:, 0], X[:, 1], c=model.labels_,
          #           cmap=plt.cm.spectral)
         #plt.title('linkage=%s (time %.2fs)' % (linkage, elapsed_time),
           #           fontdict=dict(verticalalignment='top'))
         #plt.axis('equal')
         #plt.axis('off')
         #plt.subplots_adjust(bottom=0, top=.89, wspace=0,
           #                      left=0, right=1)
         #    plt.suptitle('n_cluster=%i, connectivity=%r' %
          #                (n_clusters, connectivity is not None), size=17)
 
 
         #plt.show()
Exemple #7
0
    def clustering_approach(self):
        '''
        Cluster user data using various clustering algos
        IN: self.df_full and self.labels
        OUT: results to stdout
        '''
        print 'Fitting clustering model'
        X = self.df_full.values
        y = self.labels

        # scale data
        scaler = StandardScaler()
        X = scaler.fit_transform(X)

        # KMeans
        km_clf = KMeans(n_clusters=2, n_jobs=6)
        km_clf.fit(X)

        # swap labels as super-users are in cluster 0 (messy!!)
        temp = y.apply(lambda x: 0 if x == 1 else 1)
        print '\nKMeans clustering: '
        self.analyse_preds(temp, km_clf.labels_)

        # Agglomerative clustering
        print '\nAgglomerative clustering approach: '
        ac_clf = AgglomerativeClustering()
        ac_labels = ac_clf.fit_predict(X)
        self.analyse_preds(y, ac_labels)

        return None
Exemple #8
0
 def Create_Ext_Agg_cluster(self,stem,stop,processing,remS): 
      
     Allrow_dicts=data_pkg.FileHandling.read_csv(self.ExtStringCSv)
     Allstrings=list()
     #Allstrings=[rowdict_str["Text_original"] for rowdict_str in Allrow_dicts]
     for row_dict in Allrow_dicts:
         if self.POS =="ALL_EXT":
             Stringrow=row_dict["Text_original"]+row_dict["Adj_Extended"]+row_dict["Noun_Extended"] +row_dict["Verb_Extended"]
             Allstrings.append(Stringrow)
         else:
             Stringrow=row_dict["Adj"]+row_dict["Adj_Extended"]+row_dict["Noun"]+row_dict["Noun_Extended"]#+row_dict["Verb"]#+row_dict["Verb_Extended"]
             Allstrings.append(Stringrow)
              
     Allstrings_process=[preprocess_text.preprocess(string_text, stem,stop) for string_text in Allstrings]  
      
     if remS:
         Allstrings_process=[preprocess_text.removeS(text) for text in Allstrings_process]            
     vectorizer = CountVectorizer()    
     term_doc=vectorizer.fit_transform(Allstrings_process)
     #-------------------------- feature_names=vectorizer.get_feature_names()
     #--z---------------------------------------------- Array=term_doc.toarray
      
     if self.affinity=='euclidean':
         Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,affinity='euclidean')
     if self.affinity=='cosine':
         Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,linkage='average',affinity='cosine')
     Res_Labels=Agg_cluster.fit_predict(term_doc.toarray())
     self.cluster_tup_list=self.tuple_Ext_cluster_doc(Res_Labels,Allstrings,Allrow_dicts)
     #term_doc_lsa = lsa.fit_transform(term_doc)
     print type (term_doc)
     self.metric=metrics.silhouette_score(term_doc.toarray(), Res_Labels, metric=self.affinity)
     print Res_Labels
     print("n_samples: %d, n_features: %d" % term_doc.shape) 
Exemple #9
0
 def CreateCluster(self):
     Fileobj=file(self.DistanceFile,"rb")
     SimArray=np.load(self.DistanceFile)
     Fileobj.close()
     
     print SimArray
     AggClusterDistObj=AgglomerativeClustering(n_clusters=self.num_cluster,linkage='average',affinity=self.affinity) 
     Res_Labels=AggClusterDistObj.fit_predict(SimArray)
     print Res_Labels
Exemple #10
0
def hierarchicalCluster(corr_matrix_df, n_clusters):
	"""calculate clustering from the correlation matrix using the hierarchical Ward method"""
	#set method
	ward = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward',affinity='euclidean')

	result=ward.fit_predict(corr_matrix_df)
	cluster_df=pd.DataFrame(result, index=corr_matrix_df.index, columns= ['Cluster'])

	return cluster_df
def get_topics(X_lsi, text_names, nk=1):
    ag = AgglomerativeClustering(n_clusters=nk, affinity='cosine', linkage='average')
    topics = ag.fit_predict(X_lsi)
    paper_to_topic = defaultdict(int)
    topic_to_papers = defaultdict(list)
    for paper,topic in zip(text_names,topics):
        paper_to_topic[paper] = topic
        topic_to_papers[topic].append(paper)
    return (paper_to_topic, topic_to_papers)
def openfaceExp(lfwAligned, net, cls):
    df = pd.DataFrame(columns=('nPpl', 'nImgs',
                               'trainTimeSecMean', 'trainTimeSecStd',
                               'predictTimeSecMean', 'predictTimeSecStd',
                               'accsMean', 'accsStd'))

    repCache = {}

    df_i = 0
    for nPpl in nPplVals:
        print(" + nPpl: {}".format(nPpl))
	cls = AgglomerativeClustering(n_clusters=nPpl)
        (X, y) = getData(lfwAligned, nPpl, nImgs, size=96, mode='rgb')
        nSampled = X.shape[0]
        ss = ShuffleSplit(nSampled, n_iter=10, test_size=0.1, random_state=0)

        allTrainTimeSec = []
        allPredictTimeSec = []
        accs = []

        for train, test in ss:
            X_train = []
            for img in X[train]:
                h = hash(str(img.data))
                if h in repCache:
                    rep = repCache[h]
                else:
                    rep = net.forward(img)
                    repCache[h] = rep
                X_train.append(rep)

            start = time.time()
            X_train = np.array(X_train)
            cls.fit(X_train, y[train])
            trainTimeSec = time.time() - start
            allTrainTimeSec.append(trainTimeSec)

            start = time.time()
            X_test = []
            for img in X[test]:
                X_test.append(net.forward(img))
            y_predict = cls.fit_predict(X_test)
            predictTimeSec = time.time() - start
            allPredictTimeSec.append(predictTimeSec / len(test))
            y_predict = np.array(y_predict)
            print y[test], y_predict
            acc = accuracy_score(y[test], y_predict)
            print acc
            accs.append(acc)

        df.loc[df_i] = [nPpl, nImgs,
                        np.mean(allTrainTimeSec), np.std(allTrainTimeSec),
                        np.mean(allPredictTimeSec), np.std(allPredictTimeSec),
                        np.mean(accs), np.std(accs)]
        df_i += 1

    return df
def test_agglomerative_clustering_with_distance_threshold_edge_case(
        linkage, threshold, y_true):
    # test boundary case of distance_threshold matching the distance
    X = [[0], [1]]
    clusterer = AgglomerativeClustering(
        n_clusters=None,
        distance_threshold=threshold,
        linkage=linkage)
    y_pred = clusterer.fit_predict(X)
    assert adjusted_rand_score(y_true, y_pred) == 1
Exemple #14
0
def clusterize(matrices):
    #dbscan = DBSCAN(metric="precomputed", eps=25, min_samples=50)
    cluster = AgglomerativeClustering(n_clusters=2, affinity="precomputed", linkage="complete")
    distances = distance_matrix(matrices)
    print("mean of distances is {} and std of norms is {}".format(numpy.mean(distances), numpy.std([numpy.linalg.norm(m, numpy.inf) for m in matrices])))
    #pyplot.plot([numpy.linalg.norm(m, numpy.inf) for m in matrices], 'ro')
    #pyplot.show()
    #pyplot.hist(distances.flatten(), bins=20)
    #pyplot.show()
    return cluster.fit_predict(distances)
def agglomerative_clustering(crime_rows, column_names, num_clusters):
    crime_xy = [crime[0:2] for crime in crime_rows]
    crime_info = [crime[2:] for crime in crime_rows]
    print("Running Agglomerative Clustering")
    agglo_clustering = AgglomerativeClustering(n_clusters=num_clusters, 
            connectivity=neighbors.kneighbors_graph(crime_xy, n_neighbors=2))
    agglomerative_clustering_labels = agglo_clustering.fit_predict(crime_xy)
    print("formatting....")
    return _format_clustering(agglomerative_clustering_labels, 
            crime_xy, crime_info, column_names)
    def agglomorative_clustering(df_in):
        # Set model input args
        n_clusters = 8
        linkage = 'ward'

        model = AgglomerativeClustering(linkage=linkage,
                                        n_clusters=n_clusters)

        # attach cluster-label to dataframe
        df_in['cluster'] = model.fit_predict(df_in)
Exemple #17
0
def find_steady_coalition():

    working_direcotry = r"C:\Users\ORI\Documents\IDC-non-sync\ML_Course\Election\Data\\"
    file_name = os.path.join(working_direcotry, r'ElectionsData.csv')
    train, validation, test, feature_categorical_dictionary, train_idx, test_idx, number_to_party_dictionary = prepare_the_data(file_name,

                                                                                                        working_direcotry)

    good_colation_found = False
    for n_clusters in [5,4,3]:
        print ("---------------")
        linkage = 'ward'
        X = train.data
        clusters = AgglomerativeClustering(linkage=linkage, n_clusters=n_clusters)
        clusters.fit_predict(X)
        bin_count_of_kmeans_clusters = np.bincount(clusters.labels_)
        normalized_bin_count_of_kmeans_clusters = bin_count_of_kmeans_clusters/np.sum(bin_count_of_kmeans_clusters).astype('float32')
        #is there any cluster with more than 50% of the votes?
        coalition_exists = np.any(normalized_bin_count_of_kmeans_clusters > 0.5)
        print "number_of_clustes {0}".format(n_clusters)
        print "coalition_exists: {0} ".format(coalition_exists)

        # find all the parties belong to the cluster
        biggest_cluster = np.argmax(normalized_bin_count_of_kmeans_clusters)
        biggest_cluster_voters = np.bincount(train.labels[clusters.labels_ == biggest_cluster].astype('int64'))

        #normalize the votes by the size of their parties:
        votes_out_of_party =  biggest_cluster_voters/np.bincount( train.labels.astype('int32')).astype('float32')
        #commited_to_coalition_parties = partyw with majority of the  votes in the cluster
        commited_to_coalition_parties = votes_out_of_party > 0.5

        percentage_of_voters_in_commited_coalition = np.sum(biggest_cluster_voters[votes_out_of_party > 0.5])*1.0/len(train.labels)*1.0

        print percentage_of_voters_in_commited_coalition
        if percentage_of_voters_in_commited_coalition> 0.5:
            print "coalition found"
            parties_in_coalition = number_to_party_dictionary.keys()
            print "parties in coalition:{0}".format([number_to_party_dictionary[k] for k in  np.array(number_to_party_dictionary.keys())[votes_out_of_party > 0.5]])

            break
        print ("---------------")
def outlier_clusters_ward(x, y, skill=None, memory=None):
    # TODO: incorporate skill
    data = np.vstack((x, y)).T

    if len(data) == 0:
        # uh.
        print 'clustering: NO cluster members!'
        cluster_centers = np.array([[-1, -1]])
        cluster_labels = []
        labels = []
        n_clusters = 0
        dist_within = np.array([])

    elif len(data) == 1:
        print 'clustering: only 1 data point!'
        cluster_centers = data
        cluster_labels = [0]
        labels = np.array([0])
        n_clusters = 1
        dist_within = np.array([0])

    else:
        dist_within = 1000
        dist_max = 75
        n_clusters = 0
        n_clusters_max = 10

        clusterer = AgglomerativeClustering(n_clusters=n_clusters,
                memory=memory)

        # while dist_within > dist_max, keep adding clusters
        while (dist_within > dist_max) * (n_clusters < n_clusters_max):
            # iterate n_clusters
            n_clusters += 1
            clusterer.set_params(n_clusters=n_clusters)

            # cluster
            labels = clusterer.fit_predict(data)

            # get cluster_centers
            cluster_labels = range(n_clusters)
            cluster_centers = np.array([np.mean(data[labels == i], axis=0)
                                        for i in cluster_labels])

            # find dist_within: the maximum pairwise distance inside a cluster
            dist_within = np.max([np.max(pairwise_distances(
                                  data[labels == i]))
                                  for i in cluster_labels])

    dist_within_final = np.array([np.max(pairwise_distances(
            data[labels == i])) for i in cluster_labels])

    return cluster_centers, cluster_labels, labels, n_clusters, dist_within_final
def hierarchical_clustering(g, n_clusters=3):
    """performs hierarchical_clustering to specified number fo clusters"""
    from sklearn.cluster import AgglomerativeClustering
    
    distances = get_shortest_path_distance_matrix(g)
    ac = AgglomerativeClustering(n_clusters, linkage='average')
    labels = ac.fit_predict(distances)
    
    clusters = defaultdict(list)
    for node, label in zip(g.nodes(), labels):
        clusters[label].append(node)
    return clusters
def ward_cluster(file_topic_matrix="PCA_matrix.pkl",
                 file_term_matrix="Tfidf_Matrix.pkl",
                 file_term_list="features.pkl",n_clusters=500,
                 truncate=25000,output="",tolerance=-0.2):
    #Only keep the first 10k articles by default.  I couldn't get it to run with 50k.
    #However, I got it with 25k, and it took 45 minutes.
    start_time = time.time()
    #Load pickles
    topic_matrix = pd.read_pickle(file_topic_matrix)[:truncate,:]
    term_matrix = pd.read_pickle(file_term_matrix)[:truncate,:]
    term_list = pd.read_pickle(file_term_list)
    processing_time = (time.time() - start_time)/60
    print("Current time: %.2f minutes.  Files loaded." % processing_time )

    #Apply clustering
    clustering = AgglomerativeClustering(linkage="ward", n_clusters=n_clusters)
    classification = clustering.fit_predict(topic_matrix)
    processing_time = (time.time() - start_time)/60
    print("Current time: %.2f minutes.  Clustering done." % processing_time )

    #Translate to tree_node, generate label_tree and collapsed_tree
    full_tree = tree_to_nodes(clustering.children_,topic_matrix.shape[0])
    label_tree = get_label_tree(classify_tree(full_tree,classification))
    (topic_means,docs_in_cluster) = get_means(classification,topic_matrix)
    collapsed_tree = collapse_label_tree(label_tree,topic_means,docs_in_cluster,tolerance)
    processing_time = (time.time() - start_time)/60
    print("Current time: %.2f minutes.  Trees collapsed." % processing_time )

    #Assign names to each node of tree based on most common links
    (term_means,docs_in_cluster) = get_means(classification,term_matrix)
    descriptive_tree = get_name_tree(collapsed_tree,term_means,docs_in_cluster,term_list)
    processing_time = (time.time() - start_time)/60
    print("Current time: %.2f minutes.  Node descriptions generated." % processing_time )

    #Write pickles
    #c_labels tells you which cluster each document is in
    pd.to_pickle(classification,output+'c_labels.pkl')
    #save the uncollapsed tree in case I want to tweak that process later.
    pd.to_pickle(label_tree,output+'uncollapsed_tree.pkl')
    #ward_tree shows how the clusters fit together in a tree structure
    pd.to_pickle(collapsed_tree,output+'ward_tree.pkl')
    #descriptive_tree the same as ward_tree, except that nodes are named after most common links
    pd.to_pickle(descriptive_tree,output+'descriptive_tree.pkl')
    #cluster_means are the vectors (in the PCA space) of each cluster
    pd.to_pickle(topic_means,output+'topic_means.pkl')
    #link_means are the vectors (in link space) of each cluster
    pd.to_pickle(term_means,output+'term_means.pkl')
    #docs_in_cluster tells you the size of each cluster
    pd.to_pickle(docs_in_cluster,output+'docs_in_cluster.pkl')
    processing_time = (time.time() - start_time)/60
    print("Current time: %.2f minutes.  Pickles written." % processing_time )
Exemple #21
0
 def Create_Agg_cluster(self,stem,stop,processing,remS): 
     
     Allrow_dicts=data_pkg.FileHandling.read_csv(self.StringsFile)
     Allstrings=[rowdict_str[self.clusterdfield] for rowdict_str in Allrow_dicts]
     if self.POS=="ALL":
         Allstrings_process=[preprocess_text.preprocess(string_text, stem,stop) for string_text in Allstrings] 
     else:
         POS_Strings=list()
         if self.POS=="Noun_Verb_AdJ" :
             POS_List=["Noun","Adj","Verb"]
         else:    
             if  self.POS=="Noun_AdJ" :
                 POS_List=["Noun","Adj"] 
             else:
                 print "Error in Part of speech in function Create_Agg_cluster"
                 sys.exit(0)
                 
         
         for string in Allstrings:
             POS_String=Add_POS.ADDPOS_string(string,POS_List)["AllPOSstring"] 
             POS_Strings.append(POS_String)                  
         Allstrings_process=[preprocess_text.preprocess(string_text, stem,stop) for string_text in POS_Strings]  
     
     if remS:
         Allstrings_process=[preprocess_text.removeS(text) for text in Allstrings_process]            
     if self.vec=="CountVectorizer":
         vectorizer = CountVectorizer()
     else:
         if self.vec=="TFIdfCountVectorizer":
             vectorizer= TfidfVectorizer()      
     term_doc=vectorizer.fit_transform(Allstrings_process)
     #=======================================================================
     # svd = TruncatedSVD(n_components=5, random_state=42)
     # lsa = make_pipeline(svd, Normalizer(copy=False))
     # term_doc = lsa.fit_transform(term_doc)
     # term_doc = svd.fit_transform(term_doc)
     #=======================================================================
     
     #-------------------------- feature_names=vectorizer.get_feature_names()
     #------------------------------------------------ Array=term_doc.toarray
     if self.affinity=='euclidean':
         Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,affinity='euclidean')
     if self.affinity=='cosine':
         Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,linkage='average',affinity=self.affinity)
     if self.affinity=='l1':
         Agg_cluster=AgglomerativeClustering(n_clusters=self.num_cluster,linkage='average',affinity=self.affinity)    
     Res_Labels=Agg_cluster.fit_predict(term_doc.toarray())
     self.cluster_tup_list=self.tuple_cluster_doc(Res_Labels,Allstrings,Allrow_dicts)
     #print type (term_doc)
     self.metric=metrics.silhouette_score(term_doc.toarray(), Res_Labels, metric=self.affinity)
Exemple #22
0
def hierarchical_cluster(champ_data,num_clusters):
    clusterer = AgglomerativeClustering(num_clusters)
    ids = champ_data.keys()
    ids.sort()
    X = []
    ordered_atts = champ_data[ids[0]].keys()
    ordered_atts.sort()
    for ID in ids:
        champ = champ_data[ID]
        X.append([champ[att] for att in ordered_atts])
    X = array(X)
    X_scaled = preprocessing.scale(X)
    clusters = clusterer.fit_predict(X_scaled)
    id_to_cluster = {ids[i]: clusters[i] for i in xrange(len(ids))}
    return id_to_cluster
   def obtainClusters(self, hist):

      print 'Obatining clusters using Agglomerative Clustering from skilean...'
      
      hist = np.array(hist)
      hist = hist.astype(float)         
      scaled_vec = StandardScaler().fit_transform(hist)
      
      hc = AgglomerativeClustering(n_clusters=self.nclusters, linkage=self.linkage, affinity=self.dist)
      
      #obatin the clusters
      clusters = hc.fit_predict(scaled_vec,None)
      
      print 'Clusters obtained.'
      
      return clusters
def cv_iteration(n_clusters=1, affinity='euclidean', linkage='ward'):
    X, y_train, _ = load_data()
    scores = []
    cms = []  # confusion matrices
    cluster_sizes = []
    random_states = (666, 69, 7, 13, 1337)
    for i in random_states:
        model = AgglomerativeClustering(n_clusters=n_clusters, affinity=affinity, linkage=linkage)
        predictions = model.fit_predict(X)
        score, confusion_matrix = scoring_function(y_train, predictions)
        scores.append(score)
        cms.append(serialise_confusion_matrix(confusion_matrix))
        cluster_sizes.append(serialise_confusion_matrix(np.unique(predictions, return_counts=True)))
                  
    return {'result': scores,
           'confusion_matrices': eval(str(cms)),
           'score_name': string_enhancer(str(scoring_function)),
           'cluster_sizes': eval(str(cluster_sizes))}
   def obtainCodebook(self, hist):

      print 'Obatining clusters using Agglomerative Clustering from skilean...'
   
      scaled_vec = StandardScaler().fit_transform(hist)
      
      # connectivity matrix for structured Ward
      connectivity = kneighbors_graph(scaled_vec, n_neighbors=3, include_self=False)
      # make connectivity symmetric
      connectivity = 0.5 * (connectivity + connectivity.T)      
      
      hc = AgglomerativeClustering(n_clusters=self.nclusters, linkage=self.linkage, connectivity=connectivity, compute_full_tree=False, affinity=self.dist)
      
      #obatin the codebook and the projections of the images on the codebook (clusters of words)
      clusters = hc.fit_predict(scaled_vec,None)
      
      print 'Clusters obtained.'
      
      return None, clusters
Exemple #26
0
def _sort_clustering(data):
    try:
        from sklearn.cluster import AgglomerativeClustering
        from sklearn.metrics.pairwise import euclidean_distances
    except ImportError:
        logging.error('Cannot use _sort_clustering without scikit-learn '
                      'installed.')
        raise

    dbscan = AgglomerativeClustering(n_clusters=len(data) / 10,
                                     compute_full_tree=False)
    labels = dbscan.fit_predict(data)

    logging.debug('Clustering sort, assigned labels: {0}'
                  ''.format({l: len([x for x in labels if x == l])
                             for l in set(labels)}))

    data_by_labels = collections.defaultdict(list)
    for i, l in enumerate(labels):
        data_by_labels[l].append(data[i])

    numpy_by_labels = {l: numpy.array(data_by_labels[l])
                       for l in data_by_labels}
    centroids = {l: numpy.average(numpy_by_labels[l], axis=0)
                 for l in numpy_by_labels}
    pivot_centroid = centroids[0]
    # Get distances to pivot centroid
    distances_to_pivot = {l: euclidean_distances(numpy.array([centroids[l],
                                                              pivot_centroid]))[0][1]
                          for l in centroids}

    # sort Labels by Distances To Pivot (ldtp)
    ldtp = map(operator.itemgetter(0),
               sorted(distances_to_pivot.items(),
                      key=operator.itemgetter(1)))

    sorted_data_by_cluster = []
    for label in ldtp:
        for item in numpy_by_labels[label]:
            sorted_data_by_cluster.append(item)

    return numpy.array(sorted_data_by_cluster)
def Aglomerative_cl(feat_name):
	features = opening_data_target(feat_name)
	features = scale(features)

	ac = AgglomerativeClustering(n_clusters=4, linkage='average', affinity='cosine') #cosine

	ac_preds = ac.fit_predict(features)

	# getting aglomerative tags
	# need only for evaluation
	#ac_tags = validate_with_mappings(ac_preds, target, features)

	# Aglomerative results
	'''
	print 'Accuracy ', accuracy_score(target, ac_tags)
	print 'Precision ', precision_score(target, ac_tags)
	print 'Recall ', recall_score(target, ac_tags)
	print 'F1 ', f1_score(target, ac_tags)
	'''
	return ac_preds
def main():
    ratings, users, movies = readFromFile()
    global NO_OF_CLUSTERS, NO_OF_USERS

    userDataMatrix = getUserDataMatrix(ratings, users, movies)

    kmeansModel = KMeans(n_clusters=NO_OF_CLUSTERS, init="k-means++")
    agglomerativeClusteringModel = AgglomerativeClustering(n_clusters=NO_OF_CLUSTERS, affinity="euclidean")

    kmeansPredictedClusters = kmeansModel.fit_predict(userDataMatrix)

    aggClusteringPredictedClusters = agglomerativeClusteringModel.fit_predict(userDataMatrix)

    with open("../data/UserClustersKMeans.txt", "w") as KMeansOutFile:
        for i in range(1, NO_OF_USERS):
            KMeansOutFile.write("\t".join([str(i), str(kmeansPredictedClusters[i - 1])]) + "\n")

    with open("../data/UserClustersAgglomerativeClustering.txt", "w") as aggClusOutFile:
        for i in range(1, NO_OF_USERS):
            aggClusOutFile.write("\t".join([str(i), str(aggClusteringPredictedClusters[i - 1])]) + "\n")
Exemple #29
0
def agglomerativeClustering(sourceFiles, fileExtension):
    """ Performs agglomerative hierarchical clustering using files with <fileExtension> in the <sourceFiles> directory and return accuracy measure"""
    try:
        accuracy = 0
        # Step 1 - Check the required algorithm to specify the data type to load
        dataFiles = glob.glob("%s/*.%s" % (arguments.sourcedir, arguments.datatype)) # Get the paths of files to load
        dataSamples, dataLabels, loadedClusters = [], [], []
        for dataPoint in dataFiles:
            dataSamples.append([float(x) for x in open(dataPoint).read()[1:-1].split(",")])
            # Also load its cluster
            clusterName, paramNames = loadLabelFromFile(dataPoint.replace(".%s" % arguments.datatype, ".metadata"))
            if not clusterName in loadedClusters:
                loadedClusters.append(clusterName)
            dataLabels.append(loadedClusters.index(clusterName))
        prettyPrint("Successfully retrieved %s instances for clustering" % len(dataSamples))
        # Step 2 - Perform clustering
        clusterer = AgglomerativeClustering(n_clusters=len(loadedClusters))
        predicted = clusterer.fit_predict(numpy.array(dataSamples), dataLabels)
        accuracy = round(metrics.accuracy_score(dataLabels, predicted), 2)

    except Exception as e:
        prettyPrint("Error encountered: %s" % e, "error")

    return accuracy
Exemple #30
0
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial import ConvexHull
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Import Data
df = pd.read_csv(
    'https://raw.githubusercontent.com/selva86/datasets/master/USArrests.csv')

# Agglomerative Clustering
cluster = AgglomerativeClustering(n_clusters=5,
                                  affinity='euclidean',
                                  linkage='ward')
cluster.fit_predict(df[['Murder', 'Assault', 'UrbanPop', 'Rape']])

# Plot
plt.figure(figsize=(14, 10), dpi=80)
plt.scatter(df.iloc[:, 0], df.iloc[:, 1], c=cluster.labels_, cmap='tab10')


# Encircle
def encircle(x, y, ax=None, **kw):
    if not ax: ax = plt.gca()
    p = np.c_[x, y]
    hull = ConvexHull(p)
    poly = plt.Polygon(p[hull.vertices, :], **kw)
    ax.add_patch(poly)


# Draw polygon surrounding vertices
Exemple #31
0
        coo_Y = []    #y坐标列表
        for j in range(len(C[i])):
            coo_X.append(C[i][j][0])
            coo_Y.append(C[i][j][1])
        pl.scatter(coo_X, coo_Y, marker='x', color=colValue[i%len(colValue)], label=i)

    pl.legend(loc='upper right')
    pl.show()
if __name__ == '__main__':
    data=DataS()
    data=np.array(data)
    labels_true=label_init()
    labels_super=label_init()
    #C = AGNES(data, dist_avg, 6)
    c = ['标准', '周期', '递增', '递减', '递增向上', '递减向下']
    color = ['dodgerblue', 'orange', 'green', 'tomato', 'yellow', 'brown']
    pca = PCA(n_components=2)  # 进行PCA降维
    newdata = pca.fit_transform(data)
    patches = [mpatches.Patch(color=color[i], label="{:s}".format(c[i])) for i in range(len(color))]
    for i in range(6):
        for j in range(100):
            plt.scatter(newdata[i*100+j][0],newdata[i*100+j][1],c=color[i])
    plt.legend(handles=patches,loc='upper right')
    plt.show()
    ac = AgglomerativeClustering(n_clusters=6, affinity='euclidean', linkage='complete')
    labels_super=ac.fit_predict(data)
    plt.scatter(newdata[:, 0], newdata[:, 1], c=labels_super)
    plt.show()
    print(metrics.adjusted_rand_score(labels_true=labels_true, labels_pred=labels_super))  # 进行ARI性能评估,取值[-1,1]越接近1,性能越好
    #draw(C)
Exemple #32
0
#****************************
# Hierarchical clustering
#****************************

hierarch_res = pd.DataFrame(
    columns=['it_id', 'linkage', 'micro', 'macro', 'silhouette'])

linkages = ['complete', 'average', 'single']

for linky in linkages:
    for i in range(nb_trials):
        aglo = AgglomerativeClustering(n_clusters=n_clusters,
                                       affinity='precomputed',
                                       linkage=linky)
        aglo_preds = aglo.fit_predict(dm)
        m, pred = misc(labels_oh, aglo_preds, True)

        sil = silhouette_score(dm, pred, metric='precomputed')
        micro = precision_score(labels_oh, pred, average='micro')
        macro = precision_score(labels_oh, pred, average='macro')

        hierarch_res = hierarch_res.append({'it_id': i + 1, 'linkage': linky, \
                            'micro': micro, 'macro': macro, 'silhouette': sil},\
                                           ignore_index=True)

hierarch_res.groupby('linkage').mean()
hierarch_res.groupby('linkage').std()

hierarch_res.to_csv(res_folder + '/hierarch_res_categ_encoded.csv')
arr = np.array(images)
arr = arr.reshape(2006, 1024)

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize

scaler = StandardScaler()
arr = scaler.fit_transform(arr)
arr = normalize(arr)
import pandas as pd
arr = pd.DataFrame(arr * 2550)

from sklearn.cluster import AgglomerativeClustering
clustering = AgglomerativeClustering().fit(arr)
opt = clustering.fit_predict(arr)
"""
sources = {}
for i in range(2006):
    if opt[i] not in sources:
        sources[opt[i]] = []
        sources[opt[i]].append(files[i])
    else:
        sources[opt[i]].append(files[i])
"""

import shutil
for i in range(0, 2006):
    if opt[i] == 0:
        shutil.copy("C:/Users/ManavChordia/Work/IUCAA/DI/cutouts/" + files[i],
                    "C:/Users/ManavChordia/Work/IUCAA/DI/cutout_0")
Exemple #34
0
data = sc_X.fit_transform(data)

# Using the dendrogram to find the optimal number of clusters
import scipy.cluster.hierarchy as sch
dendrogram = sch.dendrogram(sch.linkage(data, method='ward'))
plt.title('Dendrogram')
plt.xlabel('TimeStamp')
plt.ylabel('Total Expenditure')
plt.show()

# Fitting Hierarchical Clustering to the dataset
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters=4,
                             affinity='euclidean',
                             linkage='ward')
y_hc = hc.fit_predict(data)

# Visualising the clusters
plt.scatter(data[y_hc == 0, 0],
            data[y_hc == 0, 1],
            s=100,
            c='red',
            label='Cluster 1')
plt.scatter(data[y_hc == 1, 0],
            data[y_hc == 1, 1],
            s=100,
            c='blue',
            label='Cluster 2')
plt.scatter(data[y_hc == 2, 0],
            data[y_hc == 2, 1],
            s=100,
Exemple #35
0
plt.axhline(y=1,color='r',linestyle='--')


# In[94]:


## we have 2 clusters as the line cuts the dendrogram at two points


# In[95]:



from sklearn.cluster import AgglomerativeClustering
cluster = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')  
cluster.fit_predict(cluster_data)


# In[97]:


plt.figure(figsize=(10, 7))  
plt.scatter(cluster_data['SepalWidthCm'], cluster_data['PetalLengthCm'], c=cluster.labels_) 


# ## USING K MEANS ON THE DATASET

# In[65]:


import numpy as np
X = data_drop.values

from sklearn.manifold import TSNE
tsne = TSNE(verbose=1, perplexity=40, n_iter= 4000)
Y = tsne.fit_transform(X)

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1, algorithm='auto')
kY = kmeans.fit_predict(X)

f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)

ax1.scatter(Y[:,0],Y[:,1],  c=kY, cmap = "jet", edgecolor = "None", alpha=0.35)
ax1.set_title('k-means clustering')

ax2.scatter(Y[:,0],Y[:,1],  c = datas['diagnosis'], cmap = "jet", edgecolor = "None", alpha=0.35)
ax2.set_title('Actual clusters')

from sklearn.cluster import AgglomerativeClustering
aggC = AgglomerativeClustering(n_clusters=2, linkage='ward')
kY = aggC.fit_predict(X)


f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)


ax1.scatter(Y[:,0],Y[:,1],  c=kY, cmap = "jet", edgecolor = "None", alpha=0.35)
ax1.set_title('Hierarchical clustering ')

ax2.scatter(Y[:,0],Y[:,1],  c = datas['diagnosis'], cmap = "jet", edgecolor = "None", alpha=0.35)
ax2.set_title('Actual clusters')
Exemple #37
0
tot_word = pd.read_csv('C:\\Users\Salvo\Desktop\IFISC\data\\no_words_per_county.csv', sep=",", low_memory=False,index_col=[0])
print(tot_word)
word = np.asarray(tot_word.iloc[1:-1, 1].to_numpy(), dtype=float)
print(word)

#Plotting a scatter plot for each number of clusters
which_clusters = [3,5,7,9]


#"""
#KS
for i in which_clusters:

    #calling the object
    ks_cluster = AgglomerativeClustering(n_clusters=i, affinity='euclidean', linkage='ward')
    a = ks_cluster.fit_predict(myD)
    print(a)
    #a has in position i the county with index i.
    #as value in position i , it has the number of the cluster that the i-th county belongs to (example, 3 clusters, a =[1,1,1,1,0,0,0,2,1,1]

    #Here it is extracting the how many counties are in each cluster, (example, 3 clusters, bin_count = [0, 3, 6, 1])
    unique, bin_count_temp = np.unique(a, return_counts=True)
    #with concatenate I put a 0 as a first element of bin count
    bin_count = np.concatenate(([0], bin_count_temp))

    print(bin_count, len(bin_count))

    #b are the counties indexes
    b = np.arange(len(a))

    #Here I create a pandas dataframe to easily sort counties, latitude and longitude arrays, following the clusters sorting.
Exemple #38
0
    df["Connectivity"] = df["Cellular"] | df["WiFi"]
    df = df.drop(["WiFi", "Cellular", "isInteractive"],
                 axis=1)  # Cause of correlation between columns

    # After the first run, you don't have to compute the distance matrix again. You can read it from the pickle file
    distance = gower.gower_matrix(df)
    with open("distance.pickle", "wb") as f:
        pickle.dump(distance, f)
    #distance = pickle.load( open( "distance.pickle", "rb" ) )
    print("Done with distance!")
    del df

    modelAverage = AgglomerativeClustering(n_clusters=4,
                                           affinity="precomputed",
                                           linkage='average').fit(distance)
    labels = modelAverage.fit_predict(distance)
    silhouette_score = metrics.silhouette_score(distance,
                                                labels,
                                                metric="precomputed")
    print(f"silhouette = {silhouette_score} for average and 4 clusters.")

    modelComplete = AgglomerativeClustering(n_clusters=5,
                                            affinity="precomputed",
                                            linkage='complete').fit(distance)
    labels = modelComplete.fit_predict(distance)
    silhouette_score = metrics.silhouette_score(distance,
                                                labels,
                                                metric="precomputed")
    print(f"silhouette = {silhouette_score} for complete and 5 clusters.")

    # Save models to keep the same numbering on clusters ...
def get_tasks():
    pca = PCA()
    data = request.json
    dataset = pd.read_csv(data['filename'])
    # dataset =  pd.read_csv('D:/sem1/VDS/symptom_project/cs529-project/data/output/multi-dim-5-timepoints-0_t.csv')
    symp = data['symptoms']

    symp1 = deepcopy(symp)

    # for el in symp1:
    # 	symp.append(el+'1')

    # for el in symp1:
    # 	symp.append(el+'2')

    # for el in symp1:
    # 	symp.append(el+'3')

    # for el in symp1:
    # 	symp.append(el+'4')

    z = dataset[symp]
    patientId = data['patientId']
    x = z.values
    patients = dataset.iloc[:, list(range(28))]

    # patients = dataset.iloc[:, list(range(59))]
    # patients = dataset.iloc[:, list(range(87))]
    # patients = dataset.iloc[:, list(range(115))]
    # patients = dataset.iloc[:, list(range(142))]

    print(patients)
    p = patients[symp]
    x_pca = pca.fit_transform(p)
    x_pca = pd.DataFrame(x_pca)
    if len(symp) == 1:
        x_pca['PC2'] = [[0]] * len(patients)
    x_pca = x_pca.iloc[:, list(range(2))]
    x_pca.columns = ['PC1', 'PC2']
    dataset['PC1'] = x_pca['PC1']
    dataset['PC2'] = x_pca['PC2']

    x_pca_pc1 = x_pca['PC1'].to_numpy()
    x_pca_pc1 = (x_pca_pc1 - x_pca_pc1.mean()) / np.std(x_pca_pc1)

    if len(symp) != 1:
        x_pca_pc2 = x_pca['PC2'].to_numpy()
        x_pca_pc2 = (x_pca_pc2 - x_pca_pc2.mean()) / np.std(x_pca_pc2)
        dataset['PC2'] = x_pca_pc2
    dataset['PC1'] = x_pca_pc1

    print(x_pca_pc1)

    if (len(patientId) > 0):
        l = []
        d = {}
        dp = {}
        for i, row in dataset.iterrows():
            l.append([row['PC1'], row['PC2']])
            d[int(row['patientId'])] = i
            dp[i] = int(row['patientId'])

        crt_patient = d[int(patientId)]

        distance_list = []
        for i, elem in enumerate(l):
            distance_list.append([
                i,
                math.sqrt((elem[0] - l[crt_patient][0])**2 +
                          (elem[1] - l[crt_patient][1])**2)
            ])

        distance_list.sort(key=lambda x: x[1])

        return jsonify([
            dp[distance_list[1][0]], dp[distance_list[2][0]],
            dp[distance_list[3][0]]
        ])

    h = AgglomerativeClustering(n_clusters=2,
                                affinity='euclidean',
                                linkage='ward')
    y = h.fit_predict(x)
    dataset['cluster'] = y

    dataset = dataset.transpose()
    dataset = dataset.to_dict()

    sum0 = -1
    sum1 = -1

    for index, row in dataset.items():
        if row['cluster'] == 0:
            sum0 = row['sum']
        if row['cluster'] == 1 and sum0 != -1:
            sum1 = row['sum']
            break

    while sum1 == sum0:
        sum0 = -1
        for index, row in dataset.items():
            if index > 0:
                if row['cluster'] == 0:
                    sum0 = row['sum']
                    break
    if sum0 > sum1:
        for index, row in dataset.items():
            if int(row['cluster']) == 0:
                row['cluster'] = 1
            else:
                row['cluster'] = 0

    return jsonify(dataset)
Exemple #40
0
    index=False)

# K-Means 手肘法:统计不同K取值的误差平方和
import matplotlib.pyplot as plt

sse = []
for k in range(1, 204):
    # kmeans算法
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(train_x)
    # 计算inertia簇内误差平方和
    sse.append(kmeans.inertia_)
x = range(1, 204)
plt.xlabel('K')
plt.ylabel('SSE')
plt.plot(x, sse, 'o-')
plt.show()

#使用层次聚类
from scipy.cluster.hierarchy import dendrogram, ward
from sklearn.cluster import KMeans, AgglomerativeClustering
import matplotlib.pyplot as plt

model = AgglomerativeClustering(linkage='ward', n_clusters=3)
y = model.fit_predict(train_x)
print(y)

linkage_matrix = ward(train_x)
dendrogram(linkage_matrix)
plt.show()
x = [station['loc']['coordinates'][0] for station in all_stations]
y = [station['loc']['coordinates'][1] for station in all_stations]
X = np.array((x, y))
X = X.T

try:
    mongo_bulk = mongo_db.stations.initialize_ordered_bulk_op()
    mongo_bulk.find({}).update({'$set': {'clusters': []}})

    for n_clusters in reversed(range_clusters):

        model = AgglomerativeClustering(linkage='ward',
                                        connectivity=None,
                                        n_clusters=n_clusters)
        labels = model.fit_predict(X)

        for label in range(len(np.unique(labels))):
            cluster_assign = labels == label
            cluster = X[cluster_assign]

            average = np.average(cluster, 0)
            middle = cluster[KDTree(cluster).query(average)[1]]

            indexes = np.where((X == middle).all(axis=1))[0]
            if len(indexes) > 1:
                stations = list(
                    mongo_db.stations.find(
                        {
                            '_id': {
                                '$in':
Exemple #42
0
data = customer_data.iloc[:, 3:5].values

# In[10]:

# create the dendrograms for our dataset.
import scipy.cluster.hierarchy as shc  #import scipy for dendograms

plt.figure(figsize=(10, 7))
plt.title("Customer Dendograms")
dend = shc.dendrogram(shc.linkage(data, method='ward'))

# In[11]:

#group the data points into these k(5) clusters

# In[12]:

from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=5,
                                  affinity='euclidean',
                                  linkage='ward')
cluster.fit_predict(data)

# In[13]:

plt.figure(figsize=(10, 7))
plt.scatter(data[:, 0], data[:, 1], c=cluster.labels_, cmap='rainbow')

# In[ ]:
#dendogram method - finding optimal number of clusters
import scipy.cluster.hierarchy as sch
dendrogram = sch.dendrogram(sch.linkage(x, method='ward'))

plt.title('dendrogram')
plt.xlabel('customers')
plt.ylabel('distance')
plt.show()

#fitting hierarchical clustering
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters=5,
                             affinity='euclidean',
                             linkage='ward')
yhc = hc.fit_predict(x)

#visualising clustes
plt.scatter(x[yhc == 0, 0], x[yhc == 0, 1], s=100, c='red', label='cluster 1')
plt.scatter(x[yhc == 1, 0], x[yhc == 1, 1], s=100, c='blue', label='cluster 2')
plt.scatter(x[yhc == 2, 0],
            x[yhc == 2, 1],
            s=100,
            c='green',
            label='cluster 3')
plt.scatter(x[yhc == 3, 0],
            x[yhc == 3, 1],
            s=100,
            c='yellow',
            label='cluster 4')
plt.scatter(x[yhc == 4, 0],
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

dataset = pd.read_csv('/content/gdrive/My Drive/Mall_Customers.csv')
X = dataset.iloc[:, [3, 4]].values
y = dataset.iloc[:, 3].values



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'ward')
y_hc = hc.fit_predict(X)

plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_hc == 4, 0], X[y_hc == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

Exemple #45
0
def silhouette(X, alg = "kmeans", max_dec = 5):

    assert alg in ['agglomerative', 'kmeans'], "alg must be kmeans or agglomerative"

    x_plot = []
    silhuette_plot = []
    sse_plot = []

    max_silhouette = -1

    #x_plot.append(1)
    #silhuette_plot.append(max_silhouette)   

    num_dec = 0
    n_clusters = 1

    fig, ax1 = plt.subplots(1)
    ax1.set_title(("Silhouette score for each cluster number"),fontsize=16, fontweight='bold')
    fig.set_size_inches(18, 7)
    plt.grid(b=True, which='major', color='#666666', linestyle='-')
    plt.minorticks_on()
    plt.grid(b=True, which='minor', color='#999999', linestyle='-', alpha=0.2)

    hl, = plt.plot([], [])
    best_labels = []

    while num_dec <= max_dec:
        n_clusters+=1
        if alg == 'agglomerative':
            clusterer = AgglomerativeClustering(n_clusters=n_clusters)
        elif alg == 'kmeans':
            clusterer = KMeans(n_clusters=n_clusters, random_state=42)

        cluster_labels = clusterer.fit_predict(X)
        silhouette_avg = silhouette_score(X, cluster_labels)
        x_plot.append(n_clusters)
        silhuette_plot.append(silhouette_avg)        

        if silhouette_avg < max_silhouette:
            num_dec += 1
        else:
            best_labels = cluster_labels
            max_silhouette = silhouette_avg
            num_dec = 0
    
    if max_silhouette < 0.2:
        best_labels = [0]*len(X)   

    #silhuette_plot = silhuette_plot/max(silhuette_plot)
    ax1.plot(x_plot , silhuette_plot, label='silhuoette')
    #sse_plot = sse_plot/max(sse_plot)
    #ax1.plot(x_plot , sse_plot, label='inertia')
    ax1.set_xticks([i+1 for i in range(n_clusters)])
    ax1.legend()
    ax1.set_xlabel("Number of clusters")
    ax1.set_ylabel("Silhouette score")
    plt.show()

    print(f"Best cluster number is {len(set(best_labels))}")

    return best_labels
Exemple #46
0
from sklearn.datasets import make_blobs
from sklearn.cluster import AgglomerativeClustering
import numpy as np
from matplotlib import  pyplot as plt

X,Y = make_blobs(n_samples=2000, n_features=2, centers=8, cluster_std=2.0)
plt.scatter(X[:,0], X[:,1], s=4)
plt.title('Generated Data')
plt.show()

Z = AgglomerativeClustering(n_clusters=8, linkage='complete')
P = Z.fit_predict(X)
colormap = np.array(['r', 'g', 'b', 'k', 'y', 'c', 'm', 'orange'])
plt.scatter(X[:,0], X[:,1], s=4, c=colormap[P])
plt.title('Clustering results')
plt.show()
Exemple #47
0
def HC_predict(data, kclusters):
    """Fit the HC on data"""
    hc = AgglomerativeClustering(n_clusters=kclusters,
                                 affinity='euclidean',
                                 linkage='ward')
    return hc.fit_predict(data)
Exemple #48
0
data = pd.DataFrame(dictionary)

plt.scatter(x1, y1)
plt.scatter(x2, y2)
plt.scatter(x3, y3)
plt.show()

# %% dendogram
from scipy.cluster.hierarchy import linkage, dendrogram

merg = linkage(data, method="ward")
dendrogram(merg, leaf_rotation=90)
plt.xlabel("data points")
plt.ylabel("euclidean distance")
plt.show()

# %% HC
from sklearn.cluster import AgglomerativeClustering

hiyerartical_cluster = AgglomerativeClustering(n_clusters=3,
                                               affinity="euclidean",
                                               linkage="ward")
cluster = hiyerartical_cluster.fit_predict(data)

data["label"] = cluster

plt.scatter(data.x[data.label == 0], data.y[data.label == 0], color="red")
plt.scatter(data.x[data.label == 1], data.y[data.label == 1], color="green")
plt.scatter(data.x[data.label == 2], data.y[data.label == 2], color="blue")
plt.show()
import numpy as np
import pandas as pd
import scipy.cluster.hierarchy as shc
from matplotlib import pyplot as plt
from sklearn.cluster import AgglomerativeClustering

customer_data = pd.read_csv('shopping_data.csv')
print(customer_data.shape)
dt = np.array(customer_data)
# print(dt[0:10,:])
print(customer_data.head())
print(customer_data.iloc[0:10, 0:5].values)

data = customer_data.iloc[0:15, 3:5].values
print('--Linkage Matrix-only 10 rows------')
link = shc.linkage(data, method='ward')
print(link[0:10, ])
plt.figure(figsize=(6, 4))
plt.title("Customer Dendograms")
dend = shc.dendrogram(shc.linkage(data, method='ward'))

cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
clt = cluster.fit_predict(data)
print('--Clusters formed by program-----')
print(cluster.fit_predict(data))
plt.figure(figsize=(6, 4))
plt.scatter(data[:, 0], data[:, 1], c=cluster.labels_, cmap='rainbow')

plt.show()
Exemple #50
0
        with open(file, "rb") as f:
            pdf = pdftotext.PDF(f)
            score_sheet = pdf[2]
        SS.append(" ".join(
            jieba.cut(re.sub('\W|\d|[a-zA-Z]', '', score_sheet),
                      cut_all=False)))

    vect = TfidfVectorizer(min_df=1)
    tfidf = vect.fit_transform(SS)
    SS_sim_mat = (tfidf * tfidf.T).A
    linkage_matrix = ward(SS_sim_mat)

    cluster = AgglomerativeClustering(n_clusters=N_Group,
                                      affinity='euclidean',
                                      linkage='ward')
    cluster.fit_predict(SS_sim_mat)

    os.mkdir(f'{ODIR}/manual')
    for i in range(N_Group):
        os.mkdir(f'{ODIR}/manual/G{i}')

    for i, file in enumerate(manual_files):
        copyfile(
            file, f'{ODIR}/manual/G' + str(cluster.labels_[i]) + '/' +
            file.split('/')[-1])

    # create template excel files to start manual work
    for i in range(N_Group):
        temp = [
            f for j, f in enumerate(manual_files) if cluster.labels_[j] == i
        ]
Exemple #51
0
plt.axhline(y=8, c='black', lw=2, linestyle='dashed')

#from scipy.cluster.hierarchy import fcluster
#d=shc.linkage(X_principal, method ='ward')

ac2 = AgglomerativeClustering(n_clusters=2, compute_full_tree=True)

# Visualizing the clustering
plt.figure(figsize=(6, 6))

color = ['b', 'r']

for i in range(X_new.shape[0]):
    plt.scatter(X_principal[i, 0],
                X_principal[i, 1],
                c=color[ac2.fit_predict(X_new)[i]],
                cmap='rainbow')
'''    
for i in range (35):
    
    plt.annotate(z[i],(X_principal[i, 0], X_principal[i, 1]))
'''
plt.axhline(y=0, color='k')
plt.axvline(x=0, color='k')
plt.axis('off')
labels = ac2.fit_predict(X_new)

DX = pd.DataFrame(labels, columns=["label"])
df_concat = pd.concat([dfnew, DX], axis=1)

print(("total samples is: ") + str(df_concat.shape[0]))
# for i in clusted_list_title:
#     print("Cluser: "******"Quality: ", len(clusted_list_title.get(i)))
#     for title in clusted_list_title.get(i):
#         # print(title)
#         dem+= 1
# print("Title Quality: ", dem)

print("Agglomerative")
# https://stackabuse.com/hierarchical-clustering-with-python-and-scikit-learn/
# cluster5 = AgglomerativeClustering(n_clusters=12, affinity='precomputed', linkage='complete')
cluster5 = AgglomerativeClustering(n_clusters=None,
                                   affinity='precomputed',
                                   linkage='complete',
                                   distance_threshold=0.88)
# clusted = model.fit_predict(linkage_matrix)
cluster5.fit_predict(dist2)
clus_list = []
for i in range(0, len(cluster5.labels_)):
    clus_dict = {
        'id': list_id[i],
        'old_cluster': list_old_cluster[i],
        'new_cluster': cluster5.labels_.item(i),
        'title': list_title[i]
    }
    clus_list.append(clus_dict)
clus_list.sort(key=lambda item: item.get("new_cluster"))
for cl in clus_list:
    print(cl)

# for i in range(0, 209):
#     print(i, ": ", linkage_matrix[i])
Exemple #53
0
import scipy.cluster.hierarchy as shc
plt.figure(figsize=(10, 7))  
plt.title("Dendrograms")  
dend = shc.dendrogram(shc.linkage(data_scaled, method='ward'))

#%%%%
#The x-axis contains the samples and y-axis represents the distance between these samples. The vertical line with maximum distance is the blue line and hence we can decide a threshold of 6 and cut the dendrogram:
    
plt.figure(figsize=(10, 7))  
plt.title("Dendrograms")  
dend = shc.dendrogram(shc.linkage(data_scaled, method='ward'))
plt.axhline(y=6, color='r', linestyle='--')


#We have two clusters as this line cuts the dendrogram at two points. Let’s now apply hierarchical clustering for 2 clusters:

from sklearn.cluster import AgglomerativeClustering
cluster = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')  
cluster.fit_predict(data_scaled)

#%%%
#We can see the values of 0s and 1s in the output since we defined 2 clusters. 0 represents the points that belong to the first cluster and 1 represents points in the second cluster. Let’s now visualize the two clusters:

plt.figure(figsize=(10, 7))  
plt.scatter(data_scaled['Milk'], data_scaled['Grocery'], c=cluster.labels_) 


#%%%%
#selecting number of clusters
#https://www.analyticsvidhya.com/wp-content/uploads/2016/11/clustering-7.png
#https://www.analyticsvidhya.com/blog/2016/11/an-introduction-to-clustering-and-different-methods-of-clustering/
Exemple #54
0
plt.show()

fig = plt.figure(figsize=(8, 8), facecolor="white")
axd = fig.add_axes([0.09, 0.1, 0.2, 0.6])
row_dendr = dendrogram(row_clusters, orientation='left')

df_rowclust = df.iloc[row_dendr['leaves'][::-1]]

axm = fig.add_axes([0.23, 0.1, 0.6, 0.6])
cax = axm.matshow(df_rowclust, interpolation='nearest', cmap='hot_r')

axd.set_xticks([])
axd.set_yticks([])
for i in axd.spines.values():
    i.set_visible(False)
fig.colorbar(cax)
axm.set_xticklabels([''] + list(df_rowclust.columns))
axm.set_yticklabels([''] + list(df_rowclust.index))
plt.show()

ac = AgglomerativeClustering(n_clusters=3,
                             affinity='euclidean',
                             linkage='complete')
labels = ac.fit_predict(X)
print('Cluster Label: %s' % labels)

ac = AgglomerativeClustering(n_clusters=2,
                             affinity='euclidean',
                             linkage='complete')
labels = ac.fit_predict(X)
print('Cluster Label: %s' % labels)
Exemple #55
0
# -*- coding: utf-8 -*-
from numpy import *
from matplotlib.pyplot import *
from sklearn.cluster import AgglomerativeClustering

# cosine類似度に基づくクラスタリング

random.seed(0)

N = 300

x = random.uniform(-1, 1, N)
y = random.uniform(-1, 1, N)

xlim(-1, 1)
ylim(-1, 1)
scatter(x, y, s=50, cmap=cm.rainbow)
show()

cls = AgglomerativeClustering(n_clusters = 10, affinity="cosine", linkage="average")
labels = cls.fit_predict(c_[x, y])

xlim(-1, 1)
ylim(-1, 1)
scatter(x, y, s=50, c=labels, cmap=cm.rainbow)
show()
def clustering_tfidf(indir, level=None):

    datadir = indir + '/level-' + level

    lab_to_idx, idx_to_lab = _load_vocab(datadir, ut.file_names['vocab'])
    _, behrs = _load_data(datadir, ut.file_names['behr'])

    terms = []
    for vec in behrs.values():
        terms.extend(vec)

    count = 0
    list_count = {}
    for idx, lab in idx_to_lab.items():
        co = terms.count(str(idx))
        list_count[lab] = co
        if co > 1:
            count += 1
    print("Number of repeated terms: {0} -- Terms with one occurrence: {1}\n".format(count, len(lab_to_idx)-count))

    print('Most frequent terms (TF>20)')
    x = []
    y = []
    for lab, co in list_count.items():
        if co > 20:
            x.append(lab)
            y.append(co)
            print('%s, %d' % (lab, co))
        else:
            x.append('TF<20')
            y.append(co)

    plt.figure(figsize=(30, 20))
    plt.bar(x, y)
    plt.tick_params(axis='x', rotation=90, labelsize=10)
    plt.savefig(os.path.join(datadir, 'term20-distribution.png'))

    plt.figure(figsize=(20, 10))
    plt.bar(range(len(list_count.values())), list(list_count.values()))
    plt.tick_params(axis='x', rotation=90, labelsize=10)
    plt.savefig(os.path.join(datadir, 'term-distribution.png'))

    print('\n')

    # TF-IDF
    print('Computing TF-IDF matrix...')
    doc_list = list(map(lambda x: ' '.join(x), list(behrs.values())))
    id_subj = [id_lab for id_lab in behrs]

    vectorizer = TfidfVectorizer(norm='l2')
    tfidf_mtx = vectorizer.fit_transform(doc_list)

    print('Performing SVD on the TF-IDF matrix...')
    reducer = TruncatedSVD(n_components=ut.n_dim, random_state=123)
    encoded_dt = reducer.fit_transform(tfidf_mtx)

    # Internal clustering validation
    rf = RandomForestClassifier(criterion='entropy', random_state=42)
    best = 0
    for n_clu in range(ut.min_cl, ut.max_cl):
        hclu = AgglomerativeClustering(n_clusters=n_clu)
        lab_cl = hclu.fit_predict(encoded_dt)
        tmp_silh = silhouette_score(encoded_dt, lab_cl)
        print('(*) Number of clusters %d -- Silhouette score %.2f' % (n_clu, tmp_silh))
        enc_tr, enc_ts, lab_tr, lab_ts = train_test_split(encoded_dt, lab_cl,
                                                          stratify=lab_cl,
                                                          test_size=0.25,
                                                          random_state=42)
        rf.fit(enc_tr, lab_tr)
        rf_predict = rf.predict(enc_ts)
        tmp_mcc = matthews_corrcoef(lab_ts, rf_predict)
        print('    MCC RF classifier: %.2f' % tmp_mcc)
        mu = np.mean([tmp_mcc, tmp_silh])
        if mu > best:
            best_mcc = tmp_mcc
            best_silh = tmp_silh
            best_lab_cl = lab_cl
            best_n_clu = n_clu
            best = mu

    print('\n')
    print("MCC: %.4f -- silhouette score: %.4f -- Number of clusters: %d\n" % (best_mcc, best_silh, best_n_clu))

    num_count = np.unique(best_lab_cl, return_counts=True)[1]
    for idx, nc in enumerate(num_count):
        print("Cluster {0} -- Numerosity {1}".format(idx, nc))
    print('\n')

    colormap = [c for c in ut.col_dict if c not in ut.c_out]
    colormap_rid = [colormap[cl] for cl in sorted(list(set(best_lab_cl)))]
    colors_en = [colormap_rid[v] for v in best_lab_cl]
    umap_mtx = umap.UMAP(random_state=42).fit_transform(encoded_dt)
    single_plot(datadir, umap_mtx, best_lab_cl, colors_en)

    linked = linkage(encoded_dt, 'ward')
    # Color mapping
    dflt_col = "#808080"  # Unclustered gray
    # * rows in Z correspond to "inverted U" links that connect clusters
    # * rows are ordered by increasing distance
    # * if the colors of the connected clusters match, use that color for link
    link_cols = {}
    for i, i12 in enumerate(linked[:, :2].astype(int)):
        c1, c2 = (link_cols[x] if x > len(linked) else colormap_rid[best_lab_cl[x]]
                  for x in i12)
        link_cols[i + 1 + len(linked)] = c1 if c1 == c2 else dflt_col

    plt.figure(figsize=(20, 10))
    # Dendrogram
    dendrogram(Z=linked, labels=best_lab_cl, color_threshold=None,
                   leaf_font_size=5, leaf_rotation=0, link_color_func=lambda x: link_cols[x])
    plt.savefig(os.path.join(datadir, 'dendrogram-tfidf.png'))

    with open(os.path.join(indir, 'person-demographics.csv')) as f:
        rd = csv.reader(f)
        next(rd)
        dem = {r[0]: r[1::] for r in rd}

    df_ar = []
    for id_name, coord, cl_lab in zip(id_subj, umap_mtx, best_lab_cl):
        df_ar.append([id_name, coord[0], coord[1], cl_lab, age(dem[id_name][0]),
                          dem[id_name][2], dem[id_name][3]])
    df_ar = np.array(df_ar)
    df = pd.DataFrame(df_ar, columns=['id_subj', 'x', 'y', 'cluster', 'age', 'sex', 'n_enc'])
    df['x'] = df['x'].astype('float64')
    df['y'] = df['y'].astype('float64')
    df['age'] = df['age'].astype('float64')
    df['n_enc'] = df['n_enc'].astype('int')

    p_clu = {}
    with open(os.path.join(datadir, 'person-cluster.txt'), 'w') as f:
        wr = csv.writer(f)
        wr.writerow(['ID_LAB', 'CLUSTER'])
        for el in df_ar:
            wr.writerow([el[0], el[3]])
            p_clu[el[0]] = el[3]


    source = ColumnDataSource(dict(
        x=df['x'].tolist(),
        y=df['y'].tolist(),
        id_subj=df['id_subj'].tolist(),
        cluster=[str(i) for i in df['cluster'].tolist()],
        age=df['age'].tolist(),
        sex=df['sex'].tolist(),
        n_enc=df['n_enc'].tolist()))

    labels = [str(i) for i in df['cluster'].tolist()]
    cmap = CategoricalColorMapper(factors=sorted(pd.unique(labels)), palette=colormap_rid)

    TOOLTIPS = [('id_subj', '@id_subj'),
                ('cluster', '@cluster'),
                ('sex', '@sex'),
                ('age', '@age'),
                ('n_enc', '@n_enc')]

    plotTools = 'box_zoom, wheel_zoom, pan,  crosshair, reset, save'

    output_file(filename=os.path.join(datadir, 'tfidf-plot-interactive.html'), mode='inline')
    p = figure(plot_width=800, plot_height=800, tools=plotTools)
    p.add_tools(HoverTool(tooltips=TOOLTIPS))
    p.circle('x', 'y', legend='cluster', source=source, color={"field": 'cluster', "transform": cmap})
    save(p)

    freq_term(best_lab_cl, idx_to_lab, behrs, p_clu)