Python wardの例、scipy.cluster.hierarchy.ward Pythonの例

コード例 #1

0

ファイルを表示

ファイル: LatentConfigurationClustering_Demo.py プロジェクト: Sciumo/ProximityForest

def demoFourGs():
    '''
    Demonstrate the performance of LCC
    on points drawn from a four gaussians
    '''           
    s=(640,480)
    dat = genNormalClusters(N=100, size=s)
    cList = ['red', 'blue','green','yellow']
    img_truth = plotClusts(dat[0], dat[1], size=s, 
                           colors=[cList[i] for i in dat[1]], window=None)
    
    #generate normal hierarchical clustering off euclidean data points
    print "Generating Hierarchical Clustering on Raw Data"
    Z2 = spc.ward(scipy.array(dat[0]))
    clusts2 = spc.fcluster(Z2, 4, criterion="maxclust")
    img_HC = plotClusts(dat[0], clusts2, size=s, 
                           colors=[cList[i-1] for i in clusts2], window=None)
    
    #generate LCC clustering
    print "Generating LCC Clustering"
    (clusts, _,_,_) = pf.LatentConfigurationClustering(dat[0], pt_dist, 4, numtrees=27)
    img_LCC = plotClusts(dat[0], clusts, size=s, 
                           colors=[cList[i-1] for i in clusts], window=None)
    
    im = pv.ImageMontage([img_truth, img_LCC, img_HC], layout=(1,3), gutter=3,
                          tileSize=(320,240), labels=None )
    im.show(window="Truth vs. LCC vs. HC")

コード例 #2

0

ファイルを表示

ファイル: test_hierarchical.py プロジェクト: VirgileFritsch/scikit-learn

def test_scikit_vs_scipy():
    """Test scikit ward with full connectivity (i.e. unstructured) vs scipy
    """
    from scipy.sparse import lil_matrix

    n, p, k = 10, 5, 3
    rnd = np.random.RandomState(0)

    connectivity = lil_matrix(np.ones((n, n)))
    for i in range(5):
        X = 0.1 * rnd.normal(size=(n, p))
        X -= 4 * np.arange(n)[:, np.newaxis]
        X -= X.mean(axis=1)[:, np.newaxis]

        out = hierarchy.ward(X)

        children_ = out[:, :2].astype(np.int)
        children, _, n_leaves, _ = ward_tree(X, connectivity)

        cut = _hc_cut(k, children, n_leaves)
        cut_ = _hc_cut(k, children_, n_leaves)
        assess_same_labelling(cut, cut_)

    # Test error management in _hc_cut
    assert_raises(ValueError, _hc_cut, n_leaves + 1, children, n_leaves)

コード例 #3

0

ファイルを表示

ファイル: agglomerative.py プロジェクト: sharadmv/trees

def make_tree(X, C, method='single'):
    if method == 'single':
        tree = to_tree(single(C))
    elif method == 'ward':
        tree = to_tree(ward(X))
    elif method == 'average':
        tree = to_tree(average(C))
    return Tree(root=construct_node(tree))

コード例 #4

0

ファイルを表示

ファイル: clusterInfoProcessor.py プロジェクト: rubyagarwal/NewsClustering

def plotHierarchichalClusterGraph(tf_idf_matrix, headlines_utf):
    dist = 1 - cosine_similarity(tf_idf_matrix)
    linkage_matrix = ward(dist)
    fig, ax = plt.subplots(figsize=(15, 20)) # set size
    dendrogram(linkage_matrix, orientation="right", labels=headlines_utf);

    plt.tick_params(axis= 'x', which='both', bottom='off', top='off', labelbottom='off')
    plt.tight_layout()
    plt.savefig('../plots/hierachichal_clusters.png', dpi=200)

コード例 #5

0

ファイルを表示

ファイル: test_dendrogram.py プロジェクト: biocore/gneiss

    def setUp(self):
        np.random.seed(0)
        x = np.random.rand(10)
        dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y))
        lm = ward(dm.condensed_form())
        ids = np.arange(len(x)).astype(np.str)
        self.tree = TreeNode.from_linkage_matrix(lm, ids)

        # initialize tree with branch length and named internal nodes
        for i, n in enumerate(self.tree.postorder(include_self=True)):
            n.length = 1
            if not n.is_tip():
                n.name = "y%d" % i

コード例 #6

0

ファイルを表示

ファイル: mtextcluster_fun.py プロジェクト: tuling56/Python

def hierarchyCluster(dist,titles):
    linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances
    fig, ax = plt.subplots(figsize=(15, 20)) # set size
    ax = dendrogram(linkage_matrix, orientation="right", labels=titles);

    plt.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='major',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='on')

    plt.tight_layout() #show plot with tight layout
    plt.show()

コード例 #7

0

ファイルを表示

ファイル: clustering.py プロジェクト: jknox13/cortical_paper

def _ward_cluster(X):
    """Clusters 1-corr using Ward distance

    Parameters
    ----------
    X
    Returns
    -------
    """
    # pairwise (1-corr) of zscores
    D = pdist( X, metric="correlation" )

    # return top branch split using ward linkage
    return fcluster( ward(D), 2, criterion="maxclust" )

コード例 #8

0

ファイルを表示

ファイル: test_dendrogram.py プロジェクト: biocore/gneiss

    def setUp(self):
        np.random.seed(0)
        self.table = pd.DataFrame(np.random.random((5, 5)))
        num_otus = 5  # otus
        x = np.random.rand(num_otus)
        dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y))
        lm = ward(dm.condensed_form())
        t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str))
        self.tree = SquareDendrogram.from_tree(t)

        for i, n in enumerate(t.postorder()):
            if not n.is_tip():
                n.name = "y%d" % i
            n.length = np.random.rand()*3

コード例 #9

0

ファイルを表示

ファイル: AJTokenizer.py プロジェクト: adisorn711/comp6237cw2

    def hierachical_clustering(self):
        linkage_matrix = ward(self.__dist_matrix) #define the linkage_matrix using ward clustering pre-computed distances

        fig, ax = plt.subplots(figsize=(15, 9)) # set size
        ax = dendrogram(linkage_matrix, orientation="right", labels=titles);

        plt.tick_params(\
            axis= 'x',          # changes apply to the x-axis
            which='both',      # both major and minor ticks are affected
            bottom='off',      # ticks along the bottom edge are off
            top='off',         # ticks along the top edge are off
            labelbottom='off')

        fig.set_tight_layout(True) #show plot with tight layout
        plt.show()

コード例 #10

0

ファイルを表示

ファイル: test_dendrogram.py プロジェクト: biocore/gneiss

    def test_cache_ntips(self):
        dm = DistanceMatrix.from_iterable([0, 1, 2, 3],
                                          lambda x, y: np.abs(x-y))
        lm = ward(dm.condensed_form())
        ids = np.arange(4).astype(np.str)
        t = mock.from_linkage_matrix(lm, ids)

        t._cache_ntips()

        self.assertEquals(t.leafcount, 4)
        self.assertEquals(t.children[0].leafcount, 2)
        self.assertEquals(t.children[1].leafcount, 2)
        self.assertEquals(t.children[0].children[0].leafcount, 1)
        self.assertEquals(t.children[0].children[1].leafcount, 1)
        self.assertEquals(t.children[1].children[0].leafcount, 1)
        self.assertEquals(t.children[1].children[1].leafcount, 1)

コード例 #11

0

ファイルを表示

ファイル: dataAnalysis.py プロジェクト: heggy231/social-deprivation

def knn(df, axis=None, labels=None):
    dist = 1 - cosine_similarity(df.values)
    # define the linkage_matrix using ward clustering pre-computed distances
    linkage_matrix = ward(dist)

    fig, ax = plt.subplots(figsize=(15, 20))  # set size
    ax = dendrogram(linkage_matrix, orientation="right", labels=labels)

    plt.tick_params(
        axis='x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')

    plt.tight_layout()

コード例 #12

0

ファイルを表示

ファイル: twitterAlgorithms.py プロジェクト: yxy-github/Twitter

 def find_clusters(self, features):
     ''' Returns the clusters and their centroids.'''
     # 1. Cluster the data.
     totalClusters = int(round(features.shape[0] / 2))
     distance = 1 - pairwise_distances(features, metric = "cosine")
     # Ward minimizes the sum of squared differences within all clusters.
     # It is a variance-minimizing approach, which is similar to the k-means objective function.
     linkage_matrix = ward(distance)
     clusters = fcluster(linkage_matrix, totalClusters, criterion = 'maxclust')
     print "Number of clusters:", totalClusters
     
     # 2. Find the centroid for each cluster.
     centroid = np.empty([totalClusters, features.shape[1]])
     for i in range(1, totalClusters + 1):
         nCluster = np.where(clusters == i)
         centroid[i-1,:] = np.mean(features[nCluster], axis = 0)
     return (clusters, centroid)

コード例 #13

0

ファイルを表示

ファイル: ClusteringAlgo.py プロジェクト: MoizRauf/OQuant_Wiki_Clustering

    def create_hierarchy(self, sim_matrix):
        linkage_matrix = ward(sim_matrix)
        fig, ax = plt.subplots(figsize=(15, 20)) # set size
        ax = dendrogram(linkage_matrix, orientation="right", labels=self.titles);

        plt.tick_params(\
            axis= 'x',          # changes apply to the x-axis
            which='both',      # both major and minor ticks are affected
            bottom='off',      # ticks along the bottom edge are off
            top='off',         # ticks along the top edge are off
            labelbottom='off')

        plt.tight_layout() #show plot with tight layout

        #uncomment below to save figure
        plt.savefig('ward_clusters.png', dpi=200) #save figure as ward_clusters
        return

コード例 #14

0

ファイルを表示

ファイル: LSA_code.py プロジェクト: dizcology/cogitatio_2

def lsa_dendrogram(lessonpath):
    # document-term matrix and document indices
    dtm, docindex, lessonname = dtm_matrix(lessonpath)

    # reconstructed dtm matrix using LSA and a reduced subspace of dimension 3
    dtm2 = LSA_dtm(dtm, 3)

    # distance metric based on cosine similarity
    dist = 1 - cosine_similarity(dtm)

    dist = np.round(dist, 10)

    # linkage matrix
    linkage_matrix = ward(dist)

    # dendrogram
    show(dendrogram(linkage_matrix, orientation="right", labels=docindex))

コード例 #15

0

ファイルを表示

ファイル: ward.py プロジェクト: fahadsultan/datalib

	def get_clusters(self, data, features=None, text_features=[], n_clusters=8, centroid_features=10, random_seeds=True, 
		weights=[]):

		"""
		Applies Agglomerative hierarchial clustering using Ward's linkage

		Parameters
		----------
		data : Pandas DataFrame
			Data on which on apply clustering 
		features : list, optional, default : all columns used as features
			Subset of columns in the data frame to be used as features
		text_features : list, optional, default : None
			List of features that are of type text. These are then vectorizer using 
			TfidfVectorizer.
		n_clusters : int, optional, default: 8
			The number of clusters to form as well as the number of centroids to generate.
		centroid_features : int, optional, default: 10
			The number of most-important-features to return against each cluster centroid
		random_seeds : boolean, optional, default: False
			If False, uses clusters from kernel density estimation followed by thresholding
			as initial seeds. The number of clusters is also determined by results of kde and
			thus n_clusters parameter is ignored. 

		Returns
		-------
		result : tuple (labels, centroid_features)
			labels : 
				cluster numbers against each row of the data passed
			centroids : dictionary
				map of most important features of each cluster 
		"""

		X = self.encode_features(data, features, text_features)

		ipshell()

		dist = 1 - cosine_similarity(X)

		self.linkage_matrix = ward(dist)

		return (km.labels_, centroids)

コード例 #16

0

ファイルを表示

ファイル: test_heatmap.py プロジェクト: biocore/gneiss

    def setUp(self):
        np.random.seed(0)
        self.table = pd.DataFrame(np.random.random((5, 5)),
                                  index=['0', '1', '2', '3', '4'],
                                  columns=['0', '1', '2', '3', '4'])

        num_otus = 5  # otus
        x = np.random.rand(num_otus)
        dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y))
        lm = ward(dm.condensed_form())
        t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str))
        self.t = SquareDendrogram.from_tree(t)
        self.md = pd.Series(['a', 'a', 'a', 'b', 'b'],
                            index=['0', '1', '2', '3', '4'])
        for i, n in enumerate(t.postorder()):
            if not n.is_tip():
                n.name = "y%d" % i
            n.length = np.random.rand()*3

        self.highlights = pd.DataFrame({'y8': ['#FF0000', '#00FF00'],
                                        'y6': ['#0000FF', '#F0000F']}).T

コード例 #17

0

ファイルを表示

def cluster_ndarray(
    profiles_arr,
    output_prefix="clustered",
    output_lists=False,
    threshold=25,
    criterion="maxclust",
    min_num_images=50,
):
    """
    cluster_ndarray clusters images based on their radial profiles

    Parameters
    ----------
    profiles_arr : np.ndarray
        radial profiles (or any other profiles, honestly) 2D np.ndarray
    output_prefix : str, optional
        output prefix for image lists0, by default "clustered"
    output_lists : bool, optional
        whether to output lists as text fiels, by default False
    threshold : int, optional
        distance according to criterion, by default 25
    criterion : str, optional
        criterion for clustering, by default "maxclust"
    min_num_images : int, optional
        minimal number of images in single cluster, others will go to singletone, by default 50

    Returns
    -------
    Union[dict, list]
        Either:
           - Dictionary {cluster_num:[*image_and_event_lines]} -- if output_lists == False
           - List [output_list_1.lst, output_list_2.lst, ...] -- if output_lists == True
    """
    profiles = np.array([elem[1] for elem in profiles_arr])
    names = np.array([elem[0] for elem in profiles_arr])

    # this actually does clustering
    Z = ward(pdist(profiles))
    idx = fcluster(Z, t=threshold, criterion=criterion)

    # output lists
    clusters = defaultdict(lambda: set())
    out_lists = set()
    for list_idx in tqdm(list(set(idx)), desc="Output lists"):
        belong_to_this_idx = np.where(idx == list_idx)[0]
        if len(belong_to_this_idx) < min_num_images:
            fout_name = f"{output_prefix}_singletone.lst"
            out_cluster_idx = -1
        else:
            fout_name = f"{output_prefix}_{list_idx}.lst"
            out_cluster_idx = list_idx
        out_lists.add(fout_name)
        try:
            os.remove(fout_name)
        except OSError:
            pass

        # print output lists if you want to
        for name in names[belong_to_this_idx]:
            clusters[out_cluster_idx].add(name)
        if output_lists:
            with open(fout_name, "a") as fout:
                print(*clusters[out_cluster_idx], sep="\n", file=fout)

    if output_lists:
        return list(out_lists)
    else:
        return clusters

コード例 #18

0

ファイルを表示

def cluster(carrel, type):
    """Apply dimension reduction to <carrel> and visualize the result.
	
	This is useful for determining how holistic <carrel> is. A carrel with many clusters is less holistic and probably means the number of latent topics (think "subjects") is high. On the other hand, you may observe clusters falling into distinct groups surrounding authors, titles, or sources. In other words, use this subcommand to learn the degree <carrel> is a hodgepodge of items or a collection of unrelated items. 
	
	Example: rdr cluster homer
	
	See also: rdr tm --help"""

    # configure
    MAXIMUM = 0.95
    MINIMUM = 2
    STOPWORDS = 'english'
    EXTENSION = '.txt'

    # require
    from os import path, system, listdir
    from scipy.cluster.hierarchy import ward, dendrogram
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.manifold import MDS
    from sklearn.metrics.pairwise import cosine_similarity
    import matplotlib.pyplot as plt

    # sanity check
    checkForCarrel(carrel)

    # initialize
    localLibrary = configuration('localLibrary')
    directory = localLibrary / carrel / TXT
    filenames = [
        path.join(directory, filename) for filename in listdir(directory)
    ]
    vectorizer = TfidfVectorizer(input='filename',
                                 max_df=MAXIMUM,
                                 min_df=MINIMUM,
                                 stop_words=STOPWORDS)
    matrix = vectorizer.fit_transform(filenames).toarray()
    distance = 1 - cosine_similarity(matrix)
    keys = [
        path.basename(filename).replace(EXTENSION, '')
        for filename in filenames
    ]

    # branch according to type; dendrogram
    if type == 'dendrogram':
        linkage_matrix = ward(distance)
        dendrogram(linkage_matrix, orientation="right", labels=keys)
        plt.tight_layout()

    # cube
    elif type == 'cube':
        mds = MDS(n_components=3, dissimilarity="precomputed", random_state=1)
        pos = mds.fit_transform(distance)
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(pos[:, 0], pos[:, 1], pos[:, 2])
        for x, y, z, s in zip(pos[:, 0], pos[:, 1], pos[:, 2], keys):
            ax.text(x, y, z, s)

    # error
    else:
        click.echo(f"Error: Unknown value for TYPE: { type }")
        system('rdr cluster --help')

    # output
    plt.show()

コード例 #19

0

ファイルを表示

ファイル: test_radial.py プロジェクト: biocore/gneiss

    def test_basic_plot(self):
        self.maxDiff = None
        exp_edges = {'dest_node': ['0', '1', '2', 'y3'],
                     'edge_color': ['#00FF00', '#00FF00',
                                    '#00FF00', '#FF0000'],
                     'edge_width': [2, 2, 2, 2],
                     'src_node': ['y3', 'y4', 'y3', 'y4'],
                     'x0': [338.2612593838583,
                            193.1688862557773,
                            338.2612593838583,
                            193.1688862557773],
                     'x1': [487.5, 12.499999999999972,
                            324.89684138234867, 338.2612593838583],
                     'y0': [271.7282256126416,
                            365.95231443706376,
                            271.7282256126416,
                            365.95231443706376],
                     'y1': [347.7691620070637,
                            483.2800610261029,
                            16.719938973897143,
                            271.7282256126416]}

        exp_nodes = {'child0': [np.nan, np.nan, np.nan, '0', '1'],
                     'child1': [np.nan, np.nan, np.nan, '2', 'y3'],
                     'color': ['#1C9099', '#1C9099', '#1C9099',
                               '#FF999F', '#FF999F'],
                     'hover_var': [None, None, None, None, None],
                     'is_tip': [True, True, True, False, False],
                     'node_size': [10, 10, 10, 10, 10],
                     'x': [487.5,
                           12.499999999999972,
                           324.89684138234867,
                           338.26125938385832,
                           193.16888625577729],
                     'y': [347.7691620070637,
                           483.28006102610289,
                           16.719938973897143,
                           271.72822561264161,
                           365.95231443706376]}
        np.random.seed(0)
        num_otus = 3  # otus
        x = np.random.rand(num_otus)
        dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y))
        lm = ward(dm.condensed_form())
        t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str))
        t = UnrootedDendrogram.from_tree(t)
        # incorporate colors in tree
        for i, n in enumerate(t.postorder(include_self=True)):
            if not n.is_tip():
                n.name = "y%d" % i
                n.color = '#FF999F'
                n.edge_color = '#FF0000'
                n.node_size = 10
            else:
                n.color = '#1C9099'
                n.edge_color = '#00FF00'
                n.node_size = 10
            n.length = np.random.rand()*3
            n.edge_width = 2
        p = radialplot(t, node_color='color', edge_color='edge_color',
                       node_size='node_size', edge_width='edge_width')

        for e in exp_edges.keys():
            self.assertListEqual(
                list(p.renderers[0].data_source.data[e]),
                exp_edges[e])

        for e in exp_nodes.keys():
            self.assertListEqual(
                list(p.renderers[1].data_source.data[e]),
                exp_nodes[e])

        self.assertTrue(isinstance(t, TreeNode))

コード例 #20

0

ファイルを表示

# compute distance matrix
distance_matrix = manhattan_distances(activities_binary_matrix)
print(distance_matrix.shape)

activity_names = [
    'Shopping', 'Antiquing', 'Site Seeing', 'Fine Dining', 'Casual Dining',
    'Family Style Dining', 'Fast Food Dining', 'Museums', 'Indoor Pool',
    'Outdoor Pool', 'Hiking', 'Gambling', 'Boating/Swimming', 'Fishing',
    'Golfing', 'Boat Tours', 'Ride the Ducks', 'Amusement Park', 'Minigolf',
    'Go-carting', 'Waterpark', 'Circus World', 'Tommy Bartlett Ski Show',
    'Helicopter Rides', 'Horseback Riding', 'Stand Rock',
    'Outdoor Attractions', 'Nearby Attractions', 'Movie Theater',
    'Concert Theater', 'Bar/Pub Dancing', 'Shop Broadway', 'Bungee Jumping'
]

linkage_matrix = ward(distance_matrix)
fig, ax = plt.subplots(figsize=(15, 20))  # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=activity_names)

plt.tick_params(\
    axis = 'x',          # changes apply to the x-axis
    which = 'both',      # both major and minor ticks are affected
    bottom = 'off',      # ticks along the bottom edge are off
    top = 'off',         # ticks along the top edge are off
    labelbottom = 'off')

plt.tight_layout()  # show plot with tight layout

# route figure to external file
plt.savefig('plot_hierarchical_clustering_solution.png', dpi=200)

コード例 #21

0

ファイルを表示

# -*- coding:utf-8 -*-
import pickle
import numpy as np
from scipy.cluster.hierarchy import ward, dendrogram
from matplotlib import pyplot as plt

with open('countries_vectors.pickle', 'rb') as f:
    xs = pickle.load(f)
countries = []
with open('countries2.txt', 'r') as f:
    for c in f:
        countries.append(c.strip())
X = np.array(xs)
cluster = ward(X)
print(cluster)
dendrogram(cluster, labels=countries)
plt.show()

コード例 #22

0

ファイルを表示

#加上x，y标签
for i in range(len(df)):
    ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)

plt.show()  # 绘图展示

#如果要存储图片可以把下面的#去掉
#plt.savefig('clusters_small_noaxes.png', dpi=200)

plt.close()
# 肉眼看还凑合是吧，聚类主题相关的，很多都在附近。那咱们再用层次聚类试试好了。
#########################
# 文本层次聚类
from scipy.cluster.hierarchy import ward, dendrogram

linkage_matrix = ward(dist)  # 定义linkage_matrix为ward型预算聚类

fig, ax = plt.subplots(figsize=(15, 20))  # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=titles)

plt.tick_params( \
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')

plt.tight_layout()  # show plot with tight layout

# 如果要保存图片，把下面的#去掉
# plt.savefig('ward_clusters.png', dpi=200)

コード例 #23

0

ファイルを表示

"""
96の単語ベクトルに対して，Ward法による階層型クラスタリングを実行せよ．さらに，クラスタリング結果をデンドログラムとして可視化せよ．
"""

import pickle
from collections import OrderedDict
from scipy import io
import numpy as np
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import ward, dendrogram
from matplotlib import pyplot as plt

f_in_dict = "dict_countries"
f_in_matrix = "matrix_counties"

with open(f_in_dict, "rb") as f:
    dict_index_t = pickle.load(f)

matrix_x_300 = io.loadmat(f_in_matrix)['matrix_x_300']

ward = ward(matrix_x_300)
print(ward)

dendrogram(ward, labels=list(dict_index_t.keys()), leaf_font_size=8)
plt.show()

コード例 #24

0

ファイルを表示

ファイル: car_cluster - homework.py プロジェクト: momoleon/data-analysis-with-python

centroids = kmeans.cluster_centers_
print(centroids)
predict_label = kmeans.predict(train_x)
for i in range(k):
    plt.scatter(centroids[i][2], centroids[i][3], c=color[i], marker='X', s=60)
print(data)

for (_data, _label) in zip(train_x, predict_label):
    plt.scatter(_data[2], _data[3], color=color[_label],alpha=0.3)
plt.show()
"""

#合并聚类结果，插入到原数据集
result = pd.concat((data, pd.DataFrame(kmeans.labels_)), axis=1)
#给单元格附名
result.rename({0: u'聚类'}, axis=1, inplace=True)
#print(result)
result.to_csv("car_cluster_result.csv", index=True)
"""
#层次聚类建模方法
from sklearn.cluster import KMeans, AgglomerativeClustering
model = AgglomerativeClustering(linkage='ward', n_clusters=3)
y = model.fit_predict(train_x)
print(y)
"""
#可以用层次聚类的方式对KMeans方法进行可视化，结果大概率一致
#print(train_x)
linkage_matrix = ward(train_x)
dendrogram(linkage_matrix)
plt.show()

コード例 #25

0

ファイルを表示

def cluster_dendogram(corpus,
                      titles=None,
                      stemming=True,
                      max_df=0.95,
                      min_df=2,
                      ngram=(1, 3),
                      cleaning=simple_textcleaning,
                      vectorizer='bow',
                      stop_words=STOPWORDS,
                      random_samples=0.3,
                      figsize=(17, 9),
                      **kwargs):
    """
    plot hierarchical dendogram with similar texts.

    Parameters
    ----------

    corpus: list
    titles: list
        list of titles, length must same with corpus.
    stemming: bool, (default=True)
        If True, sastrawi_stemmer will apply.
    max_df: float, (default=0.95)
        maximum of a word selected based on document frequency.
    min_df: int, (default=2)
        minimum of a word selected on based on document frequency.
    ngram: tuple, (default=(1,3))
        n-grams size to train a corpus.
    cleaning: function, (default=simple_textcleaning)
        function to clean the corpus.
    stop_words: list, (default=STOPWORDS)
        list of stop words to remove.
    vectorizer: str, (default='bow')
        vectorizer technique. Allowed values:

        * ``'bow'`` - Bag of Word.
        * ``'tfidf'`` - Term frequency inverse Document Frequency.
        * ``'skip-gram'`` - Bag of Word with skipping certain n-grams.

    Returns
    -------
    dictionary: {'linkage_matrix': linkage_matrix, 'titles': titles}
    """
    if not isinstance(corpus, list):
        raise ValueError('corpus must be a list')
    if not isinstance(corpus[0], str):
        raise ValueError('corpus must be list of strings')
    if not isinstance(titles, list) and titles is not None:
        raise ValueError('titles must be a list or None')
    if titles:
        if len(titles) != len(corpus):
            raise ValueError('length of titles must be same with corpus')
    if not isinstance(vectorizer, str):
        raise ValueError('vectorizer must be a string')
    if not isinstance(stemming, bool):
        raise ValueError('bool must be a boolean')
    vectorizer = vectorizer.lower()
    if not vectorizer in ['tfidf', 'bow', 'skip-gram']:
        raise ValueError(
            "vectorizer must be in  ['tfidf', 'bow', 'skip-gram']")
    if not isinstance(ngram, tuple):
        raise ValueError('ngram must be a tuple')
    if not len(ngram) == 2:
        raise ValueError('ngram size must equal to 2')
    if not isinstance(min_df, int):
        raise ValueError('min_df must be an integer')
    if not isinstance(max_df, float):
        raise ValueError('max_df must be a float')
    if min_df < 1:
        raise ValueError('min_df must be bigger than 0')
    if not (max_df <= 1 and max_df > 0):
        raise ValueError(
            'max_df must be bigger than 0, less than or equal to 1')
    if vectorizer == 'tfidf':
        Vectorizer = TfidfVectorizer
    elif vectorizer == 'bow':
        Vectorizer = CountVectorizer
    elif vectorizer == 'skip-gram':
        Vectorizer = SkipGramVectorizer
    else:
        raise ValueError(
            "vectorizer must be in  ['tfidf', 'bow', 'skip-gram']")
    if vectorizer == 'tfidf':
        Vectorizer = TfidfVectorizer
    elif vectorizer == 'bow':
        Vectorizer = CountVectorizer
    elif vectorizer == 'skip-gram':
        Vectorizer = SkipGramVectorizer
    else:
        raise ValueError(
            "vectorizer must be in  ['tfidf', 'bow', 'skip-gram']")

    try:
        import matplotlib.pyplot as plt
        import seaborn as sns

        sns.set()
    except:
        raise Exception(
            'matplotlib and seaborn not installed. Please install it and try again.'
        )

    tf_vectorizer = Vectorizer(ngram_range=ngram,
                               min_df=min_df,
                               max_df=max_df,
                               stop_words=stop_words,
                               **kwargs)
    corpus = random.sample(corpus, k=int(random_samples * len(corpus)))
    if cleaning is not None:
        for i in range(len(corpus)):
            corpus[i] = cleaning(corpus[i])
    if stemming:
        for i in range(len(corpus)):
            corpus[i] = sastrawi(corpus[i])
    text_clean = []
    for text in corpus:
        text_clean.append(' '.join(
            [word for word in text.split() if word not in stop_words]))
    tf_vectorizer.fit(text_clean)
    transformed_text_clean = tf_vectorizer.transform(text_clean)
    features = tf_vectorizer.get_feature_names()
    dist = 1 - cosine_similarity(transformed_text_clean)
    linkage_matrix = ward(dist)
    if not titles:
        titles = []
        for i in range(transformed_text_clean.shape[0]):
            indices = np.argsort(
                np.array(transformed_text_clean[i].todense())[0])[::-1]
            titles.append(' '.join([features[i] for i in indices[:ngram[1]]]))
    plt.figure(figsize=figsize)
    ax = dendrogram(linkage_matrix, orientation='right', labels=titles)
    plt.tick_params(
        axis='x',
        which='both',
        bottom='off',
        top='off',
        labelbottom='off',
    )
    plt.tight_layout()
    plt.show()
    return {'linkage_matrix': linkage_matrix, 'titles': titles}

コード例 #26

0

ファイルを表示

ファイル: clustering.py プロジェクト: armijoalb/IN

            clusterFrame.groupby('cluster').cluster.transform(len) > min_size]
    else:
        X_filtrado = clusterFrame

    makeScatterPlot(data=clusterFrame,
                    outputName="./imagenes/scatterMatrix_caso3_" +
                    algorithm_name,
                    displayOutput=False)

    makeHeatmap(data=X_filtrado,
                outputName="./imagenes/heatmap_caso3_" + algorithm_name,
                displayOutput=False)

    if algorithm_name == 'AC':
        X_filtrado_normal = preprocessing.normalize(X_filtrado, norm='l2')
        linkage_array = ward(X_filtrado_normal)

        dendrogram(linkage_array, leaf_rotation=90., leaf_font_size=5.)
        plt.show()
        #plt.clf()

    results['N Clusters'] = n_clusters
    results['HC metric'] = met[0]
    results['SC metric'] = met[1]
    results['Time'] = timeAlg

    outputData[algorithm_name] = results

latexCaso1 = createLatexDataFrame(data=outputData)
f = open('caso3.txt', 'w')
f.write(latexCaso1.to_latex())

コード例 #27

0

ファイルを表示

plt.show()

mglearn.plots.plot_kmeans_faces(km, pca, pca_x, x_people, y_people,
                                people.target_names)

##    Agglomerative Clustering
from scipy.cluster.hierarchy import dendrogram, ward

agglomerative = AgglomerativeClustering(n_clusters=40)
labels_agg = agglomerative.fit_predict(pca_x)
print("Cluster size with agglomerative clustering: {}"\
    .format(np.bincount(labels_agg)))
print("ARI btw KMeans and Agglomerative {:.2f}"\
    .format(adjusted_rand_score(labels_agg, labels_km))) # low commonality

linkage_array = ward(pca_x)
plt.figure(figsize=(20, 5))
dendrogram(linkage_array, p=7, truncate_mode='level', no_labels=True)
plt.xlabel("Simple index")
plt.ylabel("Cluster distance")
plt.show()

for cluster in [10, 13, 19, 22, 36]:
    mask = labels_agg == cluster
    cluster_size = np.sum(mask)

    fig, axes = plt.subplots(1,
                             15,
                             subplot_kw={
                                 'xticks': (),
                                 'yticks': ()

コード例 #28

0

ファイルを表示

ファイル: main.py プロジェクト: Mejay013/ML_ezam

plt.ylabel("pc2")
plt.axis('equal')
plt.show()

df_new = pd.DataFrame(pca.get_covariance())
print(df_new)
plt.matshow(pca.components_, cmap='twilight')
plt.colorbar()
plt.gca().xaxis.tick_bottom()
plt.xticks(range(len(df.columns)), df.iloc[:, :].columns, rotation=90)
plt.yticks(range(len(df2.columns)), df2.iloc[:, :].columns)
plt.title("Main features")
i, k = plt.ylim()
plt.ylim(i + 0.5, k - 0.5)
plt.show()

from scipy.cluster.hierarchy import dendrogram, ward
linkage_array = ward(df2)
plt.figure(figsize=(20, 10))
dendrogram(linkage_array, truncate_mode='level', no_labels=True, p=10)
plt.title("Dendrogram")
plt.show()

import SimpSOM as sps

net = sps.somNet(20, 20, df2.values, PBC=True)
net.train(0.01, 10000)
net.save('filename_weights')
# net.nodes_graph(colnum=0)
# net.diff_graph()
net.cluster(df2.values, type='qthresh')

コード例 #29

0

ファイルを表示

## Distance metrics
dist = euclidean_distances(dtm)
cosdist = 1 - cosine_similarity(dtm)

## 2D Visualization
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=5193)
pos = mds.fit_transform(cosdist)
xs, ys = pos[:, 0], pos[:, 1]
for x, y, name in zip(xs, ys, names):
    plt.scatter(x, y)
    plt.text(x, y, name)
plt.title("Document Distances 2D Cartesian")
plt.show()

## 3D Visualization
mds = MDS(n_components=3, dissimilarity="precomputed", random_state=5193)
pos = mds.fit_transform(cosdist)
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.scatter(pos[:, 0], pos[:, 1], pos[:, 2])
for x, y, z, s in zip(pos[:, 0], pos[:, 1], pos[:, 2], names):
    ax.text(x, y, z, s)
plt.title("Document Distances 3D Cartesian")
plt.show()

## Hierarchical Clustering Visualization
linkage_matrix = ward(cosdist)
dendrogram(linkage_matrix, orientation="right", labels=names)
plt.tight_layout()
plt.title("Document Distances Hierarchical Clustering")
plt.show()

コード例 #30

0

ファイルを表示

def magic_cluster(input_matrix,
                  output_path,
                  out_name,
                  num_clusters=5,
                  num_terms_in_cluster=20,
                  cluster_seed=3425,
                  source=''):
    #todo: add logic to dynamically select optimal number of clusters
    #note: num_terms_in_cluster is only the number of terms to be REPORTED, the actual
    #   number of terms is the full collection of all terms

    #convert our term tf_idf into a proper matrix (get rid of extraneous columns, and transpose)
    tfidf_matrix = input_matrix.copy()
    #were we given the terms matrix to cluster, or the concept matrix?
    if 'concept' in tfidf_matrix.columns:
        tfidf_matrix.drop(
            ['t_count', 'concept', 'd_count', 'tf', 'idf', 'tf_idf', 'weight'],
            axis=1,
            inplace=True)
    else:
        tfidf_matrix.drop(
            ['t_count', 'd_count', 'tf', 'idf', 'tf_idf', 'weight'],
            axis=1,
            inplace=True)
    tfidf_matrix.fillna(0, inplace=True)
    tfidf_matrix = tfidf_matrix.transpose()

    #Calculate cosine similarity of all terms to each other...
    dist = 1 - cosine_similarity(tfidf_matrix)

    #determine k-means clustering
    #km = KMeans(n_clusters=num_clusters)
    km = KMeans(n_clusters=num_clusters,
                init='k-means++',
                max_iter=100,
                n_init=1,
                verbose=0,
                random_state=cluster_seed)
    km.fit(tfidf_matrix)
    clusters = km.labels_.tolist()
    vocab_frame = pd.DataFrame(tfidf_matrix.columns)

    tfidf_matrix['cluster'] = clusters
    #tfidf_matrix['cluster'].value_counts() #how many DSI belong to each cluster?

    #what are the top terms from each of the clusters?

    cluster_terms = {}
    cluster_names = {}
    seed_name = "B"
    if cluster_seed == 3425: seed_name = "A"
    text_file = open(
        output_path + source + '_' + out_name + '_clusters.' + seed_name +
        '.txt', "w")
    #print("Top terms per cluster:\n")
    text_file.write("Top terms per cluster:\n")
    #sort cluster centers by proximity to centroid
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    for i in range(num_clusters):
        terms_in_cluster = ''
        dsi_in_cluster = ''
        #print("Cluster %d words:" % i)
        text_file.write("Cluster %d words:\n" % i)
        for ind in order_centroids[i, :num_terms_in_cluster]:
            terms_in_cluster = terms_in_cluster + vocab_frame.iloc[ind][
                0] + ", "
        #print(terms_in_cluster[:-2]) #don't print the trailing ', '
        text_file.write(terms_in_cluster[:-2])  #don't print the trailing ', '
        cluster_terms[i] = terms_in_cluster[:-2]
        cluster_names[i] = cluster_terms[i].split(
            ', ')[:4]  #only use the first 4 terms to "name" the cluster
        #print()
        text_file.write('\n\n')
        for dsi in tfidf_matrix[tfidf_matrix['cluster'] == i].index:
            dsi_in_cluster = dsi_in_cluster + dsi + ", "
        #print("DSI in cluster %d:" % i)
        text_file.write("DSI in cluster %d:\n" % i)
        #print(dsi_in_cluster[:-2]) #don't print the trailing ', '
        text_file.write(dsi_in_cluster[:-2])  #don't print the trailing ', '
        #print('\n\n')
        text_file.write('\n\n')
    text_file.close()
    del i, terms_in_cluster, dsi_in_cluster, ind, order_centroids, text_file

    MDS()
    # convert two components as we're plotting points in a two-dimensional plane
    #   "precomputed" because we provide a distance matrix
    #   we will also specify `random_state` so the plot is reproducible.
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
    pos = mds.fit_transform(dist)  # shape (n_components, n_samples)
    xs, ys = pos[:, 0], pos[:, 1]

    #un-comment below to manually set up colors per clusters using a dict
    #   also, find "cluster_colors" below, and un-comment that line to enable it
    #cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e'}
    #un-comment the below section if you want to manually name the clusters
    #   Be certain the dictionary is the same length as your declared number of clusters
    # cluster_names = {0: 'Immigration and border control',
    #                 1: 'American governmental policy',
    #                 2: 'Russian interference',
    #                 3: 'International trade',
    #                 4: 'Tax reform'}

    #create data frame that has the result of the MDS plus the cluster numbers and titles
    mappedDF = pd.DataFrame(
        dict(x=xs, y=ys, label=clusters, title=list(tfidf_matrix.index)))
    #group by cluster
    groups = mappedDF.groupby('label')
    # set up plot
    fig, ax = plt.subplots(figsize=(16, 12))  # set size
    ax.margins(0.05)  # Optional, just adds 5% padding to the autoscaling
    #iterate through groups to layer the plot
    for name, group in groups:
        ax.plot(
            group.x,
            group.y,
            marker='o',
            linestyle='',
            ms=12,
            label=cluster_names[name],
            #color=cluster_colors[name],
            mec='none')
        ax.set_aspect('auto')
        ax.tick_params(\
            axis= 'x',          # changes apply to the x-axis
            which='both',      # both major and minor ticks are affected
            bottom='off',      # ticks along the bottom edge are off
            top='off',         # ticks along the top edge are off
            labelbottom='off')
        ax.tick_params(\
            axis= 'y',         # changes apply to the y-axis
            which='both',      # both major and minor ticks are affected
            left='off',      # ticks along the bottom edge are off
            top='off',         # ticks along the top edge are off
            labelleft='off')
    lgd = ax.legend(numpoints=1,
                    bbox_to_anchor=(.5, -.3),
                    loc=8,
                    borderaxespad=0.)
    #lgd = ax.legend(numpoints=1,  loc=0)
    #add label in x,y position with the label as the DSI#
    for i in range(len(mappedDF)):
        ax.text(mappedDF.loc[i]['x'],
                mappedDF.loc[i]['y'],
                mappedDF.loc[i]['title'],
                size=8)
    plt.title('DSI K-Means cluster assignment: ' + out_name)
    plt.margins(0.05, 0.1)
    #plt.show() #show the plot
    plt.savefig(output_path + source + out_name + '_kmeans.' + seed_name +
                '.png',
                bbox_extra_artists=(lgd, ),
                bbox_inches='tight',
                dpi=200)
    plt.close(
        'all'
    )  #even though we don't show the plot, you need to explicitly close to free the memory

    #the 2D map is done, now prepare a dendrogram to visualize how clustering splits
    linkage_matrix = ward(
        dist
    )  #define the linkage_matrix using ward clustering pre-computed distances
    fig, ax = plt.subplots(figsize=(15, 30))  # set size
    ax = dendrogram(linkage_matrix,
                    orientation="right",
                    labels=list(tfidf_matrix.index))

    plt.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')
    plt.yticks(fontsize=14)
    plt.title('DSI Ward clustering dendrogram: ' + out_name)
    #plt.show()
    #plt.savefig(output_path + out_name + '_dendrogram' + seed_name + '.png', bbox_inches='tight', dpi=72)
    #Dendrogram is generated with ward distances on pre-computed values, it does not change based on KMeans seed
    #   Therefore, don't write out an "A", and "B" version of the dendrogram
    #   Todo: udpate dendrogram to better reflect the KMeans 2D map
    #   Recommend you start here: https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/
    plt.savefig(output_path + source + '_' + out_name + '_dendrogram.png',
                bbox_inches='tight',
                dpi=200)
    plt.close('all')  #clear plot from memory

    return tfidf_matrix

コード例 #31

0

ファイルを表示

ファイル: hierarchy.py プロジェクト: imfht/flaskapps

count_vec = CountVectorizer(min_df=3)
xx1 = count_vec.fit_transform(list1).toarray()
word = count_vec.get_feature_names()
print("word feature length: {}".format(len(word)))
print(word)
print(xx1.shape)
print(xx1[0])
titles = word

#------------------------------ 第四步 相似度计算 ------------------------------
df = pd.DataFrame(xx1)
print(df.corr())
print(df.corr('spearman'))
print(df.corr('kendall'))

dist = df.corr()
print(dist)
print(dist.shape)

#------------------------------ 第五步 可视化分析 ------------------------------
# define the linkage_matrix using ward clustering pre-computed distances
linkage_matrix = ward(dist)
fig, ax = plt.subplots(figsize=(15, 20))  # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=titles)

# how plot with tight layout
plt.tight_layout()

# save figure as ward_clusters
plt.savefig('Tree_word.png', dpi=200)

コード例 #32

0

ファイルを表示

ファイル: bench_plot_ward.py プロジェクト: Roujdami/scikit-learn

ward = AgglomerativeClustering(n_clusters=3, linkage="ward")

n_samples = np.logspace(0.5, 3, 9)
n_features = np.logspace(1, 3.5, 7)
N_samples, N_features = np.meshgrid(n_samples, n_features)
scikits_time = np.zeros(N_samples.shape)
scipy_time = np.zeros(N_samples.shape)

for i, n in enumerate(n_samples):
    for j, p in enumerate(n_features):
        X = np.random.normal(size=(n, p))
        t0 = time.time()
        ward.fit(X)
        scikits_time[j, i] = time.time() - t0
        t0 = time.time()
        hierarchy.ward(X)
        scipy_time[j, i] = time.time() - t0

ratio = scikits_time / scipy_time

plt.figure("scikit-learn Ward's method benchmark results")
plt.imshow(np.log(ratio), aspect="auto", origin="lower")
plt.colorbar()
plt.contour(ratio, levels=[1], colors="k")
plt.yticks(range(len(n_features)), n_features.astype(np.int))
plt.ylabel("N features")
plt.xticks(range(len(n_samples)), n_samples.astype(np.int))
plt.xlabel("N samples")
plt.title("Scikit's time, in units of scipy time (log)")
plt.show()

コード例 #33

0

ファイルを表示

    print("{:6.2f} segundos, ".format(tiempo),end='')
    if (k[name]>1):
        metric_CH[name] = metrics.calinski_harabaz_score(X_normal, cluster_predict[name])
        metric_SC[name] = metrics.silhouette_score(X_normal, cluster_predict[name], metric='euclidean', sample_size=floor(0.1*len(X)), random_state=123456)

    print("CH index: {:9.3f}, ".format(metric_CH[name]),end='')
    print("SC: {:.5f}".format(metric_SC[name]))
    
    clusters = pd.DataFrame(cluster_predict[name],index=X.index,columns=['cluster'])
    X_cluster = pd.concat([X,clusters],axis=1)
    min_size = 5
    X_filtrado = X_cluster[X_cluster.groupby('cluster').cluster.transform(len) > min_size]

    makeScatterPlot(X_filtrado)
    makeHeatmap(X_filtrado)
    
    
clusters = pd.DataFrame(cluster_predict['Ward'],index=X.index,columns=['cluster'])
X_cluster = pd.concat([X,clusters],axis=1)

min_size = 5
X_filtrado = X_cluster[X_cluster.groupby('cluster').cluster.transform(len) > min_size]
k_filtrado = len(set(X_filtrado['cluster']))

X_filtrado = X_filtrado.drop('cluster',1)
X_filtrado_normal = X_filtrado.apply(norm_to_zero_one)

linkage_array = hierarchy.ward(X_filtrado_normal)
h_dict = hierarchy.dendrogram(linkage_array,orientation='left')

sns.clustermap(X_filtrado_normal, method='ward', col_cluster=False, figsize=(15,7), cmap='YlGnBu', yticklabels=False)

コード例 #34

0

ファイルを表示

ax.legend(numpoints=1)  # 图例（legend）中每项只显示一个点

# 在坐标点为 x,y 处添加影片名作为标签（label）
for i in range(len(df)):
    ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)

plt.show()  # 展示绘图

# 以下注释语句可以保存需要的绘图
#plt.savefig('clusters_small_noaxes.png', dpi=200)

plt.close()

##层次聚类
from scipy.cluster.hierarchy import ward, dendrogram
linkage_matrix = ward(dist)  # 聚类算法处理之前计算得到的距离，用 linkage_matrix 表示
fig, ax = plt.subplots(figsize=(15, 20))  # 设置大小
ax = dendrogram(linkage_matrix, orientation="right", labels=titles)
plt.tick_params(
    axis='x',  # 使用 x 坐标轴
    which='both',  # 同时使用主刻度标签（major ticks）和次刻度标签（minor ticks）
    bottom='off',  # 取消底部边缘（bottom edge）标签
    top='off',  # 取消顶部边缘（top edge）标签
    labelbottom='off')

plt.tight_layout()  # 展示紧凑的绘图布局
# 注释语句用来保存图片
#plt.savefig('ward_clusters.png', dpi=200) # 保存图片为 ward_clusters

plt.close()

コード例 #35

0

ファイルを表示

ファイル: hierarchical, dbscan cluster.py プロジェクト: Hanbyeongrim/python

from sklearn.datasets import make_blobs
X, y = make_blobs(random_state=1)

agg = AgglomerativeClustering(n_clusters=3)
assignment = agg.fit_predict(X)

mglearn.discrete_scatter(X[:, 0], X[:, 1], assignment)
import matplotlib.pyplot as plt
plt.legend(["cluster 0", "cluster 1", "cluster 2"])

mglearn.plots.plot_agglomerative()

#덴드로그램
from scipy.cluster.hierarchy import dendrogram, ward
X, y = make_blobs(random_state=0, n_samples=12)
linkage_array = ward(X)
dendrogram(linkage_array)

ax = plt.gca()
bounds = ax.get_xbound()
ax.plot(bounds, [7.25, 7.25], "--", c="k")
ax.plot(bounds, [4, 4], "--", c="k")

ax.text(bounds[1], 7.25, "two cluster", va="center", fontdict={"size": 15})
ax.text(bounds[1], 4, "three cluster", va="center", fontdict={"size": 15})
plt.xlabel("sample num")
plt.ylabel("cluster distance")

##DBSCAN
#random data
from sklearn.cluster import DBSCAN

コード例 #36

0

ファイルを表示

ファイル: dendroToNewick.py プロジェクト: bbrule/Lexos

def cluster():
    fileManager = managers.utility.loadFileManager()
    leq = '≤'.decode('utf-8')

    if request.method == "GET":
        # "GET" request occurs when the page is first loaded.
        if 'analyoption' not in session:
            session['analyoption'] = constants.DEFAULT_ANALIZE_OPTIONS
        if 'hierarchyoption' not in session:
            session['hierarchyoption'] = constants.DEFAULT_HIERARCHICAL_OPTIONS
        labels = fileManager.getActiveLabels()
        thresholdOps = {}
        return render_template('cluster.html', labels=labels, thresholdOps=thresholdOps)

    if 'getdendro' in request.form:
        labelDict = fileManager.getActiveLabels()
        labels = []
        for ind, label in labelDict.items():
            labels.append(label)
        # Apply re-tokenisation and filters to DTM 
        #countMatrix = fileManager.getMatrix(ARGUMENTS OMITTED)

        # Get options from request.form
        orientation = str(request.form['orientation'])
        title = request.form['title']
        pruning = request.form['pruning']
        pruning = int(request.form['pruning']) if pruning else 0
        linkage = str(request.form['linkage'])
        metric = str(request.form['metric'])

        # Get active files
        allContents = []  # list of strings-of-text for each segment
        tempLabels = []  # list of labels for each segment
        for lFile in fileManager.files.values():
            if lFile.active:
                contentElement = lFile.loadContents()
                allContents.append(contentElement)

                if request.form["file_" + str(lFile.id)] == lFile.label:
                    tempLabels.append(lFile.label.encode("utf-8"))
                else:
                    newLabel = request.form["file_" + str(lFile.id)].encode("utf-8")
                    tempLabels.append(newLabel)

        # More options
        ngramSize = int(request.form['tokenSize'])
        useWordTokens = request.form['tokenType'] == 'word'
        try:
            useFreq = request.form['normalizeType'] == 'freq'

            useTfidf = request.form['normalizeType'] == 'tfidf'  # if use TF/IDF
            normOption = "N/A"  # only applicable when using "TF/IDF", set default value to N/A
            if useTfidf:
                if request.form['norm'] == 'l1':
                    normOption = u'l1'
                elif request.form['norm'] == 'l2':
                    normOption = u'l2'
                else:
                    normOption = None
        except:
            useFreq = useTfidf = False
            normOption = None

        onlyCharGramsWithinWords = False
        if not useWordTokens:  # if using character-grams
            # this option is disabled on the GUI, because countVectorizer count front and end markers as ' ' if this is true
            onlyCharGramsWithinWords = 'inWordsOnly' in request.form

        greyWord = 'greyword' in request.form
        MostFrequenWord = 'mfwcheckbox' in request.form
        Culling = 'cullcheckbox' in request.form

        showDeletedWord = False
        if 'greyword' or 'mfwcheckbox' or 'cullcheckbox' in request.form:
            if 'onlygreyword' in request.form:
                showDeletedWord = True

        if useWordTokens:
            tokenType = u'word'
        else:
            tokenType = u'char'
            if onlyCharGramsWithinWords:
                tokenType = u'char_wb'

        from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
        vectorizer = CountVectorizer(input=u'content', encoding=u'utf-8', min_df=1,
                                      analyzer=tokenType, token_pattern=ur'(?u)\b[\w\']+\b',
                                      ngram_range=(ngramSize, ngramSize),
                                      stop_words=[], dtype=float, max_df=1.0)

        # make a (sparse) Document-Term-Matrix (DTM) to hold all counts
        DocTermSparseMatrix = vectorizer.fit_transform(allContents)
        dtm = DocTermSparseMatrix.toarray()

        from sklearn.metrics.pairwise import euclidean_distances
        from scipy.cluster.hierarchy import ward

        import matplotlib.pyplot as plt
        from scipy.cluster.hierarchy import average, weighted, ward, single, complete, dendrogram
        from scipy.cluster import hierarchy
        from scipy.spatial.distance import pdist

        if orientation == "left":
            orientation = "right"
        if orientation == "top":
            LEAF_ROTATION_DEGREE = 90
        else:
            LEAF_ROTATION_DEGREE = 0

        if linkage == "ward":
            dist = euclidean_distances(dtm)
            np.round(dist, 1)
            linkage_matrix = ward(dist)
            dendrogram(linkage_matrix, orientation=orientation, leaf_rotation=LEAF_ROTATION_DEGREE, labels=labels)
            Z = linkage_matrix
        else:
            Y = pdist(dtm, metric)
            Z = hierarchy.linkage(Y, method=linkage)
            dendrogram(Z, orientation=orientation, leaf_rotation=LEAF_ROTATION_DEGREE, labels=labels)

        plt.tight_layout()  # fixes margins

        ## Conversion to Newick/ETE
        # Stuff we need
        from scipy.cluster.hierarchy import average, linkage, to_tree
        #from hcluster import linkage, to_tree
        from ete2 import Tree, TreeStyle, NodeStyle

        # Change it to a distance matrix
        T = to_tree(Z)

        # ete2 section
        root = Tree()
        root.dist = 0
        root.name = "root"
        item2node = {T: root}

        to_visit = [T]
        while to_visit:
            node = to_visit.pop()
            cl_dist = node.dist /2.0
            for ch_node in [node.left, node.right]:
                if ch_node:
                    ch = Tree()
                    ch.dist = cl_dist
                    ch.name = str(ch_node.id)
                    item2node[node].add_child(ch)
                    item2node[ch_node] = ch
                    to_visit.append(ch_node)

        # This is the ETE tree structure
        tree = root
        ts = TreeStyle()
        ts.show_leaf_name = True
        ts.show_branch_length = True
        ts.show_scale = False
        ts.scale =  None
        if orientation == "top":
            ts.rotation = 90
            ts.branch_vertical_margin = 10 # 10 pixels between adjacent branches

        # Draws nodes as small red spheres of diameter equal to 10 pixels
        nstyle = NodeStyle()
        nstyle["size"] = 0

        # Replace the node labels
        for leaf in tree:
            k = leaf.name
            k = int(k)
            leaf.name = labels[k]

        # Apply node styles to nodes
        for n in tree.traverse():
           n.set_style(nstyle)

        # Convert the ETE tree to Newick
        newick = tree.write()
        f = open('C:\\Users\\Scott\\Documents\\GitHub\\d3-dendro\\newickStr.txt', 'w')
        f.write(newick)
        f.close()

        # Save the image as .png...
        from os import path, makedirs

        # Using ETE
        folder = pathjoin(session_manager.session_folder(), constants.RESULTS_FOLDER)
        if (not os.path.isdir(folder)):
            makedirs(folder)

        # saves dendrogram as a .png with pyplot
        plt.savefig(path.join(folder, constants.DENDROGRAM_PNG_FILENAME))
        plt.close()
        # if orientation == "top":
        #     plt.figure(figsize=(20,80))
        # else:
        #     plt.figure(figsize=(80,20))

        pdfPageNumber, score, inconsistentMax, maxclustMax, distanceMax, distanceMin, monocritMax, monocritMin, threshold = utility.generateDendrogram(
            fileManager)
        session['dengenerated'] = True
        labels = fileManager.getActiveLabels()

        inconsistentOp = "0 " + leq + " t " + leq + " " + str(inconsistentMax)
        maxclustOp = "2 " + leq + " t " + leq + " " + str(maxclustMax)
        distanceOp = str(distanceMin) + " " + leq + " t " + leq + " " + str(distanceMax)
        monocritOp = str(monocritMin) + " " + leq + " t " + leq + " " + str(monocritMax)

        thresholdOps = {"inconsistent": inconsistentOp, "maxclust": maxclustOp, "distance": distanceOp,
                        "monocrit": monocritOp}

        managers.utility.saveFileManager(fileManager)
        session_manager.cacheAnalysisOption()
        session_manager.cacheHierarchyOption()
        import random
        ver = random.random() * 100
        return render_template('cluster.html', labels=labels, pdfPageNumber=pdfPageNumber, score=score,
                               inconsistentMax=inconsistentMax, maxclustMax=maxclustMax, distanceMax=distanceMax,
                               distanceMin=distanceMin, monocritMax=monocritMax, monocritMin=monocritMin,
                               threshold=threshold, thresholdOps=thresholdOps, ver=ver)

コード例 #37

0

ファイルを表示

ファイル: _agglomerative.py プロジェクト: zzhzaihq/scikit-learn

def ward_tree(X, *, connectivity=None, n_clusters=None, return_distance=False):
    """Ward clustering based on a Feature matrix.

    Recursively merges the pair of clusters that minimally increases
    within-cluster variance.

    The inertia matrix uses a Heapq-based representation.

    This is the structured version, that takes into account some topological
    structure between samples.

    Read more in the :ref:`User Guide <hierarchical_clustering>`.

    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        feature matrix representing n_samples samples to be clustered

    connectivity : sparse matrix, default=None
        connectivity matrix. Defines for each sample the neighboring samples
        following a given structure of the data. The matrix is assumed to
        be symmetric and only the upper triangular half is used.
        Default is None, i.e, the Ward algorithm is unstructured.

    n_clusters : int, default=None
        Stop early the construction of the tree at n_clusters. This is
        useful to decrease computation time if the number of clusters is
        not small compared to the number of samples. In this case, the
        complete tree is not computed, thus the 'children' output is of
        limited use, and the 'parents' output should rather be used.
        This option is valid only when specifying a connectivity matrix.

    return_distance : bool, default=None
        If True, return the distance between the clusters.

    Returns
    -------
    children : ndarray of shape (n_nodes-1, 2)
        The children of each non-leaf node. Values less than `n_samples`
        correspond to leaves of the tree which are the original samples.
        A node `i` greater than or equal to `n_samples` is a non-leaf
        node and has children `children_[i - n_samples]`. Alternatively
        at the i-th iteration, children[i][0] and children[i][1]
        are merged to form node `n_samples + i`

    n_connected_components : int
        The number of connected components in the graph.

    n_leaves : int
        The number of leaves in the tree

    parents : ndarray of shape (n_nodes,) or None
        The parent of each node. Only returned when a connectivity matrix
        is specified, elsewhere 'None' is returned.

    distances : ndarray of shape (n_nodes-1,)
        Only returned if return_distance is set to True (for compatibility).
        The distances between the centers of the nodes. `distances[i]`
        corresponds to a weighted euclidean distance between
        the nodes `children[i, 1]` and `children[i, 2]`. If the nodes refer to
        leaves of the tree, then `distances[i]` is their unweighted euclidean
        distance. Distances are updated in the following way
        (from scipy.hierarchy.linkage):

        The new entry :math:`d(u,v)` is computed as follows,

        .. math::

           d(u,v) = \\sqrt{\\frac{|v|+|s|}
                               {T}d(v,s)^2
                        + \\frac{|v|+|t|}
                               {T}d(v,t)^2
                        - \\frac{|v|}
                               {T}d(s,t)^2}

        where :math:`u` is the newly joined cluster consisting of
        clusters :math:`s` and :math:`t`, :math:`v` is an unused
        cluster in the forest, :math:`T=|v|+|s|+|t|`, and
        :math:`|*|` is the cardinality of its argument. This is also
        known as the incremental algorithm.
    """
    X = np.asarray(X)
    if X.ndim == 1:
        X = np.reshape(X, (-1, 1))
    n_samples, n_features = X.shape

    if connectivity is None:
        from scipy.cluster import hierarchy  # imports PIL

        if n_clusters is not None:
            warnings.warn('Partial build of the tree is implemented '
                          'only for structured clustering (i.e. with '
                          'explicit connectivity). The algorithm '
                          'will build the full tree and only '
                          'retain the lower branches required '
                          'for the specified number of clusters',
                          stacklevel=2)
        X = np.require(X, requirements="W")
        out = hierarchy.ward(X)
        children_ = out[:, :2].astype(np.intp)

        if return_distance:
            distances = out[:, 2]
            return children_, 1, n_samples, None, distances
        else:
            return children_, 1, n_samples, None

    connectivity, n_connected_components = _fix_connectivity(
                                                X, connectivity,
                                                affinity='euclidean')
    if n_clusters is None:
        n_nodes = 2 * n_samples - 1
    else:
        if n_clusters > n_samples:
            raise ValueError('Cannot provide more clusters than samples. '
                             '%i n_clusters was asked, and there are %i '
                             'samples.' % (n_clusters, n_samples))
        n_nodes = 2 * n_samples - n_clusters

    # create inertia matrix
    coord_row = []
    coord_col = []
    A = []
    for ind, row in enumerate(connectivity.rows):
        A.append(row)
        # We keep only the upper triangular for the moments
        # Generator expressions are faster than arrays on the following
        row = [i for i in row if i < ind]
        coord_row.extend(len(row) * [ind, ])
        coord_col.extend(row)

    coord_row = np.array(coord_row, dtype=np.intp, order='C')
    coord_col = np.array(coord_col, dtype=np.intp, order='C')

    # build moments as a list
    moments_1 = np.zeros(n_nodes, order='C')
    moments_1[:n_samples] = 1
    moments_2 = np.zeros((n_nodes, n_features), order='C')
    moments_2[:n_samples] = X
    inertia = np.empty(len(coord_row), dtype=np.float64, order='C')
    _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col,
                                    inertia)
    inertia = list(zip(inertia, coord_row, coord_col))
    heapify(inertia)

    # prepare the main fields
    parent = np.arange(n_nodes, dtype=np.intp)
    used_node = np.ones(n_nodes, dtype=bool)
    children = []
    if return_distance:
        distances = np.empty(n_nodes - n_samples)

    not_visited = np.empty(n_nodes, dtype=np.int8, order='C')

    # recursive merge loop
    for k in range(n_samples, n_nodes):
        # identify the merge
        while True:
            inert, i, j = heappop(inertia)
            if used_node[i] and used_node[j]:
                break
        parent[i], parent[j] = k, k
        children.append((i, j))
        used_node[i] = used_node[j] = False
        if return_distance:  # store inertia value
            distances[k - n_samples] = inert

        # update the moments
        moments_1[k] = moments_1[i] + moments_1[j]
        moments_2[k] = moments_2[i] + moments_2[j]

        # update the structure matrix A and the inertia matrix
        coord_col = []
        not_visited.fill(1)
        not_visited[k] = 0
        _hierarchical._get_parents(A[i], coord_col, parent, not_visited)
        _hierarchical._get_parents(A[j], coord_col, parent, not_visited)
        # List comprehension is faster than a for loop
        [A[col].append(k) for col in coord_col]
        A.append(coord_col)
        coord_col = np.array(coord_col, dtype=np.intp, order='C')
        coord_row = np.empty(coord_col.shape, dtype=np.intp, order='C')
        coord_row.fill(k)
        n_additions = len(coord_row)
        ini = np.empty(n_additions, dtype=np.float64, order='C')

        _hierarchical.compute_ward_dist(moments_1, moments_2,
                                        coord_row, coord_col, ini)

        # List comprehension is faster than a for loop
        [heappush(inertia, (ini[idx], k, coord_col[idx]))
            for idx in range(n_additions)]

    # Separate leaves in children (empty lists up to now)
    n_leaves = n_samples
    # sort children to get consistent output with unstructured version
    children = [c[::-1] for c in children]
    children = np.array(children)  # return numpy array for efficient caching

    if return_distance:
        # 2 is scaling factor to compare w/ unstructured version
        distances = np.sqrt(2. * distances)
        return children, n_connected_components, n_leaves, parent, distances
    else:
        return children, n_connected_components, n_leaves, parent

コード例 #38

0

ファイルを表示

ファイル: text_sim.py プロジェクト: nevmenandr/Zhukovsky

def main():
    fwr = codecs.open('lemmed_cluster_no_stopwords.txt', 'w', 'utf-8')
    lst = os.listdir(pth)
    titles = []
    contents = []
    for fl in lst:
        titles.append(fl.replace('.txt', ''))
        f = codecs.open(pth + fl, 'r', 'utf-8')
        cont = f.read()
        f.close()
        contents.append(cont)
    
    totalvocab_tokenized = []
    for i in contents:
        allwords = tokenize_only(i)
        totalvocab_tokenized.extend(allwords)
        
    vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index=totalvocab_tokenized)
    fwr.write('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame\n')
    
    #vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
    
    tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000, min_df=0.2, use_idf=True, tokenizer=tokenize_only, ngram_range=(1,3))
    tfidf_matrix = tfidf_vectorizer.fit_transform(contents)
    
    for an in tfidf_matrix.shape:
        fwr.write(str(an) + '\t')
        fwr.write('\n')
    #fwr.write('\t'.join(tfidf_matrix.shape))
    terms = tfidf_vectorizer.get_feature_names()
    dist = 1 - cosine_similarity(tfidf_matrix)
    num_clusters = 2
    km = KMeans(n_clusters=num_clusters)
    km.fit(tfidf_matrix)
    clusters = km.labels_.tolist()
    
    joblib.dump(km, 'songs_cluster.pkl')
        
    #km = joblib.load('songs_cluster.pkl')
    #clusters = km.labels_.tolist()    


    texts = { 'title': titles, 'content': contents, 'cluster': clusters }
    frame = pd.DataFrame(texts, index=[clusters], columns=['title', 'cluster'])
    fwr.write(str(frame['cluster'].value_counts()))
    
    fwr.write(u'Top terms per cluster:\n')
    order_centroids = km.cluster_centers_.argsort()[:, ::-1] 
    
    
    
    for i in range(num_clusters):
        fwr.write("\nCluster %d words:" % i)
        for ind in order_centroids[i, :20]:
            try:
                #fwr.write(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore') + ', ')
                fwr.write(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0] + ', ')
            #print u' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore')
            except:
                pass
            #pass
        fwr.write("\nCluster %d titles:" % i)
        for title in frame.ix[i]['title'].values.tolist():
            fwr.write(' %s,' % title)
            
    MDS()
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
    pos = mds.fit_transform(dist)  # shape (n_components, n_samples)
    xs, ys = pos[:, 0], pos[:, 1]
    
    #Visualizing document clusters
    
    #cluster_colors = {0: '#0000A0', 1: '#FF0000'} # #000000, #C0C0C0
    cluster_colors = {0: '#000000', 1: '#C0C0C0'}
    cluster_names = {0: u'Meter anomaly', 
                 1: u'Regular meter'}
    
    df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles)) 
    groups = df.groupby('label')
    fig, ax = plt.subplots(figsize=(15, 15))
    ax.margins(0.2)
    
    for name, group in groups:
        ax.plot(group.x, group.y, marker='o', linestyle='', ms=18, 
                label=cluster_names[name], color=cluster_colors[name], 
                mec='none')
        ax.set_aspect('auto')
        ax.tick_params(
            axis= 'x',          # changes apply to the x-axis
            which='both',      # both major and minor ticks are affected
            bottom='off',      # ticks along the bottom edge are off
            top='off',         # ticks along the top edge are off
            labelbottom='off')
        ax.tick_params(
            axis= 'y',         # changes apply to the y-axis
            which='both',      # both major and minor ticks are affected
            left='off',      # ticks along the bottom edge are off
            top='off',         # ticks along the top edge are off
            labelleft='off')
        
    ax.legend(numpoints=1)  #show legend with only 1 point
    for i in range(len(df)):
        ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=25) 
    
    #plt.show()
    pylab.savefig('forms_cluster.png')
    
    # dendro
    
    linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances

    fig, ax = plt.subplots(figsize=(12, 10))  # set size
    ax = dendrogram(linkage_matrix, orientation="right", labels=titles)
    
    plt.tick_params(
        axis='x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')
    
    plt.tight_layout() #show plot with tight layout
    
    #uncomment below to save figure
    plt.savefig('ward_clusters.png', dpi=200) #save figure as ward_clusters

    fwr.close()
    return 0

コード例 #39

0

ファイルを表示

ファイル: knock98.py プロジェクト: tmu-nlp/100knock2017

from scipy.cluster.hierarchy import dendrogram, ward, leaves_list
import numpy as np
import matplotlib.pyplot as plt
import pickle

if __name__ == '__main__':
    #country_name = dict()
    country_name = list()
    with open('country_vec.dump', 'rb') as feat_f:
        country_dict = pickle.load(feat_f)
    for id_num, item in enumerate(country_dict.items()):
        #country_name[item[0]] = id_num
        country_name.append(item[0])
        if id_num == 0:
            country_mat = item[1]
            continue
        country_mat = np.vstack((country_mat, item[1]))
    h_cls = ward(country_mat)
    dendrogram(h_cls, labels=country_name)
    plt.show()

コード例 #40

0

ファイルを表示

ファイル: 9.0.baselines.py プロジェクト: liusida/ds2_arxiv

# ShowCase II: tsp is good for recovering images:
elements = cv2.imread("shared/demo/zimo.jpg", flags=cv2.IMREAD_COLOR)
elements = elements[:, :, 0]
rng = default_rng(seed=1)
i = rng.permutation(np.arange(elements.shape[0]))
# j = rng.permutation(np.arange(elements.shape[0]))
elements = elements[i]
elements = elements[:, i]

elements = np.load("shared/author_similarity_matrix.npy")
print(elements.shape)

save_pic(elements, "randomized")

for i in range(1):
    Z = hierarchy.ward(elements)
    indices = hierarchy.leaves_list(
        hierarchy.optimal_leaf_ordering(Z, elements))

    elements = elements[indices].T
    elements = elements[indices].T

    save_pic(elements, f"processed_{i}_olo")

    pca = PCA(n_components=1)
    pca.fit(elements)
    print(pca.components_.shape)
    indices = np.argsort(pca.components_.flatten())

    elements = elements[indices].T
    elements = elements[indices].T

コード例 #41

0

ファイルを表示

ファイル: analyze.py プロジェクト: kidaak/ineffable

		return summs

	def similarity(data):
		from sklearn.metrics.pairwise import cosine_similarity
		sims = cosine_similarity(data)
		return sims




if True:
	reduced, v = reduce_data(bdata)
	simplified = summ_subs(reduced)
	similar = similarity(simplified)
	from scipy.cluster.hierarchy import ward
	clusters = ward(similar)
	subnames = sorted(substance_count.keys())
	subcounts = [substance_count[key] for key in subnames]

if True:
	tree = jsontree(clusters,2*clusters.shape[0],subnames,subcounts,1,np.nan)
	#tree = jsontree(clusters,2*clusters.shape[0],subnames,subcounts,np.nan,1000)
	with open(path+"gh-pages/tagtree.json","wb") as j:
		import json
		json.dump(tree,j)

if True:
	reduced, v = reduce_data(ldata)
	similar = similarity(reduced.T)
	from scipy.cluster.hierarchy import ward
	clusters = ward(similar)

コード例 #42

0

ファイルを表示

def getcountry(file_name):
    with open(file_name, 'r') as ff:
        country_l = []
        for ii, line in enumerate(ff):
            if (ii % 2) == 1:
                if ' ' in line.strip():
                    line2 = line.replace(' ', '_', 100)
                    country_l.append(line2.strip())
                else:
                    country_l.append(line.strip())
    return country_l
    #print(country)


if __name__ == "__main__":

    #country_l = getcountry('../chapter09/countries2.tsv')

    index_file = 'country_idx'
    ntx_file = 'country_MTX'

    MT_X = Get_MT_X(ntx_file)
    t_i = get_t_i(index_file)

    #ward = cl.AgglomerativeClustering(linkage='ward').fit_predict(MT_X)
    ward = ward(MT_X)
    print(ward)

    dendrogram(ward, labels=list(t_i.keys()), leaf_font_size=8)
    plt.show()

コード例 #43

0

ファイルを表示

ファイル: n98.py プロジェクト: chantera/nlp100

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
98. Ward法によるクラスタリング
96の単語ベクトルに対して，Ward法による階層型クラスタリングを実行せよ．さらに，クラスタリング結果をデンドログラムとして可視化せよ．
"""

from n90 import load_model
from n96 import get_country_vector
import numpy as np
from scipy.cluster.hierarchy import ward, dendrogram
import matplotlib.pyplot as plt
import sys


vector = get_country_vector(load_model(sys.argv[1]))
dendrogram(ward(np.array(list(vector.values()))), labels=list(vector.keys()))
plt.show()

コード例 #44

0

ファイルを表示

#!/usr/bin/env python3

import sys
from scipy.cluster.hierarchy import ward, dendrogram, linkage, leaves_list
import numpy as np
from matplotlib import pyplot as plt
from scipy.spatial.distance import pdist

data = []

for line in open(sys.argv[1]):
	fields = line.rstrip("\r\n").split()
	gene = fields[0]
	cfu =  fields[1]
	poly = fields[2]
	data.append([cfu, poly])
#print(data)

z = ward(pdist(data))
y = leaves_list(z)
fig = plt.figure(figsize=(20, 10))
dn = dendrogram(z)
plt.tight_layout()
plt.ylabel("")        
plt.xlabel("")   
#plt.set_title("")
fig.savefig("dendro.png")
plt.close(fig)

コード例 #45

0

ファイルを表示

ファイル: MDS_Extra_6_7.py プロジェクト: jaehyeon-kim/marketing_data_science

# compute distance matrix
distance_matrix = manhattan_distances(activities_binary_matrix)
print(distance_matrix.shape)
                                                 
activity_names = ['Shopping', 'Antiquing',     
'Site Seeing', 'Fine Dining', 'Casual Dining', 
'Family Style Dining', 'Fast Food Dining', 'Museums',       
'Indoor Pool', 'Outdoor Pool', 'Hiking', 'Gambling', 
'Boating/Swimming', 'Fishing', 'Golfing', 'Boat Tours', 
'Ride the Ducks', 'Amusement Park', 'Minigolf', 'Go-carting',     
'Waterpark', 'Circus World', 'Tommy Bartlett Ski Show', 
'Helicopter Rides', 'Horseback Riding', 'Stand Rock',     
'Outdoor Attractions', 'Nearby Attractions', 
'Movie Theater', 'Concert Theater', 'Bar/Pub Dancing',
'Shop Broadway', 'Bungee Jumping']

linkage_matrix = ward(distance_matrix) 
fig, ax = plt.subplots(figsize=(15, 20)) # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=activity_names)

plt.tick_params(\
    axis = 'x',          # changes apply to the x-axis
    which = 'both',      # both major and minor ticks are affected
    bottom = 'off',      # ticks along the bottom edge are off
    top = 'off',         # ticks along the top edge are off
    labelbottom = 'off')

plt.tight_layout()  # show plot with tight layout

# route figure to external file
plt.savefig('plot_hierarchical_clustering_solution.png', dpi = 200)

コード例 #46

0

ファイルを表示

ファイル: hierarchical.py プロジェクト: Hydroinformatics-UNESCO-IHE/scikit-learn

def ward_tree(X, connectivity=None, n_components=None, copy=True,
              n_clusters=None):
    """Ward clustering based on a Feature matrix.

    Recursively merges the pair of clusters that minimally increases
    within-cluster variance.

    The inertia matrix uses a Heapq-based representation.

    This is the structured version, that takes into account some topological
    structure between samples.

    Parameters
    ----------
    X : array of shape (n_samples, n_features)
        feature matrix  representing n_samples samples to be clustered

    connectivity : sparse matrix.
        connectivity matrix. Defines for each sample the neighboring samples
        following a given structure of the data. The matrix is assumed to
        be symmetric and only the upper triangular half is used.
        Default is None, i.e, the Ward algorithm is unstructured.

    n_components : int (optional)
        Number of connected components. If None the number of connected
        components is estimated from the connectivity matrix.

    copy : bool (optional)
        Make a copy of connectivity or work inplace. If connectivity
        is not of LIL type there will be a copy in any case.

    n_clusters : int (optional)
        Stop early the construction of the tree at n_clusters. This is
        useful to decrease computation time if the number of clusters is
        not small compared to the number of samples. In this case, the
        complete tree is not computed, thus the 'children' output is of
        limited use, and the 'parents' output should rather be used.
        This option is valid only when specifying a connectivity matrix.

    Returns
    -------
    children : 2D array, shape (n_nodes, 2)
        The children of each non-leaf node. Values less than `n_samples` refer
        to leaves of the tree. A greater value `i` indicates a node with
        children `children[i - n_samples]`.

    n_components : int
        The number of connected components in the graph.

    n_leaves : int
        The number of leaves in the tree

    parents : 1D array, shape (n_nodes, ) or None
        The parent of each node. Only returned when a connectivity matrix
        is specified, elsewhere 'None' is returned.
    """
    X = np.asarray(X)
    if X.ndim == 1:
        X = np.reshape(X, (-1, 1))
    n_samples, n_features = X.shape

    if connectivity is None:
        if n_clusters is not None:
            warnings.warn('Early stopping is implemented only for '
                          'structured Ward clustering (i.e. with '
                          'explicit connectivity.', stacklevel=2)
        out = hierarchy.ward(X)
        children_ = out[:, :2].astype(np.int)
        return children_, 1, n_samples, None

    # Compute the number of nodes
    if n_components is None:
        n_components, labels = cs_graph_components(connectivity)

    # Convert connectivity matrix to LIL with a copy if needed
    if sparse.isspmatrix_lil(connectivity) and copy:
        connectivity = connectivity.copy()
    elif not sparse.isspmatrix(connectivity):
        connectivity = sparse.lil_matrix(connectivity)
    else:
        connectivity = connectivity.tolil()

    if n_components > 1:
        warnings.warn("the number of connected components of the "
                      "connectivity matrix is %d > 1. Completing it to avoid "
                      "stopping the tree early." % n_components)
        connectivity = _fix_connectivity(X, connectivity, n_components, labels)
        n_components = 1

    if n_clusters is None:
        n_nodes = 2 * n_samples - n_components
    else:
        assert n_clusters <= n_samples
        n_nodes = 2 * n_samples - n_clusters

    if (connectivity.shape[0] != n_samples
            or connectivity.shape[1] != n_samples):
        raise ValueError('Wrong shape for connectivity matrix: %s '
                         'when X is %s' % (connectivity.shape, X.shape))

    # create inertia matrix
    coord_row = []
    coord_col = []
    A = []
    for ind, row in enumerate(connectivity.rows):
        A.append(row)
        # We keep only the upper triangular for the moments
        # Generator expressions are faster than arrays on the following
        row = [i for i in row if i < ind]
        coord_row.extend(len(row) * [ind, ])
        coord_col.extend(row)

    coord_row = np.array(coord_row, dtype=np.int)
    coord_col = np.array(coord_col, dtype=np.int)

    # build moments as a list
    moments_1 = np.zeros(n_nodes)
    moments_1[:n_samples] = 1
    moments_2 = np.zeros((n_nodes, n_features))
    moments_2[:n_samples] = X
    inertia = np.empty(len(coord_row), dtype=np.float)
    _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col,
                                    inertia)
    inertia = list(six.moves.zip(inertia, coord_row, coord_col))
    heapify(inertia)

    # prepare the main fields
    parent = np.arange(n_nodes, dtype=np.int)
    heights = np.zeros(n_nodes)
    used_node = np.ones(n_nodes, dtype=bool)
    children = []

    not_visited = np.empty(n_nodes, dtype=np.int8)

    # recursive merge loop
    for k in range(n_samples, n_nodes):
        # identify the merge
        while True:
            inert, i, j = heappop(inertia)
            if used_node[i] and used_node[j]:
                break
        parent[i], parent[j], heights[k] = k, k, inert
        children.append([i, j])
        used_node[i] = used_node[j] = False

        # update the moments
        moments_1[k] = moments_1[i] + moments_1[j]
        moments_2[k] = moments_2[i] + moments_2[j]

        # update the structure matrix A and the inertia matrix
        coord_col = []
        not_visited.fill(1)
        not_visited[k] = 0
        _hierarchical._get_parents(A[i], coord_col, parent, not_visited)
        _hierarchical._get_parents(A[j], coord_col, parent, not_visited)
        # List comprehension is faster than a for loop
        [A[l].append(k) for l in coord_col]
        A.append(coord_col)
        coord_col = np.array(coord_col, dtype=np.int)
        coord_row = np.empty_like(coord_col)
        coord_row.fill(k)
        n_additions = len(coord_row)
        ini = np.empty(n_additions, dtype=np.float)

        _hierarchical.compute_ward_dist(moments_1, moments_2,
                                        coord_row, coord_col, ini)
        # List comprehension is faster than a for loop
        [heappush(inertia, (ini[idx], k, coord_col[idx]))
            for idx in range(n_additions)]

    # Separate leaves in children (empty lists up to now)
    n_leaves = n_samples
    children = np.array(children)  # return numpy array for efficient caching

    return children, n_components, n_leaves, parent

コード例 #47

0

ファイルを表示

ファイル: cluster.py プロジェクト: houyushan/python--

def ward_hierarchical_clustering(feature_matrix):
    cosine_distance = 1 - cosine_similarity(feature_matrix)
    linkage_matrix = ward(cosine_distance)
    return linkage_matrix

コード例 #48

0

ファイルを表示

'''
fig,axes = plt.subplots(2,5,subplot_kw={'xticks':(),'yticks':()},figsize=(12,4))
for center,ax in zip(km.cluster_centers_,axes.ravel()):
    ax.imshow(pca.inverse_transform(center).reshape(image_shape),vmin=0,vmax=1)

plt.show()
'''

agglomerative = AgglomerativeClustering(n_clusters=40)
labels_agg = agglomerative.fit_predict(X_pca)
print("Cluster sizes agglomerative clustering:{}".format(
    np.bincount(labels_agg)))

print("ARI:{:.2f}".format(adjusted_rand_score(labels_agg, labels_km)))

linkage_array = ward(X_pca)
plt.figure(figsize=(20, 5))
dendrogram(linkage_array, p=7, truncate_mode='level', no_labels=True)
plt.xlabel("Sample index")
plt.ylabel("Cluster distance")
'''
for cluster in range(max(labels)+1):
    mask = labels == cluster
    n_images = np.sum(mask)
    fig,axes = plt.subplots(1,n_images,figsize=(n_images*1.5,4),subplot_kw={'xticks':(),'yticks':()})
    for image,label,ax in zip(X_people[mask],y_people[mask],axes):
        ax.imshow(image.reshape(image_shape), vmin=0, vmax=1)
        ax.set_title(people.target_names[label].split()[-1])
'''

n_clusters = 40

コード例 #49

0

ファイルを表示

ファイル: hierarchical.py プロジェクト: 93sam/scikit-learn

def ward_tree(X, connectivity=None, n_components=None, copy=None,
              n_clusters=None):
    """Ward clustering based on a Feature matrix.

    Recursively merges the pair of clusters that minimally increases
    within-cluster variance.

    The inertia matrix uses a Heapq-based representation.

    This is the structured version, that takes into account some topological
    structure between samples.

    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        feature matrix  representing n_samples samples to be clustered

    connectivity : sparse matrix (optional).
        connectivity matrix. Defines for each sample the neighboring samples
        following a given structure of the data. The matrix is assumed to
        be symmetric and only the upper triangular half is used.
        Default is None, i.e, the Ward algorithm is unstructured.

    n_components : int (optional)
        Number of connected components. If None the number of connected
        components is estimated from the connectivity matrix.

    n_clusters : int (optional)
        Stop early the construction of the tree at n_clusters. This is
        useful to decrease computation time if the number of clusters is
        not small compared to the number of samples. In this case, the
        complete tree is not computed, thus the 'children' output is of
        limited use, and the 'parents' output should rather be used.
        This option is valid only when specifying a connectivity matrix.

    Returns
    -------
    children : 2D array, shape (n_nodes, 2)
        The children of each non-leaf node. Values less than `n_samples` refer
        to leaves of the tree. A greater value `i` indicates a node with
        children `children[i - n_samples]`.

    n_components : int
        The number of connected components in the graph.

    n_leaves : int
        The number of leaves in the tree

    parents : 1D array, shape (n_nodes, ) or None
        The parent of each node. Only returned when a connectivity matrix
        is specified, elsewhere 'None' is returned.
    """
    if copy is not None:
        warnings.warn("The copy argument is deprecated and will be removed "
                      "in 0.16. The connectivity is now always copied.",
                      DeprecationWarning)

    X = np.asarray(X)
    if X.ndim == 1:
        X = np.reshape(X, (-1, 1))
    n_samples, n_features = X.shape

    if connectivity is None:
        from scipy.cluster import hierarchy     # imports PIL

        if n_clusters is not None:
            warnings.warn('Partial build of the tree is implemented '
                          'only for structured clustering (i.e. with '
                          'explicit connectivity). The algorithm '
                          'will build the full tree and only '
                          'retain the lower branches required '
                          'for the specified number of clusters',
                          stacklevel=2)
        out = hierarchy.ward(X)
        children_ = out[:, :2].astype(np.intp)
        return children_, 1, n_samples, None

    connectivity = _fix_connectivity(X, connectivity,
                                     n_components=n_components)
    if n_clusters is None:
        n_nodes = 2 * n_samples - 1
    else:
        if n_clusters > n_samples:
            raise ValueError('Cannot provide more clusters than samples. '
                '%i n_clusters was asked, and there are %i samples.'
                % (n_clusters, n_samples))
        n_nodes = 2 * n_samples - n_clusters

    # create inertia matrix
    coord_row = []
    coord_col = []
    A = []
    for ind, row in enumerate(connectivity.rows):
        A.append(row)
        # We keep only the upper triangular for the moments
        # Generator expressions are faster than arrays on the following
        row = [i for i in row if i < ind]
        coord_row.extend(len(row) * [ind, ])
        coord_col.extend(row)

    coord_row = np.array(coord_row, dtype=np.intp, order='C')
    coord_col = np.array(coord_col, dtype=np.intp, order='C')

    # build moments as a list
    moments_1 = np.zeros(n_nodes, order='C')
    moments_1[:n_samples] = 1
    moments_2 = np.zeros((n_nodes, n_features), order='C')
    moments_2[:n_samples] = X
    inertia = np.empty(len(coord_row), dtype=np.float, order='C')
    _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col,
                                    inertia)
    inertia = list(six.moves.zip(inertia, coord_row, coord_col))
    heapify(inertia)

    # prepare the main fields
    parent = np.arange(n_nodes, dtype=np.intp)
    used_node = np.ones(n_nodes, dtype=bool)
    children = []

    not_visited = np.empty(n_nodes, dtype=np.int8, order='C')

    # recursive merge loop
    for k in range(n_samples, n_nodes):
        # identify the merge
        while True:
            inert, i, j = heappop(inertia)
            if used_node[i] and used_node[j]:
                break
        parent[i], parent[j] = k, k
        children.append((i, j))
        used_node[i] = used_node[j] = False

        # update the moments
        moments_1[k] = moments_1[i] + moments_1[j]
        moments_2[k] = moments_2[i] + moments_2[j]

        # update the structure matrix A and the inertia matrix
        coord_col = []
        not_visited.fill(1)
        not_visited[k] = 0
        _hierarchical._get_parents(A[i], coord_col, parent, not_visited)
        _hierarchical._get_parents(A[j], coord_col, parent, not_visited)
        # List comprehension is faster than a for loop
        [A[l].append(k) for l in coord_col]
        A.append(coord_col)
        coord_col = np.array(coord_col, dtype=np.intp, order='C')
        coord_row = np.empty(coord_col.shape, dtype=np.intp, order='C')
        coord_row.fill(k)
        n_additions = len(coord_row)
        ini = np.empty(n_additions, dtype=np.float, order='C')

        _hierarchical.compute_ward_dist(moments_1, moments_2,
                                        coord_row, coord_col, ini)

        # List comprehension is faster than a for loop
        [heappush(inertia, (ini[idx], k, coord_col[idx]))
            for idx in range(n_additions)]

    # Separate leaves in children (empty lists up to now)
    n_leaves = n_samples
    children = np.array(children)  # return numpy array for efficient caching

    return children, n_components, n_leaves, parent

コード例 #50

0

ファイルを表示

ファイル: datamining.py プロジェクト: ulysesrico33/youtubePython

def ejecutarPrograma(opcion):
    '''
   Pasar archivos a una lista
   '''
    path = '/Users/ulysesrico/data'
    documents = []
    titles = []
    dirs = os.listdir(path)
    for doc in dirs:
        if doc.endswith('.txt'):
            titles.append(doc)
            f = open(os.path.join(path, doc), 'r')
            words = f.read()
            documents.append(words)
            f.close()

    #Genera stopwords
    sw = stopwords.words('spanish')

    #Crea los vectores ya sin stopwords y genera matriz tf-idf
    tfidf_vectorizer = TfidfVectorizer(sw)
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

    #Se crea diccionario
    diccionario = tfidf_vectorizer.get_feature_names()

    print
    print 'Corroborar tamaño de la matriz Documentos vs Términos'
    print tfidf_matrix.shape
    print

    print
    print 'Obteniendo similitud de coseno entre 2 documentos (si son iguales el valor es 1)'
    cosine = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[99:100])
    print cosine
    print 'Cálculo de distancia'
    dist = 1 - cosine
    print dist
    print
    print 'Ángulo de separación de los documentos (grados)'
    angle_in_radians = math.acos(cosine)
    print math.degrees(angle_in_radians)
    print
    print 'Área de gráficos'
    print
    dist = 1 - cosine_similarity(tfidf_matrix)
    np.round(dist, 2)
    if opcion == 1:
        print 'Inicio'
        print 'Impresión de similitud de documentos por método de coseno'
        r = 1
        d = 2 * r * (1 - cosine)
        circle1 = plt.Circle((0, 0), r, alpha=.5)
        circle2 = plt.Circle((d, 0), r, alpha=.5)
        ## set axis limits
        plt.ylim([-1.1, 1.1])
        plt.xlim([-1.1, 1.1 + d])
        fig = plt.gcf()
        fig.gca().add_artist(circle1)
        fig.gca().add_artist(circle2)
        print 'Fin'
    elif opcion == 2:
        print 'Inicio'
        print 'Clustering de distancia entre documntos'
        mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
        pos = mds.fit_transform(dist)  # shape (n_components, n_samples)
        xs, ys = pos[:, 0], pos[:, 1]
        names = [os.path.basename(fn).replace('.txt', '') for fn in titles]
        # color-blind-friendly palette
        for x, y, name in zip(xs, ys, names):
            color = 'orange' if "d1" in name else 'blue'
            plt.scatter(x, y, c=color)
            plt.text(x, y, name)
        plt.show()
        print 'Fin'
    elif opcion == 3:
        print 'Inicio'
        print 'Clustering de documentos en 3D'
        mds = MDS(n_components=3, dissimilarity="precomputed", random_state=1)
        pos = mds.fit_transform(dist)
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(pos[:, 0], pos[:, 1], pos[:, 2])
        for x, y, z, s in zip(pos[:, 0], pos[:, 1], pos[:, 2], titles):
            ax.text(x, y, z, s)
        plt.show()
        print 'Fin'
    else:
        print 'Similitud entre documentos (Dibujar distancia entre ellos)'
        print 'Inicio'
        linkage_matrix = ward(dist)
        dendrogram(linkage_matrix, orientation="right", labels=titles)
        plt.tight_layout()
        plt.show()
        print 'Fin'

コード例 #51

0

ファイルを表示

ファイル: clustering.3.py プロジェクト: daniellllllll/random_sample

#samples = [2,3,4,5,6,7,8,9,10,11,13]
samples = open("FDC.csv", "r"). read().split(",")
samples = [ float(value) for value in samples]


"""
diff_samples = numpy.array(original_samples + [0])-numpy.array([0] + original_samples)
diff_samples = list(diff_samples)
difff_samples = numpy.array((diff_samples + [0])) - numpy.array([0] + diff_samples)
"""


#執行階層式分群作業
tsamples = numpy.array([samples]).transpose()
distance = distance_matrix(tsamples, tsamples)
hc = ward(distance)

#print(hc)
dendrogram(hc)

def find_majority( array, index):
    if index < 5:
        start = 0
    else:
        start = index - 5
    
    end = index + 6
    datas = array[start:end]
    
    counter = Counter(datas)
    [(majority, count)] = counter.most_common(1)

コード例 #52

0

ファイルを表示

ファイル: compareclustering.py プロジェクト: DCtheTall/introduction-to-machine-learning

plt.show()
# plots each center, the 5 closest faces to center,
# and the 5 farthest in each cluster
# As expected faces closer to the smoothed faces are facing
# similar directions and have similar facial expressions
# Faces that are far from center may have different orientations,
# headwear, or facial expressions

agglom = AgglomerativeClustering(n_clusters=10)
labels_agg = agglom.fit_predict(X_pca)
print 'Cluster sizes: {}'.format(np.bincount(labels_agg))
# Like kMeans, it creates relatively similarly sized clusters
# print 'ARI: {:.2f}'.format(adjusted_rand_score(labels_agg, labels_km))
# They seem to be rather uncorrelated (0.09)

linkage_arr = ward(X_pca)
plt.figure(figsize=(20, 5))
dendrogram(linkage_arr, p=7, truncate_mode='level', no_labels=True)
plt.xlabel('Sample index')
plt.ylabel('Cluster distance')
plt.show()
# The plot shows branches vary in length
# There does not seem to be a good cutoff for
# classifying the data

for cluster in range(10):
    mask = labels_agg == cluster
    fig, axes = plt.subplots(1,
                             10,
                             subplot_kw={
                                 'xticks': (),

コード例 #53

0

ファイルを表示

ファイル: hierarchical.py プロジェクト: WeatherGod/scikit-learn

def ward_tree(X, connectivity=None, n_components=None, copy=True):
    """Ward clustering based on a Feature matrix.

    The inertia matrix uses a Heapq-based representation.

    This is the structured version, that takes into account a some topological
    structure between samples.

    Parameters
    ----------
    X : array of shape (n_samples, n_features)
        feature matrix  representing n_samples samples to be clustered

    connectivity : sparse matrix.
        connectivity matrix. Defines for each sample the neigbhoring samples
        following a given structure of the data. The matrix is assumed to
        be symmetric and only the upper triangular half is used.
        Default is None, i.e, the Ward algorithm is unstructured.

    n_components : int (optional)
        Number of connected components. If None the number of connected
        components is estimated from the connectivity matrix.

    copy : bool (optional)
        Make a copy of connectivity or work inplace. If connectivity
        is not of LIL type there will be a copy in any case.

    Returns
    -------
    children : list of pairs. Lenght of n_nodes
               list of the children of each nodes.
               Leaves of the tree have empty list of children.

    n_components : sparse matrix.
        The number of connected components in the graph.

    n_leaves : int
        The number of leaves in the tree
    """
    X = np.asarray(X)
    n_samples, n_features = X.shape
    if X.ndim == 1:
        X = np.reshape(X, (-1, 1))

    if connectivity is None:
        out = hierarchy.ward(X)
        children_ = out[:, :2].astype(np.int)
        return children_, 1, n_samples

    # Compute the number of nodes
    if n_components is None:
        n_components, labels = cs_graph_components(connectivity)

    # Convert connectivity matrix to LIL with a copy if needed
    if sparse.isspmatrix_lil(connectivity) and copy:
        connectivity = connectivity.copy()
    else:
        connectivity = connectivity.tolil()

    if n_components > 1:
        warnings.warn("the number of connected components of the"
        " connectivity matrix is %d > 1. Completing it to avoid"
        " stopping the tree early."
        % n_components)
        connectivity = _fix_connectivity(X, connectivity,
                                            n_components, labels)
        n_components = 1

    n_nodes = 2 * n_samples - n_components

    if (connectivity.shape[0] != n_samples or
        connectivity.shape[1] != n_samples):
        raise ValueError('Wrong shape for connectivity matrix: %s '
                         'when X is %s' % (connectivity.shape, X.shape))

    # Remove diagonal from connectivity matrix
    connectivity.setdiag(np.zeros(connectivity.shape[0]))

    # create inertia matrix
    coord_row = []
    coord_col = []
    A = []
    for ind, row in enumerate(connectivity.rows):
        A.append(row)
        # We keep only the upper triangular for the moments
        # Generator expressions are faster than arrays on the following
        row = [i for i in row if i < ind]
        coord_row.extend(len(row) * [ind, ])
        coord_col.extend(row)

    coord_row = np.array(coord_row, dtype=np.int)
    coord_col = np.array(coord_col, dtype=np.int)

    # build moments as a list
    moments_1 = np.zeros(n_nodes)
    moments_1[:n_samples] = 1
    moments_2 = np.zeros((n_nodes, n_features))
    moments_2[:n_samples] = X
    inertia = np.empty(len(coord_row), dtype=np.float)
    _hierarchical.compute_ward_dist(moments_1, moments_2,
                             coord_row, coord_col, inertia)
    inertia = zip(inertia, coord_row, coord_col)
    heapify(inertia)

    # prepare the main fields
    parent = np.arange(n_nodes, dtype=np.int)
    heights = np.zeros(n_nodes)
    used_node = np.ones(n_nodes, dtype=bool)
    children = []

    visited = np.empty(n_nodes, dtype=bool)

    # recursive merge loop
    for k in xrange(n_samples, n_nodes):
        # identify the merge
        while True:
            inert, i, j = heappop(inertia)
            if used_node[i] and used_node[j]:
                break
        parent[i], parent[j], heights[k] = k, k, inert
        children.append([i, j])
        used_node[i] = used_node[j] = False

        # update the moments
        moments_1[k] = moments_1[i] + moments_1[j]
        moments_2[k] = moments_2[i] + moments_2[j]

        # update the structure matrix A and the inertia matrix
        coord_col = []
        visited[:] = False
        visited[k] = True
        for l in set(A[i]).union(A[j]):
            l = _hierarchical._get_parent(l, parent)
            if not visited[l]:
                visited[l] = True
                coord_col.append(l)
                A[l].append(k)
        A.append(coord_col)
        coord_col = np.array(coord_col, dtype=np.int)
        coord_row = np.empty_like(coord_col)
        coord_row.fill(k)
        ini = np.empty(len(coord_row), dtype=np.float)

        _hierarchical.compute_ward_dist(moments_1, moments_2,
                                   coord_row, coord_col, ini)
        for tupl in itertools.izip(ini, coord_row, coord_col):
            heappush(inertia, tupl)

    # Separate leaves in children (empty lists up to now)
    n_leaves = n_samples
    children = np.array(children)  # return numpy array for efficient caching

    return children, n_components, n_leaves

コード例 #54

0

ファイルを表示

ファイル: vb_summary.py プロジェクト: DouglasPatton/vbflow

    def missingVals(self):
        n = self.X_nan_bool_df.shape[0]
        if np.sum(self.X_nan_bool_df.to_numpy().ravel()) == 0:
            print(f'no missing values found')
            return

        nan_01 = self.X_nan_bool_df.to_numpy().astype(np.int16)
        feature_names = self.X_nan_bool_df.columns.to_list()
        feature_idx = np.arange(len(feature_names))
        #nan_bool_stack=self.X_nan_bool_df.reset_index(drop=True,inplace=False).to_numpy().astype(np.uint8)

        plt.rcParams['font.size'] = '8'
        fig, (ax0, ax1, ax2, ax3) = plt.subplots(4,
                                                 1,
                                                 figsize=(12, 16),
                                                 dpi=200)
        feat_miss_count_ser = self.X_nan_bool_df.astype(np.int16).sum(axis=0)
        feat_miss_count_ser.plot.bar(ax=ax0, )
        ax0.set_title('Missing Data Counts by Feature')
        pct_missing_list = [
            f'{round(pct)}%'
            for pct in (100 * feat_miss_count_ser / n).tolist()
        ]
        self.addAnnotations(ax0, pct_missing_list)

        row_miss_count_ser = feat_miss_count_ser = self.X_nan_bool_df.astype(
            np.int16).sum(axis=1)
        ax1.bar(np.arange(n), row_miss_count_ser.to_numpy(), width=1)
        ax1.set_title('Missing Data Counts by Row')

        nan_01_sum = nan_01.sum(axis=0)
        has_nan_features = nan_01_sum > 0
        nan_01_hasnan = nan_01[:, has_nan_features]
        hasnan_features = [
            name for i, name in enumerate(feature_names) if has_nan_features[i]
        ]
        nan_corr = self.pearsonCorrelationMatrix(nan_01_hasnan)
        nan_corr_df = pd.DataFrame(data=nan_corr, columns=hasnan_features)
        self.nan_corr = nan_corr
        self.nan_corr_df = nan_corr_df
        corr_linkage = hierarchy.ward(nan_corr)
        dendro = hierarchy.dendrogram(  #just used for ordering the features by the grouping
            corr_linkage,
            labels=hasnan_features,
            ax=None,
            no_plot=True,
            leaf_rotation=90)

        ax2.imshow(nan_01, aspect='auto', interpolation='none', cmap='plasma')
        colors = [plt.get_cmap('plasma')(value) for value in [255]]
        labels = ['missing data']
        patches = [Patch(color=colors[i], label=labels[i]) for i in [0]]
        ax2.legend(handles=patches,
                   bbox_to_anchor=(0, 1.1),
                   loc=9,
                   ncol=2,
                   fontsize='large')
        ax2.set_xticks(feature_idx)
        ax2.set_xticklabels(feature_names, rotation='vertical', fontsize=6)
        ax2.set_title('Missing Data Layout')

        cp = ax3.imshow(nan_corr[dendro['leaves'], :][:, dendro['leaves']],
                        aspect='equal',
                        interpolation='none')
        fig.colorbar(cp, shrink=0.5)
        hasnan_feature_idx = np.arange(len(hasnan_features))
        ax3.set_yticks(hasnan_feature_idx)
        ax3.set_xticks(hasnan_feature_idx)
        ax3.set_xticklabels(dendro['ivl'], rotation='vertical', fontsize=6)
        ax3.set_yticklabels(dendro['ivl'], fontsize=6)
        ax3.set_title('Missing Data Clustering Across Features')
        fig.tight_layout()

コード例 #55

0

ファイルを表示

ファイル: hierarchical.py プロジェクト: NUMBLP7890Fly/scikit-learn

def ward_tree(X, connectivity=None, n_components=None, n_clusters=None,
              return_distance=False):
    """Ward clustering based on a Feature matrix.

    Recursively merges the pair of clusters that minimally increases
    within-cluster variance.

    The inertia matrix uses a Heapq-based representation.

    This is the structured version, that takes into account some topological
    structure between samples.

    Read more in the :ref:`User Guide <hierarchical_clustering>`.

    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        feature matrix  representing n_samples samples to be clustered

    connectivity : sparse matrix (optional).
        connectivity matrix. Defines for each sample the neighboring samples
        following a given structure of the data. The matrix is assumed to
        be symmetric and only the upper triangular half is used.
        Default is None, i.e, the Ward algorithm is unstructured.

    n_components : int (optional)
        Number of connected components. If None the number of connected
        components is estimated from the connectivity matrix.
        NOTE: This parameter is now directly determined directly
        from the connectivity matrix and will be removed in 0.18

    n_clusters : int (optional)
        Stop early the construction of the tree at n_clusters. This is
        useful to decrease computation time if the number of clusters is
        not small compared to the number of samples. In this case, the
        complete tree is not computed, thus the 'children' output is of
        limited use, and the 'parents' output should rather be used.
        This option is valid only when specifying a connectivity matrix.

    return_distance: bool (optional)
        If True, return the distance between the clusters.

    Returns
    -------
    children : 2D array, shape (n_nodes-1, 2)
        The children of each non-leaf node. Values less than `n_samples`
        correspond to leaves of the tree which are the original samples.
        A node `i` greater than or equal to `n_samples` is a non-leaf
        node and has children `children_[i - n_samples]`. Alternatively
        at the i-th iteration, children[i][0] and children[i][1]
        are merged to form node `n_samples + i`

    n_components : int
        The number of connected components in the graph.

    n_leaves : int
        The number of leaves in the tree

    parents : 1D array, shape (n_nodes, ) or None
        The parent of each node. Only returned when a connectivity matrix
        is specified, elsewhere 'None' is returned.

    distances : 1D array, shape (n_nodes-1, )
        Only returned if return_distance is set to True (for compatibility).
        The distances between the centers of the nodes. `distances[i]`
        corresponds to a weighted euclidean distance between
        the nodes `children[i, 1]` and `children[i, 2]`. If the nodes refer to
        leaves of the tree, then `distances[i]` is their unweighted euclidean
        distance. Distances are updated in the following way
        (from scipy.hierarchy.linkage):

        The new entry :math:`d(u,v)` is computed as follows,

        .. math::

           d(u,v) = \\sqrt{\\frac{|v|+|s|}
                               {T}d(v,s)^2
                        + \\frac{|v|+|t|}
                               {T}d(v,t)^2
                        - \\frac{|v|}
                               {T}d(s,t)^2}

        where :math:`u` is the newly joined cluster consisting of
        clusters :math:`s` and :math:`t`, :math:`v` is an unused
        cluster in the forest, :math:`T=|v|+|s|+|t|`, and
        :math:`|*|` is the cardinality of its argument. This is also
        known as the incremental algorithm.
    """
    X = np.asarray(X)
    if X.ndim == 1:
        X = np.reshape(X, (-1, 1))
    n_samples, n_features = X.shape

    if connectivity is None:
        from scipy.cluster import hierarchy     # imports PIL

        if n_clusters is not None:
            warnings.warn('Partial build of the tree is implemented '
                          'only for structured clustering (i.e. with '
                          'explicit connectivity). The algorithm '
                          'will build the full tree and only '
                          'retain the lower branches required '
                          'for the specified number of clusters',
                          stacklevel=2)
        out = hierarchy.ward(X)
        children_ = out[:, :2].astype(np.intp)

        if return_distance:
            distances = out[:, 2]
            return children_, 1, n_samples, None, distances
        else:
            return children_, 1, n_samples, None

    if n_components is not None:
        warnings.warn(
            "n_components is now directly calculated from the connectivity "
            "matrix and will be removed in 0.18",
            DeprecationWarning)
    connectivity, n_components = _fix_connectivity(X, connectivity)
    if n_clusters is None:
        n_nodes = 2 * n_samples - 1
    else:
        if n_clusters > n_samples:
            raise ValueError('Cannot provide more clusters than samples. '
                             '%i n_clusters was asked, and there are %i samples.'
                             % (n_clusters, n_samples))
        n_nodes = 2 * n_samples - n_clusters

    # create inertia matrix
    coord_row = []
    coord_col = []
    A = []
    for ind, row in enumerate(connectivity.rows):
        A.append(row)
        # We keep only the upper triangular for the moments
        # Generator expressions are faster than arrays on the following
        row = [i for i in row if i < ind]
        coord_row.extend(len(row) * [ind, ])
        coord_col.extend(row)

    coord_row = np.array(coord_row, dtype=np.intp, order='C')
    coord_col = np.array(coord_col, dtype=np.intp, order='C')

    # build moments as a list
    moments_1 = np.zeros(n_nodes, order='C')
    moments_1[:n_samples] = 1
    moments_2 = np.zeros((n_nodes, n_features), order='C')
    moments_2[:n_samples] = X
    inertia = np.empty(len(coord_row), dtype=np.float64, order='C')
    _hierarchical.compute_ward_dist(moments_1, moments_2, coord_row, coord_col,
                                    inertia)
    inertia = list(six.moves.zip(inertia, coord_row, coord_col))
    heapify(inertia)

    # prepare the main fields
    parent = np.arange(n_nodes, dtype=np.intp)
    used_node = np.ones(n_nodes, dtype=bool)
    children = []
    if return_distance:
        distances = np.empty(n_nodes - n_samples)

    not_visited = np.empty(n_nodes, dtype=np.int8, order='C')

    # recursive merge loop
    for k in range(n_samples, n_nodes):
        # identify the merge
        while True:
            inert, i, j = heappop(inertia)
            if used_node[i] and used_node[j]:
                break
        parent[i], parent[j] = k, k
        children.append((i, j))
        used_node[i] = used_node[j] = False
        if return_distance:  # store inertia value
            distances[k - n_samples] = inert

        # update the moments
        moments_1[k] = moments_1[i] + moments_1[j]
        moments_2[k] = moments_2[i] + moments_2[j]

        # update the structure matrix A and the inertia matrix
        coord_col = []
        not_visited.fill(1)
        not_visited[k] = 0
        _hierarchical._get_parents(A[i], coord_col, parent, not_visited)
        _hierarchical._get_parents(A[j], coord_col, parent, not_visited)
        # List comprehension is faster than a for loop
        [A[l].append(k) for l in coord_col]
        A.append(coord_col)
        coord_col = np.array(coord_col, dtype=np.intp, order='C')
        coord_row = np.empty(coord_col.shape, dtype=np.intp, order='C')
        coord_row.fill(k)
        n_additions = len(coord_row)
        ini = np.empty(n_additions, dtype=np.float64, order='C')

        _hierarchical.compute_ward_dist(moments_1, moments_2,
                                        coord_row, coord_col, ini)

        # List comprehension is faster than a for loop
        [heappush(inertia, (ini[idx], k, coord_col[idx]))
            for idx in range(n_additions)]

    # Separate leaves in children (empty lists up to now)
    n_leaves = n_samples
    # sort children to get consistent output with unstructured version
    children = [c[::-1] for c in children]
    children = np.array(children)  # return numpy array for efficient caching

    if return_distance:
        # 2 is scaling factor to compare w/ unstructured version
        distances = np.sqrt(2. * distances)
        return children, n_components, n_leaves, parent, distances
    else:
        return children, n_components, n_leaves, parent

コード例 #56

0

ファイルを表示

    if not kwargs.get('no_plot', False):
        for i, d in zip(ddata['icoord'], ddata['dcoord']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            plt.plot(x, y, 'ro')
            plt.annotate("%.3g" % y, (x, y),
                         xytext=(0, -8),
                         textcoords='offset points',
                         va='top',
                         ha='center')

    return ddata


#Ward's method
wx = ward(wi)
print('------------------------------------------')
print('Macierz odległości: ')
print('------------------------------------------')
print(wi)
print('------------------------------------------')
print('Macierz po wykonaniu klastrowania: ')
print('------------------------------------------')
print(wx)
z = hierarchy.linkage(wx, 'ward')
ddata = augmented_dendrogram(z, color_threshold=5, truncate_mode='lastp')
hierarchy.dendrogram(z,
                     leaf_rotation=90,
                     leaf_font_size=8,
                     labels=data.index,
                     color_threshold=10)

コード例 #57

0

ファイルを表示

ファイル: repub_debate5.py プロジェクト: gtadiparthi/debate-parser

def repub_debate():
	if len(sys.argv) < 2:
		print ("Run: python repub_debate.py < Input csv> ")
		sys.exit(1)

	data = pd.read_csv(sys.argv[1])
	print(data)

	#data = data [~data.Speaker.isin(['MALE','SANTELLI','(UNKNOWN)','UNIDENTIFIED MALE','HARMAN', 'HARWOOD','CRAMER','EPPERSON','QUICK','QUINTANILLA'])]
	#Filter list for 4th republican debate
	data = data [~data.Speaker.isin(['MALE','BAKER','(UNKNOWN)','UNIDENTIFIED MALE','CAVUTO', 'BARTIROMO'])]
	
	print (('Unique Speakers: ', sorted(list(data.Speaker.unique()))))
	#Count the number of words each speaker spoke
	def countWords(speaker):
		speakerData = data[data.Speaker == speaker]
		allText = ""
		for index, row in speakerData.iterrows():
			allText += str(row['Text'])+" "

		words_all = len(allText.split())
		print (('Total words:   ',speaker,': ', words_all))
		
	for name in data.Speaker.unique():
		countWords(name);
		
	def generatewordcloud(speaker, inputImageFileName, outputImageFileName):

		speakerData = data[data.Speaker == speaker]
		allText = ""
		for index, row in speakerData.iterrows():
			allText += str(row['Text'])+" "
	
		#print (allText)
		ImageFile.LOAD_TRUNCATED_IMAGES = True

		img = Image.open(inputImageFileName)
		img = img.resize((980,1080), Image.ANTIALIAS)

		speakerArray = np.array(img)
		sl = STOPWORDS | stopwordshearing
		
		wc = WordCloud(background_color="white", max_words=500, mask=speakerArray, stopwords=sl)
		wc.generate(allText)
		# create coloring from image
		image_colors = ImageColorGenerator(speakerArray)
		wc.recolor(color_func=image_colors)
		wc.to_file(outputImageFileName)
#Commenting out generating word cloud as I am testin gsomething else now
#	 generatewordcloud('KASICH', "images/kasich.png", "images/wc_kasich.png");
#	 generatewordcloud("HUCKABEE", "images/huckabee.png", "images/wc_huckabee.png");
#	 generatewordcloud("BUSH", "images/bush.png", "images/wc_bush.png");
#	 generatewordcloud("RUBIO", "images/rubio.png", "images/wc_rubio.png");
#	 generatewordcloud("TRUMP", "images/trump.png", "images/wc_trump.png");
#	 generatewordcloud("CARSON", "images/carson.png", "images/wc_carson.png");
#	 generatewordcloud("FIORINA", "images/fiorina.png", "images/wc_fiorina.png");
#	 generatewordcloud("CRUZ", "images/cruz.png", "images/wc_cruz.png");
#	 generatewordcloud("CHRISTIE", "images/christie.png", "images/wc_christie.png");
#	 generatewordcloud("PAUL", "images/paul.png", "images/wc_paul.png");
	def generateoverallwordcloud(inputImageFileName, outputImageFileName):

		allText = ""
		for index, row in data.iterrows():
			allText += str(row['Text'])+" "
	
		#print (allText)
		ImageFile.LOAD_TRUNCATED_IMAGES = True

		img = Image.open(inputImageFileName)
		img = img.resize((980,1080), Image.ANTIALIAS)

		speakerArray = np.array(img)
		sl = STOPWORDS | stopwordshearing
		
		wc = WordCloud(background_color="white", max_words=500, mask=speakerArray, stopwords=sl)
		wc.generate(allText)
		# create coloring from image
		image_colors = ImageColorGenerator(speakerArray)
		wc.recolor(color_func=image_colors)
		wc.to_file(outputImageFileName)
	#generateoverallwordcloud("images/RepublicanLogo.png", "images/wc_rep_debate3.png");
	
	#Count the number of words by each party member
	def getWords(speaker):
		global stopwordshearing
		speakerData = data[data.Speaker == speaker]
		allText = ""
		for index, row in speakerData.iterrows():
			#s.translate(table, string.punctuation)
			allText += str(row['Text']).lower().translate(table)+" "
		allText = allText.replace("e-mail","email")
		allText = allText.replace("e- mail","email")
		allText = allText.replace("op-ed","oped")
		sl = STOPWORDS | stopwordshearing
		wc = WordCloud(background_color="white", max_words=2000,  stopwords=sl,
				random_state=42)
		
		wc.generate(allText)
		wcdf = pd.DataFrame(wc.words_)
		wcdf.columns = ["word",speaker]
		return wcdf
		#Count the number of words in the entire transcript
	def getTotalWords():
		global stopwordshearing
		speakerData = data
		allText = ""
		for index, row in speakerData.iterrows():
			#s.translate(table, string.punctuation)
			allText += str(row['Text']).lower().translate(table)+" "
		allText = allText.replace("e-mail","email")
		allText = allText.replace("e- mail","email")
		allText = allText.replace("op-ed","oped")
		sl = STOPWORDS | stopwordshearing
		wc = WordCloud(background_color="white", max_words=2000,  stopwords=sl,
				random_state=42)
		
		wc.generate(allText)
		wcdf = pd.DataFrame(wc.words_)
		wcdf.columns = ["word","Total"]
		return wcdf
	# Separate dataframes by Republican and Democrat's word frequencies
	df_dict ={}
	i=1
	for name in data.Speaker.unique():
		df_dict[name] = getWords(name)
		#print df_dict[name].head()
		if i == 1:
			rdwc = df_dict[name]
		else:
			rdwc = pd.merge(rdwc, df_dict[name], on = "word", how='outer')
		i += 1
	df_dict["Total"] = getTotalWords()
	rdwc = pd.merge(rdwc,df_dict["Total"], on = "word", how='outer')
	print (rdwc.head())
	rdwc=rdwc.fillna(0)
	rdwc.to_csv("wordfreq.csv")
	def getAllText(speaker):
		global stopwordshearing
		speakerData = data[data.Speaker == speaker]
		allText = ""
		for index, row in speakerData.iterrows():
			#s.translate(table, string.punctuation)
			allText += str(row['Text']).lower().translate(table)+" "
		allText = allText.replace("e-mail","email")
		allText = allText.replace("e- mail","email")
		allText = allText.replace("op-ed","oped")
		
		return allText
#Calculate using countvectorizer and also calculate the consine similarities
	df_list =[]
	speaker_list=[]
	i=1
	for name in data.Speaker.unique():
		df_list.append(getAllText(name))
		speaker_list.append(name)
	#print(df_dict)
	vectorizer = CountVectorizer(input='content',stop_words=stop_words)
	dtm = vectorizer.fit_transform(df_list)
	vocab = vectorizer.get_feature_names()
	dtm = dtm.toarray()
	vocab = np.array(vocab)
	dist = 1 - cosine_similarity(dtm)
	np.round(dist, 2)
	print(dist[0,1])
	print(dist[0,2])
	mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
	pos = mds.fit_transform(dist)  # shape (n_components, n_samples)
	xs, ys = pos[:, 0], pos[:, 1]
	for x, y, name in zip(xs, ys, speaker_list):
		color = 'orange' if "CLINTON" in name else 'skyblue'
		plt.scatter(x, y, c=color)
		plt.text(x, y, name)
	plt.show()
	mds = MDS(n_components=3, dissimilarity="precomputed", random_state=1)
	pos = mds.fit_transform(dist)
	from mpl_toolkits.mplot3d import Axes3D
	fig = plt.figure()
	ax = fig.add_subplot(111, projection='3d')
	ax.scatter(pos[:, 0], pos[:, 1], pos[:, 2])
	for x, y, z, s in zip(pos[:, 0], pos[:, 1], pos[:, 2], speaker_list):
		ax.text(x, y, z, s)
	plt.show()
	
	from scipy.cluster.hierarchy import ward, dendrogram
	linkage_matrix = ward(dist)
	names = speaker_list
	dendrogram(linkage_matrix, labels=names)
	plt.tight_layout() 
	plt.show()

コード例 #58

0

ファイルを表示

ファイル: cluster.py プロジェクト: lkngin/Malaya

def cluster_dendogram(
    corpus: List[str],
    vectorizer,
    titles: List[str] = None,
    stemming: Callable = sastrawi,
    stop_words: List[str] = None,
    cleaning: Callable = simple_textcleaning,
    random_samples: float = 0.3,
    ngram: Tuple[int, int] = (1, 3),
    figsize: Tuple[int, int] = (17, 9),
    batch_size: int = 20,
):
    """
    plot hierarchical dendogram with similar texts.

    Parameters
    ----------

    corpus: List[str]
    vectorizer: class
        vectorizer class.
    num_clusters: int, (default=5)
        size of unsupervised clusters.
    titles: List[str], (default=None)
        list of titles, length must same with corpus.
    stemming: function, (default=sastrawi)
        function to stem the corpus.
    stop_words: List[str], (default=None)
        list of stop words to remove. If None, default is malaya.texts._text_functions.STOPWORDS
    cleaning: function, (default=simple_textcleaning)
        function to clean the corpus.
    random_samples: float, (default=0.3)
        random samples from the corpus, 0.3 means 30%.
    ngram: Tuple[int, int], (default=(1,3))
        n-grams size to train a corpus.
    batch_size: int, (default=20)
        size of strings for each vectorization and attention. Only useful if use transformer vectorizer.

    Returns
    -------
    dictionary: {'linkage_matrix': linkage_matrix, 'titles': titles}
    """
    if not isinstance(stemming, collections.Callable) and stemming is not None:
        raise ValueError('stemming must be a callable type or None')
    if titles:
        if len(titles) != len(corpus):
            raise ValueError('length of titles must be same with corpus')

    if not hasattr(vectorizer, 'vectorize') and not hasattr(vectorizer, 'fit'):
        raise ValueError('vectorizer must has `fit` and `vectorize` methods')
    if not (random_samples < 1 and random_samples > 0):
        raise ValueError('random_samples must be between 0 and 1')

    try:
        import matplotlib.pyplot as plt
        import seaborn as sns
        from scipy.cluster.hierarchy import ward, dendrogram

        sns.set()
    except:
        raise Exception(
            'matplotlib and seaborn not installed. Please install it and try again.'
        )
    if stop_words is None:
        stop_words = STOPWORDS

    corpus = random.sample(corpus, k = int(random_samples * len(corpus)))

    if cleaning is not None:
        for i in range(len(corpus)):
            corpus[i] = cleaning(corpus[i])
    if stemming:
        for i in range(len(corpus)):
            corpus[i] = stemming(corpus[i])
    text_clean = []
    for text in corpus:
        text_clean.append(
            ' '.join([word for word in text.split() if word not in stop_words])
        )

    if hasattr(vectorizer, 'fit'):
        vectorizer.fit(text_clean)
        transformed_text_clean = vectorizer.transform(text_clean)
        features = vectorizer.get_feature_names()
    else:
        transformed_text_clean, attentions = [], []
        for i in range(0, len(text_clean), batch_size):
            index = min(i + batch_size, len(text_clean))
            transformed_text_clean.append(
                vectorizer.vectorize(text_clean[i:index])
            )
            attentions.extend(vectorizer.attention(text_clean[i:index]))
        transformed_text_clean = np.concatenate(
            transformed_text_clean, axis = 0
        )

    dist = 1 - cosine_similarity(transformed_text_clean)
    linkage_matrix = ward(dist)
    if not titles:
        titles = []
        for i in range(transformed_text_clean.shape[0]):

            if hasattr(vectorizer, 'fit'):
                indices = np.argsort(
                    np.array(transformed_text_clean[i].todense())[0]
                )[::-1]
                titles.append(
                    ' '.join([features[i] for i in indices[: ngram[1]]])
                )
            else:
                attentions[i].sort(key = lambda x: x[1])
                titles.append(
                    ' '.join([i[0] for i in attentions[i][-ngram[1] :]])
                )
    plt.figure(figsize = figsize)
    ax = dendrogram(linkage_matrix, orientation = 'right', labels = titles)
    plt.tick_params(
        axis = 'x',
        which = 'both',
        bottom = 'off',
        top = 'off',
        labelbottom = 'off',
    )
    plt.tight_layout()
    plt.show()
    return {'linkage_matrix': linkage_matrix, 'titles': titles}

コード例 #59

0

ファイルを表示

ファイル: knock98.py プロジェクト: tmu-nlp/100knock2016

import numpy as np
from scipy.cluster.hierarchy import dendrogram, ward
from matplotlib.pyplot import show
from gensim.models import word2vec


model = word2vec.Word2Vec.load("knock90_word2vec")
country_list = list()
vector_list = list()
for country in open('country_list.txt'):
    country = country.strip('\n')
    if country in model:
        country_list.append(country)
        vector_list.append(model[country])
features = np.array(vector_list)
clustring =  ward(features)
dendrogram(clustring, labels = country_list, orientation='left', leaf_font_size=10)
show()

コード例 #60

0

ファイルを表示

ファイル: important_words_phrases.py プロジェクト: LuckyCheese/Notion21_3

    for line in file:
        # remove linebreak which is the last character of the string
        currentPlace = line[:-1]

        # add item to the list
        features.append(currentPlace)



dist = 1 - cosine_similarity(X_train_vectorised)


import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import ward, dendrogram

linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances
'''
fig, ax = plt.subplots(figsize=(100, 200)) # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=title);
plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')
plt.tight_layout() #show plot with tight layout
'''

c = list(range(2, 14))
clusters = [create_clusters(cl) for cl in c]
# Logistic regression