Python km Exemples, sklearn.cluster.km Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : cluster.py Projet : steelec/pipelines

    def _run_interface(self, runtime):        
        fname = self.inputs.volume
        #load data, read lines 8~penultimate
        datafile = open(fname, 'rb')
        data = [i.strip().split() for i in datafile.readlines()]
        stringmatrix = data[8:-1]
        datafile.close()

        if self.inputs.hemi == 'lh': chosenvertices = lhvertices
        if self.inputs.hemi == 'rh': chosenvertices = rhvertices
        corrmatrix = np.zeros((len(chosenvertices),len(chosenvertices)))
        for x, vertex in enumerate(chosenvertices):
        	for i in xrange(len(chosenvertices)):
	            corrmatrix[x][i] = abs(float(stringmatrix[vertex][i]))
        if self.inputs.cluster_type == 'spectral':
            labels = spectral(corrmatrix, n_clusters=self.inputs.n_clusters, mode='arpack')
        if self.inputs.cluster_type == 'hiercluster':
            labels = Ward(n_clusters=self.inputs.n_clusters).fit_predict(corrmatrix)
        if self.inputs.cluster_type == 'kmeans':
            labels = km(n_clusters=self.inputs.n_clusters).fit_predict(corrmatrix)
        if self.inputs.cluster_type == 'dbscan':
            labels = DBSCAN(eps=np.average(corrmatrix)+np.std(corrmatrix)).fit_predict(corrmatrix)
        sxfmout = self.inputs.sxfmout
        img = nb.load(sxfmout)

        outarray = -np.ones(shape=img.shape[0])
        for j, cluster in enumerate(labels):
            outarray[chosenvertices[j]] = cluster+1

        new_img = nb.Nifti1Image(outarray, img.get_affine(), img.get_header())
        _, base, _ = split_filename(fname)
        nb.save(new_img, os.path.abspath(base + '_clustered.nii'))

        return runtime

Exemple #2

0

Afficher le fichier

    def _run_interface(self, runtime):
        #load data
        data = nb.load(self.inputs.in_File).get_data()
        corrmatrix = np.squeeze(data)
        if self.inputs.cluster_type == 'spectral':
            positivecorrs = np.where(
                corrmatrix > 0, corrmatrix,
                0)  #threshold at 0 (spectral uses non-negative values)
            newmatrix = np.asarray(
                positivecorrs,
                dtype=np.double)  #spectral expects dtype=double values
            labels = spectral(newmatrix,
                              n_clusters=self.inputs.n_clusters,
                              eigen_solver='arpack',
                              assign_labels='discretize')
        if self.inputs.cluster_type == 'hiercluster':
            labels = Ward(
                n_clusters=self.inputs.n_clusters).fit_predict(corrmatrix)
        if self.inputs.cluster_type == 'kmeans':
            labels = km(
                n_clusters=self.inputs.n_clusters).fit_predict(corrmatrix)
        if self.inputs.cluster_type == 'dbscan':
            labels = DBSCAN(eps=self.inputs.epsilon).fit_predict(corrmatrix)

        new_img = nb.Nifti1Image(labels + 1,
                                 None)  #+1 because cluster labels start at 0
        _, base, _ = split_filename(self.inputs.in_File)
        nb.save(
            new_img,
            os.path.abspath(base + '_' + str(self.inputs.n_clusters) + '_' +
                            self.inputs.cluster_type + '_' + self.inputs.hemi +
                            '.nii'))

        return runtime

Exemple #3

0

Afficher le fichier

Fichier : kmeans.py Projet : wentaowanguc/softgoals

 def run(self, src):
   x = self.table(src)
   k = self.settings.k
   if k == "auto":
     raise RuntimeError("Have to implement auto parser")
   cl = km(n_clusters=k)
   y = cl.fit_predict(x)
   clusters = {}
   for x_i, y_i in zip(x, y):
     clusters[y_i] = clusters.get(y_i, []) + [x_i]
   return clusters

Exemple #4

0

Afficher le fichier

def retrain(clusters):
    pickle_file = open("train_data.pickle", "rb")
    arr = pickle.load(pickle_file)
    pickle_file.close()

    clt = km(n_clusters = clusters)
    clt.fit(arr)

    pickle_file = open("clt.pickle", "wb")
    pickle.dump(clt, pickle_file)
    pickle_file.close()

Exemple #5

0

Afficher le fichier

 def __nk(self, nk=None):
     if self.Xm is None:
         self.nk = None
     elif nk is None:
         #self.nk = np.random.randint(0,self.k, self.n)
         self.nk = km(n_clusters=self.k).fit_predict(self.Xm)
     else:
         try:
             self.nk = nk
             self.nk.shape = (self.n, )
         except ValueError:
             print "nk must be of the same lenght as Xm"
     return self.nk

Exemple #6

0

Afficher le fichier

Fichier : clawer-code.py Projet : zyy98/class-project

def cluster(nc):
    global list_u
    global dict_u
    clist = []
    for i in range(len(list_u)):
        clist.append(dict_u[list_u[i]])
    result = km(n_clusters=nc,
                max_iter=300, n_init=40, init='k-means++').fit_predict(
                    np.array(clist))  #根据大学的不同学科情况进行聚类，聚类个数为nc个，最大迭代次数为300

    ny = [[] for i in range(nc)]  #初始化数组
    for i in range(top_k):
        ny[result[i]].append(list_u[i])  #按照类别将大学名称加入各类的list中

    for i in range(nc):  #输出各类的大学名称
        print(ny[i])

Exemple #7

0

Afficher le fichier

 def genlist():
     print("fetching candidate replacements for " + self.pred)
     cdatasource = eval(self.classname).pred_candidates
     if not cdatasource.has(self.lnoun):
         print("can't find typical preds for ", self.noun)
         return None
     cands_max = cdatasource.get(self.lnoun).most_common()
     cands = set()
     syns = self.get_syns()
     syns.sort(key=lambda x: self.modifies_noun(x), reverse=True)
     strong_syns = list()
     mat = list()
     good = list()
     for cand in cands_max:
         c = cand[0]
         if c in vecs and abst.has(c) and abst.get(c) > abst.get(
                 self.pred):
             if c != self.pred and c in syns:
                 strong_syns.append(c)
             mat.append(vecs.get(c))
             good.append(cand)
     if len(strong_syns) > 0:
         self.strong_syns = strong_syns
     mat = np.array(mat)
     if len(cands_max) > 100 and False:
         k = km(n_clusters=round(len(cands_max) / 50),
                random_state=0).fit(mat)
         hotclust = k.predict(vecs.get(self.pred).reshape(1, -1))
         cands = [
             cand for cand in good
             if k.predict(vecs.get(cand[0]).reshape(1, -1)) == hotclust
         ]
         cands.sort(key=lambda x: x[1])
         #cands = squeeze(cands,15)
     else:
         cands = good
     cands = [cand[0] for cand in cands[:50]]
     if len(strong_syns) > 1:
         cands = set(cands[:5]).union(set(strong_syns))
         cands.discard(self.pred)
         ret = list(cands)
     else:
         cands = set(cands[:15]).union(set(syns[:5]))
         cands.discard(self.pred)
         ret = squeeze(list(cands), 5)
     return ret

Exemple #8

0

Afficher le fichier

Fichier : utils.py Projet : akhilkumarco007/Sparse_coding_Feature_mapping

def input_generator(file_names, n_clust, n_sub, cluster_method, sub_length):
    '''
    Creates a matrix from the gaze data. Gaze data from each file is clustered to n_clust points using either HR
    clustering or KM clustering. These n_clust points are flattened to 2 * n_clust values and each of these 2 * n_clust
    values are divided in n_sub subsequences of length sub_length. The columns of the matrix correspond to these gaze
    subsequences of length sub_length flattened into 2 * sub_length values.
    :param file_names: Gaze files names as the list of strings.
    :param n_clust: Number of clusters as an integer.
    :param n_sub: Number of Sub seqeunces per image as an integer
    :param cluster_method: Clustering method used as a String. Can be either 'HR' or 'KM'
    :param sub_length: Integer length of the sub sequence
    :return: Input matrix with all the clustered gaze points organized into sub sequences.
    '''
    mat = []
    for image in file_names:
        gaze = []
        with open(args.gaze_path + image, 'r') as f:
            reader = csv.reader(f, delimiter=',')
            count = 0
            for row in reader:
                gaze.append((int(row[0]), int(row[1]), count * 110))
                count += 1
            gaze = np.array(gaze)
            try:
                if cluster_method == 'HR':
                    cluster_labels = hc(n_clusters=n_clust).fit_predict(gaze)
                elif cluster_method == 'KM':
                    cluster_labels = km(n_clusters=n_clust).fit_predict(gaze)
            except:
                raise ValueError(
                    'Choose between "HR" or "KM" as the clustering method')
            result = {
                i: gaze[np.where(cluster_labels == i)]
                for i in range(n_clust)
            }
            centres = []
            for cluster in result:
                cluster_points = np.array(result[cluster])
                cluster_centre = np.mean(cluster_points, axis=0)
                centres.append(int(cluster_centre[0]))
                centres.append(int(cluster_centre[1]))
            # for i in range(0, len(centres) - (2 * sub_length), 2 * ((n_clust - sub_length)/ n_sub)):
            for i in range(0, len(centres), ((2 * n_clust) / n_sub)):
                mat.append(centres[i:i + (2 * sub_length)])
    return np.transpose(np.array(mat))

Exemple #9

0

Afficher le fichier

Fichier : cluster.py Projet : JanisReinelt/pipelines

    def _run_interface(self, runtime):        
        #load data
        data = nb.load(self.inputs.in_File).get_data()
        corrmatrix = np.squeeze(data)
        if self.inputs.cluster_type == 'spectral':
            positivecorrs = np.where(corrmatrix>0,corrmatrix,0) #threshold at 0 (spectral uses non-negative values)
            newmatrix = np.asarray(positivecorrs,dtype=np.double) #spectral expects dtype=double values
            labels = spectral(newmatrix, n_clusters=self.inputs.n_clusters, eigen_solver='arpack', assign_labels='discretize')
        if self.inputs.cluster_type == 'hiercluster':
            labels = Ward(n_clusters=self.inputs.n_clusters).fit_predict(corrmatrix)
        if self.inputs.cluster_type == 'kmeans':
            labels = km(n_clusters=self.inputs.n_clusters).fit_predict(corrmatrix)
        if self.inputs.cluster_type == 'dbscan':
            labels = DBSCAN(eps=self.inputs.epsilon).fit_predict(corrmatrix)

        new_img = nb.Nifti1Image(labels+1, None) #+1 because cluster labels start at 0
        _, base, _ = split_filename(self.inputs.in_File)
        nb.save(new_img, os.path.abspath(base+'_'+str(self.inputs.n_clusters)+'_'+self.inputs.cluster_type+'_'+self.inputs.hemi+'.nii'))

        return runtime

Exemple #10

0

Afficher le fichier

Fichier : metsub.py Projet : ilay32/metaphore-substitutes

 def genlist():
     print("fetching candidate replacements for "+self.pred)
     cdatasource = eval(self.classname).pred_candidates
     if not cdatasource.has(self.lnoun):
         print("can't find typical preds for ",self.noun)
         return None
     cands_max = cdatasource.get(self.lnoun).most_common()
     cands = set()
     syns = self.get_syns()
     syns.sort(key=lambda x: self.modifies_noun(x),reverse=True)
     strong_syns = list()
     mat = list()
     good = list()
     for cand in cands_max:
         c = cand[0]
         if c in vecs and abst.has(c) and abst.get(c) > abst.get(self.pred):
             if c != self.pred and c in syns:
                 strong_syns.append(c)
             mat.append(vecs.get(c))
             good.append(cand)
     if len(strong_syns) > 0:
         self.strong_syns = strong_syns
     mat = np.array(mat)
     if len(cands_max) > 100 and False:
         k = km(n_clusters=round(len(cands_max)/50),random_state=0).fit(mat)
         hotclust = k.predict(vecs.get(self.pred).reshape(1,-1))
         cands = [cand for cand in good if k.predict(vecs.get(cand[0]).reshape(1,-1)) == hotclust]
         cands.sort(key=lambda x : x[1])
         #cands = squeeze(cands,15)
     else:
         cands = good
     cands = [cand[0] for cand in cands[:50]]
     if len(strong_syns) > 1:
         cands = set(cands[:5]).union(set(strong_syns))
         cands.discard(self.pred)
         ret = list(cands)
     else:  
         cands = set(cands[:15]).union(set(syns[:5]))
         cands.discard(self.pred)
         ret =  squeeze(list(cands),5)
     return ret

Exemple #11

0

Afficher le fichier

    def fit(self, original_image, segmented_image):
        self.original_image = original_image
        self.segmented_image = segmented_image
        self.groups = np.unique(
            self.segmented_image[self.segmented_image != -1])
        self.flat_segments = np.reshape(self.segmented_image, (-1))
        self.flat_image = np.reshape(self.original_image, (-1, 3))

        # Extracting objects from image
        objects = []
        for group in self.groups:
            object = self.flat_image[self.flat_segments == group]
            objects.append(object)

        # Constructing features from objects
        features = np.zeros((len(objects), 3))
        for i, object in enumerate(objects):
            max = np.max(object)
            min = np.min(object)
            mean = np.mean(object)
            features[i] = [max, min, mean]

        self.group_counts = np.array(self.n_clusters)

        kmeans = km(n_clusters=self.n_clusters, random_state=0).fit(features)
        labels = kmeans.labels_

        labelled_image = np.zeros(self.flat_segments.shape[0])

        for i, group in enumerate(self.groups):
            labelled_image[self.flat_segments == group] = labels[i]

        labelled_image = np.reshape(labelled_image, self.segmented_image.shape)

        if self.visualise:
            cv.imshow("Clustered image", labelled_image)
            cv.waitKey(0)
            cv.destroyWindow("Clustered image")

        return labelled_image

Exemple #12

0

Afficher le fichier

def kmeans(
        feature_matrix: pd.DataFrame(), k: int = 2,
        feature_columns: list = []):
    """包裝Kmeans

    Args:
        feature_matrix (pd.DataFrame): 經過前處理的特徵矩陣
        k (int, optional): 分K群. Defaults to 2.
        feature_columns (list, optional): 選擇分群用的特徵的columns. Defaults to [].

    Returns:
        sklearn.cluster.KMeans: sklearn Kmeans分群結果物件
    """
    if not feature_columns:
        feature_columns = feature_matrix.columns
    # uid以外作為特徵
    feature_matrix = feature_matrix[feature_columns]
    # 轉np array做輸入用
    x = np.array(feature_matrix)
    # 跑Kmeans分群
    km_model = km(n_clusters=k, random_state=0).fit(x)
    return km_model

Exemple #13

0

Afficher le fichier

def silhoutte(name):
    df = pd.read_csv("../processing/" + name + "_dist.csv.virus")
    xs = list(df['x'])
    ys = list(df['y'])
    xs = [x - min(xs) for x in xs]
    ys = [x - min(ys) for x in ys]
    X = np.matrix(zip(xs, ys))
    stat = open('kstatistics.csv', 'w')
    ncluster = []
    distortion_set = []
    silh = []
    for nc in range(2, 13):
        kmeans = km(n_clusters=nc).fit(X)
        cluster_labels = kmeans.fit_predict(X)
        silhouette_avg = silhouette_score(X, cluster_labels)
        distortion = 0
        distortion = (sum(
            np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1)) /
                      X.shape[0])
        ncluster.append(nc)
        distortion_set.append(distortion)
        silh.append(silhouette_avg)

        # print kmeans.cluster_centers_, sum(np.min(cdist(X,
        # kmeans.cluster_centers_, 'euclidean'), axis=1))/13

        print(nc, distortion, silhouette_avg)
        print(nc, distortion, silhouette_avg, file=stat)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.plot(ncluster, distortion_set, '-o')
    ax.set_xlabel('No.of clusters')
    ax.set_ylabel('Distortion')
    ax.set_title("Selecting K with Elbow method", fontsize=10)
    plt.show(block=False)
    ''' fig1 = plt.figure()

Exemple #14

0

Afficher le fichier

Fichier : KMeans.py Projet : templeblock/das

        a = exp_X_cent_dist / tf.reduce_sum(
            exp_X_cent_dist, axis=2, keep_dims=True)
        print a
        return a

    def fit(self, X_train):
        c, l = self.sess.run(self.network, {self.X: X_train})
        return c, l


from sklearn.cluster import KMeans as km

if __name__ == "__main__":
    nb_samples = 10000
    E = 2
    nb_clusters = 2

    X, y = make_blobs(n_samples=nb_samples, centers=nb_clusters, n_features=E)

    X_ = X[np.newaxis, :]
    y = y[np.newaxis, :]
    print y
    kmean = KMeans(nb_clusters)
    kmean.init()
    centroids, labels = kmean.fit(X_)
    print centroids
    print np.sum(labels)
    print y

    kmeans = km(n_clusters=2, random_state=0).fit(X)
    print kmeans.cluster_centers_

Exemple #15

0

Afficher le fichier

Fichier : K-Means_airlines.py Projet : vamshiammula/K-Means

#normalization
def fun(i):
    x = ((i - i.min()) / (i.max() - i.min()))
    return (x)


df_norm = fun(df.iloc[:, 1:])
df_norm.describe()

# In[157]:

#elbow curve
wss = []
k = list(range(10, 100, 5))
for i in k:
    kmeans = km(n_clusters=i)
    kmeans.fit(df_norm)
    wss.append(kmeans.inertia_)
wss

# In[158]:

plt.plot(k, wss, 'ro-')
plt.xlabel('number of clusters')
plt.ylabel('total with in ss')

# In[159]:

model = km(n_clusters=40)
model.fit(df_norm)
model.labels_

Exemple #16

0

Afficher le fichier

Fichier : ga_methods.py Projet : ra6had/GA-Cluster

    print('Computing using: ' + method + ' breed method')
    start = datetime.now()
    GENERATION = copy.deepcopy(init_pop)
    for i in range(gens):
        print('Generation no: ' + str(i + 1))
        GENERATION.select()
        survivors.append(GENERATION.population)
        top_scores.append((min(GENERATION.score())))
        fittest.append(GENERATION.population[GENERATION.sorted_scores[0]])
        GENERATION.mutate(0.001)
        GENERATION.breed(method=method)
    GENERATION.population = fittest
    GENERATION.sorted_scores = np.argsort(GENERATION.score())
    fit_rank = GENERATION.sorted_scores
    alpha = fittest[fit_rank[0]]
    ga_means = km(n_clusters, alpha, 1).fit(X)
    cluster_list.append(ga_means)
    end = datetime.now()
    comp_duration.append(end - start)
    Fittest.append(alpha)
    #Survivors.append(survivors)

for i in range(len(cluster_list)):
    Distances.append(galuster.sum_distances(cluster_list[i], X))
    plt.figure(i)
    galuster.lolipop_plot(cluster_list[i], X)

#Iterate GA operations over number of generations

#ga_start = datetime.now()
#

Exemple #17

0

Afficher le fichier

Fichier : code2_b.py Projet : kautsiitd/Unsupervised-Hand-Written-Digit-Recognition

frqs = [['Actual/Predict', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 'Recall']
        ] + [[i] + [0 for j in range(10)] for i in range(10)] + [['Precision']]

# saving original pics
# for i in range(2000):
# 	scipy.misc.imsave("Output/2.2c/Original_"+str(n_cluster)+"/"+str(i)+".bmp",data[i].reshape(28,28))

# modeling and predicting labels
pca = PCA(n_components=.9)
data = pca.fit_transform(data)

print data[0]
sys.exit()
model = km(n_clusters=n_cluster,
           max_iter=2000,
           n_init=100,
           init='k-means++',
           tol=.00001,
           n_jobs=4)
model.fit(np.array(data))
print 1
p_label = model.fit_predict(data)
print 2

# finding mapping
for i in range(2000):
    result[p_label[i]].append(labels[i][0])
mapping = [max(set(i), key=i.count) for i in result]

# saving pics after applying PCA
for i in range(2000):
    scipy.misc.imsave(

Exemple #18

0

Afficher le fichier

def find_spread(X,cluster_size):
    X_kmeans = km(cluster_size, random_state = 0).fit(X)
    X_kmeans_list = {i: X[np.where(X_kmeans.labels_ == i)] for i in range(cluster_size)}
    X_spread = list(np.diag(np.diag(np.cov(X_kmeans_list[i].T))) for i in range(cluster_size))
    return X_spread

Exemple #19

0

Afficher le fichier

Fichier : main.py Projet : ra6had/GA-Cluster

ga_start = datetime.now()

for i in range(gens):
    print('Generation no: ' + str(i + 1))
    pop.select()
    #survivors.append(pop.population)
    top_scores.append((min(pop.score())))
    fittest.append(pop.population[pop.sorted_scores[0]])
    pop.mutate(0.001)
    pop.breed(method='hybrid')

#Cluster the data using the fittest seed
init_pop.population = fittest
init_pop.sorted_scores = np.argsort(init_pop.score())
fit_rank = init_pop.sorted_scores
ga_means = km(n_clusters, fittest[fit_rank[0]], 1).fit(X)

ga_end = datetime.now()
comp_duration.append(ga_end - ga_start)

cluster_list.append(ga_means)
sum_of_dist.append(galuster.sum_distances(ga_means, X))

#Plot GA cluster membership
plt.figure(0)
galuster.lolipop_plot(ga_means, X)

for i in range(no_kmeans):
    print('Starting kmeans algorithm no: ' + str(i + 1))
    start = datetime.now()  #Record starting time
    kmeans = km(n_clusters, n_init=n_seed).fit(X)  #compute kmeans

Exemple #20

0

Afficher le fichier

Fichier : kmeans.py Projet : esslushy/Emoji-Sorter

    return clusters

def organizeEnumeratedDictionary(dictionary):
	newDict = {}
	for i in range(0, len(dictionary)):
		newDict[i]=dictionary[i]
	return newDict

def emojiCodeToEmoji(clusterDict, emojiDict):
	for i in clusterDict:
		for x in range(0, len(clusterDict[i])):
			clusterDict[i][x] = emojiDict[clusterDict[i][x]]
	return clusterDict


tsvToEmojiDict = arrayToDict(getFile('emoji_lookup.tsv'))#dictionary to translate emojis over
emojiDataFrame = pd.read_csv(StringIO(codecs.open('emojis.txt', 'r', encoding='utf8', errors='ignore').read()), sep='\s+')#creates pandas system for holding data
dimensions = np.array(emojiDataFrame.as_matrix(columns=emojiDataFrame.columns[1:]))#gets dimensions in an array perfect for cluster
#			auto random state
cluster = km(n_clusters = 100, max_iter=10000000)#kmeans clustering
#cluster = ap(max_iter=1000000)#Affinity propogation clustering
#cluster = b(n_clusters=200)#birch clustering
cluster.fit(dimensions)
codedCluster = addEmojiCode(dimensions, cluster, getFile('emoji_lookup.tsv'))
organizedClusters = organizeEnumeratedDictionary(codedCluster)
emojiClusters = emojiCodeToEmoji(organizedClusters, tsvToEmojiDict)
for z in emojiClusters:
	print("Clusters" + str(z))
	print(emojiClusters[z])
	print("\n")

Exemple #21

0

Afficher le fichier

Fichier : PlotHandler.py Projet : damianmatusik96/SoilIdentificationSystem-1.0

"""

import pandas as pd
from sklearn.cluster import KMeans as km
import numpy
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score as ss

data = pd.read_csv("sdm.csv", sep=";", header=None)
data.columns = ["depth", "param_1", "param_2"]

data_pr = data[["param_1", "param_2"]]

plt.scatter(data_pr.param_1, data_pr.param_2)

kmeans = km(init='k-means++', n_clusters=3, random_state=0).fit(data_pr.as_matrix())
# data_pr['labels'] =pd.Series(kmeans.labels_)
# data_pr.plot.scatter(x='b',y='c',c='labels', colormap='viridis')
data_pr1 = data_pr.copy()
scores = []

for k in range(2, 10):
    kmeans = km(init='k-means++', n_clusters=k, random_state=0).fit(data_pr.as_matrix())
    data_pr1['labels'] = pd.Series(kmeans.labels_)
    print(len(kmeans.labels_))
    data_pr1.plot.scatter(x='b', y='c', c='labels', colormap='viridis')
    scores.append(ss(data_pr1[['b', 'c']], labels=data_pr1['labels']))

print(data_pr1)
n = [i for i in range(2, 10)]
plt.figure()

Exemple #22

0

Afficher le fichier

Fichier : Cluster.py Projet : eugeniousone/Persona-Clustering-Analysis

 def run(self, clusters=3):
     self.y = km(n_clusters=clusters).fit_predict(self.X)

Exemple #23

0

Afficher le fichier

Fichier : kmeans-movies-data.py Projet : cpati/machine-learning-movies

data.head()

#plotting values
f1 = data['duration'].values
f2 = data['imdb_score'].values
X = np.array(list(zip(f1, f2)))
print("This is X. The zipped array")
print(X)

#Finding optimal k
wcss = []

for i in range(1, 11):
    kmeans = km(n_clusters=i,
                init='k-means++',
                max_iter=300,
                n_init=10,
                random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

#Plotting the results onto a line graph, allowing us to observe 'The elbow'
plt.plot(range(1, 11), wcss)
plt.title('The elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')  #within cluster sum of squares
plt.show()

k = 4
kmeans = km(n_clusters=k)
KMmodel = kmeans.fit(X)

Exemple #24

0

Afficher le fichier

Fichier : top_20_validation.py Projet : ee08b397/uberAnalysis

for line in lines:
    if line.strip() == "":
        break
    name = line.split(":")[0].strip()
    location = line.split(":")[1].strip().split(",")
    top_yelp.append([name, float(location[0].strip()), float(location[1].strip())])

top_yelp_lat_long = []
for row in top_yelp:
    top_yelp_lat_long.append([row[1], row[2]])

t = 21
lat_long = []
for row in all_data[t]:
    lat_long.append([row[3], row[4]])

for k in [240]:
    kmeans = km(k, max_iter=1000, n_init=50, init="k-means++")
    kmeans.fit(lat_long)


pred = kmeans.predict(top_yelp_lat_long)

pred_cluster_centers = [kmeans.cluster_centers_[i] for i in pred]

error = [vincenty(top_yelp_lat_long[i], pred_cluster_centers[i]).miles for i in range(len(top_yelp_lat_long))]

print "For t = {t_val}:".format(t_val=t)
print "Min Error\t\tMax Error\tAvg Error".format()
print "{min_e}\t{max_e}\t{avg_e}".format(min_e=min(error), max_e=max(error), avg_e=sum(error) / len(error))

Exemple #25

0

Afficher le fichier

Fichier : Debug.py Projet : TerrellNowlin/ROIDetector

def runROIDetector(p, k, min_dist, threshold, gammae, nue):
    k = k.astype(np.int64)
    p = p.astype(np.int64)
    threshold = threshold.astype(np.int64)
    resize = .3

    filesHtrain = []
    filesUtrain = []

    filesUtest = []
    filesHtest = []

    filesUCV = []
    filesHCV = []

    unhealthyTestPatient = [
        f for f in listdir('testROI/unhealthy')
        if isdir(join('testROI/unhealthy', f))
    ]
    healthyTestPatient = [
        f for f in listdir('testROI/healthy')
        if isdir(join('testROI/healthy', f))
    ]
    unhealthyTrainPatient = [
        f for f in listdir('trainROI/unhealthy')
        if isdir(join('trainROI/unhealthy', f))
    ]
    healthyTrainPatient = [
        f for f in listdir('trainROI/healthy')
        if isdir(join('trainROI/healthy', f))
    ]
    unhealthyCVPatient = [
        f for f in listdir('trainROI/unlabel')
        if isdir(join('trainROI/unlabel', f))
    ]
    healthyCVPatient = healthyTrainPatient[0:8]
    healthyTrainPatient = healthyTrainPatient[8:len(healthyTrainPatient) - 1]

    for i in healthyTrainPatient:
        dirs = listdir('trainROI/healthy/' + i)
        for j in dirs:
            if "._" not in j:
                filesHtrain.append(('trainROI/healthy/' + i + '/' + j))

    for i in unhealthyTrainPatient:
        dirs = listdir('trainROI/unhealthy/' + i)
        for j in dirs:
            if "._" not in j:
                filesUtrain.append(('trainROI/unhealthy/' + i + '/' + j))

    for i in unhealthyCVPatient:
        dirs = listdir('trainROI/unlabel/' + i)
        for j in dirs:
            if "._" not in j:
                filesUCV.append(('trainROI/unlabel/' + i + '/' + j))
    for i in healthyTestPatient:
        dirs = listdir('testROI/healthy/' + i)
        for j in dirs:
            if "._" not in j:
                filesHtest.append(('testROI/healthy/' + i + '/' + j))

    for i in unhealthyTestPatient:
        dirs = listdir('testROI/unhealthy/' + i)
        for j in dirs:
            if "._" not in j:
                filesUtest.append(('testROI/unhealthy/' + i + '/' + j))

    init_words = []
    words = []
    final_words = []
    healthyPatientDict = {}
    # Part One:----------Extract initial words from training positive set----------
    for image in (filesUtrain):
        #        read image from Unhealthy class (These are the seed examples)
        img = imread(image)
        [h, w, d] = img.shape
        img = rescale(img, resize)
        [h, w, d] = img.shape

        #        Trim the image to a bucketable size
        F = img[0:int(p * math.floor(h / p)), 0:int(p * math.floor(w / p)), :]

        #        Extract HOG Features
        fd = hog(F[:, :, 1],
                 orientations=8,
                 pixels_per_cell=(p, p),
                 cells_per_block=(1, 1))

        #        Reorganize features so that each index matches a sample
        features = np.reshape(fd, (((F.shape[1] * F.shape[0]) / (p * p)), 8))

        #        Cluster the features
        KM = km(n_clusters=k).fit(features)

        #        Add the cluster centers to the inital dictionary
        init_words.extend(KM.cluster_centers_)

    init_words = np.asarray(init_words)
    occurencVec = {}
    ### Part Two:----------------------Filter out "bad words"------------------------
    for patient in (healthyTrainPatient):
        healthyPatientDict[patient] = {}
        for idx in range(len(init_words)):
            temp = healthyPatientDict[patient]
            temp[idx] = -1
            occurencVec[idx] = 0
            healthyPatientDict[patient] = temp
    for patient in (healthyTrainPatient):
        imageList = listdir('trainROI/healthy/' + patient)
        HOGvec = []
        count = 0
        for image in imageList:
            if "._" not in image:
                #             read image from Healthy class
                img = imread('trainROI/healthy/' + patient + '/' + image)
                [h, w, d] = img.shape
                img = rescale(img, resize, anti_aliasing=True)
                [h, w, d] = img.shape

                #                 Trim the image to a bucketable size
                F = img[0:int(p * math.floor(h / p)),
                        0:int(p * math.floor(w / p)), :]

                #               Extract HOG Features
                fd = hog(F[:, :, 1],
                         orientations=8,
                         pixels_per_cell=(p, p),
                         cells_per_block=(1, 1))

                #              Reorganize features so that each index matches a sample
                features = np.reshape(fd, (((F.shape[1] * F.shape[0]) /
                                            (p * p)), 8))
                if count == 0:
                    HOGvec = features
                else:
                    HOGvec = np.vstack((HOGvec, features))
                count = count + 1
#       For each of the words in the inital dictionary, calculate the
#       L2-distance between the features of the current image. If the distance
#       is too small too many times, remove the word from the dictionary.

        initWordsIdx = 0
        for rows in (init_words):
            num_of_matches = 0
            iters = 0
            for n in HOGvec:
                iters = iters + 1
                r = np.linalg.norm(rows - n)
                if r < min_dist:
                    num_of_matches = num_of_matches + 1
                    temp = occurencVec[initWordsIdx]
                    occurencVec[initWordsIdx] = temp + 1
                iters = iters + 1
            final_words.append(rows)
            temp = healthyPatientDict[patient]
            temp[initWordsIdx] = num_of_matches
            initWordsIdx = initWordsIdx + 1

    averages = []
    for count in range(len(healthyPatientDict[healthyTrainPatient[0]])):
        numerator = 0
        for patient in healthyTrainPatient:
            temp = healthyPatientDict[patient]
            numerator = numerator + temp[count]
        average = numerator / len(healthyPatientDict)
        averages.append(average)
        idxs = np.where(np.asarray(averages) < threshold)
    idxs = idxs[0]
    featureMatrix = np.zeros(shape=(len(healthyPatientDict), len(idxs)))
    i = 0
    for patient in healthyPatientDict:
        j = 0
        for idx in idxs:
            temp = healthyPatientDict[patient]
            featureMatrix[i, j] = temp[idx]
            j = j + 1
        i = i + 1
    validationMatrix = np.zeros(shape=(len(unhealthyCVPatient) +
                                       len(healthyCVPatient), len(idxs)))
    i = 0
    for patient in (healthyCVPatient):
        imageList = listdir('trainROI/healthy/' + patient)
        HOGvec = []
        count = 0
        j = 0
        for image in imageList:
            if "._" not in image:
                #             read image from Healthy class
                img = imread('trainROI/healthy/' + patient + '/' + image)
                [h, w, d] = img.shape
                img = rescale(img, resize, anti_aliasing=True)
                [h, w, d] = img.shape

                #                 Trim the image to a bucketable size
                F = img[0:int(p * math.floor(h / p)),
                        0:int(p * math.floor(w / p)), :]

                #               Extract HOG Features
                fd = hog(F[:, :, 1],
                         orientations=8,
                         pixels_per_cell=(p, p),
                         cells_per_block=(1, 1))

                #              Reorganize features so that each index matches a sample
                features = np.reshape(fd, (((F.shape[1] * F.shape[0]) /
                                            (p * p)), 8))
                if count == 0:
                    HOGvec = features
                else:
                    HOGvec = np.vstack((HOGvec, features))
                count = count + 1
#       For each of the words in the inital dictionary, calculate the
#       L2-distance between the features of the current image. If the distance
#       is too small too many times, remove the word from the dictionary.

        initWordsIdx = 0
        for num in range(len(idxs)):
            num_of_matches = 0
            iters = 0
            for n in HOGvec:
                iters = iters + 1
                r = np.linalg.norm(init_words[num] - n)
                if r < min_dist:
                    num_of_matches = num_of_matches + 1
                iters = iters + 1
            validationMatrix[i, j] = num_of_matches
            j = j + 1
        i = i + 1
    for patient in (unhealthyCVPatient):
        imageList = listdir('trainROI/unlabel/' + patient)
        HOGvec = []
        count = 0
        j = 0
        for image in imageList:
            if "._" not in image:
                #             read image from Healthy class
                img = imread('trainROI/unlabel/' + patient + '/' + image)
                [h, w, d] = img.shape
                img = rescale(img, resize, anti_aliasing=True)
                [h, w, d] = img.shape

                #                 Trim the image to a bucketable size
                F = img[0:int(p * math.floor(h / p)),
                        0:int(p * math.floor(w / p)), :]

                #               Extract HOG Features
                fd = hog(F[:, :, 1],
                         orientations=8,
                         pixels_per_cell=(p, p),
                         cells_per_block=(1, 1))

                #              Reorganize features so that each index matches a sample
                features = np.reshape(fd, (((F.shape[1] * F.shape[0]) /
                                            (p * p)), 8))
                if count == 0:
                    HOGvec = features
                else:
                    HOGvec = np.vstack((HOGvec, features))
                count = count + 1
#       For each of the words in the inital dictionary, calculate the
#       L2-distance between the features of the current image. If the distance
#       is too small too many times, remove the word from the dictionary.

        initWordsIdx = 0
        for num in range(len(idxs)):
            num_of_matches = 0
            iters = 0
            for n in HOGvec:
                iters = iters + 1
                r = np.linalg.norm(init_words[num] - n)
                if r < min_dist:
                    num_of_matches = num_of_matches + 1
                iters = iters + 1
            validationMatrix[i, j] = num_of_matches
            j = j + 1
        i = i + 1

    normalizedXtrain = normalize(featureMatrix)
    normalizedXCV = normalize(validationMatrix)

    clf = svm.OneClassSVM(nu=nue, kernel="rbf", gamma=gammae)
    clf.fit(normalizedXtrain)
    y_CV = np.ones(shape=(len(unhealthyCVPatient) + len(healthyCVPatient), 1))
    y_CV[len(healthyCVPatient):len(unhealthyCVPatient) +
         len(healthyCVPatient)] = -1
    y_pred_train = clf.predict(normalizedXtrain)
    y_pred_CV = clf.predict(normalizedXCV)

    return [y_pred_train, y_pred_CV, y_CV]

Exemple #26

0

Afficher le fichier

</script>
  <script async defer
      src="https://maps.googleapis.com/maps/api/js?key=AIzaSyDu4tAkj9-8cwEPTamK812YSbPnZ6xq9D8&signed_in=true&libraries=visualization&callback=initMap">
  </script>
</body>
</html>

"""

all_data = pickle.load(open("../data/all_data_new.p", "rb"))


t = 0
lat_long = []
for row in all_data[t]:
    lat_long.append([row[3], row[4]])

kmeans = km(190, max_iter=1000, n_init = 50,init = 'k-means++')
kmeans.fit(lat_long)


t = 0
lat_lon_values = ""
for c in kmeans.cluster_centers_:
    lat_lon_values += "new google.maps.LatLng(" + str(c[0]) + ", " + str(c[1]) + "),\n"
lat_lon_values = lat_lon_values[:-2]
file_ = open('../outputs/Google_Heatmap_{t_val}.html'.format(t_val=t), 'w')
file_.write(html_front + lat_lon_values + html_back)
file_.close()

Exemple #27

0

Afficher le fichier

Fichier : clustering_model.py Projet : mattsarn/cycl.co

print("How do I look?")
print(features.head())


# ###Normalizing the data and clustering


cols_to_norm = ['Duration','distance_start_stop', 'day_of_week', 'hours'] 
features[cols_to_norm] = features[cols_to_norm].apply(lambda x: (x - x.mean()) / (x.max() - x.min()))
print("Normalized")
print(features.head(2))


cluster_num = 11
model = km(n_clusters = cluster_num, n_init=5, max_iter=20)
model.fit_transform(features)
print("Model created")


features['labels'] = model.labels_
print("Got some labels.")
print(features.head(2))


#sampling my data to run the silhouette score
sample = features.sample(4000)


silhouette_score(sample.values, sample['labels'].values)

Exemple #28

0

Afficher le fichier

def choose_center(X):
    X_kmeans = km(n_clusters = 10, random_state = 0).fit(X)
    X_centers = X_kmeans.cluster_centers_
    return X_centers

Exemple #29

0

Afficher le fichier

import pandas as pd
from sklearn.cluster import KMeans as km
import numpy as np
import seaborn as sns
#data okuma
df = pd.read_csv("Final-data.txt")
# k ve model olusturma
k = int(input("k:"))
a = km(n_clusters=k).fit(df)

#Perfomance
TCSS = np.sum((df.values - np.sum(df.values, axis=0) / len(df))**2)
WCSS = np.zeros(k)
for index, i in enumerate(a.labels_):
    WCSS[i] += np.sum((df.values[index] - a.cluster_centers_[i])**2)**0.5

BCSS = TCSS - np.sum(WCSS)
dist = []
for i in a.cluster_centers_:
    for j in a.cluster_centers_:
        dist.append(np.sum((i - j)**2)**0.5)

DunnIndex = np.min(WCSS) / np.max(dist)
#####################################################
#sonuclari yazma
c = np.zeros(k)
f = open("sonuc.txt", "w")
for i in range(len(a.labels_)):
    f.write("Kayit " + str(i) + ":\t" + "Kume " + str(a.labels_[i]) + "\n")
    c[a.labels_[i]] += 1

Exemple #30

0

Afficher le fichier

b0 = []
b1 = []
for i in range(len(a)):
    if i % 2 == 0:
        b0.append(a[i])
    else:
        b1.append(a[i])
a0 = [float(x) for x in b0]
a1 = [float(x) for x in b1]
df = pd.DataFrame()
df['x'] = a0
df['y'] = a1
y_true = [0 for i in range(500)]
for i in range(len(a0) - 500):
    y_true.append(1)
model = km(n_clusters=3)
y = model.fit_predict(df)
plt.title("KMeans")
plt.scatter(df[y == 0]['x'], df[y == 0]['y'])
plt.scatter(df[y == 1]['x'], df[y == 1]['y'])
plt.show()
print("Purity score for KMeans: ", purity_score(y_true, y))

clustering = AgglomerativeClustering().fit(df)
y = clustering.labels_
plt.title("Agglo_Clustering")
plt.scatter(df[y == 0]['x'], df[y == 0]['y'])
plt.scatter(df[y == 1]['x'], df[y == 1]['y'])
plt.show()
print("Purity score for Agglomerative Clustering: ", purity_score(y_true, y))

Exemple #31

0

Afficher le fichier

Fichier : proposed.py Projet : babakahmadi/Thesis

def Prototyping(X, numP):
    from sklearn.cluster import KMeans as km
    kmeans = km(init='k-means++', n_clusters=numP)
    kmeans.fit(X)
    centers = kmeans.cluster_centers_
    return centers

Exemple #32

0

Afficher le fichier

def scale(array):
    minimum = array.min()
    maximum = array.max()
    scaled_array = np.array([])
    for i in range(0, len(array)):
        scaled_array = np.append(scaled_array,
                                 (array[i] - minimum) / (maximum - minimum))

    return scaled_array


# In[]:

ofc_scaled = scale(df_full['ofc'])
mi_scaled = scale(df_full['transform'])
points = np.column_stack((ofc_scaled, mi_scaled))
#metrics = [df_full['ofc'], df_full['mi']]
#points = pd.concat(metrics, axis = 1)
from sklearn.cluster import KMeans as km
kmeans = km(n_clusters=3)
# fit kmeans object to data
kmeans.fit(points)
# print location of clusters learned by kmeans object
print(kmeans.cluster_centers_)
# save new clusters for chart
y_km = kmeans.fit_predict(points)

#plt.scatter(points[y_km ==0,0], points[y_km == 0,1], s=100, c='red')
#plt.scatter(points[y_km ==1,0], points[y_km == 1,1], s=100, c='black')

Exemple #33

0

Afficher le fichier

from sklearn.cluster import KMeans as km
from sklearn.metrics import silhouette_score
import sys
from scipy.spatial.distance import cdist
import numpy as np


df = pd.read_csv("data_dist.csv.virus")
xs = list(df['x'])
ys = list(df['y'])
xs = xs - min(xs)
ys = ys - min(ys)
X = np.matrix(zip(xs, ys))
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
stat = open('kstatistics.csv', 'w')
for nc in range(2, 13):
    kmeans = km(n_clusters=nc, random_state=10)
    cluster_labels = kmeans.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
    #print("For n_clusters =", no_cluster, "The average silhouette_score is :", silhouette_avg)
    labels = kmeans.labels_
    dist = kmeans.transform(X)
    distortion = 0
    distortion = (
        sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
    # print kmeans.cluster_centers_, sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1))/13

    print nc, distortion, silhouette_avg
    print >>stat, nc, distortion, silhouette_avg

Exemple #34

0

Afficher le fichier

Fichier : code2_b.py Projet : kautsiitd/Unsupervised-Hand-Written-Digit-Recognition

im.show()
sys.exit()
labels 	= mat["data_labels"]
frqs	= [['Actual/Predict',0,1,2,3,4,5,6,7,8,9,'Recall']]+[[i]+[0 for j in range(10)] for i in range(10)]+[['Precision']]

# saving original pics
# for i in range(2000):
# 	scipy.misc.imsave("Output/2.2c/Original_"+str(n_cluster)+"/"+str(i)+".bmp",data[i].reshape(28,28))

# modeling and predicting labels
pca 	= PCA(n_components = .9)
data 	= pca.fit_transform(data)

print data[0]
sys.exit()
model	= km(n_clusters=n_cluster, max_iter=2000, n_init=100, init='k-means++', tol=.00001, n_jobs=4)
model.fit(np.array(data))
print 1
p_label = model.fit_predict(data)
print 2

# finding mapping
for i in range(2000):
	result[p_label[i]].append(labels[i][0])
mapping = [max(set(i), key=i.count) for i in result]

# saving pics after applying PCA
for i in range(2000):
	scipy.misc.imsave("../Output/2.2c/After_PCA_"+str(n_cluster)+"/"+str(i)+".bmp",data[i][:81].reshape(9,9))

# saving results

Exemple #35

0

Afficher le fichier


valores=data.values
escal=pre.MinMaxScaler()
x_esc=escal.fit_transform(valores)
x_normalizado=pd.DataFrame(x_esc)

pca=PCA(n_components=2) 

reduced=pd.DataFrame(pca.fit_transform(x_normalizado))

reduced['x']=reduced[0]
reduced['y']=reduced[1]


from sklearn.cluster import KMeans as km

rede=km(n_clusters=4)

rede.fit(reduced)

lista=np.array([rede.labels_,nomes.to_numpy()])

reduced['cluster']=rede.labels_.tolist()
reduced['nomes']=nomes


import matplotlib.pyplot as plt

plt.scatter(reduced['x'],reduced['y'],c=reduced['cluster'],s=150)
[plt.text(reduced['x'][i],reduced['y'][i], nomes.to_numpy()[i]) for i in range(len(nomes))]

Exemple #36

0

Afficher le fichier

Fichier : task2.py Projet : lclarke98/IOT

    cm.yaxis.set_ticklabels(cm.yaxis.get_ticklabels(), rotation=90)
    cm.xaxis.set_ticklabels(cm.xaxis.get_ticklabels(), rotation=0)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


def convertCluster2Label(cluster_labels, original_labels, labels2convert):
    converted_labels = np.full(labels2convert.size, -1)
    for i in np.unique(cluster_labels):
        temp_original_labels = original_labels[cluster_labels == i]
        temp_label = np.bincount(temp_original_labels).argmax()
        converted_labels[labels2convert == i] = temp_label
    return converted_labels


iris = datasets.load_iris()
X = iris.data
y = iris.target
target_names = iris.target_names
X_train, X_test, y_train, y_true = train_test_split(X, y)
kmeans = km(n_clusters=3, n_init=2020)
kmeans.fit(X_train)
values = kmeans.cluster_centers_.squeeze()
trained_labels = kmeans.labels_
labels_predict = kmeans.predict(X_test)
print(labels_predict)
y_predict = convertCluster2Label(trained_labels, y_train, labels_predict)
print(y_predict)

confusionM(y_true, y_predict, target_names)