Beispiel #1
0
def get_domi_color_new_image(image, n_clusters=2):
    '''
    INPUT:
        image: numpy array
        n_clusters: integer

    OUTPUT:
        domi_color: numpy array
    '''
    
    if len(image.shape) == 3:
        image = transform.resize(image, (300,300,3))
    else:
        return -1

    # Flatten the image matrix:
    nrow, ncol, depth = image.shape 
    lst_of_pixels = [image[irow][icol] for irow in range(nrow) for icol in range(ncol)]

    # Clustering the colors of each pixel:
    kmean = KMeans(n_clusters=n_clusters)
    kmean.fit_transform(lst_of_pixels)
    domi_colors = kmean.cluster_centers_

    # Get the dominant color of the furniture (darker than the background):
    if np.mean(domi_colors[0]) < np.mean(domi_colors[1]):
        domi_color = domi_colors[0]
    else:
        domi_color = domi_colors[1]
    return domi_color
Beispiel #2
0
def mfcc_clustering(file_name, n_clusters):
    """
    From Prem
    :return:
    """

    clusterer = KMeans(n_clusters=n_clusters)

    print(file_name)
    mix, sr = librosa.load(file_name)
    mix_stft = librosa.stft(mix)
    comps, acts = find_template(mix_stft, sr, 100, 101, 0, mix_stft.shape[1])
    cluster_comps = librosa.feature.mfcc(S=comps)[1:14]
    save_mfcc_img(file_name[:-4] + "_mfcc.png", np.flipud(cluster_comps))
    clusterer.fit_transform(cluster_comps.T)
    labels = clusterer.labels_
    # print(labels)
    sources = []

    for cluster_index in range(n_clusters):
        indices = np.where(labels == cluster_index)[0]
        template, residual = extract_template(comps[:, indices], mix_stft)
        t = librosa.istft(template)
        sources.append(t)

    return np.array(sources)
Beispiel #3
0
def run_kmeans(vector=None, links=[], iters=500, clusters=8):
    km = KMeans(n_clusters=clusters, max_iters=iters)
    km.fit_transform(vec)
    clusters = defaultdict(list)
    for i in xrange(len(links)):
        clusters[km.labels[i]].append(links[i])
    for x in clusters:
        print x, clusters[x]
    return km.labels_
Beispiel #4
0
 def get_kmean_clusters(self,X):
     '''
     Returns labels of kmeans clustering
     INPUTS: X = feature matrix as 2d numpy float array
     OUTPUTS: KMeans cluster labels as 1d numpy array of strings
     '''
     kmeans = KMeans(5)
     kmeans.fit_transform(X)
     return kmeans.labels_ 
Beispiel #5
0
def wrapper_scikit(K):
    pics_t = np.empty((pics.shape[0],np.power(pics.shape[1],2)))
    for i in range(pics_t.shape[0]):
        pics_t[i] = pics[i].flatten()
    time1 = time.time()
    kmean = KMeans(init='random', n_clusters=K)
    kmean.fit_transform(pics_t)
    time2 = time.time()
    return (time2-time1)*1000.
Beispiel #6
0
def findElbow(features, n = 10):
    error = []
    for i in xrange(n):
        km = KMeans(n_clusters = i + 1)
        km.fit_transform(features)
        error.append(kmeansError(features, km))
    plt.figure(figsize=(10,10))
    plt.plot(range(1,n + 1),error,'k',linewidth=10)
    plt.plot(range(1,n + 1),error,'ko',markersize=25)
    plt.show()
Beispiel #7
0
def get_kmean_model(X, true_k, n_init=10, verbose=False):
    
   
    km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100,
                n_init=n_init, verbose=verbose)


    km.fit_transform(X)
   
    
    return km
Beispiel #8
0
def train_model(texts, points, num_classses, model_dir, text_encoding='utf-8'):
	""" Given an iterable of (text, lat, lon) items, cluster the points into #num_classes and use
	them as labels, then extract unigram features, train a classifier and save it in models/model_name
	for future use. 

	Args:
	texts -- an iterable (e.g. a list) of texts e.g. ['this is the first text', 'this is the second text'].
	points -- an iterable (e.g. a list) of tuples in the form of (lat, lon) where coordinates are of type float e.g. [(1.2343, -10.239834r),(5.634534, -12.47563)]
	num_classes -- the number of desired clusters/labels/classes of the model.
	model_name -- the name of the directory within models/ that the model will be saved.
	"""
	
	if os.path.exists(model_dir):
		logging.error("Model directory " + model_dir + " already exists, please try another address.")
		sys.exit(-1)
	else:
		os.mkdir(model_dir)
	
	from sklearn.cluster import KMeans
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model.stochastic_gradient import SGDClassifier
	
	kmeans = KMeans(n_clusters=num_classses, random_state=0)
	points_arr = numpy.array(points)
	kmeans.fit_transform(points_arr)
	cluster_centers = kmeans.cluster_centers_
	sample_clusters = kmeans.labels_
	label_coordinate = {}
	for i in range(cluster_centers.shape[0]):
		lat, lon = cluster_centers[i, 0], cluster_centers[i, 1]
		label_coordinate[i] = (lat, lon)
	
	logging.info('extracting features from text...')
	vectorizer = TfidfVectorizer(encoding=text_encoding, stop_words='english', ngram_range=(1,1), max_df=0.5, min_df=0, binary=True, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True)
	X_train = vectorizer.fit_transform(texts)
	Y_train = sample_clusters
	vectorizer.stop_words_ = None
	logging.info('the number of samples is %d and the number of features is %d' % (X_train.shape[0], X_train.shape[1]))
	
	logging.info('training the classifier...')
	logging.warn('Note that alpha (regularisation strength) should be tuned based on the performance on validation data.')
	clf = SGDClassifier(loss='log', penalty='elasticnet', alpha=5e-5, l1_ratio=0.9, fit_intercept=True, n_iter=5, n_jobs=2, random_state=0, learning_rate="optimal")
	clf.fit(X_train, Y_train)
	clf.coef_ = csr_matrix(clf.coef_)
	
	logging.info('retrieving address of the given points using geopy (requires internet access).')
	coordinate_address = retrieve_location_from_coordinates(label_coordinate.values())

	logging.info('dumping the the vectorizer, clf (trained model), label_coordinates and coordinate_locations into pickle files in ' + model_dir)
	dump_model(clf, vectorizer, coordinate_address, label_coordinate, model_dir)
Beispiel #9
0
def kmeans(embedding,n_components, mask):
    import numpy as np
    from sklearn.cluster import KMeans
    
    all_vertex=range(embedding.shape[0])
    masked_embedding = np.delete(embedding, mask, 0)
    cortex=np.delete(all_vertex, mask)
    
    est = KMeans(n_clusters=n_components, n_jobs=-2, init='k-means++', n_init=300)
    est.fit_transform(masked_embedding)
    labels = est.labels_
    kmeans_results = labels.astype(np.float)
    kmeans_recort = recort(len(all_vertex), kmeans_results, cortex, 1)
    return kmeans_recort
Beispiel #10
0
 def best_lda_cluster_wine(self):
     dh = data_helper()
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data_lda_best()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     ##
     ## K-Means
     ##
     km = KMeans(n_clusters=4, algorithm='full')
     X_train_transformed = km.fit_transform(X_train_scl)
     X_test_transformed = km.transform(X_test_scl)
     
     # save
     filename = './' + self.save_dir + '/wine_kmeans_lda_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_kmeans_lda_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_kmeans_lda_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_kmeans_lda_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
     
     ##
     ## GMM
     ##
     gmm = GaussianMixture(n_components=4, covariance_type='full')
     X_train_transformed = km.fit_transform(X_train_scl)
     X_test_transformed = km.transform(X_test_scl)
     
     # save
     filename = './' + self.save_dir + '/wine_gmm_lda_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_gmm_lda_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_gmm_lda_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_gmm_lda_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
Beispiel #11
0
def decompose_map(map1, method, r=40, out='inter'):
    map1.reset_solution()
    if method == 'EIG':
        map1.decompose('EIG', dim_num=r)
    elif method == 'PCA':
        map1.decompose('PCA', dim_num=r)
    elif method == 'ICE':
        map1.decompose('ICE', dim_num=r)
    elif method == 'K-means':
        from k_means_pdist import kmeanssample
        DIST = -np.array(map1.contact_map) ## simi to dist
        centres, xtoc, dist = kmeanssample(DIST, np.eye(DIST.shape[0]), r, nsample=0, delta=0.001, maxiter=20, verbose=0)
        map1.contact_group = -np.matrix(dist) ## dist to simi
    elif method == '3D-K-means':
        km = KMeans(n_clusters=r)
        dfile = 'pdb.txt'
        pb, vx = map1.get_locations(dfile, st=1, ch=0, po=1, nm=2, add=0)
        pb, vy = map1.get_locations(dfile, st=1, ch=0, po=1, nm=3, add=0)
        pb, vz = map1.get_locations(dfile, st=1, ch=0, po=1, nm=4, add=0)
        X = np.zeros((map1.contact_map.shape[0], 3))
        C = np.zeros(map1.contact_map.shape[0])
        for i,x,y,z in zip(pb,vx,vy,vz):
            X[i,0] = x
            X[i,1] = y
            X[i,2] = z
            C[i] += 1
        C[C==0] = 1
        X /= C[:,np.newaxis]
        map1.contact_group = -np.matrix(km.fit_transform(X))
    elif method == 'NMF':
        map1.decompose('NND', dim_num=r)
        map1.decompose('NMF-Gaussian', dim_num=r)
        map1.contact_group = np.dot(map1.contact_group, map1.group_map)
    elif method == 'BNMF':
        map1.decompose('NND', dim_num=r)
        map1.decompose('NMF-PoissonManifoldEqual', dim_num=r, par_lam=0)
        map1.contact_group = np.dot(map1.contact_group, map1.group_map)
    elif method == 'Random':
        n = map1.contact_map.shape[0]
        map1.contact_group = np.zeros((n,r))
        from math import ceil
        size = int(ceil(n/float(r)))
        for i in xrange(n):
            map1.contact_group[i, i/size] = 1
    elif method == 'Armatus':
        from run_armatus import Armatus
        map1.save()
        map2 = Armatus('../tools/armatus2.1/armatus', name=map1.name)
        map2.load()
        map2.decompose()
        map1.contact_group = map2.contact_group
    elif method == 'TAD':
        from run_domaincall import DomainCall
        map1.save()
        map2 = DomainCall('../tools/domaincall/', name=map1.name)
        map2.load()
        map2.decompose()
        map1.contact_group = map2.contact_group
    else:
        raise ValueError('Unknow method name '+method)
Beispiel #12
0
def run(lines,vectorizerCls):

    print(TIMENOW(),'VECTORIZE','-'*42)      
    vectorizer=vectorizerCls(stop_words=['le','de','la','les','je','un','une','des','est','et','il','elle','du','ai','au',])
    data =vectorizer.fit_transform(lines)
    num_samples, num_features = data.shape
    print("#samples: %d, #features: %d" % (num_samples, num_features)) #samples: 5, #features: 25 #samples: 2, #features: 37
    print(TIMENOW(),'KMEANS','-'*42)      
    km   =KMeans(n_clusters=n_clusters)
    res  =km.fit_transform(data)
    labels = km.labels_
    labels_shape = km.labels_.shape
    print ("labels : ", labels)
    print ("labels_shape : ", labels_shape)

    print(TIMENOW(),'DONE','-'*42)  
        
    print("Top terms per cluster:")
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    result = dict()
    for i in range(n_clusters):
        result[i]=list()
        print("Cluster %d:" % i, end='')
        for ind in order_centroids[i, :25]:
            print(' %s' % terms[ind], end='\n')
            result[i].append(terms[ind])
        print()    
    return result
Beispiel #13
0
def KinKmeans(var, nk=False, tol=1e-4, n_init=100):
    '''
    Uses pseudo-F to estimate the best number of K in K-Means
    From MJCarvalho GapStatistics

    :param numpy var: Numpy array with input data
    :param int nk: Initial number of K
    :param float tol: Tolerance for K-Means
    :param int n_init: Number of initializations for K-Means

    :return int: Number of K and f statistic
    '''

    from sklearn.cluster import KMeans

    Nd = np.size(var, axis=0)
    S = np.zeros(Nd)
    f = np.zeros(Nd)
    alpha = np.zeros(Nd)

    if not nk:
        term = 3
    else:
        term = nk

    kink = [0]
    i = 0
    while len(kink) <= term:
        ## Kmeans
        kmeans = KMeans(init='k-means++', n_clusters=i+1,
                        n_init=n_init, tol=tol)

        T = kmeans.fit_transform(var, y=None)
        I = np.nansum(T**2, axis=0)
        S[i] = np.nansum(I, axis=0)
        ## Det. Alpha
        if i == 1:
            alpha[i] = 1.0 - (3.0/(4.0*Nd))
        elif i > 1:
            alpha[i] = alpha[i-1] + (1-alpha[i-1])/6.0
        ## Det. f(k)
        if i == 0:
            f[i] = 1
        else:
            f[i] = S[i] / (alpha[i] * S[i-1])

        if not nk:
            kink = np.arange(len(f))[
                np.r_[True, f[1:] < f[:-1]] &
                np.r_[f[:-1] <= f[1:], True] |
                np.r_[True, f[1:] <= f[:-1]] &
                np.r_[f[:-1] < f[1:], True]
            ]

        else:
            kink.append(0)
        i += 1

    return kink[1], f
Beispiel #14
0
def clusterGoalies(df, idx, numOfClusters):
	model = KMeans(n_clusters=numOfClusters, n_init=20)
	distMat = model.fit_transform(df)
	resultList = [[] for i in range(numOfClusters)]
	for i, rowList in enumerate(distMat):
		minIndex = min(enumerate(rowList), key = lambda x: x[1])[0]
		resultList[minIndex].append(idx[i])
	return resultList
Beispiel #15
0
def make_cluster(df):
    cluster_df = pd.DataFrame()
    clusters = KMeans(n_clusters=4)
    distance_matrix = clusters.fit_transform(cust_data_transform)
    cluster_df["cluster"] = clusters.labels_
    # Finding the euclidean distance from the point to its cluster center
    cluster_df["dist"] = [min(x) for x in distance_matrix]
    return cluster_df, clusters.cluster_centers_
 def vectorize(self, term_docs, n_clusters = 8):
     self.n_clusters = n_clusters
     tf = TfidfVectorizer()
     X = tf.fit_transform(term_docs)
     km = KMeans(n_clusters = n_clusters)
     x = km.fit_transform(X)
     self.labels = km.labels_
     return km.labels_
def kcluster(dataframe, n=3, n_clusters=5):
    X_centered = preprocessing.scale(dataframe.fillna(0))
    pca = decomposition.PCA(n_components=n)
    X_pca = pca.fit_transform(X_centered)
    kpy.plot_k_sse(X_pca)
    k = KMeans(n_clusters=n_clusters)
    km = k.fit_transform(X_pca)
    plt.hist(k.labels_)
    return pca, X_pca, k, km
def make_clustering(data_frame, number_of_clusters):
    # initializing KMeans object, computing clustering and transforming X to cluster-distance space
    k_means_model = KMeans(n_clusters=number_of_clusters)
    distances = k_means_model.fit_transform(data_frame.iloc[:, 2:])

    # adding to out data frame information about unit' cluster and distances to every cluster
    data_frame["cluster"] = k_means_model.labels_
    for i in range(number_of_clusters):
        data_frame["dist " + str(i) + " cluster"] = distances[:, i]

    return data_frame
  def runKMeansSKLearn(X, k = None):
    kmeans = KMeans(n_clusters=k, n_jobs=-1)
    clusters = kmeans.fit_predict(X).tolist()
    cluster_distance = kmeans.fit_transform(X).tolist()
    cluster_centers = kmeans.cluster_centers_

    coords = []
    for cluster in clusters:
      coord = [cluster_centers[cluster,0], cluster_centers[cluster, 1]]
      coords.append(coord)

    return [None, coords]
    def getWordCentroidMap(self, model, num_clusters):
        start = time.time()

        word_vectors = model.syn0
        print "begin to clustering to gaining code book"
        kmeans_clustering = KMeans(n_clusters = num_clusters)
        idx = kmeans_clustering.fit_transform(word_vectors)

        end = time.time()
        elapsed= end - start
        print "Time takes for K means clustering: {} seconds".format(elapsed)
        return dict(zip(model.index2word, idx))
Beispiel #21
0
def cluster_points(points, number_of_clusters):
    '''This function should take a list of points (in two dimensions) and return a list of clusters,
    each of which is a list of points. For example, if you passed in [(0, 0), (-0.1, 0.1), (2,3), (2.1, 3)] 
    with number_of_clusters set to 2, it should return [[(0, 0), (-0.1, 0.1)], [(2,3), (2.1, 3)]].'''

    model = KMeans(n_clusters=number_of_clusters)
    distMat = model.fit_transform(points)
    resultList = [[] for i in range(number_of_clusters)]
    for i, rowList in enumerate(distMat):
        minIndex = min(enumerate(rowList), key = lambda x: x[1])[0]
        resultList[minIndex].append(points[i])
    return resultList
Beispiel #22
0
def Analysis(vector, K=2):
    arr = (np.array(vector))

    # mean normalization of the data . converting into normal distribution having mean=0 , -0.1<x<0.1
    sc = StandardScaler()
    x = sc.fit_transform(arr)

    # Breaking into principle components
    pca = PCA(n_components=2)
    components = (pca.fit_transform(x))
    # Applying kmeans algorithm for finding centroids

    kmeans = KMeans(n_clusters=K, n_jobs=-1)
    kmeans.fit_transform(components)
    print("labels: ", kmeans.labels_)
    centers = kmeans.cluster_centers_

    # lables are assigned by the algorithm if 2 clusters then lables would be 0 or 1
    lables = kmeans.labels_
    colors = ["r.", "g.", "b.", "y.", "c."]
    colors = colors[:K + 1]

    for i in range(len(components)):
        plt.plot(components[i][0],
                 components[i][1],
                 colors[lables[i]],
                 markersize=10)

    plt.scatter(centers[:, 0],
                centers[:, 1],
                marker="x",
                s=150,
                linewidths=10,
                zorder=15)
    plt.xlabel("1st Principle Component")
    plt.ylabel("2nd Principle Component")
    title = "Styles Clusters"
    plt.title(title)
    plt.savefig("Results" + ".png")
    plt.show()
Beispiel #23
0
def main():
    listCorpus, listSize = readFile()
    embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

    # Corpus with example sentences
    corpus = [
        'A man is eating food.', 'A man is eating a piece of bread.',
        'A man is eating pasta.', 'The girl is carrying a baby.',
        'The baby is carried by the woman', 'A man is riding a horse.',
        'A man is riding a white horse on an enclosed ground.',
        'A monkey is playing drums.',
        'Someone in a gorilla costume is playing a set of drums.',
        'A cheetah is running behind its prey.',
        'A cheetah chases prey on across a field.'
    ]
    # listCorpus=corpus
    # listSize=[1,1,1,1,1,1,1,1,1,1,1]
    corpus_embeddings = embedder.encode(listCorpus)

    num_clusters = 150
    clustering_model = KMeans(n_clusters=num_clusters)
    cluster_dist = clustering_model.fit_transform(corpus_embeddings)
    cluster_dist = cluster_dist.min(1)
    cluster_assignment = clustering_model.labels_
    final_assignment = -1 * np.ones(len(listCorpus))
    keywords_list = []
    for i in range(0, num_clusters):
        theta = (cluster_dist * (cluster_assignment == i))
        if len(np.nonzero(theta)[0]) == 0:
            continue

        idx = np.where(theta == np.min(theta[np.nonzero(theta)]))
        final_assignment[cluster_assignment == i] = idx[0][0]
        keywords_list.append(listCorpus[idx[0][0]])

    final_assignment1 = [listCorpus[int(i)] for i in final_assignment]

    start = 0
    line_no = 0
    # with open('citi_file_cluster.csv', 'w', newline='') as write_file:
    #     for i in range(0, len(listSize)):
    #         writer = csv.writer(write_file)
    #         col1 = "/".join(listCorpus[start:start + listSize[line_no]])
    #         col2 = "/".join(final_assignment1[start:start + listSize[line_no]])
    #         col3 = "/".join(list(set(final_assignment1[start:start + listSize[line_no]])))
    #         start += listSize[line_no]
    #         writer.writerow([col1, col2, col3])
    #         line_no += 1
    #
    with open('keywords_list.csv', 'w', newline='') as write_file:
        writer = csv.writer(write_file)
        writer.writerow(keywords_list)
def do_nmf_and_clustering(input_file, n_clusters):
    """

    :return:
    """
    clusterer = KMeans(n_clusters=n_clusters)

    mix, sr = librosa.load(input_file)
    mix_stft = librosa.stft(mix)
    comps, acts = find_template(mix_stft, sr, 100, 101, 0, mix_stft.shape[1])
    cluster_comps = librosa.feature.mfcc(S=comps)[1:14]
    clusterer.fit_transform(cluster_comps.T)
    labels = clusterer.labels_
    sources = []

    for cluster_index in range(n_clusters):
        indices = np.where(labels == cluster_index)[0]
        template, residual = extract_template(comps[:, indices], mix_stft)
        t = librosa.istft(template)
        sources.append(t)

    return np.array(sources), cluster_comps
def kmeans_():

    # use features for clustering
    from sklearn.cluster import KMeans
    km = KMeans(n_clusters=N, init='k-means++')
    #features = np.reshape(x_train, newshape=(features.shape[0], -1))
    km_trans = km.fit_transform(x_train)
    pred = km.predict(x_train)
    print pred.shape
    print('acc=', met.acc(y_train, pred), 'nmi=', met.nmi(y_train,
                                                          pred), 'ari=',
          met.ari(y_train, pred))
    return km_trans, pred
Beispiel #26
0
def CQ_ABC(img, K):
    # init
    n_solutions = 100
    solutions = np.random.randint(0, 255, size=(n_solutions, K, 3))
    MCN = 100
    pixels = np.reshape(img, newshape=(img.shape[0] * img.shape[1], 3))
    fitness_array = np.zeros(shape=n_solutions)
    for n in range(MCN):
        # employed bee
        for solution in solutions:
            k_means = KMeans(n_clusters=K, init=solution, max_iter=5)
            # mapping and evaluation
            mapped = k_means.fit_transform(pixels)
    def vectorize(self, term_docs, n_clusters = 8):

        self.n_clusters = n_clusters
        tf = TfidfVectorizer()
        X = tf.fit_transform(term_docs)

        km = KMeans(n_clusters = n_clusters)

        artist_distance = km.fit_transform(X)
        #ipdb.set_trace()
        self.labels = km.labels_
        self.km = km
        return km.labels_, artist_distance
Beispiel #28
0
def clustering(atributes, amount_centroides):
    centroides = atributes[np.random.choice(atributes.shape[0],
                                            amount_centroides,
                                            replace=False)]
    kmeans = KMeans(n_clusters=amount_centroides,
                    init=centroides,
                    max_iter=500,
                    n_init=1,
                    random_state=0)
    distances = kmeans.fit_transform(atributes)
    centros = kmeans.cluster_centers_

    return kmeans, distances, centros
def add_kmeans(clusters_list, group, X, results_df):
    for clusters in clusters_list:
        km = KMeans(n_clusters=clusters,
                    random_state=0,
                    n_init=10,
                    algorithm='auto',
                    n_jobs=-1)
        y_km_transform = km.fit_transform(X)
        y_km_labels = km.labels_

        results_df[group + ' ' + str(clusters) + ' K-Means'] = y_km_labels

    return results_df
Beispiel #30
0
    def Semi_supervised_learning(self):
        from sklearn.datasets import load_digits
        X_digits, y_digits = load_digits(return_X_y=True)
        X_train, X_test, y_train, y_test = train_test_split(X_digits,
                                                            y_digits,
                                                            test_size=0.33,
                                                            random_state=42)

        from sklearn.linear_model import LogisticRegression
        n_labeled = 50

        log_reg = LogisticRegression(random_state=42)
        print(X_train.shape)
        log_reg.fit(X_train[:n_labeled], y_train[:n_labeled])
        print(log_reg.score(X_test, y_test))

        kmeans = KMeans(n_clusters=n_labeled, random_state=42)
        X_digits_dist = kmeans.fit_transform(X_train)
        print(X_digits_dist.shape)
        # 获取列方向上的最小索引
        representative_digits_idx = np.argmin(X_digits_dist, axis=0)
        X_representative_digits = X_train[representative_digits_idx]

        plt.figure(figsize=(8, 2))
        for index, X_representative_digit in enumerate(
                X_representative_digits):
            plt.subplot(n_labeled // 10, 10, index + 1)
            plt.imshow(X_representative_digit.reshape(8, 8),
                       cmap="binary",
                       interpolation="bilinear")
            plt.axis('off')

        # plt.show()

        # 根据距离kmean计算最近的几个元素距离
        log_reg = LogisticRegression(random_state=42)
        log_reg.fit(X_train[representative_digits_idx],
                    y_train[representative_digits_idx])
        print(log_reg.score(X_test, y_test))

        # 根据kmaeans打标签
        y_representative_digits = np.array([
            4, 8, 0, 6, 8, 3, 7, 7, 9, 2, 5, 5, 8, 5, 2, 1, 2, 9, 6, 1, 1, 6,
            9, 0, 8, 3, 0, 7, 4, 1, 6, 5, 2, 4, 1, 8, 6, 3, 9, 2, 4, 2, 9, 4,
            7, 6, 2, 3, 1, 1
        ])
        y_train_propagated = np.empty(len(X_train), dtype=np.int32)
        for i in range(50):
            y_train_propagated[kmeans.labels_ ==
                               i] = y_representative_digits[i]
Beispiel #31
0
def do_kmeans_analysis(data_frame: DataFrame,
                       clusters_number: int) -> KMeansAnalysisResult:
    k_means: KMeans = None

    if clusters_number > 0:
        k_means = KMeans(n_clusters=clusters_number)
    else:
        k_means = KMeans()

    labels_mapping = normalise_data_frame(data_frame.iloc[:, :3])
    columns = labels_mapping.columns

    k_means.fit_transform(labels_mapping)

    labels_mapping['Label'] = MinMaxScaler().fit_transform(
        k_means.labels_.reshape(-1, 1))

    centroids = DataFrame(data=k_means.cluster_centers_, columns=columns)

    return KMeansAnalysisResult(labels_mapping=labels_mapping,
                                labels_mapping_labels=get_columns_labels(
                                    data_frame, columns),
                                centroids=centroids)
    def kmeans_cleaning(self, use_cache, cache):
        if self.USE_KMEANS and not use_cache:
            tfidf_info, tdmatrix, _ = self.makeTermDocMatrix(self.X_train)
            # print(tdmatrix)
            kmeans = KMeans(self.N_CLUSTERS, n_jobs=-1)
            X_new = kmeans.fit_transform(
                tdmatrix
            )  # hope they don't mess with indices of training data.
            corressponding_dists_with_indices_not_messed_hopefully = [
                X_new[(i, x)] for i, x in enumerate(kmeans.labels_)
            ]
            print("len of list",
                  len(corressponding_dists_with_indices_not_messed_hopefully))
            print("max threshold,",max(corressponding_dists_with_indices_not_messed_hopefully),"\n",\
             "min threshold:",
             min(corressponding_dists_with_indices_not_messed_hopefully))
            # plt.hist(corressponding_dists_with_indices_not_messed_hopefully)
            # plt.show()
            self.cleanedId = (np.where(
                np.array(corressponding_dists_with_indices_not_messed_hopefully
                         ) < self.THRESHOLD)[0]).tolist()

            for cl in self.classes:
                self.docsId[cl] = set(self.docsId[cl]).intersection(
                    self.cleanedId)  # updating docsId

            # need to update X_train, y_train also.

            # print(type(cleanedId))
            print("Number of new documents:", len(self.cleanedId))
            # print(self.cleanedId[:5])
            self.X_train = (np.array(self.X_train)[self.cleanedId]).tolist()
            self.y_train = (np.array(self.y_train)[self.cleanedId]).tolist()

        elif self.USE_KMEANS and use_cache:
            self.X_train, self.y_train, self.cleanedId = cache

        self.docs = {
            cl: [self.idToDoc[x] for x in self.docsId[cl]]
            for cl in self.classes
        }

        self.LEN_OF_CLASS = {}
        for cl in self.classes:
            self.LEN_OF_CLASS[cl] = len(self.docs[cl])

        self.TOTAL_DOCS = sum(len(self.docs[cl]) for cl in self.classes)

        # to be collected outside or may be inside.
        return self.X_train, self.y_train, self.cleanedId
def evaluate_components(A,Yr,psx):

    #%% clustering components
    Ys=Yr
    psx = cse.pre_processing.get_noise_fft(Ys,get_spectrum=True);
    
    #[sn,psx] = get_noise_fft(Ys,options);
    #P.sn = sn(:);
    #fprintf('  done \n');
    psdx = np.sqrt(psx[:,3:]);
    X = psdx[:,1:np.minimum(np.shape(psdx)[1],1500)];
    #P.psdx = X;
    X = X-np.mean(X,axis=1)[:,np.newaxis]#     bsxfun(@minus,X,mean(X,2));     % center
    X = X/(+1e-5+np.std(X,axis=1)[:,np.newaxis])
    
    from sklearn.cluster import KMeans
    from sklearn.decomposition import PCA,NMF
    from sklearn.mixture import GMM
    pc=PCA(n_components=5)
    nmf=NMF(n_components=2)
    nmr=nmf.fit_transform(X)
    
    cp=pc.fit_transform(X)
    gmm=GMM(n_components=2)
    
    Cx1=gmm.fit_predict(cp)
    
    L=gmm.predict_proba(cp)
    
    km=KMeans(n_clusters=2)
    Cx=km.fit_transform(X)
    Cx=km.fit_transform(cp)
    Cx=km.cluster_centers_
    L=km.labels_
    ind=np.argmin(np.mean(Cx[:,-49:],axis=1))
    active_pixels = (L==ind)
    centroids = Cx;
def clusterKMeans(indexer, function, clusters, seeds):
    # First, set up the data correctly
    # Import the vectorization as done by the indexing.
    normalized = indexer.get_normalized_paper_values("paper_text", function)

    # Use the results found in the indexing as the vector.
    vectorizer = DictVectorizer()
    X = vectorizer.fit_transform(normalized.values())

    # Cluster documents
    model = KMeans(n_clusters=clusters, init='k-means++', n_init=seeds)
    model.fit_transform(X)

    # Print top terms per cluster clusters, getting and sorting centroids and terms
    print("Top terms per cluster:")
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()

    # Print top terms per cluster clusters, actually printing them
    for i in range(clusters):
        print("Cluster %d:" % i, '\n')
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind], )
        print('\n')

    # Getting and printing the clusters and amount of points in them
    labels, counts = np.unique(model.labels_[model.labels_ >= 0],
                               return_counts=True)
    for i in range(clusters):
        print('Cluster %d has %d points in it.' % (labels[i], counts[i]))

    # Computing and printing silhouette score
    sil_coeff = silhouette_score(X, model.labels_, metric='euclidean')
    print("For n_clusters={}, The Silhouette Coefficient is {}".format(
        clusters, sil_coeff))

    return X, model
Beispiel #35
0
def tfIdf_Kmeans(texts, clusters):
    """ Transform texts to Tf-Idf coordinates and cluster texts using K-Means """
    
    print "def tfIdf_Kmeans(texts, clusters):"
#     vectorizer = TfidfVectorizer(tokenizer=process_text,
#                                  stop_words=stopwords.words('portuguese'),
#                                  max_df=0.5,
#                                  min_df=0.1,
#                                  lowercase=True)
    #experimento 1
    vectorizer = TfidfVectorizer()
    
    #experimento 2
#     vectorizer = TfidfVectorizer(max_df=0.6,
#                                  min_df=0.3)
    
    #experimento 3
#     vectorizer = TfidfVectorizer(max_df=0.6,
#                                  min_df=0.3)
 
    tfidf_model = vectorizer.fit_transform(texts)
#     km_model = MiniBatchKMeans(n_clusters=clusters)
    #Valor ideal, após experimentos = 100000    
    km_model = KMeans(n_clusters=clusters, n_init=100000)
    
    
    #VALOR PARA TESTE!
    #km_model = KMeans(n_clusters=clusters, n_init=1)
    
    km_model.fit_transform(tfidf_model)
    
    clustering = collections.defaultdict(list)
 
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)
 
    return clustering
Beispiel #36
0
def GP_SE(dealer):
    df = data_dropped[data_dropped['dealer'] == dealer]
    df.sort_values(['year', 'month'], inplace=True)
    # If a column is all nan , it would drop the feature so , the dimension wont be matched.
    try:

        df.iloc[:, 1:] = SimpleImputer(missing_values=np.nan,
                                       strategy='median').fit_transform(
                                           np.array(df.iloc[:, 1:]))

    except:
        return [dealer]

    low_whisker = df.iloc[:, 1:5].quantile(0.01)
    mask_1 = (df.iloc[:, 1:5] < low_whisker)
    df.iloc[:, 1:5] = np.where(mask_1, low_whisker, df.iloc[:, 1:5])

    high_whisker = df.iloc[:, 1:5].quantile(0.99)
    mask_2 = (df.iloc[:, 1:5] > high_whisker)
    df.iloc[:, 1:5] = np.where(mask_2, high_whisker, df.iloc[:, 1:5])
    df['GPNV'] = df['R27'] / df['U27']
    df['SE'] = df['MV1'] / df['EXP1']
    df = df.replace([np.inf, -np.inf], np.nan)
    df.dropna(inplace=True)
    corr = df[['GPNV', 'SE']].corr().iloc[0, 1]
    X_GPNV = MinMaxScaler().fit_transform(np.array(df['GPNV']).reshape(-1, 1))
    y_SE = MinMaxScaler().fit_transform(np.array(df['SE']).reshape(-1, 1))
    df['GPNV_scaled'] = X_GPNV
    df['SE_scaled'] = y_SE
    X_2 = PolynomialFeatures(degree=2).fit_transform(X_GPNV)
    regress_2 = linear_model.LinearRegression()
    regress_2.fit(X_2, y_SE)
    X_3 = PolynomialFeatures(degree=3).fit_transform(X_GPNV)
    regress_3 = linear_model.LinearRegression()
    regress_3.fit(X_3, y_SE)
    k = KMeans(1)
    radius = max(k.fit_transform(df[['GPNV_scaled', 'SE_scaled']]))
    centroid = k.cluster_centers_
    collector = [
        dealer, df.shape[0], df['SE'].mean(), df['GPNV'].mean(), corr, radius,
        centroid[0], centroid[1]
    ]

    collector.extend(list(regress_2.coef_[0, :]))
    collector.append(regress_2.score(X_2, np.array(df['SE'])))
    collector.extend(list(regress_3.coef_[0, :]))
    collector.append(regress_3.score(X_3, np.array(df['SE'])))

    return df, collector
Beispiel #37
0
 def begin(self, inarray):
     inarray = self.make2d(inarray)
     inarray = normalize(inarray)
     inarray = self.make2d(inarray)
     for i in range(2, 12):
         kd = KMeans(n_clusters=i)
         temparray = inarray
         result = kd.fit_transform(temparray)
         params = kd.get_params()
         print('\n\n\n======================================\n\n\n')
         print("variance = {0}".format(result.var()))
         print(params)
         fIO.FileIO().saveWork((result, params, kd),
                               'kmeansfit_fulldata'.format(i), 2)
     input("press any key to exit...")
Beispiel #38
0
def run_Kmeans(X_train,
               X_test,
               y_train,
               y_test,
               experiment_number,
               dataset_name,
               neighbors=None):
    if neighbors:
        algorithm = KMeans(random_state=0, n_clusters=neighbors)
    else:
        algorithm = KMeans(random_state=0)

    # add conversions
    transformed_X_train = algorithm.fit_transform(X_train)
    transformed_X_test = algorithm.fit_transform(X_test)
    df = pd.DataFrame()
    title = "Kmeans"

    confidence = algorithm.score(X_test, y_test)
    inertia = algorithm.inertia_
    if neighbors >= 2:
        df['label'] = pd.Series([i[0] for i in y_train.tolist()])
        df['comp-one'] = transformed_X_train[:, 0]
        df['comp-two'] = transformed_X_train[:, 1]
        transformed_df = pd.DataFrame(transformed_X_train)
        c = transformed_df.corr().abs()
        s = c.unstack()
        so = s.sort_values(kind="quicksort")
        so = so[so != 1]
        max_cross_section = so.idxmax()
        min_cross_section = so.idxmin()
        for cross_section in [max_cross_section, min_cross_section]:
            plot_cross_section(transformed_X_train, cross_section, title,
                               neighbors, experiment_number, dataset_name)

    return confidence, inertia, transformed_X_train, transformed_X_test, df
Beispiel #39
0
def tfIdf_Kmeans(texts, clusters):
    """ Transform texts to Tf-Idf coordinates and cluster texts using K-Means """

    print "def tfIdf_Kmeans(texts, clusters):"
    #     vectorizer = TfidfVectorizer(tokenizer=process_text,
    #                                  stop_words=stopwords.words('portuguese'),
    #                                  max_df=0.5,
    #                                  min_df=0.1,
    #                                  lowercase=True)
    #experimento 1
    vectorizer = TfidfVectorizer()

    #experimento 2
    #     vectorizer = TfidfVectorizer(max_df=0.6,
    #                                  min_df=0.3)

    #experimento 3
    #     vectorizer = TfidfVectorizer(max_df=0.6,
    #                                  min_df=0.3)

    tfidf_model = vectorizer.fit_transform(texts)
    #     km_model = MiniBatchKMeans(n_clusters=clusters)
    #Valor ideal, após experimentos = 100000
    km_model = KMeans(n_clusters=clusters, n_init=100000)

    #VALOR PARA TESTE!
    #km_model = KMeans(n_clusters=clusters, n_init=1)

    km_model.fit_transform(tfidf_model)

    clustering = collections.defaultdict(list)

    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)

    return clustering
Beispiel #40
0
class KMeansClustering(AnomalyModel):
    def __init__(self, anomaly_dict, settings, features):
        super().__init__(anomaly_dict, settings, features)

        self.kmeans = KMeans(n_clusters=settings.n_clusters,
                             init=settings.init,
                             n_init=settings.n_init,
                             max_iter=settings.max_iter,
                             tol=settings.tol,
                             verbose=settings.verbose)
#            algortihm = 'full')

    def fit_Kmeans(self):
        self.kmeans.fit_transform(self.X.transpose())
        #print(self.kmeans.labels_)

    def predict(self, x):
        x_dict = x.get_feature_dict()
        return self.kmeans.predict(
            np.array([x_dict[feature]
                      for feature in self.features]).reshape(1, -1))

    def send_labels(self):
        return self.kmeans.labels_
def add_clusters_to_data_kmeans():
    x_train = data.DATA['fashion']['base']['x_train']
    x_test = data.DATA['fashion']['base']['x_test']

    # KMeans (k = 4) on train
    kmeans = KMeans(n_clusters=4, random_state=SEED)
    x_train_transformed = kmeans.fit_transform(x_train)
    x_train_new = pd.concat(
        [x_train, pd.DataFrame(x_train_transformed)], axis=1)

    scaler_train = StandardScaler()
    x_train_new_scaled = scaler_train.fit_transform(x_train_new)
    pd.DataFrame(x_train_new_scaled).to_csv(
        f'{DATA_FOLDER}/fashion_aug_kmeans_x_train.csv')

    # KMeans (k = 4) on test
    kmeans = KMeans(n_clusters=4, random_state=SEED)
    x_test_transformed = kmeans.fit_transform(x_test, )
    x_test_new = pd.concat([x_test, pd.DataFrame(x_test_transformed)], axis=1)

    scaler_test = StandardScaler()
    x_test_new_scaled = scaler_test.fit_transform(x_test_new)
    pd.DataFrame(x_test_new_scaled).to_csv(
        f'{DATA_FOLDER}/fashion_aug_kmeans_x_test.csv')
Beispiel #42
0
def simple_k_means(X: pd.DataFrame,
                   n_clusters=3,
                   score_metric='euclidean') -> Dict:
    model = KMeans(n_clusters=n_clusters)
    clusters = model.fit_transform(X)

    labels = model.labels_
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

    print(labels)
    print('Estimated number of clusters:', n_clusters)

    # There are many methods of deciding a score of a cluster model. Here is one example:
    score = metrics.silhouette_score(X, model.labels_, metric=score_metric)
    return dict(model=model, score=score, clusters=clusters, labels=labels)
	def create_tensor(self, tuples, k):

		m, l, k = self.lengths['user'], self.lengths['image'], self.lengths['location']
		
		tensor = np.zeros((m,l,k))

		for i in range(0, len(tuples)-1):
			tensor[tuples[i][0]][tuples[i][1]][tuples[i][2]] += 1

		factors = parafac(tensor, rank=k, init='random', tol=10e-2)

		k = []
		for j in range(len(factors)):
			wcss = []
			slope = 1;
			slopeA = [1, 1, 1, 1, 1];
			i = 1
			val = 1;
			while True:
				kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
				kmeans.fit(factors[j])
				wcss.append(kmeans.inertia_)
				if i > 5:
					slope, intercept = np.polyfit(range(1, i + 1), wcss, 1);
					slopeA.append(slope);
					term1 = abs(slopeA[i - 1])
					term2 = abs(slopeA[i - 2])
					val = (term1 - term2) / term2;
					print(i);
				if len(factors[j]) <= i:
					break;
				if abs(val) < 0.05:
					break
				i += 1;
			k.append(i)

		# plt.plot(range(1, 10), wcss)
		# plt.title('the elbow method')
		# plt.xlabel('Number of clusters')
		# plt.show();
		print(k);
		# print(elapsed_time);
		final = []
		for x in range(len(k)):
			kmeans = KMeans(n_clusters=k[x], init='k-means++', max_iter=300, n_init=10, random_state=0)
			final.append(kmeans.fit_transform(factors[j]))
		# print(final[0][0], "\n\n\n", final[1][0])
		print(final)
Beispiel #44
0
def analyze(n_preview=10):
    global vectorizer, km
    # Encode:
    logger.info('Encoding...')
    vectorizer = TfidfVectorizer(max_df=0.5,
                                 max_features=common.n_features,
                                 min_df=2,
                                 stop_words='english')
    common.X = vectorizer.fit_transform(common.doc_texts)
    common.save_pickle(vectorizer, 'vectorizer.pickle')
    common.vocab = np.array(vectorizer.get_feature_names())
    logger.info(f'X: {common.X.shape}')
    common.save_encoded_vocab()

    logger.info('Clustering...')
    # km = MiniBatchKMeans(n_clusters=common.n_topics, init=init_centroids(), init_size=1000, batch_size=1000,
    #                      verbose=0, random_state=common.random_seed)
    # km = MiniBatchKMeans(n_clusters=common.n_topics, verbose=1, random_state=1)
    km = KMeans(n_clusters=common.n_topics,
                init=init_centroids(),
                max_iter=3,
                verbose=1,
                random_state=2)

    # Analyze:
    common.doc_topics = km.fit_transform(common.X)  # the smaller, the closer
    common.doc_topics_reduced = np.argmin(common.doc_topics, axis=1)
    common.topics = km.cluster_centers_
    common.save_pickle(km, 'km.pickle')
    logger.info(f'doc_topics: {common.doc_topics.shape}')
    logger.info(f'topics: {common.topics.shape}')
    print()

    print('----------------')
    for i, topic_dist in enumerate(common.topics):
        top_words = common.vocab[np.argsort(topic_dist)[-10:][::-1]]
        print(f"Topic {i}: {' '.join(top_words)}")

    print()
    print('----------------')

    for i in range(n_preview):
        print(
            f'Article {i} (topic: {common.doc_topics_reduced[i]}), {common.doc_titles[i]}'
        )

    print()
    common.save_analyze_result()
Beispiel #45
0
    def k_means(self): # k개의 centroid를 반환
        model = KMeans()
        visualizer = KElbowVisualizer(model, metric='calinski_harabasz', k=(3, 100))

        visualizer.fit( self.reduced_new_lst )
        #visualizer.show()
        K = visualizer.elbow_value_

        if K == None:
            K = 50
        print('K= ',K)

        model = KMeans(init="k-means++", n_clusters=K, random_state=0)
        xys = model.fit_transform(self.reduced_new_lst)
        y_kmeans = model.predict(self.reduced_new_lst)
        #print(xys)

        word_vector = self.embedding_model.wv
        keys = word_vector.vocab.keys()

        xs = xys[:, 0]
        ys = xys[:, 1]
        #self.plot_2d_graph(keys, xs, ys)

        # 아래는 dataframe으로 뿌리기 위한 용도이구나
        pd_reduced_new_lst = pd.DataFrame(self.reduced_new_lst)
        keys = [k for k in keys]
        pd_keys = pd.DataFrame(keys)
        pd_keys = pd_keys.rename(columns={0: "keyword"})
        df = pd.concat([pd_reduced_new_lst, pd_keys], 1)
        #print(df)

        plt.figure()
        plt.scatter(xs, ys, c=y_kmeans, s=50, cmap='viridis')
        words = df['keyword']
        for i, word in enumerate(words):
            plt.annotate(word, xy=(xs[i], ys[i]))

        centers = model.cluster_centers_
        #plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)
        pd_centers = pd.DataFrame(centers)
        #print(pd_centers)

        new = pd.concat([pd_centers, pd_keys], axis=1, join='inner')
        print(new)
        #plt.show()

        return new
 def cluster(self, dataloader, clusters):
     names, features = self.pca(dataloader, dataloader, 1024)
     algo = KMeans(clusters,
                   algorithm="elkan",
                   init="random",
                   random_state=42)
     dist = algo.fit_transform(features)
     dist = dist.argmin(axis=1)
     for i in range(clusters):
         os.mkdir(
             f"{'/'.join(dataloader.dataset.path.split('/')[:-1])}/{i}")
     for idx, i in enumerate(dist):
         shutil.copy2(
             names[idx],
             f"{'/'.join(dataloader.dataset.path.split('/')[:-1])}/{i}/{names[idx].split('/')[-1]}"
         )
 def create_sampling_distribution(self, base_learner, data, fold_results):
     k_means = KMeans(n_clusters=self.configs.active_items_per_iteration *
                      2)
     I = data.is_train.nonzero()[0]
     X_cluster_space = k_means.fit_transform(data.x[I])
     #cluster_inds = k_means.fit_predict(data.x[I])
     centroid_inds = self.get_cluster_centroids(X_cluster_space)
     permuted_inds = np.random.permutation(centroid_inds)
     centroid_pairs = np.reshape(permuted_inds, (permuted_inds.size / 2, 2))
     for ind, (idx1, idx2) in enumerate(centroid_pairs):
         if data.true_y[idx1] <= data.true_y[idx2]:
             continue
         centroid_pairs[ind] = centroid_pairs[ind, ::-1]
     d = np.zeros(centroid_pairs.shape[0])
     d[:] = 1
     d = d / d.sum()
     return d, centroid_pairs
def kmean_distance(filename, group):
    k = KMeans(n_clusters = group, tol=0.000000001, init='random')
    rowname = filename[:,0]    
    filename = filename[:,1:]
    g = k.fit_predict(filename)  ##group
    distance = k.fit_transform(filename) ## caculate distance between point and "every" group center
    g = np.column_stack((rowname, g, np.zeros((len(filename),)) )) ## combine raw data and    
    for nrow in range(len(g)):
        id = int(g[nrow,1]) ##catch the group id 
        d = distance[nrow,id] ## get the distance with point's own group center 
        g[nrow,2] = d ## combine
    ##g_8 is the result
    cnt = Counter(g[:,1])
    cnt = sorted(cnt.items(),key = itemgetter(0))
    print "total group: %s" % (group)
    print "cnt of each group %s" % (cnt)
    return g
Beispiel #49
0
def main():
    args = parse_args()

    df = build_dataframe()
    df = df.sample(n=args.n_datapoints, random_state=args.seed)
    model = build_model(classes=None,
                        input_shape=(args.res, args.res, 3),
                        base_weights=args.model)
    image_shape = (args.res, args.res)

    df["feature_vector"] = df[["image_path"]].apply(
        lambda x: compute_features(model, x[0], image_shape), axis=1)
    kmeans = KMeans(n_clusters=args.n_clusters)
    distances = kmeans.fit_transform(list(df["feature_vector"]))
    df["distance"] = distances.min(axis=1)
    df["cluster"] = kmeans.labels_
    df.to_json(args.save_path)
Beispiel #50
0
def week9(csv, x_1, y_1, x_2, y_2, x_3, y_3):
    data = pd.read_csv(csv, delimiter=',', index_col='Object')
    coords = data.drop('Cluster', axis=1)
    centroid = np.array([[x_1, y_1], [x_2, y_2], [x_3, y_3]])
    kmeans = KMeans(n_clusters=3, init=centroid, max_iter=100, n_init=1)

    model = kmeans.fit(coords)
    answers = model.labels_.tolist()
    dist = kmeans.fit_transform(coords)

    my_claster = []

    for i in range(len(dist)):
        if answers[i] == 0:
            my_claster.append(dist[i][0].tolist())

    return answers, round(np.mean(my_claster), 3)
    def k_means(self, clusters):
        '''K-means'''
        self.algorithm = "kmeans"
        kmeans = KMeans(n_clusters=clusters)

        self.X_dist_matrix = kmeans.fit_transform(self.X)
        self.labels = kmeans.labels_

        labels = ["X", "Y"]

        self.num_clusters = clusters
        self.df = pd.DataFrame(data=self.X, columns=labels)
        self.labels_df = pd.DataFrame(data=self.labels)
        self.df['labels'] = self.labels_df

        #self.X_labeled = np.append(self.X, self.labels, axis=1)
        print("Kmeans complete")
def k_means(fileName, dimensions):
    f = open(fileName, 'r')
    fw = open(fileName + '.km', 'w')
    data = []
    video_info = []
    while 1:
        features = split_line_into_tokens(f.readline())
        if not features:
            break
        data.append(features[3:])
        video_info.append(features[:3])
    kmeans = KMeans(dimensions)
    transformedData = kmeans.fit_transform(data).tolist()
    index = 0
    for row in transformedData:
        finalfeatures = video_info[index] + row
        fw.write("; ".join(map(lambda x: str(x), finalfeatures)) + "\n")
        index = index + 1
Beispiel #53
0
def main():
    # Create a database connection
    connection = sqlite3.connect("wildfires.sqlite")
    df = pd.read_sql_query("SELECT LATITUDE,LONGITUDE FROM 'Fires'",
                           connection)
    attributes = ['LATITUDE', 'LONGITUDE']

    #df = df.drop(['LATITUDE','LONGITUDE'], axis=1)
    #df['LATITUDE'] = df['LATITUDE'].fillna(df['LATITUDE'].median())
    #df['LONGITUDE'] = df['LONGITUDE'].fillna(df['LONGITUDE'].median())

    data_attributes = df[attributes]
    kmeans_model = KMeans(n_clusters=2, random_state=1)
    distances = kmeans_model.fit_transform(data_attributes)
    labels = kmeans_model.labels_
    plt.scatter(distances[:, 0], distances[:, 1], c=labels)
    plt.title('K-means')
    plt.show()
Beispiel #54
0
def CreateDataset(X, Xtest, datasets = []):
    for dataset in datasets:
        if   dataset == 'text':
            X, Xtest = TextTransform(X, Xtest)
        elif dataset == 'log':
            X, Xtest = np.log10(X + 1), np.log10(Xtest + 1)
        elif dataset == 'original':
            pass
        elif dataset == 'kmeans':
            clf = KMeans(n_clusters = 200, n_init = 40, max_iter = 300, verbose
                    = 1, n_jobs = -1)
            X = np.vstack([X, Xtest])
            XX = clf.fit_transform(X)
            X = XX[:len(X)]
            Xtest = XX[len(X):]
        else:
            logging.warning("Datasets must be one of: text, original, log")
        SaveDataset(dataset, X, Xtest)
Beispiel #55
0
def clustering(X,
               clusters=4,
               max_iter=100,
               slow=False,
               init_size=2000,
               batch_size=2000,
               cluster_type="kmeans"):
    """
    Takes a tf-idf matrix and clusters it.

    Parameters:
        X: A sparse tf-idf matrix
        clusters: Integer. Number of clusters to be used
        max_iter: Integer. How many iterations to go before stopping
        slow: bool. Not used anymore. Use cluster_type instead
        init_size: Integer. If using the mini-kmeans batch, this determines the initialize size
        batch_size: Integer. Used for mini-kmeans batch
        cluster_type: String. Takes "kmeans" or "agg" currently. If the name isn't recognized your stuck with mini-
        kmeans

    Return:
        clustered_distances: a numpy array with two columns and rows for each document representing the distance between
            docs
        cluster_labels: a list with the cluster name for each doc.
    """
    time1 = time()
    if cluster_type == "kmeans":
        km = KMeans(n_clusters=clusters, max_iter=max_iter)
    elif cluster_type == "agg":
        cluster = AgglomerativeClustering(n_clusters=clusters,
                                          affinity="cosine",
                                          linkage="average")
        cluster.fit(X)
        print('Agglomerative clustering done in {}s'.format(time() - time1))
        return [], cluster.labels_
    else:
        km = MiniBatchKMeans(n_clusters=clusters,
                             max_iter=max_iter,
                             init_size=init_size,
                             batch_size=batch_size)
    clustered_distances = km.fit_transform(X)
    cluster_labels = km.labels_
    print('KMeans clustering done in {}s'.format(time() - time1))
    return clustered_distances, cluster_labels
Beispiel #56
0
def km_mlp(X, y):

    start = time.time()

    kmeans = KMeans(n_clusters=2)
    X_km = kmeans.fit_transform(X)

    train_sizes = [50, 100, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000]
    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
    estimator = MLPClassifier(hidden_layer_sizes=(40,),
                              max_iter=10000,
                              activation='relu',
                              solver='adam',
                              random_state=0)

    title = 'Learning curve for KM + MLP Classifier on Wave Data'

    print("Plotting", title)

    train_sizes, train_scores, valid_scores = learning_curve(estimator=estimator, X=X_km, y=y,
                                                             train_sizes=train_sizes,
                                                             cv=cv, scoring='neg_mean_squared_error')

    train_scores_mean = -train_scores.mean(axis=1)
    valid_scores_mean = -valid_scores.mean(axis=1)

    end = time.time()

    total = end - start

    print("TOTAL TIME TAKEN : ", total)

    plt.style.use('seaborn')
    plt.plot(train_sizes, train_scores_mean, marker='.', label='Training error')
    plt.plot(train_sizes, valid_scores_mean, marker='.', label='Validation error')
    plt.ylabel('MSE', fontsize=14)
    plt.xlabel('Training set size', fontsize=14)
    plt.title(title, fontsize=16, y=1.03)
    plt.legend()
    # plt.ylim(0, )
    plt.savefig('KM_MLP_LC_2.png')
    plt.clf()
    return total
Beispiel #57
0
def graph_clustering(A_matrix,method,n_clusters,ratio=None,graph_num=None,plotting=True,Mean=False):
    if(graph_num==None):
        graph_num = random.randint(1,len(A_matrix))-1
    if(Mean):
        graph_num = 0; A_matrix = np.mean(A_matrix,axis=0,keepdims=True)
    n = A_matrix.shape[1]
    if(method=='kmeans'):
        #kmeans on first n vectors with nonzero eigenvalues
        _, vecs = graph_representation(train_A=A_matrix,graph_num=graph_num,Prop='Spectral',plotting=False)
        kmeans = KMeans(n_clusters=n_clusters)
        kmeans.fit(vecs[:,1:n_clusters].reshape(-1,n_clusters-1))
        if(ratio==None):
            return kmeans.labels_
        num = np.sum(kmeans.labels_)
        ind = 0 if num>(n//2) else 1
        prob = (kmeans.fit_transform(vecs[:,1:n_clusters].reshape(-1,n_clusters-1)))
        thresh = np.quantile(prob[:,ind], ratio)
        return (prob[:,ind] >= thresh)
    elif(method=='Spectral_clustering'):
        adjacency_matrix = A_matrix[graph_num].reshape(n,n)
        sc = SpectralClustering(n_clusters, affinity='precomputed', n_init=100,
                                 assign_labels='discretize')
        Class = sc.fit_predict(adjacency_matrix)
        if(plotting):
            Ab_matrix = A_binarize(A_matrix)
            G = nx.Graph(Ab_matrix[graph_num])
            plt.figure(); nx.draw(G, node_size=200, pos=nx.spring_layout(G)); plt.show()
            plt.figure(); nx.draw(G, node_color=Class, node_size=200, pos=nx.spring_layout(G)); plt.show()
        return Class
    elif(method=='Affinity_propagation'):
        _, vecs = graph_representation(train_A=A_matrix,graph_num=graph_num,Prop='Spectral',plotting=False)
        clustering = AffinityPropagation().fit(vecs[:,1:n_clusters])
    elif(method=='Agglomerative_clustering'):
        _, vecs = graph_representation(train_A=A_matrix,graph_num=graph_num,Prop='Spectral',plotting=False)
        clustering = AgglomerativeClustering(n_clusters=n_clusters).fit(vecs[:,1:n_clusters].reshape(-1,n_clusters-1))
    elif(method=='Graclus'):
        sA = sparse.csr_matrix(A_matrix[graph_num])
        edge_index, edge_weight = g_utils.from_scipy_sparse_matrix(sA)
        cluster = graclus_cluster(edge_index[0], edge_index[1], edge_weight)
        return cluster.numpy()
    else:
        raise Exception("non-existing clustering method")
    return clustering.labels_
    colors = datainfo.colors

    f = h5py.File(datainfo.dpath + datainfo.name + ext + '.hdf5', 'r')

    for s, nclusters in zip(datainfo.sensors, datainfo.clusters):
        print s
        ldata = []
        for dfiles in datainfo.datafiles:
            d = f[dfiles + '/' + s + '/' + 'PeaksResamplePCA']
            dataf = d[()]
            ldata.append(dataf)

        data = ldata[0] #np.concatenate(ldata)

        km = KMeans(n_clusters=nclusters, n_jobs=-1)
        km.fit_transform(data)
        lsignals = []
        cnt = Counter(list(km.labels_))

        lmax = []
        for i in range(km.n_clusters):
            lmax.append((i,np.max(km.cluster_centers_[i])))
        lmax = sorted(lmax, key=itemgetter(1))

        print lmax
        print data.shape

        lhisto = []
        for dataf, ndata in zip(ldata, datainfo.datafiles):
            histo = np.zeros(nclusters)
            for i in range(dataf.shape[0]):
	def clustering(self):
		kmeans = KMeans(n_clusters=26)
		kmeans.fit_transform(self.train)
		return kmeans
 def kmeans(embedding, n_components):
     est = KMeans(n_clusters=n_components, n_jobs=-1, init='k-means++', n_init=300)
     est.fit_transform(embedding)
     labels = est.labels_
     data = labels.astype(np.float)
     return data