Beispiel #1
0
def cluster_features(features, n_clusters, implementation, faiss_gpu=False, max_iter=3000, random_state=None):
    if implementation == "sklearn":
        kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, max_iter=max_iter)
        kmeans.fit(features)
        print(f"Loss: {kmeans.inertia_}")
        return kmeans.cluster_centers_, kmeans.labels_
    elif implementation == "faiss":
        kmeans = faiss.Kmeans(features.shape[1], n_clusters, niter=max_iter, gpu=faiss_gpu)
        kmeans.train(features)
        _, I = kmeans.index.search(features, 1)
        return kmeans.centroids, I.reshape(I.shape[0])
    else:
        print(f"No such kmeans implementation {implementation} available.")
 def _fit(self, num_iters=10):
     scores = []
     start = time.time()
     for i in range(num_iters):
         print('Starting sklearn KMeans: %d' % i)
         sklearn_kmeans = SklearnKMeans(n_clusters=self.num_clusters,
                                        init='k-means++',
                                        max_iter=50,
                                        n_init=1,
                                        tol=1e-4,
                                        random_state=i * 42)
         sklearn_kmeans.train(self.points)
         scores.append(sklearn_kmeans.inertia_)
     self._report(num_iters, start, time.time(), scores)
Beispiel #3
0
 def _fit(self, num_iters=10):
   scores = []
   start = time.time()
   for i in range(num_iters):
     print('Starting sklearn KMeans: %d' % i)
     sklearn_kmeans = SklearnKMeans(
         n_clusters=self.num_clusters,
         init='k-means++',
         max_iter=50,
         n_init=1,
         tol=1e-4,
         random_state=i * 42)
     sklearn_kmeans.train(self.points)
     scores.append(sklearn_kmeans.inertia_)
   self._report(num_iters, start, time.time(), scores)
Beispiel #4
0
def standard_spark_kmeans(data, k, max_iter, random_state):
    t1 = time()
    from pyspark.mllib.clustering import KMeans
    from math import sqrt
    from pyspark import SparkContext, SparkConf
    conf = SparkConf().setAppName('K-Means_Spark').setMaster('local[%d]'%10)
    sc = SparkContext(conf=conf)
    data = sc.parallelize(data)
    # Build the model (cluster the data)
    clusters = KMeans.train(data, k, maxIterations=max_iter, runs=10, initializationMode="random", seed=random_state,  epsilon=1e-4)

    #  Evaluate clustering by computing Within Set Sum of Squared Errors
    def error(point):
        center = clusters.centers[clusters.predict(point)]
        return sqrt(sum([x**2 for x in (point - center)]))

    WSSSE = data.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    print time() - t1
    print WSSSE
max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha,
                min_alpha=0.00025,
                min_count=1,
                dm=1)

model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")
print("Model Saved")

from gensim.models.doc2vec import Doc2Vec

model = Doc2Vec.load("d2v.model")
#to find the vector of a document which is not in training data
test_data = word_tokenize("Sports".lower())
v1 = model.infer_vector(test_data)
Beispiel #6
0
scale_data = DataFrame.as_matrix(scale_data)

pca = PCA(n_components=20)
pca.fit(scale_data)

#Use PCA to reduce the dimension
transform_data = pca.transform(scale_data)
data_array = transform_data

DataFrame(transform_data).to_csv('temp.csv')

#Using K-means to classify the data

model = KMeans.train(sc.parallelize(data_array),
                     6,
                     maxIterations=50,
                     runs=30,
                     initializationMode="random")

data_cluster = np.array([])
for k in range(0, data_array.shape[0]):
    clusters = model.predict(data_array[k])
    data_cluster = np.append(data_cluster, clusters)
    print 'data of row', k, 'is cluster:', clusters

file_path = raw_data_total['file_path']

cluster_dataframe = DataFrame({
    'file_path': file_path,
    'cluster': data_cluster
})