Esempio n. 1
0
    def produce_clusterings(self):
        km = kmeans(self.representativities, self.graphlets_per_graph,
                    self.nb_clusters)
        km.to_string()
        labels = km.compute()

        folder = self.Results + '/Clusterings/%s_classes' % self.nb_clusters
        if not '%s_classes' % self.nb_clusters in listdir(self.Results +
                                                          '/Clusterings/'):
            makedirs(folder, exist_ok=True)

        km.write_results(folder)

        radar = kiviat(folder)
        radar.plot_kiviat()
Esempio n. 2
0
def kmix(xs, k, tolerance=0.01, max_iter=100, verbose=True, init='random'):
    n, p = xs.shape  #n total data points, p is dimension

    if init == 'kmeans':
        if verbose == True:
            print('Initializing points with K-Means++.')
        mus = list(kMeans.kmeans(xs, k, plus=True, verbose=verbose).keys())
    else:
        if verbose == True:
            print('Initializing points randomly.')
        mus = random_points(xs, k)

    sigmas = [np.array([[1, 0], [0, 1]])] * k
    pis = [1 / k] * k

    ll_old = 0
    for i in range(max_iter):
        if verbose == True:
            print('Iteration {} | loglikelihood {}'.format(i, ll_old))
        '''
        pij is a kxn array. We iterate over the clusters and then over each data
        point. pij /= pij.sum(0) divides the columns by the sum of the columns
        '''
        pij = np.zeros((k, n))
        for j in range(len(mus)):
            for i in range(n):
                pij[j, i] = pis[j] * multivariate_normal.pdf(
                    xs[i], mus[j], sigmas[j])
        pij /= pij.sum(0)
        '''
        we iterate over the clusters and then over the data again, and pis is
        initially the sum of all the data points per cluster.
        Then we divide each cluster by the length of the total data
        '''
        pis = np.zeros(k)
        for j in range(len(mus)):
            for i in range(n):
                pis[j] += pij[j, i]
        pis /= n
        '''
        mus is initially the points weighted by the pij.
        Then we divide mus by the sum of the weights in the cluster
        '''
        mus = np.zeros((k, p))
        for j in range(k):
            for i in range(n):
                mus[j] += pij[j, i] * xs[i]
            mus[j] /= pij[j, :].sum()
        '''
        sigmas is initially k p dimensional square matrices
        Then it becomes the sum of the weighted covariances
        Then we divide by the sum of the weights in the cluster
        '''
        sigmas = np.zeros((k, p, p))
        for j in range(k):
            for i in range(n):
                ys = np.reshape(xs[i] - mus[j], (2, 1))
                sigmas[j] += pij[j, i] * np.dot(ys, ys.T)
            sigmas[j] /= pij[j, :].sum()
        '''
        iterate over all data points then over all clusters
        add the weighted things per cluster, then log
        '''
        ll_new = 0.0
        for i in range(n):
            s = 0
            for j in range(k):
                s += pis[j] * multivariate_normal.pdf(xs[i], mus[j], sigmas[j])
            ll_new += np.log(s)

        if np.abs(ll_new - ll_old) <= tolerance:
            break
        ll_old = ll_new

    return_dict = {}
    for i in range(n):
        max_tracker = []
        for j in range(k):
            max_tracker.append(
                pis[j] * multivariate_normal(mus[j], sigmas[j]).pdf(xs[i]))
        index = max_tracker.index(max(max_tracker))
        if not (tuple(mus[index])) in return_dict.keys():
            return_dict[tuple(mus[index])] = [xs[i]]
        else:
            return_dict[tuple(mus[index])].append(xs[i])

    return return_dict
Esempio n. 3
0
__author__ = 'I322233'
from numpy import *
import time
import matplotlib.pyplot as plt
import kMeans as km

## step 1: load data
print "step 1: load data..."
dataSet = []
## read file
fileIn = open('kmeans_test_set.txt')
for line in fileIn.readlines():
    lineArr = line.strip().split(' ')
    dataSet.append([float(lineArr[0]), float(lineArr[1])])

## step 2: clustering...
print "step 2: clustering..."

## change array to matrix
dataSet = mat(dataSet)
k = 4
centroids, clusterAssment = km.kmeans(dataSet, k)

## step 3: show the result
print "step 3: show the result..."
km.showCluster(dataSet, k, centroids, clusterAssment)
Esempio n. 4
0
def kmix(xs, k, tolerance = 0.01, max_iter=100, verbose = True, init = 'random'):
    n, p = xs.shape #n total data points, p is dimension

    if init == 'kmeans':
        if verbose == True:
            print('Initializing points with K-Means++.')
        mus = list(kMeans.kmeans(xs, k, plus = True, verbose = verbose).keys())
    else:
        if verbose == True:
            print('Initializing points randomly.')
        mus = random_points(xs, k)

    sigmas = [np.array([[1, 0],[0, 1]])] * k
    pis = [1 / k] * k

    ll_old = 0
    for i in range(max_iter):
        if verbose == True:
            print('Iteration {} | loglikelihood {}'.format(i, ll_old))

        '''
        pij is a kxn array. We iterate over the clusters and then over each data
        point. pij /= pij.sum(0) divides the columns by the sum of the columns
        '''
        pij = np.zeros((k, n))
        for j in range(len(mus)):
            for i in range(n):
                pij[j, i] = pis[j] * multivariate_normal.pdf(xs[i], mus[j], sigmas[j])
        pij /= pij.sum(0)

        '''
        we iterate over the clusters and then over the data again, and pis is
        initially the sum of all the data points per cluster.
        Then we divide each cluster by the length of the total data
        '''
        pis = np.zeros(k)
        for j in range(len(mus)):
            for i in range(n):
                pis[j] += pij[j, i]
        pis /= n

        '''
        mus is initially the points weighted by the pij.
        Then we divide mus by the sum of the weights in the cluster
        '''
        mus = np.zeros((k, p))
        for j in range(k):
            for i in range(n):
                mus[j] += pij[j, i] * xs[i]
            mus[j] /= pij[j, :].sum()

        '''
        sigmas is initially k p dimensional square matrices
        Then it becomes the sum of the weighted covariances
        Then we divide by the sum of the weights in the cluster
        '''
        sigmas = np.zeros((k, p, p))
        for j in range(k):
            for i in range(n):
                ys = np.reshape(xs[i]- mus[j], (2,1))
                sigmas[j] += pij[j, i] * np.dot(ys, ys.T)
            sigmas[j] /= pij[j,:].sum()

        '''
        iterate over all data points then over all clusters
        add the weighted things per cluster, then log
        '''
        ll_new = 0.0
        for i in range(n):
            s = 0
            for j in range(k):
                s += pis[j] * multivariate_normal.pdf(xs[i], mus[j], sigmas[j])
            ll_new += np.log(s)
        
        if np.abs(ll_new - ll_old) <= tolerance:
            break
        ll_old = ll_new

    return_dict = {}
    for i in range(n):
        max_tracker = []
        for j in range(k):
            max_tracker.append(pis[j] * multivariate_normal(mus[j], sigmas[j]).pdf(xs[i]))
        index = max_tracker.index(max(max_tracker))
        if not (tuple(mus[index])) in return_dict.keys():
            return_dict[tuple(mus[index])] = [xs[i]]
        else:
            return_dict[tuple(mus[index])].append(xs[i])

    return return_dict
Esempio n. 5
0
	plt.clf()

def graph_it(data, display = True, file_name = None, verbose = True):
	'''
	The data should be in the form of a dictionary with centers as keys and values arrays
	file_name, if passed, will write the graph to the output file
	display, if True, will display the graph upon completion
	'''
	p = len(data[list(data.keys())[0]][0])
	if p == 2:
		return graph2d(data, display, file_name, verbose)
	elif p == 3:
		graph3d(data, display, file_name, verbose)

#Examples
graph_it(kMeans.kmeans(toydata, 3, plus = False))
# graph_it(kMixture.kmix(toydata, 3, init = 'kmeans'), file_name = 'lol.png')
# graph_it(kMeans.kmeans(np.random.rand(1000,2), 8, plus = True))


# toydata = pd.read_csv("Data/3Ddata.txt", sep=r"\s+", header = None)
# print(toydata)
# toydata = np.array([toydata.ix[:,0],toydata.ix[:,1], toydata.ix[:,2]]).transpose()
# graph3d(kMeans.kmeans(toydata, 3, plus = True))

def clusterer(n, clusters):
	per = n // clusters
	final = np.array([0,0])
	for i in range(clusters):
		center = np.random.rand(1,2)
		dists = np.random.rand(per, 1)
def main():

    modelName = "Word2VectforNLPTraining"
    model = Word2Vec.load(modelName)

    # model.init_sims(replace=True)

    word_vectors = model.syn0
    # print(word_vectors[0])
    num_clusters = int(word_vectors.shape[0] / 5)
    # print("number of clusters: {}".format(num_clusters))
    # input("Press enter to continue:")
    print("Clustering...")
    startTime = time.time()
    cluster_index = kMeans.kmeans(num_clusters, word_vectors)
    endTime = time.time()

    print("Time taken for clustering: {} seconds".format(endTime - startTime))


    # create a word/index dictionary, mapping each vocabulary word to a cluster number
    # zip(): make an iterator that aggregates elements from each of the iterables
    index_word_map = dict(zip(model.index2word, cluster_index))

    def create_bag_of_centroids(reviewData):
        """
        assign each word in the review to a centroid
        this returns a numpy array with the dimension as num_clusters
        each will be served as one feature for classification
        :param reviewData:
        :return:
        """
        featureVector = np.zeros(num_clusters, dtype=np.float)
        for word in reviewData:
            if word in index_word_map:
                index = index_word_map[word]
                featureVector[index] += 1
        return featureVector

    train = pd.read_csv("/path/labeledTrainData.tsv",
                    header=0, delimiter="\t", quoting=3)
    test = pd.read_csv("/path/testData.tsv",
                   header=0, delimiter="\t", quoting=3)

    trainingDataFV = np.zeros((train["review"].size, num_clusters), dtype=np.float)
    testDataFV = np.zeros((test["review"].size, num_clusters), dtype=np.float)

    print("Processing training data...")
    counter = 0
    cleaned_training_data = processData.clean_data(train)
    for review in cleaned_training_data:
        trainingDataFV[counter] = create_bag_of_centroids(review)
        counter += 1

    print("Processing test data...")
    counter = 0
    cleaned_test_data = processData.clean_data(test)
    for review in cleaned_test_data:
        testDataFV[counter] = create_bag_of_centroids(review)
        counter += 1

    n_estimators = 100
    result = randomForestClassifier.rfClassifer(n_estimators, trainingDataFV, train["sentiment"],testDataFV)
    output = pd.DataFrame(data={"id": test["id"], "sentiment": result})
    output.to_csv("Doc2Vec_Clustering.csv", index=False, quoting=3)
Esempio n. 7
0
    centers = [0] * len(key)
    for key in data.keys():
        for i in range(len(centers)):
            centers[i] = key[i]
        ax.scatter(centers[0], centers[1], centers[2], marker='o', c='black')

    if file_name != None:
        fig.savefig(file_name)  #, bbox_inches='tight')
    if display == True:
        plt.show()
    plt.clf()


def graph_it(data, display=True, file_name=None, verbose=True):
    '''
	The data should be in the form of a dictionary with centers as keys and values arrays
	file_name, if passed, will write the graph to the output file
	display, if True, will display the graph upon completion
	'''
    p = len(data[list(data.keys())[0]][0])
    if p == 2:
        return graph2d(data, display, file_name, verbose)
    elif p == 3:
        graph3d(data, display, file_name, verbose)


#Examples
graph_it(kMeans.kmeans(toydata, 3, plus=True))
# graph_it(kMixture.kmix(toydata, 3, init = 'kmeans'), file_name = 'lol.png')
# graph_it(kMeans.kmeans(np.random.rand(1000,3), 4, plus = True))
Esempio n. 8
0
 def test_kmeans(self):
     # I find it's also hard to test. Because the initial centroids is randomly.
     dataset = load_dataset("testSet.txt")
     centroids, cluster_assignment = kmeans(dataset, 4)
     print centroids