Ejemplo n.º 1
0
 def test_diff_true2(self):
     old_mean = np.array([[1, 1], [1, 1]])
     new_mean = np.array([[0, 0], [0, 0]])
     I = kmeans.Kmeans()
     I.mean = new_mean
     expr = I.diff(old_mean)
     self.assertIs(expr, False)
Ejemplo n.º 2
0
 def test_calc_mean(self):
     I = kmeans.Kmeans()
     I.features = np.array(([[10, 20], [14, 24], [100, 120], [110, 120]]))
     I.label = [0, 0, 1, 1]
     ary = I.calc_mean()
     expr = np.array(([[12, 22], [105, 120]]))
     self.assertTrue(np.all(ary == expr))
Ejemplo n.º 3
0
 def test_diff_false(self):
     old_mean = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
     new_mean = np.array([[1, 2, 3], [4, 7, 6], [7, 8, 9]])
     I = kmeans.Kmeans()
     I.mean = new_mean
     expr = I.diff(old_mean)
     self.assertIs(expr, False)
Ejemplo n.º 4
0
 def test_diff_false2(self):
     old_mean = np.array([[0.00000001, 2], [4, 5]])
     new_mean = np.array([[0.0000001, 2], [4, 5]])
     I = kmeans.Kmeans()
     I.mean = new_mean
     expr = I.diff(old_mean)
     self.assertIs(expr, True)
 def kmeans_operation(self):
     KmeansInstance=kmeans.Kmeans()
     self.rep_warp,self.weights=KmeansInstance.RunKmeans(self.X,self.Clusters)
     self.warp_index=0
     weight=0
     for i in range(0,len(self.rep_warp)):
         if(self.weights[i]>weight):
             weight=self.weights[i]
             self.warp_index=self.rep_warp[i]
     print(self.warp_index)
Ejemplo n.º 6
0
    def test_experiment1(self):

        k = 10
        trainfile = 'optdigits.train'
        testfile = 'optdigits.test'

        km = kmeans.Kmeans(k=k, trainfile=trainfile, testfile=testfile)
        km.train()
        accuracy, confusion_matrix = km.evaluate(data=km.test_data)
        km.report(test_accuracy=accuracy, confusion_matrix=confusion_matrix)

        assert (accuracy > .70)
    def Exercise1(self):
        data = [[1.9, 7.3], [3.4, 7.5], [2.5, 6.8], [1.5, 6.5], [3.5, 6.4],
                [2.2, 5.8], [3.4, 5.2], [3.6, 4], [5, 3.2], [4.5, 2.4],
                [6, 2.6], [1.9, 3], [1, 2.7], [1.9, 2.4], [0.8, 2], [1.6, 1.8],
                [1, 1]]
        _data = np.asarray(_matrix.AddColumn(
            _matrix.Copy(data), -1))  # create new column for classification
        x1, y1 = _matrix.DivideXY(_data)
        plot.Kmeans(x1, np.int_(y1), [[]], 'Plot pure data')

        newData, centroids = kmeans.Kmeans(data, 3)
        x, y = _matrix.DivideXY(newData)
        plot.Kmeans(x, np.int_(y), centroids, 'Plot Kmeans')
Ejemplo n.º 8
0
def kmean_segementation(expression, get_clustering_features,
                        recognition_model):
    strokes = expression.strokes.values()
    cluster_features = get_clustering_features(strokes)
    n = len(strokes)
    confidences = {}
    max_confidence = 0
    max_segmentation = []
    for k in range(n, n // 4, -1):
        segmenter = kmeans.Kmeans(k)
        test_k = KMeans(k)
        test = test_k.fit_predict(cluster_features)
        # clusters = segmenter.kmeans(cluster_features)
        clusters = {}
        for i, index in enumerate(test):
            arr = clusters.get(index, None)
            if arr is None:
                clusters[index] = [i]
            else:
                arr.append(i)
        cluster_confidences = []
        cluster_symbols = []
        for key in clusters.keys():
            cluster = clusters[key]
            name = ''.join(list(map(str, cluster)))
            s, c = confidences.get(name, (None, None))
            if c is None:
                # needs to be changed for the max probability score
                symbol_strokes = []
                for stroke in cluster:
                    symbol_strokes.append(list(strokes)[stroke])
                s = Symbol(name, symbol_strokes)
                feature_vector = digitClassifier.features([s.to_digit()])
                c, classification = recognition_model.classify_conf(
                    feature_vector)
                confidences[name] = (s, c)
                s.classification = recognition_model.int_to_class[
                    classification] if classification < len(
                        recognition_model.int_to_class) else 'junk'
                if s.classification == ',':
                    s.classification = 'COMMA'
                elif s.classification == 'junk':
                    c /= 10
                s.weight = c
            cluster_confidences.append(c)
            cluster_symbols.append(s)
        segmentation_confifdence = objective_function(cluster_confidences)
        if segmentation_confifdence > max_confidence:
            max_confidence = segmentation_confifdence
            max_segmentation = cluster_symbols
    expression.symbols = max_segmentation
    def dataset_test(self, path, datatype="float"):
        print(">>>>>> " + path + " <<<<<<")
        readmatrix = _matrix.ReadCsv(path, ";", datatype)

        _data = np.asarray(
            _matrix.AddColumn(_matrix.Copy(readmatrix),
                              -1))  # create new column for classification
        x1, y1 = _matrix.DivideXY(_data)
        plot.Kmeans(x1, np.int_(y1), [[]], path + ' - Plot pure data')

        newData, centroids = kmeans.Kmeans(readmatrix, 3)
        x, y = _matrix.DivideXY(newData)
        plot.Kmeans(x, np.int_(y), centroids, path + ' - Plot Kmeans')

        print("\n--------------------------------------------------")
    def Exercise3(self):
        iris = datasets.load_iris()
        x = iris.data

        # reduce dimensions with Pca
        resultPca = PCA(n_components=2)
        resultPca.fit(x)
        PcaData = resultPca.transform(x)

        _data = np.asarray(_matrix.AddColumn(
            _matrix.Copy(PcaData), -1))  # create new column for classification
        x1, y1 = _matrix.DivideXY(_data)
        plot.Kmeans(x1, np.int_(y1), [[]], 'Iris - Plot pure data')

        newData, centroids = kmeans.Kmeans(PcaData, 3)
        x, y = _matrix.DivideXY(newData)
        plot.Kmeans(x, np.int_(y), centroids, 'Iris - Plot Kmeans')
Ejemplo n.º 11
0
def main():
    """
    Main call to start the program
    """

    #Dataset
    file_name = 'frogs.csv'
    file_path = '../data/'
    full_path = file_path + file_name
    data = utils.csv_to_arrays(full_path)

    #Parameters
    k = 3
    mode = 'hammerly'
    iterations = 1000
    tests = 10
    model = kmeans.Kmeans(data, k, mode, iterations)

    #Book keeping variables
    total_iterations = 0
    total_dist = 0
    total_time = 0

    print("-----" + mode + "-----")

    for i in range(tests):

        start = time.time()
        iters, clusters = model.cluster()
        stop = time.time()
        error = model.average_error(clusters)

        run_time = stop - start
        total_time += run_time

        #print ("Stopped after " + str(iters) +
        #    " iterations, with average distance of: " + str(error))
        model.reset()

        total_dist += error
        total_iterations += iters

    print("average iterations: " + str(total_iterations / tests))
    print("average distance: " + str(total_dist / tests))
    print("average time: " + str(total_time / tests))
Ejemplo n.º 12
0
    def do_my_Kmeans(self, args, initCentroid=None):
        km = mykm.Kmeans(args, initCentroid=initCentroid)


        print("fitting the model (my own k-means) ...")
        start_time = time.time()
        start_clock = time.clock()
        km.fit(self.X)
        print("--- time: %s seconds\t clock: %s seconds ---" % (time.time() - start_time, time.clock() - start_clock))


        km.plot_sse_chart('my_kmeans_sse', args.save_path)

        print("acc: ", self.accuracy(self.Y, km.labels_))
        print("===== K-means cluster center (my own k-means) =====")
        print(km.cluster_centers_)        
        print("===================================================\n\n\n\n")
        
        return km
Ejemplo n.º 13
0
def main():
    t1 = time.monotonic()

    model = gensim.models.doc2vec.Doc2Vec.load(
        '../obj/doc2vec/abstracts_etd_doc2vec_5000_docs')

    doc_vectors, keys = extract_mapped_doc2vecs(model)

    km_obj = kmeans.Kmeans(doc_list=keys,
                           n_clusters=10,
                           init='k-means++',
                           n_init=3,
                           n_jobs=5,
                           random_state=42,
                           verbose=1,
                           algorithm='full')

    km_obj.fit(doc_vectors)

    km_obj.save('abstracts_etd_doc2vec_5000_docs_kmeans.sav')

    print("Time taken {}s".format(timedelta(time.monotonic() - t1)))
Ejemplo n.º 14
0
    def main(self):

        # create synthetic dataset
        synthesizer = sd.SyntheticData(5, 5, 10, 6)
        pts = synthesizer.point_assignments  # synthetic data pts
        # TODO: might attempt to plot clusters if time permits.
        #print(pts.head(30))

        # k-means setup for synthetic dataset
        print('Running k-means with synthetic dataset...')
        kmeans = km.Kmeans(5, pts, False, True)
        kmeans_clusters = kmeans.getClusters(3)  # max of 3 iterations
        print('Finished k-means with synthetic dataset')
        print('Calculating average silhouette coefficient...')
        kmeans_silhouette = assessment.calculate_silhouette(kmeans_clusters)
        print('Average k-means silhouette coefficient = {0}\n'.format(
            kmeans_silhouette))

        # DB Scan setup for synthetic dataset
        print('Running DB Scan with synthetic dataset...')
        epsilon = 2
        minPts = 2
        dbScan = db.DbScan(pts, False, True)
        df, dex = dbScan.getDataframe()
        m = dbScan.createPairwiseDistanceMatrix(df, dex)
        lessThanEpsilonDict = dbScan.createLessThanEpsilonDict(df, m, epsilon)
        dbScan_clusters = dbScan.clusterAssignment(df, lessThanEpsilonDict,
                                                   minPts)
        print('Finished DB Scan with synthetic dataset')
        print('Calculating average silhouette coefficient...')
        dbScan_silhouette = assessment.calculate_silhouette(dbScan_clusters)
        print('Average DB Scan silhouette coefficient = {0}\n'.format(
            dbScan_silhouette))

        # k-means and DB scan for classification datasets from UCI repository
        for dataset in self.alldataset:  # for each dataset call each algorithm
            print('current dataset ::: {0} \n'.format(dataset))
            data = self.alldataset.get(dataset)
            isClassification = self.IsClassificationDict.get(dataset)

            print('Running k-means on dataset ::: {0}... \n'.format(dataset))
            k = 5
            kmeans = km.Kmeans(k, data, isClassification, False)
            kmeans_clusters = kmeans.getClusters(3)  #max of 3 iterations
            print('Finished kmeans for {0} dataset\n'.format(dataset))
            print('Calculating purity...')
            kmeans_purity = assessment.calculate_purity(kmeans_clusters)
            print('k-means purity on dataset ::: {0} = {1} \n'.format(
                dataset, kmeans_purity))

            print('Running DB Scan on dataset ::: {0}... \n'.format(dataset))
            epsilon = 2
            minPts = 2
            dbScan = db.DbScan(data, isClassification, False)
            df, dex = dbScan.getDataframe()
            m = dbScan.createPairwiseDistanceMatrix(df, dex)
            lessThanEpsilonDict = dbScan.createLessThanEpsilonDict(
                df, m, epsilon)
            dbScan_clusters = dbScan.clusterAssignment(df, lessThanEpsilonDict,
                                                       minPts)
            print('Finished DB Scan for {0} dataset\n'.format(dataset))
            print('Calculating purity...')
            dbScan_purity = assessment.calculate_purity(dbScan_clusters)
            print('DB Scan purity on dataset ::: {0} = {1} \n'.format(
                dataset, dbScan_purity))
Ejemplo n.º 15
0
import pandas as pd
import random
import sys
import math
import kmeans

ginf = pd.read_csv('./tagged_data_01_2017.csv', delimiter=';')

cols = list(ginf.columns)
ginf[cols[1:]] = ginf[cols[1:]].fillna(0).astype('float32')

# ginf.info()
ginf.set_index('time', inplace=True)
ginf.index = pd.to_datetime(ginf.index, format='%d/%m/%y %H:%M')
data_ind = ginf.filter(regex="^.*indoor...*$")
data_ind_means = data_ind.groupby(
    data_ind.index.day).agg(lambda x: np.nanmean(x[x < 100]))

data_ind_means_numpy = data_ind_means.as_matrix()
data_ind_means_num = np.reshape(data_ind_means_numpy, -1)

df = pd.DataFrame(data_ind_means_num)
df.info()

#K-Means Configuration
GINF_CONFIG = {'diff_cols': [0], 'diff_labels': ['Indoor_Air_Temp']}

config = kmeans.KmeansConfig(GINF_CONFIG['diff_cols'],
                             GINF_CONFIG['diff_labels'], 0)
kmeansObj = kmeans.Kmeans(config, 4, df)
kmeansObj.cluster()
Ejemplo n.º 16
0
            ydata = data[i][:, j] / np.max(data[i][:, j])
            popt = scipy.optimize.fmin(lssq, (1.0, 1), disp=0)
            params[i, j, 2:4] = np.array(popt)

    with open('../data/params.pkl', 'wb') as f:
        pickle.dump(params, f)
    with open('../data/churns.pkl', 'wb') as f:
        pickle.dump(churns, f)

else:
    with open('../data/params.pkl', 'rb') as f:
        params = pickle.load(f)

if not os.path.exists('../data/labels.pkl'):
    kmeans.Kmeans(input=params,
                  range_n_clusters=[2, 3, 4, 5],
                  output='../data/labels.pkl')
    #kmeans.cluster4()
with open('../data/labels.pkl', 'rb') as f:
    labellist = pickle.load(f)
    centerlist = pickle.load(f)

for dim in xrange(len(labellist)):
    labels = labellist[dim]
    centers = centerlist[dim]
    unique = np.unique(labels)
    colorshape = ['ro', 'gv', 'bs', 'k*', 'm8', 'y.']

    #draw model parameter plots
    xs = [[] for i in range(len(unique))]
    ys = [[] for i in range(len(unique))]
Ejemplo n.º 17
0
from __future__ import division
import numpy as np
import pandas as pd
import random
import sys
import math
import kmeans

ginf = pd.read_csv('./tagged_data_01_2017.csv', delimiter=';')

cols = list(ginf.columns)
ginf[cols[1:]] = ginf[cols[1:]].fillna(0).astype('float32')

ginf.info()

# K-Means Configuration
GINF_CONFIG = {
    'diff_cols': [1, 2, 4, 5, 6],
    'diff_labels': [
        'Indoor_Air_Temp_1',
        'Indoor_Air_Temp_2',
        'Outdoor_Air_Temp_1',
        'Supply_Water_Temp_1',
        'Return_Water_Temp_1'
    ]
}

config = kmeans.KmeansConfig(GINF_CONFIG['diff_cols'], GINF_CONFIG['diff_labels'], 1)
kmeansObj = kmeans.Kmeans(config, 10, ginf)
kmeansObj.cluster()
Ejemplo n.º 18
0
count = 0
results_set = []
for item in items:
    image_string = ""
    image_url = item["link"]

    try:
        # with urllib.request.urlopen(image_url) as url:
        req = Request(image_url, headers={'User-Agent': 'Mozilla/5.0'})
        image_string = urlopen(req).read()
        image_file = BytesIO(image_string)
        img = Image.open(image_file)
        filename = "%s.%s" % (count, img.format)
        img.save(filename)

        k = kmeans.Kmeans()
        results = k.run(img)
        print("Results are:")
        print(results)
        k.saveCentroidColours(str(count), img.format)

        for result in results:
            print(kmeans.rgb_to_hex(result))
            results_set.append(result)
            print("image", count, "has color:", result)

    except:
        print("Unexpected error:", sys.exc_info()[0])

    count = count + 1
Ejemplo n.º 19
0
import numpy as np
import kmeans
import matplotlib.pyplot as plt

#Generate random data around centroids_initial
dimensions = 2
amount_of_clusters = 4
amount_of_data_in_cluster = 50

centroids_initial = np.empty([amount_of_clusters, dimensions])
points = np.empty([amount_of_clusters * amount_of_data_in_cluster, dimensions])

for i in range(amount_of_clusters):
    centroids_initial[i, :] = np.random.uniform(low=-10,
                                                high=10,
                                                size=(dimensions, ))

    points[i * amount_of_data_in_cluster:(i + 1) *
           amount_of_data_in_cluster, :] = np.random.uniform(
               low=-1, high=1, size=(amount_of_data_in_cluster,
                                     dimensions)) + centroids_initial[i, :]

centroids, gama, iters = kmeans.Kmeans(points, amount_of_clusters, 10, 1000,
                                       10)

plt.scatter(points[:, 0], points[:, 1], s=1)
plt.scatter(centroids_initial[:, 0], centroids_initial[:, 1], s=10)
plt.scatter(centroids[:, 0], centroids[:, 1], s=30, c="red")
plt.show()