コード例 #1
0
def main():

    # # of clusters
    k = 3

    data = pd.read_csv("data_Noah_Preprocessing.csv", encoding='utf-8')
    data = data.drop(['pitch_type'], axis=1)
    result, finalCentroids = k_means(data.values, n_clusters=k)
    #fill the dataframe with kMeans result
    cluster = pd.DataFrame(data=result, columns=['cluster#'])
    outData = pd.read_csv("data_Noah_Preprocessing.csv", encoding='utf-8')
    outData = appendCol(outData, cluster)
    outData = outData.sort_values(by=['cluster#', 'pitch_type'])
    kMeansAccuracy(outData, k)

    centroids = pd.DataFrame(data=finalCentroids,
                             columns=['x', 'y', 'speed', 'az'])
    #dummy values for visualization
    dummy = pd.DataFrame(data={'pitch_type': ['GG'] * k, 'cluster#': [k] * k})
    centroids = appendCol(centroids, dummy)
    #dummy values for visualization
    dummy2 = pd.DataFrame(
        data={'sizedummy': ['4'] * data.shape[0] + ['10'] * k})
    outData = outData.append(centroids, sort=False, ignore_index=True)
    outData = appendCol(outData, dummy2)

    outData.to_csv("data_Noah_Cluster.csv", index_label=False, index=False)
コード例 #2
0
def spectral_cluster(data):
    # 生成laplace 矩阵
    lm = laplace_matrix(data)
    # 通过laplace矩阵 求特征值 特征向量
    eg_values, eg_vectors = linalg.eig(lm)
    # 排序
    idx = eg_values.argsort()
    eg_vectors = eg_vectors[:idx]

    m = len(data)
    # eg_data就是教程中的u
    eg_data = [[] for x in range(m)]
    for i in range(m):
        # u中的第i行 :y[i]
        eg_data[i] = [0 for x in range(k)]

        # 选取前K个特征征
        for j in range(k):
            eg_data[i][j] = eg_vectors[i][j]
    return k_means(eg_data)
コード例 #3
0
def do_experiment(train_data, test_data):
    """ conducts single experiment 
        INPUT:
            train_data: (data_raw, data)
            test_data: (data_raw, data)
            bool learn: use unsupervised learning or classic receiver?
        OUTPUT:
            BER
    """

    x_raw, x = train_data

    # STEP 1: find clusters
    # perform jump method to find clusters
    mapper = k_means(MAX_K)
    jump, data = mapper.jump_method(x, NUM_ITERATIONS, True, MAX_K)
    num_clusters_predicted = np.argmax(jump)  # find most likely k value

    # extract data from the run with the most likely k value
    distortion, means, assign_train = data[num_clusters_predicted]

    # STEP 2: find mapping
    # modes saves the bit-values for each cluster mean as integer. It is found
    #+ by taking the mode of the symbols which have been assigned to each cluster
    modes = np.zeros(means.size, dtype=np.int8)
    error_values = np.array([bin(x).count('1') for x in range(MAX_K)
                             ])  # create mapping xor diff -> biterrors

    for mean in range(num_clusters_predicted):
        modes[mean] = mode(x_raw[assign_train == mean])[0][0]

    # STEP 3: evaluate BER
    x_raw, x = test_data
    assign = np.argmin(np.abs(x[:, None] - means[None, :]), axis=1)
    x_recon = modes[assign]

    # count bit errors- this code is a bit messy
    diff = x_recon ^ x_raw  # bitwise comparison
    bit_errors = np.sum(error_values[diff])
    ber = bit_errors / (NUM_SAMPLES * BITS_PER_SYMBOL)
    return distortion, means, assign_train, jump, num_clusters_predicted, ber
コード例 #4
0
ファイル: main.py プロジェクト: kirawrath/RP
def main():
	filename='iris2d.data'
	if len(sys.argv) > 1:
		filename = sys.argv[1]
	else:
		print 'Assuming filename as \'iris2d.data\''
	interval=[0]*2

	print 'Do you want to choose a \'k\' or an interval of \'k\'s?'
	choice = raw_input('type \'k\' or \'i\': ')
	if choice == 'k':
		k = int(raw_input('Type a value of k: '))
		interval[0]=interval[1]=k
	elif choice == 'i':
		interval = raw_input('Type the two numbers of the interval: ')
		interval = map(lambda a: int(a), interval.split())
		assert len(interval) == 2
	else:
		print 'Assuming k=3'
		interval[0]=interval[1]=3
		
	
	print 'f0,f,d1,d2'
	dots=None
	for k in range(interval[0], interval[1]+1):
		dots = parse_file(filename)
		print 'k =',k
		result = k_means(k, dots)
		print result
		if result[0] > result[1]:
			print 'f0 is greater than the f from the table,',
			print 'so we can reject the hypothesis that there is no group (it\'s all OK).'
		else:
			print 'f0 is smaller than the f from the table, your K is likely to be inappropriated.'

	if len(dots[0].pos) == 2:
		choice = raw_input('Display last result with IBL1? [y/N] ')
		if choice == 'y':
			use_ibl(dots)
コード例 #5
0
import numpy as np
import pandas as pd
import os
import sys
from k_means import *

if __name__ == '__main__':
    np.set_printoptions(precision=2,
                        suppress=True)  # Cortar la impresión de decimales a 1
    os.chdir('data')
    LARGER_DISTANCE = sys.maxsize
    # Leer los datos de archivo
    df = pd.read_csv("train_numbers.csv")
    # rellena los valores faltantes con la media
    DATA_SET = df.fillna(df.mean()).values
    # Tamaño del conjunto de datos
    DATA_LEN = len(DATA_SET)
    # inicializa el k means
    k_means(5, 0, DATA_SET, DATA_LEN, LARGER_DISTANCE)
コード例 #6
0
    ind = temp.upper().find('-TESTING=')
    if not ind == -1:
        testingFilename = str(temp[10:])
        continue
'''
Reading the training data
'''
f = open(trainingFilename, 'r')
trainingdata = json.load(f)
traininglabels = array(trainingdata['labels'])
trainingpoints = array(trainingdata['points'])
f.close()

# get the vectors usign k-means
vectors = k_means(trainingpoints, N, True)
'''
***************************** PART 1 *****************************
'''

# preparing the data for matplot
if printPlot:
    import matplotlib.pyplot as plt

    x = []
    y = []

    for point in trainingpoints:
        x.append(point[0])
        y.append(point[1])
コード例 #7
0
ファイル: k_means_main.py プロジェクト: anushka018/k-means
from image_utils import *
from k_means import *

if __name__ == "__main__":
    file = input(
        "please enter a image filename to run the k-means clustering algorithm on:"
    )
    k = int(
        input(
            "please enter a value for k, the number of colors you would like to cluster the image into:"
        ))
    output = (input(
        "what would you like to name the file that contains the clustered image? (end name with .ppm)"
    ))
    image = read_ppm(file)
    result = k_means(image, k)
    for mean in range(len(result[0])):
        for j in range(len(result[1])):
            for k in range(len(result[1][0])):
                if (result[1])[j][k] == mean:
                    (image[j][k]) = result[0][mean]
    save_ppm(output, image)
コード例 #8
0
import numpy as np
import k_means
from k_means import k_means
x = np.random.rand(1000, 2) * 100

kmeans = k_means(number_of_clusters=5, number_of_iteration=10, init='random')
points_with_centroid_index, centroids = kmeans.fit(x)
kmeans.visualize()

kmeans = k_means(number_of_clusters=5,
                 number_of_iteration=10,
                 init='initial_centroids_from_points')
points_with_centroid_index, centroids = kmeans.fit(x)
kmeans.visualize()
コード例 #9
0
# Angad Cheema cheem011
# Project 1, run_k_means module
# function for taking a file from the user and running k means
from k_means import *
'''
takes file name, k and output file name as input
applies k means function, then assigns each pixel to the average color of its respective cluster
saves the new image to the output file
'''

file = input("Input image file name: ")
k = int(input("Input number of colors: "))
output_file = input("Input output file name: ")

image = read_ppm(file)
means_list, assignments = k_means(image, k)  # does k means operation

width, height = get_width_height(image)
for x in range(width):
    for y in range(height):
        image[x][y] = means_list[
            assignments[x][y]]  # assigns each pixel to closest cluster mean

save_ppm(output_file, image)
print("Process completed, image saved to", output_file)
コード例 #10
0
gen = data_gen(center, N, K)  # data generator
dr = data_drawer(K)
ch = convex_hull()
m = MST(100, K)
#mass = gen.normal_gen()
#mass = gen.nested_data()
mass = gen.get_random_data()  # all points
#mass = gen.get_real_random_data()
#mass = gen.strip_data()
sample = mass[:sample_N]
smpl = data_gen(center, sample_N, K)
smpl.mass = sample
s_edges = smpl.get_edges()
s_edges.sort()
mst = m.get_MST(s_edges)
km_obj = k_means(mass, K)
#mass = gen.strip_data()
#convex = ch.convex_hull(result[0])
#dr.draw_MST(mst, mass)
#dr.draw_data(mass)

sample_clusters = m.get_clusters(
    sample)  # get clusters from hierarchical clustering
centroids = m.get_centroids()  # get center of clusters
res = 0

# nesting check
for k in range(K):
    convex = ch.convex_hull(sample_clusters[k])
    for i in centroids:
        if len(convex) != 0:
コード例 #11
0
ファイル: main.py プロジェクト: niesmo/school

'''
Reading the training data
'''
f=open(trainingFilename,'r')
trainingdata=json.load(f)
traininglabels=array(trainingdata['labels'])
trainingpoints=array(trainingdata['points'])
f.close()




# get the vectors usign k-means
vectors = k_means(trainingpoints, N, True);

'''
***************************** PART 1 *****************************
'''

# preparing the data for matplot
if printPlot:
  import matplotlib.pyplot as plt
  
  x = [];
  y = [];

  for point in trainingpoints:
    x.append(point[0]);
    y.append(point[1]);
コード例 #12
0
#Ahmed Hassasn
from image_utils import *
from k_means import *

if __name__ == "__main__":
    file_name = input(
        "What is the file name of the image you'd like to change?: ")
    k = int(input("What is the K value you would like to use?: "))
    output = input(
        "What is the file name of the image you'd like to output to?: ")
    image = read_ppm(file_name)
    width, height = get_width_height(image)
    save_ppm(output, k_means(image, k))
コード例 #13
0
ファイル: console.py プロジェクト: yys123456/Algorithm
 def k_means(self, dataList, k, iteration):
     km = k_means(k, iteration, dataList)
     cif = tranDF2list(km.getOutPut())
     plt = km.getImg()
     return cif, plt
コード例 #14
0
def main(degree=4,
         cache_data=False,
         use_cached_data=False,
         use_all_data=True,
         plot=True,
         windowed=True):
    """
    """
    total_start = time.time()
    fetched_data = get_fetched_data(cache_data=cache_data,
                                    use_cached_data=use_cached_data,
                                    use_all_data=use_all_data)

    # Slice data into windows
    data, window_size = scraper.slice_windows(fetched_data)
    fft_data, z_list = poly_fit(data, window_size)
    # freq_vecs = [fft_vec for fft_vec in fft_data]

    symbol_dict = {i: data[i][1] for i in range(len(data))}

    total_freqs_data = np.zeros((6, len(data), fft_data[0].shape[1]))
    for v_ind, fft_vec in enumerate(fft_data):
        for c_ind, coeff_vec in enumerate(fft_vec):
            total_freqs_data[c_ind, v_ind] = np.abs(fft_vec[c_ind])

    k_min = 1000
    k_max = 1050
    label_list = []

    k_list = []
    for c_ind, freq_data in enumerate(total_freqs_data):
        # print("working on coefficient {}".format(c_ind))
        cost_k_list = []
        for k in range(k_min, k_max):
            if k % 5 == 0:
                print("k is {0} of {1} for coefficient {2}".format(
                    k - k_min, k_max - k_min, c_ind))
            clusters, label, cost_list = k_means(freq_data, k)
            cost = cost_list[-1]
            cost_k_list.append(cost)
        opt_k = np.argmin(cost_k_list) + 1 + k_min

        plt.plot(range(k_min, k_max), cost_k_list, 'g^')
        plt.plot(opt_k, min(cost_k_list), 'rD')

        plt.title('Cost vs Number of Clusters')
        plt.savefig('plots/kmeans_{0}_ks.png'.format(c_ind), format='png')
        plt.close()

        X = freq_data
        clusters, label, cost_list = k_means(X, opt_k)
        label_list += [np.array(label)]
        k_list += [opt_k]
        # pt_cluster = clusters[label.flatten()]
        # fig = plt.figure()
        # ax = fig.add_subplot(111, projection="3d")
        # data_plot = ax.plot(X[:, 2], X[:, 50], X[:, 25], "bo", markersize=1)

        # center_plot = plt.plot(clusters[:, 0], clusters[:, 1], "g^", markersize=1)

        # # set up legend and save the plot to the current folder
        # plt.legend((data_plot, center_plot), ('data', 'clusters'), loc = 'best')
        # plt.title('Visualization with {} clusters'.format(opt_k))
        # # plt.show()
        # plt.savefig('plots/kmeans_{0}_{1}.png'.format(c_ind, opt_k), format='png')
        # plt.close()

    label_list = np.array(label_list)
    data_list = []
    for d_ind in range(label_list.shape[1]):
        datum = label_list[:, d_ind]
        out_vec = np.zeros((sum(k_list)))
        offset = 0
        for c_ind in range(len(datum)):
            out_vec[offset + datum[c_ind]] = 1
            offset += k_list[c_ind]
        data_list += [out_vec]

    data_list = np.array(data_list)
    cost_k_list = []
    k_min = 1000
    k_max = 1050
    for k in range(k_min, k_max):
        if k % 50 == 0:
            print("on {0} of {1}".format(k, k_max))
        clusters, label, cost_list = k_means(data_list, k)
        cost = cost_list[-1]
        cost_k_list.append(cost)
    opt_k = np.argmin(cost_k_list) + 1

    clusters, label, cost_list = k_means(X, opt_k)

    plt.plot(range(k_min, k_max), cost_k_list, 'g^')
    plt.plot(opt_k, min(cost_k_list), 'rD')

    plt.title('Cost vs Number of Clusters')
    plt.savefig('plots/kmeans_layer2_ks.png'.format(c_ind), format='png')
    plt.close()

    label_symbols = []
    # print(opt_k)
    total = 0
    for i in range(opt_k):
        ind_arr = np.where(label == i)
        ind_arr = ind_arr[0]
        if len(ind_arr) > 0:
            cluster_syms = []
            for ind in ind_arr:
                cluster_syms += [symbol_dict[ind]]
            label_symbols += [cluster_syms]
            total += len(cluster_syms)
    print(total)
    label_symbols = np.array(label_symbols)
    print(label_symbols)
    print("================================\n Total elapsed time: {}".format(
        time.time() - total_start))
    return label_symbols
コード例 #15
0
    return parameters


if __name__ == '__main__':
    param = getParameters()

    print("################# kmeans ######################")
    print("1 - charger les données iris")
    print("2 - appliquer kmeans sur des données générées aléatoirement")
    print("3 - tester la methode Elbow sur des données aleatoires")
    mode = int(input("choisissez le mode: "))

    if mode == 1:
        points = read_iris_data()
        k_means(points,
                param['numberOfClusters'],
                param['maxNumberOfRepetitions'],
                iris=True)
        print(
            "Verifiez les fichiers iris_centers et iris_results pour voir les resultats de l'appel."
        )

    elif mode == 2:
        points = generatepoints(param['numberOfDatas'], param['dataDimension'],
                                param['minRandomNumber'],
                                param['maxRandomNumber'])
        k_means(points, param['numberOfClusters'],
                param['maxNumberOfRepetitions'])
        print(
            "Verifiez les fichiers centroids et results pour voir les resultats de l'appel"
        )
コード例 #16
0
ファイル: route.py プロジェクト: joelcarlss/2DV515-A2
    def get(self):
        blog_data, blog_names = get_data()
        result = k_means(blog_data)
        named_list = elements_for_names(result, blog_names)

        return {'res': named_list}  # Fetches first column that is Employee ID
コード例 #17
0
from image_utils import *
from k_means import *

# inputs for the image and k
file = input("Enter image file > ")
k = int(input('Enter how many colors > '))

print('...processing...')
image = read_ppm(file)

# k means algorithm used to produce new image
newImage = k_means(image, k)
print('New modified image generated')

# new image saved to file "newImage.ppm"
save_ppm("newImage.ppm", newImage)
print('New modified image saved')