예제 #1
0
 def __init__(self, n_clusters=40):
     self.n_clusters = n_clusters
     self.kmeans_obj = KMeans(n_clusters=n_clusters)
     self.kmeans_ret = None
     self.descriptor_vstack = None
     self.mega_histogram = None
     self.clf = GaussianNB()
예제 #2
0
def bestKElbow(X, options):
    K = 2
    finish = 10
    kmeans_list = []
    fits = []
    porcentage = []
    tolerance = 10  # 10% de tolerancia
    finished = False
    best_k = 0

    # kmeans con k=2
    kmeans = km.KMeans(X, K, options)
    kmeans_list.append(kmeans)
    fits.append(kmeans.fitting())

    K += 1

    # kmeans con k=3
    kmeans = km.KMeans(X, K, options)
    kmeans_list.append(kmeans)
    fits.append(kmeans.fitting())
    K += 1

    # Guardamos la resta y le asignamos un porcentage de 100%
    porcentage.append((fits[0] - fits[1], 100))

    while K <= finish and not finished:
        best_k = 0
        kmeans = km.KMeans(X, K, options)
        kmeans_list.append(kmeans)
        fits.append(kmeans.fitting())
        K += 1

        # restamos el penultimo con el antepenultimo (ultimos 2 kmeans calculados)
        resta = fits[-2] - fits[-1]
        # calculamos el porcentage respecto a la primera resta
        first_resta = porcentage[0][0]
        if first_resta == 0:
            first_resta = 0.000000000001
        porcentage.append((resta, resta * 100 / first_resta))

        # si las 2 ultimos porcentajes estan por debajo de la tolerancia cogemos la k anterior a estos 2
        if (porcentage[-2][1] < tolerance and porcentage[-1][1] < tolerance):
            # print "por telerancia ssale"
            finished = True
            # le kito por el valor k empieza la k y las 2 k me paso
            best_k = K - 2 - 2 - 1

    # Si no se ha cumplido el factor de tolerancia buscamos el % mas grande de caida y elegimos su segunda K
    if best_k == 0:
        max_porcentage = porcentage[1][1]
        porcentage_pos = 1
        for pos, x in enumerate(porcentage[2:]):
            if x[1] > max_porcentage:
                max_porcentage = x[1]
                porcentage_pos = pos

        best_k = porcentage_pos

    return kmeans_list[best_k]
예제 #3
0
def evaluate_adjusted_rand_score(data, threshold, random):
    full_data = Parser.parse_data(only_answers=False)
    labels_true = Parser.get_true_party_assignment(full_data)
    number_of_parties = 11
    assignment = KMeans.kmeans(data, number_of_parties, threshold, random)
    labels_pred = KMeans.get_centroid_labels(data, assignment)
    return adjusted_rand_score(labels_true, labels_pred)
예제 #4
0
def testGapStatistic():
    count = 10;
    # X = np.array(np.loadtxt("data/kmeans1.txt", delimiter = "\t"));
    # X = np.mat(np.load("/media/WindowsE/Data/PARS/JNLH/YuanLiaoDaiShui/trainSet/train_all_75.npy"))[:, 68];#57
    # X = np.mat(np.load("/media/WindowsD/WorkSpace/data.npy"))[:, 0];
    X = np.load("/media/WindowsE/Data/PARS/JNLH/ReasonAnalysis/30/2020-08-01/data.npy")[:, 1].reshape(-1, 1);

    plt.figure(1, (12, 8));
    plt.get_current_fig_manager().window.showMaximized();
    plt.hist(X, bins = 1000);
    plt.show(block=True);

    selector1 = KMeans.GapStatistic(20, True);
    selector2 = KMeans.ElbowMethod(True);
    index, distance, center = KMeans.KMeans.optimalK(X, 5, count, KMeans.CombinationOptimalKSelector([selector1, selector2]));

    # plt.figure(1, (12, 8));
    # plt.get_current_fig_manager().window.showMaximized();
    #
    # colors = ["g", "b", "y", "k", "r", "m", "c"];
    # markers = ["*", "+", "D", "s", "h", "v", "d"];
    #
    # for i in range(0, len(center)):
    #     plt.scatter(X[index == i, 0], X[index == i, 1], color = colors[i], marker = markers[i]);
    #     plt.scatter(center[i, 0], center[i, 1], color = ["r"], marker = "x");
    #
    # plt.show(block=True);

    plt.figure(1, (12, 8));
    plt.get_current_fig_manager().window.showMaximized();
    plt.hist(X, bins=1000);
    for i in range(len(center)):
        plt.axvline(center[i, 0], color = "r");
    plt.show(block=True);
예제 #5
0
def processImage(im, options):
    """@brief   Finds the colors present on the input image
    
    @param  im      LIST    input image
    @param  options DICTIONARY  dictionary with options
    
    @return colors  LIST    colors of centroids of kmeans object
    @return indexes LIST    indexes of centroids with the same label
    @return kmeans  KMeans  object of the class KMeans
    """

    #########################################################
    ##  YOU MUST ADAPT THE CODE IN THIS FUNCTIONS TO:
    ##  1- CHANGE THE IMAGE TO THE CORRESPONDING COLOR SPACE FOR KMEANS
    ##  2- APPLY KMEANS ACCORDING TO 'OPTIONS' PARAMETER
    ##  3- GET THE NAME LABELS DETECTED ON THE 11 DIMENSIONAL SPACE
    #########################################################

    ##  1- CHANGE THE IMAGE TO THE CORRESPONDING COLOR SPACE FOR KMEANS
    if options['colorspace'].lower() == 'ColorNaming'.lower():
        im = cn.ImColorNamingTSELabDescriptor(im)
    elif options['colorspace'].lower() == 'RGB'.lower():
        pass
        # im = np.reshape(im, (-1, im.shape[2]))
    elif options['colorspace'].lower() == 'Lab'.lower():
        im = color.rgb2lab(im)
    elif options['colorspace'].lower() == 'HSV'.lower():
        im = color.rgb2hsv(im)

    ##  2- APPLY KMEANS ACCORDING TO 'OPTIONS' PARAMETER
    if options['K'] < 2:
        kmeans = km.KMeans(im, 0, options)
        kmeans.bestK()
    else:
        kmeans = km.KMeans(im, options['K'], options)
        kmeans.run()

    ##  3- GET THE NAME LABELS DETECTED ON THE 11 DIMENSIONAL SPACE
    if options['colorspace'].lower() == 'RGB'.lower():
        kmeans.centroids = cn.ImColorNamingTSELabDescriptor(kmeans.centroids)

    elif options['colorspace'].lower() == 'Lab'.lower():
        kmeans.centroids = kmeans.centroids[:, newaxis, :]
        kmeans.centroids = color.lab2rgb(kmeans.centroids) * 255.0
        kmeans.centroids = np.reshape(
            kmeans.centroids,
            (kmeans.centroids.shape[0], kmeans.centroids.shape[2]))
        kmeans.centroids = cn.ImColorNamingTSELabDescriptor(kmeans.centroids)

    elif options['colorspace'].lower() == 'HSV'.lower():
        kmeans.centroids = kmeans.centroids[:, newaxis, :]
        kmeans.centroids = color.hsv2rgb(kmeans.centroids)
        kmeans.centroids = np.reshape(
            kmeans.centroids,
            (kmeans.centroids.shape[0], kmeans.centroids.shape[2]))
        kmeans.centroids = cn.ImColorNamingTSELabDescriptor(kmeans.centroids)

    colors, which = getLabels(kmeans, options)
    return colors, which, kmeans
예제 #6
0
    def KMeans(self, X):
        kmeans = KMeans(n_clusters=2, random_state=0).fit(X)

        kmeans.labels_
        kmeans.predict([[0, 0], [4, 4]])
        kmeans.cluster_centers_

        return kmeans
예제 #7
0
    def test_bikmeans(self):
        data_mat = mat(KMeans.loadDataSet("testSet2.txt"))
        k = 3
        centList, clusterAssment = KMeans.biKmeans(data_mat, k)
        print("\n centList == %s" % (centList))
        print("\n clusterAssment == %s" % (clusterAssment))

        # plot
        fig = plt.figure()
        ax = fig.add_subplot(111)

        # 使用local()函数动态定义变量
        for i in arange(k):
            locals()['xCluster' + str(i)] = []
            locals()['yCluster' + str(i)] = []

        for i in range(shape(clusterAssment)[0]):
            locals()['xCluster' + str(int(clusterAssment[i, 0]))].append(
                data_mat[i, 0])
            locals()['yCluster' + str(int(clusterAssment[i, 0]))].append(
                data_mat[i, 1])
        print("\n xCluster0 == %s" % (locals()['xCluster' + str(0)]))
        print("\n yCluster0 == %s" % (locals()['yCluster' + str(0)]))

        for i in arange(k):
            ax.scatter(locals()['xCluster' + str(i)],
                       locals()['yCluster' + str(i)],
                       s=30,
                       c='orange',
                       marker='s')

        # for i in range(shape(clusterAssment)[0]):
        #     if clusterAssment[i, 0] == 0:
        #         xCluster0.append(data_mat[i, 0])
        #         yCluster0.append(data_mat[i, 1])
        #     elif clusterAssment[i, 0] == 1:
        #         xCluster1.append(data_mat[i, 0])
        #         yCluster1.append(data_mat[i, 1])
        #     elif clusterAssment[i, 0] == 2:
        #         xCluster2.append(data_mat[i, 0])
        #         yCluster2.append(data_mat[i, 1])
        #     elif clusterAssment[i, 0] == 3:
        #         xCluster3.append(data_mat[i, 0])
        #         yCluster3.append(data_mat[i, 1])
        # ax.scatter(xCluster0, yCluster0, s=30, c='orange', marker='s')
        # ax.scatter(xCluster1, yCluster1, s=30, c='red', marker='p')
        # ax.scatter(xCluster2, yCluster2, s=30, c='blue', marker='*')
        # ax.scatter(xCluster3, yCluster3, s=30, c='black', marker='d')

        # 绘制质心点
        xcord2 = centList[:, 0].A
        ycord2 = centList[:, 1].A
        ax.scatter(xcord2, ycord2, s=100, c='red', marker='+')
        plt.show()
예제 #8
0
def processImage(im, options):
    """@brief   Finds the colors present on the input image
    
    @param  im      LIST    input image
    @param  options DICTIONARY  dictionary with options
    
    @return colors  LIST    colors of centroids of kmeans object
    @return indexes LIST    indexes of centroids with the same label
    @return kmeans  KMeans  object of the class KMeans
    """

    #########################################################
    ##  YOU MUST ADAPT THE CODE IN THIS FUNCTIONS TO:
    ##  1- CHANGE THE IMAGE TO THE CORRESPONDING COLOR SPACE FOR KMEANS
    ##  2- APPLY KMEANS ACCORDING TO 'OPTIONS' PARAMETER
    ##  3- GET THE NAME LABELS DETECTED ON THE 11 DIMENSIONAL SPACE
    #########################################################

    ##  1- CHANGE THE IMAGE TO THE CORRESPONDING COLOR SPACE FOR KMEANS
    if options['colorspace'] == 'ColorNaming':
        im = cn.ImColorNamingTSELabDescriptor(im)
    elif options['colorspace'] == 'Lab':
        im = color.rgb2lab(im)
    #No fa falta RGB per que no s'ha de transformar res

    img = rescale(im, 1, preserve_range=True)  #Reescalat per que si no explota
    img = np.reshape(img, (-1, img.shape[2]))

    ##  2- APPLY KMEANS ACCORDING TO 'OPTIONS' PARAMETER
    if options['K'] < 2:  # find the best K
        kmeans = km.KMeans(im, 0, options)
        kmeans.bestK()
    else:
        kmeans = km.KMeans(img, options['K'], options)
        kmeans.run()

##  3- GET THE NAME LABELS DETECTED ON THE 11 DIMENSIONAL SPACE
    if options['colorspace'] != 'ColorNaming':
        kmeans.centroids = np.reshape(
            kmeans.centroids,
            (-1, 1, kmeans.centroids.shape[1]
             ))  #Un altre reshape per que si no falla el test amb Lab o RGB
        if options['colorspace'] == 'Lab':
            kmeans.centroids = color.lab2rgb(kmeans.centroids) * 255  #El extra
        kmeans.centroids = cn.ImColorNamingTSELabDescriptor(kmeans.centroids)
        kmeans.centroids = np.reshape(kmeans.centroids,
                                      (-1, kmeans.centroids.shape[2]))

#########################################################
##  THE FOLLOWING 2 END LINES SHOULD BE KEPT UNMODIFIED
#########################################################
    colors, which = getLabels(kmeans, options)
    return colors, which, kmeans
예제 #9
0
def processImage(im, options):
    """@brief   Finds the colors present on the input image
    
    @param  im      LIST    input image
    @param  options DICTIONARY  dictionary with options
    
    @return colors  LIST    colors of centroids of kmeans object
    @return indexes LIST    indexes of centroids with the same label
    @return kmeans  KMeans  object of the class KMeans
    """

    #########################################################
    ##  YOU MUST ADAPT THE CODE IN THIS FUNCTIONS TO:
    ##  1- CHANGE THE IMAGE TO THE CORRESPONDING COLOR SPACE FOR KMEANS
    ##  2- APPLY KMEANS ACCORDING TO 'OPTIONS' PARAMETER
    ##  3- GET THE NAME LABELS DETECTED ON THE 11 DIMENSIONAL SPACE
    #########################################################
    #im = im.astype('uint8')
    ##  1- CHANGE THE IMAGE TO THE CORRESPONDING COLOR SPACE FOR KMEANS
    if options['colorspace'].lower() == 'ColorNaming'.lower():
        im = cn.ImColorNamingTSELabDescriptor(im)
    elif options['colorspace'].lower() == 'RGB'.lower():
        pass  #im = color.convert_colorspace(im, 'RGB', options['colorspace'])
    elif options['colorspace'].lower() == 'Lab'.lower():
        im = im.astype('float64')
        im = color.rgb2lab(im / 255)
    elif options['colorspace'].lower() == 'HSV'.lower():
        im = color.rgb2hsv(im.astype('uint8'))

##  2- APPLY KMEANS ACCORDING TO 'OPTIONS' PARAMETER
    if options['K'] < 2:  # find the best K
        kmeans = km.KMeans(im, 0, options)
        kmeans.bestK()
    else:
        kmeans = km.KMeans(im, options['K'], options)
        kmeans.run()

##  3- GET THE NAME LABELS DETECTED ON THE 11 DIMENSIONAL SPACE
    if options['colorspace'].lower() == 'RGB'.lower():
        kmeans.centroids = cn.ImColorNamingTSELabDescriptor(kmeans.centroids)

    elif options['colorspace'].lower() == 'Lab'.lower():
        kmeans.centroids = color.lab2rgb([kmeans.centroids])[0] * 255
        kmeans.centroids = cn.ImColorNamingTSELabDescriptor(kmeans.centroids)
    elif options['colorspace'].lower() == 'HSV'.lower():
        kmeans.centroids = color.hsv2rgb([kmeans.centroids])[0]
        kmeans.centroids = cn.ImColorNamingTSELabDescriptor(kmeans.centroids)

#########################################################
##  THE FOLLOWING 2 END LINES SHOULD BE KEPT UNMODIFIED
#########################################################
    colors, which = getLabels(kmeans, options)
    return colors, which, kmeans
예제 #10
0
def run_test(fileName, max_k):
    cache_dir = './cache'
    D = 2.
    T = 3.
    L = 1.

    host, paras, phi = newickFormatReader.getInput(fileName)

    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
        f = open('%s/README' % cache_dir, 'w')
        f.write(
            'This directory holds a cache of reconciliation graph for the TreeLife data set'
        )
        f.close()

    cache_location = '%s/%s.graph' % (cache_dir, os.path.split(fileName)[1])
    if not os.path.isfile(cache_location):
        print >> sys.stderr, 'A reconciliation graph has not been built yet for this newick file'
        print >> sys.stderr, 'Doing so now and caching it in {%s}...' % cache_location

        DictGraph, numRecon = DP.DP(host, paras, phi, D, T, L)

        f = open(cache_location, 'w+')
        f.write(repr(DictGraph))
        f.close()

    print >> sys.stderr, 'Loading reonciliation graph from cache'
    f = open(cache_location)
    DictGraph = eval(f.read())
    f.close()

    scoresList, dictReps = Greedy.Greedy(DictGraph, paras)

    print >> sys.stderr, 'Found cluster representatives using point-collecting'

    graph = ReconGraph.ReconGraph(DictGraph)
    setReps = [
        ReconGraph.dictRecToSetRec(graph, dictRep) for dictRep in dictReps
    ]
    random.seed(0)
    extra_reps = [KMeans.get_template(graph) for i in xrange(max_k)]

    representatives = setReps + extra_reps

    print >> sys.stderr, 'Starting K Means algorithm ... '
    print >> sys.stderr, 'Printing Average and Maximum cluster radius at each step'

    for i in xrange(1, max_k + 1):
        print 'k = %d' % i
        KMeans.k_means(graph, 10, i, 0, representatives[:i])
예제 #11
0
def processImage(im, options):
    """@brief   Finds the colors present on the input image
    
    @param  im      LIST    input image
    @param  options DICTIONARY  dictionary with options
    
    @return colors  LIST    colors of centroids of kmeans object
    @return indexes LIST    indexes of centroids with the same label
    @return kmeans  KMeans  object of the class KMeans
    """
    """
    Aquesta és la funció principal en la que es basa tot el projecte. 
    Aqui s'ha de transformar la imatge donada a un espai de tres dimensions per
    tal que es pugui treballar amb el nostre algorisme K-Means.
    Després es troben les etiquetes de color a partir dels centroids i es determina
    el percentatge d'èxit que ha tingut la nostra execució.
    """

    ##  1- Canviem l'espai de color donat per a que funcioni amb el nostre algorisme.
    if options['colorspace'].lower() == 'ColorNaming'.lower():
        im = cn.ImColorNamingTSELabDescriptor(im)

    elif options['colorspace'].lower() == 'RGB'.lower():
        pass

    elif options['colorspace'].lower() == 'Lab'.lower():
        im = color.rgb2lab(im)

##  2- Analitzem el paràmetre K per determinar si hem o no d'executar el bestK o no.
##      Com que nosaltres fem la crida si i només si K=0, en cas que sigui K=1, executem bestK.

    if options['K'] < 2:
        kmeans = km.KMeans(im, 0, options)
    else:
        kmeans = km.KMeans(im, options['K'], options)

    kmeans.run()

    ##  3- Obtenim les etiquetes dels colors de la nostra imatge.
    if options['colorspace'].lower() == 'Lab'.lower():
        kmeans.centroids = color.lab2rgb([kmeans.centroids])[0] * 255
        kmeans.centroids = cn.ImColorNamingTSELabDescriptor(kmeans.centroids)

    elif options['colorspace'].lower() == 'RGB'.lower():
        kmeans.centroids = cn.ImColorNamingTSELabDescriptor(kmeans.centroids)

#########################################################
##  THE FOLLOWING 2 END LINES SHOULD BE KEPT UNMODIFIED
#########################################################
    colors, which = getLabels(kmeans, options)
    return colors, which, kmeans
예제 #12
0
 def __init__(self, n_cluster, data, use_kmeans=True, w=0.5, c1=0.8, c2=0.6):
     index = np.random.choice(list(range(len(data))), n_cluster)
     self.centroids = data[index].copy()
     if use_kmeans:
         kmeans = KMeans(n_cluster=n_cluster, init_pp=False)
         kmeans.fit(data)
         self.centroids = kmeans.centroid.copy()
     self.best_position = self.centroids.copy()
     self.best_score = quantization_error(self.centroids, self._predict(data), data)
     self.best_sse = calc_sse(self.centroids, self._predict(data), data)
     self.velocity = np.zeros_like(self.centroids)
     self._w = w
     self._c1 = c1
     self._c2 = c2
예제 #13
0
    def test_rand_cent1(self):
        data_mat = mat(KMeans.loadDataSet("testSet.txt"))
        min0 = min(data_mat[:, 0])
        max0 = max(data_mat[:, 0])
        print("\n min0 == %s max0 == %s" % (min0, max0))
        min1 = min(data_mat[:, 1])
        max1 = max(data_mat[:, 1])
        print("\n min1 == %s max1 == %s" % (min1, max1))

        cent = KMeans.randCent(data_mat, 2)
        print("\n cent == %s" % (cent))

        dist_eclud = KMeans.distEclud(data_mat[0], data_mat[1])
        print("\n dist_eclud == %s" % (dist_eclud))
예제 #14
0
def processImage(im, options):
    """@brief   Finds the colors present on the input image
    
    @param  im      LIST    input image
    @param  options DICTIONARY  dictionary with options
    
    @return colors  LIST    colors of centroids of kmeans object
    @return indexes LIST    indexes of centroids with the same label
    @return kmeans  KMeans  object of the class KMeans
    """

#########################################################
##  YOU MUST ADAPT THE CODE IN THIS FUNCTIONS TO:
##  1- CHANGE THE IMAGE TO THE CORRESPONDING COLOR SPACE FOR KMEANS
##  2- APPLY KMEANS ACCORDING TO 'OPTIONS' PARAMETER
##  3- GET THE NAME LABELS DETECTED ON THE 11 DIMENSIONAL SPACE
#########################################################

##  1- CHANGE THE IMAGE TO THE CORRESPONDING COLOR SPACE FOR KMEANS
    if options['colorspace'].lower() == 'ColorNaming'.lower():  
        im = cn.ImColorNamingTSELabDescriptor(im)
    elif options['colorspace'].lower() == 'RGB'.lower():        
        pass  #Ya estamos en RGB
    elif options['colorspace'].lower() == 'Lab'.lower():        
        im = color.rgb2lab(im)
        im = np.array(im).reshape((-1,3))


##  2- APPLY KMEANS ACCORDING TO 'OPTIONS' PARAMETER
    if options['K']<2: # find the bes K
        kmeans = km.KMeans(im, 0, options)
        kmeans.bestK() #bestK segons fitting
    else:
        kmeans = km.KMeans(im, options['K'], options) 
        kmeans.run()

##  3- GET THE NAME LABELS DETECTED ON THE 11 DIMENSIONAL SPACE
    if options['colorspace'].lower() == 'RGB'.lower():        
        kmeans.centroids = cn.ImColorNamingTSELabDescriptor(kmeans.centroids)
    elif options['colorspace'].lower() == 'Lab'.lower():
        pass
        

#########################################################
##  THE FOLLOWING 2 END LINES SHOULD BE KEPT UNMODIFIED
#########################################################
    colors, which = getLabels(kmeans, options)   
    return colors, which, kmeans
예제 #15
0
    def test1(self):
        print "TEST 1:----------------------------------------------------------------"
        features = np.array([[1.9, 2.3],
                          [1.5, 2.5],
                          [0.8, 0.6],
                          [0.4, 1.8],
                          [0.1, 0.1],
                          [0.2, 1.8],
                          [2.0, 0.5],
                          [0.3, 1.5],
                          [1.0, 1.0]])
        whitened = whiten(features)
        book = np.array((whitened[0], whitened[2]))
        numpy_result = kmeans(whitened, book)[0]
        print numpy_result
        print ""

        features2 = np.array([[1.9, 2.3,0],
                             [1.5, 2.5,0],
                             [0.8, 0.6,0],
                             [0.4, 1.8,0],
                             [0.1, 0.1,0],
                             [0.2, 1.8,0],
                             [2.0, 0.5,0],
                             [0.3, 1.5,0],
                             [1.0, 1.0,0]])
        whitened2 = whiten(features2)
        book2 = [whitened[0], whitened[2]]
        our_result = np.array(KMeans.k_means2(whitened2.tolist(), 2, book2).centroids)[:, :-1]
        print our_result
예제 #16
0
    def fit(self, data):

        # step1 construct weight matrix for every point
        #weight = kneighbors_graph(data, n_neighbors = self.n_neighbors_,mode='distance', include_self = False)
        weight = kneighbors_graph(data, n_neighbors = self.n_neighbors_,mode='connectivity', include_self = False)
        weight = 0.5 * (weight + weight.T)
        self.weight_ = weight.toarray()
        self.degree_ = np.diag(np.sum(self.weight_, axis = 0).ravel()) 

        # step2 construct Laplacian matrix for every point, and normalize
        self.laplacians_ = self.degree_ - self.weight_
        #unit_arrary = np.ones([data.shape[0],data.shape[0]],dtype=np.float64)
        #with np.errstate(divide='ignore'): 
        #    degree_nor = unit_arrary/np.sqrt(self.degree_) 
        #    degree_nor[self.degree_ == 0] = 0
        degree_nor=np.sqrt(np.linalg.inv(self.degree_))
        self.laplacians_ = np.dot(degree_nor, self.laplacians_)  
        self.laplacians_ = np.dot(self.laplacians_, degree_nor)#normalize

        #step3 compute minimun k eigenvalues corresponding to eigenvectors and normalize
        eigen_values, eigen_vector  = np.linalg.eigh(self.laplacians_)
        sort_index = eigen_values.argsort()
        eigen_vector = eigen_vector[:,sort_index]
        self.eigen_vector_ = np.asarray([eigen_vector[:,i] for i in range(self.n_clusters_)]).T
        #self.eigen_vector_ /= np.sqrt(np.sum(self.eigen_vector_**2, axis = 1)).reshape(data.shape[0], 1 )
        self.eigen_vector_ /= np.linalg.norm(self.eigen_vector_, axis=1).reshape(data.shape[0], 1 )
        
        #step4  kmeans with eigenvectors 
        spectral_kmeans = KMeans.K_Means(n_clusters=self.n_clusters_)
        spectral_kmeans.fit(self.eigen_vector_)
        spectral_label = spectral_kmeans.predict(self.eigen_vector_)
        self.label_ = spectral_label
        self.fitted = True
def clusterClubs(numClust=5):
    datList = []
    for line in open('places.txt').readlines():
        lineArr = line.split('\t')
        datList.append([float(lineArr[4]), float(lineArr[3])])
    datMat = np.mat(datList)
    # 利用2-means聚类算法聚类
    myCentroids, clustAssing = KMeans.biKmeans(datMat,
                                               numClust,
                                               distMeas=distSLC)
    fig = plt.figure()
    rect = [0.1, 0.1, 0.8, 0.8]
    scatterMarkers = ['s', 'o', '^', '8', 'p', 'd', 'v', 'h', '>', '<']
    axprops = dict(xticks=[], yticks=[])
    ax0 = fig.add_axes(rect, label='ax0', **axprops)
    imgP = plt.imread('Portland.png')
    ax0.imshow(imgP)
    ax1 = fig.add_axes(rect, label='ax1', frameon=False)
    for i in range(numClust):
        ptsInCurrCluster = datMat[np.nonzero(clustAssing[:, 0].A == i)[0], :]
        markerStyle = scatterMarkers[i % len(scatterMarkers)]
        ax1.scatter(ptsInCurrCluster[:, 0].flatten().A[0], \
                    ptsInCurrCluster[:, 1].flatten().A[0], \
                    marker = markerStyle, s=90)
    for i in range(numClust):
        ax1.scatter(myCentroids[i].tolist()[0][0],
                    myCentroids[i].tolist()[0][1],
                    s=300,
                    c='k',
                    marker='+',
                    alpha=.5)
    plt.show()
예제 #18
0
def processImage(im, options):
    """@brief   Finds the colors present on the input image
    
    @param  im      LIST    input image
    @param  options DICTIONARY  dictionary with options
    
    @return colors  LIST    colors of centroids of kmeans object
    @return indexes LIST    indexes of centroids with the same label
    @return kmeans  KMeans  object of the class KMeans
    """

    if options['colorspace'].lower() == 'ColorNaming'.lower():
        im = cn.ImColorNamingTSELabDescriptor(im)

    elif options['colorspace'].lower() == 'RGB'.lower():
        pass
    elif options['colorspace'].lower() == 'Lab'.lower():
        im = im.astype('float64')
        im = color.rgb2lab(im / 255)

    kmeansAlgorithm = km.KMeans(im, options['K'], options)
    kmeansAlgorithm.run()

    if options['colorspace'].lower() == 'RGB'.lower():
        kmeansAlgorithm.centroids = cn.ImColorNamingTSELabDescriptor(
            kmeansAlgorithm.centroids)

    elif options['colorspace'].lower() == 'Lab'.lower():
        kmeansAlgorithm.centroids = color.lab2rgb([kmeansAlgorithm.centroids
                                                   ])[0] * 255
        kmeansAlgorithm.centroids = cn.ImColorNamingTSELabDescriptor(
            kmeansAlgorithm.centroids)

    colors_obt, which_obt = getLabels(kmeansAlgorithm, options)
    return colors_obt, which_obt, kmeansAlgorithm
예제 #19
0
def runExperimentsForKMeans(numClusters, numExperiments, useTFIDF=True):
    maxSim = 0
    experiment = 0
    for i in range(numExperiments):
        print "\nExperiment " + str(i) + " for " + str(
            numClusters) + " clusters"
        [clusterAssignment,
         similarity] = KMeans.findClusterAssignment(numClusters, useTFIDF)
        outputFileName = "output/KMeans" + str(numClusters) + "-" + str(i)
        KMeans.printDocumentClusters(clusterAssignment, outputFileName)
        if maxSim < similarity:
            maxSim = similarity
            experiment = i
    print "Max Similarity over " + str(
        numExperiments) + " Experiments: " + str(maxSim)
    return maxSim, experiment
예제 #20
0
def main():
    try:
        graphyboi = GraphController()
        graphyboi.update()
        kmeansyboi = KMeans(graphyboi.coords)
    except:
        print("Nothing in coords")
def run_test(fileName, max_k):
    cache_dir = './cache'
    D = 2.
    T = 3.
    L = 1.

    host, paras, phi = newickFormatReader.getInput(fileName)

    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
        f = open('%s/README' % cache_dir, 'w')
        f.write('This directory holds a cache of reconciliation graph for the TreeLife data set')
        f.close()

    cache_location = '%s/%s.graph' % (cache_dir, os.path.split(fileName)[1])
    if not os.path.isfile(cache_location):
        print >> sys.stderr, 'A reconciliation graph has not been built yet for this newick file'
        print >> sys.stderr, 'Doing so now and caching it in {%s}...' % cache_location

        DictGraph, numRecon = DP.DP(host, paras, phi, D, T, L)

        f = open(cache_location, 'w+')
        f.write(repr(DictGraph))
        f.close()

    print >> sys.stderr, 'Loading reonciliation graph from cache'
    f = open(cache_location)
    DictGraph = eval(f.read())
    f.close()

    scoresList, dictReps = Greedy.Greedy(DictGraph, paras)

    print >> sys.stderr, 'Found cluster representatives using point-collecting'

    graph = ReconGraph.ReconGraph(DictGraph)
    setReps = [ReconGraph.dictRecToSetRec(graph, dictRep) for dictRep in dictReps]
    random.seed(0)
    extra_reps = [KMeans.get_template(graph) for i in xrange(max_k)]

    representatives = setReps + extra_reps

    print >> sys.stderr, 'Starting K Means algorithm ... '
    print >> sys.stderr, 'Printing Average and Maximum cluster radius at each step'

    for i in xrange(1, max_k + 1):
        print 'k = %d' % i
        KMeans.k_means(graph, 10, i, 0, representatives[:i])
예제 #22
0
    def test_kmeans(self):
        data_mat = mat(KMeans.loadDataSet("testSet.txt"))
        centroids, clusterAssment = KMeans.kMeans(data_mat, 4)
        print("\n centroids == \n %s" % (centroids))
        print("\n clusterAssment == \n %s" % (clusterAssment))

        # plot
        fig = plt.figure()
        ax = fig.add_subplot(111)
        xcord = data_mat[:, 0].A
        ycord = data_mat[:, 1].A

        xCluster0 = []
        yCluster0 = []
        xCluster1 = []
        yCluster1 = []
        xCluster2 = []
        yCluster2 = []
        xCluster3 = []
        yCluster3 = []

        for i in range(shape(clusterAssment)[0]):
            if clusterAssment[i, 0] == 0:
                xCluster0.append(data_mat[i, 0])
                yCluster0.append(data_mat[i, 1])
            elif clusterAssment[i, 0] == 1:
                xCluster1.append(data_mat[i, 0])
                yCluster1.append(data_mat[i, 1])
            elif clusterAssment[i, 0] == 2:
                xCluster2.append(data_mat[i, 0])
                yCluster2.append(data_mat[i, 1])
            elif clusterAssment[i, 0] == 3:
                xCluster3.append(data_mat[i, 0])
                yCluster3.append(data_mat[i, 1])
        ax.scatter(xCluster0, yCluster0, s=30, c='orange', marker='s')
        ax.scatter(xCluster1, yCluster1, s=30, c='red', marker='p')
        ax.scatter(xCluster2, yCluster2, s=30, c='blue', marker='*')
        ax.scatter(xCluster3, yCluster3, s=30, c='black', marker='d')

        # 绘制原始数据
        # ax.scatter(xcord, ycord, s=10, c='orange', marker='s')

        # 绘制质心点
        xcord2 = centroids[:, 0].A
        ycord2 = centroids[:, 1].A
        ax.scatter(xcord2, ycord2, s=100, c='red', marker='+')
        plt.show()
예제 #23
0
def Cluster(cluster_zone):
    print ("step 2: Building Dataset..." ) 
    dataSet = [] 
    for i in g_macro_node_list:
        temp = []
        temp.append(float(i.x_coord))
        temp.append(float(i.y_coord))
        dataSet.append(temp)

    dataSet = mat(dataSet)

    print ("step 3: Clustering..." ) 
    centroids, clusterAssment = KMeans.kmeans(dataSet, cluster_zone)     
    
    print ("step 3: show the result...")
    KMeans.showCluster(dataSet, cluster_zone, centroids, clusterAssment)
    return centroids, clusterAssment, dataSet
예제 #24
0
    def find(self, scores):
        scale = 100
        data = np.mat(scores).T * scale
        indices, distances, center = KMeans(
            lambda X, k: np.mat([X.min(), X.mean(), X.max()]).T).clustering(
                data, 3, 1)
        print("anomaly score centers:{0}".format(center.T))

        checkValue = center[2, 0]
        minCheckValue = self._minCheckValue * scale
        maxCheckValue = self._maxCheckValue * scale
        defaultThreshold = self._defaultThreshold * scale
        minValue = data[(indices == 2).A.flatten(), :].min(0)[0, 0]
        maxValue = data[(indices == 2).A.flatten(), :].max(0)[0, 0]

        if maxValue <= defaultThreshold:
            return defaultThreshold / scale

        if checkValue >= defaultThreshold:
            checkValue = (minValue + checkValue) / 2
        elif checkValue <= minCheckValue:
            checkValue = (checkValue + maxValue) / 2
        if checkValue < minCheckValue:
            checkValue = minCheckValue
        elif checkValue > maxCheckValue:
            checkValue = maxCheckValue
        print("threshold check value: {0}".format(checkValue))

        i = None
        for j in range(0, data.shape[0]):
            if data[j, 0] >= checkValue and i is None:
                i = j

            if data[j, 0] < checkValue and i is not None:
                if j - i > CurvesThresholdFinder.MIN_SAMPLES_NUMBER * 2:
                    x, y = self._fit(data[i:j, 0])

                    if self.__showPlot:
                        plt.figure(1, (16, 10))
                        plt.plot(list(range(0, j - i)),
                                 data[i:j, 0].A.flatten().tolist(),
                                 color="b",
                                 marker="x")
                        if x is not None and y is not None:
                            plt.plot(x, y, color="r")
                        plt.show()

                i = None
        print("threshold all values: {0}".format(self._values))

        threshold = (np.mean(self._values)
                     if len(self._values) > 0 else defaultThreshold) / scale
        print("threshold found: {0}".format(threshold))

        self.__reset()

        return threshold
예제 #25
0
def main():

    data_in = []
    feed_id = []
    print('start reading data')
    #read_json.read_json(config.path_in,data_in,config.stop_word_path,feed_id,config.data_lines)

    #read_json.test_alignment_py('../data/cpp/small.txt','../data/cpp/python_alignment_extraction1.txt','stop_words.utf8',True,True,5,data_in)
    #print('finish reading data')

    #term_id = []
    id_url = []
    #read_liulanqi_data.read_data(config.path_in, data_in, id_url,50)

    read_weishi_data.read_json(config.path_in, data_in, None, feed_id,
                               config.data_lines, None, config.topk)

    if config.mode == 'Training':
        if config.model_name == 'Counter':
            model = Vectorizer.CounterVector(config.model_name)
        elif config.model_name == 'TfIdf':
            model = Vectorizer.TfIdfVector(config.model_name)
            print('finish initilizing model')
        elif config.model_name == 'FeatureHasher':
            model = Vectorizer.FeatureHasherVector(config.model_name,
                                                   config.n_features)

        model.feature_transform(data_in)
        print(len(model.vectorizer.vocabulary_))

        model.serilize_model()

        if config.algo_name == 'KMeans':
            algo_instance = KMeans.KMeansClustering(config.algo_name)
            print('start training model')
            algo_instance.fit(model.feature)
            algo_instance.serilize_model()
            algo_instance.output_cluster_info(data_in, model, feed_id)

    else:
        print('loading vectorizer')
        model = BaseModel.BaseModel(config.model_name)
        model.de_serilize_model()
        print('finish loading vector')

        if config.algo_name == 'KMeans':
            algo_instance = Algorithm.Base_Algorithm(config.algo_name)
            algo_instance.de_serilize_model()
            print('finish desirialization')
            features = model.transform(data_in)

            labels = algo_instance.predict(features)
            print(labels)
            #algo_instance.get_centroids()
            #algo_instance.output_cluster_info(data_in, model, feed_id)
            print('finish all')
예제 #26
0
def get_cluster():
    cluster_list = dict()
    for k in range(3, 9):
        k_temp = KMeans.KMeans(k)
        for j in range(2010, 2013):
            print("Fitting, k=", k, "Year=", j)
            k_temp.fit(data2test[j])
            s_key = str(k) + "_" + str(j)
            cluster_list[s_key] = k_temp.cluster

    return cluster_list
예제 #27
0
 def ReDo(self, actions, eegs):
     self.db = []
     if self.data != []:
         for r in range(len(self.data)):
             t = []
             for c in range(len(self.data[r]) - 1):
                 t.append(self.data[r][c])
             self.db.append(t)
         self.k = KMeans(self.db, actions[0], actions[1], actions[2],
                         actions[3])
         self.viewButton.Enable()
def processImage(im, options):  # NO TOCAR
    im = rescale(im, 0.25, preserve_range=True)

    if options['colorspace'] == 'ColorNaming':  # NO TOCAR
        im = np.reshape(im, (-1, im.shape[2]))
        im = cn.ImColorNamingTSELabDescriptor(im)
    elif options['colorspace'] == 'RGB':  # NO TOCAR
        im = np.reshape(im, (-1, im.shape[2]))
    elif options['colorspace'] == 'Lab':  # NO TOCAR
        im = cn.RGB2Lab(im)
        im = np.reshape(im, (-1, im.shape[2]))

    if options['K'] < 2:  # trobar la millor K
        i = 2
        fitting = []
        for i in range(2, 15):
            kmeans = km.KMeans(im, i, options)
            fitting.append(kmeans.fitting())
        print "k optima = ", fitting.index(min(fitting)) + 3
        kmeans = km.KMeans(im, fitting.index(min(fitting)) + 3, options)
    else:
        kmeans = km.KMeans(im, options['K'], options)  # NO TOCAR

    if options['colorspace'] == 'RGB':
        kmeans.centroids = cn.ImColorNamingTSELabDescriptor(
            kmeans.centroids)  #centroides a cn
    elif options['colorspace'] == 'Lab':
        kmeans.centroids = np.reshape(kmeans.centroids,
                                      (-1, 1, kmeans.centroids.shape[1]))
        kmeans.centroids = color.lab2rgb(kmeans.centroids) * 255
        kmeans.centroids = cn.ImColorNamingTSELabDescriptor(
            kmeans.centroids)  #centroides a cn
        kmeans.centroids = np.reshape(kmeans.centroids,
                                      (-1, kmeans.centroids.shape[2]))

    #obtenir una representacio dels kmeans.centroids en funcio del 11 colors basics
    #normalitzar: obtenir una representacio dels kmeans.centroids que sumi 1 per fila

    colors, which = getLabels(kmeans, options)  # NO TOCAR
    return colors, which, kmeans  # NO TOCAR
예제 #29
0
 def post(self):
     year = self.get_argument("year")
     taskId = self.get_argument("taskId")
     self.db = MysqlDriver(
         DATABASE.host, DATABASE.username, DATABASE.password,
         DATABASE.dbname)  # connect to MySql database server
     dataset = self.db.getFeatures()
     km = KMeans(dataset, 50, 3, year, taskId)  #initiatating ML algo
     MLthread = Thread(target=km.main, args=())
     MLthread.daemon = True
     MLthread.start()
     self.write("Request created")  #send the response of requested created
     self.finish()
예제 #30
0
    def __initialization(self):
        """
        Initialization all parameters by k-means.

        Returns:
            Initial mu, sigma, gamma
        """
        km = KMeans.KMeans(self.k)
        labels = km.fit_predict(self.x, 50)
        mu = np.array(
            [np.average(self.x[labels == i], axis=0) for i in range(self.k)])
        sigma = np.array([np.eye(self.dimension) + 1 for i in range(self.k)])
        gamma = np.array([[-math.log(self.k, math.e)] * self.k] *
                         self.x.shape[0])
        return mu, sigma, gamma
예제 #31
0
 def fit(self, data):
     # 作业3
     # 屏蔽开始
     # step1: initial the attribute of gmm by kmeans
     k_means = KMeans.K_Means(self.n_clusters_)
     k_means.fit(data)
     self.mu_ = np.asarray(k_means.centers_)
     print(self.n_clusters_)
     self.prior_ = np.asarray([1 / self.n_clusters_] *
                              self.n_clusters_).reshape(
                                  self.n_clusters_, 1)
     self.posteriori_ = np.zeros((self.n_clusters_, len(data)))
     self.cov_ = np.asarray([eye(2, 2)] * self.n_clusters_)
     # step2:iteration
     Likelihood_value_before = -inf
     for i in range(self.max_iter_):
         # step3: E-step   generate probability density distribution for every point and normalize
         print("gmm iterator:", i)
         for k in range(self.n_clusters_):
             self.posteriori_[k] = multivariate_normal.pdf(x=data,
                                                           mean=self.mu_[k],
                                                           cov=self.cov_[k])
         self.posteriori_ = np.dot(diag(self.prior_.ravel()),
                                   self.posteriori_)
         self.posteriori_ /= np.sum(self.posteriori_, axis=0)
         #posteriori=np.asarray(self.posteriori_)
         #print(posteriori.shape)
         # step4: M-step   update the parameters of generate probability density distribution for every point int E-step and stop when reached threshold
         self.Nk_ = np.sum(self.posteriori_, axis=1)
         self.mu_ = np.asarray([
             np.dot(self.posteriori_[k], data) / self.Nk_[k]
             for k in range(self.n_clusters_)
         ])
         self.cov_ = np.asarray([
             np.dot((data - self.mu_[k]).T,
                    np.dot(np.diag(self.posteriori_[k].ravel()),
                           data - self.mu_[k])) / self.Nk_[k]
             for k in range(self.n_clusters_)
         ])
         self.prior_ = np.asarray([self.Nk_ / self.n_clusters_
                                   ]).reshape(self.n_clusters_, 1)
         Likelihood_value_after = np.sum(np.log(self.posteriori_))
         print(Likelihood_value_after - Likelihood_value_before)
         if np.abs(Likelihood_value_after - Likelihood_value_before
                   ) < self.tolerance_ * self.n_clusters_:
             break
         Likelihood_value_before = np.copy(Likelihood_value_after)
     self.fitted = True
예제 #32
0
def main():

    data_in = []
    feed_id = []
    print('start reading data')
    path = 'E:\\QQ_Browser_data\\ruyizhuan.csv'
    path2 = 'E:\\QQ_Browser_data\\yanxigonglue.csv'
    tv_show.process_data(path, feed_id, data_in)
    tv_show.process_data(path2, feed_id, data_in)

    if config.mode == 'Training':
        if config.model_name == 'Counter':
            model = Vectorizer.CounterVector(config.model_name)
        elif config.model_name == 'TfIdf':
            model = Vectorizer.TfIdfVector(config.model_name)
            print('finish initilizing model')
        elif config.model_name == 'FeatureHasher':
            model = Vectorizer.FeatureHasherVector(config.model_name,
                                                   config.n_features)

        model.feature_transform(data_in)
        print(len(model.vectorizer.vocabulary_))

        if config.algo_name == 'KMeans':
            algo_instance = KMeans.KMeansClustering(config.algo_name)
            print('start training model')
            algo_instance.fit(model.feature)
            algo_instance.serilize_model()
            print('finish serilizing model')
            algo_instance.output_cluster_info(data_in, model, feed_id)

    else:
        print('loading vectorizer')
        model = BaseModel.BaseModel(config.model_name)
        model.de_serilize_model()
        print('finish loading vector')

        if config.algo_name == 'KMeans':
            algo_instance = Algorithm.Base_Algorithm(config.algo_name)
            algo_instance.de_serilize_model()
            print('finish desirialization')
            features = model.transform(data_in)

            labels = algo_instance.predict(features)
            print(labels)
            #algo_instance.get_centroids()
            #algo_instance.output_cluster_info(data_in, model, feed_id)
            print('finish all')
예제 #33
0
    def test2(self):
        print "TEST 2:----------------------------------------------------------------"
        rand.seed(777)
        sampler = swr.SampleWithoutReplacement('datasets/adjusted-abalone.csv', .10)
        sampler.z_scale()
        training_set = sampler.get_training_set()
        test_set = sampler.get_test_set()

        indices_selected = list()
        centroids = [None]*4
        for i in range(4):
            while True:
                index_selected = np.random.randint(0, len(training_set))
                if index_selected not in indices_selected:
                    centroids[i] = training_set[index_selected]
                    indices_selected.append(index_selected)
                    break

        numpy_result=kmeans(np.array(training_set)[:, :-1], np.array(centroids)[:, :-1])[0]
        our_result=np.array(KMeans.k_means2(training_set, 4, centroids).centroids)[:, :-1]
        print numpy_result
        print ""
        print our_result
예제 #34
0
def assign_to_cluster(input_set, centroids):
    """
    Assigns every observation in the input_set to a cluster. Clusters are centered around the
    given centroids

    :param input_set: List of observations from the test set to put into a cluster. Has format
                    [list of [observations]]
    :param centroids: List of centroids from running K-means on a training set. Has format
                    [list of centroids]
    :return: The clusters that test set observations have been assigned to. Has format
                    [list of [clusters of[observations of n features]]]
    """
    if len(centroids) < 1:
        raise Exception('No centroids were given.')
    if len(input_set) < 1:
        raise Exception('No input observations were given.')

    # cluster_set = [[] for a in range(len(centroids))]
    cluster_indices = [[] for a in range(len(centroids))]

    for i in range(len(input_set)):
        min_dist = sys.maxint
        min_index = sys.maxint

        for j in range(len(centroids)):
            curr_dist = KMeans.euclidean_distance(input_set[i], centroids[j])
            if curr_dist < min_dist:
                min_dist = curr_dist
                min_index = j
        # if input_set[i] not in cluster_set[min_index]:
            # cluster_set[min_index].append(input_set[i])
        cluster_indices[min_index].append(i)

    # assignment = collections.namedtuple("clusterAssigment", ['clusters', 'indices'])
    # return assignment(cluster_set, cluster_indices)
    return cluster_indices
def run_test(fileName, max_k):
    cache_dir = './cache'
    D = 2.
    T = 3.
    L = 1.

    print >> sys.stderr, "FILE: ", fileName
    print fileName


    host, paras, phi = newickFormatReader.getInput(fileName)

    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
        f = open('%s/README' % cache_dir, 'w')
        f.write('This directory holds a cache of reconciliation graph for the TreeLife data set')
        f.close()

    cache_location = '%s/%s.graph' % (cache_dir, os.path.split(fileName)[1])
    recon_count_location = '%s/%s.count' % (cache_dir, os.path.split(fileName)[1])
    if not(os.path.isfile(cache_location)) or not(os.path.isfile(recon_count_location)):
        print >> sys.stderr, 'A reconciliation graph has not been built yet for this newick file'
        print >> sys.stderr, 'Doing so now and caching it in {%s}...' % cache_location

        DictGraph, numRecon = DP.DP(host, paras, phi, D, T, L)
        f = open(cache_location, 'w+')
        g = open(recon_count_location, 'w+')
        f.write(repr(DictGraph))
        g.write(str(numRecon))
        f.close()
        g.close()

    print >> sys.stderr, 'Loading reonciliation graph from cache'
    f = open(cache_location)
    g = open(recon_count_location)
    DictGraph = eval(f.read())
    numRecon = float(g.read())
    f.close()
    g.close()

    ## Only consider running algorithm for reconciliations with more than 
    # threshold MPRs
    if (numRecon < recon_threshold):
        print >> sys.stderr, 'Too few reconciliations: ', numRecon
        return 
    else:
        print >> sys.stderr, 'Reconciliation Count: ', numRecon



    scoresList, dictReps = Greedy.Greedy(DictGraph, paras)

    print >> sys.stderr, 'Found cluster representatives using point-collecting'

    graph = ReconGraph.ReconGraph(DictGraph)
    setReps = [ReconGraph.dictRecToSetRec(graph, dictRep) for dictRep in dictReps]
    random.seed(0)
    extra_reps = [KMeans.get_template(graph) for i in xrange(max_k)]

    representatives = setReps + extra_reps

    print >> sys.stderr, 'Starting K Means algorithm ... '
    print >> sys.stderr, 'Printing Average and Maximum cluster radius at each step'

    for seed in xrange(5):
        for i in xrange(1, max_k + 1):
            # print 'k = %d' % i
            # KMeans.k_means(graph, 10, i, 0, representatives[:i])
            KMeans.k_means(graph, 10, i, seed, None)
            print
예제 #36
0
 def testReadFilePoints(self):
     points = KMeans.dataset_to_list_points(DATASET)
     self.assertTrue(len(points) > 0)
     self.assertTrue(points[0].dimension == 2)
예제 #37
0
 def testGetNearestCluster(self):
     self.assertEquals(KMeans.get_nearest_cluster(
         [cluster, Cluster([Point(np.array([8, 8]))])], point), 0)
예제 #38
0
	print "Init - Create the first cluster points and plot them..."
	# TODO here is where you change the number of centroids by adding or removing the points.
	# The numbers represent the starting points of each centroid with the following coordinate pair: (x, y)
	clusterPoints = [Point(2, 3), Point(35, 20), Point(40, 40), Point(60, 60), Point(30, 30)]
	centroids = getCentroids(clusterPoints) # just convert the points to centroids for plotting and labeling assistance...
	plotter.plotCentroids(centroids)
	print "Init complete..."
	raw_input('Press enter to continue and to start the algorithm.')

	# Run the algorith 10 times
	# TODO So right now we are running the algorithm 10 times. Maybe we should come up with some better meassurement?
	for x in xrange(1,10):
		# Get lables
		print "Create the lables, this should take some time...."
		# The interesting part is what is going on in the classify method.
		labels = kmeans.classify(trainingX, trainingY, centroids)
		# Plot the labled data
		print "Plot the labled data."
		plotter.clear()
		plotter.plotCentroids(centroids)
		plotter.plotLabledData(trainingX, trainingY, labels, centroids)
		raw_input('Press enter to continue')

		# Recalculated the centroids and unlable the data so to say...
		print "Plot the new centroids."
		plotter.clear()
		plotter.plotUnlabledData(trainingX, trainingY)
		centroids = kmeans.reCalculateCentroids(trainingX, trainingY, labels, centroids)
		plotter.plotCentroids(centroids)
		raw_input('Press enter to continue')
예제 #39
0
파일: __init__.py 프로젝트: zerozzl/MLStudy
import KMeans
import numpy

'''
dataMat = mat(KMeans.loadDataSet("E:/TestDatas/MachineLearningInAction/Ch10/testSet.txt"))
k = 4
centroids, clustAssing = KMeans.kMeans(dataMat, k)
KMeans.showCluster(dataMat, k, centroids, clustAssing)
'''

'''
dataMat = numpy.mat(KMeans.loadDataSet("E:/TestDatas/MachineLearningInAction/Ch10/testSet2.txt"))
k = 3
centroids, clustAssing = KMeans.bitKmeans(dataMat, k)
KMeans.showCluster(dataMat, k, centroids, clustAssing)
'''

KMeans.clusterClubs("E:/TestDatas/MachineLearningInAction/Ch10/places.txt",
                    "E:/TestDatas/MachineLearningInAction/Ch10/Portland.png")
def learnvocabulary(train_set, cluster_num, max_iter) :
	start = time.time()
	means = KMeans.mykmeanspp(train_set, cluster_num, max_iter, True)
	print("Kmeans Time: ",time.time()-start)
	return means
예제 #41
0
def run_test(fileName, max_k):
    cache_dir = './cache'
    D = 2.
    T = 3.
    L = 1.

    print >> sys.stderr, "FILE: ", fileName
    print fileName

    host, paras, phi = newickFormatReader.getInput(fileName)

    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
        f = open('%s/README' % cache_dir, 'w')
        f.write('This directory holds a cache of reconciliation graph for the TreeLife data set')
        f.close()

    cache_location = '%s/%s.graph' % (cache_dir, os.path.split(fileName)[1])
    recon_count_location = '%s/%s.count' % (cache_dir, os.path.split(fileName)[1])
    if not(os.path.isfile(cache_location)) or not(os.path.isfile(recon_count_location)):
        print >> sys.stderr, 'A reconciliation graph has not been built yet for this newick file'
        print >> sys.stderr, 'Doing so now and caching it in {%s}...' % cache_location

        DictGraph, numRecon = DP.DP(host, paras, phi, D, T, L)
        f = open(cache_location, 'w+')
        g = open(recon_count_location, 'w+')
        f.write(repr(DictGraph))
        g.write(str(numRecon))
        f.close()
        g.close()

    print >> sys.stderr, 'Loading reonciliation graph from cache'
    f = open(cache_location)
    g = open(recon_count_location)
    DictGraph = eval(f.read())
    numRecon = float(g.read())
    f.close()
    g.close()

    
    
    ## Only consider running algorithm for reconciliations with more than 
    # threshold MPRs
    if (numRecon < recon_threshold):
        print >> sys.stderr, 'Too few reconciliations: ', numRecon
        return 
    else:
        print >> sys.stderr, 'Reconciliation Count: ', numRecon



    scoresList, dictReps = Greedy.Greedy(DictGraph, paras)
    graph = ReconGraph.ReconGraph(DictGraph)
    representatives = [ReconGraph.dictRecToSetRec(graph, dictReps[0])]

    ## Debug info
    ## Modifies the graph 
    ## Checking for the case when there is an error in likelihood 
    print >> sys.stderr, "== Checking for likelihoods over 1 =="
    found = False 
    for key in DictGraph.keys():
        children = DictGraph[key]
        for child in children[:-1]:
            if child[-1] > 1:
                # Attempt to round to fix large float math errors
                roundedValue = round(child[-1])
                if roundedValue != 1.0:
                    print >> sys.stderr, "ERR FOUND: ", key, child 
                    found = True 
                
    if not(found):
        print >> sys.stderr, "NO ERR(s)"
    print >> sys.stderr, "== End of over 1 checks. =="



    print >> sys.stderr, 'Starting K-centers algorithm ... '
    for i in xrange(2, max_k + 2):
        d, newrep = maximize(graph,representatives)
        if not all(d_i > 0 for d_i in d):
            print >> sys.stderr, "Distance vector contains 0", d 
            break

        print i-1, min(d),
        representatives.append(newrep)
        dist_sum = 0
        n = 10
        for _ in xrange(n):
            reps = [KMeans.get_weighted_template(graph) for _ in xrange(i-1)]
            dist_sum += min_d(maximize(graph,reps))
        print float(dist_sum) / n

    print  >> sys.stderr, "Finished k centers algorithm ..."
Summary: 1. If there are no seed, the random number is different in each run, which means KMeans will not be stable.
		 The stable algorithm means the result is the same in each run, which means we can get the same centroids in each run.  

		 2. When k is 1, all data points are in the same group, and the center is at (0,0) in each dimension. This is because 
		 	I normalized dataset before I did kmeans. 
		 	When k is small, there are no or less overlap between groups. However, when k is large, the overlap is serious, which
		 	means there may be more incorrect partition when there are more clusters.
		 	Note: In a normal case, the neighbor points of a centroid must be the same class. But as k becomes large, the adjacent
		 	zone of a centroid may include points with other class.


'''


for k in range(1,11):
	means, clusters = KMeans.mykmean(X, k, max_iter)
	#print(means)
	data, target, data_clusters, target_clusters = data_recovery_more(clusters, X)
	show_figure_clusters(data_clusters, target_clusters, means)
# end for


'''

Section 2.2

Note: I choose the inital centroid from the run with minimum distortion

Summary: 1. By running many times, we can get a more stable result.

		 2. By looking the distortion versus iteration figure, we can know the distortion will reduce to a stable value as iteration 
예제 #43
0
    test_set = sampler.get_test_set()

    # Tried to see if it made a difference if QR was performed on unscaled datasets
    rand.seed(777)
    sampler2 = swr.SampleWithoutReplacement('datasets/adjusted-abalone.csv', .10)
    unscaled_training_set = sampler2.get_training_set()
    unscaled_test_set = sampler2.get_test_set()

    global_wcss = list()
    global_rmse = list()

    # Run K-means on the data set and output results from it
    for i in [1, 2, 4, 8, 16]:

        # Run K-means on the training set and store the data
        results = KMeans.k_means(training_set, i)
        global_wcss.append(sum(results.wcss))

        # Calculate the mean, sd, and weights of all clusters
        cluster_weights = [None]*i
        cluster_info = list()
        for j in range(len(results.clusters)):
            info = calculate_cluster_values(results, j, unscaled_training_set)
            cluster_info.append(info)
            cluster_weights[j]=list(info.weights)

        # assign all observations in the test set to clusters
        test_clusters = assign_to_cluster(test_set, results.centroids)

        # Now predict y for the test clusters using the weights from training clusters
        cluster_predictions = predict_categories(unscaled_test_set, test_clusters, cluster_weights)
예제 #44
0
# HomePage :  
# Email  :  
#################################################  
  
from numpy import *  
import time  
import matplotlib.pyplot as plt 
import KMeans
   
## step 1: load data  
print ("step 1: load data..." ) 
dataSet = []   #列表,用来表示,列表中的每个元素也是一个二维的列表;这个二维列表就是一个样本,样本中包含有我们的属性值和类别号。
#与我们所熟悉的矩阵类似,最终我们将获得N*2的矩阵,每行元素构成了我们的训练样本的属性值和类别号
fileIn = open("D:/xuepython/testSet.txt")  #是正斜杠
for line in fileIn.readlines(): 
	temp=[]
	lineArr = line.strip().split('\t')  #line.strip()把末尾的'\n'去掉
	temp.append(float(lineArr[0]))
	temp.append(float(lineArr[1]))
	dataSet.append(temp)
    #dataSet.append([float(lineArr[0]), float(lineArr[1])])  
fileIn.close()  
## step 2: clustering...  
print ("step 2: clustering..."  )
dataSet = mat(dataSet)  #mat()函数是Numpy中的库函数,将数组转化为矩阵
k = 4  
centroids, clusterAssment = KMeans.kmeans(dataSet, k)  #调用KMeans文件中定义的kmeans方法。
  
## step 3: show the result  
print ("step 3: show the result..."  )
KMeans.showCluster(dataSet, k, centroids, clusterAssment)
    def strategy(self):
        kmeans = KMeans(self.file_name)
        hospitals = kmeans.k_means()

        sorted_hospitals = sorted(hospitals.keys())
        k = 1
        for hospital in sorted_hospitals:
            sys.stdout.write(
                "Hospital:"
                + str(hospital.id)
                + "|"
                + str(hospital.x)
                + ","
                + str(hospital.y)
                + ","
                + str(hospital.ambu)
                + "|"
            )
            for i in range(hospital.ambu):
                if i != hospital.ambu - 1:
                    sys.stdout.write(str(k) + ",")
                else:
                    sys.stdout.write(str(k) + "\n")
                k += 1
        ambulance_num = k - 1

        k = 1
        print
        for hospital in sorted_hospitals:
            patients = filter(
                lambda x: x.time > 2.3 * (abs(x.x - hospital.x) + abs(x.y - hospital.y)), hospitals[hospital]
            )

            ambu_num = hospital.ambu
            ambulances = []
            for i in range(ambu_num):
                ambulances.append(Ambulance(k, hospital))
                k += 1
            while True:
                pre_num = len(self.saved_patients)
                for i in range(4):
                    for ambulance in ambulances:
                        patients = sorted(patients, key=lambda x: abs(x.x - ambulance.x) + abs(x.y - ambulance.y))
                        for patient in patients:
                            if self.__savable(patient, ambulance, hospital, i):
                                ambulance.patients.append(patient)
                                patients.remove(patient)
                                ambulance.current_time += self.__distance(patient, ambulance) + 1
                                ambulance.x = patient.x
                                ambulance.y = patient.y
                                break
                for ambulance in ambulances:
                    if len(ambulance.patients) == 0:
                        continue
                    sys.stdout.write(
                        "Ambulance:" + str(ambulance.id) + "|" + str(hospital.x) + "," + str(hospital.y) + "|"
                    )
                    ambulance.current_time += self.__distance(ambulance, hospital) + 1
                    first = True
                    for patient in ambulance.patients:

                        if patient.time >= ambulance.current_time:
                            if first:
                                sys.stdout.write(
                                    str(patient.id)
                                    + ","
                                    + str(patient.x)
                                    + ","
                                    + str(patient.y)
                                    + ","
                                    + str(patient.time)
                                )
                                first = False
                            else:
                                sys.stdout.write(
                                    ";"
                                    + str(patient.id)
                                    + ","
                                    + str(patient.x)
                                    + ","
                                    + str(patient.y)
                                    + ","
                                    + str(patient.time)
                                )
                            self.saved_patients.append(patient)
                    sys.stdout.write("|" + str(hospital.x) + "," + str(hospital.y) + "\n")
                    ambulance.patients = []
                    ambulance.x = hospital.x
                    ambulance.y = hospital.y
                if len(self.saved_patients) == pre_num:
                    break