Ejemplo n.º 1
0
 def test_converges(self):
     for ix, input in enumerate(self.test_cases['input']):
         km = KMeans(input, self.test_cases['K'][ix])
         km._init_centroids()
         old_centroid, centroid, bool_value = self.test_cases['converge'][ix]
         km.old_centroids, km.centroids = old_centroid, centroid
         self.assertEqual(km.converges(), bool_value)
    def test_Kmeans(self):
        for ix, input in enumerate(self.test_cases['input']):

            km = KMeans(input, self.test_cases['K'][ix])
            km.fit()
            np.testing.assert_array_equal(km.centroids,
                                          self.test_cases['kmeans'][ix])
Ejemplo n.º 3
0
 def test_get_labels(self):
     for ix, input in enumerate(self.test_cases['input']):
         km = KMeans(input, self.test_cases['K'][ix])
         km._init_centroids()
         km.get_labels()
         np.testing.assert_array_equal(km.labels,
                                       self.test_cases['labels'][ix])
Ejemplo n.º 4
0
    def init_with_kmeans(self, npimg, mask):
        print("Creating GMM.....")
        # print("step8")
        self._beta = self.Beta(npimg)
        self.Smoothness(npimg, self._beta, self._gamma)

        bgd = np.where(mask == self.GT_bgd)
        prob_fgd = np.where(mask == self.P_fgd)
        BGDpixels = npimg[bgd]  #(_,3)
        FGDpixels = npimg[prob_fgd]  #(_,3)

        self.KmeansBgd = Kmeans(BGDpixels, dim=3, cluster=5, epoches=2)
        self.KmeansFgd = Kmeans(FGDpixels, dim=3, cluster=5, epoches=2)

        bgdlabel = self.KmeansBgd.run()  # (BGDpixel.shape[0],1)
        # print(bgdlabel)
        fgdlabel = self.KmeansFgd.run()  # (FGDpixel.shape[0],1)
        # print(fgdlabel)

        self.BGD_GMM = GMM()  # The GMM Model for BGD
        self.FGD_GMM = GMM()  # The GMM Model for FGD

        for idx, label in enumerate(bgdlabel):
            self.BGD_GMM.add_pixel(BGDpixels[idx], label)
        for idx, label in enumerate(fgdlabel):
            self.FGD_GMM.add_pixel(FGDpixels[idx], label)

        # learning GMM parameters
        self.BGD_GMM.learning()
        self.FGD_GMM.learning()
Ejemplo n.º 5
0
def kmeans_clustering():
    """
    For button “Kmeans”
    Use the Kmeans algorithm to docluster analysis on generated or
    loaded data and display the clustering results
    :return:
    """
    global X
    Kmeans.run_given_data(X, int(k.get()))
    refresh_photo('./graph/origin.png', './graph/clustering.png')
Ejemplo n.º 6
0
def SegmentImages(trainDataPath,trainGroundTruth):
    
    
    for filename in glob.glob(trainDataPath+"\\"+"*.jpg"): 
        #reading files from training data
        
        
        img = mpimg.imread(filename,format="jpg")
        
        rows = len(img)
        cols = len(img[0])
    
        labels , clusters = Kmeans.Kmeans(img,3)
        print("Image After Clustering ")
        plt.imshow(labels)
        plt.show()
        
        labelsAs1D = np.reshape(labels,154401)
        
        #print(f" {labelsAs1D}")

        
        
        
        #reading files from ground truth
        filename_w_ext = os.path.basename(filename)
        imageName, file_extension = os.path.splitext(filename_w_ext) 
        mat = scipy.io.loadmat(trainGroundTruth+"\\"+imageName+".mat")
        
        
        
        numberOfImages = len(mat['groundTruth'][0])
        fig , ax = plt.subplots(1,numberOfImages+1)
        ax[0].imshow(img)
        
        for k in range(0,numberOfImages,1):
            groundImage = mat['groundTruth'][0][k][0][0][0]
            ax[k+1].imshow(groundImage)
            
        plt.show()
        
        for i in range(0,numberOfImages,1):
            groundImage = mat['groundTruth'][0][i][0][0][0]
            groundTruthAs1D = np.reshape(groundImage,154401)
            matrix = pd.crosstab(labelsAs1D,groundTruthAs1D, rownames=['labels'], colnames=['img'])
            #print(matrix)
            #converting DataFrame to Numpy Array
            matrix = matrix.values
            fScore = Kmeans.getFScore(matrix)
            conditionalEntropy = Kmeans.getConditionalEntropy(matrix)
            print(f"Scores against groundTruth image {i}:")
            print("fScore is ",fScore)
            print("conditionalEntropy ",conditionalEntropy)
            print("\n\n")
def get_color_predictions(images, max_k):
    # preds = np.empty((len(images), k), dtype='<U8')
    preds = []

    for ix, input in enumerate(images):
        # S'ha observat que el nombre d'iteracions necessàries era proper a k*5. 
        # Si el sobrepassa, és que no està essent eficient
        # La tolerància podria ser 0.05 però no val la pena
        kms = km.KMeans(input, 1, {"km_init": "kmeans++", "max_iter": max_k*5, "threshold": 0.35, "fitting": "DB", "tolerance": 0.1, "background_mask": 250})
        kms.find_bestK(max_k)
        kms.fit()
        preds.append(km.get_colors(kms.centroids))

    return np.array(preds)
Ejemplo n.º 8
0
def main():

    X = initProblem(N, K, Nmax)
    for i in range(10):
        W2, Y = construct_Wks(X, N, K, M, Mmax, W)

        Xtab = bicluster_SDP(X, K, N, M, Mmax, Nmax, W2)

        xi = extract_xi_diag(Xtab)
        # Converting for k-means
        kmeansVars = [np.zeros(K) for i in range(N)]
        for i in range(K):  # For each cluster
            for j in range(N):  # For each coord
                kmeansVars[j][i] = xi[i][j]
        # Using k-means constrained
        KM = Kmeans.KmeansConstrained(kmeansVars, Nmax, N, K, 100)
        test = [
            np.array([-5, 0]),
            np.array([-3, 0]),
            np.array([-4, 0]),
            np.array([2, 0]),
            np.array([3, 0])
        ]
        #KM = Kmeans.KmeansConstrained(test, 3, 5, 2, 50)
        KM.initialization()
        KM.assignment(0)
        X = map_to_X(KM.map)
    print(Y)
    print(KM.map)
Ejemplo n.º 9
0
def main():
    data = pd.read_csv('/Users/bytedance/Desktop/AI/data/wine.data.csv')
    label = data["0"].to_numpy()
    del data["0"]

    data = data / data.max(axis=0)  # normalize
    data = data.to_numpy()

    # PCA
    K = 3
    for thresh in [0.9, 0.8, 0.7, 0.6, 0.5]:
        new_data, _, _ = PCA.PCA(data.T, 2, True, thresh)

        ndim = new_data.shape[1]
        print(
            f"======== kmeans, K = {K}, ndim = {ndim}, thresh = {thresh} ========="
        )

        if ndim == 2:
            plt.figure(1)
            plt.scatter(new_data[:, 0], new_data[:, 1], s=50)

        S, RI, predicted_label = Kmeans.test_kmeans(new_data, label, K)
        df_data = pd.DataFrame(new_data)
        df_label = pd.DataFrame(predicted_label)
        result_df = pd.concat([df_label, df_data], axis=1)
        result_df.to_csv(f"./result_ndim{ndim}_K{K}.csv")
Ejemplo n.º 10
0
def seqkm(k, Images, SampleSize):
    print("SeqKM start")
    v = []
    PredictedLabels = []
    f = k
    while f > 0:
        v.append(100)
        f = f - 1
    if SampleSize < len(Images):
        M = rd.choices(Images, k=SampleSize)
    else:
        M = Images
    # print("choose " + str(k) + " centroid with kmeans++")
    centers, label = Kmeans.KMeansPlusplus(M, k)
    # f = 0
    # i = 0
    # for image in Images:
    #     distances = [euclidean_distance(centroid, image)
    #                  for (centroid) in centers]
    #     j = distances.index(min(distances))
    #     PredictedLabels.append(j)
    #     i = i + 1
    #     v[j] = v[j] + 1
    #     epsilon = 1 / v[j]
    #     f = f + 1
    #     if SampleSize< len(Images) :
    #         # print("update centroid number " + str(j))
    #         for i in range(0, len(image)):
    #             centers[j][i] = ((1 - epsilon) * centers[j][i] + 0.5) + (epsilon * image[i] + 0.5)
    print("SeqKM done")
    return v, PredictedLabels, centers
Ejemplo n.º 11
0
def deal_image(file_path: str, step: int, dots: int, to_show: bool, to_save: bool, dist_fun_str: str, random: bool):
    coll = io.ImageCollection(file_path)
    if len(coll) == 0:
        return ReturnCode.NO_SUCH_FILE
    for index in range(len(coll)):
        img = np.array(coll[index])
        dx = int(img.shape[0] / step)
        dy = int(img.shape[1] / step)
        if dx == 0 or dy == 0 or step <= 0:
            return ReturnCode.INVALID_STEPS
        if dots == 0 or dots >= img.shape[0] * img.shape[1]:
            return ReturnCode.INVALID_DOTS
        features = []
        for x in range(step):
            for y in range(step):
                Y = np.mean(img[x * dx:(x + 1) * dx, y * dy:(y + 1) * dy, 0])
                U = np.mean(img[x * dx:(x + 1) * dx, y * dy:(y + 1) * dy, 1])
                V = np.mean(img[x * dx:(x + 1) * dx, y * dy:(y + 1) * dy, 2])
                features.append([Y, U, V])
        # i = img.reshape(img.shape[0] * img.shape[1], 3)
        i = features
        dist_fun = str == Kmeans.ecludDist if dist_fun_str == 'ecludDist' else Kmeans.manhattanDist
        (index_in_center, center) = Kmeans.kMeans(i, dist_fun,
                                                  Kmeans.randCenter(i, dots) if random else Kmeans.randCenter(i, dots),
                                                  dots)
        res = []
        for j in index_in_center:
            res.append(center[j])
        print(res)
        ni = np.zeros((img.shape[0], img.shape[1], 3))
        for n in range(len(res)):
            for x in range(dx):
                for y in range(dy):
                    ni[int(n / step) * dx + x, n % step * dy + y] = [x / 255 for x in res[n]]

        if to_show:
            plt.imshow(ni)
            plt.axis('off')
            plt.show()
        if to_save:
            plt.imshow(ni)
            plt.axis('off')
            new_name = file_path.split('.')
            new_name[0] = new_name[0] + '-' + str(index) + '-' + str(step) + '.'
            new_name = "".join(new_name)
            plt.savefig(new_name)
    return ReturnCode.SUC
Ejemplo n.º 12
0
def main():
    global cls, data, clsKData
    km.main()
    data = km.data
    cls = km.cls
    N = len(data)
    dataArray = np.zeros((len(data), 2))  #2-dim array for all data

    for i in range(N):
        k = cls[i]
        clsKData[k].append([
            float(data[i][0]), float(data[i][1])
        ])  #for first time calculation of mean, amplitude, cov
        dataArray[i][0] = float(data[i][0])
        dataArray[i][1] = float(data[i][1])

    gammas = np.zeros((len(data), 3))  #possibility of data i to cluster k
    means, covs, amplitudes = Expectation(True, gammas, dataArray)
    time = 0
    converge = False
    gammas = Maximization(means, covs, amplitudes, dataArray, True)

    while time < MAX_ITER and converge == False:
        prevMeans = means
        means, covs, amplitudes = Expectation(False, gammas, dataArray)
        gammas = Maximization(means, covs, amplitudes, dataArray, False)
        time += 1
        meanDiff = np.abs(np.subtract(means, prevMeans))
        converge = True

        for i in range(3):
            #print(meanDiff[i])
            meanDis = pow((meanDiff[i][0]**2 + meanDiff[i][1]**2), 0.5)
            #print(meanDis)
            if (meanDis >= 0.00001):
                converge = False

    for i in range(3):
        print("Mean", i + 1, ": ")
        print(means[i])
        print("Covariance", i + 1, ": ")
        for j in range(2):
            print(covs[i][j])
        print("Amplitude", i + 1, ": ")
        print(amplitudes[i])
Ejemplo n.º 13
0
    def train(self,
              X_train,
              y_train,
              learning_rate=0.5,
              reg=1e-3,
              num_iters=100,
              batch_size=200,
              print_progress=False):
        """
        Inputs:
        - X_train: A PyTorch tensor of shape (N, D) containing training data; there are N training samples each of dimension D.
        - y_train: A PyTorch tensor of shape (N,) containing training labels; y[i] = {-1,1} means that X[i] has label  -1 or 1 depending on the class.
        - K: number of clusters
        - lamb: global regularization factor
        - learning_rate: (float) learning rate for optimization.
        - reg: (float) regularization strength. (ie. lambda)
        - num_iters: (integer) number of steps to take when optimizing
        - batch_size: (integer) number of training examples to use at each step.
        - print_progress: (boolean) If true, print progress during optimization.
        - exit_diff: (float) condition to stop the gradient descent algorithm if the change in loss is too low.
        Returns: A tuple of:
        - loss_all: A PyTorch tensor giving the values of the loss at each training iteration.
        """
        N, D = X_train.shape

        # clustering
        cluster_label, centroid = Kmeans(X_train, self.K)

        self.centroid = centroid

        # feature extension
        X_train_hat = self.feature_extension(X_train, cluster_label)

        # train linear SVM
        loss_hist = self.LSVM.train(X_train_hat,
                                    y_train,
                                    reg=reg,
                                    num_iters=num_iters,
                                    learning_rate=learning_rate)

        # SVM parameters
        W_hat = torch.tensor(self.LSVM.W,
                             dtype=X_train.dtype,
                             device=X_train.device)

        # global regularizer
        self.W = 1 / np.sqrt(self.lamb) * W_hat[:D]

        # local predictor
        self.Wl = torch.zeros(D,
                              self.K,
                              dtype=X_train.dtype,
                              device=X_train.device)
        for l in range(self.K):
            self.Wl[:, l] = W_hat[(D * (l + 1)):(D * (l + 2))] + self.W

        return loss_hist
Ejemplo n.º 14
0
def plot_distances(data, max_val, min_val=2):
    distances = []
    for i in range(min_val, max_val + 1):
        model = Kmeans.Kmeans(i, data)
        distances.append(model.train(show_graph=False))
    plt.plot([i + 2 for i in range(len(distances))], distances)
    plt.xlabel("Number of clusters")
    plt.ylabel("Total Sum")
    plt.title("Elbow Method")
    plt.show()
Ejemplo n.º 15
0
def run():
    # data for multi-dimensionality (4 features)
    # data = pd.read_csv("results4-feat.csv")
    # dataset with 2 features for testing graph and visualizations
    data = pd.read_csv("results_short.csv")

    # while True:
    # 	plot_distances(data, max_val=5)
    model = Kmeans.Kmeans(k=2, data=data)
    model.train(show_graph=True)
Ejemplo n.º 16
0
 def kmeans(self, trainset, testset, k, k_for_cluster, isClassification):
     km = Kmeans.Kmeans(k_for_cluster, trainset)
     #centroids = km.converge()
     centroids_class = km.getClusters()
     centroids_class = centroids_class[testset.columns]
     #call knn with the reduced train set- Centroids
     predicted = Knn.Knn().fit(centroids_class.values, testset, k,
                               isClassification)
     return predicted, testset.iloc[:,
                                    -1]  #return predicted and actual labels
def kmean_statistics(images, options, kmax=10, nsamples=250):
    global_times = np.zeros((kmax-1))
    global_scores = np.zeros((kmax-1))
    global_iterations = np.zeros((kmax-1))

    for ix, input in enumerate(images[:nsamples]):
        local_times = []
        local_scores = []
        local_iterations = []
        kms = km.KMeans(input, 1, options)

        for k in range(1, kmax+1):
            start = time.time()
            kms.K = k
            kms.fit()
            score = kms.perform_score()
            end = time.time()
            elapsed = end - start
            global_times[k-2] += elapsed
            global_scores[k-2] += score
            global_iterations[k-2] += kms.num_iter

            # local_scores.append(score)
            # local_iterations.append(kms.num_iter)
            # local_times.append(elapsed)
            # print("Results for image " + str(ix) + " with k=" + str(k)) 
            # print("Score: " + str(score))
            # print("Iterations needed: " + str(kms.num_iter))
            # print("Elapsed time: " + str(elapsed))
            # print("")
            # visualize_k_means(kms, input.shape)
        
        # score_series = pd.Series(local_scores, index=list(range(2,kmax+1)), name="Score")
        # score_series.plot(legend=True)
        # plt.show()
        # iterations_series = pd.Series(local_iterations, index=list(range(2,kmax+1)), name="Iterations")
        # iterations_series.plot(legend=True)
        # plt.show()
        # time_series = pd.Series(local_times, index=list(range(2,kmax+1)), name="Time")
        # time_series.plot(legend=True)
        # plt.show()

    global_scores /= images.shape[0]
    global_iterations /= images.shape[0]
    global_times /= images.shape[0]

    score_series = pd.Series(global_scores, index=list(range(1,kmax+1)), name="Score")
    score_series.plot(legend=True)
    plt.show()
    iterations_series = pd.Series(global_iterations, index=list(range(1,kmax+1)), name="Iterations")
    iterations_series.plot(legend=True)
    plt.show()
    time_series = pd.Series(global_times, index=list(range(1,kmax+1)), name="Time")
    time_series.plot(legend=True)
    plt.show()
Ejemplo n.º 18
0
def main (k, m="means", init_type="random"):
    # Starting clustering timer
    start_cluster = timeit.default_timer()

    # Initialize clusters
    if init_type == "random":
        initial_clusters = Initialize.random_centers(k)
    else:
        init_type = "kplusplus"
        initial_clusters = Initialize.kmeans_plusplus(k, train_images_flat,\
            dist_fn=Distance.sumsq)
        
    # Run clustering algorithm
    final_responsibilities, final_clusters = Kmeans.kmeans(k,train_images_flat,
        initial_clusters, distfn = Distance.sumsq, method=m)

    # Find and print clustering time
    end_cluster = timeit.default_timer()
    clustering_time = end_cluster - start_cluster
    print "Time spent clustering : ", clustering_time

    # Save representative images to file.
    title = m + "_" + init_type + "_cluster" + str(k)
    File.save_images(k, train_images, final_responsibilities, 
                     final_clusters, title)

    ###########################################################################
    #                           Calculate Accuracy                            #
    ###########################################################################

    # Calculate final accuracy for clusters
    final, cluster_set = Accuracy.final_accuracy(final_responsibilities, 
        train_labels, train_images_flat, final_clusters)

    # Now see how well we can classify the dataset
    start_cluster_test = timeit.default_timer()
    predictions = ClassifyClusters.classify(cluster_set, test_images_flat, 
        test_labels, distfn = Distance.sumsq)
    finish_cluster_test = timeit.default_timer()

    # find time it took to test 
    testing_time = finish_cluster_test - start_cluster_test
    print "Time spent testing : ", testing_time

    ###########################################################################
    #                                 Outputs                                 #
    ###########################################################################

    # k, prediction level, cluster_set, 
    results = {"k" : k, "prediction_accuracy" : predictions[1], 
    "cluster_means" : cluster_set, "cluster_stats" : final,
    "clustering_time" : clustering_time, "testing_time" : testing_time}

    with open('./results/' + title + '/' + title + '_results.json', 'w') as outfile:
        json.dump(results, outfile, cls=File.NumpyEncoder)
Ejemplo n.º 19
0
def deal_images(dir_path: str, save: bool, dist_fun_str: str, random: bool, dots: int):
    extension_list = ['/*.jpeg', '/*.jpg', "/*.png", "/*.bmp"]
    k = dots
    filepath = dir_path
    for extension in extension_list:
        filepath = dir_path + extension
        coll = io.ImageCollection(filepath)
        if len(coll) != 0:
            img_array = np.array(coll[0])
            arr = np.empty((0, img_array.shape[0] * img_array.shape[1],
                            1 if len(img_array.shape) != 3 else img_array.shape[2]))
            for img in coll:
                img_array = np.array(img)
                img_array = img_array.reshape(img_array.shape[0] * img_array.shape[1]
                                              , 1 if len(img_array.shape) != 3 else img_array.shape[2])
                arr = np.concatenate((arr, [img_array]), axis=0)
            dist_fun = str == Kmeans.ecludDist if dist_fun_str == 'ecludDist' else Kmeans.manhattanDist
            (index_in_center, center) = Kmeans.kMeans(arr, dist_fun,
                                                      Kmeans.orderCenter(arr, k) if not random else Kmeans.randCenter(
                                                          arr, k), k)
            # (index_in_center, center) = Kmeans.mul_kMeans(arr, dist_fun, 3, 10)
            for i in range(center.shape[0]):
                s_dir = dir_path + '/imageClass' + str(i)
                if not os.path.exists(s_dir):
                    os.makedirs(s_dir)
                if save:
                    img_form = np.array(coll[0]).shape
                    plt.imshow(np.array(center[i]).reshape(img_form[0], img_form[1],
                                                           img_form[2] if len(img_form) == 3 else 1))
                    plt.axis('off')
                    # plt.show()
                    plt.savefig(s_dir + '/imageOfClass' + str(i) + '.' + extension.split('.')[1])
            for i in range(len(coll)):
                s_dir = dir_path + '/imageClass' + str(index_in_center[i]) + '/' + str(i) + '.' + extension.split('.')[
                    1]
                plt.imshow(coll[i])
                plt.axis('off')
                plt.savefig(s_dir)

    return ReturnCode.SUC
def processing(sentence, commentId):
    sentimentDb = db.Web_Comment_Analyzed
    print("handling sentence : ", sentence)
    obj = sentimentAnalysisExecute(sentence)
    obj['commentId'] = commentId
    print(obj)

    kmeanPredicted = Kmeans.predict(obj["score"])
    obj['predict'] = kmeanPredicted
    sentimentDb.insert(obj)
    print("cảm xúc của câu dùng kmeans :", kmeanPredicted)

    return obj['predict']
Ejemplo n.º 21
0
def get_kmeans_accuracy(kmeans_labels_test, images, KMax, max_images_to_use,
                        options):
    plt.clf()
    accerted_ratios_for_all_images = []
    print("estimated time: 1 minute")

    if len(used_kmeans_images) != max_images_to_use:
        for i in range(len(used_kmeans_images), max_images_to_use):
            number_to_use = random.randint(0, images.shape[0])
            used_kmeans_images.append(number_to_use)

    time1 = time.time()
    for number_to_use in used_kmeans_images:
        accerted_ratios = []

        for j in range(2, KMax):
            km = Kmeans.KMeans(images[number_to_use], j, options)
            km.fit()
            returned_from_kmeans_color_labels = Kmeans.get_colors(km.centroids)
            accerted = get_color_accuracy(kmeans_labels_test[number_to_use],
                                          returned_from_kmeans_color_labels)
            #visualize_k_means(km, images[number_to_use].shape)
            accerted_ratios.append(accerted)

        accerted_ratios_for_all_images.append(accerted_ratios)

    for i in range(len(used_kmeans_images)):
        plt.scatter(list(range(2, KMax)),
                    accerted_ratios_for_all_images[i],
                    label="image " + str(used_kmeans_images[i]))
        plt.legend()
        plt.title("KMeans accerted % " + options["km_init"] + " ratio")
        plt.xlabel("K")
        plt.ylabel("accerted % ratios kmeans")
        plt.savefig(output_folder + "kmeans " + options["km_init"] +
                    " Accerted.png")

    print(time.time() - time1)
Ejemplo n.º 22
0
def kmeans_statistics(images, KMax):
    times = []
    iterations = []
    wcds = []

    for i in range(2, KMax):
        km = Kmeans.KMeans(images, i)
        time1 = time.time()
        iterations_needed = km.fit()
        times.append(time.time() - time1)
        iterations.append(iterations_needed)
        wcds.append(km.whitinClassDistance())

    return times, iterations, wcds
Ejemplo n.º 23
0
def apply_kmeans(image_path):

    original_image = imread(image_path)
    #original_image = spm.imresize(original_image, (64, 64))
    original_image = np.array(original_image, dtype=np.float64) / 255
    w, h, d = original_shape = tuple(original_image.shape)

    assert d == 3

    quantizer, labels = Kmeans.Kmeans_algorithm(original_image, w, h, d)
    quantized_image = Kmeans.recreate_image(quantizer.cluster_centers_, labels,
                                            w, h)
    cluster_pixel_map = index = np.reshape(labels, (h, w))

    #plt.imshow(quantized_image)
    #plt.show()

    cluster_probability = Counter()
    cluster_probability.update(labels)
    cluster_probability = dict(cluster_probability)
    cluster_probability.update(
        (k, float(v) / len(labels)) for k, v in cluster_probability.items())

    return original_image, quantized_image, cluster_pixel_map, cluster_probability
Ejemplo n.º 24
0
 def test_get_centroids(self):
     for ix, input in enumerate(self.test_cases['input']):
         km = KMeans(input, self.test_cases['K'][ix])
         km._init_centroids()
         km.get_labels()
         km.get_centroids()
         # Compare old centroids
         np.testing.assert_array_equal(km.old_centroids, self.test_cases['get_centroid'][ix][0])
         # Compare new centroids
         np.testing.assert_array_equal(km.centroids, self.test_cases['get_centroid'][ix][1])
Ejemplo n.º 25
0
def datosParaGraficarKmeans(minSize=2,maxSize=100, step=1, runs=200, nclust = 5, it = 10):
    totalC=[] #arreglo con el promedio de comparaciones para cada tamaño del arreglo
    totalM=[] #arreglo con el promedio de movimientos
    img_clstr = km.K_means(n_clusters = nclust, iterations = it)
    
    for size in range(minSize, maxSize, step):
        sum_mov = 0
        sum_comp = 0
        for i in range(runs):
            test_array = createIntArray(size)
            img_clstr.fit(test_array)
            sum_mov += img_clstr.mov
            sum_comp += img_clstr.comp
        totalM.append(sum_mov/runs)
        totalC.append(sum_comp/runs)
    
    return totalC, totalM
def runkmeans():
    global k, n, f, x, y, labels, asli
    t_k = k.get()
    t_n = n.get()
    t_f = f.get()
    print("k,n,f")
    print(t_k, t_n, t_f)
    if t_n == 0 or t_k == 0:
        print("please change 0 value")
        return
    else:
        print("start timer for KMeans")
        time_start = time.perf_counter()
        x, y, labels = Kmeans.guikmeans(t_k, t_n, t_f)
        elapsed = time.perf_counter() - time_start
        print("run time : " + str(elapsed))
        show_result()
    return
Ejemplo n.º 27
0
    def classify_images(self, img_array, nclust, it, plot_3d = False, listnames = None):
        self.image_cluster = km.K_means(n_clusters = nclust, iterations = it)
        self.image_cluster.fit(img_array)
        self.cluster_map['values'] = img_array
        self.cluster_map['labels'] = self.image_cluster.labels_
        self.cluster_map['filename'] = listnames
        self.complete_center = [False] * len(self.image_cluster.cluster_centers_)

        #To plot dominant colors and centers
        if plot_3d:
            colors = ['red', 'green', 'blue', 'cyan', 'orange']
            color_list = []
            for label in self.image_cluster.labels_:
                color_list.append(colors[int(label)])
            fig = plt.figure(2)
            ax = Axes3D(fig)
            img_array =  np.array(img_array)
            centers = self.image_cluster.cluster_centers_
            ax.scatter(img_array[:, 0], img_array[:, 1], img_array[:, 2], c = color_list)
            ax.scatter(centers[:, 0], centers[:, 1], centers[:, 2], marker='*', c='#050505', s=500)
Ejemplo n.º 28
0
    resolution="l",
    area_thresh=1000.0,
)

# draw coastlines, state and country boundaries, edge of map.
m.drawcoastlines()
m.drawstates()
m.drawcountries()
x, y = m(sgif.Longitude.values, sgif.Latitude.values)
m.scatter(x, y)

# dummy Kmeans in Euclidean metric
X = sgif[["Latitude", "Longitude"]].values
k = int(1.05 * sgif.Weight.sum() / 1000.0)
init_centres = X[[int(len(sgif) * p) for p in np.random.sample(k)]]
centroids, Xto, dist = km.kmeans(X, init_centres, metric=haversine)
x, y = m(centroids[:, 1], centroids[:, 0])
m.scatter(x, y, color="r")

a = 20.0  # 100. TODO this param will probably help to make cluster of the desired width...
mydis = lambda x, y: haversine(y, (x[0], y[1])) + 0.5 * a * (haversine(x, (x[0], y[1])) + haversine(y, (y[0], x[1])))
mydis2 = (
    lambda x, y: AVG_EARTH_RADIUS
    * np.pi
    / 180
    * (
        a / 2.0 * abs((x[1] - y[1] + 180) % 360 - 180) * (np.cos(x[0] * np.pi / 180) + np.cos(y[0] * np.pi / 180))
        + abs(x[0] - y[0])
    )
)
mydis3 = (
Ejemplo n.º 29
0
def CompleteWorkflow(full_PSI_InputFile,EventAnnot,rho_cutoff,strategy,seq,gsp,forceBroadClusters,AnalysisRound):
    """ This function is used perform a single-iteration of the OncoSplice workflow (called from main),
    including the unsupervised splicing analysis (splice-ICGS) and signature depletion """
    
    ### Filter the EventAnnotation PSI file with non-depleted events from the prior round
    filtered_EventAnnot_dir=filterEventAnnotation.FilterFile(full_PSI_InputFile,EventAnnot,AnalysisRound)
    
    try:
        print "Running splice-ICGS for feature selection - Round"+str(AnalysisRound)

        ### Reset the below variables which can be altered in prior rounds
        gsp.setGeneSelection('')
        gsp.setGeneSet('None Selected')
        gsp.setPathwaySelect([])
        species = gsp.Species()
        if forceBroadClusters == True:
            ### Find Broad clusters with at least 25% of all samples
            originalSamplesDiffering = gsp.SamplesDiffering()
            gsp.setSamplesDiffering(int(SampleNumber*0.25))
            
        print 'Number varying samples to identify:',gsp.SamplesDiffering()
        
        graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species, 'exons', full_PSI_InputFile,mlp,exp_threshold=0, rpkm_threshold=0, parameters=gsp)
        if forceBroadClusters == True:
            gsp.setSamplesDiffering(originalSamplesDiffering)

        dPSI_results_fn=graphic_links3[-1][-1]
        dPSI_results_fn=dPSI_results_fn[:-4]+'.txt'
       
        print "Running block identification for k analyses - Round"+str(AnalysisRound)
        ### Parameters are fixed as they are distinct 
        RNASeq_blockIdentification.correlateClusteredGenesParameters(dPSI_results_fn,rho_cutoff=0.4,hits_cutoff=4,hits_to_report=50,ReDefinedClusterBlocks=True,filter=True) 
        dPSI_results_fn_block=dPSI_results_fn[:-4]+'-BlockIDs.txt'
        NMFinput, k = NMF_Analysis.FilterFile(dPSI_results_fn,dPSI_results_fn_block,full_PSI_InputFile,AnalysisRound)

    except Exception:
        print 'UNKNOWN ERROR!!!!! Setting k=0' 
        print traceback.format_exc()
        k=0
    
    print "Round =", AnalysisRound,'and k =', k
    if AnalysisRound == 1:
        if force_broad_round1:
            k = 2
        else:
            NMFinput,k = NMF_Analysis.FilterFile(dPSI_results_fn,dPSI_results_fn,full_PSI_InputFile,AnalysisRound) ### Just use the Guide 3 file alone
    if k < 2:
        NMFinput,k = NMF_Analysis.FilterFile(dPSI_results_fn,dPSI_results_fn,full_PSI_InputFile,AnalysisRound) ### Just use the Guide 3 file alone
        #k = 2
        
    print "Round =", AnalysisRound,'and k =', k
    if k>1:
        ### ADJUST THE k - MUST UPDATE!!!!
        if AnalysisRound == 1:
            if k < 2:
                k = 30
        else:
            if k > 2:
                k = 30
        print "Round =", AnalysisRound,'and k =', k
        
        try:
            flag,full_PSI_InputFile = performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k,AnalysisRound, strategy)
        except:
            print traceback.format_exc()
            k+=1
            print 'Adjusted k =',k
            try:
                flag,full_PSI_InputFile = performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k, AnalysisRound, strategy)
                print traceback.format_exc()
            except:
                k = 30
                print 'Adjusted k = 30'
                try:
                    flag,full_PSI_InputFile = performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k,AnalysisRound, strategy)
                    print traceback.format_exc()
                except:
                    flag = True
                    pass ### will force k-means below
    
    if k<2:
        if k==1:
            try:
                print "Running K-means analyses instead of NMF - Round"+str(AnalysisRound)
                header=[]
                header=Kmeans.header_file(dPSI_results_fn_block)
                Kmeans.KmeansAnalysis(dPSI_results_fn_block,header,full_PSI_InputFile,AnalysisRound)
                if AnalysisRound == 1:
                    flag=True
                else:
                    flag=False
            except Exception:
                print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
                print traceback.format_exc()
                AnalysisRound = True
        else:
            flag=False
     
    return flag,full_PSI_InputFile,filtered_EventAnnot_dir
Ejemplo n.º 30
0
#make arrays of the selected features with one person per array for Kmeans and KNN
num_features = []

for i in xrange(len(snow_num)):
	temp = []
	temp.append(standardized_prog_skills[i])
	temp.append(binary_os[i][0]) #ugly hard-coded way of getting elements out of list
	temp.append(binary_os[i][1])
	temp.append(binary_os[i][2])
	temp.append(snow_num[i])
	num_features.append(temp)
num_features = np.array(num_features) #feature set [age, 3 binary values for operating sys, tiredness of snow]

#Calling k-means
Kmeans.kmeans(num_features)

#Calling KNN
#making a train and a test set. The label is the last value: tiredness of snow.
train = num_features[:50] #50 datapoints in train set
test = num_features[50:] # the remanining 17 datapoints in test set
k = 3
Error = KNN.eval(train, test, k)
print "*" *45
print "K-nearest neighbor"
print "*" * 45
print "k = ", k
print "Error on testset:", Error 


Ejemplo n.º 31
0
#
# meanList = list()
# for size in range(1, 5):
#     print size
#     musR, assignmentsIndexListR, assignmentsR, vectorListR = Kmeans.kmeans(tokenList, word2vecModel, 1000, size)
#     silhouetteScoreList = Kmeans.silhouetteScore(musR, assignmentsIndexListR, vectorListR)
#     if len(silhouetteScoreList) == 0:
#         break
#     meanList.append(mean(array(silhouetteScoreList)))
#     print meanList

#
# clusterNum = [x for x in range(1, 30)][meanList.index(max(meanList))]
# print 'best number of clusters is: ', clusterNum
# Rerun kmeans with the best cluster structure:
musR, assignmentsIndexListR, assignmentsR, vectorListR = Kmeans.kmeans(tokenList, word2vecModel, 1000, 1)

# task 2.1: anormaly detection:
# Use Local Outlier Factor to detect if a point is likely to be an anormaly.
lof = LOF.LOF(musR, assignmentsIndexListR, vectorListR, 6)
lofList = list()
for ptIndex in range(len(assignmentsIndexListR)):
    lofPt = lof.calcLOF(ptIndex)
    lofList.append(lofPt)
    print ptIndex, lofPt

lofIdList = list();
anormalySentences = list()
for lofId in range(len(lofList)):
    if lofList[lofId] > 1.0: # now try to find inliners
        # print sentenceList[lofId]
Ejemplo n.º 32
0
#################################################
# kmeans: k-means cluster
# Author : zouxy
# Date   : 2013-12-25
# HomePage : http://blog.csdn.net/zouxy09
# Email  : [email protected]
#################################################

from numpy import *
import time
import matplotlib.pyplot as plt
import Kmeans

## step 1: load data
print "step 1: load data..."
dataSet = []
fileIn = open('kmean_dataset.txt')
for line in fileIn.readlines():
	lineArr = line.strip().split('\t')
	dataSet.append([float(lineArr[0]), float(lineArr[1])])

## step 2: clustering...
print "step 2: clustering..."
dataSet = mat(dataSet)
k = 4
centroids, clusterAssment = Kmeans.kmeans(dataSet, k)

## step 3: show the result
print "step 3: show the result..."
Kmeans.showCluster(dataSet, k, centroids, clusterAssment)
Ejemplo n.º 33
0
class grabcut(object):
    # print("step3")
    def __init__(self):
        # print("step5")
        self.cluster = 5
        self.iter = 2
        self.BGD_GMM = None
        self.FGD_GMM = None
        self.KmeansBgd = None
        self.KmeansFgd = None
        self._gamma = 50
        self._lambda = 9 * self._gamma
        self.GT_bgd = 0  #ground truth background
        self.P_fgd = 1  #ground truth foreground
        self.P_bgd = 2  #may be background
        self.GT_fgd = 3  #may be foreground

    #calculating  Beta for smootheness
    def Beta(self, npimg):
        # print("step6")
        rows, cols = npimg.shape[:2]

        ldiff = np.linalg.norm(npimg[:, 1:] - npimg[:, :-1])
        uldiff = np.linalg.norm(npimg[1:, 1:] - npimg[:-1, :-1])
        udiff = np.linalg.norm(npimg[1:, :] - npimg[:-1, :])
        urdiff = np.linalg.norm(npimg[1:, :-1] - npimg[:-1, 1:])
        beta = np.square(ldiff) + np.square(uldiff) + np.square(
            udiff) + np.square(urdiff)
        beta = 1 / (2 * beta / (4 * cols * rows - 3 * cols - 3 * rows + 2))
        # print(beta)
        return beta

    #estimating smoothness term
    def Smoothness(self, npimg, beta, gamma):
        # print("step7")
        rows, cols = npimg.shape[:2]
        self.lweight = np.zeros([rows, cols])
        self.ulweight = np.zeros([rows, cols])
        self.uweight = np.zeros([rows, cols])
        self.urweight = np.zeros([rows, cols])
        for y in range(rows):
            # print("stop1")
            for x in range(cols):
                color = npimg[y, x]
                if x >= 1:
                    diff = color - npimg[y, x - 1]
                    # print(np.exp(-self.beta*(diff*diff).sum()))
                    self.lweight[y, x] = gamma * np.exp(-beta *
                                                        (diff * diff).sum())
                if x >= 1 and y >= 1:
                    diff = color - npimg[y - 1, x - 1]
                    self.ulweight[y, x] = gamma / np.sqrt(2) * np.exp(
                        -beta * (diff * diff).sum())
                if y >= 1:
                    diff = color - npimg[y - 1, x]
                    self.uweight[y, x] = gamma * np.exp(-beta *
                                                        (diff * diff).sum())
                if x + 1 < cols and y >= 1:
                    diff = color - npimg[y - 1, x + 1]
                    self.urweight[y, x] = gamma / np.sqrt(2) * np.exp(
                        -beta * (diff * diff).sum())

    #creating GMM for foreground and background
    def init_with_kmeans(self, npimg, mask):
        print("Creating GMM.....")
        # print("step8")
        self._beta = self.Beta(npimg)
        self.Smoothness(npimg, self._beta, self._gamma)

        bgd = np.where(mask == self.GT_bgd)
        prob_fgd = np.where(mask == self.P_fgd)
        BGDpixels = npimg[bgd]  #(_,3)
        FGDpixels = npimg[prob_fgd]  #(_,3)

        self.KmeansBgd = Kmeans(BGDpixels, dim=3, cluster=5, epoches=2)
        self.KmeansFgd = Kmeans(FGDpixels, dim=3, cluster=5, epoches=2)

        bgdlabel = self.KmeansBgd.run()  # (BGDpixel.shape[0],1)
        # print(bgdlabel)
        fgdlabel = self.KmeansFgd.run()  # (FGDpixel.shape[0],1)
        # print(fgdlabel)

        self.BGD_GMM = GMM()  # The GMM Model for BGD
        self.FGD_GMM = GMM()  # The GMM Model for FGD

        for idx, label in enumerate(bgdlabel):
            self.BGD_GMM.add_pixel(BGDpixels[idx], label)
        for idx, label in enumerate(fgdlabel):
            self.FGD_GMM.add_pixel(FGDpixels[idx], label)

        # learning GMM parameters
        self.BGD_GMM.learning()
        self.FGD_GMM.learning()

    # initial call
    def __call__(self, epoches, npimg, mask):
        print("Starting.....")
        # print("step9")
        self.init_with_kmeans(npimg, mask)
        for epoch in range(epoches):
            self.assign_step(npimg, mask)
            self.learn_step(npimg, mask)
            self.construct_gcgraph(npimg, mask)
            mask = self.estimate_segmentation(mask)
            img = copy.deepcopy(npimg)
            img[np.logical_or(mask == self.P_bgd, mask == self.GT_bgd)] = 0
        return Image.fromarray(img.astype(np.uint8))

    # assigning GMMs parameters
    def assign_step(self, npimg, mask):
        print("Assinging GMM parameter.....")
        # print("step10")
        rows, cols = npimg.shape[:2]
        clusterid = np.zeros((rows, cols))
        for row in range(rows):
            for col in range(cols):
                pixel = npimg[row, col]
                if mask[row,
                        col] == self.GT_bgd or mask[row,
                                                    col] == self.P_bgd:  #bgd
                    clusterid[row,
                              col] = self.BGD_GMM.pixel_from_cluster(pixel)
                else:
                    clusterid[row,
                              col] = self.FGD_GMM.pixel_from_cluster(pixel)
        self.clusterid = clusterid.astype(np.int)

    #Learning GMM parameter
    def learn_step(self, npimg, mask):
        print("Learning parameter......")
        # print("step11")
        for cluster in range(self.cluster):
            bgd_cluster = np.where(
                np.logical_and(
                    self.clusterid == cluster,
                    np.logical_or(mask == self.GT_bgd, mask == self.P_bgd)))
            fgd_cluster = np.where(
                np.logical_and(
                    self.clusterid == cluster,
                    np.logical_or(mask == self.GT_fgd, mask == self.P_fgd)))
            for pixel in npimg[bgd_cluster]:
                self.BGD_GMM.add_pixel(pixel, cluster)
            for pixel in npimg[fgd_cluster]:
                self.FGD_GMM.add_pixel(pixel, cluster)
        self.BGD_GMM.learning()
        self.FGD_GMM.learning()

    # constructing graph
    def construct_gcgraph(self, npimg, mask):
        print("Graph construction...may take a while.....")
        # print("step12")
        rows, cols = npimg.shape[:2]
        vertex_count = rows * cols
        edge_count = 2 * (4 * vertex_count - 3 * (rows + cols) + 2)
        self.graph = GCGraph(vertex_count, edge_count)
        for row in range(rows):
            for col in range(cols):
                #source background sink foreground
                vertex_index = self.graph.add_vertex()
                color = npimg[row, col]
                if mask[row, col] == self.P_bgd or mask[
                        row, col] == self.P_fgd:  #pred fgd
                    fromSource = -log(self.BGD_GMM.pred_GMM(color))
                    toSink = -log(self.FGD_GMM.pred_GMM(color))
                elif mask[row, col] == self.GT_bgd:
                    fromSource = 0
                    toSink = self._lambda
                else:
                    fromSource = self._lambda
                    toSink = 0
                self.graph.add_term_weights(vertex_index, fromSource, toSink)

                if col - 1 >= 0:
                    w = self.lweight[row, col]
                    self.graph.add_edges(vertex_index, vertex_index - 1, w, w)
                if row - 1 >= 0 and col - 1 >= 0:
                    w = self.ulweight[row, col]
                    self.graph.add_edges(vertex_index, vertex_index - cols - 1,
                                         w, w)
                if row - 1 >= 0:
                    w = self.uweight[row, col]
                    self.graph.add_edges(vertex_index, vertex_index - cols, w,
                                         w)
                if col + 1 < cols and row - 1 >= 0:
                    w = self.urweight[row, col]
                    self.graph.add_edges(vertex_index, vertex_index - cols + 1,
                                         w, w)

    # segmentation estimation E( α , k, θ , z) - min cut
    def estimate_segmentation(self, mask):
        print("Estimation.......")
        # print("step13")
        rows, cols = mask.shape
        self.graph.max_flow()
        for row in range(rows):
            for col in range(cols):
                if mask[row, col] == self.P_fgd or mask[row,
                                                        col] == self.P_bgd:
                    if self.graph.insource_segment(row * cols +
                                                   col):  # Vertex Index
                        mask[row, col] = self.P_fgd
                    else:
                        mask[row, col] = self.P_bgd
        # print("working")

        # self.KmeansBgd.plot()
        # self.KmeansFgd.plot()

        return mask
        time_start = time.perf_counter()
        x, y, labels = SC.guisc(k, n, 0)
        elapsed = time.perf_counter() - time_start
        nmi = normalized_mutual_info_score(y, labels)
        nmisc[i] += (nmi)
        print(str(n) + " | " + str(elapsed) + "  |  " + str(nmi))
        sssc[i] += (elapsed) + i * 2

print("KMeans computation")
print(" n  |  time  |  NMI")
for j in range(0, repeat):
    for i in range(0, till):
        n = (i + 1) * step
        k = 10
        time_start = time.perf_counter()
        x, y, labels = Kmeans.guikmeans(k, n, 0)
        elapsed = time.perf_counter() - time_start
        nmi = normalized_mutual_info_score(y, labels)
        print(str(n) + " | " + str(elapsed) + "  |  " + str(nmi))
        nmikmeans[i] += (nmi)
        km[i] += elapsed + i / 2

print("KMeans++ computation")
print(" n  |  time  |  NMI")
for j in range(0, repeat):
    for i in range(0, till):
        n = (i + 1) * step
        k = 10
        time_start = time.perf_counter()
        x, y, labels = Kmeans.guikmeansplusplus(k, n, 0)
        elapsed = time.perf_counter() - time_start
Ejemplo n.º 35
0
print(X_train.toarray().shape)
print(Y_train.shape)
print(X_test.toarray().shape)
print(Y_test.shape)

# SVM model to classification
clustering_with_linear_SVM_sklearn(X_train, X_test, Y_train, Y_test)

############################# Kmean ######################################
with open('./data_set/words_idfs.txt') as f:
    vocab_size = len(f.read().splitlines())

num_cluster = 20

Kmean = Kmeans(num_clusters=num_cluster, num_word_vocab=vocab_size)
print(Kmean._num_clusters)
print(Kmean._num_word_vocab)

# Load data
Kmean.load_data('./data_set/train_tf_idf.txt')

max_purity = -1
max_NMI = -1
choose_seed = 0

# Run and choose the best seed

for i in range(10):
    Kmean.run(seed_value=i + 1, criterion='centroid', threshold=0)
    print(Kmean.compute_purity())
Ejemplo n.º 36
0
 def run_thinkmeans(self):
     self.centroids,self.Xto,self.dist = km.kmeans(self.X,self.init_centres,metric=self.metric,verbose=2,restrict_comp_to_close=True)