def init_with_kmeans(self, npimg, mask): print("Creating GMM.....") # print("step8") self._beta = self.Beta(npimg) self.Smoothness(npimg, self._beta, self._gamma) bgd = np.where(mask == self.GT_bgd) prob_fgd = np.where(mask == self.P_fgd) BGDpixels = npimg[bgd] #(_,3) FGDpixels = npimg[prob_fgd] #(_,3) self.KmeansBgd = Kmeans(BGDpixels, dim=3, cluster=5, epoches=2) self.KmeansFgd = Kmeans(FGDpixels, dim=3, cluster=5, epoches=2) bgdlabel = self.KmeansBgd.run() # (BGDpixel.shape[0],1) # print(bgdlabel) fgdlabel = self.KmeansFgd.run() # (FGDpixel.shape[0],1) # print(fgdlabel) self.BGD_GMM = GMM() # The GMM Model for BGD self.FGD_GMM = GMM() # The GMM Model for FGD for idx, label in enumerate(bgdlabel): self.BGD_GMM.add_pixel(BGDpixels[idx], label) for idx, label in enumerate(fgdlabel): self.FGD_GMM.add_pixel(FGDpixels[idx], label) # learning GMM parameters self.BGD_GMM.learning() self.FGD_GMM.learning()
def train(self, X_train, y_train, learning_rate=0.5, reg=1e-3, num_iters=100, batch_size=200, print_progress=False): """ Inputs: - X_train: A PyTorch tensor of shape (N, D) containing training data; there are N training samples each of dimension D. - y_train: A PyTorch tensor of shape (N,) containing training labels; y[i] = {-1,1} means that X[i] has label -1 or 1 depending on the class. - K: number of clusters - lamb: global regularization factor - learning_rate: (float) learning rate for optimization. - reg: (float) regularization strength. (ie. lambda) - num_iters: (integer) number of steps to take when optimizing - batch_size: (integer) number of training examples to use at each step. - print_progress: (boolean) If true, print progress during optimization. - exit_diff: (float) condition to stop the gradient descent algorithm if the change in loss is too low. Returns: A tuple of: - loss_all: A PyTorch tensor giving the values of the loss at each training iteration. """ N, D = X_train.shape # clustering cluster_label, centroid = Kmeans(X_train, self.K) self.centroid = centroid # feature extension X_train_hat = self.feature_extension(X_train, cluster_label) # train linear SVM loss_hist = self.LSVM.train(X_train_hat, y_train, reg=reg, num_iters=num_iters, learning_rate=learning_rate) # SVM parameters W_hat = torch.tensor(self.LSVM.W, dtype=X_train.dtype, device=X_train.device) # global regularizer self.W = 1 / np.sqrt(self.lamb) * W_hat[:D] # local predictor self.Wl = torch.zeros(D, self.K, dtype=X_train.dtype, device=X_train.device) for l in range(self.K): self.Wl[:, l] = W_hat[(D * (l + 1)):(D * (l + 2))] + self.W return loss_hist
def run(): # data for multi-dimensionality (4 features) # data = pd.read_csv("results4-feat.csv") # dataset with 2 features for testing graph and visualizations data = pd.read_csv("results_short.csv") # while True: # plot_distances(data, max_val=5) model = Kmeans.Kmeans(k=2, data=data) model.train(show_graph=True)
def plot_distances(data, max_val, min_val=2): distances = [] for i in range(min_val, max_val + 1): model = Kmeans.Kmeans(i, data) distances.append(model.train(show_graph=False)) plt.plot([i + 2 for i in range(len(distances))], distances) plt.xlabel("Number of clusters") plt.ylabel("Total Sum") plt.title("Elbow Method") plt.show()
def kmeans(self, trainset, testset, k, k_for_cluster, isClassification): km = Kmeans.Kmeans(k_for_cluster, trainset) #centroids = km.converge() centroids_class = km.getClusters() centroids_class = centroids_class[testset.columns] #call knn with the reduced train set- Centroids predicted = Knn.Knn().fit(centroids_class.values, testset, k, isClassification) return predicted, testset.iloc[:, -1] #return predicted and actual labels
def SegmentImages(trainDataPath,trainGroundTruth): for filename in glob.glob(trainDataPath+"\\"+"*.jpg"): #reading files from training data img = mpimg.imread(filename,format="jpg") rows = len(img) cols = len(img[0]) labels , clusters = Kmeans.Kmeans(img,3) print("Image After Clustering ") plt.imshow(labels) plt.show() labelsAs1D = np.reshape(labels,154401) #print(f" {labelsAs1D}") #reading files from ground truth filename_w_ext = os.path.basename(filename) imageName, file_extension = os.path.splitext(filename_w_ext) mat = scipy.io.loadmat(trainGroundTruth+"\\"+imageName+".mat") numberOfImages = len(mat['groundTruth'][0]) fig , ax = plt.subplots(1,numberOfImages+1) ax[0].imshow(img) for k in range(0,numberOfImages,1): groundImage = mat['groundTruth'][0][k][0][0][0] ax[k+1].imshow(groundImage) plt.show() for i in range(0,numberOfImages,1): groundImage = mat['groundTruth'][0][i][0][0][0] groundTruthAs1D = np.reshape(groundImage,154401) matrix = pd.crosstab(labelsAs1D,groundTruthAs1D, rownames=['labels'], colnames=['img']) #print(matrix) #converting DataFrame to Numpy Array matrix = matrix.values fScore = Kmeans.getFScore(matrix) conditionalEntropy = Kmeans.getConditionalEntropy(matrix) print(f"Scores against groundTruth image {i}:") print("fScore is ",fScore) print("conditionalEntropy ",conditionalEntropy) print("\n\n")
print(X_train.toarray().shape) print(Y_train.shape) print(X_test.toarray().shape) print(Y_test.shape) # SVM model to classification clustering_with_linear_SVM_sklearn(X_train, X_test, Y_train, Y_test) ############################# Kmean ###################################### with open('./data_set/words_idfs.txt') as f: vocab_size = len(f.read().splitlines()) num_cluster = 20 Kmean = Kmeans(num_clusters=num_cluster, num_word_vocab=vocab_size) print(Kmean._num_clusters) print(Kmean._num_word_vocab) # Load data Kmean.load_data('./data_set/train_tf_idf.txt') max_purity = -1 max_NMI = -1 choose_seed = 0 # Run and choose the best seed for i in range(10): Kmean.run(seed_value=i + 1, criterion='centroid', threshold=0) print(Kmean.compute_purity())
def main(): try: _, train_data_path, test_data_path = sys.argv except ValueError: train_data_path = 'kddcup.data_10_percent_datatreat' test_data_path = 'corrected_datatreat' """train""" cluster_tree = ClusterTree() km = Kmeans(tree=cluster_tree, kid=Kmeans.KMEANS_ID, level=1, num_dimensions=MAX_ATTRIBUTES + 1) km.readTrainData(train_data_path) k_value = MAX_LABELS with redirection(LOG_FILE, 'w'): print("Init K-value = ", k_value) km.runKmeans(k_value) print(format_msg('*', "Total Clustering process finished !")) with redirection(LOG_FILE, 'a'): print(format_msg('*', "Total Clustering process finished !")) cluster_tree.printLog() """test""" print(format_msg('*', "Start classify the test records")) with redirection(LOG_FILE, 'a'): print(format_msg('*', "Start classify the test records")) reader = test_reader(test_data_path) cfs_matrix = ConfuseMatrix() right_rcd_mun = 0 test_rcd_mun = 0 with redirection(RESULT_FILE, 'w'): print(format_msg('*', "Classification result")) fmt = "True Label = {} Pre Label = {} Cluster Path = {}" for record in reader: predict = cluster_tree.findNearestCluster(record) if record.label == predict.getClusterNodeLabel(): right_rcd_mun += 1 cfs_matrix.update(record.label, predict.getClusterNodeLabel()) with redirection(RESULT_FILE, 'a'): print(fmt.format(LABEL_NAMES[record.label], LABEL_NAMES[predict.getClusterNodeLabel()], predict.strPath)) test_rcd_mun += 1 if test_rcd_mun % 10000 == 0: print("{} records have been done ...".format(test_rcd_mun)) with redirection(LOG_FILE, 'a'): print("{} records have been done ...".format(test_rcd_mun)) print(format_msg('*', "The process of classifying test records finished !")) with redirection(LOG_FILE, 'a'): print(format_msg('*', "The process of classifying test records finished !")) print(format_msg('=', "Classify Result")) fmt = "Total test record = {} Right label record = {} Right Rate = {}" print(fmt.format(test_rcd_mun, right_rcd_mun, right_rcd_mun / test_rcd_mun)) with redirection(RESULT_FILE, 'a'): print(format_msg('=', "Classify Result")) print(fmt.format(test_rcd_mun, right_rcd_mun, right_rcd_mun / test_rcd_mun)) cfs_matrix.print() cfs_matrix.printLog()
import Kmeans def iris_f(nome_arq): data = open(nome_arq, 'r') datalist = data.readlines() ret_list = [] for line in datalist: aux = line.split(',') ret_list.append([float(aux[0]), float(aux[1])]) data.close() return ret_list lista_pon = iris_f('iris.txt') clusters = Kmeans.Kmeans(3, lista_pon) print(clusters)
import numpy as np import matplotlib.image as mpimg import Kmeans from sklearn.externals import joblib classifier = joblib.load('knnModel.pkl') print(classifier) img = mpimg.imread("./flower_images/0002.jpg") newFeatures = np.zeros((1, 3)) newFeatures[0][0], newFeatures[0][1], newFeatures[0][2] = Kmeans.Kmeans( img, 2, 5) print("Image Features : ", newFeatures) y_pred = classifier.predict(newFeatures) print("Predicted Flower Class : ", y_pred[0])
from utils import * from SVM import * from Kmeans import * # Test select_cluster member1 = Member(label = 1, doc_id = 1, r_d = [1,0]) member2 = Member(label = 1, doc_id = 1, r_d = [0,0]) member3 = Member(label = 1, doc_id = 1, r_d = [0,1]) member4 = Member(label = 1, doc_id = 1, r_d = [1,1]) Cluster = Cluster() Cluster.add_member(member1) Cluster.add_member(member2) Cluster.add_member(member3) Cluster.add_member(member4) Kmean = Kmeans(num_clusters = 3, num_word_vocab = 2) Kmean.update_centroid_of(Cluster) print(Cluster._centroid) #[0.5,0.5] # Test random init ### Kmean = Kmeans(num_clusters = 3, num_word_vocab = 2) Kmean.random_init(1) for cluster in Kmean._clusters: print(cluster._centroid) # Test select_cluster_for Kmean = Kmeans(num_clusters = 3, num_word_vocab = 2) Cluster1 = Cluster() Cluster1._centroid = [0,0] Cluster2 = Cluster() Cluster2._centroid = [2,0]
crc = np.load('files/pred_crcrate.npy') # crc.sort(axis=0) # crc = np.append(crc, 2 * crc[-1] - crc[0]) ti = np.load('files/pred_timeinterval.npy') # ti.sort(axis=0) # ti = np.append(ti, 2 * ti[-1] - ti[0]) # # np.save('files/pred_crcrate.npy', crc) # np.save('files/pred_timeinterval.npy', ti) # assert False result = d.load_data()[:, d.binary_result].astype(int) disc_result = [] for i in range(10, 50, 10): print 'pid' kmeans_pid = Kmeans(raw_pid, None, k=i) pid = kmeans_pid.calc(None, 20, 2000) for j in range(10, 50, 5): print 'pressure measurement' pm = discrete_plus(pressure_measurement, j, 0.9) for k in range(10, 50, 5): # before = time.time() print 'set point' sp = discrete_plus(setpoint, k, 0.9) data_str = init.signature_all(d, crc, ti, pid, pm, sp) count = True features_normal = [] for r in range(data_str.shape[0]): if result[r] == 0: if data_str[r] not in features_normal:
import Kmeans as km import numpy as np if __name__ == "__main__": data = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15], [16, 17, 18, 19, 20], [21, 22, 23, 24, 25], [26, 27, 28, 29, 30]]) label = np.array([0, 0, 0, 1, 1, 1]) kmeans = km.Kmeans(data, kind=2, rowsam=True) res = kmeans.cluster() print("聚类结果为 res = ", res) acc = kmeans.accuracy(label) print("聚类准确度为 acc = ", acc)