f_vectors = [] train_f_vectors = [] test_f_vectors = [] train_f_hist = [] test_f_hist = [] images = [] #Found elbow at 400-500 N_cluster = 500 #These values are based on various validations and details are mentioned in the Report SIFT_WEIGHT = 0.90 HIST_WEIGHT = 0.10 #Accuracy and Performance trade off is found out with MiniBatchKmeans algorithm ms = MiniBatchKMeans(n_clusters=N_cluster, max_no_improvement=3, batch_size=20000) ms.fit(SIFT_data) sumimages = 0 del SIFT_data #*************************************Read Every training Image********************************* list1 = os.listdir(train_folder) for filename in list1: try: img = cv2.imread(train_folder + filename, 0) sumimages += 1 if img is not None: sft = cv2.SIFT(300) kp, ds = sft.detectAndCompute(img, None) if len(ds) > 0:
def test_minibatch_tol(): mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=10, random_state=42, tol=.01).fit(X) _check_fitted_model(mb_k_means)
} show(plot_tfidf) # ## **K-Means Clustering** # # K-means clustering obejctive is to minimize the average squared Euclidean distance of the document / description from their cluster centroids. # In[ ]: from sklearn.cluster import MiniBatchKMeans num_clusters = 30 # need to be selected wisely kmeans_model = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=0, max_iter=1000) # In[ ]: kmeans = kmeans_model.fit(vz) kmeans_clusters = kmeans.predict(vz) kmeans_distances = kmeans.transform(vz) # In[ ]: sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names()
print("done in %fs" % (time() - t0)) explained_variance = svd.explained_variance_ratio_.sum() print("Explained variance of the SVD step: {}%".format( int(explained_variance * 100))) print() # ############################################################################# # Do the actual clustering if opts.minibatch: km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=opts.verbose) else: km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, verbose=opts.verbose) print("Clustering sparse data with %s" % km) t0 = time() km.fit(X) print("done in %0.3fs" % (time() - t0)) print()
def test_mb_k_means_plus_plus_init_sparse_matrix(): mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters, random_state=42) mb_k_means.fit(X_csr) _check_fitted_model(mb_k_means)
def test_minibatch_k_means_perfect_init_dense_array(): mb_k_means = MiniBatchKMeans(init=centers.copy(), k=n_clusters, random_state=42).fit(X) _check_fitted_model(mb_k_means)
def test_minibatch_default_init_size(): mb_k_means = MiniBatchKMeans(init=centers.copy(), k=n_clusters, random_state=42).fit(X) assert_equal(mb_k_means.init_size, 3 * mb_k_means.batch_size) _check_fitted_model(mb_k_means)
#centers = [[50, 100], [100, 200], [50, 50],[150, 150]] centers = np.random.randint(size=(k, 2), low=20, high=200) n_clusters = len(centers) k_means = KMeans(init='k-means++', n_clusters=n_clusters, verbose=True) t0 = time.time() k_means.fit(X) t_batch = time.time() - t0 k_means_labels = k_means.labels_ k_means_cluster_centers = k_means.cluster_centers_ k_means_labels_unique = np.unique(k_means_labels) mbk = MiniBatchKMeans(init='k-means++', n_clusters=n_clusters, batch_size=batch_size, verbose=True) t0 = time.time() mbk.fit(X) t_mini_batch = time.time() - t0 mbk_means_labels = mbk.labels_ mbk_means_cluster_centers = mbk.cluster_centers_ mbk_means_labels_unique = np.unique(mbk_means_labels) # Plot result fig = pl.figure() colors = ['#4EACC5', '#FF9C34', '#4E9A06'] # We want to have the same colors for the same cluster from the # MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
def run(argv): dir_path = argv[1] outfile = argv[2] logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO) print('Loading title_StackOverflow.txt ... ') sentences = [] with open(dir_path + 'title_StackOverflow.txt', 'r') as f: for line in f: line = line.translate(replace_punctuation) line = line.decode('utf-8').encode('ascii', 'ignore').lower().split() words = [word for word in line if word not in stoplist] sentences.append(' '.join(words)) print( "Extracting features from the training dataset using a sparse vectorizer" ) vectorizer = TfidfVectorizer( max_df=0.5, min_df=2, #stop_words=stoplist, use_idf=True, sublinear_tf=True) X = vectorizer.fit_transform(sentences) print("n_samples: %d, n_features: %d" % X.shape) print("Performing dimensionality reduction using LSA") svd = TruncatedSVD(20) #TODO normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) explained_variance = svd.explained_variance_ratio_.sum() explained_variance = svd.explained_variance_ratio_.sum() print("Explained variance of the SVD step: {}%".format( int(explained_variance * 100))) print("KMeans Clustering ... ") MINI = True true_k = 35 #TODO if MINI: km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=1) else: km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, verbose=1) km.fit(X) print("Top terms per cluster:") original_space_centroids = svd.inverse_transform(km.cluster_centers_) order_centroids = original_space_centroids.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(true_k): print("Cluster %d:" % i, end='') for ind in order_centroids[i, :10]: print(' %s' % terms[ind], end='') print() labels = km.labels_ predictions = np.zeros(5000000) with open(dir_path + 'check_index.csv') as f: incsv = csv.reader(f) next(incsv) for row in incsv: if labels[int(row[1])] == labels[int(row[2])]: predictions[int(row[0])] = 1 with open(outfile, 'wb') as f: f.write('ID,Ans\n') for i in xrange(predictions.shape[0]): f.write('%d,%d\n' % (i, predictions[i]))
decomposition.MiniBatchSparsePCA(n_components=n_components, alpha=0.8, n_iter=100, chunk_size=3, random_state=rng), True), ('MiniBatchDictionaryLearning', decomposition.MiniBatchDictionaryLearning(n_atoms=15, alpha=0.1, n_iter=50, chunk_size=3, random_state=rng), True), ('Cluster centers - MiniBatchKMeans', MiniBatchKMeans(n_clusters=n_components, tol=1e-3, batch_size=20, max_iter=50, random_state=rng), True)] ############################################################################### # Plot a sample of the input data plot_gallery("First centered Olivetti faces", faces_centered[:n_components]) ############################################################################### # Do the estimation and plot it for name, estimator, center in estimators: print "Extracting the top %d %s..." % (n_components, name) t0 = time() data = faces
# Measure runtime start_time = time.time() n_colors = int(sys.argv[1]) # Reads the image in BGR format img = cv2.imread(pic_path) # BGR->RGB img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # Normalize the colors so that they are between 0&1 and reshape the data for a typical # scikit-learn input X = (img / 255.0).reshape(-1, 3) # KMeans model model = MiniBatchKMeans(n_clusters=n_colors) classf = model.fit_predict(X) # The new colors are the centers of the clusters # Basically we create a new image where the true input color is replaced by the color of # the closest cluster new_colors = model.cluster_centers_ new_image = new_colors[classf].reshape(img.shape) new_image = (new_image * 255).astype(np.uint8) # Save the new image plot = plt.imshow(new_image) # Remove axes and whitespaces sorrounding the image plt.axis('off') plt.savefig("compressed_" + pic_path.split("/")[-1], bbox_inches=0)
if birch_model.n_clusters is None: ax.scatter(this_centroid[0], this_centroid[1], marker='+', c='k', s=25) ax.set_ylim([-25, 25]) ax.set_xlim([-25, 25]) ax.set_autoscaley_on(False) ax.set_title('Birch %s' % info) # Compute clustering with MiniBatchKMeans. mbk = MiniBatchKMeans(init='k-means++', n_clusters=100, batch_size=100, n_init=10, max_no_improvement=10, verbose=0, random_state=0) t0 = time() mbk.fit(X) t_mini_batch = time() - t0 print("Time taken to run MiniBatchKMeans %0.2f seconds" % t_mini_batch) mbk_means_labels_unique = np.unique(mbk.labels_) ax = fig.add_subplot(1, 3, 3) for this_centroid, k, col in zip(mbk.cluster_centers_, range(n_clusters), colors_): mask = mbk.labels_ == k ax.scatter(X[mask, 0], X[mask, 1],
print 'done in', time.time() - t, 'seconds' dm.save_pipeline_result(features_rotated, 'featuresRotated', 'npy') try: centroids = dm.load_pipeline_result('centroids', 'npy', section=100) n_texton = len(centroids) except: t = time.time() print 'quantize feature vectors ...', n_texton = 100 from sklearn.cluster import MiniBatchKMeans kmeans = MiniBatchKMeans(n_clusters=n_texton, batch_size=1000) kmeans.fit(features_rotated[::10]) centroids = kmeans.cluster_centers_ cluster_assignments = fclusterdata(centroids, 1.15, method="complete", criterion="inconsistent") centroids = np.array([centroids[cluster_assignments == i].mean(axis=0) for i in set(cluster_assignments)]) n_texton = len(centroids) print n_texton, 'reduced textons' print 'done in', time.time() - t, 'seconds' del kmeans
from gensim.models import doc2vec, word2vec from collections import namedtuple import pandas as pd import numpy as np train = pd.read_csv("data/train.csv") train_data = np.append(train.Context.values, train.Utterance.values) taggedMessage = namedtuple('TaggedMessage', 'words tags') documents = [] # Preprocess messages for i, message in enumerate(train_data): # Split into lists of words words = message.split() tags = [i] x = taggedMessage(words, tags) documents.append(taggedMessage(words, tags)) d2v2 = doc2vec.Doc2Vec(documents, size=200, workers=4, iter=20) X = [] for v in d2v2.docvecs: X.append(v) X = np.array(X) # Cluster messages using k-means kmeans = MiniBatchKMeans().fit(X) print 'successfully clustered'
def cluster_2_vec( dataset_df, id_df, downsample_ratio, model_name, k_means_method='k-means++', num_cluster=20, ): """ :param dataset_df: :param id_df: :param k_means_method: k-means++ or random :param num_cluster: :return: """ from sklearn.cluster import KMeans, MiniBatchKMeans dataset_x = np.asarray(dataset_df.ix[:, 1:len(dataset_df.columns) - 1].as_matrix(), dtype='float32') dataset_x = normalize(dataset_x, axis=1) print '...... kmeans ......' mbk = MiniBatchKMeans(init=k_means_method, n_clusters=num_cluster, batch_size=1000, n_init=10, max_no_improvement=10, verbose=0) mbk.fit(dataset_x) dataset_df['cluster'] = mbk.labels_ print '...... to feature vectors ......' train_person_ids = set(id_df['个人编码']) feature_vecs = [] feature_labels = [] for person_id in tqdm(train_person_ids): person_pd = dataset_df[dataset_df['个人编码'] == person_id] person_vec = np.zeros(num_cluster) person_label = np.asarray(person_pd.iloc[0][len(person_pd.columns) - 2], dtype='float32') # down sampling if int(person_label) == 0: if np.random.rand() <= downsample_ratio: for c in range(num_cluster): person_vec[c] = np.sum(person_pd['cluster'] == c) feature_vecs.append(person_vec) feature_labels.append(_one_hot_encoder(int(person_label))) else: for c in range(num_cluster): person_vec[c] = np.sum(person_pd['cluster'] == c) feature_vecs.append(person_vec) feature_labels.append(_one_hot_encoder(int(person_label))) _shuffle_data(feature_vecs, feature_labels) feature_vecs = np.asarray(feature_vecs, dtype='float32') feature_labels = np.asarray(feature_labels, dtype='float32') print 'postive sample:%d, negative sample:%d' % (np.sum( feature_labels[:, 1] == 1), np.sum(feature_labels[:, 0] == 1)) return feature_vecs, feature_labels, mbk
def qmrf_regions(data, edges, nbow=20, lamda=1, sampling='random', nsamples=10000, label_potential='l1', unary_sq=True, online=True, gamma=None, max_iter=5, truncated=False, rng=42, verbose=True, return_centers=False, return_edge_costs=True): with Timer('Colors'): if nbow == 'birch': clf = Birch(threshold=0.8, branching_factor=100) elif online: clf = MiniBatchKMeans(n_clusters=nbow, verbose=verbose, random_state=rng, batch_size=100, max_iter=100, max_no_improvement=10) else: clf = KMeans(n_clusters=nbow, verbose=verbose, random_state=rng) if nsamples is None: dist = clf.fit_transform(data) else: if sampling == 'random': idx = np.random.choice(data.shape[0], nsamples, replace=False) else: n = np.sqrt(nsamples) ratio = image.shape[0] / float(image.shape[1]) ny = int(n * ratio) nx = int(n / ratio) y = np.linspace(0, image.shape[0], ny, endpoint=False) + (image.shape[0] // ny // 2) x = np.linspace(0, image.shape[1], nx, endpoint=False) + (image.shape[1] // nx // 2) xx, yy = np.meshgrid(x, y) idx = np.round(yy * image.shape[1] + xx).astype(int).flatten() clf.fit(data[idx]) dist = clf.transform(data) if nbow == 'birch': centers = clf.subcluster_centers_ else: centers = clf.cluster_centers_ with Timer('Unary'): K = centers.shape[0] if label_potential == 'color': unary_cost = np.zeros((data.shape[0], centers.shape[0]), np.float32) for i in range(centers.shape[0]): unary_cost[:, i] = colordiff(data, centers[i:i + 1]) else: unary_cost = dist.astype(np.float32) if unary_sq: unary_cost **= 2 with Timer('Pairwise'): if label_potential == 'l1': label_cost = np.abs(centers[:, None, :] - centers[None, ...]).sum(-1) elif label_potential == 'l2': label_cost = np.sqrt( ((centers[:, None, :] - centers[None, ...])**2).sum(-1)) elif label_potential == 'potts': label_cost = np.ones((K, K), int) - np.eye(K, dtype=int) elif label_potential == 'color': label_cost = np.zeros((centers.shape[0], centers.shape[0]), np.float32) for i in range(centers.shape[0]): label_cost[:, i] = colordiff(centers, centers[i:i + 1]) if truncated: label_cost = np.maximum(1, label_cost) label_cost = (label_cost * lamda).astype(np.float32) if verbose: print("=================") print("Minimizing graph:") print("Nodes: %d, edges: %d, labels: %d" % \ (unary_cost.shape[0], edges.shape[0], label_cost.shape[0])) print("UnarySq: %s, LabelPotential: %s, EdgeCost: %s" % \ (unary_sq, label_potential, (gamma is not None))) print("#################") with Timer('Edge Cost'): diff = ((data[edges[:, 0]] - data[edges[:, 1]])**2).sum(axis=1) if gamma is not None and type(gamma) in [int, float]: edge_costs = np.exp(-gamma * diff).astype(np.float32) elif gamma == 'auto': edge_costs = np.exp(-diff.mean() * diff).astype(np.float32) elif gamma == 'color': edge_costs = 1. / (1. + colordiff(data[edges[:, 0]], data[edges[:, 1]])) edge_costs = edge_costs.astype(np.float32) else: edge_costs = np.ones(edges.shape[0], dtype=np.float32) with Timer('Minimize'): if label_cost.shape[0] == 2: labels = solve_binary(edges, unary_cost, edge_costs, label_cost) else: labels = solve_aexpansion(edges, unary_cost, edge_costs, label_cost) if return_centers: return labels, label_cost, centers return labels, label_cost
# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.MiniBatchKMeans.html from sklearn.cluster import MiniBatchKMeans #kmeans = MiniBatchKMeans(n_clusters=2, # random_state=0, # batch_size=100, # max_iter=10).fit(X) #print(kmeans.cluster_centers_) #print(kmeans.labels_) #print(kmeans.inertia_) filename = "mm.data" x_mm = np.memmap(filename, dtype='float32', mode='write', shape=x_train.shape) x_mm[:] = x_train minibatch_kmeans = MiniBatchKMeans(n_clusters=10, batch_size=10, random_state=42, verbose=2) minibatch_kmeans.fit(x_mm) #n_components = 2 #ipca = IncrementalPCA(n_components=n_components, batch_size=10) #x_train_ipca = ipca.fit_transform(x_train) # #pca = PCA(n_components=n_components) #x_trai_pca = pca.fit_transform(X) # #colors = ['navy', 'turquoise', 'darkorange'] # #for x_train_transformed, title in [(x_train_ipca, "Incremental PCA"), (x_train_pca, "PCA")]: # plt.figure(figsize=(8, 8)) # for color, i, target_name in zip(colors, [0, 1, 2], iris.target_names): # plt.scatter(X_transformed[y == i, 0], X_transformed[y == i, 1],
) #plot_PCA(anomaly[numerical_features], anomaly.KMcluster, chunk_labels) #print (total_FW) return (outputDF) # In[46]: kmodel = MiniBatchKMeans(n_clusters=4,random_state=10, init='k-means++') result = run_train( total, eps_=1, min_samples_=4, E=3.0, kmodel=kmodel, chunk_size=500, tau1=0.5, tau2=1.0, rd_threshold=0.5, alpha=0.5, fw=True )
def test_sparse_mb_k_means_callable_init(): def test_init(X, k, random_state): return centers mb_k_means = MiniBatchKMeans(init=test_init, random_state=42).fit(X_csr) _check_fitted_model(mb_k_means)
# Create a Zeek log reader print('Opening Data File: {:s}'.format(args.zeek_log)) reader = zeek_log_reader.ZeekLogReader(args.zeek_log, tail=True) # Create a Zeek IDS log live simulator print('Opening Data File: {:s}'.format(args.zeek_log)) reader = live_simulator.LiveSimulator(args.zeek_log, eps=10) # 10 events per second # Create a Dataframe Cache df_cache = dataframe_cache.DataFrameCache( max_cache_time=600) # 10 minute cache # Streaming Clustering Class batch_kmeans = MiniBatchKMeans(n_clusters=5, verbose=True) # Use the ZeekThon DataframeToMatrix class to_matrix = dataframe_to_matrix.DataFrameToMatrix() # Add each new row into the cache time_delta = 10 timer = time.time() + time_delta FIRST_TIME = True for row in reader.readrows(): df_cache.add_row(row) # Every 30 seconds grab the dataframe from the cache if time.time() > timer: timer = time.time() + time_delta
def main(): im=Image.open("a.jpg"); pix=im.load(); print im.size w=im.size[0]; h=im.size[1]; lst = []; k=0; for i in range(0,w): for j in range(0,h): lst.append(pix[i,j]); k=k+1; #print pix[i,j]," "; #print "\n" print lst n=len(lst) #print len(lst) #d={}; arr = np.ones(n); arr_list = np.array(arr).tolist() print arr_list; #pixels = np.array(pix).tolist() #print pixels; temp = collections.defaultdict(list) for p, ele in zip(lst, arr_list): temp[p].append(ele) print temp; d={} s=0; for key,value in temp.iteritems(): s=add_value(value) d[key]=s; print d; d_list=[] for key, value in d.iteritems(): tmp = [key,value] d_list.append(tmp) d_pixel_list=[]; for key in d.iteritems(): temp2=[key]; d_pixel_list.append(temp2); b=np.asarray(d_pixel_list); c=np.asarray(lst); #print 'd1_list' #print d1_list a=np.asarray(d_list); #print 'array' #print print 'a'; print a; print 'b'; print b; print 'c'; print c; #np.savetxt("C:\Users\Ashish Yadav\Desktop\ALL\ML\ML_Project\File.txt",a,delimiter=",",fmt='%s'); cluster = MiniBatchKMeans(n_clusters=10); y = cluster.fit_predict(c); print 'c_new'; print c; print 'y'; print y; #print 'cluster'; #print cluster; np.savetxt("C:\Users\Ashish Yadav\Desktop\ALL\ML\ML_Project\Cluster.txt",y,delimiter=",",fmt='%s');
def one_kmeans(self): self.kmeans = MiniBatchKMeans(n_clusters=self.n_clusters, batch_size=self.batch_size, verbose=True).fit(self.data)
def test_mb_k_means_plus_plus_init_dense_array(): mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters, random_state=42) mb_k_means.fit(X) _check_fitted_model(mb_k_means)
print path for video in os.listdir(path): if video == '.DS_Store': continue print label + ": " + video videoPath = path + "/" + video videoFeatures = util.readVideoData(videoPath, 100) #stackOfAllFeatures.append(videoFeatures) print "VideoFeatures array dims:" + str( videoFeatures.shape) #Steve add: dimensions of array print "StackOfAll array dims: " + str( stackOfAllFeatures.shape) #Steve add: dimensions of array stackOfAllFeatures = np.vstack((stackOfAllFeatures, videoFeatures)) temp = stackOfAllFeatures[1:] # Perform K-Means: 2500 centroids # kmeans = MiniBatchKMeans(init="k-means++", n_clusters=2500, n_init=10, verbose=0) kmeans = MiniBatchKMeans(init="k-means++", n_clusters=2500, n_init=10, verbose=1) # Steve: set it verbose kmeans.fit(temp) vocabulary = kmeans.cluster_centers_ # Save vocabulary file = open("Data/voc.pkl", "w") pickle.dump(vocabulary, file)
def test_minibatch_k_means_perfect_init_sparse_csr(): mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters, random_state=42).fit(X_csr) _check_fitted_model(mb_k_means)
exit(1) mfcc_csv_file = sys.argv[1] output_folder = sys.argv[2] # Set a fixed random seed so we can reproduce the results np.random.seed(11775) print "Loading MFCC CSV file" data = np.loadtxt(mfcc_csv_file, delimiter=";") print "data.shape: " + str(data.shape) for cluster_num in range(50, 501, 50): print "cluster_num: " + str(cluster_num) # kmeans = KMeans(n_clusters=cluster_num, init="k-means++", n_init=10) kmeans = MiniBatchKMeans(n_clusters=cluster_num, batch_size=10000, init="k-means++", n_init=3) kmeans.fit(data) # print kmeans print "K-means inertia: " + str(kmeans.inertia_) + "\n" # print "k_means.cluster_centers_.shape: " + str(kmeans.cluster_centers_.shape) # Output the K-means model output_filename = output_folder + "/kmeans." + str( cluster_num) + ".model" output_file = open(output_filename, "wb") cPickle.dump(kmeans, output_file) output_file.close() print "K-means model trained and output successfully!"
def test_mini_match_k_means_invalid_init(): km = MiniBatchKMeans(init="invalid", n_init=1, n_clusters=n_clusters) assert_raises(ValueError, km.fit, X)
# This simple script trains a Kmeans model on a collection of csv that should contain, row by row, the vectors # that correspond to the sift features extracted from an image. This will then be used to find the "k" most # representative features, which will then be used to categorize every feature extracted from a certain image. # This in turn will allow the creation of histogram where the relative occurence of each of these features will be # represented by the value of each bin in the histogram. import dask.dataframe as dd import joblib from sklearn.cluster import MiniBatchKMeans data_path = 'data' # Number of clusters, thus representing the number of visual words that are of interest. k = 500 model_name = 'new_KMeans_' + str(k) + '.pkl' print(model_name) # Use dask to lazily load all the data at once, and thus avoid overloading the memory. df = dd.read_csv(data_path + '/*/*.csv') # Create and train the model, using dasks lazy loading of data. model = MiniBatchKMeans(n_clusters=k, verbose=True) model.fit(df.values) # Export the trained model, so it can be used in a later stage. print("exporting") joblib.dump(model, model_name) print("done")
# print(labels) # hueValuePlot.plot_hue_and_value(list_of_hue, list_of_value, list_of_rgb, list_of_saturation, filename, labels) # f = [] for (dirpath, dirnames, filenames) in walk('images'): f.extend(filenames) break counter = 0 for each in f: # name = each print("processing image", counter) counter += 1 filename = each img = Image.open('images/'+filename).convert('RGB') c_rgb = img.getcolors() c_rgb = hueValuePlot.create_dict_of_color_list(c_rgb) img = img.convert('HSV') c = img.getcolors() list_of_hue, list_of_value, list_of_rgb, list_of_saturation = hueValuePlot.get_hue_value_from_list(c, c_rgb) h_list = np.array(list_of_hue) s_list = np.array(list_of_saturation) v_list = np.array(list_of_value) training_array = np.dstack((h_list, s_list, v_list)) training_array = training_array.reshape(-1, 3) # print(training_array.shape, training_array) clt = MiniBatchKMeans(n_clusters=5) labels = clt.fit_predict(training_array) # print(labels) hueValuePlot.plot_hue_and_value(list_of_hue, list_of_value, list_of_rgb, list_of_saturation, filename, labels)
def run(train_pyramid_descriptors, D, test_pyramid_descriptors, feat_des_options): train_images_filenames = cPickle.load( open('train_images_filenames.dat', 'rb')) test_images_filenames = cPickle.load( open('test_images_filenames.dat', 'rb')) train_labels = cPickle.load(open('train_labels.dat', 'rb')) test_labels = cPickle.load(open('test_labels.dat', 'rb')) k = feat_des_options['k'] codebook = MiniBatchKMeans(n_clusters=k, verbose=False, batch_size=k * 20, compute_labels=False, reassignment_ratio=10**-4, random_state=42) codebook.fit(D) visual_words_pyramid = np.zeros((len(train_pyramid_descriptors), k * len(train_pyramid_descriptors[0])), dtype=np.float32) for i in range(len(train_pyramid_descriptors)): visual_words_pyramid[i, :] = spatial_pyramid_histograms( train_pyramid_descriptors[i], codebook, k) knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1, metric='euclidean') knn.fit(visual_words_pyramid, train_labels) # logreg = LogisticRegression(random_state=0,max_iter=300).fit(visual_words_pyramid, train_labels) # scores = cross_validate(logreg, visual_words_pyramid, train_labels,scoring = ['precision_macro', 'recall_macro','f1_macro'], cv=5,return_estimator=True) scores = cross_validate( knn, visual_words_pyramid, train_labels, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'], cv=8, return_estimator=True) cross_val_accuracy = scores['test_accuracy'].mean() cross_val_precision = scores['test_precision_macro'].mean() cross_val_recall = scores['test_recall_macro'].mean() cross_val_f1 = scores['test_f1_macro'].mean() # print("%0.2f precision with a std dev of %0.2f" % (cross_val_precision, scores['test_precision_macro'].std())) # print("%0.2f recall with a std dev of %0.2f" % (cross_val_recall, scores['test_recall_macro'].std())) # print("%0.2f F1-score with a std dev of %0.2f" % (cross_val_f1, scores['test_f1_macro'].std())) visual_words_test = np.zeros( (len(test_images_filenames), visual_words_pyramid.shape[1]), dtype=np.float32) for i in range(len(test_images_filenames)): visual_words_test[i, :] = spatial_pyramid_histograms( test_pyramid_descriptors[i], codebook, k) test_accuracy = 100 * knn.score(visual_words_test, test_labels) # print("Test accuracy: %0.2f" % (test_accuracy)) test_prediction = knn.predict(visual_words_test) # test_prediction = logreg.predict(visual_words_test) test_precision, test_recall, test_fscore, _ = precision_recall_fscore_support( test_labels, test_prediction, average='macro') # print("%0.2f precision" % (test_precision)) # print("%0.2f recall" % (test_recall)) # print("%0.2f F1-score" % (test_fscore)) # pca = PCA(n_components=64) pca = PCA(n_components=feat_des_options['pca_perc'], svd_solver='full') VWpca = pca.fit_transform(visual_words_pyramid) knnpca = KNeighborsClassifier(n_neighbors=5, n_jobs=-1, metric='euclidean') knnpca.fit(VWpca, train_labels) vwtestpca = pca.transform(visual_words_test) pca_test_accuracy = 100 * knnpca.score(vwtestpca, test_labels) # print("PCA Test accuracy: %0.2f" % (pca_test_accuracy)) scores_pca = cross_validate( knnpca, visual_words_pyramid, train_labels, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'], cv=8, return_estimator=True) cross_val_accuracy_pca = scores_pca['test_accuracy'].mean() cross_val_precision_pca = scores_pca['test_precision_macro'].mean() cross_val_recall_pca = scores_pca['test_recall_macro'].mean() cross_val_f1_pca = scores_pca['test_f1_macro'].mean() lda = LinearDiscriminantAnalysis(n_components=7) VWlda = lda.fit_transform(visual_words_pyramid, train_labels) knnlda = KNeighborsClassifier(n_neighbors=5, n_jobs=-1, metric='euclidean') knnlda.fit(VWlda, train_labels) vwtestlda = lda.transform(visual_words_test) lda_test_accuracy = 100 * knnlda.score(vwtestlda, test_labels) # print("LDA Test accuracy: %0.2f" % (lda_test_accuracy)) return [ cross_val_accuracy, cross_val_precision, cross_val_recall, cross_val_f1, test_precision, test_recall, test_fscore, test_accuracy, pca_test_accuracy, cross_val_accuracy_pca, cross_val_precision_pca, cross_val_recall_pca, cross_val_f1_pca, lda_test_accuracy ]