def predict_with_model(path, name, cmodel, get_data_and_query): print(f'predicting with {name}') create_dir(f'{path}/{name}') with open(f'{path}/{name}/label_query_vector', 'wb') as label_query_vector: reader = read_from_pickle(f'{path}/processed') for coll in iter_by_batch(reader, batch_size): print(f'iterating ------ {folder}') unzipped = list(zip(*coll)) data, queries = get_data_and_query(unzipped) for i in zip(cmodel.predict(data), queries, data): pickle.dump(i, label_query_vector) create_dir(f'{path}/cluster_{name}') create_dir(f'{path}/cluster_{name}_dump') for label, query, vector in read_from_pickle( f'{path}/{name}/label_query_vector'): with open(f'{path}/cluster_{name}/{label}', 'a') as f: f.write(f'{query}\n') with open(f'{path}/cluster_{name}_dump/{label}', 'ab') as fh: pickle.dump(vector, fh) coll = list(read_from_pickle(f'{path}/{name}/label_query_vector')) unzipped = list(zip(*coll)) labels = unzipped[0] vectors = unzipped[2] with open(f'{path}/{name}/silhouette', 'w') as f: f.write(str(silhouette_score(vectors, labels)))
def execute_task5(request): l = int(request.POST.get('number_of_layers')) k = int(request.POST.get('number_of_hashes_per_layer')) lsh = LSH(k=k, l=l) dbconnection = DatabaseConnection() if read_from_pickle('all_img_features_LSH.pickle') != None: all_image_hog_features = read_from_pickle('all_img_features_LSH.pickle') else: all_image_hog_features = dbconnection.get_object_feature_matrix_from_db(tablename='histogram_of_gradients') save_to_pickle(all_image_hog_features,'all_img_features_LSH.pickle') #SVD on hog features if(read_from_pickle('svd_hog_lsh.pickle')!=None): svd_obj = read_from_pickle('svd_hog_lsh.pickle') transformed_data = svd_obj['data_matrix'] vt = svd_obj['vt'] else: svd = SingularValueDecomposition() transformed_data,vt = svd.get_transformed_data_copy(all_image_hog_features['data_matrix'],400) save_to_pickle({"data_matrix":transformed_data,"images":all_image_hog_features['images'],"vt":vt},'svd_hog_lsh.pickle') # index_of_query_image = (all_image_hog_features['images']).index(query_image) # image_vector = transformed_data[index_of_query_image] bit_map = lsh.generate_representation_for_all_layers(transformed_data,all_image_hog_features['images']) save_to_pickle(lsh, 'lsh_model') return render(request, 'task5a_output.html')
def collect_global_stats(): path = f'../data/dates/' files = [ map(lambda x: x[2], read_from_pickle(f'{path}{folder}/processed')) for folder in os.listdir(path) ] files_n = [ map(lambda x: x[3], read_from_pickle(f'{path}{folder}/processed')) for folder in os.listdir(path) ] hoq = HistogramOfQueries('../data/global_stats/hoq') hot = HistogramOfTokens('../data/global_stats/hot') for query in tqdm(itertools.chain(*files)): hoq.add_doc(query) hot.add_doc(query) hoq.save() hot.save() hoq = HistogramOfQueries('../data/global_stats/hoq_n') hot = HistogramOfTokens('../data/global_stats/hot_n') for query in tqdm(itertools.chain(*files_n)): hoq.add_doc(query) hot.add_doc(query) hoq.save() hot.save()
def load_data(self): print('loading {}-{} features'.format(self.dataset_name,self.cnn_name)) self.train_data_ids = utils.read_file_to_list(self.train_data_ids_path) self.val_data_ids = utils.read_file_to_list(self.val_data_ids_path) self.test_data_ids = utils.read_file_to_list(self.test_data_ids_path) utils.shuffle_array(self.train_data_ids) utils.shuffle_array(self.val_data_ids) utils.shuffle_array(self.test_data_ids) self.train_data_ids = self.train_data_ids[:1] # ONLY FOR DEBUG - REMOVE self.val_data_ids = self.val_data_ids[:1] self.test_data_ids = self.test_data_ids[:1] self.train_caps = utils.read_from_json(self.train_caps_path) self.val_caps = utils.read_from_json(self.val_caps_path) self.test_caps = utils.read_from_json(self.test_caps_path) self.vocab = utils.read_from_json(self.vocab_path) self.reverse_vocab = utils.read_from_pickle(self.reverse_vocab_path) self.vocab_size = len(self.vocab) if self.cnn_name in ['ResNet50', 'ResNet152', 'InceptionV3']: self.ctx_dim = 2048 elif self.cnn_name in ['MURALI']: self.ctx_dim = 1024 elif self.cnn_name in ['VGG19']: self.ctx_dim = 512 else: raise NotImplementedError() self.train_ids = self.get_vid_ids(self.train_data_ids) self.val_ids = self.get_vid_ids(self.val_data_ids) self.test_ids = self.get_vid_ids(self.test_data_ids) self.kf_train = utils.generate_minibatch_idx(len(self.train_data_ids), self.mb_size_train) self.kf_val = utils.generate_minibatch_idx(len(self.val_data_ids), self.mb_size_test) #TODO - verify test or val self.kf_test = utils.generate_minibatch_idx(len(self.test_data_ids), self.mb_size_test)
def get_DTC_based_feedback(self, q, rel_items, irl_items, obj_feature_matrix, m): q_new = self.compute_new_query_vector(q_old=q, relevant_items=rel_items, irrel_items=irl_items) X_train, Y_train = self.create_X_Y_as_np_matrix(rel_items=rel_items, irl_items=irl_items) # Training SVM classifier dtl = decision_tree_learning.DecisionTreeLearning() dtl.fit(X=X_train, y=Y_train) # Now getting more test data from LSH indexes test_dataset = read_from_pickle('test_dataset.pickle') X_test, imageNames = self.create_X_test_as_np_matrix(test_dataset=test_dataset) Y_pred = dtl.predict(u=X_test) relevant_pred_img_names = [imageNames[i] for i in range(0, len(Y_pred)) if Y_pred[i] == 1] length_relevant_images = len(relevant_pred_img_names) if length_relevant_images < m: irr_image_names = [imageNames[i] for i in range(0, m - length_relevant_images) if Y_pred[i] == -1] relevant_pred_img_names.extend(irr_image_names) new_obj_feature_matrix = self.database_connection.HOG_descriptor_from_image_ids( image_ids=relevant_pred_img_names) new_rank_list = get_most_m_similar_images(data_with_images=new_obj_feature_matrix, query_image_feature_vector=q_new, m=m) return new_rank_list
def train_model(path, cmodel, get_data_and_query): print(f'training model {path}') reader = read_from_pickle(f'{path}/processed') for coll in iter_by_batch(reader, batch_size): unzipped = list(zip(*coll)) data, _ = get_data_and_query(unzipped) online_clustering(data, cmodel)
def get_hsv_std_values(): """ Import and return stored HSV value object from '/pickle_files/hsv.pickle'. If no such file exists, return HSV object with default values. :return: """ hsv_std_values = utils.read_from_pickle(HSV_PICKLE_PATH) if not hsv_std_values: return HSV() else: return hsv_std_values
def get_PPR_based_feedback(self, q, rel_items, irl_items, obj_feature_matrix, m): q_new = self.compute_new_query_vector(q_old=q, relevant_items=rel_items, irrel_items=irl_items) topology_images = read_from_pickle('test_dataset.pickle') image_names = get_image_names_from_tuples(topology_images) db_conn = DatabaseConnection() data_image_dict = db_conn.HOG_descriptor_from_image_ids(image_names) data_matrix = data_image_dict['data_matrix'] image_names = data_image_dict['images'] svd_obj = SingularValueDecomposition() svd_image_data = svd_obj.get_transformed_data(data_matrix, 8) # change this for 11K images pg_obj = PageRank() image_similarity_matrix = pg_obj.get_image_similarity_matrix_for_top_k_images(6, svd_image_data) seed_vector = pg_obj.get_seed_vector(rel_items, image_names, irl_items) pie = pg_obj.get_page_rank_eigen_vector(image_similarity_matrix, seed_vector) new_rank_list = pg_obj.get_top_K_images_based_on_scores(pie, image_names, m) return new_rank_list
def __init__(self, cur_dir, dataset_path, executable_path): """ Args: cur_dir: Working directory (for lkh3 files) dataset_path: Path to graph data executable_path: Path to LKH-3 executable (LKH file) """ print('This class was written and tested for Unix systems only') self.platform = sys.platform self.dir = cur_dir print('Creating directory ', self.dir) os.makedirs(self.dir, exist_ok=True) print('Loading validation dataset ', dataset_path) self.val_data = read_from_pickle(dataset_path, return_tf_data_set=False) self.problem_files = [] self.tour_files = [] self.params_files = [] self.executable = executable_path self.depot_list = [] self.loc_list = [] self.demands_list = [] self.tour_list = [] # Params for LKH-3 self.runs = 1 self.seed = 1234
def process_folders(path, folders, tfidf_dict=None): batch_size = 100 def train_model(path, cmodel, get_data_and_query): print(f'training model {path}') reader = read_from_pickle(f'{path}/processed') for coll in iter_by_batch(reader, batch_size): unzipped = list(zip(*coll)) data, _ = get_data_and_query(unzipped) online_clustering(data, cmodel) def predict_with_model(path, name, cmodel, get_data_and_query): print(f'predicting with {name}') create_dir(f'{path}/{name}') with open(f'{path}/{name}/label_query_vector', 'wb') as label_query_vector: reader = read_from_pickle(f'{path}/processed') for coll in iter_by_batch(reader, batch_size): print(f'iterating ------ {folder}') unzipped = list(zip(*coll)) data, queries = get_data_and_query(unzipped) for i in zip(cmodel.predict(data), queries, data): pickle.dump(i, label_query_vector) create_dir(f'{path}/cluster_{name}') create_dir(f'{path}/cluster_{name}_dump') for label, query, vector in read_from_pickle( f'{path}/{name}/label_query_vector'): with open(f'{path}/cluster_{name}/{label}', 'a') as f: f.write(f'{query}\n') with open(f'{path}/cluster_{name}_dump/{label}', 'ab') as fh: pickle.dump(vector, fh) coll = list(read_from_pickle(f'{path}/{name}/label_query_vector')) unzipped = list(zip(*coll)) labels = unzipped[0] vectors = unzipped[2] with open(f'{path}/{name}/silhouette', 'w') as f: f.write(str(silhouette_score(vectors, labels))) for folder in folders: cmodels = [('w2v', Birch(n_clusters=300), lambda uz: (np.array(uz[0]), uz[2])), ('w2v_n', Birch(n_clusters=300), lambda uz: (np.array(uz[1]), uz[3])), ('tfidf', Birch(n_clusters=300), lambda uz: (get_tfidf_rep(uz[2], tfidf_dict), uz[2])), ('tfidf_n', Birch(n_clusters=300), lambda uz: (get_tfidf_rep(uz[3], tfidf_dict), uz[3]))] labels = [] for name, cmodel, get_data in cmodels: train_model(f'{path}{folder}', cmodel, get_data) predict_with_model(f'{path}{folder}', name, cmodel, get_data) labels.append([ label for label, _, _ in read_from_pickle( f'{path}{folder}/{name}/label_query_vector') ]) # read labels, then compare with open(f'{path}{folder}/cluster_similarity', 'w') as sim_file: sim_file.write('w2v/w2v_n ' + str(adjusted_rand_score(labels[0], labels[1])) + '\n') sim_file.write('w2v/tfidf ' + str(adjusted_rand_score(labels[0], labels[2])) + '\n') sim_file.write('w2v/tfidf_n ' + str(adjusted_rand_score(labels[0], labels[3])) + '\n') sim_file.write('w2v_n/tfidf ' + str(adjusted_rand_score(labels[1], labels[2])) + '\n') sim_file.write('w2v_n/tfidf_n ' + str(adjusted_rand_score(labels[1], labels[3])) + '\n') sim_file.write('tfidf/tfidf_n ' + str(adjusted_rand_score(labels[2], labels[3])) + '\n')
def execute_task6(request): query_image = request.POST.get('query_image') most_similar_images = int(request.POST.get('most_similar_images')) query_image_folder_name = request.POST.get('query_image_folder_name') relevance_feedback = request.POST.get('relevance_feedback') lsh = read_from_pickle('lsh_model') db_connection = DatabaseConnection() image_vector = db_connection.get_feature_data_for_image( 'histogram_of_gradients', query_image) image_vector = np.asarray(image_vector.flatten()) if read_from_pickle('all_img_features_LSH.pickle') != None: all_image_hog_features = read_from_pickle( 'all_img_features_LSH.pickle') else: all_image_hog_features = db_connection.get_object_feature_matrix_from_db( tablename='histogram_of_gradients') save_to_pickle(all_image_hog_features, 'all_img_features_LSH.pickle') #SVD on hog features if (read_from_pickle('svd_hog_lsh.pickle') != None): svd_obj = read_from_pickle('svd_hog_lsh.pickle') transformed_data = svd_obj['data_matrix'] vt = svd_obj['vt'] else: svd = SingularValueDecomposition() transformed_data, vt = svd.get_transformed_data_copy( all_image_hog_features['data_matrix'], 400) save_to_pickle( { "data_matrix": transformed_data, "images": all_image_hog_features['images'], "vt": vt }, 'svd_hog_lsh.pickle') if (query_image_folder_name != ''): table_name = convert_folder_path_to_table_name( query_image_folder_name, 'histogram_of_gradients') image_vector = db_connection.get_feature_data_for_image( table_name, query_image) image_vector = np.dot(image_vector.astype(float), np.transpose(vt)) new_obj = {} new_obj['data_matrix'] = transformed_data new_obj['images'] = all_image_hog_features['images'] (sorted_k_values, result_stats) = lsh.find_ksimilar_images(k=most_similar_images, image_vector=image_vector, all_image_hog_features=new_obj) # Now getting a bigger test dataset for relevance feedback if relevance_feedback == "Probabilistic": (test_dataset, result_stats) = lsh.find_ksimilar_images( k=10 + most_similar_images, image_vector=image_vector, all_image_hog_features=new_obj) else: (test_dataset, result_stats) = lsh.find_ksimilar_images( k=200 + most_similar_images, image_vector=image_vector, all_image_hog_features=new_obj) save_to_pickle(test_dataset, 'test_dataset.pickle') print(sorted_k_values[:most_similar_images]) return render( request, 'visualize_images.html', { 'images': sorted_k_values[:most_similar_images], "from_task": "task5", 'rel_type': relevance_feedback, "q": query_image, "t": most_similar_images, "num_total": result_stats['total'], "num_unique": result_stats['unique'] })
def process_feedback(request): rf = RelevanceFeedback() relevant = request.POST.get("relevant[]") irrelevant = request.POST.get("irrelevant[]") rel_type = json.loads(request.POST.get("rel_type")) m = int(request.POST.get("t")) q_name = json.loads(request.POST.get("q")) # obj_feature_matrix = rf.database_connection.get_object_feature_matrix_from_db('histogram_of_gradients') obj_similar_thousand_names = read_from_pickle('test_dataset.pickle') obj_similar_thousand_names = [x[0] for x in obj_similar_thousand_names] obj_feature_matrix = rf.database_connection.HOG_descriptor_from_image_ids( image_ids=obj_similar_thousand_names) data_matrix = obj_feature_matrix['data_matrix'] new_rank_list = [] relevant = json.loads(relevant) irrelevant = json.loads(irrelevant) q = rf.database_connection.get_feature_data_for_image( 'histogram_of_gradients', q_name) # Vt=rf.get_Vt(obj_feature_matrix=obj_feature_matrix) if rel_type == 'Probabilistic': n_i = rf.calculate_n_i(D_matrix=data_matrix) new_rank_list = rf.calculate_feedback_prob_similarity( D_matrix=data_matrix, images=obj_feature_matrix['images'], relevant_items=relevant, n_i=n_i) new_rank_list = new_rank_list[:m] elif rel_type == 'Support Vector Machine': new_rank_list = rf.get_SVM_based_feedback( q=q, rel_items=relevant, irl_items=irrelevant, obj_feature_matrix=obj_feature_matrix, m=m) # new_rank_list=rf.get_SVM_based_feedback(q=q,Vt=Vt,rel_items=relevant,irl_items=irrelevant,obj_feature_matrix=obj_feature_matrix,m=m) elif rel_type == 'Decision Tree Classifier': new_rank_list = rf.get_DTC_based_feedback( q=q, rel_items=relevant, irl_items=irrelevant, obj_feature_matrix=obj_feature_matrix, m=m) elif rel_type == 'Personalized Page Rank': new_rank_list = rf.get_PPR_based_feedback( q=q, rel_items=relevant, irl_items=irrelevant, obj_feature_matrix=obj_feature_matrix, m=m) else: new_rank_list.append(( 'Please select a relevance feedback type and start again from task 5', '0')) return render( request, 'visualize_images.html', { 'images': new_rank_list, "from_task": "task6", "rel_type": rel_type, "q": q_name, "t": m })