class DataPreProcessor: def __init__(self): """ 1) Read each image from data dir and put metadata in database table. 2) Pass that image to each feature model to get a vector of dimension (1 * m) 3) Pass this vector to all the dimension reduction technique to get latent semantics. 4) Put this latent semantic of each image from each model and technique with its metadata inside database """ self.root_path = str(Path(str(Path(os.getcwd()).parent))) + "/Data/" self.database_connection = DatabaseConnection() self.process_metadata() self.process_classification_metadata() self.DATABASE_IMAGES_PATH = self.root_path + "/images" self.CLASSIFICATION_IMAGES_PATH = self.root_path + "/phase3_sample_data" # feature_models = [] # feature_models.append("histogram_of_gradients") # feature_models.append("sift") # # feature_models.append("histogram_of_gradients_30") # # processes = [] # for i, feature in enumerate(feature_models): # processes.append(Process(target=self.perform_feature_model(feature))) # processes[i].start() # # for i in range(len(processes)): # processes[i].join() # classification specific charas = ["Labelled", "Unlabelled"] # sets = ["Set1", "Set2", "Set3", "Set4"] sets = ["Set1", "Set2", "Set3"] number_of_clusters = ['250', '300'] path = [] feature_models = [] for chara_ in charas: for set_ in sets: for cluster_count in number_of_clusters: path.append(self.CLASSIFICATION_IMAGES_PATH + "/" + chara_ + "/" + set_) # feature_models.append("histogram_of_gradients" + "_" + chara_ + "_" + set_) # feature_models.append("local_binary_pattern" + "_" + chara_ + "_" + set_) feature_models.append("sift" + "_" + chara_ + "_" + set_ + "_" + cluster_count) processes = [] for i, feature in enumerate(feature_models): processes.append(Process(target=self.perform_classification_feature_model(feature, path[i], cluster_count))) processes[i].start() for i in range(len(processes)): processes[i].join() # This function will read all the metadata of input images and put those metadata details in database. def process_metadata(self): csv_file_path = self.root_path + "/HandInfo.csv" connection = self.database_connection.get_db_connection() cursor = connection.cursor() cursor.execute("""DROP Table IF EXISTS metadata;""") # Create metadata table cursor.execute("CREATE TABLE IF NOT EXISTS metadata( \n" " id INT NOT NULL, \n" " age INT NOT NULL, \n" " gender TEXT NOT NULL, \n" " skinColor TEXT NOT NULL, accessories INT NOT NULL,\n" " nailPolish INT NOT NULL,\n" " aspectOfHand TEXT NOT NULL,\n" " imageName TEXT NOT NULL,\n" " irregularities INT NOT NULL,\n" " PRIMARY KEY (imageName)\n" " );\n" " ") # file opened to avoid permission error in linux with open(csv_file_path, 'r') as f: next(f) cursor.copy_from(f, 'metadata', sep=',', null='') # cursor.execute("""copy metadata from '{}' csv header;""".format(csv_file_path)) connection.commit() def process_classification_metadata(self): # metadata_files = ['labelled_set1.csv', 'labelled_set2.csv', 'unlabelled_set1.csv', 'unlabelled_set2.csv'] data_folder = self.root_path + "/phase3_sample_data" extension = 'csv' os.chdir(data_folder) metadata_files = [os.path.basename(x) for x in glob.glob('*.{}'.format(extension))] connection = self.database_connection.get_db_connection() cursor = connection.cursor() for metadata_file in metadata_files: table_name = "metadata_" + metadata_file.split('.')[0] metadata_file_path = data_folder + "/" + metadata_file cursor.execute("DROP Table IF EXISTS " + table_name + ";") cursor.execute("""CREATE TABLE IF NOT EXISTS """ + table_name + """( some_number INT, id INT NOT NULL, age INT NOT NULL, gender TEXT NOT NULL, skinColor TEXT NOT NULL, accessories INT NOT NULL, nailPolish INT NOT NULL, aspectOfHand TEXT, imageName TEXT NOT NULL, irregularities INT NOT NULL, PRIMARY KEY (imageName) ); """) # file opened to avoid permission error in linux with open(metadata_file_path, 'r') as f: next(f) cursor.copy_from(f, table_name, sep=',', null='') # cursor.execute("""copy """ + table_name + """ from '{}' csv header;""".format(metadata_file_path)) connection.commit() def perform_feature_model(self, feature): if feature == 'sift': sift = SIFT(self.DATABASE_IMAGES_PATH) sift.read_and_clusterize(num_cluster=150) feature_vectors = sift.calculate_centroids_histogram() else: histogram_of_gradients = HistogramOfGradients(self.DATABASE_IMAGES_PATH) feature_vectors = histogram_of_gradients.get_image_vectors() self.database_connection.create_feature_model_table(feature) self.database_connection.insert_feature_data(feature, feature_vectors) def perform_classification_feature_model(self, feature, path, cluster_count): self.database_connection.create_feature_model_table(feature) if "histogram_of_gradients" in feature: histogram_of_gradients = HistogramOfGradients(path) feature_vectors = histogram_of_gradients.get_image_vectors() elif "local_binary_pattern" in feature: local_binary_pattern = LocalBinaryPattern(path) feature_vectors = local_binary_pattern.get_image_vectors() elif "sift" in feature: sift = SIFT(path) sift.read_and_clusterize(num_cluster=int(cluster_count)) feature_vectors = sift.calculate_centroids_histogram() self.database_connection.insert_feature_data(feature, feature_vectors)
class RelevanceFeedback: def __init__(self): self.database_connection = DatabaseConnection() self.conn = self.database_connection.get_db_connection() print('Initiating RelevanceFeedback....') def compute_new_query_vector(self, q_old, relevant_items, irrel_items, alpha=0.3, beta=0.65, gamma=0.05): print('Computing new query vector.....') avg_rel_vec = np.zeros(q_old.shape) avg_irl_vec = np.zeros(q_old.shape) # Aggregating relevant items for item in relevant_items: vector = self.database_connection.get_feature_data_for_image('histogram_of_gradients', item) avg_rel_vec = avg_rel_vec + vector # Aggregating irrelevant items for item in irrel_items: vector = self.database_connection.get_feature_data_for_image('histogram_of_gradients', item) avg_irl_vec = avg_irl_vec + vector if len(relevant_items) != 0: avg_rel_vec = avg_rel_vec / len(relevant_items) if len(irrel_items) != 0: avg_irl_vec = avg_irl_vec / len(irrel_items) q_new = alpha * q_old + beta * avg_rel_vec - gamma * avg_irl_vec return q_new def get_user_feedback(self, init_rank_list, q_name, caller='misc'): print('Taking user feedback now...') rel_items = [] irl_items = [] if caller == 'prb': for item in init_rank_list[0]: if item[0] == q_name: continue else: print(f'Is image {item[0]} relevant ? (y/n)') if input() is 'y': rel_items.append(item[0]) else: irl_items.append(item[0]) else: for item in init_rank_list: if item[0] == q_name: continue else: print(f'Is image {item[0]} relevant ? (y/n)') if input() is 'y': rel_items.append(item[0]) else: irl_items.append(item[0]) return rel_items, irl_items def get_SVM_based_feedback(self, q, rel_items, irl_items, obj_feature_matrix, m): q_new = self.compute_new_query_vector(q_old=q, relevant_items=rel_items, irrel_items=irl_items) X_train, Y_train = self.create_X_Y_as_np_matrix(rel_items=rel_items, irl_items=irl_items) # Training SVM classifier svm = support_vector_machine.SupportVectorMachine() svm.fit(X=X_train, y=Y_train) # Now getting more test data from LSH indexes test_dataset = read_from_pickle('test_dataset.pickle') X_test, imageNames = self.create_X_test_as_np_matrix(test_dataset=test_dataset) Y_pred = svm.predict(u=X_test) relevant_pred_img_names = [imageNames[i] for i in range(0, len(Y_pred)) if Y_pred[i] == 1] length_relevant_images = len(relevant_pred_img_names) if length_relevant_images < m: irr_image_names = [imageNames[i] for i in range(0, m - length_relevant_images) if Y_pred[i] == -1] relevant_pred_img_names.extend(irr_image_names) new_obj_feature_matrix = self.database_connection.HOG_descriptor_from_image_ids( image_ids=relevant_pred_img_names) new_rank_list = get_most_m_similar_images(data_with_images=new_obj_feature_matrix, query_image_feature_vector=q_new, m=m) return new_rank_list def get_DTC_based_feedback(self, q, rel_items, irl_items, obj_feature_matrix, m): q_new = self.compute_new_query_vector(q_old=q, relevant_items=rel_items, irrel_items=irl_items) X_train, Y_train = self.create_X_Y_as_np_matrix(rel_items=rel_items, irl_items=irl_items) # Training SVM classifier dtl = decision_tree_learning.DecisionTreeLearning() dtl.fit(X=X_train, y=Y_train) # Now getting more test data from LSH indexes test_dataset = read_from_pickle('test_dataset.pickle') X_test, imageNames = self.create_X_test_as_np_matrix(test_dataset=test_dataset) Y_pred = dtl.predict(u=X_test) relevant_pred_img_names = [imageNames[i] for i in range(0, len(Y_pred)) if Y_pred[i] == 1] length_relevant_images = len(relevant_pred_img_names) if length_relevant_images < m: irr_image_names = [imageNames[i] for i in range(0, m - length_relevant_images) if Y_pred[i] == -1] relevant_pred_img_names.extend(irr_image_names) new_obj_feature_matrix = self.database_connection.HOG_descriptor_from_image_ids( image_ids=relevant_pred_img_names) new_rank_list = get_most_m_similar_images(data_with_images=new_obj_feature_matrix, query_image_feature_vector=q_new, m=m) return new_rank_list def get_PPR_based_feedback(self, q, rel_items, irl_items, obj_feature_matrix, m): q_new = self.compute_new_query_vector(q_old=q, relevant_items=rel_items, irrel_items=irl_items) topology_images = read_from_pickle('test_dataset.pickle') image_names = get_image_names_from_tuples(topology_images) db_conn = DatabaseConnection() data_image_dict = db_conn.HOG_descriptor_from_image_ids(image_names) data_matrix = data_image_dict['data_matrix'] image_names = data_image_dict['images'] svd_obj = SingularValueDecomposition() svd_image_data = svd_obj.get_transformed_data(data_matrix, 8) # change this for 11K images pg_obj = PageRank() image_similarity_matrix = pg_obj.get_image_similarity_matrix_for_top_k_images(6, svd_image_data) seed_vector = pg_obj.get_seed_vector(rel_items, image_names, irl_items) pie = pg_obj.get_page_rank_eigen_vector(image_similarity_matrix, seed_vector) new_rank_list = pg_obj.get_top_K_images_based_on_scores(pie, image_names, m) return new_rank_list def get_init_ranking(self, obj_feature_matrix, q): # For SVM, DTC, PPR.... check calculate_init_prob_similarity for Probab based svd = singular_value_decomposition.SingularValueDecomposition() data_matrix = obj_feature_matrix['data_matrix'] U, S, Vt = svd.get_latent_semantics(data_matrix=data_matrix, n_components=25) init_rank_list = get_most_m_similar_images(data_with_images=obj_feature_matrix, query_image_feature_vector=q, Vt=Vt, m=5) return init_rank_list, Vt # rel_items,irl_items=rf.get_user_feedback(init_rank_list=init_rank_list,q_name=q_name) # q_new=rf.compute_new_query_vector(q_old=q,relevant_items=rel_items,irrel_items=irl_items) # new_rank_list=get_most_m_similar_images(data_with_images=obj_feature_matrix,query_image_feature_vector=q_new,Vt=Vt,m=5) def get_Vt(self, obj_feature_matrix): # For SVM, DTC, PPR.... check calculate_init_prob_similarity for Probab based svd = singular_value_decomposition.SingularValueDecomposition() data_matrix = obj_feature_matrix['data_matrix'] U, S, Vt = svd.get_latent_semantics(data_matrix=data_matrix, n_components=25) return Vt def get_probabilistic_relevance_feedback(self, D_matrix, images, q_name, m): n_i = self.calculate_n_i(D_matrix=D_matrix) init_scores = self.calculate_initial_prob_similarity(D_matrix=D_matrix, images=images, n_i=n_i) rel_items, irl_items = self.get_user_feedback(init_rank_list=[init_scores[:m]], q_name=q_name, caller='prb') new_rank_list = self.calculate_feedback_prob_similarity(D_matrix=D_matrix, images=images, relevant_items=rel_items, n_i=n_i) return new_rank_list[:m] def calculate_feedback_prob_similarity(self, D_matrix, images, relevant_items, n_i): N = D_matrix.shape[0] R = len(relevant_items) n_i = n_i[0] r_i = self.calculate_r_i(D_matrix=D_matrix, images=images, relevant_items=relevant_items) r_i = r_i[0] feedback_scores = {} j = 0 for d in D_matrix: sim_score = 0 for i in range(0, len(n_i)): numerator = (r_i[i] + 0.5) / (R + 1 - r_i[i]) denominator = (n_i[i] - r_i[i] + 0.5) / (N - R + 1 - n_i[i] + r_i[i]) sim_score = sim_score + d[i] * math.log2(numerator / denominator) feedback_scores[images[j]] = sim_score j += 1 feedback_scores = sorted(feedback_scores.items(), key=lambda k: k[1], reverse=True) return feedback_scores def calculate_initial_prob_similarity(self, D_matrix, images, n_i): N = D_matrix.shape[0] n_i = n_i[0] init_scores = {} j = 0 for d in D_matrix: sim_score = 0 for i in range(0, len(n_i)): sim_score = sim_score + d[i] * math.log2((N - n_i[i] + 0.5) / (n_i[i] + 0.5)) init_scores[images[j]] = sim_score j += 1 init_scores = sorted(init_scores.items(), key=lambda k: k[1], reverse=True) return init_scores def calculate_r_i(self, D_matrix, images, relevant_items): r_i = np.zeros((1, D_matrix.shape[1])) i = 0 for row in D_matrix: temp = [1 if row[x] > 0 and images[i] in relevant_items else 0 for x in range(0, len(row))] r_i = r_i + np.array(temp).T i += 1 return r_i def calculate_n_i(self, D_matrix): n_i = np.zeros((1, D_matrix.shape[1])) for row in D_matrix: temp = [1 if row[x] > 0 else 0 for x in range(0, len(row))] n_i = n_i + np.array(temp).T return n_i def create_X_Y_as_np_matrix(self, rel_items, irl_items): X = [] Y = [] # Adding relevant items in X and Y for item in rel_items: fv = self.database_connection.get_feature_data_for_image('histogram_of_gradients', item) X.append(fv.reshape(fv.shape[1])) Y.append(1) # Adding irrelevant items in X and Y for item in irl_items: fv = self.database_connection.get_feature_data_for_image('histogram_of_gradients', item) X.append(fv.reshape(fv.shape[1])) Y.append(-1) return np.array(X), np.array(Y) def create_X_test_as_np_matrix(self, test_dataset): X = [] imageNames = [] # Adding relevant items in X and Y for item in test_dataset: fv = self.database_connection.get_feature_data_for_image('histogram_of_gradients', item[0]) X.append(fv.reshape(fv.shape[1])) imageNames.append(item[0]) return np.array(X), imageNames