def generate_img_img_adj_matrix(self): """ Method: generate image-image similarity matrix and stash in pickle file""" print("getting and normalizing data...") data_extractor = DataExtractor() loc_mapping = data_extractor.location_mapping() self.img_feature_matrix = data_extractor.prepare_dataset_for_task6( loc_mapping) scaler = MinMaxScaler() scaler.fit(list(self.img_feature_matrix.values())) for img, feature in self.img_feature_matrix.items(): self.img_feature_matrix[img] = scaler.transform([feature])[0] self.img_ids = list(self.img_feature_matrix.keys())
class PreProcessor: def __init__(self): self.data_extractor = DataExtractor() self.mapping = self.data_extractor.location_mapping() self.location_names = list(self.mapping.values()) self.reference_model = 'CM3x3' self.model_list = self.init_model_list() self.reference_df = pd.DataFrame() self.df_list = self.init_df_list() self.data_dict = dict() self.minmax_scaler = MinMaxScaler() def init_model_list(self): """ Method Explanation: . Initializes the model_list as every model name other than the reference model for preprocessing. """ models = ['CN', 'CSD', 'LBP3x3', 'HOG', 'GLRLM'] # along with reference model CM3x3 return models def init_df_list(self): """ Method Explanation: . Initializes the df_list comprising of the dataframes. """ to_return = list() for model in self.model_list: to_return.append(pd.DataFrame()) return to_return def compute_first_index_lesser_than_one(self, S): """ Method Explanation: . Computes the first index in S that is lesser than 1 where S is a vector representation of the factor matrix in SVD with eigen values in decreasing order of weights. Input(s): S -- Vector representation of the factor matrix in SVD (S in U, S, Vt) comprising of eigen values sorted in decreasing order of weights. Output(s): The number of eigen values to consider for concept mapping given by the first index in S with a value lesser than 1. """ for index, eigen_value in enumerate(S): if eigen_value < 1: return index + 1 def compute_latent_semantics(self, feature_matrix): """ Method Explanation: . Returns the latent semantic representation of the feature_matrix with 'k' concepts. . 'k' -- number of concepts -- index of the first eigen value that is lesser than 1 in S represented as a vector in decreasing order of weights. Input(s): feature_matrix -- the list of all features of all image IDs on top of which SVD would be done. Output: The concept mapping of the feature_matrix in 'k' dimensions/concepts. """ print('Finding latent semantics of the data...') U, S, Vt = np.linalg.svd(feature_matrix) print('Removing eigen vectors with eigen value less than 1...') k = self.compute_first_index_lesser_than_one(S) S = np.diag(S) print('Preprocessing done...') return (np.dot(U[:, :k], S[:k, :k])) def preprocess_MinMaxScaler(self): """ Method Explanation: . Refer to the top of the file for the algorithm for preprocessing. . Uses the MinMaxScaling for the normalization of data between 0 and 1. """ print('\nPreprocessing the data...') self.data_dict.clear() # print('Current model being processed: ', self.reference_model, '...') for location in self.location_names: current_df = pd.read_csv("../dataset/visual_descriptors/" + location + " " + self.reference_model + ".csv", header=None) self.reference_df = self.reference_df.append(current_df, ignore_index=True) self.reference_df = self.reference_df.drop_duplicates( subset=[0], keep='first' ) # drop duplicate image ID rows and keep the first one. columns_to_normalize = np.arange( 1, self.reference_df.shape[1], 1 ) # the column indices to which MinMax normalization will be applied to. self.reference_df[ columns_to_normalize] = self.minmax_scaler.fit_transform( self.reference_df[columns_to_normalize] ) # MinMax normalization self.data_dict = self.reference_df.set_index(0).T.to_dict( 'list') # Filling the data dict temp_dict = dict() for index, model in enumerate(self.model_list): # print('Current model being processed: ', model, '...') for location in self.location_names: # print('\tLocation being processed: ', location, '...') current_df = pd.read_csv("../dataset/visual_descriptors/" + location + " " + model + ".csv", header=None) df_to_modify = self.df_list[ index] # Get the current model's DF that has been populated with X items so far... df_to_modify = df_to_modify.append( current_df, ignore_index=True ) # Append the current df to the current model's DF... self.df_list[index] = df_to_modify model_df = self.df_list[index] model_df = model_df.drop_duplicates( subset=[0], keep='first' ) # drop duplicate image ID rows and keep the first one. columns_to_normalize = np.arange( 1, model_df.shape[1], 1 ) # the column indices to which MinMax normalization will be applied to. model_df[columns_to_normalize] = self.minmax_scaler.fit_transform( model_df[columns_to_normalize]) self.df_list[index] = model_df temp_dict = self.df_list[index].set_index(0).T.to_dict('list') for key, val in temp_dict.items(): if key in self.data_dict: current_list = self.data_dict[key] current_list.extend(temp_dict[key]) self.data_dict[ key] = current_list # insert into data dict only if image id is already present temp_dict.clear( ) # clear the temp dictionary for the next iteration # Apply MinMax scaling to the feature concatenated data matrix as well the_feature_matrix = self.minmax_scaler.fit_transform( np.asarray(list(self.data_dict.values()))) # Repopulate the data_dict index_counter = 0 for key, val in self.data_dict.items(): self.data_dict[key].clear() self.data_dict[key] = the_feature_matrix[index_counter] index_counter += 1 return self.data_dict
class Task2(object): """ This module is responsible for finding similarity between entities based on the latent semantics computed by PCA/LDA/SVD on original vector space (user/location/image) """ def __init__(self): self.ut = Util() self.data_extractor = DataExtractor() self.mapping = self.data_extractor.location_mapping() def calculate_similarity(self, input_vector, k_semantics_map, entity_type): ''' Method : calculate_similarity computes compute the similarity matrix for the given entity id and given the latent semantics vector in form of weights for given entity type ''' similarity_data = [] for key, value in k_semantics_map.items(): result = self.ut.cosine_similarity(input_vector, value) if entity_type == constants.IMAGE_TEXT: key = int(key) similarity_data.append((key, result)) return similarity_data def get_k_semantics_map(self, entity_data, k_semantics): """ Method : Returns the k semantics map while linking the entity id from original data with new latent semantics entity_data : user_term matrix or image_term matrix or location_term matrix k_semantics : new reduced feature space after projecting the original data points with dimension o x k """ entity_ids = list(entity_data.data.master_dict.keys()) k_semantics_map = {} for entity_id, value in zip(entity_ids, k_semantics): k_semantics_map[entity_id] = value return k_semantics_map def top_5(self, similarity_data): """ Method: Prints the top5 similar entities (users/images/locations) with respect to input entity similarity_data: List of objects containing entity_id and the similarity score between respective entity """ print(sorted(similarity_data, key=lambda x: x[1], reverse=True)[:5]) pass def dim_reduce_SVD(self, document_term_matrix, k, pca=False): """ Method: Returns the left factor, Sigma and right factor matrix using SVD and PCA if pca flag is True document_term_matrix : Input data matrix k : number of hidden concepts """ document_term_matrix = self.ut.convert_list_to_numpyarray( document_term_matrix) document_term_sparse_matrix = scipy.sparse.csc_matrix( document_term_matrix) if pca: input_std = StandardScaler().fit_transform(document_term_matrix) pca = PCA(n_components=int(k)) object_concept_matrix = pca.fit_transform(input_std) Vt = pca.components_ #original sigma is linear array of k eigen values, so we need to construct a diagonal matrix S = np.diag(pca.singular_values_) U = document_term_matrix @ Vt.T else: #document_term_sparse_matrix = scipy.sparse.csc_matrix(document_term_matrix) svd = TruncatedSVD(n_components=int(k)) svd.fit(document_term_matrix) object_concept_matrix = svd.transform(document_term_matrix) Vt = svd.components_ #original sigma is linear array of k eigen values, so we need to construct a diagonal matrix S = np.diag(svd.singular_values_) U = document_term_matrix @ Vt.T # U,S,Vt = sparsesvd(document_term_sparse_matrix,k) # #Projection of objects along hidden concepts # U = document_term_sparse_matrix @ Vt.T # #original sigma is linear array of k components, so we need to construct a diagonal matrix # S = np.diag(S) return U, S, Vt def dim_reduce_LDA(self, document_term_matrix, k): """ Method: Returns the left factor, Sigma and right factor matrix using LDA document_term_matrix : Input data matrix k : number of hidden concepts """ lda = LatentDirichletAllocation(n_components=int(k), max_iter=5, learning_method='online', random_state=0) document_topic_matrix = lda.fit_transform(document_term_matrix) term_topic_matrix = lda.components_ #Getting the feature vs feature matrix topic_topic_matrix = term_topic_matrix @ term_topic_matrix.T #Projection of original objects along hidden topics transformed_document_topic_matrix = document_topic_matrix @ topic_topic_matrix return transformed_document_topic_matrix, topic_topic_matrix, term_topic_matrix def get_projected_query_vector(self, input_vector, v_matrix, sigma_matrix): """ Method: Returns the projected query vector onto given latent semantic space input_vector : input_query from original data matrix form v_matrix : feature vs k concepts matrix sigma_matrix : core_matrix """ projected_query_vector = [] projected_query_vector = input_vector.T @ v_matrix.T @ np.linalg.inv( sigma_matrix) return projected_query_vector def get_document_term_matrix(self, entity_data): """ Method : Get the document term matrix for the given entity using the global dictionary of terms. entity_data : entity_data matrix """ global_tag_set = entity_data.get_global_tag_set() global_tag_dict = entity_data.convert_dict_from_set(global_tag_set) master_matrix = entity_data.create_master_matrix(global_tag_dict) return master_matrix def get_similar_entities(self, user_term_matrix, image_term_matrix, location_term_matrix, user_S_matrix, user_vt_matrix, image_S_matrix, image_vt_matrix, location_S_matrix, location_vt_matrix, user_id=None, image_id=None, location_id=None): """ Method: Get the similar users, images and locations by projecting the given query vector onto other latent semantic space for all user, image and location entity user_term_matrix: user document_term matrix image_term_matrix: image document_term matrix location_term_matrix: location document_term matrix user_S_matrix : sigma matrix after decomposing user data matrix image_S_matrix : sigma matrix after decomposing image data matrix location_S_matrix : sigma matrix after decomposing location data matrix user_vt_matrix : feature vs concepts matrix after decomposing user data matrix image_vt_matrix : feature vs concepts matrix after decomposing image data matrix location_vt_matrix : feature vs concepts matrix after decomposing location data matrix """ if user_id: #If the input is user_id user_input_vector = self.user_semantics_map[user_id] #For similar user id we can directly use the user U matrix without projecting similar_users = self.calculate_similarity(user_input_vector, self.user_semantics_map, constants.USER_TEXT) original_user_input_vector = self.ut.convert_list_to_numpyarray( user_term_matrix[self.user_index]) user_projected_query_vector_image = self.get_projected_query_vector( original_user_input_vector, image_vt_matrix, image_S_matrix) user_projected_query_vector_location = self.get_projected_query_vector( original_user_input_vector, location_vt_matrix, location_S_matrix) similar_images = self.calculate_similarity( user_projected_query_vector_image, self.image_semantics_map, constants.IMAGE_TEXT) similar_locations = self.calculate_similarity( user_projected_query_vector_location, self.location_semantics_map, constants.LOCATION_TEXT) elif image_id: #Given image id, computing the top 5 related images ,users and locations image_input_vector = self.image_semantics_map[image_id] #For similar user id we can directly use the user U matrix without projecting similar_images = self.calculate_similarity( image_input_vector, self.image_semantics_map, constants.IMAGE_TEXT) original_image_input_vector = self.ut.convert_list_to_numpyarray( image_term_matrix[self.image_index]) image_projected_query_vector_user = self.get_projected_query_vector( original_image_input_vector, user_vt_matrix, user_S_matrix) image_projected_query_vector_location = self.get_projected_query_vector( original_image_input_vector, location_vt_matrix, location_S_matrix) similar_users = self.calculate_similarity( image_projected_query_vector_user, self.user_semantics_map, constants.USER_TEXT) similar_locations = self.calculate_similarity( image_projected_query_vector_location, self.location_semantics_map, constants.LOCATION_TEXT) elif location_id: #Given location id, computing the top 5 related locations,users and images location_input_vector = self.location_semantics_map[ self.mapping[location_id]] #For similar user id we can directly use the user U matrix without projecting similar_locations = self.calculate_similarity( location_input_vector, self.location_semantics_map, constants.LOCATION_TEXT) #self.top_5(similar_locations) original_location_input_vector = self.ut.convert_list_to_numpyarray( location_term_matrix[self.location_index]) location_projected_query_vector_image = self.get_projected_query_vector( original_location_input_vector, image_vt_matrix, image_S_matrix) location_projected_query_vector_user = self.get_projected_query_vector( original_location_input_vector, user_vt_matrix, user_S_matrix) similar_images = self.calculate_similarity( location_projected_query_vector_image, self.image_semantics_map, constants.IMAGE_TEXT) similar_users = self.calculate_similarity( location_projected_query_vector_user, self.user_semantics_map, constants.USER_TEXT) print("Top 5 related users are") self.top_5(similar_users) print("Top 5 related images are") self.top_5(similar_images) print("Top 5 related locations are") self.top_5(similar_locations) pass def get_all_latent_semantics_map(self, user_data, image_data, location_data, user_u_matrix, image_u_matrix, location_u_matrix): user_semantics_map = self.get_k_semantics_map(user_data, user_u_matrix) image_semantics_map = self.get_k_semantics_map(image_data, image_u_matrix) location_semantics_map = self.get_k_semantics_map( location_data, location_u_matrix) return user_semantics_map, image_semantics_map, location_semantics_map def runner(self): """ Method: runner implemented for all the tasks, takes user input for type of entity from list of User,Image and Location and respective entity_id Displays the top 5 entities with respect to input entity using the latent semantics obtained from task1 for respective entity vector space """ #k = input("Enter the value of k :") #start = time.time() k = input("Enter the value of k :") # user_id = input("Enter the user id: ") # image_id = input("Enter the image id: ") # location_id = input("Enter the location id: ") algo_choice = input("Enter the Algorithm: ") entity_index = int( input("Choose the entity id \t1) User \t2)Image \t3)Location.: ")) user_id, image_id, location_id = None, None, None if entity_index == 1: self.entity_type = constants.USER_TEXT user_id = input("Enter the user id: ") elif entity_index == 2: self.entity_type = constants.IMAGE_TEXT image_id = input("Enter the image id: ") elif entity_index == 3: self.entity_type = constants.LOCATION_TEXT location_id = input("Enter the location id: ") """ Get the document term matrix for users,images and locations from task1 """ user_data = Task1() user_data.load_data_per_entity(constants.USER_TEXT) user_term_matrix = self.get_document_term_matrix(user_data) user_term_matrix = self.ut.convert_list_to_numpyarray( user_term_matrix).T image_data = Task1() image_data.load_data_per_entity(constants.IMAGE_TEXT) image_term_matrix = self.get_document_term_matrix(image_data) image_term_matrix = self.ut.convert_list_to_numpyarray( image_term_matrix).T location_data = Task1() location_data.load_data_per_entity(constants.LOCATION_TEXT) location_term_matrix = self.get_document_term_matrix(location_data) location_term_matrix = self.ut.convert_list_to_numpyarray( location_term_matrix).T if self.entity_type == constants.USER_TEXT: try: self.user_index = list( user_data.data.master_dict.keys()).index(user_id) except ValueError: raise ValueError(constants.USER_ID_KEY_ERROR) pass elif self.entity_type == constants.IMAGE_TEXT: try: self.image_index = list( image_data.data.master_dict.keys()).index(image_id) except ValueError: raise ValueError(constants.IMAGE_ID_KEY_ERROR) pass elif self.entity_type == constants.LOCATION_TEXT: try: input_location = self.mapping[location_id] self.location_index = list( location_data.data.master_dict.keys()).index( input_location) except ValueError: raise ValueError(constants.LOCATION_ID_KEY_ERROR) pass if algo_choice == 'SVD' or algo_choice == 'PCA': pca = False if algo_choice == 'PCA': pca = True """ Decompose the original document term matrix into U,S and Vt using SVD For PCA we pass pca flag to indicate the passing of covariance matrix in the SVD method. """ user_u_matrix, user_S_matrix, user_vt_matrix = self.dim_reduce_SVD( user_term_matrix, k, pca) image_u_matrix, image_S_matrix, image_vt_matrix = self.dim_reduce_SVD( image_term_matrix, k, pca) location_u_matrix, location_S_matrix, location_vt_matrix = self.dim_reduce_SVD( location_term_matrix, k, pca) """ Get the latent semantics for users, images and locations """ user_semantics_map, image_semantics_map,location_semantics_map = \ self.get_all_latent_semantics_map(user_data,image_data,location_data, user_u_matrix,image_u_matrix,location_u_matrix) self.user_semantics_map = user_semantics_map self.image_semantics_map = image_semantics_map self.location_semantics_map = location_semantics_map """ Get the similar cross entities given a entity id. eg userid -> similar users, images, and locations, imageid -> similar images, locations and users. """ self.get_similar_entities(user_term_matrix, image_term_matrix, location_term_matrix, user_S_matrix, user_vt_matrix, image_S_matrix, image_vt_matrix, location_S_matrix, location_vt_matrix, user_id, image_id, location_id) elif algo_choice == 'LDA': """ Decompose the original document term matrix into U,S and Vt using LDA """ user_u_matrix, user_S_matrix, user_vt_matrix = self.dim_reduce_LDA( user_term_matrix, k) image_u_matrix, image_S_matrix, image_vt_matrix = self.dim_reduce_LDA( image_term_matrix, k) location_u_matrix, location_S_matrix, location_vt_matrix = self.dim_reduce_LDA( location_term_matrix, k) user_semantics_map, image_semantics_map,location_semantics_map = \ self.get_all_latent_semantics_map(user_data,image_data,location_data, user_u_matrix,image_u_matrix,location_u_matrix) self.user_semantics_map = user_semantics_map self.image_semantics_map = image_semantics_map self.location_semantics_map = location_semantics_map self.get_similar_entities(user_term_matrix, image_term_matrix, location_term_matrix, user_S_matrix, user_vt_matrix, image_S_matrix, image_vt_matrix, location_S_matrix, location_vt_matrix, user_id, image_id, location_id) #print("Seconds",time.time() - start)
class Task6: def __init__(self): """ Method Explanation: Intializes all the variables for the analysis task. """ self.util = Util() self.data_extractor = DataExtractor() self.location_id_to_title_map = self.data_extractor.location_mapping() self.location_title_to_id_map = self.data_extractor.location_title_to_id_mapping( ) self.location_list = list( self.location_title_to_id_map.values()) # List of location ids self.LOCATION_COUNT = len(self.location_list) # constant self.global_term_dictionary_current_index = 0 # To store the count of unique terms and indexing a given term in the global dictionary self.global_term_dictionary = dict( ) # To store the global list of terms as keys and their indices as values self.global_term_index_dictionary = dict( ) # To store the global list of terms referenced via the indices as the keys and terms as the values self.location_dictionary = dict( ) # To store the terms of a particular location and their corresponding attributes self.similarity_matrix = numpy.zeros( (self.LOCATION_COUNT, self.LOCATION_COUNT)) # To capture location-location similarity def construct_vocabulary(self): """ Method Explanation: . Constructs a global term vocabulary. . Constructs a location based term vocabulary. """ with open(constants.TEXT_DESCRIPTORS_DIR_PATH + "devset_textTermsPerPOI.txt", encoding="utf-8") as f: lines = [line.rstrip("\n") for line in f] for line in lines: words = line.split() temp_list_for_title = [] # extract location title while "\"" not in words[0]: temp_list_for_title.append(words.pop(0)) location_title = ("_").join(temp_list_for_title) location_id = self.location_title_to_id_map[location_title] # Build the term vocabulary and also the dictionary for terms corresponding to the locations and their scores for index, word in enumerate(words): index_mod4 = index % 4 if index_mod4 == 0: # the term current_word = word.strip('\"') if not self.global_term_dictionary.get(current_word): self.global_term_dictionary[ current_word] = self.global_term_dictionary_current_index self.global_term_index_dictionary[ self. global_term_dictionary_current_index] = current_word self.global_term_dictionary_current_index += 1 if not self.location_dictionary.get(location_id): self.location_dictionary[location_id] = {} if not self.location_dictionary.get(location_id).get( current_word): self.location_dictionary[location_id][ current_word] = { "TF": 0, "DF": 0, "TFIDF": 0 } elif index_mod4 == 1: # TF self.location_dictionary[location_id][current_word][ "TF"] = int(word) elif index_mod4 == 2: # DF self.location_dictionary[location_id][current_word][ "DF"] = int(word) elif index_mod4 == 3: # TFIDF self.location_dictionary[location_id][current_word][ "TFIDF"] = float(word) def construct_similarity_matrix(self, model): """ Method Explanation: . Goes over every location as a potential query location, compares its textual descriptors with every other location as a potential target location. . The comparison is based on the Cosine Similarity scores of one of the model vectors (TF/DF/TFIDF) defined by the <model> parameter. Inputs: <model> - Has three possible values -- TF, DF, TFIDF. Corresponds to which model score to consider for computing the Cosine Similarity between the textual descriptors. """ the_model = model # Go over every location as a potential query location for query_location_id in self.location_list: query_model_vector = [0 ] * self.global_term_dictionary_current_index # Construct the query model vector (<the_model> values of each term in the query location) for current_term_id_key, current_term_id_value in self.location_dictionary[ query_location_id].items(): if current_term_id_key == the_model: continue current_term_index = self.global_term_dictionary[ current_term_id_key] query_model_vector[ current_term_index] = self.location_dictionary[ query_location_id][current_term_id_key][the_model] # Go over every location as a potential target location for target_location_id, target_location_id_data in self.location_dictionary.items( ): # If query location is the same as target location, similarity = 1 if target_location_id == query_location_id: self.similarity_matrix[query_location_id - 1][target_location_id - 1] = 1 continue else: if not self.location_dictionary.get( target_location_id).get(the_model): self.location_dictionary[target_location_id][ the_model] = [ 0 ] * self.global_term_dictionary_current_index # Build the target model vector comprising of the_model scores of the target location for current_term_key, current_term_value in self.location_dictionary[ target_location_id].items(): if current_term_key == the_model: continue current_term_index = self.global_term_dictionary[ current_term_key] self.location_dictionary[target_location_id][ the_model][ current_term_index] = self.location_dictionary[ target_location_id][current_term_key][ the_model] # Compute the Cosine Similarity between the query model vector and target model vector cosine_similarity_value = self.util.cosine_similarity( query_model_vector, self.location_dictionary[target_location_id] [the_model]) self.similarity_matrix[query_location_id - 1][target_location_id - 1] = cosine_similarity_value def print_k_latent_semantics(self, k): """ Method Explanation: . Applies a Singular Valued Decomposition on the similarity matrix and prints the first k latent semantics determined by the k parameter. . The output is in the form of location-weight pairs for each semantic sorted in the decreasing order of weights. Input: . <k> for considering only the k latent semantics post SVD """ U, S, Vt = numpy.linalg.svd(self.similarity_matrix) # Get the concept mapping concept_mapping = self.similarity_matrix.dot(U[:, :k]) concept_mapping = concept_mapping.transpose() # { # <location_id>: [{"Location Name": <>, "Weight": <>}, {"Location Name": <>, "Weight": <>}, ...], # <location_id>: [{"Location Name": <>, "Weight": <>}, {"Location Name": <>, "Weight": <>}, ...], # ... # } semantic_data_dict = {} print("") for arr_index, arr in enumerate(concept_mapping): current_key = arr_index + 1 if not semantic_data_dict.get(current_key): semantic_data_dict[current_key] = [] for index, element in enumerate(arr): semantic_data_dict[current_key].append({ "Location Name": self.location_id_to_title_map[str(index + 1)], "Weight": element }) # Sort the latent semantic based on the weight of the feature sorted_list = sorted(semantic_data_dict[current_key], key=itemgetter("Weight"), reverse=True) semantic_data_dict[current_key].clear() semantic_data_dict[current_key] = sorted_list # Print location name-weight pairs sorted in decreasing order of weights print("Latent Semantic: ", current_key) for idx, data in enumerate(sorted_list): print("\tLocation Name: ", semantic_data_dict[current_key][idx]["Location Name"], " | Weight: ", semantic_data_dict[current_key][idx]["Weight"]) print("") def runner(self): k = input("Enter the k value: ") k = int(k) the_model = "TFIDF" self.construct_vocabulary() self.construct_similarity_matrix(the_model) self.print_k_latent_semantics(k)
class Task3(object): def __init__(self): self.ut = Util() self.data_extractor = DataExtractor() def calculate_similarity(self, k_semantics, image_position, array_of_all_images, array_location_vector): """ Method: image-image and image-location similarity""" vector_of_input_image = k_semantics[image_position] similarity_score_images = self.ut.get_similarity_scores(k_semantics,vector_of_input_image) #Storing all the image IDs and its score with given input image ID image_and_score = [] for i in range(len(array_of_all_images)): image_and_score.append([array_of_all_images[i],similarity_score_images[i]]) #Sorting on the basis of score and printing top 5 images across all locations sorted_sim_vector = sorted(image_and_score,key = lambda x:x[1],reverse = True) #sorting the similarity vector print("5 most similar images with matching score is :") print(sorted_sim_vector[:5]) """ The start index and end index for a location is used, the image to image scores for that location is sorted and the top value is stored for representing that location. The top values of all locations are sorted and the top 5 locations are printed. """ loc_img_score = [] top_value = [] for key in array_location_vector: start_index = array_location_vector[key][0] end_index = array_location_vector[key][1] top_value = sorted(similarity_score_images[start_index:end_index + 1],key = lambda x:x,reverse = True)[0] mapping = self.data_extractor.location_mapping() for loc_id,location_name in mapping.items(): if(key == location_name): location_id = loc_id loc_img_score.append([location_id,key,top_value]) #Sorting on basis of score and printing top 5 locations top_locations = sorted(loc_img_score,key = lambda x:x[2],reverse = True)[:5] print("5 most similar locations with matching score is :") print(top_locations) def runner(self): """ Method: runner implemented for all the tasks, takes user input, runs dimensionality reduction algorithm, prints latent semantics and computes image-image and image-location similarity using the latent semantics. """ try: model = input("Enter the model : ") k = input("Enter the value of k :") image_id = input("Enter image ID : ") array_of_all_images, image_input_array, image_position, \ array_location_vector = self.data_extractor.prepare_dataset_for_task3(model, image_id) algo_choice = input("Enter the Algorithm: ") algorithms = { "SVD": self.ut.dim_reduce_SVD, "PCA": self.ut.dim_reduce_PCA , "LDA": self.ut.dim_reduce_LDA} k_semantics = algorithms.get(algo_choice)(image_input_array, k) print(k_semantics) self.calculate_similarity(k_semantics, image_position, array_of_all_images, array_location_vector) except Exception as e: print(constants.GENERIC_EXCEPTION_MESSAGE + "," + str(type(e)) + "::" + str(e.args))
class Task4(object): def __init__(self): self.ut = Util() self.data_extractor = DataExtractor() def calculate_location_similarity(self, arr, location_list_indices, mapping, location_id): """ Method: calculate_location_similarity computes similarity score for the reduced location-location dataset. Given an input location, we need to find out similarity score of this location with respect to other locations. Computes similarity based on euclidean distance. For each comparison of an image in location 1 with all other images in location 2, we find out the most similar images. Finally, we return the average for these most similar images in location 2 with respect to location 1. Note that the low dimensional dataset will not have reference to visual descriptor models. k_semantics: low dimensional dataset to be used for similarity computation (total number of images X k) location_indices_map: stores key => location, value => indices in k_semantics algo_choice: (can be used in case we want to use a different similarity metric for each of the algorithms) input_location: reference location """ location_similarity = {} for location in location_list_indices.keys(): imgximg_exhaustive_sim = [] imgximg_similarity = [] for i in range(0, location_list_indices[mapping[location_id]][1]): for j in range(location_list_indices[location][0], location_list_indices[location][1]): similarity = spatial.distance.euclidean(arr[i], arr[j]) similarity = 1 / (1 + similarity) imgximg_exhaustive_sim.append(similarity) imgximg_similarity.append(max(imgximg_exhaustive_sim)) imgximg_exhaustive_sim = [] location_similarity.update( {location: sum(imgximg_similarity) / len(imgximg_similarity)}) print( sorted(location_similarity.items(), key=lambda x: x[1], reverse=True)[:5]) def runner(self): """ Method: runner implemented for all the tasks, takes user input, runs dimensionality reduction algorithm, prints latent semantics for input location and computes similarity between two locations for a given model using the latent semantics. """ try: #create the location_id-locationName mapping mapping = self.data_extractor.location_mapping() #take the input from user location_id = input("Enter the location id:") location = mapping[location_id] model = input("Enter the model: ") k = input("Enter value of k: ") algo_choice = input("Enter the Algorithm: ") #create the list of all files of the given model file_list = self.data_extractor.create_dataset( mapping, model, location_id) #append all the location images to a list with the first location being the input input_image_list, location_list_indices, input_location_index = self.data_extractor.append_givenloc_to_list(\ mapping, model,location_id, file_list) #convert list to numpy array input_image_arr = self.ut.convert_list_to_numpyarray( input_image_list) #select algorithm algorithms = { "SVD": self.ut.dim_reduce_SVD, "PCA": self.ut.dim_reduce_PCA, "LDA": self.ut.dim_reduce_LDA } #get the k latent semantics k_semantics = algorithms.get(algo_choice)(input_image_arr, k) print(k_semantics[0:input_location_index]) self.calculate_location_similarity(k_semantics, location_list_indices, mapping, location_id) except KeyError: print(constants.LOCATION_ID_KEY_ERROR) except Exception as e: print(constants.GENERIC_EXCEPTION_MESSAGE + "," + str(type(e)) + "::" + str(e.args))
class Task1(): def __init__(self): self.ut = Util() self.data_extractor = DataExtractor() self.mapping = self.data_extractor.location_mapping() def generate_imgximg_edgelist(self, image_list1, image_list2, image_feature_map, k): """ Method: generate_imgximg_edgelist returns image to image similarity in form of an edge list """ imgximg_edgelist_file = open(constants.VISUALIZATIONS_DIR_PATH + "entire_graph_file.txt", "w") image_id_mapping_file = open(constants.DUMPED_OBJECTS_DIR_PATH + "image_id_mapping.pickle", "wb") image_id_mapping = {} for index1 in range(0, len(image_list1)): local_img_img_sim_list = [] for index2 in range(0, len(image_list2)): image1 = image_list1[index1] image2 = image_list2[index2] features_image1 = image_feature_map[image1] features_image2 = image_feature_map[image2] score = 1 / (1 + self.calculate_similarity(features_image1, features_image2)) imgximg_edgelist_file.write(str(image1) + " " + str(image2) + " " + str(score) + "\n") local_img_img_sim_list.append((image1, image2, score)) self.top_k(local_img_img_sim_list, k) image_id_mapping[image1] = index1 pickle.dump(["Image_id mapping:", image_id_mapping], image_id_mapping_file) image_id_mapping_file.close() def calculate_similarity(self, features_image1, features_image2): """ Method: image-image similarity computation""" return self.ut.compute_euclidean_distance(np.array(features_image1), np.array(features_image2)) def top_k(self, graph_list, k): reduced_graph_file = open(constants.VISUALIZATIONS_DIR_PATH + "reduced_graph_file_" + str(k) + ".txt", "a+") top_k = sorted(graph_list, key=lambda x:(-x[2], x[1], x[0]))[0:k] for iter in top_k: reduced_graph_file.write(str(iter[0]) + " " + str(iter[1]) + " " + str(iter[2]) + "\n") def create_graph(self, k): reduced_graph_file = open(constants.VISUALIZATIONS_DIR_PATH + "reduced_graph_file_" + str(k) + ".txt", "r") visualise_graph_file = open(constants.VISUALIZATIONS_DIR_PATH + "visualisation_graph_file.txt", "w") task1_output_file = open(constants.TASK1_OUTPUT_FILE, "w") visualise_len = 10 * int(k) for iter in range(visualise_len): visualise_graph_file.write(reduced_graph_file.readline()) count = 0 for iter in reduced_graph_file: image_id = iter.split(" ") count += 1 if count <= k: task1_output_file.write(image_id[1] + "\n") else: count = 0 task1_output_file.write("####\n") task1_output_file.close() visualise_graph_file.close() g = nx.read_edgelist(constants.VISUALIZATIONS_DIR_PATH + "visualisation_graph_file.txt", nodetype=int, \ data=(('weight',float),), create_using=nx.DiGraph()) print("graph created") nx.draw(g, with_labels=True) plt.show() return g def runner(self): """ Method: runner implemented for all the tasks, takes user input, and prints desired results. """ try: k = int(input("Enter the value of k:\t")) image_feature_map = self.data_extractor.prepare_dataset_for_task1(self.mapping) image_list = list(image_feature_map.keys()) self.generate_imgximg_edgelist(image_list, image_list, image_feature_map, k) self.create_graph(k) except Exception as e: print(constants.GENERIC_EXCEPTION_MESSAGE + "," + str(type(e)) + "::" + str(e.args))
def __init__(self): self.models = constants.MODELS data_extractor = DataExtractor() mapping = data_extractor.location_mapping() self.locations = list(mapping.values()) self.task1 = Task1()
class Task5(object): def __init__(self): self.ut = Util() self.data_extractor = DataExtractor() self.mapping = self.data_extractor.location_mapping() def calculate_location_similarity(self, k_semantics, location_indices_map, algo_choice, input_location): """ Method: calculate_location_similarity computes similarity score for the reduced location-location dataset. Given an input location, we need to find out similarity score of this location with respect to other locations. Note that the low dimensional dataset will not have reference to visual descriptor models. k_semantics: low dimensional dataset to be used for similarity computation (total number of images X k) location_indices_map: stores key => location, value => indices in k_semantics algo_choice: (can be used in case we want to use a different similarity metric for each of the algorithms) input_location: reference location """ """ TODO: If we want to use different similarity metrics for the three models, following switcher can be used. similarity_computation = { "SVD": self.distance_based_similarity_computation, "PCA": self.distance_based_similarity_computation } """ locations = list(self.mapping.values()) location_location_similarity_map = OrderedDict({}) location1_indices = location_indices_map[input_location] location1_data = k_semantics[location1_indices[0]:location1_indices[1]] for location_index2 in range(0, len(locations)): location2_indices = location_indices_map[ locations[location_index2]] location2_data = k_semantics[ location2_indices[0]:location2_indices[1]] # similarity_score = similarity_computation.get(algo_choice)(location1_data, location2_data) similarity_score = self.distance_based_similarity_computation( location1_data, location2_data) location_location_similarity_map[ locations[location_index2]] = similarity_score self.top_5(location_location_similarity_map) def distance_based_similarity_computation(self, location1_data, location2_data): """ Method: distance_based_similarity_computation computes similarity based on euclidean distance. For each comparison of an image in location1_data with all other images in location2_data, we find out the most similar images. Finally, we return the average for these most similar images in location2_data with respect to location1_data. location1_data: Low dimensional dataset for input location (number of images in location 1 X k) location2_data: Low dimensional dataset for other locations (number of images in location 2 X k) """ image_image_similarity = [] for iterator1 in location1_data: local_img_img_similarity = self.ut.get_similarity_scores( location2_data, iterator1) image_image_similarity.append(max(local_img_img_similarity)) return sum(image_image_similarity) / len(image_image_similarity) def top_5(self, location_location_similarity_map): """ Method: top_5 prints the top 5 most similar locations with respect to the input location. location_location_similarity_map: stores similarity score between input and other locations in dataset. """ print( sorted(location_location_similarity_map.items(), key=lambda x: x[1], reverse=True)[:5]) def print_latent_semantics_for_input_location(self, k_semantics, input_location, location_indices_map): """ Method: print_latent_semantics_for_input_location prints k latent semantics for the location input by user k_semantics: low dimensional location-location dataset (total number of images X k) input_location: user input location_indices_map: stores key => location, value => indices in k_semantics """ location_indices = location_indices_map[input_location] print(k_semantics[location_indices[0]:location_indices[1]]) def compute_similarity_wrapper(self, k_semantics, input_location, location_indices_map, algo_choice): self.print_latent_semantics_for_input_location(k_semantics, input_location, location_indices_map) self.calculate_location_similarity(k_semantics, location_indices_map, algo_choice, input_location) def fetch_k_semantics(self, algo_choice, location_id, k): task5_pkl_file = open( constants.DUMPED_OBJECTS_DIR_PATH + "task5_k" + str(k) + algo_choice + ".pickle", "wb") input_location = self.mapping[location_id] data, location_indices_map, model_feature_length_map = self.data_extractor.prepare_dataset_for_task5\ (self.mapping, k) # model_feature_length_map is unused but if any code change is required, this will be handy so will retain this. matrix = np.array(list(data.values())) algorithms = { "SVD": self.ut.dim_reduce_SVD, "PCA": self.ut.dim_reduce_PCA, "LDA": self.ut.dim_reduce_LDA } k_semantics = algorithms.get(algo_choice)(matrix, k) pickle.dump((k_semantics, location_indices_map), task5_pkl_file) self.compute_similarity_wrapper(k_semantics, input_location, location_indices_map, algo_choice) def runner(self): """ Method: runner implemented for all the tasks, takes user input, runs dimensionality reduction algorithm, prints latent semantics for input location and computes similarity between two locations using the latent semantics. """ #take input from user location_id = input("Enter the location id:") k = input("Enter value of k: ") algo_choice = input("Enter the Algorithm: ") try: input_location = self.mapping[location_id] if int(k) > 2 and int(k) < 9: task5_read_pkl_file = open( constants.DUMPED_OBJECTS_DIR_PATH + "task5_k" + str(k) + algo_choice + ".pickle", "rb") objects = pickle.load(task5_read_pkl_file) k_semantics = objects[0] location_indices_map = objects[1] self.compute_similarity_wrapper(k_semantics, input_location, location_indices_map, algo_choice) else: self.fetch_k_semantics(algo_choice, location_id, k) except (OSError, IOError) as e: self.fetch_k_semantics(algo_choice, location_id, k) except KeyError: print(constants.LOCATION_ID_KEY_ERROR) except Exception as e: print(constants.GENERIC_EXCEPTION_MESSAGE + "," + str(type(e)) + "::" + str(e.args))
def runner(self): k = input('Enter the k value: ') k = int(k) util = Util() data_extractor = DataExtractor() location_id_to_title_map = data_extractor.location_mapping() location_title_to_id_map = data_extractor.location_title_to_id_mapping( ) location_list = list(location_id_to_title_map.values()) LOCATION_COUNT = len(location_list) # constant MODEL_COUNT = len(constants.MODELS) MAX_SCORE = (LOCATION_COUNT - 1) * MODEL_COUNT FILE_PATH_PREFIX = constants.PROCESSED_VISUAL_DESCRIPTORS_DIR_PATH # '../dataset/visual_descriptors/processed/' # constant # { # 1: {'CM': [{'location_id': 1, 'distance': 0}, {'location_id':2, 'distance': 0.45}, ...], 'CN': [...], ... }, # 2: {'CM': [...], 'CN': [...], ...}, # ... , # <query_location>: { # <model>: [{'location_id': <location_id>, 'distance': <distance>}, {'location_id': <location_id>, 'distance': <distance>}], # <model>: [...], # ... # } # } global_location_distance_data_dict = {} # { # 1: {1: 0, 2: 0.54, 3: 0.43, ...}, # 2: { 1: 0.45, 2: 0, ...}, # ... , # <query_location>: { <target_location>: <distance>, <target_location>: <distance>, ...} # } location_wise_distance_data_dict = {} similarity_matrix = numpy.zeros((LOCATION_COUNT, LOCATION_COUNT)) print('Starting...') # Go over every location as a potential query location for query_location in location_list: query_location_files = data_extractor.get_all_files_prefixed_with( query_location) query_location_id = location_title_to_id_map[query_location] if not global_location_distance_data_dict.get(query_location_id): global_location_distance_data_dict[query_location_id] = {} if not location_wise_distance_data_dict.get(query_location_id): location_wise_distance_data_dict[query_location_id] = {} print('Query Location: ', query_location) # Go over every model file in the query location for query_model_file in query_location_files: query_model_name_with_csv = query_model_file.split(" ")[ 1] # CM.csv, CN.csv, <modelName>.csv, ... query_model = query_model_name_with_csv.split(".")[ 0] # CM, CN, CN3x3, <modelName>, ... query_file_path = FILE_PATH_PREFIX + query_model_file query_model_df = pd.read_csv(query_file_path, header=None) del query_model_df[0] query_model_df = query_model_df.reset_index(drop=True) query_model_df_row_count = query_model_df.shape[0] if not global_location_distance_data_dict.get( query_location_id).get(query_model): global_location_distance_data_dict[query_location_id][ query_model] = [] print('\tQuery Model: ', query_model) # Go over every location as a potential target location for which we will compute the distance to from the query location for target_location in location_list: target_location_id = location_title_to_id_map[ target_location] # If query location == target location, distance = 0 if query_location == target_location: distance = 0 global_location_distance_data_dict[query_location_id][ query_model].append({ 'location_id': target_location_id, 'distance': 0 }) else: # Find the corresponding model file of the query location in the target location target_model_file_path = FILE_PATH_PREFIX + target_location + " " + query_model + ".csv" target_model_df = pd.read_csv(target_model_file_path, header=None) target_model_df_copy = target_model_df.copy() del target_model_df[0] target_model_df = target_model_df.reset_index( drop=True) target_model_df_row_count = target_model_df.shape[0] target_model_df_column_count = target_model_df.shape[1] # Calculate the distance between the query location's model file and the target location's corresponding model file distance = self.get_the_distance_value( query_model_df, target_model_df, query_model_df_row_count, query_model, util) global_location_distance_data_dict[query_location_id][ query_model].append({ 'location_id': target_location_id, 'distance': distance }) # Set distance temporarily as 0 in the location_wise_distance_data_dict for this location if not location_wise_distance_data_dict.get( query_location_id).get(target_location_id): location_wise_distance_data_dict[query_location_id][ target_location_id] = 0 # At this state, we have gone over every target location with the corresponding model file from the query location. # Sort the model based location list of distances based on distance from the location sorted_list = sorted( global_location_distance_data_dict[query_location_id] [query_model], key=lambda k: k['distance']) global_location_distance_data_dict[query_location_id][ query_model].clear() global_location_distance_data_dict[query_location_id][ query_model] = sorted_list # Repeat the loop, do it for every model file of the query location location_data_dict = global_location_distance_data_dict[ query_location_id] # Compute the ranking of similar locations for the query location for curr_model, distance_list in location_data_dict.items(): for index, curr_location_distance_data in enumerate( distance_list): curr_location_id = curr_location_distance_data[ 'location_id'] curr_val = location_wise_distance_data_dict[ query_location_id][curr_location_id] location_wise_distance_data_dict[query_location_id][ curr_location_id] = curr_val + index for l_id, dist in location_wise_distance_data_dict[ query_location_id].items(): similarity_matrix[query_location_id - 1][l_id - 1] = dist # Add this to similarity matrix print(similarity_matrix) # Generate CSVs of the current similarity matrix (given by distances derived from the ranks of individual models) # df = pd.DataFrame(similarity_matrix) # loc_list = [] # for i in range(1,31): # loc_list.append(location_id_to_title_map[str(i)]) # # Generate the distance datrix as CSV # df.to_csv('./generated_data/distance_matrix_vd_minmax.csv', encoding='utf-8', header=None, index=False) # df.to_csv('./generated_data/distance_matrix_vd_minmax_descriptive.csv', encoding='utf-8', header=loc_list, index=loc_list) # Convert distance score to similarity score converted_similarity_matrix = similarity_matrix for row in range(len(converted_similarity_matrix)): for col in range(len(converted_similarity_matrix[0])): # In the dev set case, it scales distance score that ranges from 0-290 in the computation to a similarity score ranging from 0-1 converted_similarity_matrix[row][col] = ( (float)(MAX_SCORE - converted_similarity_matrix[row][col]) / MAX_SCORE) # Generate the similarity matrix as CSV if needed # df = pd.DataFrame(converted_similarity_matrix) # df.to_csv('./generated_data/similarity_matrix_vd_minmax.csv', encoding='utf-8', header=None, index=False) # df.to_csv('./generated_data/similarity_matrix_vd_descriptive.csv', encoding='utf-8') # Apply SVD on the data U, S, Vt = numpy.linalg.svd(converted_similarity_matrix) # { # <location_id>: [{'Location Name': <>, 'Weight': <>}, {'Location Name': <>, 'Weight': <>}, ...], # <location_id>: [{'Location Name': <>, 'Weight': <>}, {'Location Name': <>, 'Weight': <>}, ...], # ... # } semantic_data_dict = {} for arr_index, arr in enumerate(Vt[:k, :]): if not semantic_data_dict.get(arr_index + 1): semantic_data_dict[arr_index + 1] = [] for index, element in enumerate(arr): semantic_data_dict[arr_index + 1].append({ 'Location Name': location_id_to_title_map[str(index + 1)], 'Weight': element }) # Sort the list based on the weight attribute sorted_list = sorted(semantic_data_dict[arr_index + 1], key=itemgetter('Weight'), reverse=True) semantic_data_dict[arr_index + 1].clear() semantic_data_dict[arr_index + 1] = sorted_list # Print the latent semantic as location name-weight pairs sorted in decreasing order of weights print('Latent Semantic: ', arr_index + 1) for idx, data in enumerate(sorted_list): print('\tLocation Name: ', semantic_data_dict[arr_index + 1][idx]['Location Name'], '| Weight: ', semantic_data_dict[arr_index + 1][idx]['Weight'])