def generate_img_img_adj_matrix(self):
     """ Method: generate image-image similarity matrix and stash in pickle file"""
     print("getting and normalizing data...")
     data_extractor = DataExtractor()
     loc_mapping = data_extractor.location_mapping()
     self.img_feature_matrix = data_extractor.prepare_dataset_for_task6(
         loc_mapping)
     scaler = MinMaxScaler()
     scaler.fit(list(self.img_feature_matrix.values()))
     for img, feature in self.img_feature_matrix.items():
         self.img_feature_matrix[img] = scaler.transform([feature])[0]
     self.img_ids = list(self.img_feature_matrix.keys())
Ejemplo n.º 2
0
class PreProcessor:
    def __init__(self):
        self.data_extractor = DataExtractor()
        self.mapping = self.data_extractor.location_mapping()
        self.location_names = list(self.mapping.values())
        self.reference_model = 'CM3x3'
        self.model_list = self.init_model_list()
        self.reference_df = pd.DataFrame()
        self.df_list = self.init_df_list()
        self.data_dict = dict()
        self.minmax_scaler = MinMaxScaler()

    def init_model_list(self):
        """
		Method Explanation:
			. Initializes the model_list as every model name other than the reference model for preprocessing.
		"""
        models = ['CN', 'CSD', 'LBP3x3', 'HOG',
                  'GLRLM']  # along with reference model CM3x3

        return models

    def init_df_list(self):
        """
		Method Explanation:
			. Initializes the df_list comprising of the dataframes.
		"""
        to_return = list()
        for model in self.model_list:
            to_return.append(pd.DataFrame())
        return to_return

    def compute_first_index_lesser_than_one(self, S):
        """
		Method Explanation:
			. Computes the first index in S that is lesser than 1 where S is a vector representation of the
			factor matrix in SVD with eigen values in decreasing order of weights.
		Input(s):
			S -- Vector representation of the factor matrix in SVD (S in U, S, Vt) comprising of eigen values
				sorted in decreasing order of weights.
		Output(s):
			The number of eigen values to consider for concept mapping given by the first index in S with a
			value lesser than 1.
		"""
        for index, eigen_value in enumerate(S):
            if eigen_value < 1:
                return index + 1

    def compute_latent_semantics(self, feature_matrix):
        """
		Method Explanation:
			. Returns the latent semantic representation of the feature_matrix with 'k' concepts.
			. 'k' -- number of concepts -- index of the first eigen value that is lesser than
			   1 in S represented as a vector in decreasing order of weights.
		Input(s):
			feature_matrix -- the list of all features of all image IDs on top of which SVD would be done.
		Output:
			The concept mapping of the feature_matrix in 'k' dimensions/concepts.
		"""

        print('Finding latent semantics of the data...')
        U, S, Vt = np.linalg.svd(feature_matrix)

        print('Removing eigen vectors with eigen value less than 1...')
        k = self.compute_first_index_lesser_than_one(S)
        S = np.diag(S)

        print('Preprocessing done...')
        return (np.dot(U[:, :k], S[:k, :k]))

    def preprocess_MinMaxScaler(self):
        """
		Method Explanation:
			. Refer to the top of the file for the algorithm for preprocessing.
			. Uses the MinMaxScaling for the normalization of data between 0 and 1.
		"""
        print('\nPreprocessing the data...')
        self.data_dict.clear()
        # print('Current model being processed: ', self.reference_model, '...')

        for location in self.location_names:
            current_df = pd.read_csv("../dataset/visual_descriptors/" +
                                     location + " " + self.reference_model +
                                     ".csv",
                                     header=None)
            self.reference_df = self.reference_df.append(current_df,
                                                         ignore_index=True)

        self.reference_df = self.reference_df.drop_duplicates(
            subset=[0], keep='first'
        )  # drop duplicate image ID rows and keep the first one.
        columns_to_normalize = np.arange(
            1, self.reference_df.shape[1], 1
        )  # the column indices to which MinMax normalization will be applied to.
        self.reference_df[
            columns_to_normalize] = self.minmax_scaler.fit_transform(
                self.reference_df[columns_to_normalize]
            )  # MinMax normalization

        self.data_dict = self.reference_df.set_index(0).T.to_dict(
            'list')  # Filling the data dict

        temp_dict = dict()
        for index, model in enumerate(self.model_list):
            # print('Current model being processed: ', model, '...')
            for location in self.location_names:
                # print('\tLocation being processed: ', location, '...')
                current_df = pd.read_csv("../dataset/visual_descriptors/" +
                                         location + " " + model + ".csv",
                                         header=None)
                df_to_modify = self.df_list[
                    index]  # Get the current model's DF that has been populated with X items so far...
                df_to_modify = df_to_modify.append(
                    current_df, ignore_index=True
                )  # Append the current df to the current model's DF...
                self.df_list[index] = df_to_modify
            model_df = self.df_list[index]

            model_df = model_df.drop_duplicates(
                subset=[0], keep='first'
            )  # drop duplicate image ID rows and keep the first one.
            columns_to_normalize = np.arange(
                1, model_df.shape[1], 1
            )  # the column indices to which MinMax normalization will be applied to.
            model_df[columns_to_normalize] = self.minmax_scaler.fit_transform(
                model_df[columns_to_normalize])
            self.df_list[index] = model_df

            temp_dict = self.df_list[index].set_index(0).T.to_dict('list')

            for key, val in temp_dict.items():
                if key in self.data_dict:
                    current_list = self.data_dict[key]
                    current_list.extend(temp_dict[key])
                    self.data_dict[
                        key] = current_list  # insert into data dict only if image id is already present

            temp_dict.clear(
            )  # clear the temp dictionary for the next iteration

        # Apply MinMax scaling to the feature concatenated data matrix as well
        the_feature_matrix = self.minmax_scaler.fit_transform(
            np.asarray(list(self.data_dict.values())))

        # Repopulate the data_dict
        index_counter = 0
        for key, val in self.data_dict.items():
            self.data_dict[key].clear()
            self.data_dict[key] = the_feature_matrix[index_counter]
            index_counter += 1

        return self.data_dict
Ejemplo n.º 3
0
class Task2(object):
    """
	This module is responsible for finding similarity between entities based on the latent semantics
	computed by PCA/LDA/SVD on original vector space (user/location/image)
	"""
    def __init__(self):
        self.ut = Util()
        self.data_extractor = DataExtractor()
        self.mapping = self.data_extractor.location_mapping()

    def calculate_similarity(self, input_vector, k_semantics_map, entity_type):
        '''
		Method : calculate_similarity computes compute the similarity 
		matrix for the given entity id and given the latent semantics vector
		in form of weights for given entity type
		'''
        similarity_data = []

        for key, value in k_semantics_map.items():
            result = self.ut.cosine_similarity(input_vector, value)
            if entity_type == constants.IMAGE_TEXT:
                key = int(key)
            similarity_data.append((key, result))

        return similarity_data

    def get_k_semantics_map(self, entity_data, k_semantics):
        """
		Method : Returns the k semantics map while linking the entity id from original data 
		with new latent semantics
		entity_data : user_term matrix or image_term matrix or location_term matrix
		k_semantics : new reduced feature space after projecting the original data points with
		dimension o x k
		"""
        entity_ids = list(entity_data.data.master_dict.keys())
        k_semantics_map = {}
        for entity_id, value in zip(entity_ids, k_semantics):
            k_semantics_map[entity_id] = value

        return k_semantics_map

    def top_5(self, similarity_data):
        """
		Method: Prints the top5 similar entities (users/images/locations) with respect to input
		entity
		similarity_data: List of objects containing entity_id and the similarity score between
		respective entity
		"""
        print(sorted(similarity_data, key=lambda x: x[1], reverse=True)[:5])
        pass

    def dim_reduce_SVD(self, document_term_matrix, k, pca=False):
        """
		Method: Returns the left factor, Sigma and right factor matrix using SVD and PCA if pca flag
		is True
		document_term_matrix : Input data matrix
		k  : number of hidden concepts
		"""
        document_term_matrix = self.ut.convert_list_to_numpyarray(
            document_term_matrix)
        document_term_sparse_matrix = scipy.sparse.csc_matrix(
            document_term_matrix)
        if pca:
            input_std = StandardScaler().fit_transform(document_term_matrix)
            pca = PCA(n_components=int(k))

            object_concept_matrix = pca.fit_transform(input_std)
            Vt = pca.components_

            #original sigma is linear array of k eigen values, so we need to construct a diagonal matrix
            S = np.diag(pca.singular_values_)

            U = document_term_matrix @ Vt.T
        else:
            #document_term_sparse_matrix = scipy.sparse.csc_matrix(document_term_matrix)
            svd = TruncatedSVD(n_components=int(k))
            svd.fit(document_term_matrix)

            object_concept_matrix = svd.transform(document_term_matrix)
            Vt = svd.components_

            #original sigma is linear array of k eigen values, so we need to construct a diagonal matrix
            S = np.diag(svd.singular_values_)

            U = document_term_matrix @ Vt.T

            # U,S,Vt = sparsesvd(document_term_sparse_matrix,k)

            # #Projection of objects along hidden concepts
            # U = document_term_sparse_matrix @ Vt.T

            # #original sigma is linear array of k components, so we need to construct a diagonal matrix
            # S = np.diag(S)

        return U, S, Vt

    def dim_reduce_LDA(self, document_term_matrix, k):
        """
		Method: Returns the left factor, Sigma and right factor matrix using LDA
		document_term_matrix : Input data matrix
		k  : number of hidden concepts
		"""
        lda = LatentDirichletAllocation(n_components=int(k),
                                        max_iter=5,
                                        learning_method='online',
                                        random_state=0)
        document_topic_matrix = lda.fit_transform(document_term_matrix)

        term_topic_matrix = lda.components_

        #Getting the feature vs feature matrix
        topic_topic_matrix = term_topic_matrix @ term_topic_matrix.T

        #Projection of original objects along hidden topics
        transformed_document_topic_matrix = document_topic_matrix @ topic_topic_matrix

        return transformed_document_topic_matrix, topic_topic_matrix, term_topic_matrix

    def get_projected_query_vector(self, input_vector, v_matrix, sigma_matrix):
        """
		Method:  Returns the projected query vector onto given latent semantic space
		input_vector : input_query from original data matrix form
		v_matrix : feature vs k concepts matrix
		sigma_matrix : core_matrix
		"""
        projected_query_vector = []

        projected_query_vector = input_vector.T @ v_matrix.T @ np.linalg.inv(
            sigma_matrix)

        return projected_query_vector

    def get_document_term_matrix(self, entity_data):
        """
		Method : Get the document term matrix for the given entity using the global dictionary of 
		terms.
		entity_data : entity_data matrix
		"""
        global_tag_set = entity_data.get_global_tag_set()
        global_tag_dict = entity_data.convert_dict_from_set(global_tag_set)
        master_matrix = entity_data.create_master_matrix(global_tag_dict)

        return master_matrix

    def get_similar_entities(self,
                             user_term_matrix,
                             image_term_matrix,
                             location_term_matrix,
                             user_S_matrix,
                             user_vt_matrix,
                             image_S_matrix,
                             image_vt_matrix,
                             location_S_matrix,
                             location_vt_matrix,
                             user_id=None,
                             image_id=None,
                             location_id=None):
        """
		Method: Get the similar users, images and locations by projecting the given query vector onto 
		other latent semantic space for all user, image and location entity
		
		user_term_matrix: user document_term matrix
		image_term_matrix: image document_term matrix
		location_term_matrix: location document_term matrix
		user_S_matrix :  sigma matrix after decomposing user data matrix
		image_S_matrix :  sigma matrix after decomposing image data matrix
		location_S_matrix :  sigma matrix after decomposing location data matrix

		user_vt_matrix :  feature vs concepts matrix after decomposing user data matrix
		image_vt_matrix :  feature vs concepts matrix after decomposing image data matrix
		location_vt_matrix :  feature vs concepts matrix after decomposing location data matrix
		"""

        if user_id:
            #If the input is user_id
            user_input_vector = self.user_semantics_map[user_id]

            #For similar user id we can directly use the user U matrix without projecting
            similar_users = self.calculate_similarity(user_input_vector,
                                                      self.user_semantics_map,
                                                      constants.USER_TEXT)

            original_user_input_vector = self.ut.convert_list_to_numpyarray(
                user_term_matrix[self.user_index])

            user_projected_query_vector_image = self.get_projected_query_vector(
                original_user_input_vector, image_vt_matrix, image_S_matrix)
            user_projected_query_vector_location = self.get_projected_query_vector(
                original_user_input_vector, location_vt_matrix,
                location_S_matrix)

            similar_images = self.calculate_similarity(
                user_projected_query_vector_image, self.image_semantics_map,
                constants.IMAGE_TEXT)
            similar_locations = self.calculate_similarity(
                user_projected_query_vector_location,
                self.location_semantics_map, constants.LOCATION_TEXT)

        elif image_id:
            #Given image id, computing the top 5 related images ,users and locations
            image_input_vector = self.image_semantics_map[image_id]

            #For similar user id we can directly use the user U matrix without projecting
            similar_images = self.calculate_similarity(
                image_input_vector, self.image_semantics_map,
                constants.IMAGE_TEXT)

            original_image_input_vector = self.ut.convert_list_to_numpyarray(
                image_term_matrix[self.image_index])

            image_projected_query_vector_user = self.get_projected_query_vector(
                original_image_input_vector, user_vt_matrix, user_S_matrix)
            image_projected_query_vector_location = self.get_projected_query_vector(
                original_image_input_vector, location_vt_matrix,
                location_S_matrix)

            similar_users = self.calculate_similarity(
                image_projected_query_vector_user, self.user_semantics_map,
                constants.USER_TEXT)
            similar_locations = self.calculate_similarity(
                image_projected_query_vector_location,
                self.location_semantics_map, constants.LOCATION_TEXT)

        elif location_id:
            #Given location id, computing the top 5 related locations,users and images
            location_input_vector = self.location_semantics_map[
                self.mapping[location_id]]

            #For similar user id we can directly use the user U matrix without projecting
            similar_locations = self.calculate_similarity(
                location_input_vector, self.location_semantics_map,
                constants.LOCATION_TEXT)
            #self.top_5(similar_locations)

            original_location_input_vector = self.ut.convert_list_to_numpyarray(
                location_term_matrix[self.location_index])

            location_projected_query_vector_image = self.get_projected_query_vector(
                original_location_input_vector, image_vt_matrix,
                image_S_matrix)
            location_projected_query_vector_user = self.get_projected_query_vector(
                original_location_input_vector, user_vt_matrix, user_S_matrix)

            similar_images = self.calculate_similarity(
                location_projected_query_vector_image,
                self.image_semantics_map, constants.IMAGE_TEXT)
            similar_users = self.calculate_similarity(
                location_projected_query_vector_user, self.user_semantics_map,
                constants.USER_TEXT)

        print("Top 5 related users are")
        self.top_5(similar_users)
        print("Top 5 related images are")
        self.top_5(similar_images)
        print("Top 5 related locations are")
        self.top_5(similar_locations)
        pass

    def get_all_latent_semantics_map(self, user_data, image_data,
                                     location_data, user_u_matrix,
                                     image_u_matrix, location_u_matrix):

        user_semantics_map = self.get_k_semantics_map(user_data, user_u_matrix)
        image_semantics_map = self.get_k_semantics_map(image_data,
                                                       image_u_matrix)
        location_semantics_map = self.get_k_semantics_map(
            location_data, location_u_matrix)

        return user_semantics_map, image_semantics_map, location_semantics_map

    def runner(self):
        """
		Method: runner implemented for all the tasks, 
		takes user input for type of entity from list of User,Image and Location
		and respective entity_id
		Displays the top 5 entities with respect to input entity 
		using the latent semantics obtained from task1 for respective entity vector space
		"""
        #k = input("Enter the value of k :")
        #start = time.time()
        k = input("Enter the value of k :")

        # user_id = input("Enter the user id: ")
        # image_id = input("Enter the image id: ")
        # location_id = input("Enter the location id: ")

        algo_choice = input("Enter the Algorithm: ")

        entity_index = int(
            input("Choose the entity id \t1) User \t2)Image \t3)Location.: "))

        user_id, image_id, location_id = None, None, None

        if entity_index == 1:
            self.entity_type = constants.USER_TEXT
            user_id = input("Enter the user id: ")

        elif entity_index == 2:
            self.entity_type = constants.IMAGE_TEXT
            image_id = input("Enter the image id: ")

        elif entity_index == 3:
            self.entity_type = constants.LOCATION_TEXT
            location_id = input("Enter the location id: ")
        """
		Get the document term matrix for users,images and locations from task1
		"""
        user_data = Task1()
        user_data.load_data_per_entity(constants.USER_TEXT)
        user_term_matrix = self.get_document_term_matrix(user_data)
        user_term_matrix = self.ut.convert_list_to_numpyarray(
            user_term_matrix).T

        image_data = Task1()
        image_data.load_data_per_entity(constants.IMAGE_TEXT)
        image_term_matrix = self.get_document_term_matrix(image_data)
        image_term_matrix = self.ut.convert_list_to_numpyarray(
            image_term_matrix).T

        location_data = Task1()
        location_data.load_data_per_entity(constants.LOCATION_TEXT)
        location_term_matrix = self.get_document_term_matrix(location_data)
        location_term_matrix = self.ut.convert_list_to_numpyarray(
            location_term_matrix).T

        if self.entity_type == constants.USER_TEXT:
            try:
                self.user_index = list(
                    user_data.data.master_dict.keys()).index(user_id)
            except ValueError:
                raise ValueError(constants.USER_ID_KEY_ERROR)
            pass
        elif self.entity_type == constants.IMAGE_TEXT:
            try:
                self.image_index = list(
                    image_data.data.master_dict.keys()).index(image_id)
            except ValueError:
                raise ValueError(constants.IMAGE_ID_KEY_ERROR)
            pass
        elif self.entity_type == constants.LOCATION_TEXT:
            try:
                input_location = self.mapping[location_id]
                self.location_index = list(
                    location_data.data.master_dict.keys()).index(
                        input_location)
            except ValueError:
                raise ValueError(constants.LOCATION_ID_KEY_ERROR)
            pass

        if algo_choice == 'SVD' or algo_choice == 'PCA':
            pca = False
            if algo_choice == 'PCA':
                pca = True
            """
			Decompose the original document term matrix into U,S and Vt using SVD
			For PCA we pass pca flag to indicate the passing of covariance matrix in the SVD method.
			"""
            user_u_matrix, user_S_matrix, user_vt_matrix = self.dim_reduce_SVD(
                user_term_matrix, k, pca)
            image_u_matrix, image_S_matrix, image_vt_matrix = self.dim_reduce_SVD(
                image_term_matrix, k, pca)
            location_u_matrix, location_S_matrix, location_vt_matrix = self.dim_reduce_SVD(
                location_term_matrix, k, pca)
            """
			Get the latent semantics for users, images and locations
			"""
            user_semantics_map, image_semantics_map,location_semantics_map = \
              self.get_all_latent_semantics_map(user_data,image_data,location_data,
               user_u_matrix,image_u_matrix,location_u_matrix)

            self.user_semantics_map = user_semantics_map
            self.image_semantics_map = image_semantics_map
            self.location_semantics_map = location_semantics_map
            """
			Get the similar cross entities given a entity id. eg userid -> similar users, images,
			and locations,  imageid -> similar images, locations and users.
			"""
            self.get_similar_entities(user_term_matrix, image_term_matrix,
                                      location_term_matrix, user_S_matrix,
                                      user_vt_matrix, image_S_matrix,
                                      image_vt_matrix, location_S_matrix,
                                      location_vt_matrix, user_id, image_id,
                                      location_id)

        elif algo_choice == 'LDA':
            """
			Decompose the original document term matrix into U,S and Vt using LDA
			"""
            user_u_matrix, user_S_matrix, user_vt_matrix = self.dim_reduce_LDA(
                user_term_matrix, k)
            image_u_matrix, image_S_matrix, image_vt_matrix = self.dim_reduce_LDA(
                image_term_matrix, k)
            location_u_matrix, location_S_matrix, location_vt_matrix = self.dim_reduce_LDA(
                location_term_matrix, k)

            user_semantics_map, image_semantics_map,location_semantics_map = \
              self.get_all_latent_semantics_map(user_data,image_data,location_data,
               user_u_matrix,image_u_matrix,location_u_matrix)

            self.user_semantics_map = user_semantics_map
            self.image_semantics_map = image_semantics_map
            self.location_semantics_map = location_semantics_map

            self.get_similar_entities(user_term_matrix, image_term_matrix,
                                      location_term_matrix, user_S_matrix,
                                      user_vt_matrix, image_S_matrix,
                                      image_vt_matrix, location_S_matrix,
                                      location_vt_matrix, user_id, image_id,
                                      location_id)

        #print("Seconds",time.time() - start)
class Task6:
    def __init__(self):
        """
		Method Explanation:
			Intializes all the variables for the analysis task.
		"""
        self.util = Util()
        self.data_extractor = DataExtractor()

        self.location_id_to_title_map = self.data_extractor.location_mapping()
        self.location_title_to_id_map = self.data_extractor.location_title_to_id_mapping(
        )

        self.location_list = list(
            self.location_title_to_id_map.values())  # List of location ids
        self.LOCATION_COUNT = len(self.location_list)  # constant

        self.global_term_dictionary_current_index = 0  # To store the count of unique terms and indexing a given term in the global dictionary
        self.global_term_dictionary = dict(
        )  # To store the global list of terms as keys and their indices as values
        self.global_term_index_dictionary = dict(
        )  # To store the global list of terms referenced via the indices as the keys and terms as the values
        self.location_dictionary = dict(
        )  # To store the terms of a particular location and their corresponding attributes
        self.similarity_matrix = numpy.zeros(
            (self.LOCATION_COUNT,
             self.LOCATION_COUNT))  # To capture location-location similarity

    def construct_vocabulary(self):
        """
		Method Explanation:
			. Constructs a global term vocabulary.
			. Constructs a location based term vocabulary.
		"""
        with open(constants.TEXT_DESCRIPTORS_DIR_PATH +
                  "devset_textTermsPerPOI.txt",
                  encoding="utf-8") as f:
            lines = [line.rstrip("\n") for line in f]
            for line in lines:
                words = line.split()

                temp_list_for_title = []
                # extract location title
                while "\"" not in words[0]:
                    temp_list_for_title.append(words.pop(0))
                location_title = ("_").join(temp_list_for_title)
                location_id = self.location_title_to_id_map[location_title]

                # Build the term vocabulary and also the dictionary for terms corresponding to the locations and their scores
                for index, word in enumerate(words):
                    index_mod4 = index % 4

                    if index_mod4 == 0:  # the term
                        current_word = word.strip('\"')
                        if not self.global_term_dictionary.get(current_word):
                            self.global_term_dictionary[
                                current_word] = self.global_term_dictionary_current_index
                            self.global_term_index_dictionary[
                                self.
                                global_term_dictionary_current_index] = current_word
                            self.global_term_dictionary_current_index += 1
                        if not self.location_dictionary.get(location_id):
                            self.location_dictionary[location_id] = {}
                        if not self.location_dictionary.get(location_id).get(
                                current_word):
                            self.location_dictionary[location_id][
                                current_word] = {
                                    "TF": 0,
                                    "DF": 0,
                                    "TFIDF": 0
                                }
                    elif index_mod4 == 1:  # TF
                        self.location_dictionary[location_id][current_word][
                            "TF"] = int(word)
                    elif index_mod4 == 2:  # DF
                        self.location_dictionary[location_id][current_word][
                            "DF"] = int(word)
                    elif index_mod4 == 3:  # TFIDF
                        self.location_dictionary[location_id][current_word][
                            "TFIDF"] = float(word)

    def construct_similarity_matrix(self, model):
        """
		Method Explanation:
			. Goes over every location as a potential query location, compares its textual descriptors with every other location as a
			  potential target location.
			. The comparison is based on the Cosine Similarity scores of one of the model vectors (TF/DF/TFIDF) defined by the <model> parameter.
		Inputs:
			<model> - Has three possible values -- TF, DF, TFIDF. Corresponds to which model score to consider for computing the Cosine Similarity
			          between the textual descriptors.
		"""
        the_model = model
        # Go over every location as a potential query location
        for query_location_id in self.location_list:
            query_model_vector = [0
                                  ] * self.global_term_dictionary_current_index

            # Construct the query model vector (<the_model> values of each term in the query location)
            for current_term_id_key, current_term_id_value in self.location_dictionary[
                    query_location_id].items():
                if current_term_id_key == the_model:
                    continue
                current_term_index = self.global_term_dictionary[
                    current_term_id_key]
                query_model_vector[
                    current_term_index] = self.location_dictionary[
                        query_location_id][current_term_id_key][the_model]

            # Go over every location as a potential target location
            for target_location_id, target_location_id_data in self.location_dictionary.items(
            ):
                # If query location is the same as target location, similarity = 1
                if target_location_id == query_location_id:
                    self.similarity_matrix[query_location_id -
                                           1][target_location_id - 1] = 1
                    continue
                else:
                    if not self.location_dictionary.get(
                            target_location_id).get(the_model):
                        self.location_dictionary[target_location_id][
                            the_model] = [
                                0
                            ] * self.global_term_dictionary_current_index

                    # Build the target model vector comprising of the_model scores of the target location
                    for current_term_key, current_term_value in self.location_dictionary[
                            target_location_id].items():
                        if current_term_key == the_model:
                            continue
                        current_term_index = self.global_term_dictionary[
                            current_term_key]
                        self.location_dictionary[target_location_id][
                            the_model][
                                current_term_index] = self.location_dictionary[
                                    target_location_id][current_term_key][
                                        the_model]

                    # Compute the Cosine Similarity between the query model vector and target model vector
                    cosine_similarity_value = self.util.cosine_similarity(
                        query_model_vector,
                        self.location_dictionary[target_location_id]
                        [the_model])
                    self.similarity_matrix[query_location_id -
                                           1][target_location_id -
                                              1] = cosine_similarity_value

    def print_k_latent_semantics(self, k):
        """
		Method Explanation:
			. Applies a Singular Valued Decomposition on the similarity matrix and prints the first k latent semantics determined by the k parameter.
			. The output is in the form of location-weight pairs for each semantic sorted in the decreasing order of weights.
		Input:
			. <k> for considering only the k latent semantics post SVD
		"""
        U, S, Vt = numpy.linalg.svd(self.similarity_matrix)

        # Get the concept mapping
        concept_mapping = self.similarity_matrix.dot(U[:, :k])
        concept_mapping = concept_mapping.transpose()

        # {
        #  <location_id>: [{"Location Name": <>, "Weight": <>}, {"Location Name": <>, "Weight": <>}, ...],
        #  <location_id>: [{"Location Name": <>, "Weight": <>}, {"Location Name": <>, "Weight": <>}, ...],
        #  ...
        # }
        semantic_data_dict = {}
        print("")
        for arr_index, arr in enumerate(concept_mapping):
            current_key = arr_index + 1
            if not semantic_data_dict.get(current_key):
                semantic_data_dict[current_key] = []

            for index, element in enumerate(arr):
                semantic_data_dict[current_key].append({
                    "Location Name":
                    self.location_id_to_title_map[str(index + 1)],
                    "Weight":
                    element
                })

            # Sort the latent semantic based on the weight of the feature
            sorted_list = sorted(semantic_data_dict[current_key],
                                 key=itemgetter("Weight"),
                                 reverse=True)
            semantic_data_dict[current_key].clear()
            semantic_data_dict[current_key] = sorted_list

            # Print location name-weight pairs sorted in decreasing order of weights
            print("Latent Semantic: ", current_key)
            for idx, data in enumerate(sorted_list):
                print("\tLocation Name: ",
                      semantic_data_dict[current_key][idx]["Location Name"],
                      " | Weight: ",
                      semantic_data_dict[current_key][idx]["Weight"])
            print("")

    def runner(self):
        k = input("Enter the k value: ")
        k = int(k)
        the_model = "TFIDF"
        self.construct_vocabulary()
        self.construct_similarity_matrix(the_model)
        self.print_k_latent_semantics(k)
Ejemplo n.º 5
0
class Task3(object):
	def __init__(self):
		self.ut = Util()
		self.data_extractor = DataExtractor()

	def calculate_similarity(self, k_semantics, image_position, array_of_all_images, array_location_vector):
		""" Method: image-image and image-location similarity"""

		vector_of_input_image = k_semantics[image_position]

		similarity_score_images = self.ut.get_similarity_scores(k_semantics,vector_of_input_image)

		#Storing all the image IDs and its score with given input image ID
		image_and_score = []
		for i in range(len(array_of_all_images)):
			image_and_score.append([array_of_all_images[i],similarity_score_images[i]])

		#Sorting on the basis of score and printing top 5 images across all locations
		sorted_sim_vector = sorted(image_and_score,key = lambda x:x[1],reverse = True) #sorting the similarity vector
		print("5 most similar images with matching score is :")
		print(sorted_sim_vector[:5])

		""" The start index and end index for a location is used, the image to image scores
			for that location is sorted and the top value is stored for representing that location.
			The top values of all locations are sorted and the top 5 locations are printed. """

		loc_img_score = []
		top_value = []
		for key in array_location_vector:
			start_index = array_location_vector[key][0]
			end_index = array_location_vector[key][1]
			top_value = sorted(similarity_score_images[start_index:end_index + 1],key = lambda x:x,reverse = True)[0]
			mapping = self.data_extractor.location_mapping()
			for loc_id,location_name in mapping.items():
				if(key == location_name):
					location_id = loc_id
			loc_img_score.append([location_id,key,top_value])

		#Sorting on basis of score and printing top 5 locations
		top_locations = sorted(loc_img_score,key = lambda x:x[2],reverse = True)[:5]
		print("5 most similar locations with matching score is :")
		print(top_locations)

	def runner(self):
		"""
		Method: runner implemented for all the tasks, takes user input, runs dimensionality reduction algorithm, prints
		latent semantics and computes image-image and image-location similarity using the latent semantics.
		"""
		try:
			model = input("Enter the model : ")
			k = input("Enter the value of k :")
			image_id = input("Enter image ID : ")

			array_of_all_images, image_input_array, image_position, \
			array_location_vector = self.data_extractor.prepare_dataset_for_task3(model, image_id)

			algo_choice = input("Enter the Algorithm: ")

			algorithms = { "SVD": self.ut.dim_reduce_SVD, "PCA": self.ut.dim_reduce_PCA , "LDA": self.ut.dim_reduce_LDA}

			k_semantics = algorithms.get(algo_choice)(image_input_array, k)
			print(k_semantics)

			self.calculate_similarity(k_semantics, image_position, array_of_all_images, array_location_vector)

		except Exception as e:
			print(constants.GENERIC_EXCEPTION_MESSAGE + "," + str(type(e)) + "::" + str(e.args))
Ejemplo n.º 6
0
class Task4(object):
    def __init__(self):
        self.ut = Util()
        self.data_extractor = DataExtractor()

    def calculate_location_similarity(self, arr, location_list_indices,
                                      mapping, location_id):
        """
		Method: calculate_location_similarity computes similarity score for the reduced location-location dataset.
		Given an input location, we need to find out similarity score of this location with respect to other locations.
		Computes similarity based on euclidean distance. For each comparison of an image in location 1 with all other
		images in location 2, we find out the most similar images. Finally, we return the average for these most similar
		images in location 2 with respect to location 1.
		Note that the low dimensional dataset will not have reference to visual descriptor models.
		k_semantics: low dimensional dataset to be used for similarity computation (total number of images X k)
		location_indices_map: stores key => location, value => indices in k_semantics
		algo_choice: (can be used in case we want to use a different similarity metric for each of the algorithms)
		input_location: reference location
		"""

        location_similarity = {}
        for location in location_list_indices.keys():
            imgximg_exhaustive_sim = []
            imgximg_similarity = []

            for i in range(0, location_list_indices[mapping[location_id]][1]):
                for j in range(location_list_indices[location][0],
                               location_list_indices[location][1]):
                    similarity = spatial.distance.euclidean(arr[i], arr[j])
                    similarity = 1 / (1 + similarity)
                    imgximg_exhaustive_sim.append(similarity)
                imgximg_similarity.append(max(imgximg_exhaustive_sim))
                imgximg_exhaustive_sim = []

            location_similarity.update(
                {location: sum(imgximg_similarity) / len(imgximg_similarity)})

        print(
            sorted(location_similarity.items(),
                   key=lambda x: x[1],
                   reverse=True)[:5])

    def runner(self):
        """
		Method: runner implemented for all the tasks, takes user input, runs dimensionality reduction algorithm, prints
		latent semantics for input location and computes similarity between two locations for a given model using the
		latent semantics.
		"""
        try:
            #create the location_id-locationName mapping
            mapping = self.data_extractor.location_mapping()

            #take the input from user
            location_id = input("Enter the location id:")
            location = mapping[location_id]
            model = input("Enter the model: ")
            k = input("Enter value of k: ")
            algo_choice = input("Enter the Algorithm: ")

            #create the list of all files of the given model
            file_list = self.data_extractor.create_dataset(
                mapping, model, location_id)

            #append all the location images to a list with the first location being the input
            input_image_list, location_list_indices, input_location_index = self.data_extractor.append_givenloc_to_list(\
                             mapping, model,location_id, file_list)

            #convert list to numpy array
            input_image_arr = self.ut.convert_list_to_numpyarray(
                input_image_list)

            #select algorithm
            algorithms = {
                "SVD": self.ut.dim_reduce_SVD,
                "PCA": self.ut.dim_reduce_PCA,
                "LDA": self.ut.dim_reduce_LDA
            }

            #get the k latent semantics
            k_semantics = algorithms.get(algo_choice)(input_image_arr, k)

            print(k_semantics[0:input_location_index])

            self.calculate_location_similarity(k_semantics,
                                               location_list_indices, mapping,
                                               location_id)

        except KeyError:
            print(constants.LOCATION_ID_KEY_ERROR)

        except Exception as e:
            print(constants.GENERIC_EXCEPTION_MESSAGE + "," + str(type(e)) +
                  "::" + str(e.args))
Ejemplo n.º 7
0
class Task1():
	def __init__(self):
		self.ut = Util()
		self.data_extractor = DataExtractor()
		self.mapping = self.data_extractor.location_mapping()

	def generate_imgximg_edgelist(self, image_list1, image_list2, image_feature_map, k):
		""" Method: generate_imgximg_edgelist returns image to image similarity in form of an edge list """
		imgximg_edgelist_file = open(constants.VISUALIZATIONS_DIR_PATH + "entire_graph_file.txt", "w")
		image_id_mapping_file = open(constants.DUMPED_OBJECTS_DIR_PATH + "image_id_mapping.pickle", "wb")
		image_id_mapping = {}

		for index1 in range(0, len(image_list1)):
			local_img_img_sim_list = []
			for index2 in range(0, len(image_list2)):
				image1 = image_list1[index1]
				image2 = image_list2[index2]
				features_image1 = image_feature_map[image1]
				features_image2 = image_feature_map[image2]
				score = 1 / (1 + self.calculate_similarity(features_image1, features_image2))
				imgximg_edgelist_file.write(str(image1) + " " + str(image2) + " " + str(score) + "\n")
				local_img_img_sim_list.append((image1, image2, score))

			self.top_k(local_img_img_sim_list, k)
			image_id_mapping[image1] = index1

		pickle.dump(["Image_id mapping:", image_id_mapping], image_id_mapping_file)
		image_id_mapping_file.close()

	def calculate_similarity(self, features_image1, features_image2):
		""" Method: image-image similarity computation"""
		return self.ut.compute_euclidean_distance(np.array(features_image1), np.array(features_image2))

	def top_k(self, graph_list, k):
		reduced_graph_file = open(constants.VISUALIZATIONS_DIR_PATH + "reduced_graph_file_" + str(k) + ".txt", "a+")

		top_k = sorted(graph_list, key=lambda x:(-x[2], x[1], x[0]))[0:k]
		for iter in top_k:
			reduced_graph_file.write(str(iter[0]) + " " + str(iter[1]) + " " + str(iter[2]) + "\n")

	def create_graph(self, k):
		reduced_graph_file = open(constants.VISUALIZATIONS_DIR_PATH + "reduced_graph_file_" + str(k) + ".txt", "r")
		visualise_graph_file = open(constants.VISUALIZATIONS_DIR_PATH + "visualisation_graph_file.txt", "w")
		task1_output_file = open(constants.TASK1_OUTPUT_FILE, "w")

		visualise_len = 10 * int(k)

		for iter in range(visualise_len):
			visualise_graph_file.write(reduced_graph_file.readline())

		count = 0
		for iter in reduced_graph_file:
			image_id = iter.split(" ")
			count += 1
			if count <= k:
				task1_output_file.write(image_id[1] + "\n")
			else:
				count = 0
				task1_output_file.write("####\n")

		task1_output_file.close()
		visualise_graph_file.close()

		g = nx.read_edgelist(constants.VISUALIZATIONS_DIR_PATH + "visualisation_graph_file.txt", nodetype=int, \
							data=(('weight',float),), create_using=nx.DiGraph())
		print("graph created")
		nx.draw(g, with_labels=True)
		plt.show()
		return g


	def runner(self):
		"""
		Method: runner implemented for all the tasks, takes user input, and prints desired results.
		"""
		try:
			k = int(input("Enter the value of k:\t"))
			image_feature_map = self.data_extractor.prepare_dataset_for_task1(self.mapping)
			image_list = list(image_feature_map.keys())
			self.generate_imgximg_edgelist(image_list, image_list, image_feature_map, k)
			self.create_graph(k)
		except Exception as e:
			print(constants.GENERIC_EXCEPTION_MESSAGE + "," + str(type(e)) + "::" + str(e.args))
 def __init__(self):
     self.models = constants.MODELS
     data_extractor = DataExtractor()
     mapping = data_extractor.location_mapping()
     self.locations = list(mapping.values())
     self.task1 = Task1()
Ejemplo n.º 9
0
class Task5(object):
    def __init__(self):
        self.ut = Util()
        self.data_extractor = DataExtractor()
        self.mapping = self.data_extractor.location_mapping()

    def calculate_location_similarity(self, k_semantics, location_indices_map,
                                      algo_choice, input_location):
        """
		Method: calculate_location_similarity computes similarity score for the reduced location-location dataset.
		Given an input location, we need to find out similarity score of this location with respect to other locations.
		Note that the low dimensional dataset will not have reference to visual descriptor models.
		k_semantics: low dimensional dataset to be used for similarity computation (total number of images X k)
		location_indices_map: stores key => location, value => indices in k_semantics
		algo_choice: (can be used in case we want to use a different similarity metric for each of the algorithms)
		input_location: reference location
		"""
        """
		TODO: If we want to use different similarity metrics for the three models, following switcher can be used.
		similarity_computation = { "SVD": self.distance_based_similarity_computation,
								"PCA": self.distance_based_similarity_computation }
		"""

        locations = list(self.mapping.values())
        location_location_similarity_map = OrderedDict({})
        location1_indices = location_indices_map[input_location]
        location1_data = k_semantics[location1_indices[0]:location1_indices[1]]

        for location_index2 in range(0, len(locations)):
            location2_indices = location_indices_map[
                locations[location_index2]]
            location2_data = k_semantics[
                location2_indices[0]:location2_indices[1]]
            # similarity_score = similarity_computation.get(algo_choice)(location1_data, location2_data)
            similarity_score = self.distance_based_similarity_computation(
                location1_data, location2_data)
            location_location_similarity_map[
                locations[location_index2]] = similarity_score

        self.top_5(location_location_similarity_map)

    def distance_based_similarity_computation(self, location1_data,
                                              location2_data):
        """
		Method: distance_based_similarity_computation computes similarity based on euclidean distance.
		For each comparison of an image in location1_data with all other images in location2_data, we find out the most
		similar images. Finally, we return the average for these most similar images in location2_data with respect to
		location1_data.
		location1_data: Low dimensional dataset for input location (number of images in location 1 X k)
		location2_data: Low dimensional dataset for other locations (number of images in location 2 X k)
		"""

        image_image_similarity = []
        for iterator1 in location1_data:
            local_img_img_similarity = self.ut.get_similarity_scores(
                location2_data, iterator1)
            image_image_similarity.append(max(local_img_img_similarity))

        return sum(image_image_similarity) / len(image_image_similarity)

    def top_5(self, location_location_similarity_map):
        """
		Method: top_5 prints the top 5 most similar locations with respect to the input location.
		location_location_similarity_map: stores similarity score between input and other locations in dataset.
		"""

        print(
            sorted(location_location_similarity_map.items(),
                   key=lambda x: x[1],
                   reverse=True)[:5])

    def print_latent_semantics_for_input_location(self, k_semantics,
                                                  input_location,
                                                  location_indices_map):
        """
		Method: print_latent_semantics_for_input_location prints k latent semantics for the location input by user
		k_semantics: low dimensional location-location dataset (total number of images X k)
		input_location: user input
		location_indices_map:  stores key => location, value => indices in k_semantics
		"""

        location_indices = location_indices_map[input_location]
        print(k_semantics[location_indices[0]:location_indices[1]])

    def compute_similarity_wrapper(self, k_semantics, input_location,
                                   location_indices_map, algo_choice):
        self.print_latent_semantics_for_input_location(k_semantics,
                                                       input_location,
                                                       location_indices_map)
        self.calculate_location_similarity(k_semantics, location_indices_map,
                                           algo_choice, input_location)

    def fetch_k_semantics(self, algo_choice, location_id, k):
        task5_pkl_file = open(
            constants.DUMPED_OBJECTS_DIR_PATH + "task5_k" + str(k) +
            algo_choice + ".pickle", "wb")
        input_location = self.mapping[location_id]
        data, location_indices_map, model_feature_length_map = self.data_extractor.prepare_dataset_for_task5\
                           (self.mapping, k)
        # model_feature_length_map is unused but if any code change is required, this will be handy so will retain this.

        matrix = np.array(list(data.values()))
        algorithms = {
            "SVD": self.ut.dim_reduce_SVD,
            "PCA": self.ut.dim_reduce_PCA,
            "LDA": self.ut.dim_reduce_LDA
        }

        k_semantics = algorithms.get(algo_choice)(matrix, k)
        pickle.dump((k_semantics, location_indices_map), task5_pkl_file)

        self.compute_similarity_wrapper(k_semantics, input_location,
                                        location_indices_map, algo_choice)

    def runner(self):
        """
		Method: runner implemented for all the tasks, takes user input, runs dimensionality reduction algorithm, prints
		latent semantics for input location and computes similarity between two locations using the latent semantics.
		"""

        #take input from user
        location_id = input("Enter the location id:")
        k = input("Enter value of k: ")
        algo_choice = input("Enter the Algorithm: ")

        try:
            input_location = self.mapping[location_id]
            if int(k) > 2 and int(k) < 9:
                task5_read_pkl_file = open(
                    constants.DUMPED_OBJECTS_DIR_PATH + "task5_k" + str(k) +
                    algo_choice + ".pickle", "rb")
                objects = pickle.load(task5_read_pkl_file)
                k_semantics = objects[0]
                location_indices_map = objects[1]

                self.compute_similarity_wrapper(k_semantics, input_location,
                                                location_indices_map,
                                                algo_choice)
            else:
                self.fetch_k_semantics(algo_choice, location_id, k)

        except (OSError, IOError) as e:
            self.fetch_k_semantics(algo_choice, location_id, k)

        except KeyError:
            print(constants.LOCATION_ID_KEY_ERROR)

        except Exception as e:
            print(constants.GENERIC_EXCEPTION_MESSAGE + "," + str(type(e)) +
                  "::" + str(e.args))
    def runner(self):
        k = input('Enter the k value: ')
        k = int(k)

        util = Util()
        data_extractor = DataExtractor()

        location_id_to_title_map = data_extractor.location_mapping()
        location_title_to_id_map = data_extractor.location_title_to_id_mapping(
        )

        location_list = list(location_id_to_title_map.values())

        LOCATION_COUNT = len(location_list)  # constant
        MODEL_COUNT = len(constants.MODELS)
        MAX_SCORE = (LOCATION_COUNT - 1) * MODEL_COUNT

        FILE_PATH_PREFIX = constants.PROCESSED_VISUAL_DESCRIPTORS_DIR_PATH  # '../dataset/visual_descriptors/processed/' # constant
        # {
        # 	1: {'CM': [{'location_id': 1, 'distance': 0}, {'location_id':2, 'distance': 0.45}, ...], 'CN': [...], ... },
        # 	2: {'CM': [...], 'CN': [...], ...},
        #   ... ,
        #   <query_location>: {
        # 						<model>: [{'location_id': <location_id>, 'distance': <distance>}, {'location_id': <location_id>, 'distance': <distance>}],
        # 					   	<model>: [...],
        # 					   	...
        # 					  }
        # }
        global_location_distance_data_dict = {}
        # {
        # 1: {1: 0, 2: 0.54, 3: 0.43, ...},
        # 2: { 1: 0.45, 2: 0, ...},
        # ... ,
        # <query_location>: { <target_location>: <distance>, <target_location>: <distance>, ...}
        # }
        location_wise_distance_data_dict = {}
        similarity_matrix = numpy.zeros((LOCATION_COUNT, LOCATION_COUNT))
        print('Starting...')

        # Go over every location as a potential query location
        for query_location in location_list:
            query_location_files = data_extractor.get_all_files_prefixed_with(
                query_location)
            query_location_id = location_title_to_id_map[query_location]

            if not global_location_distance_data_dict.get(query_location_id):
                global_location_distance_data_dict[query_location_id] = {}
            if not location_wise_distance_data_dict.get(query_location_id):
                location_wise_distance_data_dict[query_location_id] = {}
            print('Query Location: ', query_location)

            # Go over every model file in the query location
            for query_model_file in query_location_files:
                query_model_name_with_csv = query_model_file.split(" ")[
                    1]  # CM.csv, CN.csv, <modelName>.csv, ...
                query_model = query_model_name_with_csv.split(".")[
                    0]  # CM, CN, CN3x3, <modelName>, ...
                query_file_path = FILE_PATH_PREFIX + query_model_file
                query_model_df = pd.read_csv(query_file_path, header=None)
                del query_model_df[0]
                query_model_df = query_model_df.reset_index(drop=True)
                query_model_df_row_count = query_model_df.shape[0]

                if not global_location_distance_data_dict.get(
                        query_location_id).get(query_model):
                    global_location_distance_data_dict[query_location_id][
                        query_model] = []
                print('\tQuery Model: ', query_model)

                # Go over every location as a potential target location for which we will compute the distance to from the query location
                for target_location in location_list:
                    target_location_id = location_title_to_id_map[
                        target_location]
                    # If query location == target location, distance = 0
                    if query_location == target_location:
                        distance = 0
                        global_location_distance_data_dict[query_location_id][
                            query_model].append({
                                'location_id': target_location_id,
                                'distance': 0
                            })
                    else:
                        # Find the corresponding model file of the query location in the target location
                        target_model_file_path = FILE_PATH_PREFIX + target_location + " " + query_model + ".csv"
                        target_model_df = pd.read_csv(target_model_file_path,
                                                      header=None)
                        target_model_df_copy = target_model_df.copy()
                        del target_model_df[0]
                        target_model_df = target_model_df.reset_index(
                            drop=True)
                        target_model_df_row_count = target_model_df.shape[0]
                        target_model_df_column_count = target_model_df.shape[1]

                        # Calculate the distance between the query location's model file and the target location's corresponding model file
                        distance = self.get_the_distance_value(
                            query_model_df, target_model_df,
                            query_model_df_row_count, query_model, util)

                        global_location_distance_data_dict[query_location_id][
                            query_model].append({
                                'location_id': target_location_id,
                                'distance': distance
                            })

                    # Set distance temporarily as 0 in the location_wise_distance_data_dict for this location
                    if not location_wise_distance_data_dict.get(
                            query_location_id).get(target_location_id):
                        location_wise_distance_data_dict[query_location_id][
                            target_location_id] = 0

                # At this state, we have gone over every target location with the corresponding model file from the query location.
                # Sort the model based location list of distances based on distance from the location
                sorted_list = sorted(
                    global_location_distance_data_dict[query_location_id]
                    [query_model],
                    key=lambda k: k['distance'])
                global_location_distance_data_dict[query_location_id][
                    query_model].clear()
                global_location_distance_data_dict[query_location_id][
                    query_model] = sorted_list
                # Repeat the loop, do it for every model file of the query location

            location_data_dict = global_location_distance_data_dict[
                query_location_id]

            # Compute the ranking of similar locations for the query location
            for curr_model, distance_list in location_data_dict.items():
                for index, curr_location_distance_data in enumerate(
                        distance_list):
                    curr_location_id = curr_location_distance_data[
                        'location_id']
                    curr_val = location_wise_distance_data_dict[
                        query_location_id][curr_location_id]
                    location_wise_distance_data_dict[query_location_id][
                        curr_location_id] = curr_val + index
            for l_id, dist in location_wise_distance_data_dict[
                    query_location_id].items():
                similarity_matrix[query_location_id - 1][l_id - 1] = dist
            # Add this to similarity matrix

        print(similarity_matrix)

        # Generate CSVs of the current similarity matrix (given by distances derived from the ranks of individual models)

        # df = pd.DataFrame(similarity_matrix)
        # loc_list = []
        # for i in range(1,31):
        # 	loc_list.append(location_id_to_title_map[str(i)])

        # # Generate the distance datrix as CSV
        # df.to_csv('./generated_data/distance_matrix_vd_minmax.csv', encoding='utf-8', header=None, index=False)
        # df.to_csv('./generated_data/distance_matrix_vd_minmax_descriptive.csv', encoding='utf-8', header=loc_list, index=loc_list)

        # Convert distance score to similarity score
        converted_similarity_matrix = similarity_matrix
        for row in range(len(converted_similarity_matrix)):
            for col in range(len(converted_similarity_matrix[0])):
                # In the dev set case, it scales distance score that ranges from 0-290 in the computation to a similarity score ranging from 0-1
                converted_similarity_matrix[row][col] = (
                    (float)(MAX_SCORE - converted_similarity_matrix[row][col])
                    / MAX_SCORE)

        # Generate the similarity matrix as CSV if needed
        # df = pd.DataFrame(converted_similarity_matrix)
        # df.to_csv('./generated_data/similarity_matrix_vd_minmax.csv', encoding='utf-8', header=None, index=False)
        # df.to_csv('./generated_data/similarity_matrix_vd_descriptive.csv', encoding='utf-8')

        # Apply SVD on the data
        U, S, Vt = numpy.linalg.svd(converted_similarity_matrix)

        # {
        #  <location_id>: [{'Location Name': <>, 'Weight': <>}, {'Location Name': <>, 'Weight': <>}, ...],
        #  <location_id>: [{'Location Name': <>, 'Weight': <>}, {'Location Name': <>, 'Weight': <>}, ...],
        #  ...
        # }
        semantic_data_dict = {}
        for arr_index, arr in enumerate(Vt[:k, :]):
            if not semantic_data_dict.get(arr_index + 1):
                semantic_data_dict[arr_index + 1] = []

            for index, element in enumerate(arr):
                semantic_data_dict[arr_index + 1].append({
                    'Location Name':
                    location_id_to_title_map[str(index + 1)],
                    'Weight':
                    element
                })

            # Sort the list based on the weight attribute
            sorted_list = sorted(semantic_data_dict[arr_index + 1],
                                 key=itemgetter('Weight'),
                                 reverse=True)
            semantic_data_dict[arr_index + 1].clear()
            semantic_data_dict[arr_index + 1] = sorted_list

            # Print the latent semantic as location name-weight pairs sorted in decreasing order of weights
            print('Latent Semantic: ', arr_index + 1)
            for idx, data in enumerate(sorted_list):
                print('\tLocation Name: ',
                      semantic_data_dict[arr_index + 1][idx]['Location Name'],
                      '| Weight: ',
                      semantic_data_dict[arr_index + 1][idx]['Weight'])