def test_find_subsets4(self): kwc1 = KeywordCoordinate(0, 0, ['0']) kwc2 = KeywordCoordinate(1, 1, ['1']) kwc3 = KeywordCoordinate(2, 2, ['2']) kwc4 = KeywordCoordinate(3, 3, ['3']) superset = [kwc1, kwc2, kwc3, kwc4] subsets = mt.find_subsets(superset, 4) self.assertEqual(len(subsets), 1) for subset in subsets: self.assertEqual(len(subset), 4)
def get_all_subsets(self, data): """ Calculates all the possible subsets for the given data. Takes the set maximum length for subsets into account. :param data: The data :return: A list of all possible subsets """ start_time = time.time() max_length = min(len(data), self.max_subset_size) list_of_subsets = [] # nlp = en_core_web_lg.load() j = 1 # query_docs = [] # for kw in self.query.keywords: # doc_query = nlp(kw) # query_docs.append(doc_query) for index in range(max_length): print('***** Longitud ', j) j += 1 new_subsets = find_subsets(data, index + 1) # temp = 1 for subset in new_subsets: to_set = True # simDict = {} max = [] # Initialize max list to 0 for every query keyword for i in range(0, len(self.query.keywords)): # print('INDEX: ', i) max.append(0) for poi in subset: # Transform POI keywords into NLP encoding # element_string = '' # for kw in poi.keywords: # element_string = element_string + ' ' + kw # doc = nlp(element_string) # doc = self.df_poi_encoded.at[poi.name,'nlp_keywords_encoded'] sims = semantic_similarity( self.query.keywords, poi.name, self.df_poi_queries_similarities) for i in range(0, len(self.query.keywords)): # print('sims[i]=', sims[i], ' - max[i]=', max[i]) if sims[i] > max[i]: max[i] = sims[i] for i in range(0, len(max)): to_set = to_set and max[i] >= self.SEMANTIC_THRESHOLD # to_set = to_set and semantic_similarity(self.query, poi, nlp, self.SEMANTIC_THRESHOLD) if to_set: list_of_subsets.append(subset) # print('subset ', subset) # temp = temp + 1 self.number_of_subsets_after_semantic_filtering = len( list_of_subsets) print('# subsets: ', len(list_of_subsets)) finish_time = time.time() self.time_4_build_subsets = finish_time - start_time return list_of_subsets