def test_precalculated_max_inter_dataset(self):
     query = KeywordCoordinate(0, 0, ['family', 'food', 'outdoor'])
     kwc1 = KeywordCoordinate(1, 1, ['family', 'food', 'outdoor'])
     kwc2 = KeywordCoordinate(3, 3, ['food', 'family'])
     kwc3 = KeywordCoordinate(2, 2, ['outdoor'])
     data = [kwc1, kwc2, kwc3]
     cf = Type1(euclidean_distance,
                combined_cosine_similarity,
                0.3,
                0.3,
                0.4,
                disable_thresholds=True)
     ns = NaiveSolver(query, data, cf)
     pre_id = ns.get_max_inter_dataset_distance()
     result = ns.solve()
     cf.precalculated_inter_dataset_dict = pre_id
     result_pre = ns.solve()
     for index in range(len(result)):
         self.assertAlmostEqual(result[index][0],
                                result_pre[index][0],
                                delta=0.01)
         key_list = list(result[index][1])
         key_list_pre = list(result_pre[index][1])
         for list_index in range(len(key_list)):
             self.assertAlmostEqual(key_list[list_index].coordinates.x,
                                    key_list_pre[list_index].coordinates.x)
             self.assertAlmostEqual(key_list[list_index].coordinates.y,
                                    key_list_pre[list_index].coordinates.y)
             self.assertListEqual(key_list[list_index].keywords,
                                  key_list_pre[list_index].keywords)
 def test_complex_precalculations(self):
     query = KeywordCoordinate(5, 6, ['culture'])
     kwc1 = KeywordCoordinate(2, 1, ['family', 'rest', 'indoor'])
     kwc2 = KeywordCoordinate(0, 2, ['science', 'culture', 'history'])
     kwc3 = KeywordCoordinate(0, 0, ['food', 'outdoor', 'sports'])
     data = [kwc1, kwc2, kwc3]
     cf = Type1(euclidean_distance,
                combined_cosine_similarity,
                0.3,
                0.3,
                0.4,
                disable_thresholds=True)
     ns = NaiveSolver(query, data, cf, result_length=100)
     result = ns.solve()
     pre_qd = ns.get_query_dataset_distance()
     pre_id = ns.get_inter_dataset_distance()
     pre_ks = ns.get_keyword_similarity()
     cf.precalculated_query_dataset_dict = pre_qd
     cf.precalculated_inter_dataset_dict = pre_id
     cf.precalculated_keyword_similarity_dict = pre_ks
     result_pre = ns.solve()
     for index in range(len(result)):
         self.assertAlmostEqual(result[index][0],
                                result_pre[index][0],
                                delta=0.01)
         key_list = list(result[index][1])
         key_list_pre = list(result_pre[index][1])
         for list_index in range(len(key_list)):
             self.assertAlmostEqual(key_list[list_index].coordinates.x,
                                    key_list_pre[list_index].coordinates.x)
             self.assertAlmostEqual(key_list[list_index].coordinates.y,
                                    key_list_pre[list_index].coordinates.y)
             self.assertListEqual(key_list[list_index].keywords,
                                  key_list_pre[list_index].keywords)
Exemple #3
0
 def test_get_min_inter_dataset_distance(self):
     query_keywords = ['family', 'food', 'outdoor']
     kwc1_keywords = ['family', 'food', 'outdoor']
     kwc2_keywords = ['food']
     kwc3_keywords = ['outdoor']
     query = KeywordCoordinate(0, 0, query_keywords)
     kwc1 = KeywordCoordinate(1, 1, kwc1_keywords)
     kwc2 = KeywordCoordinate(2, 2, kwc2_keywords)
     kwc3 = KeywordCoordinate(3, 3, kwc3_keywords)
     data = [kwc1, kwc2, kwc3]
     cf = CostFunction(euclidean_distance, combined_cosine_similarity, 0.3,
                       0.3, 0.4)
     so = Solver(query, data, cf, normalize=False)
     fs1 = frozenset([kwc1])
     fs2 = frozenset([kwc2])
     fs3 = frozenset([kwc3])
     fs4 = frozenset([kwc1, kwc2])
     fs5 = frozenset([kwc1, kwc3])
     fs6 = frozenset([kwc2, kwc3])
     fs7 = frozenset([kwc1, kwc2, kwc3])
     result = so.get_min_inter_dataset_distance()
     self.assertEqual(len(result), 7)
     self.assertAlmostEqual(result.get(fs1), 0.0, delta=0.01)
     self.assertAlmostEqual(result.get(fs2), 0.0, delta=0.01)
     self.assertAlmostEqual(result.get(fs3), 0.0, delta=0.01)
     self.assertAlmostEqual(result.get(fs4), 1.41, delta=0.01)
     self.assertAlmostEqual(result.get(fs5), 2.83, delta=0.01)
     self.assertAlmostEqual(result.get(fs6), 1.41, delta=0.01)
     self.assertAlmostEqual(result.get(fs7), 1.41, delta=0.01)
Exemple #4
0
 def test_threshold6(self):
     t3 = Type3(euclidean_distance, separated_cosine_similarity, 0.25, 0.25,
                0.5, math.inf, math.inf, 0.4)
     query = KeywordCoordinate(0, 0, ['keyword1', 'keyword2', 'keyword3'])
     kwc1 = KeywordCoordinate(0, 0, ['keyword1'])
     kwc2 = KeywordCoordinate(0, 0, ['keyword2'])
     data = [kwc1, kwc2]
     result = t3.solve(query, data)
     self.assertAlmostEqual(result, math.inf, delta=0.01)
 def test_find_subsets4(self):
     kwc1 = KeywordCoordinate(0, 0, ['0'])
     kwc2 = KeywordCoordinate(1, 1, ['1'])
     kwc3 = KeywordCoordinate(2, 2, ['2'])
     kwc4 = KeywordCoordinate(3, 3, ['3'])
     superset = [kwc1, kwc2, kwc3, kwc4]
     subsets = mt.find_subsets(superset, 4)
     self.assertEqual(len(subsets), 1)
     for subset in subsets:
         self.assertEqual(len(subset), 4)
Exemple #6
0
 def test_threshold4(self):
     t1 = Type1(euclidean_distance, separated_cosine_similarity, 0.0, 0.3,
                0.7, math.inf, 0.1, math.inf)
     query = KeywordCoordinate(0, 0, ['keyword1', 'keyword2', 'keyword3'])
     kwc1 = KeywordCoordinate(0.1, 0.1,
                              ['keyword1', 'keyword2', 'keyword3'])
     kwc2 = KeywordCoordinate(0.2, 0.2,
                              ['keyword1', 'keyword2', 'keyword3'])
     data = [kwc1, kwc2]
     result = t1.solve(query, data)
     self.assertAlmostEqual(result, math.inf, delta=0.01)
 def test_create_combined_keyword_vector1(self):
     kwv1 = ['kw1', 'kw2', 'kw3']
     kwv2 = ['kw4', 'kw2', 'kw3']
     kwc1 = KeywordCoordinate(0, 0, kwv1)
     kwc2 = KeywordCoordinate(0, 0, kwv2)
     kwc3 = KeywordCoordinate(0, 0, kwv2)
     kwc_list = [kwc2, kwc3]
     result = mt.create_combined_keyword_vector(kwc1, kwc_list)
     combined_list = kwv1 + kwv2
     for element in result:
         self.assertTrue(element in combined_list)
     self.assertEqual(len(result), 4)
Exemple #8
0
 def test_denormalize(self):
     cost_doesnt_matter = 0.0
     kwc1 = KeywordCoordinate(0.0, 0.0, ['family'])
     kwc2 = KeywordCoordinate(1.0, 0.4, ['food'])
     kwc3 = KeywordCoordinate(0.33, 1.0, ['outdoor'])
     data = [(cost_doesnt_matter, [kwc1, kwc2, kwc3])]
     result = mt.denormalize_result_data(data, 3, 0, 5, 0)
     self.assertAlmostEqual(result[0][1][0].coordinates.x, 0.0, delta=0.02)
     self.assertAlmostEqual(result[0][1][0].coordinates.y, 0.0, delta=0.02)
     self.assertAlmostEqual(result[0][1][1].coordinates.x, 3.0, delta=0.02)
     self.assertAlmostEqual(result[0][1][1].coordinates.y, 2.0, delta=0.02)
     self.assertAlmostEqual(result[0][1][2].coordinates.x, 1.0, delta=0.02)
     self.assertAlmostEqual(result[0][1][2].coordinates.y, 5.0, delta=0.02)
 def test_solve(self):
     query = KeywordCoordinate(0, 0, ['family', 'food', 'outdoor'])
     kwc1 = KeywordCoordinate(1, 1, ['family', 'food', 'outdoor'])
     kwc2 = KeywordCoordinate(3, 3, ['food'])
     kwc3 = KeywordCoordinate(2, 2, ['outdoor'])
     data = [kwc1, kwc2, kwc3]
     cf = Type1(euclidean_distance,
                separated_cosine_similarity,
                0.3,
                0.3,
                0.4,
                disable_thresholds=True)
     ns = NaiveSolver(query, data, cf, normalize=False, result_length=1)
     result = ns.solve()
     self.assertAlmostEqual(result[0][0], 0.42, delta=0.01)
Exemple #10
0
 def generate(self, data_size: int) -> dataset_type:
     """
     Generates a dataset with a given size.
     :param data_size: The size of the dataset
     :return: The dataset
     """
     logger = logging.getLogger(__name__)
     logger.debug('generating dataset of size {}'.format(data_size))
     dataset: dataset_type = []
     for data_counter in range(data_size):
         possible_keywords_copy = self.possible_keywords.copy()
         current_keywords: keyword_dataset_type = []
         current_x = random.randint(self.physical_min_x,
                                    self.physical_max_x)
         current_y = random.randint(self.physical_min_y,
                                    self.physical_max_y)
         number_of_keywords = random.randint(self.keywords_min,
                                             self.keywords_max)
         for kw_counter in range(number_of_keywords):
             try:
                 current_keyword = random.choice(possible_keywords_copy)
             except IndexError:
                 break
             possible_keywords_copy.remove(current_keyword)
             current_keywords.append(current_keyword)
         new_entry = KeywordCoordinate(current_x, current_y,
                                       current_keywords)
         dataset.append(new_entry)
     logger.debug('generated dataset {}'.format(
         dataset_comprehension(dataset)))
     return dataset
 def test_instantiation(self):
     x = 3
     y = 8
     kw = ['keyword 1', 'kw2', '3']
     kwc = KeywordCoordinate(x, y, kw)
     self.assertEqual(kwc.coordinates.x, x)
     self.assertEqual(kwc.coordinates.y, y)
     self.assertEqual(kwc.keywords, kw)
Exemple #12
0
 def test_instantiation(self):
     query_keywords = ['family', 'food', 'outdoor']
     kwc1_keywords = ['family', 'food', 'outdoor']
     kwc2_keywords = ['food']
     kwc3_keywords = ['outdoor']
     query = KeywordCoordinate(0, 0, query_keywords)
     kwc1 = KeywordCoordinate(1, 1, kwc1_keywords)
     kwc2 = KeywordCoordinate(2, 2, kwc2_keywords)
     kwc3 = KeywordCoordinate(3, 3, kwc3_keywords)
     data = [kwc1, kwc2, kwc3]
     cf = CostFunction(euclidean_distance, separated_cosine_similarity, 0.3,
                       0.3, 0.4)
     so = Solver(query, data, cf, normalize=False, result_length=10)
     self.assertAlmostEqual(so.query.coordinates.x, 0, delta=0.01)
     self.assertAlmostEqual(so.query.coordinates.y, 0, delta=0.01)
     self.assertListEqual(so.data, data)
     self.assertAlmostEqual(so.data[0].coordinates.x, 1, delta=0.01)
     self.assertAlmostEqual(so.data[0].coordinates.y, 1, delta=0.01)
     self.assertListEqual(so.data[0].keywords, kwc1_keywords)
     for index in range(len(so.data[0].keywords)):
         self.assertEqual(so.data[0].keywords[index], kwc1_keywords[index])
     self.assertAlmostEqual(so.data[1].coordinates.x, 2, delta=0.01)
     self.assertAlmostEqual(so.data[1].coordinates.y, 2, delta=0.01)
     self.assertListEqual(so.data[1].keywords, kwc2_keywords)
     for index in range(len(so.data[1].keywords)):
         self.assertEqual(so.data[1].keywords[index], kwc2_keywords[index])
     self.assertAlmostEqual(so.data[2].coordinates.x, 3, delta=0.01)
     self.assertAlmostEqual(so.data[2].coordinates.y, 3, delta=0.01)
     self.assertListEqual(so.data[2].keywords, kwc3_keywords)
     for index in range(len(so.data[2].keywords)):
         self.assertEqual(so.data[2].keywords[index], kwc3_keywords[index])
     self.assertEqual(euclidean_distance.__get__,
                      so.cost_function.distance_metric.__get__)
     self.assertEqual(separated_cosine_similarity.__get__,
                      so.cost_function.similarity_metric.__get__)
     self.assertAlmostEqual(so.cost_function.alpha, 0.3, delta=0.01)
     self.assertAlmostEqual(so.cost_function.beta, 0.3, delta=0.01)
     self.assertAlmostEqual(so.cost_function.omega, 0.4, delta=0.01)
     self.assertEqual(so.normalize_data, False)
     self.assertEqual(so.result_length, 10)
     self.assertAlmostEqual(so.denormalize_max_x, 0.0, delta=0.01)
     self.assertAlmostEqual(so.denormalize_min_x, 0.0, delta=0.01)
     self.assertAlmostEqual(so.denormalize_max_y, 0.0, delta=0.01)
     self.assertAlmostEqual(so.denormalize_min_y, 0.0, delta=0.01)
 def test_get_maximum_keyword_distance4(self):
     keywords_query = ['food', 'fun', 'outdoor']
     keywords_kwc1 = ['food', 'fun', 'outdoor']
     keywords_kwc2 = ['food', 'fun', 'outdoor']
     keywords_kwc3 = ['food', 'fun', 'outdoor']
     coordinates_dont_matter_here = 0
     query = KeywordCoordinate(coordinates_dont_matter_here,
                               coordinates_dont_matter_here, keywords_query)
     kwc1 = KeywordCoordinate(coordinates_dont_matter_here,
                              coordinates_dont_matter_here, keywords_kwc1)
     kwc2 = KeywordCoordinate(coordinates_dont_matter_here,
                              coordinates_dont_matter_here, keywords_kwc2)
     kwc3 = KeywordCoordinate(coordinates_dont_matter_here,
                              coordinates_dont_matter_here, keywords_kwc3)
     dataset: dataset_type = [kwc1, kwc2, kwc3]
     cf = CostFunction(euclidean_distance, separated_cosine_similarity, 0.3,
                       0.3, 0.4)
     result = cf.get_maximum_keyword_distance(query, dataset)
     self.assertAlmostEqual(result, 0.0, delta=0.01)
Exemple #14
0
 def test_normalize_data(self):
     query = KeywordCoordinate(2, 1, ['family', 'food', 'outdoor'])
     kwc1 = KeywordCoordinate(0, 0, ['family'])
     kwc2 = KeywordCoordinate(3, 2, ['food'])
     kwc3 = KeywordCoordinate(1, 5, ['outdoor'])
     data = [kwc1, kwc2, kwc3]
     norm_query, norm_data, max_x, min_x, max_y, min_y = mt.normalize_data(
         query, data)
     self.assertAlmostEqual(norm_query.coordinates.x, 0.66, delta=0.01)
     self.assertAlmostEqual(norm_query.coordinates.y, 0.20, delta=0.01)
     self.assertAlmostEqual(norm_data[0].coordinates.x, 0.0, delta=0.01)
     self.assertAlmostEqual(norm_data[0].coordinates.y, 0.0, delta=0.01)
     self.assertAlmostEqual(norm_data[1].coordinates.x, 1.0, delta=0.01)
     self.assertAlmostEqual(norm_data[1].coordinates.y, 0.4, delta=0.01)
     self.assertAlmostEqual(norm_data[2].coordinates.x, 0.33, delta=0.01)
     self.assertAlmostEqual(norm_data[2].coordinates.y, 1.0, delta=0.01)
     self.assertEqual(max_x, 3)
     self.assertEqual(min_x, 0)
     self.assertEqual(max_y, 5)
     self.assertEqual(min_y, 0)
Exemple #15
0
 def test_write_and_read_data(self):
     kwc1 = KeywordCoordinate(1, 1, ['1'])
     kwc2 = KeywordCoordinate(2, 2, ['2'])
     kwc3 = KeywordCoordinate(3, 3, ['3'])
     data = [kwc1, kwc2, kwc3]
     file_name = 'test/test.pickle'
     write_pickle(data, file_name, True)
     loaded_result = load_pickle(file_name)
     self.assertEqual(len(loaded_result), 3)
     for index in range(len(loaded_result)):
         self.assertAlmostEqual(loaded_result[index].coordinates.x,
                                data[index].coordinates.x)
         self.assertAlmostEqual(loaded_result[index].coordinates.y,
                                data[index].coordinates.y)
         self.assertListEqual(loaded_result[index].keywords,
                              data[index].keywords)
     os.remove(
         os.path.abspath(
             os.path.dirname(os.path.abspath(__file__)) + '../../../' +
             file_name))
 def test_precalculated_word2vec(self):
     query = KeywordCoordinate(0, 0, ['family', 'food', 'outdoor'])
     kwc1 = KeywordCoordinate(1, 1, ['family', 'food', 'outdoor'])
     kwc2 = KeywordCoordinate(3, 3, ['food', 'family'])
     kwc3 = KeywordCoordinate(2, 2, ['outdoor'])
     data = [kwc1, kwc2, kwc3]
     model = calculate_model_subset(query, data, load_word2vec_model())
     cf = Type3(euclidean_distance,
                word2vec_cosine_similarity,
                0.3,
                0.3,
                0.4,
                disable_thresholds=True,
                model=model)
     ns = NaiveSolver(query, data, cf)
     result = ns.solve()
     pre_qd = ns.get_query_dataset_distance()
     pre_id = ns.get_inter_dataset_distance()
     pre_ks = ns.get_keyword_similarity()
     cf.precalculated_query_dataset_dict = pre_qd
     cf.precalculated_inter_dataset_dict = pre_id
     cf.precalculated_keyword_similarity_dict = pre_ks
     result_pre = ns.solve()
     for index in range(len(result)):
         self.assertAlmostEqual(result[index][0],
                                result_pre[index][0],
                                delta=0.01)
         key_list = list(result[index][1])
         key_list_pre = list(result_pre[index][1])
         for list_index in range(len(key_list)):
             self.assertAlmostEqual(key_list[list_index].coordinates.x,
                                    key_list_pre[list_index].coordinates.x)
             self.assertAlmostEqual(key_list[list_index].coordinates.y,
                                    key_list_pre[list_index].coordinates.y)
             self.assertListEqual(key_list[list_index].keywords,
                                  key_list_pre[list_index].keywords)
 def test_get_minimum_for_query4(self):
     keywords_dont_matter_here = ['']
     query = KeywordCoordinate(0, 0, keywords_dont_matter_here)
     kwc1 = KeywordCoordinate(8, 8, keywords_dont_matter_here)
     kwc2 = KeywordCoordinate(9, 9, keywords_dont_matter_here)
     kwc3 = KeywordCoordinate(13, 13, keywords_dont_matter_here)
     kwc4 = KeywordCoordinate(24, 24, keywords_dont_matter_here)
     kwc5 = KeywordCoordinate(35, 35, keywords_dont_matter_here)
     dataset: dataset_type = [kwc1, kwc2, kwc3, kwc4, kwc5]
     cf = CostFunction(manhattan_distance, separated_cosine_similarity, 0.3,
                       0.3, 0.4)
     result = cf.get_minimum_for_query(query, dataset)
     self.assertAlmostEqual(result, 16.0, delta=0.01)
 def test_get_minimum_for_dataset2(self):
     keywords_dont_matter_here = ['']
     kwc1 = KeywordCoordinate(5, 5, keywords_dont_matter_here)
     kwc2 = KeywordCoordinate(6, 6, keywords_dont_matter_here)
     kwc3 = KeywordCoordinate(7, 7, keywords_dont_matter_here)
     kwc4 = KeywordCoordinate(8, 8, keywords_dont_matter_here)
     kwc5 = KeywordCoordinate(9, 9, keywords_dont_matter_here)
     kwc6 = KeywordCoordinate(10, 10, keywords_dont_matter_here)
     dataset: dataset_type = [kwc1, kwc2, kwc3, kwc4, kwc5, kwc6]
     cf = CostFunction(manhattan_distance, separated_cosine_similarity, 0.3,
                       0.3, 0.4)
     result = cf.get_minimum_for_dataset(dataset)
     self.assertAlmostEqual(result, 2.0, delta=0.01)
 def test_get_maximum_for_dataset3(self):
     keywords_dont_matter_here = ['']
     kwc1 = KeywordCoordinate(6, 6, keywords_dont_matter_here)
     kwc2 = KeywordCoordinate(8, 8, keywords_dont_matter_here)
     kwc3 = KeywordCoordinate(9, 9, keywords_dont_matter_here)
     kwc4 = KeywordCoordinate(13, 13, keywords_dont_matter_here)
     kwc5 = KeywordCoordinate(24, 24, keywords_dont_matter_here)
     kwc6 = KeywordCoordinate(35, 35, keywords_dont_matter_here)
     dataset: dataset_type = [kwc1, kwc2, kwc3, kwc4, kwc5, kwc6]
     cf = CostFunction(euclidean_distance, separated_cosine_similarity, 0.3,
                       0.3, 0.4)
     result = cf.get_maximum_for_dataset(dataset)
     self.assertAlmostEqual(result, 41.01, delta=0.01)
 def test_get_maximum_for_dataset1(self):
     keywords_dont_matter_here = ['']
     kwc1 = KeywordCoordinate(0, 0, keywords_dont_matter_here)
     kwc2 = KeywordCoordinate(1, 1, keywords_dont_matter_here)
     kwc3 = KeywordCoordinate(2, 2, keywords_dont_matter_here)
     kwc4 = KeywordCoordinate(3, 3, keywords_dont_matter_here)
     kwc5 = KeywordCoordinate(4, 4, keywords_dont_matter_here)
     kwc6 = KeywordCoordinate(5, 5, keywords_dont_matter_here)
     dataset: dataset_type = [kwc1, kwc2, kwc3, kwc4, kwc5, kwc6]
     cf = CostFunction(euclidean_distance, separated_cosine_similarity, 0.3,
                       0.3, 0.4)
     result = cf.get_maximum_for_dataset(dataset)
     self.assertAlmostEqual(result, 7.07, delta=0.01)
 def test_get_minimum_for_dataset3(self):
     keywords_dont_matter_here = ['']
     kwc1 = KeywordCoordinate(0, 0, keywords_dont_matter_here)
     kwc2 = KeywordCoordinate(13, 13, keywords_dont_matter_here)
     kwc3 = KeywordCoordinate(20, 20, keywords_dont_matter_here)
     kwc4 = KeywordCoordinate(800, 800, keywords_dont_matter_here)
     kwc5 = KeywordCoordinate(9000, 9000, keywords_dont_matter_here)
     kwc6 = KeywordCoordinate(10000, 10000, keywords_dont_matter_here)
     dataset: dataset_type = [kwc1, kwc2, kwc3, kwc4, kwc5, kwc6]
     cf = CostFunction(euclidean_distance, separated_cosine_similarity, 0.3,
                       0.3, 0.4)
     result = cf.get_minimum_for_dataset(dataset)
     self.assertAlmostEqual(result, 9.9, delta=0.01)
Exemple #22
0
 def test_solve4(self):
     t3 = Type3(manhattan_distance,
                separated_cosine_similarity,
                1,
                0,
                0,
                disable_thresholds=True)
     keywords_query = ['food', 'fun', 'outdoor', 'family']
     keywords_kwc1 = ['food', 'fun', 'outdoor']
     keywords_kwc2 = ['food', 'fun']
     keywords_kwc3 = ['food']
     query = KeywordCoordinate(0, 0, keywords_query)
     kwc1 = KeywordCoordinate(1, 1, keywords_kwc1)
     kwc2 = KeywordCoordinate(2, 2, keywords_kwc2)
     kwc3 = KeywordCoordinate(3, 3, keywords_kwc3)
     kwc4 = KeywordCoordinate(4, 4, keywords_kwc3)
     kwc5 = KeywordCoordinate(5, 5, keywords_kwc3)
     data = [kwc1, kwc2, kwc3, kwc4, kwc5]
     result = t3.solve(query, data)
     self.assertAlmostEqual(result, 2.0, delta=0.01)
Exemple #23
0
 def test_solve9(self):
     t1 = Type1(euclidean_distance,
                separated_cosine_similarity,
                0,
                0,
                1,
                disable_thresholds=True)
     keywords_query = ['food', 'fun', 'outdoor', 'family']
     keywords_kwc1 = ['food', 'fun', 'outdoor', 'family']
     keywords_kwc2 = ['food', 'fun', 'outdoor', 'family']
     keywords_kwc3 = ['this_is_not_a_match']
     query = KeywordCoordinate(0, 0, keywords_query)
     kwc1 = KeywordCoordinate(1, 1, keywords_kwc1)
     kwc2 = KeywordCoordinate(2, 2, keywords_kwc2)
     kwc3 = KeywordCoordinate(3, 3, keywords_kwc3)
     kwc4 = KeywordCoordinate(4, 4, keywords_kwc3)
     kwc5 = KeywordCoordinate(5, 5, keywords_kwc3)
     data = [kwc1, kwc2, kwc3, kwc4, kwc5]
     result = t1.solve(query, data)
     self.assertAlmostEqual(result, 1.0, delta=0.01)
Exemple #24
0
def load_csv(file_name: str,
             x_coordinate_index: int,
             y_coordinate_index: int,
             keywords_index: int,
             keywords_delimiter: str = ' ',
             max_read_length: int = -1,
             delimiter: str = ',',
             newline: str = '',
             quotechar: str = '"',
             path_relative_to_project_root: bool = True,
             query_load: bool = False) -> dataset_type:
    """
    Loads a csv file.
    :param file_name: The file name of the csv file. The file is usually in the project folder. Otherwise use the path_relative_to_project_root flag.
    :param x_coordinate_index: The index of the x coordinates
    :param y_coordinate_index: The index of the y coordinates
    :param keywords_index: The index of the keywords
    :param keywords_delimiter: The delimiter of the keywords
    :param max_read_length: The maximum number of lines to read
    :param delimiter: The csv cell delimiter
    :param newline: The newline delimiter
    :param quotechar: The quotechar symbol
    :param path_relative_to_project_root: The flag if the file name is relative to the project folder
    :return: The dataset of the csv
    """
    dataset: dataset_type = []
    if query_load:
        max_read_length -= 1  # because the length doesn't start counting at 0
        if path_relative_to_project_root:
            file_path = os.path.abspath(
                os.path.dirname(os.path.abspath(__file__)) + '/../../files/' +
                file_name)
        else:
            file_path = file_name
        with open(file_path, mode='rt', newline=newline,
                  encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile,
                                delimiter=delimiter,
                                quotechar=quotechar)
            for row in reader:
                try:
                    # print(row[x_coordinate_index])
                    current_coordinate_x = float(row[x_coordinate_index])
                    # print(current_coordinate_x)
                    current_coordinate_y = float(row[y_coordinate_index])
                    # print(current_coordinate_y)
                except:
                    print('----- Failure -----')
                    if max_read_length > 0:
                        max_read_length += 1
                    continue
                raw_keyword_list = row[keywords_index].split(
                    keywords_delimiter)
                current_POI_name = 'Query point'  # Query has no POI name

                current_keywords: keyword_dataset_type = []
                for keyword in raw_keyword_list:
                    stripped_keyword = keyword.strip()
                    if len(stripped_keyword) > 0:
                        current_keywords.append(stripped_keyword)
                current_keyword_coordinate = KeywordCoordinate(
                    current_POI_name, current_coordinate_x,
                    current_coordinate_y, current_keywords)
                dataset.append(current_keyword_coordinate)

    else:
        df = pd.read_csv(os.path.abspath(
            os.path.dirname(os.path.abspath(__file__)) + '/../../files/' +
            file_name),
                         delimiter=';',
                         error_bad_lines=False,
                         encoding="utf-8")
        # print(df)
        # Calculates topN keywords using TF-IDF
        # Removes rows with NaN values
        df.dropna(inplace=True)
        reviews = df['keywords_all']

        df['keyword lists IDF'] = reviews.apply(lambda x: reviews2OneString(x))

        #remove POIs with no reviews or NaN values
        df = df[df['keyword lists IDF'].str.len() != 0]
        #df.dropna(inplace=True)

        # nlp = spacy.load('en_core_web_lg')
        nlp = en_core_web_lg.load()
        df['keyword lists IDF'] = df['keyword lists IDF'].apply(
            lambda x: pre_process(x, nlp))

        docs = df['keyword lists IDF'].tolist()

        #print(df['keyword lists IDF'][0])
        # Let's compute IDF
        #1. Create a vocabulary of words,
        #2. Ignore words that appear in 85% of documents,
        #3. Eliminate stop words
        cv = CountVectorizer(max_df=0.85, stop_words='english')
        word_count_vector = cv.fit_transform(docs)

        # print(np.shape(word_count_vector))

        # Let's compute IDF (test = IDF dataset)
        tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
        tfidf_transformer.fit(word_count_vector)

        # Computing TF-IDF and Extracting Keywords
        # Get the whole vocabulary (all reviews for all POIs) in a list
        docs = df['keyword lists IDF'].tolist()

        feature_names = cv.get_feature_names()

        df['Top-Keywords-TFIDF'] = reviews.apply(lambda x: get_topN_keywords(
            x, 10, tfidf_transformer, cv, feature_names))

        #print('list --> ', df['Top-Keywords-TFIDF'])

        df_poi_keywords = pd.DataFrame(
            columns=['poi_name', 'nlp_keywords_encoded'])

        for index, row in df.iterrows():
            current_POI_name = row.get('name')
            # Whatch out if we want to use coordinates for something else
            try:
                current_coordinate_x = float(row.get('lat'))
                current_coordinate_y = float(row.get('lng'))
            except:
                print("ERROR --> Coordinates")
                continue
            #current_keywords: keyword_dataset_type = (df['Top-Keywords-TFIDF'][i])
            #print(type(row['Top-Keywords-TFIDF']))
            current_keywords = []

            for k in row['Top-Keywords-TFIDF'].keys():
                current_keywords.append(k)

            # print('+++++ current_keywords --> ', current_keywords)

            # current_keywords: keyword_dataset_type = []
            # for keyword in raw_keyword_list:
            #     stripped_keyword = keyword.strip()
            #     if len(stripped_keyword) > 0:
            #         current_keywords.append(stripped_keyword)
            # current_keyword_coordinate = KeywordCoordinate(current_POI_name, current_coordinate_x, current_coordinate_y, current_keywords)
            element_string = ''
            for kw in current_keywords:
                element_string = element_string + ' ' + kw

            nlp_element = nlp(element_string)
            # print('NLP element: ', nlp_element)
            new_row = {
                'poi_name': current_POI_name,
                'nlp_keywords_encoded': nlp_element
            }
            df_poi_keywords = df_poi_keywords.append(new_row,
                                                     ignore_index=True)
            current_keyword_coordinate = KeywordCoordinate(
                current_POI_name, current_coordinate_x, current_coordinate_y,
                current_keywords)
            dataset.append(current_keyword_coordinate)

        # print(np.shape(df_poi_keywords))

        df_poi_keywords.set_index('poi_name', inplace=True)
        df_poi_keywords.to_csv(os.path.dirname(os.path.abspath(__file__)) +
                               '/../../files/' + 'poi_keywords_encoded.csv',
                               encoding='utf-8')

        print(np.shape(dataset))
    return dataset