def main(argv): start_time = time.time() # Config file_name_data = 'dataset.pickle' file_name_query = 'query.pickle' file_name_word2vec_model = 'model.pickle' target_file_name = 'precalculated_query_dataset_keyword_similarities_word2vec.pickle' # max_subset_size = int(argv[0]) max_subset_size = 2 # cost_function = Type3(euclidean_distance, word2vec_cosine_similarity, 0.2, 0.1, 0.7, # model=load_word2vec_model(file_name_word2vec_model)) # cost_function = Type2(euclidean_distance, word2vec_cosine_similarity, 0, 0, 1, # model=load_word2vec_model(file_name_word2vec_model)) cost_function = Type1(euclidean_distance, word2vec_cosine_similarity, 0.2, 0.1, 0.7, model=load_word2vec_model(file_name_word2vec_model)) file_allow_overwrite = True # Code data = load_pickle(file_name_data) query = load_pickle(file_name_query) solver = NaiveSolver(query, data, cost_function, max_subset_size=max_subset_size) precalculated_query_dataset_distances = solver.get_keyword_similarity() write_pickle(precalculated_query_dataset_distances, target_file_name, file_allow_overwrite=file_allow_overwrite) print("--- %s seconds ---" % (time.time() - start_time))
def test_precalculated_max_inter_dataset(self): query = KeywordCoordinate(0, 0, ['family', 'food', 'outdoor']) kwc1 = KeywordCoordinate(1, 1, ['family', 'food', 'outdoor']) kwc2 = KeywordCoordinate(3, 3, ['food', 'family']) kwc3 = KeywordCoordinate(2, 2, ['outdoor']) data = [kwc1, kwc2, kwc3] cf = Type1(euclidean_distance, combined_cosine_similarity, 0.3, 0.3, 0.4, disable_thresholds=True) ns = NaiveSolver(query, data, cf) pre_id = ns.get_max_inter_dataset_distance() result = ns.solve() cf.precalculated_inter_dataset_dict = pre_id result_pre = ns.solve() for index in range(len(result)): self.assertAlmostEqual(result[index][0], result_pre[index][0], delta=0.01) key_list = list(result[index][1]) key_list_pre = list(result_pre[index][1]) for list_index in range(len(key_list)): self.assertAlmostEqual(key_list[list_index].coordinates.x, key_list_pre[list_index].coordinates.x) self.assertAlmostEqual(key_list[list_index].coordinates.y, key_list_pre[list_index].coordinates.y) self.assertListEqual(key_list[list_index].keywords, key_list_pre[list_index].keywords)
def test_complex_precalculations(self): query = KeywordCoordinate(5, 6, ['culture']) kwc1 = KeywordCoordinate(2, 1, ['family', 'rest', 'indoor']) kwc2 = KeywordCoordinate(0, 2, ['science', 'culture', 'history']) kwc3 = KeywordCoordinate(0, 0, ['food', 'outdoor', 'sports']) data = [kwc1, kwc2, kwc3] cf = Type1(euclidean_distance, combined_cosine_similarity, 0.3, 0.3, 0.4, disable_thresholds=True) ns = NaiveSolver(query, data, cf, result_length=100) result = ns.solve() pre_qd = ns.get_query_dataset_distance() pre_id = ns.get_inter_dataset_distance() pre_ks = ns.get_keyword_similarity() cf.precalculated_query_dataset_dict = pre_qd cf.precalculated_inter_dataset_dict = pre_id cf.precalculated_keyword_similarity_dict = pre_ks result_pre = ns.solve() for index in range(len(result)): self.assertAlmostEqual(result[index][0], result_pre[index][0], delta=0.01) key_list = list(result[index][1]) key_list_pre = list(result_pre[index][1]) for list_index in range(len(key_list)): self.assertAlmostEqual(key_list[list_index].coordinates.x, key_list_pre[list_index].coordinates.x) self.assertAlmostEqual(key_list[list_index].coordinates.y, key_list_pre[list_index].coordinates.y) self.assertListEqual(key_list[list_index].keywords, key_list_pre[list_index].keywords)
def test_threshold6(self): t1 = Type1(euclidean_distance, separated_cosine_similarity, 0.25, 0.25, 0.5, math.inf, math.inf, 0.4) query = KeywordCoordinate(0, 0, ['keyword1', 'keyword2', 'keyword3']) kwc1 = KeywordCoordinate(0, 0, ['keyword1']) kwc2 = KeywordCoordinate(0, 0, ['keyword2']) data = [kwc1, kwc2] result = t1.solve(query, data) self.assertAlmostEqual(result, math.inf, delta=0.01)
def test_general(self): ev = Evaluator() possible_keywords = [ 'family', 'food', 'outdoor', 'rest', 'indoor', 'sports', 'science', 'culture', 'history' ] dg = DataGenerator(possible_keywords) gen_query = dg.generate(1)[0] gen_data = dg.generate(10) cf1 = Type1(euclidean_distance, combined_cosine_similarity, 0.33, 0.33, 0.33, disable_thresholds=True) cf2 = Type2(euclidean_distance, combined_cosine_similarity, 0.33, 0.33, 0.33, disable_thresholds=True) cf3 = Type3(euclidean_distance, combined_cosine_similarity, 0.33, 0.33, 0.33, disable_thresholds=True) ns1 = NaiveSolver(gen_query, gen_data, cf1, result_length=10, max_subset_size=6) ns2 = NaiveSolver(gen_query, gen_data, cf2, result_length=10, max_subset_size=6) ns3 = NaiveSolver(gen_query, gen_data, cf3, result_length=10, max_subset_size=6) ev.add_solver(ns1) ev.add_solver(ns2) ev.add_solver(ns3) ev.evaluate() results = ev.get_results() self.assertEqual(len(results), 3) self.assertEqual(len(results[0]), 2) self.assertEqual(len(results[1]), 2) self.assertEqual(len(results[2]), 2) self.assertEqual(len(results[0][0]), 10) self.assertEqual(len(results[1][0]), 10) self.assertEqual(len(results[2][0]), 10)
def test_solve(self): query = KeywordCoordinate(0, 0, ['family', 'food', 'outdoor']) kwc1 = KeywordCoordinate(1, 1, ['family', 'food', 'outdoor']) kwc2 = KeywordCoordinate(3, 3, ['food']) kwc3 = KeywordCoordinate(2, 2, ['outdoor']) data = [kwc1, kwc2, kwc3] cf = Type1(euclidean_distance, separated_cosine_similarity, 0.3, 0.3, 0.4, disable_thresholds=True) ns = NaiveSolver(query, data, cf, normalize=False, result_length=1) result = ns.solve() self.assertAlmostEqual(result[0][0], 0.42, delta=0.01)
def test_instantiation(self): t1 = Type1(euclidean_distance, separated_cosine_similarity, 0.3, 0.3, 0.4, 0.5, 0.6, 0.7, False) self.assertEqual(euclidean_distance.__get__, t1.distance_metric.__get__) self.assertEqual(separated_cosine_similarity.__get__, t1.similarity_metric.__get__) self.assertAlmostEqual(t1.alpha, 0.3, delta=0.01) self.assertAlmostEqual(t1.beta, 0.3, delta=0.01) self.assertAlmostEqual(t1.omega, 0.4, delta=0.01) self.assertAlmostEqual(t1.query_distance_threshold, 0.5, delta=0.01) self.assertAlmostEqual(t1.dataset_distance_threshold, 0.6, delta=0.01) self.assertAlmostEqual(t1.keyword_similarity_threshold, 0.7, delta=0.01) self.assertEqual(t1.disable_thresholds, False)
def main(argv): start_time = time.time() # Config file_name_data = 'dataset.pickle' query_file_name = 'query.pickle' file_name_word2vec_model = 'model.pickle' target_file_name = 'precalculated_inter_dataset_distances.pickle' # max_subset_size = 3 # Changed max_subset_size = 2 #int(argv[0]) # cost_function = Type1(euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7) # cost_function = Type2(euclidean_distance, word2vec_cosine_similarity, 0, 0, 1, # model=load_word2vec_model(file_name_word2vec_model)) start_time = time.time() cost_function = Type1(euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7, model=load_word2vec_model(file_name_word2vec_model)) finish_time = time.time() print('Cost function initialization --> ', finish_time - start_time) file_allow_overwrite = True # Code start_time = time.time() data = load_pickle(file_name_data) finish_time = time.time() print('Load data pickle --> ', finish_time - start_time) # query = KeywordCoordinate("", 0, 0, ['0']) query = load_pickle(query_file_name) start_time = time.time() solver = NaiveSolver(query, data, cost_function, max_subset_size=max_subset_size) finish_time = time.time() print('Solver initialization --> ', finish_time - start_time) precalculated_inter_dataset_distances = solver.get_inter_dataset_distance() write_pickle(precalculated_inter_dataset_distances, target_file_name, file_allow_overwrite=file_allow_overwrite) print("--- %s seconds ---" % (time.time() - start_time))
def test_solve4(self): t1 = Type1(manhattan_distance, separated_cosine_similarity, 0, 1, 0, disable_thresholds=True) keywords_query = ['food', 'fun', 'outdoor', 'family'] keywords_kwc1 = ['food', 'fun', 'outdoor'] keywords_kwc2 = ['food', 'fun'] keywords_kwc3 = ['food'] query = KeywordCoordinate(0, 0, keywords_query) kwc1 = KeywordCoordinate(1, 1, keywords_kwc1) kwc2 = KeywordCoordinate(2, 2, keywords_kwc2) kwc3 = KeywordCoordinate(3, 3, keywords_kwc3) kwc4 = KeywordCoordinate(4, 4, keywords_kwc3) kwc5 = KeywordCoordinate(5, 5, keywords_kwc3) data = [kwc1, kwc2, kwc3, kwc4, kwc5] result = t1.solve(query, data) self.assertAlmostEqual(result, 8.0, delta=0.01)
def test_complex_generated_word2vec_precalculations(self): possible_keywords = [ 'family', 'food', 'outdoor', 'rest', 'indoor', 'sports', 'science', 'culture', 'history' ] dg = DataGenerator(possible_keywords) query: KeywordCoordinate = dg.generate(1)[0] data = dg.generate(5) model = calculate_model_subset(query, data, load_word2vec_model()) cf = Type1(euclidean_distance, word2vec_cosine_similarity, 0.3, 0.3, 0.4, disable_thresholds=True, model=model) ns = NaiveSolver(query, data, cf, result_length=100) result = ns.solve() pre_qd = ns.get_query_dataset_distance() pre_id = ns.get_inter_dataset_distance() pre_ks = ns.get_keyword_similarity() cf.precalculated_query_dataset_dict = pre_qd cf.precalculated_inter_dataset_dict = pre_id cf.precalculated_keyword_similarity_dict = pre_ks result_pre = ns.solve() for index in range(len(result)): self.assertAlmostEqual(result[index][0], result_pre[index][0], delta=0.01) key_list = list(result[index][1]) key_list_pre = list(result_pre[index][1]) for list_index in range(len(key_list)): self.assertAlmostEqual(key_list[list_index].coordinates.x, key_list_pre[list_index].coordinates.x) self.assertAlmostEqual(key_list[list_index].coordinates.y, key_list_pre[list_index].coordinates.y) self.assertListEqual(key_list[list_index].keywords, key_list_pre[list_index].keywords)
def main(argv): start_time = time.time() # Evaluator, instantiate it first for logging purposes ev = Evaluator() query: KeywordCoordinate = load_pickle('query.pickle') print('Query:', query) data: dataset_type = load_pickle('dataset.pickle') # print('Data:', dataset_comprehension(data)) # Let's filter out by user radius # dataAux = sorted(data, key=lambda x: geographic_distance(x.coordinates, query.coordinates)) # distances = [geographic_distance(x.coordinates, query.coordinates) >= RADIUS for x in dataAux] # print('------ Distances: ', distances) # Load precalculated values and models precalculated_inter_dataset_distances = load_pickle( 'precalculated_inter_dataset_distances.pickle') precalculated_query_dataset_distances = load_pickle( 'precalculated_query_dataset_distances.pickle') precalculated_query_dataset_keyword_similarities = load_pickle( 'precalculated_query_dataset_keyword_similarities.pickle') # **** ONLY FOR word2vec model executions precalculated_query_dataset_keyword_similarities_word2vec = load_pickle( 'precalculated_query_dataset_keyword_similarities_word2vec.pickle') word2vec_model = load_word2vec_model('word2vec_model.pickle') # **** # Define the CostFunctions. For all possible parameters refer to the documentation. cf1 = Type1(euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7) cf2 = Type2(euclidean_distance, word2vec_cosine_similarity, 0.2, 0.1, 0.7, model=word2vec_model) cf3 = Type1( euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7, precalculated_inter_dataset_dict=precalculated_inter_dataset_distances, precalculated_query_dataset_dict=precalculated_query_dataset_distances, precalculated_keyword_similarity_dict= precalculated_query_dataset_keyword_similarities) cf4 = Type2( euclidean_distance, word2vec_cosine_similarity, 0, 0, 1.0, precalculated_inter_dataset_dict=precalculated_inter_dataset_distances, precalculated_query_dataset_dict=precalculated_query_dataset_distances, precalculated_keyword_similarity_dict= precalculated_query_dataset_keyword_similarities_word2vec, model=word2vec_model) cf5 = Type3( euclidean_distance, word2vec_cosine_similarity, 0.1, 0.1, 0.8, precalculated_inter_dataset_dict=precalculated_inter_dataset_distances, precalculated_query_dataset_dict=precalculated_query_dataset_distances, precalculated_keyword_similarity_dict= precalculated_query_dataset_keyword_similarities_word2vec, model=word2vec_model) cf6 = Type1( euclidean_distance, word2vec_cosine_similarity, 0.2, 0.1, 0.7, precalculated_inter_dataset_dict=precalculated_inter_dataset_distances, precalculated_query_dataset_dict=precalculated_query_dataset_distances, precalculated_keyword_similarity_dict= precalculated_query_dataset_keyword_similarities, model=word2vec_model) map_name = argv[0] # map_name = 'London_mini' # Choose which Solvers to use. For all possible parameters refer to the documentation. max_number_of_processes = mp.cpu_count() ns1 = NaiveSolver( query, data, cf2, result_length=5, max_subset_size=3, max_number_of_concurrent_processes=max_number_of_processes, _map=map_name) # ns2 = NaiveSolver(query, data, cf5, result_length=5, max_subset_size=3, # max_number_of_concurrent_processes=max_number_of_processes, _map = map_name) # ns3 = NaiveSolver(query, data, cf3, result_length=5, max_subset_size=6, # max_number_of_concurrent_processes=max_number_of_processes) # ns4 = NaiveSolver(query, data, cf6, result_length=5, max_subset_size=3, # max_number_of_concurrent_processes=max_number_of_processes, _map = map_name) # Add Solvers to Evaluator ev.add_solver(ns1) # ev.add_solver(ns2) # ev.add_solver(ns4) #Only for Debug: calculates and print physical distances between items in the dataset and the query location #distances = [geographic_distance(x.coordinates, query.coordinates) for x in data] # print('------ Distances: ', distances # Run Evaluator and fetch results ev.evaluate() results = ev.get_results() timings = ev.get_timings() write_csv(map_name, results, timings) print('*** Solution -', solution_list_comprehension(results)) # print('*** Timing -', timing_list_comprehension(timings)) initialLat = [] initialLon = [] keywords = [] gmap = gmplot.GoogleMapPlotter(query.coordinates.x, query.coordinates.y, 14) colors = ['red', 'blue', 'green', 'purple', 'orange'] # Third dimension is the order of solution (Best: 0, Second best: 1...) for i in range(5): lats = [] lons = [] for kwc in results[0][0][i][1]: lats.append(kwc.coordinates.x) lons.append(kwc.coordinates.y) keywords.append(kwc.keywords) for j in range(len(lats)): gmap.marker(lats[j], lons[j], color=colors[i]) gmap.polygon(lats, lons, color='cornflowerblue', edge_width=7) # initialLat.append(query.coordinates.x) # initialLon.append(query.coordinates.y) # gmap.scatter(initialLat, initialLon, '#00FF00', size = 70, marker = False) # gmap.scatter(lats, lons, '#FF0000',size = 50, marker = False ) # gmap.plot(lats, lons, 'cornflowerblue', edge_width = 3.0) # gmap.polygon(lats, lons, color='cornflowerblue', edge_width=10) # gmap.scatter(lats, lons, color='#3B0B39', size=40, marker=False) #Your Google_API_Key #gmap.apikey = " API_Key” # save it to html # gmap.scatter(lats, lons, '#FF0000', size=40, marker=True) gmap.marker(query.coordinates.x, query.coordinates.y, color='cornflowerblue', title='Query point') gmap.draw(r"graphic_results.html") print("--- %s seconds ---" % (time.time() - start_time))
#************************ in_start_time = time.time() # Config # file_name_data = 'dataset.pickle' # query_file_name = 'query.pickle' # file_name_word2vec_model = 'model.pickle' target_file_name = 'precalculated_inter_dataset_distances.pickle' # max_subset_size = 3 # Changed max_subset_size = number_of_pois_in_solution # cost_function = Type1(euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7) # cost_function = Type2(euclidean_distance, word2vec_cosine_similarity, 0, 0, 1, # model=load_word2vec_model(file_name_word2vec_model)) in_cost_start_time = time.time() cost_function = Type1(euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7, model=word2vec_model ) finish_time = time.time() print('Cost function initialization --> ', finish_time-in_cost_start_time) file_allow_overwrite = True # Code # start_time = time.time() # data = load_pickle(file_name_data) # finish_time = time.time() # print('Load data pickle --> ', finish_time-start_time) # query = KeywordCoordinate("", 0, 0, ['0']) # query = load_pickle(query_file_name) solver_start_time = time.time() solver = NaiveSolver(query, data, cost_function, max_subset_size=max_subset_size, update = False) data = solver.data