def main(argv):
    start_time = time.time()

    # Config
    file_name_data = 'dataset.pickle'
    file_name_query = 'query.pickle'
    file_name_word2vec_model = 'model.pickle'
    target_file_name = 'precalculated_query_dataset_keyword_similarities_word2vec.pickle'
    # max_subset_size = int(argv[0])
    max_subset_size = 2
    # cost_function = Type3(euclidean_distance, word2vec_cosine_similarity, 0.2, 0.1, 0.7,
    #                       model=load_word2vec_model(file_name_word2vec_model))
    # cost_function = Type2(euclidean_distance, word2vec_cosine_similarity, 0, 0, 1,
    #                       model=load_word2vec_model(file_name_word2vec_model))
    cost_function = Type1(euclidean_distance,
                          word2vec_cosine_similarity,
                          0.2,
                          0.1,
                          0.7,
                          model=load_word2vec_model(file_name_word2vec_model))
    file_allow_overwrite = True

    # Code
    data = load_pickle(file_name_data)
    query = load_pickle(file_name_query)
    solver = NaiveSolver(query,
                         data,
                         cost_function,
                         max_subset_size=max_subset_size)
    precalculated_query_dataset_distances = solver.get_keyword_similarity()
    write_pickle(precalculated_query_dataset_distances,
                 target_file_name,
                 file_allow_overwrite=file_allow_overwrite)

    print("--- %s seconds ---" % (time.time() - start_time))
 def test_precalculated_max_inter_dataset(self):
     query = KeywordCoordinate(0, 0, ['family', 'food', 'outdoor'])
     kwc1 = KeywordCoordinate(1, 1, ['family', 'food', 'outdoor'])
     kwc2 = KeywordCoordinate(3, 3, ['food', 'family'])
     kwc3 = KeywordCoordinate(2, 2, ['outdoor'])
     data = [kwc1, kwc2, kwc3]
     cf = Type1(euclidean_distance,
                combined_cosine_similarity,
                0.3,
                0.3,
                0.4,
                disable_thresholds=True)
     ns = NaiveSolver(query, data, cf)
     pre_id = ns.get_max_inter_dataset_distance()
     result = ns.solve()
     cf.precalculated_inter_dataset_dict = pre_id
     result_pre = ns.solve()
     for index in range(len(result)):
         self.assertAlmostEqual(result[index][0],
                                result_pre[index][0],
                                delta=0.01)
         key_list = list(result[index][1])
         key_list_pre = list(result_pre[index][1])
         for list_index in range(len(key_list)):
             self.assertAlmostEqual(key_list[list_index].coordinates.x,
                                    key_list_pre[list_index].coordinates.x)
             self.assertAlmostEqual(key_list[list_index].coordinates.y,
                                    key_list_pre[list_index].coordinates.y)
             self.assertListEqual(key_list[list_index].keywords,
                                  key_list_pre[list_index].keywords)
 def test_complex_precalculations(self):
     query = KeywordCoordinate(5, 6, ['culture'])
     kwc1 = KeywordCoordinate(2, 1, ['family', 'rest', 'indoor'])
     kwc2 = KeywordCoordinate(0, 2, ['science', 'culture', 'history'])
     kwc3 = KeywordCoordinate(0, 0, ['food', 'outdoor', 'sports'])
     data = [kwc1, kwc2, kwc3]
     cf = Type1(euclidean_distance,
                combined_cosine_similarity,
                0.3,
                0.3,
                0.4,
                disable_thresholds=True)
     ns = NaiveSolver(query, data, cf, result_length=100)
     result = ns.solve()
     pre_qd = ns.get_query_dataset_distance()
     pre_id = ns.get_inter_dataset_distance()
     pre_ks = ns.get_keyword_similarity()
     cf.precalculated_query_dataset_dict = pre_qd
     cf.precalculated_inter_dataset_dict = pre_id
     cf.precalculated_keyword_similarity_dict = pre_ks
     result_pre = ns.solve()
     for index in range(len(result)):
         self.assertAlmostEqual(result[index][0],
                                result_pre[index][0],
                                delta=0.01)
         key_list = list(result[index][1])
         key_list_pre = list(result_pre[index][1])
         for list_index in range(len(key_list)):
             self.assertAlmostEqual(key_list[list_index].coordinates.x,
                                    key_list_pre[list_index].coordinates.x)
             self.assertAlmostEqual(key_list[list_index].coordinates.y,
                                    key_list_pre[list_index].coordinates.y)
             self.assertListEqual(key_list[list_index].keywords,
                                  key_list_pre[list_index].keywords)
Beispiel #4
0
 def test_threshold6(self):
     t1 = Type1(euclidean_distance, separated_cosine_similarity, 0.25, 0.25,
                0.5, math.inf, math.inf, 0.4)
     query = KeywordCoordinate(0, 0, ['keyword1', 'keyword2', 'keyword3'])
     kwc1 = KeywordCoordinate(0, 0, ['keyword1'])
     kwc2 = KeywordCoordinate(0, 0, ['keyword2'])
     data = [kwc1, kwc2]
     result = t1.solve(query, data)
     self.assertAlmostEqual(result, math.inf, delta=0.01)
Beispiel #5
0
 def test_general(self):
     ev = Evaluator()
     possible_keywords = [
         'family', 'food', 'outdoor', 'rest', 'indoor', 'sports', 'science',
         'culture', 'history'
     ]
     dg = DataGenerator(possible_keywords)
     gen_query = dg.generate(1)[0]
     gen_data = dg.generate(10)
     cf1 = Type1(euclidean_distance,
                 combined_cosine_similarity,
                 0.33,
                 0.33,
                 0.33,
                 disable_thresholds=True)
     cf2 = Type2(euclidean_distance,
                 combined_cosine_similarity,
                 0.33,
                 0.33,
                 0.33,
                 disable_thresholds=True)
     cf3 = Type3(euclidean_distance,
                 combined_cosine_similarity,
                 0.33,
                 0.33,
                 0.33,
                 disable_thresholds=True)
     ns1 = NaiveSolver(gen_query,
                       gen_data,
                       cf1,
                       result_length=10,
                       max_subset_size=6)
     ns2 = NaiveSolver(gen_query,
                       gen_data,
                       cf2,
                       result_length=10,
                       max_subset_size=6)
     ns3 = NaiveSolver(gen_query,
                       gen_data,
                       cf3,
                       result_length=10,
                       max_subset_size=6)
     ev.add_solver(ns1)
     ev.add_solver(ns2)
     ev.add_solver(ns3)
     ev.evaluate()
     results = ev.get_results()
     self.assertEqual(len(results), 3)
     self.assertEqual(len(results[0]), 2)
     self.assertEqual(len(results[1]), 2)
     self.assertEqual(len(results[2]), 2)
     self.assertEqual(len(results[0][0]), 10)
     self.assertEqual(len(results[1][0]), 10)
     self.assertEqual(len(results[2][0]), 10)
 def test_solve(self):
     query = KeywordCoordinate(0, 0, ['family', 'food', 'outdoor'])
     kwc1 = KeywordCoordinate(1, 1, ['family', 'food', 'outdoor'])
     kwc2 = KeywordCoordinate(3, 3, ['food'])
     kwc3 = KeywordCoordinate(2, 2, ['outdoor'])
     data = [kwc1, kwc2, kwc3]
     cf = Type1(euclidean_distance,
                separated_cosine_similarity,
                0.3,
                0.3,
                0.4,
                disable_thresholds=True)
     ns = NaiveSolver(query, data, cf, normalize=False, result_length=1)
     result = ns.solve()
     self.assertAlmostEqual(result[0][0], 0.42, delta=0.01)
Beispiel #7
0
 def test_instantiation(self):
     t1 = Type1(euclidean_distance, separated_cosine_similarity, 0.3, 0.3,
                0.4, 0.5, 0.6, 0.7, False)
     self.assertEqual(euclidean_distance.__get__,
                      t1.distance_metric.__get__)
     self.assertEqual(separated_cosine_similarity.__get__,
                      t1.similarity_metric.__get__)
     self.assertAlmostEqual(t1.alpha, 0.3, delta=0.01)
     self.assertAlmostEqual(t1.beta, 0.3, delta=0.01)
     self.assertAlmostEqual(t1.omega, 0.4, delta=0.01)
     self.assertAlmostEqual(t1.query_distance_threshold, 0.5, delta=0.01)
     self.assertAlmostEqual(t1.dataset_distance_threshold, 0.6, delta=0.01)
     self.assertAlmostEqual(t1.keyword_similarity_threshold,
                            0.7,
                            delta=0.01)
     self.assertEqual(t1.disable_thresholds, False)
def main(argv):

    start_time = time.time()
    # Config
    file_name_data = 'dataset.pickle'
    query_file_name = 'query.pickle'
    file_name_word2vec_model = 'model.pickle'
    target_file_name = 'precalculated_inter_dataset_distances.pickle'
    # max_subset_size = 3 # Changed
    max_subset_size = 2  #int(argv[0])
    # cost_function = Type1(euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7)
    # cost_function = Type2(euclidean_distance, word2vec_cosine_similarity, 0, 0, 1,
    #                       model=load_word2vec_model(file_name_word2vec_model))
    start_time = time.time()
    cost_function = Type1(euclidean_distance,
                          combined_cosine_similarity,
                          0.2,
                          0.1,
                          0.7,
                          model=load_word2vec_model(file_name_word2vec_model))
    finish_time = time.time()
    print('Cost function initialization --> ', finish_time - start_time)
    file_allow_overwrite = True

    # Code
    start_time = time.time()
    data = load_pickle(file_name_data)
    finish_time = time.time()
    print('Load data pickle --> ', finish_time - start_time)
    # query = KeywordCoordinate("", 0, 0, ['0'])
    query = load_pickle(query_file_name)
    start_time = time.time()
    solver = NaiveSolver(query,
                         data,
                         cost_function,
                         max_subset_size=max_subset_size)
    finish_time = time.time()
    print('Solver initialization --> ', finish_time - start_time)
    precalculated_inter_dataset_distances = solver.get_inter_dataset_distance()
    write_pickle(precalculated_inter_dataset_distances,
                 target_file_name,
                 file_allow_overwrite=file_allow_overwrite)

    print("--- %s seconds ---" % (time.time() - start_time))
Beispiel #9
0
 def test_solve4(self):
     t1 = Type1(manhattan_distance,
                separated_cosine_similarity,
                0,
                1,
                0,
                disable_thresholds=True)
     keywords_query = ['food', 'fun', 'outdoor', 'family']
     keywords_kwc1 = ['food', 'fun', 'outdoor']
     keywords_kwc2 = ['food', 'fun']
     keywords_kwc3 = ['food']
     query = KeywordCoordinate(0, 0, keywords_query)
     kwc1 = KeywordCoordinate(1, 1, keywords_kwc1)
     kwc2 = KeywordCoordinate(2, 2, keywords_kwc2)
     kwc3 = KeywordCoordinate(3, 3, keywords_kwc3)
     kwc4 = KeywordCoordinate(4, 4, keywords_kwc3)
     kwc5 = KeywordCoordinate(5, 5, keywords_kwc3)
     data = [kwc1, kwc2, kwc3, kwc4, kwc5]
     result = t1.solve(query, data)
     self.assertAlmostEqual(result, 8.0, delta=0.01)
 def test_complex_generated_word2vec_precalculations(self):
     possible_keywords = [
         'family', 'food', 'outdoor', 'rest', 'indoor', 'sports', 'science',
         'culture', 'history'
     ]
     dg = DataGenerator(possible_keywords)
     query: KeywordCoordinate = dg.generate(1)[0]
     data = dg.generate(5)
     model = calculate_model_subset(query, data, load_word2vec_model())
     cf = Type1(euclidean_distance,
                word2vec_cosine_similarity,
                0.3,
                0.3,
                0.4,
                disable_thresholds=True,
                model=model)
     ns = NaiveSolver(query, data, cf, result_length=100)
     result = ns.solve()
     pre_qd = ns.get_query_dataset_distance()
     pre_id = ns.get_inter_dataset_distance()
     pre_ks = ns.get_keyword_similarity()
     cf.precalculated_query_dataset_dict = pre_qd
     cf.precalculated_inter_dataset_dict = pre_id
     cf.precalculated_keyword_similarity_dict = pre_ks
     result_pre = ns.solve()
     for index in range(len(result)):
         self.assertAlmostEqual(result[index][0],
                                result_pre[index][0],
                                delta=0.01)
         key_list = list(result[index][1])
         key_list_pre = list(result_pre[index][1])
         for list_index in range(len(key_list)):
             self.assertAlmostEqual(key_list[list_index].coordinates.x,
                                    key_list_pre[list_index].coordinates.x)
             self.assertAlmostEqual(key_list[list_index].coordinates.y,
                                    key_list_pre[list_index].coordinates.y)
             self.assertListEqual(key_list[list_index].keywords,
                                  key_list_pre[list_index].keywords)
Beispiel #11
0
def main(argv):
    start_time = time.time()

    # Evaluator, instantiate it first for logging purposes
    ev = Evaluator()

    query: KeywordCoordinate = load_pickle('query.pickle')
    print('Query:', query)
    data: dataset_type = load_pickle('dataset.pickle')
    # print('Data:', dataset_comprehension(data))

    # Let's filter out by user radius
    # dataAux = sorted(data, key=lambda x: geographic_distance(x.coordinates, query.coordinates))
    # distances = [geographic_distance(x.coordinates, query.coordinates) >= RADIUS for x in dataAux]
    # print('------ Distances: ', distances)

    # Load precalculated values and models
    precalculated_inter_dataset_distances = load_pickle(
        'precalculated_inter_dataset_distances.pickle')
    precalculated_query_dataset_distances = load_pickle(
        'precalculated_query_dataset_distances.pickle')
    precalculated_query_dataset_keyword_similarities = load_pickle(
        'precalculated_query_dataset_keyword_similarities.pickle')

    # **** ONLY FOR word2vec model executions
    precalculated_query_dataset_keyword_similarities_word2vec = load_pickle(
        'precalculated_query_dataset_keyword_similarities_word2vec.pickle')
    word2vec_model = load_word2vec_model('word2vec_model.pickle')
    # ****

    # Define the CostFunctions. For all possible parameters refer to the documentation.
    cf1 = Type1(euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7)
    cf2 = Type2(euclidean_distance,
                word2vec_cosine_similarity,
                0.2,
                0.1,
                0.7,
                model=word2vec_model)
    cf3 = Type1(
        euclidean_distance,
        combined_cosine_similarity,
        0.2,
        0.1,
        0.7,
        precalculated_inter_dataset_dict=precalculated_inter_dataset_distances,
        precalculated_query_dataset_dict=precalculated_query_dataset_distances,
        precalculated_keyword_similarity_dict=
        precalculated_query_dataset_keyword_similarities)
    cf4 = Type2(
        euclidean_distance,
        word2vec_cosine_similarity,
        0,
        0,
        1.0,
        precalculated_inter_dataset_dict=precalculated_inter_dataset_distances,
        precalculated_query_dataset_dict=precalculated_query_dataset_distances,
        precalculated_keyword_similarity_dict=
        precalculated_query_dataset_keyword_similarities_word2vec,
        model=word2vec_model)
    cf5 = Type3(
        euclidean_distance,
        word2vec_cosine_similarity,
        0.1,
        0.1,
        0.8,
        precalculated_inter_dataset_dict=precalculated_inter_dataset_distances,
        precalculated_query_dataset_dict=precalculated_query_dataset_distances,
        precalculated_keyword_similarity_dict=
        precalculated_query_dataset_keyword_similarities_word2vec,
        model=word2vec_model)
    cf6 = Type1(
        euclidean_distance,
        word2vec_cosine_similarity,
        0.2,
        0.1,
        0.7,
        precalculated_inter_dataset_dict=precalculated_inter_dataset_distances,
        precalculated_query_dataset_dict=precalculated_query_dataset_distances,
        precalculated_keyword_similarity_dict=
        precalculated_query_dataset_keyword_similarities,
        model=word2vec_model)

    map_name = argv[0]
    # map_name = 'London_mini'
    # Choose which Solvers to use. For all possible parameters refer to the documentation.
    max_number_of_processes = mp.cpu_count()
    ns1 = NaiveSolver(
        query,
        data,
        cf2,
        result_length=5,
        max_subset_size=3,
        max_number_of_concurrent_processes=max_number_of_processes,
        _map=map_name)
    # ns2 = NaiveSolver(query, data, cf5, result_length=5, max_subset_size=3,
    # max_number_of_concurrent_processes=max_number_of_processes, _map = map_name)
    # ns3 = NaiveSolver(query, data, cf3, result_length=5, max_subset_size=6,
    #                   max_number_of_concurrent_processes=max_number_of_processes)
    # ns4 = NaiveSolver(query, data, cf6, result_length=5, max_subset_size=3,
    # max_number_of_concurrent_processes=max_number_of_processes, _map = map_name)

    # Add Solvers to Evaluator
    ev.add_solver(ns1)
    # ev.add_solver(ns2)
    # ev.add_solver(ns4)

    #Only for Debug: calculates and print physical distances between items in the dataset and the query location
    #distances = [geographic_distance(x.coordinates, query.coordinates) for x in data]
    # print('------ Distances: ', distances

    # Run Evaluator and fetch results
    ev.evaluate()
    results = ev.get_results()
    timings = ev.get_timings()

    write_csv(map_name, results, timings)

    print('*** Solution -', solution_list_comprehension(results))
    # print('*** Timing -', timing_list_comprehension(timings))

    initialLat = []
    initialLon = []

    keywords = []

    gmap = gmplot.GoogleMapPlotter(query.coordinates.x, query.coordinates.y,
                                   14)

    colors = ['red', 'blue', 'green', 'purple', 'orange']

    # Third dimension is the order of solution (Best: 0, Second best: 1...)
    for i in range(5):
        lats = []
        lons = []
        for kwc in results[0][0][i][1]:
            lats.append(kwc.coordinates.x)
            lons.append(kwc.coordinates.y)
            keywords.append(kwc.keywords)
        for j in range(len(lats)):
            gmap.marker(lats[j], lons[j], color=colors[i])
        gmap.polygon(lats, lons, color='cornflowerblue', edge_width=7)

        # initialLat.append(query.coordinates.x)
        # initialLon.append(query.coordinates.y)

        # gmap.scatter(initialLat, initialLon, '#00FF00', size = 70, marker = False)
        # gmap.scatter(lats, lons, '#FF0000',size = 50, marker = False )

        # gmap.plot(lats, lons, 'cornflowerblue', edge_width = 3.0)
        # gmap.polygon(lats, lons, color='cornflowerblue', edge_width=10)

        # gmap.scatter(lats, lons, color='#3B0B39', size=40, marker=False)

        #Your Google_API_Key
        #gmap.apikey = " API_Key”
        # save it to html
    # gmap.scatter(lats, lons, '#FF0000', size=40, marker=True)
    gmap.marker(query.coordinates.x,
                query.coordinates.y,
                color='cornflowerblue',
                title='Query point')
    gmap.draw(r"graphic_results.html")

    print("--- %s seconds ---" % (time.time() - start_time))
Beispiel #12
0
     #************************
     
     in_start_time = time.time()
     # Config
     # file_name_data = 'dataset.pickle'
     # query_file_name = 'query.pickle'
     # file_name_word2vec_model = 'model.pickle'
     target_file_name = 'precalculated_inter_dataset_distances.pickle'
     # max_subset_size = 3 # Changed
     max_subset_size = number_of_pois_in_solution
     # cost_function = Type1(euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7)
     # cost_function = Type2(euclidean_distance, word2vec_cosine_similarity, 0, 0, 1,
     #                       model=load_word2vec_model(file_name_word2vec_model))
     in_cost_start_time = time.time()
     cost_function = Type1(euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7,
                 model=word2vec_model
                 )
     finish_time = time.time()
     print('Cost function initialization --> ', finish_time-in_cost_start_time)
     file_allow_overwrite = True
 
     # Code
     # start_time = time.time()
     # data = load_pickle(file_name_data)
     # finish_time = time.time()
     # print('Load data pickle --> ', finish_time-start_time)
     # query = KeywordCoordinate("", 0, 0, ['0'])
     # query = load_pickle(query_file_name)
     solver_start_time = time.time()
     solver = NaiveSolver(query, data, cost_function, max_subset_size=max_subset_size, update = False)
     data = solver.data