def main(argv):
    start_time = time.time()

    # Config
    file_name_data = 'dataset.pickle'
    file_name_query = 'query.pickle'
    file_name_word2vec_model = 'model.pickle'
    target_file_name = 'precalculated_query_dataset_keyword_similarities_word2vec.pickle'
    # max_subset_size = int(argv[0])
    max_subset_size = 2
    # cost_function = Type3(euclidean_distance, word2vec_cosine_similarity, 0.2, 0.1, 0.7,
    #                       model=load_word2vec_model(file_name_word2vec_model))
    # cost_function = Type2(euclidean_distance, word2vec_cosine_similarity, 0, 0, 1,
    #                       model=load_word2vec_model(file_name_word2vec_model))
    cost_function = Type1(euclidean_distance,
                          word2vec_cosine_similarity,
                          0.2,
                          0.1,
                          0.7,
                          model=load_word2vec_model(file_name_word2vec_model))
    file_allow_overwrite = True

    # Code
    data = load_pickle(file_name_data)
    query = load_pickle(file_name_query)
    solver = NaiveSolver(query,
                         data,
                         cost_function,
                         max_subset_size=max_subset_size)
    precalculated_query_dataset_distances = solver.get_keyword_similarity()
    write_pickle(precalculated_query_dataset_distances,
                 target_file_name,
                 file_allow_overwrite=file_allow_overwrite)

    print("--- %s seconds ---" % (time.time() - start_time))
def main(argv):

    start_time = time.time()
    # Config
    file_name_data = 'dataset.pickle'
    query_file_name = 'query.pickle'
    file_name_word2vec_model = 'model.pickle'
    target_file_name = 'precalculated_inter_dataset_distances.pickle'
    # max_subset_size = 3 # Changed
    max_subset_size = 2  #int(argv[0])
    # cost_function = Type1(euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7)
    # cost_function = Type2(euclidean_distance, word2vec_cosine_similarity, 0, 0, 1,
    #                       model=load_word2vec_model(file_name_word2vec_model))
    start_time = time.time()
    cost_function = Type1(euclidean_distance,
                          combined_cosine_similarity,
                          0.2,
                          0.1,
                          0.7,
                          model=load_word2vec_model(file_name_word2vec_model))
    finish_time = time.time()
    print('Cost function initialization --> ', finish_time - start_time)
    file_allow_overwrite = True

    # Code
    start_time = time.time()
    data = load_pickle(file_name_data)
    finish_time = time.time()
    print('Load data pickle --> ', finish_time - start_time)
    # query = KeywordCoordinate("", 0, 0, ['0'])
    query = load_pickle(query_file_name)
    start_time = time.time()
    solver = NaiveSolver(query,
                         data,
                         cost_function,
                         max_subset_size=max_subset_size)
    finish_time = time.time()
    print('Solver initialization --> ', finish_time - start_time)
    precalculated_inter_dataset_distances = solver.get_inter_dataset_distance()
    write_pickle(precalculated_inter_dataset_distances,
                 target_file_name,
                 file_allow_overwrite=file_allow_overwrite)

    print("--- %s seconds ---" % (time.time() - start_time))
def main(argv):
    start_time = time.time()
    
    # Config
    # Both files should be in the root directory of the project.
    word2vec_model_name = 'model.pickle'
    model_pickle_file_name = 'word2vec_model.pickle'
    query_file_name = 'query.pickle'
    data_file_name = 'dataset.pickle'
    file_allow_overwrite = True

    # Code - you shouldn't have to make any changes to this
    model = load_word2vec_model(word2vec_model_name)
    query = load_pickle(query_file_name)
    data = load_pickle(data_file_name)
    shrunk_model = calculate_model_subset(query, data, model)
    write_pickle(shrunk_model, model_pickle_file_name, file_allow_overwrite=file_allow_overwrite)

    print("--- %s seconds ---" % (time.time() - start_time))
Ejemplo n.º 4
0
 def test_complex_generated_word2vec_precalculations(self):
     possible_keywords = [
         'family', 'food', 'outdoor', 'rest', 'indoor', 'sports', 'science',
         'culture', 'history'
     ]
     dg = DataGenerator(possible_keywords)
     query: KeywordCoordinate = dg.generate(1)[0]
     data = dg.generate(5)
     model = calculate_model_subset(query, data, load_word2vec_model())
     cf = Type1(euclidean_distance,
                word2vec_cosine_similarity,
                0.3,
                0.3,
                0.4,
                disable_thresholds=True,
                model=model)
     ns = NaiveSolver(query, data, cf, result_length=100)
     result = ns.solve()
     pre_qd = ns.get_query_dataset_distance()
     pre_id = ns.get_inter_dataset_distance()
     pre_ks = ns.get_keyword_similarity()
     cf.precalculated_query_dataset_dict = pre_qd
     cf.precalculated_inter_dataset_dict = pre_id
     cf.precalculated_keyword_similarity_dict = pre_ks
     result_pre = ns.solve()
     for index in range(len(result)):
         self.assertAlmostEqual(result[index][0],
                                result_pre[index][0],
                                delta=0.01)
         key_list = list(result[index][1])
         key_list_pre = list(result_pre[index][1])
         for list_index in range(len(key_list)):
             self.assertAlmostEqual(key_list[list_index].coordinates.x,
                                    key_list_pre[list_index].coordinates.x)
             self.assertAlmostEqual(key_list[list_index].coordinates.y,
                                    key_list_pre[list_index].coordinates.y)
             self.assertListEqual(key_list[list_index].keywords,
                                  key_list_pre[list_index].keywords)
Ejemplo n.º 5
0
 def test_precalculated_word2vec(self):
     query = KeywordCoordinate(0, 0, ['family', 'food', 'outdoor'])
     kwc1 = KeywordCoordinate(1, 1, ['family', 'food', 'outdoor'])
     kwc2 = KeywordCoordinate(3, 3, ['food', 'family'])
     kwc3 = KeywordCoordinate(2, 2, ['outdoor'])
     data = [kwc1, kwc2, kwc3]
     model = calculate_model_subset(query, data, load_word2vec_model())
     cf = Type3(euclidean_distance,
                word2vec_cosine_similarity,
                0.3,
                0.3,
                0.4,
                disable_thresholds=True,
                model=model)
     ns = NaiveSolver(query, data, cf)
     result = ns.solve()
     pre_qd = ns.get_query_dataset_distance()
     pre_id = ns.get_inter_dataset_distance()
     pre_ks = ns.get_keyword_similarity()
     cf.precalculated_query_dataset_dict = pre_qd
     cf.precalculated_inter_dataset_dict = pre_id
     cf.precalculated_keyword_similarity_dict = pre_ks
     result_pre = ns.solve()
     for index in range(len(result)):
         self.assertAlmostEqual(result[index][0],
                                result_pre[index][0],
                                delta=0.01)
         key_list = list(result[index][1])
         key_list_pre = list(result_pre[index][1])
         for list_index in range(len(key_list)):
             self.assertAlmostEqual(key_list[list_index].coordinates.x,
                                    key_list_pre[list_index].coordinates.x)
             self.assertAlmostEqual(key_list[list_index].coordinates.y,
                                    key_list_pre[list_index].coordinates.y)
             self.assertListEqual(key_list[list_index].keywords,
                                  key_list_pre[list_index].keywords)
Ejemplo n.º 6
0
def main(argv):
    start_time = time.time()

    # Evaluator, instantiate it first for logging purposes
    ev = Evaluator()

    query: KeywordCoordinate = load_pickle('query.pickle')
    print('Query:', query)
    data: dataset_type = load_pickle('dataset.pickle')
    # print('Data:', dataset_comprehension(data))

    # Let's filter out by user radius
    # dataAux = sorted(data, key=lambda x: geographic_distance(x.coordinates, query.coordinates))
    # distances = [geographic_distance(x.coordinates, query.coordinates) >= RADIUS for x in dataAux]
    # print('------ Distances: ', distances)

    # Load precalculated values and models
    precalculated_inter_dataset_distances = load_pickle(
        'precalculated_inter_dataset_distances.pickle')
    precalculated_query_dataset_distances = load_pickle(
        'precalculated_query_dataset_distances.pickle')
    precalculated_query_dataset_keyword_similarities = load_pickle(
        'precalculated_query_dataset_keyword_similarities.pickle')

    # **** ONLY FOR word2vec model executions
    precalculated_query_dataset_keyword_similarities_word2vec = load_pickle(
        'precalculated_query_dataset_keyword_similarities_word2vec.pickle')
    word2vec_model = load_word2vec_model('word2vec_model.pickle')
    # ****

    # Define the CostFunctions. For all possible parameters refer to the documentation.
    cf1 = Type1(euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7)
    cf2 = Type2(euclidean_distance,
                word2vec_cosine_similarity,
                0.2,
                0.1,
                0.7,
                model=word2vec_model)
    cf3 = Type1(
        euclidean_distance,
        combined_cosine_similarity,
        0.2,
        0.1,
        0.7,
        precalculated_inter_dataset_dict=precalculated_inter_dataset_distances,
        precalculated_query_dataset_dict=precalculated_query_dataset_distances,
        precalculated_keyword_similarity_dict=
        precalculated_query_dataset_keyword_similarities)
    cf4 = Type2(
        euclidean_distance,
        word2vec_cosine_similarity,
        0,
        0,
        1.0,
        precalculated_inter_dataset_dict=precalculated_inter_dataset_distances,
        precalculated_query_dataset_dict=precalculated_query_dataset_distances,
        precalculated_keyword_similarity_dict=
        precalculated_query_dataset_keyword_similarities_word2vec,
        model=word2vec_model)
    cf5 = Type3(
        euclidean_distance,
        word2vec_cosine_similarity,
        0.1,
        0.1,
        0.8,
        precalculated_inter_dataset_dict=precalculated_inter_dataset_distances,
        precalculated_query_dataset_dict=precalculated_query_dataset_distances,
        precalculated_keyword_similarity_dict=
        precalculated_query_dataset_keyword_similarities_word2vec,
        model=word2vec_model)
    cf6 = Type1(
        euclidean_distance,
        word2vec_cosine_similarity,
        0.2,
        0.1,
        0.7,
        precalculated_inter_dataset_dict=precalculated_inter_dataset_distances,
        precalculated_query_dataset_dict=precalculated_query_dataset_distances,
        precalculated_keyword_similarity_dict=
        precalculated_query_dataset_keyword_similarities,
        model=word2vec_model)

    map_name = argv[0]
    # map_name = 'London_mini'
    # Choose which Solvers to use. For all possible parameters refer to the documentation.
    max_number_of_processes = mp.cpu_count()
    ns1 = NaiveSolver(
        query,
        data,
        cf2,
        result_length=5,
        max_subset_size=3,
        max_number_of_concurrent_processes=max_number_of_processes,
        _map=map_name)
    # ns2 = NaiveSolver(query, data, cf5, result_length=5, max_subset_size=3,
    # max_number_of_concurrent_processes=max_number_of_processes, _map = map_name)
    # ns3 = NaiveSolver(query, data, cf3, result_length=5, max_subset_size=6,
    #                   max_number_of_concurrent_processes=max_number_of_processes)
    # ns4 = NaiveSolver(query, data, cf6, result_length=5, max_subset_size=3,
    # max_number_of_concurrent_processes=max_number_of_processes, _map = map_name)

    # Add Solvers to Evaluator
    ev.add_solver(ns1)
    # ev.add_solver(ns2)
    # ev.add_solver(ns4)

    #Only for Debug: calculates and print physical distances between items in the dataset and the query location
    #distances = [geographic_distance(x.coordinates, query.coordinates) for x in data]
    # print('------ Distances: ', distances

    # Run Evaluator and fetch results
    ev.evaluate()
    results = ev.get_results()
    timings = ev.get_timings()

    write_csv(map_name, results, timings)

    print('*** Solution -', solution_list_comprehension(results))
    # print('*** Timing -', timing_list_comprehension(timings))

    initialLat = []
    initialLon = []

    keywords = []

    gmap = gmplot.GoogleMapPlotter(query.coordinates.x, query.coordinates.y,
                                   14)

    colors = ['red', 'blue', 'green', 'purple', 'orange']

    # Third dimension is the order of solution (Best: 0, Second best: 1...)
    for i in range(5):
        lats = []
        lons = []
        for kwc in results[0][0][i][1]:
            lats.append(kwc.coordinates.x)
            lons.append(kwc.coordinates.y)
            keywords.append(kwc.keywords)
        for j in range(len(lats)):
            gmap.marker(lats[j], lons[j], color=colors[i])
        gmap.polygon(lats, lons, color='cornflowerblue', edge_width=7)

        # initialLat.append(query.coordinates.x)
        # initialLon.append(query.coordinates.y)

        # gmap.scatter(initialLat, initialLon, '#00FF00', size = 70, marker = False)
        # gmap.scatter(lats, lons, '#FF0000',size = 50, marker = False )

        # gmap.plot(lats, lons, 'cornflowerblue', edge_width = 3.0)
        # gmap.polygon(lats, lons, color='cornflowerblue', edge_width=10)

        # gmap.scatter(lats, lons, color='#3B0B39', size=40, marker=False)

        #Your Google_API_Key
        #gmap.apikey = " API_Key”
        # save it to html
    # gmap.scatter(lats, lons, '#FF0000', size=40, marker=True)
    gmap.marker(query.coordinates.x,
                query.coordinates.y,
                color='cornflowerblue',
                title='Query point')
    gmap.draw(r"graphic_results.html")

    print("--- %s seconds ---" % (time.time() - start_time))
Ejemplo n.º 7
0
    # Code
    print('Loading CSV', csv_file_name)
    data = load_csv(file_name=csv_file_name, x_coordinate_index=x_index, y_coordinate_index=y_index,
                    keywords_index=keyword_index, keywords_delimiter=keyword_delimiter, delimiter=csv_delimiter,
                    quotechar=csv_quotechar, max_read_length=max_read_length)
    if len(data) > 0:
        print('Example Datapoint:', data[0].coordinates.x, data[0].coordinates.y, data[0].keywords)
        write_pickle(data=data, file_name=data_target_name, file_allow_overwrite=file_allow_overwrite)
    else:
        print('Could not load any data.')


    print("--- %s seconds ---" % (time.time() - start_time))

    word2vec_model_name = 'model.pickle'
    word2vec_model = load_word2vec_model(word2vec_model_name)
        
    for i in range(13, 79):
        print('++++++++++++++++++')
        print('Query --> ', i)
        print('++++++++++++++++++')
        
        iteration_start_time = time.time()
        
        # ***********************
        # preprocess_csv_query.py
        #************************
        
        in_start_time = time.time()
    
        # Config