def main(argv):
    start_time = time.time()

    # Config
    file_name_data = 'dataset.pickle'
    file_name_query = 'query.pickle'
    file_name_word2vec_model = 'model.pickle'
    target_file_name = 'precalculated_query_dataset_keyword_similarities_word2vec.pickle'
    # max_subset_size = int(argv[0])
    max_subset_size = 2
    # cost_function = Type3(euclidean_distance, word2vec_cosine_similarity, 0.2, 0.1, 0.7,
    #                       model=load_word2vec_model(file_name_word2vec_model))
    # cost_function = Type2(euclidean_distance, word2vec_cosine_similarity, 0, 0, 1,
    #                       model=load_word2vec_model(file_name_word2vec_model))
    cost_function = Type1(euclidean_distance,
                          word2vec_cosine_similarity,
                          0.2,
                          0.1,
                          0.7,
                          model=load_word2vec_model(file_name_word2vec_model))
    file_allow_overwrite = True

    # Code
    data = load_pickle(file_name_data)
    query = load_pickle(file_name_query)
    solver = NaiveSolver(query,
                         data,
                         cost_function,
                         max_subset_size=max_subset_size)
    precalculated_query_dataset_distances = solver.get_keyword_similarity()
    write_pickle(precalculated_query_dataset_distances,
                 target_file_name,
                 file_allow_overwrite=file_allow_overwrite)

    print("--- %s seconds ---" % (time.time() - start_time))
コード例 #2
0
 def test_pickle(self):
     possible_keywords = ['1', '2', '3', '4', '5']
     keywords_min = 6
     keywords_max = 10
     physical_min_x = 0.0
     physical_max_x = 50.0
     physical_min_y = 0.0
     physical_max_y = 50.0
     dg = DataGenerator(possible_keywords=possible_keywords,
                        keywords_min=keywords_min,
                        keywords_max=keywords_max,
                        physical_min_x=physical_min_x,
                        physical_max_x=physical_max_x,
                        physical_min_y=physical_min_y,
                        physical_max_y=physical_max_y)
     result_length = 5
     file_name = 'test/test.pickle'
     generated_result = dg.generate_pickle(result_length, file_name, True)
     loaded_result = load_pickle(file_name)
     self.assertEqual(len(loaded_result), result_length)
     for index in range(len(loaded_result)):
         self.assertAlmostEqual(loaded_result[index].coordinates.x,
                                generated_result[index].coordinates.x)
         self.assertAlmostEqual(loaded_result[index].coordinates.y,
                                generated_result[index].coordinates.y)
         self.assertListEqual(loaded_result[index].keywords,
                              generated_result[index].keywords)
     os.remove(
         os.path.abspath(
             os.path.dirname(os.path.abspath(__file__)) + '../../../' +
             file_name))
コード例 #3
0
def main(argv):

    start_time = time.time()
    # Config
    file_name_data = 'dataset.pickle'
    query_file_name = 'query.pickle'
    file_name_word2vec_model = 'model.pickle'
    target_file_name = 'precalculated_inter_dataset_distances.pickle'
    # max_subset_size = 3 # Changed
    max_subset_size = 2  #int(argv[0])
    # cost_function = Type1(euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7)
    # cost_function = Type2(euclidean_distance, word2vec_cosine_similarity, 0, 0, 1,
    #                       model=load_word2vec_model(file_name_word2vec_model))
    start_time = time.time()
    cost_function = Type1(euclidean_distance,
                          combined_cosine_similarity,
                          0.2,
                          0.1,
                          0.7,
                          model=load_word2vec_model(file_name_word2vec_model))
    finish_time = time.time()
    print('Cost function initialization --> ', finish_time - start_time)
    file_allow_overwrite = True

    # Code
    start_time = time.time()
    data = load_pickle(file_name_data)
    finish_time = time.time()
    print('Load data pickle --> ', finish_time - start_time)
    # query = KeywordCoordinate("", 0, 0, ['0'])
    query = load_pickle(query_file_name)
    start_time = time.time()
    solver = NaiveSolver(query,
                         data,
                         cost_function,
                         max_subset_size=max_subset_size)
    finish_time = time.time()
    print('Solver initialization --> ', finish_time - start_time)
    precalculated_inter_dataset_distances = solver.get_inter_dataset_distance()
    write_pickle(precalculated_inter_dataset_distances,
                 target_file_name,
                 file_allow_overwrite=file_allow_overwrite)

    print("--- %s seconds ---" % (time.time() - start_time))
コード例 #4
0
 def __init__(self, distance_metric: distance_function_type,
              similarity_metric: similarity_function_type, alpha: float, beta: float, omega: float,
              query_distance_threshold: float = 0.7, dataset_distance_threshold: float = 0.7,
              keyword_similarity_threshold: float = 0.7, disable_thresholds: bool = False, model=None,
              precalculated_query_dataset_dict: precalculated_dict_type = None,
              precalculated_inter_dataset_dict: precalculated_dict_type = None,
              precalculated_keyword_similarity_dict: precalculated_dict_type = None):
     """
     Constructs a new CostFunction object. The CostFunction class should never be directly instantiated. Instead use a class that inherits from the CostFunction class and implements the solve() method.
     :param distance_metric: The distance metric to calculate coordinate distances between KeywordCoordinates.
     :param similarity_metric: The similarity metric to calculate the similarity between keyword lists of KeywordCoordinates.
     :param alpha: The scaling parameter for the query-dataset distance.
     :param beta: The scaling parameter for the inter-dataset distance.
     :param omega: The scaling parameter for the keyword list similarity.
     :param query_distance_threshold: The threshold for the query-dataset distance.
     :param dataset_distance_threshold: The threshold for the inter-dataset distance.
     :param keyword_similarity_threshold: The threshold for the keyword list similarity.
     :param disable_thresholds: Whether to honor any threshold values.
     :param model: The word2vec model. This can be passed to the CostFunction instead of reading it from disk to improve performance.
     :param precalculated_query_dataset_dict: A dictionary with precalculated query-dataset values for a given frozen subset.
     :param precalculated_inter_dataset_dict: A dictionary with precalculated inter-dataset values for a given frozen subset.
     :param precalculated_keyword_similarity_dict: A dictionary with precalculated keyword similarity values for a given frozen subset.
     """
     self.distance_metric: distance_function_type = distance_metric
     self.similarity_metric: similarity_function_type = similarity_metric
     self.alpha = alpha
     self.beta = beta
     self.omega = omega
     self.query_distance_threshold = query_distance_threshold
     self.dataset_distance_threshold = dataset_distance_threshold
     self.keyword_similarity_threshold = keyword_similarity_threshold
     self.disable_thresholds = disable_thresholds
     self.precalculated_query_dataset_dict = precalculated_query_dataset_dict
     self.precalculated_inter_dataset_dict = precalculated_inter_dataset_dict
     self.precalculated_keyword_similarity_dict = precalculated_keyword_similarity_dict
     logger = logging.getLogger(__name__)
     if self.similarity_metric.__name__ == 'word2vec_cosine_similarity':
         try:
             if model is None:
                 model_path = os.path.abspath(os.path.abspath(os.path.dirname(__file__)) + '/../../files/model.pickle')
                 print('*****' + model_path)
                 logger.debug('loading model from path {}'.format(model_path))
                 self.model = load_pickle(model_path)
             else:
                 logger.debug('loading model {} from parameter'.format(model))
                 self.model = model
                 key, value = self.model.popitem()
                 self.model[key] = value
                 # if type(value) != np.ndarray:
                 #     logger.error('Model seems to be corrupt.')
                 #     raise ValueError('Model seems to be corrupt.')
         except:
             logger.error('Could not load model')
             raise ValueError('Could not load model')
     logger.debug('created with distance metric {}, similarity metric {}, alpha {}, beta {} and omega {}'.format(
         self.distance_metric.__name__, self.similarity_metric.__name__, self.alpha, self.beta, self.omega))
コード例 #5
0
def main(argv):
    start_time = time.time()
    
    # Config
    # Both files should be in the root directory of the project.
    word2vec_model_name = 'model.pickle'
    model_pickle_file_name = 'word2vec_model.pickle'
    query_file_name = 'query.pickle'
    data_file_name = 'dataset.pickle'
    file_allow_overwrite = True

    # Code - you shouldn't have to make any changes to this
    model = load_word2vec_model(word2vec_model_name)
    query = load_pickle(query_file_name)
    data = load_pickle(data_file_name)
    shrunk_model = calculate_model_subset(query, data, model)
    write_pickle(shrunk_model, model_pickle_file_name, file_allow_overwrite=file_allow_overwrite)

    print("--- %s seconds ---" % (time.time() - start_time))
コード例 #6
0
 def test_write_and_read_data(self):
     kwc1 = KeywordCoordinate(1, 1, ['1'])
     kwc2 = KeywordCoordinate(2, 2, ['2'])
     kwc3 = KeywordCoordinate(3, 3, ['3'])
     data = [kwc1, kwc2, kwc3]
     file_name = 'test/test.pickle'
     write_pickle(data, file_name, True)
     loaded_result = load_pickle(file_name)
     self.assertEqual(len(loaded_result), 3)
     for index in range(len(loaded_result)):
         self.assertAlmostEqual(loaded_result[index].coordinates.x,
                                data[index].coordinates.x)
         self.assertAlmostEqual(loaded_result[index].coordinates.y,
                                data[index].coordinates.y)
         self.assertListEqual(loaded_result[index].keywords,
                              data[index].keywords)
     os.remove(
         os.path.abspath(
             os.path.dirname(os.path.abspath(__file__)) + '../../../' +
             file_name))
コード例 #7
0
def main(argv):
    start_time = time.time()

    # Evaluator, instantiate it first for logging purposes
    ev = Evaluator()

    query: KeywordCoordinate = load_pickle('query.pickle')
    print('Query:', query)
    data: dataset_type = load_pickle('dataset.pickle')
    # print('Data:', dataset_comprehension(data))

    # Let's filter out by user radius
    # dataAux = sorted(data, key=lambda x: geographic_distance(x.coordinates, query.coordinates))
    # distances = [geographic_distance(x.coordinates, query.coordinates) >= RADIUS for x in dataAux]
    # print('------ Distances: ', distances)

    # Load precalculated values and models
    precalculated_inter_dataset_distances = load_pickle(
        'precalculated_inter_dataset_distances.pickle')
    precalculated_query_dataset_distances = load_pickle(
        'precalculated_query_dataset_distances.pickle')
    precalculated_query_dataset_keyword_similarities = load_pickle(
        'precalculated_query_dataset_keyword_similarities.pickle')

    # **** ONLY FOR word2vec model executions
    precalculated_query_dataset_keyword_similarities_word2vec = load_pickle(
        'precalculated_query_dataset_keyword_similarities_word2vec.pickle')
    word2vec_model = load_word2vec_model('word2vec_model.pickle')
    # ****

    # Define the CostFunctions. For all possible parameters refer to the documentation.
    cf1 = Type1(euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7)
    cf2 = Type2(euclidean_distance,
                word2vec_cosine_similarity,
                0.2,
                0.1,
                0.7,
                model=word2vec_model)
    cf3 = Type1(
        euclidean_distance,
        combined_cosine_similarity,
        0.2,
        0.1,
        0.7,
        precalculated_inter_dataset_dict=precalculated_inter_dataset_distances,
        precalculated_query_dataset_dict=precalculated_query_dataset_distances,
        precalculated_keyword_similarity_dict=
        precalculated_query_dataset_keyword_similarities)
    cf4 = Type2(
        euclidean_distance,
        word2vec_cosine_similarity,
        0,
        0,
        1.0,
        precalculated_inter_dataset_dict=precalculated_inter_dataset_distances,
        precalculated_query_dataset_dict=precalculated_query_dataset_distances,
        precalculated_keyword_similarity_dict=
        precalculated_query_dataset_keyword_similarities_word2vec,
        model=word2vec_model)
    cf5 = Type3(
        euclidean_distance,
        word2vec_cosine_similarity,
        0.1,
        0.1,
        0.8,
        precalculated_inter_dataset_dict=precalculated_inter_dataset_distances,
        precalculated_query_dataset_dict=precalculated_query_dataset_distances,
        precalculated_keyword_similarity_dict=
        precalculated_query_dataset_keyword_similarities_word2vec,
        model=word2vec_model)
    cf6 = Type1(
        euclidean_distance,
        word2vec_cosine_similarity,
        0.2,
        0.1,
        0.7,
        precalculated_inter_dataset_dict=precalculated_inter_dataset_distances,
        precalculated_query_dataset_dict=precalculated_query_dataset_distances,
        precalculated_keyword_similarity_dict=
        precalculated_query_dataset_keyword_similarities,
        model=word2vec_model)

    map_name = argv[0]
    # map_name = 'London_mini'
    # Choose which Solvers to use. For all possible parameters refer to the documentation.
    max_number_of_processes = mp.cpu_count()
    ns1 = NaiveSolver(
        query,
        data,
        cf2,
        result_length=5,
        max_subset_size=3,
        max_number_of_concurrent_processes=max_number_of_processes,
        _map=map_name)
    # ns2 = NaiveSolver(query, data, cf5, result_length=5, max_subset_size=3,
    # max_number_of_concurrent_processes=max_number_of_processes, _map = map_name)
    # ns3 = NaiveSolver(query, data, cf3, result_length=5, max_subset_size=6,
    #                   max_number_of_concurrent_processes=max_number_of_processes)
    # ns4 = NaiveSolver(query, data, cf6, result_length=5, max_subset_size=3,
    # max_number_of_concurrent_processes=max_number_of_processes, _map = map_name)

    # Add Solvers to Evaluator
    ev.add_solver(ns1)
    # ev.add_solver(ns2)
    # ev.add_solver(ns4)

    #Only for Debug: calculates and print physical distances between items in the dataset and the query location
    #distances = [geographic_distance(x.coordinates, query.coordinates) for x in data]
    # print('------ Distances: ', distances

    # Run Evaluator and fetch results
    ev.evaluate()
    results = ev.get_results()
    timings = ev.get_timings()

    write_csv(map_name, results, timings)

    print('*** Solution -', solution_list_comprehension(results))
    # print('*** Timing -', timing_list_comprehension(timings))

    initialLat = []
    initialLon = []

    keywords = []

    gmap = gmplot.GoogleMapPlotter(query.coordinates.x, query.coordinates.y,
                                   14)

    colors = ['red', 'blue', 'green', 'purple', 'orange']

    # Third dimension is the order of solution (Best: 0, Second best: 1...)
    for i in range(5):
        lats = []
        lons = []
        for kwc in results[0][0][i][1]:
            lats.append(kwc.coordinates.x)
            lons.append(kwc.coordinates.y)
            keywords.append(kwc.keywords)
        for j in range(len(lats)):
            gmap.marker(lats[j], lons[j], color=colors[i])
        gmap.polygon(lats, lons, color='cornflowerblue', edge_width=7)

        # initialLat.append(query.coordinates.x)
        # initialLon.append(query.coordinates.y)

        # gmap.scatter(initialLat, initialLon, '#00FF00', size = 70, marker = False)
        # gmap.scatter(lats, lons, '#FF0000',size = 50, marker = False )

        # gmap.plot(lats, lons, 'cornflowerblue', edge_width = 3.0)
        # gmap.polygon(lats, lons, color='cornflowerblue', edge_width=10)

        # gmap.scatter(lats, lons, color='#3B0B39', size=40, marker=False)

        #Your Google_API_Key
        #gmap.apikey = " API_Key”
        # save it to html
    # gmap.scatter(lats, lons, '#FF0000', size=40, marker=True)
    gmap.marker(query.coordinates.x,
                query.coordinates.y,
                color='cornflowerblue',
                title='Query point')
    gmap.draw(r"graphic_results.html")

    print("--- %s seconds ---" % (time.time() - start_time))
コード例 #8
0
        
        sys.path.append("..")

        in_start_time = time.time()
    
        # Config
        # Both files should be in the root directory of the project.
        model_pickle_file_name = 'word2vec_model.pickle'
        query_file_name = 'query.pickle'
        data_file_name = 'dataset.pickle'
        file_allow_overwrite = True
    
        # Code - you shouldn't have to make any changes to this

        # query = load_pickle(query_file_name)
        data = load_pickle(data_file_name)
        shrunk_model = calculate_model_subset(query, data, word2vec_model)
        write_pickle(shrunk_model, model_pickle_file_name, file_allow_overwrite=file_allow_overwrite)
    
        print("--- %s seconds ---" % (time.time() - in_start_time))
        
        # ***********************
        # precalculate_inter_dataset_distances.py
        #************************
        
        in_start_time = time.time()
        # Config
        # file_name_data = 'dataset.pickle'
        # query_file_name = 'query.pickle'
        # file_name_word2vec_model = 'model.pickle'
        target_file_name = 'precalculated_inter_dataset_distances.pickle'