def main(argv): start_time = time.time() # Config file_name_data = 'dataset.pickle' file_name_query = 'query.pickle' file_name_word2vec_model = 'model.pickle' target_file_name = 'precalculated_query_dataset_keyword_similarities_word2vec.pickle' # max_subset_size = int(argv[0]) max_subset_size = 2 # cost_function = Type3(euclidean_distance, word2vec_cosine_similarity, 0.2, 0.1, 0.7, # model=load_word2vec_model(file_name_word2vec_model)) # cost_function = Type2(euclidean_distance, word2vec_cosine_similarity, 0, 0, 1, # model=load_word2vec_model(file_name_word2vec_model)) cost_function = Type1(euclidean_distance, word2vec_cosine_similarity, 0.2, 0.1, 0.7, model=load_word2vec_model(file_name_word2vec_model)) file_allow_overwrite = True # Code data = load_pickle(file_name_data) query = load_pickle(file_name_query) solver = NaiveSolver(query, data, cost_function, max_subset_size=max_subset_size) precalculated_query_dataset_distances = solver.get_keyword_similarity() write_pickle(precalculated_query_dataset_distances, target_file_name, file_allow_overwrite=file_allow_overwrite) print("--- %s seconds ---" % (time.time() - start_time))
def test_pickle(self): possible_keywords = ['1', '2', '3', '4', '5'] keywords_min = 6 keywords_max = 10 physical_min_x = 0.0 physical_max_x = 50.0 physical_min_y = 0.0 physical_max_y = 50.0 dg = DataGenerator(possible_keywords=possible_keywords, keywords_min=keywords_min, keywords_max=keywords_max, physical_min_x=physical_min_x, physical_max_x=physical_max_x, physical_min_y=physical_min_y, physical_max_y=physical_max_y) result_length = 5 file_name = 'test/test.pickle' generated_result = dg.generate_pickle(result_length, file_name, True) loaded_result = load_pickle(file_name) self.assertEqual(len(loaded_result), result_length) for index in range(len(loaded_result)): self.assertAlmostEqual(loaded_result[index].coordinates.x, generated_result[index].coordinates.x) self.assertAlmostEqual(loaded_result[index].coordinates.y, generated_result[index].coordinates.y) self.assertListEqual(loaded_result[index].keywords, generated_result[index].keywords) os.remove( os.path.abspath( os.path.dirname(os.path.abspath(__file__)) + '../../../' + file_name))
def main(argv): start_time = time.time() # Config file_name_data = 'dataset.pickle' query_file_name = 'query.pickle' file_name_word2vec_model = 'model.pickle' target_file_name = 'precalculated_inter_dataset_distances.pickle' # max_subset_size = 3 # Changed max_subset_size = 2 #int(argv[0]) # cost_function = Type1(euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7) # cost_function = Type2(euclidean_distance, word2vec_cosine_similarity, 0, 0, 1, # model=load_word2vec_model(file_name_word2vec_model)) start_time = time.time() cost_function = Type1(euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7, model=load_word2vec_model(file_name_word2vec_model)) finish_time = time.time() print('Cost function initialization --> ', finish_time - start_time) file_allow_overwrite = True # Code start_time = time.time() data = load_pickle(file_name_data) finish_time = time.time() print('Load data pickle --> ', finish_time - start_time) # query = KeywordCoordinate("", 0, 0, ['0']) query = load_pickle(query_file_name) start_time = time.time() solver = NaiveSolver(query, data, cost_function, max_subset_size=max_subset_size) finish_time = time.time() print('Solver initialization --> ', finish_time - start_time) precalculated_inter_dataset_distances = solver.get_inter_dataset_distance() write_pickle(precalculated_inter_dataset_distances, target_file_name, file_allow_overwrite=file_allow_overwrite) print("--- %s seconds ---" % (time.time() - start_time))
def __init__(self, distance_metric: distance_function_type, similarity_metric: similarity_function_type, alpha: float, beta: float, omega: float, query_distance_threshold: float = 0.7, dataset_distance_threshold: float = 0.7, keyword_similarity_threshold: float = 0.7, disable_thresholds: bool = False, model=None, precalculated_query_dataset_dict: precalculated_dict_type = None, precalculated_inter_dataset_dict: precalculated_dict_type = None, precalculated_keyword_similarity_dict: precalculated_dict_type = None): """ Constructs a new CostFunction object. The CostFunction class should never be directly instantiated. Instead use a class that inherits from the CostFunction class and implements the solve() method. :param distance_metric: The distance metric to calculate coordinate distances between KeywordCoordinates. :param similarity_metric: The similarity metric to calculate the similarity between keyword lists of KeywordCoordinates. :param alpha: The scaling parameter for the query-dataset distance. :param beta: The scaling parameter for the inter-dataset distance. :param omega: The scaling parameter for the keyword list similarity. :param query_distance_threshold: The threshold for the query-dataset distance. :param dataset_distance_threshold: The threshold for the inter-dataset distance. :param keyword_similarity_threshold: The threshold for the keyword list similarity. :param disable_thresholds: Whether to honor any threshold values. :param model: The word2vec model. This can be passed to the CostFunction instead of reading it from disk to improve performance. :param precalculated_query_dataset_dict: A dictionary with precalculated query-dataset values for a given frozen subset. :param precalculated_inter_dataset_dict: A dictionary with precalculated inter-dataset values for a given frozen subset. :param precalculated_keyword_similarity_dict: A dictionary with precalculated keyword similarity values for a given frozen subset. """ self.distance_metric: distance_function_type = distance_metric self.similarity_metric: similarity_function_type = similarity_metric self.alpha = alpha self.beta = beta self.omega = omega self.query_distance_threshold = query_distance_threshold self.dataset_distance_threshold = dataset_distance_threshold self.keyword_similarity_threshold = keyword_similarity_threshold self.disable_thresholds = disable_thresholds self.precalculated_query_dataset_dict = precalculated_query_dataset_dict self.precalculated_inter_dataset_dict = precalculated_inter_dataset_dict self.precalculated_keyword_similarity_dict = precalculated_keyword_similarity_dict logger = logging.getLogger(__name__) if self.similarity_metric.__name__ == 'word2vec_cosine_similarity': try: if model is None: model_path = os.path.abspath(os.path.abspath(os.path.dirname(__file__)) + '/../../files/model.pickle') print('*****' + model_path) logger.debug('loading model from path {}'.format(model_path)) self.model = load_pickle(model_path) else: logger.debug('loading model {} from parameter'.format(model)) self.model = model key, value = self.model.popitem() self.model[key] = value # if type(value) != np.ndarray: # logger.error('Model seems to be corrupt.') # raise ValueError('Model seems to be corrupt.') except: logger.error('Could not load model') raise ValueError('Could not load model') logger.debug('created with distance metric {}, similarity metric {}, alpha {}, beta {} and omega {}'.format( self.distance_metric.__name__, self.similarity_metric.__name__, self.alpha, self.beta, self.omega))
def main(argv): start_time = time.time() # Config # Both files should be in the root directory of the project. word2vec_model_name = 'model.pickle' model_pickle_file_name = 'word2vec_model.pickle' query_file_name = 'query.pickle' data_file_name = 'dataset.pickle' file_allow_overwrite = True # Code - you shouldn't have to make any changes to this model = load_word2vec_model(word2vec_model_name) query = load_pickle(query_file_name) data = load_pickle(data_file_name) shrunk_model = calculate_model_subset(query, data, model) write_pickle(shrunk_model, model_pickle_file_name, file_allow_overwrite=file_allow_overwrite) print("--- %s seconds ---" % (time.time() - start_time))
def test_write_and_read_data(self): kwc1 = KeywordCoordinate(1, 1, ['1']) kwc2 = KeywordCoordinate(2, 2, ['2']) kwc3 = KeywordCoordinate(3, 3, ['3']) data = [kwc1, kwc2, kwc3] file_name = 'test/test.pickle' write_pickle(data, file_name, True) loaded_result = load_pickle(file_name) self.assertEqual(len(loaded_result), 3) for index in range(len(loaded_result)): self.assertAlmostEqual(loaded_result[index].coordinates.x, data[index].coordinates.x) self.assertAlmostEqual(loaded_result[index].coordinates.y, data[index].coordinates.y) self.assertListEqual(loaded_result[index].keywords, data[index].keywords) os.remove( os.path.abspath( os.path.dirname(os.path.abspath(__file__)) + '../../../' + file_name))
def main(argv): start_time = time.time() # Evaluator, instantiate it first for logging purposes ev = Evaluator() query: KeywordCoordinate = load_pickle('query.pickle') print('Query:', query) data: dataset_type = load_pickle('dataset.pickle') # print('Data:', dataset_comprehension(data)) # Let's filter out by user radius # dataAux = sorted(data, key=lambda x: geographic_distance(x.coordinates, query.coordinates)) # distances = [geographic_distance(x.coordinates, query.coordinates) >= RADIUS for x in dataAux] # print('------ Distances: ', distances) # Load precalculated values and models precalculated_inter_dataset_distances = load_pickle( 'precalculated_inter_dataset_distances.pickle') precalculated_query_dataset_distances = load_pickle( 'precalculated_query_dataset_distances.pickle') precalculated_query_dataset_keyword_similarities = load_pickle( 'precalculated_query_dataset_keyword_similarities.pickle') # **** ONLY FOR word2vec model executions precalculated_query_dataset_keyword_similarities_word2vec = load_pickle( 'precalculated_query_dataset_keyword_similarities_word2vec.pickle') word2vec_model = load_word2vec_model('word2vec_model.pickle') # **** # Define the CostFunctions. For all possible parameters refer to the documentation. cf1 = Type1(euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7) cf2 = Type2(euclidean_distance, word2vec_cosine_similarity, 0.2, 0.1, 0.7, model=word2vec_model) cf3 = Type1( euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7, precalculated_inter_dataset_dict=precalculated_inter_dataset_distances, precalculated_query_dataset_dict=precalculated_query_dataset_distances, precalculated_keyword_similarity_dict= precalculated_query_dataset_keyword_similarities) cf4 = Type2( euclidean_distance, word2vec_cosine_similarity, 0, 0, 1.0, precalculated_inter_dataset_dict=precalculated_inter_dataset_distances, precalculated_query_dataset_dict=precalculated_query_dataset_distances, precalculated_keyword_similarity_dict= precalculated_query_dataset_keyword_similarities_word2vec, model=word2vec_model) cf5 = Type3( euclidean_distance, word2vec_cosine_similarity, 0.1, 0.1, 0.8, precalculated_inter_dataset_dict=precalculated_inter_dataset_distances, precalculated_query_dataset_dict=precalculated_query_dataset_distances, precalculated_keyword_similarity_dict= precalculated_query_dataset_keyword_similarities_word2vec, model=word2vec_model) cf6 = Type1( euclidean_distance, word2vec_cosine_similarity, 0.2, 0.1, 0.7, precalculated_inter_dataset_dict=precalculated_inter_dataset_distances, precalculated_query_dataset_dict=precalculated_query_dataset_distances, precalculated_keyword_similarity_dict= precalculated_query_dataset_keyword_similarities, model=word2vec_model) map_name = argv[0] # map_name = 'London_mini' # Choose which Solvers to use. For all possible parameters refer to the documentation. max_number_of_processes = mp.cpu_count() ns1 = NaiveSolver( query, data, cf2, result_length=5, max_subset_size=3, max_number_of_concurrent_processes=max_number_of_processes, _map=map_name) # ns2 = NaiveSolver(query, data, cf5, result_length=5, max_subset_size=3, # max_number_of_concurrent_processes=max_number_of_processes, _map = map_name) # ns3 = NaiveSolver(query, data, cf3, result_length=5, max_subset_size=6, # max_number_of_concurrent_processes=max_number_of_processes) # ns4 = NaiveSolver(query, data, cf6, result_length=5, max_subset_size=3, # max_number_of_concurrent_processes=max_number_of_processes, _map = map_name) # Add Solvers to Evaluator ev.add_solver(ns1) # ev.add_solver(ns2) # ev.add_solver(ns4) #Only for Debug: calculates and print physical distances between items in the dataset and the query location #distances = [geographic_distance(x.coordinates, query.coordinates) for x in data] # print('------ Distances: ', distances # Run Evaluator and fetch results ev.evaluate() results = ev.get_results() timings = ev.get_timings() write_csv(map_name, results, timings) print('*** Solution -', solution_list_comprehension(results)) # print('*** Timing -', timing_list_comprehension(timings)) initialLat = [] initialLon = [] keywords = [] gmap = gmplot.GoogleMapPlotter(query.coordinates.x, query.coordinates.y, 14) colors = ['red', 'blue', 'green', 'purple', 'orange'] # Third dimension is the order of solution (Best: 0, Second best: 1...) for i in range(5): lats = [] lons = [] for kwc in results[0][0][i][1]: lats.append(kwc.coordinates.x) lons.append(kwc.coordinates.y) keywords.append(kwc.keywords) for j in range(len(lats)): gmap.marker(lats[j], lons[j], color=colors[i]) gmap.polygon(lats, lons, color='cornflowerblue', edge_width=7) # initialLat.append(query.coordinates.x) # initialLon.append(query.coordinates.y) # gmap.scatter(initialLat, initialLon, '#00FF00', size = 70, marker = False) # gmap.scatter(lats, lons, '#FF0000',size = 50, marker = False ) # gmap.plot(lats, lons, 'cornflowerblue', edge_width = 3.0) # gmap.polygon(lats, lons, color='cornflowerblue', edge_width=10) # gmap.scatter(lats, lons, color='#3B0B39', size=40, marker=False) #Your Google_API_Key #gmap.apikey = " API_Key” # save it to html # gmap.scatter(lats, lons, '#FF0000', size=40, marker=True) gmap.marker(query.coordinates.x, query.coordinates.y, color='cornflowerblue', title='Query point') gmap.draw(r"graphic_results.html") print("--- %s seconds ---" % (time.time() - start_time))
sys.path.append("..") in_start_time = time.time() # Config # Both files should be in the root directory of the project. model_pickle_file_name = 'word2vec_model.pickle' query_file_name = 'query.pickle' data_file_name = 'dataset.pickle' file_allow_overwrite = True # Code - you shouldn't have to make any changes to this # query = load_pickle(query_file_name) data = load_pickle(data_file_name) shrunk_model = calculate_model_subset(query, data, word2vec_model) write_pickle(shrunk_model, model_pickle_file_name, file_allow_overwrite=file_allow_overwrite) print("--- %s seconds ---" % (time.time() - in_start_time)) # *********************** # precalculate_inter_dataset_distances.py #************************ in_start_time = time.time() # Config # file_name_data = 'dataset.pickle' # query_file_name = 'query.pickle' # file_name_word2vec_model = 'model.pickle' target_file_name = 'precalculated_inter_dataset_distances.pickle'