def main(argv):
    start_time = time.time()

    # Config
    file_name_data = 'dataset.pickle'
    file_name_query = 'query.pickle'
    file_name_word2vec_model = 'model.pickle'
    target_file_name = 'precalculated_query_dataset_keyword_similarities_word2vec.pickle'
    # max_subset_size = int(argv[0])
    max_subset_size = 2
    # cost_function = Type3(euclidean_distance, word2vec_cosine_similarity, 0.2, 0.1, 0.7,
    #                       model=load_word2vec_model(file_name_word2vec_model))
    # cost_function = Type2(euclidean_distance, word2vec_cosine_similarity, 0, 0, 1,
    #                       model=load_word2vec_model(file_name_word2vec_model))
    cost_function = Type1(euclidean_distance,
                          word2vec_cosine_similarity,
                          0.2,
                          0.1,
                          0.7,
                          model=load_word2vec_model(file_name_word2vec_model))
    file_allow_overwrite = True

    # Code
    data = load_pickle(file_name_data)
    query = load_pickle(file_name_query)
    solver = NaiveSolver(query,
                         data,
                         cost_function,
                         max_subset_size=max_subset_size)
    precalculated_query_dataset_distances = solver.get_keyword_similarity()
    write_pickle(precalculated_query_dataset_distances,
                 target_file_name,
                 file_allow_overwrite=file_allow_overwrite)

    print("--- %s seconds ---" % (time.time() - start_time))
Ejemplo n.º 2
0
def main(argv):
    start_time = time.time()
    # Config
    # csv_file_name = 'London.csv'
    csv_file_name = argv[0]
    print(csv_file_name)
    data_target_name = 'dataset.pickle'
    x_index = 5 # Starts in 0
    y_index = 6
    keyword_index = 9
    max_read_length = -1   # -1 to disable
    keyword_delimiter = ' '
    csv_delimiter = ';'
    csv_quotechar = '"'
    file_allow_overwrite = True

    # Code
    print('Loading CSV', csv_file_name)
    data = load_csv(file_name=csv_file_name, x_coordinate_index=x_index, y_coordinate_index=y_index,
                    keywords_index=keyword_index, keywords_delimiter=keyword_delimiter, delimiter=csv_delimiter,
                    quotechar=csv_quotechar, max_read_length=max_read_length)
    if len(data) > 0:
        print('Example Datapoint:', data[0].coordinates.x, data[0].coordinates.y, data[0].keywords)
        write_pickle(data=data, file_name=data_target_name, file_allow_overwrite=file_allow_overwrite)
    else:
        print('Could not load any data.')

    print("--- %s seconds ---" % (time.time() - start_time))
Ejemplo n.º 3
0
def main(argv):
    start_time = time.time()

    # Config
    csv_file_name = 'user_queries.csv'
    data_target_name = 'query.pickle'
    x_index = 0
    y_index = 1
    keyword_index = 2
    # query_index = 16 # Let's take a query from the query file
    query_index = int(argv[0])
    keyword_delimiter = ' '
    csv_delimiter = ';'
    csv_quotechar = '"'
    file_allow_overwrite = True

    # Code
    if query_index <= 0:
        print('The query row has to be positive.')
    else:
        print('Loading CSV', csv_file_name)
        data = load_csv(file_name=csv_file_name,
                        x_coordinate_index=x_index,
                        y_coordinate_index=y_index,
                        keywords_index=keyword_index,
                        keywords_delimiter=keyword_delimiter,
                        delimiter=csv_delimiter,
                        quotechar=csv_quotechar,
                        max_read_length=query_index + 1,
                        query_load=True)
        if len(data) > 0:
            print('Query Datapoint:', data[query_index - 1].coordinates.x,
                  data[query_index - 1].coordinates.y,
                  data[query_index - 1].keywords)
            write_pickle(data=data[query_index - 1],
                         file_name=data_target_name,
                         file_allow_overwrite=file_allow_overwrite)

        else:
            print('Could not load any data.')
    nlp = en_core_web_lg.load()
    df_poi_encoded = pd.read_csv(os.path.dirname(os.path.abspath(__file__)) +
                                 '/../files/' + 'poi_keywords_encoded.csv',
                                 index_col='poi_name',
                                 encoding='utf-8')
    # print(df_poi_encoded)
    for kw in data[query_index - 1].keywords:
        df_poi_encoded[kw] = df_poi_encoded.apply(
            lambda row: nlp(row['nlp_keywords_encoded']).similarity(nlp(kw)),
            axis=1)

    df_poi_encoded.to_csv(os.path.dirname(os.path.abspath(__file__)) +
                          '/../files/' + 'poi_queries_similarities.csv',
                          encoding='utf-8')

    print("--- %s seconds ---" % (time.time() - start_time))
def main(argv):

    start_time = time.time()
    # Config
    file_name_data = 'dataset.pickle'
    query_file_name = 'query.pickle'
    file_name_word2vec_model = 'model.pickle'
    target_file_name = 'precalculated_inter_dataset_distances.pickle'
    # max_subset_size = 3 # Changed
    max_subset_size = 2  #int(argv[0])
    # cost_function = Type1(euclidean_distance, combined_cosine_similarity, 0.2, 0.1, 0.7)
    # cost_function = Type2(euclidean_distance, word2vec_cosine_similarity, 0, 0, 1,
    #                       model=load_word2vec_model(file_name_word2vec_model))
    start_time = time.time()
    cost_function = Type1(euclidean_distance,
                          combined_cosine_similarity,
                          0.2,
                          0.1,
                          0.7,
                          model=load_word2vec_model(file_name_word2vec_model))
    finish_time = time.time()
    print('Cost function initialization --> ', finish_time - start_time)
    file_allow_overwrite = True

    # Code
    start_time = time.time()
    data = load_pickle(file_name_data)
    finish_time = time.time()
    print('Load data pickle --> ', finish_time - start_time)
    # query = KeywordCoordinate("", 0, 0, ['0'])
    query = load_pickle(query_file_name)
    start_time = time.time()
    solver = NaiveSolver(query,
                         data,
                         cost_function,
                         max_subset_size=max_subset_size)
    finish_time = time.time()
    print('Solver initialization --> ', finish_time - start_time)
    precalculated_inter_dataset_distances = solver.get_inter_dataset_distance()
    write_pickle(precalculated_inter_dataset_distances,
                 target_file_name,
                 file_allow_overwrite=file_allow_overwrite)

    print("--- %s seconds ---" % (time.time() - start_time))
def main(argv):
    start_time = time.time()
    
    # Config
    # Both files should be in the root directory of the project.
    word2vec_model_name = 'model.pickle'
    model_pickle_file_name = 'word2vec_model.pickle'
    query_file_name = 'query.pickle'
    data_file_name = 'dataset.pickle'
    file_allow_overwrite = True

    # Code - you shouldn't have to make any changes to this
    model = load_word2vec_model(word2vec_model_name)
    query = load_pickle(query_file_name)
    data = load_pickle(data_file_name)
    shrunk_model = calculate_model_subset(query, data, model)
    write_pickle(shrunk_model, model_pickle_file_name, file_allow_overwrite=file_allow_overwrite)

    print("--- %s seconds ---" % (time.time() - start_time))
Ejemplo n.º 6
0
 def test_write_and_read_data(self):
     kwc1 = KeywordCoordinate(1, 1, ['1'])
     kwc2 = KeywordCoordinate(2, 2, ['2'])
     kwc3 = KeywordCoordinate(3, 3, ['3'])
     data = [kwc1, kwc2, kwc3]
     file_name = 'test/test.pickle'
     write_pickle(data, file_name, True)
     loaded_result = load_pickle(file_name)
     self.assertEqual(len(loaded_result), 3)
     for index in range(len(loaded_result)):
         self.assertAlmostEqual(loaded_result[index].coordinates.x,
                                data[index].coordinates.x)
         self.assertAlmostEqual(loaded_result[index].coordinates.y,
                                data[index].coordinates.y)
         self.assertListEqual(loaded_result[index].keywords,
                              data[index].keywords)
     os.remove(
         os.path.abspath(
             os.path.dirname(os.path.abspath(__file__)) + '../../../' +
             file_name))
Ejemplo n.º 7
0
 def generate_pickle(self,
                     data_size: int,
                     file_name: str,
                     file_allow_overwrite: bool = False,
                     file_only_overwrite_dot_pickle_files: bool = True,
                     pickle_protocol_version: int = 4) -> dataset_type:
     """
     Generates a new dataset, writes it as pickle and returns the generated dataset.
     :param data_size: The dataset
     :param file_name: The name of the file
     :param file_allow_overwrite: If files are allowed to be overwritten
     :param file_only_overwrite_dot_pickle_files: If the name of the file has to end with .pickle
     :param pickle_protocol_version: The protocol version of the pickle format
     :return: The generated data which has been written to disk
     """
     logger = logging.getLogger(__name__)
     logger.debug('generating dataset of size {}'.format(data_size))
     data = self.generate(data_size)
     logger.debug('generated dataset {}'.format(
         dataset_comprehension(data)))
     write_pickle(data, file_name, file_allow_overwrite,
                  file_only_overwrite_dot_pickle_files,
                  pickle_protocol_version)
     return data
Ejemplo n.º 8
0
    y_index = 6
    keyword_index = 9
    max_read_length = -1   # -1 to disable
    keyword_delimiter = ' '
    csv_delimiter = ';'
    csv_quotechar = '"'
    file_allow_overwrite = True

    # Code
    print('Loading CSV', csv_file_name)
    data = load_csv(file_name=csv_file_name, x_coordinate_index=x_index, y_coordinate_index=y_index,
                    keywords_index=keyword_index, keywords_delimiter=keyword_delimiter, delimiter=csv_delimiter,
                    quotechar=csv_quotechar, max_read_length=max_read_length)
    if len(data) > 0:
        print('Example Datapoint:', data[0].coordinates.x, data[0].coordinates.y, data[0].keywords)
        write_pickle(data=data, file_name=data_target_name, file_allow_overwrite=file_allow_overwrite)
    else:
        print('Could not load any data.')


    print("--- %s seconds ---" % (time.time() - start_time))

    word2vec_model_name = 'model.pickle'
    word2vec_model = load_word2vec_model(word2vec_model_name)
        
    for i in range(13, 79):
        print('++++++++++++++++++')
        print('Query --> ', i)
        print('++++++++++++++++++')
        
        iteration_start_time = time.time()
Ejemplo n.º 9
0
import os

#import word2vec
#from gensim.models import Word2Vec
from gensim.models import KeyedVectors

import sys
sys.path.append("..")
from src.utils.data_handler import write_pickle

if __name__ == '__main__':

    # Config
    # Both files should be in the root directory of the project.
    #word2vec_model_name = 'model.bin'
    word2vec_model_name = 'model_test2.bin'
    model_pickle_file_name = 'word2vec_model.pickle'
    word2vec_model_path = os.path.abspath(
        os.path.abspath(os.path.dirname(__file__)) + '/../files/' +
        word2vec_model_name)

    # Code - you shouldn't have to make any changes to this
    keyedVectors = KeyedVectors.load(word2vec_model_path, mmap='r')
    print(len(keyedVectors.wv.vocab.items()))
    result_dict = dict()
    for word in keyedVectors.wv.vocab:
        result_dict[word] = keyedVectors.wv.get_vector(word)

    print()
    write_pickle(result_dict, model_pickle_file_name)
Ejemplo n.º 10
0
from src.utils.data_generator import DataGenerator
from src.utils.data_handler import write_pickle

if __name__ == '__main__':
    # Config
    data_target_name = 'query.pickle'
    possible_keywords = [
        'family', 'food', 'outdoor', 'rest', 'indoor', 'sports', 'science',
        'culture', 'history'
    ]
    file_allow_overwrite = False

    # Code
    dg = DataGenerator(possible_keywords)
    generated_query = dg.generate(1)
    write_pickle(generated_query,
                 file_name=data_target_name,
                 file_allow_overwrite=file_allow_overwrite)