Python dataset_comprehensionの例、src.utils.logging_utils.dataset_comprehension Pythonの例

コード例 #1

0

ファイルを表示

    def __init__(self,
                 query: KeywordCoordinate,
                 data: dataset_type,
                 cost_function: CostFunction,
                 normalize: bool = True,
                 result_length: int = 10,
                 max_subset_size: int = math.inf,
                 max_number_of_concurrent_processes: int = 5,
                 rebalance_subsets: bool = True,
                 _map: str = 'London',
                 update: bool = True):
        """
        Constructs a new NaiveSolver object.
        :param query: The query for which to solve for
        :param data: The data for which to solve for
        :param cost_function: The cost function to determine subset costs
        :param normalize: If the data should be normalized before being processed. The data will be denormalized before being returned.
        :param result_length: The size of the results (Top-N)
        :param max_subset_size: The maximum size of any subset used to calculate the solution
        :param rebalance_subsets: If the passed subsets should be rearranged to better distribute the workload among the processes
        """
        logger = logging.getLogger(__name__)
        logger.debug(
            'creating with query {}, data {}, cost function {}, normalization {} and result length {}'
            .format(query, dataset_comprehension(data), cost_function,
                    normalize, result_length))
        super().__init__(query,
                         data,
                         cost_function,
                         normalize,
                         result_length,
                         max_subset_size,
                         max_number_of_concurrent_processes,
                         rebalance_subsets,
                         _map=_map,
                         update_dataset=update)

        start_time = time.time()
        if self.query.keywords[0] != '0':  # Only for precalculate distance
            self.list_of_subsets, self.normalised_query = self.preprocess_input(
            )
        finish_time = time.time()
        print('NaiveSolver initialization - preprocess_input() --> ',
              finish_time - start_time)

        logger.debug(
            'created with query {}, data {}, cost function {}, normalization {} and result length {}'
            .format(self.query, dataset_comprehension(self.data),
                    self.cost_function, self.normalize_data,
                    self.result_length))

コード例 #2

0

ファイルを表示

 def get_minimum_for_query(self, query: KeywordCoordinate, dataset: dataset_type) -> float:
     """
     Calculates the minimum query-dataset distance cost.
     :param query: The query
     :param dataset: The dataset
     :return: Minimum query-dataset distance cost
     """
     logger = logging.getLogger(__name__)
     logger.debug(
         'finding minimum distance for query {} and dataset {}'.format(query, dataset_comprehension(dataset)))
     if self.precalculated_query_dataset_dict is not None:
         # logger.debug('querying precalculated set')
         precalculated_result = self.precalculated_query_dataset_dict.get(frozenset(dataset))
         if precalculated_result is not None:
             # logger.debug('found precalculated value {}'.format(precalculated_result))
             return precalculated_result
         else:
             logger.debug(
                 'could not find the minimum precalculated query-dataset value in the given precalculated set. This suggests an erroneous or a wrong dict has been passed into the CostFunction.')
     else:
         logger.debug('No precalculated query-dataset dict found')
     current_minimum = 99999999
     for index in range(len(dataset)):
         current_value = self.distance_metric(query.coordinates, dataset[index].coordinates)
         if current_value < current_minimum:
             current_minimum = current_value
     logger.debug('found minimum distance for query and dataset of {}'.format(current_minimum))
     return current_minimum

コード例 #3

0

ファイルを表示

 def get_minimum_for_dataset(self, dataset: dataset_type, denormalized_dataset: dataset_type = None) -> float:
     """
     Calculates the minimum inter-dataset distance cost.
     :param dataset: The dataset.
     :param denormalized_dataset: The normalized_dataset. This is used for the matching of precalculated values.
     :return: Minimum inter-dataset distance cost.
     """
     logger = logging.getLogger(__name__)
     logger.debug('finding minimum distance for dataset {}'.format(dataset_comprehension(dataset)))
     if self.precalculated_inter_dataset_dict is not None:
         logger.debug('querying precalculated set')
         if denormalized_dataset is not None:
             dataset_key = denormalized_dataset
         else:
             dataset_key = dataset
         precalculated_result = self.precalculated_inter_dataset_dict.get(frozenset(dataset_key))
         if precalculated_result is not None:
             logger.debug('found precalculated value {}'.format(precalculated_result))
             return precalculated_result
         else:
             logger.debug('could not find the minimum precalculated inter-dataset value in the given precalculated set. This suggests an erroneous or a wrong dict has been passed into the CostFunction.')
     else:
         print('No precalculated inter-dataset dict found')
     current_minimum: float = 9999999.9
     if len(dataset) <= 1:
         print('Dataset of size 1 returning inter-dataset distance of 0.0')
         return 0.0
     for index1 in range(len(dataset)):
         for index2 in range(len(dataset) - index1 - 1):
             current_value = self.distance_metric(dataset[index1].coordinates,
                                                  dataset[index1 + index2 + 1].coordinates)
             if current_value < current_minimum:
                 current_minimum = current_value
     logger.debug('found minimum distance for dataset of {}'.format(current_minimum))
     return current_minimum

コード例 #4

0

ファイルを表示

 def generate(self, data_size: int) -> dataset_type:
     """
     Generates a dataset with a given size.
     :param data_size: The size of the dataset
     :return: The dataset
     """
     logger = logging.getLogger(__name__)
     logger.debug('generating dataset of size {}'.format(data_size))
     dataset: dataset_type = []
     for data_counter in range(data_size):
         possible_keywords_copy = self.possible_keywords.copy()
         current_keywords: keyword_dataset_type = []
         current_x = random.randint(self.physical_min_x,
                                    self.physical_max_x)
         current_y = random.randint(self.physical_min_y,
                                    self.physical_max_y)
         number_of_keywords = random.randint(self.keywords_min,
                                             self.keywords_max)
         for kw_counter in range(number_of_keywords):
             try:
                 current_keyword = random.choice(possible_keywords_copy)
             except IndexError:
                 break
             possible_keywords_copy.remove(current_keyword)
             current_keywords.append(current_keyword)
         new_entry = KeywordCoordinate(current_x, current_y,
                                       current_keywords)
         dataset.append(new_entry)
     logger.debug('generated dataset {}'.format(
         dataset_comprehension(dataset)))
     return dataset

コード例 #5

0

ファイルを表示

 def get_maximum_keyword_distance(self, query: KeywordCoordinate, dataset: dataset_type) -> float:
     """
     Calculates the maximum keyword distance.
     :param query: The query
     :param dataset: The dataset
     :return: Maximum distance between the keywords
     """
     # print('XXXXXXXXXXXXXXXXXXXXXX')
     logger = logging.getLogger(__name__)
     logger.debug(
         'finding maximum similarity for query {} and dataset {}'.format(query, dataset_comprehension(dataset)))
     if self.precalculated_keyword_similarity_dict is not None:
         # logger.debug('querying precalculated set')
         precalculated_result = self.precalculated_keyword_similarity_dict.get(frozenset(dataset))
         if precalculated_result is not None:
             # logger.debug('found precalculated value {}'.format(precalculated_result))
             return precalculated_result
         else:
             logger.debug(
                 'could not find the maximum precalculated keyword similarity value in the given precalculated set. This suggests an erroneous or a wrong dict has been passed into the CostFunction.')
     else:
         logger.debug('No precalculated keyword-similarity dict found')
     current_maximum = 0
     combination = False
     latentfactors = False
     
     # print(self.similarity_metric.__name__)
     if self.similarity_metric.__name__ == 'combined_cosine_similarity':
         combined_keyword_vector: keyword_dataset_type = create_combined_keyword_vector(query, dataset)
         combination = True
     elif self.similarity_metric.__name__ == 'word2vec_cosine_similarity':
         latentfactors = True
     for element in dataset:
         if combination:
             current_value = self.similarity_metric(query.keywords, element.keywords, combined_keyword_vector)
         elif latentfactors:
             results = []
             list_with_kw = []
             # print('***********')
             # print('Similarity Q = ', query.keywords, ' y ', element.keywords)
             for kw_inquery in query.keywords:
                 
                 list_with_kw = []
                 list_with_kw.append(kw_inquery)
                 # print('list_with_kw --> ', list_with_kw)
                 results.append(self.similarity_metric(list_with_kw, element.keywords, self.model))
                 # print('1 - similarity of ',kw_inquery, ' --> ', self.similarity_metric(list_with_kw, element.keywords, self.model))
             
             # print('Mean (1-)similarity --> ', mean(results))
             current_value = mean(results)
             # current_value = self.similarity_metric(query.keywords, element.keywords, self.model)
             # print('Similarity between query: ', query.keywords, ' and POI ', element.keywords)
         else:
             current_value = self.similarity_metric(query.keywords, element.keywords)
         if current_value > current_maximum:
             current_maximum = current_value
     # logger.debug('found maximum similarity cost for query and dataset of {}'.format(current_maximum))
     return current_maximum

コード例 #6

0

ファイルを表示

    def solve(self) -> solution_list:
        """
        Implements the solution algorithm.
        :return: A list with tuples. Every tuple contains a cost and the corresponding subset of KeywordCoordinates.
        """
        start_time = time.time()
        logger = logging.getLogger(__name__)
        logger.info(
            'solving for query {} and dataset {} using cost function {} and result length {}'
            .format(self.query, dataset_comprehension(self.data),
                    self.cost_function, self.result_length))

        result_list: solution_list = []

        # UNCOMMENT FOR MULTIPROCESSING (DOES NOT WORK IN WINDOWS 10)
        # with concurrent.futures.ProcessPoolExecutor(
        #         max_workers=self.max_number_of_concurrent_processes) as executor:
        #     future_list = []
        #     for subsets in list_of_split_subsets:
        #         future = executor.submit(self.get_cost_for_subset, query, subsets)
        #         future_list.append(future)
        #     for future in future_list:
        #         for solution in future.result():
        #             result_list.append(solution)

        # ONE PROCESSOR VERSION
        result_list = []

        for subset in self.list_of_subsets:  # list_of_split_subsets if multiprocessing enabled.
            # print(i)
            # i = i + 1
            solution, query_distance, inter_distance, semantic_distance = self.get_cost_for_subset(
                self.normalised_query, subset)
            self.query_distance_list.append(query_distance)
            self.inter_distance_list.append(inter_distance)
            self.semantic_distance_list.append(semantic_distance)
            self.cost_list.append(solution)
            result_list.append((solution, subset))

        # MULTIPROCESSOR VERSION
        # for future in future_list:
        #     for solution in future.result():
        #         result_list.append(solution)
        #########################

        result_list.sort(key=lambda x: x[0])
        result_list = result_list[:self.result_length]
        denormalized_result_list = denormalize_result_data(
            result_list, self.denormalize_max_x, self.denormalize_min_x,
            self.denormalize_max_y, self.denormalize_min_y)
        logger.info('solved for {} with length {}'.format(
            result_list_comprehension(denormalized_result_list),
            self.result_length))
        finish_time = time.time()
        self.total_time = finish_time - start_time
        return denormalized_result_list

コード例 #7

0

ファイルを表示

def normalize_data(
    query: KeywordCoordinate, dataset: dataset_type
) -> typing.Tuple[KeywordCoordinate, typing.List[KeywordCoordinate], float,
                  float, float, float]:
    """
    Calculates the normalized query, dataset and parameters to undo this normalization.
    :param query: The query
    :param dataset: The dataset
    :return: A tuple with: the normalized query, the normalized dataset, the denormalization parameter max_x,  the denormalization parameter min_x, the denormalization parameter max_y and the denormalization parameter min_y,
    """
    logger = logging.getLogger(__name__ + '.normalize_data')
    logger.debug('calculation for query {} and dataset {}'.format(
        query, dataset_comprehension(dataset)))
    data = copy.deepcopy(dataset)

    # Cambio de Ramon (20200903)
    data.append(
        copy.deepcopy(query)
    )  # Añade una lista a una lista que solo contiene KeywordCoordinates
    #data.append(copy.deepcopy(query[0]))
    list_of_x = []
    list_of_y = []

    type(data)
    for kwc in data:
        list_of_x.append(kwc.coordinates.x)
        list_of_y.append(kwc.coordinates.y)
    min_x = min(list_of_x)
    min_y = min(list_of_y)
    max_x = max(list_of_x)
    max_y = max(list_of_y)
    for index in range(len(data)):
        new_x = (data[index].coordinates.x - min_x) / (max_x - min_x)
        new_y = (data[index].coordinates.y - min_y) / (max_y - min_y)
        data[index].coordinates.x = new_x
        data[index].coordinates.y = new_y
    logger.debug('calculated query {} and dataset {}'.format(
        data[-1:][0], dataset_comprehension(data[:-1])))
    return (data[-1:][0], data[:-1], max_x, min_x, max_y, min_y)

コード例 #8

0

ファイルを表示

def find_subsets(input_set: dataset_type, subset_size: int):
    """
    Calculates all the subsets of an input dataset and a given size.
    :param input_set: The input dataset
    :param subset_size: The subset size
    :return: A set of all the subsets
    """
    logger = logging.getLogger(__name__ + '.find_subsets')
    logger.debug('finding all subsets of length {} in set {}'.format(subset_size, dataset_comprehension(input_set)))
    if subset_size > len(input_set):
        solution = set(itertools.combinations(input_set, 0))
    else:
        solution = set(itertools.combinations(input_set, subset_size))
    logger.debug('found {}'.format(sets_of_set_comprehension(solution)))
    return solution

コード例 #9

0

ファイルを表示

def create_combined_keyword_vector(query: KeywordCoordinate, dataset: dataset_type) -> keyword_dataset_type:
    """
    Creates a combined keyword vector of the query and the dataset. This vector contains all the keywords that appear in either the query or dataset.
    :param query: The query
    :param dataset: The dataset
    :return: A list with all the unique keywords in the query and dataset
    """
    logger = logging.getLogger(__name__ + '.create_combined_keyword_vector')
    logger.debug('calculating for query {} and dataset {}'.format(query, dataset_comprehension(dataset)))
    result_keyword_list: keyword_dataset_type = []
    for string in query.keywords:
        result_keyword_list.append(string)
    for kwc in dataset:
        for string in kwc.keywords:
            result_keyword_list.append(string)
    result = list(set(result_keyword_list))
    return result

コード例 #10

0

ファイルを表示

 def solve(self,
           query: KeywordCoordinate,
           dataset: dataset_type,
           denormalized_dataset: dataset_type = None):
     """
     Solves the Type4 cost function.
     :param query: The query
     :param dataset: The dataset
     :param denormalized_dataset: The normalized_dataset. This is used for the matching of precalculated values.
     :return: The maximum cost for the given query and dataset
     """
     logger = logging.getLogger(__name__)
     logger.debug('solving for query {} and dataset {}'.format(
         query, dataset_comprehension(dataset)))
     # TODO does this type of threshold filtering make sense for the unified function?
     query_distance = self.get_maximum_for_query(query, dataset)
     logger.debug('solved query distance for {}'.format(query_distance))
     if denormalized_dataset is not None:
         dataset_distance = self.get_maximum_for_dataset(
             dataset, denormalized_dataset)
     else:
         dataset_distance = self.get_maximum_for_dataset(dataset)
     logger.debug('solved dataset distance for {}'.format(dataset_distance))
     keyword_similarity = self.get_maximum_keyword_distance(query, dataset)
     logger.debug(
         'solved keyword similarity for {}'.format(keyword_similarity))
     # if (not self.disable_thresholds and (
     #         query_distance > self.query_distance_threshold or dataset_distance > self.dataset_distance_threshold or keyword_similarity > self.keyword_similarity_threshold)):
     #     logger.debug(
     #         'One of the thresholds was not met. Query threshold: {}, dataset threshold: {}, keyword threshold {}'.format(
     #             self.query_distance_threshold, self.dataset_distance_threshold, self.keyword_similarity_threshold))
     #     return math.inf, 0, 0, 0
     # else:
     a: float = 0.0
     for element in dataset:
         a += self.distance_metric(query.coordinates,
                                   element.coordinates)**self.phi_1
     a = a**(1 / self.phi_1)
     a = (self.alpha * a)**self.phi_2
     b: float = (self.beta * dataset_distance)**self.phi_2
     c: float = ((self.omega *
                  keyword_similarity)**self.phi_2)**(1 / self.phi_2)
     solution = a + b + c
     logger.debug('solved with a cost of {}'.format(solution))
     return solution, query_distance, dataset_distance, keyword_similarity

コード例 #11

0

ファイルを表示

ファイル: type3.py プロジェクト: rhermosoUZ/Re-CoSKQ

    def solve(self,
              query: KeywordCoordinate,
              dataset: dataset_type,
              denormalized_dataset: dataset_type = None):
        """
        Solves the Type3 cost function.
        :param query: The query
        :param dataset: The dataset
        :param denormalized_dataset: The normalized_dataset. This is used for the matching of precalculated values.
        :return: The maximum cost for the given query and dataset
        """
        logger = logging.getLogger(__name__)
        logger.debug('solving for query {} and dataset {}'.format(
            query, dataset_comprehension(dataset)))
        query_distance = self.get_minimum_for_query(query, dataset)
        logger.debug('solved query distance for {}'.format(query_distance))
        if denormalized_dataset is not None:
            dataset_distance = self.get_maximum_for_dataset(
                dataset, denormalized_dataset)
        else:
            dataset_distance = self.get_maximum_for_dataset(dataset)
        logger.debug('solved dataset distance for {}'.format(dataset_distance))
        keyword_similarity = self.get_maximum_keyword_distance(query, dataset)
        logger.debug(
            'solved keyword similarity for {}'.format(keyword_similarity))

        ### Traces if normalized
        # print('Query distance: ', query_distance)
        # print('Dataset distance: ', dataset_distance)
        # print('Keyword similarity: ', keyword_similarity)
        ###

        # if (not self.disable_thresholds and (
        #         query_distance > self.query_distance_threshold or dataset_distance > self.dataset_distance_threshold or keyword_similarity > self.keyword_similarity_threshold)):
        #     logger.debug(
        #         'One of the thresholds was not met. Query threshold: {}, dataset threshold: {}, keyword threshold {}'.format(
        #             self.query_distance_threshold, self.dataset_distance_threshold, self.keyword_similarity_threshold))
        #     return math.inf, 0, 0, 0
        # else:
        solution = self.alpha * query_distance + self.beta * dataset_distance + self.omega * keyword_similarity
        logger.debug('solved with a cost of {}'.format(solution))
        # print('Solution: ', solution)
        return solution, query_distance, dataset_distance, keyword_similarity

コード例 #12

0

ファイルを表示

 def get_maximum_for_dataset(self, dataset: dataset_type, denormalized_dataset: dataset_type = None) -> float:
     """
     Calculates the maximum inter-dataset distance cost.
     :param dataset: The dataset.
     :param denormalized_dataset: The normalized_dataset. This is used for the matching of precalculated values.
     :return: Maximum inter-dataset distance cost.
     """
     logger = logging.getLogger(__name__)
     logger.debug('finding maximum distance for dataset {}'.format(dataset_comprehension(dataset)))
     # if self.precalculated_inter_dataset_dict is not None:
     #     logger.debug('querying precalculated set')
     #     if denormalized_dataset is not None:
     #         dataset_key = denormalized_dataset
     #     else:
     #         dataset_key = dataset
     #     print('--------- dataset_key:')
     #     # for i in range(10):
     #     print(dataset_key[0].name)
     #     precalculated_result = self.precalculated_inter_dataset_dict.get(frozenset(dataset_key))
     #     print('*****************', precalculated_result)
     #     if precalculated_result is not None:
     #         logger.debug('found precalculated value {}'.format(precalculated_result))
     #         return precalculated_result
     #     else:
     #         # print('could not find the maximum precalculated inter-dataset value in the given precalculated set. This suggests an erroneous or a wrong dict has been passed into the CostFunction.')
     #         logger.warning(
     #             'could not find the maximum precalculated inter-dataset value in the given precalculated set. This suggests an erroneous or a wrong dict has been passed into the CostFunction.')
     # else:
     #     pass
     #     logger.warning('No precalculated inter-dataset dict found')
     current_maximum: float = 0.0
     for index1 in range(len(dataset)):
         for index2 in range(len(dataset) - index1 - 1):
             current_value = self.distance_metric(dataset[index1].coordinates,
                                                  dataset[index1 + index2 + 1].coordinates)
             # print('Max = ', current_maximum, ' - Current value = ', current_value)
             if current_value > current_maximum:
                 current_maximum = current_value
     logger.debug('found maximum distance for dataset of {}'.format(current_maximum))
     # print('Max distance for dataset: ', current_maximum)
     return current_maximum

コード例 #13

0

ファイルを表示

 def generate_pickle(self,
                     data_size: int,
                     file_name: str,
                     file_allow_overwrite: bool = False,
                     file_only_overwrite_dot_pickle_files: bool = True,
                     pickle_protocol_version: int = 4) -> dataset_type:
     """
     Generates a new dataset, writes it as pickle and returns the generated dataset.
     :param data_size: The dataset
     :param file_name: The name of the file
     :param file_allow_overwrite: If files are allowed to be overwritten
     :param file_only_overwrite_dot_pickle_files: If the name of the file has to end with .pickle
     :param pickle_protocol_version: The protocol version of the pickle format
     :return: The generated data which has been written to disk
     """
     logger = logging.getLogger(__name__)
     logger.debug('generating dataset of size {}'.format(data_size))
     data = self.generate(data_size)
     logger.debug('generated dataset {}'.format(
         dataset_comprehension(data)))
     write_pickle(data, file_name, file_allow_overwrite,
                  file_only_overwrite_dot_pickle_files,
                  pickle_protocol_version)
     return data

コード例 #14

0

ファイルを表示

 def __str__(self):
     return '{}(query: {}, dataset: {}, cost function: {}, result length {})'.format(
         type(self).__name__, self.query, dataset_comprehension(self.data),
         self.cost_function, self.result_length)

コード例 #15

0

ファイルを表示

    def __init__(self,
                 query: KeywordCoordinate,
                 data: dataset_type,
                 cost_function: CostFunction,
                 normalize: bool = True,
                 result_length: int = 10,
                 max_subset_size: int = math.inf,
                 max_number_of_concurrent_processes: int = mp.cpu_count(),
                 rebalance_subsets: bool = True,
                 RADIUS: float = 2000.0,
                 semantic_filtering: bool = True,
                 _map: str = 'London',
                 update_dataset: bool = True):
        """
        Constructs a new Solver object. The Solver class should never be directly instantiated. Instead use a class that inherits from the Solver class and implements the solve() method.
        :param query: The query for which to solve for
        :param data: The data for which to solve for
        :param cost_function: The cost function to determine subset costs
        :param normalize: If the data should be normalized before being processed. The data will be denormalized before being returned.
        :param result_length: The size of the results (Top-N)
        :param max_subset_size: The maximum size of any subset used to calculate the solution
        :param rebalance_subsets: If the passed subsets should be rearranged to better distribute the workload among the processes
        """
        self.query: KeywordCoordinate = query
        self.data: dataset_type = data
        self.cost_function: CostFunction = cost_function
        self.result_length = result_length
        self.normalize_data = normalize
        self.denormalize_max_x: float = 0.0
        self.denormalize_min_x: float = 0.0
        self.denormalize_max_y: float = 0.0
        self.denormalize_min_y: float = 0.0
        self.max_subset_size = max_subset_size
        self.max_number_of_concurrent_processes = max_number_of_concurrent_processes
        self.rebalance_subsets = rebalance_subsets
        self.RADIUS: float = RADIUS
        self.semantic_filtering = semantic_filtering
        self.SEMANTIC_THRESHOLD: float = 0.5

        self.map = _map
        self.max_number_of_subsets = math.factorial(len(
            self.data)) / (math.factorial(max_subset_size) *
                           math.factorial(len(self.data) - max_subset_size))
        self.number_of_subsets_after_radius_filtering: int = 0
        self.number_of_subsets_after_semantic_filtering: int = 0
        self.cost_list = []
        self.query_distance_list = []
        self.inter_distance_list = []
        self.semantic_distance_list = []
        # self.query_distance: float = 0.0
        # self.intra_distance: float = 0.0
        # self.semantic_distance: float = 0.0
        self.time_4_radius_filtering: float = 0.0
        self.time_4_build_subsets: float = 0.0
        self.total_time: float = 0.0

        self.df_poi_queries_similarities = pd.read_csv(
            os.path.dirname(os.path.abspath(__file__)) + '/../../files/' +
            'poi_queries_similarities.csv',
            index_col='poi_name',
            encoding='utf-8')
        if update_dataset:
            self.data = self.get_promising_poi_data(
                self.data, self.df_poi_queries_similarities)

        logging.getLogger(__name__).debug(
            'created with query {}, data {}, cost function {}, normalization {} and result length {}'
            .format(self.query, dataset_comprehension(self.data),
                    self.cost_function, self.normalize_data,
                    self.result_length))