コード例 #1
0
 def test_find_subsets4(self):
     kwc1 = KeywordCoordinate(0, 0, ['0'])
     kwc2 = KeywordCoordinate(1, 1, ['1'])
     kwc3 = KeywordCoordinate(2, 2, ['2'])
     kwc4 = KeywordCoordinate(3, 3, ['3'])
     superset = [kwc1, kwc2, kwc3, kwc4]
     subsets = mt.find_subsets(superset, 4)
     self.assertEqual(len(subsets), 1)
     for subset in subsets:
         self.assertEqual(len(subset), 4)
コード例 #2
0
    def get_all_subsets(self, data):
        """
        Calculates all the possible subsets for the given data. Takes the set maximum length for subsets into account.
        :param data: The data
        :return: A list of all possible subsets
        """

        start_time = time.time()
        max_length = min(len(data), self.max_subset_size)
        list_of_subsets = []
        # nlp = en_core_web_lg.load()

        j = 1

        # query_docs = []
        # for kw in self.query.keywords:
        #     doc_query = nlp(kw)
        #     query_docs.append(doc_query)

        for index in range(max_length):
            print('***** Longitud ', j)
            j += 1
            new_subsets = find_subsets(data, index + 1)
            # temp = 1
            for subset in new_subsets:
                to_set = True

                # simDict = {}
                max = []

                # Initialize max list to 0 for every query keyword
                for i in range(0, len(self.query.keywords)):
                    # print('INDEX: ', i)
                    max.append(0)

                for poi in subset:
                    # Transform POI keywords into NLP encoding
                    # element_string = ''
                    # for kw in poi.keywords:
                    #     element_string = element_string + ' ' + kw

                    # doc = nlp(element_string)

                    # doc = self.df_poi_encoded.at[poi.name,'nlp_keywords_encoded']
                    sims = semantic_similarity(
                        self.query.keywords, poi.name,
                        self.df_poi_queries_similarities)

                    for i in range(0, len(self.query.keywords)):
                        # print('sims[i]=', sims[i], ' - max[i]=', max[i])
                        if sims[i] > max[i]:
                            max[i] = sims[i]

                for i in range(0, len(max)):
                    to_set = to_set and max[i] >= self.SEMANTIC_THRESHOLD

                    # to_set = to_set and semantic_similarity(self.query, poi, nlp, self.SEMANTIC_THRESHOLD)

                if to_set:
                    list_of_subsets.append(subset)
                    # print('subset ', subset)
                # temp = temp + 1
            self.number_of_subsets_after_semantic_filtering = len(
                list_of_subsets)
            print('# subsets: ', len(list_of_subsets))

        finish_time = time.time()
        self.time_4_build_subsets = finish_time - start_time

        return list_of_subsets