def __init__(self,
                 dictionary_loader: DictionaryLoader,
                 language="en",
                 filters: List[AnnotationFilter] = None):
        """
        Parameters
        ----------
            dictionary_loader: DictionaryLoader
                The dictionary loader that will provide the dictionary contents
            language: str
                The language of the text that will processed (affects the choice of tokenner and stemmer).
            filters: List[AnnotationFilter]
                A list of filters to apply post recognition
        """
        super().__init__(dictionary_loader, language=language, filters=filters)
        self.punctuation_remove = regex.compile(
            r'[\p{C}|\p{M}|\p{P}|\p{S}|\p{Z}]+', regex.UNICODE)
        self.label_concept_index = {}
        self.label_token_counts = {}
        self.label_lengths = {}
        self.trie = Trie()

        if language == 'en':
            import en_core_web_md
            self.spacy = en_core_web_md.load()
        elif language == 'fr':
            import fr_core_web_md
            self.spacy = fr_core_web_md.load()
        else:
            raise ValueError(f"Unsupported language: {language}")
Esempio n. 2
0
 def test_word_add_all_set(self):
     self.trie = Trie()
     self.trie.add_all({'axe', 'kick'})  #set
     self.assertIsInstance(self.trie, Trie,
                           "Object should be of type `lexpy.trie.Trie`")
     self.assertTrue('axe' in self.trie, "Word should be in trie")
     self.assertTrue('kick' in self.trie, "Word should be in trie")
     self.assertEqual(2, self.trie.get_word_count(), "Word count not equal")
Esempio n. 3
0
 def test_word_add_all_with_number(self):
     self.trie = Trie()
     self.trie.add_all(('axe', 'kick', 3))  #tuple with one integer.
     self.assertIsInstance(self.trie, Trie,
                           "Object should be of type `lexpy.trie.Trie`")
     self.assertTrue('axe' in self.trie, "Word should be in trie")
     self.assertTrue('kick' in self.trie, "Word should be in trie")
     self.assertEqual(2, self.trie.get_word_count(), "Word count not equal")
Esempio n. 4
0
 def test_trie_node_count(self):
     self.trie = Trie()
     self.trie.add_all(['ash', 'ashley'])
     self.assertIsInstance(self.trie, Trie,
                           "Object should be of type `lexpy.trie.Trie`")
     self.assertTrue('ash' in self.trie, "Word should be in trie")
     self.assertTrue('ashley' in self.trie, "Word should be in trie")
     self.assertEqual(2, self.trie.get_word_count(), "Word count not equal")
     self.assertEqual(7, len(self.trie), "Number of nodes")
Esempio n. 5
0
 def test_trie_wildcard_exception(self):
     self.trie = Trie()
     self.trie.add_all(['ab', 'as', 'ash', 'ashley'])
     self.assertIsInstance(self.trie, Trie,
                           "Object should be of type `lexpy.trie.Trie`")
     self.assertTrue('ash' in self.trie, "Word should be in trie")
     self.assertTrue('ashley' in self.trie, "Word should be in trie")
     self.assertRaises(InvalidWildCardExpressionError, self.trie.search,
                       '#$%^a')
Esempio n. 6
0
 def test_trie_question_search(self):
     self.trie = Trie()
     self.trie.add_all(['ab', 'as', 'ash', 'ashley'])
     self.assertIsInstance(self.trie, Trie,
                           "Object should be of type `lexpy.trie.Trie`")
     self.assertTrue('ash' in self.trie, "Word should be in trie")
     self.assertTrue('ashley' in self.trie, "Word should be in trie")
     self.assertEqual(sorted(self.trie.search('a?')), sorted(['ab', 'as']),
                      'The lists should be equal')
Esempio n. 7
0
class TestTrieExactWordSearch(unittest.TestCase):
    def test_word_in_trie(self):
        self.trie = Trie()
        self.trie.add_all(['ash', 'ashley'])
        self.assertTrue('ash' in self.trie, "Word should be in trie")

    def test_word_not_int_trie(self):
        self.trie = Trie()
        self.trie.add_all(['ash', 'ashley'])
        self.assertFalse('salary' in self.trie, "Word should not be in trie")
Esempio n. 8
0
 def test_trie_prefix_search(self):
     self.trie = Trie()
     self.trie.add_all(['ashlame', 'ashley', 'askoiu', 'ashlo'])
     self.assertIsInstance(self.trie, Trie,
                           "Object should be of type `lexpy.trie.Trie`")
     self.assertFalse('ash' in self.trie, "Word should not be in trie")
     self.assertTrue('ashley' in self.trie, "Word should be in trie")
     self.assertEqual(4, self.trie.get_word_count(), "Word count not equal")
     self.assertTrue(self.trie.contains_prefix('ash'),
                     "Prefix should be present in Trie")
     self.assertEqual(sorted(self.trie.search_with_prefix('ash')),
                      sorted(['ashlame', 'ashley', 'ashlo']),
                      'The lists should be equal')
Esempio n. 9
0
 def test_trie_node_prefix_not_exists(self):
     self.trie = Trie()
     self.trie.add_all(['ash', 'ashley'])
     self.assertIsInstance(self.trie, Trie,
                           "Object should be of type `lexpy.trie.Trie`")
     self.assertTrue('ash' in self.trie, "Word should be in trie")
     self.assertTrue('ashley' in self.trie, "Word should be in trie")
     self.assertEqual(2, self.trie.get_word_count(), "Word count not equal")
     self.assertFalse(self.trie.contains_prefix('xmas'),
                      "Prefix should be present in Trie")
     self.assertFalse(self.trie.contains_prefix('xor'),
                      "Prefix should be present in Trie")
     self.assertFalse(self.trie.contains_prefix('sh'),
                      "Prefix should be present in Trie")
Esempio n. 10
0
    def test_word_add_all_gen(self):
        def gen_words():
            a = ['ash', 'ashley', 'simpson']
            for word in a:
                yield word

        self.trie = Trie()
        self.trie.add_all(gen_words())  # generator
        self.assertIsInstance(self.trie, Trie,
                              "Object should be of type `lexpy.trie.Trie`")
        self.assertTrue('ash' in self.trie, "Word should be in trie")
        self.assertTrue('ashley' in self.trie, "Word should be in trie")
        self.assertTrue('simpson' in self.trie, "Word should be in trie")
        self.assertEqual(3, self.trie.get_word_count(), "Word count not equal")
Esempio n. 11
0
def __build_tries(dataset_id: int, user_id: int) -> DatasetDictionary:
    """Creates lookup tries from levels and segments"""
    col_unique_values = get_column_unique_values(dataset_id, user_id)
    all_values = __condense_and_lemmatize(col_unique_values)

    level_trie = Trie()
    level_trie.add_all(all_values)

    colname_trie = Trie()
    colname_trie.add_all(list(col_unique_values.keys()))

    wordcount = colname_trie.get_word_count() + level_trie.get_word_count()

    return DatasetDictionary(dataset_id, colname_trie, level_trie, wordcount)
Esempio n. 12
0
class TestWordCount(unittest.TestCase):
    def test_word_count_greater_than_zero(self):
        self.trie = Trie()
        self.trie.add_all(['ash', 'ashley', 'ashes'])
        self.assertGreater(self.trie.get_word_count(), 0,
                           "The number of words should be greater than 0")
        self.assertEqual(3, self.trie.get_word_count(), "Word count not equal")

    def test_word_count_zero(self):
        self.trie = Trie()
        self.trie.add_all([])
        self.assertEqual(0, self.trie.get_word_count(), "Word count not equal")
Esempio n. 13
0
 def test_without_count(self):
     trie = Trie()
     trie.add_all(['ash', 'ashley', 'ashes', 'ashes'])
     expected = ['ash', 'ashley', 'ashes']
     self.assertListEqual(expected, trie.search('a*'))
class TrieApproxRecognizer(ConceptRecognizer):
    def __init__(self,
                 dictionary_loader: DictionaryLoader,
                 language="en",
                 filters: List[AnnotationFilter] = None):
        """
        Parameters
        ----------
            dictionary_loader: DictionaryLoader
                The dictionary loader that will provide the dictionary contents
            language: str
                The language of the text that will processed (affects the choice of tokenner and stemmer).
            filters: List[AnnotationFilter]
                A list of filters to apply post recognition
        """
        super().__init__(dictionary_loader, language=language, filters=filters)
        self.punctuation_remove = regex.compile(
            r'[\p{C}|\p{M}|\p{P}|\p{S}|\p{Z}]+', regex.UNICODE)
        self.label_concept_index = {}
        self.label_token_counts = {}
        self.label_lengths = {}
        self.trie = Trie()

        if language == 'en':
            import en_core_web_md
            self.spacy = en_core_web_md.load()
        elif language == 'fr':
            import fr_core_web_md
            self.spacy = fr_core_web_md.load()
        else:
            raise ValueError(f"Unsupported language: {language}")

    def _load_concept_labels(self, concept_id, labels):

        label_index = 0
        for label in labels:
            normalized = self.punctuation_remove.sub(" ", label).replace(
                "-", " ").lower()
            tokens, _ = span_tokenize(self.spacy, normalized)
            # For each token
            key = str(concept_id) + ":::" + str(label_index)
            self.label_concept_index[normalized] = key
            self.label_token_counts[normalized] = len(tokens)
            self.label_lengths[normalized] = len(normalized)
            self.trie.add(normalized, count=1)
            label_index += 1

    def match_mentions(
        self, input_text
    ) -> Tuple[List[Tuple[int, int]], List[str], Set[Annotation]]:
        normalized_text = self.punctuation_remove.sub(" ", input_text).replace(
            "-", " ").lower()
        matches = []
        tokens, spans = span_tokenize(self.spacy, normalized_text)

        current_token_index = 0
        while current_token_index < len(tokens):
            current_match_cursor = 0
            while current_token_index + current_match_cursor < len(tokens):
                sub_string = normalized_text[spans[current_token_index][0]:
                                             spans[current_match_cursor][1]]
                found = self.trie.search_within_distance(sub_string, dist=2)
                if len(found) > 0:
                    # Register match
                    print(len(found))
                    current_match_cursor += 1
                else:
                    break
            current_token_index += 1

        return [], [], set()
Esempio n. 15
0
 def test_word_add(self):
     self.trie = Trie()
     self.trie.add('axe')
     self.assertIsInstance(self.trie, Trie,
                           "Object should be of type `lexpy.trie.Trie`")
     self.assertTrue('axe' in self.trie, "Word should be in trie")
Esempio n. 16
0
class TesTrieWordInsert(unittest.TestCase):
    def test_word_add(self):
        self.trie = Trie()
        self.trie.add('axe')
        self.assertIsInstance(self.trie, Trie,
                              "Object should be of type `lexpy.trie.Trie`")
        self.assertTrue('axe' in self.trie, "Word should be in trie")

    def test_word_add_all_list(self):
        self.trie = Trie()
        self.trie.add_all(['axe', 'kick'])  #list
        self.assertIsInstance(self.trie, Trie,
                              "Object should be of type `lexpy.trie.Trie`")
        self.assertTrue('axe' in self.trie, "Word should be in trie")
        self.assertTrue('kick' in self.trie, "Word should be in trie")
        self.assertEqual(2, self.trie.get_word_count(), "Word count not equal")

    def test_word_add_all_set(self):
        self.trie = Trie()
        self.trie.add_all({'axe', 'kick'})  #set
        self.assertIsInstance(self.trie, Trie,
                              "Object should be of type `lexpy.trie.Trie`")
        self.assertTrue('axe' in self.trie, "Word should be in trie")
        self.assertTrue('kick' in self.trie, "Word should be in trie")
        self.assertEqual(2, self.trie.get_word_count(), "Word count not equal")

    def test_word_add_all_tuple(self):
        self.trie = Trie()
        self.trie.add_all(('axe', 'kick'))  #tuple
        self.assertIsInstance(self.trie, Trie,
                              "Object should be of type `lexpy.trie.Trie`")
        self.assertTrue('axe' in self.trie, "Word should be in trie")
        self.assertTrue('kick' in self.trie, "Word should be in trie")
        self.assertEqual(2, self.trie.get_word_count(), "Word count not equal")

    def test_word_add_all_with_number(self):
        self.trie = Trie()
        self.trie.add_all(('axe', 'kick', 3))  #tuple with one integer.
        self.assertIsInstance(self.trie, Trie,
                              "Object should be of type `lexpy.trie.Trie`")
        self.assertTrue('axe' in self.trie, "Word should be in trie")
        self.assertTrue('kick' in self.trie, "Word should be in trie")
        self.assertEqual(2, self.trie.get_word_count(), "Word count not equal")

    def test_word_add_all_gen(self):
        def gen_words():
            a = ['ash', 'ashley', 'simpson']
            for word in a:
                yield word

        self.trie = Trie()
        self.trie.add_all(gen_words())  # generator
        self.assertIsInstance(self.trie, Trie,
                              "Object should be of type `lexpy.trie.Trie`")
        self.assertTrue('ash' in self.trie, "Word should be in trie")
        self.assertTrue('ashley' in self.trie, "Word should be in trie")
        self.assertTrue('simpson' in self.trie, "Word should be in trie")
        self.assertEqual(3, self.trie.get_word_count(), "Word count not equal")

    def test_word_add_all_file_path(self):
        self.trie = Trie()
        self.trie.add_all(small_dataset)  # From a file
        self.assertIsInstance(self.trie, Trie,
                              "Object should be of type `lexpy.trie.Trie`")
        self.assertTrue('ash' in self.trie, "Word should be in trie")
        self.assertTrue('ashley' in self.trie, "Word should be in trie")
        self.assertTrue('simpson' in self.trie, "Word should be in trie")
        self.assertEqual(8, self.trie.get_word_count(), "Word count not equal")
Esempio n. 17
0
 def test_word_not_int_trie(self):
     self.trie = Trie()
     self.trie.add_all(['ash', 'ashley'])
     self.assertFalse('salary' in self.trie, "Word should not be in trie")
Esempio n. 18
0
 def test_word_in_trie(self):
     self.trie = Trie()
     self.trie.add_all(['ash', 'ashley'])
     self.assertTrue('ash' in self.trie, "Word should be in trie")
Esempio n. 19
0
 def test_word_count_zero(self):
     self.trie = Trie()
     self.trie.add_all([])
     self.assertEqual(0, self.trie.get_word_count(), "Word count not equal")
Esempio n. 20
0
class TestWildCardSearch(unittest.TestCase):
    def test_trie_asterisk_search(self):
        self.trie = Trie()
        self.trie.add_all(['ash', 'ashley'])
        self.assertIsInstance(self.trie, Trie,
                              "Object should be of type `lexpy.trie.Trie`")
        self.assertTrue('ash' in self.trie, "Word should be in trie")
        self.assertTrue('ashley' in self.trie, "Word should be in trie")
        self.assertEqual(sorted(self.trie.search('a*')),
                         sorted(['ash', 'ashley']),
                         'The lists should be equal')

    def test_trie_question_search(self):
        self.trie = Trie()
        self.trie.add_all(['ab', 'as', 'ash', 'ashley'])
        self.assertIsInstance(self.trie, Trie,
                              "Object should be of type `lexpy.trie.Trie`")
        self.assertTrue('ash' in self.trie, "Word should be in trie")
        self.assertTrue('ashley' in self.trie, "Word should be in trie")
        self.assertEqual(sorted(self.trie.search('a?')), sorted(['ab', 'as']),
                         'The lists should be equal')

    def test_trie_wildcard_search(self):
        self.trie = Trie()
        self.trie.add_all(['ab', 'as', 'ash', 'ashley'])
        self.assertIsInstance(self.trie, Trie,
                              "Object should be of type `lexpy.trie.Trie`")
        self.assertTrue('ash' in self.trie, "Word should be in trie")
        self.assertTrue('ashley' in self.trie, "Word should be in trie")
        self.assertEqual(sorted(self.trie.search('*a******?')),
                         sorted(['ab', 'as', 'ash', 'ashley']),
                         'The lists should be equal')

    def test_trie_wildcard_exception(self):
        self.trie = Trie()
        self.trie.add_all(['ab', 'as', 'ash', 'ashley'])
        self.assertIsInstance(self.trie, Trie,
                              "Object should be of type `lexpy.trie.Trie`")
        self.assertTrue('ash' in self.trie, "Word should be in trie")
        self.assertTrue('ashley' in self.trie, "Word should be in trie")
        self.assertRaises(InvalidWildCardExpressionError, self.trie.search,
                          '#$%^a')
Esempio n. 21
0
 def test_with_count(self):
     trie = Trie()
     trie.add_all(['ash', 'ashley', 'ashes', 'ashes'])
     expected = [('ash', 1), ('ashley', 1), ('ashes', 2)]
     self.assertListEqual(expected, trie.search('a*', with_count=True))