def test_tokenizer(self): self.maxDiff = None self.assertEqual(list(compositions(1, 'ab')), [ ('ab', ), ]) self.assertEqual(list(compositions(2, 'ab')), [ ('a', 'b'), ]) self.assertEqual(list(compositions(2, 'abc')), [ ('a', 'bc'), ('ab', 'c'), ]) self.assertEqual(list(compositions(2, 'abcd')), [ ('a', 'bcd'), ('ab', 'cd'), ('abc', 'd'), ]) self.assertEqual(list(compositions(3, 'abcd')), [ ('a', 'b', 'cd'), ('a', 'bc', 'd'), ('ab', 'c', 'd'), ]) self.assertEqual(list(compositions(3, 'abcde')), [ ('a', 'b', 'cde'), ('a', 'bc', 'de'), ('a', 'bcd', 'e'), ('ab', 'c', 'de'), ('ab', 'cd', 'e'), ('abc', 'd', 'e'), ]) self.assertEqual(list(compositions(2, 'abcdefghijklmn')), [ ('a', 'bcdefghijklmn'), ('ab', 'cdefghijklmn'), ('abc', 'defghijklmn'), ('abcd', 'efghijklmn'), ('abcde', 'fghijklmn'), ('abcdef', 'ghijklmn'), ('abcdefg', 'hijklmn'), ('abcdefgh', 'ijklmn'), ('abcdefghi', 'jklmn'), ('abcdefghij', 'klmn'), ('abcdefghijk', 'lmn'), ('abcdefghijkl', 'mn'), ('abcdefghijklm', 'n'), ]) self.assertEqual(list(compositions(9, 'abcdefghij')), [ ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'ij'), ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'hi', 'j'), ('a', 'b', 'c', 'd', 'e', 'f', 'gh', 'i', 'j'), ('a', 'b', 'c', 'd', 'e', 'fg', 'h', 'i', 'j'), ('a', 'b', 'c', 'd', 'ef', 'g', 'h', 'i', 'j'), ('a', 'b', 'c', 'de', 'f', 'g', 'h', 'i', 'j'), ('a', 'b', 'cd', 'e', 'f', 'g', 'h', 'i', 'j'), ('a', 'bc', 'd', 'e', 'f', 'g', 'h', 'i', 'j'), ('ab', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'), ]) self.assertEqual(list(compositions(2, 'a')), []) self.assertEqual( list(strjoin(compositions(2, ['swedbank', 'lizingas', 'uab']))), [ ('swedbank', 'lizingas uab'), ('swedbank lizingas', 'uab'), ]) self.assertEqual( list(strjoin(compositions(1, ['swedbank', 'lizingas', 'uab']))), [ ('swedbank lizingas uab', ), ])
def pattern_finder(self, patterns, value, stack=None): """Search for all possible matches for given pattern and value. How does this work. For example you have these indexes defined: company-type/ aliases.txt: UAB Uždaroji akcinė bendrovė IĮ Individuali įmonė choices.txt: 1,UAB 2,IĮ company/ aliases.txt: {company} {company-type} {company} Programuotojų artelė Programmers of Vilnius choices.txt: 1,Programuotojų artelė Then you call: pattern_finder([('company-type', ()), ('company', ())], 'uždaroji akcinė bendrovė programmers of vilnius') Finder will collect all posible compositions for 'uždaroji akcinė bendrovė programmers of vilnius' and two patterns: ('uždaroji', 'akcinė bendrovė programmers of vilnius') ('uždaroji akcinė', 'bendrovė programmers of vilnius') ('uždaroji akcinė bendrovė', 'programmers of vilnius') ('uždaroji akcinė bendrovė programmers', 'of vilnius') ('uždaroji akcinė bendrovė programmers of', 'vilnius') Then for each conbination you collect possible pattern choices, first for raw strings that match. In our case there is not raw strings, only two patterns, so nothing will happen here. Then collect all possible choices for given patterns, by searching index specified in each pattern for value from generated compositions. In our case, only 'uždaroji akcinė bendrovė' will give 'uab' and 'programmers of vilnius' will give 'programuotojų artelė', all other composition values will not return any results. [['uab'], ['programuotojų artelė']] And finally generate all possible compositions: {'company-type': 'uab', 'company': 'programuotojų artelė'} Arguments: - patterns: list, example: [('bank', ()), 'bankas'] - value: str, normalized value (see norm), example: 'dnb bankas' Returns generator with all possible values. """ n_patterns = len(patterns) choices = [[] for i in range(n_patterns)] stack = stack or set() for comb in strjoin(compositions(n_patterns, value)): skip = False # First check all raw strings, if at least one raw string does not match, skip. for i, (token, pattern) in enumerate(zip(comb, patterns)): if isinstance(pattern, str): pattern = pattern.strip() if token == pattern: choices[i].append(token) else: skip = True break if skip: continue # Find all indexes. for i, (token, pattern) in enumerate(zip(comb, patterns)): if isinstance(pattern, tuple): appended = False name, flags = pattern token = self.handle_flags(token, flags) if (name, token) not in stack: for item in self.find(name, token, stack | {(name, token)}): choices[i].append(item) appended = True if not appended: break # Finally generate all possible compositions from found indexes and matching raw strings. for option in itertools.product(*choices): yield [(k, v) for k, v in zip(patterns, option) if isinstance(k, tuple)]
def test_tokenizer(self): self.maxDiff = None self.assertEqual(list(compositions(1, 'ab')), [ ('ab',), ]) self.assertEqual(list(compositions(2, 'ab')), [ ('a', 'b'), ]) self.assertEqual(list(compositions(2, 'abc')), [ ('a', 'bc'), ('ab', 'c'), ]) self.assertEqual(list(compositions(2, 'abcd')), [ ('a', 'bcd'), ('ab', 'cd'), ('abc', 'd'), ]) self.assertEqual(list(compositions(3, 'abcd')), [ ('a', 'b', 'cd'), ('a', 'bc', 'd'), ('ab', 'c', 'd'), ]) self.assertEqual(list(compositions(3, 'abcde')), [ ('a', 'b', 'cde'), ('a', 'bc', 'de'), ('a', 'bcd', 'e'), ('ab', 'c', 'de'), ('ab', 'cd', 'e'), ('abc', 'd', 'e'), ]) self.assertEqual(list(compositions(2, 'abcdefghijklmn')), [ ('a', 'bcdefghijklmn'), ('ab', 'cdefghijklmn'), ('abc', 'defghijklmn'), ('abcd', 'efghijklmn'), ('abcde', 'fghijklmn'), ('abcdef', 'ghijklmn'), ('abcdefg', 'hijklmn'), ('abcdefgh', 'ijklmn'), ('abcdefghi', 'jklmn'), ('abcdefghij', 'klmn'), ('abcdefghijk', 'lmn'), ('abcdefghijkl', 'mn'), ('abcdefghijklm', 'n'), ]) self.assertEqual(list(compositions(9, 'abcdefghij')), [ ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'ij'), ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'hi', 'j'), ('a', 'b', 'c', 'd', 'e', 'f', 'gh', 'i', 'j'), ('a', 'b', 'c', 'd', 'e', 'fg', 'h', 'i', 'j'), ('a', 'b', 'c', 'd', 'ef', 'g', 'h', 'i', 'j'), ('a', 'b', 'c', 'de', 'f', 'g', 'h', 'i', 'j'), ('a', 'b', 'cd', 'e', 'f', 'g', 'h', 'i', 'j'), ('a', 'bc', 'd', 'e', 'f', 'g', 'h', 'i', 'j'), ('ab', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'), ]) self.assertEqual(list(compositions(2, 'a')), []) self.assertEqual(list(strjoin(compositions(2, ['swedbank', 'lizingas', 'uab']))), [ ('swedbank', 'lizingas uab'), ('swedbank lizingas', 'uab'), ]) self.assertEqual(list(strjoin(compositions(1, ['swedbank', 'lizingas', 'uab']))), [ ('swedbank lizingas uab',), ])