Beispiel #1
0
    def test_tokenizer(self):
        self.maxDiff = None

        self.assertEqual(list(compositions(1, 'ab')), [
            ('ab', ),
        ])
        self.assertEqual(list(compositions(2, 'ab')), [
            ('a', 'b'),
        ])
        self.assertEqual(list(compositions(2, 'abc')), [
            ('a', 'bc'),
            ('ab', 'c'),
        ])
        self.assertEqual(list(compositions(2, 'abcd')), [
            ('a', 'bcd'),
            ('ab', 'cd'),
            ('abc', 'd'),
        ])
        self.assertEqual(list(compositions(3, 'abcd')), [
            ('a', 'b', 'cd'),
            ('a', 'bc', 'd'),
            ('ab', 'c', 'd'),
        ])
        self.assertEqual(list(compositions(3, 'abcde')), [
            ('a', 'b', 'cde'),
            ('a', 'bc', 'de'),
            ('a', 'bcd', 'e'),
            ('ab', 'c', 'de'),
            ('ab', 'cd', 'e'),
            ('abc', 'd', 'e'),
        ])
        self.assertEqual(list(compositions(2, 'abcdefghijklmn')), [
            ('a', 'bcdefghijklmn'),
            ('ab', 'cdefghijklmn'),
            ('abc', 'defghijklmn'),
            ('abcd', 'efghijklmn'),
            ('abcde', 'fghijklmn'),
            ('abcdef', 'ghijklmn'),
            ('abcdefg', 'hijklmn'),
            ('abcdefgh', 'ijklmn'),
            ('abcdefghi', 'jklmn'),
            ('abcdefghij', 'klmn'),
            ('abcdefghijk', 'lmn'),
            ('abcdefghijkl', 'mn'),
            ('abcdefghijklm', 'n'),
        ])
        self.assertEqual(list(compositions(9, 'abcdefghij')), [
            ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'ij'),
            ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'hi', 'j'),
            ('a', 'b', 'c', 'd', 'e', 'f', 'gh', 'i', 'j'),
            ('a', 'b', 'c', 'd', 'e', 'fg', 'h', 'i', 'j'),
            ('a', 'b', 'c', 'd', 'ef', 'g', 'h', 'i', 'j'),
            ('a', 'b', 'c', 'de', 'f', 'g', 'h', 'i', 'j'),
            ('a', 'b', 'cd', 'e', 'f', 'g', 'h', 'i', 'j'),
            ('a', 'bc', 'd', 'e', 'f', 'g', 'h', 'i', 'j'),
            ('ab', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'),
        ])
        self.assertEqual(list(compositions(2, 'a')), [])
        self.assertEqual(
            list(strjoin(compositions(2, ['swedbank', 'lizingas', 'uab']))), [
                ('swedbank', 'lizingas uab'),
                ('swedbank lizingas', 'uab'),
            ])
        self.assertEqual(
            list(strjoin(compositions(1, ['swedbank', 'lizingas', 'uab']))), [
                ('swedbank lizingas uab', ),
            ])
Beispiel #2
0
    def pattern_finder(self, patterns, value, stack=None):
        """Search for all possible matches for given pattern and value.

        How does this work.

        For example you have these indexes defined:

            company-type/
                aliases.txt:
                    UAB
                        Uždaroji akcinė bendrovė
                    IĮ
                        Individuali įmonė
                choices.txt:
                    1,UAB
                    2,IĮ
            company/
                aliases.txt:
                    {company}
                        {company-type} {company}
                    Programuotojų artelė
                        Programmers of Vilnius
                choices.txt:
                    1,Programuotojų artelė

        Then you call:

            pattern_finder([('company-type', ()), ('company', ())], 'uždaroji akcinė bendrovė programmers of vilnius')

        Finder will collect all posible compositions for 'uždaroji akcinė bendrovė programmers of vilnius' and two
        patterns:

            ('uždaroji', 'akcinė bendrovė programmers of vilnius')
            ('uždaroji akcinė', 'bendrovė programmers of vilnius')
            ('uždaroji akcinė bendrovė', 'programmers of vilnius')
            ('uždaroji akcinė bendrovė programmers', 'of vilnius')
            ('uždaroji akcinė bendrovė programmers of', 'vilnius')

        Then for each conbination you collect possible pattern choices, first for raw strings that match. In our case
        there is not raw strings, only two patterns, so nothing will happen here.

        Then collect all possible choices for given patterns, by searching index specified in each pattern for value
        from generated compositions. In our case, only 'uždaroji akcinė bendrovė' will give 'uab' and 'programmers of
        vilnius' will give 'programuotojų artelė', all other composition values will not return any results.

            [['uab'], ['programuotojų artelė']]

        And finally generate all possible compositions:

            {'company-type': 'uab', 'company': 'programuotojų artelė'}

        Arguments:
        - patterns: list, example: [('bank', ()), 'bankas']
        - value: str, normalized value (see norm), example: 'dnb bankas'

        Returns generator with all possible values.
        """
        n_patterns = len(patterns)
        choices = [[] for i in range(n_patterns)]
        stack = stack or set()

        for comb in strjoin(compositions(n_patterns, value)):
            skip = False

            # First check all raw strings, if at least one raw string does not match, skip.
            for i, (token, pattern) in enumerate(zip(comb, patterns)):
                if isinstance(pattern, str):
                    pattern = pattern.strip()
                    if token == pattern:
                        choices[i].append(token)
                    else:
                        skip = True
                        break
            if skip:
                continue

            # Find all indexes.
            for i, (token, pattern) in enumerate(zip(comb, patterns)):
                if isinstance(pattern, tuple):
                    appended = False
                    name, flags = pattern
                    token = self.handle_flags(token, flags)
                    if (name, token) not in stack:
                        for item in self.find(name, token, stack | {(name, token)}):
                            choices[i].append(item)
                            appended = True
                    if not appended:
                        break

        # Finally generate all possible compositions from found indexes and matching raw strings.
        for option in itertools.product(*choices):
            yield [(k, v) for k, v in zip(patterns, option) if isinstance(k, tuple)]
    def test_tokenizer(self):
        self.maxDiff = None

        self.assertEqual(list(compositions(1, 'ab')), [
            ('ab',),
        ])
        self.assertEqual(list(compositions(2, 'ab')), [
            ('a', 'b'),
        ])
        self.assertEqual(list(compositions(2, 'abc')), [
            ('a', 'bc'),
            ('ab', 'c'),
        ])
        self.assertEqual(list(compositions(2, 'abcd')), [
            ('a', 'bcd'),
            ('ab', 'cd'),
            ('abc', 'd'),
        ])
        self.assertEqual(list(compositions(3, 'abcd')), [
            ('a', 'b', 'cd'),
            ('a', 'bc', 'd'),
            ('ab', 'c', 'd'),
        ])
        self.assertEqual(list(compositions(3, 'abcde')), [
            ('a', 'b', 'cde'),
            ('a', 'bc', 'de'),
            ('a', 'bcd', 'e'),
            ('ab', 'c', 'de'),
            ('ab', 'cd', 'e'),
            ('abc', 'd', 'e'),
        ])
        self.assertEqual(list(compositions(2, 'abcdefghijklmn')), [
            ('a', 'bcdefghijklmn'),
            ('ab', 'cdefghijklmn'),
            ('abc', 'defghijklmn'),
            ('abcd', 'efghijklmn'),
            ('abcde', 'fghijklmn'),
            ('abcdef', 'ghijklmn'),
            ('abcdefg', 'hijklmn'),
            ('abcdefgh', 'ijklmn'),
            ('abcdefghi', 'jklmn'),
            ('abcdefghij', 'klmn'),
            ('abcdefghijk', 'lmn'),
            ('abcdefghijkl', 'mn'),
            ('abcdefghijklm', 'n'),
        ])
        self.assertEqual(list(compositions(9, 'abcdefghij')), [
            ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'ij'),
            ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'hi', 'j'),
            ('a', 'b', 'c', 'd', 'e', 'f', 'gh', 'i', 'j'),
            ('a', 'b', 'c', 'd', 'e', 'fg', 'h', 'i', 'j'),
            ('a', 'b', 'c', 'd', 'ef', 'g', 'h', 'i', 'j'),
            ('a', 'b', 'c', 'de', 'f', 'g', 'h', 'i', 'j'),
            ('a', 'b', 'cd', 'e', 'f', 'g', 'h', 'i', 'j'),
            ('a', 'bc', 'd', 'e', 'f', 'g', 'h', 'i', 'j'),
            ('ab', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'),
        ])
        self.assertEqual(list(compositions(2, 'a')), [])
        self.assertEqual(list(strjoin(compositions(2, ['swedbank', 'lizingas', 'uab']))), [
            ('swedbank', 'lizingas uab'),
            ('swedbank lizingas', 'uab'),
        ])
        self.assertEqual(list(strjoin(compositions(1, ['swedbank', 'lizingas', 'uab']))), [
            ('swedbank lizingas uab',),
        ])