Beispiel #1
0
 def test_string_boundaries(self):
     # See http://bugs.python.org/issue10713
     self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
                      "abc")
     # There's a word boundary at the start of a string.
     self.assertTrue(re.match(r"\b", "abc"))
     # A non-empty string includes a non-boundary zero-length match.
     self.assertTrue(re.search(r"\B", "abc"))
     # There is no non-boundary match at the start of a string.
     self.assertFalse(re.match(r"\B", "abc"))
     # However, an empty string contains no word boundaries, and also no
     # non-boundaries.
     self.assertEqual(re.search(r"\B", ""), None)
     # This one is questionable and different from the perlre behaviour,
     # but describes current behavior.
     self.assertEqual(re.search(r"\b", ""), None)
     # A single word-character string has two boundaries, but no
     # non-boundary gaps.
     self.assertEqual(len(re.findall(r"\b", "a")), 2)
     self.assertEqual(len(re.findall(r"\B", "a")), 0)
     # If there are no words, there are no boundaries
     self.assertEqual(len(re.findall(r"\b", " ")), 0)
     self.assertEqual(len(re.findall(r"\b", "   ")), 0)
     # Can match around the whitespace.
     self.assertEqual(len(re.findall(r"\B", " ")), 2)
Beispiel #2
0
 def test_search_star_plus(self):
     self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
     self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
     self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
     self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
     self.assertEqual(re.search('x', 'aaa'), None)
     self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
     self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
     self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
     self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
     self.assertEqual(re.match('a+', 'xxx'), None)
Beispiel #3
0
 def test_large_search(self, size):
     # Issue #10182: indices were 32-bit-truncated.
     s = 'a' * size
     m = re.search('$', s)
     self.assertIsNotNone(m)
     self.assertEqual(m.start(), size)
     self.assertEqual(m.end(), size)
Beispiel #4
0
    def has_disallowed_pairs(self):
        # alpha = self.alpha

        patterns = ["^be[^r]i$", "^(k|s)e(i|kan)$", "^(di|me|te)[^krwylp]an$"]

        if self.removed['derivational_prefix'] != '' and self.removed[
                'derivational_suffix'] != '':
            prefix = self.removed['derivational_prefix'][0]

            for pattern in patterns:
                # self.removed['derivational_suffix'] = pcre.search(pattern, prefix)

                if pcre.search(pattern, prefix):
                    self.removed['derivational_suffix'] = pcre.search(
                        pattern, prefix)
                    return True

        return False
Beispiel #5
0
 def test_bug_418626(self):
     # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
     # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
     # pattern '*?' on a long string.
     self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
     self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
                      20003)
     self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
     # non-simple '*?' still used to hit the recursion limit, before the
     # non-recursive scheme was implemented.
     self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Beispiel #6
0
    def delete_derivational_suffix(self, word):
        result = word
        derivational_suffix = "(i|k?an)$"
        match = pcre.search(derivational_suffix, result)

        if match:
            result = pcre.sub(derivational_suffix, '', result)
            self.removed['derivational_suffix'] = match.group(0)
            check = self.lookup(result)

            if check:
                return check

        return result
Beispiel #7
0
    def check_rule_precedence(self, word):
        # alpha = self.alpha

        patterns = [
            "^be(?<word>{})([^k]an|lah|kah)$".format(self.alpha),
            "^(me|di|pe|te)(?<word>{})(i)$".format(self.alpha),
            "^(k|s)e(?<word>{})(i|kan)$".format(self.alpha),
            "^([pm]e[nm]|di[tmp])(?<word>ah|ak|er|el)an$"
        ]

        for pattern in patterns:
            match = pcre.search(pattern, word)

            if match and match.group('word') != 'ngalam':
                return True

        return False
Beispiel #8
0
    def delete_inflectional_suffix(self, word):
        result = word
        patterns = {
            'particle': "([klt]ah|pun)$",
            'possessive_pronoun': "([km]u|nya)$"
        }

        for key, pattern in patterns.items():
            match = pcre.search(pattern, result)

            if match:
                result = pcre.sub(pattern, '', result)
                self.removed[key] = match.group(0)
                check = self.lookup(result)

                if check:
                    return check

        return result
Beispiel #9
0
 def test_search_coverage(self):
     self.assertEqual(re.search("\s(b)", " b").group(1), "b")
     self.assertEqual(re.search("a\s", "a ").group(0), "a ")
Beispiel #10
0
 def test_not_literal(self):
     self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
     self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
Beispiel #11
0
 def test_special_escapes(self):
     self.assertEqual(re.search(r"\b(b.)\b",
                                "abcd abc bcd bx").group(1), "bx")
     self.assertEqual(re.search(r"\B(b.)\B",
                                "abc bcd bc abxd").group(1), "bx")
     self.assertEqual(re.search(r"\b(b.)\b",
                                "abcd abc bcd bx", re.LOCALE).group(1), "bx")
     self.assertEqual(re.search(r"\B(b.)\B",
                                "abc bcd bc abxd", re.LOCALE).group(1), "bx")
     self.assertEqual(re.search(r"\b(b.)\b",
                                "abcd abc bcd bx", re.UNICODE).group(1), "bx")
     self.assertEqual(re.search(r"\B(b.)\B",
                                "abc bcd bc abxd", re.UNICODE).group(1), "bx")
     self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
     self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
     self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None)
     self.assertEqual(re.search(r"\b(b.)\b",
                                u"abcd abc bcd bx").group(1), "bx")
     self.assertEqual(re.search(r"\B(b.)\B",
                                u"abc bcd bc abxd").group(1), "bx")
     self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc")
     self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc")
     self.assertEqual(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M), None)
     self.assertEqual(re.search(r"\d\D\w\W\s\S",
                                "1aa! a").group(0), "1aa! a")
     self.assertEqual(re.search(r"\d\D\w\W\s\S",
                                "1aa! a", re.LOCALE).group(0), "1aa! a")
     self.assertEqual(re.search(r"\d\D\w\W\s\S",
                                "1aa! a", re.UNICODE).group(0), "1aa! a")
Beispiel #12
0
    def lookup(self, word):

        if len(word) < 3:
            return False

        check = word
        check2 = ''
        query_string = ''

        # check repeating words like main-main
        match = pcre.search("^([a-z]+)-([a-z]+)$", check)
        if match:
            if match.group(1) == match.group(2):
                check = match.group(1)
                check2 = word

        if len(word) <= 6:
            query_string = "'{}'".format(check)
        else:
            syllabel = "([bcdfghjklmnpqrstvwxyz]|sy)?([aiueo])(?U)([bcdfghjklmnpqrstvwxyz]|ng)?"
            reg = "^(?<first>aneka|({}{}))(?<second>{}{}(?U)({})*)$".format(
                syllabel, syllabel, syllabel, syllabel,
                syllabel)  # notsure if true

            match = pcre.search(reg, word)

            if match:
                query_string = "'" + match.group('first') + ' ' + match.group(
                    'second') + "' OR lemma LIKE '{}'".format(check)
            else:
                query_string = "'{}'".format(check)

        if check2 != '':
            query_string += " OR lemma LIKE '{}'".format(check2)

        if pcre.search(
                '[aiueo]$', word
        ) and self.removed['derivational_suffix'] == 'kan' and len(word) > 3:
            query_string += " OR lemma LIKE '{}k' ORDER BY pos DESC".format(
                check)

        query = self.database.cursor()
        txt = "SELECT * FROM dictionary WHERE lemma LIKE {} LIMIT 1".format(
            query_string)

        try:
            query.execute(txt)
        except:
            print('error happened')

        self.total_lookup += 1

        try:
            row = query.fetchall()
        except:
            row = ""
            print('empty data')

        if row:
            self.found = row[0]

            return self.found
Beispiel #13
0
    def delete_derivational_prefix(self, word):
        vowel = self.vowel
        consonant = self.consonant
        alpha = self.alpha
        result = word
        prefix_type = ""
        prefix = ""

        patterns = {'plain': "^(di|(k|s)e)", 'complex': "^(b|m|p|t)e"}

        if len(result) < 4:
            return result

        for key, pattern in patterns.items():
            match = re.match(pattern, result)

            if match:
                prefix_type = (key == 'plain')
                prefix = match[0]

                if self.removed[
                        'derivational_prefix'] != '' and prefix in self.removed[
                            'derivational_prefix']:
                    return result

                self.recoding_tracker[match[0]] = ''

                if prefix_type:
                    array = self.removed['derivational_prefix']

                    if prefix == 'ke' and array != '' and (
                            array[0] == 'di'
                            and not (pcre.search('(tawa|tahu)', result))
                            and array[0] != 'be'):
                        return result

                    result = pcre.sub(pattern, '', result)

                    self.complex_prefix_tracker[prefix] = {prefix: ''}

                else:
                    modification = ""

                    #  'be-' prefix rules
                    #   total rule = 5

                    if prefix == 'be':

                        if self.removed['derivational_prefix'] != '':

                            array_key = list(
                                self.complex_prefix_tracker.keys())[
                                    0]  # get first dict value
                            array = self.complex_prefix_tracker[array_key]

                            added_key = list(array.keys())[0]
                            added = array[added_key]
                            pp = added_key

                            if pp not in ['mem', 'pem', 'di', 'ke']:
                                return result

                        # rule 1

                        if pcre.search("^ber{}".format(vowel), result):

                            result = pcre.sub("^ber", '', result)
                            modification = {"ber": ''}
                            self.recoding_tracker[prefix] = {'be': ''}

                        # rule 2
                        elif pcre.search(
                                "^ber[bcdfghjklmnpqstvwxyz][a-z](?!er)",
                                result):

                            result = pcre.sub("^ber", '', result)
                            modification = {'ber': ""}

                        # rule 3
                        elif pcre.search(
                                "^ber[bcdfghjklmnpqstvwxyz][a-z]er{}".format(
                                    vowel), result):

                            result = pcre.sub("^ber", '', result)
                            modification = {'ber': ""}

                        # rule 4
                        elif pcre.search("^belajar$", result):

                            result = pcre.sub("^bel", '', result)
                            modification = {'bel': ""}

                        # rule 5
                        elif pcre.search(
                                "^be[bcdfghjkmnpqstvwxyz]er{}".format(
                                    consonant), result):

                            result = pcre.sub("^be", '', result)
                            modification = {'be': ""}

                        # unsuccessful
                        else:
                            del self.recoding_tracker[prefix]
                            return word

                    # te- prefix rules
                    # total rule : 5

                    elif prefix == 'te':

                        if self.removed['derivational_prefix'] != '':
                            array_key = list(
                                self.complex_prefix_tracker.keys())[
                                    0]  # get first dict value
                            array = self.complex_prefix_tracker[array_key]

                            added_key = list(array.keys())[0]
                            added = array[added_key]
                            pp = added_key

                            if pp != 'ke' and pp in [
                                    'me', 'men', 'pen'
                            ] and not (pcre.search('tawa', result)):
                                return result

                        # rule 6
                        if pcre.search("^ter{}".format(vowel), result):

                            result = pcre.sub('^ter', '', result)
                            modification = {'ter': ''}
                            self.recoding_tracker[prefix] = {'te': ''}

                        # rule 7

                        elif pcre.search(
                                "^ter[bcdfghjklmnpqstvwxyz]er{}".format(vowel),
                                result):

                            result = pcre.sub('^ter', '', result)
                            modification = {'ter': ''}

                        # rule 8
                        elif pcre.search("^ter{}(?!er)".format(consonant),
                                         result):

                            result = pcre.sub('^ter', '', result)
                            modification = {'ter': ''}

                        # rule 9
                        elif pcre.search(
                                "^te[bcdfghjklmnpqstvwxyz]er{}".format(
                                    consonant), result):

                            result = pcre.sub('^te', '', result)
                            modification = {'te': ''}

                        # rule 10
                        elif pcre.search(
                                "^ter[bcdfghjklmnpqstvwxyz]er{}".format(
                                    consonant), result):

                            result = pcre.sub('^ter', '', result)
                            modification = {'ter': ''}

                        # unsuccessful

                        else:
                            del self.recoding_tracker[prefix]
                            return word

                    # me- prefix rules
                    # total rule = 10

                    elif prefix == 'me':

                        if self.removed['derivational_prefix'] != '':
                            return result

                        # rule 11
                        if pcre.search('^me[lrwy]{}'.format(vowel), result):

                            result = pcre.sub('^me', '', result)
                            modification = {'me': ''}

                        # rule 12

                        elif pcre.search('^mem[bfv]', result):

                            result = pcre.sub('^mem', '', result)
                            modification = {'mem': ''}

                        # rule 13
                        elif pcre.search('^mempe', result):

                            result = pcre.sub('^mem', '', result)
                            modification = {'mem': ''}

                        # rule 14
                        elif pcre.search("^mem(r?)[aiueo]", result):
                            match = pcre.search("^mem(r?)[aiueo]", result)
                            result = pcre.sub('^me', '', result)
                            modification = {'me{}'.format(match.group(1)): ''}
                            self.recoding_tracker[prefix] = {'mem': 'p'}

                        # rule 15

                        elif pcre.search('^men[cdsjz]', result):

                            result = pcre.sub('^men', '', result)
                            modification = {'men': ''}

                        # rule 16

                        elif pcre.search('^men{}'.format(vowel), result):

                            result = pcre.sub('^men', 't', result)
                            modification = {'men': 't'}
                            self.recoding_tracker[prefix] = {'me': ''}

                        # rule 17

                        elif pcre.search('^meng[ghqk]', result):

                            result = pcre.sub('^meng', '', result)
                            modification = {'meng': ''}

                        # rule 18

                        elif pcre.search('^meng({})'.format(vowel), result):
                            match = pcre.search('^meng({})'.format(vowel),
                                                result)
                            result = pcre.sub('^meng', '', result)
                            modification = {'meng': ''}

                            self.recoding_tracker[prefix] = {'meng1': 'k'}
                            self.recoding_tracker[prefix]['menge'] = ''

                        # rule 19
                        elif pcre.search('^meny{}'.format(vowel), result):

                            result = pcre.sub('^me', '', result)
                            modification = {'me': ''}
                            self.recoding_tracker[prefix] = {'meny': 's'}

                        # rule 20
                        elif pcre.search('^memp[abcdfghijklmnopqrstuvwxyz]',
                                         result):

                            result = pcre.sub('^mem', '', result)
                            modification = {'mem': ''}

                        # unsuccesful
                        else:
                            del self.recoding_tracker[prefix]
                            return word

                    # pe- prefix rules
                    # total rule = 15

                    elif prefix == 'pe':

                        if self.removed['derivational_prefix'] != '':
                            array_key = list(
                                self.complex_prefix_tracker.keys())[
                                    0]  # get first dict value
                            array = self.complex_prefix_tracker[array_key]

                            added_key = list(array.keys())[0]
                            added = array[added_key]
                            pp = added_key

                            if pp not in ['di', 'ber', 'mem', 'se', 'ke']:
                                return result

                        # rule 21
                        if pcre.search('^pe[wy]{}'.format(vowel), result):

                            result = pcre.sub('^pe', '', result)
                            modification = {'pe': ''}

                        # rule 22
                        elif pcre.search('^per{}'.format(vowel), result):

                            result = pcre.sub('^per', '', result)
                            modification = {'per': ''}
                            self.recoding_tracker[prefix] = {'pe': ''}

                        # rule 23
                        elif pcre.search(
                                '^per[bcdfghjklmnpqstvwxyz][a-z](?!er)',
                                result):

                            result = pcre.sub('^per', '', result)
                            modification = {'per': ''}

                        # rule 24
                        elif pcre.search(
                                '^per[bcdfghjklmnpqstvwxyz][a-z]er{}'.format(
                                    vowel), result):

                            result = pcre.sub('^per', '', result)
                            modification = {'per': ''}

                        # rule 25
                        elif pcre.search('^pem[bfv]', result):

                            result = pcre.sub('^pem', '', result)
                            modification = {'pem': ''}

                        # rule 26
                        elif pcre.search('^pem(r?){}'.format(vowel), result):

                            result = pcre.sub('^pe', '', result)
                            modification = {'pe': ''}
                            self.recoding_tracker[prefix] = {'pem': 'p'}

                        # rule 27
                        elif pcre.search('^pen[cdjz]', result):

                            result = pcre.sub('^pen', '', result)
                            modification = {'pen': ''}

                        # rule 28
                        elif pcre.search('^pen{}'.format(vowel), result):

                            result = pcre.sub('^pen', 't', result)
                            modification = {'pen': 't'}
                            self.recoding_tracker[prefix] = {'pe': ''}

                        # rule 29
                        elif pcre.search('^peng{}'.format(consonant), result):

                            result = pcre.sub('^peng', '', result)
                            modification = {'peng': ''}

                        # rule 30
                        elif pcre.search('^peng({})'.format(vowel), result):
                            match = pcre.search('^peng({})'.format(vowel),
                                                result)
                            result = pcre.sub('^peng', '', result)
                            modification = {'peng': ''}

                            self.recoding_tracker[prefix] = {'peng1': 'k'}
                            self.recoding_tracker[prefix]['penge'] = ''

                        # rule 31
                        elif pcre.search('^peny{}'.format(vowel), result):

                            result = pcre.sub('^pe', '', result)
                            modification = {'pe': ''}
                            self.recoding_tracker[prefix] = {'peny': 's'}

                        # rule 32
                        elif pcre.search('^pel{}'.format(vowel), result):

                            if (result == 'pelajar'):
                                result = pcre.sub('^pel', '', result)
                                modification = {'pel': ''}
                            else:
                                result = pcre.sub("^pe", "", result)
                                modification = {'pe': ''}

                        # rule 33
                        elif pcre.search(
                                '^pe[bcdfghjkpqstvxz]er{}'.format(vowel),
                                result):

                            result = pcre.sub('^pe', '', result)
                            modification = {'pe': ''}

                        # rule 34
                        elif pcre.search('^pe[bcdfghjkpqstvxz](?!er)', result):

                            result = pcre.sub('^pe', '', result)
                            modification = {'pe': ''}

                        # rule 35
                        elif pcre.search(
                                '^pe[bcdfghjkpqstvxz]er{}'.format(consonant),
                                result):

                            result = pcre.sub('^pe', '', result)
                            modification = {'pe': ''}

                        # unsuccessful
                        else:
                            del self.recoding_tracker[prefix]
                            return word

                    if modification != "":
                        self.complex_prefix_tracker[prefix] = modification
                    else:
                        return result

                if self.removed['derivational_prefix'] == '':
                    self.removed['derivational_prefix'] = []

                self.removed['derivational_prefix'].append(prefix)
                self.lookup(result)
                return result

        return result
Beispiel #14
0
    def search(self, vhost, https=False, port=None):
        ips = self.resolv(vhost)
        if len(ips) > 1:
            raise "Vhost on multiple IPS not supported."

        if port is None:
            if https:
                port = 443
            else:
                port = 80
        candidates = []
        ip = ips[0]
        logging.debug("Pre-select vhost that can serve IP <%s> on %s", ip,
                      ("HTTPS" if https else 'HTTP'))
        for srv in self.servers:
            if srv.can_serve(ip, https, port):
                candidates.append(srv)

        logging.debug("1st pass: exact names")
        for srv in candidates:
            if vhost in srv.server_names:
                return srv

        logging.debug(
            "2nd pass: longest wildcard name starting with an asterisk")
        pass_candidates = []
        for srv in candidates:
            for srvname in srv.server_names:
                wildcardvhost = None
                if srvname.startswith("*."):
                    wildcardvhost = srvname[2:]
                elif srvname.startswith("."):
                    wildcardvhost = srvname[1:]
                if wildcardvhost:
                    if vhost.endswith(wildcardvhost):
                        pass_candidates.append(
                            (srv, srvname, srvname.count('.')))
        if pass_candidates:
            dots = 0
            selected = None
            for sp in pass_candidates:
                if sp[2] > dots:
                    dots = sp[2]
                    selected = sp[0]
            return selected

        logging.debug(
            "3rd pass: longest wildcard name ending with an asterisk")
        pass_candidates = []
        for srv in candidates:
            for srvname in srv.server_names:
                wildcardvhost = None
                if srvname.endswith(".*"):
                    wildcardvhost = srvname[:-2]
                    if vhost.startswith(wildcardvhost):
                        pass_candidates.append(
                            (srv, srvname, srvname.count('.')))
        if pass_candidates:
            dots = 0
            selected = None
            for sp in pass_candidates:
                if sp[2] > dots:
                    dots = sp[2]
                    selected = sp[0]
            return selected

        logging.debug(
            "4th pass: first matching regular expression (in order of appearance in a configuration file)"
        )
        pass_candidates = []
        for srv in candidates:
            for srvname in srv.server_names:
                wildcardvhost = None
                if srvname.startswith('~'):
                    wildcardvhost = srvname[1:]
                    try:
                        revhost = pcre.search(wildcardvhost, vhost)
                        if revhost:
                            return srv
                    except:
                        logging.debug("FAILED to compile PCRE '%s'",
                                      wildcardvhost)

        logging.debug("5th pass: fallback to default vhost")
        for srv in candidates:
            if srv.is_default_server_name() or srv.is_default_server(ip, port):
                return srv

        return None