def test_counting_scanner_state_has_right_result(self): scanner = pire.CountingScanner( pattern=pire.Lexer("[a-z]+").Parse(), sep=pire.Lexer(r"\s").Parse(), ) text = "abc def, abc def ghi, abc" state = scanner.InitState().Begin().Run(text).End() assert 3 == state.Result(0)
def test_glued_counting_scanner_state_has_right_results(self): separator_fsm = pire.Lexer(".*").Parse() scanner1, scanner2 = [ pire.CountingScanner(pire.Lexer(pattern).Parse(), separator_fsm) for pattern in ["[a-z]+", "[0-9]+"] ] glued = scanner1.GluedWith(scanner2) state = glued.InitState() state.Begin().Run("abc defg 123 jklmn 4567 opqrst").End() assert 4 == state.Result(0) assert 2 == state.Result(1)
def test_fsm_supports_fluent_inplace_operations(self, scanner_class, parse_scanner): a = pire.Fsm().Append("a").AppendDot() b = pire.Fsm() b.Append("b") d = pire.Fsm().Append("d") d *= 3 c = pire.Lexer("c").Parse() fsm = a.Iterate() fsm += b.AppendAnything() fsm |= d fsm &= c.PrependAnything().Complement() expected_scanner = parse_scanner("((a.)*(b.*)|(d{3}))&~(.*c)", "a") check_equivalence(expected_scanner, scanner_class(fsm), [ "ddd", "dddc", "a-b--c", "a-a-b--", "bdddc", "bddd", "", "b", "bc", "c", ])
def test_lexer_glues_similar_glyphs(self): almost_regexp = u"rеgехр" # 'е', 'х' and 'р' are cyrillic exactly_regexp = "regexp" # all latin1 for pattern in [almost_regexp, exactly_regexp]: scanner = pire.Lexer( pattern, pire.UTF8 | pire.GLUE_SIMILAR_GLYPHS, ).Parse().Compile() check_scanner( scanner, accepts=[exactly_regexp, almost_regexp.encode("utf8")], )
def test_scanner_finds_prefixes_and_suffixes(self, scanner_class): fsm = pire.Lexer("-->").Parse() any_occurence = scanner_class(~pire.Fsm.MakeFalse() + fsm) first_occurence = scanner_class(~fsm.Surrounded() + fsm) reverse_occurence = scanner_class(fsm.Reverse()) text = "1234567890 --> middle --> end" assert 14 == first_occurence.LongestPrefix(text) assert 11 == reverse_occurence.LongestSuffix(text[:14]) assert 25 == any_occurence.LongestPrefix(text) assert 22 == reverse_occurence.LongestSuffix(text[:25]) assert 14 == first_occurence.ShortestPrefix(text) assert 11 == reverse_occurence.ShortestSuffix(text[:14]) assert 14 == any_occurence.ShortestPrefix(text) assert 11 == reverse_occurence.ShortestSuffix(text[:14])
def test_fsm_supports_nonmodifying_operations(self, scanner_class, parse_scanner): a, b, c, d, e = [pire.Lexer(char).Parse() for char in "abcde"] expression = ((a + b.Iterated()) | c.Surrounded() | (2 * (d * 2))) & ~e expected_scanner = parse_scanner("((ab*)|(.*c.*)|(d{4}))&~e", "a") check_equivalence(expected_scanner, scanner_class(expression), [ "a", "abbbb", "c", "--c-", "dddd", "--", "e", "-ee-", "", ])
def test_gluing_too_many_scanners_raises(self, scanner_class): if scanner_class in SCANNERS_WITHOUT_GLUE: return many_patterns = [ '/product/', '/catalog/', '/?(\?.*)?$', '/.*/a', '/.*/b', '/.*/c', '/.*/d', '/.*/e', '/.*/f', '/.*/g', '/.*/1234567891011', '/.*/qwertyuiopasdfgh', '/.*/do_it_yourself/' '/.*/doityourself/' ] with pytest.raises(OverflowError): scanner = scanner_class() for pattern in many_patterns: new_scanner = (pire.Lexer("^" + pattern + ".*").Parse().Compile(scanner_class)) scanner = scanner.GluedWith(new_scanner) assert not scanner.Empty() assert scanner.RegexpsCount() == len(many_patterns)
def test_capturing_trivial(self): """ This is the "Trivial" test from tests/capture_ut.cpp. """ lexer = pire.Lexer(r"""google_id\s*=\s*['"]([a-z0-9]+)['"]\s*;""") fsm = lexer.AddOptions(pire.I).AddCapturing(1).Parse() scanner = fsm.Surround().Compile(pire.CapturingScanner) text = "google_id = 'abcde';" captured = scanner.InitState().Begin().Run(text).End().Captured() assert captured assert "abcde" == text[captured[0] - 1:captured[1] - 1] text = "var google_id = 'abcde'; eval(google_id);" captured = scanner.InitState().Begin().Run(text).End().Captured() assert captured assert "abcde" == text[captured[0] - 1:captured[1] - 1] text = "google_id != 'abcde';" captured = scanner.InitState().Begin().Run(text).End().Captured() assert None is captured
def scanner_factory(pattern, options=""): lexer = pire.Lexer(pattern, options) fsm = lexer.Parse() return scanner_class(fsm)
def test_lexer_raises_on_parsing_invalid_regexp(self): lexer = pire.Lexer("[ab") with pytest.raises(ValueError): lexer.Parse()
def test_lexer_cannot_be_constructed_with_wrong_argument(self): with pytest.raises(TypeError): pire.Lexer(42)
def test_lexer_default_constructible(self): lexer = pire.Lexer() assert pire.Fsm == type(lexer.Parse())