def testLogLine(self):
     repository = {
         'space':
         String(' '),
         'integer':
         RegularExpression("^[0123456789]*$"),
         'ipv4':
         RegularExpression(
             "^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$"),
         'characters':
         RegularExpression("^[A-z]+$")
     }
     grammar = load_bnf_file("pydsl/contrib/grammar/logline.bnf",
                             repository)
     checker = checker_factory(grammar)
     original_string = "1.2.3.4 - - [1/1/2003:11:11:11 +2] \"GET\" 1 1 \"referer\" \"useragent\""
     tokenized = lex(grammar.alphabet,
                     ascii_encoding,
                     original_string,
                     force_lexer="general")
     self.assertTrue(checker.check(tokenized))
     self.assertFalse(
         checker.check(
             "1.2.3.4 - - [1/1/2003:11:11:11 +2] \"GOT\" 1 1 \"referer\" \"useragent\""
         ))
Exemple #2
0
    def nextToken(self):
        tree = PositionResultList()  # This is the extract algorithm
        valid_alternatives = []
        for gd in self.alphabet:
            checker = checker_factory(gd)
            for left in range(0, len(self.string)):
                if getattr(gd, 'maxsize', None):
                    max_right = left + gd.maxsize + 1
                else:
                    max_right = len(self.string) +1 
                for right in range(left +1, min(max_right, len(self.string) +1)):
                    slice = self.string[left:right]
                    if checker.check(slice):
                        tree.append(left, right, slice, gd, check_position=False)
        if not tree:
            raise Exception("Nothing consumed")

        right_length_seq = []
        valid_sequences = tree.valid_sequences()
        for x in valid_sequences:
            my_list = list(x)
            if my_list[-1]['right'] == len(self.string):
                right_length_seq.append(my_list)
        if not right_length_seq:
            raise Exception("No sequence found for input %s alphabet %s" % (self.string,self.alphabet))
        for y in sorted(right_length_seq, key=lambda x:len(x))[0]: #Always gets the match with less tokens
            yield Token(y['content'], y.get('gd'))
Exemple #3
0
 def testCheck(self):
     """Test checker instantiation and call"""
     from pydsl.grammar.definition import JsonSchema
     from pydsl.check import JsonSchemaChecker
     schema = {
             "type" : "object",
             "required":["foo"],
             "properties" : {
                 "foo" : {"enum" : [1, 3]},
                 "bar" : {"format": "number_three"} #Ignored by jsonschema
             }
     }
     grammardef = JsonSchema(schema)
     checker = JsonSchemaChecker(grammardef)
     self.assertFalse(checker.check("a"))
     self.assertTrue(checker.check({"foo":1}))
     self.assertFalse(checker.check({"foo":2}))
     self.assertTrue(checker.check({"foo":3}))
     self.assertFalse(checker.check([1, {"foo" : 2, "bar" : {"baz" : [1]}}, "quux"]))
     self.assertRaises(Exception, checker.check, [1, {"foo" : 2, "bar" : {"baz" : [1]}}, "quux"], raise_exceptions=True)
     number_three = checker_factory(String("3"))
     fc = {"number_three":number_three}
     grammardef = JsonSchema(schema)
     checker = JsonSchemaChecker(grammardef, fc) # Adds a format checker
     self.assertFalse(checker.check({"foo" : 1, "bar" : "123456"}))
     self.assertTrue(checker.check({"foo" : 1, "bar" : "3"}))
Exemple #4
0
 def testCheck(self):
     """Test checker instantiation and call"""
     from pydsl.grammar.definition import JsonSchema
     from pydsl.check import JsonSchemaChecker
     schema = {
             "type" : "object",
             "required":["foo"],
             "properties" : {
                 "foo" : {"enum" : [1, 3]},
                 "bar" : {"format": "number_three"} #Ignored by jsonschema
             }
     }
     grammardef = JsonSchema(schema)
     checker = JsonSchemaChecker(grammardef)
     self.assertFalse(checker.check("a"))
     self.assertTrue(checker.check({"foo":1}))
     self.assertFalse(checker.check({"foo":2}))
     self.assertTrue(checker.check({"foo":3}))
     self.assertFalse(checker.check([1, {"foo" : 2, "bar" : {"baz" : [1]}}, "quux"]))
     self.assertRaises(Exception, checker.check, [1, {"foo" : 2, "bar" : {"baz" : [1]}}, "quux"], raise_exceptions=True)
     number_three = checker_factory(String("3"))
     fc = {"number_three":number_three}
     grammardef = JsonSchema(schema)
     checker = JsonSchemaChecker(grammardef, fc) # Adds a format checker
     self.assertFalse(checker.check({"foo" : 1, "bar" : "123456"}))
     self.assertTrue(checker.check({"foo" : 1, "bar" : "3"}))
Exemple #5
0
    def nextToken(self, include_gd=False):
        tree = PositionResultList()  # This is the extract algorithm
        valid_alternatives = []
        for gd in self.alphabet:
            checker = checker_factory(gd)
            for left in range(0, len(self.string)):
                for right in range(left +1, len(self.string) +1 ):
                    if checker.check(self.string[left:right]):
                        valid_alternatives.append((left, right, gd))
        if not valid_alternatives:
            raise Exception("Nothing consumed")
        for left, right, gd in valid_alternatives:
            string = self.string[left:right]
            tree.append(left, right, string, gd, check_position=False)

        right_length_seq = []
        for x in tree.valid_sequences():
            if x[-1]['right'] == len(self.string):
                right_length_seq.append(x)
        if not right_length_seq:
            raise Exception("No sequence found for input %s alphabet %s" % (self.string,self.alphabet))
        for y in sorted(right_length_seq, key=lambda x:len(x))[0]: #Always gets the match with less tokens
            if include_gd:
                yield Token(y['content'], y.get('gd'))
            else:
                yield Token(y['content'], None)
Exemple #6
0
    def nextToken(self):
        tree = PositionResultList()  # This is the extract algorithm
        valid_alternatives = []
        for gd in self.alphabet:
            checker = checker_factory(gd)
            for left in range(0, len(self.string)):
                if getattr(gd, 'maxsize', None):
                    max_right = left + gd.maxsize + 1
                else:
                    max_right = len(self.string) + 1
                for right in range(left + 1,
                                   min(max_right,
                                       len(self.string) + 1)):
                    slice = self.string[left:right]
                    if checker.check(slice):
                        tree.append(left,
                                    right,
                                    slice,
                                    gd,
                                    check_position=False)
        if not tree:
            raise Exception("Nothing consumed")

        right_length_seq = []
        valid_sequences = tree.valid_sequences()
        for x in valid_sequences:
            my_list = list(x)
            if my_list[-1]['right'] == len(self.string):
                right_length_seq.append(my_list)
        if not right_length_seq:
            raise Exception("No sequence found for input %s alphabet %s" %
                            (self.string, self.alphabet))
        for y in sorted(right_length_seq, key=lambda x: len(x)
                        )[0]:  #Always gets the match with less tokens
            yield Token(y['content'], y.get('gd'))
Exemple #7
0
 def testEcho(self):
     from pydsl.translator import translate, PythonTranslator
     from pydsl.grammar.definition import RegularExpression
     from pydsl.check import checker_factory
     cstring = checker_factory(RegularExpression('.*'))
     def function(my_input):
         return my_input
     pt = PythonTranslator(function)
     self.assertEqual(translate(pt,{'my_input':"1234"}),"1234")
Exemple #8
0
 def testDate(self):
     from pydsl.file.parsley import load_parsley_grammar_file
     repository = {'DayOfMonth':load_python_file('pydsl/contrib/grammar/DayOfMonth.py')} #DayOfMonth loaded as checker
     G=load_parsley_grammar_file("pydsl/contrib/grammar/Date.parsley", "expr", repository)
     C=checker_factory(G)
     T=translator_factory(G)
     self.assertTrue(C("2/4/12"))
     self.assertEqual(T("2/4/12"),(2,4,12))
     self.assertRaises(parsley.ParseError,T, "40/4/12")
Exemple #9
0
 def __init__(self, rules, root_rule="expr", repository=None):
     import parsley
     Grammar.__init__(self)
     repo=dict(repository or {})
     for key in repo:
         if isinstance(repo[key], Grammar):
             repo[key] = checker_factory(repo[key])
     self.grammar=parsley.makeGrammar(rules, repo)
     self.root_rule=root_rule 
Exemple #10
0
 def testChecker(self):
     alphabet = Alphabet([self.integer,self.date])
     checker = checker_factory(alphabet)
     self.assertTrue(checker.check("1234"))
     self.assertTrue(checker.check([x for x in "1234"]))
     self.assertFalse(checker.check("11/11/1991")) #Non tokenized input
     self.assertFalse(checker.check([x for x in "11/11/1991"])) #Non tokenized input
     self.assertTrue(checker.check(["11","/","11","/","1991"])) #tokenized input
     self.assertFalse(checker.check("bcdf"))
     self.assertFalse(checker.check([x for x in "bcdf"]))
Exemple #11
0
 def testChecker(self):
     alphabet = Choice([self.integer,self.date])
     checker = checker_factory(alphabet)
     self.assertTrue(checker.check("1234"))
     self.assertTrue(checker.check([x for x in "1234"]))
     self.assertFalse(checker.check("11/11/1991")) #Non tokenized input
     self.assertFalse(checker.check([x for x in "11/11/1991"])) #Non tokenized input
     self.assertTrue(checker.check([Token(x, ascii_encoding) for x in ["11","/","11","/","1991"]]))
     self.assertFalse(checker.check("bcdf"))
     self.assertFalse(checker.check([x for x in "bcdf"]))
Exemple #12
0
 def testLogLine(self):
     repository = {'space':String(' '), 
                   'integer':RegularExpression("^[0123456789]*$"),
                   'ipv4':RegularExpression("^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$"),
                   'characters':RegularExpression("^[A-z]+$")}
     grammar = load_bnf_file("pydsl/contrib/grammar/logline.bnf", repository)
     checker = checker_factory(grammar)
     original_string = "1.2.3.4 - - [1/1/2003:11:11:11 +2] \"GET\" 1 1 \"referer\" \"useragent\""
     tokenized = lex(grammar.alphabet, ascii_encoding, original_string, force_lexer="general")
     self.assertTrue(checker.check(tokenized))
     self.assertFalse(checker.check("1.2.3.4 - - [1/1/2003:11:11:11 +2] \"GOT\" 1 1 \"referer\" \"useragent\""))
Exemple #13
0
 def testChecker(self):
     alphabet = Choice([self.integer, self.date])
     checker = checker_factory(alphabet)
     self.assertTrue(checker.check("1234"))
     self.assertTrue(checker.check([x for x in "1234"]))
     self.assertFalse(checker.check("11/11/1991"))  #Non tokenized input
     self.assertFalse(checker.check([x for x in "11/11/1991"
                                     ]))  #Non tokenized input
     self.assertTrue(
         checker.check([
             Token(x, ascii_encoding)
             for x in ["11", "/", "11", "/", "1991"]
         ]))
     self.assertFalse(checker.check("bcdf"))
     self.assertFalse(checker.check([x for x in "bcdf"]))
Exemple #14
0
 def nextToken(self):
     best_right = 0
     best_gd = None
     matches = []
     for gd in self.alphabet:
         checker = checker_factory(gd)
         if checker.check(self.string[self.index:self.index + 1]):
             matches.append(gd)
     if not matches:
         raise Exception("Unmatched Token")
     elif len(matches) > 1:
         raise Exception("Too many matches")
     gd = matches[0]
     result = Token(self.string[self.index:self.index + 1], gd)
     self.index += 1
     return result
Exemple #15
0
 def nextToken(self):
     best_right = 0
     best_gd = None
     matches = []
     for gd in self.alphabet:
         checker = checker_factory(gd)
         if checker.check(self.string[self.index:self.index+1]):
             matches.append(gd)
     if not matches:
         raise Exception("Unmatched Token")
     elif len(matches) > 1:
         raise Exception("Too many matches")
     gd = matches[0]
     result = Token(self.string[self.index:self.index+1], gd)
     self.index += 1
     return result
Exemple #16
0
 def testSecondLevelGrammar(self):
     a = String("a")
     b = String("b")
     c = String("c")
     x = String("x")
     y = String("y")
     z = String("z")
     first_level = Choice([a,b,c])
     first_levelb = Choice([x,y,z])
     second_level = Sequence([a,b], base_alphabet=first_level)
     from pydsl.check import checker_factory
     checker = checker_factory(second_level)
     self.assertTrue(checker([a,b]))
     second_level_alphabet = Choice([first_level, first_levelb]) 
     lexer = lexer_factory(second_level_alphabet, base=first_level+first_levelb)
     self.assertListEqual(lexer("ab"), [Token("a",first_level),Token("b",first_level)])
Exemple #17
0
 def nextToken(self, include_gd=False):
     best_right = 0
     best_gd = None
     for gd in self.alphabet:
         checker = checker_factory(gd)
         left = self.index
         for right in range(left +1, len(self.string) +1):
             if checker.check(self.string[left:right]): #TODO: Use match
                 if right > best_right:
                     best_right = right
                     best_gd = gd
     if not best_gd:
         raise Exception("Nothing consumed")
     if include_gd:
         result = self.string[self.index:best_right], best_gd
     else:
         result = self.string[self.index:best_right]
     self.index = right
     return result
Exemple #18
0
 def testSecondLevelGrammar(self):
     a = String("a")
     b = String("b")
     c = String("c")
     x = String("x")
     y = String("y")
     z = String("z")
     first_level = Choice([a, b, c])
     first_levelb = Choice([x, y, z])
     second_level = Sequence([a, b], base_alphabet=first_level)
     from pydsl.check import checker_factory
     checker = checker_factory(second_level)
     self.assertTrue(checker([a, b]))
     second_level_alphabet = Choice([first_level, first_levelb])
     lexer = lexer_factory(second_level_alphabet,
                           base=first_level + first_levelb)
     self.assertListEqual(
         lexer("ab"), [Token("a", first_level),
                       Token("b", first_level)])
Exemple #19
0
def extract(grammar, inputdata, fixed_start = False, return_first=False):
    """
    Receives a sequence and a grammar, 
    returns a list of PositionTokens with all of the parts of the sequence that 
    are recognized by the grammar
    """
    if not inputdata:
        return []
    checker = checker_factory(grammar)

    if isinstance(inputdata[0], (Token, PositionToken)):
        inputdata = [x.content for x in inputdata]

    totallen = len(inputdata)
    try:
        maxl = grammar.maxsize or totallen
    except NotImplementedError:
        maxl = totallen
    try:
        #minl = grammar.minsize #FIXME: It won't work with incompatible alphabets
        minl = 1
    except NotImplementedError:
        minl = 1
    if fixed_start:
        max_start = 1
    else:
        max_start = totallen
    result = []
    for i in range(max_start):
        for j in range(i+minl, min(i+maxl, totallen) + 1):
            check = checker.check(inputdata[i:j])
            if check:
                this_pt = PositionToken(inputdata[i:j], None, i, j)
                if return_first:
                    return this_pt
                result.append(this_pt)
    return result
Exemple #20
0
def extract(grammar, inputdata, fixed_start=False, return_first=False):
    """
    Receives a sequence and a grammar, 
    returns a list of PositionTokens with all of the parts of the sequence that 
    are recognized by the grammar
    """
    if not inputdata:
        return []
    checker = checker_factory(grammar)

    totallen = len(inputdata)
    from pydsl.grammar.PEG import Choice
    try:
        maxl = grammar.maxsize or totallen
    except NotImplementedError:
        maxl = totallen
    try:
        #minl = grammar.minsize #FIXME: It won't work with incompatible alphabets
        minl = 1
    except NotImplementedError:
        minl = 1
    if fixed_start:
        max_start = 1
    else:
        max_start = totallen
    result = []
    for i in range(max_start):
        for j in range(i + minl, min(i + maxl, totallen) + 1):
            slice = inputdata[i:j]
            check = checker.check(slice)
            if check:
                this_pt = PositionToken(slice, grammar, i, j)
                if return_first:
                    return this_pt
                result.append(this_pt)
    return result