def nextToken(self, include_gd=False): tree = PositionResultList() # This is the extract algorithm valid_alternatives = [] for gd in self.alphabet: checker = checker_factory(gd) for left in range(0, len(self.string)): for right in range(left +1, len(self.string) +1 ): if checker.check(self.string[left:right]): valid_alternatives.append((left, right, gd)) if not valid_alternatives: raise Exception("Nothing consumed") for left, right, gd in valid_alternatives: string = self.string[left:right] tree.append(left, right, string, gd, check_position=False) right_length_seq = [] for x in tree.valid_sequences(): if x[-1]['right'] == len(self.string): right_length_seq.append(x) if not right_length_seq: raise Exception("No sequence found for input %s alphabet %s" % (self.string,self.alphabet)) for y in sorted(right_length_seq, key=lambda x:len(x))[0]: #Always gets the match with less tokens if include_gd: yield Token(y['content'], y.get('gd')) else: yield Token(y['content'], None)
def _load_checker(originaldic): """Converts {"channelname","type"} into {"channelname",instance}""" result = {} for key in originaldic: from pydsl.Check import checker_factory result[key] = checker_factory(load(str(originaldic[key]))) #FIXME: load is no longer available return result
def nextToken(self, include_gd=False): tree = PositionResultList() # This is the extract algorithm valid_alternatives = [] for gd in self.alphabet: checker = checker_factory(gd) for left in range(0, len(self.string)): for right in range(left + 1, len(self.string) + 1): if checker.check(self.string[left:right]): valid_alternatives.append((left, right, gd)) if not valid_alternatives: raise Exception("Nothing consumed") for left, right, gd in valid_alternatives: string = self.string[left:right] tree.append(left, right, string, gd, check_position=False) right_length_seq = [] for x in tree.valid_sequences(): if x[-1]['right'] == len(self.string): right_length_seq.append(x) if not right_length_seq: raise Exception("No sequence found for input %s alphabet %s" % (self.string, self.alphabet)) for y in sorted(right_length_seq, key=lambda x: len(x) )[0]: #Always gets the match with less tokens if include_gd: yield Token(y['content'], y.get('gd')) else: yield Token(y['content'], None)
def extract(grammar, inputdata, fixed_start=False): """ Receives a sequence and a grammar, returns a list of PositionTokens with all of the parts of the sequence that are recognized by the grammar """ if not inputdata: return [] checker = checker_factory(grammar) if isinstance(inputdata[0], (Token, PositionToken)): inputdata = [x.content for x in inputdata] totallen = len(inputdata) try: maxl = grammar.maxsize or totallen except NotImplementedError: maxl = totallen try: #minl = grammar.minsize #FIXME: It won't work with incompatible alphabets minl = 1 except NotImplementedError: minl = 1 if fixed_start: max_start = 1 else: max_start = totallen result = [] for i in range(max_start): for j in range(i + minl, min(i + maxl, totallen) + 1): check = checker.check(inputdata[i:j]) if check: result.append(PositionToken(inputdata[i:j], None, i, j)) return result
def testLogLine(self): repository = { 'space': RegularExpression("^ $"), 'integer': RegularExpression("^[0123456789]*$"), 'ipv4': RegularExpression( "^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$"), 'characters': RegularExpression("^[A-z]+$") } grammar = load_bnf_file("pydsl/contrib/grammar/logline.bnf", repository) checker = checker_factory(grammar) original_string = "1.2.3.4 - - [1/1/2003:11:11:11 +2] \"GET\" 1 1 \"referer\" \"useragent\"" tokenized = [ x.content for x in lex(grammar.alphabet, ascii_encoding, original_string) ] self.assertTrue(checker.check(tokenized)) self.assertFalse( checker.check( "1.2.3.4 - - [1/1/2003:11:11:11 +2] \"GOT\" 1 1 \"referer\" \"useragent\"" ))
def __init__(self, rules, root_rule, repository={}): import parsley Grammar.__init__(self) repo = {} for k, v in repository.items(): repo[k] = (v, checker_factory(v))[isinstance(v, Grammar)] self.grammar = parsley.makeGrammar(rules, repo) self.root_rule = root_rule
def __init__(self, rules, root_rule, repository={}): import parsley Grammar.__init__(self) repo={} for k, v in repository.items(): repo[k]=(v, checker_factory(v))[isinstance(v, Grammar)] self.grammar=parsley.makeGrammar(rules, repo) self.root_rule=root_rule
def testFileLoader(self): repository = {'DayOfMonth':load_python_file('pydsl/contrib/grammar/DayOfMonth.py')} #DayOfMonth loaded as checker G=load_parsley_grammar_file("pydsl/contrib/grammar/Date.parsley", "expr", repository) C=checker_factory(G) T=translator_factory(G) self.assertTrue(C("2/4/12")) self.assertEqual(T("2/4/12"),(2,4,12)) self.assertRaises(parsley.ParseError,T, "40/4/12")
def testEcho(self): from pydsl.Translator import translate, PythonTranslator from pydsl.Grammar.Definition import RegularExpression from pydsl.Check import checker_factory cstring = checker_factory(RegularExpression('.*')) def function(my_input): return my_input pt = PythonTranslator({'my_input':cstring}, {'output':cstring}, function) self.assertEqual(translate(pt,{'my_input':"1234"}),"1234")
def __init__(self, rules, root_rule="expr", repository=None): import parsley Grammar.__init__(self) repo=dict(repository or {}) for key in repo: if isinstance(repo[key], Grammar): repo[key] = checker_factory(repo[key]) self.grammar=parsley.makeGrammar(rules, repo) self.root_rule=root_rule
def testLogLine(self): repository = {'space':RegularExpression("^ $"), 'integer':RegularExpression("^[0123456789]*$"), 'ipv4':RegularExpression("^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$"), 'characters':RegularExpression("^[A-z]+$")} grammar = load_bnf_file("pydsl/contrib/grammar/logline.bnf", repository) checker = checker_factory(grammar) self.assertTrue(checker.check("1.2.3.4 - - [1/1/2003:11:11:11 +2] \"GET\" 1 1 \"referer\" \"useragent\"")) self.assertFalse(checker.check("1.2.3.4 - - [1/1/2003:11:11:11 +2] \"GOT\" 1 1 \"referer\" \"useragent\""))
def testChecker(self): alphabet = GrammarCollection([self.integer,self.date]) checker = checker_factory(alphabet) self.assertTrue(checker.check("1234")) self.assertTrue(checker.check([x for x in "1234"])) self.assertFalse(checker.check("11/11/1991")) #Non tokenized input self.assertFalse(checker.check([x for x in "11/11/1991"])) #Non tokenized input self.assertTrue(checker.check(["11","/","11","/","1991"])) #tokenized input self.assertFalse(checker.check("bcdf")) self.assertFalse(checker.check([x for x in "bcdf"]))
def testLogLine(self): repository = {'space':RegularExpression("^ $"), 'integer':RegularExpression("^[0123456789]*$"), 'ipv4':RegularExpression("^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$"), 'characters':RegularExpression("^[A-z]+$")} grammar = load_bnf_file("pydsl/contrib/grammar/logline.bnf", repository) checker = checker_factory(grammar) original_string = "1.2.3.4 - - [1/1/2003:11:11:11 +2] \"GET\" 1 1 \"referer\" \"useragent\"" tokenized = [x.content for x in lex(grammar.alphabet, ascii_encoding, original_string)] self.assertTrue(checker.check(tokenized)) self.assertFalse(checker.check("1.2.3.4 - - [1/1/2003:11:11:11 +2] \"GOT\" 1 1 \"referer\" \"useragent\""))
def testFileLoader(self): import parsley from pydsl.File.Parsley import load_parsley_grammar_file repository = { 'DayOfMonth': load_python_file('pydsl/contrib/grammar/DayOfMonth.py') } #DayOfMonth loaded as checker G = load_parsley_grammar_file("pydsl/contrib/grammar/Date.parsley", "expr", repository) C = checker_factory(G) T = translator_factory(G) self.assertTrue(C("2/4/12")) self.assertEqual(T("2/4/12"), (2, 4, 12)) self.assertRaises(parsley.ParseError, T, "40/4/12")
def testSecondLevelGrammar(self): a = String("a") b = String("b") c = String("c") x = String("x") y = String("y") z = String("z") first_level = Choice([a,b,c]) first_levelb = Choice([x,y,z]) second_level = Sequence([a,b], base_alphabet=first_level) from pydsl.Check import checker_factory checker = checker_factory(second_level) self.assertTrue(checker([a,b])) second_level_alphabet = Choice([first_level, first_levelb]) lexer = lexer_factory(second_level_alphabet, base=first_level+first_levelb) self.assertListEqual(lexer("ab"), [("a",first_level),("b",first_level)])
def testSecondLevelGrammar(self): a = String("a") b = String("b") c = String("c") x = String("x") y = String("y") z = String("z") first_level = Choice([a, b, c]) first_levelb = Choice([x, y, z]) second_level = Sequence([a, b], base_alphabet=first_level) from pydsl.Check import checker_factory checker = checker_factory(second_level) self.assertTrue(checker([a, b])) second_level_alphabet = Choice([first_level, first_levelb]) lexer = lexer_factory(second_level_alphabet, base=first_level + first_levelb) self.assertListEqual(lexer("ab"), [("a", first_level), ("b", first_level)])
def nextToken(self, include_gd=False): best_right = 0 best_gd = None for gd in self.alphabet: checker = checker_factory(gd) left = self.index for right in range(left + 1, len(self.string) + 1): if checker.check(self.string[left:right]): #TODO: Use match if right > best_right: best_right = right best_gd = gd if not best_gd: raise Exception("Nothing consumed") if include_gd: result = self.string[self.index:best_right], best_gd else: result = self.string[self.index:best_right] self.index = right return result
def nextToken(self, include_gd=False): best_right = 0 best_gd = None for gd in self.alphabet: checker = checker_factory(gd) left = self.index for right in range(left +1, len(self.string) +1): if checker.check(self.string[left:right]): #TODO: Use match if right > best_right: best_right = right best_gd = gd if not best_gd: raise Exception("Nothing consumed") if include_gd: result = self.string[self.index:best_right], best_gd else: result = self.string[self.index:best_right] self.index = right return result
def extract(grammar, inputdata, fixed_start = False, return_first=False): """ Receives a sequence and a grammar, returns a list of PositionTokens with all of the parts of the sequence that are recognized by the grammar """ if not inputdata: return [] checker = checker_factory(grammar) if isinstance(inputdata[0], (Token, PositionToken)): inputdata = [x.content for x in inputdata] totallen = len(inputdata) try: maxl = grammar.maxsize or totallen except NotImplementedError: maxl = totallen try: #minl = grammar.minsize #FIXME: It won't work with incompatible alphabets minl = 1 except NotImplementedError: minl = 1 if fixed_start: max_start = 1 else: max_start = totallen result = [] for i in range(max_start): for j in range(i+minl, min(i+maxl, totallen) + 1): check = checker.check(inputdata[i:j]) if check: this_pt = PositionToken(inputdata[i:j], None, i, j) if return_first: return this_pt result.append(this_pt) return result
def extract(grammar, inputdata, fixed_start = False): """Extract every slice of the input data that belongs to the Grammar Definition""" checker = checker_factory(grammar) totallen = len(inputdata) try: maxl = grammar.maxsize or totallen except NotImplementedError: maxl = totallen try: #minl = grammar.minsize #FIXME: It won't work with incompatible alphabets minl = 1 except NotImplementedError: minl = 1 if fixed_start: max_start = 1 else: max_start = totallen result = [] for i in range(max_start): for j in range(i+minl, min(i+maxl, totallen) + 1): check = checker.check(inputdata[i:j]) if check: result.append((i,j, inputdata[i:j])) return result
def testChecker(self): checker = checker_factory(self.alphabet) self.assertTrue(checker.check("1234")) self.assertTrue(checker.check("11/11/1991")) self.assertFalse(checker.check("bcdf"))