def typographify(input): """ Run from parent directory. >>> import os >>> os.chdir('typographify') >>> print typographify(open('typographify.txt').read()) <strong>strong</strong> <em>words</em> parsed 17 chars of 17 https://pypi.python.org/pypi/SimpleParse/ Version 2.2 http://www.ibm.com/developerworks/linux/library/l-simple/index.html https://books.google.com/books?id=GxKWdn7u4w8C&pg=PA319&lpg=PA319&dq=simpleparse+standard+input&source=bl&ots=M8x58SCzpT&sig=5DOLvoC5-TZyxxlq3_LHD68gbXY&hl=en&sa=X&ved=0ahUKEwjFjOCurKjMAhVMuYMKHaM4ATUQ6AEIMTAD#v=onepage&q=simpleparse%20standard%20input&f=false """ parser = Parser(open('typographify.def').read(), 'para') taglist = parser.parse(input) text = '' for tag, beg, end, parts in taglist[1]: if tag == 'plain': text += (input[beg:end]) elif tag == 'markup': markup = parts[0] mtag, mbeg, mend = markup[:3] start, stop = codes.get(mtag, ('<!-- unknown -->','<!-- / -->')) text += start + input[mbeg+1:mend-1] + stop text += 'parsed %s chars of %s' % (taglist[-1], len(input)) return text
def parse(self, *args, **kwargs): res = Parser.parse(self, *args, **kwargs) l = [r for r in res[1] if isinstance(r, Node)] count = 0 while count < len(l): l += l[count].children count += 1 for e in l: if e.__class__.__name__ == "Inline": url = os.path.join(self.root_path, e.url) if e.children[:]: continue logger.info("Parse inlined vrml {0}".format(url)) e.children = Parser.parse(self, open(url).read())[1] for child in e.children: child._parent = e code = "from parser import Node\n\n" for name, prototype in self.prototypes.items(): obj = prototype() attrs = [(key, getattr(obj, key)) for key in dir(obj) if not ( key.startswith("_") or callable(getattr(obj, key)) or key == "children")] code += "class {0}({1}):\n".format(name, "object")#prototype.__bases__[0].__name__) #print obj, dir(obj), "\n---\n", obj._ftypes, "\n---\n",attrs code += " def __init__(self):\n" for key, value in attrs: code += " self.{0} = {1} #{2}\n".format(key, repr(value), prototype.ftype(key)) code += "\n" f = open("/tmp/robotviewer_protos.py",'w') f.write(code) f.close() logger.debug("internally generated foloowing classes:\n{0}".format(code)) return res[0], res[1], res[2]
def testTermSharing( self ): """Test that shared terminal productions are using the same parser""" first =""" a := b,b >b<:= d d:= 'this'""" pFirst = Parser( first, "a") tFirst = pFirst.buildTagger() b,c = tFirst assert b is c, """Not sharing the same tuple for b and c instances"""
def testBasic(self): proc = dispatchprocessor.DispatchProcessor() setattr(proc, "string", strings.StringInterpreter()) for production, yestable, notable in parseTests: p = Parser("x := %s" % production, "x") for data in yestable: if production == "string": success, results, next = p.parse(data, processor=proc) else: success, results, next = p.parse(data) assert success and (next == len(data)), """Did not parse string %s as a %s result=%s""" % ( repr(data), production, (success, results, next), ) assert results, """Didn't get any results for string %s as a %s result=%s""" % ( repr(data), production, (success, results, next), ) if production == "string": expected = eval(data, {}, {}) assert results[0] == expected, ( """Got different interpreted value for data %s, we got %s, expected %s""" % (repr(data), repr(results[0]), repr(expected)) ) for data in notable: success, results, next = p.parse(data) assert not success, """Parsed %s of %s as a %s result=%s""" % ( repr(data), production, (success, results, next), )
def main(): oparser = get_parser() opts, args = oparser.parse_args() parser = Parser(open(opts.grammar).read(), opts.root) success, tags, next = parser.parse( open(opts.input).read() ) print tags
def debugparser(self, filename): file = open(filename).read() debugparser = Parser (self.declaration) import pprint info("started debug parsing") pprint.pprint(debugparser.parse(file)) info("completed debug parsing") exit(0)
def testTZ( self ): names = list(timezone_names.timezone_mapping.keys()) names.sort() # tests that the items don't match shorter versions... decl = Parser("""this := (timezone_name, ' '?)+""", 'this') proc = dispatchprocessor.DispatchProcessor() proc.timezone_name = timezone_names.TimeZoneNameInterpreter() text = ' '.join(names) success, result, next = decl.parse( text, processor = proc ) assert success, """Unable to complete parsing the timezone names, stopped parsing at char %s %s"""%(next, text[next:]) assert result == list(map( timezone_names.timezone_mapping.get, names)), """Got different results for interpretation than expected (expected first, recieved second)\n%s\n%s"""%(list(map( timezone_names.timezone_mapping.get, names)), result)
def __init__(self, filename): with open(filename) as f: content = f.read() parser = Parser(declaration) success, tree, nextChar = parser.parse(content, processor=ConfigProcessor(self)) if not success: raise Exception for k, v in tree[0].iteritems(): setattr(self, k, v)
def testBasic( self ): for production, yestable, notable in parseTests: p = Parser( "x := %s"%production, 'x') for data in yestable: success, results, next = p.parse( data) assert success and (next == len(data)), """Did not parse comment %s as a %s result=%s"""%( repr(data), production, (success, results, next)) assert results, """Didn't get any results for comment %s as a %s result=%s"""%( repr(data), production, (success, results, next)) for data in notable: success, results, next = p.parse( data) assert not success, """Parsed %s of %s as a %s result=%s"""%( next, repr(data), production, results )
def testTermCompression( self ): """Test that unreported productions are compressed Term compression is basically an inlining of terminal expressions into the calling table. At the moment the terminal expressions are all duplicated, which may balloon the size of the grammar, not sure if this will be an actual problem. As written, this optimization should provide a significant speed up, but there may the even more of a speed up if we allow for sharing the terminal tuples as well. This: a:=b <b>:= -c* c:='this' Should eventually compress to this: a := -'this'* """ failures = [] for first, second in [ ("""a:=b <b>:= -c* c:='this'""", """a := -'this'*"""), ("""a:=b >b<:= c c:= 'this'""", """a := c c:= 'this'"""), ("""a:=b >b<:= c <c>:= 'this'""", """a := 'this'"""), ("""a:=b >b<:= c+ <c>:= 'this'""", """a := 'this'+"""), # The following will never work, so eventually may raise # an error or at least give a warning! ("""a:=b,c >b<:= c+ <c>:= 'this'""", """a := 'this'+,'this'"""), ("""a:=b/c >b<:= c+ <c>:= 'this'""", """a := 'this'+/'this'"""), # This is requiring group-compression, which isn't yet written ("""a:=-b/c >b<:= c+ <c>:= 'this'""", """a := -'this'+/'this'"""), ("""a := (table1 / table2 / any_line)* <any_line> := ANY*, EOL <ANY> := -EOL <EOL> := '\n' table1 := 'a' table2 := 'b' """, """a := (table1 / table2 / (-'\n'*, '\n'))* table1 := 'a' table2 := 'b' """), ("""a:= b,c <b>:= -c* <c>:= '\n'""", """a := -'\n'*,'\n'"""), ]: pFirst = Parser( first, "a") pSecond = Parser( second, "a") tFirst = pFirst.buildTagger() tSecond = pSecond.buildTagger() if not rcmp( tFirst , tSecond): tFirstRepr = pprint.pformat(tFirst) tSecondRepr = pprint.pformat(tSecond) failures.append( """%(first)r did not produce the same parser as %(second)r\n\t%(tFirstRepr)s\n\t%(tSecondRepr)s"""%locals()) if failures: raise ValueError( "\n".join(failures))
def __init__(self): declaration = r''' fun := fun_name,'(',exp_list,')' fun_name := [a-zA-Z0-9_-]+ exp_list := exp,(',',exp)* exp := pm_exp pm_exp := md_exp,('+'/'-',md_exp)* md_exp := bracket_exp,('*'/'/',bracket_exp)* bracket_exp := ('(',exp,')')/str_var/number str_var := [a-zA-Z],[a-zA-Z0-9_-\\.]* ''' self.fun_parser = Parser( declaration, "fun" ) self.exp_parser = Parser( declaration, "exp" )
def testBasic( self ): for production, processor, yestable, notable in _data: p = Parser( "x := %s"%production, 'x') proc = dispatchprocessor.DispatchProcessor() setattr(proc, production, processor()) for data, length, value in yestable: success, results, next = p.parse( data, processor = proc) assert next == length, """Did not parse string %s of %s as a %s result=%s"""%( repr(data[:length]), repr(data), production, (success, results, next)) assert results[0] == value, """Didn't get expected value from processing value %s, expected %s, got %s"""%( data[:length], value, results[0]) for data in notable: success, results, next = p.parse( data) assert not success, """Parsed %s of %s as a %s result=%s"""%( repr(data[:length]), repr(data), production, (success, results, next))
def __init__(self, grammar = None, root_node = "vrmlScene"): path = os.path.abspath(os.path.dirname(__file__)) if not grammar: grammar_file = os.path.join(path,"vrml.sbnf" ) # print("Using grammar {0}".format(grammar_file)) grammar = open(grammar_file).read() #logger.info("Grammar: {0}".format(grammar)) Parser.__init__(self, grammar, root_node) logging.debug("created parser instance") self.root_path = "" self.prototypes = {} spec_data = open(os.path.join(path, 'standard_nodes.wrl')).read() self.parse(spec_data) logging.debug("Parsed vrml2.0 specs")
def main(): try: opts, args = getopt.getopt(sys.argv[1:], 'hn') except getopt.GetoptError: usage() sys.exit(2) # Get any options navFlag = False for o, a in opts: if o == '-h': usage() sys.exit() if o == '-n': navFlag = True # Get the input filename if len(args) != 1: usage() sys.exit(2) else: filename = args[0] # Initialise data base db = freenav.freedb.Freedb() db.delete_airspace() # Initialise parser parser = Parser(tnp.TNP_DECL, 'tnp_file') p = db.get_projection() proj = freenav.projection.Lambert(p['parallel1'], p['parallel2'], p['latitude'], p['longitude']) output_processor = AirProcessor(db, proj) tnp_processor = tnp.TnpProcessor(output_processor) # Read data and parse airdata = open(filename).read() success, parse_result, next_char = parser.parse(airdata, processor=tnp_processor) # Report any syntax errors if not (success and next_char==len(airdata)): print "%s: Syntax error at (or near) line %d" % \ (filename, len(airdata[:next_char].splitlines())+1) sys.exit(1) # Create indices and tidy up db.commit() db.vacuum()
def _testSet( self, set, singleName, multiName ): """Test multi-line definitions""" decl = """single := %s multiple := %s"""%( singleName, multiName ) p = Parser(decl) notset = string.translate( fulltrans, fulltrans, set ) for char in set: success, children, next = p.parse( char, singleName) assert success and (next == 1), """Parser for %s couldn't parse %s"""%( singleName, char ) for char in notset: success, children, next = p.parse( char, singleName) assert (not success) and (next == 0), """Parser for %s parsed %s"""%( singleName, char ) success, children, next = p.parse( char, multiName) assert (not success) and (next == 0), """Parser for %s parsed %s"""%( multiName, char ) success, children, next = p.parse( set, multiName) assert success and (next == len(set)), """Parser for %s couldn't parse full set of chars, failed at %s"""%( multiName, set[next:] )
def testISODate( self ): """Test the parsing of ISO date and time formats""" values = [ ("2002-02-03", DateTime.DateTime( 2002, 2,3)), ("2002-02",DateTime.DateTime( 2002, 2)), ("2002",DateTime.DateTime( 2002)), ("2002-02-03T04:15", DateTime.DateTime( 2002, 2,3, 4,15)), ("2002-02-03T04:15:16", DateTime.DateTime( 2002, 2,3, 4,15, 16)), ("2002-02-03T04:15:16+00:00", DateTime.DateTime( 2002, 2,3, 4,15, 16)-tzOffset), ] p = Parser ("d:= ISO_date_time", "d") proc = iso_date.MxInterpreter() for to_parse, date in values: success, children, next = p.parse( to_parse, processor=proc) assert success, """Unable to parse any of the string %s with the ISO date-time parser"""% (to_parse) assert next == len(to_parse),"""Did not finish parsing string %s with the ISO date-time parser, remainder was %s, found was %s"""%( to_parse, to_parse [next:],children) assert children [0] == date,"""Returned different date for string %s than expected, got %s, expected %s"""% (to_parse,children [0], date)
def convert(input, definition = 'compilation_unit'): """ Example of converting syntax from ActionScript to C#. >>> print(convert('import com.finegamedesign.anagram.Model;', 'import_definition')) using /*<com>*/Finegamedesign.Anagram/*<Model>*/; Related to grammar unit testing specification (gUnit) https://theantlrguy.atlassian.net/wiki/display/ANTLR3/gUnit+-+Grammar+Unit+Testing """ source = cfg['source'] to = cfg['to'] parser = Parser(grammars[source], definition) input = may_import(None, input, definition, to) taglist = parser.parse(input) taglist = [(definition, 0, taglist[-1], taglist[1])] text = _recurse_tags(taglist, input, source, to) text = may_import(taglist, text, definition, to) text = may_format(definition, text) return text
def parse(self, txt, *args, **kwargs): # Easter egg. if txt == "2+2": return (True, 5) try: success, children, next = Parser.parse(self, txt, *args, **kwargs) except ParserSyntaxError: return (False, 0.0) if not (success and next == len(txt)): return (False, 0.0) else: return (True, children[0])
class GeneratorAPI1: """Stand-in class supporting operation of SimpleParse 1.0 applications There was really only the one method of interest, parserbyname, everything else was internal (and is now part of simpleparsegrammar.py). """ def __init__( self, production, prebuilt=() ): from simpleparse.parser import Parser self.parser = Parser( production, prebuilts=prebuilt ) def parserbyname( self, name ): """Retrieve a tag-table by production name""" return self.parser.buildTagger( name )
class Compiler: def __init__(self): self.parser = Parser(grammar) self.translator = SyntaxTreeProcessor() def compile(self, command): cmd = re.sub('\s', '', command) (success, children, nextchar) = self.parser.parse(cmd) result = self.translator((success, children, nextchar), cmd) python_src = result[1][0] return compile(python_src, '', 'exec')
def __init__(self, *args): scriptDir = os.getcwd() self.graph = Graph() n3File = Util.relpath(scriptDir + '/etc/sfs-extra.n3') self.graph.load(n3File, format='n3') self.roots = [] self.uriFormatter = {} self.decl = '' self.namedLaws = {} self.loadEbnf(scriptDir + '/etc/base.ebnf') self.args = args if self.LAGRUM in args: prods = self.loadEbnf(scriptDir + '/etc/lagrum.ebnf') for p in prods: self.uriFormatter[p] = self.sfsFormatUri self.namedLaws.update(self.getRelationship(RDFS.label)) self.roots.append('sfsrefs') self.roots.append('sfsref') if self.KORTALAGRUM in args: # TODO: Fix korta lagrum also pass if self.FORARBETEN in args: prods = self.loadEbnf(scriptDir + '/etc/forarbeten.ebnf') for p in prods: self.uriFormatter[p] = self.forarbeteFormatUri self.roots.append('forarbeteref') self.decl += 'root ::= (%s/plain)+\n' % '/'.join(self.roots) self.parser = Parser(self.decl, 'root') self.tagger = self.parser.buildTagger('root') self.depth = 0 #SFS specific settings self.currentLaw = None self.currentChapter = None self.currentSection = None self.currentPiece = None self.lastLaw = None self.currentNamedLaws = {}
testdata.append('cn=="Klaus Müller" AND (dob <= dob2) OR (dob == 2000) AND (gender != "M")') testdata.append(' ( ( ( test == ( test ) ) ) ) ') testdata.append('peter == "test"') testdata.append('cn="Klaus Müller" AND (dob < dob2 OR dob = 2000) AND gender != "M"') testdata.append('test') testdata.append('(test)') testdata.append('((test))') testdata.append('test==test') testdata.append('()') testdata.append('(test') testdata.append('cn="Klaus Müller" AND (dob < 1975 OR dob = 2000) AND gender != "M"') from simpleparse.parser import Parser import pprint parser = Parser(declaration, "FILTER") if __name__ =="__main__": for entry in testdata: res = parser.parse(entry); print '-' * 90 if(res[2] != len(entry)): print "FAILED: ", entry pprint.pprint(parser.parse(entry)) print len(entry), res[2] else: print "OK: ", entry
def testEOFFail( self ): p = Parser( """this := 'a',EOF""", 'this') success, children, next = p.parse( 'a ' ) assert not success, """EOF matched before end of string"""
def parse(self, definition, parserName, testValue, source): result = Parser(definition, ).parse(testValue, production=parserName, processor=source) return result
def format_taglist(input, definition): source = cfg['source'] parser = Parser(grammars[source], definition) taglist = parser.parse(input) return pformat(taglist)
class EBNFSpill(object): DEFAULT_MAX_TIMES_CHAR = 35 DEFAULT_MAX_TIMES_FUNC = 10 DEFAULT_MAX_SELF_RECURSION = 25 DEFAULT_MAX_WALK_RECURSION = 100 def __init__(self,showTags=False,showTagsRecursive=False,recursionLevel=0): self._reset() self.showTags=showTags self.showTagsRecursive=showTagsRecursive self.recursionLevelObj=recursionLevel if self.recursionLevelObj>self.DEFAULT_MAX_SELF_RECURSION: raise Exception("a") #print "INIT",recursionLevel pass def __del__(self): self.recursionLevelObj-=1 pass def validate(self,data): return self.parser.parse(data) def setDeclaration(self,declaration,production): self.parser = Parser(declaration, production) self.table = self.parser.buildTagger(production=production) def setTable(self,table,nodes=None): self.table = table self.nodes=nodes or self.nodes def _reset(self): self.nodes = {} self.ctx = [] # context (infos like recurion for table2) #self.recursionLevelObj=0 self.recursionLevelWalk=0 random.seed() def setDefaults(self,**kwargs): valid_defaults = [i for i in dir(self) if i.startswith("DEFAULT_")] for k,v in kwargs.iteritems(): if k in valid_defaults: setattr(self,k,v) else: raise Exception("Not allowed to change %s to %s (valid options: %s)"%(k,v,valid_defaults)) def getTable(self): return self.table def getTagName(self,node): if self.showTags and node[0]: return "<%s>"%node[0] return "" def checkTypeIterable(self,l): return isinstance(l, collections.Iterable) and not isinstance(l, basestring) def checkTypeIterableRecursive(self,l): return isinstance(l, collections.Iterable) and not isinstance(l, basestring) and isinstance(l,tuple) and isinstance(l[0],list) and isinstance(l[1],int) def checkTypeNodeBase(self,l): #checks ( None|str, int, *) return self.checkTypeIterable(l) and len(l)>=2 and (l[0]==None or isinstance(l[0],basestring)) and isinstance(l[1],int) def checkTypeNodeWithChilds(self,l): #print "check_",str(l)[:50] try: #print "check_metric",checkTypeNodeBase(l),len(l)>=3 , checkTypeIterable(l[2]) pass except: pass return self.checkTypeNodeBase(l) and len(l)>=3 and self.checkTypeIterable(l[2]) def next(self): return def rndTimesFunc(self,sample_func,args,minlen=0,maxlen=None): maxlen = maxlen or self.DEFAULT_MAX_TIMES_FUNC maxlen+=1 out = "" for i in range(random.randrange(minlen,maxlen)): out+=sample_func(args) return out def rndTimes(self,sample,minlen=0,maxlen=None): maxlen = maxlen or self.DEFAULT_MAX_TIMES_CHAR maxlen+=1 out = "" for i in range(random.randrange(minlen,maxlen)): out+=sample return out def rndSelect(self,haystack,sample_len=1,minlen=0,maxlen=None): maxlen = maxlen or self.DEFAULT_MAX_TIMES_CHAR maxlen+=1 out = "" for i in range(random.randrange(minlen,maxlen)): out += "".join(random.sample(haystack,sample_len)) return out def eval(self,node): # different lenght commandos #print node #print id(node),node #if self.recursionLevelObj>self.DEFAULT_MAX_SELF_RECURSION or self.recursionLevelWalk>self.DEFAULT_MAX_WALK_RECURSION: # return "<recursion_exception>" if not node: return "" if len(node)<3: raise Exception( "<3 - %s"%repr(node) ) #this is an error! elif node[1]==Tdef.MATCH_RECURSION_EXCEPTION: return "<<" elif node[1]==Tdef.MATCH_RECURSION: # create a new EBNFSpill object, and resolv this one? #print node[2],self.nodes[node[2]] self.recursionLevelObj+=1 try: x = EBNFSpill(showTags=self.showTagsRecursive,recursionLevel=self.recursionLevelObj) x.setTable(self.table) recr_node=self.nodes[node[2]] except: return "" # #print "REKR",node #print "REKR2",self.nodes #print "<DAMN_RECURSION %s wild=%s>"%(node[2],self.ctx) #return "<RECURSION" #print "EXCEPT:",node[2],self.nodes #return self.rndTimes(x.generate(recr_node['obj']), 0, 3) return self.getTagName(node)+x.generate(recr_node) # single words/selections elif len(node)==3: if node[1]==Tdef.MATCH_WORD or node[1]==Tdef.MATCH_IS: return self.getTagName(node)+node[2] elif node[1]==Tdef.MATCH_ALLIN or node[1]==Tdef.MATCH_ISIN: return self.getTagName(node)+self.rndSelect(node[2],minlen=1,maxlen=1) elif node[1]==Tdef.MATCH_TABLE: # (xyz,MATCH_TABLE, <table>, 1) == exact 1 # (xyz,MATCH_TABLE, <table>, 2,1) == * return self.getTagName(node)+"" #return "<TABLE: %s>"%node[0] # mostly recursive ones elif len(node)>3: # recursions and stuff if node[1]==Tdef.MATCH_IS or node[1]==Tdef.MATCH_IS: # like (none,"MATCH_IS",'c',1,0) - choose zero or xx times return self.getTagName(node)+self.rndTimes(node[2]) elif node[1]==Tdef.MATCH_ALLIN or node[1]==Tdef.MATCH_ISIN: return self.getTagName(node)+self.rndSelect(node[2]) elif node[1]>=Tdef.MATCH_CALL and node[1]<=Tdef.MATCH_SUBTABLEINLIST: # (xyz,MATCH_TABLE, <table>, 1) == exact 1 # (xyz,MATCH_TABLE, <table>, 2,1) == * self.recursionLevelObj+=1 try: x = EBNFSpill(showTags=self.showTagsRecursive,recursionLevel=self.recursionLevelObj) except: return "" x.setTable(self.table) #print "<TABLE: %s | %s || %s || nodeid:%s>"%(node[0:1],node[3],self.ctx,id(node[3])) #print node[2] #return self.getTagName(node)+"" return self.getTagName(node)+self.rndTimesFunc(x.generate,(node[2])) return self.getTagName(node) def generate(self,node=None): out = "" for n in self.walk(node): #print n #print self.recursionLevelObj,self.recursionLevelWalk out+= self.eval(n) return out def process(self,l): if self.checkTypeNodeBase(l): return (l[0],Tdef().toName()[l[1]])+l[2:] return l def _checkRecursion(self,node): # return boolean if boolean=True nID = id(node) #print "-->",nID, " NODE ",node if self.nodes.has_key(nID): raise StopRecursionException(('[RECURSION of Node=%s]'%nID,Tdef.MATCH_RECURSION,nID)) #self.nodeIDs.append(nID) #print nID,node return node def _trackNode(self,node,nodeID=None): nID = nodeID or id(node) #print node if self.checkTypeNodeBase(node): #print "ISIN1",Tdef.MATCH_CALL,Tdef.MATCH_SUBTABLEINLIST,node[1],node[1]>=Tdef.MATCH_CALL and node[1]<=Tdef.MATCH_SUBTABLEINLIST if node[1]>=Tdef.MATCH_CALL and node[1]<=Tdef.MATCH_SUBTABLEINLIST: #print "ISIN2" #print "--- add BASE",id(node),node self.nodes[nID]=node elif self.checkTypeIterable(node): #print "--- add LIST",id(node),node self.nodes[nID]=node return node def _pushLevel(self,node): # add one level. . to check recursion space if node[1]>=Tdef.MATCH_CALL and node[1]<=Tdef.MATCH_SUBTABLEINLIST \ and len(node)>3 and node[3]==2: #print "push__" self.ctx.append(id(node)) return node def _popLevel(self,node): if node[1]>=Tdef.MATCH_CALL and node[1]<=Tdef.MATCH_SUBTABLEINLIST \ and len(node)>3 and node[3]==2: #print "pop___" return self.ctx.pop() return node def walk(self,table=None): table=table or self.table if not table: raise Exception("EBNF TagTable not set, please generate [.setDeclaration()] or set one [.setTable()]") #must not be !NONE!, please .setDeclaration() first! retn = self._walk(table) self._reset() return retn def _walk(self,l): # check if (None|basestring, int, ... ) > 2 #import time #time.sleep(0.8) #print "BEGIN",str(l)[:50] #recursion check if self.recursionLevelObj>self.DEFAULT_MAX_SELF_RECURSION or self.recursionLevelWalk>self.DEFAULT_MAX_WALK_RECURSION: #print self.recursionLevelWalk #print self.recursionLevelObj #nID= #raise StopRecursionException(('[RECURSION of Node=%s]'%nID,Tdef.MATCH_RECURSION,nID)) #print self.nodes #yield l #print "StopIter",l #print self.recursionLevelObj,self.recursionLevelWalk raise StopIteration("HMM") #yield (None,Tdef.MATCH_RECURSION_EXCEPTION,()) #raise StopRecursionException(("[RECURSION_EXCEPTION_LEVEL_REACHED]",Tdef.MATCH_RECURSION_EXCEPTION,None)) self.recursionLevelWalk+=1 #print id(l),len(l),l try: if self.checkTypeNodeWithChilds(l): #print "Childs" self._checkRecursion(l) yield self._trackNode(l) self._pushLevel(l) for e in self._walk(l[2]): yield e self._popLevel(l) elif self.checkTypeNodeBase(l): #print "Base" self._checkRecursion(l) yield self._trackNode(l) elif self.checkTypeIterableRecursive(l): #print "xxx",l[0][0] nID=id(l[0][0]) #print "IterReck" #print '[RECURSION of Node=%s]'%nID #TODO: does not work #fixme: does not work - recurses too much raise StopRecursionException(('[RECURSION of Node=%s]'%nID,Tdef.MATCH_RECURSION,nID)) elif self.checkTypeIterable(l): #print "list" self._checkRecursion(l) self._trackNode(l) # checkTypeIterableRecursive refs one of these nodes :( // damn need to reparse if this doesnt work out #self._pushLevel(l) for e in l: self._pushLevel(e) for x in self._walk(e): yield x #do not check recursion here.. this is not what we want self._popLevel(e) #self._popLevel(l) else: self._checkRecursion(l) print "Elem? - ",l #print self.checkTypeNodeWithChilds(l),self.checkTypeNodeBase(l),self.checkTypeIterable(l) yield self._trackNode(l) except StopRecursionException, e: #print self.nodes[e.getObj()[2]] #print "Except:",e.getObj() yield e.getObj() self.recursionLevelWalk-=1
seq_group := ts, element_token, (ts, element_token)+, ts element_token := (optional_element / base_element), repetition? repetition := ('*'/'+') optional_element := '[',fo_group,']' >base_element< := (range/string/group/name) <fo_indicator> := '|' name := [a-zA-Z_],[a-zA-Z0-9_]* <ts> := ( ('\n', ?-name) / [ \011]+ / comment )* comment := '#',-'\n'+,'\n' range := string, ts, '...', ts, string """ from simpleparse.parser import Parser from simpleparse.common import strings parser = Parser( declaration ) if __name__ == "__main__": from simpleparse.stt.TextTools import print_tags grammar = open("""py_grammar.txt""").read() success, result, next = parser.parse( grammar, 'declarationset') print( 'success', success, next ) print_tags( grammar, result )
'''make the config tuple, and adds them to config''' global config, text, lastItem, section_name if tag == 'section_name': section_name = text[start:end] elif tag == 'item': lastItem = text[start:end] elif tag == 'value': config.append((lastItem, text[start:end])) def travel(root, func): if root == None: return tag, start, end, children = root func(tag, start, end) if children != None: for item in children: travel(item, func) if __name__ =="__main__": parser = Parser( declaration, "file" ) success, resultTrees, nextChar = parser.parse(text) output = {} for section in resultTrees: config = [] travel(section, config_maker) output[section_name] = config pprint.pprint(output)
print offset * ' ', '->', isFisrtBlock = True elif tag == 'block': print "%s%s" % ('' if isFisrtBlock else (maxTagLen + 3) * ' ' + '| ', text[start:end]) isFisrtBlock = False def travel(root, func): if root == None: return tag, start, end, children = root func(tag, start, end) if children != None: for item in children: travel(item, func) if __name__ =="__main__": inFile = open("2.txt") text = "" for line in inFile.readlines(): text += line + "\n" parser = Parser( declaration, "file" ) success, resultTrees, nextChar = parser.parse(text) #pprint.pprint(resultTrees) for item in resultTrees: travel(item, counter) print maxTagLen for item in resultTrees: travel(item, printer)
plusset := '+',(set/atom), (set/atom) atom := -[+*] >interesting< := (example8/example7/example6/example5/example4/example3/example2/example1) example1 := '*+',(set/atom),(set/atom),'+',(set/atom),(set/atom) example2 := '**',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom) example3 := 'fsd*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom) example4 := 'm*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom) example5 := 'a*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom) example6 := 's*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom) example7 := 'bdf*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom) example8 := 'sd*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom) ''' import sys from simpleparse.parser import Parser parser = Parser(declaration, 'set') class Emitter: def process(self, data): #import pprint tree = self.parse(data) #pprint.pprint( tree ) # wrap up the tuple 'cause TextTools uses a different format for the top-level :( tree = ('set', 0, tree[-1], tree[1]) return self.emit(tree) def parse(self, data): self.data = data return parser.parse(data)
value := simple_value / (simple_value, (tb,'#', tb, simple_value)+) >simple_value< := string / number / name alpha_name := [a-zA-Z]+ name := []-[a-z_A-Z!$&+./:;<>?^`|'] , []-[a-z_A-Z0-9!$&+./:;<>?^`|']* number := [0-9]+ / ([[0-9]+, tb, [-]+, tb, [0-9]+) string := ('\"' , quotes_string?, '\"') / ('{' , braces_string?, '}') <braces_string> := (-[{}@]+ / string)+ <quotes_string> := (-[\"{}]+ / ('{', braces_string,'}'))+ <junk> := -[ \t\r\n]+ <tb> := (comment / ws)* <ws> := [ \t\n\r] <comment> := '%' , -[\n]*, '\n' """ ## instantiate SimpleParse parsers parser = Parser(dec, 'bibfile') entry_parser = Parser(dec, 'entry') ## offer a default parse function def Parse(src, processor=None): '''Parse the bibtex string *src*, process with *processor*.''' return parser.parse(src, processor=processor) ## self-test if __name__ == "__main__": import sys, pprint if len(sys.argv) > 1: src = open(sys.argv[1]).read() taglist = Parse(src)
from simpleparse.xml import xml_parser from simpleparse.parser import Parser import unittest, string p = Parser(xml_parser.declaration) class XMLProductionTests(unittest.TestCase): """Tests that XML grammar productions match appropriate values""" ### ProductionTests will be added here by loop below... class ProductionTest: def __init__(self, production, should, shouldnot): self.production = production self.should = should self.shouldnot = shouldnot def __call__(self): """Perform the test""" for item in self.should: success, children, next = p.parse(item, self.production) assert success, """Didn't parse %s as a %s, should have""" % ( repr(item), self.production) assert next == len( item ), """Didn't parse whole of %s as a %s, parsed %s of %s characters, results were:\n%s\nRest was:\n%s""" % ( repr(item), self.production, next, len(item), children, item[next:]) for item in shouldnot: success, children, next = p.parse(item, self.production)
]), ]), ]), ], 21)) def testDeclarationSet2(self): '''Just tries to parse and sees that everything was parsed, doesn't predict the result''' parser = SPGenerator.buildParser("declarationset") result = TextTools.tag(declaration, parser) assert result[-1] == len( declaration ), '''Didn't complete parse of the simpleparse declaration, only got %s chars, should have %s''' % ( result[-1], len(declaration)) recursiveParser = Parser(declaration) class SimpleParseRecursiveTests(SimpleParseGrammarTests): """Test parsing of grammar elements with generated version of simpleparse grammar""" def doBasicTest( self, parserName, testValue, expected, ): result = recursiveParser.parse(testValue, production=parserName) assert result == expected, '''\nexpected:%s\n got:%s\n''' % ( expected, result)
("string_triple_single", """ nondelimiter := -"'''" <delimiter> := "'''" char_no_quote := -[\\\\']+ string_special_escapes := [\\\\abfnrtv'] """), ("string_triple_double", ''' nondelimiter := -'"""' <delimiter> := '"""' char_no_quote := -[\\\\"]+ string_special_escapes := [\\\\abfnrtv"] '''), ] for name, partial in _stringTypeData: _p = Parser(stringDeclaration + partial) c[name] = objectgenerator.LibraryElement( generator=_p._generator, production="str", ) common.share(c) _p = Parser(""" string := string_triple_double/string_triple_single/string_double_quote/string_single_quote """) c["string"] = objectgenerator.LibraryElement( generator=_p._generator, production="string", ) class StringInterpreter(DispatchProcessor):
rulespec1 := match*, ts, target* match := -[\n]* target := '-j'/'--jump', ts, chainname, ts, targetopt* targetopt := -[\n]* arg := literal / -[\\"\\' \t\n]+ negarg := '!', ts, arg literal := ("'",(CHARNOSNGLQUOTE/ESCAPEDCHAR)*,"'") / ('"',(CHARNODBLQUOTE/ESCAPEDCHAR)*,'"') CHARNOSNGLQUOTE := -[\\']+ CHARNODBLQUOTE := -[\\"]+ ESCAPEDCHAR := '\\',( SPECIALESCAPEDCHAR / OCTALESCAPEDCHAR ) SPECIALESCAPEDCHAR := [\\abfnrtv] OCTALESCAPEDCHAR := [0-7],[0-7]?,[0-7]? """ parser = Parser(iptCommandLine, "tablesection") fw = Firewall() class IptProcessor(dispatchprocessor.DispatchProcessor): def tableline(self, tup, prsbuf): print "###", tup[3][0] self.table = dispatchprocessor.dispatch(self, tup[3][0], prsbuf) def tablename(self, tup, prsbuf): return repr(dispatchprocessor.getString(tup, prsbuf)) def chainline(self, tup, prsbuf): print repr(dispatchprocessor.getString(tup, prsbuf)) def chainname(self, tup, prsbuf):
} # Examples: # https://github.com/blakeembrey/change-case first_letter_case_tags = { 'as': { 'function_identifier': 'lower', 'namespace_identifier': 'lower', }, 'cs': { 'function_identifier': 'upper', 'namespace_identifier': 'upper', } } ebnf_parser = Parser(declaration, 'declarationset') source_keys = [key for source in SOURCES for key in source.keys()] source_keys.sort() ## print(pformat(source_keys)) def reset(): if 'DECLARED_TYPE' in literals and 'DECLARED_TYPE_ORIGINAL' in literals: literals['DECLARED_TYPE'] = literals['DECLARED_TYPE_ORIGINAL'] for direction in ['as', 'cs']: if 'DECLARED_TYPE' in literals[direction] \ and 'DECLARED_TYPE_ORIGINAL' in literals[direction]: literals[direction]['DECLARED_TYPE'] = literals[direction][ 'DECLARED_TYPE_ORIGINAL'] data_types.clear()
def __init__(self,*args): if not os.path.sep in __file__: scriptdir = os.getcwd() else: scriptdir = os.path.dirname(__file__) #n3file = os.path.sep.join([scriptdir,"etc","sfs-extra.n3"]) #n3url = "file://" + n3file.replace("\\","/") #print "scriptdir: %s" % scriptdir #print "n3file: %s" % n3file #print "n3url: %s" % n3url self.graph = Graph() n3file = Util.relpath(scriptdir + "/etc/sfs-extra.n3") # print "loading n3file %s" % n3file self.graph.load(n3file, format="n3") self.roots = [] self.uriformatter = {} self.decl = "" self.namedlaws = {} self.load_ebnf(scriptdir+"/etc/base.ebnf") self.args = args if self.LAGRUM in args: productions = self.load_ebnf(scriptdir+"/etc/lagrum.ebnf") for p in productions: self.uriformatter[p] = self.sfs_format_uri self.namedlaws.update(self.get_relations(RDFS.label)) self.roots.append("sfsrefs") self.roots.append("sfsref") if self.KORTLAGRUM in args: # om vi inte redan laddat lagrum.ebnf måste vi göra det # nu, eftersom kortlagrum.ebnf beror på produktioner som # definerats där if not self.LAGRUM in args: self.load_ebnf(scriptdir+"/etc/lagrum.ebnf") productions = self.load_ebnf(scriptdir+"/etc/kortlagrum.ebnf") for p in productions: self.uriformatter[p] = self.sfs_format_uri DCT = Namespace("http://purl.org/dc/terms/") d = self.get_relations(DCT['alternate']) self.namedlaws.update(d) lawlist = [x.encode(SP_CHARSET) for x in d.keys()] # Make sure longer law abbreviations come before shorter # ones (so that we don't mistake "3 § MBL" for "3 § MB"+"L") lawlist.sort(cmp=lambda x,y:len(y)-len(x)) self.decl += "LawAbbreviation ::= ('%s')\n" % "'/'".join(lawlist) self.roots.insert(0,"kortlagrumref") if self.EGLAGSTIFTNING in args: productions = self.load_ebnf(scriptdir+"/etc/eglag.ebnf") for p in productions: self.uriformatter[p] = self.eglag_format_uri self.roots.append("eglagref") if self.FORARBETEN in args: productions = self.load_ebnf(scriptdir+"/etc/forarbeten.ebnf") for p in productions: self.uriformatter[p] = self.forarbete_format_uri self.roots.append("forarbeteref") if self.RATTSFALL in args: productions = self.load_ebnf(scriptdir+"/etc/rattsfall.ebnf") for p in productions: self.uriformatter[p] = self.rattsfall_format_uri self.roots.append("rattsfallref") if self.EGRATTSFALL in args: productions = self.load_ebnf(scriptdir+"/etc/egratt.ebnf") for p in productions: self.uriformatter[p] = self.egrattsfall_format_uri self.roots.append("ecjcaseref") self.decl += "root ::= (%s/plain)+\n" % "/".join(self.roots) # pprint(productions) # print self.decl.decode(SP_CHARSET,'ignore') self.parser = Parser(self.decl, "root") self.tagger = self.parser.buildTagger("root") # print "tagger length: %d" % len(repr(self.tagger)) self.verbose = False self.depth = 0 # SFS-specifik kod self.currentlaw = None self.currentchapter = None self.currentsection = None self.currentpiece = None self.lastlaw = None self.currentlynamedlaws = {}
"""Demonstrates what happens when your declaration is syntactically incorrect When run as a script, will generate a traceback telling you that the grammar defined here is incorrectly formatted. """ from simpleparse.common import numbers, strings, comments declaration = r'''# note use of raw string when embedding in python code... file := [ \t\n]*, section+ section := '[',identifier,']' ts,'\n', body body := statement* statement := (ts,semicolon_comment)/equality/nullline nullline := ts,'\n' comment := -'\n'* equality := ts, identifier,ts,'=',ts,identified,ts,'\n' identifier := [a-zA-Z], [a-zA-Z0-9_]* identified := string/number/identifier ts := [ \t]* ''' testdata = '''[test1] val=23 ''' if __name__ == "__main__": from simpleparse.parser import Parser parser = Parser(declaration, "file") # will raise ValueError
class LegalRef: # Kanske detta borde vara 1,2,4,8 osv, så att anroparen kan be om # LAGRUM | FORESKRIFTER, och så vi kan definera samlingar av # vanliga kombinationer (exv ALL_LAGSTIFTNING = LAGRUM | # KORTLAGRUM | FORESKRIFTER | EGLAGSTIFTNING) LAGRUM = 1 # hänvisningar till lagrum i SFS KORTLAGRUM = 2 # SFS-hänvisningar på kortform FORESKRIFTER = 3 # hänvisningar till myndigheters författningssamlingar EGLAGSTIFTNING = 4 # EG-fördrag, förordningar och direktiv INTLLAGSTIFTNING = 5 # Fördrag, traktat etc FORARBETEN = 6 # proppar, betänkanden, etc RATTSFALL = 7 # Rättsfall i svenska domstolar MYNDIGHETSBESLUT = 8 # Myndighetsbeslut (JO, ARN, DI...) EGRATTSFALL = 9 # Rättsfall i EG-domstolen/förstainstansrätten INTLRATTSFALL = 10 # Europadomstolen # re_urisegments = re.compile(r'([\w]+://[^/]+/[^\d]*)(\d+:(bih\. |N|)?\d+( s\.\d+|))#?(K(\d+)|)(P(\d+)|)(S(\d+)|)(N(\d+)|)') re_urisegments = re.compile(r'([\w]+://[^/]+/[^\d]*)(\d+:(bih\.[_ ]|N|)?\d+([_ ]s\.\d+|))#?(K([a-z0-9]+)|)(P([a-z0-9]+)|)(S(\d+)|)(N(\d+)|)') re_escape_compound = re.compile(r'\b(\w+-) (och) (\w+-?)(lagen|förordningen)\b', re.UNICODE) re_escape_named = re.compile(r'\B(lagens?|balkens?|förordningens?|formens?|ordningens?|kungörelsens?|stadgans?)\b', re.UNICODE) re_descape_compound = re.compile(r'\b(\w+-)_(och)_(\w+-?)(lagen|förordningen)\b', re.UNICODE) re_descape_named = re.compile(r'\|(lagens?|balkens?|förordningens?|formens?|ordningens?|kungörelsens?|stadgans?)') re_xmlcharref = re.compile("&#\d+;") def __init__(self,*args): if not os.path.sep in __file__: scriptdir = os.getcwd() else: scriptdir = os.path.dirname(__file__) #n3file = os.path.sep.join([scriptdir,"etc","sfs-extra.n3"]) #n3url = "file://" + n3file.replace("\\","/") #print "scriptdir: %s" % scriptdir #print "n3file: %s" % n3file #print "n3url: %s" % n3url self.graph = Graph() n3file = Util.relpath(scriptdir + "/etc/sfs-extra.n3") # print "loading n3file %s" % n3file self.graph.load(n3file, format="n3") self.roots = [] self.uriformatter = {} self.decl = "" self.namedlaws = {} self.load_ebnf(scriptdir+"/etc/base.ebnf") self.args = args if self.LAGRUM in args: productions = self.load_ebnf(scriptdir+"/etc/lagrum.ebnf") for p in productions: self.uriformatter[p] = self.sfs_format_uri self.namedlaws.update(self.get_relations(RDFS.label)) self.roots.append("sfsrefs") self.roots.append("sfsref") if self.KORTLAGRUM in args: # om vi inte redan laddat lagrum.ebnf måste vi göra det # nu, eftersom kortlagrum.ebnf beror på produktioner som # definerats där if not self.LAGRUM in args: self.load_ebnf(scriptdir+"/etc/lagrum.ebnf") productions = self.load_ebnf(scriptdir+"/etc/kortlagrum.ebnf") for p in productions: self.uriformatter[p] = self.sfs_format_uri DCT = Namespace("http://purl.org/dc/terms/") d = self.get_relations(DCT['alternate']) self.namedlaws.update(d) lawlist = [x.encode(SP_CHARSET) for x in d.keys()] # Make sure longer law abbreviations come before shorter # ones (so that we don't mistake "3 § MBL" for "3 § MB"+"L") lawlist.sort(cmp=lambda x,y:len(y)-len(x)) self.decl += "LawAbbreviation ::= ('%s')\n" % "'/'".join(lawlist) self.roots.insert(0,"kortlagrumref") if self.EGLAGSTIFTNING in args: productions = self.load_ebnf(scriptdir+"/etc/eglag.ebnf") for p in productions: self.uriformatter[p] = self.eglag_format_uri self.roots.append("eglagref") if self.FORARBETEN in args: productions = self.load_ebnf(scriptdir+"/etc/forarbeten.ebnf") for p in productions: self.uriformatter[p] = self.forarbete_format_uri self.roots.append("forarbeteref") if self.RATTSFALL in args: productions = self.load_ebnf(scriptdir+"/etc/rattsfall.ebnf") for p in productions: self.uriformatter[p] = self.rattsfall_format_uri self.roots.append("rattsfallref") if self.EGRATTSFALL in args: productions = self.load_ebnf(scriptdir+"/etc/egratt.ebnf") for p in productions: self.uriformatter[p] = self.egrattsfall_format_uri self.roots.append("ecjcaseref") self.decl += "root ::= (%s/plain)+\n" % "/".join(self.roots) # pprint(productions) # print self.decl.decode(SP_CHARSET,'ignore') self.parser = Parser(self.decl, "root") self.tagger = self.parser.buildTagger("root") # print "tagger length: %d" % len(repr(self.tagger)) self.verbose = False self.depth = 0 # SFS-specifik kod self.currentlaw = None self.currentchapter = None self.currentsection = None self.currentpiece = None self.lastlaw = None self.currentlynamedlaws = {} def load_ebnf(self,file): """Laddar in produktionerna i den angivna filen i den EBNF-deklaration som används, samt returnerar alla *Ref och *RefId-produktioner""" # print "%s: Loading %s" % (id(self), file) f = open(file) content = f.read() self.decl += content f.close() return [x.group(1) for x in re.finditer(r'(\w+(Ref|RefID))\s*::=', content)] def get_relations(self, predicate): d = {} for obj, subj in self.graph.subject_objects(predicate): d[unicode(subj)] = unicode(obj) return d def parse(self, indata, baseuri="http://rinfo.lagrummet.se/publ/sfs/9999:999#K9P9S9P9",predicate=None): if indata == "": return indata # this actually triggered a bug... # h = hashlib.sha1() # h.update(indata) # print "Called with %r (%s) (%s)" % (indata, h.hexdigest(), self.verbose) self.predicate = predicate self.baseuri = baseuri if baseuri: m = self.re_urisegments.match(baseuri) if m: self.baseuri_attributes = {'baseuri':m.group(1), 'law':m.group(2), 'chapter':m.group(6), 'section':m.group(8), 'piece':m.group(10), 'item':m.group(12)} else: self.baseuri_attributes = {'baseuri':baseuri} else: self.baseuri_attributes = {} # Det är svårt att få EBNF-grammatiken att känna igen # godtyckliga ord som slutar på ett givet suffix (exv # 'bokföringslagen' med suffixet 'lagen'). Därför förbehandlar # vi indatasträngen och stoppar in ett '|'-tecken innan vissa # suffix. Vi transformerar även 'Radio- och TV-lagen' till # 'Radio-_och_TV-lagen' # # FIXME: Obviously, this shouldn't be done in a general class, # but rather in a subclas or via proxy/adapter # if we don't do the unicode conversion and pass # BeautifulSoup.NavigableString, the later .encode call fails # (since it's not a real unicode string) fixedindata = unicode(indata) # print "Before: %r" % type(fixedindata) if self.LAGRUM in self.args: fixedindata = self.re_escape_compound.sub(r'\1_\2_\3\4', fixedindata) fixedindata = self.re_escape_named.sub(r'|\1', fixedindata) # print "After: %r" % type(fixedindata) # SimpleParse har inget stöd för unicodesträngar, så vi # konverterar intdatat till en bytesträng. Tyvärr får jag inte # det hela att funka med UTF8, så vi kör xml character # references istället if isinstance(fixedindata,unicode): fixedindata = fixedindata.encode(SP_CHARSET,'xmlcharrefreplace') # Parsea texten med TextTools.tag - inte det enklaste sättet # att göra det, men om man gör enligt # Simpleparse-dokumentationen byggs taggertabellen om för # varje anrop till parse() if self.verbose: print u"calling tag with '%s'" % (fixedindata.decode(SP_CHARSET)) # print "tagger length: %d" % len(repr(self.tagger)) taglist = tag(fixedindata, self.tagger,0,len(fixedindata)) result = [] root = NodeTree(taglist,fixedindata) for part in root.nodes: if part.tag != 'plain' and self.verbose: sys.stdout.write(self.prettyprint(part)) if part.tag in self.roots: self.clear_state() # self.verbose = False result.extend(self.formatter_dispatch(part)) else: assert part.tag == 'plain',"Tag is %s" % part.tag result.append(part.text) # clear state if self.currentlaw != None: self.lastlaw = self.currentlaw self.currentlaw = None if taglist[-1] != len(fixedindata): log.error(u'Problem (%d:%d) with %r / %r' % (taglist[-1]-8,taglist[-1]+8,fixedindata,indata)) raise ParseError, "parsed %s chars of %s (...%s...)" % (taglist[-1], len(indata), indata[(taglist[-1]-2):taglist[-1]+3]) # Normalisera resultatet, dvs konkatenera intilliggande # textnoder, och ta bort ev '|'-tecken som vi stoppat in # tidigare. normres = [] for i in range(len(result)): if not self.re_descape_named.search(result[i]): node = result[i] else: if self.LAGRUM in self.args: text = self.re_descape_named.sub(r'\1',result[i]) text = self.re_descape_compound.sub(r'\1 \2 \3\4', text) if isinstance(result[i], Link): # Eftersom Link-objekt är immutable måste vi skapa # ett nytt och kopiera dess attribut if hasattr(result[i],'predicate'): node = LinkSubject(text, predicate=result[i].predicate, uri=result[i].uri) else: node = Link(text,uri=result[i].uri) else: node = text if (len(normres) > 0 and not isinstance(normres[-1],Link) and not isinstance(node,Link)): normres[-1] += node else: normres.append(node) # and finally... for i in range(len(normres)): if isinstance(normres[i], Link): # deal with these later pass else: normres[i] = self.re_xmlcharref.sub(self.unescape_xmlcharref, normres[i]) return normres def unescape_xmlcharref(self, m): # print "Changing %r to a %r" % (m.group(0)[2:-1], unichr(int(m.group(0)[2:-1]))) return unichr(int(m.group(0)[2:-1])) def find_attributes(self,parts,extra={}): """recurses through a parse tree and creates a dictionary of attributes""" d = {} self.depth += 1 if self.verbose: print ". "*self.depth+"find_attributes: starting with %s"%d if extra: d.update(extra) for part in parts: current_part_tag = part.tag.lower() if current_part_tag.endswith('refid'): if ((current_part_tag == 'singlesectionrefid') or (current_part_tag == 'lastsectionrefid')): current_part_tag = 'sectionrefid' d[current_part_tag[:-5]] = part.text.strip() if self.verbose: print ". "*self.depth+"find_attributes: d is now %s" % d if part.nodes: d.update(self.find_attributes(part.nodes,d)) if self.verbose: print ". "*self.depth+"find_attributes: returning %s" % d self.depth -= 1 if self.currentlaw and 'law' not in d : d['law'] = self.currentlaw if self.currentchapter and 'chapter' not in d: d['chapter'] = self.currentchapter if self.currentsection and 'section' not in d: d['section'] = self.currentsection if self.currentpiece and 'piece' not in d : d['piece'] = self.currentpiece return d def find_node(self,root,nodetag): """Returns the first node in the tree that has a tag matching nodetag. The search is depth-first""" if root.tag == nodetag: # base case return root else: for node in root.nodes: x = self.find_node(node,nodetag) if x != None: return x return None def find_nodes(self,root,nodetag): if root.tag == nodetag: return [root] else: res = [] for node in root.nodes: res.extend(self.find_nodes(node,nodetag)) return res def flatten_tokentree(self,part,suffix): """returns a 'flattened' tokentree ie for the following tree and the suffix 'RefID' foo->bar->BlahongaRefID ->baz->quux->Blahonga2RefID ->Blahonga3RefID ->Blahonga4RefID this should return [BlahongaRefID, Blahonga2RefID, Blahonga3RefID, Blahonga4RefID]""" l = [] if part.tag.endswith(suffix): l.append(part) if not part.nodes: return l for subpart in part.nodes: l.extend(self.flatten_tokentree(subpart,suffix)) return l def formatter_dispatch(self,part): # print "Verbositiy: %r" % self.verbose self.depth += 1 # Finns det en skräddarsydd formatterare? if "format_"+part.tag in dir(self): formatter = getattr(self,"format_"+part.tag) if self.verbose: print (". "*self.depth)+ "formatter_dispatch: format_%s defined, calling it" % part.tag res = formatter(part) assert res != None, "Custom formatter for %s didn't return anything" % part.tag else: if self.verbose: print (". "*self.depth)+ "formatter_dispatch: no format_%s, using format_tokentree" % part.tag res = self.format_tokentree(part) if res == None: print (". "*self.depth)+ "something wrong with this:\n" + self.prettyprint(part) self.depth -= 1 return res def format_tokentree(self,part): # This is the default formatter. It converts every token that # ends with a RefID into a Link object. For grammar # productions like SectionPieceRefs, which contain # subproductions that also end in RefID, this is not a good # function to use - use a custom formatter instead. res = [] if self.verbose: print (". "*self.depth)+ "format_tokentree: called for %s" % part.tag # this is like the bottom case, or something if (not part.nodes) and (not part.tag.endswith("RefID")): res.append(part.text) else: if part.tag.endswith("RefID"): res.append(self.format_generic_link(part)) elif part.tag.endswith("Ref"): res.append(self.format_generic_link(part)) else: for subpart in part.nodes: if self.verbose and part.tag == 'LawRef': print (". "*self.depth) + "format_tokentree: part '%s' is a %s" % (subpart.text, subpart.tag) res.extend(self.formatter_dispatch(subpart)) if self.verbose: print (". "*self.depth)+ "format_tokentree: returning '%s' for %s" % (res,part.tag) return res def prettyprint(self,root,indent=0): res = u"%s'%s': '%s'\n" % (" "*indent,root.tag,re.sub(r'\s+', ' ',root.text)) if root.nodes != None: for subpart in root.nodes: res += self.prettyprint(subpart,indent+1) return res else: return u"" def format_generic_link(self,part,uriformatter=None): try: uri = self.uriformatter[part.tag](self.find_attributes([part])) except KeyError: if uriformatter: uri = uriformatter(self.find_attributes([part])) else: uri = self.sfs_format_uri(self.find_attributes([part])) except AttributeError: # Normal error from eglag_format_uri return part.text except: exc = sys.exc_info() # If something else went wrong, just return the plaintext log.warning("(unknown): Unable to format link for text %s (production %s)" % (part.text, part.tag)) return part.text if self.verbose: print (". "*self.depth)+ "format_generic_link: uri is %s" % uri if not uri: # the formatting function decided not to return a URI for # some reason (maybe it was a partial/relative reference # without a proper base uri context return part.text elif self.predicate: return LinkSubject(part.text, uri=uri, predicate=self.predicate) else: return Link(part.text, uri=uri) # FIXME: unify this with format_generic_link def format_custom_link(self, attributes, text, production): try: uri = self.uriformatter[production](attributes) except KeyError: uri = self.sfs_format_uri(attributes) if not uri: # the formatting function decided not to return a URI for # some reason (maybe it was a partial/relative reference # without a proper base uri context return part.text elif self.predicate: return LinkSubject(text, uri=uri, predicate=self.predicate) else: return Link(text, uri=uri) ################################################################ # KOD FÖR LAGRUM def clear_state(self): self.currentlaw = None self.currentchapter = None self.currentsection = None self.currentpiece = None def normalize_sfsid(self,sfsid): # sometimes '1736:0123 2' is given as '1736:0123 s. 2' or # '1736:0123.2'. This fixes that. sfsid = re.sub(r'(\d+:\d+)\.(\d)',r'\1 \2',sfsid) #return sfsid.replace('s. ','').replace('s.','') # more advanced normalizations to come... return sfsid def normalize_lawname(self,lawname): lawname=lawname.replace('|','').replace('_',' ').lower() if lawname.endswith('s'): lawname = lawname[:-1] return lawname def namedlaw_to_sfsid(self,text,normalize=True): if normalize: text = self.normalize_lawname(text) nolaw = [ u'aktieslagen', u'anordningen', u'anordningen', u'anslagen', u'arbetsordningen', u'associationsformen', u'avfallsslagen', u'avslagen', u'avvittringsutslagen', u'bergslagen', u'beskattningsunderlagen', u'bolagen', u'bolagsordningen', u'bolagsordningen', u'dagordningen', u'djurslagen', u'dotterbolagen', u'emballagen', u'energislagen', u'ersättningsformen', u'ersättningsslagen', u'examensordningen', u'finansbolagen', u'finansieringsformen', u'fissionsvederlagen', u'flygbolagen', u'fondbolagen', u'förbundsordningen', u'föreslagen', u'företrädesordningen', u'förhandlingsordningen', u'förlagen', u'förmånsrättsordningen', u'förmögenhetsordningen', u'förordningen', u'förslagen', u'försäkringsaktiebolagen', u'försäkringsbolagen', u'gravanordningen', u'grundlagen', u'handelsplattformen', u'handläggningsordningen', u'inkomstslagen', u'inköpssamordningen', u'kapitalunderlagen', u'klockslagen', u'kopplingsanordningen', u'låneformen', u'mervärdesskatteordningen', u'nummerordningen', u'omslagen', u'ordalagen', u'pensionsordningen', u'renhållningsordningen', u'representationsreformen', u'rättegångordningen', u'rättegångsordningen', u'rättsordningen', u'samordningen', u'samordningen', u'skatteordningen', u'skatteslagen', u'skatteunderlagen', u'skolformen', u'skyddsanordningen', u'slagen', u'solvärmeanordningen', u'storslagen', u'studieformen', u'stödformen', u'stödordningen', u'stödordningen', u'säkerhetsanordningen', u'talarordningen', u'tillslagen', u'tivolianordningen', u'trafikslagen', u'transportanordningen', u'transportslagen', u'trädslagen', u'turordningen', u'underlagen', u'uniformen', u'uppställningsformen', u'utvecklingsbolagen', u'varuslagen', u'verksamhetsformen', u'vevanordningen', u'vårdformen', u'ägoanordningen', u'ägoslagen', u'ärendeslagen', u'åtgärdsförslagen', ] if text in nolaw: return None if self.currentlynamedlaws.has_key(text): return self.currentlynamedlaws[text] elif self.namedlaws.has_key(text): return self.namedlaws[text] else: if self.verbose: # print "(unknown): I don't know the ID of named law [%s]" % text log.warning("(unknown): I don't know the ID of named law [%s]" % text) return None def sfs_format_uri(self,attributes): piecemappings = {u'första' :'1', u'andra' :'2', u'tredje' :'3', u'fjärde' :'4', u'femte' :'5', u'sjätte' :'6', u'sjunde' :'7', u'åttonde':'8', u'nionde' :'9'} keymapping = {'lawref' :'L', 'chapter' :'K', 'section' :'P', 'piece' :'S', 'item' :'N', 'itemnumeric':'N', 'element' :'O', 'sentence':'M', # is this ever used? } attributeorder = ['law', 'lawref', 'chapter', 'section', 'element', 'piece', 'item', 'itemnumeric','sentence'] if 'law' in attributes: if attributes['law'].startswith('http://'): res = '' else: res = 'http://rinfo.lagrummet.se/publ/sfs/' else: if 'baseuri' in self.baseuri_attributes: res = self.baseuri_attributes['baseuri'] else: res = '' resolvetobase = True addfragment = False justincase = None for key in attributeorder: if attributes.has_key(key): resolvetobase = False val = attributes[key] elif (resolvetobase and self.baseuri_attributes.has_key(key)): val = self.baseuri_attributes[key] else: val = None if val: if addfragment: res += '#' addfragment = False if (key in ['piece', 'itemnumeric', 'sentence'] and val in piecemappings): res += '%s%s' % (keymapping[key],piecemappings[val.lower()]) else: if key == 'law': val = self.normalize_sfsid(val) val = val.replace(" ", "_") res += val addfragment = True else: if justincase: res += justincase justincase = None val = val.replace(" ", "") val = val.replace("\n", "") val = val.replace("\r", "") res += '%s%s' % (keymapping[key],val) else: if key == 'piece': justincase = "S1" return res def format_ChapterSectionRefs(self,root): assert(root.tag == 'ChapterSectionRefs') assert(len(root.nodes) == 3) # ChapterRef, wc, SectionRefs part = root.nodes[0] self.currentchapter = part.nodes[0].text.strip() if self.currentlaw: res = [self.format_custom_link({'law':self.currentlaw, 'chapter':self.currentchapter}, part.text, part.tag)] else: res = [self.format_custom_link({'chapter':self.currentchapter}, part.text, part.tag)] res.extend(self.formatter_dispatch(root.nodes[1])) res.extend(self.formatter_dispatch(root.nodes[2])) self.currentchapter = None return res def format_ChapterSectionPieceRefs(self,root): assert(root.nodes[0].nodes[0].tag == 'ChapterRefID') self.currentchapter = root.nodes[0].nodes[0].text.strip() res = [] for node in root.nodes: res.extend(self.formatter_dispatch(node)) return res def format_LastSectionRef(self, root): # the last section ref is a bit different, since we want the # ending double section mark to be part of the link text assert(root.tag == 'LastSectionRef') assert(len(root.nodes) == 3) # LastSectionRefID, wc, DoubleSectionMark sectionrefid = root.nodes[0] sectionid = sectionrefid.text return [self.format_generic_link(root)] def format_SectionPieceRefs(self, root): assert(root.tag == 'SectionPieceRefs') self.currentsection = root.nodes[0].nodes[0].text.strip() res = [self.format_custom_link(self.find_attributes([root.nodes[2]]), "%s %s" % (root.nodes[0].text, root.nodes[2].text), root.tag)] for node in root.nodes[3:]: res.extend(self.formatter_dispatch(node)) self.currentsection = None return res def format_SectionPieceItemRefs(self,root): assert(root.tag == 'SectionPieceItemRefs') self.currentsection = root.nodes[0].nodes[0].text.strip() self.currentpiece = root.nodes[2].nodes[0].text.strip() res = [self.format_custom_link(self.find_attributes([root.nodes[2]]), "%s %s" % (root.nodes[0].text, root.nodes[2].text), root.tag)] for node in root.nodes[3:]: res.extend(self.formatter_dispatch(node)) self.currentsection = None self.currentpiece = None return res # This is a special case for things like '17-29 och 32 §§ i lagen # (2004:575)', which picks out the LawRefID first and stores it in # .currentlaw, so that find_attributes finds it # automagically. Although now it seems to be branching out and be # all things to all people. def format_ExternalRefs(self,root): assert(root.tag == 'ExternalRefs') # print "DEBUG: start of format_ExternalRefs; self.currentlaw is %s" % self.currentlaw lawrefid_node = self.find_node(root,'LawRefID') if lawrefid_node == None: # Ok, no explicit LawRefID found, lets see if this is a named law that we have the ID for # namedlaw_node = self.find_node(root, 'NamedLawExternalLawRef') namedlaw_node = self.find_node(root, 'NamedLaw') if namedlaw_node == None: # As a last chance, this might be a reference back to a previously mentioned law ("...enligt 4 § samma lag") samelaw_node = self.find_node(root, 'SameLaw') assert(samelaw_node != None) if self.lastlaw == None: log.warning(u"(unknown): found reference to \"{samma,nämnda} {lag,förordning}\", but self.lastlaw is not set") self.currentlaw = self.lastlaw else: # the NamedLaw case self.currentlaw = self.namedlaw_to_sfsid(namedlaw_node.text) if self.currentlaw == None: # unknow law name - in this case it's better to # bail out rather than resolving chapter/paragraph # references relative to baseuri (which is almost # certainly wrong) return [root.text] else: self.currentlaw = lawrefid_node.text if self.find_node(root,'NamedLaw'): namedlaw = self.normalize_lawname(self.find_node(root,'NamedLaw').text) # print "remember that %s is %s!" % (namedlaw, self.currentlaw) self.currentlynamedlaws[namedlaw] = self.currentlaw #print "DEBUG: middle of format_ExternalRefs; self.currentlaw is %s" % self.currentlaw if self.lastlaw is None: #print "DEBUG: format_ExternalRefs: setting self.lastlaw to %s" % self.currentlaw self.lastlaw = self.currentlaw # if the node tree only contains a single reference, it looks # better if the entire expression, not just the # chapter/section part, is linked. But not if it's a # "anonymous" law ('1 § i lagen (1234:234) om blahonga') if (len(self.find_nodes(root,'GenericRefs')) == 1 and len(self.find_nodes(root,'SectionRefID')) == 1 and len(self.find_nodes(root,'AnonymousExternalLaw')) == 0): res = [self.format_generic_link(root)] else: res = self.format_tokentree(root) return res def format_SectionItemRefs(self,root): assert(root.nodes[0].nodes[0].tag == 'SectionRefID') self.currentsection = root.nodes[0].nodes[0].text.strip() #res = self.formatter_dispatch(root.nodes[0]) # was formatter_dispatch(self.root) res = self.format_tokentree(root) self.currentsection = None return res def format_PieceItemRefs(self,root): self.currentpiece = root.nodes[0].nodes[0].text.strip() res = [self.format_custom_link(self.find_attributes([root.nodes[2].nodes[0]]), "%s %s" % (root.nodes[0].text, root.nodes[2].nodes[0].text), root.tag)] for node in root.nodes[2].nodes[1:]: res.extend(self.formatter_dispatch(node)) self.currentpiece = None return res def format_ChapterSectionRef(self,root): assert(root.nodes[0].nodes[0].tag == 'ChapterRefID') self.currentchapter = root.nodes[0].nodes[0].text.strip() return [self.format_generic_link(root)] def format_AlternateChapterSectionRefs(self,root): assert(root.nodes[0].nodes[0].tag == 'ChapterRefID') self.currentchapter = root.nodes[0].nodes[0].text.strip() # print "Self.currentchapter is now %s" % self.currentchapter res = self.format_tokentree(root) self.currentchapter = None return res def format_ExternalLaw(self,root): self.currentchapter = None return self.formatter_dispatch(root.nodes[0]) def format_ChangeRef(self,root): id = self.find_node(root,'LawRefID').data return [self.format_custom_link({'lawref':id}, root.text, root.tag)] def format_SFSNr(self,root): if self.baseuri == None: sfsid = self.find_node(root,'LawRefID').data self.baseuri_attributes = {'baseuri':'http://rinfo.lagrummet.se/publ/sfs/'+sfsid+'#'} return self.format_tokentree(root) def format_NamedExternalLawRef(self,root): resetcurrentlaw = False #print "format_NamedExternalLawRef: self.currentlaw is %r" % self.currentlaw if self.currentlaw == None: resetcurrentlaw = True lawrefid_node = self.find_node(root,'LawRefID') if lawrefid_node == None: self.currentlaw = self.namedlaw_to_sfsid(root.text) else: self.currentlaw = lawrefid_node.text namedlaw = self.normalize_lawname(self.find_node(root,'NamedLaw').text) # print "remember that %s is %s!" % (namedlaw, self.currentlaw) self.currentlynamedlaws[namedlaw] = self.currentlaw #print "format_NamedExternalLawRef: self.currentlaw is now %r" % self.currentlaw #print "format_NamedExternalLawRef: self.baseuri is %r" % self.baseuri if self.currentlaw == None: # if we can't find a ID for this law, better not <link> it res = [root.text] else: res = [self.format_generic_link(root)] #print "format_NamedExternalLawRef: self.baseuri is %r" % self.baseuri if self.baseuri == None and self.currentlaw != None: #print "format_NamedExternalLawRef: setting baseuri_attributes" # use this as the new baseuri_attributes m = self.re_urisegments.match(self.currentlaw) if m: self.baseuri_attributes = {'baseuri':m.group(1), 'law':m.group(2), 'chapter':m.group(6), 'section':m.group(8), 'piece':m.group(10), 'item':m.group(12)} else: self.baseuri_attributes = {'baseuri':'http://rinfo.lagrummet.se/publ/sfs/'+self.currentlaw+'#'} if resetcurrentlaw: if self.currentlaw != None: self.lastlaw = self.currentlaw self.currentlaw = None return res ################################################################ # KOD FÖR KORTLAGRUM def format_AbbrevLawNormalRef(self,root): lawabbr_node = self.find_node(root,'LawAbbreviation') self.currentlaw = self.namedlaw_to_sfsid(lawabbr_node.text,normalize=False) res = [self.format_generic_link(root)] if self.currentlaw != None: self.lastlaw = self.currentlaw self.currentlaw = None return res def format_AbbrevLawShortRef(self,root): assert(root.nodes[0].tag == 'LawAbbreviation') assert(root.nodes[2].tag == 'ShortChapterSectionRef') self.currentlaw = self.namedlaw_to_sfsid(root.nodes[0].text,normalize=False) shortsection_node = root.nodes[2] assert(shortsection_node.nodes[0].tag == 'ShortChapterRefID') assert(shortsection_node.nodes[2].tag == 'ShortSectionRefID') self.currentchapter = shortsection_node.nodes[0].text self.currentsection = shortsection_node.nodes[2].text res = [self.format_generic_link(root)] self.currentchapter = None self.currentsection = None self.currentlaw = None return res ################################################################ # KOD FÖR FORARBETEN def forarbete_format_uri(self,attributes): # res = self.baseuri_attributes['baseuri'] res = 'http://rinfo.lagrummet.se/' resolvetobase = True addfragment = False for key,val in attributes.items(): if key == 'prop': res += "publ/prop/%s" % val elif key == 'bet': res += "ext/bet/%s" % val elif key == 'skrivelse': res += "ext/rskr/%s" % val elif key == 'celex': if len(val) == 8: # incorrectly formatted, uses YY instead of YYYY val = val[0]+'19'+val[1:] res += "ext/celex/%s" % val if 'sidnr' in attributes: res += "#s%s" % attributes['sidnr'] return res def format_ChapterSectionRef(self,root): assert(root.nodes[0].nodes[0].tag == 'ChapterRefID') self.currentchapter = root.nodes[0].nodes[0].text.strip() return [self.format_generic_link(root)] ################################################################ # KOD FÖR EGLAGSTIFTNING def eglag_format_uri(self,attributes): res = 'http://rinfo.lagrummet.se/ext/celex/' if not 'akttyp' in attributes: if 'forordning' in attributes: attributes['akttyp'] = u'förordning'; elif 'direktiv' in attributes: attributes['akttyp'] = u'direktiv'; if 'akttyp' not in attributes: raise AttributeError("Akttyp saknas") # Om hur CELEX-nummer konstrueras # https://www.infotorg.sema.se/infotorg/itweb/handbook/rb/hlp_celn.htm # https://www.infotorg.sema.se/infotorg/itweb/handbook/rb/hlp_celf.htm # Om hur länkning till EURLEX ska se ut: # http://eur-lex.europa.eu/sv/tools/help_syntax.htm # Absolut URI? if 'ar' in attributes and 'lopnummer' in attributes: sektor = '3' rattslig_form = {u'direktiv':'L', u'förordning':'R'} if len(attributes['ar']) == 2: attributes['ar'] = '19'+attributes['ar'] res += "%s%s%s%04d" % (sektor,attributes['ar'], rattslig_form[attributes['akttyp']], int(attributes['lopnummer'])) else: if not self.baseuri_attributes['baseuri'].startswith(res): # FIXME: should we warn about this? # print "Relative reference, but base context %s is not a celex context" % self.baseuri_attributes['baseuri'] return None if 'artikel' in attributes: res += "#%s" % attributes['artikel'] if 'underartikel' in attributes: res += ".%s" % attributes['underartikel'] return res ################################################################ # KOD FÖR RATTSFALL def rattsfall_format_uri(self,attributes): # Listan härledd från containers.n3/rattsfallsforteckningar.n3 i # rinfoprojektets källkod - en ambitiösare lösning vore att läsa # in de faktiska N3-filerna i en rdflib-graf. containerid = {u'NJA': '/publ/rattsfall/nja/', u'RH': '/publ/rattsfall/rh/', u'MÖD': '/publ/rattsfall/mod/', u'RÅ': '/publ/rattsfall/ra/', u'HFD': '/publ/rattsfall/hfd/', u'RK': '/publ/rattsfall/rk/', u'MIG': '/publ/rattsfall/mig/', u'AD': '/publ/rattsfall/ad/', u'MD': '/publ/rattsfall/md/', u'FÖD': '/publ/rattsfall/fod/'} # res = self.baseuri_attributes['baseuri'] if 'nja' in attributes: attributes['domstol'] = attributes['nja'] assert 'domstol' in attributes, "No court provided" assert attributes['domstol'] in containerid, "%s is an unknown court" % attributes['domstol'] res = "http://rinfo.lagrummet.se"+containerid[attributes['domstol']] if 'lopnr' in attributes and ":" in attributes['lopnr']: (attributes['ar'], attributes['lopnr']) = lopnr.split(":", 1) if attributes['domstol'] == u'NJA': # FIXME: URIs should be based on publikationsordinal, not # pagenumber (which this in effect is) - but this requires # a big lookup table/database/graph with # pagenumber-to-ordinal-mappings res += '%ss%s' % (attributes['ar'], attributes['sidnr']) else: res += '%s:%s' % (attributes['ar'], attributes['lopnr']) return res ################################################################ # KOD FÖR EGRÄTTSFALL def egrattsfall_format_uri(self,attributes): descriptormap = {'C':'J', # Judgment of the Court 'T':'A', # Judgment of the Court of First Instance 'F':'W', # Judgement of the Civil Service Tribunal } # FIXME: Change this before the year 2054 (as ECJ will # hopefully have fixed their case numbering by then) if len(attributes['year']) == 2: if int(attributes['year']) < 54: year = "20"+attributes['year'] else: year = "19"+attributes['year'] else: year = attributes['year'] serial = '%04d' % int(attributes['serial']) descriptor = descriptormap[attributes['decision']] uri = "http://lagen.nu/ext/celex/6%s%s%s" % (year, descriptor, serial) return uri
from simpleparse.common import numbers from simpleparse.parser import Parser import rollparse parser = Parser(rollparse.declaration) tests_success = [ "d6", "5d6", "5d6 + d8", "(5d6 + d8)", "6 + (5d6 + d8)", "[5d6 + d8] + 6", "{3d20} + 10" ] prod = "roll" for test in tests_success: success, children, nextcharacter = parser.parse(test, production=prod) assert success and nextcharacter == len( test ), """Wasn't able to parse %s as a %s (%s chars parsed of %s), returned value was %s""" % ( repr(test), prod, nextcharacter, len(test), (success, children, nextcharacter)) tests_fail = [ "{5d6}+{8d8}", "5d", "3+", "8d8" #this one should actually work ] for test in tests_fail: success, children, nextcharacter = parser.parse(test, production=prod)
def CorpusPTBReader(ptb_data_path): ptb_sent_file = open("total_ptb.txt", "w") file_pattern = r".*/.*\.mrg" ptb = BracketParseCorpusReader(ptb_data_path, file_pattern) #print (ptb.fileids()) #print ((ptb.sents())) #ptb.sents(fileids= 'brown/cf/cf01.mrg')[0] count = 0 for sent in ptb.sents(): '''sent = "" for word in sent: if "\\" in word or "e_s" in word or "n_s" in word: continue else: sent += word + " " out = sent[:-1]''' if len(sent) < 7: continue out = ' '.join(sent) out = out.lower() # print(len(sent), out) parser = Parser(grammar, 'all') temp_result = parser.parse(out) sub_sent = [] start_index = 0 for num_info in temp_result[1]: sub_sent.append(out[start_index:num_info[1]]) sub_sent.append("NUM" + (str(num_info[2] - num_info[1]))) start_index = num_info[2] sub_sent.append(out[start_index:]) final_out = ''.join(sub_sent) final_out = re.sub(r'\*\-NUM\d ', '', final_out) final_out = re.sub(r'e_s ', '', final_out) final_out = re.sub(r'n_s ', '', final_out) final_out = re.sub(r'e_s', '', final_out) final_out = re.sub(r'n_s', '', final_out) final_out = re.sub(r'\\. ', '', final_out) final_out = re.sub(r'\\.', '', final_out) final_out = re.sub(r'\*. ', '', final_out) final_out = re.sub(r'\*.', '', final_out) final_out = re.sub(r'-. ', '', final_out) final_out = re.sub(r'-.', '', final_out) #final_out = re.sub(r'\**.\* ', '', final_out) #final_out = re.sub(r'\**.\*', '', final_out) final_out = re.sub(r'\*{,3}.\*.. ', '', final_out) final_out = re.sub(r'\*{,3}.\*. ', '', final_out) final_out = re.sub(r'\*.. ', '', final_out) final_out = re.sub(r'\*..', '', final_out) final_out = re.sub(r'\* ', '', final_out) #final_out = re.sub(r'\*', '', final_out) final_out = re.sub(r'- ', '', final_out) final_out = re.sub(r'-', '', final_out) final_out = re.sub(r'; ; ', '; ', final_out) final_out = final_out[:-1] ptb_sent_file.write(final_out) ptb_sent_file.write("\n") #print(final_out) count += 1 #if count == 10000: break #if count > 10: break ptb_sent_file.close() print(count)
from simpleparse.common import numbers from simpleparse.parser import Parser import rollparse parser = Parser(rollparse.declaration) tests_success = [ "d6", "5d6", "5d6 + d8", "(5d6 + d8)", "6 + (5d6 + d8)", "[5d6 + d8] + 6", "{3d20} + 10" ] prod = "roll" for test in tests_success: success, children, nextcharacter = parser.parse(test, production=prod) assert success and nextcharacter==len(test), """Wasn't able to parse %s as a %s (%s chars parsed of %s), returned value was %s"""%( repr(test), prod, nextcharacter, len(test), (success, children, nextcharacter)) tests_fail = [ "{5d6}+{8d8}", "5d", "3+", "8d8" #this one should actually work ] for test in tests_fail:
a job, a column, and another job, from left to right. In the second row there are two jobs, from left to right. The column in the first row has two jobs side by side, then another one above them. """ try: fid = file(fname, 'rt') except Exception, detail: raise RuntimeError, "Unable to open layout file: %s\n %s" % ( fname, str(detail)) data = fid.read() fid.close() parser = Parser(declaration, "file") # Replace all CR's in data with nothing, to convert DOS line endings # to unix format (all LF's). data = string.replace(data, '\x0D', '') tree = parser.parse(data) # Last element of tree is number of characters parsed if not tree[0]: raise RuntimeError, "Layout file cannot be parsed" if tree[2] != len(data): raise RuntimeError, "Parse error at character %d in layout file" % tree[ 2]
def parseInput(input): parser = Parser(declaration) success, children, nextcharacter = parser.parse(input, production="fastg") assert success return children
semesterkuerzel := ("A-M", [0-9]) / ("IK-M", [0-9]) / ("BWI", [0-9]) / "MINF1" / "BMT5" / "BTI1" / -"-"+ >kuerzel< := fachKuerzel, nummer? prakKuerzel := [A-Z], [A-Z], "P" verbKuerzel := [A-Z], [A-Z], "J" labKuerzel := ([A-Z], [A-Z], "L") / (fachKuerzel, " L") gwKuerzel := [A-Z_-]+, (" ", [a-zA-Z]+)? oe1 := "I" oe2 := "II" fachKuerzel := [a-zA-Z]+ nummer := int no := int gruppe := int alphanumGruppe := [A-Z] ''' VeranstaltungParser = Parser(declaration, root="root") def tryGetFullName(veranstaltung): from veranstaltungenDispatchProcessor import VeranstaltungDispatchProcessor fullName = "" success, result, nextcharacter = VeranstaltungParser.parse( veranstaltung, processor=VeranstaltungDispatchProcessor()) if success: fullName = result[0] return fullName def test(): from simpleparse import dispatchprocessor
def setDeclaration(self,declaration,production): self.parser = Parser(declaration, production) self.table = self.parser.buildTagger(production=production)
c = {} eolcomments = r""" ### comment formats where the comment goes ### from a marker to the end of the line comment := -'\012'* <EOL> := ('\r'?,'\n')/EOF >hash_comment< := '#', comment, EOL >semicolon_comment< := ';', comment, EOL >slashslash_comment< := '//', comment, EOL """ _p = Parser(eolcomments) for name in ["hash_comment", "semicolon_comment", "slashslash_comment"]: c[name] = objectgenerator.LibraryElement( generator=_p._generator, production=name, ) ccomments = r""" ### comments in format /* comment */ with no recursion allowed comment := -"*/"* >slashbang_comment< := '/*', comment, '*/' """ _p = Parser(ccomments) for name in ["c_comment", "slashbang_comment"]: c[name] = objectgenerator.LibraryElement( generator=_p._generator,
This is a panel with two rows. In the first row there is a job, a column, and another job, from left to right. In the second row there are two jobs, from left to right. The column in the first row has two jobs side by side, then another one above them. """ try: fid = file(fname, 'rt') except Exception, detail: raise RuntimeError, "Unable to open layout file: %s\n %s" % (fname, str(detail)) data = fid.read() fid.close() parser = Parser(declaration, "file") # Replace all CR's in data with nothing, to convert DOS line endings # to unix format (all LF's). data = string.replace(data, '\x0D', '') tree = parser.parse(data) # Last element of tree is number of characters parsed if not tree[0]: raise RuntimeError, "Layout file cannot be parsed" if tree[2] != len(data): raise RuntimeError, "Parse error at character %d in layout file" % tree[2] Rows = []
def __init__(self, filedef=transit_file_def, verbosity=1): Parser.__init__(self, filedef) self.verbosity = verbosity self.tfp = TransitFileProcessor(self.verbosity)