def testTermCompression(self): """Test that unreported productions are compressed Term compression is basically an inlining of terminal expressions into the calling table. At the moment the terminal expressions are all duplicated, which may balloon the size of the grammar, not sure if this will be an actual problem. As written, this optimization should provide a significant speed up, but there may the even more of a speed up if we allow for sharing the terminal tuples as well. This: a:=b <b>:= -c* c:='this' Should eventually compress to this: a := -'this'* """ failures = [] for first, second in [ ("""a:=b <b>:= -c* c:='this'""", """a := -'this'*"""), ("""a:=b >b<:= c c:= 'this'""", """a := c c:= 'this'"""), ("""a:=b >b<:= c <c>:= 'this'""", """a := 'this'"""), ("""a:=b >b<:= c+ <c>:= 'this'""", """a := 'this'+"""), # The following will never work, so eventually may raise # an error or at least give a warning! ("""a:=b,c >b<:= c+ <c>:= 'this'""", """a := 'this'+,'this'"""), ("""a:=b/c >b<:= c+ <c>:= 'this'""", """a := 'this'+/'this'"""), # This is requiring group-compression, which isn't yet written ("""a:=-b/c >b<:= c+ <c>:= 'this'""", """a := -'this'+/'this'"""), ("""a := (table1 / table2 / any_line)* <any_line> := ANY*, EOL <ANY> := -EOL <EOL> := '\n' table1 := 'a' table2 := 'b' """, """a := (table1 / table2 / (-'\n'*, '\n'))* table1 := 'a' table2 := 'b' """), ("""a:= b,c <b>:= -c* <c>:= '\n'""", """a := -'\n'*,'\n'"""), ]: pFirst = Parser(first, "a") pSecond = Parser(second, "a") tFirst = pFirst.buildTagger() tSecond = pSecond.buildTagger() if not rcmp(tFirst, tSecond): tFirstRepr = pprint.pformat(tFirst) tSecondRepr = pprint.pformat(tSecond) failures.append( """%(first)r did not produce the same parser as %(second)r\n\t%(tFirstRepr)s\n\t%(tSecondRepr)s""" % locals()) if failures: raise ValueError("\n".join(failures))
def testTermCompression( self ): """Test that unreported productions are compressed Term compression is basically an inlining of terminal expressions into the calling table. At the moment the terminal expressions are all duplicated, which may balloon the size of the grammar, not sure if this will be an actual problem. As written, this optimization should provide a significant speed up, but there may the even more of a speed up if we allow for sharing the terminal tuples as well. This: a:=b <b>:= -c* c:='this' Should eventually compress to this: a := -'this'* """ failures = [] for first, second in [ ("""a:=b <b>:= -c* c:='this'""", """a := -'this'*"""), ("""a:=b >b<:= c c:= 'this'""", """a := c c:= 'this'"""), ("""a:=b >b<:= c <c>:= 'this'""", """a := 'this'"""), ("""a:=b >b<:= c+ <c>:= 'this'""", """a := 'this'+"""), # The following will never work, so eventually may raise # an error or at least give a warning! ("""a:=b,c >b<:= c+ <c>:= 'this'""", """a := 'this'+,'this'"""), ("""a:=b/c >b<:= c+ <c>:= 'this'""", """a := 'this'+/'this'"""), # This is requiring group-compression, which isn't yet written ("""a:=-b/c >b<:= c+ <c>:= 'this'""", """a := -'this'+/'this'"""), ("""a := (table1 / table2 / any_line)* <any_line> := ANY*, EOL <ANY> := -EOL <EOL> := '\n' table1 := 'a' table2 := 'b' """, """a := (table1 / table2 / (-'\n'*, '\n'))* table1 := 'a' table2 := 'b' """), ("""a:= b,c <b>:= -c* <c>:= '\n'""", """a := -'\n'*,'\n'"""), ]: pFirst = Parser( first, "a") pSecond = Parser( second, "a") tFirst = pFirst.buildTagger() tSecond = pSecond.buildTagger() if not rcmp( tFirst , tSecond): tFirstRepr = pprint.pformat(tFirst) tSecondRepr = pprint.pformat(tSecond) failures.append( """%(first)r did not produce the same parser as %(second)r\n\t%(tFirstRepr)s\n\t%(tSecondRepr)s"""%locals()) if failures: raise ValueError( "\n".join(failures))
def testTermSharing( self ): """Test that shared terminal productions are using the same parser""" first =""" a := b,b >b<:= d d:= 'this'""" pFirst = Parser( first, "a") tFirst = pFirst.buildTagger() b,c = tFirst assert b is c, """Not sharing the same tuple for b and c instances"""
def testTermSharing(self): """Test that shared terminal productions are using the same parser""" first = """ a := b,b >b<:= d d:= 'this'""" pFirst = Parser(first, "a") tFirst = pFirst.buildTagger() b, c = tFirst assert b is c, """Not sharing the same tuple for b and c instances"""
class GeneratorAPI1: """Stand-in class supporting operation of SimpleParse 1.0 applications There was really only the one method of interest, parserbyname, everything else was internal (and is now part of simpleparsegrammar.py). """ def __init__( self, production, prebuilt=() ): from simpleparse.parser import Parser self.parser = Parser( production, prebuilts=prebuilt ) def parserbyname( self, name ): """Retrieve a tag-table by production name""" return self.parser.buildTagger( name )
class GeneratorAPI1: """Stand-in class supporting operation of SimpleParse 1.0 applications There was really only the one method of interest, parserbyname, everything else was internal (and is now part of simpleparsegrammar.py). """ def __init__(self, production, prebuilt=()): from simpleparse.parser import Parser self.parser = Parser(production, prebuilts=prebuilt) def parserbyname(self, name): """Retrieve a tag-table by production name""" return self.parser.buildTagger(name)
class EBNFSpill(object): DEFAULT_MAX_TIMES_CHAR = 35 DEFAULT_MAX_TIMES_FUNC = 10 DEFAULT_MAX_SELF_RECURSION = 25 DEFAULT_MAX_WALK_RECURSION = 100 def __init__(self,showTags=False,showTagsRecursive=False,recursionLevel=0): self._reset() self.showTags=showTags self.showTagsRecursive=showTagsRecursive self.recursionLevelObj=recursionLevel if self.recursionLevelObj>self.DEFAULT_MAX_SELF_RECURSION: raise Exception("a") #print "INIT",recursionLevel pass def __del__(self): self.recursionLevelObj-=1 pass def validate(self,data): return self.parser.parse(data) def setDeclaration(self,declaration,production): self.parser = Parser(declaration, production) self.table = self.parser.buildTagger(production=production) def setTable(self,table,nodes=None): self.table = table self.nodes=nodes or self.nodes def _reset(self): self.nodes = {} self.ctx = [] # context (infos like recurion for table2) #self.recursionLevelObj=0 self.recursionLevelWalk=0 random.seed() def setDefaults(self,**kwargs): valid_defaults = [i for i in dir(self) if i.startswith("DEFAULT_")] for k,v in kwargs.iteritems(): if k in valid_defaults: setattr(self,k,v) else: raise Exception("Not allowed to change %s to %s (valid options: %s)"%(k,v,valid_defaults)) def getTable(self): return self.table def getTagName(self,node): if self.showTags and node[0]: return "<%s>"%node[0] return "" def checkTypeIterable(self,l): return isinstance(l, collections.Iterable) and not isinstance(l, basestring) def checkTypeIterableRecursive(self,l): return isinstance(l, collections.Iterable) and not isinstance(l, basestring) and isinstance(l,tuple) and isinstance(l[0],list) and isinstance(l[1],int) def checkTypeNodeBase(self,l): #checks ( None|str, int, *) return self.checkTypeIterable(l) and len(l)>=2 and (l[0]==None or isinstance(l[0],basestring)) and isinstance(l[1],int) def checkTypeNodeWithChilds(self,l): #print "check_",str(l)[:50] try: #print "check_metric",checkTypeNodeBase(l),len(l)>=3 , checkTypeIterable(l[2]) pass except: pass return self.checkTypeNodeBase(l) and len(l)>=3 and self.checkTypeIterable(l[2]) def next(self): return def rndTimesFunc(self,sample_func,args,minlen=0,maxlen=None): maxlen = maxlen or self.DEFAULT_MAX_TIMES_FUNC maxlen+=1 out = "" for i in range(random.randrange(minlen,maxlen)): out+=sample_func(args) return out def rndTimes(self,sample,minlen=0,maxlen=None): maxlen = maxlen or self.DEFAULT_MAX_TIMES_CHAR maxlen+=1 out = "" for i in range(random.randrange(minlen,maxlen)): out+=sample return out def rndSelect(self,haystack,sample_len=1,minlen=0,maxlen=None): maxlen = maxlen or self.DEFAULT_MAX_TIMES_CHAR maxlen+=1 out = "" for i in range(random.randrange(minlen,maxlen)): out += "".join(random.sample(haystack,sample_len)) return out def eval(self,node): # different lenght commandos #print node #print id(node),node #if self.recursionLevelObj>self.DEFAULT_MAX_SELF_RECURSION or self.recursionLevelWalk>self.DEFAULT_MAX_WALK_RECURSION: # return "<recursion_exception>" if not node: return "" if len(node)<3: raise Exception( "<3 - %s"%repr(node) ) #this is an error! elif node[1]==Tdef.MATCH_RECURSION_EXCEPTION: return "<<" elif node[1]==Tdef.MATCH_RECURSION: # create a new EBNFSpill object, and resolv this one? #print node[2],self.nodes[node[2]] self.recursionLevelObj+=1 try: x = EBNFSpill(showTags=self.showTagsRecursive,recursionLevel=self.recursionLevelObj) x.setTable(self.table) recr_node=self.nodes[node[2]] except: return "" # #print "REKR",node #print "REKR2",self.nodes #print "<DAMN_RECURSION %s wild=%s>"%(node[2],self.ctx) #return "<RECURSION" #print "EXCEPT:",node[2],self.nodes #return self.rndTimes(x.generate(recr_node['obj']), 0, 3) return self.getTagName(node)+x.generate(recr_node) # single words/selections elif len(node)==3: if node[1]==Tdef.MATCH_WORD or node[1]==Tdef.MATCH_IS: return self.getTagName(node)+node[2] elif node[1]==Tdef.MATCH_ALLIN or node[1]==Tdef.MATCH_ISIN: return self.getTagName(node)+self.rndSelect(node[2],minlen=1,maxlen=1) elif node[1]==Tdef.MATCH_TABLE: # (xyz,MATCH_TABLE, <table>, 1) == exact 1 # (xyz,MATCH_TABLE, <table>, 2,1) == * return self.getTagName(node)+"" #return "<TABLE: %s>"%node[0] # mostly recursive ones elif len(node)>3: # recursions and stuff if node[1]==Tdef.MATCH_IS or node[1]==Tdef.MATCH_IS: # like (none,"MATCH_IS",'c',1,0) - choose zero or xx times return self.getTagName(node)+self.rndTimes(node[2]) elif node[1]==Tdef.MATCH_ALLIN or node[1]==Tdef.MATCH_ISIN: return self.getTagName(node)+self.rndSelect(node[2]) elif node[1]>=Tdef.MATCH_CALL and node[1]<=Tdef.MATCH_SUBTABLEINLIST: # (xyz,MATCH_TABLE, <table>, 1) == exact 1 # (xyz,MATCH_TABLE, <table>, 2,1) == * self.recursionLevelObj+=1 try: x = EBNFSpill(showTags=self.showTagsRecursive,recursionLevel=self.recursionLevelObj) except: return "" x.setTable(self.table) #print "<TABLE: %s | %s || %s || nodeid:%s>"%(node[0:1],node[3],self.ctx,id(node[3])) #print node[2] #return self.getTagName(node)+"" return self.getTagName(node)+self.rndTimesFunc(x.generate,(node[2])) return self.getTagName(node) def generate(self,node=None): out = "" for n in self.walk(node): #print n #print self.recursionLevelObj,self.recursionLevelWalk out+= self.eval(n) return out def process(self,l): if self.checkTypeNodeBase(l): return (l[0],Tdef().toName()[l[1]])+l[2:] return l def _checkRecursion(self,node): # return boolean if boolean=True nID = id(node) #print "-->",nID, " NODE ",node if self.nodes.has_key(nID): raise StopRecursionException(('[RECURSION of Node=%s]'%nID,Tdef.MATCH_RECURSION,nID)) #self.nodeIDs.append(nID) #print nID,node return node def _trackNode(self,node,nodeID=None): nID = nodeID or id(node) #print node if self.checkTypeNodeBase(node): #print "ISIN1",Tdef.MATCH_CALL,Tdef.MATCH_SUBTABLEINLIST,node[1],node[1]>=Tdef.MATCH_CALL and node[1]<=Tdef.MATCH_SUBTABLEINLIST if node[1]>=Tdef.MATCH_CALL and node[1]<=Tdef.MATCH_SUBTABLEINLIST: #print "ISIN2" #print "--- add BASE",id(node),node self.nodes[nID]=node elif self.checkTypeIterable(node): #print "--- add LIST",id(node),node self.nodes[nID]=node return node def _pushLevel(self,node): # add one level. . to check recursion space if node[1]>=Tdef.MATCH_CALL and node[1]<=Tdef.MATCH_SUBTABLEINLIST \ and len(node)>3 and node[3]==2: #print "push__" self.ctx.append(id(node)) return node def _popLevel(self,node): if node[1]>=Tdef.MATCH_CALL and node[1]<=Tdef.MATCH_SUBTABLEINLIST \ and len(node)>3 and node[3]==2: #print "pop___" return self.ctx.pop() return node def walk(self,table=None): table=table or self.table if not table: raise Exception("EBNF TagTable not set, please generate [.setDeclaration()] or set one [.setTable()]") #must not be !NONE!, please .setDeclaration() first! retn = self._walk(table) self._reset() return retn def _walk(self,l): # check if (None|basestring, int, ... ) > 2 #import time #time.sleep(0.8) #print "BEGIN",str(l)[:50] #recursion check if self.recursionLevelObj>self.DEFAULT_MAX_SELF_RECURSION or self.recursionLevelWalk>self.DEFAULT_MAX_WALK_RECURSION: #print self.recursionLevelWalk #print self.recursionLevelObj #nID= #raise StopRecursionException(('[RECURSION of Node=%s]'%nID,Tdef.MATCH_RECURSION,nID)) #print self.nodes #yield l #print "StopIter",l #print self.recursionLevelObj,self.recursionLevelWalk raise StopIteration("HMM") #yield (None,Tdef.MATCH_RECURSION_EXCEPTION,()) #raise StopRecursionException(("[RECURSION_EXCEPTION_LEVEL_REACHED]",Tdef.MATCH_RECURSION_EXCEPTION,None)) self.recursionLevelWalk+=1 #print id(l),len(l),l try: if self.checkTypeNodeWithChilds(l): #print "Childs" self._checkRecursion(l) yield self._trackNode(l) self._pushLevel(l) for e in self._walk(l[2]): yield e self._popLevel(l) elif self.checkTypeNodeBase(l): #print "Base" self._checkRecursion(l) yield self._trackNode(l) elif self.checkTypeIterableRecursive(l): #print "xxx",l[0][0] nID=id(l[0][0]) #print "IterReck" #print '[RECURSION of Node=%s]'%nID #TODO: does not work #fixme: does not work - recurses too much raise StopRecursionException(('[RECURSION of Node=%s]'%nID,Tdef.MATCH_RECURSION,nID)) elif self.checkTypeIterable(l): #print "list" self._checkRecursion(l) self._trackNode(l) # checkTypeIterableRecursive refs one of these nodes :( // damn need to reparse if this doesnt work out #self._pushLevel(l) for e in l: self._pushLevel(e) for x in self._walk(e): yield x #do not check recursion here.. this is not what we want self._popLevel(e) #self._popLevel(l) else: self._checkRecursion(l) print "Elem? - ",l #print self.checkTypeNodeWithChilds(l),self.checkTypeNodeBase(l),self.checkTypeIterable(l) yield self._trackNode(l) except StopRecursionException, e: #print self.nodes[e.getObj()[2]] #print "Except:",e.getObj() yield e.getObj() self.recursionLevelWalk-=1
class LegalRef: # Kanske detta borde vara 1,2,4,8 osv, så att anroparen kan be om # LAGRUM | FORESKRIFTER, och så vi kan definera samlingar av # vanliga kombinationer (exv ALL_LAGSTIFTNING = LAGRUM | # KORTLAGRUM | FORESKRIFTER | EGLAGSTIFTNING) LAGRUM = 1 # hänvisningar till lagrum i SFS KORTLAGRUM = 2 # SFS-hänvisningar på kortform FORESKRIFTER = 3 # hänvisningar till myndigheters författningssamlingar EGLAGSTIFTNING = 4 # EG-fördrag, förordningar och direktiv INTLLAGSTIFTNING = 5 # Fördrag, traktat etc FORARBETEN = 6 # proppar, betänkanden, etc RATTSFALL = 7 # Rättsfall i svenska domstolar MYNDIGHETSBESLUT = 8 # Myndighetsbeslut (JO, ARN, DI...) EGRATTSFALL = 9 # Rättsfall i EG-domstolen/förstainstansrätten INTLRATTSFALL = 10 # Europadomstolen # re_urisegments = re.compile(r'([\w]+://[^/]+/[^\d]*)(\d+:(bih\. # |N|)?\d+( s\.\d+|))#?(K(\d+)|)(P(\d+)|)(S(\d+)|)(N(\d+)|)') re_urisegments = re.compile( r'([\w]+://[^/]+/[^\d]*)(\d+:(bih\.[_ ]|N|)?\d+([_ ]s\.\d+|))#?(K([a-z0-9]+)|)(P([a-z0-9]+)|)(S(\d+)|)(N(\d+)|)') re_escape_compound = re.compile( r'\b(\w+-) (och) (\w+-?)(lagen|förordningen)\b', re.UNICODE) re_escape_named = re.compile( r'\B(lagens?|balkens?|förordningens?|formens?|ordningens?|kungörelsens?|stadgans?)\b', re.UNICODE) re_descape_compound = re.compile( r'\b(\w+-)_(och)_(\w+-?)(lagen|förordningen)\b', re.UNICODE) re_descape_named = re.compile( r'\|(lagens?|balkens?|förordningens?|formens?|ordningens?|kungörelsens?|stadgans?)') re_xmlcharref = re.compile("&#\d+;") def __init__(self, *args): if not os.path.sep in __file__: scriptdir = os.getcwd() else: scriptdir = os.path.dirname(__file__) self.graph = Graph() n3file = os.path.relpath(scriptdir + "/../../../res/etc/sfs-extra.n3") # print "loading n3file %s" % n3file self.graph.load(n3file, format="n3") self.roots = [] self.uriformatter = {} self.decl = "" # try to make it unicode clean all the way self.namedlaws = {} self.load_ebnf(scriptdir + "/../../../res/etc/base.ebnf") self.args = args if self.LAGRUM in args: productions = self.load_ebnf(scriptdir + "/../../../res/etc/lagrum.ebnf") for p in productions: self.uriformatter[p] = self.sfs_format_uri self.namedlaws.update(self.get_relations(RDFS.label)) self.roots.append("sfsrefs") self.roots.append("sfsref") if self.KORTLAGRUM in args: # om vi inte redan laddat lagrum.ebnf måste vi göra det # nu, eftersom kortlagrum.ebnf beror på produktioner som # definerats där if not self.LAGRUM in args: self.load_ebnf(scriptdir + "/../../../res/etc/lagrum.ebnf") productions = self.load_ebnf( scriptdir + "/../../../res/etc/kortlagrum.ebnf") for p in productions: self.uriformatter[p] = self.sfs_format_uri DCT = Namespace("http://purl.org/dc/terms/") d = self.get_relations(DCT['alternate']) self.namedlaws.update(d) # lawlist = [x.encode(SP_CHARSET) for x in list(d.keys())] lawlist = list(d.keys()) # Make sure longer law abbreviations come before shorter # ones (so that we don't mistake "3 § MBL" for "3 § MB"+"L") # lawlist.sort(cmp=lambda x, y: len(y) - len(x)) lawlist.sort(key=len, reverse=True) lawdecl = "LawAbbreviation ::= ('%s')\n" % "'/'".join(lawlist) self.decl += lawdecl self.roots.insert(0, "kortlagrumref") if self.EGLAGSTIFTNING in args: productions = self.load_ebnf(scriptdir + "/../../../res/etc/eglag.ebnf") for p in productions: self.uriformatter[p] = self.eglag_format_uri self.roots.append("eglagref") if self.FORARBETEN in args: productions = self.load_ebnf( scriptdir + "/../../../res/etc/forarbeten.ebnf") for p in productions: self.uriformatter[p] = self.forarbete_format_uri self.roots.append("forarbeteref") if self.RATTSFALL in args: productions = self.load_ebnf(scriptdir + "/../../../res/etc/rattsfall.ebnf") for p in productions: self.uriformatter[p] = self.rattsfall_format_uri self.roots.append("rattsfallref") if self.EGRATTSFALL in args: productions = self.load_ebnf(scriptdir + "/../../../res/etc/egratt.ebnf") for p in productions: self.uriformatter[p] = self.egrattsfall_format_uri self.roots.append("ecjcaseref") rootprod = "root ::= (%s/plain)+\n" % "/".join(self.roots) self.decl += rootprod self.parser = Parser(self.decl.encode(SP_CHARSET), "root") self.tagger = self.parser.buildTagger("root") # util.writefile("tagger.tmp", repr(self.tagger), SP_CHARSET) # print "tagger length: %d" % len(repr(self.tagger)) self.verbose = False self.depth = 0 # SFS-specifik kod self.currentlaw = None self.currentchapter = None self.currentsection = None self.currentpiece = None self.lastlaw = None self.currentlynamedlaws = {} def load_ebnf(self, file): """Laddar in produktionerna i den angivna filen i den EBNF-deklaration som används, samt returnerar alla *Ref och *RefId-produktioner""" # base.ebnf contains 0x1A, ie the EOF character on windows, # therefore we need to read it in binary mode f = open(file, 'rb') # assume our ebnf files use the same charset content = f.read(os.stat(file).st_size).decode(SP_CHARSET) self.decl += content f.close() return [x.group(1) for x in re.finditer(r'(\w+(Ref|RefID))\s*::=', content)] def get_relations(self, predicate): d = {} for obj, subj in self.graph.subject_objects(predicate): d[six.text_type(subj)] = six.text_type(obj) return d def parse(self, indata, baseuri="http://rinfo.lagrummet.se/publ/sfs/9999:999#K9P9S9P9", predicate=None): assert isinstance(indata, six.text_type) if indata == "": return indata # this actually triggered a bug... # h = hashlib.sha1() # h.update(indata) # print "Called with %r (%s) (%s)" % (indata, h.hexdigest(), self.verbose) self.predicate = predicate self.baseuri = baseuri if baseuri: m = self.re_urisegments.match(baseuri) if m: self.baseuri_attributes = {'baseuri': m.group(1), 'law': m.group(2), 'chapter': m.group(6), 'section': m.group(8), 'piece': m.group(10), 'item': m.group(12)} else: self.baseuri_attributes = {'baseuri': baseuri} else: self.baseuri_attributes = {} # Det är svårt att få EBNF-grammatiken att känna igen # godtyckliga ord som slutar på ett givet suffix (exv # 'bokföringslagen' med suffixet 'lagen'). Därför förbehandlar # vi indatasträngen och stoppar in ett '|'-tecken innan vissa # suffix. Vi transformerar även 'Radio- och TV-lagen' till # 'Radio-_och_TV-lagen' # # FIXME: Obviously, this shouldn't be done in a general class, # but rather in a subclas or via proxy/adapter fixedindata = indata # FIXME: Nonsensical if self.LAGRUM in self.args: fixedindata = self.re_escape_compound.sub( r'\1_\2_\3\4', fixedindata) fixedindata = self.re_escape_named.sub(r'|\1', fixedindata) # print "After: %r" % type(fixedindata) # SimpleParse har inget stöd för unicodesträngar, så vi # konverterar intdatat till en bytesträng. Tyvärr får jag inte # det hela att funka med UTF8, så vi kör xml character # references istället fixedindata = fixedindata.encode(SP_CHARSET, 'xmlcharrefreplace') # Parsea texten med TextTools.tag - inte det enklaste sättet # att göra det, men om man gör enligt # Simpleparse-dokumentationen byggs taggertabellen om för # varje anrop till parse() if self.verbose: print(("calling tag with '%s'" % (fixedindata.decode(SP_CHARSET)))) # print "tagger length: %d" % len(repr(self.tagger)) taglist = tag(fixedindata, self.tagger, 0, len(fixedindata)) result = [] root = NodeTree(taglist, fixedindata) for part in root.nodes: if part.tag != 'plain' and self.verbose: sys.stdout.write(self.prettyprint(part)) if part.tag in self.roots: self.clear_state() # self.verbose = False result.extend(self.formatter_dispatch(part)) else: assert part.tag == 'plain', "Tag is %s" % part.tag result.append(part.text) # clear state if self.currentlaw is not None: self.lastlaw = self.currentlaw self.currentlaw = None if taglist[-1] != len(fixedindata): log.error('Problem (%d:%d) with %r / %r' % ( taglist[-1] - 8, taglist[-1] + 8, fixedindata, indata)) raise ParseError( "parsed %s chars of %s (...%s...)" % (taglist[-1], len(indata), indata[(taglist[-1] - 2):taglist[-1] + 3])) # Normalisera resultatet, dvs konkatenera intilliggande # textnoder, och ta bort ev '|'-tecken som vi stoppat in # tidigare. normres = [] for i in range(len(result)): if not self.re_descape_named.search(result[i]): node = result[i] else: if self.LAGRUM in self.args: text = self.re_descape_named.sub(r'\1', result[i]) text = self.re_descape_compound.sub(r'\1 \2 \3\4', text) if isinstance(result[i], Link): # Eftersom Link-objekt är immutable måste vi skapa # ett nytt och kopiera dess attribut if hasattr(result[i], 'predicate'): node = LinkSubject(text, predicate=result[i].predicate, uri=result[i].uri) else: node = Link(text, uri=result[i].uri) else: node = text if (len(normres) > 0 and not isinstance(normres[-1], Link) and not isinstance(node, Link)): normres[-1] += node else: normres.append(node) # and finally... for i in range(len(normres)): if isinstance(normres[i], Link): # deal with these later pass else: normres[i] = self.re_xmlcharref.sub( self.unescape_xmlcharref, normres[i]) return normres def unescape_xmlcharref(self, m): # print "Changing %r to a %r" % (m.group(0)[2:-1], unichr(int(m.group(0)[2:-1]))) return chr(int(m.group(0)[2:-1])) def find_attributes(self, parts, extra={}): """recurses through a parse tree and creates a dictionary of attributes""" d = {} self.depth += 1 if self.verbose: print( (". " * self.depth + "find_attributes: starting with %s" % d)) if extra: d.update(extra) for part in parts: current_part_tag = part.tag.lower() if current_part_tag.endswith('refid'): if ((current_part_tag == 'singlesectionrefid') or (current_part_tag == 'lastsectionrefid')): current_part_tag = 'sectionrefid' d[current_part_tag[:-5]] = part.text.strip() if self.verbose: print((". " * self.depth + "find_attributes: d is now %s" % d)) if part.nodes: d.update(self.find_attributes(part.nodes, d)) if self.verbose: print((". " * self.depth + "find_attributes: returning %s" % d)) self.depth -= 1 if self.currentlaw and 'law' not in d: d['law'] = self.currentlaw if self.currentchapter and 'chapter' not in d: d['chapter'] = self.currentchapter if self.currentsection and 'section' not in d: d['section'] = self.currentsection if self.currentpiece and 'piece' not in d: d['piece'] = self.currentpiece return d def find_node(self, root, nodetag): """Returns the first node in the tree that has a tag matching nodetag. The search is depth-first""" if root.tag == nodetag: # base case return root else: for node in root.nodes: x = self.find_node(node, nodetag) if x is not None: return x return None def find_nodes(self, root, nodetag): if root.tag == nodetag: return [root] else: res = [] for node in root.nodes: res.extend(self.find_nodes(node, nodetag)) return res def flatten_tokentree(self, part, suffix): """returns a 'flattened' tokentree ie for the following tree and the suffix 'RefID' foo->bar->BlahongaRefID ->baz->quux->Blahonga2RefID ->Blahonga3RefID ->Blahonga4RefID this should return [BlahongaRefID, Blahonga2RefID, Blahonga3RefID, Blahonga4RefID]""" l = [] if part.tag.endswith(suffix): l.append(part) if not part.nodes: return l for subpart in part.nodes: l.extend(self.flatten_tokentree(subpart, suffix)) return l def formatter_dispatch(self, part): # print "Verbositiy: %r" % self.verbose self.depth += 1 # Finns det en skräddarsydd formatterare? if "format_" + part.tag in dir(self): formatter = getattr(self, "format_" + part.tag) if self.verbose: print( ((". " * self.depth) + "formatter_dispatch: format_%s defined, calling it" % part.tag)) res = formatter(part) assert res is not None, "Custom formatter for %s didn't return anything" % part.tag else: if self.verbose: print( ((". " * self.depth) + "formatter_dispatch: no format_%s, using format_tokentree" % part.tag)) res = self.format_tokentree(part) if res is None: print(((". " * self.depth) + "something wrong with this:\n" + self.prettyprint(part))) self.depth -= 1 return res def format_tokentree(self, part): # This is the default formatter. It converts every token that # ends with a RefID into a Link object. For grammar # productions like SectionPieceRefs, which contain # subproductions that also end in RefID, this is not a good # function to use - use a custom formatter instead. res = [] if self.verbose: print(((". " * self.depth) + "format_tokentree: called for %s" % part.tag)) # this is like the bottom case, or something if (not part.nodes) and (not part.tag.endswith("RefID")): res.append(part.text) else: if part.tag.endswith("RefID"): res.append(self.format_generic_link(part)) elif part.tag.endswith("Ref"): res.append(self.format_generic_link(part)) else: for subpart in part.nodes: if self.verbose and part.tag == 'LawRef': print( ((". " * self.depth) + "format_tokentree: part '%s' is a %s" % (subpart.text, subpart.tag))) res.extend(self.formatter_dispatch(subpart)) if self.verbose: print( ((". " * self.depth) + "format_tokentree: returning '%s' for %s" % (res, part.tag))) return res def prettyprint(self, root, indent=0): res = "%s'%s': '%s'\n" % ( " " * indent, root.tag, re.sub(r'\s+', ' ', root.text)) if root.nodes is not None: for subpart in root.nodes: res += self.prettyprint(subpart, indent + 1) return res else: return "" def format_generic_link(self, part, uriformatter=None): try: uri = self.uriformatter[part.tag](self.find_attributes([part])) except KeyError: if uriformatter: uri = uriformatter(self.find_attributes([part])) else: uri = self.sfs_format_uri(self.find_attributes([part])) except AttributeError: # Normal error from eglag_format_uri return part.text except: exc = sys.exc_info() # If something else went wrong, just return the plaintext log.warning("(unknown): Unable to format link for text %s (production %s)" % (part.text, part.tag)) return part.text if self.verbose: print(( (". " * self.depth) + "format_generic_link: uri is %s" % uri)) if not uri: # the formatting function decided not to return a URI for # some reason (maybe it was a partial/relative reference # without a proper base uri context return part.text elif self.predicate: return LinkSubject(part.text, uri=uri, predicate=self.predicate) else: return Link(part.text, uri=uri) # FIXME: unify this with format_generic_link def format_custom_link(self, attributes, text, production): try: uri = self.uriformatter[production](attributes) except KeyError: uri = self.sfs_format_uri(attributes) if not uri: # the formatting function decided not to return a URI for # some reason (maybe it was a partial/relative reference # without a proper base uri context return part.text elif self.predicate: return LinkSubject(text, uri=uri, predicate=self.predicate) else: return Link(text, uri=uri) # # KOD FÖR LAGRUM def clear_state(self): self.currentlaw = None self.currentchapter = None self.currentsection = None self.currentpiece = None def normalize_sfsid(self, sfsid): # sometimes '1736:0123 2' is given as '1736:0123 s. 2' or # '1736:0123.2'. This fixes that. sfsid = re.sub(r'(\d+:\d+)\.(\d)', r'\1 \2', sfsid) # return sfsid.replace('s. ','').replace('s.','') # more advanced normalizations to come... return sfsid def normalize_lawname(self, lawname): lawname = lawname.replace('|', '').replace('_', ' ').lower() if lawname.endswith('s'): lawname = lawname[:-1] return lawname def namedlaw_to_sfsid(self, text, normalize=True): if normalize: text = self.normalize_lawname(text) nolaw = [ 'aktieslagen', 'anordningen', 'anordningen', 'anslagen', 'arbetsordningen', 'associationsformen', 'avfallsslagen', 'avslagen', 'avvittringsutslagen', 'bergslagen', 'beskattningsunderlagen', 'bolagen', 'bolagsordningen', 'bolagsordningen', 'dagordningen', 'djurslagen', 'dotterbolagen', 'emballagen', 'energislagen', 'ersättningsformen', 'ersättningsslagen', 'examensordningen', 'finansbolagen', 'finansieringsformen', 'fissionsvederlagen', 'flygbolagen', 'fondbolagen', 'förbundsordningen', 'föreslagen', 'företrädesordningen', 'förhandlingsordningen', 'förlagen', 'förmånsrättsordningen', 'förmögenhetsordningen', 'förordningen', 'förslagen', 'försäkringsaktiebolagen', 'försäkringsbolagen', 'gravanordningen', 'grundlagen', 'handelsplattformen', 'handläggningsordningen', 'inkomstslagen', 'inköpssamordningen', 'kapitalunderlagen', 'klockslagen', 'kopplingsanordningen', 'låneformen', 'mervärdesskatteordningen', 'nummerordningen', 'omslagen', 'ordalagen', 'pensionsordningen', 'renhållningsordningen', 'representationsreformen', 'rättegångordningen', 'rättegångsordningen', 'rättsordningen', 'samordningen', 'samordningen', 'skatteordningen', 'skatteslagen', 'skatteunderlagen', 'skolformen', 'skyddsanordningen', 'slagen', 'solvärmeanordningen', 'storslagen', 'studieformen', 'stödformen', 'stödordningen', 'stödordningen', 'säkerhetsanordningen', 'talarordningen', 'tillslagen', 'tivolianordningen', 'trafikslagen', 'transportanordningen', 'transportslagen', 'trädslagen', 'turordningen', 'underlagen', 'uniformen', 'uppställningsformen', 'utvecklingsbolagen', 'varuslagen', 'verksamhetsformen', 'vevanordningen', 'vårdformen', 'ägoanordningen', 'ägoslagen', 'ärendeslagen', 'åtgärdsförslagen', ] if text in nolaw: return None if text in self.currentlynamedlaws: return self.currentlynamedlaws[text] elif text in self.namedlaws: return self.namedlaws[text] else: if self.verbose: # print "(unknown): I don't know the ID of named law [%s]" % text log.warning( "(unknown): I don't know the ID of named law [%s]" % text) return None def sfs_format_uri(self, attributes): piecemappings = {'första': '1', 'andra': '2', 'tredje': '3', 'fjärde': '4', 'femte': '5', 'sjätte': '6', 'sjunde': '7', 'åttonde': '8', 'nionde': '9'} keymapping = {'lawref': 'L', 'chapter': 'K', 'section': 'P', 'piece': 'S', 'item': 'N', 'itemnumeric': 'N', 'element': 'O', 'sentence': 'M', # is this ever used? } attributeorder = ['law', 'lawref', 'chapter', 'section', 'element', 'piece', 'item', 'itemnumeric', 'sentence'] if 'law' in attributes: if attributes['law'].startswith('http://'): res = '' else: res = 'http://rinfo.lagrummet.se/publ/sfs/' else: if 'baseuri' in self.baseuri_attributes: res = self.baseuri_attributes['baseuri'] else: res = '' resolvetobase = True addfragment = False justincase = None for key in attributeorder: if key in attributes: resolvetobase = False val = attributes[key] elif (resolvetobase and key in self.baseuri_attributes): val = self.baseuri_attributes[key] else: val = None if val: if not isinstance(val, six.text_type): val = val.decode(SP_CHARSET) if addfragment: res += '#' addfragment = False if (key in ['piece', 'itemnumeric', 'sentence'] and val in piecemappings): res += '%s%s' % ( keymapping[key], piecemappings[val.lower()]) else: if key == 'law': val = self.normalize_sfsid(val) val = val.replace(" ", "_") res += val addfragment = True else: if justincase: res += justincase justincase = None val = val.replace(" ", "") val = val.replace("\n", "") val = val.replace("\r", "") res += '%s%s' % (keymapping[key], val) else: if key == 'piece': justincase = "S1" return res def format_ChapterSectionRefs(self, root): assert(root.tag == 'ChapterSectionRefs') assert(len(root.nodes) == 3) # ChapterRef, wc, SectionRefs part = root.nodes[0] self.currentchapter = part.nodes[0].text.strip() if self.currentlaw: res = [self.format_custom_link({'law': self.currentlaw, 'chapter': self.currentchapter}, part.text, part.tag)] else: res = [self.format_custom_link({'chapter': self.currentchapter}, part.text, part.tag)] res.extend(self.formatter_dispatch(root.nodes[1])) res.extend(self.formatter_dispatch(root.nodes[2])) self.currentchapter = None return res def format_ChapterSectionPieceRefs(self, root): assert(root.nodes[0].nodes[0].tag == 'ChapterRefID') self.currentchapter = root.nodes[0].nodes[0].text.strip() res = [] for node in root.nodes: res.extend(self.formatter_dispatch(node)) return res def format_LastSectionRef(self, root): # the last section ref is a bit different, since we want the # ending double section mark to be part of the link text assert(root.tag == 'LastSectionRef') assert(len(root.nodes) == 3) # LastSectionRefID, wc, DoubleSectionMark sectionrefid = root.nodes[0] sectionid = sectionrefid.text return [self.format_generic_link(root)] def format_SectionPieceRefs(self, root): assert(root.tag == 'SectionPieceRefs') self.currentsection = root.nodes[0].nodes[0].text.strip() res = [self.format_custom_link(self.find_attributes([root.nodes[2]]), "%s %s" % (root.nodes[0] .text, root.nodes[2].text), root.tag)] for node in root.nodes[3:]: res.extend(self.formatter_dispatch(node)) self.currentsection = None return res def format_SectionPieceItemRefs(self, root): assert(root.tag == 'SectionPieceItemRefs') self.currentsection = root.nodes[0].nodes[0].text.strip() self.currentpiece = root.nodes[2].nodes[0].text.strip() res = [self.format_custom_link(self.find_attributes([root.nodes[2]]), "%s %s" % (root.nodes[0] .text, root.nodes[2].text), root.tag)] for node in root.nodes[3:]: res.extend(self.formatter_dispatch(node)) self.currentsection = None self.currentpiece = None return res # This is a special case for things like '17-29 och 32 §§ i lagen # (2004:575)', which picks out the LawRefID first and stores it in # .currentlaw, so that find_attributes finds it # automagically. Although now it seems to be branching out and be # all things to all people. def format_ExternalRefs(self, root): assert(root.tag == 'ExternalRefs') # print "DEBUG: start of format_ExternalRefs; self.currentlaw is %s" % self.currentlaw lawrefid_node = self.find_node(root, 'LawRefID') if lawrefid_node is None: # Ok, no explicit LawRefID found, lets see if this is a named law that we have the ID for # namedlaw_node = self.find_node(root, 'NamedLawExternalLawRef') namedlaw_node = self.find_node(root, 'NamedLaw') if namedlaw_node is None: # As a last chance, this might be a reference back to a previously # mentioned law ("...enligt 4 § samma lag") samelaw_node = self.find_node(root, 'SameLaw') assert(samelaw_node is not None) if self.lastlaw is None: log.warning( "(unknown): found reference to \"{samma,nämnda} {lag,förordning}\", but self.lastlaw is not set") self.currentlaw = self.lastlaw else: # the NamedLaw case self.currentlaw = self.namedlaw_to_sfsid(namedlaw_node.text) if self.currentlaw is None: # unknow law name - in this case it's better to # bail out rather than resolving chapter/paragraph # references relative to baseuri (which is almost # certainly wrong) return [root.text] else: self.currentlaw = lawrefid_node.text if self.find_node(root, 'NamedLaw'): namedlaw = self.normalize_lawname( self.find_node(root, 'NamedLaw').text) # print "remember that %s is %s!" % (namedlaw, self.currentlaw) self.currentlynamedlaws[namedlaw] = self.currentlaw # print "DEBUG: middle of format_ExternalRefs; self.currentlaw is %s" % self.currentlaw if self.lastlaw is None: # print "DEBUG: format_ExternalRefs: setting self.lastlaw to %s" % self.currentlaw self.lastlaw = self.currentlaw # if the node tree only contains a single reference, it looks # better if the entire expression, not just the # chapter/section part, is linked. But not if it's a # "anonymous" law ('1 § i lagen (1234:234) om blahonga') if (len(self.find_nodes(root, 'GenericRefs')) == 1 and len(self.find_nodes(root, 'SectionRefID')) == 1 and len(self.find_nodes(root, 'AnonymousExternalLaw')) == 0): res = [self.format_generic_link(root)] else: res = self.format_tokentree(root) return res def format_SectionItemRefs(self, root): assert(root.nodes[0].nodes[0].tag == 'SectionRefID') self.currentsection = root.nodes[0].nodes[0].text.strip() # res = self.formatter_dispatch(root.nodes[0]) # was formatter_dispatch(self.root) res = self.format_tokentree(root) self.currentsection = None return res def format_PieceItemRefs(self, root): self.currentpiece = root.nodes[0].nodes[0].text.strip() res = [self.format_custom_link( self.find_attributes([root.nodes[2].nodes[0]]), "%s %s" % (root.nodes[0].text, root.nodes[2].nodes[0].text), root.tag)] for node in root.nodes[2].nodes[1:]: res.extend(self.formatter_dispatch(node)) self.currentpiece = None return res def format_ChapterSectionRef(self, root): assert(root.nodes[0].nodes[0].tag == 'ChapterRefID') self.currentchapter = root.nodes[0].nodes[0].text.strip() return [self.format_generic_link(root)] def format_AlternateChapterSectionRefs(self, root): assert(root.nodes[0].nodes[0].tag == 'ChapterRefID') self.currentchapter = root.nodes[0].nodes[0].text.strip() # print "Self.currentchapter is now %s" % self.currentchapter res = self.format_tokentree(root) self.currentchapter = None return res def format_ExternalLaw(self, root): self.currentchapter = None return self.formatter_dispatch(root.nodes[0]) def format_ChangeRef(self, root): id = self.find_node(root, 'LawRefID').data return [self.format_custom_link({'lawref': id}, root.text, root.tag)] def format_SFSNr(self, root): if self.baseuri is None: sfsid = self.find_node(root, 'LawRefID').data baseuri = 'http://rinfo.lagrummet.se/publ/sfs/%s#' % sfsid.decode(SP_CHARSET) self.baseuri_attributes = {'baseuri': baseuri} return self.format_tokentree(root) def format_NamedExternalLawRef(self, root): resetcurrentlaw = False # print "format_NamedExternalLawRef: self.currentlaw is %r" % self.currentlaw if self.currentlaw is None: resetcurrentlaw = True lawrefid_node = self.find_node(root, 'LawRefID') if lawrefid_node is None: self.currentlaw = self.namedlaw_to_sfsid(root.text) else: self.currentlaw = lawrefid_node.text namedlaw = self.normalize_lawname( self.find_node(root, 'NamedLaw').text) # print "remember that %s is %s!" % (namedlaw, self.currentlaw) self.currentlynamedlaws[namedlaw] = self.currentlaw # print "format_NamedExternalLawRef: self.currentlaw is now %r" % self.currentlaw # print "format_NamedExternalLawRef: self.baseuri is %r" % self.baseuri if self.currentlaw is None: # if we can't find a ID for this law, better not <link> it res = [root.text] else: res = [self.format_generic_link(root)] # print "format_NamedExternalLawRef: self.baseuri is %r" % self.baseuri if self.baseuri is None and self.currentlaw is not None: # print "format_NamedExternalLawRef: setting baseuri_attributes" # use this as the new baseuri_attributes m = self.re_urisegments.match(self.currentlaw) if m: self.baseuri_attributes = {'baseuri': m.group(1), 'law': m.group(2), 'chapter': m.group(6), 'section': m.group(8), 'piece': m.group(10), 'item': m.group(12)} else: self.baseuri_attributes = { 'baseuri': 'http://rinfo.lagrummet.se/publ/sfs/' + self.currentlaw + '#'} if resetcurrentlaw: if self.currentlaw is not None: self.lastlaw = self.currentlaw self.currentlaw = None return res # # KOD FÖR KORTLAGRUM def format_AbbrevLawNormalRef(self, root): lawabbr_node = self.find_node(root, 'LawAbbreviation') self.currentlaw = self.namedlaw_to_sfsid( lawabbr_node.text, normalize=False) res = [self.format_generic_link(root)] if self.currentlaw is not None: self.lastlaw = self.currentlaw self.currentlaw = None return res def format_AbbrevLawShortRef(self, root): assert(root.nodes[0].tag == 'LawAbbreviation') assert(root.nodes[2].tag == 'ShortChapterSectionRef') self.currentlaw = self.namedlaw_to_sfsid( root.nodes[0].text, normalize=False) shortsection_node = root.nodes[2] assert(shortsection_node.nodes[0].tag == 'ShortChapterRefID') assert(shortsection_node.nodes[2].tag == 'ShortSectionRefID') self.currentchapter = shortsection_node.nodes[0].text self.currentsection = shortsection_node.nodes[2].text res = [self.format_generic_link(root)] self.currentchapter = None self.currentsection = None self.currentlaw = None return res # # KOD FÖR FORARBETEN def forarbete_format_uri(self, attributes): # res = self.baseuri_attributes['baseuri'] res = 'http://rinfo.lagrummet.se/' resolvetobase = True addfragment = False for key, val in list(attributes.items()): if key == 'prop': res += "publ/prop/%s" % val elif key == 'bet': res += "publ/bet/%s" % val elif key == 'skrivelse': res += "publ/rskr/%s" % val elif key == 'celex': if len(val) == 8: # incorrectly formatted, uses YY instead of YYYY val = val[0] + '19' + val[1:] res += "ext/eur-lex/%s" % val if 'sidnr' in attributes: res += "#s%s" % attributes['sidnr'] return res def format_ChapterSectionRef(self, root): assert(root.nodes[0].nodes[0].tag == 'ChapterRefID') self.currentchapter = root.nodes[0].nodes[0].text.strip() return [self.format_generic_link(root)] # # KOD FÖR EGLAGSTIFTNING def eglag_format_uri(self, attributes): res = 'http://rinfo.lagrummet.se/ext/celex/' if not 'akttyp' in attributes: if 'forordning' in attributes: attributes['akttyp'] = 'förordning' elif 'direktiv' in attributes: attributes['akttyp'] = 'direktiv' if 'akttyp' not in attributes: raise AttributeError("Akttyp saknas") # Om hur CELEX-nummer konstrueras # https://www.infotorg.sema.se/infotorg/itweb/handbook/rb/hlp_celn.htm # https://www.infotorg.sema.se/infotorg/itweb/handbook/rb/hlp_celf.htm # Om hur länkning till EURLEX ska se ut: # http://eur-lex.europa.eu/sv/tools/help_syntax.htm # Absolut URI? if 'ar' in attributes and 'lopnummer' in attributes: sektor = '3' rattslig_form = {'direktiv': 'L', 'förordning': 'R'} if len(attributes['ar']) == 2: attributes['ar'] = '19' + attributes['ar'] res += "%s%s%s%04d" % (sektor, attributes['ar'], rattslig_form[attributes['akttyp']], int(attributes['lopnummer'])) else: if not self.baseuri_attributes['baseuri'].startswith(res): # FIXME: should we warn about this? # print "Relative reference, but base context %s is not a celex context" % # self.baseuri_attributes['baseuri'] return None if 'artikel' in attributes: res += "#%s" % attributes['artikel'] if 'underartikel' in attributes: res += ".%s" % attributes['underartikel'] return res # # KOD FÖR RATTSFALL def rattsfall_format_uri(self, attributes): # Listan härledd från containers.n3/rattsfallsforteckningar.n3 i # rinfoprojektets källkod - en ambitiösare lösning vore att läsa # in de faktiska N3-filerna i en rdflib-graf. containerid = {'NJA': '/publ/rattsfall/nja/', 'RH': '/publ/rattsfall/rh/', 'MÖD': '/publ/rattsfall/mod/', 'RÅ': '/publ/rattsfall/ra/', 'RK': '/publ/rattsfall/rk/', 'MIG': '/publ/rattsfall/mig/', 'AD': '/publ/rattsfall/ad/', 'MD': '/publ/rattsfall/md/', 'FÖD': '/publ/rattsfall/fod/'} # res = self.baseuri_attributes['baseuri'] if 'nja' in attributes: attributes['domstol'] = attributes['nja'] assert 'domstol' in attributes, "No court provided" assert attributes[ 'domstol'] in containerid, "%s is an unknown court" % attributes['domstol'] res = "http://rinfo.lagrummet.se" + containerid[attributes['domstol']] if 'lopnr' in attributes and ":" in attributes['lopnr']: (attributes['ar'], attributes['lopnr']) = lopnr.split(":", 1) if attributes['domstol'] == 'NJA': # FIXME: URIs should be based on publikationsordinal, not # pagenumber (which this in effect is) - but this requires # a big lookup table/database/graph with # pagenumber-to-ordinal-mappings res += '%ss%s' % (attributes['ar'], attributes['sidnr']) else: res += '%s:%s' % (attributes['ar'], attributes['lopnr']) return res # # KOD FÖR EGRÄTTSFALL def egrattsfall_format_uri(self, attributes): descriptormap = {'C': 'J', # Judgment of the Court 'T': 'A', # Judgment of the Court of First Instance 'F': 'W', # Judgement of the Civil Service Tribunal } # FIXME: Change this before the year 2054 (as ECJ will # hopefully have fixed their case numbering by then) if len(attributes['year']) == 2: if int(attributes['year']) < 54: year = "20" + attributes['year'] else: year = "19" + attributes['year'] else: year = attributes['year'] serial = '%04d' % int(attributes['serial']) descriptor = descriptormap[attributes['decision']] uri = "http://lagen.nu/ext/celex/6%s%s%s" % (year, descriptor, serial) return uri
class Reference: LAGRUM = 1 KORTALAGRUM = 2 FORESKRIFTER = 3 FORARBETEN = 6 reUriSegments = re.compile(r'([\w]+://[^/]+/[^\d]*)(\d+:(bih\.[_ ]|N|)?\d+([_ ]s\.\d+|))#?(K([a-z0-9]+)|)(P([a-z0-9]+)|)(S(\d+)|)(N(\d+)|)') reEscapeCompound = re.compile(r'\b(\w+-) (och) (\w+-?)(lagen|förordningen)\b', re.UNICODE) reEscapeNamed = re.compile(r'\B(lagens?|balkens?|förordningens?|formens?|ordningens?|kungörelsens?|stadgans?)\b', re.UNICODE) reDescapeCompound = re.compile(r'\b(\w+-)_(och)_(\w+-?)(lagen|förordningen)\b', re.UNICODE) reDescapeNamed = re.compile(r'\|(lagens?|balkens?|förordningens?|formens?|ordningens?|kungörelsens?|stadgans?)') reXmlCharref = re.compile('&#\d+;') def __init__(self, *args): scriptDir = os.getcwd() self.graph = Graph() n3File = Util.relpath(scriptDir + '/etc/sfs-extra.n3') self.graph.load(n3File, format='n3') self.roots = [] self.uriFormatter = {} self.decl = '' self.namedLaws = {} self.loadEbnf(scriptDir + '/etc/base.ebnf') self.args = args if self.LAGRUM in args: prods = self.loadEbnf(scriptDir + '/etc/lagrum.ebnf') for p in prods: self.uriFormatter[p] = self.sfsFormatUri self.namedLaws.update(self.getRelationship(RDFS.label)) self.roots.append('sfsrefs') self.roots.append('sfsref') if self.KORTALAGRUM in args: # TODO: Fix korta lagrum also pass if self.FORARBETEN in args: prods = self.loadEbnf(scriptDir + '/etc/forarbeten.ebnf') for p in prods: self.uriFormatter[p] = self.forarbeteFormatUri self.roots.append('forarbeteref') self.decl += 'root ::= (%s/plain)+\n' % '/'.join(self.roots) self.parser = Parser(self.decl, 'root') self.tagger = self.parser.buildTagger('root') self.depth = 0 #SFS specific settings self.currentLaw = None self.currentChapter = None self.currentSection = None self.currentPiece = None self.lastLaw = None self.currentNamedLaws = {} def loadEbnf(self, file): """Loads the syntax from a given EBNF file""" f = open(file) syntax = f.read() self.decl += syntax f.close() return [x.group(1) for x in re.finditer(r'(\w+(Ref|RefID))\s*::=', syntax)] def getRelationship(self, predicate): d = {} for obj, subj in self.graph.subject_objects(predicate): d[unicode(subj)] = unicode(obj) return d def parse(self, indata, baseUri='http://rinfo.lagrummet.se/publ/sfs/9999:999#K9P9S9P9',predicate=None): if indata == '': return indata self.predicate = predicate self.baseUri = baseUri if baseUri: m = self.reUriSegments.match(baseUri) if m: self.baseUriAttrs = {'baseUri' : m.group(1), 'law' : m.group(2), 'chapter' : m.group(6), 'section' : m.group(8), 'piece' : m.group(10), 'item' : m.group(12)} else: self.baseUriAttrs = {'baseUri':baseUri} else: self.baseUriAttrs = {} fixedIndata = unicode(indata) if self.LAGRUM in self.args: fixedIndata = self.reEscapeCompound.sub(r'\1_\2_\3\4', fixedIndata) fixedIndata = self.reEscapeNamed.sub(r'|\1', fixedIndata) if isinstance(fixedIndata, unicode): fixedIndata = fixedIndata.encode(SP_CHARSET, 'xmlcharrefreplace') tagList = tag(fixedIndata, self.tagger,0,len(fixedIndata)) res = [] root = NodeTree(tagList, fixedIndata) for n in root.nodes: if n.tag in self.roots: self.clearState() res.extend(self.formatterDispatch(n)) else: assert n.tag == 'plain', 'Tag is %s' % n.tag res.append(n.text) if self.currentLaw != None: self.lastLaw = self.currentLaw self.currentLaw = None if tagList[-1] != len(fixedIndata): #TODO: Add Error raise ParseError, 'Parsed %s chars of %s (...%s...)' % (tagList[-1], len(indata), indata[(tagList[-1]-2):tagList[-1]+3]) # Normalize the result, concat and remove '|' result = [] for i in range(len(res)): if not self.reDescapeNamed.search(res[i]): node = res[i] else: if self.LAGRUM in self.args: text = self.reDescapeNamed.sub(r'\1', res[i]) text = self.reDescapeCompound.sub(r'\1 \2 \3\4', text) if isinstance(res[i], Link): # A Link obj is immutable so we have to # create a new and copy its attrs if hasattr(res[i], 'predicate'): node = LinkSubject(text, predicate=res[i].predicate, uri=res[i].uri) else: node = Link(text, uri=res[i].uri) else: node = text if (len(result) > 0 and not isinstance(result[-1], Link) and not isinstance(node, Link)): result[-1] += node else: result.append(node) for i in range(len(result)): if isinstance(result[i], Link): pass else: result[i] = self.reXmlCharref.sub(self.unescapeXmlCharref, result[i]) return result def unescapeXmlCharref(self, m): return unichr(int(m.group(0)[2:-1])) def findAttrs(self, parts, extra={}): """Creates a dict of attributes through a tree""" d = {} self.depth += 1 if extra: d.update(extra) for part in parts: currentPartTag = part.tag.lower() if currentPartTag.endswith('refid'): if ((currentPartTag == 'singelsectionrefid') or (currentPartTag == 'lastsectionrefid')): currentPartTag = 'sectionrefid' d[currentPartTag[:-5]] = part.text.strip() if part.nodes: d.update(self.findAttrs(part.nodes, d)) self.depth -= 1 if self.currentLaw and 'law' not in d: d['law'] = self.currentLaw if self.currentChapter and 'chapter' not in d: d['chapter'] = self.currentChapter if self.currentSection and 'section' not in d: d['section'] = self.currentSection if self.currentPiece and 'piece' not in d: d['piece'] = self.currentPiece return d def findNode(self, root, nodeTag): """Returns the first node in the tree that has a matching tag, dfs.""" if root.tag == nodeTag: return root else: for node in root.nodes: x = self.findNode(node, nodeTag) if x != None: return x return None def findNodes(self, root, nodeTag): if root.tag == nodeTag: return [root] else: res = [] for node in root.nodes: res.extend(self.findNodes(node, nodeTag)) return res def formatterDispatch(self, part): self.depth += 1 if 'format_' + part.tag in dir(self): formatter = getattr(self,'format_'+part.tag) res = formatter(part) assert res != None, 'Custom formatter for %s didnt return anythin' % part.tag else: res = self.formatTokentree(part) self.depth -= 1 return res def formatTokentree(self, part): res = [] if (not part.nodes) and (not part.tag.endswith('RefID')): res.append(part.text) else: if part.tag.endswith('RefID'): res.append(self.formatGenericLink(part)) elif part.tag.endswith('Ref'): res.append(self.formatGenericLink(part)) else: for p in part.nodes: res.extend(self.formatterDispatch(p)) return res def formatGenericLink(self, part, uriFormatter=None): try: uri = self.uriFormatter[part.tag](self.findAttrs([part])) except KeyError: if uriFormatter: uri = uriFormatter(self.findAttrs([part])) else: uri = self.sfsFormatUri(self.findAttrs([part])) except AttributeError: return part.text except: exc = sys.exc_info() return part.text if not uri: return part.text elif self.predicate: return LinkSubject(part.text, uri=uri, predicate=self.predicate) else: return Link(part.text, uri=uri) def formatCustomLink(self, attrs, text, production): try: uri = self.uriFormatter[production](attrs) except KeyError: uri = self.sfsFormatUri(attrs) if not uri: return part.text elif self.predicate: return LinkSubject(text, uri=uri, predicate=self.predicate) else: return Link(text, uri=uri) def clearState(self): self.currentLaw = None self.currentChapter = None self.currentSection = None self.currentPiece = None def normalizeSfsId(self, sfsId): sfsId = re.sub(r'(\d+:\d+)\.(\d)', r'\1 \2', sfsId) return sfsId def normalizeLawName(self, lawName): lawName = lawName.replace('|','').replace('_',' ').lower() if lawName.endswith('s'): lawName = lawName[:-1] return lawName def namedLawToSfsid(self, text, normalize=True): if normalize: text = self.normalizeLawName(text) noLaw = [ u'aktieslagen', u'anordningen', u'anordningen', u'anslagen', u'arbetsordningen', u'associationsformen', u'avfallsslagen', u'avslagen', u'avvittringsutslagen', u'bergslagen', u'beskattningsunderlagen', u'bolagen', u'bolagsordningen', u'bolagsordningen', u'dagordningen', u'djurslagen', u'dotterbolagen', u'emballagen', u'energislagen', u'ersättningsformen', u'ersättningsslagen', u'examensordningen', u'finansbolagen', u'finansieringsformen', u'fissionsvederlagen', u'flygbolagen', u'fondbolagen', u'förbundsordningen', u'föreslagen', u'företrädesordningen', u'förhandlingsordningen', u'förlagen', u'förmånsrättsordningen', u'förmögenhetsordningen', u'förordningen', u'förslagen', u'försäkringsaktiebolagen', u'försäkringsbolagen', u'gravanordningen', u'grundlagen', u'handelsplattformen', u'handläggningsordningen', u'inkomstslagen', u'inköpssamordningen', u'kapitalunderlagen', u'klockslagen', u'kopplingsanordningen', u'låneformen', u'mervärdesskatteordningen', u'nummerordningen', u'omslagen', u'ordalagen', u'pensionsordningen', u'renhållningsordningen', u'representationsreformen', u'rättegångordningen', u'rättegångsordningen', u'rättsordningen', u'samordningen', u'samordningen', u'skatteordningen', u'skatteslagen', u'skatteunderlagen', u'skolformen', u'skyddsanordningen', u'slagen', u'solvärmeanordningen', u'storslagen', u'studieformen', u'stödformen', u'stödordningen', u'stödordningen', u'säkerhetsanordningen', u'talarordningen', u'tillslagen', u'tivolianordningen', u'trafikslagen', u'transportanordningen', u'transportslagen', u'trädslagen', u'turordningen', u'underlagen', u'uniformen', u'uppställningsformen', u'utvecklingsbolagen', u'varuslagen', u'verksamhetsformen', u'vevanordningen', u'vårdformen', u'ägoanordningen', u'ägoslagen', u'ärendeslagen', u'åtgärdsförslagen'] if text in noLaw: return None if self.currentNamedLaws.has_key(text): return self.currentNamedLaws[text] elif self.namedLaws.has_key(text): return self.namedLaws[text] else: return None def sfsFormatUri(self, attrs): pieceMap = {u'första' :'1', u'andra' :'2', u'tredje' :'3', u'fjärde' :'4', u'femte' :'5', u'sjätte' :'6', u'sjunde' :'7', u'åttonde':'8', u'nionde' :'9'} keyMap = {u'lawref' :'L', u'chapter':'K', u'section':'P', u'piece' :'S', u'item' :'N', u'itemnumeric': 'N', u'element':'O', u'sentence': 'M'} attrOrder = ['law', 'lawref', 'chapter', 'section', 'element', 'piece', 'item', 'itemnumeric', 'sentence'] if 'law' in attrs: if attrs['law'].startswith('http://'): res = '' else: res = 'http://rinfo.lagrummet.se/publ/sfs/' else: if 'baseUri' in self.baseUriAttrs: res = self.baseUriAttrs['baseUri'] else: res = '' resolveBase = True addFragment = False justInCase = None for key in attrOrder: if attrs.has_key(key): resolveBase = False val = attrs[key] elif (resolveBase and self.baseUriAttrs.has_key(key)): val = self.baseUriAttrs[key] else: val = None if val: if addFragment: res += '#' addFragment = False if (key in ['piece', 'itemnumeric', 'sentence'] and val in pieceMap): res += '%s%s' % (keyMap[key], pieceMap[val.lower()]) else: if key == 'law': val = self.normalizeSfsId(val) val = val.replace(' ', '_') res += val addFragment = True else: if justInCase: res += justInCase justInCase = None val = val.replace(' ', '') val = val.replace('\n', '') val = val.replace('\r', '') res += '%s%s' % (keyMap[key], val) else: if key == 'piece': justInCase = 'S1' return res def format_SFSNr(self, root): if self.baseUri == None: sfsId = self.findNode(root, 'LawRefID').data self.baseUriAttrs = {'baseUri':'http://rinfo.lagrummet.se/publ/sfs/'+sfsId+'#'} return self.formatTokentree(root) def format_ChangeRef(self, root): id = self.findNode(root, 'LawRefID').data return [self.formatCustomLink({'lawref':id}, root.text, root.tag)] def format_NamedExternalLawRef(self, root): resetCurrentLaw = False if self.currentLaw == None: resetCurrentLaw = True lawRefIdNode = self.findNode(root, 'LawRefID') if lawRefIdNode == None: self.currentLaw = self.namedLawToSfsid(root.text) else: self.currentLaw = lawRefIdNode.text namedLaw = self.normalizeLawName(self.findNode(root, 'NamedLaw').text) self.currentNamedLaws[namedLaw] = self.currentLaw if self.currentLaw == None: res = [root.text] else: res = [self.formatGenericLink(root)] if self.baseUri == None and self.currentLaw != None: m = self.reUriSegments.match(self.currentLaw) if m: self.baseUriAttrs = {'baseUri' : m.group(1), 'law': m.group(2), 'chapter': m.group(6), 'section': m.group(8), 'piece': m.group(10), 'item': m.group(12)} else: self.baseUriAttrs = {'baseUri': 'http://rinfo.lagrummet.se/publ/sfs/' + self.currentLaw + '#'} if resetCurrentLaw: if self.currentLaw != None: self.lastLaw = self.currentLaw self.currentLaw = None return res def format_ChapterSectionRef(self, root): assert(root.nodes[0].nodes[0].tag == 'ChapterRefID') self.currentChapter = root.nodes[0].nodes[0].text.strip() return [self.formatGenericLink(root)] def format_ChapterSectionPieceRefs(self, root): assert(root.nodes[0].nodes[0].tag == 'ChapterRefID') self.currentChapter = root.nodes[0].nodes[0].text.strip() res = [] for node in root.nodes: res.extend(self.formatterDispatch(node)) return res def format_AlternativeChapterSectionRefs(self, root): print "TODO: Implement me %s" % root.tag def format_LastSectionRef(self, root): # We want the ending double section mark to be # a part of the link assert(root.tag == 'LastSectionRef') assert(len(root.nodes) == 3) sectionRefId = root.nodes[0] sectionId = sectionRefId.text return [self.formatGenericLink(root)] def format_SectionPieceRefs(self, root): assert(root.tag == 'SectionPieceRefs') self.currentSection = root.nodes[0].nodes[0].text.strip() res = [self.formatCustomLink(self.findAttrs([root.nodes[2]]), '%s %s' % (root.nodes[0].text, root.nodes[2].text), root.tag)] for node in root.nodes[3:]: res.extend(self.formatterDispatch(node)) self.currentSection = None return res def format_SectionPieceItemRefs(self, root): assert(root.tag == 'SectionPieceItemRefs') self.currentSection = root.nodes[0].nodes[0].text.strip() self.currentPiece = root.nodes[2].nodes[0].text.strip() res = [self.formatCustomLink(self.findAttrs([root.nodes[2]]), '%s %s' % (root.nodes[0].text, root.nodes[2].text), root.tag)] for node in root.nodes[3:]: res.extend(self.formatterDispatch(node)) self.currentSection = None self.currentPiece = None return res def format_SectionItemRefs(self, root): assert(root.nodes[0].nodes[0].tag == 'SectionRefID') self.currentSection = root.nodes[0].nodes[0].text.strip() res = self.formatTokentree(root) self.currentSection = None return res def format_PieceItemRefs(self, root): self.currentPiece = root.nodes[0].nodes[0].text.strip() res = [self.formatCustomLink(self.findAttrs([root.nodes[2].nodes[0]]), '%s %s' % (root.nodes[0].text, root.nodes[2].nodes[0].text), root.tag)] for node in root.nodes[2].nodes[1:]: res.extend(self.formatterDispatch(node)) self.currentPiece = None return res def format_ExternalLaw(self, root): self.currentChapter = None return self.formatterDispatch(root.nodes[0]) def format_ExternalRefs(self, root): # Special case for things like '17-29 och 32 §§ i lagen # (2004:575)' by picking the LawRefID and store it in # currentLaw do findAttrs will find it. assert(root.tag == 'ExternalRefs') lawRefIdNode = self.findNode(root, 'LawRefID') if lawRefIdNode == None: namedLawNode = self.findNode(root, 'NamedLaw') if namedLawNode == None: sameLawNode = self.findNode(root, 'SameLaw') assert(sameLawNode != None) self.currentLaw = self.lastLaw else: self.currentLaw = self.namedLawToSfsid(namedLawNode.text) if self.currentLaw == None: # Unknown law name, return return [root.text] else: self.currentLaw = lawRefIdNode.text if self.findNode(root, 'NamedLaw'): namedLaw = self.normalizeLawName(self.findNode(root, 'NamedLaw').text) self.currentNamedLaws[namedLaw] = self.currentLaw if self.lastLaw is None: self.lastLaw = self.currentLaw if (len(self.findNodes(root, 'GenericRefs')) == 1 and len(self.findNodes(root, 'SectionRefID')) == 1 and len(self.findNodes(root, 'AnonymousExternalLaw')) == 0): res = [self.formatGenericLink(root)] else: res = self.formatTokentree(root) return res def forarbeteFormatUri(self, attrs): res = 'http://rinfo.lagrummet.se/' resolveBase = True addFragment = False for key, val in attrs.items(): if key == 'prop': res += 'publ/prop/%s' % val elif key == 'bet': res += 'ext/bet/%s' % val elif key == 'skrivelse': res += 'ext/rskr/%s' % val elif key == 'celex': if len(val) == 8: val = val[0] + '19' + val[1:] res += 'ext/celex/%s' % val if 'sidnr' in attrs: res += '#s%s' % attrs['sidnr'] return res
class LegalRef: # Kanske detta borde vara 1,2,4,8 osv, så att anroparen kan be om # LAGRUM | FORESKRIFTER, och så vi kan definera samlingar av # vanliga kombinationer (exv ALL_LAGSTIFTNING = LAGRUM | # KORTLAGRUM | FORESKRIFTER | EGLAGSTIFTNING) LAGRUM = 1 # hänvisningar till lagrum i SFS KORTLAGRUM = 2 # SFS-hänvisningar på kortform FORESKRIFTER = 3 # hänvisningar till myndigheters författningssamlingar EGLAGSTIFTNING = 4 # EG-fördrag, förordningar och direktiv INTLLAGSTIFTNING = 5 # Fördrag, traktat etc FORARBETEN = 6 # proppar, betänkanden, etc RATTSFALL = 7 # Rättsfall i svenska domstolar MYNDIGHETSBESLUT = 8 # Myndighetsbeslut (JO, ARN, DI...) EGRATTSFALL = 9 # Rättsfall i EG-domstolen/förstainstansrätten INTLRATTSFALL = 10 # Europadomstolen # re_urisegments = re.compile(r'([\w]+://[^/]+/[^\d]*)(\d+:(bih\. |N|)?\d+( s\.\d+|))#?(K(\d+)|)(P(\d+)|)(S(\d+)|)(N(\d+)|)') re_urisegments = re.compile(r'([\w]+://[^/]+/[^\d]*)(\d+:(bih\.[_ ]|N|)?\d+([_ ]s\.\d+|))#?(K([a-z0-9]+)|)(P([a-z0-9]+)|)(S(\d+)|)(N(\d+)|)') re_escape_compound = re.compile(r'\b(\w+-) (och) (\w+-?)(lagen|förordningen)\b', re.UNICODE) re_escape_named = re.compile(r'\B(lagens?|balkens?|förordningens?|formens?|ordningens?|kungörelsens?|stadgans?)\b', re.UNICODE) re_descape_compound = re.compile(r'\b(\w+-)_(och)_(\w+-?)(lagen|förordningen)\b', re.UNICODE) re_descape_named = re.compile(r'\|(lagens?|balkens?|förordningens?|formens?|ordningens?|kungörelsens?|stadgans?)') re_xmlcharref = re.compile("&#\d+;") def __init__(self,*args): if not os.path.sep in __file__: scriptdir = os.getcwd() else: scriptdir = os.path.dirname(__file__) #n3file = os.path.sep.join([scriptdir,"etc","sfs-extra.n3"]) #n3url = "file://" + n3file.replace("\\","/") #print "scriptdir: %s" % scriptdir #print "n3file: %s" % n3file #print "n3url: %s" % n3url self.graph = Graph() n3file = Util.relpath(scriptdir + "/etc/sfs-extra.n3") # print "loading n3file %s" % n3file self.graph.load(n3file, format="n3") self.roots = [] self.uriformatter = {} self.decl = "" self.namedlaws = {} self.load_ebnf(scriptdir+"/etc/base.ebnf") self.args = args if self.LAGRUM in args: productions = self.load_ebnf(scriptdir+"/etc/lagrum.ebnf") for p in productions: self.uriformatter[p] = self.sfs_format_uri self.namedlaws.update(self.get_relations(RDFS.label)) self.roots.append("sfsrefs") self.roots.append("sfsref") if self.KORTLAGRUM in args: # om vi inte redan laddat lagrum.ebnf måste vi göra det # nu, eftersom kortlagrum.ebnf beror på produktioner som # definerats där if not self.LAGRUM in args: self.load_ebnf(scriptdir+"/etc/lagrum.ebnf") productions = self.load_ebnf(scriptdir+"/etc/kortlagrum.ebnf") for p in productions: self.uriformatter[p] = self.sfs_format_uri DCT = Namespace("http://purl.org/dc/terms/") d = self.get_relations(DCT['alternate']) self.namedlaws.update(d) lawlist = [x.encode(SP_CHARSET) for x in d.keys()] # Make sure longer law abbreviations come before shorter # ones (so that we don't mistake "3 § MBL" for "3 § MB"+"L") lawlist.sort(cmp=lambda x,y:len(y)-len(x)) self.decl += "LawAbbreviation ::= ('%s')\n" % "'/'".join(lawlist) self.roots.insert(0,"kortlagrumref") if self.EGLAGSTIFTNING in args: productions = self.load_ebnf(scriptdir+"/etc/eglag.ebnf") for p in productions: self.uriformatter[p] = self.eglag_format_uri self.roots.append("eglagref") if self.FORARBETEN in args: productions = self.load_ebnf(scriptdir+"/etc/forarbeten.ebnf") for p in productions: self.uriformatter[p] = self.forarbete_format_uri self.roots.append("forarbeteref") if self.RATTSFALL in args: productions = self.load_ebnf(scriptdir+"/etc/rattsfall.ebnf") for p in productions: self.uriformatter[p] = self.rattsfall_format_uri self.roots.append("rattsfallref") if self.EGRATTSFALL in args: productions = self.load_ebnf(scriptdir+"/etc/egratt.ebnf") for p in productions: self.uriformatter[p] = self.egrattsfall_format_uri self.roots.append("ecjcaseref") self.decl += "root ::= (%s/plain)+\n" % "/".join(self.roots) # pprint(productions) # print self.decl.decode(SP_CHARSET,'ignore') self.parser = Parser(self.decl, "root") self.tagger = self.parser.buildTagger("root") # print "tagger length: %d" % len(repr(self.tagger)) self.verbose = False self.depth = 0 # SFS-specifik kod self.currentlaw = None self.currentchapter = None self.currentsection = None self.currentpiece = None self.lastlaw = None self.currentlynamedlaws = {} def load_ebnf(self,file): """Laddar in produktionerna i den angivna filen i den EBNF-deklaration som används, samt returnerar alla *Ref och *RefId-produktioner""" # print "%s: Loading %s" % (id(self), file) f = open(file) content = f.read() self.decl += content f.close() return [x.group(1) for x in re.finditer(r'(\w+(Ref|RefID))\s*::=', content)] def get_relations(self, predicate): d = {} for obj, subj in self.graph.subject_objects(predicate): d[unicode(subj)] = unicode(obj) return d def parse(self, indata, baseuri="http://rinfo.lagrummet.se/publ/sfs/9999:999#K9P9S9P9",predicate=None): if indata == "": return indata # this actually triggered a bug... # h = hashlib.sha1() # h.update(indata) # print "Called with %r (%s) (%s)" % (indata, h.hexdigest(), self.verbose) self.predicate = predicate self.baseuri = baseuri if baseuri: m = self.re_urisegments.match(baseuri) if m: self.baseuri_attributes = {'baseuri':m.group(1), 'law':m.group(2), 'chapter':m.group(6), 'section':m.group(8), 'piece':m.group(10), 'item':m.group(12)} else: self.baseuri_attributes = {'baseuri':baseuri} else: self.baseuri_attributes = {} # Det är svårt att få EBNF-grammatiken att känna igen # godtyckliga ord som slutar på ett givet suffix (exv # 'bokföringslagen' med suffixet 'lagen'). Därför förbehandlar # vi indatasträngen och stoppar in ett '|'-tecken innan vissa # suffix. Vi transformerar även 'Radio- och TV-lagen' till # 'Radio-_och_TV-lagen' # # FIXME: Obviously, this shouldn't be done in a general class, # but rather in a subclas or via proxy/adapter # if we don't do the unicode conversion and pass # BeautifulSoup.NavigableString, the later .encode call fails # (since it's not a real unicode string) fixedindata = unicode(indata) # print "Before: %r" % type(fixedindata) if self.LAGRUM in self.args: fixedindata = self.re_escape_compound.sub(r'\1_\2_\3\4', fixedindata) fixedindata = self.re_escape_named.sub(r'|\1', fixedindata) # print "After: %r" % type(fixedindata) # SimpleParse har inget stöd för unicodesträngar, så vi # konverterar intdatat till en bytesträng. Tyvärr får jag inte # det hela att funka med UTF8, så vi kör xml character # references istället if isinstance(fixedindata,unicode): fixedindata = fixedindata.encode(SP_CHARSET,'xmlcharrefreplace') # Parsea texten med TextTools.tag - inte det enklaste sättet # att göra det, men om man gör enligt # Simpleparse-dokumentationen byggs taggertabellen om för # varje anrop till parse() if self.verbose: print u"calling tag with '%s'" % (fixedindata.decode(SP_CHARSET)) # print "tagger length: %d" % len(repr(self.tagger)) taglist = tag(fixedindata, self.tagger,0,len(fixedindata)) result = [] root = NodeTree(taglist,fixedindata) for part in root.nodes: if part.tag != 'plain' and self.verbose: sys.stdout.write(self.prettyprint(part)) if part.tag in self.roots: self.clear_state() # self.verbose = False result.extend(self.formatter_dispatch(part)) else: assert part.tag == 'plain',"Tag is %s" % part.tag result.append(part.text) # clear state if self.currentlaw != None: self.lastlaw = self.currentlaw self.currentlaw = None if taglist[-1] != len(fixedindata): log.error(u'Problem (%d:%d) with %r / %r' % (taglist[-1]-8,taglist[-1]+8,fixedindata,indata)) raise ParseError, "parsed %s chars of %s (...%s...)" % (taglist[-1], len(indata), indata[(taglist[-1]-2):taglist[-1]+3]) # Normalisera resultatet, dvs konkatenera intilliggande # textnoder, och ta bort ev '|'-tecken som vi stoppat in # tidigare. normres = [] for i in range(len(result)): if not self.re_descape_named.search(result[i]): node = result[i] else: if self.LAGRUM in self.args: text = self.re_descape_named.sub(r'\1',result[i]) text = self.re_descape_compound.sub(r'\1 \2 \3\4', text) if isinstance(result[i], Link): # Eftersom Link-objekt är immutable måste vi skapa # ett nytt och kopiera dess attribut if hasattr(result[i],'predicate'): node = LinkSubject(text, predicate=result[i].predicate, uri=result[i].uri) else: node = Link(text,uri=result[i].uri) else: node = text if (len(normres) > 0 and not isinstance(normres[-1],Link) and not isinstance(node,Link)): normres[-1] += node else: normres.append(node) # and finally... for i in range(len(normres)): if isinstance(normres[i], Link): # deal with these later pass else: normres[i] = self.re_xmlcharref.sub(self.unescape_xmlcharref, normres[i]) return normres def unescape_xmlcharref(self, m): # print "Changing %r to a %r" % (m.group(0)[2:-1], unichr(int(m.group(0)[2:-1]))) return unichr(int(m.group(0)[2:-1])) def find_attributes(self,parts,extra={}): """recurses through a parse tree and creates a dictionary of attributes""" d = {} self.depth += 1 if self.verbose: print ". "*self.depth+"find_attributes: starting with %s"%d if extra: d.update(extra) for part in parts: current_part_tag = part.tag.lower() if current_part_tag.endswith('refid'): if ((current_part_tag == 'singlesectionrefid') or (current_part_tag == 'lastsectionrefid')): current_part_tag = 'sectionrefid' d[current_part_tag[:-5]] = part.text.strip() if self.verbose: print ". "*self.depth+"find_attributes: d is now %s" % d if part.nodes: d.update(self.find_attributes(part.nodes,d)) if self.verbose: print ". "*self.depth+"find_attributes: returning %s" % d self.depth -= 1 if self.currentlaw and 'law' not in d : d['law'] = self.currentlaw if self.currentchapter and 'chapter' not in d: d['chapter'] = self.currentchapter if self.currentsection and 'section' not in d: d['section'] = self.currentsection if self.currentpiece and 'piece' not in d : d['piece'] = self.currentpiece return d def find_node(self,root,nodetag): """Returns the first node in the tree that has a tag matching nodetag. The search is depth-first""" if root.tag == nodetag: # base case return root else: for node in root.nodes: x = self.find_node(node,nodetag) if x != None: return x return None def find_nodes(self,root,nodetag): if root.tag == nodetag: return [root] else: res = [] for node in root.nodes: res.extend(self.find_nodes(node,nodetag)) return res def flatten_tokentree(self,part,suffix): """returns a 'flattened' tokentree ie for the following tree and the suffix 'RefID' foo->bar->BlahongaRefID ->baz->quux->Blahonga2RefID ->Blahonga3RefID ->Blahonga4RefID this should return [BlahongaRefID, Blahonga2RefID, Blahonga3RefID, Blahonga4RefID]""" l = [] if part.tag.endswith(suffix): l.append(part) if not part.nodes: return l for subpart in part.nodes: l.extend(self.flatten_tokentree(subpart,suffix)) return l def formatter_dispatch(self,part): # print "Verbositiy: %r" % self.verbose self.depth += 1 # Finns det en skräddarsydd formatterare? if "format_"+part.tag in dir(self): formatter = getattr(self,"format_"+part.tag) if self.verbose: print (". "*self.depth)+ "formatter_dispatch: format_%s defined, calling it" % part.tag res = formatter(part) assert res != None, "Custom formatter for %s didn't return anything" % part.tag else: if self.verbose: print (". "*self.depth)+ "formatter_dispatch: no format_%s, using format_tokentree" % part.tag res = self.format_tokentree(part) if res == None: print (". "*self.depth)+ "something wrong with this:\n" + self.prettyprint(part) self.depth -= 1 return res def format_tokentree(self,part): # This is the default formatter. It converts every token that # ends with a RefID into a Link object. For grammar # productions like SectionPieceRefs, which contain # subproductions that also end in RefID, this is not a good # function to use - use a custom formatter instead. res = [] if self.verbose: print (". "*self.depth)+ "format_tokentree: called for %s" % part.tag # this is like the bottom case, or something if (not part.nodes) and (not part.tag.endswith("RefID")): res.append(part.text) else: if part.tag.endswith("RefID"): res.append(self.format_generic_link(part)) elif part.tag.endswith("Ref"): res.append(self.format_generic_link(part)) else: for subpart in part.nodes: if self.verbose and part.tag == 'LawRef': print (". "*self.depth) + "format_tokentree: part '%s' is a %s" % (subpart.text, subpart.tag) res.extend(self.formatter_dispatch(subpart)) if self.verbose: print (". "*self.depth)+ "format_tokentree: returning '%s' for %s" % (res,part.tag) return res def prettyprint(self,root,indent=0): res = u"%s'%s': '%s'\n" % (" "*indent,root.tag,re.sub(r'\s+', ' ',root.text)) if root.nodes != None: for subpart in root.nodes: res += self.prettyprint(subpart,indent+1) return res else: return u"" def format_generic_link(self,part,uriformatter=None): try: uri = self.uriformatter[part.tag](self.find_attributes([part])) except KeyError: if uriformatter: uri = uriformatter(self.find_attributes([part])) else: uri = self.sfs_format_uri(self.find_attributes([part])) except AttributeError: # Normal error from eglag_format_uri return part.text except: exc = sys.exc_info() # If something else went wrong, just return the plaintext log.warning("(unknown): Unable to format link for text %s (production %s)" % (part.text, part.tag)) return part.text if self.verbose: print (". "*self.depth)+ "format_generic_link: uri is %s" % uri if not uri: # the formatting function decided not to return a URI for # some reason (maybe it was a partial/relative reference # without a proper base uri context return part.text elif self.predicate: return LinkSubject(part.text, uri=uri, predicate=self.predicate) else: return Link(part.text, uri=uri) # FIXME: unify this with format_generic_link def format_custom_link(self, attributes, text, production): try: uri = self.uriformatter[production](attributes) except KeyError: uri = self.sfs_format_uri(attributes) if not uri: # the formatting function decided not to return a URI for # some reason (maybe it was a partial/relative reference # without a proper base uri context return part.text elif self.predicate: return LinkSubject(text, uri=uri, predicate=self.predicate) else: return Link(text, uri=uri) ################################################################ # KOD FÖR LAGRUM def clear_state(self): self.currentlaw = None self.currentchapter = None self.currentsection = None self.currentpiece = None def normalize_sfsid(self,sfsid): # sometimes '1736:0123 2' is given as '1736:0123 s. 2' or # '1736:0123.2'. This fixes that. sfsid = re.sub(r'(\d+:\d+)\.(\d)',r'\1 \2',sfsid) #return sfsid.replace('s. ','').replace('s.','') # more advanced normalizations to come... return sfsid def normalize_lawname(self,lawname): lawname=lawname.replace('|','').replace('_',' ').lower() if lawname.endswith('s'): lawname = lawname[:-1] return lawname def namedlaw_to_sfsid(self,text,normalize=True): if normalize: text = self.normalize_lawname(text) nolaw = [ u'aktieslagen', u'anordningen', u'anordningen', u'anslagen', u'arbetsordningen', u'associationsformen', u'avfallsslagen', u'avslagen', u'avvittringsutslagen', u'bergslagen', u'beskattningsunderlagen', u'bolagen', u'bolagsordningen', u'bolagsordningen', u'dagordningen', u'djurslagen', u'dotterbolagen', u'emballagen', u'energislagen', u'ersättningsformen', u'ersättningsslagen', u'examensordningen', u'finansbolagen', u'finansieringsformen', u'fissionsvederlagen', u'flygbolagen', u'fondbolagen', u'förbundsordningen', u'föreslagen', u'företrädesordningen', u'förhandlingsordningen', u'förlagen', u'förmånsrättsordningen', u'förmögenhetsordningen', u'förordningen', u'förslagen', u'försäkringsaktiebolagen', u'försäkringsbolagen', u'gravanordningen', u'grundlagen', u'handelsplattformen', u'handläggningsordningen', u'inkomstslagen', u'inköpssamordningen', u'kapitalunderlagen', u'klockslagen', u'kopplingsanordningen', u'låneformen', u'mervärdesskatteordningen', u'nummerordningen', u'omslagen', u'ordalagen', u'pensionsordningen', u'renhållningsordningen', u'representationsreformen', u'rättegångordningen', u'rättegångsordningen', u'rättsordningen', u'samordningen', u'samordningen', u'skatteordningen', u'skatteslagen', u'skatteunderlagen', u'skolformen', u'skyddsanordningen', u'slagen', u'solvärmeanordningen', u'storslagen', u'studieformen', u'stödformen', u'stödordningen', u'stödordningen', u'säkerhetsanordningen', u'talarordningen', u'tillslagen', u'tivolianordningen', u'trafikslagen', u'transportanordningen', u'transportslagen', u'trädslagen', u'turordningen', u'underlagen', u'uniformen', u'uppställningsformen', u'utvecklingsbolagen', u'varuslagen', u'verksamhetsformen', u'vevanordningen', u'vårdformen', u'ägoanordningen', u'ägoslagen', u'ärendeslagen', u'åtgärdsförslagen', ] if text in nolaw: return None if self.currentlynamedlaws.has_key(text): return self.currentlynamedlaws[text] elif self.namedlaws.has_key(text): return self.namedlaws[text] else: if self.verbose: # print "(unknown): I don't know the ID of named law [%s]" % text log.warning("(unknown): I don't know the ID of named law [%s]" % text) return None def sfs_format_uri(self,attributes): piecemappings = {u'första' :'1', u'andra' :'2', u'tredje' :'3', u'fjärde' :'4', u'femte' :'5', u'sjätte' :'6', u'sjunde' :'7', u'åttonde':'8', u'nionde' :'9'} keymapping = {'lawref' :'L', 'chapter' :'K', 'section' :'P', 'piece' :'S', 'item' :'N', 'itemnumeric':'N', 'element' :'O', 'sentence':'M', # is this ever used? } attributeorder = ['law', 'lawref', 'chapter', 'section', 'element', 'piece', 'item', 'itemnumeric','sentence'] if 'law' in attributes: if attributes['law'].startswith('http://'): res = '' else: res = 'http://rinfo.lagrummet.se/publ/sfs/' else: if 'baseuri' in self.baseuri_attributes: res = self.baseuri_attributes['baseuri'] else: res = '' resolvetobase = True addfragment = False justincase = None for key in attributeorder: if attributes.has_key(key): resolvetobase = False val = attributes[key] elif (resolvetobase and self.baseuri_attributes.has_key(key)): val = self.baseuri_attributes[key] else: val = None if val: if addfragment: res += '#' addfragment = False if (key in ['piece', 'itemnumeric', 'sentence'] and val in piecemappings): res += '%s%s' % (keymapping[key],piecemappings[val.lower()]) else: if key == 'law': val = self.normalize_sfsid(val) val = val.replace(" ", "_") res += val addfragment = True else: if justincase: res += justincase justincase = None val = val.replace(" ", "") val = val.replace("\n", "") val = val.replace("\r", "") res += '%s%s' % (keymapping[key],val) else: if key == 'piece': justincase = "S1" return res def format_ChapterSectionRefs(self,root): assert(root.tag == 'ChapterSectionRefs') assert(len(root.nodes) == 3) # ChapterRef, wc, SectionRefs part = root.nodes[0] self.currentchapter = part.nodes[0].text.strip() if self.currentlaw: res = [self.format_custom_link({'law':self.currentlaw, 'chapter':self.currentchapter}, part.text, part.tag)] else: res = [self.format_custom_link({'chapter':self.currentchapter}, part.text, part.tag)] res.extend(self.formatter_dispatch(root.nodes[1])) res.extend(self.formatter_dispatch(root.nodes[2])) self.currentchapter = None return res def format_ChapterSectionPieceRefs(self,root): assert(root.nodes[0].nodes[0].tag == 'ChapterRefID') self.currentchapter = root.nodes[0].nodes[0].text.strip() res = [] for node in root.nodes: res.extend(self.formatter_dispatch(node)) return res def format_LastSectionRef(self, root): # the last section ref is a bit different, since we want the # ending double section mark to be part of the link text assert(root.tag == 'LastSectionRef') assert(len(root.nodes) == 3) # LastSectionRefID, wc, DoubleSectionMark sectionrefid = root.nodes[0] sectionid = sectionrefid.text return [self.format_generic_link(root)] def format_SectionPieceRefs(self, root): assert(root.tag == 'SectionPieceRefs') self.currentsection = root.nodes[0].nodes[0].text.strip() res = [self.format_custom_link(self.find_attributes([root.nodes[2]]), "%s %s" % (root.nodes[0].text, root.nodes[2].text), root.tag)] for node in root.nodes[3:]: res.extend(self.formatter_dispatch(node)) self.currentsection = None return res def format_SectionPieceItemRefs(self,root): assert(root.tag == 'SectionPieceItemRefs') self.currentsection = root.nodes[0].nodes[0].text.strip() self.currentpiece = root.nodes[2].nodes[0].text.strip() res = [self.format_custom_link(self.find_attributes([root.nodes[2]]), "%s %s" % (root.nodes[0].text, root.nodes[2].text), root.tag)] for node in root.nodes[3:]: res.extend(self.formatter_dispatch(node)) self.currentsection = None self.currentpiece = None return res # This is a special case for things like '17-29 och 32 §§ i lagen # (2004:575)', which picks out the LawRefID first and stores it in # .currentlaw, so that find_attributes finds it # automagically. Although now it seems to be branching out and be # all things to all people. def format_ExternalRefs(self,root): assert(root.tag == 'ExternalRefs') # print "DEBUG: start of format_ExternalRefs; self.currentlaw is %s" % self.currentlaw lawrefid_node = self.find_node(root,'LawRefID') if lawrefid_node == None: # Ok, no explicit LawRefID found, lets see if this is a named law that we have the ID for # namedlaw_node = self.find_node(root, 'NamedLawExternalLawRef') namedlaw_node = self.find_node(root, 'NamedLaw') if namedlaw_node == None: # As a last chance, this might be a reference back to a previously mentioned law ("...enligt 4 § samma lag") samelaw_node = self.find_node(root, 'SameLaw') assert(samelaw_node != None) if self.lastlaw == None: log.warning(u"(unknown): found reference to \"{samma,nämnda} {lag,förordning}\", but self.lastlaw is not set") self.currentlaw = self.lastlaw else: # the NamedLaw case self.currentlaw = self.namedlaw_to_sfsid(namedlaw_node.text) if self.currentlaw == None: # unknow law name - in this case it's better to # bail out rather than resolving chapter/paragraph # references relative to baseuri (which is almost # certainly wrong) return [root.text] else: self.currentlaw = lawrefid_node.text if self.find_node(root,'NamedLaw'): namedlaw = self.normalize_lawname(self.find_node(root,'NamedLaw').text) # print "remember that %s is %s!" % (namedlaw, self.currentlaw) self.currentlynamedlaws[namedlaw] = self.currentlaw #print "DEBUG: middle of format_ExternalRefs; self.currentlaw is %s" % self.currentlaw if self.lastlaw is None: #print "DEBUG: format_ExternalRefs: setting self.lastlaw to %s" % self.currentlaw self.lastlaw = self.currentlaw # if the node tree only contains a single reference, it looks # better if the entire expression, not just the # chapter/section part, is linked. But not if it's a # "anonymous" law ('1 § i lagen (1234:234) om blahonga') if (len(self.find_nodes(root,'GenericRefs')) == 1 and len(self.find_nodes(root,'SectionRefID')) == 1 and len(self.find_nodes(root,'AnonymousExternalLaw')) == 0): res = [self.format_generic_link(root)] else: res = self.format_tokentree(root) return res def format_SectionItemRefs(self,root): assert(root.nodes[0].nodes[0].tag == 'SectionRefID') self.currentsection = root.nodes[0].nodes[0].text.strip() #res = self.formatter_dispatch(root.nodes[0]) # was formatter_dispatch(self.root) res = self.format_tokentree(root) self.currentsection = None return res def format_PieceItemRefs(self,root): self.currentpiece = root.nodes[0].nodes[0].text.strip() res = [self.format_custom_link(self.find_attributes([root.nodes[2].nodes[0]]), "%s %s" % (root.nodes[0].text, root.nodes[2].nodes[0].text), root.tag)] for node in root.nodes[2].nodes[1:]: res.extend(self.formatter_dispatch(node)) self.currentpiece = None return res def format_ChapterSectionRef(self,root): assert(root.nodes[0].nodes[0].tag == 'ChapterRefID') self.currentchapter = root.nodes[0].nodes[0].text.strip() return [self.format_generic_link(root)] def format_AlternateChapterSectionRefs(self,root): assert(root.nodes[0].nodes[0].tag == 'ChapterRefID') self.currentchapter = root.nodes[0].nodes[0].text.strip() # print "Self.currentchapter is now %s" % self.currentchapter res = self.format_tokentree(root) self.currentchapter = None return res def format_ExternalLaw(self,root): self.currentchapter = None return self.formatter_dispatch(root.nodes[0]) def format_ChangeRef(self,root): id = self.find_node(root,'LawRefID').data return [self.format_custom_link({'lawref':id}, root.text, root.tag)] def format_SFSNr(self,root): if self.baseuri == None: sfsid = self.find_node(root,'LawRefID').data self.baseuri_attributes = {'baseuri':'http://rinfo.lagrummet.se/publ/sfs/'+sfsid+'#'} return self.format_tokentree(root) def format_NamedExternalLawRef(self,root): resetcurrentlaw = False #print "format_NamedExternalLawRef: self.currentlaw is %r" % self.currentlaw if self.currentlaw == None: resetcurrentlaw = True lawrefid_node = self.find_node(root,'LawRefID') if lawrefid_node == None: self.currentlaw = self.namedlaw_to_sfsid(root.text) else: self.currentlaw = lawrefid_node.text namedlaw = self.normalize_lawname(self.find_node(root,'NamedLaw').text) # print "remember that %s is %s!" % (namedlaw, self.currentlaw) self.currentlynamedlaws[namedlaw] = self.currentlaw #print "format_NamedExternalLawRef: self.currentlaw is now %r" % self.currentlaw #print "format_NamedExternalLawRef: self.baseuri is %r" % self.baseuri if self.currentlaw == None: # if we can't find a ID for this law, better not <link> it res = [root.text] else: res = [self.format_generic_link(root)] #print "format_NamedExternalLawRef: self.baseuri is %r" % self.baseuri if self.baseuri == None and self.currentlaw != None: #print "format_NamedExternalLawRef: setting baseuri_attributes" # use this as the new baseuri_attributes m = self.re_urisegments.match(self.currentlaw) if m: self.baseuri_attributes = {'baseuri':m.group(1), 'law':m.group(2), 'chapter':m.group(6), 'section':m.group(8), 'piece':m.group(10), 'item':m.group(12)} else: self.baseuri_attributes = {'baseuri':'http://rinfo.lagrummet.se/publ/sfs/'+self.currentlaw+'#'} if resetcurrentlaw: if self.currentlaw != None: self.lastlaw = self.currentlaw self.currentlaw = None return res ################################################################ # KOD FÖR KORTLAGRUM def format_AbbrevLawNormalRef(self,root): lawabbr_node = self.find_node(root,'LawAbbreviation') self.currentlaw = self.namedlaw_to_sfsid(lawabbr_node.text,normalize=False) res = [self.format_generic_link(root)] if self.currentlaw != None: self.lastlaw = self.currentlaw self.currentlaw = None return res def format_AbbrevLawShortRef(self,root): assert(root.nodes[0].tag == 'LawAbbreviation') assert(root.nodes[2].tag == 'ShortChapterSectionRef') self.currentlaw = self.namedlaw_to_sfsid(root.nodes[0].text,normalize=False) shortsection_node = root.nodes[2] assert(shortsection_node.nodes[0].tag == 'ShortChapterRefID') assert(shortsection_node.nodes[2].tag == 'ShortSectionRefID') self.currentchapter = shortsection_node.nodes[0].text self.currentsection = shortsection_node.nodes[2].text res = [self.format_generic_link(root)] self.currentchapter = None self.currentsection = None self.currentlaw = None return res ################################################################ # KOD FÖR FORARBETEN def forarbete_format_uri(self,attributes): # res = self.baseuri_attributes['baseuri'] res = 'http://rinfo.lagrummet.se/' resolvetobase = True addfragment = False for key,val in attributes.items(): if key == 'prop': res += "publ/prop/%s" % val elif key == 'bet': res += "ext/bet/%s" % val elif key == 'skrivelse': res += "ext/rskr/%s" % val elif key == 'celex': if len(val) == 8: # incorrectly formatted, uses YY instead of YYYY val = val[0]+'19'+val[1:] res += "ext/celex/%s" % val if 'sidnr' in attributes: res += "#s%s" % attributes['sidnr'] return res def format_ChapterSectionRef(self,root): assert(root.nodes[0].nodes[0].tag == 'ChapterRefID') self.currentchapter = root.nodes[0].nodes[0].text.strip() return [self.format_generic_link(root)] ################################################################ # KOD FÖR EGLAGSTIFTNING def eglag_format_uri(self,attributes): res = 'http://rinfo.lagrummet.se/ext/celex/' if not 'akttyp' in attributes: if 'forordning' in attributes: attributes['akttyp'] = u'förordning'; elif 'direktiv' in attributes: attributes['akttyp'] = u'direktiv'; if 'akttyp' not in attributes: raise AttributeError("Akttyp saknas") # Om hur CELEX-nummer konstrueras # https://www.infotorg.sema.se/infotorg/itweb/handbook/rb/hlp_celn.htm # https://www.infotorg.sema.se/infotorg/itweb/handbook/rb/hlp_celf.htm # Om hur länkning till EURLEX ska se ut: # http://eur-lex.europa.eu/sv/tools/help_syntax.htm # Absolut URI? if 'ar' in attributes and 'lopnummer' in attributes: sektor = '3' rattslig_form = {u'direktiv':'L', u'förordning':'R'} if len(attributes['ar']) == 2: attributes['ar'] = '19'+attributes['ar'] res += "%s%s%s%04d" % (sektor,attributes['ar'], rattslig_form[attributes['akttyp']], int(attributes['lopnummer'])) else: if not self.baseuri_attributes['baseuri'].startswith(res): # FIXME: should we warn about this? # print "Relative reference, but base context %s is not a celex context" % self.baseuri_attributes['baseuri'] return None if 'artikel' in attributes: res += "#%s" % attributes['artikel'] if 'underartikel' in attributes: res += ".%s" % attributes['underartikel'] return res ################################################################ # KOD FÖR RATTSFALL def rattsfall_format_uri(self,attributes): # Listan härledd från containers.n3/rattsfallsforteckningar.n3 i # rinfoprojektets källkod - en ambitiösare lösning vore att läsa # in de faktiska N3-filerna i en rdflib-graf. containerid = {u'NJA': '/publ/rattsfall/nja/', u'RH': '/publ/rattsfall/rh/', u'MÖD': '/publ/rattsfall/mod/', u'RÅ': '/publ/rattsfall/ra/', u'HFD': '/publ/rattsfall/hfd/', u'RK': '/publ/rattsfall/rk/', u'MIG': '/publ/rattsfall/mig/', u'AD': '/publ/rattsfall/ad/', u'MD': '/publ/rattsfall/md/', u'FÖD': '/publ/rattsfall/fod/'} # res = self.baseuri_attributes['baseuri'] if 'nja' in attributes: attributes['domstol'] = attributes['nja'] assert 'domstol' in attributes, "No court provided" assert attributes['domstol'] in containerid, "%s is an unknown court" % attributes['domstol'] res = "http://rinfo.lagrummet.se"+containerid[attributes['domstol']] if 'lopnr' in attributes and ":" in attributes['lopnr']: (attributes['ar'], attributes['lopnr']) = lopnr.split(":", 1) if attributes['domstol'] == u'NJA': # FIXME: URIs should be based on publikationsordinal, not # pagenumber (which this in effect is) - but this requires # a big lookup table/database/graph with # pagenumber-to-ordinal-mappings res += '%ss%s' % (attributes['ar'], attributes['sidnr']) else: res += '%s:%s' % (attributes['ar'], attributes['lopnr']) return res ################################################################ # KOD FÖR EGRÄTTSFALL def egrattsfall_format_uri(self,attributes): descriptormap = {'C':'J', # Judgment of the Court 'T':'A', # Judgment of the Court of First Instance 'F':'W', # Judgement of the Civil Service Tribunal } # FIXME: Change this before the year 2054 (as ECJ will # hopefully have fixed their case numbering by then) if len(attributes['year']) == 2: if int(attributes['year']) < 54: year = "20"+attributes['year'] else: year = "19"+attributes['year'] else: year = attributes['year'] serial = '%04d' % int(attributes['serial']) descriptor = descriptormap[attributes['decision']] uri = "http://lagen.nu/ext/celex/6%s%s%s" % (year, descriptor, serial) return uri