Python Parser.buildTagger Beispiele

Programmiersprache: Python

Namespace / Paketname: simpleparse.parser

Klasse / Typ: Parser

Methode / Funktion: buildTagger

Beispiele auf hotexamples.com: 10

Python Parser.buildTagger - 10 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die simpleparse.parser.Parser.buildTagger, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

Parser(30)

parse(30)

__init__(4)

buildTagger(4)

Beispiel #1

Datei anzeigen

Datei: test_optimisation.py Projekt: pombreda/RimRoot_Ubuntu_12.04lts_x86_64

    def testTermCompression(self):
        """Test that unreported productions are compressed

		Term compression is basically an inlining of terminal
		expressions into the calling table.  At the moment
		the terminal expressions are all duplicated, which may
		balloon the size of the grammar, not sure if this will
		be an actual problem.  As written, this optimization
		should provide a significant speed up, but there may
		the even more of a speed up if we allow for sharing
		the terminal tuples as well.

		This:
			a:=b <b>:= -c* c:='this'
		Should eventually compress to this:
			a := -'this'*
		"""
        failures = []
        for first, second in [
            ("""a:=b <b>:= -c* c:='this'""", """a := -'this'*"""),
            ("""a:=b >b<:= c c:= 'this'""", """a := c c:= 'this'"""),
            ("""a:=b >b<:= c <c>:= 'this'""", """a := 'this'"""),
            ("""a:=b >b<:= c+ <c>:= 'this'""", """a := 'this'+"""),
                # The following will never work, so eventually may raise
                # an error or at least give a warning!
            ("""a:=b,c >b<:= c+ <c>:= 'this'""", """a := 'this'+,'this'"""),
            ("""a:=b/c >b<:= c+ <c>:= 'this'""", """a := 'this'+/'this'"""),
                # This is requiring group-compression, which isn't yet written
            ("""a:=-b/c >b<:= c+ <c>:= 'this'""", """a := -'this'+/'this'"""),
            ("""a    := (table1 / table2 / any_line)*
  <any_line> := ANY*, EOL
  <ANY>      := -EOL
  <EOL>      := '\n'
  table1 := 'a'
  table2 := 'b'
  """, """a    := (table1 / table2 / (-'\n'*, '\n'))*
    table1 := 'a'
  table2 := 'b'
"""),
            ("""a:= b,c <b>:= -c* <c>:= '\n'""", """a := -'\n'*,'\n'"""),
        ]:
            pFirst = Parser(first, "a")
            pSecond = Parser(second, "a")
            tFirst = pFirst.buildTagger()
            tSecond = pSecond.buildTagger()
            if not rcmp(tFirst, tSecond):
                tFirstRepr = pprint.pformat(tFirst)
                tSecondRepr = pprint.pformat(tSecond)
                failures.append(
                    """%(first)r did not produce the same parser as %(second)r\n\t%(tFirstRepr)s\n\t%(tSecondRepr)s"""
                    % locals())
        if failures:
            raise ValueError("\n".join(failures))

Beispiel #2

Datei anzeigen

Datei: test_optimisation.py Projekt: johnfkraus/pyparseurl

    def testTermCompression( self ):
        """Test that unreported productions are compressed

        Term compression is basically an inlining of terminal
        expressions into the calling table.  At the moment
        the terminal expressions are all duplicated, which may
        balloon the size of the grammar, not sure if this will
        be an actual problem.  As written, this optimization
        should provide a significant speed up, but there may
        the even more of a speed up if we allow for sharing
        the terminal tuples as well.

        This:
            a:=b <b>:= -c* c:='this'
        Should eventually compress to this:
            a := -'this'*
        """
        failures = []
        for first, second in [
            ("""a:=b <b>:= -c* c:='this'""", """a := -'this'*"""),
            ("""a:=b >b<:= c c:= 'this'""", """a := c c:= 'this'"""),
            ("""a:=b >b<:= c <c>:= 'this'""", """a := 'this'"""),
            ("""a:=b >b<:= c+ <c>:= 'this'""", """a := 'this'+"""),
            # The following will never work, so eventually may raise
            # an error or at least give a warning!
            ("""a:=b,c >b<:= c+ <c>:= 'this'""", """a := 'this'+,'this'"""),
            ("""a:=b/c >b<:= c+ <c>:= 'this'""", """a := 'this'+/'this'"""),
            # This is requiring group-compression, which isn't yet written
            ("""a:=-b/c >b<:= c+ <c>:= 'this'""", """a := -'this'+/'this'"""),
            ("""a    := (table1 / table2 / any_line)*
  <any_line> := ANY*, EOL
  <ANY>      := -EOL
  <EOL>      := '\n'
  table1 := 'a'
  table2 := 'b'
  """, """a    := (table1 / table2 / (-'\n'*, '\n'))*
    table1 := 'a'
  table2 := 'b'
"""),
            ("""a:= b,c <b>:= -c* <c>:= '\n'""", """a := -'\n'*,'\n'"""),
            
        ]:
            pFirst = Parser( first, "a")
            pSecond = Parser( second, "a")
            tFirst = pFirst.buildTagger()
            tSecond = pSecond.buildTagger()
            if not rcmp( tFirst , tSecond):
                tFirstRepr = pprint.pformat(tFirst)
                tSecondRepr = pprint.pformat(tSecond)
                failures.append( """%(first)r did not produce the same parser as %(second)r\n\t%(tFirstRepr)s\n\t%(tSecondRepr)s"""%locals())
        if failures:
            raise ValueError( "\n".join(failures))

Beispiel #3

Datei anzeigen

Datei: test_optimisation.py Projekt: johnfkraus/pyparseurl

 def testTermSharing( self ):
     """Test that shared terminal productions are using the same parser"""
     first =""" a := b,b >b<:= d d:= 'this'"""
     pFirst = Parser( first, "a")
     tFirst = pFirst.buildTagger()
     b,c = tFirst
     assert b is c, """Not sharing the same tuple for b and c instances"""

Beispiel #4

Datei anzeigen

Datei: test_optimisation.py Projekt: pombreda/RimRoot_Ubuntu_12.04lts_x86_64

 def testTermSharing(self):
     """Test that shared terminal productions are using the same parser"""
     first = """ a := b,b >b<:= d d:= 'this'"""
     pFirst = Parser(first, "a")
     tFirst = pFirst.buildTagger()
     b, c = tFirst
     assert b is c, """Not sharing the same tuple for b and c instances"""

Beispiel #5

Datei anzeigen

Datei: generator.py Projekt: johnfkraus/pyparseurl

class GeneratorAPI1:
	"""Stand-in class supporting operation of SimpleParse 1.0 applications

	There was really only the one method of interest, parserbyname,
	everything else was internal (and is now part of
	simpleparsegrammar.py).
	"""
	def __init__( self, production, prebuilt=() ):
		from simpleparse.parser import Parser
		self.parser = Parser( production, prebuilts=prebuilt )
	def parserbyname( self, name ):
		"""Retrieve a tag-table by production name"""
		return self.parser.buildTagger( name )

Beispiel #6

Datei anzeigen

class GeneratorAPI1:
    """Stand-in class supporting operation of SimpleParse 1.0 applications

    There was really only the one method of interest, parserbyname,
    everything else was internal (and is now part of
    simpleparsegrammar.py).
    """
    def __init__(self, production, prebuilt=()):
        from simpleparse.parser import Parser
        self.parser = Parser(production, prebuilts=prebuilt)

    def parserbyname(self, name):
        """Retrieve a tag-table by production name"""
        return self.parser.buildTagger(name)

Beispiel #7

Datei anzeigen

Datei: EBNFSpill.py Projekt: tintinweb/EBNFSpill

class EBNFSpill(object):
    DEFAULT_MAX_TIMES_CHAR = 35
    DEFAULT_MAX_TIMES_FUNC = 10
    DEFAULT_MAX_SELF_RECURSION = 25
    DEFAULT_MAX_WALK_RECURSION = 100

    def __init__(self,showTags=False,showTagsRecursive=False,recursionLevel=0):
        self._reset()
        self.showTags=showTags
        self.showTagsRecursive=showTagsRecursive
        self.recursionLevelObj=recursionLevel
        
        if self.recursionLevelObj>self.DEFAULT_MAX_SELF_RECURSION: raise Exception("a")
        #print "INIT",recursionLevel
        pass
    
    def __del__(self):
        self.recursionLevelObj-=1
        pass
    
    def validate(self,data):
        return self.parser.parse(data)
    
    def setDeclaration(self,declaration,production):
        self.parser = Parser(declaration, production)
        self.table =  self.parser.buildTagger(production=production)
    
    def setTable(self,table,nodes=None):
        self.table = table
        self.nodes=nodes or self.nodes
    
    def _reset(self):
        self.nodes = {}
        self.ctx = []       # context (infos like recurion for table2)
        #self.recursionLevelObj=0
        self.recursionLevelWalk=0
        random.seed()
        
    def setDefaults(self,**kwargs):
        valid_defaults = [i for i in dir(self) if i.startswith("DEFAULT_")]
        for k,v in kwargs.iteritems():
            if k in valid_defaults:
                setattr(self,k,v)
            else:
                raise Exception("Not allowed to change %s to %s (valid options: %s)"%(k,v,valid_defaults))
    
    def getTable(self):
        return self.table

    def getTagName(self,node):
        if self.showTags and node[0]:
            return "<%s>"%node[0]
        return ""
    
    def checkTypeIterable(self,l):
        return isinstance(l, collections.Iterable) and not isinstance(l, basestring)
    def checkTypeIterableRecursive(self,l):
        return isinstance(l, collections.Iterable) and not isinstance(l, basestring) and isinstance(l,tuple) and isinstance(l[0],list) and isinstance(l[1],int)

    def checkTypeNodeBase(self,l):
        #checks ( None|str, int, *)
        return self.checkTypeIterable(l) and len(l)>=2 and (l[0]==None or isinstance(l[0],basestring)) and isinstance(l[1],int)
    def checkTypeNodeWithChilds(self,l):
        #print "check_",str(l)[:50]
        try:
            #print "check_metric",checkTypeNodeBase(l),len(l)>=3 , checkTypeIterable(l[2])
            pass
        except:
            pass
        return self.checkTypeNodeBase(l) and len(l)>=3 and self.checkTypeIterable(l[2])
    
    def next(self):
        return

    def rndTimesFunc(self,sample_func,args,minlen=0,maxlen=None):
        maxlen = maxlen or self.DEFAULT_MAX_TIMES_FUNC
        maxlen+=1
        out = ""
        for i in range(random.randrange(minlen,maxlen)):
            out+=sample_func(args)
        return out   

    def rndTimes(self,sample,minlen=0,maxlen=None):
        maxlen = maxlen or self.DEFAULT_MAX_TIMES_CHAR
        maxlen+=1
        out = ""
        for i in range(random.randrange(minlen,maxlen)):
            out+=sample
        return out
    
    def rndSelect(self,haystack,sample_len=1,minlen=0,maxlen=None):
        maxlen = maxlen or self.DEFAULT_MAX_TIMES_CHAR
        maxlen+=1
        out = ""
        for i in range(random.randrange(minlen,maxlen)):
            out += "".join(random.sample(haystack,sample_len))
        
        return out

    def eval(self,node):
        # different lenght commandos
        #print node
        #print id(node),node
        #if self.recursionLevelObj>self.DEFAULT_MAX_SELF_RECURSION or self.recursionLevelWalk>self.DEFAULT_MAX_WALK_RECURSION:
        #    return "<recursion_exception>"
        
        if not node:
            return ""
        
        if len(node)<3:
            raise Exception( "<3 - %s"%repr(node) )          #this is an error!
        
        elif node[1]==Tdef.MATCH_RECURSION_EXCEPTION:
            return "<<"
        
        elif node[1]==Tdef.MATCH_RECURSION:
            # create a new EBNFSpill object, and resolv this one?
            #print node[2],self.nodes[node[2]]
            self.recursionLevelObj+=1
            try:
                x = EBNFSpill(showTags=self.showTagsRecursive,recursionLevel=self.recursionLevelObj)
                x.setTable(self.table)
                recr_node=self.nodes[node[2]]
            except:
                return ""
            
            #
            #print "REKR",node
            #print "REKR2",self.nodes
            #print "<DAMN_RECURSION %s wild=%s>"%(node[2],self.ctx)
            #return "<RECURSION"
            
            #print "EXCEPT:",node[2],self.nodes
            #return self.rndTimes(x.generate(recr_node['obj']), 0, 3)
            return self.getTagName(node)+x.generate(recr_node)
        
        # single words/selections
        elif len(node)==3:
            if node[1]==Tdef.MATCH_WORD or node[1]==Tdef.MATCH_IS:
                return self.getTagName(node)+node[2]
            elif node[1]==Tdef.MATCH_ALLIN or node[1]==Tdef.MATCH_ISIN:
                return self.getTagName(node)+self.rndSelect(node[2],minlen=1,maxlen=1)
            elif node[1]==Tdef.MATCH_TABLE:
                # (xyz,MATCH_TABLE, <table>, 1)  == exact 1
                # (xyz,MATCH_TABLE, <table>, 2,1)  == *
                return self.getTagName(node)+""
                #return "<TABLE: %s>"%node[0]
        
        # mostly recursive ones
        elif len(node)>3:
            # recursions and stuff
            if node[1]==Tdef.MATCH_IS or node[1]==Tdef.MATCH_IS:
                # like (none,"MATCH_IS",'c',1,0) - choose zero or xx times
                return self.getTagName(node)+self.rndTimes(node[2])
            elif node[1]==Tdef.MATCH_ALLIN or node[1]==Tdef.MATCH_ISIN:
                return self.getTagName(node)+self.rndSelect(node[2])
            
            elif node[1]>=Tdef.MATCH_CALL and node[1]<=Tdef.MATCH_SUBTABLEINLIST:
                # (xyz,MATCH_TABLE, <table>, 1)  == exact 1
                # (xyz,MATCH_TABLE, <table>, 2,1)  == *
                self.recursionLevelObj+=1
                try:
                    x = EBNFSpill(showTags=self.showTagsRecursive,recursionLevel=self.recursionLevelObj)
                except:
                    return ""
                x.setTable(self.table)
                #print "<TABLE: %s | %s  || %s || nodeid:%s>"%(node[0:1],node[3],self.ctx,id(node[3]))
                #print node[2]
                #return self.getTagName(node)+""
                return self.getTagName(node)+self.rndTimesFunc(x.generate,(node[2]))            
        
        
        return self.getTagName(node)

    def generate(self,node=None):
        out = ""
        for n in self.walk(node):
            #print n
            #print self.recursionLevelObj,self.recursionLevelWalk

            out+= self.eval(n)

        return out

    def process(self,l):
        if self.checkTypeNodeBase(l):
            return (l[0],Tdef().toName()[l[1]])+l[2:]
        return l

    def _checkRecursion(self,node):
        # return boolean if boolean=True
        nID = id(node)

        #print "-->",nID, " NODE ",node
        if self.nodes.has_key(nID):
            raise StopRecursionException(('[RECURSION of Node=%s]'%nID,Tdef.MATCH_RECURSION,nID))
        #self.nodeIDs.append(nID)
        #print nID,node
        return node
    
    def _trackNode(self,node,nodeID=None):
        nID = nodeID or id(node)
        #print node
        if self.checkTypeNodeBase(node):
            #print "ISIN1",Tdef.MATCH_CALL,Tdef.MATCH_SUBTABLEINLIST,node[1],node[1]>=Tdef.MATCH_CALL and node[1]<=Tdef.MATCH_SUBTABLEINLIST
            
            if node[1]>=Tdef.MATCH_CALL and node[1]<=Tdef.MATCH_SUBTABLEINLIST:
                #print "ISIN2"
                #print "--- add BASE",id(node),node
                self.nodes[nID]=node
        elif self.checkTypeIterable(node):
            #print "--- add LIST",id(node),node
            self.nodes[nID]=node
        return node 
    
    def _pushLevel(self,node):
        # add one level. . to check recursion space
        if node[1]>=Tdef.MATCH_CALL and node[1]<=Tdef.MATCH_SUBTABLEINLIST  \
          and len(node)>3 and node[3]==2:  
            #print "push__"
            self.ctx.append(id(node))
        return node
    
    def _popLevel(self,node):
        if node[1]>=Tdef.MATCH_CALL and node[1]<=Tdef.MATCH_SUBTABLEINLIST  \
          and len(node)>3 and node[3]==2:
            #print "pop___"
            return self.ctx.pop()
        return node
    
    def walk(self,table=None):
        table=table or self.table
        if not table: raise Exception("EBNF TagTable not set, please generate [.setDeclaration()] or set one [.setTable()]")       #must not be !NONE!, please .setDeclaration() first!
        
        retn =  self._walk(table)
        self._reset()
        return retn

    def _walk(self,l):
        # check if (None|basestring, int, ... ) > 2
        #import time
        #time.sleep(0.8)
        #print "BEGIN",str(l)[:50]
        #recursion check
        if self.recursionLevelObj>self.DEFAULT_MAX_SELF_RECURSION or self.recursionLevelWalk>self.DEFAULT_MAX_WALK_RECURSION:
            #print self.recursionLevelWalk
            #print self.recursionLevelObj
            #nID=
            #raise StopRecursionException(('[RECURSION of Node=%s]'%nID,Tdef.MATCH_RECURSION,nID))
            #print self.nodes
            #yield l
            #print "StopIter",l
            #print self.recursionLevelObj,self.recursionLevelWalk
            raise StopIteration("HMM")
            #yield (None,Tdef.MATCH_RECURSION_EXCEPTION,())
            #raise StopRecursionException(("[RECURSION_EXCEPTION_LEVEL_REACHED]",Tdef.MATCH_RECURSION_EXCEPTION,None))
        self.recursionLevelWalk+=1
        #print id(l),len(l),l
        

        try:  
            if self.checkTypeNodeWithChilds(l):
                #print "Childs"
                self._checkRecursion(l)
                yield self._trackNode(l)
                self._pushLevel(l)
                for e in self._walk(l[2]):
                    yield e
                self._popLevel(l)
                    
            elif self.checkTypeNodeBase(l):
                #print "Base"
                self._checkRecursion(l)
                yield self._trackNode(l)
       
            elif self.checkTypeIterableRecursive(l):
                #print "xxx",l[0][0]
                nID=id(l[0][0])
                #print "IterReck"
                #print '[RECURSION of Node=%s]'%nID
                #TODO: does not work
                #fixme: does not work - recurses too much
                raise StopRecursionException(('[RECURSION of Node=%s]'%nID,Tdef.MATCH_RECURSION,nID))
                    
            elif self.checkTypeIterable(l):
                #print "list"
                self._checkRecursion(l)
                self._trackNode(l)              # checkTypeIterableRecursive refs one of these nodes :( // damn need to reparse if this doesnt work out
                #self._pushLevel(l)
                for e in l:
                    self._pushLevel(e)
                    for x in self._walk(e):
                        yield x             #do not check recursion here.. this is not what we want
                    self._popLevel(e)
                #self._popLevel(l)
            else:
                self._checkRecursion(l)
                print "Elem? - ",l
                #print self.checkTypeNodeWithChilds(l),self.checkTypeNodeBase(l),self.checkTypeIterable(l)
                yield self._trackNode(l)
        
        except StopRecursionException, e:
            #print self.nodes[e.getObj()[2]]
            #print "Except:",e.getObj()
            yield e.getObj()

    
        self.recursionLevelWalk-=1

Beispiel #8

Datei anzeigen

Datei: legalref.py Projekt: h4ck3rm1k3/ferenda

class LegalRef:
    # Kanske detta borde vara 1,2,4,8 osv, så att anroparen kan be om
    # LAGRUM | FORESKRIFTER, och så vi kan definera samlingar av
    # vanliga kombinationer (exv ALL_LAGSTIFTNING = LAGRUM |
    # KORTLAGRUM | FORESKRIFTER | EGLAGSTIFTNING)
    LAGRUM = 1             # hänvisningar till lagrum i SFS
    KORTLAGRUM = 2         # SFS-hänvisningar på kortform
    FORESKRIFTER = 3       # hänvisningar till myndigheters författningssamlingar
    EGLAGSTIFTNING = 4     # EG-fördrag, förordningar och direktiv
    INTLLAGSTIFTNING = 5   # Fördrag, traktat etc
    FORARBETEN = 6         # proppar, betänkanden, etc
    RATTSFALL = 7          # Rättsfall i svenska domstolar
    MYNDIGHETSBESLUT = 8   # Myndighetsbeslut (JO, ARN, DI...)
    EGRATTSFALL = 9        # Rättsfall i EG-domstolen/förstainstansrätten
    INTLRATTSFALL = 10     # Europadomstolen

    # re_urisegments = re.compile(r'([\w]+://[^/]+/[^\d]*)(\d+:(bih\.
    # |N|)?\d+( s\.\d+|))#?(K(\d+)|)(P(\d+)|)(S(\d+)|)(N(\d+)|)')
    re_urisegments = re.compile(
        r'([\w]+://[^/]+/[^\d]*)(\d+:(bih\.[_ ]|N|)?\d+([_ ]s\.\d+|))#?(K([a-z0-9]+)|)(P([a-z0-9]+)|)(S(\d+)|)(N(\d+)|)')
    re_escape_compound = re.compile(
        r'\b(\w+-) (och) (\w+-?)(lagen|förordningen)\b', re.UNICODE)
    re_escape_named = re.compile(
        r'\B(lagens?|balkens?|förordningens?|formens?|ordningens?|kungörelsens?|stadgans?)\b', re.UNICODE)

    re_descape_compound = re.compile(
        r'\b(\w+-)_(och)_(\w+-?)(lagen|förordningen)\b', re.UNICODE)
    re_descape_named = re.compile(
        r'\|(lagens?|balkens?|förordningens?|formens?|ordningens?|kungörelsens?|stadgans?)')
    re_xmlcharref = re.compile("&#\d+;")

    def __init__(self, *args):
        if not os.path.sep in __file__:
            scriptdir = os.getcwd()
        else:
            scriptdir = os.path.dirname(__file__)

        self.graph = Graph()
        n3file = os.path.relpath(scriptdir + "/../../../res/etc/sfs-extra.n3")
        # print "loading n3file %s" % n3file
        self.graph.load(n3file, format="n3")
        self.roots = []
        self.uriformatter = {}
        self.decl = ""  # try to make it unicode clean all the way
        self.namedlaws = {}
        self.load_ebnf(scriptdir + "/../../../res/etc/base.ebnf")

        self.args = args
        if self.LAGRUM in args:
            productions = self.load_ebnf(scriptdir + "/../../../res/etc/lagrum.ebnf")
            for p in productions:
                self.uriformatter[p] = self.sfs_format_uri
            self.namedlaws.update(self.get_relations(RDFS.label))
            self.roots.append("sfsrefs")
            self.roots.append("sfsref")

        if self.KORTLAGRUM in args:
            # om vi inte redan laddat lagrum.ebnf måste vi göra det
            # nu, eftersom kortlagrum.ebnf beror på produktioner som
            # definerats där
            if not self.LAGRUM in args:
                self.load_ebnf(scriptdir + "/../../../res/etc/lagrum.ebnf")

            productions = self.load_ebnf(
                scriptdir + "/../../../res/etc/kortlagrum.ebnf")
            for p in productions:
                self.uriformatter[p] = self.sfs_format_uri
            DCT = Namespace("http://purl.org/dc/terms/")
            d = self.get_relations(DCT['alternate'])
            self.namedlaws.update(d)
            # lawlist = [x.encode(SP_CHARSET) for x in list(d.keys())]
            lawlist = list(d.keys())
            # Make sure longer law abbreviations come before shorter
            # ones (so that we don't mistake "3 § MBL" for "3 § MB"+"L")
            # lawlist.sort(cmp=lambda x, y: len(y) - len(x))
            lawlist.sort(key=len, reverse=True)
            lawdecl = "LawAbbreviation ::= ('%s')\n" % "'/'".join(lawlist)
            self.decl += lawdecl
            self.roots.insert(0, "kortlagrumref")

        if self.EGLAGSTIFTNING in args:
            productions = self.load_ebnf(scriptdir + "/../../../res/etc/eglag.ebnf")
            for p in productions:
                self.uriformatter[p] = self.eglag_format_uri
            self.roots.append("eglagref")
        if self.FORARBETEN in args:
            productions = self.load_ebnf(
                scriptdir + "/../../../res/etc/forarbeten.ebnf")
            for p in productions:
                self.uriformatter[p] = self.forarbete_format_uri
            self.roots.append("forarbeteref")
        if self.RATTSFALL in args:
            productions = self.load_ebnf(scriptdir + "/../../../res/etc/rattsfall.ebnf")
            for p in productions:
                self.uriformatter[p] = self.rattsfall_format_uri
            self.roots.append("rattsfallref")
        if self.EGRATTSFALL in args:
            productions = self.load_ebnf(scriptdir + "/../../../res/etc/egratt.ebnf")
            for p in productions:
                self.uriformatter[p] = self.egrattsfall_format_uri
            self.roots.append("ecjcaseref")

        rootprod = "root ::= (%s/plain)+\n" % "/".join(self.roots)
        self.decl += rootprod

        self.parser = Parser(self.decl.encode(SP_CHARSET), "root")
        self.tagger = self.parser.buildTagger("root")
        # util.writefile("tagger.tmp", repr(self.tagger), SP_CHARSET)
        # print "tagger length: %d" % len(repr(self.tagger))
        self.verbose = False
        self.depth = 0

        # SFS-specifik kod
        self.currentlaw = None
        self.currentchapter = None
        self.currentsection = None
        self.currentpiece = None
        self.lastlaw = None
        self.currentlynamedlaws = {}

    def load_ebnf(self, file):
        """Laddar in produktionerna i den angivna filen i den
        EBNF-deklaration som används, samt returnerar alla
        *Ref och *RefId-produktioner"""
        # base.ebnf contains 0x1A, ie the EOF character on windows,
        # therefore we need to read it in binary mode

        f = open(file, 'rb')
        # assume our ebnf files use the same charset
        content = f.read(os.stat(file).st_size).decode(SP_CHARSET)
        self.decl += content
        f.close()
        return [x.group(1) for x in re.finditer(r'(\w+(Ref|RefID))\s*::=', content)]

    def get_relations(self, predicate):
        d = {}
        for obj, subj in self.graph.subject_objects(predicate):
            d[six.text_type(subj)] = six.text_type(obj)
        return d

    def parse(self, indata, baseuri="http://rinfo.lagrummet.se/publ/sfs/9999:999#K9P9S9P9", predicate=None):
        assert isinstance(indata, six.text_type)
        if indata == "":
            return indata  # this actually triggered a bug...
        # h = hashlib.sha1()
        # h.update(indata)
        # print "Called with %r (%s) (%s)" % (indata, h.hexdigest(), self.verbose)
        self.predicate = predicate
        self.baseuri = baseuri
        if baseuri:
            m = self.re_urisegments.match(baseuri)
            if m:
                self.baseuri_attributes = {'baseuri': m.group(1),
                                           'law': m.group(2),
                                           'chapter': m.group(6),
                                           'section': m.group(8),
                                           'piece': m.group(10),
                                           'item': m.group(12)}
            else:
                self.baseuri_attributes = {'baseuri': baseuri}
        else:
            self.baseuri_attributes = {}
        # Det är svårt att få EBNF-grammatiken att känna igen
        # godtyckliga ord som slutar på ett givet suffix (exv
        # 'bokföringslagen' med suffixet 'lagen'). Därför förbehandlar
        # vi indatasträngen och stoppar in ett '|'-tecken innan vissa
        # suffix. Vi transformerar även 'Radio- och TV-lagen' till
        # 'Radio-_och_TV-lagen'
        #
        # FIXME: Obviously, this shouldn't be done in a general class,
        # but rather in a subclas or via proxy/adapter

        fixedindata = indata  # FIXME: Nonsensical
        if self.LAGRUM in self.args:
            fixedindata = self.re_escape_compound.sub(
                r'\1_\2_\3\4', fixedindata)
            fixedindata = self.re_escape_named.sub(r'|\1', fixedindata)
        # print "After: %r" % type(fixedindata)

        # SimpleParse har inget stöd för unicodesträngar, så vi
        # konverterar intdatat till en bytesträng. Tyvärr får jag inte
        # det hela att funka med UTF8, så vi kör xml character
        # references istället
        fixedindata = fixedindata.encode(SP_CHARSET, 'xmlcharrefreplace')

        # Parsea texten med TextTools.tag - inte det enklaste sättet
        # att göra det, men om man gör enligt
        # Simpleparse-dokumentationen byggs taggertabellen om för
        # varje anrop till parse()
        if self.verbose:
            print(("calling tag with '%s'" % (fixedindata.decode(SP_CHARSET))))
        # print "tagger length: %d" % len(repr(self.tagger))
        taglist = tag(fixedindata, self.tagger, 0, len(fixedindata))

        result = []

        root = NodeTree(taglist, fixedindata)
        for part in root.nodes:
            if part.tag != 'plain' and self.verbose:
                sys.stdout.write(self.prettyprint(part))
            if part.tag in self.roots:
                self.clear_state()
                # self.verbose = False
                result.extend(self.formatter_dispatch(part))
            else:
                assert part.tag == 'plain', "Tag is %s" % part.tag
                result.append(part.text)

            # clear state
            if self.currentlaw is not None:
                self.lastlaw = self.currentlaw
            self.currentlaw = None

        if taglist[-1] != len(fixedindata):
            log.error('Problem (%d:%d) with %r / %r' % (
                taglist[-1] - 8, taglist[-1] + 8, fixedindata, indata))

            raise ParseError(
                "parsed %s chars of %s (...%s...)" % (taglist[-1], len(indata),
                                                      indata[(taglist[-1] - 2):taglist[-1] + 3]))

        # Normalisera resultatet, dvs konkatenera intilliggande
        # textnoder, och ta bort ev '|'-tecken som vi stoppat in
        # tidigare.
        normres = []
        for i in range(len(result)):
            if not self.re_descape_named.search(result[i]):
                node = result[i]
            else:
                if self.LAGRUM in self.args:
                    text = self.re_descape_named.sub(r'\1', result[i])
                    text = self.re_descape_compound.sub(r'\1 \2 \3\4', text)
                if isinstance(result[i], Link):
                    # Eftersom Link-objekt är immutable måste vi skapa
                    # ett nytt och kopiera dess attribut
                    if hasattr(result[i], 'predicate'):
                        node = LinkSubject(text, predicate=result[i].predicate,
                                           uri=result[i].uri)
                    else:
                        node = Link(text, uri=result[i].uri)
                else:
                    node = text
            if (len(normres) > 0
                and not isinstance(normres[-1], Link)
                    and not isinstance(node, Link)):
                normres[-1] += node
            else:
                normres.append(node)

        # and finally...
        for i in range(len(normres)):
            if isinstance(normres[i], Link):
                # deal with these later
                pass
            else:
                normres[i] = self.re_xmlcharref.sub(
                    self.unescape_xmlcharref, normres[i])
        return normres

    def unescape_xmlcharref(self, m):
        # print "Changing %r to a %r" % (m.group(0)[2:-1], unichr(int(m.group(0)[2:-1])))
        return chr(int(m.group(0)[2:-1]))

    def find_attributes(self, parts, extra={}):
        """recurses through a parse tree and creates a dictionary of
        attributes"""
        d = {}

        self.depth += 1
        if self.verbose:
            print(
                (". " * self.depth + "find_attributes: starting with %s" % d))
        if extra:
            d.update(extra)

        for part in parts:
            current_part_tag = part.tag.lower()
            if current_part_tag.endswith('refid'):
                if ((current_part_tag == 'singlesectionrefid') or
                        (current_part_tag == 'lastsectionrefid')):
                    current_part_tag = 'sectionrefid'
                d[current_part_tag[:-5]] = part.text.strip()
                if self.verbose:
                    print((". " * self.depth +
                          "find_attributes: d is now %s" % d))

            if part.nodes:
                d.update(self.find_attributes(part.nodes, d))
        if self.verbose:
            print((". " * self.depth + "find_attributes: returning %s" % d))
        self.depth -= 1

        if self.currentlaw and 'law' not in d:
            d['law'] = self.currentlaw
        if self.currentchapter and 'chapter' not in d:
            d['chapter'] = self.currentchapter
        if self.currentsection and 'section' not in d:
            d['section'] = self.currentsection
        if self.currentpiece and 'piece' not in d:
            d['piece'] = self.currentpiece

        return d

    def find_node(self, root, nodetag):
        """Returns the first node in the tree that has a tag matching nodetag. The search is depth-first"""
        if root.tag == nodetag:  # base case
            return root
        else:
            for node in root.nodes:
                x = self.find_node(node, nodetag)
                if x is not None:
                    return x
            return None

    def find_nodes(self, root, nodetag):
        if root.tag == nodetag:
            return [root]
        else:
            res = []
            for node in root.nodes:
                res.extend(self.find_nodes(node, nodetag))
            return res

    def flatten_tokentree(self, part, suffix):
        """returns a 'flattened' tokentree ie for the following tree and the suffix 'RefID'
           foo->bar->BlahongaRefID
              ->baz->quux->Blahonga2RefID
                         ->Blahonga3RefID
              ->Blahonga4RefID

           this should return [BlahongaRefID, Blahonga2RefID, Blahonga3RefID, Blahonga4RefID]"""
        l = []
        if part.tag.endswith(suffix):
            l.append(part)
        if not part.nodes:
            return l

        for subpart in part.nodes:
            l.extend(self.flatten_tokentree(subpart, suffix))
        return l

    def formatter_dispatch(self, part):
        # print "Verbositiy: %r" % self.verbose
        self.depth += 1
        # Finns det en skräddarsydd formatterare?
        if "format_" + part.tag in dir(self):
            formatter = getattr(self, "format_" + part.tag)
            if self.verbose:
                print(
                    ((". " * self.depth) + "formatter_dispatch: format_%s defined, calling it" % part.tag))
            res = formatter(part)
            assert res is not None, "Custom formatter for %s didn't return anything" % part.tag
        else:
            if self.verbose:
                print(
                    ((". " * self.depth) + "formatter_dispatch: no format_%s, using format_tokentree" % part.tag))
            res = self.format_tokentree(part)

        if res is None:
            print(((". " * self.depth) +
                  "something wrong with this:\n" + self.prettyprint(part)))
        self.depth -= 1
        return res

    def format_tokentree(self, part):
        # This is the default formatter. It converts every token that
        # ends with a RefID into a Link object. For grammar
        # productions like SectionPieceRefs, which contain
        # subproductions that also end in RefID, this is not a good
        # function to use - use a custom formatter instead.

        res = []

        if self.verbose:
            print(((". " * self.depth) +
                  "format_tokentree: called for %s" % part.tag))
        # this is like the bottom case, or something
        if (not part.nodes) and (not part.tag.endswith("RefID")):
            res.append(part.text)
        else:
            if part.tag.endswith("RefID"):
                res.append(self.format_generic_link(part))
            elif part.tag.endswith("Ref"):
                res.append(self.format_generic_link(part))
            else:
                for subpart in part.nodes:
                    if self.verbose and part.tag == 'LawRef':
                        print(
                            ((". " * self.depth) + "format_tokentree: part '%s' is a %s" % (subpart.text, subpart.tag)))
                    res.extend(self.formatter_dispatch(subpart))
        if self.verbose:
            print(
                ((". " * self.depth) + "format_tokentree: returning '%s' for %s" % (res, part.tag)))
        return res

    def prettyprint(self, root, indent=0):
        res = "%s'%s': '%s'\n" % (
            "    " * indent, root.tag, re.sub(r'\s+', ' ', root.text))
        if root.nodes is not None:
            for subpart in root.nodes:
                res += self.prettyprint(subpart, indent + 1)
            return res
        else:
            return ""

    def format_generic_link(self, part, uriformatter=None):
        try:
            uri = self.uriformatter[part.tag](self.find_attributes([part]))
        except KeyError:
            if uriformatter:
                uri = uriformatter(self.find_attributes([part]))
            else:
                uri = self.sfs_format_uri(self.find_attributes([part]))
        except AttributeError:
            # Normal error from eglag_format_uri
            return part.text
        except:
            exc = sys.exc_info()
            # If something else went wrong, just return the plaintext
            log.warning("(unknown): Unable to format link for text %s (production %s)" %
                        (part.text, part.tag))
            return part.text

        if self.verbose:
            print((
                (". " * self.depth) + "format_generic_link: uri is %s" % uri))
        if not uri:
            # the formatting function decided not to return a URI for
            # some reason (maybe it was a partial/relative reference
            # without a proper base uri context
            return part.text
        elif self.predicate:
            return LinkSubject(part.text, uri=uri, predicate=self.predicate)
        else:
            return Link(part.text, uri=uri)

    # FIXME: unify this with format_generic_link
    def format_custom_link(self, attributes, text, production):
        try:
            uri = self.uriformatter[production](attributes)
        except KeyError:
            uri = self.sfs_format_uri(attributes)

        if not uri:
            # the formatting function decided not to return a URI for
            # some reason (maybe it was a partial/relative reference
            # without a proper base uri context
            return part.text
        elif self.predicate:
            return LinkSubject(text, uri=uri, predicate=self.predicate)
        else:
            return Link(text, uri=uri)

    #
    # KOD FÖR LAGRUM
    def clear_state(self):
        self.currentlaw = None
        self.currentchapter = None
        self.currentsection = None
        self.currentpiece = None

    def normalize_sfsid(self, sfsid):
        # sometimes '1736:0123 2' is given as '1736:0123 s. 2' or
        # '1736:0123.2'. This fixes that.
        sfsid = re.sub(r'(\d+:\d+)\.(\d)', r'\1 \2', sfsid)
        # return sfsid.replace('s. ','').replace('s.','') # more advanced normalizations to come...
        return sfsid

    def normalize_lawname(self, lawname):
        lawname = lawname.replace('|', '').replace('_', ' ').lower()
        if lawname.endswith('s'):
            lawname = lawname[:-1]
        return lawname

    def namedlaw_to_sfsid(self, text, normalize=True):
        if normalize:
            text = self.normalize_lawname(text)

        nolaw = [
            'aktieslagen',
            'anordningen',
            'anordningen',
            'anslagen',
            'arbetsordningen',
            'associationsformen',
            'avfallsslagen',
            'avslagen',
            'avvittringsutslagen',
            'bergslagen',
            'beskattningsunderlagen',
            'bolagen',
            'bolagsordningen',
            'bolagsordningen',
            'dagordningen',
            'djurslagen',
            'dotterbolagen',
            'emballagen',
            'energislagen',
            'ersättningsformen',
            'ersättningsslagen',
            'examensordningen',
            'finansbolagen',
            'finansieringsformen',
            'fissionsvederlagen',
            'flygbolagen',
            'fondbolagen',
            'förbundsordningen',
            'föreslagen',
            'företrädesordningen',
            'förhandlingsordningen',
            'förlagen',
            'förmånsrättsordningen',
            'förmögenhetsordningen',
            'förordningen',
            'förslagen',
            'försäkringsaktiebolagen',
            'försäkringsbolagen',
            'gravanordningen',
            'grundlagen',
            'handelsplattformen',
            'handläggningsordningen',
            'inkomstslagen',
            'inköpssamordningen',
            'kapitalunderlagen',
            'klockslagen',
            'kopplingsanordningen',
            'låneformen',
            'mervärdesskatteordningen',
            'nummerordningen',
            'omslagen',
            'ordalagen',
            'pensionsordningen',
            'renhållningsordningen',
            'representationsreformen',
            'rättegångordningen',
            'rättegångsordningen',
            'rättsordningen',
            'samordningen',
            'samordningen',
            'skatteordningen',
            'skatteslagen',
            'skatteunderlagen',
            'skolformen',
            'skyddsanordningen',
            'slagen',
            'solvärmeanordningen',
            'storslagen',
            'studieformen',
            'stödformen',
            'stödordningen',
            'stödordningen',
            'säkerhetsanordningen',
            'talarordningen',
            'tillslagen',
            'tivolianordningen',
            'trafikslagen',
            'transportanordningen',
            'transportslagen',
            'trädslagen',
            'turordningen',
            'underlagen',
            'uniformen',
            'uppställningsformen',
            'utvecklingsbolagen',
            'varuslagen',
            'verksamhetsformen',
            'vevanordningen',
            'vårdformen',
            'ägoanordningen',
            'ägoslagen',
            'ärendeslagen',
            'åtgärdsförslagen',
        ]
        if text in nolaw:
            return None

        if text in self.currentlynamedlaws:
            return self.currentlynamedlaws[text]
        elif text in self.namedlaws:
            return self.namedlaws[text]
        else:
            if self.verbose:
                # print "(unknown): I don't know the ID of named law [%s]" % text
                log.warning(
                    "(unknown): I don't know the ID of named law [%s]" % text)
            return None

    def sfs_format_uri(self, attributes):
        piecemappings = {'första': '1',
                         'andra': '2',
                         'tredje': '3',
                         'fjärde': '4',
                         'femte': '5',
                         'sjätte': '6',
                         'sjunde': '7',
                         'åttonde': '8',
                         'nionde': '9'}
        keymapping = {'lawref': 'L',
                      'chapter': 'K',
                      'section': 'P',
                      'piece': 'S',
                      'item': 'N',
                      'itemnumeric': 'N',
                      'element': 'O',
                      'sentence': 'M',  # is this ever used?
                      }
        attributeorder = ['law', 'lawref', 'chapter', 'section',
                          'element', 'piece', 'item', 'itemnumeric', 'sentence']

        if 'law' in attributes:
            if attributes['law'].startswith('http://'):
                res = ''
            else:
                res = 'http://rinfo.lagrummet.se/publ/sfs/'

        else:
            if 'baseuri' in self.baseuri_attributes:
                res = self.baseuri_attributes['baseuri']
            else:
                res = ''
        resolvetobase = True
        addfragment = False
        justincase = None
        for key in attributeorder:
            if key in attributes:
                resolvetobase = False
                val = attributes[key]
            elif (resolvetobase and key in self.baseuri_attributes):
                val = self.baseuri_attributes[key]
            else:
                val = None

            if val:
                if not isinstance(val, six.text_type):
                    val = val.decode(SP_CHARSET)
                if addfragment:
                    res += '#'
                    addfragment = False
                if (key in ['piece', 'itemnumeric', 'sentence'] and val in piecemappings):
                    res += '%s%s' % (
                        keymapping[key], piecemappings[val.lower()])
                else:
                    if key == 'law':
                        val = self.normalize_sfsid(val)
                        val = val.replace(" ", "_")
                        res += val
                        addfragment = True
                    else:
                        if justincase:
                            res += justincase
                            justincase = None
                        val = val.replace(" ", "")
                        val = val.replace("\n", "")
                        val = val.replace("\r", "")
                        res += '%s%s' % (keymapping[key], val)
            else:
                if key == 'piece':
                    justincase = "S1"
        return res

    def format_ChapterSectionRefs(self, root):
        assert(root.tag == 'ChapterSectionRefs')
        assert(len(root.nodes) == 3)  # ChapterRef, wc, SectionRefs

        part = root.nodes[0]
        self.currentchapter = part.nodes[0].text.strip()

        if self.currentlaw:
            res = [self.format_custom_link({'law': self.currentlaw,
                                            'chapter': self.currentchapter},
                                           part.text,
                                           part.tag)]
        else:
            res = [self.format_custom_link({'chapter': self.currentchapter},
                                           part.text,
                                           part.tag)]

        res.extend(self.formatter_dispatch(root.nodes[1]))
        res.extend(self.formatter_dispatch(root.nodes[2]))
        self.currentchapter = None
        return res

    def format_ChapterSectionPieceRefs(self, root):
        assert(root.nodes[0].nodes[0].tag == 'ChapterRefID')
        self.currentchapter = root.nodes[0].nodes[0].text.strip()
        res = []
        for node in root.nodes:
            res.extend(self.formatter_dispatch(node))
        return res

    def format_LastSectionRef(self, root):
        # the last section ref is a bit different, since we want the
        # ending double section mark to be part of the link text
        assert(root.tag == 'LastSectionRef')
        assert(len(root.nodes) == 3)  # LastSectionRefID, wc, DoubleSectionMark
        sectionrefid = root.nodes[0]
        sectionid = sectionrefid.text

        return [self.format_generic_link(root)]

    def format_SectionPieceRefs(self, root):
        assert(root.tag == 'SectionPieceRefs')
        self.currentsection = root.nodes[0].nodes[0].text.strip()

        res = [self.format_custom_link(self.find_attributes([root.nodes[2]]),
                                       "%s %s" % (root.nodes[0]
                                                  .text, root.nodes[2].text),
                                       root.tag)]
        for node in root.nodes[3:]:
            res.extend(self.formatter_dispatch(node))

        self.currentsection = None
        return res

    def format_SectionPieceItemRefs(self, root):
        assert(root.tag == 'SectionPieceItemRefs')
        self.currentsection = root.nodes[0].nodes[0].text.strip()
        self.currentpiece = root.nodes[2].nodes[0].text.strip()

        res = [self.format_custom_link(self.find_attributes([root.nodes[2]]),
                                       "%s %s" % (root.nodes[0]
                                                  .text, root.nodes[2].text),
                                       root.tag)]

        for node in root.nodes[3:]:
            res.extend(self.formatter_dispatch(node))

        self.currentsection = None
        self.currentpiece = None
        return res

    # This is a special case for things like '17-29 och 32 §§ i lagen
    # (2004:575)', which picks out the LawRefID first and stores it in
    # .currentlaw, so that find_attributes finds it
    # automagically. Although now it seems to be branching out and be
    # all things to all people.
    def format_ExternalRefs(self, root):
        assert(root.tag == 'ExternalRefs')
        # print "DEBUG: start of format_ExternalRefs; self.currentlaw is %s" % self.currentlaw

        lawrefid_node = self.find_node(root, 'LawRefID')
        if lawrefid_node is None:
            # Ok, no explicit LawRefID found, lets see if this is a named law that we have the ID for
            # namedlaw_node = self.find_node(root, 'NamedLawExternalLawRef')
            namedlaw_node = self.find_node(root, 'NamedLaw')
            if namedlaw_node is None:
                # As a last chance, this might be a reference back to a previously
                # mentioned law ("...enligt 4 § samma lag")
                samelaw_node = self.find_node(root, 'SameLaw')
                assert(samelaw_node is not None)
                if self.lastlaw is None:
                    log.warning(
                        "(unknown): found reference to \"{samma,nämnda} {lag,förordning}\", but self.lastlaw is not set")

                self.currentlaw = self.lastlaw
            else:
                # the NamedLaw case
                self.currentlaw = self.namedlaw_to_sfsid(namedlaw_node.text)
                if self.currentlaw is None:
                    # unknow law name - in this case it's better to
                    # bail out rather than resolving chapter/paragraph
                    # references relative to baseuri (which is almost
                    # certainly wrong)
                    return [root.text]
        else:
            self.currentlaw = lawrefid_node.text
            if self.find_node(root, 'NamedLaw'):
                namedlaw = self.normalize_lawname(
                    self.find_node(root, 'NamedLaw').text)
                # print "remember that %s is %s!" % (namedlaw, self.currentlaw)
                self.currentlynamedlaws[namedlaw] = self.currentlaw

        # print "DEBUG: middle of format_ExternalRefs; self.currentlaw is %s" % self.currentlaw
        if self.lastlaw is None:
            # print "DEBUG: format_ExternalRefs: setting self.lastlaw to %s" % self.currentlaw
            self.lastlaw = self.currentlaw

        # if the node tree only contains a single reference, it looks
        # better if the entire expression, not just the
        # chapter/section part, is linked. But not if it's a
        # "anonymous" law ('1 § i lagen (1234:234) om blahonga')
        if (len(self.find_nodes(root, 'GenericRefs')) == 1 and
            len(self.find_nodes(root, 'SectionRefID')) == 1 and
                len(self.find_nodes(root, 'AnonymousExternalLaw')) == 0):
            res = [self.format_generic_link(root)]
        else:
            res = self.format_tokentree(root)

        return res

    def format_SectionItemRefs(self, root):
        assert(root.nodes[0].nodes[0].tag == 'SectionRefID')
        self.currentsection = root.nodes[0].nodes[0].text.strip()
        # res = self.formatter_dispatch(root.nodes[0]) # was formatter_dispatch(self.root)
        res = self.format_tokentree(root)
        self.currentsection = None
        return res

    def format_PieceItemRefs(self, root):
        self.currentpiece = root.nodes[0].nodes[0].text.strip()
        res = [self.format_custom_link(
            self.find_attributes([root.nodes[2].nodes[0]]),
               "%s %s" % (root.nodes[0].text, root.nodes[2].nodes[0].text),
               root.tag)]
        for node in root.nodes[2].nodes[1:]:
            res.extend(self.formatter_dispatch(node))

        self.currentpiece = None
        return res

    def format_ChapterSectionRef(self, root):
        assert(root.nodes[0].nodes[0].tag == 'ChapterRefID')
        self.currentchapter = root.nodes[0].nodes[0].text.strip()
        return [self.format_generic_link(root)]

    def format_AlternateChapterSectionRefs(self, root):
        assert(root.nodes[0].nodes[0].tag == 'ChapterRefID')
        self.currentchapter = root.nodes[0].nodes[0].text.strip()
        # print "Self.currentchapter is now %s" % self.currentchapter
        res = self.format_tokentree(root)
        self.currentchapter = None
        return res

    def format_ExternalLaw(self, root):
        self.currentchapter = None
        return self.formatter_dispatch(root.nodes[0])

    def format_ChangeRef(self, root):
        id = self.find_node(root, 'LawRefID').data
        return [self.format_custom_link({'lawref': id},
                                        root.text,
                                        root.tag)]

    def format_SFSNr(self, root):
        if self.baseuri is None:
            sfsid = self.find_node(root, 'LawRefID').data
            baseuri = 'http://rinfo.lagrummet.se/publ/sfs/%s#' % sfsid.decode(SP_CHARSET)
            self.baseuri_attributes = {'baseuri': baseuri}

        return self.format_tokentree(root)

    def format_NamedExternalLawRef(self, root):
        resetcurrentlaw = False
        # print "format_NamedExternalLawRef: self.currentlaw is %r"  % self.currentlaw
        if self.currentlaw is None:
            resetcurrentlaw = True
            lawrefid_node = self.find_node(root, 'LawRefID')
            if lawrefid_node is None:
                self.currentlaw = self.namedlaw_to_sfsid(root.text)
            else:
                self.currentlaw = lawrefid_node.text
                namedlaw = self.normalize_lawname(
                    self.find_node(root, 'NamedLaw').text)
                # print "remember that %s is %s!" % (namedlaw, self.currentlaw)
                self.currentlynamedlaws[namedlaw] = self.currentlaw
            # print "format_NamedExternalLawRef: self.currentlaw is now %r"  % self.currentlaw

        # print "format_NamedExternalLawRef: self.baseuri is %r" % self.baseuri
        if self.currentlaw is None:  # if we can't find a ID for this law, better not <link> it
            res = [root.text]
        else:
            res = [self.format_generic_link(root)]

        # print "format_NamedExternalLawRef: self.baseuri is %r" % self.baseuri
        if self.baseuri is None and self.currentlaw is not None:
            # print "format_NamedExternalLawRef: setting baseuri_attributes"
            # use this as the new baseuri_attributes
            m = self.re_urisegments.match(self.currentlaw)
            if m:
                self.baseuri_attributes = {'baseuri': m.group(1),
                                           'law': m.group(2),
                                           'chapter': m.group(6),
                                           'section': m.group(8),
                                           'piece': m.group(10),
                                           'item': m.group(12)}
            else:
                self.baseuri_attributes = {
                    'baseuri': 'http://rinfo.lagrummet.se/publ/sfs/' + self.currentlaw + '#'}

        if resetcurrentlaw:
            if self.currentlaw is not None:
                self.lastlaw = self.currentlaw
            self.currentlaw = None
        return res

    #
    # KOD FÖR KORTLAGRUM
    def format_AbbrevLawNormalRef(self, root):
        lawabbr_node = self.find_node(root, 'LawAbbreviation')
        self.currentlaw = self.namedlaw_to_sfsid(
            lawabbr_node.text, normalize=False)
        res = [self.format_generic_link(root)]
        if self.currentlaw is not None:
            self.lastlaw = self.currentlaw
        self.currentlaw = None
        return res

    def format_AbbrevLawShortRef(self, root):
        assert(root.nodes[0].tag == 'LawAbbreviation')
        assert(root.nodes[2].tag == 'ShortChapterSectionRef')
        self.currentlaw = self.namedlaw_to_sfsid(
            root.nodes[0].text, normalize=False)
        shortsection_node = root.nodes[2]
        assert(shortsection_node.nodes[0].tag == 'ShortChapterRefID')
        assert(shortsection_node.nodes[2].tag == 'ShortSectionRefID')
        self.currentchapter = shortsection_node.nodes[0].text
        self.currentsection = shortsection_node.nodes[2].text

        res = [self.format_generic_link(root)]

        self.currentchapter = None
        self.currentsection = None
        self.currentlaw = None
        return res

    #
    # KOD FÖR FORARBETEN
    def forarbete_format_uri(self, attributes):
        # res = self.baseuri_attributes['baseuri']
        res = 'http://rinfo.lagrummet.se/'
        resolvetobase = True
        addfragment = False

        for key, val in list(attributes.items()):
            if key == 'prop':
                res += "publ/prop/%s" % val
            elif key == 'bet':
                res += "publ/bet/%s" % val
            elif key == 'skrivelse':
                res += "publ/rskr/%s" % val
            elif key == 'celex':
                if len(val) == 8:  # incorrectly formatted, uses YY instead of YYYY
                    val = val[0] + '19' + val[1:]
                res += "ext/eur-lex/%s" % val
        if 'sidnr' in attributes:
            res += "#s%s" % attributes['sidnr']

        return res

    def format_ChapterSectionRef(self, root):
        assert(root.nodes[0].nodes[0].tag == 'ChapterRefID')
        self.currentchapter = root.nodes[0].nodes[0].text.strip()
        return [self.format_generic_link(root)]

    #
    # KOD FÖR EGLAGSTIFTNING
    def eglag_format_uri(self, attributes):
        res = 'http://rinfo.lagrummet.se/ext/celex/'
        if not 'akttyp' in attributes:
            if 'forordning' in attributes:
                attributes['akttyp'] = 'förordning'
            elif 'direktiv' in attributes:
                attributes['akttyp'] = 'direktiv'

        if 'akttyp' not in attributes:
            raise AttributeError("Akttyp saknas")
        # Om hur CELEX-nummer konstrueras
        # https://www.infotorg.sema.se/infotorg/itweb/handbook/rb/hlp_celn.htm
        # https://www.infotorg.sema.se/infotorg/itweb/handbook/rb/hlp_celf.htm
        # Om hur länkning till EURLEX ska se ut:
        # http://eur-lex.europa.eu/sv/tools/help_syntax.htm
        # Absolut URI?
        if 'ar' in attributes and 'lopnummer' in attributes:
            sektor = '3'
            rattslig_form = {'direktiv': 'L',
                             'förordning': 'R'}

            if len(attributes['ar']) == 2:
                attributes['ar'] = '19' + attributes['ar']
            res += "%s%s%s%04d" % (sektor, attributes['ar'],
                                   rattslig_form[attributes['akttyp']],
                                   int(attributes['lopnummer']))
        else:
            if not self.baseuri_attributes['baseuri'].startswith(res):
                # FIXME: should we warn about this?
                # print "Relative reference, but base context %s is not a celex context" %
                # self.baseuri_attributes['baseuri']
                return None

        if 'artikel' in attributes:
            res += "#%s" % attributes['artikel']
            if 'underartikel' in attributes:
                res += ".%s" % attributes['underartikel']

        return res

    #
    # KOD FÖR RATTSFALL
    def rattsfall_format_uri(self, attributes):
        # Listan härledd från containers.n3/rattsfallsforteckningar.n3 i
        # rinfoprojektets källkod - en ambitiösare lösning vore att läsa
        # in de faktiska N3-filerna i en rdflib-graf.
        containerid = {'NJA': '/publ/rattsfall/nja/',
                       'RH': '/publ/rattsfall/rh/',
                       'MÖD': '/publ/rattsfall/mod/',
                       'RÅ': '/publ/rattsfall/ra/',
                       'RK': '/publ/rattsfall/rk/',
                       'MIG': '/publ/rattsfall/mig/',
                       'AD': '/publ/rattsfall/ad/',
                       'MD': '/publ/rattsfall/md/',
                       'FÖD': '/publ/rattsfall/fod/'}

        # res = self.baseuri_attributes['baseuri']
        if 'nja' in attributes:
            attributes['domstol'] = attributes['nja']

        assert 'domstol' in attributes, "No court provided"
        assert attributes[
            'domstol'] in containerid, "%s is an unknown court" % attributes['domstol']
        res = "http://rinfo.lagrummet.se" + containerid[attributes['domstol']]

        if 'lopnr' in attributes and ":" in attributes['lopnr']:
            (attributes['ar'], attributes['lopnr']) = lopnr.split(":", 1)

        if attributes['domstol'] == 'NJA':
            # FIXME: URIs should be based on publikationsordinal, not
            # pagenumber (which this in effect is) - but this requires
            # a big lookup table/database/graph with
            # pagenumber-to-ordinal-mappings
            res += '%ss%s' % (attributes['ar'], attributes['sidnr'])
        else:
            res += '%s:%s' % (attributes['ar'], attributes['lopnr'])

        return res

    #
    # KOD FÖR EGRÄTTSFALL
    def egrattsfall_format_uri(self, attributes):
        descriptormap = {'C': 'J',  # Judgment of the Court
                         'T': 'A',  # Judgment of the Court of First Instance
                         'F': 'W',  # Judgement of the Civil Service Tribunal
                         }
        # FIXME: Change this before the year 2054 (as ECJ will
        # hopefully have fixed their case numbering by then)
        if len(attributes['year']) == 2:
            if int(attributes['year']) < 54:
                year = "20" + attributes['year']
            else:
                year = "19" + attributes['year']
        else:
            year = attributes['year']

        serial = '%04d' % int(attributes['serial'])
        descriptor = descriptormap[attributes['decision']]
        uri = "http://lagen.nu/ext/celex/6%s%s%s" % (year, descriptor, serial)
        return uri

Beispiel #9

Datei anzeigen

Datei: Reference.py Projekt: Sup3rgnu/lawParse

class Reference:
	LAGRUM 			= 1
	KORTALAGRUM 	= 2
	FORESKRIFTER 	= 3
	FORARBETEN 		= 6

	reUriSegments 		= re.compile(r'([\w]+://[^/]+/[^\d]*)(\d+:(bih\.[_ ]|N|)?\d+([_ ]s\.\d+|))#?(K([a-z0-9]+)|)(P([a-z0-9]+)|)(S(\d+)|)(N(\d+)|)')
	reEscapeCompound 	= re.compile(r'\b(\w+-) (och) (\w+-?)(lagen|förordningen)\b', re.UNICODE)
	reEscapeNamed 		= re.compile(r'\B(lagens?|balkens?|förordningens?|formens?|ordningens?|kungörelsens?|stadgans?)\b', re.UNICODE)

	reDescapeCompound 	= re.compile(r'\b(\w+-)_(och)_(\w+-?)(lagen|förordningen)\b', re.UNICODE)
	reDescapeNamed 		= re.compile(r'\|(lagens?|balkens?|förordningens?|formens?|ordningens?|kungörelsens?|stadgans?)')
	reXmlCharref		= re.compile('&#\d+;')

	def __init__(self, *args):
		scriptDir = os.getcwd()

		self.graph = Graph()
		n3File = Util.relpath(scriptDir + '/etc/sfs-extra.n3')
		self.graph.load(n3File, format='n3')

		self.roots = []
		self.uriFormatter = {}
		self.decl = ''
		self.namedLaws = {}
		self.loadEbnf(scriptDir + '/etc/base.ebnf')
		self.args = args
		
		if self.LAGRUM in args:
			prods = self.loadEbnf(scriptDir + '/etc/lagrum.ebnf')
			for p in prods: 
				self.uriFormatter[p] = self.sfsFormatUri
			self.namedLaws.update(self.getRelationship(RDFS.label))
			self.roots.append('sfsrefs')
			self.roots.append('sfsref')

		if self.KORTALAGRUM in args:
			# TODO: Fix korta lagrum also
			pass

		if self.FORARBETEN in args:
			prods = self.loadEbnf(scriptDir + '/etc/forarbeten.ebnf')
			for p in prods:
				self.uriFormatter[p] = self.forarbeteFormatUri
			self.roots.append('forarbeteref')

		self.decl += 'root ::= (%s/plain)+\n' % '/'.join(self.roots)
		self.parser = Parser(self.decl, 'root')
		self.tagger = self.parser.buildTagger('root')
		self.depth 	= 0

		#SFS specific settings
		self.currentLaw 		= None
		self.currentChapter 	= None
		self.currentSection 	= None
		self.currentPiece		= None
		self.lastLaw			= None
		self.currentNamedLaws	= {}

	def loadEbnf(self, file):
		"""Loads the syntax from a given EBNF file"""
		f = open(file)
		syntax = f.read()
		self.decl += syntax
		f.close()
		return [x.group(1) for x in re.finditer(r'(\w+(Ref|RefID))\s*::=', syntax)]

	def getRelationship(self, predicate):
		d = {}
		for obj, subj in self.graph.subject_objects(predicate):
			d[unicode(subj)] = unicode(obj)
		return d

	def parse(self, indata, baseUri='http://rinfo.lagrummet.se/publ/sfs/9999:999#K9P9S9P9',predicate=None):
		if indata == '':
			return indata
		self.predicate = predicate
		self.baseUri = baseUri
		if baseUri:
			m = self.reUriSegments.match(baseUri)
			if m:
				self.baseUriAttrs = {'baseUri'	: m.group(1),
									 'law'		: m.group(2),
									 'chapter'	: m.group(6),
									 'section'	: m.group(8),
									 'piece'	: m.group(10),
									 'item'		: m.group(12)}
			else:
				self.baseUriAttrs = {'baseUri':baseUri}
		else:
			self.baseUriAttrs = {}


		fixedIndata = unicode(indata)
		
		if self.LAGRUM in self.args:
			fixedIndata = self.reEscapeCompound.sub(r'\1_\2_\3\4', fixedIndata)
			fixedIndata = self.reEscapeNamed.sub(r'|\1', fixedIndata)

		if isinstance(fixedIndata, unicode):
			fixedIndata = fixedIndata.encode(SP_CHARSET, 'xmlcharrefreplace')

		tagList = tag(fixedIndata, self.tagger,0,len(fixedIndata))
		res = []
		root = NodeTree(tagList, fixedIndata)
		for n in root.nodes:
			if n.tag in self.roots:
				self.clearState()
				res.extend(self.formatterDispatch(n))
			else:
				assert n.tag == 'plain', 'Tag is %s' % n.tag
				res.append(n.text)

			if self.currentLaw != None:
				self.lastLaw = self.currentLaw
			self.currentLaw = None

		if tagList[-1] != len(fixedIndata):
			#TODO: Add Error 
			raise ParseError, 'Parsed %s chars of %s (...%s...)' % (tagList[-1], len(indata), indata[(tagList[-1]-2):tagList[-1]+3])

		# Normalize the result, concat and remove '|'
		result = []
		
		for i in range(len(res)):
			if not self.reDescapeNamed.search(res[i]):
				node = res[i]
			else:
				if self.LAGRUM in self.args:
					text = self.reDescapeNamed.sub(r'\1', res[i])
					text = self.reDescapeCompound.sub(r'\1 \2 \3\4', text)
				if isinstance(res[i], Link):
					# A Link obj is immutable so we have to 
					# create a new and copy its attrs
					if hasattr(res[i], 'predicate'):
						node = LinkSubject(text, predicate=res[i].predicate, uri=res[i].uri)
					else:
						node = Link(text, uri=res[i].uri)
				else:
					node = text
			if (len(result) > 0 
				and not isinstance(result[-1], Link)
					and not isinstance(node, Link)):
				result[-1] += node
			else:
				result.append(node)

		for i in range(len(result)):
			if isinstance(result[i], Link):
				pass
			else:
				result[i] = self.reXmlCharref.sub(self.unescapeXmlCharref, result[i])

		return result

	def unescapeXmlCharref(self, m):
		return unichr(int(m.group(0)[2:-1]))

	def findAttrs(self, parts, extra={}):
		"""Creates a dict of attributes through a tree"""
		d = {}
		self.depth += 1
		if extra:
			d.update(extra)
		for part in parts:
			currentPartTag = part.tag.lower()
			if currentPartTag.endswith('refid'):
				if ((currentPartTag == 'singelsectionrefid') or 
					(currentPartTag == 'lastsectionrefid')):
					currentPartTag = 'sectionrefid'
				d[currentPartTag[:-5]] = part.text.strip()

			if part.nodes:
				d.update(self.findAttrs(part.nodes, d))

		self.depth -= 1

		if self.currentLaw and 'law' not in d:
			d['law'] = self.currentLaw
		if self.currentChapter and 'chapter' not in d:
			d['chapter'] = self.currentChapter
		if self.currentSection and 'section' not in d:
			d['section'] = self.currentSection
		if self.currentPiece and 'piece' not in d:
			d['piece'] = self.currentPiece

		return d

	def findNode(self, root, nodeTag):
		"""Returns the first node in the tree that has a matching tag, dfs."""
		if root.tag == nodeTag:
			return root
		else:
			for node in root.nodes:
				x = self.findNode(node, nodeTag)
				if x != None:
					return x
			return None

	def findNodes(self, root, nodeTag):
		if root.tag == nodeTag:
			return [root]
		else:
			res = []
			for node in root.nodes:
				res.extend(self.findNodes(node, nodeTag))
			return res

	def formatterDispatch(self, part):
		self.depth += 1
		if 'format_' + part.tag in dir(self):
			formatter = getattr(self,'format_'+part.tag)
			res = formatter(part)
			assert res != None, 'Custom formatter for %s didnt return anythin' % part.tag
		else:
			res = self.formatTokentree(part)

		self.depth -= 1
		return res

	def formatTokentree(self, part):
		res = []
		if (not part.nodes) and (not part.tag.endswith('RefID')):
			res.append(part.text)
		else:
			if part.tag.endswith('RefID'):
				res.append(self.formatGenericLink(part))
			elif part.tag.endswith('Ref'):
				res.append(self.formatGenericLink(part))
			else:
				for p in part.nodes:
					res.extend(self.formatterDispatch(p))
		return res

	def formatGenericLink(self, part, uriFormatter=None):
		try:
			uri = self.uriFormatter[part.tag](self.findAttrs([part]))			
		except KeyError:
			if uriFormatter:
				uri = uriFormatter(self.findAttrs([part]))	
			else:
				uri = self.sfsFormatUri(self.findAttrs([part]))
		except AttributeError:
			return part.text
		except:
			exc = sys.exc_info()
			return part.text

		if not uri:
			return part.text
		elif self.predicate:
			return LinkSubject(part.text, uri=uri, predicate=self.predicate)
		else:
			return Link(part.text, uri=uri)

	def formatCustomLink(self, attrs, text, production):
		try:
			uri = self.uriFormatter[production](attrs)
		except KeyError:
			uri = self.sfsFormatUri(attrs)

		if not uri:
			return part.text
		elif self.predicate:
			return LinkSubject(text, uri=uri, predicate=self.predicate)
		else:
			return Link(text, uri=uri)

	def clearState(self):
		self.currentLaw 	= None
		self.currentChapter	= None
		self.currentSection	= None
		self.currentPiece	= None

	def normalizeSfsId(self, sfsId):
		sfsId = re.sub(r'(\d+:\d+)\.(\d)', r'\1 \2', sfsId)
		return sfsId

	def normalizeLawName(self, lawName):
		lawName = lawName.replace('|','').replace('_',' ').lower()
		if lawName.endswith('s'):
			lawName = lawName[:-1]
		return lawName

	def namedLawToSfsid(self, text, normalize=True):
		if normalize:
			text = self.normalizeLawName(text)

		noLaw = [
			u'aktieslagen',
			u'anordningen',
			u'anordningen',
			u'anslagen',
			u'arbetsordningen',
			u'associationsformen',
			u'avfallsslagen',
			u'avslagen',
			u'avvittringsutslagen',
			u'bergslagen',
			u'beskattningsunderlagen',
			u'bolagen',
			u'bolagsordningen',
			u'bolagsordningen',
			u'dagordningen',
			u'djurslagen',
			u'dotterbolagen',
			u'emballagen',
			u'energislagen',
			u'ersättningsformen',
			u'ersättningsslagen',
			u'examensordningen',
			u'finansbolagen',
			u'finansieringsformen',
			u'fissionsvederlagen',
			u'flygbolagen',
			u'fondbolagen',
			u'förbundsordningen',
			u'föreslagen',
			u'företrädesordningen',
			u'förhandlingsordningen',
			u'förlagen',
			u'förmånsrättsordningen',
			u'förmögenhetsordningen',
			u'förordningen',
			u'förslagen',
			u'försäkringsaktiebolagen',
			u'försäkringsbolagen',
			u'gravanordningen',
			u'grundlagen',
			u'handelsplattformen',
			u'handläggningsordningen',
			u'inkomstslagen',
			u'inköpssamordningen',
			u'kapitalunderlagen',
			u'klockslagen',
			u'kopplingsanordningen',
			u'låneformen',
			u'mervärdesskatteordningen',
			u'nummerordningen',
			u'omslagen',
			u'ordalagen',
			u'pensionsordningen',
			u'renhållningsordningen',
			u'representationsreformen',
			u'rättegångordningen',
			u'rättegångsordningen',
			u'rättsordningen',
			u'samordningen',
			u'samordningen',
			u'skatteordningen',
			u'skatteslagen',
			u'skatteunderlagen',
			u'skolformen',
			u'skyddsanordningen',
			u'slagen',
			u'solvärmeanordningen',
			u'storslagen',
			u'studieformen',
			u'stödformen',
			u'stödordningen',
			u'stödordningen',
			u'säkerhetsanordningen',
			u'talarordningen',
			u'tillslagen',
			u'tivolianordningen',
			u'trafikslagen',
			u'transportanordningen',
			u'transportslagen',
			u'trädslagen',
			u'turordningen',
			u'underlagen',
			u'uniformen',
			u'uppställningsformen',
			u'utvecklingsbolagen',
			u'varuslagen',
			u'verksamhetsformen',
			u'vevanordningen',
			u'vårdformen',
			u'ägoanordningen',
			u'ägoslagen',
			u'ärendeslagen',
			u'åtgärdsförslagen']
			
		if text in noLaw:
			return None
		if self.currentNamedLaws.has_key(text):
			return self.currentNamedLaws[text]
		elif self.namedLaws.has_key(text):
			return self.namedLaws[text]
		else:
			return None

	def sfsFormatUri(self, attrs):
		pieceMap = {u'första'	:'1',
				  u'andra'	:'2',
				  u'tredje'	:'3',
				  u'fjärde'	:'4',
				  u'femte'	:'5',
				  u'sjätte'	:'6',
				  u'sjunde'	:'7',
				  u'åttonde':'8',
				  u'nionde'	:'9'}
		
		keyMap = {u'lawref'	:'L',
				  u'chapter':'K',
				  u'section':'P',
				  u'piece'	:'S',
				  u'item'	:'N',
				  u'itemnumeric': 'N',
				  u'element':'O',
				  u'sentence': 'M'}
		
		attrOrder = ['law', 'lawref', 'chapter', 'section', 'element', 'piece', 'item', 'itemnumeric', 'sentence']

		if 'law' in attrs:
			if attrs['law'].startswith('http://'):
				res = ''
			else:
				res = 'http://rinfo.lagrummet.se/publ/sfs/'
		else:
			if 'baseUri' in self.baseUriAttrs:
				res = self.baseUriAttrs['baseUri']
			else:
				res = ''

		resolveBase = True
		addFragment = False
		justInCase 	= None

		for key in attrOrder:
			if attrs.has_key(key):
				resolveBase = False
				val = attrs[key]
			elif (resolveBase and self.baseUriAttrs.has_key(key)):
				val = self.baseUriAttrs[key]
			else:
				val = None

			if val:
				if addFragment:
					res += '#'
					addFragment = False
				if (key in ['piece', 'itemnumeric', 'sentence'] and val in pieceMap):
					res += '%s%s' % (keyMap[key], pieceMap[val.lower()])
				else:
					if key == 'law':
						val = self.normalizeSfsId(val)
						val = val.replace(' ', '_')
						res += val
						addFragment = True
					else:
						if justInCase:
							res += justInCase
							justInCase = None
						val = val.replace(' ', '')
						val = val.replace('\n', '')
						val = val.replace('\r', '')
						res += '%s%s' % (keyMap[key], val)
			else:
				if key == 'piece':
					justInCase = 'S1'
		return res								

	def format_SFSNr(self, root):
		if self.baseUri == None:
			sfsId = self.findNode(root, 'LawRefID').data
			self.baseUriAttrs = {'baseUri':'http://rinfo.lagrummet.se/publ/sfs/'+sfsId+'#'}
		return self.formatTokentree(root)

	def format_ChangeRef(self, root):
		id = self.findNode(root, 'LawRefID').data
		return [self.formatCustomLink({'lawref':id},
									   root.text,
									   root.tag)]

	def format_NamedExternalLawRef(self, root):
		resetCurrentLaw = False
		if self.currentLaw == None:
			resetCurrentLaw = True
			lawRefIdNode = self.findNode(root, 'LawRefID')
			if lawRefIdNode == None:
				self.currentLaw = self.namedLawToSfsid(root.text)
			else:
				self.currentLaw = lawRefIdNode.text
				namedLaw = self.normalizeLawName(self.findNode(root, 'NamedLaw').text)
				self.currentNamedLaws[namedLaw] = self.currentLaw

		if self.currentLaw == None:
			res = [root.text]
		else:
			res = [self.formatGenericLink(root)]

		if self.baseUri == None and self.currentLaw != None:
			m = self.reUriSegments.match(self.currentLaw)
			if m:
				self.baseUriAttrs = {'baseUri' : m.group(1),
									 'law': m.group(2),
									 'chapter': m.group(6),
									 'section': m.group(8),
									 'piece': m.group(10),
									 'item': m.group(12)} 
			else:
				self.baseUriAttrs = {'baseUri': 'http://rinfo.lagrummet.se/publ/sfs/' + self.currentLaw + '#'}
		if resetCurrentLaw:
			if self.currentLaw != None:
				self.lastLaw = self.currentLaw
			self.currentLaw = None

		return res

	def format_ChapterSectionRef(self, root):
		assert(root.nodes[0].nodes[0].tag == 'ChapterRefID')
		self.currentChapter = root.nodes[0].nodes[0].text.strip()
		return [self.formatGenericLink(root)]

	def format_ChapterSectionPieceRefs(self, root):
		assert(root.nodes[0].nodes[0].tag == 'ChapterRefID')
		self.currentChapter = root.nodes[0].nodes[0].text.strip()
		res = []
		for node in root.nodes:
			res.extend(self.formatterDispatch(node))

		return res 

	def format_AlternativeChapterSectionRefs(self, root):
		print "TODO: Implement me %s" % root.tag 

	def format_LastSectionRef(self, root):
		# We want the ending double section mark to be 
		# a part of the link
		assert(root.tag == 'LastSectionRef')
		assert(len(root.nodes) == 3)
		sectionRefId = root.nodes[0]
		sectionId = sectionRefId.text

		return [self.formatGenericLink(root)]

	def format_SectionPieceRefs(self, root):
		assert(root.tag == 'SectionPieceRefs')
		self.currentSection = root.nodes[0].nodes[0].text.strip()
		res = [self.formatCustomLink(self.findAttrs([root.nodes[2]]),
									 '%s %s' % (root.nodes[0].text, root.nodes[2].text),
									 root.tag)]
		for node in root.nodes[3:]:
			res.extend(self.formatterDispatch(node))
		self.currentSection = None

		return res

	def format_SectionPieceItemRefs(self, root):
		assert(root.tag == 'SectionPieceItemRefs')
		self.currentSection = root.nodes[0].nodes[0].text.strip()
		self.currentPiece = root.nodes[2].nodes[0].text.strip()

		res = [self.formatCustomLink(self.findAttrs([root.nodes[2]]),
									 '%s %s' % (root.nodes[0].text, root.nodes[2].text),
									 root.tag)]
		for node in root.nodes[3:]:
			res.extend(self.formatterDispatch(node))

		self.currentSection = None
		self.currentPiece = None

		return res

	def format_SectionItemRefs(self, root):
		assert(root.nodes[0].nodes[0].tag == 'SectionRefID')
		self.currentSection = root.nodes[0].nodes[0].text.strip()
		res = self.formatTokentree(root)
		self.currentSection = None
		
		return res

	def format_PieceItemRefs(self, root):
		self.currentPiece = root.nodes[0].nodes[0].text.strip()
		res = [self.formatCustomLink(self.findAttrs([root.nodes[2].nodes[0]]),
													'%s %s' % (root.nodes[0].text, root.nodes[2].nodes[0].text),
													root.tag)]
		for node in root.nodes[2].nodes[1:]:
			res.extend(self.formatterDispatch(node))
		self.currentPiece = None

		return res

	def format_ExternalLaw(self, root):
		self.currentChapter = None
		return self.formatterDispatch(root.nodes[0])

	def format_ExternalRefs(self, root):
		# Special case for things like '17-29 och 32 §§ i lagen
		# (2004:575)' by picking the LawRefID and store it in 
		# currentLaw do findAttrs will find it.  
		assert(root.tag == 'ExternalRefs')

		lawRefIdNode = self.findNode(root, 'LawRefID')
		if lawRefIdNode == None:
			namedLawNode = self.findNode(root, 'NamedLaw')
			if namedLawNode == None:
				sameLawNode = self.findNode(root, 'SameLaw')
				assert(sameLawNode != None)
				self.currentLaw = self.lastLaw
			else:
				self.currentLaw = self.namedLawToSfsid(namedLawNode.text)
				if self.currentLaw == None:
					# Unknown law name, return
					return [root.text]
		else:
			self.currentLaw = lawRefIdNode.text
			if self.findNode(root, 'NamedLaw'):
				namedLaw = self.normalizeLawName(self.findNode(root, 'NamedLaw').text)
				self.currentNamedLaws[namedLaw] = self.currentLaw

		if self.lastLaw is None:
			self.lastLaw = self.currentLaw

		if (len(self.findNodes(root, 'GenericRefs')) == 1 and 
			len(self.findNodes(root, 'SectionRefID')) == 1 and
			len(self.findNodes(root, 'AnonymousExternalLaw')) == 0):
			res = [self.formatGenericLink(root)]
		else:
			res = self.formatTokentree(root)

		return res

	def forarbeteFormatUri(self, attrs):
		res = 'http://rinfo.lagrummet.se/'
		resolveBase = True
		addFragment = False

		for key, val in attrs.items():
			if key == 'prop':
				res += 'publ/prop/%s' % val
			elif key == 'bet':
				res += 'ext/bet/%s' % val
			elif key == 'skrivelse':
				res += 'ext/rskr/%s' % val
			elif key == 'celex':
				if len(val) == 8:
					val = val[0] + '19' + val[1:]
				res += 'ext/celex/%s' % val
		if 'sidnr' in attrs:
			res += '#s%s' % attrs['sidnr']

		return res

Beispiel #10

Datei anzeigen

class LegalRef:
    # Kanske detta borde vara 1,2,4,8 osv, så att anroparen kan be om
    # LAGRUM | FORESKRIFTER, och så vi kan definera samlingar av
    # vanliga kombinationer (exv ALL_LAGSTIFTNING = LAGRUM |
    # KORTLAGRUM | FORESKRIFTER | EGLAGSTIFTNING)
    LAGRUM = 1             # hänvisningar till lagrum i SFS
    KORTLAGRUM = 2         # SFS-hänvisningar på kortform
    FORESKRIFTER = 3       # hänvisningar till myndigheters författningssamlingar
    EGLAGSTIFTNING = 4     # EG-fördrag, förordningar och direktiv
    INTLLAGSTIFTNING = 5   # Fördrag, traktat etc
    FORARBETEN = 6         # proppar, betänkanden, etc
    RATTSFALL = 7          # Rättsfall i svenska domstolar
    MYNDIGHETSBESLUT = 8   # Myndighetsbeslut (JO, ARN, DI...)
    EGRATTSFALL = 9        # Rättsfall i EG-domstolen/förstainstansrätten 
    INTLRATTSFALL = 10     # Europadomstolen

    
    # re_urisegments = re.compile(r'([\w]+://[^/]+/[^\d]*)(\d+:(bih\. |N|)?\d+( s\.\d+|))#?(K(\d+)|)(P(\d+)|)(S(\d+)|)(N(\d+)|)')
    re_urisegments = re.compile(r'([\w]+://[^/]+/[^\d]*)(\d+:(bih\.[_ ]|N|)?\d+([_ ]s\.\d+|))#?(K([a-z0-9]+)|)(P([a-z0-9]+)|)(S(\d+)|)(N(\d+)|)')
    re_escape_compound = re.compile(r'\b(\w+-) (och) (\w+-?)(lagen|förordningen)\b', re.UNICODE)
    re_escape_named = re.compile(r'\B(lagens?|balkens?|förordningens?|formens?|ordningens?|kungörelsens?|stadgans?)\b', re.UNICODE)

    re_descape_compound = re.compile(r'\b(\w+-)_(och)_(\w+-?)(lagen|förordningen)\b', re.UNICODE)
    re_descape_named = re.compile(r'\|(lagens?|balkens?|förordningens?|formens?|ordningens?|kungörelsens?|stadgans?)')
    re_xmlcharref = re.compile("&#\d+;")
    def __init__(self,*args):
        if not os.path.sep in __file__:
            scriptdir = os.getcwd()
        else:
            scriptdir = os.path.dirname(__file__)

        #n3file = os.path.sep.join([scriptdir,"etc","sfs-extra.n3"])
        #n3url = "file://" + n3file.replace("\\","/")

        #print "scriptdir: %s" % scriptdir
        #print "n3file: %s" % n3file
        #print "n3url: %s" % n3url

        self.graph = Graph()
        n3file = Util.relpath(scriptdir + "/etc/sfs-extra.n3")
        # print "loading n3file %s" % n3file
        self.graph.load(n3file, format="n3")
        self.roots = []
        self.uriformatter = {}
        self.decl = ""
        self.namedlaws = {}
        self.load_ebnf(scriptdir+"/etc/base.ebnf")

        self.args = args
        if self.LAGRUM in args:
            productions = self.load_ebnf(scriptdir+"/etc/lagrum.ebnf")
            for p in productions:
                self.uriformatter[p] = self.sfs_format_uri
            self.namedlaws.update(self.get_relations(RDFS.label))
            self.roots.append("sfsrefs")
            self.roots.append("sfsref")

        if self.KORTLAGRUM in args:
            # om vi inte redan laddat lagrum.ebnf måste vi göra det
            # nu, eftersom kortlagrum.ebnf beror på produktioner som
            # definerats där
            if not self.LAGRUM in args:
                self.load_ebnf(scriptdir+"/etc/lagrum.ebnf")
                
            productions = self.load_ebnf(scriptdir+"/etc/kortlagrum.ebnf")
            for p in productions:
                self.uriformatter[p] = self.sfs_format_uri
            DCT = Namespace("http://purl.org/dc/terms/")
            d = self.get_relations(DCT['alternate'])
            self.namedlaws.update(d)
            lawlist = [x.encode(SP_CHARSET) for x in d.keys()]
            # Make sure longer law abbreviations come before shorter
            # ones (so that we don't mistake "3 § MBL" for "3 § MB"+"L")
            lawlist.sort(cmp=lambda x,y:len(y)-len(x))
            self.decl += "LawAbbreviation ::= ('%s')\n" % "'/'".join(lawlist)
            self.roots.insert(0,"kortlagrumref")

        if self.EGLAGSTIFTNING in args:
            productions = self.load_ebnf(scriptdir+"/etc/eglag.ebnf")
            for p in productions:
                self.uriformatter[p] = self.eglag_format_uri
            self.roots.append("eglagref")
        if self.FORARBETEN in args:
            productions = self.load_ebnf(scriptdir+"/etc/forarbeten.ebnf")
            for p in productions:
                self.uriformatter[p] = self.forarbete_format_uri
            self.roots.append("forarbeteref")
        if self.RATTSFALL in args:
            productions = self.load_ebnf(scriptdir+"/etc/rattsfall.ebnf")
            for p in productions:
                self.uriformatter[p] = self.rattsfall_format_uri
            self.roots.append("rattsfallref")
        if self.EGRATTSFALL in args:
            productions = self.load_ebnf(scriptdir+"/etc/egratt.ebnf")
            for p in productions:
                self.uriformatter[p] = self.egrattsfall_format_uri
            self.roots.append("ecjcaseref")
            
        self.decl += "root ::= (%s/plain)+\n" % "/".join(self.roots)
        # pprint(productions)
        # print self.decl.decode(SP_CHARSET,'ignore')

        self.parser = Parser(self.decl, "root")
        self.tagger = self.parser.buildTagger("root")
        # print "tagger length: %d" % len(repr(self.tagger))
        self.verbose = False
        self.depth = 0

        # SFS-specifik kod
        self.currentlaw     = None
        self.currentchapter = None
        self.currentsection = None
        self.currentpiece   = None
        self.lastlaw        = None
        self.currentlynamedlaws = {}
        
    def load_ebnf(self,file):
        """Laddar in produktionerna i den angivna filen i den
        EBNF-deklaration som används, samt returnerar alla
        *Ref och *RefId-produktioner"""
        # print "%s: Loading %s" % (id(self), file)
        f = open(file)
        content = f.read()
        self.decl += content
        f.close()
        return [x.group(1) for x in re.finditer(r'(\w+(Ref|RefID))\s*::=', content)]

    def get_relations(self, predicate):
        d = {}
        for obj, subj in self.graph.subject_objects(predicate):
            d[unicode(subj)] = unicode(obj)
        return d


    def parse(self, indata, baseuri="http://rinfo.lagrummet.se/publ/sfs/9999:999#K9P9S9P9",predicate=None):
        if indata == "": return indata # this actually triggered a bug...
        # h = hashlib.sha1()
        # h.update(indata)
        # print "Called with %r (%s) (%s)" % (indata, h.hexdigest(), self.verbose)
        self.predicate = predicate
        self.baseuri = baseuri
        if baseuri:
            m = self.re_urisegments.match(baseuri)
            if m:
                self.baseuri_attributes = {'baseuri':m.group(1),
                                           'law':m.group(2),
                                           'chapter':m.group(6),
                                           'section':m.group(8),
                                           'piece':m.group(10),
                                           'item':m.group(12)}
            else:
                self.baseuri_attributes = {'baseuri':baseuri}
        else:
            self.baseuri_attributes = {}
        # Det är svårt att få EBNF-grammatiken att känna igen
        # godtyckliga ord som slutar på ett givet suffix (exv
        # 'bokföringslagen' med suffixet 'lagen'). Därför förbehandlar
        # vi indatasträngen och stoppar in ett '|'-tecken innan vissa
        # suffix. Vi transformerar även 'Radio- och TV-lagen' till
        # 'Radio-_och_TV-lagen'
        #
        # FIXME: Obviously, this shouldn't be done in a general class,
        # but rather in a subclas or via proxy/adapter
        # if we don't do the unicode conversion and pass
        # BeautifulSoup.NavigableString, the later .encode call fails
        # (since it's not a real unicode string)
            
        fixedindata = unicode(indata)
        # print "Before: %r" % type(fixedindata)
        
        if self.LAGRUM in self.args:
            fixedindata = self.re_escape_compound.sub(r'\1_\2_\3\4', fixedindata)
            fixedindata = self.re_escape_named.sub(r'|\1', fixedindata)
        # print "After: %r" % type(fixedindata)
        
        # SimpleParse har inget stöd för unicodesträngar, så vi
        # konverterar intdatat till en bytesträng. Tyvärr får jag inte
        # det hela att funka med UTF8, så vi kör xml character
        # references istället
        if isinstance(fixedindata,unicode):
            fixedindata = fixedindata.encode(SP_CHARSET,'xmlcharrefreplace')
            
        # Parsea texten med TextTools.tag - inte det enklaste sättet
        # att göra det, men om man gör enligt
        # Simpleparse-dokumentationen byggs taggertabellen om för
        # varje anrop till parse()
        if self.verbose: print u"calling tag with '%s'" % (fixedindata.decode(SP_CHARSET))
        # print "tagger length: %d" % len(repr(self.tagger))
        taglist = tag(fixedindata, self.tagger,0,len(fixedindata))
        result = []

        root = NodeTree(taglist,fixedindata)
        for part in root.nodes:
            if part.tag != 'plain' and self.verbose:
                sys.stdout.write(self.prettyprint(part))
            if part.tag in self.roots:
                self.clear_state()
                # self.verbose = False
                result.extend(self.formatter_dispatch(part))
            else:
                assert part.tag == 'plain',"Tag is %s" % part.tag
                result.append(part.text)
                
            # clear state
            if self.currentlaw != None: self.lastlaw = self.currentlaw
            self.currentlaw = None


        if taglist[-1] != len(fixedindata):
            log.error(u'Problem (%d:%d) with %r / %r' % (taglist[-1]-8,taglist[-1]+8,fixedindata,indata))

            raise ParseError, "parsed %s chars of %s (...%s...)" %  (taglist[-1], len(indata),
                                                                               indata[(taglist[-1]-2):taglist[-1]+3])


        # Normalisera resultatet, dvs konkatenera intilliggande
        # textnoder, och ta bort ev '|'-tecken som vi stoppat in
        # tidigare.
        normres = []
        for i in range(len(result)):
            if not self.re_descape_named.search(result[i]):
                node = result[i]
            else:
                if self.LAGRUM in self.args:
                    text = self.re_descape_named.sub(r'\1',result[i])
                    text = self.re_descape_compound.sub(r'\1 \2 \3\4', text)
                if isinstance(result[i], Link):
                    # Eftersom Link-objekt är immutable måste vi skapa
                    # ett nytt och kopiera dess attribut
                    if hasattr(result[i],'predicate'):
                        node = LinkSubject(text, predicate=result[i].predicate,
                                           uri=result[i].uri)
                    else:
                        node = Link(text,uri=result[i].uri)
                else:
                    node = text
            if (len(normres) > 0
                and not isinstance(normres[-1],Link) 
                and not isinstance(node,Link)):
                normres[-1] += node
            else:
                normres.append(node)

        # and finally...
        for i in range(len(normres)):
            if isinstance(normres[i], Link):
                # deal with these later
                pass
            else:
                normres[i] = self.re_xmlcharref.sub(self.unescape_xmlcharref, normres[i])
        return normres

    def unescape_xmlcharref(self, m):
        # print "Changing %r to a %r" % (m.group(0)[2:-1], unichr(int(m.group(0)[2:-1])))
        return unichr(int(m.group(0)[2:-1]))

    def find_attributes(self,parts,extra={}):
        """recurses through a parse tree and creates a dictionary of
        attributes"""
        d = {}
        
        self.depth += 1
        if self.verbose: print ". "*self.depth+"find_attributes: starting with %s"%d
        if extra:
            d.update(extra)
            
        for part in parts:
            current_part_tag = part.tag.lower()
            if current_part_tag.endswith('refid'):
                if ((current_part_tag == 'singlesectionrefid') or
                    (current_part_tag == 'lastsectionrefid')):
                    current_part_tag = 'sectionrefid'
                d[current_part_tag[:-5]] = part.text.strip()
                if self.verbose: print ". "*self.depth+"find_attributes: d is now %s" % d
                
            if part.nodes:
                d.update(self.find_attributes(part.nodes,d))
        if self.verbose: print ". "*self.depth+"find_attributes: returning %s" % d
        self.depth -= 1

        if self.currentlaw     and 'law' not in d    : d['law']     = self.currentlaw
        if self.currentchapter and 'chapter' not in d: d['chapter'] = self.currentchapter
        if self.currentsection and 'section' not in d: d['section'] = self.currentsection
        if self.currentpiece   and 'piece' not in d  : d['piece']   = self.currentpiece

        return d


    def find_node(self,root,nodetag):
        """Returns the first node in the tree that has a tag matching nodetag. The search is depth-first"""
        if root.tag == nodetag: # base case
            return root
        else:
            for node in root.nodes:
                x = self.find_node(node,nodetag)
                if x != None: return x
            return None

    def find_nodes(self,root,nodetag):
        if root.tag == nodetag:
            return [root]
        else:
            res = []
            for node in root.nodes:
                res.extend(self.find_nodes(node,nodetag))
            return res
                

    def flatten_tokentree(self,part,suffix):
        """returns a 'flattened' tokentree ie for the following tree and the suffix 'RefID'
           foo->bar->BlahongaRefID
              ->baz->quux->Blahonga2RefID
                         ->Blahonga3RefID
              ->Blahonga4RefID

           this should return [BlahongaRefID, Blahonga2RefID, Blahonga3RefID, Blahonga4RefID]"""
        l = []
        if part.tag.endswith(suffix): l.append(part)
        if not part.nodes: return l

        for subpart in part.nodes:
            l.extend(self.flatten_tokentree(subpart,suffix))
        return l

    def formatter_dispatch(self,part):
        # print "Verbositiy: %r" % self.verbose
        self.depth += 1
        # Finns det en skräddarsydd formatterare?
        if "format_"+part.tag in dir(self): 
            formatter = getattr(self,"format_"+part.tag)
            if self.verbose: print (". "*self.depth)+ "formatter_dispatch: format_%s defined, calling it" % part.tag
            res = formatter(part)
            assert res != None, "Custom formatter for %s didn't return anything" % part.tag
        else:
            if self.verbose: print (". "*self.depth)+ "formatter_dispatch: no format_%s, using format_tokentree" % part.tag
            res = self.format_tokentree(part)

        if res == None: print (". "*self.depth)+ "something wrong with this:\n" + self.prettyprint(part)
        self.depth -= 1
        return res
        
    def format_tokentree(self,part):
        # This is the default formatter. It converts every token that
        # ends with a RefID into a Link object. For grammar
        # productions like SectionPieceRefs, which contain
        # subproductions that also end in RefID, this is not a good
        # function to use - use a custom formatter instead.

        res = []

        if self.verbose: print (". "*self.depth)+ "format_tokentree: called for %s" % part.tag
        # this is like the bottom case, or something
        if (not part.nodes) and (not part.tag.endswith("RefID")):
            res.append(part.text)
        else:
            if part.tag.endswith("RefID"):
                res.append(self.format_generic_link(part))
            elif part.tag.endswith("Ref"):
                res.append(self.format_generic_link(part))
            else:
                for subpart in part.nodes:
                    if self.verbose and part.tag == 'LawRef':
                        print (". "*self.depth) + "format_tokentree: part '%s' is a %s" % (subpart.text, subpart.tag)
                    res.extend(self.formatter_dispatch(subpart))
        if self.verbose: print (". "*self.depth)+ "format_tokentree: returning '%s' for %s" % (res,part.tag)
        return res
    

    def prettyprint(self,root,indent=0):
        res = u"%s'%s': '%s'\n" % ("    "*indent,root.tag,re.sub(r'\s+', ' ',root.text))
        if root.nodes != None:
            for subpart in root.nodes:
                res += self.prettyprint(subpart,indent+1)
            return res
        else: return u""


    def format_generic_link(self,part,uriformatter=None):
        try:
            uri = self.uriformatter[part.tag](self.find_attributes([part]))
        except KeyError:
            if uriformatter:
                uri = uriformatter(self.find_attributes([part]))
            else:
                uri = self.sfs_format_uri(self.find_attributes([part]))
        except AttributeError:
            # Normal error from eglag_format_uri
            return part.text
        except:
            exc = sys.exc_info()
            # If something else went wrong, just return the plaintext
            log.warning("(unknown): Unable to format link for text %s (production %s)" % (part.text, part.tag))
            return part.text
        
        if self.verbose: print (". "*self.depth)+ "format_generic_link: uri is %s" % uri
        if not uri:
            # the formatting function decided not to return a URI for
            # some reason (maybe it was a partial/relative reference
            # without a proper base uri context
            return part.text
        elif self.predicate:
            return LinkSubject(part.text, uri=uri, predicate=self.predicate)
        else:
            return Link(part.text, uri=uri)
        
    # FIXME: unify this with format_generic_link
    def format_custom_link(self, attributes, text, production):
        try:
            uri = self.uriformatter[production](attributes)
        except KeyError:
            uri = self.sfs_format_uri(attributes)

        if not uri:
            # the formatting function decided not to return a URI for
            # some reason (maybe it was a partial/relative reference
            # without a proper base uri context
            return part.text
        elif self.predicate:
            return LinkSubject(text, uri=uri, predicate=self.predicate)
        else:
            return Link(text, uri=uri)


    ################################################################
    # KOD FÖR LAGRUM
    def clear_state(self):
        self.currentlaw     = None
        self.currentchapter = None
        self.currentsection = None
        self.currentpiece   = None

    def normalize_sfsid(self,sfsid):
        # sometimes '1736:0123 2' is given as '1736:0123 s. 2' or
        # '1736:0123.2'. This fixes that.
        sfsid = re.sub(r'(\d+:\d+)\.(\d)',r'\1 \2',sfsid)
        #return sfsid.replace('s. ','').replace('s.','') # more advanced normalizations to come...
        return sfsid
        
    def normalize_lawname(self,lawname):
        lawname=lawname.replace('|','').replace('_',' ').lower()
        if lawname.endswith('s'):
            lawname = lawname[:-1]
        return lawname
        
    def namedlaw_to_sfsid(self,text,normalize=True):
        if normalize:
            text = self.normalize_lawname(text)
        
        nolaw = [
            u'aktieslagen',
            u'anordningen',
            u'anordningen',
            u'anslagen',
            u'arbetsordningen',
            u'associationsformen',
            u'avfallsslagen',
            u'avslagen',
            u'avvittringsutslagen',
            u'bergslagen',
            u'beskattningsunderlagen',
            u'bolagen',
            u'bolagsordningen',
            u'bolagsordningen',
            u'dagordningen',
            u'djurslagen',
            u'dotterbolagen',
            u'emballagen',
            u'energislagen',
            u'ersättningsformen',
            u'ersättningsslagen',
            u'examensordningen',
            u'finansbolagen',
            u'finansieringsformen',
            u'fissionsvederlagen',
            u'flygbolagen',
            u'fondbolagen',
            u'förbundsordningen',
            u'föreslagen',
            u'företrädesordningen',
            u'förhandlingsordningen',
            u'förlagen',
            u'förmånsrättsordningen',
            u'förmögenhetsordningen',
            u'förordningen',
            u'förslagen',
            u'försäkringsaktiebolagen',
            u'försäkringsbolagen',
            u'gravanordningen',
            u'grundlagen',
            u'handelsplattformen',
            u'handläggningsordningen',
            u'inkomstslagen',
            u'inköpssamordningen',
            u'kapitalunderlagen',
            u'klockslagen',
            u'kopplingsanordningen',
            u'låneformen',
            u'mervärdesskatteordningen',
            u'nummerordningen',
            u'omslagen',
            u'ordalagen',
            u'pensionsordningen',
            u'renhållningsordningen',
            u'representationsreformen',
            u'rättegångordningen',
            u'rättegångsordningen',
            u'rättsordningen',
            u'samordningen',
            u'samordningen',
            u'skatteordningen',
            u'skatteslagen',
            u'skatteunderlagen',
            u'skolformen',
            u'skyddsanordningen',
            u'slagen',
            u'solvärmeanordningen',
            u'storslagen',
            u'studieformen',
            u'stödformen',
            u'stödordningen',
            u'stödordningen',
            u'säkerhetsanordningen',
            u'talarordningen',
            u'tillslagen',
            u'tivolianordningen',
            u'trafikslagen',
            u'transportanordningen',
            u'transportslagen',
            u'trädslagen',
            u'turordningen',
            u'underlagen',
            u'uniformen',
            u'uppställningsformen',
            u'utvecklingsbolagen',
            u'varuslagen',
            u'verksamhetsformen',
            u'vevanordningen',
            u'vårdformen',
            u'ägoanordningen',
            u'ägoslagen',
            u'ärendeslagen',
            u'åtgärdsförslagen',
                 ]
        if text in nolaw:
            return None

        if self.currentlynamedlaws.has_key(text):
            return self.currentlynamedlaws[text]
        elif self.namedlaws.has_key(text):
            return self.namedlaws[text]
        else:
            if self.verbose:
                # print "(unknown): I don't know the ID of named law [%s]" % text
                log.warning("(unknown): I don't know the ID of named law [%s]" % text)
            return None

    def sfs_format_uri(self,attributes):
        piecemappings = {u'första' :'1',
                         u'andra'  :'2',
                         u'tredje' :'3',
                         u'fjärde' :'4',
                         u'femte'  :'5',
                         u'sjätte' :'6',
                         u'sjunde' :'7',
                         u'åttonde':'8',
                         u'nionde' :'9'}
        keymapping = {'lawref'  :'L',
                      'chapter' :'K',
                      'section' :'P',
                      'piece'   :'S',
                      'item'    :'N',
                      'itemnumeric':'N',
                      'element' :'O',
                      'sentence':'M', # is this ever used?
                      }
        attributeorder = ['law', 'lawref', 'chapter', 'section', 'element', 'piece', 'item', 'itemnumeric','sentence']

        if 'law' in attributes:
            if attributes['law'].startswith('http://'):
                res = ''
            else:
                res = 'http://rinfo.lagrummet.se/publ/sfs/'
            
        else:
            if 'baseuri' in self.baseuri_attributes:
                res = self.baseuri_attributes['baseuri']
            else:
                res = ''
        resolvetobase = True
        addfragment = False
        justincase = None
        for key in attributeorder:
            if attributes.has_key(key):
                resolvetobase = False
                val = attributes[key]
            elif (resolvetobase and self.baseuri_attributes.has_key(key)):
                val = self.baseuri_attributes[key]
            else:
                val = None

            if val:
                if addfragment:
                    res += '#'
                    addfragment = False
                if (key in ['piece', 'itemnumeric', 'sentence'] and val in piecemappings):
                    res += '%s%s' % (keymapping[key],piecemappings[val.lower()])
                else:
                    if key == 'law':
                        val = self.normalize_sfsid(val)
                        val = val.replace(" ", "_")
                        res += val
                        addfragment = True
                    else:
                        if justincase:
                            res += justincase
                            justincase = None
                        val = val.replace(" ", "")
                        val = val.replace("\n", "")
                        val = val.replace("\r", "")
                        res += '%s%s' % (keymapping[key],val)
            else:
                if key == 'piece':
                    justincase = "S1" 
        return res
        
    def format_ChapterSectionRefs(self,root):
        assert(root.tag == 'ChapterSectionRefs')
        assert(len(root.nodes) == 3) # ChapterRef, wc, SectionRefs
        
        part = root.nodes[0]
        self.currentchapter = part.nodes[0].text.strip()

        if self.currentlaw:
            res = [self.format_custom_link({'law':self.currentlaw,
                                            'chapter':self.currentchapter},
                                           part.text,
                                           part.tag)]
        else:
            res = [self.format_custom_link({'chapter':self.currentchapter},
                                           part.text,
                                           part.tag)]

        res.extend(self.formatter_dispatch(root.nodes[1]))
        res.extend(self.formatter_dispatch(root.nodes[2]))
        self.currentchapter = None
        return res

    def format_ChapterSectionPieceRefs(self,root):
        assert(root.nodes[0].nodes[0].tag == 'ChapterRefID')
        self.currentchapter = root.nodes[0].nodes[0].text.strip()
        res = []
        for node in root.nodes:
            res.extend(self.formatter_dispatch(node))
        return res

    def format_LastSectionRef(self, root):
        # the last section ref is a bit different, since we want the
        # ending double section mark to be part of the link text
        assert(root.tag == 'LastSectionRef')
        assert(len(root.nodes) == 3) # LastSectionRefID, wc, DoubleSectionMark
        sectionrefid = root.nodes[0]
        sectionid = sectionrefid.text
      
        return [self.format_generic_link(root)]


    def format_SectionPieceRefs(self, root):
        assert(root.tag == 'SectionPieceRefs')
        self.currentsection = root.nodes[0].nodes[0].text.strip()

        res = [self.format_custom_link(self.find_attributes([root.nodes[2]]),
                                       "%s %s" % (root.nodes[0].text, root.nodes[2].text),
                                       root.tag)]
        for node in root.nodes[3:]:
            res.extend(self.formatter_dispatch(node))
            
        self.currentsection = None
        return res

    def format_SectionPieceItemRefs(self,root):
        assert(root.tag == 'SectionPieceItemRefs')
        self.currentsection = root.nodes[0].nodes[0].text.strip()
        self.currentpiece = root.nodes[2].nodes[0].text.strip()

        res = [self.format_custom_link(self.find_attributes([root.nodes[2]]),
                                       "%s %s" % (root.nodes[0].text, root.nodes[2].text),
                                       root.tag)]

        for node in root.nodes[3:]:
            res.extend(self.formatter_dispatch(node))
            
        self.currentsection = None
        self.currentpiece =  None
        return res
        

    # This is a special case for things like '17-29 och 32 §§ i lagen
    # (2004:575)', which picks out the LawRefID first and stores it in
    # .currentlaw, so that find_attributes finds it
    # automagically. Although now it seems to be branching out and be
    # all things to all people.
    def format_ExternalRefs(self,root):
        assert(root.tag == 'ExternalRefs')
        # print "DEBUG: start of format_ExternalRefs; self.currentlaw is %s" % self.currentlaw

        lawrefid_node = self.find_node(root,'LawRefID')
        if lawrefid_node == None:
            # Ok, no explicit LawRefID found, lets see if this is a named law that we have the ID for
            # namedlaw_node = self.find_node(root, 'NamedLawExternalLawRef')
            namedlaw_node = self.find_node(root, 'NamedLaw')
            if namedlaw_node == None:
                # As a last chance, this might be a reference back to a previously mentioned law ("...enligt 4 § samma lag")
                samelaw_node = self.find_node(root, 'SameLaw')
                assert(samelaw_node != None)
                if self.lastlaw == None:
                    log.warning(u"(unknown): found reference to \"{samma,nämnda} {lag,förordning}\", but self.lastlaw is not set")

                self.currentlaw = self.lastlaw
            else:
                # the NamedLaw case
                self.currentlaw = self.namedlaw_to_sfsid(namedlaw_node.text)
                if self.currentlaw == None:
                    # unknow law name - in this case it's better to
                    # bail out rather than resolving chapter/paragraph
                    # references relative to baseuri (which is almost
                    # certainly wrong)
                    return [root.text]
        else:
            self.currentlaw = lawrefid_node.text
            if self.find_node(root,'NamedLaw'):
                namedlaw = self.normalize_lawname(self.find_node(root,'NamedLaw').text)
                # print "remember that %s is %s!" % (namedlaw, self.currentlaw)
                self.currentlynamedlaws[namedlaw] = self.currentlaw

        #print "DEBUG: middle of format_ExternalRefs; self.currentlaw is %s" % self.currentlaw
        if self.lastlaw is None:
            #print "DEBUG: format_ExternalRefs: setting self.lastlaw to %s" % self.currentlaw
            self.lastlaw = self.currentlaw

        # if the node tree only contains a single reference, it looks
        # better if the entire expression, not just the
        # chapter/section part, is linked. But not if it's a
        # "anonymous" law ('1 § i lagen (1234:234) om blahonga')
        if (len(self.find_nodes(root,'GenericRefs')) == 1 and
            len(self.find_nodes(root,'SectionRefID')) == 1 and
            len(self.find_nodes(root,'AnonymousExternalLaw')) == 0):
            res = [self.format_generic_link(root)]
        else:
            res = self.format_tokentree(root)

        return res

    def format_SectionItemRefs(self,root):
        assert(root.nodes[0].nodes[0].tag == 'SectionRefID')
        self.currentsection = root.nodes[0].nodes[0].text.strip()
        #res = self.formatter_dispatch(root.nodes[0]) # was formatter_dispatch(self.root)
        res = self.format_tokentree(root)
        self.currentsection = None
        return res

    def format_PieceItemRefs(self,root):
        self.currentpiece = root.nodes[0].nodes[0].text.strip()
        res = [self.format_custom_link(self.find_attributes([root.nodes[2].nodes[0]]),
                                       "%s %s" % (root.nodes[0].text, root.nodes[2].nodes[0].text),
                                       root.tag)]
        for node in root.nodes[2].nodes[1:]:
            res.extend(self.formatter_dispatch(node))
        
        self.currentpiece = None
        return res

    def format_ChapterSectionRef(self,root):
        assert(root.nodes[0].nodes[0].tag == 'ChapterRefID')
        self.currentchapter = root.nodes[0].nodes[0].text.strip()
        return [self.format_generic_link(root)]

    def format_AlternateChapterSectionRefs(self,root):
        assert(root.nodes[0].nodes[0].tag == 'ChapterRefID')
        self.currentchapter = root.nodes[0].nodes[0].text.strip()
        # print "Self.currentchapter is now %s" % self.currentchapter
        res = self.format_tokentree(root)
        self.currentchapter = None
        return res

        
        
    def format_ExternalLaw(self,root):
        self.currentchapter = None
        return self.formatter_dispatch(root.nodes[0])

    def format_ChangeRef(self,root):
        id = self.find_node(root,'LawRefID').data
        return [self.format_custom_link({'lawref':id},
                                        root.text,
                                        root.tag)]

    def format_SFSNr(self,root):
        if self.baseuri == None: 
            sfsid = self.find_node(root,'LawRefID').data
            self.baseuri_attributes = {'baseuri':'http://rinfo.lagrummet.se/publ/sfs/'+sfsid+'#'}
        return self.format_tokentree(root)


    def format_NamedExternalLawRef(self,root):
        resetcurrentlaw = False
        #print "format_NamedExternalLawRef: self.currentlaw is %r"  % self.currentlaw
        if self.currentlaw == None:
            resetcurrentlaw = True
            lawrefid_node = self.find_node(root,'LawRefID')
            if lawrefid_node == None:
                self.currentlaw = self.namedlaw_to_sfsid(root.text)
            else:
                self.currentlaw = lawrefid_node.text
                namedlaw = self.normalize_lawname(self.find_node(root,'NamedLaw').text)
                # print "remember that %s is %s!" % (namedlaw, self.currentlaw)
                self.currentlynamedlaws[namedlaw] = self.currentlaw
            #print "format_NamedExternalLawRef: self.currentlaw is now %r"  % self.currentlaw

        #print "format_NamedExternalLawRef: self.baseuri is %r" % self.baseuri
        if self.currentlaw == None: # if we can't find a ID for this law, better not <link> it
            res = [root.text]
        else:
            res = [self.format_generic_link(root)]

        #print "format_NamedExternalLawRef: self.baseuri is %r" % self.baseuri
        if self.baseuri == None and self.currentlaw != None:
            #print "format_NamedExternalLawRef: setting baseuri_attributes"
            # use this as the new baseuri_attributes
            m = self.re_urisegments.match(self.currentlaw)
            if m:
                self.baseuri_attributes = {'baseuri':m.group(1),
                                           'law':m.group(2),
                                           'chapter':m.group(6),
                                           'section':m.group(8),
                                           'piece':m.group(10),
                                           'item':m.group(12)}
            else:
                self.baseuri_attributes = {'baseuri':'http://rinfo.lagrummet.se/publ/sfs/'+self.currentlaw+'#'}

        if resetcurrentlaw:
            if self.currentlaw != None: self.lastlaw = self.currentlaw
            self.currentlaw = None
        return res

    ################################################################
    # KOD FÖR KORTLAGRUM
    def format_AbbrevLawNormalRef(self,root):
        lawabbr_node = self.find_node(root,'LawAbbreviation')
        self.currentlaw = self.namedlaw_to_sfsid(lawabbr_node.text,normalize=False)
        res = [self.format_generic_link(root)]
        if self.currentlaw != None: self.lastlaw = self.currentlaw
        self.currentlaw = None
        return res

    def format_AbbrevLawShortRef(self,root):
        assert(root.nodes[0].tag == 'LawAbbreviation')
        assert(root.nodes[2].tag == 'ShortChapterSectionRef')
        self.currentlaw = self.namedlaw_to_sfsid(root.nodes[0].text,normalize=False)
        shortsection_node = root.nodes[2]
        assert(shortsection_node.nodes[0].tag == 'ShortChapterRefID')
        assert(shortsection_node.nodes[2].tag == 'ShortSectionRefID')
        self.currentchapter = shortsection_node.nodes[0].text
        self.currentsection = shortsection_node.nodes[2].text
        
        res = [self.format_generic_link(root)]

        self.currentchapter = None
        self.currentsection = None
        self.currentlaw     = None
        return res

    
    ################################################################
    # KOD FÖR FORARBETEN
    def forarbete_format_uri(self,attributes):
        # res = self.baseuri_attributes['baseuri']
        res = 'http://rinfo.lagrummet.se/'
        resolvetobase = True
        addfragment = False
        
        for key,val in attributes.items():
            if key == 'prop':
                res += "publ/prop/%s" % val
            elif key == 'bet':
                res += "ext/bet/%s" % val
            elif key == 'skrivelse':
                res += "ext/rskr/%s" % val
            elif key == 'celex':
                if len(val) == 8: # incorrectly formatted, uses YY instead of YYYY
                    val = val[0]+'19'+val[1:]
                res += "ext/celex/%s" % val
        if 'sidnr' in attributes:
            res += "#s%s" % attributes['sidnr']

        return res

    def format_ChapterSectionRef(self,root):
        assert(root.nodes[0].nodes[0].tag == 'ChapterRefID')
        self.currentchapter = root.nodes[0].nodes[0].text.strip()
        return [self.format_generic_link(root)]

    ################################################################
    # KOD FÖR EGLAGSTIFTNING
    def eglag_format_uri(self,attributes):
        res = 'http://rinfo.lagrummet.se/ext/celex/'
        if not 'akttyp' in attributes:
            if 'forordning' in attributes:
                attributes['akttyp'] = u'förordning';
            elif 'direktiv' in attributes:
                attributes['akttyp'] = u'direktiv';

        if 'akttyp' not in attributes:
            raise AttributeError("Akttyp saknas")
        # Om hur CELEX-nummer konstrueras
        # https://www.infotorg.sema.se/infotorg/itweb/handbook/rb/hlp_celn.htm
        # https://www.infotorg.sema.se/infotorg/itweb/handbook/rb/hlp_celf.htm
        # Om hur länkning till EURLEX ska se ut:
        # http://eur-lex.europa.eu/sv/tools/help_syntax.htm
        # Absolut URI?
        if 'ar' in attributes and 'lopnummer' in attributes:
            sektor = '3'
            rattslig_form = {u'direktiv':'L',
                             u'förordning':'R'}

            if len(attributes['ar']) == 2:
                attributes['ar'] = '19'+attributes['ar']
            res += "%s%s%s%04d" % (sektor,attributes['ar'],
                                   rattslig_form[attributes['akttyp']],
                                   int(attributes['lopnummer']))
        else:
            if not self.baseuri_attributes['baseuri'].startswith(res):
                # FIXME: should we warn about this?
                # print "Relative reference, but base context %s is not a celex context" % self.baseuri_attributes['baseuri']
                return None

        if 'artikel' in attributes:
            res += "#%s" % attributes['artikel']
            if 'underartikel' in attributes:
                res += ".%s" % attributes['underartikel']

        return res


    ################################################################
    # KOD FÖR RATTSFALL
    def rattsfall_format_uri(self,attributes):
        # Listan härledd från containers.n3/rattsfallsforteckningar.n3 i
        # rinfoprojektets källkod - en ambitiösare lösning vore att läsa
        # in de faktiska N3-filerna i en rdflib-graf.
        containerid = {u'NJA': '/publ/rattsfall/nja/',
                       u'RH': '/publ/rattsfall/rh/',
                       u'MÖD': '/publ/rattsfall/mod/',
                       u'RÅ': '/publ/rattsfall/ra/',
                       u'HFD': '/publ/rattsfall/hfd/',
                       u'RK': '/publ/rattsfall/rk/',
                       u'MIG': '/publ/rattsfall/mig/',
                       u'AD': '/publ/rattsfall/ad/',
                       u'MD': '/publ/rattsfall/md/',
                       u'FÖD': '/publ/rattsfall/fod/'}

        # res = self.baseuri_attributes['baseuri']
        if 'nja' in attributes:
            attributes['domstol'] = attributes['nja']

        assert 'domstol' in attributes, "No court provided"
        assert attributes['domstol'] in containerid, "%s is an unknown court" % attributes['domstol']
        res = "http://rinfo.lagrummet.se"+containerid[attributes['domstol']]

        if 'lopnr' in attributes and ":" in attributes['lopnr']:
            (attributes['ar'], attributes['lopnr']) = lopnr.split(":", 1)

        if attributes['domstol'] == u'NJA':
            # FIXME: URIs should be based on publikationsordinal, not
            # pagenumber (which this in effect is) - but this requires
            # a big lookup table/database/graph with
            # pagenumber-to-ordinal-mappings
            res += '%ss%s' % (attributes['ar'], attributes['sidnr'])
        else:
            res += '%s:%s' % (attributes['ar'], attributes['lopnr'])

        return res

    ################################################################
    # KOD FÖR EGRÄTTSFALL
    def egrattsfall_format_uri(self,attributes):
        descriptormap = {'C':'J', # Judgment of the Court
                         'T':'A', # Judgment of the Court of First Instance
                         'F':'W', # Judgement of the Civil Service Tribunal
                         }
        # FIXME: Change this before the year 2054 (as ECJ will
        # hopefully have fixed their case numbering by then)
        if len(attributes['year']) == 2:
            if int(attributes['year']) < 54:
                year = "20"+attributes['year']
            else:
                year = "19"+attributes['year']
        else:
            year = attributes['year']

        serial = '%04d' % int(attributes['serial'])
        descriptor = descriptormap[attributes['decision']]
        uri = "http://lagen.nu/ext/celex/6%s%s%s" % (year, descriptor, serial)
        return uri