Ejemplo n.º 1
0
 def parse(self, doc):
     # FIXME: don't create these if they already exists
     self.lagrum_parser = LegalRef(LegalRef.LAGRUM)
     self.rattsfall_parser = LegalRef(LegalRef.RATTSFALL)
     docfile = self.store.downloaded_path(doc.basefile)
     intermediatefile = self.store.intermediate_path(doc.basefile)
     r = WordReader()
     intermediatefile, filetype = r.read(docfile, intermediatefile)
     with codecs.open(intermediatefile, encoding="utf-8") as fp:
         patchedtext, patchdesc = self.patch_if_needed(doc.basefile,
                                                       fp.read())
     # The second step is to mangle the crappy XML produced by
     # antiword (docbook) or Word 2007 (OOXML) into a nice pair of
     # structures. rawhead is a simple dict that we'll later transform
     # into a rdflib Graph. rawbody is a list of plaintext strings, each
     # representing a paragraph.
     #
     # long-term FIXME: WordReader should expose a unified
     # interface for handling both kinds of word files so that we
     # wouldn't need both parse_ooxml() and
     # parse_antiword_docbook(). This might require some other tool
     # than antiword for old .doc files, as this throws away a LOT
     # of info.
     if filetype == "docx":
         rawhead, rawbody = self.parse_ooxml(patchedtext, doc.basefile)
     else:
         rawhead, rawbody = self.parse_antiword_docbook(patchedtext, doc.basefile)
     doc.uri = self.polish_metadata(rawhead, doc)
     if patchdesc:
         doc.meta.add((URIRef(doc.uri),
                       self.ns['ferenda'].patchdescription,
                       patchdesc))
     doc.body = self.format_body(rawbody)  # FIXME: Write a
Ejemplo n.º 2
0
    def __init__(self, alias):
        # setup
        self.alias = alias
        parsetype = alias.split("/")[1]
        self.parser = LegalRef({
            'SFS': LegalRef.LAGRUM,
            'Short': LegalRef.KORTLAGRUM,
            'DV': LegalRef.RATTSFALL,
            'Regpubl': LegalRef.FORARBETEN,
            'EGLag': LegalRef.EULAGSTIFTNING,
            'ECJ': LegalRef.EURATTSFALL
        }[parsetype])

        # this particular test method is set up to use lagen.nu style
        # URIs because the canonical URIs are significantly different.
        dirname = os.path.dirname(__file__)
        basedir = dirname + "/../"
        space = basedir + "lagen/nu/res/uri/swedishlegalsource.space.ttl"
        slugs = basedir + "lagen/nu/res/uri/swedishlegalsource.slugs.ttl"
        extra = [
            basedir + "lagen/nu/res/extra/swedishlegalsource.ttl",
            basedir + "lagen/nu/res/extra/sfs.ttl"
        ]
        cfg = Graph().parse(space, format="turtle").parse(slugs,
                                                          format="turtle")
        self.metadata = Graph()
        for ttl in extra:
            self.metadata.parse(ttl, format="turtle")
        COIN = Namespace("http://purl.org/court/def/2009/coin#")
        # select correct URI for the URISpace definition by
        # finding a single coin:URISpace object
        spaceuri = cfg.value(predicate=RDF.type, object=COIN.URISpace)
        self.minter = URIMinter(cfg, spaceuri)
Ejemplo n.º 3
0
    def __init__(self, alias):
        # setup
        self.alias = alias
        parsetype = alias.split("/")[1]
        self.parser = LegalRef({'SFS': LegalRef.LAGRUM,
                                'Short': LegalRef.KORTLAGRUM,
                                'DV': LegalRef.RATTSFALL,
                                'Regpubl': LegalRef.FORARBETEN,
                                'EGLag': LegalRef.EULAGSTIFTNING,
                                'ECJ': LegalRef.EURATTSFALL}[parsetype])

        # this particular test method is set up to use lagen.nu style
        # URIs because the canonical URIs are significantly different.
        dirname = os.path.dirname(__file__)
        basedir = dirname + "/../"
        space = basedir + "lagen/nu/res/uri/swedishlegalsource.space.ttl"
        slugs = basedir + "lagen/nu/res/uri/swedishlegalsource.slugs.ttl"
        extra = [basedir + "lagen/nu/res/extra/swedishlegalsource.ttl",
                 basedir + "lagen/nu/res/extra/sfs.ttl"]
        cfg = Graph().parse(space,
                            format="turtle").parse(slugs, format="turtle")
        self.metadata = Graph()
        for ttl in extra:
            self.metadata.parse(ttl, format="turtle")
        COIN = Namespace("http://purl.org/court/def/2009/coin#")
        # select correct URI for the URISpace definition by
        # finding a single coin:URISpace object
        spaceuri = cfg.value(predicate=RDF.type, object=COIN.URISpace)
        self.minter = URIMinter(cfg, spaceuri)
Ejemplo n.º 4
0
 def __init__(self, repos, inifile=None, **kwargs):
     super(WSGIApp, self).__init__(repos, inifile, **kwargs)
     sfsrepo = [repo for repo in repos if repo.alias == "sfs"][0]
     self.parser = SwedishCitationParser(
         LegalRef(LegalRef.RATTSFALL, LegalRef.LAGRUM, LegalRef.KORTLAGRUM,
                  LegalRef.FORARBETEN, LegalRef.MYNDIGHETSBESLUT),
         sfsrepo.minter,
         sfsrepo.commondata,
         allow_relative=True)
     graph = Graph().parse(sfsrepo.resourceloader.filename("extra/sfs.ttl"),
                           format="turtle")
     self.lagforkortningar = [
         str(o) for s, o in graph.subject_objects(DCTERMS.alternate)
     ]
     self.paragraflag = []
     for s, o in graph.subject_objects(DCTERMS.alternate):
         basefile = sfsrepo.basefile_from_uri(str(s))
         distilledpath = sfsrepo.store.distilled_path(basefile)
         firstpara_uri = str(s) + "#P1"
         needle = '<rpubl:Paragraf rdf:about="%s">' % firstpara_uri
         if os.path.exists(distilledpath) and needle in util.readfile(
                 distilledpath):
             self.paragraflag.append(str(o).lower())
     self.lagnamn = [str(o) for s, o in graph.subject_objects(RDFS.label)]
     self.lagforkortningar_regex = "|".join(
         sorted(self.lagforkortningar, key=len, reverse=True))
Ejemplo n.º 5
0
    def parse_document_from_soup(self, soup, doc):
        # Process text and create DOM
        self.parser = LegalRef(LegalRef.EGRATTSFALL)

        textdiv = soup.find("div", "texte")
        if textdiv:
            for node in textdiv.childGenerator():
                if node.string:
                    # Here we should start analyzing for things like
                    # "C-197/09". Note that the Eurlex data does not use
                    # the ordinary hyphen like above, but rather
                    # 'NON-BREAKING HYPHEN' (U+2011) - LegaRef will mangle
                    # this to an ordinary hyphen.
                    subnodes = self.parser.parse(
                        node.string, predicate="dcterms:references")
                    doc.body.append(Paragraph(subnodes))
        else:
            self.log.warning("%s: No fulltext available!" % celexnum)
            doc.body.append(Paragraph(["(No fulltext available)"]))
Ejemplo n.º 6
0
 def parser(self):
     p = LegalRef(LegalRef.LAGRUM, LegalRef.KORTLAGRUM, LegalRef.FORARBETEN,
                  LegalRef.RATTSFALL)
     # self.commondata need to include extra/sfs.ttl
     # somehow. This is probably not the best way.
     with self.resourceloader.open("extra/sfs.ttl") as fp:
         self.commondata.parse(data=fp.read(), format="turtle")
     # actually, to mint URIs for rattsfall we need the
     # skos:altLabel for the rpubl:Rattsfallspublikation -- so we
     # need everything
     with self.resourceloader.open("extra/swedishlegalsource.ttl") as fp:
         self.commondata.parse(data=fp.read(), format="turtle")
     return SwedishCitationParser(p,
                                  self.minter,
                                  self.commondata,
                                  allow_relative=True)
Ejemplo n.º 7
0
    def parse_document_from_soup(self, soup, doc):
        # Process text and create DOM
        self.parser = LegalRef(LegalRef.EGRATTSFALL)

        textdiv = soup.find("div", "texte")
        if textdiv:
            for node in textdiv.childGenerator():
                if node.string:
                    # Here we should start analyzing for things like
                    # "C-197/09". Note that the Eurlex data does not use
                    # the ordinary hyphen like above, but rather
                    # 'NON-BREAKING HYPHEN' (U+2011) - LegaRef will mangle
                    # this to an ordinary hyphen.
                    subnodes = self.parser.parse(node.string,
                                                 predicate="dct:references")
                    doc.body.append(Paragraph(subnodes))
        else:
            self.log.warning("%s: No fulltext available!" % celexnum)
            doc.body.append(Paragraph(["(No fulltext available)"]))
Ejemplo n.º 8
0
class DV(SwedishLegalSource):
    alias = "dv"
    downloaded_suffix = ".zip"
    rdf_type = RPUBL.Rattsfallsreferat
    documentstore_class = DVStore
    namespaces = ('rdf',  # always needed
                  'dct',  # title, identifier, etc
                  'xsd',  # datatypes
                  'owl',  # : sameAs
                  ('rpubl', 'http://rinfo.lagrummet.se/ns/2008/11/rinfo/publ#')
                  )
    DCT = Namespace(util.ns['dct'])

    def get_default_options(self):
        opts = super(DV, self).get_default_options()
        opts['ftpuser'] = None
        opts['ftppassword'] = None
        return opts

    # FIXME: store.list_basefiles_for("parse") must be fixed to handle two
    # different suffixes. Maybe store.downloaded_path() as well, so that
    # it returns .docx if a .docx file indeed exists, and .doc otherwise.
    # But this case (where documents can be in two (or more) formats depending
    # on age isn't uncommon, maybe DocumentStore should support it natively
    # (like with optional suffix parameter to download_path)?

    def download(self):
        # recurse =~ download everything, which we do if force is
        # specified OR if we've never downloaded before
        recurse = False

        if self.config.force or not self.config.lastdownload:
            recurse = True

        self.downloadcount = 0  # number of files extracted from zip files
                               # (not number of zip files)
        try:
            if self.config.ftpuser:
                self.download_ftp("", recurse,
                                  self.config.ftpuser,
                                  self.config.ftppassword)
            else:
                self.download_www("", recurse)
        except MaxDownloadsReached:  # ok we're done!
            pass

    def download_ftp(self, dirname, recurse, user, password, connection=None):
        self.log.debug('Listing contents of %s' % dirname)
        lines = []
        if not connection:
            connection = FTP('ftp.dom.se')
            connection.login(user, password)

        connection.cwd(dirname)
        connection.retrlines('LIST', lines.append)

        for line in lines:
            parts = line.split()
            filename = parts[-1].strip()
            if line.startswith('d') and recurse:
                self.download(filename, recurse)
            elif line.startswith('-'):
                basefile = os.path.splitext(filename)[0]
                if dirname:
                    basefile = dirname + "/" + basefile
                localpath = self.store.downloaded_path(basefile)
                if os.path.exists(localpath) and not self.config.force:
                    pass  # we already got this
                else:
                    util.ensure_dir(localpath)
                    self.log.debug('Fetching %s to %s' % (filename,
                                                          localpath))
                    connection.retrbinary('RETR %s' % filename,
                                          # FIXME: retrbinary calls .close()?
                                          open(localpath, 'wb').write)
                    self.process_zipfile(localpath)
        connection.cwd('/')

    def download_www(self, dirname, recurse):
        url = 'https://lagen.nu/dv/downloaded/%s' % dirname
        self.log.debug('Listing contents of %s' % url)
        resp = requests.get(url)
        iterlinks = lxml.html.document_fromstring(resp.text).iterlinks()
        for element, attribute, link, pos in iterlinks:
            if link.startswith("/"):
                continue
            elif link.endswith("/") and recurse:
                self.download_www(link, recurse)
            elif link.endswith(".zip"):
                basefile = os.path.splitext(link)[0]
                if dirname:
                    basefile = dirname + basefile

                localpath = self.store.downloaded_path(basefile)
                if os.path.exists(localpath) and not self.config.force:
                    pass  # we already got this
                else:
                    absolute_url = urljoin(url, link)
                    self.log.debug('Fetching %s to %s' % (link, localpath))
                    resp = requests.get(absolute_url)
                    with self.store.open_downloaded(basefile, "wb") as fp:
                        fp.write(resp.content)
                    self.process_zipfile(localpath)

    # eg. HDO_T3467-96.doc or HDO_T3467-96_1.doc
    re_malnr = re.compile(r'([^_]*)_([^_\.]*)_?(\d*)(\.docx?)')
    # eg. HDO_T3467-96_BYTUT_2010-03-17.doc or
    #     HDO_T3467-96_BYTUT_2010-03-17_1.doc
    re_bytut_malnr = re.compile(
        r'([^_]*)_([^_\.]*)_BYTUT_\d+-\d+-\d+_?(\d*)(\.docx?)')
    re_tabort_malnr = re.compile(
        r'([^_]*)_([^_\.]*)_TABORT_\d+-\d+-\d+_?(\d*)(\.docx?)')

    def process_zipfile(self, zipfilename):
        removed = replaced = created = untouched = 0
        zipf = zipfile.ZipFile(zipfilename, "r")
        for bname in zipf.namelist():
            if not isinstance(bname, str):  # py2
                # Files in the zip file are encoded using codepage 437
                name = bname.decode('cp437')
            else:
                name = bname
            if "_notis_" in name:
                continue
            name = os.path.split(name)[1]
            if 'BYTUT' in name:
                m = self.re_bytut_malnr.match(name)
            elif 'TABORT' in name:
                m = self.re_tabort_malnr.match(name)
            else:
                m = self.re_malnr.match(name)
            if m:
                (court, malnr, referatnr, suffix) = (
                    m.group(1), m.group(2), m.group(3), m.group(4))
                assert ((suffix == ".doc") or (suffix == ".docx")
                        ), "Unknown suffix %s in %r" % (suffix, name)
                if referatnr:
                    basefile = "%s/%s_%s" % (court, malnr, referatnr)
                else:
                    basefile = "%s/%s" % (court, malnr)

                outfile = self.store.path(basefile, 'downloaded', suffix)

                if "TABORT" in name:
                    self.log.info("%s: Removing" % basefile)
                    if not os.path.exists(outfile):
                        self.log.warning("%s: %s doesn't exist" % (basefile,
                                                                   outfile))
                    else:
                        os.unlink(outfile)
                    removed += 1
                else:
                    if "BYTUT" in name:
                        self.log.info("%s: Replacing with new" % basefile)
                        if not os.path.exists(outfile):
                            self.log.warning("%s: %s doesn't exist" %
                                             (basefile, outfile))
                        replaced += 1
                    else:
                        self.log.info("%s: Unpacking" % basefile)
                        if os.path.exists(outfile):
                            untouched += 1
                            continue
                        else:
                            created += 1
                    data = zipf.read(bname)

                    with self.store.open(basefile, "downloaded", suffix, "wb") as fp:
                        fp.write(data)

                    # Make the unzipped files have correct timestamp
                    zi = zipf.getinfo(bname)
                    dt = datetime(*zi.date_time)
                    ts = mktime(dt.timetuple())
                    os.utime(outfile, (ts, ts))

                    self.downloadcount += 1
                    if self.config.downloadmax and self.downloadcount >= self.config.downloadmax:
                        raise MaxDownloadsReached()
            else:
                self.log.warning('Kunde inte tolka filnamnet %r i %s' %
                                (name, os.path.relpath(zipfilename)))
        self.log.debug('Processade %s, skapade %s,  bytte ut %s, tog bort %s, lät bli %s filer' %
                       (os.path.relpath(zipfilename), created, replaced, removed, untouched))

    re_NJAref = re.compile(r'(NJA \d{4} s\. \d+) \(alt. (NJA \d{4}:\d+)\)')
    re_delimSplit = re.compile("[;,] ?").split

    labels = {'Rubrik': DCT.description,
              'Domstol': DCT['creator'],  # konvertera till auktoritetspost
              'Målnummer': RPUBL['malnummer'],
              'Domsnummer': RPUBL['domsnummer'],
              'Diarienummer': RPUBL['diarienummer'],
              'Avdelning': RPUBL['domstolsavdelning'],
              'Referat': DCT['identifier'],
              'Avgörandedatum': RPUBL['avgorandedatum'],  # konvertera till xsd:date
              }

    # Metadata som kan innehålla noll eller flera poster.
    # Litteratur/sökord har ingen motsvarighet i RPUBL-vokabulären
    multilabels = {'Lagrum': RPUBL['lagrum'],
                   'Rättsfall': RPUBL['rattsfallshanvisning'],
                   # dct:references vore bättre, men sådana ska inte ha literalvärden
                   'Litteratur': DCT['relation'],
                   'Sökord': DCT['subject']
                   }

    # Listan härledd från containers.n3/rattsfallsforteckningar.n3 i
    # rinfoprojektets källkod - en ambitiösare lösning vore att
    # läsa in de faktiska N3-filerna i en rdflib-graf.
    publikationsuri = {'NJA': 'http://rinfo.lagrummet.se/ref/rff/nja',
                       'RH': 'http://rinfo.lagrummet.se/ref/rff/rh',
                       'MÖD': 'http://rinfo.lagrummet.se/ref/rff/mod',
                       'RÅ': 'http://rinfo.lagrummet.se/ref/rff/ra',
                       'RK': 'http://rinfo.lagrummet.se/ref/rff/rk',
                       'MIG': 'http://rinfo.lagrummet.se/ref/rff/mig',
                       'AD': 'http://rinfo.lagrummet.se/ref/rff/ad',
                       'MD': 'http://rinfo.lagrummet.se/ref/rff/md',
                       'FÖD': 'http://rinfo.lagrummet.se/ref/rff/fod'}

    domstolsforkortningar = {'ADO': 'http://lagen.nu/org/2008/arbetsdomstolen',
                             'HDO': 'http://lagen.nu/org/2008/hogsta-domstolen',
                             'HGO': 'http://lagen.nu/org/2008/gota-hovratt',
                             'HNN': 'http://lagen.nu/org/2008/hovratten-for-nedre-norrland',
                             'HON': 'http://lagen.nu/org/2008/hovratten-for-ovre-norrland',
                             'HSB': 'http://lagen.nu/org/2008/hovratten-over-skane-och-blekinge',
                             'HSV': 'http://lagen.nu/org/2008/svea-hovratt',
                             'HVS': 'http://lagen.nu/org/2008/hovratten-for-vastra-sverige',
                             'MDO': 'http://lagen.nu/org/2008/marknadsdomstolen',
                             'MIG': 'http://lagen.nu/org/2008/migrationsoverdomstolen',
                             'MÖD': 'http://lagen.nu/org/2008/miljooverdomstolen',
                             'REG': 'http://lagen.nu/org/2008/regeringsratten',
                             'KST': 'http://lagen.nu/org/2008/kammarratten-i-stockholm'}

    # This is information you can get from RDL, but we hardcode it for
    # now.
    slugs = {'Arbetsdomstolen': 'ad',
             'Domstolsverket': 'dv',
             'Göta hovrätt': 'hgo',
             'Högsta domstolen': 'hd',
             'Högsta förvaltningsdomstolen': 'hfd',
             'Hovrätten för Nedre Norrland': 'hnn',
             'Hovrätten för Övre Norrland': 'hon',
             'Hovrätten för Västra Sverige': 'hvs',
             'Hovrätten över Skåne och Blekinge': 'hsb',
             'Justitiekanslern': 'jk',
             'Kammarrätten i Göteborg': 'kgg',
             'Kammarrätten i Jönköping': 'kjo',
             'Kammarrätten i Stockholm': 'kst',
             'Kammarrätten i Sundsvall': 'ksu',
             'Marknadsdomstolen': 'md',
             'Migrationsöverdomstolen': 'mig',
             'Miljööverdomstolen': 'mod',
             'Patentbesvärsrätten': 'pbr',
             'Rättshjälpsnämnden': 'rhn',
             'regr': 'Regeringsrätten',
             'Statens ansvarsnämnd': 'san',
             'Svea hovrätt': 'hsv'}

    @managedparsing
    def parse(self, doc):
        # FIXME: don't create these if they already exists
        self.lagrum_parser = LegalRef(LegalRef.LAGRUM)
        self.rattsfall_parser = LegalRef(LegalRef.RATTSFALL)
        docfile = self.store.downloaded_path(doc.basefile)
        intermediatefile = self.store.intermediate_path(doc.basefile)
        r = WordReader()
        intermediatefile, filetype = r.read(docfile, intermediatefile)
        with codecs.open(intermediatefile, encoding="utf-8") as fp:
            patchedtext, patchdesc = self.patch_if_needed(doc.basefile,
                                                          fp.read())
        # The second step is to mangle the crappy XML produced by
        # antiword (docbook) or Word 2007 (OOXML) into a nice pair of
        # structures. rawhead is a simple dict that we'll later transform
        # into a rdflib Graph. rawbody is a list of plaintext strings, each
        # representing a paragraph.
        #
        # long-term FIXME: WordReader should expose a unified
        # interface for handling both kinds of word files so that we
        # wouldn't need both parse_ooxml() and
        # parse_antiword_docbook(). This might require some other tool
        # than antiword for old .doc files, as this throws away a LOT
        # of info.
        if filetype == "docx":
            rawhead, rawbody = self.parse_ooxml(patchedtext, doc.basefile)
        else:
            rawhead, rawbody = self.parse_antiword_docbook(patchedtext, doc.basefile)
        doc.uri = self.polish_metadata(rawhead, doc)
        if patchdesc:
            doc.meta.add((URIRef(doc.uri),
                          self.ns['ferenda'].patchdescription,
                          patchdesc))
        doc.body = self.format_body(rawbody)  # FIXME: Write a
                                             # FSMParser to detect
                                             # high-level structure of
                                             # the document

    def parse_ooxml(self, text, basefile):
        soup = BeautifulSoup(text)
        for instrtext in soup.find_all("w:instrtext"):
            instrtext.decompose()
        head = {}

        # Högst uppe på varje domslut står domstolsnamnet ("Högsta
        # domstolen") följt av referatnumret ("NJA 1987
        # s. 113").
        firstfield = soup.find("w:t")
        # Ibland ärdomstolsnamnet uppsplittat på två
        # w:r-element. Bäst att gå på all text i
        # föräldra-w:tc-cellen
        firstfield = firstfield.find_parent("w:tc")
        head['Domstol'] = firstfield.get_text(strip=True)

        nextfield = firstfield.find_next("w:tc")
        head['Referat'] = nextfield.get_text(strip=True)

        # Hitta övriga enkla metadatafält i sidhuvudet
        for key in self.labels:
            node = soup.find(text=re.compile(key + ':'))
            if not node:
                # Sometimes these text fields are broken up
                # (eg "<w:t>Avgörand</w:t>...<w:t>a</w:t>...<w:t>tum</w:t>")
                # Use (ridiculous) fallback method
                nodes = soup.find_all('w:statustext', attrs={'w:val': key})
                if nodes:
                    node = nodes[-1]
                else:
                    self.log.warning("%s: Couldn't find field %r" % (basefile, key))
                    continue

            txt = node.find_next("w:t").find_parent("w:p").get_text(strip=True)
            if txt:  # skippa fält med tomma strängen-värden
                head[key] = txt

        # Hitta sammansatta metadata i sidhuvudet
        for key in ["Lagrum", "Rättsfall"]:
            node = soup.find(text=re.compile(key + ':'))
            if node:
                textnodes = node.find_parent('w:tc').find_next_sibling('w:tc')
                if not textnodes:
                    continue
                items = []
                for textnode in textnodes.find_all('w:t'):
                    t = textnode.get_text(strip=True)
                    if t:
                        items.append(t)
                if items:
                    head[key] = items

        # The main text body of the verdict
        body = []
        for p in soup.find(text=re.compile('EFERAT')).find_parent('w:tr').find_next_sibling('w:tr').find_all('w:p'):
            ptext = ''
            for e in p.findAll("w:t"):
                ptext += e.string
            body.append(ptext)

        # Finally, some more metadata in the footer
        if soup.find(text=re.compile(r'Sökord:')):
            head['Sökord'] = soup.find(
                text=re.compile(r'Sökord:')).find_next('w:t').get_text(strip=True)

        if soup.find(text=re.compile('^\s*Litteratur:\s*$')):
            n = soup.find(text=re.compile('^\s*Litteratur:\s*$'))
            head['Litteratur'] = n.findNext('w:t').get_text(strip=True)
        return head, body

    def parse_antiword_docbook(self, text, basefile):
        soup = BeautifulSoup(text)
        head = {}
        header_elements = soup.find("para")
        header_text = ''
        for el in header_elements.contents:
            if hasattr(el, 'name') and el.name == "informaltable":
                break
            else:
                header_text += el.string

        # Högst uppe på varje domslut står domstolsnamnet ("Högsta
        # domstolen") följt av referatnumret ("NJA 1987
        # s. 113"). Beroende på worddokumentet ser dock XML-strukturen
        # olika ut. Det vanliga är att informationen finns i en
        # pipeseparerad paragraf:

        parts = [x.strip() for x in header_text.split("|")]
        if len(parts) > 1:
            head['Domstol'] = parts[0]
            head['Referat'] = parts[1]
        else:
            # alternativ står de på första raden i en informaltable
            row = soup.find("informaltable").tgroup.tbody.row.findAll('entry')
            head['Domstol'] = row[0].get_text(strip=True)
            head['Referat'] = row[1].get_text(strip=True)

        # Hitta övriga enkla metadatafält i sidhuvudet
        for key in self.labels:
            node = soup.find(text=re.compile(key + ':'))
            if node:
                txt = node.find_parent('entry').find_next_sibling('entry').get_text(strip=True)
                if txt:
                    head[key] = txt

        # Hitta sammansatta metadata i sidhuvudet
        for key in ["Lagrum", "Rättsfall"]:
            node = soup.find(text=re.compile(key + ':'))
            if node:
                head[key] = []
                textchunk = node.find_parent(
                    'entry').find_next_sibling('entry').string
                for line in [util.normalize_space(x) for x in textchunk.split("\n\n")]:
                    if line:
                        head[key].append(line)

        body = []
        for p in soup.find(text=re.compile('REFERAT')).find_parent('tgroup').find_next_sibling('tgroup').find('entry').get_text(strip=True).split("\n\n"):
            body.append(p)

        # Hitta sammansatta metadata i sidfoten
        head['Sökord'] = soup.find(text=re.compile('Sökord:')).find_parent(
            'entry').next_sibling.next_sibling.get_text(strip=True)

        if soup.find(text=re.compile('^\s*Litteratur:\s*$')):
            n = soup.find(text=re.compile('^\s*Litteratur:\s*$')).find_parent(
                'entry').next_sibling.next_sibling.get_text(strip=True)
            head['Litteratur'] = n
        return head, body

    def polish_metadata(self, head, doc):
        basefile_regex = re.compile('(?P<type>\w+)/(?P<year>\d+)-(?P<ordinal>\d+)')

        def basefile_to_referat(basefile):
            templ = {'ADO': 'AD %(year)s nr %(ordinal)s',
                     'MD': 'MD %(year)s:%(ordinal)s'}
            m = basefile_regex.match(basefile)
            if m:
                return templ[m.group("type")] % (m.groupdict())

        def ref_to_uri(ref):
            # FIXME: We'd like to retire legalref and replace it with
            # pyparsing grammars.
            nodes = self.rattsfall_parser.parse(ref)
            uri = nodes[0].uri
            return localize_uri(uri)

        def dom_to_uri(domstol, malnr, avg):
            baseuri = self.config.url
            slug = self.slugs[domstol]
            return "%(baseuri)sres/dv/%(slug)s/%(malnr)s/%(avg)s" % locals()

        def localize_uri(uri):
            if "publ/rattsfall" in uri:
                return uri.replace("http://rinfo.lagrummet.se/publ/rattsfall",
                                   self.config.url + "res/dv")
            elif "publ/sfs/" in uri:
                return uri.replace("http://rinfo.lagrummet.se/publ/sfs",
                                   self.config.url + "res/sfs")

        def split_nja(value):
            # "NJA 2008 s 567 (NJA 2008:86)"=>("NJA 2008 s 567", "NJA 2008:86")
            return [x[:-1] for x in value.split("(")]

        def sokord_uri(value):
            return self.config.url + "concept/%s" % util.ucfirst(value).replace(' ', '_')

        # 0. create Referat key if not present
        if "Referat" not in head:
            # For some courts (MD, AD, MOD?, MIG?) this is possible
            head["Referat"] = basefile_to_referat(doc.basefile)

        # 1. mint uris and create the two Describers we'll use
        refuri = ref_to_uri(head["Referat"])
        refdesc = Describer(doc.meta, refuri)
        domuri = dom_to_uri(head["Domstol"],
                            head["Målnummer"],
                            head["Avgörandedatum"])
        domdesc = Describer(doc.meta, domuri)

        # 2. convert all strings in head to proper RDF
        for label, value in head.items():
            if label == "Rubrik":
                value = util.normalize_space(value)
                refdesc.value(self.ns['rpubl'].referatrubrik, value, lang="sv")
                domdesc.value(self.ns['dct'].title, value, lang="sv")

            elif label == "Domstol":
                domdesc.rel(self.ns['dct'].publisher, self.lookup_resource(value))
            elif label == "Målnummer":
                domdesc.rel(self.ns['rpubl'].malnummer, value)
            elif label == "Domsnummer":
                domdesc.rel(self.ns['rpubl'].domsnummer, value)
            elif label == "Diarienummer":
                domdesc.rel(self.ns['rpubl'].diarienummer, value)
            elif label == "Avdelning":
                domdesc.rel(self.ns['rpubl'].avdelning, value)
            elif label == "Referat":

                for pred, regex in {'rattsfallspublikation': r'([^ ]+)',
                                    'arsutgava': r'(\d{4})',
                                    'lopnummer': r'\d{4}(?:\:| nr )(\d+)',
                                    'sidnummer': r's.? ?(\d+)'}.items():
                    m = re.search(regex, value)
                    if m:
                        if pred == 'rattsfallspublikation':
                            # "NJA" -> "http://lcaolhost:8000/coll/dv/nja"
                            uri = self.config.url + "coll/dv/" + m.group(1).lower()
                            refdesc.rel(self.ns['rpubl'][pred], uri)
                        else:
                            refdesc.value(self.ns['rpubl'][pred], m.group(1))

                    if value.startswith("NJA"):
                        realvalue, extra = split_nja(value)
                        ordinal = extra.split(" ")[1]
                        refdesc.value(self.ns['dct'].bibliographicCitation,
                                      extra)
                        refdesc.rel(self.ns['owl'].sameAs,
                                    self.config.url + "res/dv/nja/" + ordinal)
                        refdesc.value(self.ns['dct'].identifier, realvalue)
                    else:
                        refdesc.value(self.ns['dct'].identifier, value)

            elif label == "Avgörandedatum":
                with util.c_locale():
                    d = datetime.strptime(value, '%Y-%m-%d')
                domdesc.value(self.ns['rpubl'].avgorandedatum, d)

            elif label == "Lagrum":
                for i in value:  # better be list not string
                    for node in self.lagrum_parser.parse(i):
                        if isinstance(node, Link):

                            domdesc.rel(self.ns['rpubl'].lagrum,
                                        localize_uri(node.uri))
            elif label == "Rättsfall":
                for i in value:
                    for node in self.rattsfall_parser.parse(i):
                        if isinstance(node, Link):
                            domdesc.rel(self.ns['rpubl'].rattsfall,
                                        localize_uri(node.uri))
            elif label == "Litteratur":
                for i in value.split(";"):
                    domdesc.value(self.ns['dct'].relation, util.normalize_space(i))
            elif label == "Sökord":
                for s in self.re_delimSplit(value):
                    s = util.normalize_space(s)
                    if not s:
                        continue
                    # terms longer than 72 chars are not legitimate
                    # terms. more likely descriptions. If a term has a - in
                    # it, it's probably a separator between a term and a
                    # description
                    while len(s) >= 72 and " - " in s:
                        h, s = s.split(" - ", 1)
                        domdesc.rel(self.ns['dct'].subject, sokord_uri(h))
                    if len(s) < 72:
                        domdesc.rel(self.ns['dct'].subject, sokord_uri(s))

        # 3. mint some owl:sameAs URIs
        refdesc.rel(self.ns['owl'].sameAs, self.sameas_uri(refuri))
        domdesc.rel(self.ns['owl'].sameAs, self.sameas_uri(domuri))

        # 4. Add some same-for-everyone properties
        refdesc.rel(self.ns['dct'].publisher, self.lookup_resource('Domstolsverket'))
        refdesc.rdftype(self.ns['rpubl'].Rattsfallsreferat)
        domdesc.rdftype(self.ns['rpubl'].VagledandeDomstolsavgorande)
        refdesc.rel(self.ns['rpubl'].referatAvDomstolsavgorande, domuri)
        # 5. assert that we have everything we need

        # 6. done!
        return refuri

    def format_body(self, paras):
        return Body([Paragraph([x]) for x in paras])

    # FIXME: port to list_basefiles_for("parse")
    def ParseAll(self):
        self._do_for_all(intermediate_dir, '.doc', self.Parse)
        self._do_for_all(intermediate_dir, '.docx', self.Parse)

# FIXME: convert to a CONSTRUCT query, save as res/sparql/dv-annotations.rq
# Or maybe the default template should take a list of predicates, defaulting
# to dct:references, but which we could substitute rpubl:rattsfallshanvisning
#    annotation_query = """
# PREFIX dct:<http://purl.org/dc/terms/>
# PREFIX rpub:<http://rinfo.lagrummet.se/ns/2008/11/rinfo/publ#>
#
# SELECT ?uri ?id ?desc
# WHERE {
#      ?uri dct:description ?desc .
#      ?uri dct:identifier ?id .
#      ?uri rpubl:rattsfallshanvisning <%s>
#}
#""" % uri
#

    # FIXME: port to relate_all_setup / _teardown
    def GenerateMapAll(self):
        mapfile = os.path.sep.join(
            [self.baseDir, 'dv', 'generated', 'uri.map'])
        util.robust_remove(mapfile + ".new")

        parsed_dir = os.path.sep.join([self.baseDir, 'dv', 'parsed'])
        self._do_for_all(parsed_dir, '.xht2', self.GenerateMap)
        util.robustRename(mapfile + ".new", mapfile)

    def GenerateMap(self, basefile):
        start = time()
        infile = os.path.relpath(self._xmlFileName(basefile))
        head = codecs.open(infile, encoding='utf-8').read(1024)
        m = self.re_xmlbase(head)
        if m:
            uri = "http://rinfo.lagrummet.se/publ/rattsfall/%s" % m.group(1)
            mapfile = self.store.path('generated', 'uri.map', '.new')
            util.ensure_dir(mapfile)
            f = codecs.open(mapfile, 'a', encoding='iso-8859-1')
            f.write("%s\t%s\n" % (m.group(1), basefile))
            f.close()
            self.log.info("%s ok" % basefile)
            return
        else:
            self.log.warning("could not find xml:base in %s" % infile)

    # gonna need this for news_criteria()
    pubs = {'http://rinfo.lagrummet.se/ref/rff/nja': 'Högsta domstolen',
            'http://rinfo.lagrummet.se/ref/rff/rh': 'Hovrätterna',
            'http://rinfo.lagrummet.se/ref/rff/rk': 'Kammarrätterna',
            'http://rinfo.lagrummet.se/ref/rff/ra': 'Regeringsrätten',
            'http://rinfo.lagrummet.se/ref/rff/hfd': 'Högsta förvaltningsdomstolen',
            'http://rinfo.lagrummet.se/ref/rff/ad': 'Arbetsdomstolen',
            'http://rinfo.lagrummet.se/ref/rff/fod': 'Försäkringsöverdomstolen',
            'http://rinfo.lagrummet.se/ref/rff/md': 'Marknadsdomstolen',
            'http://rinfo.lagrummet.se/ref/rff/mig': 'Migrationsöverdomstolen',
            'http://rinfo.lagrummet.se/ref/rff/mod': 'Miljööverdomstolen'
            }
Ejemplo n.º 9
0
class EURLexCaselaw(EURLex):
    alias = "eurlexcaselaw"
    # only select judgments and AG opinions
    # expertquery_template = "SELECT CELLAR_ID, TI_DISPLAY, DN, DD WHERE (FM_CODED = JUDG OR FM_CODED = OPIN_AG) ORDER BY DD ASC"
    expertquery_template = "(FM_CODED = JUDG OR FM_CODED = OPIN_AG)"
    contenttype = "text/html"  # legal cases OUGHT to be available as
    # xhtml, and the "branch notice"
    # indicates that they are, but in
    # reality they're not.
    downloaded_suffix = ".html"
    celexfilter = re.compile("(6\d{4}[A-Z]{2}\d{4})$").match

    def parse_metadata_from_soup(self, soup, doc):
        # AVAILABLE METADATA IN CASES
        #
        # For now, we create a nonofficial eurlex vocab with namespace http://lagen.nu/eurlex#
        # - celex number (first h1) :celex (:celexnum?)
        #
        # - [Title and reference]
        #   - decision type and date "Judgment of the Court (Third Chamber) of 17 December 2009."
        #      :courtdecision (as opposed to :commissiondecision)
        #   - :party (or parties) "M v Agence européenne des médicaments (EMEA)."
        #   - :referingcourt "Reference for a preliminary ruling: Administrativen sad Sofia-grad - Bulgaria."
        #   - :legalissue - short description and/or(?) keywords (not always present, eg 62009J0403), hyphen sep:
        #     - "Review of the judgment in Case T-12/08 P"
        #     - "Whether the state of the proceedings permits final judgment to be given"
        #     - "Fair hearing"
        #     - "Rule that the parties should be heard"
        #     - "Whether the unity or consistency of Community law is affected."
        #   - :casenum Case number + unknown letters:
        #     - "Case C-197/09 RX-II."
        #     - "Joined cases T-117/03 to T-119/03 and T-171/03."
        #   - :casereporter Case reporter cite "European Court reports 2009 Page 00000"
        # - [Text]
        #   - :availablelang - Available languages ("bg", "es", "cs", "da" ....)
        # - :authenticlang - Authentic language ("fr" or "French")
        # - [Dates]
        #   - :decisiondate - Date of document (decision/judgement)
        #   - :applicationdate - Date of application
        # - [Classifications] (different from description/keywords above)
        #   - :subjectmatter Subject Matter, comma sep:
        #     - "Staff regulations and employment conditions - EC"
        #     - "Provisions governing the Institutions"
        #   - :directorycode - Case Law Directory Code (where is the full code list?), NL sep:
        #      - "B-09.03 EEC/EC / State aid / Exceptions to the prohibition of aid"
        #      - "B-20.05 EEC/EC / Acts of the institutions / Statement of the reasons on which a measure is based"
        #      - "B-09.03 EEC/EC / State aid / Exceptions to the prohibition of aid"
        #      - "B-09.04 EEC/EC / State aid / Review of aid by the Commission - Rules of procedure"
        # - [Miscellaneous information]
        #   - dcterms:author Author: "Court of Justice of the European Communities"
        #   - :form Form: "Judgement"
        # - [Procedure]
        #   - :proceduretype - Type of procedure, comma sep:
        #     - "Staff cases"
        #     - "Action for damages"
        #     - "Appeal"
        #     - "REEX=OB"
        #   - :applicant - Applicant: "Official"
        #   - :defendant - Defendant: "EMEA, Institutions"
        #   - :observation - Observations: "Italy, Poland, Member States, European Parliament, Council, Commission, Institutions"
        #   - :judgerapporteur - Judge-Rapporteur: "von Danwitz"
        #   - :advocategeneral - Advocate General: "Mazák"
        # - [Relationships between documents]
        #   - :treaty Treaty: "European Communities"
        #   - :caseaffecting Case affecting, NL-sep:
        #     - "Interprets [CELEXNO + pinpoint]"
        #     - "Declares void 61995A0091"
        #     - "Confirms 31996D0666"
        #   - :"Instruments cited in case law" (celex numbers with pinpoint locations?), nl-sep
        #     - "12001C/PRO/02-A61"
        #     - "12001C/PRO/02-NA13P1"
        #     - "31991Q0530-A114"
        #     - "62007K0023"
        #     - "62008A0012"
        #
        # convenience functions -- should not be needed now that we have Describer
        # def add_literal(predicate, literal):
        #     g.add((URIRef(uri),
        #            voc[predicate],
        #            Literal(literal, lang=lang)))
        #
        # def add_celex_object(predicate, celexno):
        #     g.add((URIRef(uri),
        #            voc[predicate],
        #            URIRef("http://lagen.nu/ext/celex/%s" % celexno)))
        #
        # def get_predicate(predicate):
        #     predicates = list(g.objects(URIRef(uri), voc[predicate]))
        #     return predicates != []
        #
        # These are a series of refinments for the "Affecting"
        # relationship. "Cites" doesn't have these (or similar), but
        # "is affected by" has (the inverse properties)
        affects_predicates = {
            "Interprets": "interprets",
            "Interprets the judgment": "interpretsJudgment",
            "Declares void": "declaresVoid",
            "Confirms": "confirms",
            "Declares valid (incidentally)": "declaresValidIncidentally",
            "Declares valid (by a preliminary ruling)":
            "declaresValidByPreliminaryRuling",
            "Incidentally declares invalid": "declaresInvalidIncidentally",
            "Declares invalid (by a preliminary ruling)":
            "declaresInvalidByPreliminaryRuling",
            "Amends": "amends",
            "Failure concerning": "failureConcerning"
        }

        isaffected_predicates = {
            "Interpreted by": "interpretedBy",
            "Confirmed by": "confirmedBy",
            "Declared void by": "declaredVoidBy",
            "Annulment requested by": "annulmentRequestedBy"
        }

        # 1. Express metadata about our document as a RDF graph
        desc = Describer(self.meta, self.uri)
        g = Graph()
        # :celex - first <h1>
        celexnum = soup.h1.get_text(strip=True)
        if celexnum == "No documents matching criteria.":
            raise errors.DocumentRemovedError(
                "No documents matching criteria " + celexnum)
        elif "no_data_found" in celexnum:
            self.log.warning("%s: No data found (try re-downloading)!" %
                             basefile)
            raise errors.DocumentRemovedError("No data found!")

        assert celexnum == doc.basefile, "Celex number in file (%s) differ from filename (%s)" % (
            celexnum, basefile)
        doc.lang = soup.html['lang']

        m = self.re_celexno.match(celexnum)
        # FIXME: this list is outdated!
        rdftype = {
            'J': voc['Judgment'],
            'A': voc['JudgmentFirstInstance'],
            'W': voc['JudgmentCivilService'],
            'O': voc['Order'],
            'B': voc['OrderCivilService']
        }[m.group(3)]

        desc.rdftype(rdftype)
        desc.value(self.ns['eurlex'].celexnum, celexnum)

        # The first section, following <h2>Title and reference</h2>
        # contains :courtdecision, :party (one or two items),
        # :referingcourt (optional), :legalissue (list of strings),
        # :casenum, :casereporter. Since some are optional, we do a
        # little heuristics to find out what we're looking at at any
        # given moment.
        for section in soup.findAll(["h1", "h2"]):
            if section.name == "h1" and section.a and section.a.string == "Text":
                break
            if section.string == "Title and reference":
                for para in section.findNextSiblings("p"):
                    if not para.string:
                        continue
                    string = para.string.strip()

                    # optional: do sanitychecks to see if this really is a :courtdecision
                    if not get_predicate('courtdecision'):
                        add_literal('courtdecision', string)
                    elif not get_predicate('party'):
                        # this will be one or two items. Are they position dependent?
                        for party in string.split(" v "):
                            add_literal('party', party)
                    elif (not get_predicate('referingcourt') and
                          (string.startswith(
                              "Reference for a preliminary ruling") or
                           string.startswith("Preliminary ruling requested"))):
                        add_literal('referingcourt', string)
                    elif (not get_predicate('casenum')
                          and (string.lower().startswith("case ")
                               or string.lower().startswith("joined cases "))):
                        add_literal('casenum', string)
                    elif para.em:  # :casereporter is enclosed in an em
                        for row in para.findAll(text=True):
                            add_literal('casereporter', row.strip())
                    elif get_predicate('legalissue'):
                        # fixme: Split this up somehow
                        add_literal('legalissue', string)
            elif section.string == "Relationship between documents":
                for item in section.findNextSibling("ul").findAll("li"):
                    predicate = None
                    subpredicate = None
                    for node in item.childGenerator():
                        if not hasattr(node, "name"):
                            nodetext = node.strip()
                            if re.match("([ABCDEFGIJKLNPRST]+\d*)+$",
                                        nodetext):
                                continue
                            if re.match("\d[\d\-]*[ABC]?$", nodetext):
                                continue
                            if predicate == "affects" and nodetext:
                                if nodetext in affects_predicates:
                                    subpredicate = affects_predicates[nodetext]
                                else:
                                    self.log.warning(
                                        "Can't express '%s' as a affects predicate"
                                        % nodetext)
                            elif predicate == "isaffected" and nodetext:
                                if nodetext in isaffected_predicates:
                                    subpredicate = isaffected_predicates[
                                        nodetext]
                                else:
                                    self.log.warning(
                                        "Can't express '%s' as a isaffected predicate"
                                        % nodetext)

                        elif node.name == "strong":
                            subpredicate = None
                            if node.string == "Treaty:":
                                predicate = "treaty"
                            elif node.string == "Affected by case:":
                                predicate = "isaffected"
                            elif node.string == "Case affecting:":
                                predicate = "affects"
                            elif node.string == "Instruments cited in case law:":
                                predicate = "cites"
                            else:
                                self.log.warning(
                                    "Don't know how to handle key '%s'" %
                                    node.string)
                        elif node.name == "a" and predicate:
                            p = predicate
                            if subpredicate:
                                p = subpredicate
                            # FIXME: If the
                            # predicate is "cites", the celex number
                            # may have extra crap
                            # (eg. "31968R0259(01)-N2A1L6") indicating
                            # pinpoint location. Transform these to a
                            # fragment identifier.
                            add_celex_object(p, node.string.strip())

    def parse_document_from_soup(self, soup, doc):
        # Process text and create DOM
        self.parser = LegalRef(LegalRef.EGRATTSFALL)

        textdiv = soup.find("div", "texte")
        if textdiv:
            for node in textdiv.childGenerator():
                if node.string:
                    # Here we should start analyzing for things like
                    # "C-197/09". Note that the Eurlex data does not use
                    # the ordinary hyphen like above, but rather
                    # 'NON-BREAKING HYPHEN' (U+2011) - LegaRef will mangle
                    # this to an ordinary hyphen.
                    subnodes = self.parser.parse(
                        node.string, predicate="dcterms:references")
                    doc.body.append(Paragraph(subnodes))
        else:
            self.log.warning("%s: No fulltext available!" % celexnum)
            doc.body.append(Paragraph(["(No fulltext available)"]))
Ejemplo n.º 10
0
    def parse_from_soup(self, soup):
        # Step 1: Find out basic metadata
        rubrik = soup.first("title").string
        beslutsdatum = soup.first(
            "meta", {'name': 'SG_Beslutsdatum'})['content']

        beslutsdatum = datetime.strptime(beslutsdatum, "%Y-%m-%d").date()
        diarienummer = soup.first(
            "meta", {'name': 'SG_Dokumentbet'})['content']
        arendetyp = soup.first("meta", {'name': 'Subject'})['content']
        # the keywords for a documents is contained in a metatag
        # formatted like:
        #    <meta name="Keywords" content="hets_mot_folkgrupp\nmeddelarfrihet\åklagare">
        #
        # Transform this into an array like:
        #    [u'http://lagen.nu/concept/Hets_mot_folkgrupp',
        #     u'http://lagen.nu/concept/Meddelarfrihet',
        #     u'http://lagen.nu/concept/Åklagare']
        nyckelord = soup.first("meta", {'name': 'Keywords'})['content']
        begrepp = ['http://lagen.nu/concept/%s' % util.ucfirst(
            x).strip().replace(" ", "_") for x in nyckelord.split("\n")]

        # Step 2: Using the metadata, construct the canonical URI for this document
        uri = LegalURI.construct({'type': LegalRef.MYNDIGHETSBESLUT,
                                  'myndighet': 'jk',
                                  'dnr': diarienummer})
        # self.log.debug("URI: %s" % uri)

        # Step 3: Create a RDF graph of all our metadata (so far)
        g = Graph()
        g.bind('dct', self.ns['dct'])
        g.bind('rinfo', self.ns['rinfo'])
        g.bind('rinfoex', self.ns['rinfoex'])
        g.bind('xsd', util.ns['xsd'])
        g.add((
            URIRef(uri), self.ns['dct']['title'], Literal(rubrik, lang="sv")))
        g.add((URIRef(uri), self.ns['rinfo']['beslutsdatum'],
              Literal(beslutsdatum, lang="sv")))
        g.add((URIRef(uri), self.ns['rinfo']['diarienummer'],
              Literal(diarienummer, lang="sv")))
        g.add((URIRef(uri), self.ns['rinfoex']['arendetyp'],
              Literal(arendetyp, lang="sv")))
        for s in begrepp:
            g.add((URIRef(uri), self.ns['dct']['subject'], URIRef(s)))

        g.add((URIRef(uri), self.ns['dct']['identifier'], Literal(
            "JK %s" % diarienummer, lang="sv")))
        g.add((URIRef(uri), RDF.type, self.rdf_type))

        # Step 4: Process the actual text of the document
        self.parser = LegalRef(LegalRef.LAGRUM,
                               LegalRef.KORTLAGRUM,
                               LegalRef.RATTSFALL,
                               LegalRef.FORARBETEN)

        # newer documents have a semantic structure with h1 and h2
        # elements. Older have elements like <p class="Rubrik_1">. Try
        # to determine which one we're dealing with?
        tag = soup.find('a', {'name': "Start"})
        if tag:
            # self.log.debug("Using new-style document structure")
            elements = tag.parent.findAllNext()
        else:
            # self.log.debug("Using old-style document structure")
            elements = soup.findAll("p")
        # self.log.debug("Found %d elements" % len(elements))
        from collections import deque
        elements = deque(elements)
        body = self.make_sektion(elements, "Referat av beslut")

        # Step 5: Combine the metadata and the document, and return it
        doc = {'meta': g,
               'body': body,
               'lang': 'sv',
               'uri': uri}
        return doc
Ejemplo n.º 11
0
class JK(SwedishLegalSource):
    alias = "jk"

    start_url = "http://www.jk.se/Beslut.aspx?query=&type=all&dateFrom=1998-01-01&dateTo=2100-01-01&dnr="
    document_url_regex = "http://www.jk.se/Beslut/(?P<kategori>[\w\-]+)/(?P<basefile>\d+\-\d+\-\d+).aspx"

    @recordlastdownload
    def download(self, basefile=None):
        for basefile, url in self.download_get_basefiles(self.start_url):
            self.download_single(basefile, url)

    @downloadmax
    def download_get_basefiles(self, start_url):
        document_url_regex = re.compile("(?P<basefile>\d+\-\d+\-\d+).aspx")
        done = False
        url = start_url
        pagecount = 1
        while not done:
            self.log.info("Getting page #%s" % pagecount)
            soup = BeautifulSoup(requests.get(url).text)
            for link in soup.find_all("a", href=document_url_regex):
                basefile = document_url_regex.search(link["href"]).group("basefile")
                yield basefile, urljoin(url, link["href"])

            next = soup.find("img", src="/common/images/navigation-pil-grey.png").find_parent("a")
            if next:
                url = urljoin(url, next["href"])
                pagecount += 1
            else:
                done = True

    def parse_from_soup(self, soup):
        # Step 1: Find out basic metadata
        rubrik = soup.first("title").string
        beslutsdatum = soup.first(
            "meta", {'name': 'SG_Beslutsdatum'})['content']

        beslutsdatum = datetime.strptime(beslutsdatum, "%Y-%m-%d").date()
        diarienummer = soup.first(
            "meta", {'name': 'SG_Dokumentbet'})['content']
        arendetyp = soup.first("meta", {'name': 'Subject'})['content']
        # the keywords for a documents is contained in a metatag
        # formatted like:
        #    <meta name="Keywords" content="hets_mot_folkgrupp\nmeddelarfrihet\åklagare">
        #
        # Transform this into an array like:
        #    [u'http://lagen.nu/concept/Hets_mot_folkgrupp',
        #     u'http://lagen.nu/concept/Meddelarfrihet',
        #     u'http://lagen.nu/concept/Åklagare']
        nyckelord = soup.first("meta", {'name': 'Keywords'})['content']
        begrepp = ['http://lagen.nu/concept/%s' % util.ucfirst(
            x).strip().replace(" ", "_") for x in nyckelord.split("\n")]

        # Step 2: Using the metadata, construct the canonical URI for this document
        uri = LegalURI.construct({'type': LegalRef.MYNDIGHETSBESLUT,
                                  'myndighet': 'jk',
                                  'dnr': diarienummer})
        # self.log.debug("URI: %s" % uri)

        # Step 3: Create a RDF graph of all our metadata (so far)
        g = Graph()
        g.bind('dct', self.ns['dct'])
        g.bind('rinfo', self.ns['rinfo'])
        g.bind('rinfoex', self.ns['rinfoex'])
        g.bind('xsd', util.ns['xsd'])
        g.add((
            URIRef(uri), self.ns['dct']['title'], Literal(rubrik, lang="sv")))
        g.add((URIRef(uri), self.ns['rinfo']['beslutsdatum'],
              Literal(beslutsdatum, lang="sv")))
        g.add((URIRef(uri), self.ns['rinfo']['diarienummer'],
              Literal(diarienummer, lang="sv")))
        g.add((URIRef(uri), self.ns['rinfoex']['arendetyp'],
              Literal(arendetyp, lang="sv")))
        for s in begrepp:
            g.add((URIRef(uri), self.ns['dct']['subject'], URIRef(s)))

        g.add((URIRef(uri), self.ns['dct']['identifier'], Literal(
            "JK %s" % diarienummer, lang="sv")))
        g.add((URIRef(uri), RDF.type, self.rdf_type))

        # Step 4: Process the actual text of the document
        self.parser = LegalRef(LegalRef.LAGRUM,
                               LegalRef.KORTLAGRUM,
                               LegalRef.RATTSFALL,
                               LegalRef.FORARBETEN)

        # newer documents have a semantic structure with h1 and h2
        # elements. Older have elements like <p class="Rubrik_1">. Try
        # to determine which one we're dealing with?
        tag = soup.find('a', {'name': "Start"})
        if tag:
            # self.log.debug("Using new-style document structure")
            elements = tag.parent.findAllNext()
        else:
            # self.log.debug("Using old-style document structure")
            elements = soup.findAll("p")
        # self.log.debug("Found %d elements" % len(elements))
        from collections import deque
        elements = deque(elements)
        body = self.make_sektion(elements, "Referat av beslut")

        # Step 5: Combine the metadata and the document, and return it
        doc = {'meta': g,
               'body': body,
               'lang': 'sv',
               'uri': uri}
        return doc

    def make_sektion(self, elements, heading, level=0):
        sekt = Sektion(**{"rubrik": heading,
                          "niva": level})
        self.log.debug(
            "%sCreated sektion(%d): '%s'" % ("  " * level, level, heading))
        baseuri = None
        while True:
            try:
                p = elements.popleft()
            except IndexError:
                return sekt
            text = p.get_text(strip=True)
            # self.log.debug("%sp.name: %s, p['class']: %s, 'class' in p.attrs: %s" % ("  "*level,p.name,p['class'], (u'class' in p.attrs[0])))
            new_level = None
            if p.name == "h1":
                new_level = 1
            elif p.name == "h2":
                new_level = 2
            elif p.name == "h3":
                new_level = 3
            elif ((p.name == "p") and
                  (len(p.attrs) > 0) and
                  ('class' in p.attrs[0]) and
                  (p['class'].startswith("Rubrik_"))):
                # self.log.debug("%sp.class: %s" % ("  "*level,p['class']))
                new_level = int(p['class'][7:])

            if new_level:
                if new_level > level:
                    sekt.append(self.make_sektion(elements, text, new_level))
                else:
                    elements.appendleft(p)
                    return sekt
            else:
                if text:
                    nodes = self.parser.parse(text,
                                              baseuri=baseuri,
                                              predicate="dct:references")
                    for node in nodes:
                        # Use possible SFS references as the the
                        # baseuri for subsequent paragraphs
                        if isinstance(node, Link) and node.uri.startswith("http://rinfo.lagrummet.se/publ/sfs/"):
                            baseuri = node.uri

                    stycke = Stycke(nodes)
                    # self.log.debug("%sCreated stycke: '%s'" % ("  "*level,stycke))
                    sekt.append(stycke)
Ejemplo n.º 12
0
 def parametric_test(self, datafile):
     p = LegalRef(LegalRef.EGRATTSFALL)
     return self._test_parser(datafile, p)
Ejemplo n.º 13
0
class LegalRefTest(object):
    def __init__(self, alias):
        # setup
        self.alias = alias
        parsetype = alias.split("/")[1]
        self.parser = LegalRef({'SFS': LegalRef.LAGRUM,
                                'Short': LegalRef.KORTLAGRUM,
                                'DV': LegalRef.RATTSFALL,
                                'Regpubl': LegalRef.FORARBETEN,
                                'EGLag': LegalRef.EULAGSTIFTNING,
                                'ECJ': LegalRef.EURATTSFALL}[parsetype])

        # this particular test method is set up to use lagen.nu style
        # URIs because the canonical URIs are significantly different.
        dirname = os.path.dirname(__file__)
        basedir = dirname + "/../"
        space = basedir + "lagen/nu/res/uri/swedishlegalsource.space.ttl"
        slugs = basedir + "lagen/nu/res/uri/swedishlegalsource.slugs.ttl"
        extra = [basedir + "lagen/nu/res/extra/swedishlegalsource.ttl",
                 basedir + "lagen/nu/res/extra/sfs.ttl"]
        cfg = Graph().parse(space,
                            format="turtle").parse(slugs, format="turtle")
        self.metadata = Graph()
        for ttl in extra:
            self.metadata.parse(ttl, format="turtle")
        COIN = Namespace("http://purl.org/court/def/2009/coin#")
        # select correct URI for the URISpace definition by
        # finding a single coin:URISpace object
        spaceuri = cfg.value(predicate=RDF.type, object=COIN.URISpace)
        self.minter = URIMinter(cfg, spaceuri)
       
    def createtest(self, basefile, basedir):
        # FIXME: This is mostly a cut'n paste of integrationLegalRef._test_parser
        testfile = os.path.dirname(__file__) + "/../test/files/" + self.alias + "/" + basefile + ".txt"
        encoding = 'windows-1252'
        with codecs.open(testfile,encoding=encoding) as fp:
            testdata = fp.read()
        parts = re.split('\r?\n\r?\n',testdata,1)
        testdata = parts[0]
        test_paras = re.split('\r?\n---\r?\n',testdata)

        # first: run it five times with timeit to get a good average exec time
        elapsed = timeit(functools.partial(self.run_with_timeit, test_paras), number=5, globals=globals())
        # then: run it a sixth time to get at the return value
        body = self.run_with_timeit(test_paras)
        return elapsed, extractrefs(body)

    def run_with_timeit(self, test_paras):
        body = []
        for para in test_paras:
            if para.startswith("RESET:"):
                self.parser.currentlynamedlaws.clear()
            if para.startswith("NOBASE:"):
                baseuri_attributes = {}
            else:
                baseuri_attributes = {'law': '9999:999'}
            nodes = self.parser.parse(para, self.minter, self.metadata,
                                      baseuri_attributes)
            body.append(nodes)
        return body
    timetest = createtest
Ejemplo n.º 14
0
 def lagrum_parser(self):
     return SwedishCitationParser(LegalRef(LegalRef.LAGRUM,
                                           LegalRef.EULAGSTIFTNING),
                                  self.minter,
                                  self.commondata,
                                  allow_relative=True)
Ejemplo n.º 15
0
class LegalRefTest(object):
    def __init__(self, alias):
        # setup
        self.alias = alias
        parsetype = alias.split("/")[1]
        self.parser = LegalRef({
            'SFS': LegalRef.LAGRUM,
            'Short': LegalRef.KORTLAGRUM,
            'DV': LegalRef.RATTSFALL,
            'Regpubl': LegalRef.FORARBETEN,
            'EGLag': LegalRef.EULAGSTIFTNING,
            'ECJ': LegalRef.EURATTSFALL
        }[parsetype])

        # this particular test method is set up to use lagen.nu style
        # URIs because the canonical URIs are significantly different.
        dirname = os.path.dirname(__file__)
        basedir = dirname + "/../"
        space = basedir + "lagen/nu/res/uri/swedishlegalsource.space.ttl"
        slugs = basedir + "lagen/nu/res/uri/swedishlegalsource.slugs.ttl"
        extra = [
            basedir + "lagen/nu/res/extra/swedishlegalsource.ttl",
            basedir + "lagen/nu/res/extra/sfs.ttl"
        ]
        cfg = Graph().parse(space, format="turtle").parse(slugs,
                                                          format="turtle")
        self.metadata = Graph()
        for ttl in extra:
            self.metadata.parse(ttl, format="turtle")
        COIN = Namespace("http://purl.org/court/def/2009/coin#")
        # select correct URI for the URISpace definition by
        # finding a single coin:URISpace object
        spaceuri = cfg.value(predicate=RDF.type, object=COIN.URISpace)
        self.minter = URIMinter(cfg, spaceuri)

    def createtest(self, basefile, basedir):
        # FIXME: This is mostly a cut'n paste of integrationLegalRef._test_parser
        testfile = os.path.dirname(
            __file__
        ) + "/../test/files/" + self.alias + "/" + basefile + ".txt"
        encoding = 'windows-1252'
        with codecs.open(testfile, encoding=encoding) as fp:
            testdata = fp.read()
        parts = re.split('\r?\n\r?\n', testdata, 1)
        testdata = parts[0]
        test_paras = re.split('\r?\n---\r?\n', testdata)

        # first: run it five times with timeit to get a good average exec time
        elapsed = timeit(functools.partial(self.run_with_timeit, test_paras),
                         number=5,
                         globals=globals())
        # then: run it a sixth time to get at the return value
        body = self.run_with_timeit(test_paras)
        return elapsed, extractrefs(body)

    def run_with_timeit(self, test_paras):
        body = []
        for para in test_paras:
            if para.startswith("RESET:"):
                self.parser.currentlynamedlaws.clear()
            if para.startswith("NOBASE:"):
                baseuri_attributes = {}
            else:
                baseuri_attributes = {'law': '9999:999'}
            nodes = self.parser.parse(para, self.minter, self.metadata,
                                      baseuri_attributes)
            body.append(nodes)
        return body

    timetest = createtest
Ejemplo n.º 16
0
class LNMediaWiki(MediaWiki):
    namespaces = SwedishLegalSource.namespaces

    from ferenda.sources.legal.se.legalref import LegalRef

    p = LegalRef(LegalRef.LAGRUM, LegalRef.KORTLAGRUM, LegalRef.FORARBETEN,
                 LegalRef.RATTSFALL)

    keyword_class = LNKeyword

    lang = "sv"

    def __init__(self, config=None, **kwargs):
        super(LNMediaWiki, self).__init__(config, **kwargs)
        if self.config._parent and hasattr(self.config._parent, "sfs"):
            self.sfsrepo = SFS(self.config._parent.sfs)
        else:
            self.sfsrepo = SFS()

    def get_wikisettings(self):
        settings = LNSettings(lang=self.lang)
        # NOTE: The settings object (the make_url method) only needs
        # access to the canonical_uri method.
        settings.make_sfs_url = self.sfsrepo.canonical_uri
        settings.make_keyword_url = self.keywordrepo.canonical_uri
        return settings

    def get_wikisemantics(self, parser, settings):
        return LNSemantics(parser, settings)

    def canonical_uri(self, basefile):
        if basefile.startswith("SFS/") or basefile.startswith("SFS:"):
            # "SFS/1998:204" -> "1998:204"
            return self.sfsrepo.canonical_uri(basefile[4:])
        else:
            return super(LNMediaWiki, self).canonical_uri(basefile)

    def postprocess(self, doc, xhtmltree):
        # if SFS mode:
        # create a div for root content
        # find all headers, create div for everything there
        if doc.basefile.startswith("SFS/") or doc.basefile.startswith("SFS:"):
            self.postprocess_commentary(doc, xhtmltree)
            toplevel_property = False
        else:
            toplevel_property = True
        body = super(LNMediaWiki,
                     self).postprocess(doc,
                                       xhtmltree,
                                       toplevel_property=toplevel_property)
        citparser = SwedishCitationParser(self.p, self.config.url)
        citparser.parse_recursive(body, predicate=None)
        return body

    def postprocess_commentary(self, doc, xhtmltree):
        uri = doc.uri
        body = xhtmltree.getchildren()[0]
        newbody = etree.Element("body")

        curruri = uri
        currdiv = etree.SubElement(newbody, "div")
        currdiv.set("about", curruri)
        currdiv.set("property", "dcterms:description")
        currdiv.set("datatype", "rdf:XMLLiteral")
        containerdiv = etree.SubElement(currdiv, "div")
        for child in body.getchildren():
            if child.tag in ("h1", "h2", "h3", "h4", "h5", "h6"):
                # remove that <span> element that Semantics._h_el adds for us
                assert child[
                    0].tag == "span", "Header subelement was %s not span" % child[
                        0].tag
                child.text = child[0].text
                child.remove(child[0])
                if child.text:
                    if isinstance(child.text, bytes):
                        txt = child.text.decode("utf-8")
                    else:
                        txt = child.text
                    nodes = self.p.parse(txt, curruri)
                    curruri = nodes[0].uri
                # body.remove(child)
                newbody.append(child)
                currdiv = etree.SubElement(newbody, "div")
                currdiv.set("about", curruri)
                currdiv.set("property", "dcterms:description")
                currdiv.set("datatype", "rdf:XMLLiteral")
                # create a containerdiv under currdiv for reasons
                containerdiv = etree.SubElement(currdiv, "div")
            else:
                # body.remove(child)
                currdiv[0].append(child)
        xhtmltree.remove(body)
        xhtmltree.append(newbody)
Ejemplo n.º 17
0
 def parametric_test(self, datafile):
     p = LegalRef(LegalRef.LAGRUM)
     return self._test_parser(datafile, p)
Ejemplo n.º 18
0
 def parametric_test(self, datafile):
     p = LegalRef(LegalRef.MYNDIGHETSBESLUT)
     # p.verbose = True
     return self._test_parser(datafile, p)
Ejemplo n.º 19
0
 def parametric_test(self, datafile):
     p = LegalRef(LegalRef.FORARBETEN)
     return self._test_parser(datafile, p)
Ejemplo n.º 20
0
class EurlexCaselaw(DocumentRepository):

    """Handles all case law from the European Court of Justice (ECJ)."""
    alias = "ecj"  # European Court of Justice

    start_url = "http://eur-lex.europa.eu/JURISIndex.do"
    document_url = "http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=CELEX:%(basefile)s:EN:NOT"
    source_encoding = "utf-8"

    namespaces = ('rdf',
                  'dct',
                  ('eurlex', 'http://lagen.nu/eurlex#'))

    # This regexp is specific to caselaw (the leading '6' is for the
    # caselaw area).
    re_celexno = re.compile('(6)(\d{4})(\w\w?)(\d{4})(\(\d{2}\)|)')

    def download(self, basefile=None):
        if basefile:
            self.download_single(basefile)
        if not self.config.force and 'startyear' in self.config:
            startyear = self.config.startyear
        else:
            startyear = 1954  # The first verdicts were published in this year
        for year in range(startyear, datetime.date.today().year + 1):
            # We use self.configfile directly rather than
            # self.moduleconfig, since the latter cannot be persisted
            # across sessions (as it is a subset of a composite
            # between the config file and command line options)
            self.config.startyear = year
            self.config.write()
            # FIXME: URL parameters may have changed -- this seem to produce every
            # case from year up till today
            list_url = "http://eur-lex.europa.eu/Result.do?T1=V6&T2=%d&T3=&RechType=RECH_naturel" % year
            self.log.debug("Searching for %d" % year)
            res = request.get(list_url)
            pagecnt = 0
            done = False
            while not done:
                pagecnt += 1
                self.log.debug("Result page #%s" % pagecnt)
                # Don't parse using BeautifulSoup etc -- just search the whole damn text blob
                celexnos = self.re_celexno.findall(res.text)
                # FIXME: support for config.downloadmax
                for celexno in itertools.chain(celexnos):
                    # the number will be split up in components - concatenate
                    celexno = "".join(celexno)
                    # only download actual judgements and orders
                    # FIXME: the below is outdated -- now "TA" and "CN" (amongst others?) are used

                    # J: Judgment of the Court
                    # A: Judgment of the Court of First Instance
                    # W: Judgement of the Civil Service Tribunal
                    # T: (old) Judgement of the Court
                    # B: Order of the CFI
                    # O: Order of the ECJ
                    if ('J' in celexno or 'A' in celexno
                        or 'W' in celexno or 'T' in celexno
                            or 'B' in celexno or 'O' in celexno):
                        if self.download_single(celexno, usecache=usecache):
                            self.log.info("Downloaded %s" % celexno)
                        else:
                            self.log.info("Skipped %s" % celexno)
                    else:
                        pass
                        #self.log.debug("Not downloading doc %s" % celexno)

                # see if there are any "next" pages
                url = lxml.html.parse(res.text).find("a", text=">").get('href', None)
                if url:
                    res = request.get(url)
                else:
                    self.log.info('No next page link found, we must be done')
                    done = True

    def parse_metadata_from_soup(self, soup, doc):
        # AVAILABLE METADATA IN CASES
        #
        # For now, we create a nonofficial eurlex vocab with namespace http://lagen.nu/eurlex#
        # - celex number (first h1) :celex (:celexnum?)
        #
        # - [Title and reference]
        #   - decision type and date "Judgment of the Court (Third Chamber) of 17 December 2009."
        #      :courtdecision (as opposed to :commissiondecision)
        #   - :party (or parties) "M v Agence européenne des médicaments (EMEA)."
        #   - :referingcourt "Reference for a preliminary ruling: Administrativen sad Sofia-grad - Bulgaria."
        #   - :legalissue - short description and/or(?) keywords (not always present, eg 62009J0403), hyphen sep:
        #     - "Review of the judgment in Case T-12/08 P"
        #     - "Whether the state of the proceedings permits final judgment to be given"
        #     - "Fair hearing"
        #     - "Rule that the parties should be heard"
        #     - "Whether the unity or consistency of Community law is affected."
        #   - :casenum Case number + unknown letters:
        #     - "Case C-197/09 RX-II."
        #     - "Joined cases T-117/03 to T-119/03 and T-171/03."
        #   - :casereporter Case reporter cite "European Court reports 2009 Page 00000"
        # - [Text]
        #   - :availablelang - Available languages ("bg", "es", "cs", "da" ....)
        # - :authenticlang - Authentic language ("fr" or "French")
        # - [Dates]
        #   - :decisiondate - Date of document (decision/judgement)
        #   - :applicationdate - Date of application
        # - [Classifications] (different from description/keywords above)
        #   - :subjectmatter Subject Matter, comma sep:
        #     - "Staff regulations and employment conditions - EC"
        #     - "Provisions governing the Institutions"
        #   - :directorycode - Case Law Directory Code (where is the full code list?), NL sep:
        #      - "B-09.03 EEC/EC / State aid / Exceptions to the prohibition of aid"
        #      - "B-20.05 EEC/EC / Acts of the institutions / Statement of the reasons on which a measure is based"
        #      - "B-09.03 EEC/EC / State aid / Exceptions to the prohibition of aid"
        #      - "B-09.04 EEC/EC / State aid / Review of aid by the Commission - Rules of procedure"
        # - [Miscellaneous information]
        #   - dct:author Author: "Court of Justice of the European Communities"
        #   - :form Form: "Judgement"
        # - [Procedure]
        #   - :proceduretype - Type of procedure, comma sep:
        #     - "Staff cases"
        #     - "Action for damages"
        #     - "Appeal"
        #     - "REEX=OB"
        #   - :applicant - Applicant: "Official"
        #   - :defendant - Defendant: "EMEA, Institutions"
        #   - :observation - Observations: "Italy, Poland, Member States, European Parliament, Council, Commission, Institutions"
        #   - :judgerapporteur - Judge-Rapporteur: "von Danwitz"
        #   - :advocategeneral - Advocate General: "Mazák"
        # - [Relationships between documents]
        #   - :treaty Treaty: "European Communities"
        #   - :caseaffecting Case affecting, NL-sep:
        #     - "Interprets [CELEXNO + pinpoint]"
        #     - "Declares void 61995A0091"
        #     - "Confirms 31996D0666"
        #   - :"Instruments cited in case law" (celex numbers with pinpoint locations?), nl-sep
        #     - "12001C/PRO/02-A61"
        #     - "12001C/PRO/02-NA13P1"
        #     - "31991Q0530-A114"
        #     - "62007K0023"
        #     - "62008A0012"
        #
        # convenience functions -- should not be needed now that we have Describer
        # def add_literal(predicate, literal):
        #     g.add((URIRef(uri),
        #            voc[predicate],
        #            Literal(literal, lang=lang)))
        #
        # def add_celex_object(predicate, celexno):
        #     g.add((URIRef(uri),
        #            voc[predicate],
        #            URIRef("http://lagen.nu/ext/celex/%s" % celexno)))
        #
        # def get_predicate(predicate):
        #     predicates = list(g.objects(URIRef(uri), voc[predicate]))
        #     return predicates != []
        #
        # These are a series of refinments for the "Affecting"
        # relationship. "Cites" doesn't have these (or similar), but
        # "is affected by" has (the inverse properties)
        affects_predicates = {"Interprets": "interprets",
                              "Interprets the judgment":
                              "interpretsJudgment",
                              "Declares void": "declaresVoid",
                              "Confirms": "confirms",
                              "Declares valid (incidentally)":
                              "declaresValidIncidentally",
                              "Declares valid (by a preliminary ruling)":
                              "declaresValidByPreliminaryRuling",
                              "Incidentally declares invalid":
                              "declaresInvalidIncidentally",
                              "Declares invalid (by a preliminary ruling)":
                              "declaresInvalidByPreliminaryRuling",
                              "Amends": "amends",
                              "Failure concerning": "failureConcerning"}

        isaffected_predicates = {"Interpreted by": "interpretedBy",
                                 "Confirmed by": "confirmedBy",
                                 "Declared void by": "declaredVoidBy",
                                 "Annulment requested by":
                                 "annulmentRequestedBy"}

        # 1. Express metadata about our document as a RDF graph
        desc = Describer(self.meta, self.uri)
        g = Graph()
        # :celex - first <h1>
        celexnum = soup.h1.get_text(strip=True)
        if celexnum == "No documents matching criteria.":
            raise errors.DocumentRemovedError("No documents matching criteria " + celexnum)
        elif "no_data_found" in celexnum:
            self.log.warning(
                "%s: No data found (try re-downloading)!" % basefile)
            raise errors.DocumentRemovedError("No data found!")

        assert celexnum == doc.basefile, "Celex number in file (%s) differ from filename (%s)" % (
            celexnum, basefile)
        doc.lang = soup.html['lang']

        m = self.re_celexno.match(celexnum)
        # FIXME: this list is outdated!
        rdftype = {'J': voc['Judgment'],
                   'A': voc['JudgmentFirstInstance'],
                   'W': voc['JudgmentCivilService'],
                   'O': voc['Order'],
                   'B': voc['OrderCivilService']}[m.group(3)]

        desc.rdftype(rdftype)
        desc.value(self.ns['eurlex'].celexnum, celexnum)

        # The first section, following <h2>Title and reference</h2>
        # contains :courtdecision, :party (one or two items),
        # :referingcourt (optional), :legalissue (list of strings),
        # :casenum, :casereporter. Since some are optional, we do a
        # little heuristics to find out what we're looking at at any
        # given moment.
        for section in soup.findAll(["h1", "h2"]):
            if section.name == "h1" and section.a and section.a.string == "Text":
                break
            if section.string == "Title and reference":
                for para in section.findNextSiblings("p"):
                    if not para.string:
                        continue
                    string = para.string.strip()

                    # optional: do sanitychecks to see if this really is a :courtdecision
                    if not get_predicate('courtdecision'):
                        add_literal('courtdecision', string)
                    elif not get_predicate('party'):
                        # this will be one or two items. Are they position dependent?
                        for party in string.split(" v "):
                            add_literal('party', party)
                    elif (not get_predicate('referingcourt') and
                          (string.startswith("Reference for a preliminary ruling") or
                           string.startswith("Preliminary ruling requested"))):
                        add_literal('referingcourt', string)
                    elif (not get_predicate('casenum') and
                          (string.lower().startswith("case ") or
                           string.lower().startswith("joined cases "))):
                        add_literal('casenum', string)
                    elif para.em:  # :casereporter is enclosed in an em
                        for row in para.findAll(text=True):
                            add_literal('casereporter', row.strip())
                    elif get_predicate('legalissue'):
                        # fixme: Split this up somehow
                        add_literal('legalissue', string)
            elif section.string == "Relationship between documents":
                for item in section.findNextSibling("ul").findAll("li"):
                    predicate = None
                    subpredicate = None
                    for node in item.childGenerator():
                        if not hasattr(node, "name"):
                            nodetext = node.strip()
                            if re.match("([ABCDEFGIJKLNPRST]+\d*)+$", nodetext):
                                continue
                            if re.match("\d[\d\-]*[ABC]?$", nodetext):
                                continue
                            if predicate == "affects" and nodetext:
                                if nodetext in affects_predicates:
                                    subpredicate = affects_predicates[nodetext]
                                else:
                                    self.log.warning(
                                        "Can't express '%s' as a affects predicate" % nodetext)
                            elif predicate == "isaffected" and nodetext:
                                if nodetext in isaffected_predicates:
                                    subpredicate = isaffected_predicates[
                                        nodetext]
                                else:
                                    self.log.warning(
                                        "Can't express '%s' as a isaffected predicate" % nodetext)

                        elif node.name == "strong":
                            subpredicate = None
                            if node.string == "Treaty:":
                                predicate = "treaty"
                            elif node.string == "Affected by case:":
                                predicate = "isaffected"
                            elif node.string == "Case affecting:":
                                predicate = "affects"
                            elif node.string == "Instruments cited in case law:":
                                predicate = "cites"
                            else:
                                self.log.warning("Don't know how to handle key '%s'" % node.string)
                        elif node.name == "a" and predicate:
                            p = predicate
                            if subpredicate:
                                p = subpredicate
                            # FIXME: If the
                            # predicate is "cites", the celex number
                            # may have extra crap
                            # (eg. "31968R0259(01)-N2A1L6") indicating
                            # pinpoint location. Transform these to a
                            # fragment identifier.
                            add_celex_object(p, node.string.strip())

    def parse_document_from_soup(self, soup, doc):
        # Process text and create DOM
        self.parser = LegalRef(LegalRef.EGRATTSFALL)

        textdiv = soup.find("div", "texte")
        if textdiv:
            for node in textdiv.childGenerator():
                if node.string:
                    # Here we should start analyzing for things like
                    # "C-197/09". Note that the Eurlex data does not use
                    # the ordinary hyphen like above, but rather
                    # 'NON-BREAKING HYPHEN' (U+2011) - LegaRef will mangle
                    # this to an ordinary hyphen.
                    subnodes = self.parser.parse(node.string,
                                                 predicate="dct:references")
                    doc.body.append(Paragraph(subnodes))
        else:
            self.log.warning("%s: No fulltext available!" % celexnum)
            doc.body.append(Paragraph(["(No fulltext available)"]))
Ejemplo n.º 21
0
 def parametric_test(self, datafile):
     p = LegalRef(LegalRef.EULAGSTIFTNING)
     return self._test_parser(datafile, p)
Ejemplo n.º 22
0
 def forarbete_parser(self):
     return SwedishCitationParser(LegalRef(LegalRef.FORARBETEN),
                                  self.minter, self.commondata)
Ejemplo n.º 23
0
    def parse_from_textreader(self, reader, basefile):
        tracelog = logging.getLogger("%s.tracelog" % self.alias)

        doc = self.make_document(basefile)
        g = doc.meta

        # 1.2: Load known entities and their URIs (we have to add some
        # that are not yet in the official resource lists
        resource_list_file = self.store.path("resourcelist", "intermediate", ".rdf")
        if not os.path.exists(resource_list_file):
            self.download_resource_lists("http://service.lagrummet.se/var/common", resource_list_file)
        resources = Graph()
        resources.parse(resource_list_file, format="xml")

        # 1.3: Define regexps for the data we search for.
        fwdtests = {
            "dct:issn": ["^ISSN (\d+\-\d+)$"],
            "dct:title": ["((?:Föreskrifter|[\w ]+s (?:föreskrifter|allmänna råd)).*?)\n\n"],
            "dct:identifier": ["^([A-ZÅÄÖ-]+FS\s\s?\d{4}:\d+)$"],
            "rpubl:utkomFranTryck": ["Utkom från\strycket\s+den\s(\d+ \w+ \d{4})"],
            "rpubl:omtryckAv": ["^(Omtryck)$"],
            "rpubl:genomforDirektiv": ["Celex (3\d{2,4}\w\d{4})"],
            "rpubl:beslutsdatum": ["(?:har beslutats|beslutade|beslutat) den (\d+ \w+ \d{4})"],
            "rpubl:beslutadAv": [
                "\n([A-ZÅÄÖ][\w ]+?)\d? (?:meddelar|lämnar|föreskriver)",
                "\s(?:meddelar|föreskriver) ([A-ZÅÄÖ][\w ]+?)\d?\s",
            ],
            "rpubl:bemyndigande": [
                " ?(?:meddelar|föreskriver|Föreskrifterna meddelas|Föreskrifterna upphävs)\d?,? (?:följande |)med stöd av\s(.*?) ?(?:att|efter\ssamråd|dels|följande|i fråga om|och lämnar allmänna råd|och beslutar följande allmänna råd|\.\n)",
                "^Med stöd av (.*)\s(?:meddelar|föreskriver)",
            ],
        }

        # 2: Find metadata properties

        # 2.1 Find some of the properties on the first page (or the
        # 2nd, or 3rd... continue past TOC pages, cover pages etc
        # until the "real" first page is found) NB: FFFS 2007:1 has
        # ten (10) TOC pages!
        pagecnt = 0
        for page in reader.getiterator(reader.readpage):
            # replace single newlines with spaces, but keep double
            # newlines
            # page = "\n\n".join([util.normalize_space(x) for x in page.split("\n\n")])
            pagecnt += 1
            props = {}
            for (prop, tests) in list(fwdtests.items()):
                if prop in props:
                    continue
                for test in tests:
                    m = re.search(test, page, re.MULTILINE | re.DOTALL | re.UNICODE)
                    if m:
                        props[prop] = util.normalize_space(m.group(1))
            # Single required propery. If we find this, we're done
            if "rpubl:beslutsdatum" in props:
                break
            self.log.warning("%s: Couldn't find required props on page %s" % (basefile, pagecnt))

        # 2.2 Find some of the properties on the last 'real' page (not
        # counting appendicies)
        reader.seek(0)
        pagesrev = reversed(list(reader.getiterator(reader.readpage)))
        # The language used to expres these two properties differ
        # quite a lot, more than what is reasonable to express in a
        # single regex. We therefore define a set of possible
        # expressions and try them in turn.
        revtests = {
            "rpubl:ikrafttradandedatum": [
                "(?:Denna författning|Dessa föreskrifter|Dessa allmänna råd|Dessa föreskrifter och allmänna råd)\d* träder i ?kraft den (\d+ \w+ \d{4})",
                "Dessa föreskrifter träder i kraft, (?:.*), i övrigt den (\d+ \w+ \d{4})",
                "ska(?:ll|)\supphöra att gälla (?:den |)(\d+ \w+ \d{4}|denna dag|vid utgången av \w+ \d{4})",
                "träder i kraft den dag då författningen enligt uppgift på den (utkom från trycket)",
            ],
            "rpubl:upphaver": [
                "träder i kraft den (?:\d+ \w+ \d{4}), då(.*)ska upphöra att gälla",
                "ska(?:ll|)\supphöra att gälla vid utgången av \w+ \d{4}, nämligen(.*?)\n\n",
                "att (.*) skall upphöra att gälla (denna dag|vid utgången av \w+ \d{4})",
            ],
        }

        cnt = 0
        for page in pagesrev:
            cnt += 1
            # Normalize the whitespace in each paragraph so that a
            # linebreak in the middle of the natural language
            # expression doesn't break our regexes.
            page = "\n\n".join([util.normalize_space(x) for x in page.split("\n\n")])

            for (prop, tests) in list(revtests.items()):
                if prop in props:
                    continue
                for test in tests:
                    # Not re.DOTALL -- we've normalized whitespace and
                    # don't want to match across paragraphs
                    m = re.search(test, page, re.MULTILINE | re.UNICODE)
                    if m:
                        props[prop] = util.normalize_space(m.group(1))
                        # print u"%s: '%s' resulted in match '%s' at page %s from end" %
                        # (prop,test,props[prop], cnt)

            # Single required propery. If we find this, we're done
            if "rpubl:ikrafttradandedatum" in props:
                break

        # 3: Clean up data - converting strings to Literals or
        # URIRefs, find legal references, etc
        if "dct:identifier" in props:
            (publication, year, ordinal) = re.split("[ :]", props["dct:identifier"])
            # FIXME: Read resources graph instead
            fs = resources.value(predicate=self.ns["skos"].altLabel, object=Literal(publication, lang="sv"))
            props["rpubl:forfattningssamling"] = fs
            publ = resources.value(subject=fs, predicate=self.ns["dct"].publisher)
            props["dct:publisher"] = publ

            props["rpubl:arsutgava"] = Literal(year)  # conversion to int, date not needed
            props["rpubl:lopnummer"] = Literal(ordinal)
            props["dct:identifier"] = Literal(props["dct:identifier"])

            # Now we can mint the uri (should be done through LegalURI)
            uri = "http://rinfo.lagrummet.se/publ/%s/%s:%s" % (
                props["rpubl:forfattningssamling"].split("/")[-1],
                props["rpubl:arsutgava"],
                props["rpubl:lopnummer"],
            )
            self.log.debug("URI: %s" % uri)
        else:
            self.log.error("Couldn't find dct:identifier, cannot create URI, giving up")
            return None

        tracelog.info("Cleaning rpubl:beslutadAv")
        if "rpubl:beslutadAv" in props:
            agency = resources.value(
                predicate=self.ns["foaf"].name, object=Literal(props["rpubl:beslutadAv"], lang="sv")
            )
            if agency:
                props["rpubl:beslutadAv"] = agency
            else:
                self.log.warning("Cannot find URI for rpubl:beslutadAv value %r" % props["rpubl:beslutadAv"])
                del props["rpubl:beslutadAv"]

        tracelog.info("Cleaning dct:issn")
        if "dct:issn" in props:
            props["dct:issn"] = Literal(props["dct:issn"])

        tracelog.info("Cleaning dct:title")

        # common false positive
        if "dct:title" in props and "denna f\xf6rfattning har beslutats den" in props["dct:title"]:
            del props["dct:title"]

        if "dct:title" in props:
            tracelog.info("Inspecting dct:title %r" % props["dct:title"])
            # sometimes the title isn't separated with two newlines from the rest of the text
            if "\nbeslutade den " in props["dct:title"]:
                props["dct:title"] = props["dct:title"].split("\nbeslutade den ")[0]
            props["dct:title"] = Literal(util.normalize_space(props["dct:title"]), lang="sv")

            if re.search("^(Föreskrifter|[\w ]+s föreskrifter) om ändring i ", props["dct:title"], re.UNICODE):
                tracelog.info("Finding rpubl:andrar in dct:title")
                orig = re.search("([A-ZÅÄÖ-]+FS \d{4}:\d+)", props["dct:title"]).group(0)
                (publication, year, ordinal) = re.split("[ :]", orig)
                origuri = "http://rinfo.lagrummet.se/publ/%s/%s:%s" % (
                    self.rpubl_uri_transform(publication),
                    year,
                    ordinal,
                )
                props["rpubl:andrar"] = URIRef(origuri)
                if "rpubl:omtryckAv" in props:
                    props["rpubl:omtryckAv"] = URIRef(origuri)
            if (
                re.search("^(Föreskrifter|[\w ]+s föreskrifter) om upphävande av", props["dct:title"], re.UNICODE)
                and not "rpubl:upphaver" in props
            ):
                tracelog.info("Finding rpubl:upphaver in dct:title")
                props["rpubl:upphaver"] = six.text_type(props["dct:title"])  # cleaned below

        tracelog.info("Cleaning date properties")
        for prop in ("rpubl:utkomFranTryck", "rpubl:beslutsdatum", "rpubl:ikrafttradandedatum"):
            if prop in props:
                if props[prop] == "denna dag" and prop == "rpubl:ikrafttradandedatum":
                    props[prop] = props["rpubl:beslutsdatum"]
                elif props[prop] == "utkom från trycket" and prop == "rpubl:ikrafttradandedatum":
                    props[prop] = props["rpubl:utkomFranTryck"]
                else:
                    props[prop] = Literal(self.parse_swedish_date(props[prop].lower()))

        tracelog.info("Cleaning rpubl:genomforDirektiv")
        if "rpubl:genomforDirektiv" in props:
            props["rpubl:genomforDirektiv"] = URIRef(
                "http://rinfo.lagrummet.se/ext/eur-lex/%s" % props["rpubl:genomforDirektiv"]
            )

        tracelog.info("Cleaning rpubl:bemyndigande")
        has_bemyndiganden = False

        if "rpubl:bemyndigande" in props:
            # SimpleParse can't handle unicode endash sign, transform
            # into regular ascii hyphen
            props["rpubl:bemyndigande"] = props["rpubl:bemyndigande"].replace("\u2013", "-")
            parser = LegalRef(LegalRef.LAGRUM)
            result = parser.parse(props["rpubl:bemyndigande"])
            bemyndigande_uris = [x.uri for x in result if hasattr(x, "uri")]

            # some of these uris need to be filtered away due to
            # over-matching by parser.parse
            filtered_bemyndigande_uris = []
            for bem_uri in bemyndigande_uris:
                keep = True
                for compare in bemyndigande_uris:
                    if len(compare) > len(bem_uri) and compare.startswith(bem_uri):
                        keep = False
                if keep:
                    filtered_bemyndigande_uris.append(bem_uri)

            for bem_uri in filtered_bemyndigande_uris:
                g.add((URIRef(uri), self.ns["rpubl"]["bemyndigande"], URIRef(bem_uri)))
                has_bemyndiganden = True
            del props["rpubl:bemyndigande"]

        tracelog.info("Cleaning rpubl:upphaver")
        if "rpubl:upphaver" in props:
            for upph in re.findall("([A-ZÅÄÖ-]+FS \d{4}:\d+)", util.normalize_space(props["rpubl:upphaver"])):
                (publication, year, ordinal) = re.split("[ :]", upph)
                upphuri = "http://rinfo.lagrummet.se/publ/%s/%s:%s" % (publication.lower(), year, ordinal)
                g.add((URIRef(uri), self.ns["rpubl"]["upphaver"], URIRef(upphuri)))
            del props["rpubl:upphaver"]

        tracelog.info("Deciding rdf:type")
        if "dct:title" in props and "allmänna råd" in props["dct:title"] and not "föreskrifter" in props["dct:title"]:
            props["rdf:type"] = self.ns["rpubl"]["AllmannaRad"]
        else:
            props["rdf:type"] = self.ns["rpubl"]["Myndighetsforeskrift"]

        # 3.5: Check to see that we have all properties that we expect
        # (should maybe be done elsewhere later?)
        tracelog.info("Checking required properties")
        for prop in (
            "dct:identifier",
            "dct:title",
            "rpubl:arsutgava",
            "dct:publisher",
            "rpubl:beslutadAv",
            "rpubl:beslutsdatum",
            "rpubl:forfattningssamling",
            "rpubl:ikrafttradandedatum",
            "rpubl:lopnummer",
            "rpubl:utkomFranTryck",
        ):
            if not prop in props:
                self.log.warning("%s: Failed to find %s" % (basefile, prop))

        tracelog.info("Checking rpubl:bemyndigande")
        if props["rdf:type"] == self.ns["rpubl"]["Myndighetsforeskrift"]:
            if not has_bemyndiganden:
                self.log.warning("%s: Failed to find rpubl:bemyndigande" % (basefile))

        # 4: Add the cleaned data to a RDFLib Graph
        # (maybe we should do that as early as possible?)
        tracelog.info("Adding items to rdflib.Graph")
        for (prop, value) in list(props.items()):
            (prefix, term) = prop.split(":", 1)
            p = self.ns[prefix][term]
            if not (isinstance(value, URIRef) or isinstance(value, Literal)):
                self.log.warning("%s: %s is a %s, not a URIRef or Literal" % (basefile, prop, type(value)))
            g.add((URIRef(uri), p, value))

        # 5: Create data for the body, removing various control characters
        # TODO: Use pdftohtml to create a nice viewable HTML
        # version instead of this plaintext stuff
        reader.seek(0)
        body = []

        # A fairly involved way of filtering out all control
        # characters from a string
        import unicodedata

        if six.PY3:
            all_chars = (chr(i) for i in range(0x10000))
        else:
            all_chars = (unichr(i) for i in range(0x10000))
        control_chars = "".join(c for c in all_chars if unicodedata.category(c) == "Cc")
        # tab and newline are technically Control characters in
        # unicode, but we want to keep them.
        control_chars = control_chars.replace("\t", "").replace("\n", "")

        control_char_re = re.compile("[%s]" % re.escape(control_chars))
        for page in reader.getiterator(reader.readpage):
            text = xml_escape(control_char_re.sub("", page))
            body.append("<pre>%s</pre>\n\n" % text)

        # 5: Done!
        #
        doc.body = body
        doc.lang = "sv"
        doc.uri = uri
        return doc