def parse(self, doc): # FIXME: don't create these if they already exists self.lagrum_parser = LegalRef(LegalRef.LAGRUM) self.rattsfall_parser = LegalRef(LegalRef.RATTSFALL) docfile = self.store.downloaded_path(doc.basefile) intermediatefile = self.store.intermediate_path(doc.basefile) r = WordReader() intermediatefile, filetype = r.read(docfile, intermediatefile) with codecs.open(intermediatefile, encoding="utf-8") as fp: patchedtext, patchdesc = self.patch_if_needed(doc.basefile, fp.read()) # The second step is to mangle the crappy XML produced by # antiword (docbook) or Word 2007 (OOXML) into a nice pair of # structures. rawhead is a simple dict that we'll later transform # into a rdflib Graph. rawbody is a list of plaintext strings, each # representing a paragraph. # # long-term FIXME: WordReader should expose a unified # interface for handling both kinds of word files so that we # wouldn't need both parse_ooxml() and # parse_antiword_docbook(). This might require some other tool # than antiword for old .doc files, as this throws away a LOT # of info. if filetype == "docx": rawhead, rawbody = self.parse_ooxml(patchedtext, doc.basefile) else: rawhead, rawbody = self.parse_antiword_docbook(patchedtext, doc.basefile) doc.uri = self.polish_metadata(rawhead, doc) if patchdesc: doc.meta.add((URIRef(doc.uri), self.ns['ferenda'].patchdescription, patchdesc)) doc.body = self.format_body(rawbody) # FIXME: Write a
def __init__(self, alias): # setup self.alias = alias parsetype = alias.split("/")[1] self.parser = LegalRef({ 'SFS': LegalRef.LAGRUM, 'Short': LegalRef.KORTLAGRUM, 'DV': LegalRef.RATTSFALL, 'Regpubl': LegalRef.FORARBETEN, 'EGLag': LegalRef.EULAGSTIFTNING, 'ECJ': LegalRef.EURATTSFALL }[parsetype]) # this particular test method is set up to use lagen.nu style # URIs because the canonical URIs are significantly different. dirname = os.path.dirname(__file__) basedir = dirname + "/../" space = basedir + "lagen/nu/res/uri/swedishlegalsource.space.ttl" slugs = basedir + "lagen/nu/res/uri/swedishlegalsource.slugs.ttl" extra = [ basedir + "lagen/nu/res/extra/swedishlegalsource.ttl", basedir + "lagen/nu/res/extra/sfs.ttl" ] cfg = Graph().parse(space, format="turtle").parse(slugs, format="turtle") self.metadata = Graph() for ttl in extra: self.metadata.parse(ttl, format="turtle") COIN = Namespace("http://purl.org/court/def/2009/coin#") # select correct URI for the URISpace definition by # finding a single coin:URISpace object spaceuri = cfg.value(predicate=RDF.type, object=COIN.URISpace) self.minter = URIMinter(cfg, spaceuri)
def __init__(self, alias): # setup self.alias = alias parsetype = alias.split("/")[1] self.parser = LegalRef({'SFS': LegalRef.LAGRUM, 'Short': LegalRef.KORTLAGRUM, 'DV': LegalRef.RATTSFALL, 'Regpubl': LegalRef.FORARBETEN, 'EGLag': LegalRef.EULAGSTIFTNING, 'ECJ': LegalRef.EURATTSFALL}[parsetype]) # this particular test method is set up to use lagen.nu style # URIs because the canonical URIs are significantly different. dirname = os.path.dirname(__file__) basedir = dirname + "/../" space = basedir + "lagen/nu/res/uri/swedishlegalsource.space.ttl" slugs = basedir + "lagen/nu/res/uri/swedishlegalsource.slugs.ttl" extra = [basedir + "lagen/nu/res/extra/swedishlegalsource.ttl", basedir + "lagen/nu/res/extra/sfs.ttl"] cfg = Graph().parse(space, format="turtle").parse(slugs, format="turtle") self.metadata = Graph() for ttl in extra: self.metadata.parse(ttl, format="turtle") COIN = Namespace("http://purl.org/court/def/2009/coin#") # select correct URI for the URISpace definition by # finding a single coin:URISpace object spaceuri = cfg.value(predicate=RDF.type, object=COIN.URISpace) self.minter = URIMinter(cfg, spaceuri)
def __init__(self, repos, inifile=None, **kwargs): super(WSGIApp, self).__init__(repos, inifile, **kwargs) sfsrepo = [repo for repo in repos if repo.alias == "sfs"][0] self.parser = SwedishCitationParser( LegalRef(LegalRef.RATTSFALL, LegalRef.LAGRUM, LegalRef.KORTLAGRUM, LegalRef.FORARBETEN, LegalRef.MYNDIGHETSBESLUT), sfsrepo.minter, sfsrepo.commondata, allow_relative=True) graph = Graph().parse(sfsrepo.resourceloader.filename("extra/sfs.ttl"), format="turtle") self.lagforkortningar = [ str(o) for s, o in graph.subject_objects(DCTERMS.alternate) ] self.paragraflag = [] for s, o in graph.subject_objects(DCTERMS.alternate): basefile = sfsrepo.basefile_from_uri(str(s)) distilledpath = sfsrepo.store.distilled_path(basefile) firstpara_uri = str(s) + "#P1" needle = '<rpubl:Paragraf rdf:about="%s">' % firstpara_uri if os.path.exists(distilledpath) and needle in util.readfile( distilledpath): self.paragraflag.append(str(o).lower()) self.lagnamn = [str(o) for s, o in graph.subject_objects(RDFS.label)] self.lagforkortningar_regex = "|".join( sorted(self.lagforkortningar, key=len, reverse=True))
def parse_document_from_soup(self, soup, doc): # Process text and create DOM self.parser = LegalRef(LegalRef.EGRATTSFALL) textdiv = soup.find("div", "texte") if textdiv: for node in textdiv.childGenerator(): if node.string: # Here we should start analyzing for things like # "C-197/09". Note that the Eurlex data does not use # the ordinary hyphen like above, but rather # 'NON-BREAKING HYPHEN' (U+2011) - LegaRef will mangle # this to an ordinary hyphen. subnodes = self.parser.parse( node.string, predicate="dcterms:references") doc.body.append(Paragraph(subnodes)) else: self.log.warning("%s: No fulltext available!" % celexnum) doc.body.append(Paragraph(["(No fulltext available)"]))
def parser(self): p = LegalRef(LegalRef.LAGRUM, LegalRef.KORTLAGRUM, LegalRef.FORARBETEN, LegalRef.RATTSFALL) # self.commondata need to include extra/sfs.ttl # somehow. This is probably not the best way. with self.resourceloader.open("extra/sfs.ttl") as fp: self.commondata.parse(data=fp.read(), format="turtle") # actually, to mint URIs for rattsfall we need the # skos:altLabel for the rpubl:Rattsfallspublikation -- so we # need everything with self.resourceloader.open("extra/swedishlegalsource.ttl") as fp: self.commondata.parse(data=fp.read(), format="turtle") return SwedishCitationParser(p, self.minter, self.commondata, allow_relative=True)
def parse_document_from_soup(self, soup, doc): # Process text and create DOM self.parser = LegalRef(LegalRef.EGRATTSFALL) textdiv = soup.find("div", "texte") if textdiv: for node in textdiv.childGenerator(): if node.string: # Here we should start analyzing for things like # "C-197/09". Note that the Eurlex data does not use # the ordinary hyphen like above, but rather # 'NON-BREAKING HYPHEN' (U+2011) - LegaRef will mangle # this to an ordinary hyphen. subnodes = self.parser.parse(node.string, predicate="dct:references") doc.body.append(Paragraph(subnodes)) else: self.log.warning("%s: No fulltext available!" % celexnum) doc.body.append(Paragraph(["(No fulltext available)"]))
class DV(SwedishLegalSource): alias = "dv" downloaded_suffix = ".zip" rdf_type = RPUBL.Rattsfallsreferat documentstore_class = DVStore namespaces = ('rdf', # always needed 'dct', # title, identifier, etc 'xsd', # datatypes 'owl', # : sameAs ('rpubl', 'http://rinfo.lagrummet.se/ns/2008/11/rinfo/publ#') ) DCT = Namespace(util.ns['dct']) def get_default_options(self): opts = super(DV, self).get_default_options() opts['ftpuser'] = None opts['ftppassword'] = None return opts # FIXME: store.list_basefiles_for("parse") must be fixed to handle two # different suffixes. Maybe store.downloaded_path() as well, so that # it returns .docx if a .docx file indeed exists, and .doc otherwise. # But this case (where documents can be in two (or more) formats depending # on age isn't uncommon, maybe DocumentStore should support it natively # (like with optional suffix parameter to download_path)? def download(self): # recurse =~ download everything, which we do if force is # specified OR if we've never downloaded before recurse = False if self.config.force or not self.config.lastdownload: recurse = True self.downloadcount = 0 # number of files extracted from zip files # (not number of zip files) try: if self.config.ftpuser: self.download_ftp("", recurse, self.config.ftpuser, self.config.ftppassword) else: self.download_www("", recurse) except MaxDownloadsReached: # ok we're done! pass def download_ftp(self, dirname, recurse, user, password, connection=None): self.log.debug('Listing contents of %s' % dirname) lines = [] if not connection: connection = FTP('ftp.dom.se') connection.login(user, password) connection.cwd(dirname) connection.retrlines('LIST', lines.append) for line in lines: parts = line.split() filename = parts[-1].strip() if line.startswith('d') and recurse: self.download(filename, recurse) elif line.startswith('-'): basefile = os.path.splitext(filename)[0] if dirname: basefile = dirname + "/" + basefile localpath = self.store.downloaded_path(basefile) if os.path.exists(localpath) and not self.config.force: pass # we already got this else: util.ensure_dir(localpath) self.log.debug('Fetching %s to %s' % (filename, localpath)) connection.retrbinary('RETR %s' % filename, # FIXME: retrbinary calls .close()? open(localpath, 'wb').write) self.process_zipfile(localpath) connection.cwd('/') def download_www(self, dirname, recurse): url = 'https://lagen.nu/dv/downloaded/%s' % dirname self.log.debug('Listing contents of %s' % url) resp = requests.get(url) iterlinks = lxml.html.document_fromstring(resp.text).iterlinks() for element, attribute, link, pos in iterlinks: if link.startswith("/"): continue elif link.endswith("/") and recurse: self.download_www(link, recurse) elif link.endswith(".zip"): basefile = os.path.splitext(link)[0] if dirname: basefile = dirname + basefile localpath = self.store.downloaded_path(basefile) if os.path.exists(localpath) and not self.config.force: pass # we already got this else: absolute_url = urljoin(url, link) self.log.debug('Fetching %s to %s' % (link, localpath)) resp = requests.get(absolute_url) with self.store.open_downloaded(basefile, "wb") as fp: fp.write(resp.content) self.process_zipfile(localpath) # eg. HDO_T3467-96.doc or HDO_T3467-96_1.doc re_malnr = re.compile(r'([^_]*)_([^_\.]*)_?(\d*)(\.docx?)') # eg. HDO_T3467-96_BYTUT_2010-03-17.doc or # HDO_T3467-96_BYTUT_2010-03-17_1.doc re_bytut_malnr = re.compile( r'([^_]*)_([^_\.]*)_BYTUT_\d+-\d+-\d+_?(\d*)(\.docx?)') re_tabort_malnr = re.compile( r'([^_]*)_([^_\.]*)_TABORT_\d+-\d+-\d+_?(\d*)(\.docx?)') def process_zipfile(self, zipfilename): removed = replaced = created = untouched = 0 zipf = zipfile.ZipFile(zipfilename, "r") for bname in zipf.namelist(): if not isinstance(bname, str): # py2 # Files in the zip file are encoded using codepage 437 name = bname.decode('cp437') else: name = bname if "_notis_" in name: continue name = os.path.split(name)[1] if 'BYTUT' in name: m = self.re_bytut_malnr.match(name) elif 'TABORT' in name: m = self.re_tabort_malnr.match(name) else: m = self.re_malnr.match(name) if m: (court, malnr, referatnr, suffix) = ( m.group(1), m.group(2), m.group(3), m.group(4)) assert ((suffix == ".doc") or (suffix == ".docx") ), "Unknown suffix %s in %r" % (suffix, name) if referatnr: basefile = "%s/%s_%s" % (court, malnr, referatnr) else: basefile = "%s/%s" % (court, malnr) outfile = self.store.path(basefile, 'downloaded', suffix) if "TABORT" in name: self.log.info("%s: Removing" % basefile) if not os.path.exists(outfile): self.log.warning("%s: %s doesn't exist" % (basefile, outfile)) else: os.unlink(outfile) removed += 1 else: if "BYTUT" in name: self.log.info("%s: Replacing with new" % basefile) if not os.path.exists(outfile): self.log.warning("%s: %s doesn't exist" % (basefile, outfile)) replaced += 1 else: self.log.info("%s: Unpacking" % basefile) if os.path.exists(outfile): untouched += 1 continue else: created += 1 data = zipf.read(bname) with self.store.open(basefile, "downloaded", suffix, "wb") as fp: fp.write(data) # Make the unzipped files have correct timestamp zi = zipf.getinfo(bname) dt = datetime(*zi.date_time) ts = mktime(dt.timetuple()) os.utime(outfile, (ts, ts)) self.downloadcount += 1 if self.config.downloadmax and self.downloadcount >= self.config.downloadmax: raise MaxDownloadsReached() else: self.log.warning('Kunde inte tolka filnamnet %r i %s' % (name, os.path.relpath(zipfilename))) self.log.debug('Processade %s, skapade %s, bytte ut %s, tog bort %s, lät bli %s filer' % (os.path.relpath(zipfilename), created, replaced, removed, untouched)) re_NJAref = re.compile(r'(NJA \d{4} s\. \d+) \(alt. (NJA \d{4}:\d+)\)') re_delimSplit = re.compile("[;,] ?").split labels = {'Rubrik': DCT.description, 'Domstol': DCT['creator'], # konvertera till auktoritetspost 'Målnummer': RPUBL['malnummer'], 'Domsnummer': RPUBL['domsnummer'], 'Diarienummer': RPUBL['diarienummer'], 'Avdelning': RPUBL['domstolsavdelning'], 'Referat': DCT['identifier'], 'Avgörandedatum': RPUBL['avgorandedatum'], # konvertera till xsd:date } # Metadata som kan innehålla noll eller flera poster. # Litteratur/sökord har ingen motsvarighet i RPUBL-vokabulären multilabels = {'Lagrum': RPUBL['lagrum'], 'Rättsfall': RPUBL['rattsfallshanvisning'], # dct:references vore bättre, men sådana ska inte ha literalvärden 'Litteratur': DCT['relation'], 'Sökord': DCT['subject'] } # Listan härledd från containers.n3/rattsfallsforteckningar.n3 i # rinfoprojektets källkod - en ambitiösare lösning vore att # läsa in de faktiska N3-filerna i en rdflib-graf. publikationsuri = {'NJA': 'http://rinfo.lagrummet.se/ref/rff/nja', 'RH': 'http://rinfo.lagrummet.se/ref/rff/rh', 'MÖD': 'http://rinfo.lagrummet.se/ref/rff/mod', 'RÅ': 'http://rinfo.lagrummet.se/ref/rff/ra', 'RK': 'http://rinfo.lagrummet.se/ref/rff/rk', 'MIG': 'http://rinfo.lagrummet.se/ref/rff/mig', 'AD': 'http://rinfo.lagrummet.se/ref/rff/ad', 'MD': 'http://rinfo.lagrummet.se/ref/rff/md', 'FÖD': 'http://rinfo.lagrummet.se/ref/rff/fod'} domstolsforkortningar = {'ADO': 'http://lagen.nu/org/2008/arbetsdomstolen', 'HDO': 'http://lagen.nu/org/2008/hogsta-domstolen', 'HGO': 'http://lagen.nu/org/2008/gota-hovratt', 'HNN': 'http://lagen.nu/org/2008/hovratten-for-nedre-norrland', 'HON': 'http://lagen.nu/org/2008/hovratten-for-ovre-norrland', 'HSB': 'http://lagen.nu/org/2008/hovratten-over-skane-och-blekinge', 'HSV': 'http://lagen.nu/org/2008/svea-hovratt', 'HVS': 'http://lagen.nu/org/2008/hovratten-for-vastra-sverige', 'MDO': 'http://lagen.nu/org/2008/marknadsdomstolen', 'MIG': 'http://lagen.nu/org/2008/migrationsoverdomstolen', 'MÖD': 'http://lagen.nu/org/2008/miljooverdomstolen', 'REG': 'http://lagen.nu/org/2008/regeringsratten', 'KST': 'http://lagen.nu/org/2008/kammarratten-i-stockholm'} # This is information you can get from RDL, but we hardcode it for # now. slugs = {'Arbetsdomstolen': 'ad', 'Domstolsverket': 'dv', 'Göta hovrätt': 'hgo', 'Högsta domstolen': 'hd', 'Högsta förvaltningsdomstolen': 'hfd', 'Hovrätten för Nedre Norrland': 'hnn', 'Hovrätten för Övre Norrland': 'hon', 'Hovrätten för Västra Sverige': 'hvs', 'Hovrätten över Skåne och Blekinge': 'hsb', 'Justitiekanslern': 'jk', 'Kammarrätten i Göteborg': 'kgg', 'Kammarrätten i Jönköping': 'kjo', 'Kammarrätten i Stockholm': 'kst', 'Kammarrätten i Sundsvall': 'ksu', 'Marknadsdomstolen': 'md', 'Migrationsöverdomstolen': 'mig', 'Miljööverdomstolen': 'mod', 'Patentbesvärsrätten': 'pbr', 'Rättshjälpsnämnden': 'rhn', 'regr': 'Regeringsrätten', 'Statens ansvarsnämnd': 'san', 'Svea hovrätt': 'hsv'} @managedparsing def parse(self, doc): # FIXME: don't create these if they already exists self.lagrum_parser = LegalRef(LegalRef.LAGRUM) self.rattsfall_parser = LegalRef(LegalRef.RATTSFALL) docfile = self.store.downloaded_path(doc.basefile) intermediatefile = self.store.intermediate_path(doc.basefile) r = WordReader() intermediatefile, filetype = r.read(docfile, intermediatefile) with codecs.open(intermediatefile, encoding="utf-8") as fp: patchedtext, patchdesc = self.patch_if_needed(doc.basefile, fp.read()) # The second step is to mangle the crappy XML produced by # antiword (docbook) or Word 2007 (OOXML) into a nice pair of # structures. rawhead is a simple dict that we'll later transform # into a rdflib Graph. rawbody is a list of plaintext strings, each # representing a paragraph. # # long-term FIXME: WordReader should expose a unified # interface for handling both kinds of word files so that we # wouldn't need both parse_ooxml() and # parse_antiword_docbook(). This might require some other tool # than antiword for old .doc files, as this throws away a LOT # of info. if filetype == "docx": rawhead, rawbody = self.parse_ooxml(patchedtext, doc.basefile) else: rawhead, rawbody = self.parse_antiword_docbook(patchedtext, doc.basefile) doc.uri = self.polish_metadata(rawhead, doc) if patchdesc: doc.meta.add((URIRef(doc.uri), self.ns['ferenda'].patchdescription, patchdesc)) doc.body = self.format_body(rawbody) # FIXME: Write a # FSMParser to detect # high-level structure of # the document def parse_ooxml(self, text, basefile): soup = BeautifulSoup(text) for instrtext in soup.find_all("w:instrtext"): instrtext.decompose() head = {} # Högst uppe på varje domslut står domstolsnamnet ("Högsta # domstolen") följt av referatnumret ("NJA 1987 # s. 113"). firstfield = soup.find("w:t") # Ibland ärdomstolsnamnet uppsplittat på två # w:r-element. Bäst att gå på all text i # föräldra-w:tc-cellen firstfield = firstfield.find_parent("w:tc") head['Domstol'] = firstfield.get_text(strip=True) nextfield = firstfield.find_next("w:tc") head['Referat'] = nextfield.get_text(strip=True) # Hitta övriga enkla metadatafält i sidhuvudet for key in self.labels: node = soup.find(text=re.compile(key + ':')) if not node: # Sometimes these text fields are broken up # (eg "<w:t>Avgörand</w:t>...<w:t>a</w:t>...<w:t>tum</w:t>") # Use (ridiculous) fallback method nodes = soup.find_all('w:statustext', attrs={'w:val': key}) if nodes: node = nodes[-1] else: self.log.warning("%s: Couldn't find field %r" % (basefile, key)) continue txt = node.find_next("w:t").find_parent("w:p").get_text(strip=True) if txt: # skippa fält med tomma strängen-värden head[key] = txt # Hitta sammansatta metadata i sidhuvudet for key in ["Lagrum", "Rättsfall"]: node = soup.find(text=re.compile(key + ':')) if node: textnodes = node.find_parent('w:tc').find_next_sibling('w:tc') if not textnodes: continue items = [] for textnode in textnodes.find_all('w:t'): t = textnode.get_text(strip=True) if t: items.append(t) if items: head[key] = items # The main text body of the verdict body = [] for p in soup.find(text=re.compile('EFERAT')).find_parent('w:tr').find_next_sibling('w:tr').find_all('w:p'): ptext = '' for e in p.findAll("w:t"): ptext += e.string body.append(ptext) # Finally, some more metadata in the footer if soup.find(text=re.compile(r'Sökord:')): head['Sökord'] = soup.find( text=re.compile(r'Sökord:')).find_next('w:t').get_text(strip=True) if soup.find(text=re.compile('^\s*Litteratur:\s*$')): n = soup.find(text=re.compile('^\s*Litteratur:\s*$')) head['Litteratur'] = n.findNext('w:t').get_text(strip=True) return head, body def parse_antiword_docbook(self, text, basefile): soup = BeautifulSoup(text) head = {} header_elements = soup.find("para") header_text = '' for el in header_elements.contents: if hasattr(el, 'name') and el.name == "informaltable": break else: header_text += el.string # Högst uppe på varje domslut står domstolsnamnet ("Högsta # domstolen") följt av referatnumret ("NJA 1987 # s. 113"). Beroende på worddokumentet ser dock XML-strukturen # olika ut. Det vanliga är att informationen finns i en # pipeseparerad paragraf: parts = [x.strip() for x in header_text.split("|")] if len(parts) > 1: head['Domstol'] = parts[0] head['Referat'] = parts[1] else: # alternativ står de på första raden i en informaltable row = soup.find("informaltable").tgroup.tbody.row.findAll('entry') head['Domstol'] = row[0].get_text(strip=True) head['Referat'] = row[1].get_text(strip=True) # Hitta övriga enkla metadatafält i sidhuvudet for key in self.labels: node = soup.find(text=re.compile(key + ':')) if node: txt = node.find_parent('entry').find_next_sibling('entry').get_text(strip=True) if txt: head[key] = txt # Hitta sammansatta metadata i sidhuvudet for key in ["Lagrum", "Rättsfall"]: node = soup.find(text=re.compile(key + ':')) if node: head[key] = [] textchunk = node.find_parent( 'entry').find_next_sibling('entry').string for line in [util.normalize_space(x) for x in textchunk.split("\n\n")]: if line: head[key].append(line) body = [] for p in soup.find(text=re.compile('REFERAT')).find_parent('tgroup').find_next_sibling('tgroup').find('entry').get_text(strip=True).split("\n\n"): body.append(p) # Hitta sammansatta metadata i sidfoten head['Sökord'] = soup.find(text=re.compile('Sökord:')).find_parent( 'entry').next_sibling.next_sibling.get_text(strip=True) if soup.find(text=re.compile('^\s*Litteratur:\s*$')): n = soup.find(text=re.compile('^\s*Litteratur:\s*$')).find_parent( 'entry').next_sibling.next_sibling.get_text(strip=True) head['Litteratur'] = n return head, body def polish_metadata(self, head, doc): basefile_regex = re.compile('(?P<type>\w+)/(?P<year>\d+)-(?P<ordinal>\d+)') def basefile_to_referat(basefile): templ = {'ADO': 'AD %(year)s nr %(ordinal)s', 'MD': 'MD %(year)s:%(ordinal)s'} m = basefile_regex.match(basefile) if m: return templ[m.group("type")] % (m.groupdict()) def ref_to_uri(ref): # FIXME: We'd like to retire legalref and replace it with # pyparsing grammars. nodes = self.rattsfall_parser.parse(ref) uri = nodes[0].uri return localize_uri(uri) def dom_to_uri(domstol, malnr, avg): baseuri = self.config.url slug = self.slugs[domstol] return "%(baseuri)sres/dv/%(slug)s/%(malnr)s/%(avg)s" % locals() def localize_uri(uri): if "publ/rattsfall" in uri: return uri.replace("http://rinfo.lagrummet.se/publ/rattsfall", self.config.url + "res/dv") elif "publ/sfs/" in uri: return uri.replace("http://rinfo.lagrummet.se/publ/sfs", self.config.url + "res/sfs") def split_nja(value): # "NJA 2008 s 567 (NJA 2008:86)"=>("NJA 2008 s 567", "NJA 2008:86") return [x[:-1] for x in value.split("(")] def sokord_uri(value): return self.config.url + "concept/%s" % util.ucfirst(value).replace(' ', '_') # 0. create Referat key if not present if "Referat" not in head: # For some courts (MD, AD, MOD?, MIG?) this is possible head["Referat"] = basefile_to_referat(doc.basefile) # 1. mint uris and create the two Describers we'll use refuri = ref_to_uri(head["Referat"]) refdesc = Describer(doc.meta, refuri) domuri = dom_to_uri(head["Domstol"], head["Målnummer"], head["Avgörandedatum"]) domdesc = Describer(doc.meta, domuri) # 2. convert all strings in head to proper RDF for label, value in head.items(): if label == "Rubrik": value = util.normalize_space(value) refdesc.value(self.ns['rpubl'].referatrubrik, value, lang="sv") domdesc.value(self.ns['dct'].title, value, lang="sv") elif label == "Domstol": domdesc.rel(self.ns['dct'].publisher, self.lookup_resource(value)) elif label == "Målnummer": domdesc.rel(self.ns['rpubl'].malnummer, value) elif label == "Domsnummer": domdesc.rel(self.ns['rpubl'].domsnummer, value) elif label == "Diarienummer": domdesc.rel(self.ns['rpubl'].diarienummer, value) elif label == "Avdelning": domdesc.rel(self.ns['rpubl'].avdelning, value) elif label == "Referat": for pred, regex in {'rattsfallspublikation': r'([^ ]+)', 'arsutgava': r'(\d{4})', 'lopnummer': r'\d{4}(?:\:| nr )(\d+)', 'sidnummer': r's.? ?(\d+)'}.items(): m = re.search(regex, value) if m: if pred == 'rattsfallspublikation': # "NJA" -> "http://lcaolhost:8000/coll/dv/nja" uri = self.config.url + "coll/dv/" + m.group(1).lower() refdesc.rel(self.ns['rpubl'][pred], uri) else: refdesc.value(self.ns['rpubl'][pred], m.group(1)) if value.startswith("NJA"): realvalue, extra = split_nja(value) ordinal = extra.split(" ")[1] refdesc.value(self.ns['dct'].bibliographicCitation, extra) refdesc.rel(self.ns['owl'].sameAs, self.config.url + "res/dv/nja/" + ordinal) refdesc.value(self.ns['dct'].identifier, realvalue) else: refdesc.value(self.ns['dct'].identifier, value) elif label == "Avgörandedatum": with util.c_locale(): d = datetime.strptime(value, '%Y-%m-%d') domdesc.value(self.ns['rpubl'].avgorandedatum, d) elif label == "Lagrum": for i in value: # better be list not string for node in self.lagrum_parser.parse(i): if isinstance(node, Link): domdesc.rel(self.ns['rpubl'].lagrum, localize_uri(node.uri)) elif label == "Rättsfall": for i in value: for node in self.rattsfall_parser.parse(i): if isinstance(node, Link): domdesc.rel(self.ns['rpubl'].rattsfall, localize_uri(node.uri)) elif label == "Litteratur": for i in value.split(";"): domdesc.value(self.ns['dct'].relation, util.normalize_space(i)) elif label == "Sökord": for s in self.re_delimSplit(value): s = util.normalize_space(s) if not s: continue # terms longer than 72 chars are not legitimate # terms. more likely descriptions. If a term has a - in # it, it's probably a separator between a term and a # description while len(s) >= 72 and " - " in s: h, s = s.split(" - ", 1) domdesc.rel(self.ns['dct'].subject, sokord_uri(h)) if len(s) < 72: domdesc.rel(self.ns['dct'].subject, sokord_uri(s)) # 3. mint some owl:sameAs URIs refdesc.rel(self.ns['owl'].sameAs, self.sameas_uri(refuri)) domdesc.rel(self.ns['owl'].sameAs, self.sameas_uri(domuri)) # 4. Add some same-for-everyone properties refdesc.rel(self.ns['dct'].publisher, self.lookup_resource('Domstolsverket')) refdesc.rdftype(self.ns['rpubl'].Rattsfallsreferat) domdesc.rdftype(self.ns['rpubl'].VagledandeDomstolsavgorande) refdesc.rel(self.ns['rpubl'].referatAvDomstolsavgorande, domuri) # 5. assert that we have everything we need # 6. done! return refuri def format_body(self, paras): return Body([Paragraph([x]) for x in paras]) # FIXME: port to list_basefiles_for("parse") def ParseAll(self): self._do_for_all(intermediate_dir, '.doc', self.Parse) self._do_for_all(intermediate_dir, '.docx', self.Parse) # FIXME: convert to a CONSTRUCT query, save as res/sparql/dv-annotations.rq # Or maybe the default template should take a list of predicates, defaulting # to dct:references, but which we could substitute rpubl:rattsfallshanvisning # annotation_query = """ # PREFIX dct:<http://purl.org/dc/terms/> # PREFIX rpub:<http://rinfo.lagrummet.se/ns/2008/11/rinfo/publ#> # # SELECT ?uri ?id ?desc # WHERE { # ?uri dct:description ?desc . # ?uri dct:identifier ?id . # ?uri rpubl:rattsfallshanvisning <%s> #} #""" % uri # # FIXME: port to relate_all_setup / _teardown def GenerateMapAll(self): mapfile = os.path.sep.join( [self.baseDir, 'dv', 'generated', 'uri.map']) util.robust_remove(mapfile + ".new") parsed_dir = os.path.sep.join([self.baseDir, 'dv', 'parsed']) self._do_for_all(parsed_dir, '.xht2', self.GenerateMap) util.robustRename(mapfile + ".new", mapfile) def GenerateMap(self, basefile): start = time() infile = os.path.relpath(self._xmlFileName(basefile)) head = codecs.open(infile, encoding='utf-8').read(1024) m = self.re_xmlbase(head) if m: uri = "http://rinfo.lagrummet.se/publ/rattsfall/%s" % m.group(1) mapfile = self.store.path('generated', 'uri.map', '.new') util.ensure_dir(mapfile) f = codecs.open(mapfile, 'a', encoding='iso-8859-1') f.write("%s\t%s\n" % (m.group(1), basefile)) f.close() self.log.info("%s ok" % basefile) return else: self.log.warning("could not find xml:base in %s" % infile) # gonna need this for news_criteria() pubs = {'http://rinfo.lagrummet.se/ref/rff/nja': 'Högsta domstolen', 'http://rinfo.lagrummet.se/ref/rff/rh': 'Hovrätterna', 'http://rinfo.lagrummet.se/ref/rff/rk': 'Kammarrätterna', 'http://rinfo.lagrummet.se/ref/rff/ra': 'Regeringsrätten', 'http://rinfo.lagrummet.se/ref/rff/hfd': 'Högsta förvaltningsdomstolen', 'http://rinfo.lagrummet.se/ref/rff/ad': 'Arbetsdomstolen', 'http://rinfo.lagrummet.se/ref/rff/fod': 'Försäkringsöverdomstolen', 'http://rinfo.lagrummet.se/ref/rff/md': 'Marknadsdomstolen', 'http://rinfo.lagrummet.se/ref/rff/mig': 'Migrationsöverdomstolen', 'http://rinfo.lagrummet.se/ref/rff/mod': 'Miljööverdomstolen' }
class EURLexCaselaw(EURLex): alias = "eurlexcaselaw" # only select judgments and AG opinions # expertquery_template = "SELECT CELLAR_ID, TI_DISPLAY, DN, DD WHERE (FM_CODED = JUDG OR FM_CODED = OPIN_AG) ORDER BY DD ASC" expertquery_template = "(FM_CODED = JUDG OR FM_CODED = OPIN_AG)" contenttype = "text/html" # legal cases OUGHT to be available as # xhtml, and the "branch notice" # indicates that they are, but in # reality they're not. downloaded_suffix = ".html" celexfilter = re.compile("(6\d{4}[A-Z]{2}\d{4})$").match def parse_metadata_from_soup(self, soup, doc): # AVAILABLE METADATA IN CASES # # For now, we create a nonofficial eurlex vocab with namespace http://lagen.nu/eurlex# # - celex number (first h1) :celex (:celexnum?) # # - [Title and reference] # - decision type and date "Judgment of the Court (Third Chamber) of 17 December 2009." # :courtdecision (as opposed to :commissiondecision) # - :party (or parties) "M v Agence européenne des médicaments (EMEA)." # - :referingcourt "Reference for a preliminary ruling: Administrativen sad Sofia-grad - Bulgaria." # - :legalissue - short description and/or(?) keywords (not always present, eg 62009J0403), hyphen sep: # - "Review of the judgment in Case T-12/08 P" # - "Whether the state of the proceedings permits final judgment to be given" # - "Fair hearing" # - "Rule that the parties should be heard" # - "Whether the unity or consistency of Community law is affected." # - :casenum Case number + unknown letters: # - "Case C-197/09 RX-II." # - "Joined cases T-117/03 to T-119/03 and T-171/03." # - :casereporter Case reporter cite "European Court reports 2009 Page 00000" # - [Text] # - :availablelang - Available languages ("bg", "es", "cs", "da" ....) # - :authenticlang - Authentic language ("fr" or "French") # - [Dates] # - :decisiondate - Date of document (decision/judgement) # - :applicationdate - Date of application # - [Classifications] (different from description/keywords above) # - :subjectmatter Subject Matter, comma sep: # - "Staff regulations and employment conditions - EC" # - "Provisions governing the Institutions" # - :directorycode - Case Law Directory Code (where is the full code list?), NL sep: # - "B-09.03 EEC/EC / State aid / Exceptions to the prohibition of aid" # - "B-20.05 EEC/EC / Acts of the institutions / Statement of the reasons on which a measure is based" # - "B-09.03 EEC/EC / State aid / Exceptions to the prohibition of aid" # - "B-09.04 EEC/EC / State aid / Review of aid by the Commission - Rules of procedure" # - [Miscellaneous information] # - dcterms:author Author: "Court of Justice of the European Communities" # - :form Form: "Judgement" # - [Procedure] # - :proceduretype - Type of procedure, comma sep: # - "Staff cases" # - "Action for damages" # - "Appeal" # - "REEX=OB" # - :applicant - Applicant: "Official" # - :defendant - Defendant: "EMEA, Institutions" # - :observation - Observations: "Italy, Poland, Member States, European Parliament, Council, Commission, Institutions" # - :judgerapporteur - Judge-Rapporteur: "von Danwitz" # - :advocategeneral - Advocate General: "Mazák" # - [Relationships between documents] # - :treaty Treaty: "European Communities" # - :caseaffecting Case affecting, NL-sep: # - "Interprets [CELEXNO + pinpoint]" # - "Declares void 61995A0091" # - "Confirms 31996D0666" # - :"Instruments cited in case law" (celex numbers with pinpoint locations?), nl-sep # - "12001C/PRO/02-A61" # - "12001C/PRO/02-NA13P1" # - "31991Q0530-A114" # - "62007K0023" # - "62008A0012" # # convenience functions -- should not be needed now that we have Describer # def add_literal(predicate, literal): # g.add((URIRef(uri), # voc[predicate], # Literal(literal, lang=lang))) # # def add_celex_object(predicate, celexno): # g.add((URIRef(uri), # voc[predicate], # URIRef("http://lagen.nu/ext/celex/%s" % celexno))) # # def get_predicate(predicate): # predicates = list(g.objects(URIRef(uri), voc[predicate])) # return predicates != [] # # These are a series of refinments for the "Affecting" # relationship. "Cites" doesn't have these (or similar), but # "is affected by" has (the inverse properties) affects_predicates = { "Interprets": "interprets", "Interprets the judgment": "interpretsJudgment", "Declares void": "declaresVoid", "Confirms": "confirms", "Declares valid (incidentally)": "declaresValidIncidentally", "Declares valid (by a preliminary ruling)": "declaresValidByPreliminaryRuling", "Incidentally declares invalid": "declaresInvalidIncidentally", "Declares invalid (by a preliminary ruling)": "declaresInvalidByPreliminaryRuling", "Amends": "amends", "Failure concerning": "failureConcerning" } isaffected_predicates = { "Interpreted by": "interpretedBy", "Confirmed by": "confirmedBy", "Declared void by": "declaredVoidBy", "Annulment requested by": "annulmentRequestedBy" } # 1. Express metadata about our document as a RDF graph desc = Describer(self.meta, self.uri) g = Graph() # :celex - first <h1> celexnum = soup.h1.get_text(strip=True) if celexnum == "No documents matching criteria.": raise errors.DocumentRemovedError( "No documents matching criteria " + celexnum) elif "no_data_found" in celexnum: self.log.warning("%s: No data found (try re-downloading)!" % basefile) raise errors.DocumentRemovedError("No data found!") assert celexnum == doc.basefile, "Celex number in file (%s) differ from filename (%s)" % ( celexnum, basefile) doc.lang = soup.html['lang'] m = self.re_celexno.match(celexnum) # FIXME: this list is outdated! rdftype = { 'J': voc['Judgment'], 'A': voc['JudgmentFirstInstance'], 'W': voc['JudgmentCivilService'], 'O': voc['Order'], 'B': voc['OrderCivilService'] }[m.group(3)] desc.rdftype(rdftype) desc.value(self.ns['eurlex'].celexnum, celexnum) # The first section, following <h2>Title and reference</h2> # contains :courtdecision, :party (one or two items), # :referingcourt (optional), :legalissue (list of strings), # :casenum, :casereporter. Since some are optional, we do a # little heuristics to find out what we're looking at at any # given moment. for section in soup.findAll(["h1", "h2"]): if section.name == "h1" and section.a and section.a.string == "Text": break if section.string == "Title and reference": for para in section.findNextSiblings("p"): if not para.string: continue string = para.string.strip() # optional: do sanitychecks to see if this really is a :courtdecision if not get_predicate('courtdecision'): add_literal('courtdecision', string) elif not get_predicate('party'): # this will be one or two items. Are they position dependent? for party in string.split(" v "): add_literal('party', party) elif (not get_predicate('referingcourt') and (string.startswith( "Reference for a preliminary ruling") or string.startswith("Preliminary ruling requested"))): add_literal('referingcourt', string) elif (not get_predicate('casenum') and (string.lower().startswith("case ") or string.lower().startswith("joined cases "))): add_literal('casenum', string) elif para.em: # :casereporter is enclosed in an em for row in para.findAll(text=True): add_literal('casereporter', row.strip()) elif get_predicate('legalissue'): # fixme: Split this up somehow add_literal('legalissue', string) elif section.string == "Relationship between documents": for item in section.findNextSibling("ul").findAll("li"): predicate = None subpredicate = None for node in item.childGenerator(): if not hasattr(node, "name"): nodetext = node.strip() if re.match("([ABCDEFGIJKLNPRST]+\d*)+$", nodetext): continue if re.match("\d[\d\-]*[ABC]?$", nodetext): continue if predicate == "affects" and nodetext: if nodetext in affects_predicates: subpredicate = affects_predicates[nodetext] else: self.log.warning( "Can't express '%s' as a affects predicate" % nodetext) elif predicate == "isaffected" and nodetext: if nodetext in isaffected_predicates: subpredicate = isaffected_predicates[ nodetext] else: self.log.warning( "Can't express '%s' as a isaffected predicate" % nodetext) elif node.name == "strong": subpredicate = None if node.string == "Treaty:": predicate = "treaty" elif node.string == "Affected by case:": predicate = "isaffected" elif node.string == "Case affecting:": predicate = "affects" elif node.string == "Instruments cited in case law:": predicate = "cites" else: self.log.warning( "Don't know how to handle key '%s'" % node.string) elif node.name == "a" and predicate: p = predicate if subpredicate: p = subpredicate # FIXME: If the # predicate is "cites", the celex number # may have extra crap # (eg. "31968R0259(01)-N2A1L6") indicating # pinpoint location. Transform these to a # fragment identifier. add_celex_object(p, node.string.strip()) def parse_document_from_soup(self, soup, doc): # Process text and create DOM self.parser = LegalRef(LegalRef.EGRATTSFALL) textdiv = soup.find("div", "texte") if textdiv: for node in textdiv.childGenerator(): if node.string: # Here we should start analyzing for things like # "C-197/09". Note that the Eurlex data does not use # the ordinary hyphen like above, but rather # 'NON-BREAKING HYPHEN' (U+2011) - LegaRef will mangle # this to an ordinary hyphen. subnodes = self.parser.parse( node.string, predicate="dcterms:references") doc.body.append(Paragraph(subnodes)) else: self.log.warning("%s: No fulltext available!" % celexnum) doc.body.append(Paragraph(["(No fulltext available)"]))
def parse_from_soup(self, soup): # Step 1: Find out basic metadata rubrik = soup.first("title").string beslutsdatum = soup.first( "meta", {'name': 'SG_Beslutsdatum'})['content'] beslutsdatum = datetime.strptime(beslutsdatum, "%Y-%m-%d").date() diarienummer = soup.first( "meta", {'name': 'SG_Dokumentbet'})['content'] arendetyp = soup.first("meta", {'name': 'Subject'})['content'] # the keywords for a documents is contained in a metatag # formatted like: # <meta name="Keywords" content="hets_mot_folkgrupp\nmeddelarfrihet\åklagare"> # # Transform this into an array like: # [u'http://lagen.nu/concept/Hets_mot_folkgrupp', # u'http://lagen.nu/concept/Meddelarfrihet', # u'http://lagen.nu/concept/Åklagare'] nyckelord = soup.first("meta", {'name': 'Keywords'})['content'] begrepp = ['http://lagen.nu/concept/%s' % util.ucfirst( x).strip().replace(" ", "_") for x in nyckelord.split("\n")] # Step 2: Using the metadata, construct the canonical URI for this document uri = LegalURI.construct({'type': LegalRef.MYNDIGHETSBESLUT, 'myndighet': 'jk', 'dnr': diarienummer}) # self.log.debug("URI: %s" % uri) # Step 3: Create a RDF graph of all our metadata (so far) g = Graph() g.bind('dct', self.ns['dct']) g.bind('rinfo', self.ns['rinfo']) g.bind('rinfoex', self.ns['rinfoex']) g.bind('xsd', util.ns['xsd']) g.add(( URIRef(uri), self.ns['dct']['title'], Literal(rubrik, lang="sv"))) g.add((URIRef(uri), self.ns['rinfo']['beslutsdatum'], Literal(beslutsdatum, lang="sv"))) g.add((URIRef(uri), self.ns['rinfo']['diarienummer'], Literal(diarienummer, lang="sv"))) g.add((URIRef(uri), self.ns['rinfoex']['arendetyp'], Literal(arendetyp, lang="sv"))) for s in begrepp: g.add((URIRef(uri), self.ns['dct']['subject'], URIRef(s))) g.add((URIRef(uri), self.ns['dct']['identifier'], Literal( "JK %s" % diarienummer, lang="sv"))) g.add((URIRef(uri), RDF.type, self.rdf_type)) # Step 4: Process the actual text of the document self.parser = LegalRef(LegalRef.LAGRUM, LegalRef.KORTLAGRUM, LegalRef.RATTSFALL, LegalRef.FORARBETEN) # newer documents have a semantic structure with h1 and h2 # elements. Older have elements like <p class="Rubrik_1">. Try # to determine which one we're dealing with? tag = soup.find('a', {'name': "Start"}) if tag: # self.log.debug("Using new-style document structure") elements = tag.parent.findAllNext() else: # self.log.debug("Using old-style document structure") elements = soup.findAll("p") # self.log.debug("Found %d elements" % len(elements)) from collections import deque elements = deque(elements) body = self.make_sektion(elements, "Referat av beslut") # Step 5: Combine the metadata and the document, and return it doc = {'meta': g, 'body': body, 'lang': 'sv', 'uri': uri} return doc
class JK(SwedishLegalSource): alias = "jk" start_url = "http://www.jk.se/Beslut.aspx?query=&type=all&dateFrom=1998-01-01&dateTo=2100-01-01&dnr=" document_url_regex = "http://www.jk.se/Beslut/(?P<kategori>[\w\-]+)/(?P<basefile>\d+\-\d+\-\d+).aspx" @recordlastdownload def download(self, basefile=None): for basefile, url in self.download_get_basefiles(self.start_url): self.download_single(basefile, url) @downloadmax def download_get_basefiles(self, start_url): document_url_regex = re.compile("(?P<basefile>\d+\-\d+\-\d+).aspx") done = False url = start_url pagecount = 1 while not done: self.log.info("Getting page #%s" % pagecount) soup = BeautifulSoup(requests.get(url).text) for link in soup.find_all("a", href=document_url_regex): basefile = document_url_regex.search(link["href"]).group("basefile") yield basefile, urljoin(url, link["href"]) next = soup.find("img", src="/common/images/navigation-pil-grey.png").find_parent("a") if next: url = urljoin(url, next["href"]) pagecount += 1 else: done = True def parse_from_soup(self, soup): # Step 1: Find out basic metadata rubrik = soup.first("title").string beslutsdatum = soup.first( "meta", {'name': 'SG_Beslutsdatum'})['content'] beslutsdatum = datetime.strptime(beslutsdatum, "%Y-%m-%d").date() diarienummer = soup.first( "meta", {'name': 'SG_Dokumentbet'})['content'] arendetyp = soup.first("meta", {'name': 'Subject'})['content'] # the keywords for a documents is contained in a metatag # formatted like: # <meta name="Keywords" content="hets_mot_folkgrupp\nmeddelarfrihet\åklagare"> # # Transform this into an array like: # [u'http://lagen.nu/concept/Hets_mot_folkgrupp', # u'http://lagen.nu/concept/Meddelarfrihet', # u'http://lagen.nu/concept/Åklagare'] nyckelord = soup.first("meta", {'name': 'Keywords'})['content'] begrepp = ['http://lagen.nu/concept/%s' % util.ucfirst( x).strip().replace(" ", "_") for x in nyckelord.split("\n")] # Step 2: Using the metadata, construct the canonical URI for this document uri = LegalURI.construct({'type': LegalRef.MYNDIGHETSBESLUT, 'myndighet': 'jk', 'dnr': diarienummer}) # self.log.debug("URI: %s" % uri) # Step 3: Create a RDF graph of all our metadata (so far) g = Graph() g.bind('dct', self.ns['dct']) g.bind('rinfo', self.ns['rinfo']) g.bind('rinfoex', self.ns['rinfoex']) g.bind('xsd', util.ns['xsd']) g.add(( URIRef(uri), self.ns['dct']['title'], Literal(rubrik, lang="sv"))) g.add((URIRef(uri), self.ns['rinfo']['beslutsdatum'], Literal(beslutsdatum, lang="sv"))) g.add((URIRef(uri), self.ns['rinfo']['diarienummer'], Literal(diarienummer, lang="sv"))) g.add((URIRef(uri), self.ns['rinfoex']['arendetyp'], Literal(arendetyp, lang="sv"))) for s in begrepp: g.add((URIRef(uri), self.ns['dct']['subject'], URIRef(s))) g.add((URIRef(uri), self.ns['dct']['identifier'], Literal( "JK %s" % diarienummer, lang="sv"))) g.add((URIRef(uri), RDF.type, self.rdf_type)) # Step 4: Process the actual text of the document self.parser = LegalRef(LegalRef.LAGRUM, LegalRef.KORTLAGRUM, LegalRef.RATTSFALL, LegalRef.FORARBETEN) # newer documents have a semantic structure with h1 and h2 # elements. Older have elements like <p class="Rubrik_1">. Try # to determine which one we're dealing with? tag = soup.find('a', {'name': "Start"}) if tag: # self.log.debug("Using new-style document structure") elements = tag.parent.findAllNext() else: # self.log.debug("Using old-style document structure") elements = soup.findAll("p") # self.log.debug("Found %d elements" % len(elements)) from collections import deque elements = deque(elements) body = self.make_sektion(elements, "Referat av beslut") # Step 5: Combine the metadata and the document, and return it doc = {'meta': g, 'body': body, 'lang': 'sv', 'uri': uri} return doc def make_sektion(self, elements, heading, level=0): sekt = Sektion(**{"rubrik": heading, "niva": level}) self.log.debug( "%sCreated sektion(%d): '%s'" % (" " * level, level, heading)) baseuri = None while True: try: p = elements.popleft() except IndexError: return sekt text = p.get_text(strip=True) # self.log.debug("%sp.name: %s, p['class']: %s, 'class' in p.attrs: %s" % (" "*level,p.name,p['class'], (u'class' in p.attrs[0]))) new_level = None if p.name == "h1": new_level = 1 elif p.name == "h2": new_level = 2 elif p.name == "h3": new_level = 3 elif ((p.name == "p") and (len(p.attrs) > 0) and ('class' in p.attrs[0]) and (p['class'].startswith("Rubrik_"))): # self.log.debug("%sp.class: %s" % (" "*level,p['class'])) new_level = int(p['class'][7:]) if new_level: if new_level > level: sekt.append(self.make_sektion(elements, text, new_level)) else: elements.appendleft(p) return sekt else: if text: nodes = self.parser.parse(text, baseuri=baseuri, predicate="dct:references") for node in nodes: # Use possible SFS references as the the # baseuri for subsequent paragraphs if isinstance(node, Link) and node.uri.startswith("http://rinfo.lagrummet.se/publ/sfs/"): baseuri = node.uri stycke = Stycke(nodes) # self.log.debug("%sCreated stycke: '%s'" % (" "*level,stycke)) sekt.append(stycke)
def parametric_test(self, datafile): p = LegalRef(LegalRef.EGRATTSFALL) return self._test_parser(datafile, p)
class LegalRefTest(object): def __init__(self, alias): # setup self.alias = alias parsetype = alias.split("/")[1] self.parser = LegalRef({'SFS': LegalRef.LAGRUM, 'Short': LegalRef.KORTLAGRUM, 'DV': LegalRef.RATTSFALL, 'Regpubl': LegalRef.FORARBETEN, 'EGLag': LegalRef.EULAGSTIFTNING, 'ECJ': LegalRef.EURATTSFALL}[parsetype]) # this particular test method is set up to use lagen.nu style # URIs because the canonical URIs are significantly different. dirname = os.path.dirname(__file__) basedir = dirname + "/../" space = basedir + "lagen/nu/res/uri/swedishlegalsource.space.ttl" slugs = basedir + "lagen/nu/res/uri/swedishlegalsource.slugs.ttl" extra = [basedir + "lagen/nu/res/extra/swedishlegalsource.ttl", basedir + "lagen/nu/res/extra/sfs.ttl"] cfg = Graph().parse(space, format="turtle").parse(slugs, format="turtle") self.metadata = Graph() for ttl in extra: self.metadata.parse(ttl, format="turtle") COIN = Namespace("http://purl.org/court/def/2009/coin#") # select correct URI for the URISpace definition by # finding a single coin:URISpace object spaceuri = cfg.value(predicate=RDF.type, object=COIN.URISpace) self.minter = URIMinter(cfg, spaceuri) def createtest(self, basefile, basedir): # FIXME: This is mostly a cut'n paste of integrationLegalRef._test_parser testfile = os.path.dirname(__file__) + "/../test/files/" + self.alias + "/" + basefile + ".txt" encoding = 'windows-1252' with codecs.open(testfile,encoding=encoding) as fp: testdata = fp.read() parts = re.split('\r?\n\r?\n',testdata,1) testdata = parts[0] test_paras = re.split('\r?\n---\r?\n',testdata) # first: run it five times with timeit to get a good average exec time elapsed = timeit(functools.partial(self.run_with_timeit, test_paras), number=5, globals=globals()) # then: run it a sixth time to get at the return value body = self.run_with_timeit(test_paras) return elapsed, extractrefs(body) def run_with_timeit(self, test_paras): body = [] for para in test_paras: if para.startswith("RESET:"): self.parser.currentlynamedlaws.clear() if para.startswith("NOBASE:"): baseuri_attributes = {} else: baseuri_attributes = {'law': '9999:999'} nodes = self.parser.parse(para, self.minter, self.metadata, baseuri_attributes) body.append(nodes) return body timetest = createtest
def lagrum_parser(self): return SwedishCitationParser(LegalRef(LegalRef.LAGRUM, LegalRef.EULAGSTIFTNING), self.minter, self.commondata, allow_relative=True)
class LegalRefTest(object): def __init__(self, alias): # setup self.alias = alias parsetype = alias.split("/")[1] self.parser = LegalRef({ 'SFS': LegalRef.LAGRUM, 'Short': LegalRef.KORTLAGRUM, 'DV': LegalRef.RATTSFALL, 'Regpubl': LegalRef.FORARBETEN, 'EGLag': LegalRef.EULAGSTIFTNING, 'ECJ': LegalRef.EURATTSFALL }[parsetype]) # this particular test method is set up to use lagen.nu style # URIs because the canonical URIs are significantly different. dirname = os.path.dirname(__file__) basedir = dirname + "/../" space = basedir + "lagen/nu/res/uri/swedishlegalsource.space.ttl" slugs = basedir + "lagen/nu/res/uri/swedishlegalsource.slugs.ttl" extra = [ basedir + "lagen/nu/res/extra/swedishlegalsource.ttl", basedir + "lagen/nu/res/extra/sfs.ttl" ] cfg = Graph().parse(space, format="turtle").parse(slugs, format="turtle") self.metadata = Graph() for ttl in extra: self.metadata.parse(ttl, format="turtle") COIN = Namespace("http://purl.org/court/def/2009/coin#") # select correct URI for the URISpace definition by # finding a single coin:URISpace object spaceuri = cfg.value(predicate=RDF.type, object=COIN.URISpace) self.minter = URIMinter(cfg, spaceuri) def createtest(self, basefile, basedir): # FIXME: This is mostly a cut'n paste of integrationLegalRef._test_parser testfile = os.path.dirname( __file__ ) + "/../test/files/" + self.alias + "/" + basefile + ".txt" encoding = 'windows-1252' with codecs.open(testfile, encoding=encoding) as fp: testdata = fp.read() parts = re.split('\r?\n\r?\n', testdata, 1) testdata = parts[0] test_paras = re.split('\r?\n---\r?\n', testdata) # first: run it five times with timeit to get a good average exec time elapsed = timeit(functools.partial(self.run_with_timeit, test_paras), number=5, globals=globals()) # then: run it a sixth time to get at the return value body = self.run_with_timeit(test_paras) return elapsed, extractrefs(body) def run_with_timeit(self, test_paras): body = [] for para in test_paras: if para.startswith("RESET:"): self.parser.currentlynamedlaws.clear() if para.startswith("NOBASE:"): baseuri_attributes = {} else: baseuri_attributes = {'law': '9999:999'} nodes = self.parser.parse(para, self.minter, self.metadata, baseuri_attributes) body.append(nodes) return body timetest = createtest
class LNMediaWiki(MediaWiki): namespaces = SwedishLegalSource.namespaces from ferenda.sources.legal.se.legalref import LegalRef p = LegalRef(LegalRef.LAGRUM, LegalRef.KORTLAGRUM, LegalRef.FORARBETEN, LegalRef.RATTSFALL) keyword_class = LNKeyword lang = "sv" def __init__(self, config=None, **kwargs): super(LNMediaWiki, self).__init__(config, **kwargs) if self.config._parent and hasattr(self.config._parent, "sfs"): self.sfsrepo = SFS(self.config._parent.sfs) else: self.sfsrepo = SFS() def get_wikisettings(self): settings = LNSettings(lang=self.lang) # NOTE: The settings object (the make_url method) only needs # access to the canonical_uri method. settings.make_sfs_url = self.sfsrepo.canonical_uri settings.make_keyword_url = self.keywordrepo.canonical_uri return settings def get_wikisemantics(self, parser, settings): return LNSemantics(parser, settings) def canonical_uri(self, basefile): if basefile.startswith("SFS/") or basefile.startswith("SFS:"): # "SFS/1998:204" -> "1998:204" return self.sfsrepo.canonical_uri(basefile[4:]) else: return super(LNMediaWiki, self).canonical_uri(basefile) def postprocess(self, doc, xhtmltree): # if SFS mode: # create a div for root content # find all headers, create div for everything there if doc.basefile.startswith("SFS/") or doc.basefile.startswith("SFS:"): self.postprocess_commentary(doc, xhtmltree) toplevel_property = False else: toplevel_property = True body = super(LNMediaWiki, self).postprocess(doc, xhtmltree, toplevel_property=toplevel_property) citparser = SwedishCitationParser(self.p, self.config.url) citparser.parse_recursive(body, predicate=None) return body def postprocess_commentary(self, doc, xhtmltree): uri = doc.uri body = xhtmltree.getchildren()[0] newbody = etree.Element("body") curruri = uri currdiv = etree.SubElement(newbody, "div") currdiv.set("about", curruri) currdiv.set("property", "dcterms:description") currdiv.set("datatype", "rdf:XMLLiteral") containerdiv = etree.SubElement(currdiv, "div") for child in body.getchildren(): if child.tag in ("h1", "h2", "h3", "h4", "h5", "h6"): # remove that <span> element that Semantics._h_el adds for us assert child[ 0].tag == "span", "Header subelement was %s not span" % child[ 0].tag child.text = child[0].text child.remove(child[0]) if child.text: if isinstance(child.text, bytes): txt = child.text.decode("utf-8") else: txt = child.text nodes = self.p.parse(txt, curruri) curruri = nodes[0].uri # body.remove(child) newbody.append(child) currdiv = etree.SubElement(newbody, "div") currdiv.set("about", curruri) currdiv.set("property", "dcterms:description") currdiv.set("datatype", "rdf:XMLLiteral") # create a containerdiv under currdiv for reasons containerdiv = etree.SubElement(currdiv, "div") else: # body.remove(child) currdiv[0].append(child) xhtmltree.remove(body) xhtmltree.append(newbody)
def parametric_test(self, datafile): p = LegalRef(LegalRef.LAGRUM) return self._test_parser(datafile, p)
def parametric_test(self, datafile): p = LegalRef(LegalRef.MYNDIGHETSBESLUT) # p.verbose = True return self._test_parser(datafile, p)
def parametric_test(self, datafile): p = LegalRef(LegalRef.FORARBETEN) return self._test_parser(datafile, p)
class EurlexCaselaw(DocumentRepository): """Handles all case law from the European Court of Justice (ECJ).""" alias = "ecj" # European Court of Justice start_url = "http://eur-lex.europa.eu/JURISIndex.do" document_url = "http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=CELEX:%(basefile)s:EN:NOT" source_encoding = "utf-8" namespaces = ('rdf', 'dct', ('eurlex', 'http://lagen.nu/eurlex#')) # This regexp is specific to caselaw (the leading '6' is for the # caselaw area). re_celexno = re.compile('(6)(\d{4})(\w\w?)(\d{4})(\(\d{2}\)|)') def download(self, basefile=None): if basefile: self.download_single(basefile) if not self.config.force and 'startyear' in self.config: startyear = self.config.startyear else: startyear = 1954 # The first verdicts were published in this year for year in range(startyear, datetime.date.today().year + 1): # We use self.configfile directly rather than # self.moduleconfig, since the latter cannot be persisted # across sessions (as it is a subset of a composite # between the config file and command line options) self.config.startyear = year self.config.write() # FIXME: URL parameters may have changed -- this seem to produce every # case from year up till today list_url = "http://eur-lex.europa.eu/Result.do?T1=V6&T2=%d&T3=&RechType=RECH_naturel" % year self.log.debug("Searching for %d" % year) res = request.get(list_url) pagecnt = 0 done = False while not done: pagecnt += 1 self.log.debug("Result page #%s" % pagecnt) # Don't parse using BeautifulSoup etc -- just search the whole damn text blob celexnos = self.re_celexno.findall(res.text) # FIXME: support for config.downloadmax for celexno in itertools.chain(celexnos): # the number will be split up in components - concatenate celexno = "".join(celexno) # only download actual judgements and orders # FIXME: the below is outdated -- now "TA" and "CN" (amongst others?) are used # J: Judgment of the Court # A: Judgment of the Court of First Instance # W: Judgement of the Civil Service Tribunal # T: (old) Judgement of the Court # B: Order of the CFI # O: Order of the ECJ if ('J' in celexno or 'A' in celexno or 'W' in celexno or 'T' in celexno or 'B' in celexno or 'O' in celexno): if self.download_single(celexno, usecache=usecache): self.log.info("Downloaded %s" % celexno) else: self.log.info("Skipped %s" % celexno) else: pass #self.log.debug("Not downloading doc %s" % celexno) # see if there are any "next" pages url = lxml.html.parse(res.text).find("a", text=">").get('href', None) if url: res = request.get(url) else: self.log.info('No next page link found, we must be done') done = True def parse_metadata_from_soup(self, soup, doc): # AVAILABLE METADATA IN CASES # # For now, we create a nonofficial eurlex vocab with namespace http://lagen.nu/eurlex# # - celex number (first h1) :celex (:celexnum?) # # - [Title and reference] # - decision type and date "Judgment of the Court (Third Chamber) of 17 December 2009." # :courtdecision (as opposed to :commissiondecision) # - :party (or parties) "M v Agence européenne des médicaments (EMEA)." # - :referingcourt "Reference for a preliminary ruling: Administrativen sad Sofia-grad - Bulgaria." # - :legalissue - short description and/or(?) keywords (not always present, eg 62009J0403), hyphen sep: # - "Review of the judgment in Case T-12/08 P" # - "Whether the state of the proceedings permits final judgment to be given" # - "Fair hearing" # - "Rule that the parties should be heard" # - "Whether the unity or consistency of Community law is affected." # - :casenum Case number + unknown letters: # - "Case C-197/09 RX-II." # - "Joined cases T-117/03 to T-119/03 and T-171/03." # - :casereporter Case reporter cite "European Court reports 2009 Page 00000" # - [Text] # - :availablelang - Available languages ("bg", "es", "cs", "da" ....) # - :authenticlang - Authentic language ("fr" or "French") # - [Dates] # - :decisiondate - Date of document (decision/judgement) # - :applicationdate - Date of application # - [Classifications] (different from description/keywords above) # - :subjectmatter Subject Matter, comma sep: # - "Staff regulations and employment conditions - EC" # - "Provisions governing the Institutions" # - :directorycode - Case Law Directory Code (where is the full code list?), NL sep: # - "B-09.03 EEC/EC / State aid / Exceptions to the prohibition of aid" # - "B-20.05 EEC/EC / Acts of the institutions / Statement of the reasons on which a measure is based" # - "B-09.03 EEC/EC / State aid / Exceptions to the prohibition of aid" # - "B-09.04 EEC/EC / State aid / Review of aid by the Commission - Rules of procedure" # - [Miscellaneous information] # - dct:author Author: "Court of Justice of the European Communities" # - :form Form: "Judgement" # - [Procedure] # - :proceduretype - Type of procedure, comma sep: # - "Staff cases" # - "Action for damages" # - "Appeal" # - "REEX=OB" # - :applicant - Applicant: "Official" # - :defendant - Defendant: "EMEA, Institutions" # - :observation - Observations: "Italy, Poland, Member States, European Parliament, Council, Commission, Institutions" # - :judgerapporteur - Judge-Rapporteur: "von Danwitz" # - :advocategeneral - Advocate General: "Mazák" # - [Relationships between documents] # - :treaty Treaty: "European Communities" # - :caseaffecting Case affecting, NL-sep: # - "Interprets [CELEXNO + pinpoint]" # - "Declares void 61995A0091" # - "Confirms 31996D0666" # - :"Instruments cited in case law" (celex numbers with pinpoint locations?), nl-sep # - "12001C/PRO/02-A61" # - "12001C/PRO/02-NA13P1" # - "31991Q0530-A114" # - "62007K0023" # - "62008A0012" # # convenience functions -- should not be needed now that we have Describer # def add_literal(predicate, literal): # g.add((URIRef(uri), # voc[predicate], # Literal(literal, lang=lang))) # # def add_celex_object(predicate, celexno): # g.add((URIRef(uri), # voc[predicate], # URIRef("http://lagen.nu/ext/celex/%s" % celexno))) # # def get_predicate(predicate): # predicates = list(g.objects(URIRef(uri), voc[predicate])) # return predicates != [] # # These are a series of refinments for the "Affecting" # relationship. "Cites" doesn't have these (or similar), but # "is affected by" has (the inverse properties) affects_predicates = {"Interprets": "interprets", "Interprets the judgment": "interpretsJudgment", "Declares void": "declaresVoid", "Confirms": "confirms", "Declares valid (incidentally)": "declaresValidIncidentally", "Declares valid (by a preliminary ruling)": "declaresValidByPreliminaryRuling", "Incidentally declares invalid": "declaresInvalidIncidentally", "Declares invalid (by a preliminary ruling)": "declaresInvalidByPreliminaryRuling", "Amends": "amends", "Failure concerning": "failureConcerning"} isaffected_predicates = {"Interpreted by": "interpretedBy", "Confirmed by": "confirmedBy", "Declared void by": "declaredVoidBy", "Annulment requested by": "annulmentRequestedBy"} # 1. Express metadata about our document as a RDF graph desc = Describer(self.meta, self.uri) g = Graph() # :celex - first <h1> celexnum = soup.h1.get_text(strip=True) if celexnum == "No documents matching criteria.": raise errors.DocumentRemovedError("No documents matching criteria " + celexnum) elif "no_data_found" in celexnum: self.log.warning( "%s: No data found (try re-downloading)!" % basefile) raise errors.DocumentRemovedError("No data found!") assert celexnum == doc.basefile, "Celex number in file (%s) differ from filename (%s)" % ( celexnum, basefile) doc.lang = soup.html['lang'] m = self.re_celexno.match(celexnum) # FIXME: this list is outdated! rdftype = {'J': voc['Judgment'], 'A': voc['JudgmentFirstInstance'], 'W': voc['JudgmentCivilService'], 'O': voc['Order'], 'B': voc['OrderCivilService']}[m.group(3)] desc.rdftype(rdftype) desc.value(self.ns['eurlex'].celexnum, celexnum) # The first section, following <h2>Title and reference</h2> # contains :courtdecision, :party (one or two items), # :referingcourt (optional), :legalissue (list of strings), # :casenum, :casereporter. Since some are optional, we do a # little heuristics to find out what we're looking at at any # given moment. for section in soup.findAll(["h1", "h2"]): if section.name == "h1" and section.a and section.a.string == "Text": break if section.string == "Title and reference": for para in section.findNextSiblings("p"): if not para.string: continue string = para.string.strip() # optional: do sanitychecks to see if this really is a :courtdecision if not get_predicate('courtdecision'): add_literal('courtdecision', string) elif not get_predicate('party'): # this will be one or two items. Are they position dependent? for party in string.split(" v "): add_literal('party', party) elif (not get_predicate('referingcourt') and (string.startswith("Reference for a preliminary ruling") or string.startswith("Preliminary ruling requested"))): add_literal('referingcourt', string) elif (not get_predicate('casenum') and (string.lower().startswith("case ") or string.lower().startswith("joined cases "))): add_literal('casenum', string) elif para.em: # :casereporter is enclosed in an em for row in para.findAll(text=True): add_literal('casereporter', row.strip()) elif get_predicate('legalissue'): # fixme: Split this up somehow add_literal('legalissue', string) elif section.string == "Relationship between documents": for item in section.findNextSibling("ul").findAll("li"): predicate = None subpredicate = None for node in item.childGenerator(): if not hasattr(node, "name"): nodetext = node.strip() if re.match("([ABCDEFGIJKLNPRST]+\d*)+$", nodetext): continue if re.match("\d[\d\-]*[ABC]?$", nodetext): continue if predicate == "affects" and nodetext: if nodetext in affects_predicates: subpredicate = affects_predicates[nodetext] else: self.log.warning( "Can't express '%s' as a affects predicate" % nodetext) elif predicate == "isaffected" and nodetext: if nodetext in isaffected_predicates: subpredicate = isaffected_predicates[ nodetext] else: self.log.warning( "Can't express '%s' as a isaffected predicate" % nodetext) elif node.name == "strong": subpredicate = None if node.string == "Treaty:": predicate = "treaty" elif node.string == "Affected by case:": predicate = "isaffected" elif node.string == "Case affecting:": predicate = "affects" elif node.string == "Instruments cited in case law:": predicate = "cites" else: self.log.warning("Don't know how to handle key '%s'" % node.string) elif node.name == "a" and predicate: p = predicate if subpredicate: p = subpredicate # FIXME: If the # predicate is "cites", the celex number # may have extra crap # (eg. "31968R0259(01)-N2A1L6") indicating # pinpoint location. Transform these to a # fragment identifier. add_celex_object(p, node.string.strip()) def parse_document_from_soup(self, soup, doc): # Process text and create DOM self.parser = LegalRef(LegalRef.EGRATTSFALL) textdiv = soup.find("div", "texte") if textdiv: for node in textdiv.childGenerator(): if node.string: # Here we should start analyzing for things like # "C-197/09". Note that the Eurlex data does not use # the ordinary hyphen like above, but rather # 'NON-BREAKING HYPHEN' (U+2011) - LegaRef will mangle # this to an ordinary hyphen. subnodes = self.parser.parse(node.string, predicate="dct:references") doc.body.append(Paragraph(subnodes)) else: self.log.warning("%s: No fulltext available!" % celexnum) doc.body.append(Paragraph(["(No fulltext available)"]))
def parametric_test(self, datafile): p = LegalRef(LegalRef.EULAGSTIFTNING) return self._test_parser(datafile, p)
def forarbete_parser(self): return SwedishCitationParser(LegalRef(LegalRef.FORARBETEN), self.minter, self.commondata)
def parse_from_textreader(self, reader, basefile): tracelog = logging.getLogger("%s.tracelog" % self.alias) doc = self.make_document(basefile) g = doc.meta # 1.2: Load known entities and their URIs (we have to add some # that are not yet in the official resource lists resource_list_file = self.store.path("resourcelist", "intermediate", ".rdf") if not os.path.exists(resource_list_file): self.download_resource_lists("http://service.lagrummet.se/var/common", resource_list_file) resources = Graph() resources.parse(resource_list_file, format="xml") # 1.3: Define regexps for the data we search for. fwdtests = { "dct:issn": ["^ISSN (\d+\-\d+)$"], "dct:title": ["((?:Föreskrifter|[\w ]+s (?:föreskrifter|allmänna råd)).*?)\n\n"], "dct:identifier": ["^([A-ZÅÄÖ-]+FS\s\s?\d{4}:\d+)$"], "rpubl:utkomFranTryck": ["Utkom från\strycket\s+den\s(\d+ \w+ \d{4})"], "rpubl:omtryckAv": ["^(Omtryck)$"], "rpubl:genomforDirektiv": ["Celex (3\d{2,4}\w\d{4})"], "rpubl:beslutsdatum": ["(?:har beslutats|beslutade|beslutat) den (\d+ \w+ \d{4})"], "rpubl:beslutadAv": [ "\n([A-ZÅÄÖ][\w ]+?)\d? (?:meddelar|lämnar|föreskriver)", "\s(?:meddelar|föreskriver) ([A-ZÅÄÖ][\w ]+?)\d?\s", ], "rpubl:bemyndigande": [ " ?(?:meddelar|föreskriver|Föreskrifterna meddelas|Föreskrifterna upphävs)\d?,? (?:följande |)med stöd av\s(.*?) ?(?:att|efter\ssamråd|dels|följande|i fråga om|och lämnar allmänna råd|och beslutar följande allmänna råd|\.\n)", "^Med stöd av (.*)\s(?:meddelar|föreskriver)", ], } # 2: Find metadata properties # 2.1 Find some of the properties on the first page (or the # 2nd, or 3rd... continue past TOC pages, cover pages etc # until the "real" first page is found) NB: FFFS 2007:1 has # ten (10) TOC pages! pagecnt = 0 for page in reader.getiterator(reader.readpage): # replace single newlines with spaces, but keep double # newlines # page = "\n\n".join([util.normalize_space(x) for x in page.split("\n\n")]) pagecnt += 1 props = {} for (prop, tests) in list(fwdtests.items()): if prop in props: continue for test in tests: m = re.search(test, page, re.MULTILINE | re.DOTALL | re.UNICODE) if m: props[prop] = util.normalize_space(m.group(1)) # Single required propery. If we find this, we're done if "rpubl:beslutsdatum" in props: break self.log.warning("%s: Couldn't find required props on page %s" % (basefile, pagecnt)) # 2.2 Find some of the properties on the last 'real' page (not # counting appendicies) reader.seek(0) pagesrev = reversed(list(reader.getiterator(reader.readpage))) # The language used to expres these two properties differ # quite a lot, more than what is reasonable to express in a # single regex. We therefore define a set of possible # expressions and try them in turn. revtests = { "rpubl:ikrafttradandedatum": [ "(?:Denna författning|Dessa föreskrifter|Dessa allmänna råd|Dessa föreskrifter och allmänna råd)\d* träder i ?kraft den (\d+ \w+ \d{4})", "Dessa föreskrifter träder i kraft, (?:.*), i övrigt den (\d+ \w+ \d{4})", "ska(?:ll|)\supphöra att gälla (?:den |)(\d+ \w+ \d{4}|denna dag|vid utgången av \w+ \d{4})", "träder i kraft den dag då författningen enligt uppgift på den (utkom från trycket)", ], "rpubl:upphaver": [ "träder i kraft den (?:\d+ \w+ \d{4}), då(.*)ska upphöra att gälla", "ska(?:ll|)\supphöra att gälla vid utgången av \w+ \d{4}, nämligen(.*?)\n\n", "att (.*) skall upphöra att gälla (denna dag|vid utgången av \w+ \d{4})", ], } cnt = 0 for page in pagesrev: cnt += 1 # Normalize the whitespace in each paragraph so that a # linebreak in the middle of the natural language # expression doesn't break our regexes. page = "\n\n".join([util.normalize_space(x) for x in page.split("\n\n")]) for (prop, tests) in list(revtests.items()): if prop in props: continue for test in tests: # Not re.DOTALL -- we've normalized whitespace and # don't want to match across paragraphs m = re.search(test, page, re.MULTILINE | re.UNICODE) if m: props[prop] = util.normalize_space(m.group(1)) # print u"%s: '%s' resulted in match '%s' at page %s from end" % # (prop,test,props[prop], cnt) # Single required propery. If we find this, we're done if "rpubl:ikrafttradandedatum" in props: break # 3: Clean up data - converting strings to Literals or # URIRefs, find legal references, etc if "dct:identifier" in props: (publication, year, ordinal) = re.split("[ :]", props["dct:identifier"]) # FIXME: Read resources graph instead fs = resources.value(predicate=self.ns["skos"].altLabel, object=Literal(publication, lang="sv")) props["rpubl:forfattningssamling"] = fs publ = resources.value(subject=fs, predicate=self.ns["dct"].publisher) props["dct:publisher"] = publ props["rpubl:arsutgava"] = Literal(year) # conversion to int, date not needed props["rpubl:lopnummer"] = Literal(ordinal) props["dct:identifier"] = Literal(props["dct:identifier"]) # Now we can mint the uri (should be done through LegalURI) uri = "http://rinfo.lagrummet.se/publ/%s/%s:%s" % ( props["rpubl:forfattningssamling"].split("/")[-1], props["rpubl:arsutgava"], props["rpubl:lopnummer"], ) self.log.debug("URI: %s" % uri) else: self.log.error("Couldn't find dct:identifier, cannot create URI, giving up") return None tracelog.info("Cleaning rpubl:beslutadAv") if "rpubl:beslutadAv" in props: agency = resources.value( predicate=self.ns["foaf"].name, object=Literal(props["rpubl:beslutadAv"], lang="sv") ) if agency: props["rpubl:beslutadAv"] = agency else: self.log.warning("Cannot find URI for rpubl:beslutadAv value %r" % props["rpubl:beslutadAv"]) del props["rpubl:beslutadAv"] tracelog.info("Cleaning dct:issn") if "dct:issn" in props: props["dct:issn"] = Literal(props["dct:issn"]) tracelog.info("Cleaning dct:title") # common false positive if "dct:title" in props and "denna f\xf6rfattning har beslutats den" in props["dct:title"]: del props["dct:title"] if "dct:title" in props: tracelog.info("Inspecting dct:title %r" % props["dct:title"]) # sometimes the title isn't separated with two newlines from the rest of the text if "\nbeslutade den " in props["dct:title"]: props["dct:title"] = props["dct:title"].split("\nbeslutade den ")[0] props["dct:title"] = Literal(util.normalize_space(props["dct:title"]), lang="sv") if re.search("^(Föreskrifter|[\w ]+s föreskrifter) om ändring i ", props["dct:title"], re.UNICODE): tracelog.info("Finding rpubl:andrar in dct:title") orig = re.search("([A-ZÅÄÖ-]+FS \d{4}:\d+)", props["dct:title"]).group(0) (publication, year, ordinal) = re.split("[ :]", orig) origuri = "http://rinfo.lagrummet.se/publ/%s/%s:%s" % ( self.rpubl_uri_transform(publication), year, ordinal, ) props["rpubl:andrar"] = URIRef(origuri) if "rpubl:omtryckAv" in props: props["rpubl:omtryckAv"] = URIRef(origuri) if ( re.search("^(Föreskrifter|[\w ]+s föreskrifter) om upphävande av", props["dct:title"], re.UNICODE) and not "rpubl:upphaver" in props ): tracelog.info("Finding rpubl:upphaver in dct:title") props["rpubl:upphaver"] = six.text_type(props["dct:title"]) # cleaned below tracelog.info("Cleaning date properties") for prop in ("rpubl:utkomFranTryck", "rpubl:beslutsdatum", "rpubl:ikrafttradandedatum"): if prop in props: if props[prop] == "denna dag" and prop == "rpubl:ikrafttradandedatum": props[prop] = props["rpubl:beslutsdatum"] elif props[prop] == "utkom från trycket" and prop == "rpubl:ikrafttradandedatum": props[prop] = props["rpubl:utkomFranTryck"] else: props[prop] = Literal(self.parse_swedish_date(props[prop].lower())) tracelog.info("Cleaning rpubl:genomforDirektiv") if "rpubl:genomforDirektiv" in props: props["rpubl:genomforDirektiv"] = URIRef( "http://rinfo.lagrummet.se/ext/eur-lex/%s" % props["rpubl:genomforDirektiv"] ) tracelog.info("Cleaning rpubl:bemyndigande") has_bemyndiganden = False if "rpubl:bemyndigande" in props: # SimpleParse can't handle unicode endash sign, transform # into regular ascii hyphen props["rpubl:bemyndigande"] = props["rpubl:bemyndigande"].replace("\u2013", "-") parser = LegalRef(LegalRef.LAGRUM) result = parser.parse(props["rpubl:bemyndigande"]) bemyndigande_uris = [x.uri for x in result if hasattr(x, "uri")] # some of these uris need to be filtered away due to # over-matching by parser.parse filtered_bemyndigande_uris = [] for bem_uri in bemyndigande_uris: keep = True for compare in bemyndigande_uris: if len(compare) > len(bem_uri) and compare.startswith(bem_uri): keep = False if keep: filtered_bemyndigande_uris.append(bem_uri) for bem_uri in filtered_bemyndigande_uris: g.add((URIRef(uri), self.ns["rpubl"]["bemyndigande"], URIRef(bem_uri))) has_bemyndiganden = True del props["rpubl:bemyndigande"] tracelog.info("Cleaning rpubl:upphaver") if "rpubl:upphaver" in props: for upph in re.findall("([A-ZÅÄÖ-]+FS \d{4}:\d+)", util.normalize_space(props["rpubl:upphaver"])): (publication, year, ordinal) = re.split("[ :]", upph) upphuri = "http://rinfo.lagrummet.se/publ/%s/%s:%s" % (publication.lower(), year, ordinal) g.add((URIRef(uri), self.ns["rpubl"]["upphaver"], URIRef(upphuri))) del props["rpubl:upphaver"] tracelog.info("Deciding rdf:type") if "dct:title" in props and "allmänna råd" in props["dct:title"] and not "föreskrifter" in props["dct:title"]: props["rdf:type"] = self.ns["rpubl"]["AllmannaRad"] else: props["rdf:type"] = self.ns["rpubl"]["Myndighetsforeskrift"] # 3.5: Check to see that we have all properties that we expect # (should maybe be done elsewhere later?) tracelog.info("Checking required properties") for prop in ( "dct:identifier", "dct:title", "rpubl:arsutgava", "dct:publisher", "rpubl:beslutadAv", "rpubl:beslutsdatum", "rpubl:forfattningssamling", "rpubl:ikrafttradandedatum", "rpubl:lopnummer", "rpubl:utkomFranTryck", ): if not prop in props: self.log.warning("%s: Failed to find %s" % (basefile, prop)) tracelog.info("Checking rpubl:bemyndigande") if props["rdf:type"] == self.ns["rpubl"]["Myndighetsforeskrift"]: if not has_bemyndiganden: self.log.warning("%s: Failed to find rpubl:bemyndigande" % (basefile)) # 4: Add the cleaned data to a RDFLib Graph # (maybe we should do that as early as possible?) tracelog.info("Adding items to rdflib.Graph") for (prop, value) in list(props.items()): (prefix, term) = prop.split(":", 1) p = self.ns[prefix][term] if not (isinstance(value, URIRef) or isinstance(value, Literal)): self.log.warning("%s: %s is a %s, not a URIRef or Literal" % (basefile, prop, type(value))) g.add((URIRef(uri), p, value)) # 5: Create data for the body, removing various control characters # TODO: Use pdftohtml to create a nice viewable HTML # version instead of this plaintext stuff reader.seek(0) body = [] # A fairly involved way of filtering out all control # characters from a string import unicodedata if six.PY3: all_chars = (chr(i) for i in range(0x10000)) else: all_chars = (unichr(i) for i in range(0x10000)) control_chars = "".join(c for c in all_chars if unicodedata.category(c) == "Cc") # tab and newline are technically Control characters in # unicode, but we want to keep them. control_chars = control_chars.replace("\t", "").replace("\n", "") control_char_re = re.compile("[%s]" % re.escape(control_chars)) for page in reader.getiterator(reader.readpage): text = xml_escape(control_char_re.sub("", page)) body.append("<pre>%s</pre>\n\n" % text) # 5: Done! # doc.body = body doc.lang = "sv" doc.uri = uri return doc