def find_firstpage_metadata(self, firstpage, basefile): res = {} m = re.search("proposition till riksdagen *,? *(.*?); gif?ven", util.normalize_space(firstpage), flags=re.I) if not m: self.log.warning( "%s: Couldn't find title in first %s characters (first page)" % (basefile, len(firstpage))) else: res["dcterms:title"] = m.groups(1) m = re.search("gif?ven stockholms slott den (\d+ \w+ \d{4})", util.normalize_space(firstpage), flags=re.I) if not m: self.log.warning( "%s: Couldn't find date in first %s characters (first page)" % (basefile, len(firstpage))) else: try: res["dcterms:issued"] = self.parse_swedish_date( m.group(1).lower()) except ValueError as e: self.log.warning("%s: Couldn't parse date %s" % (basefile, m.group(1))) return res
def extract_metadata(self, rawhead, basefile): d = self.metadata_from_basefile(basefile) if rawhead: # sometimes there's no headnote.html for label, key in {"Ämbetsberättelse": 'dcterms:bibliographicCitation', "Beslutsdatum": 'dcterms:issued', "Diarienummer": 'rpubl:diarienummer'}.items(): labelnode = rawhead.find(text=re.compile("%s:" % label)) if labelnode: d[key] = util.normalize_space(labelnode.next_sibling.text) # this data might contain spurious spaces due to <span # class="Definition"> tags -- see eg 3128-2002. Data in # the document is preferable d["dcterms:title"] = util.normalize_space(rawhead.find("h2").text) return d
def download_get_basefiles_page(self, pagetree): # feed the lxml tree into beautifulsoup by serializing it to a # string -- is there a better way? soup = BeautifulSoup(etree.tostring(pagetree)) for tr in soup.findAll("tr"): if ((not tr.find("a")) or not re.match(self.basefile_regex, tr.find("a").text)): # FIXME: Maybe re.search instead of .match to find # "Prop. 2012/13:152" continue # First, look at desc (third td): descnodes = [util.normalize_space(x) for x in tr.find_all("td")[2] if isinstance(x, str)] bilaga = None if len(descnodes) > 1: if descnodes[1].startswith("Bilaga:"): bilaga = util.normalize_space(descnodes[0].split(",")[-1]) desc = "\n".join(descnodes) # then, find basefile (second td) tds = tr.find_all("td") td = tds[1] basefile = td.a.text assert re.match(self.basefile_regex, basefile) basefile = self.sanitize_basefile(basefile) url = td.a['href'] # self.download_single(basefile, refresh=refresh, url=url) # and, if present, extra files (in td 4+5) extraurls = [] for td in tr.findAll("td")[3:]: extraurls.append(td.a['href']) # we slightly abuse the protocol between # download_get_basefiles and this generator -- instead of # yielding just two strings, we yield two tuples with some # extra information that download_single will need. yield (basefile, bilaga), (url, extraurls) nextpage = None for element, attribute, link, pos in pagetree.iterlinks(): if element.text == "Fler poster": nextpage = link raise NoMoreLinks(nextpage)
def test_fallback_ocr(self): try: # actually running tesseract takes ages -- for day-to-day # testing we can just as well use the canned hocr.html # files that _copy_sample fixes for us. if not os.environ.get("FERENDA_TEST_TESSERACT"): raise errors.ExternalCommandError reader = PDFReader( filename="test/files/pdfreader/scanned-ecma-99.pdf", workdir=self.datadir, images=False) except errors.ExternalCommandError: self._copy_sample() reader = PDFReader( filename="test/files/pdfreader/scanned-ecma-99.pdf", workdir=self.datadir, images=False) self.assertTrue(reader.is_empty()) reader = PDFReader(filename="test/files/pdfreader/scanned-ecma-99.pdf", workdir=self.datadir, ocr_lang="eng") self.assertFalse(reader.is_empty()) self.assertEqual(2, len(reader)) self.assertEqual("EUROPEAN COMPUTER MANUFACTURERS ASSOCIATION", util.normalize_space(str(reader[0][1])))
def extract_metadata(self, rawhead, basefile): d = self.metadata_from_basefile(basefile) if rawhead: # sometimes there's no headnote.html for label, key in { "Ämbetsberättelse": 'dcterms:bibliographicCitation', "Beslutsdatum": 'dcterms:issued', "Diarienummer": 'rpubl:diarienummer' }.items(): labelnode = rawhead.find(text=re.compile("%s:" % label)) if labelnode: d[key] = util.normalize_space(labelnode.next_sibling.text) # this data might contain spurious spaces due to <span # class="Definition"> tags -- see eg 3128-2002. Data in # the document is preferable d["dcterms:title"] = util.normalize_space(rawhead.find("h2").text) return d
def sanitize_metadata(self, a, basefile): # trim space for k in ("dcterms:title", "dcterms:abstract"): if k in a: a[k] = util.normalize_space(a[k]) # trim identifier a["dcterms:identifier"] = self.sanitize_identifier( a["dcterms:identifier"].replace("ID-nummer: ", "")) # FIXME call sanitize_identifier # save for later self._identifier = a["dcterms:identifier"] # it's rare, but in some cases a document can be published by # two different departments (eg dir. 2011:80). Convert string # to a list in these cases (SwedishLegalSource.polish_metadata # will handle that) if "rpubl:departement" in a and ", " in a["rpubl:departement"]: a["rpubl:departement"] = a["rpubl:departement"].split(", ") # remove empty utgarFran list if a["rpubl:utgarFran"]: a["rpubl:utgarFran"] = [URIRef(x) for x in a["rpubl:utgarFran"]] else: del a["rpubl:utgarFran"] # FIXME: possibly derive utrSerie from self.document_type? if self.rdf_type == RPUBL.Utredningsbetankande: altlabel = "SOU" if self.document_type == Regeringen.SOU else "Ds" a["rpubl:utrSerie"] = self.lookup_resource(altlabel, SKOS.altLabel) return a
def parse_antiword_docbook(self, text, basefile): soup = BeautifulSoup(text) head = {} header_elements = soup.find("para") header_text = '' for el in header_elements.contents: if hasattr(el, 'name') and el.name == "informaltable": break else: header_text += el.string # Högst uppe på varje domslut står domstolsnamnet ("Högsta # domstolen") följt av referatnumret ("NJA 1987 # s. 113"). Beroende på worddokumentet ser dock XML-strukturen # olika ut. Det vanliga är att informationen finns i en # pipeseparerad paragraf: parts = [x.strip() for x in header_text.split("|")] if len(parts) > 1: head['Domstol'] = parts[0] head['Referat'] = parts[1] else: # alternativ står de på första raden i en informaltable row = soup.find("informaltable").tgroup.tbody.row.findAll('entry') head['Domstol'] = row[0].get_text(strip=True) head['Referat'] = row[1].get_text(strip=True) # Hitta övriga enkla metadatafält i sidhuvudet for key in self.labels: node = soup.find(text=re.compile(key + ':')) if node: txt = node.find_parent('entry').find_next_sibling('entry').get_text(strip=True) if txt: head[key] = txt # Hitta sammansatta metadata i sidhuvudet for key in ["Lagrum", "Rättsfall"]: node = soup.find(text=re.compile(key + ':')) if node: head[key] = [] textchunk = node.find_parent( 'entry').find_next_sibling('entry').string for line in [util.normalize_space(x) for x in textchunk.split("\n\n")]: if line: head[key].append(line) body = [] for p in soup.find(text=re.compile('REFERAT')).find_parent('tgroup').find_next_sibling('tgroup').find('entry').get_text(strip=True).split("\n\n"): body.append(p) # Hitta sammansatta metadata i sidfoten head['Sökord'] = soup.find(text=re.compile('Sökord:')).find_parent( 'entry').next_sibling.next_sibling.get_text(strip=True) if soup.find(text=re.compile('^\s*Litteratur:\s*$')): n = soup.find(text=re.compile('^\s*Litteratur:\s*$')).find_parent( 'entry').next_sibling.next_sibling.get_text(strip=True) head['Litteratur'] = n return head, body
def sanitize_term(self, term): # sanity checking -- not everything can be a legit # keyword. Must be under 100 chars and not start with . or / term = util.normalize_space(term) if (self.term_max_len >= len(term) >= self.term_min_len and term[0] not in self.invalid_term_start and term[-1] not in self.invalid_term_end): return term
def sanitize_metadata(self, attribs, basefile): attribs = super(PropTrips, self).sanitize_metadata(attribs, basefile) if ('dcterms:title' in attribs and 'dcterms:identifier' in attribs and attribs['dcterms:title'].endswith( attribs['dcterms:identifier'])): x = attribs['dcterms:title'][:-len(attribs['dcterms:identifier'])] attribs['dcterms:title'] = util.normalize_space(x) return attribs
def sanitize_metadata(self, attribs, basefile): attribs = super(PropTrips, self).sanitize_metadata(attribs, basefile) if ('dcterms:title' in attribs and 'dcterms:identifier' in attribs and attribs['dcterms:title'].endswith(attribs['dcterms:identifier'])): x = attribs['dcterms:title'][:-len(attribs['dcterms:identifier'])] attribs['dcterms:title'] = util.normalize_space(x) return attribs
def htmlparser(chunks): b = Body() for block in chunks: tagtype = Preformatted if block.name == "pre" else Paragraph t = util.normalize_space(''.join(block.findAll(text=True))) block.extract() # to avoid seeing it again if t: b.append(tagtype([t])) return b
def find_firstpage_metadata(self, firstpage, basefile): res = {} m = re.search("proposition till riksdagen *,? *(.*?); gif?ven", util.normalize_space(firstpage), flags=re.I) if not m: self.log.warning("%s: Couldn't find title in first %s characters (first page)" % (basefile, len(firstpage))) else: res["dcterms:title"] = m.groups(1) m = re.search("gif?ven stockholms slott den (\d+ \w+ \d{4})", util.normalize_space(firstpage), flags=re.I) if not m: self.log.warning("%s: Couldn't find date in first %s characters (first page)" % (basefile, len(firstpage))) else: try: res["dcterms:issued"] = self.parse_swedish_date(m.group(1).lower()) except ValueError as e: self.log.warning("%s: Couldn't parse date %s" % (basefile, m.group(1))) return res
def as_plaintext(self): """Returns the plain text of this element, including child elements.""" res = [] for subpart in self: if isinstance(subpart, str): res.append(util.normalize_space(subpart)) elif (isinstance(subpart, AbstractElement) or hasattr(subpart, 'as_plaintext')): res.append(subpart.as_plaintext()) # the rule for concatenating children into a plaintext string is: # filter out all empty children, then place single space between the others. return " ".join(filter(None,res))
def extract_metadata(self, rawhead, basefile): res = self.metadata_from_basefile(basefile) # extracting title and other metadata (dep, publication date # etc) requires parsing of the body (and subsequent processing # in postprocess_doc). For documents marked as metadataonly in # options.py, the body is never parsed. Therefore, we do a # very limited parsing of the first page here. if self.get_parse_options(basefile) == "metadataonly": text = util.normalize_space(etree.tostring(rawhead, method="text", encoding="utf-8").decode("utf-8")) res.update(self.find_firstpage_metadata(text, basefile)) return res
def sanitize_metadata(self, attribs, basefile): # remove trailing "Avgörande 1993-05-03; 92-2571" if attribs['dcterms:title'].strip(): attribs['dcterms:title'] = Literal( re.sub("Avgörande \d+-\d+-\d+; \d+-\d+\.?", "", util.normalize_space(attribs['dcterms:title'])), lang="sv") else: del attribs['dcterms:title'] # no real content -- delete # it and fill the value with # stuff from the document # later. return attribs
def sanitize_metadata(self, attribs, basefile): # remove trailing "Avgörande 1993-05-03; 92-2571" if attribs['dcterms:title'].strip(): attribs['dcterms:title'] = Literal(re.sub( "Avgörande \d+-\d+-\d+; \d+-\d+\.?", "", util.normalize_space(attribs['dcterms:title'])), lang="sv") else: del attribs['dcterms:title'] # no real content -- delete # it and fill the value with # stuff from the document # later. return attribs
def as_plaintext(self): """Returns the plain text of this element, including child elements.""" res = [] for subpart in self: if isinstance(subpart, str): res.append(util.normalize_space(subpart)) elif (isinstance(subpart, AbstractElement) or hasattr(subpart, 'as_plaintext')): res.append(subpart.as_plaintext()) # the rule for concatenating children into a plaintext string is: # filter out all empty children, then place single space between # the others. return " ".join(filter(None, res))
def _extract_plaintext(self, resource, resources): about = resource.get("about") if about and "#sid" in about: # select all text content contained in the first 2 <p> # tags following the pagebreak -- this should typically be # enough to show a helpful snippet in the autocomplete box nodes = resource.xpath("following::h:p[position() < 2]//text()", namespaces={'h': 'http://www.w3.org/1999/xhtml'}) plaintext = util.normalize_space(" ".join(nodes)) if not plaintext: plaintext = "(Sid %s saknar text)" % about.split("#sid")[1] return plaintext else: return super(FixedLayoutSource, self)._extract_plaintext(resource, resources)
def postprocess_doc(self, doc): if self.get_parse_options(doc.basefile) == "metadataonly": return # the first thing will be a Sidbrytning; continue scanning text until next sidbrytning firstpage = "" for thing in doc.body[1:]: if isinstance(thing, Sidbrytning): break elif isinstance(thing, Textbox): firstpage += util.normalize_space(str(thing)) + "\n\n" metadata = self.find_firstpage_metadata(firstpage, doc.basefile) if "dcterms:title" in metadata: doc.meta.add((URIRef(doc.uri), DCTERMS.title, Literal(metadata["dcterms:title"], lang=self.lang))) if "dcterms:issued" in metadata: doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(metadata["dcterms:issued"])))
def _extract_plaintext(self, resource, resources): about = resource.get("about") if about and "#sid" in about: # select all text content contained in the first 2 <p> # tags following the pagebreak -- this should typically be # enough to show a helpful snippet in the autocomplete box nodes = resource.xpath( "following::h:p[position() < 2]//text()", namespaces={'h': 'http://www.w3.org/1999/xhtml'}) plaintext = util.normalize_space(" ".join(nodes)) if not plaintext: plaintext = "(Sid %s saknar text)" % about.split("#sid")[1] return plaintext else: return super(FixedLayoutSource, self)._extract_plaintext(resource, resources)
def parse_from_textreader(self, textreader, doc): describer = Describer(doc.meta, doc.uri) for p in textreader.getiterator(textreader.readparagraph): # print "Handing %r (%s)" % (p[:40], len(doc.body)) if not p.strip(): continue elif not doc.body and 'Obs! Dokumenten i denna databas kan vara ofullständiga.' in p: continue elif not doc.body and p.strip().startswith("Dokument:"): # We already know this continue elif not doc.body and p.strip().startswith("Titel:"): describer.value( self.ns['dct'].title, util.normalize_space(p[7:])) else: doc.body.append(Preformatted([p]))
def sanitize_metadata(self, a, basefile): # trim space for k in ("dcterms:title", "dcterms:abstract"): if k in a: a[k] = util.normalize_space(a[k]) # trim identifier try: # The identifier displayed on the HTML page is not always # correct -- it might be missing digits (eg "SOU 207:111" # instead of "SOU 2017:111"). Try to sanitize it, but if # we fail, infer it from our basefile instead. a["dcterms:identifier"] = self.sanitize_identifier( a["dcterms:identifier"].replace("ID-nummer: ", "")) except ValueError as e: inferred_identifier = str(self.infer_identifier(basefile)) self.log.warning( "%s: Irregular identifier %s, using inferred identifier %s instead" % (basefile, a["dcterms:identifier"], inferred_identifier)) a["dcterms:identifier"] = inferred_identifier # save for later self._identifier = a["dcterms:identifier"] # it's rare, but in some cases a document can be published by # two different departments (eg dir. 2011:80). Convert string # to a list in these cases (SwedishLegalSource.polish_metadata # will handle that) if "rpubl:departement" in a and ", " in a["rpubl:departement"]: a["rpubl:departement"] = a["rpubl:departement"].split(", ") # remove empty utgarFran list if a["rpubl:utgarFran"]: a["rpubl:utgarFran"] = [URIRef(x) for x in a["rpubl:utgarFran"]] else: del a["rpubl:utgarFran"] # FIXME: possibly derive utrSerie from self.document_type? if self.rdf_type == RPUBL.Utredningsbetankande: altlabel = "SOU" if self.document_type == Regeringen.SOU else "Ds" a["rpubl:utrSerie"] = self.lookup_resource(altlabel, SKOS.altLabel) return a
def sanitize_metadata(self, a, basefile): # trim space for k in ("dcterms:title", "dcterms:abstract"): if k in a: a[k] = util.normalize_space(a[k]) # trim identifier try: # The identifier displayed on the HTML page is not always # correct -- it might be missing digits (eg "SOU 207:111" # instead of "SOU 2017:111"). Try to sanitize it, but if # we fail, infer it from our basefile instead. a["dcterms:identifier"] = self.sanitize_identifier( a["dcterms:identifier"].replace("ID-nummer: ", "")) except ValueError as e: inferred_identifier = str(self.infer_identifier(basefile)) self.log.warning("%s: Irregular identifier %s, using inferred identifier %s instead" % (basefile, a["dcterms:identifier"], inferred_identifier)) a["dcterms:identifier"] = inferred_identifier # save for later self._identifier = a["dcterms:identifier"] # it's rare, but in some cases a document can be published by # two different departments (eg dir. 2011:80). Convert string # to a list in these cases (SwedishLegalSource.polish_metadata # will handle that) if "rpubl:departement" in a and ", " in a["rpubl:departement"]: a["rpubl:departement"] = a["rpubl:departement"].split(", ") # remove empty utgarFran list if a["rpubl:utgarFran"]: a["rpubl:utgarFran"] = [URIRef(x) for x in a["rpubl:utgarFran"]] else: del a["rpubl:utgarFran"] # FIXME: possibly derive utrSerie from self.document_type? if self.rdf_type == RPUBL.Utredningsbetankande: altlabel = "SOU" if self.document_type == Regeringen.SOU else "Ds" a["rpubl:utrSerie"] = self.lookup_resource(altlabel, SKOS.altLabel) return a
def test_ocr(self): try: if not os.environ.get("FERENDA_TEST_TESSERACT"): raise errors.ExternalCommandError reader = PDFReader(filename="test/files/pdfreader/scanned.pdf", workdir=self.datadir, ocr_lang="swe") except errors.ExternalCommandError: self._copy_sample() reader = PDFReader(filename="test/files/pdfreader/scanned.pdf", workdir=self.datadir, ocr_lang="swe") # assert that a hOCR file has been created self.assertTrue( os.path.exists(self.datadir + os.sep + "scanned.hocr.html")) # assert that we have two pages self.assertEqual(2, len(reader)) # assert that first element in the first textbox in the first # page corresponds to the first bbox, scaled by the # pixel/point scaling factor. self.assertEqual("Regeringens ", str(reader[0][0][0])) self.assertEqual(47, reader[0][0][0].top) self.assertEqual(38, reader[0][0][0].left) self.assertEqual(21, reader[0][0][0].height) self.assertEqual(118, reader[0][0][0].width) # assert that the <s>third</s>fifth textbox (which has mostly # normal text) is rendered correctly (note that we have a # couple of OCR errors). # self.assertEqual("Regeringen föreslår riksdagen att anta de förslag som har tagits. upp i bifogade utdrag ur regeringsprotokollet den 31 oktober l99l.", util.normalize_space(str(reader[0][3]))) self.assertEqual( "Regeringen föreslår riksdagen att anta de förslag som har tagits. upp i", util.normalize_space(str(reader[0][5])))
def test_fallback_ocr(self): try: # actually running tesseract takes ages -- for day-to-day # testing we can just as well use the canned hocr.html # files that _copy_sample fixes for us. if not os.environ.get("FERENDA_TEST_TESSERACT"): raise errors.ExternalCommandError reader = PDFReader(filename="test/files/pdfreader/scanned-ecma-99.pdf", workdir=self.datadir, images=False) except errors.ExternalCommandError: self._copy_sample() reader = PDFReader(filename="test/files/pdfreader/scanned-ecma-99.pdf", workdir=self.datadir, images=False) self.assertTrue(reader.is_empty()) reader = PDFReader(filename="test/files/pdfreader/scanned-ecma-99.pdf", workdir=self.datadir, ocr_lang="eng") self.assertFalse(reader.is_empty()) self.assertEqual(2, len(reader)) self.assertEqual("EUROPEAN COMPUTER MANUFACTURERS ASSOCIATION", util.normalize_space(str(reader[0][1])))
def analyze_baseline_queries(self, analyzed_articles, num_of_keyterms=5): basefile = "tfeu" # Helper from http://effbot.org/zone/element-lib.htm def flatten(elem, include_tail=0): text = elem.text or "" for e in elem: text += flatten(e, 1) if include_tail and elem.tail: text += elem.tail return text # step 1: Create a temporary whoosh index in order to find out # the most significant words for each article #ana = analysis.StandardAnalyzer() ana = analysis.StemmingAnalyzer() # vectorformat = formats.Frequency(ana) schema = fields.Schema(article=fields.ID(unique=True), content=fields.TEXT(analyzer=ana, stored=True)) st = RamStorage() tmpidx = st.create_index(schema) w = tmpidx.writer() XHT_NS = "{http://www.w3.org/1999/xhtml}" tree = ET.parse(self.parsed_path(basefile)) els = tree.findall("//" + XHT_NS + "div") articles = [] for el in els: if 'typeof' in el.attrib and el.attrib['typeof'] == "eurlex:Article": text = util.normalize_space(flatten(el)) article = str(el.attrib['about']) articles.append(article) w.update_document(article=article, content=text) w.commit() self.log.info("Indexed %d articles" % len(articles)) # Step 2: Open the large whoosh index containing the text of # all cases. Then, for each article, use the 5 most distinctive terms # (filtering away numbers) to create a query against that index tempsearch = tmpidx.searcher() g = Graph() g.bind('celex', 'http://lagen.nu/ext/celex/') g.bind('ir', 'http://lagen.nu/informationretrieval#') IR = Namespace('http://lagen.nu/informationretrieval#') # celex:12008E264 ir:keyterm "blahonga"@en. outfile = self.generic_path("keyterms", "analyzed", ".tex") util.ensure_dir(outfile) fp = open(outfile, "w") fp.write(""" \\begin{tabular}{r|%s} \\hline \\textbf{Art.} & \\multicolumn{%s}{l}{\\textbf{Terms}} \\\\ \\hline """ % ("l" * num_of_keyterms, num_of_keyterms)) for article in analyzed_articles: fp.write(str(int(article.split("E")[1]))) r = tempsearch.search(query.Term("article", article)) terms = r.key_terms("content", numterms=num_of_keyterms + 1) terms = [t[0] for t in terms if not t[0].isdigit( )][:num_of_keyterms] for term in terms: fp.write(" & " + term) g.add(( URIRef(article), IR["keyterm"], Literal(term, lang="en"))) self.log.debug("Article %s:%r" % (article, terms)) fp.write("\\\\\n") fp.write(""" \\hline \\end{tabular} """) fp.close() outfile = self.generic_path("keyterms", "analyzed", ".n3") util.ensure_dir(outfile) fp = open(outfile, "w") fp.write(g.serialize(format="n3")) fp.close()
def polish_metadata(self, head, doc): basefile_regex = re.compile('(?P<type>\w+)/(?P<year>\d+)-(?P<ordinal>\d+)') def basefile_to_referat(basefile): templ = {'ADO': 'AD %(year)s nr %(ordinal)s', 'MD': 'MD %(year)s:%(ordinal)s'} m = basefile_regex.match(basefile) if m: return templ[m.group("type")] % (m.groupdict()) def ref_to_uri(ref): # FIXME: We'd like to retire legalref and replace it with # pyparsing grammars. nodes = self.rattsfall_parser.parse(ref) uri = nodes[0].uri return localize_uri(uri) def dom_to_uri(domstol, malnr, avg): baseuri = self.config.url slug = self.slugs[domstol] return "%(baseuri)sres/dv/%(slug)s/%(malnr)s/%(avg)s" % locals() def localize_uri(uri): if "publ/rattsfall" in uri: return uri.replace("http://rinfo.lagrummet.se/publ/rattsfall", self.config.url + "res/dv") elif "publ/sfs/" in uri: return uri.replace("http://rinfo.lagrummet.se/publ/sfs", self.config.url + "res/sfs") def split_nja(value): # "NJA 2008 s 567 (NJA 2008:86)"=>("NJA 2008 s 567", "NJA 2008:86") return [x[:-1] for x in value.split("(")] def sokord_uri(value): return self.config.url + "concept/%s" % util.ucfirst(value).replace(' ', '_') # 0. create Referat key if not present if "Referat" not in head: # For some courts (MD, AD, MOD?, MIG?) this is possible head["Referat"] = basefile_to_referat(doc.basefile) # 1. mint uris and create the two Describers we'll use refuri = ref_to_uri(head["Referat"]) refdesc = Describer(doc.meta, refuri) domuri = dom_to_uri(head["Domstol"], head["Målnummer"], head["Avgörandedatum"]) domdesc = Describer(doc.meta, domuri) # 2. convert all strings in head to proper RDF for label, value in head.items(): if label == "Rubrik": value = util.normalize_space(value) refdesc.value(self.ns['rpubl'].referatrubrik, value, lang="sv") domdesc.value(self.ns['dct'].title, value, lang="sv") elif label == "Domstol": domdesc.rel(self.ns['dct'].publisher, self.lookup_resource(value)) elif label == "Målnummer": domdesc.rel(self.ns['rpubl'].malnummer, value) elif label == "Domsnummer": domdesc.rel(self.ns['rpubl'].domsnummer, value) elif label == "Diarienummer": domdesc.rel(self.ns['rpubl'].diarienummer, value) elif label == "Avdelning": domdesc.rel(self.ns['rpubl'].avdelning, value) elif label == "Referat": for pred, regex in {'rattsfallspublikation': r'([^ ]+)', 'arsutgava': r'(\d{4})', 'lopnummer': r'\d{4}(?:\:| nr )(\d+)', 'sidnummer': r's.? ?(\d+)'}.items(): m = re.search(regex, value) if m: if pred == 'rattsfallspublikation': # "NJA" -> "http://lcaolhost:8000/coll/dv/nja" uri = self.config.url + "coll/dv/" + m.group(1).lower() refdesc.rel(self.ns['rpubl'][pred], uri) else: refdesc.value(self.ns['rpubl'][pred], m.group(1)) if value.startswith("NJA"): realvalue, extra = split_nja(value) ordinal = extra.split(" ")[1] refdesc.value(self.ns['dct'].bibliographicCitation, extra) refdesc.rel(self.ns['owl'].sameAs, self.config.url + "res/dv/nja/" + ordinal) refdesc.value(self.ns['dct'].identifier, realvalue) else: refdesc.value(self.ns['dct'].identifier, value) elif label == "Avgörandedatum": with util.c_locale(): d = datetime.strptime(value, '%Y-%m-%d') domdesc.value(self.ns['rpubl'].avgorandedatum, d) elif label == "Lagrum": for i in value: # better be list not string for node in self.lagrum_parser.parse(i): if isinstance(node, Link): domdesc.rel(self.ns['rpubl'].lagrum, localize_uri(node.uri)) elif label == "Rättsfall": for i in value: for node in self.rattsfall_parser.parse(i): if isinstance(node, Link): domdesc.rel(self.ns['rpubl'].rattsfall, localize_uri(node.uri)) elif label == "Litteratur": for i in value.split(";"): domdesc.value(self.ns['dct'].relation, util.normalize_space(i)) elif label == "Sökord": for s in self.re_delimSplit(value): s = util.normalize_space(s) if not s: continue # terms longer than 72 chars are not legitimate # terms. more likely descriptions. If a term has a - in # it, it's probably a separator between a term and a # description while len(s) >= 72 and " - " in s: h, s = s.split(" - ", 1) domdesc.rel(self.ns['dct'].subject, sokord_uri(h)) if len(s) < 72: domdesc.rel(self.ns['dct'].subject, sokord_uri(s)) # 3. mint some owl:sameAs URIs refdesc.rel(self.ns['owl'].sameAs, self.sameas_uri(refuri)) domdesc.rel(self.ns['owl'].sameAs, self.sameas_uri(domuri)) # 4. Add some same-for-everyone properties refdesc.rel(self.ns['dct'].publisher, self.lookup_resource('Domstolsverket')) refdesc.rdftype(self.ns['rpubl'].Rattsfallsreferat) domdesc.rdftype(self.ns['rpubl'].VagledandeDomstolsavgorande) refdesc.rel(self.ns['rpubl'].referatAvDomstolsavgorande, domuri) # 5. assert that we have everything we need # 6. done! return refuri
def analyze_baseline_queries(self, analyzed_articles, num_of_keyterms=5): basefile = "tfeu" # Helper from http://effbot.org/zone/element-lib.htm def flatten(elem, include_tail=0): text = elem.text or "" for e in elem: text += flatten(e, 1) if include_tail and elem.tail: text += elem.tail return text # step 1: Create a temporary whoosh index in order to find out # the most significant words for each article #ana = analysis.StandardAnalyzer() ana = analysis.StemmingAnalyzer() # vectorformat = formats.Frequency(ana) schema = fields.Schema(article=fields.ID(unique=True), content=fields.TEXT(analyzer=ana, stored=True)) st = RamStorage() tmpidx = st.create_index(schema) w = tmpidx.writer() XHT_NS = "{http://www.w3.org/1999/xhtml}" tree = ET.parse(self.parsed_path(basefile)) els = tree.findall("//" + XHT_NS + "div") articles = [] for el in els: if 'typeof' in el.attrib and el.attrib[ 'typeof'] == "eurlex:Article": text = util.normalize_space(flatten(el)) article = str(el.attrib['about']) articles.append(article) w.update_document(article=article, content=text) w.commit() self.log.info("Indexed %d articles" % len(articles)) # Step 2: Open the large whoosh index containing the text of # all cases. Then, for each article, use the 5 most distinctive terms # (filtering away numbers) to create a query against that index tempsearch = tmpidx.searcher() g = Graph() g.bind('celex', 'http://lagen.nu/ext/celex/') g.bind('ir', 'http://lagen.nu/informationretrieval#') IR = Namespace('http://lagen.nu/informationretrieval#') # celex:12008E264 ir:keyterm "blahonga"@en. outfile = self.generic_path("keyterms", "analyzed", ".tex") util.ensure_dir(outfile) fp = open(outfile, "w") fp.write(""" \\begin{tabular}{r|%s} \\hline \\textbf{Art.} & \\multicolumn{%s}{l}{\\textbf{Terms}} \\\\ \\hline """ % ("l" * num_of_keyterms, num_of_keyterms)) for article in analyzed_articles: fp.write(str(int(article.split("E")[1]))) r = tempsearch.search(query.Term("article", article)) terms = r.key_terms("content", numterms=num_of_keyterms + 1) terms = [t[0] for t in terms if not t[0].isdigit()][:num_of_keyterms] for term in terms: fp.write(" & " + term) g.add((URIRef(article), IR["keyterm"], Literal(term, lang="en"))) self.log.debug("Article %s:%r" % (article, terms)) fp.write("\\\\\n") fp.write(""" \\hline \\end{tabular} """) fp.close() outfile = self.generic_path("keyterms", "analyzed", ".n3") util.ensure_dir(outfile) fp = open(outfile, "w") fp.write(g.serialize(format="n3")) fp.close()
def header_lines(self, header_chunk): header = re.compile("([^:]+):\s*<b>([^<]*)</b>") for m in header.finditer(header_chunk): yield [util.normalize_space(x) for x in m.groups()]
def parse(self, doc): # some very simple heuristic rules for determining # what an individual paragraph is def is_heading(p): # If it's on a single line and it isn't indented with spaces # it's probably a heading. if p.count("\n") == 0 and not p.startswith(" "): return True def is_pagebreak(p): # if it contains a form feed character, it represents a page break return "\f" in p # Parsing a document consists mainly of two parts: # 1: First we parse the body of text and store it in doc.body from ferenda.elements import Body, Preformatted, Title, Heading from ferenda import Describer reader = TextReader(self.store.downloaded_path(doc.basefile)) # First paragraph of an RFC is always a header block header = reader.readparagraph() # Preformatted is a ferenda.elements class representing a # block of preformatted text. It is derived from the built-in # list type, and must thus be initialized with an iterable, in # this case a single-element list of strings. (Note: if you # try to initialize it with a string, because strings are # iterables as well, you'll end up with a list where each # character in the string is an element, which is not what you # want). preheader = Preformatted([header]) # Doc.body is a ferenda.elements.Body class, which is also # is derived from list, so it has (amongst others) the append # method. We build our document by adding to this root # element. doc.body.append(preheader) # Second paragraph is always the title, and we don't include # this in the body of the document, since we'll add it to the # medata -- once is enough title = reader.readparagraph() # After that, just iterate over the document and guess what # everything is. TextReader.getiterator is useful for # iterating through a text in other chunks than single lines for para in reader.getiterator(reader.readparagraph): if is_heading(para): # Heading is yet another of these ferenda.elements # classes. doc.body.append(Heading([para])) elif is_pagebreak(para): # Just drop these remnants of a page-and-paper-based past pass else: # If we don't know that it's something else, it's a # preformatted section (the safest bet for RFC text). doc.body.append(Preformatted([para])) # 2: Then we create metadata for the document and store it in # doc.meta (in this case using the convenience # ferenda.Describer class). desc = Describer(doc.meta, doc.uri) # Set the rdf:type of the document desc.rdftype(self.rdf_type) # Set the title we've captured as the dct:title of the document and # specify that it is in English desc.value(self.ns['dct'].title, util.normalize_space(title), lang="en") # Construct the dct:identifier (eg "RFC 6991") for this document from the basefile desc.value(self.ns['dct'].identifier, "RFC " + doc.basefile) # find and convert the publication date in the header to a datetime # object, and set it as the dct:issued date for the document re_date = re.compile("(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})").search # This is a context manager that temporarily sets the system # locale to the "C" locale in order to be able to use strptime # with a string on the form "August 2013", even though the # system may use another locale. dt_match = re_date(header) if dt_match: with util.c_locale(): dt = datetime.strptime(re_date(header).group(0), "%B %Y") pubdate = date(dt.year,dt.month,dt.day) # Note that using some python types (cf. datetime.date) # results in a datatyped RDF literal, ie in this case # <http://localhost:8000/res/rfc/6994> dct:issued "2013-08-01"^^xsd:date desc.value(self.ns['dct'].issued, pubdate) # find any older RFCs that this document updates or obsoletes obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE) updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE) # Find the category of this RFC, store it as dct:subject cat_match = re.search("^Category: ([\w ]+?)( |$)", header, re.MULTILINE) if cat_match: desc.value(self.ns['dct'].subject, cat_match.group(1)) for predicate, matches in ((self.ns['rfc'].updates, updates), (self.ns['rfc'].obsoletes, obsoletes)): if matches is None: continue # add references between this document and these older rfcs, # using either rfc:updates or rfc:obsoletes for match in matches.group(1).strip().split(", "): uri = self.canonical_uri(match) # Note that this uses our own unofficial # namespace/vocabulary # http://example.org/ontology/rfc/ desc.rel(predicate, uri) # And now we're done. We don't need to return anything as # we've modified the Document object that was passed to # us. The calling code will serialize this modified object to # XHTML and RDF and store it on disk # end parse1 # Now do it again reader.seek(0) reader.readparagraph() reader.readparagraph() doc.body = Body() doc.body.append(preheader) # doc.body.append(Title([util.normalize_space(title)])) # begin parse2 from ferenda.elements import Section, Subsection, Subsubsection # More heuristic rules: Section headers start at the beginning # of a line and are numbered. Subsections and subsubsections # have dotted numbers, optionally with a trailing period, ie # '9.2.' or '11.3.1' def is_section(p): return re.match(r"\d+\.? +[A-Z]", p) def is_subsection(p): return re.match(r"\d+\.\d+\.? +[A-Z]", p) def is_subsubsection(p): return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p) def split_sectionheader(p): # returns a tuple of title, ordinal, identifier ordinal, title = p.split(" ",1) ordinal = ordinal.strip(".") return title.strip(), ordinal, "RFC %s, section %s" % (doc.basefile, ordinal) # Use a list as a simple stack to keep track of the nesting # depth of a document. Every time we create a Section, # Subsection or Subsubsection object, we push it onto the # stack (and clear the stack down to the appropriate nesting # depth). Every time we create some other object, we append it # to whatever object is at the top of the stack. As your rules # for representing the nesting of structure become more # complicated, you might want to use the # :class:`~ferenda.FSMParser` class, which lets you define # heuristic rules (recognizers), states and transitions, and # takes care of putting your structure together. stack = [doc.body] for para in reader.getiterator(reader.readparagraph): if is_section(para): title, ordinal, identifier = split_sectionheader(para) s = Section(title=title, ordinal=ordinal, identifier=identifier) stack[1:] = [] # clear all but bottom element stack[0].append(s) # add new section to body stack.append(s) # push new section on top of stack elif is_subsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsection(title=title, ordinal=ordinal, identifier=identifier) stack[2:] = [] # clear all but bottom two elements stack[1].append(s) # add new subsection to current section stack.append(s) elif is_subsubsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsubsection(title=title, ordinal=ordinal, identifier=identifier) stack[3:] = [] # clear all but bottom three stack[-1].append(s) # add new subsubsection to current subsection stack.append(s) elif is_heading(para): stack[-1].append(Heading([para])) elif is_pagebreak(para): pass else: pre = Preformatted([para]) stack[-1].append(pre) # end parse2 # begin citation1 from pyparsing import Word, CaselessLiteral, nums section_citation = (CaselessLiteral("section") + Word(nums+".").setResultsName("Sec")).setResultsName("SecRef") rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") + "]").setResultsName("RFCRef") section_rfc_citation = (section_citation + "of" + rfc_citation).setResultsName("SecRFCRef") # end citation1 # begin citation2 def rfc_uriformatter(parts): uri = "" if 'RFC' in parts: uri += self.canonical_uri(parts['RFC'].lstrip("0")) if 'Sec' in parts: uri += "#S" + parts['Sec'] return uri # end citation2 # begin citation3 from ferenda import CitationParser, URIFormatter citparser = CitationParser(section_rfc_citation, section_citation, rfc_citation) citparser.set_formatter(URIFormatter(("SecRFCRef", rfc_uriformatter), ("SecRef", rfc_uriformatter), ("RFCRef", rfc_uriformatter))) citparser.parse_recursive(doc.body)
def find_definitions(self, element, find_definitions): if not isinstance(element, CompoundElement): return None find_definitions_recursive = find_definitions # Hitta begreppsdefinitioner if isinstance(element, Paragraf): # kolla om första stycket innehåller en text som # antyder att definitioner följer # self.log.debug("Testing %r against some regexes" % element[0][0]) if self.re_definitions(element[0][0]): find_definitions = "normal" if (self.re_brottsdef(element[0][0]) or self.re_brottsdef_alt(element[0][0])): find_definitions = "brottsrubricering" if self.re_parantesdef(element[0][0]): find_definitions = "parantes" if self.re_loptextdef(element[0][0]): find_definitions = "loptext" for p in element: if isinstance(p, Stycke): # do an extra check in case "I denna paragraf # avses med" occurs in the 2nd or later # paragrapgh of a section if self.re_definitions(p[0]): find_definitions = "normal" find_definitions_recursive = find_definitions # Hitta lagrumshänvisningar + definitioner if isinstance(element, (Stycke, Listelement, Tabellrad)): nodes = [] term = None # self.log.debug("handling text %s, find_definitions %s" % (element[0],find_definitions)) if find_definitions: # For Tabellrad, this is a Tabellcell, not a string, # but we fix that later elementtext = element[0] termdelimiter = ":" if isinstance(element, Tabellrad): # only the first cell can be a definition, and # only if it's not the text "Beteckning". So for # the reminder of this func, we switch context to # not the element itself but rather the first # cell. element = elementtext elementtext = element[0] if elementtext != "Beteckning": term = elementtext self.log.debug( '"%s" är nog en definition (1)' % term) elif isinstance(element, Stycke): # Case 1: "antisladdsystem: ett tekniskt stödsystem" # Sometimes, : is not the delimiter between # the term and the definition, but even in # those cases, : might figure in the # definition itself, usually as part of the # SFS number. Do some hairy heuristics to find # out what delimiter to use if find_definitions == "normal": if not self.re_definitions(elementtext): if " - " in elementtext: if (":" in elementtext and (elementtext.index(":") < elementtext.index(" - "))): termdelimiter = ":" else: termdelimiter = " - " m = self.re_SearchSfsId(elementtext) if termdelimiter == ":" and m and m.start() < elementtext.index( ":"): termdelimiter = " " if termdelimiter in elementtext: term = elementtext.split(termdelimiter)[0] self.log.debug('"%s" är nog en definition (2.1)' % term) # case 2: "Den som berövar annan livet, döms # för mord till fängelse" m = self.re_brottsdef(elementtext) if m: term = m.group(2) self.log.debug( '"%s" är nog en definition (2.2)' % term) # case 3: "För miljöbrott döms till böter" m = self.re_brottsdef_alt(elementtext) if m: term = m.group(1) self.log.debug( '"%s" är nog en definition (2.3)' % term) # case 4: "Inteckning får på ansökan av # fastighetsägaren dödas (dödning)." m = self.re_parantesdef(elementtext) if m: term = m.group(1) # print("%s: %s" % (basefile, elementtext)) self.log.debug( '"%s" är nog en definition (2.4)' % term) # case 5: "Med detaljhandel avses i denna lag # försäljning av läkemedel" m = self.re_loptextdef(elementtext) if m: term = m.group(1) self.log.debug( '"%s" är nog en definition (2.5)' % term) elif isinstance(element, Listelement): for rx in (self.re_Bullet, self.re_DottedNumber, self.re_Bokstavslista): elementtext = rx.sub('', elementtext) term = elementtext.split(termdelimiter)[0] self.log.debug('"%s" är nog en definition (3)' % term) # Longest legitimate term found "Valutaväxling, # betalningsöverföring och annan finansiell # verksamhet" if term and len(term) < 68: term = util.normalize_space(term) termnode = LinkSubject(term, uri=self._term_to_subject( term), predicate="dcterms:subject") find_definitions_recursive = False else: term = None if term: idx = None for p in element: if isinstance(p, str) and term in p: (head, tail) = p.split(term, 1) nodes = (head, termnode, tail) idx = element.index(p) if not idx is None: element[idx:idx + 1] = nodes return find_definitions_recursive
def parse_from_textreader(self, reader, basefile): tracelog = logging.getLogger("%s.tracelog" % self.alias) doc = self.make_document(basefile) g = doc.meta # 1.2: Load known entities and their URIs (we have to add some # that are not yet in the official resource lists resource_list_file = self.store.path("resourcelist", "intermediate", ".rdf") if not os.path.exists(resource_list_file): self.download_resource_lists("http://service.lagrummet.se/var/common", resource_list_file) resources = Graph() resources.parse(resource_list_file, format="xml") # 1.3: Define regexps for the data we search for. fwdtests = { "dct:issn": ["^ISSN (\d+\-\d+)$"], "dct:title": ["((?:Föreskrifter|[\w ]+s (?:föreskrifter|allmänna råd)).*?)\n\n"], "dct:identifier": ["^([A-ZÅÄÖ-]+FS\s\s?\d{4}:\d+)$"], "rpubl:utkomFranTryck": ["Utkom från\strycket\s+den\s(\d+ \w+ \d{4})"], "rpubl:omtryckAv": ["^(Omtryck)$"], "rpubl:genomforDirektiv": ["Celex (3\d{2,4}\w\d{4})"], "rpubl:beslutsdatum": ["(?:har beslutats|beslutade|beslutat) den (\d+ \w+ \d{4})"], "rpubl:beslutadAv": [ "\n([A-ZÅÄÖ][\w ]+?)\d? (?:meddelar|lämnar|föreskriver)", "\s(?:meddelar|föreskriver) ([A-ZÅÄÖ][\w ]+?)\d?\s", ], "rpubl:bemyndigande": [ " ?(?:meddelar|föreskriver|Föreskrifterna meddelas|Föreskrifterna upphävs)\d?,? (?:följande |)med stöd av\s(.*?) ?(?:att|efter\ssamråd|dels|följande|i fråga om|och lämnar allmänna råd|och beslutar följande allmänna råd|\.\n)", "^Med stöd av (.*)\s(?:meddelar|föreskriver)", ], } # 2: Find metadata properties # 2.1 Find some of the properties on the first page (or the # 2nd, or 3rd... continue past TOC pages, cover pages etc # until the "real" first page is found) NB: FFFS 2007:1 has # ten (10) TOC pages! pagecnt = 0 for page in reader.getiterator(reader.readpage): # replace single newlines with spaces, but keep double # newlines # page = "\n\n".join([util.normalize_space(x) for x in page.split("\n\n")]) pagecnt += 1 props = {} for (prop, tests) in list(fwdtests.items()): if prop in props: continue for test in tests: m = re.search(test, page, re.MULTILINE | re.DOTALL | re.UNICODE) if m: props[prop] = util.normalize_space(m.group(1)) # Single required propery. If we find this, we're done if "rpubl:beslutsdatum" in props: break self.log.warning("%s: Couldn't find required props on page %s" % (basefile, pagecnt)) # 2.2 Find some of the properties on the last 'real' page (not # counting appendicies) reader.seek(0) pagesrev = reversed(list(reader.getiterator(reader.readpage))) # The language used to expres these two properties differ # quite a lot, more than what is reasonable to express in a # single regex. We therefore define a set of possible # expressions and try them in turn. revtests = { "rpubl:ikrafttradandedatum": [ "(?:Denna författning|Dessa föreskrifter|Dessa allmänna råd|Dessa föreskrifter och allmänna råd)\d* träder i ?kraft den (\d+ \w+ \d{4})", "Dessa föreskrifter träder i kraft, (?:.*), i övrigt den (\d+ \w+ \d{4})", "ska(?:ll|)\supphöra att gälla (?:den |)(\d+ \w+ \d{4}|denna dag|vid utgången av \w+ \d{4})", "träder i kraft den dag då författningen enligt uppgift på den (utkom från trycket)", ], "rpubl:upphaver": [ "träder i kraft den (?:\d+ \w+ \d{4}), då(.*)ska upphöra att gälla", "ska(?:ll|)\supphöra att gälla vid utgången av \w+ \d{4}, nämligen(.*?)\n\n", "att (.*) skall upphöra att gälla (denna dag|vid utgången av \w+ \d{4})", ], } cnt = 0 for page in pagesrev: cnt += 1 # Normalize the whitespace in each paragraph so that a # linebreak in the middle of the natural language # expression doesn't break our regexes. page = "\n\n".join([util.normalize_space(x) for x in page.split("\n\n")]) for (prop, tests) in list(revtests.items()): if prop in props: continue for test in tests: # Not re.DOTALL -- we've normalized whitespace and # don't want to match across paragraphs m = re.search(test, page, re.MULTILINE | re.UNICODE) if m: props[prop] = util.normalize_space(m.group(1)) # print u"%s: '%s' resulted in match '%s' at page %s from end" % # (prop,test,props[prop], cnt) # Single required propery. If we find this, we're done if "rpubl:ikrafttradandedatum" in props: break # 3: Clean up data - converting strings to Literals or # URIRefs, find legal references, etc if "dct:identifier" in props: (publication, year, ordinal) = re.split("[ :]", props["dct:identifier"]) # FIXME: Read resources graph instead fs = resources.value(predicate=self.ns["skos"].altLabel, object=Literal(publication, lang="sv")) props["rpubl:forfattningssamling"] = fs publ = resources.value(subject=fs, predicate=self.ns["dct"].publisher) props["dct:publisher"] = publ props["rpubl:arsutgava"] = Literal(year) # conversion to int, date not needed props["rpubl:lopnummer"] = Literal(ordinal) props["dct:identifier"] = Literal(props["dct:identifier"]) # Now we can mint the uri (should be done through LegalURI) uri = "http://rinfo.lagrummet.se/publ/%s/%s:%s" % ( props["rpubl:forfattningssamling"].split("/")[-1], props["rpubl:arsutgava"], props["rpubl:lopnummer"], ) self.log.debug("URI: %s" % uri) else: self.log.error("Couldn't find dct:identifier, cannot create URI, giving up") return None tracelog.info("Cleaning rpubl:beslutadAv") if "rpubl:beslutadAv" in props: agency = resources.value( predicate=self.ns["foaf"].name, object=Literal(props["rpubl:beslutadAv"], lang="sv") ) if agency: props["rpubl:beslutadAv"] = agency else: self.log.warning("Cannot find URI for rpubl:beslutadAv value %r" % props["rpubl:beslutadAv"]) del props["rpubl:beslutadAv"] tracelog.info("Cleaning dct:issn") if "dct:issn" in props: props["dct:issn"] = Literal(props["dct:issn"]) tracelog.info("Cleaning dct:title") # common false positive if "dct:title" in props and "denna f\xf6rfattning har beslutats den" in props["dct:title"]: del props["dct:title"] if "dct:title" in props: tracelog.info("Inspecting dct:title %r" % props["dct:title"]) # sometimes the title isn't separated with two newlines from the rest of the text if "\nbeslutade den " in props["dct:title"]: props["dct:title"] = props["dct:title"].split("\nbeslutade den ")[0] props["dct:title"] = Literal(util.normalize_space(props["dct:title"]), lang="sv") if re.search("^(Föreskrifter|[\w ]+s föreskrifter) om ändring i ", props["dct:title"], re.UNICODE): tracelog.info("Finding rpubl:andrar in dct:title") orig = re.search("([A-ZÅÄÖ-]+FS \d{4}:\d+)", props["dct:title"]).group(0) (publication, year, ordinal) = re.split("[ :]", orig) origuri = "http://rinfo.lagrummet.se/publ/%s/%s:%s" % ( self.rpubl_uri_transform(publication), year, ordinal, ) props["rpubl:andrar"] = URIRef(origuri) if "rpubl:omtryckAv" in props: props["rpubl:omtryckAv"] = URIRef(origuri) if ( re.search("^(Föreskrifter|[\w ]+s föreskrifter) om upphävande av", props["dct:title"], re.UNICODE) and not "rpubl:upphaver" in props ): tracelog.info("Finding rpubl:upphaver in dct:title") props["rpubl:upphaver"] = six.text_type(props["dct:title"]) # cleaned below tracelog.info("Cleaning date properties") for prop in ("rpubl:utkomFranTryck", "rpubl:beslutsdatum", "rpubl:ikrafttradandedatum"): if prop in props: if props[prop] == "denna dag" and prop == "rpubl:ikrafttradandedatum": props[prop] = props["rpubl:beslutsdatum"] elif props[prop] == "utkom från trycket" and prop == "rpubl:ikrafttradandedatum": props[prop] = props["rpubl:utkomFranTryck"] else: props[prop] = Literal(self.parse_swedish_date(props[prop].lower())) tracelog.info("Cleaning rpubl:genomforDirektiv") if "rpubl:genomforDirektiv" in props: props["rpubl:genomforDirektiv"] = URIRef( "http://rinfo.lagrummet.se/ext/eur-lex/%s" % props["rpubl:genomforDirektiv"] ) tracelog.info("Cleaning rpubl:bemyndigande") has_bemyndiganden = False if "rpubl:bemyndigande" in props: # SimpleParse can't handle unicode endash sign, transform # into regular ascii hyphen props["rpubl:bemyndigande"] = props["rpubl:bemyndigande"].replace("\u2013", "-") parser = LegalRef(LegalRef.LAGRUM) result = parser.parse(props["rpubl:bemyndigande"]) bemyndigande_uris = [x.uri for x in result if hasattr(x, "uri")] # some of these uris need to be filtered away due to # over-matching by parser.parse filtered_bemyndigande_uris = [] for bem_uri in bemyndigande_uris: keep = True for compare in bemyndigande_uris: if len(compare) > len(bem_uri) and compare.startswith(bem_uri): keep = False if keep: filtered_bemyndigande_uris.append(bem_uri) for bem_uri in filtered_bemyndigande_uris: g.add((URIRef(uri), self.ns["rpubl"]["bemyndigande"], URIRef(bem_uri))) has_bemyndiganden = True del props["rpubl:bemyndigande"] tracelog.info("Cleaning rpubl:upphaver") if "rpubl:upphaver" in props: for upph in re.findall("([A-ZÅÄÖ-]+FS \d{4}:\d+)", util.normalize_space(props["rpubl:upphaver"])): (publication, year, ordinal) = re.split("[ :]", upph) upphuri = "http://rinfo.lagrummet.se/publ/%s/%s:%s" % (publication.lower(), year, ordinal) g.add((URIRef(uri), self.ns["rpubl"]["upphaver"], URIRef(upphuri))) del props["rpubl:upphaver"] tracelog.info("Deciding rdf:type") if "dct:title" in props and "allmänna råd" in props["dct:title"] and not "föreskrifter" in props["dct:title"]: props["rdf:type"] = self.ns["rpubl"]["AllmannaRad"] else: props["rdf:type"] = self.ns["rpubl"]["Myndighetsforeskrift"] # 3.5: Check to see that we have all properties that we expect # (should maybe be done elsewhere later?) tracelog.info("Checking required properties") for prop in ( "dct:identifier", "dct:title", "rpubl:arsutgava", "dct:publisher", "rpubl:beslutadAv", "rpubl:beslutsdatum", "rpubl:forfattningssamling", "rpubl:ikrafttradandedatum", "rpubl:lopnummer", "rpubl:utkomFranTryck", ): if not prop in props: self.log.warning("%s: Failed to find %s" % (basefile, prop)) tracelog.info("Checking rpubl:bemyndigande") if props["rdf:type"] == self.ns["rpubl"]["Myndighetsforeskrift"]: if not has_bemyndiganden: self.log.warning("%s: Failed to find rpubl:bemyndigande" % (basefile)) # 4: Add the cleaned data to a RDFLib Graph # (maybe we should do that as early as possible?) tracelog.info("Adding items to rdflib.Graph") for (prop, value) in list(props.items()): (prefix, term) = prop.split(":", 1) p = self.ns[prefix][term] if not (isinstance(value, URIRef) or isinstance(value, Literal)): self.log.warning("%s: %s is a %s, not a URIRef or Literal" % (basefile, prop, type(value))) g.add((URIRef(uri), p, value)) # 5: Create data for the body, removing various control characters # TODO: Use pdftohtml to create a nice viewable HTML # version instead of this plaintext stuff reader.seek(0) body = [] # A fairly involved way of filtering out all control # characters from a string import unicodedata if six.PY3: all_chars = (chr(i) for i in range(0x10000)) else: all_chars = (unichr(i) for i in range(0x10000)) control_chars = "".join(c for c in all_chars if unicodedata.category(c) == "Cc") # tab and newline are technically Control characters in # unicode, but we want to keep them. control_chars = control_chars.replace("\t", "").replace("\n", "") control_char_re = re.compile("[%s]" % re.escape(control_chars)) for page in reader.getiterator(reader.readpage): text = xml_escape(control_char_re.sub("", page)) body.append("<pre>%s</pre>\n\n" % text) # 5: Done! # doc.body = body doc.lang = "sv" doc.uri = uri return doc
def parse_from_soup(self, soup, doc): for block in soup.findAll(['div', 'p']): t = util.normalize_space(''.join(block.findAll(text=True))) block.extract() # to avoid seeing it again if t: doc.body.append(Paragraph([t]))
def extract_metadata_header(self, reader, basefile): re_sfs = re.compile(r'(\d{4}:\d+)\s*$').search d = {} for line in reader: if ":" in line: (key, val) = [util.normalize_space(x) for x in line.split(":", 1)] # Simple string literals if key == 'Rubrik': d["dcterms:title"] = val elif key == 'Övrigt': d["rdfs:comment"] = val elif key == 'SFS nr': identifier = "SFS " + val # delay actual writing to graph, since we may need to # amend this # date literals elif key == 'Utfärdad': d["rpubl:utfardandedatum"] = val[:10] elif key == 'Tidsbegränsad': # FIXME: Should be done by lagen.nu.SFS d["rinfoex:tidsbegransad"] = val[:10] elif key == 'Upphävd': dat = datetime.strptime(val[:10], '%Y-%m-%d') d["rpubl:upphavandedatum"] = val[:10] if not self.config.keepexpired and dat < datetime.today(): raise UpphavdForfattning( "%s is an expired SFS" % basefile, dummyfile=self.store.parsed_path(basefile)) # urirefs elif key == 'Departement/ myndighet': # this is only needed because of SFS 1942:724, which # has "Försvarsdepartementet, Socialdepartementet"... if "departementet, " in val: val = val.split(", ")[0] d["dcterms:creator"] = val elif (key == 'Ändring införd' and re_sfs(val)): uppdaterad = re_sfs(val).group(1) # not sure we need to add this, since parse_metadata # catches the same d["rpubl:konsolideringsunderlag"] = [ URIRef(self.canonical_uri(uppdaterad)) ] if identifier and identifier != "SFS " + uppdaterad: identifier += " i lydelse enligt SFS " + uppdaterad d["dcterms:issued"] = uppdaterad elif (key == 'Omtryck' and re_sfs(val)): d["rinfoex:omtryck"] = self.canonical_uri(re_sfs(val).group(1)) elif (key == 'Författningen har upphävts genom' and re_sfs(val)): s = re_sfs(val).group(1) d["rinfoex:upphavdAv"] = self.canonical_uri(s) else: self.log.warning('%s: Obekant nyckel [\'%s\']' % (basefile, key)) d["dcterms:identifier"] = identifier # FIXME: This is a misuse of the dcterms:issued prop in order # to mint the correct URI. We need to remove this somehow afterwards. if "dcterms:issued" not in d: d["dcterms:issued"] = basefile if "dcterms:title" not in d: self.log.warning("%s: Rubrik saknas" % basefile) return d
def parse(self, doc): # some very simple heuristic rules for determining # what an individual paragraph is def is_heading(p): # If it's on a single line and it isn't indented with spaces # it's probably a heading. if p.count("\n") == 0 and not p.startswith(" "): return True def is_pagebreak(p): # if it contains a form feed character, it represents a page break return "\f" in p # Parsing a document consists mainly of two parts: # 1: First we parse the body of text and store it in doc.body from ferenda.elements import Body, Preformatted, Title, Heading from ferenda import Describer reader = TextReader(self.store.downloaded_path(doc.basefile)) # First paragraph of an RFC is always a header block header = reader.readparagraph() # Preformatted is a ferenda.elements class representing a # block of preformatted text. It is derived from the built-in # list type, and must thus be initialized with an iterable, in # this case a single-element list of strings. (Note: if you # try to initialize it with a string, because strings are # iterables as well, you'll end up with a list where each # character in the string is an element, which is not what you # want). preheader = Preformatted([header]) # Doc.body is a ferenda.elements.Body class, which is also # is derived from list, so it has (amongst others) the append # method. We build our document by adding to this root # element. doc.body.append(preheader) # Second paragraph is always the title, and we don't include # this in the body of the document, since we'll add it to the # medata -- once is enough title = reader.readparagraph() # After that, just iterate over the document and guess what # everything is. TextReader.getiterator is useful for # iterating through a text in other chunks than single lines for para in reader.getiterator(reader.readparagraph): if is_heading(para): # Heading is yet another of these ferenda.elements # classes. doc.body.append(Heading([para])) elif is_pagebreak(para): # Just drop these remnants of a page-and-paper-based past pass else: # If we don't know that it's something else, it's a # preformatted section (the safest bet for RFC text). doc.body.append(Preformatted([para])) # 2: Then we create metadata for the document and store it in # doc.meta (in this case using the convenience # ferenda.Describer class). desc = Describer(doc.meta, doc.uri) # Set the rdf:type of the document desc.rdftype(self.rdf_type) # Set the title we've captured as the dcterms:title of the document and # specify that it is in English desc.value(self.ns['dcterms'].title, util.normalize_space(title), lang="en") # Construct the dcterms:identifier (eg "RFC 6991") for this document from the basefile desc.value(self.ns['dcterms'].identifier, "RFC " + doc.basefile) # find and convert the publication date in the header to a datetime # object, and set it as the dcterms:issued date for the document re_date = re.compile( "(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})" ).search # This is a context manager that temporarily sets the system # locale to the "C" locale in order to be able to use strptime # with a string on the form "August 2013", even though the # system may use another locale. dt_match = re_date(header) if dt_match: with util.c_locale(): dt = datetime.strptime(re_date(header).group(0), "%B %Y") pubdate = date(dt.year, dt.month, dt.day) # Note that using some python types (cf. datetime.date) # results in a datatyped RDF literal, ie in this case # <http://localhost:8000/res/rfc/6994> dcterms:issued "2013-08-01"^^xsd:date desc.value(self.ns['dcterms'].issued, pubdate) # find any older RFCs that this document updates or obsoletes obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE) updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE) # Find the category of this RFC, store it as dcterms:subject cat_match = re.search("^Category: ([\w ]+?)( |$)", header, re.MULTILINE) if cat_match: desc.value(self.ns['dcterms'].subject, cat_match.group(1)) for predicate, matches in ((self.ns['rfc'].updates, updates), (self.ns['rfc'].obsoletes, obsoletes)): if matches is None: continue # add references between this document and these older rfcs, # using either rfc:updates or rfc:obsoletes for match in matches.group(1).strip().split(", "): uri = self.canonical_uri(match) # Note that this uses our own unofficial # namespace/vocabulary # http://example.org/ontology/rfc/ desc.rel(predicate, uri) # And now we're done. We don't need to return anything as # we've modified the Document object that was passed to # us. The calling code will serialize this modified object to # XHTML and RDF and store it on disk # end parse1 # Now do it again reader.seek(0) reader.readparagraph() reader.readparagraph() doc.body = Body() doc.body.append(preheader) # doc.body.append(Title([util.normalize_space(title)])) # begin parse2 from ferenda.elements import Section, Subsection, Subsubsection # More heuristic rules: Section headers start at the beginning # of a line and are numbered. Subsections and subsubsections # have dotted numbers, optionally with a trailing period, ie # '9.2.' or '11.3.1' def is_section(p): return re.match(r"\d+\.? +[A-Z]", p) def is_subsection(p): return re.match(r"\d+\.\d+\.? +[A-Z]", p) def is_subsubsection(p): return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p) def split_sectionheader(p): # returns a tuple of title, ordinal, identifier ordinal, title = p.split(" ", 1) ordinal = ordinal.strip(".") return title.strip(), ordinal, "RFC %s, section %s" % ( doc.basefile, ordinal) # Use a list as a simple stack to keep track of the nesting # depth of a document. Every time we create a Section, # Subsection or Subsubsection object, we push it onto the # stack (and clear the stack down to the appropriate nesting # depth). Every time we create some other object, we append it # to whatever object is at the top of the stack. As your rules # for representing the nesting of structure become more # complicated, you might want to use the # :class:`~ferenda.FSMParser` class, which lets you define # heuristic rules (recognizers), states and transitions, and # takes care of putting your structure together. stack = [doc.body] for para in reader.getiterator(reader.readparagraph): if is_section(para): title, ordinal, identifier = split_sectionheader(para) s = Section(title=title, ordinal=ordinal, identifier=identifier) stack[1:] = [] # clear all but bottom element stack[0].append(s) # add new section to body stack.append(s) # push new section on top of stack elif is_subsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsection(title=title, ordinal=ordinal, identifier=identifier) stack[2:] = [] # clear all but bottom two elements stack[1].append(s) # add new subsection to current section stack.append(s) elif is_subsubsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsubsection(title=title, ordinal=ordinal, identifier=identifier) stack[3:] = [] # clear all but bottom three stack[-1].append( s) # add new subsubsection to current subsection stack.append(s) elif is_heading(para): stack[-1].append(Heading([para])) elif is_pagebreak(para): pass else: pre = Preformatted([para]) stack[-1].append(pre) # end parse2 # begin citation1 from pyparsing import Word, CaselessLiteral, nums section_citation = ( CaselessLiteral("section") + Word(nums + ".").setResultsName("Sec")).setResultsName("SecRef") rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") + "]").setResultsName("RFCRef") section_rfc_citation = (section_citation + "of" + rfc_citation).setResultsName("SecRFCRef") # end citation1 # begin citation2 def rfc_uriformatter(parts): uri = "" if 'RFC' in parts: uri += self.canonical_uri(parts['RFC'].lstrip("0")) if 'Sec' in parts: uri += "#S" + parts['Sec'] return uri # end citation2 # begin citation3 from ferenda import CitationParser, URIFormatter citparser = CitationParser(section_rfc_citation, section_citation, rfc_citation) citparser.set_formatter( URIFormatter(("SecRFCRef", rfc_uriformatter), ("SecRef", rfc_uriformatter), ("RFCRef", rfc_uriformatter))) citparser.parse_recursive(doc.body)
def extract_metadata_header(self, reader, basefile): re_sfs = re.compile(r'(\d{4}:\d+)\s*$').search d = {} for line in reader: if ":" in line: (key, val) = [util.normalize_space(x) for x in line.split(":", 1)] # Simple string literals if key == 'Rubrik': d["dcterms:title"] = val elif key == 'Övrigt': d["rdfs:comment"] = val elif key == 'SFS nr': identifier = "SFS " + val # delay actual writing to graph, since we may need to # amend this # date literals elif key == 'Utfärdad': d["rpubl:utfardandedatum"] = val[:10] elif key == 'Tidsbegränsad': # FIXME: Should be done by lagen.nu.SFS d["rinfoex:tidsbegransad"] = val[:10] elif key == 'Upphävd': dat = datetime.strptime(val[:10], '%Y-%m-%d') d["rpubl:upphavandedatum"] = val[:10] if not self.config.keepexpired and dat < datetime.today(): raise UpphavdForfattning("%s is an expired SFS" % basefile, dummyfile=self.store.parsed_path(basefile)) # urirefs elif key == 'Departement/ myndighet': # this is only needed because of SFS 1942:724, which # has "Försvarsdepartementet, Socialdepartementet"... if "departementet, " in val: val = val.split(", ")[0] d["dcterms:creator"] = val elif (key == 'Ändring införd' and re_sfs(val)): uppdaterad = re_sfs(val).group(1) # not sure we need to add this, since parse_metadata # catches the same d["rpubl:konsolideringsunderlag"] = [URIRef(self.canonical_uri(uppdaterad))] if identifier and identifier != "SFS " + uppdaterad: identifier += " i lydelse enligt SFS " + uppdaterad d["dcterms:issued"] = uppdaterad elif (key == 'Omtryck' and re_sfs(val)): d["rinfoex:omtryck"] = self.canonical_uri(re_sfs(val).group(1)) elif (key == 'Författningen har upphävts genom' and re_sfs(val)): s = re_sfs(val).group(1) d["rinfoex:upphavdAv"] = self.canonical_uri(s) else: self.log.warning( '%s: Obekant nyckel [\'%s\']' % (basefile, key)) d["dcterms:identifier"] = identifier # FIXME: This is a misuse of the dcterms:issued prop in order # to mint the correct URI. We need to remove this somehow afterwards. if "dcterms:issued" not in d: d["dcterms:issued"] = basefile if "dcterms:title" not in d: self.log.warning("%s: Rubrik saknas" % basefile) return d
def extract_metadata_register(self, soup, basefile): d = {} rubrik = util.normalize_space(soup.body('table')[2].text) changes = soup.body('table')[3:-2] g = self.make_graph() # used for qname lookup only for table in changes: sfsnr = table.find(text="SFS-nummer:").find_parent( "td").find_next_sibling("td").text.strip() docuri = self.canonical_uri(sfsnr) rowdict = {} parts = sfsnr.split(":") d[docuri] = { "dcterms:publisher": "Regeringskansliet", "rpubl:arsutgava": parts[0], "rpubl:beslutadAv": "Regeringskansliet", "rpubl:forfattningssamling": "SFS", "rpubl:lopnummer": parts[1] } for row in table('tr'): key = row.td.text.strip() if key.endswith(":"): key = key[:-1] # trim ending ":" elif key == '': continue # FIXME: the \xa0 ( ) to space conversion should # maye be part of normalize_space? val = util.normalize_space(row('td')[1].text) if val == "": continue rowdict[key] = val # first change does not contain a "Rubrik" key. Fake it. if 'Rubrik' not in rowdict and rubrik: rowdict['Rubrik'] = rubrik rubrik = None for key, val in rowdict.items(): if key == 'SFS-nummer': (arsutgava, lopnummer) = val.split(":") d[docuri]["dcterms:identifier"] = "SFS " + val d[docuri]["rpubl:arsutgava"] = arsutgava d[docuri]["rpubl:lopnummer"] = lopnummer elif key == 'Ansvarig myndighet': d[docuri]["rpubl:departement"] = val # FIXME: Sanitize this in # sanitize_metadata->sanitize_department, lookup # resource in polish_metadata elif key == 'Rubrik': # Change acts to Balkar never contain the SFS no # of the Balk. if basefile not in val and not val.endswith("balken"): self.log.warning("%s: Base SFS %s not in title %r" % (basefile, basefile, val)) d[docuri]["dcterms:title"] = val d[docuri]["rdf:type"] = self._forfattningstyp(val) elif key == 'Observera': if not self.config.keepexpired: if 'Författningen är upphävd/skall upphävas: ' in val: dateval = datetime.strptime(val[41:51], '%Y-%m-%d') if dateval < datetime.today(): raise UpphavdForfattning( "%s is an expired SFS" % basefile, dummyfile=self.store.parsed_path(basefile)) d[docuri]["rdfs:comment"] = val elif key == 'Ikraft': d[docuri]["rpubl:ikrafttradandedatum"] = val[:10] elif key == 'Omfattning': # First, create rdf statements for every # single modified section we can find for changecat in val.split('; '): if (changecat.startswith('ändr.') or changecat.startswith('ändr ') or changecat.startswith('ändring ')): pred = self.ns['rpubl'].ersatter elif (changecat.startswith('upph.') or changecat.startswith('upp.') or changecat.startswith('utgår')): pred = self.ns['rpubl'].upphaver elif (changecat.startswith('ny') or changecat.startswith('ikrafttr.') or changecat.startswith('ikrafftr.') or changecat.startswith('ikraftr.') or changecat.startswith('ikraftträd.') or changecat.startswith('tillägg')): pred = self.ns['rpubl'].inforsI elif (changecat.startswith('nuvarande') or changecat.startswith('rubr. närmast') or changecat in ('begr. giltighet', 'Omtryck', 'omtryck', 'forts.giltighet', 'forts. giltighet', 'forts. giltighet av vissa best.')): # some of these changecats are renames, eg # "nuvarande 2, 3, 4, 5 §§ betecknas 10, # 11, 12, 13, 14, 15 §§;" or # "rubr. närmast efter 1 § sätts närmast # före 10 §" pred = None else: self.log.warning("%s: Okänd omfattningstyp %r" % (basefile, changecat)) pred = None old_currenturl = self.lagrum_parser._currenturl self.lagrum_parser._currenturl = docuri for node in self.lagrum_parser.parse_string( changecat, pred): if hasattr(node, 'predicate'): qname = g.qname(node.predicate) d[docuri][qname] = node.uri self.lagrum_parser._currenturl = old_currenturl # Secondly, preserve the entire text d[docuri]["rpubl:andrar"] = val elif key == 'Förarbeten': for node in self.forarbete_parser.parse_string( val, "rpubl:forarbete"): if hasattr(node, 'uri'): if "rpubl:forarbete" not in d[docuri]: d[docuri]["rpubl:forarbete"] = [] d[docuri]["rpubl:forarbete"].append(node.uri) d[node.uri] = {"dcterms:identifier": str(node)} elif key == 'CELEX-nr': for celex in re.findall('3\d{2,4}[LR]\d{4}', val): b = BNode() cg = Graph() cg.add((b, RPUBL.celexNummer, Literal(celex))) celexuri = self.minter.space.coin_uri(cg.resource(b)) if "rpubl:genomforDirektiv" not in d[docuri]: d[docuri]["rpubl:genomforDirektiv"] = [] d[docuri]["rpubl:genomforDirektiv"].append(celexuri) d[celexuri] = {"rpubl:celexNummer": celex} elif key == 'Tidsbegränsad': d["rinfoex:tidsbegransad"] = val[:10] expdate = datetime.strptime(val[:10], '%Y-%m-%d') if expdate < datetime.today(): if not self.config.keepexpired: raise UpphavdForfattning( "%s is expired (time-limited) SFS" % basefile, dummyfile=self.store.parsed_path(basefile)) else: self.log.warning('%s: Obekant nyckel [\'%s\']' % basefile, key) utfardandedatum = self._find_utfardandedatum(sfsnr) if utfardandedatum: d[docuri]["rpubl:utfardandedatum"] = utfardandedatum return d
def extract_metadata_register(self, soup, basefile): d = {} rubrik = util.normalize_space(soup.body('table')[2].text) changes = soup.body('table')[3:-2] g = self.make_graph() # used for qname lookup only for table in changes: sfsnr = table.find(text="SFS-nummer:").find_parent( "td").find_next_sibling("td").text.strip() docuri = self.canonical_uri(sfsnr) rowdict = {} parts = sfsnr.split(":") d[docuri] = { "dcterms:publisher": "Regeringskansliet", "rpubl:arsutgava": parts[0], "rpubl:beslutadAv": "Regeringskansliet", "rpubl:forfattningssamling": "SFS", "rpubl:lopnummer": parts[1] } for row in table('tr'): key = row.td.text.strip() if key.endswith(":"): key = key[:-1] # trim ending ":" elif key == '': continue # FIXME: the \xa0 ( ) to space conversion should # maye be part of normalize_space? val = util.normalize_space(row('td')[1].text) if val == "": continue rowdict[key] = val # first change does not contain a "Rubrik" key. Fake it. if 'Rubrik' not in rowdict and rubrik: rowdict['Rubrik'] = rubrik rubrik = None for key, val in rowdict.items(): if key == 'SFS-nummer': (arsutgava, lopnummer) = val.split(":") d[docuri]["dcterms:identifier"] = "SFS " + val d[docuri]["rpubl:arsutgava"] = arsutgava d[docuri]["rpubl:lopnummer"] = lopnummer elif key == 'Ansvarig myndighet': d[docuri]["rpubl:departement"] = val # FIXME: Sanitize this in # sanitize_metadata->sanitize_department, lookup # resource in polish_metadata elif key == 'Rubrik': # Change acts to Balkar never contain the SFS no # of the Balk. if basefile not in val and not val.endswith("balken"): self.log.warning( "%s: Base SFS %s not in title %r" % (basefile, basefile, val)) d[docuri]["dcterms:title"] = val d[docuri]["rdf:type"] = self._forfattningstyp(val) elif key == 'Observera': if not self.config.keepexpired: if 'Författningen är upphävd/skall upphävas: ' in val: dateval = datetime.strptime(val[41:51], '%Y-%m-%d') if dateval < datetime.today(): raise UpphavdForfattning("%s is an expired SFS" % basefile, dummyfile=self.store.parsed_path(basefile)) d[docuri]["rdfs:comment"] = val elif key == 'Ikraft': d[docuri]["rpubl:ikrafttradandedatum"] = val[:10] elif key == 'Omfattning': # First, create rdf statements for every # single modified section we can find for changecat in val.split('; '): if (changecat.startswith('ändr.') or changecat.startswith('ändr ') or changecat.startswith('ändring ')): pred = self.ns['rpubl'].ersatter elif (changecat.startswith('upph.') or changecat.startswith('upp.') or changecat.startswith('utgår')): pred = self.ns['rpubl'].upphaver elif (changecat.startswith('ny') or changecat.startswith('ikrafttr.') or changecat.startswith('ikrafftr.') or changecat.startswith('ikraftr.') or changecat.startswith('ikraftträd.') or changecat.startswith('tillägg')): pred = self.ns['rpubl'].inforsI elif (changecat.startswith('nuvarande') or changecat.startswith('rubr. närmast') or changecat in ('begr. giltighet', 'Omtryck', 'omtryck', 'forts.giltighet', 'forts. giltighet', 'forts. giltighet av vissa best.')): # some of these changecats are renames, eg # "nuvarande 2, 3, 4, 5 §§ betecknas 10, # 11, 12, 13, 14, 15 §§;" or # "rubr. närmast efter 1 § sätts närmast # före 10 §" pred = None else: self.log.warning( "%s: Okänd omfattningstyp %r" % (basefile, changecat)) pred = None old_currenturl = self.lagrum_parser._currenturl self.lagrum_parser._currenturl = docuri for node in self.lagrum_parser.parse_string(changecat, pred): if hasattr(node, 'predicate'): qname = g.qname(node.predicate) d[docuri][qname] = node.uri self.lagrum_parser._currenturl = old_currenturl # Secondly, preserve the entire text d[docuri]["rpubl:andrar"] = val elif key == 'Förarbeten': for node in self.forarbete_parser.parse_string(val, "rpubl:forarbete"): if hasattr(node, 'uri'): if "rpubl:forarbete" not in d[docuri]: d[docuri]["rpubl:forarbete"] = [] d[docuri]["rpubl:forarbete"].append(node.uri) d[node.uri] = {"dcterms:identifier": str(node)} elif key == 'CELEX-nr': for celex in re.findall('3\d{2,4}[LR]\d{4}', val): b = BNode() cg = Graph() cg.add((b, RPUBL.celexNummer, Literal(celex))) celexuri = self.minter.space.coin_uri(cg.resource(b)) if "rpubl:genomforDirektiv" not in d[docuri]: d[docuri]["rpubl:genomforDirektiv"] = [] d[docuri]["rpubl:genomforDirektiv"].append(celexuri) d[celexuri] = {"rpubl:celexNummer": celex} elif key == 'Tidsbegränsad': d["rinfoex:tidsbegransad"] = val[:10] expdate = datetime.strptime(val[:10], '%Y-%m-%d') if expdate < datetime.today(): if not self.config.keepexpired: raise UpphavdForfattning( "%s is expired (time-limited) SFS" % basefile, dummyfile=self.store.parsed_path(basefile)) else: self.log.warning( '%s: Obekant nyckel [\'%s\']' % basefile, key) utfardandedatum = self._find_utfardandedatum(sfsnr) if utfardandedatum: d[docuri]["rpubl:utfardandedatum"] = utfardandedatum return d
def find_definitions(self, element, find_definitions): if not isinstance(element, CompoundElement): return None find_definitions_recursive = find_definitions # Hitta begreppsdefinitioner if isinstance(element, Paragraf): # kolla om första stycket innehåller en text som # antyder att definitioner följer # self.log.debug("Testing %r against some regexes" % element[0][0]) if self.re_definitions(element[0][0]): find_definitions = "normal" if (self.re_brottsdef(element[0][0]) or self.re_brottsdef_alt(element[0][0])): find_definitions = "brottsrubricering" if self.re_parantesdef(element[0][0]): find_definitions = "parantes" if self.re_loptextdef(element[0][0]): find_definitions = "loptext" for p in element: if isinstance(p, Stycke): # do an extra check in case "I denna paragraf # avses med" occurs in the 2nd or later # paragrapgh of a section if self.re_definitions(p[0]): find_definitions = "normal" find_definitions_recursive = find_definitions # Hitta lagrumshänvisningar + definitioner if isinstance(element, (Stycke, Listelement, Tabellrad)): nodes = [] term = None # self.log.debug("handling text %s, find_definitions %s" % (element[0],find_definitions)) if find_definitions: # For Tabellrad, this is a Tabellcell, not a string, # but we fix that later elementtext = element[0] termdelimiter = ":" if isinstance(element, Tabellrad): # only the first cell can be a definition, and # only if it's not the text "Beteckning". So for # the reminder of this func, we switch context to # not the element itself but rather the first # cell. element = elementtext elementtext = element[0] if elementtext != "Beteckning": term = elementtext self.log.debug('"%s" är nog en definition (1)' % term) elif isinstance(element, Stycke): # Case 1: "antisladdsystem: ett tekniskt stödsystem" # Sometimes, : is not the delimiter between # the term and the definition, but even in # those cases, : might figure in the # definition itself, usually as part of the # SFS number. Do some hairy heuristics to find # out what delimiter to use if find_definitions == "normal": if not self.re_definitions(elementtext): if " - " in elementtext: if (":" in elementtext and (elementtext.index(":") < elementtext.index(" - "))): termdelimiter = ":" else: termdelimiter = " - " m = self.re_SearchSfsId(elementtext) if termdelimiter == ":" and m and m.start( ) < elementtext.index(":"): termdelimiter = " " if termdelimiter in elementtext: term = elementtext.split(termdelimiter)[0] self.log.debug( '"%s" är nog en definition (2.1)' % term) # case 2: "Den som berövar annan livet, döms # för mord till fängelse" m = self.re_brottsdef(elementtext) if m: term = m.group(2) self.log.debug('"%s" är nog en definition (2.2)' % term) # case 3: "För miljöbrott döms till böter" m = self.re_brottsdef_alt(elementtext) if m: term = m.group(1) self.log.debug('"%s" är nog en definition (2.3)' % term) # case 4: "Inteckning får på ansökan av # fastighetsägaren dödas (dödning)." m = self.re_parantesdef(elementtext) if m: term = m.group(1) # print("%s: %s" % (basefile, elementtext)) self.log.debug('"%s" är nog en definition (2.4)' % term) # case 5: "Med detaljhandel avses i denna lag # försäljning av läkemedel" m = self.re_loptextdef(elementtext) if m: term = m.group(1) self.log.debug('"%s" är nog en definition (2.5)' % term) elif isinstance(element, Listelement): for rx in (self.re_Bullet, self.re_DottedNumber, self.re_Bokstavslista): elementtext = rx.sub('', elementtext) term = elementtext.split(termdelimiter)[0] self.log.debug('"%s" är nog en definition (3)' % term) # Longest legitimate term found "Valutaväxling, # betalningsöverföring och annan finansiell # verksamhet" if term and len(term) < 68: term = util.normalize_space(term) termnode = LinkSubject(term, uri=self._term_to_subject(term), predicate="dcterms:subject") find_definitions_recursive = False else: term = None if term: idx = None for p in element: if isinstance(p, str) and term in p: (head, tail) = p.split(term, 1) nodes = (head, termnode, tail) idx = element.index(p) if not idx is None: element[idx:idx + 1] = nodes return find_definitions_recursive
def test_ocr(self): try: if not os.environ.get("FERENDA_TEST_TESSERACT"): raise errors.ExternalCommandError reader = PDFReader(filename="test/files/pdfreader/scanned.pdf", workdir=self.datadir, ocr_lang="swe") except errors.ExternalCommandError: self._copy_sample() reader = PDFReader(filename="test/files/pdfreader/scanned.pdf", workdir=self.datadir, ocr_lang="swe") # assert that a hOCR file has been created self.assertTrue(os.path.exists(self.datadir + os.sep + "scanned.hocr.html")) # assert that we have two pages self.assertEqual(2, len(reader)) # assert that first element in the first textbox in the first # page corresponds to the first bbox, scaled by the # pixel/point scaling factor. self.assertEqual("Regeringens ", str(reader[0][0][0])) self.assertEqual(47, reader[0][0][0].top) self.assertEqual(38, reader[0][0][0].left) self.assertEqual(21, reader[0][0][0].height) self.assertEqual(118, reader[0][0][0].width) # assert that the <s>third</s>fifth textbox (which has mostly # normal text) is rendered correctly (note that we have a # couple of OCR errors). # self.assertEqual("Regeringen föreslår riksdagen att anta de förslag som har tagits. upp i bifogade utdrag ur regeringsprotokollet den 31 oktober l99l.", util.normalize_space(str(reader[0][3]))) self.assertEqual("Regeringen föreslår riksdagen att anta de förslag som har tagits. upp i", util.normalize_space(str(reader[0][5])))