def parse(self, doc): # some very simple heuristic rules for determining # what an individual paragraph is def is_heading(p): # If it's on a single line and it isn't indented with spaces # it's probably a heading. if p.count("\n") == 0 and not p.startswith(" "): return True def is_pagebreak(p): # if it contains a form feed character, it represents a page break return "\f" in p # Parsing a document consists mainly of two parts: # 1: First we parse the body of text and store it in doc.body from ferenda.elements import Body, Preformatted, Title, Heading from ferenda import Describer reader = TextReader(self.store.downloaded_path(doc.basefile)) # First paragraph of an RFC is always a header block header = reader.readparagraph() # Preformatted is a ferenda.elements class representing a # block of preformatted text. It is derived from the built-in # list type, and must thus be initialized with an iterable, in # this case a single-element list of strings. (Note: if you # try to initialize it with a string, because strings are # iterables as well, you'll end up with a list where each # character in the string is an element, which is not what you # want). preheader = Preformatted([header]) # Doc.body is a ferenda.elements.Body class, which is also # is derived from list, so it has (amongst others) the append # method. We build our document by adding to this root # element. doc.body.append(preheader) # Second paragraph is always the title, and we don't include # this in the body of the document, since we'll add it to the # medata -- once is enough title = reader.readparagraph() # After that, just iterate over the document and guess what # everything is. TextReader.getiterator is useful for # iterating through a text in other chunks than single lines for para in reader.getiterator(reader.readparagraph): if is_heading(para): # Heading is yet another of these ferenda.elements # classes. doc.body.append(Heading([para])) elif is_pagebreak(para): # Just drop these remnants of a page-and-paper-based past pass else: # If we don't know that it's something else, it's a # preformatted section (the safest bet for RFC text). doc.body.append(Preformatted([para])) # 2: Then we create metadata for the document and store it in # doc.meta (in this case using the convenience # ferenda.Describer class). desc = Describer(doc.meta, doc.uri) # Set the rdf:type of the document desc.rdftype(self.rdf_type) # Set the title we've captured as the dct:title of the document and # specify that it is in English desc.value(self.ns['dct'].title, util.normalize_space(title), lang="en") # Construct the dct:identifier (eg "RFC 6991") for this document from the basefile desc.value(self.ns['dct'].identifier, "RFC " + doc.basefile) # find and convert the publication date in the header to a datetime # object, and set it as the dct:issued date for the document re_date = re.compile("(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})").search # This is a context manager that temporarily sets the system # locale to the "C" locale in order to be able to use strptime # with a string on the form "August 2013", even though the # system may use another locale. dt_match = re_date(header) if dt_match: with util.c_locale(): dt = datetime.strptime(re_date(header).group(0), "%B %Y") pubdate = date(dt.year,dt.month,dt.day) # Note that using some python types (cf. datetime.date) # results in a datatyped RDF literal, ie in this case # <http://localhost:8000/res/rfc/6994> dct:issued "2013-08-01"^^xsd:date desc.value(self.ns['dct'].issued, pubdate) # find any older RFCs that this document updates or obsoletes obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE) updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE) # Find the category of this RFC, store it as dct:subject cat_match = re.search("^Category: ([\w ]+?)( |$)", header, re.MULTILINE) if cat_match: desc.value(self.ns['dct'].subject, cat_match.group(1)) for predicate, matches in ((self.ns['rfc'].updates, updates), (self.ns['rfc'].obsoletes, obsoletes)): if matches is None: continue # add references between this document and these older rfcs, # using either rfc:updates or rfc:obsoletes for match in matches.group(1).strip().split(", "): uri = self.canonical_uri(match) # Note that this uses our own unofficial # namespace/vocabulary # http://example.org/ontology/rfc/ desc.rel(predicate, uri) # And now we're done. We don't need to return anything as # we've modified the Document object that was passed to # us. The calling code will serialize this modified object to # XHTML and RDF and store it on disk # end parse1 # Now do it again reader.seek(0) reader.readparagraph() reader.readparagraph() doc.body = Body() doc.body.append(preheader) # doc.body.append(Title([util.normalize_space(title)])) # begin parse2 from ferenda.elements import Section, Subsection, Subsubsection # More heuristic rules: Section headers start at the beginning # of a line and are numbered. Subsections and subsubsections # have dotted numbers, optionally with a trailing period, ie # '9.2.' or '11.3.1' def is_section(p): return re.match(r"\d+\.? +[A-Z]", p) def is_subsection(p): return re.match(r"\d+\.\d+\.? +[A-Z]", p) def is_subsubsection(p): return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p) def split_sectionheader(p): # returns a tuple of title, ordinal, identifier ordinal, title = p.split(" ",1) ordinal = ordinal.strip(".") return title.strip(), ordinal, "RFC %s, section %s" % (doc.basefile, ordinal) # Use a list as a simple stack to keep track of the nesting # depth of a document. Every time we create a Section, # Subsection or Subsubsection object, we push it onto the # stack (and clear the stack down to the appropriate nesting # depth). Every time we create some other object, we append it # to whatever object is at the top of the stack. As your rules # for representing the nesting of structure become more # complicated, you might want to use the # :class:`~ferenda.FSMParser` class, which lets you define # heuristic rules (recognizers), states and transitions, and # takes care of putting your structure together. stack = [doc.body] for para in reader.getiterator(reader.readparagraph): if is_section(para): title, ordinal, identifier = split_sectionheader(para) s = Section(title=title, ordinal=ordinal, identifier=identifier) stack[1:] = [] # clear all but bottom element stack[0].append(s) # add new section to body stack.append(s) # push new section on top of stack elif is_subsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsection(title=title, ordinal=ordinal, identifier=identifier) stack[2:] = [] # clear all but bottom two elements stack[1].append(s) # add new subsection to current section stack.append(s) elif is_subsubsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsubsection(title=title, ordinal=ordinal, identifier=identifier) stack[3:] = [] # clear all but bottom three stack[-1].append(s) # add new subsubsection to current subsection stack.append(s) elif is_heading(para): stack[-1].append(Heading([para])) elif is_pagebreak(para): pass else: pre = Preformatted([para]) stack[-1].append(pre) # end parse2 # begin citation1 from pyparsing import Word, CaselessLiteral, nums section_citation = (CaselessLiteral("section") + Word(nums+".").setResultsName("Sec")).setResultsName("SecRef") rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") + "]").setResultsName("RFCRef") section_rfc_citation = (section_citation + "of" + rfc_citation).setResultsName("SecRFCRef") # end citation1 # begin citation2 def rfc_uriformatter(parts): uri = "" if 'RFC' in parts: uri += self.canonical_uri(parts['RFC'].lstrip("0")) if 'Sec' in parts: uri += "#S" + parts['Sec'] return uri # end citation2 # begin citation3 from ferenda import CitationParser, URIFormatter citparser = CitationParser(section_rfc_citation, section_citation, rfc_citation) citparser.set_formatter(URIFormatter(("SecRFCRef", rfc_uriformatter), ("SecRef", rfc_uriformatter), ("RFCRef", rfc_uriformatter))) citparser.parse_recursive(doc.body)
def parse_header(self, header, desc): # split header in left-hand and right-hand side, and line by line lines = header.split("\n") left = [x.split(" ", 1)[0].strip() for x in lines] right = [x.split(" ", 1)[1].strip() for x in lines if " " in x] # first line of lefthand side is publishing organization (?) desc.value(self.ns['dct'].publisher, left[0]) # following lefthand side are key-value headers for line in left[1:]: if line.strip() == "": continue if ": " not in line: self.log.warning("Cannot treat %r as a key-value header" % line) continue (key, value) = (x.strip() for x in line.split(": ")) if key == "Request for Comments": # make sure we only extract the numeric part -- # normally value should be numeric, but we've seen # "RFC 1006", "#154" and there are doubtless other # variants value = re.sub("\D", "", value) if value: # eg RFC 100 desc.value(self.ns['dct'].identifier, "RFC %s" % value) elif key == "Category": desc.value(self.ns['dct'].subject, value) elif key == "ISSN": desc.value(self.ns['dct'].issn, value) elif key in ("Updates", "Obsoletes"): pred = {'Updates': self.ns['rfc'].updates, 'Obsoletes': self.ns['rfc'].obsoletes}[key] for valuepart in value.split(", "): rfcmatch = re.search('\d+', valuepart) if rfcmatch: uri = self.canonical_uri(rfcmatch.group(0)) desc.rel(pred, uri) else: self.log.warning("Can't pick out RFC number from line %s" % line) elif key == "BCP": desc.value(self.ns['rfc'].BCP, value) elif key == "STD": desc.value(self.ns['rfc'].STD, value) elif key == "FYI": desc.value(self.ns['rfc'].FYI, value) else: # Unknown headers seen: BCP, STD, FYI self.log.warning("Unknown header key %s (value %s)" % (key, value)) # For right hand side, any line beginning with a single letter # followed by '. ' is probably a name for line in right: if re.match("[A-Z]\. ", line): desc.value(self.ns['dct'].creator, line) elif re.match("\w+ \d{4}$", line): # NOTE: this requires english locale! with util.c_locale(): dt = datetime.strptime(line, "%B %Y") d = date(dt.year, dt.month, dt.day) desc.value(self.ns['dct'].issued, d) else: # company affiliation - include that separate from # personal author identity desc.value(self.ns['dct'].rightsHolder, line)
def polish_metadata(self, head, doc): basefile_regex = re.compile('(?P<type>\w+)/(?P<year>\d+)-(?P<ordinal>\d+)') def basefile_to_referat(basefile): templ = {'ADO': 'AD %(year)s nr %(ordinal)s', 'MD': 'MD %(year)s:%(ordinal)s'} m = basefile_regex.match(basefile) if m: return templ[m.group("type")] % (m.groupdict()) def ref_to_uri(ref): # FIXME: We'd like to retire legalref and replace it with # pyparsing grammars. nodes = self.rattsfall_parser.parse(ref) uri = nodes[0].uri return localize_uri(uri) def dom_to_uri(domstol, malnr, avg): baseuri = self.config.url slug = self.slugs[domstol] return "%(baseuri)sres/dv/%(slug)s/%(malnr)s/%(avg)s" % locals() def localize_uri(uri): if "publ/rattsfall" in uri: return uri.replace("http://rinfo.lagrummet.se/publ/rattsfall", self.config.url + "res/dv") elif "publ/sfs/" in uri: return uri.replace("http://rinfo.lagrummet.se/publ/sfs", self.config.url + "res/sfs") def split_nja(value): # "NJA 2008 s 567 (NJA 2008:86)"=>("NJA 2008 s 567", "NJA 2008:86") return [x[:-1] for x in value.split("(")] def sokord_uri(value): return self.config.url + "concept/%s" % util.ucfirst(value).replace(' ', '_') # 0. create Referat key if not present if "Referat" not in head: # For some courts (MD, AD, MOD?, MIG?) this is possible head["Referat"] = basefile_to_referat(doc.basefile) # 1. mint uris and create the two Describers we'll use refuri = ref_to_uri(head["Referat"]) refdesc = Describer(doc.meta, refuri) domuri = dom_to_uri(head["Domstol"], head["Målnummer"], head["Avgörandedatum"]) domdesc = Describer(doc.meta, domuri) # 2. convert all strings in head to proper RDF for label, value in head.items(): if label == "Rubrik": value = util.normalize_space(value) refdesc.value(self.ns['rpubl'].referatrubrik, value, lang="sv") domdesc.value(self.ns['dct'].title, value, lang="sv") elif label == "Domstol": domdesc.rel(self.ns['dct'].publisher, self.lookup_resource(value)) elif label == "Målnummer": domdesc.rel(self.ns['rpubl'].malnummer, value) elif label == "Domsnummer": domdesc.rel(self.ns['rpubl'].domsnummer, value) elif label == "Diarienummer": domdesc.rel(self.ns['rpubl'].diarienummer, value) elif label == "Avdelning": domdesc.rel(self.ns['rpubl'].avdelning, value) elif label == "Referat": for pred, regex in {'rattsfallspublikation': r'([^ ]+)', 'arsutgava': r'(\d{4})', 'lopnummer': r'\d{4}(?:\:| nr )(\d+)', 'sidnummer': r's.? ?(\d+)'}.items(): m = re.search(regex, value) if m: if pred == 'rattsfallspublikation': # "NJA" -> "http://lcaolhost:8000/coll/dv/nja" uri = self.config.url + "coll/dv/" + m.group(1).lower() refdesc.rel(self.ns['rpubl'][pred], uri) else: refdesc.value(self.ns['rpubl'][pred], m.group(1)) if value.startswith("NJA"): realvalue, extra = split_nja(value) ordinal = extra.split(" ")[1] refdesc.value(self.ns['dct'].bibliographicCitation, extra) refdesc.rel(self.ns['owl'].sameAs, self.config.url + "res/dv/nja/" + ordinal) refdesc.value(self.ns['dct'].identifier, realvalue) else: refdesc.value(self.ns['dct'].identifier, value) elif label == "Avgörandedatum": with util.c_locale(): d = datetime.strptime(value, '%Y-%m-%d') domdesc.value(self.ns['rpubl'].avgorandedatum, d) elif label == "Lagrum": for i in value: # better be list not string for node in self.lagrum_parser.parse(i): if isinstance(node, Link): domdesc.rel(self.ns['rpubl'].lagrum, localize_uri(node.uri)) elif label == "Rättsfall": for i in value: for node in self.rattsfall_parser.parse(i): if isinstance(node, Link): domdesc.rel(self.ns['rpubl'].rattsfall, localize_uri(node.uri)) elif label == "Litteratur": for i in value.split(";"): domdesc.value(self.ns['dct'].relation, util.normalize_space(i)) elif label == "Sökord": for s in self.re_delimSplit(value): s = util.normalize_space(s) if not s: continue # terms longer than 72 chars are not legitimate # terms. more likely descriptions. If a term has a - in # it, it's probably a separator between a term and a # description while len(s) >= 72 and " - " in s: h, s = s.split(" - ", 1) domdesc.rel(self.ns['dct'].subject, sokord_uri(h)) if len(s) < 72: domdesc.rel(self.ns['dct'].subject, sokord_uri(s)) # 3. mint some owl:sameAs URIs refdesc.rel(self.ns['owl'].sameAs, self.sameas_uri(refuri)) domdesc.rel(self.ns['owl'].sameAs, self.sameas_uri(domuri)) # 4. Add some same-for-everyone properties refdesc.rel(self.ns['dct'].publisher, self.lookup_resource('Domstolsverket')) refdesc.rdftype(self.ns['rpubl'].Rattsfallsreferat) domdesc.rdftype(self.ns['rpubl'].VagledandeDomstolsavgorande) refdesc.rel(self.ns['rpubl'].referatAvDomstolsavgorande, domuri) # 5. assert that we have everything we need # 6. done! return refuri
def parse(self, doc): # some very simple heuristic rules for determining # what an individual paragraph is def is_heading(p): # If it's on a single line and it isn't indented with spaces # it's probably a heading. if p.count("\n") == 0 and not p.startswith(" "): return True def is_pagebreak(p): # if it contains a form feed character, it represents a page break return "\f" in p # Parsing a document consists mainly of two parts: # 1: First we parse the body of text and store it in doc.body from ferenda.elements import Body, Preformatted, Title, Heading from ferenda import Describer reader = TextReader(self.store.downloaded_path(doc.basefile)) # First paragraph of an RFC is always a header block header = reader.readparagraph() # Preformatted is a ferenda.elements class representing a # block of preformatted text. It is derived from the built-in # list type, and must thus be initialized with an iterable, in # this case a single-element list of strings. (Note: if you # try to initialize it with a string, because strings are # iterables as well, you'll end up with a list where each # character in the string is an element, which is not what you # want). preheader = Preformatted([header]) # Doc.body is a ferenda.elements.Body class, which is also # is derived from list, so it has (amongst others) the append # method. We build our document by adding to this root # element. doc.body.append(preheader) # Second paragraph is always the title, and we don't include # this in the body of the document, since we'll add it to the # medata -- once is enough title = reader.readparagraph() # After that, just iterate over the document and guess what # everything is. TextReader.getiterator is useful for # iterating through a text in other chunks than single lines for para in reader.getiterator(reader.readparagraph): if is_heading(para): # Heading is yet another of these ferenda.elements # classes. doc.body.append(Heading([para])) elif is_pagebreak(para): # Just drop these remnants of a page-and-paper-based past pass else: # If we don't know that it's something else, it's a # preformatted section (the safest bet for RFC text). doc.body.append(Preformatted([para])) # 2: Then we create metadata for the document and store it in # doc.meta (in this case using the convenience # ferenda.Describer class). desc = Describer(doc.meta, doc.uri) # Set the rdf:type of the document desc.rdftype(self.rdf_type) # Set the title we've captured as the dcterms:title of the document and # specify that it is in English desc.value(self.ns['dcterms'].title, util.normalize_space(title), lang="en") # Construct the dcterms:identifier (eg "RFC 6991") for this document from the basefile desc.value(self.ns['dcterms'].identifier, "RFC " + doc.basefile) # find and convert the publication date in the header to a datetime # object, and set it as the dcterms:issued date for the document re_date = re.compile( "(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})" ).search # This is a context manager that temporarily sets the system # locale to the "C" locale in order to be able to use strptime # with a string on the form "August 2013", even though the # system may use another locale. dt_match = re_date(header) if dt_match: with util.c_locale(): dt = datetime.strptime(re_date(header).group(0), "%B %Y") pubdate = date(dt.year, dt.month, dt.day) # Note that using some python types (cf. datetime.date) # results in a datatyped RDF literal, ie in this case # <http://localhost:8000/res/rfc/6994> dcterms:issued "2013-08-01"^^xsd:date desc.value(self.ns['dcterms'].issued, pubdate) # find any older RFCs that this document updates or obsoletes obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE) updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE) # Find the category of this RFC, store it as dcterms:subject cat_match = re.search("^Category: ([\w ]+?)( |$)", header, re.MULTILINE) if cat_match: desc.value(self.ns['dcterms'].subject, cat_match.group(1)) for predicate, matches in ((self.ns['rfc'].updates, updates), (self.ns['rfc'].obsoletes, obsoletes)): if matches is None: continue # add references between this document and these older rfcs, # using either rfc:updates or rfc:obsoletes for match in matches.group(1).strip().split(", "): uri = self.canonical_uri(match) # Note that this uses our own unofficial # namespace/vocabulary # http://example.org/ontology/rfc/ desc.rel(predicate, uri) # And now we're done. We don't need to return anything as # we've modified the Document object that was passed to # us. The calling code will serialize this modified object to # XHTML and RDF and store it on disk # end parse1 # Now do it again reader.seek(0) reader.readparagraph() reader.readparagraph() doc.body = Body() doc.body.append(preheader) # doc.body.append(Title([util.normalize_space(title)])) # begin parse2 from ferenda.elements import Section, Subsection, Subsubsection # More heuristic rules: Section headers start at the beginning # of a line and are numbered. Subsections and subsubsections # have dotted numbers, optionally with a trailing period, ie # '9.2.' or '11.3.1' def is_section(p): return re.match(r"\d+\.? +[A-Z]", p) def is_subsection(p): return re.match(r"\d+\.\d+\.? +[A-Z]", p) def is_subsubsection(p): return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p) def split_sectionheader(p): # returns a tuple of title, ordinal, identifier ordinal, title = p.split(" ", 1) ordinal = ordinal.strip(".") return title.strip(), ordinal, "RFC %s, section %s" % ( doc.basefile, ordinal) # Use a list as a simple stack to keep track of the nesting # depth of a document. Every time we create a Section, # Subsection or Subsubsection object, we push it onto the # stack (and clear the stack down to the appropriate nesting # depth). Every time we create some other object, we append it # to whatever object is at the top of the stack. As your rules # for representing the nesting of structure become more # complicated, you might want to use the # :class:`~ferenda.FSMParser` class, which lets you define # heuristic rules (recognizers), states and transitions, and # takes care of putting your structure together. stack = [doc.body] for para in reader.getiterator(reader.readparagraph): if is_section(para): title, ordinal, identifier = split_sectionheader(para) s = Section(title=title, ordinal=ordinal, identifier=identifier) stack[1:] = [] # clear all but bottom element stack[0].append(s) # add new section to body stack.append(s) # push new section on top of stack elif is_subsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsection(title=title, ordinal=ordinal, identifier=identifier) stack[2:] = [] # clear all but bottom two elements stack[1].append(s) # add new subsection to current section stack.append(s) elif is_subsubsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsubsection(title=title, ordinal=ordinal, identifier=identifier) stack[3:] = [] # clear all but bottom three stack[-1].append( s) # add new subsubsection to current subsection stack.append(s) elif is_heading(para): stack[-1].append(Heading([para])) elif is_pagebreak(para): pass else: pre = Preformatted([para]) stack[-1].append(pre) # end parse2 # begin citation1 from pyparsing import Word, CaselessLiteral, nums section_citation = ( CaselessLiteral("section") + Word(nums + ".").setResultsName("Sec")).setResultsName("SecRef") rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") + "]").setResultsName("RFCRef") section_rfc_citation = (section_citation + "of" + rfc_citation).setResultsName("SecRFCRef") # end citation1 # begin citation2 def rfc_uriformatter(parts): uri = "" if 'RFC' in parts: uri += self.canonical_uri(parts['RFC'].lstrip("0")) if 'Sec' in parts: uri += "#S" + parts['Sec'] return uri # end citation2 # begin citation3 from ferenda import CitationParser, URIFormatter citparser = CitationParser(section_rfc_citation, section_citation, rfc_citation) citparser.set_formatter( URIFormatter(("SecRFCRef", rfc_uriformatter), ("SecRef", rfc_uriformatter), ("RFCRef", rfc_uriformatter))) citparser.parse_recursive(doc.body)