def metadata_from_basefile(self, doc): desc = Describer(doc.meta, doc.uri) desc.rel(CDM.resource_legal_id_celex, Literal(doc.basefile)) # the sixth letter in rdftype = {"R": CDM.regulation, "L": CDM.directive, "C": CDM.decision_cjeu}[doc.basefile[5]] desc.rel(RDF.type, rdftype) return doc.meta
def parse_metadata_from_soup(self, soup, doc): doc.lang = self.lang d = Describer(doc.meta, doc.uri) d.rdftype(self.rdf_type) d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) dcterms = self.ns['dcterms'] # dcterms:title d.value(dcterms.title, soup.find("title").string, lang=doc.lang) d.value(dcterms.identifier, doc.basefile) # dcterms:abstract abstract = soup.find(_class="abstract") if abstract: d.value(dcterms['abstract'], abstract.string, lang=doc.lang) # dcterms:published datehdr = soup.find(lambda x: x.name in ('h2', 'h3') and re.search("W3C\s+Recommendation,?\s+", x.text)) if datehdr: datestr = " ".join(datehdr.text.split()) m = re.search("(\d+)[ \-](\w+),?[ \-](\d{4})", datestr) if not m: self.log.warning("%s: Couldn't parse datestr %s" % (doc.basefile, datestr)) else: datestr = " ".join(m.groups()) date = None try: # 17 December 1996 date = util.strptime(datestr, "%d %B %Y").date() except ValueError: try: # 17 Dec 1996 date = util.strptime(datestr, "%d %b %Y").date() except ValueError: self.log.warning("%s: Could not parse datestr %s" % (doc.basefile, datestr)) if date: d.value(dcterms.issued, date) # dcterms:editor editors = soup.find("dt", text=re.compile("Editors?:")) if editors: for editor in editors.find_next_siblings("dd"): editor_string = " ".join(x for x in editor.stripped_strings if not "@" in x) editor_name = editor_string.split(", ")[0] d.value(dcterms.editor, editor_name) # dcterms:publisher d.rel(dcterms.publisher, "http://localhost:8000/ext/w3c") # assure we got exactly one of each of the required properties for required in (dcterms.title, dcterms.issued): d.getvalue(required) # throws KeyError if not found (or more than one)
def make_meta(self, chunk, meta, uri, basefile): d = Describer(meta, uri) dct = self.ns['dct'] prov = self.ns['prov'] owl = self.ns['owl'] rpubl = RPUBL d.rdftype(self.rdf_type) d.value(prov.wasGeneratedBy, self.qualified_class_name()) # predicates maps key strings to corresponsing RDFLib terms, # e.g. "Rubrik" -> dct:title predicates = {'Dir nr': dct.identifier, 'Departement': rpubl.departement, 'Beslut vid regeringssammanträde': rpubl.beslutsdatum, 'Rubrik': dct.title, 'Senast ändrad': dct.changed } # munger contains a set of tuples where the first item is a # method for converting a plain text into the appropriate # RDFLib value, e.g: # - "Utredning av foo" => Literal("Utredning av foo",lang="sv") # - "1987-02-19" => datetime(1987,2,19) # - "Arbetsdepartementet" => URIRef("http://lagen.nu/terms/arbdep") # The second item is the Describer method that # should be used to add the value to the graph, i.e. .value # for Literals and .rel for URIRefs munger = {'Dir nr': (self.sanitize_identifier, d.value), # the RDFLib constructor 'Departement': (functools.partial(self.lookup_resource, warn=False), d.rel), 'Beslut vid regeringssammanträde': (self.parse_iso_date, d.value), 'Rubrik': (self.sanitize_rubrik, d.value), 'Senast ändrad': (self.parse_iso_date, d.value) } # headerlines wraps a TextReader in an iterator that parses # "key:value\n" lines with support for line continuation, eg # "long\nkey:long\nvalue\n" for (key, val) in self.header_lines(chunk): if not val: continue try: pred = predicates[key] (transformer, setter) = munger[key] setter(pred, transformer(val)) except (KeyError, ValueError) as e: self.log.error( "Couldn't munge value '%s' into a proper object for predicate '%s'" % (val, key)) d.rel(dct.publisher, self.lookup_resource("Regeringskansliet")) d.rel(owl.sameAs, self.sameas_uri(uri)) self.infer_triples(d, basefile)
def polish_metadata(self, head, basefile, infer_nodes=True): # where do we get refdesc, domdesc? coin_uri = self.sameas_minter.space.coin_uri resource = super(DV, self).polish_metadata(head, basefile) refuri = resource.identifier if 'rinfoex:patchdescription' in head: resource.add(RINFOEX.patchdescription, Literal(head['rinfoex:patchdescription'], lang="sv")) refuri_sameas = coin_uri(resource) resource.graph.add((URIRef(refuri), OWL.sameAs, URIRef(refuri_sameas))) # NB: In theory, we have all the data we need to generate a # canonical URI for the dom. In practice, this data does not # meet requirements of our URISpace templates in certain cases # (all MD verdicts use rpubl:domsnummer instead of # rpubl:malnummer, which is what the template expects. The # superclass' definition of polish_metadata gets around this # by creating a minimal graph from the plain dict in head and # feeds that to coin_uri. So we do the same here, instead of # the very simple: # # domuri_sameas = coin_uri(resource.value(RPUBL.referatAvDomstolsavgorande)) # # (also, this version handles the uncommon but valid case # where one referat concerns multiple dom:s) domuri = resource.value(RPUBL.referatAvDomstolsavgorande).identifier for malnummer in head['_localid']: bnodetmp = BNode() gtmp = Graph() gtmp.bind("rpubl", RPUBL) gtmp.bind("dcterms", DCTERMS) dtmp = Describer(gtmp, bnodetmp) dtmp.rdftype(RPUBL.VagledandeDomstolsavgorande) dtmp.value(RPUBL.malnummer, malnummer) dtmp.value(RPUBL.avgorandedatum, head['Avgörandedatum']) dtmp.rel(DCTERMS.publisher, self.lookup_resource(head["Domstol"])) rtmp = dtmp.graph.resource(bnodetmp) domuri_sameas = coin_uri(rtmp) resource.graph.add( (URIRef(domuri), OWL.sameAs, URIRef(domuri_sameas))) return resource
def parse(self, doc): # some very simple heuristic rules for determining # what an individual paragraph is def is_heading(p): # If it's on a single line and it isn't indented with spaces # it's probably a heading. if p.count("\n") == 0 and not p.startswith(" "): return True def is_pagebreak(p): # if it contains a form feed character, it represents a page break return "\f" in p # Parsing a document consists mainly of two parts: # 1: First we parse the body of text and store it in doc.body from ferenda.elements import Body, Preformatted, Title, Heading from ferenda import Describer reader = TextReader(self.store.downloaded_path(doc.basefile)) # First paragraph of an RFC is always a header block header = reader.readparagraph() # Preformatted is a ferenda.elements class representing a # block of preformatted text. It is derived from the built-in # list type, and must thus be initialized with an iterable, in # this case a single-element list of strings. (Note: if you # try to initialize it with a string, because strings are # iterables as well, you'll end up with a list where each # character in the string is an element, which is not what you # want). preheader = Preformatted([header]) # Doc.body is a ferenda.elements.Body class, which is also # is derived from list, so it has (amongst others) the append # method. We build our document by adding to this root # element. doc.body.append(preheader) # Second paragraph is always the title, and we don't include # this in the body of the document, since we'll add it to the # medata -- once is enough title = reader.readparagraph() # After that, just iterate over the document and guess what # everything is. TextReader.getiterator is useful for # iterating through a text in other chunks than single lines for para in reader.getiterator(reader.readparagraph): if is_heading(para): # Heading is yet another of these ferenda.elements # classes. doc.body.append(Heading([para])) elif is_pagebreak(para): # Just drop these remnants of a page-and-paper-based past pass else: # If we don't know that it's something else, it's a # preformatted section (the safest bet for RFC text). doc.body.append(Preformatted([para])) # 2: Then we create metadata for the document and store it in # doc.meta (in this case using the convenience # ferenda.Describer class). desc = Describer(doc.meta, doc.uri) # Set the rdf:type of the document desc.rdftype(self.rdf_type) # Set the title we've captured as the dct:title of the document and # specify that it is in English desc.value(self.ns['dct'].title, util.normalize_space(title), lang="en") # Construct the dct:identifier (eg "RFC 6991") for this document from the basefile desc.value(self.ns['dct'].identifier, "RFC " + doc.basefile) # find and convert the publication date in the header to a datetime # object, and set it as the dct:issued date for the document re_date = re.compile("(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})").search # This is a context manager that temporarily sets the system # locale to the "C" locale in order to be able to use strptime # with a string on the form "August 2013", even though the # system may use another locale. dt_match = re_date(header) if dt_match: with util.c_locale(): dt = datetime.strptime(re_date(header).group(0), "%B %Y") pubdate = date(dt.year,dt.month,dt.day) # Note that using some python types (cf. datetime.date) # results in a datatyped RDF literal, ie in this case # <http://localhost:8000/res/rfc/6994> dct:issued "2013-08-01"^^xsd:date desc.value(self.ns['dct'].issued, pubdate) # find any older RFCs that this document updates or obsoletes obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE) updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE) # Find the category of this RFC, store it as dct:subject cat_match = re.search("^Category: ([\w ]+?)( |$)", header, re.MULTILINE) if cat_match: desc.value(self.ns['dct'].subject, cat_match.group(1)) for predicate, matches in ((self.ns['rfc'].updates, updates), (self.ns['rfc'].obsoletes, obsoletes)): if matches is None: continue # add references between this document and these older rfcs, # using either rfc:updates or rfc:obsoletes for match in matches.group(1).strip().split(", "): uri = self.canonical_uri(match) # Note that this uses our own unofficial # namespace/vocabulary # http://example.org/ontology/rfc/ desc.rel(predicate, uri) # And now we're done. We don't need to return anything as # we've modified the Document object that was passed to # us. The calling code will serialize this modified object to # XHTML and RDF and store it on disk # end parse1 # Now do it again reader.seek(0) reader.readparagraph() reader.readparagraph() doc.body = Body() doc.body.append(preheader) # doc.body.append(Title([util.normalize_space(title)])) # begin parse2 from ferenda.elements import Section, Subsection, Subsubsection # More heuristic rules: Section headers start at the beginning # of a line and are numbered. Subsections and subsubsections # have dotted numbers, optionally with a trailing period, ie # '9.2.' or '11.3.1' def is_section(p): return re.match(r"\d+\.? +[A-Z]", p) def is_subsection(p): return re.match(r"\d+\.\d+\.? +[A-Z]", p) def is_subsubsection(p): return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p) def split_sectionheader(p): # returns a tuple of title, ordinal, identifier ordinal, title = p.split(" ",1) ordinal = ordinal.strip(".") return title.strip(), ordinal, "RFC %s, section %s" % (doc.basefile, ordinal) # Use a list as a simple stack to keep track of the nesting # depth of a document. Every time we create a Section, # Subsection or Subsubsection object, we push it onto the # stack (and clear the stack down to the appropriate nesting # depth). Every time we create some other object, we append it # to whatever object is at the top of the stack. As your rules # for representing the nesting of structure become more # complicated, you might want to use the # :class:`~ferenda.FSMParser` class, which lets you define # heuristic rules (recognizers), states and transitions, and # takes care of putting your structure together. stack = [doc.body] for para in reader.getiterator(reader.readparagraph): if is_section(para): title, ordinal, identifier = split_sectionheader(para) s = Section(title=title, ordinal=ordinal, identifier=identifier) stack[1:] = [] # clear all but bottom element stack[0].append(s) # add new section to body stack.append(s) # push new section on top of stack elif is_subsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsection(title=title, ordinal=ordinal, identifier=identifier) stack[2:] = [] # clear all but bottom two elements stack[1].append(s) # add new subsection to current section stack.append(s) elif is_subsubsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsubsection(title=title, ordinal=ordinal, identifier=identifier) stack[3:] = [] # clear all but bottom three stack[-1].append(s) # add new subsubsection to current subsection stack.append(s) elif is_heading(para): stack[-1].append(Heading([para])) elif is_pagebreak(para): pass else: pre = Preformatted([para]) stack[-1].append(pre) # end parse2 # begin citation1 from pyparsing import Word, CaselessLiteral, nums section_citation = (CaselessLiteral("section") + Word(nums+".").setResultsName("Sec")).setResultsName("SecRef") rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") + "]").setResultsName("RFCRef") section_rfc_citation = (section_citation + "of" + rfc_citation).setResultsName("SecRFCRef") # end citation1 # begin citation2 def rfc_uriformatter(parts): uri = "" if 'RFC' in parts: uri += self.canonical_uri(parts['RFC'].lstrip("0")) if 'Sec' in parts: uri += "#S" + parts['Sec'] return uri # end citation2 # begin citation3 from ferenda import CitationParser, URIFormatter citparser = CitationParser(section_rfc_citation, section_citation, rfc_citation) citparser.set_formatter(URIFormatter(("SecRFCRef", rfc_uriformatter), ("SecRef", rfc_uriformatter), ("RFCRef", rfc_uriformatter))) citparser.parse_recursive(doc.body)
def polish_metadata(self, head, doc): basefile_regex = re.compile('(?P<type>\w+)/(?P<year>\d+)-(?P<ordinal>\d+)') def basefile_to_referat(basefile): templ = {'ADO': 'AD %(year)s nr %(ordinal)s', 'MD': 'MD %(year)s:%(ordinal)s'} m = basefile_regex.match(basefile) if m: return templ[m.group("type")] % (m.groupdict()) def ref_to_uri(ref): # FIXME: We'd like to retire legalref and replace it with # pyparsing grammars. nodes = self.rattsfall_parser.parse(ref) uri = nodes[0].uri return localize_uri(uri) def dom_to_uri(domstol, malnr, avg): baseuri = self.config.url slug = self.slugs[domstol] return "%(baseuri)sres/dv/%(slug)s/%(malnr)s/%(avg)s" % locals() def localize_uri(uri): if "publ/rattsfall" in uri: return uri.replace("http://rinfo.lagrummet.se/publ/rattsfall", self.config.url + "res/dv") elif "publ/sfs/" in uri: return uri.replace("http://rinfo.lagrummet.se/publ/sfs", self.config.url + "res/sfs") def split_nja(value): # "NJA 2008 s 567 (NJA 2008:86)"=>("NJA 2008 s 567", "NJA 2008:86") return [x[:-1] for x in value.split("(")] def sokord_uri(value): return self.config.url + "concept/%s" % util.ucfirst(value).replace(' ', '_') # 0. create Referat key if not present if "Referat" not in head: # For some courts (MD, AD, MOD?, MIG?) this is possible head["Referat"] = basefile_to_referat(doc.basefile) # 1. mint uris and create the two Describers we'll use refuri = ref_to_uri(head["Referat"]) refdesc = Describer(doc.meta, refuri) domuri = dom_to_uri(head["Domstol"], head["Målnummer"], head["Avgörandedatum"]) domdesc = Describer(doc.meta, domuri) # 2. convert all strings in head to proper RDF for label, value in head.items(): if label == "Rubrik": value = util.normalize_space(value) refdesc.value(self.ns['rpubl'].referatrubrik, value, lang="sv") domdesc.value(self.ns['dct'].title, value, lang="sv") elif label == "Domstol": domdesc.rel(self.ns['dct'].publisher, self.lookup_resource(value)) elif label == "Målnummer": domdesc.rel(self.ns['rpubl'].malnummer, value) elif label == "Domsnummer": domdesc.rel(self.ns['rpubl'].domsnummer, value) elif label == "Diarienummer": domdesc.rel(self.ns['rpubl'].diarienummer, value) elif label == "Avdelning": domdesc.rel(self.ns['rpubl'].avdelning, value) elif label == "Referat": for pred, regex in {'rattsfallspublikation': r'([^ ]+)', 'arsutgava': r'(\d{4})', 'lopnummer': r'\d{4}(?:\:| nr )(\d+)', 'sidnummer': r's.? ?(\d+)'}.items(): m = re.search(regex, value) if m: if pred == 'rattsfallspublikation': # "NJA" -> "http://lcaolhost:8000/coll/dv/nja" uri = self.config.url + "coll/dv/" + m.group(1).lower() refdesc.rel(self.ns['rpubl'][pred], uri) else: refdesc.value(self.ns['rpubl'][pred], m.group(1)) if value.startswith("NJA"): realvalue, extra = split_nja(value) ordinal = extra.split(" ")[1] refdesc.value(self.ns['dct'].bibliographicCitation, extra) refdesc.rel(self.ns['owl'].sameAs, self.config.url + "res/dv/nja/" + ordinal) refdesc.value(self.ns['dct'].identifier, realvalue) else: refdesc.value(self.ns['dct'].identifier, value) elif label == "Avgörandedatum": with util.c_locale(): d = datetime.strptime(value, '%Y-%m-%d') domdesc.value(self.ns['rpubl'].avgorandedatum, d) elif label == "Lagrum": for i in value: # better be list not string for node in self.lagrum_parser.parse(i): if isinstance(node, Link): domdesc.rel(self.ns['rpubl'].lagrum, localize_uri(node.uri)) elif label == "Rättsfall": for i in value: for node in self.rattsfall_parser.parse(i): if isinstance(node, Link): domdesc.rel(self.ns['rpubl'].rattsfall, localize_uri(node.uri)) elif label == "Litteratur": for i in value.split(";"): domdesc.value(self.ns['dct'].relation, util.normalize_space(i)) elif label == "Sökord": for s in self.re_delimSplit(value): s = util.normalize_space(s) if not s: continue # terms longer than 72 chars are not legitimate # terms. more likely descriptions. If a term has a - in # it, it's probably a separator between a term and a # description while len(s) >= 72 and " - " in s: h, s = s.split(" - ", 1) domdesc.rel(self.ns['dct'].subject, sokord_uri(h)) if len(s) < 72: domdesc.rel(self.ns['dct'].subject, sokord_uri(s)) # 3. mint some owl:sameAs URIs refdesc.rel(self.ns['owl'].sameAs, self.sameas_uri(refuri)) domdesc.rel(self.ns['owl'].sameAs, self.sameas_uri(domuri)) # 4. Add some same-for-everyone properties refdesc.rel(self.ns['dct'].publisher, self.lookup_resource('Domstolsverket')) refdesc.rdftype(self.ns['rpubl'].Rattsfallsreferat) domdesc.rdftype(self.ns['rpubl'].VagledandeDomstolsavgorande) refdesc.rel(self.ns['rpubl'].referatAvDomstolsavgorande, domuri) # 5. assert that we have everything we need # 6. done! return refuri
def parse(self, doc): # some very simple heuristic rules for determining # what an individual paragraph is def is_heading(p): # If it's on a single line and it isn't indented with spaces # it's probably a heading. if p.count("\n") == 0 and not p.startswith(" "): return True def is_pagebreak(p): # if it contains a form feed character, it represents a page break return "\f" in p # Parsing a document consists mainly of two parts: # 1: First we parse the body of text and store it in doc.body from ferenda.elements import Body, Preformatted, Title, Heading from ferenda import Describer reader = TextReader(self.store.downloaded_path(doc.basefile)) # First paragraph of an RFC is always a header block header = reader.readparagraph() # Preformatted is a ferenda.elements class representing a # block of preformatted text. It is derived from the built-in # list type, and must thus be initialized with an iterable, in # this case a single-element list of strings. (Note: if you # try to initialize it with a string, because strings are # iterables as well, you'll end up with a list where each # character in the string is an element, which is not what you # want). preheader = Preformatted([header]) # Doc.body is a ferenda.elements.Body class, which is also # is derived from list, so it has (amongst others) the append # method. We build our document by adding to this root # element. doc.body.append(preheader) # Second paragraph is always the title, and we don't include # this in the body of the document, since we'll add it to the # medata -- once is enough title = reader.readparagraph() # After that, just iterate over the document and guess what # everything is. TextReader.getiterator is useful for # iterating through a text in other chunks than single lines for para in reader.getiterator(reader.readparagraph): if is_heading(para): # Heading is yet another of these ferenda.elements # classes. doc.body.append(Heading([para])) elif is_pagebreak(para): # Just drop these remnants of a page-and-paper-based past pass else: # If we don't know that it's something else, it's a # preformatted section (the safest bet for RFC text). doc.body.append(Preformatted([para])) # 2: Then we create metadata for the document and store it in # doc.meta (in this case using the convenience # ferenda.Describer class). desc = Describer(doc.meta, doc.uri) # Set the rdf:type of the document desc.rdftype(self.rdf_type) # Set the title we've captured as the dcterms:title of the document and # specify that it is in English desc.value(self.ns['dcterms'].title, util.normalize_space(title), lang="en") # Construct the dcterms:identifier (eg "RFC 6991") for this document from the basefile desc.value(self.ns['dcterms'].identifier, "RFC " + doc.basefile) # find and convert the publication date in the header to a datetime # object, and set it as the dcterms:issued date for the document re_date = re.compile( "(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})" ).search # This is a context manager that temporarily sets the system # locale to the "C" locale in order to be able to use strptime # with a string on the form "August 2013", even though the # system may use another locale. dt_match = re_date(header) if dt_match: with util.c_locale(): dt = datetime.strptime(re_date(header).group(0), "%B %Y") pubdate = date(dt.year, dt.month, dt.day) # Note that using some python types (cf. datetime.date) # results in a datatyped RDF literal, ie in this case # <http://localhost:8000/res/rfc/6994> dcterms:issued "2013-08-01"^^xsd:date desc.value(self.ns['dcterms'].issued, pubdate) # find any older RFCs that this document updates or obsoletes obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE) updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE) # Find the category of this RFC, store it as dcterms:subject cat_match = re.search("^Category: ([\w ]+?)( |$)", header, re.MULTILINE) if cat_match: desc.value(self.ns['dcterms'].subject, cat_match.group(1)) for predicate, matches in ((self.ns['rfc'].updates, updates), (self.ns['rfc'].obsoletes, obsoletes)): if matches is None: continue # add references between this document and these older rfcs, # using either rfc:updates or rfc:obsoletes for match in matches.group(1).strip().split(", "): uri = self.canonical_uri(match) # Note that this uses our own unofficial # namespace/vocabulary # http://example.org/ontology/rfc/ desc.rel(predicate, uri) # And now we're done. We don't need to return anything as # we've modified the Document object that was passed to # us. The calling code will serialize this modified object to # XHTML and RDF and store it on disk # end parse1 # Now do it again reader.seek(0) reader.readparagraph() reader.readparagraph() doc.body = Body() doc.body.append(preheader) # doc.body.append(Title([util.normalize_space(title)])) # begin parse2 from ferenda.elements import Section, Subsection, Subsubsection # More heuristic rules: Section headers start at the beginning # of a line and are numbered. Subsections and subsubsections # have dotted numbers, optionally with a trailing period, ie # '9.2.' or '11.3.1' def is_section(p): return re.match(r"\d+\.? +[A-Z]", p) def is_subsection(p): return re.match(r"\d+\.\d+\.? +[A-Z]", p) def is_subsubsection(p): return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p) def split_sectionheader(p): # returns a tuple of title, ordinal, identifier ordinal, title = p.split(" ", 1) ordinal = ordinal.strip(".") return title.strip(), ordinal, "RFC %s, section %s" % ( doc.basefile, ordinal) # Use a list as a simple stack to keep track of the nesting # depth of a document. Every time we create a Section, # Subsection or Subsubsection object, we push it onto the # stack (and clear the stack down to the appropriate nesting # depth). Every time we create some other object, we append it # to whatever object is at the top of the stack. As your rules # for representing the nesting of structure become more # complicated, you might want to use the # :class:`~ferenda.FSMParser` class, which lets you define # heuristic rules (recognizers), states and transitions, and # takes care of putting your structure together. stack = [doc.body] for para in reader.getiterator(reader.readparagraph): if is_section(para): title, ordinal, identifier = split_sectionheader(para) s = Section(title=title, ordinal=ordinal, identifier=identifier) stack[1:] = [] # clear all but bottom element stack[0].append(s) # add new section to body stack.append(s) # push new section on top of stack elif is_subsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsection(title=title, ordinal=ordinal, identifier=identifier) stack[2:] = [] # clear all but bottom two elements stack[1].append(s) # add new subsection to current section stack.append(s) elif is_subsubsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsubsection(title=title, ordinal=ordinal, identifier=identifier) stack[3:] = [] # clear all but bottom three stack[-1].append( s) # add new subsubsection to current subsection stack.append(s) elif is_heading(para): stack[-1].append(Heading([para])) elif is_pagebreak(para): pass else: pre = Preformatted([para]) stack[-1].append(pre) # end parse2 # begin citation1 from pyparsing import Word, CaselessLiteral, nums section_citation = ( CaselessLiteral("section") + Word(nums + ".").setResultsName("Sec")).setResultsName("SecRef") rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") + "]").setResultsName("RFCRef") section_rfc_citation = (section_citation + "of" + rfc_citation).setResultsName("SecRFCRef") # end citation1 # begin citation2 def rfc_uriformatter(parts): uri = "" if 'RFC' in parts: uri += self.canonical_uri(parts['RFC'].lstrip("0")) if 'Sec' in parts: uri += "#S" + parts['Sec'] return uri # end citation2 # begin citation3 from ferenda import CitationParser, URIFormatter citparser = CitationParser(section_rfc_citation, section_citation, rfc_citation) citparser.set_formatter( URIFormatter(("SecRFCRef", rfc_uriformatter), ("SecRef", rfc_uriformatter), ("RFCRef", rfc_uriformatter))) citparser.parse_recursive(doc.body)
def parse_metadata_from_soup(self, soup, doc): doc.lang = "sv" d = Describer(doc.meta, doc.uri) d.rdftype(self.rdf_type) d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) sameas = self.sameas_uri(doc.uri) if sameas: d.rel(self.ns['owl'].sameAs, sameas) content = soup.find(id="content") title = content.find("h1").string d.value(self.ns['dct'].title, title, lang=doc.lang) identifier = self.sanitize_identifier( content.find("p", "lead").text) # might need fixing up d.value(self.ns['dct'].identifier, identifier) definitions = content.find("dl", "definitions") if definitions: for dt in definitions.find_all("dt"): key = dt.get_text(strip=True) value = dt.find_next_sibling("dd").get_text(strip=True) if key == "Utgiven:": try: d.value(self.ns['dct'].published, self.parse_swedish_date(value)) except ValueError as e: self.log.warning( "Could not parse %s as swedish date" % value) elif key == "Avsändare:": if value.endswith("departementet"): d.rel(self.ns['rpubl'].departement, self.lookup_resource(value)) else: d.rel(self.ns['dct'].publisher, self.lookup_resource(value)) if content.find("h2", text="Sammanfattning"): sums = content.find("h2", text="Sammanfattning").find_next_siblings("p") # "\n\n" doesn't seem to survive being stuffed in a rdfa # content attribute. Replace with simple space. summary = " ".join([x.get_text(strip=True) for x in sums]) d.value(self.ns['dct'].abstract, summary, lang=doc.lang) # find related documents re_basefile = re.compile(r'\d{4}(|/\d{2,4}):\d+') # legStep1=Kommittedirektiv, 2=Utredning, 3=lagrådsremiss, # 4=proposition. Assume that relationships between documents # are reciprocal (ie if the page for a Kommittedirektiv # references a Proposition, the page for that Proposition # references the Kommittedirektiv. elements = {self.KOMMITTEDIREKTIV: [], self.DS: ["legStep1"], self.PROPOSITION: ["legStep1", "legStep2"], self.SOU: ["legStep1"]}[self.document_type] for elementid in elements: box = content.find(id=elementid) for listitem in box.find_all("li"): if not listitem.find("span", "info"): continue infospans = [x.text.strip( ) for x in listitem.find_all("span", "info")] rel_basefile = None identifier = None for infospan in infospans: if re_basefile.search(infospan): # scrub identifier ("Dir. 2008:50" -> "2008:50" etc) rel_basefile = re_basefile.search(infospan).group() identifier = infospan if not rel_basefile: self.log.warning( "Couldn't find rel_basefile (elementid #%s) among %r" % (elementid, infospans)) continue if elementid == "legStep1": subjUri = self.canonical_uri( rel_basefile, self.KOMMITTEDIREKTIV) elif elementid == "legStep2": if identifier.startswith("SOU"): subjUri = self.canonical_uri(rel_basefile, self.SOU) elif identifier.startswith(("Ds", "DS")): subjUri = self.canonical_uri(rel_basefile, self.DS) else: self.log.warning( "Cannot find out what type of document the linked %s is (#%s)" % (identifier, elementid)) self.log.warning("Infospans was %r" % infospans) continue elif elementid == "legStep3": subjUri = self.canonical_uri( rel_basefile, self.PROPOSITION) d.rel(self.ns['rpubl'].utgarFran, subjUri) # find related pages related = content.find("h2", text="Relaterat") if related: for link in related.findParent("div").find_all("a"): r = urljoin( "http://www.regeringen.se/", link["href"]) d.rel(RDFS.seeAlso, URIRef(r)) # with d.rel(RDFS.seeAlso, URIRef(r)): # d.value(RDFS.label, link.get_text(strip=True)) self.infer_triples(d, doc.basefile)