def parse(self, doc): doc.uri = self.canonical_uri(doc.basefile) d = Describer(doc.meta, doc.uri) d.rdftype(self.rdf_type) d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) self.infer_triples(d, doc.basefile) # prefer PDF or Word files over the plaintext-containing HTML files # FIXME: PDF or Word files are now stored as attachments pdffile = self.generic_path(doc.basefile, 'downloaded', '.pdf') wordfiles = (self.generic_path(doc.basefile, 'downloaded', '.doc'), self.generic_path(doc.basefile, 'downloaded', '.docx'), self.generic_path(doc.basefile, 'downloaded', '.wpd'), self.generic_path(doc.basefile, 'downloaded', '.rtf')) wordfile = None for f in wordfiles: if os.path.exists(f): wordfile = f # if we lack a .pdf file, use Open/LibreOffice to convert any # .wpd or .doc file to .pdf first if (wordfile and not os.path.exists(pdffile)): intermediate_pdf = self.generic_path( doc.basefile, "intermediate", ".pdf") if not os.path.exists(intermediate_pdf): cmdline = "%s --headless -convert-to pdf -outdir '%s' %s" % (self.config.get('soffice', 'soffice'), os.path.dirname( intermediate_pdf), wordfile) self.log.debug( "%s: Converting to PDF: %s" % (doc.basefile, cmdline)) (ret, stdout, stderr) = util.runcmd( cmdline, require_success=True) pdffile = intermediate_pdf if os.path.exists(pdffile): self.log.debug("%s: Using %s" % (doc.basefile, pdffile)) intermediate_dir = os.path.dirname( self.generic_path(doc.basefile, 'intermediate', '.foo')) self.setup_logger('pdfreader', self.config.get('log', 'INFO')) pdfreader = PDFReader() pdfreader.read(pdffile, intermediate_dir) self.parse_from_pdfreader(pdfreader, doc) else: downloaded_path = self.downloaded_path(doc.basefile) intermediate_path = self.generic_path( doc.basefile, 'intermediate', '.txt') self.log.debug("%s: Using %s (%s)" % (doc.basefile, downloaded_path, intermediate_path)) if not os.path.exists(intermediate_path): html = codecs.open( downloaded_path, encoding="iso-8859-1").read() util.writefile(intermediate_path, util.extract_text( html, '<pre>', '</pre>'), encoding="utf-8") textreader = TextReader(intermediate_path, encoding="utf-8") self.parse_from_textreader(textreader, doc)
def infer_metadata(self, resource, basefile): super(InferTimes, self).infer_metadata(resource, basefile) desc = Describer(resource.graph, resource.identifier) de = DocumentEntry(self.store.documententry_path(basefile)) if de.orig_updated: desc.value(RINFOEX.senastHamtad, de.orig_updated) if de.orig_checked: desc.value(RINFOEX.senastKontrollerad, de.orig_checked)
def parse_from_pdfreader(self, pdfreader, doc): doc.body = Body([pdfreader]) d = Describer(doc.meta, doc.uri) d.rdftype(self.rdf_type) d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) return doc
def setUp(self): super(News, self).setUp() self.faceted_data = [] # create a bunch of DocumentEntry objects and save them basetime = datetime(2013, 1, 1, 12, 0) for basefile in range(25): v = {'id':self.repo.canonical_uri(basefile), 'title':"Doc #%s" % basefile} self.faceted_data.append({'uri': v['id'], 'dcterms_title': v['title'], 'rdf_type': 'http://xmlns.com/foaf/0.1/Document'}) de = DocumentEntry() de.orig_created = basetime + timedelta(hours=basefile) de.orig_updated = basetime + timedelta(hours=basefile, minutes=10) de.orig_checked = basetime + timedelta(hours=basefile, minutes=20) de.published = basetime + timedelta(hours=basefile, minutes=30) de.updated = basetime + timedelta(hours=basefile, minutes=40) de.orig_url = "http://source.example.org/doc/%s" % basefile de.title = v['title'] de.save(self.repo.store.documententry_path(str(basefile))) g = rdflib.Graph() desc = Describer(g, self.repo.canonical_uri(basefile)) dcterms = self.repo.ns['dcterms'] desc.rdftype(self.repo.ns['foaf'].Document) desc.value(dcterms.title, "Invalid title") util.ensure_dir(self.repo.store.distilled_path(str(basefile))) with open(self.repo.store.distilled_path(str(basefile)), "wb") as fp: g.serialize(fp, format="pretty-xml") util.ensure_dir(self.repo.store.parsed_path(str(basefile))) with open(self.repo.store.parsed_path(str(basefile)), "w") as fp: fp.write("""<?xml version='1.0' encoding='utf-8'?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN" "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xmlns:dcterms="http://purl.org/dc/terms/" xml:lang="en"> <head about="%(id)s"> <title>%(title)s</title> </head> <body about="%(id)s"> <h1>%(title)s</h1> </body> </html>""" % v) util.ensure_dir(self.repo.store.generated_path(str(basefile))) with open(self.repo.store.generated_path(str(basefile)), "w") as fp: fp.write("""<!DOCTYPE html> <html> <head> <title>%(title)s</title> </head> <body> <h1>%(title)s</h1> </body> </html>""" % v)
def parse(self, doc): # create a dummy txt d = Describer(doc.meta, doc.uri) d.rdftype(self.rdf_type) d.value(self.ns['dcterms'].title, Literal(doc.basefile, lang=doc.lang)) d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) doc.body = Body() # can be empty, all content in doc.meta return True
def parse_document_from_soup(self, soup, doc): from ferenda.elements import Page from ferenda import Describer part = Page(["This is a part of a document"], ordinal=42, uri="http://example.org/doc#42", meta=self.make_graph()) d = Describer(part.meta, part.uri) d.rdftype(self.ns['bibo'].DocumentPart) # the dcterms:identifier for a document part is often whatever # would be the preferred way to cite that part in another # document d.value(self.ns['dcterms'].identifier, "Doc:4711, p 42") # end part from lxml import etree return etree.tostring(part.as_xhtml("http://example.org/doc"))
def parse_from_textreader(self, textreader, doc): describer = Describer(doc.meta, doc.uri) for p in textreader.getiterator(textreader.readparagraph): # print "Handing %r (%s)" % (p[:40], len(doc.body)) if not p.strip(): continue elif not doc.body and 'Obs! Dokumenten i denna databas kan vara ofullständiga.' in p: continue elif not doc.body and p.strip().startswith("Dokument:"): # We already know this continue elif not doc.body and p.strip().startswith("Titel:"): describer.value( self.ns['dct'].title, util.normalize_space(p[7:])) else: doc.body.append(Preformatted([p]))
def make_meta(self, chunk, meta, uri, basefile): d = Describer(meta, uri) dct = self.ns['dct'] prov = self.ns['prov'] owl = self.ns['owl'] rpubl = RPUBL d.rdftype(self.rdf_type) d.value(prov.wasGeneratedBy, self.qualified_class_name()) # predicates maps key strings to corresponsing RDFLib terms, # e.g. "Rubrik" -> dct:title predicates = {'Dir nr': dct.identifier, 'Departement': rpubl.departement, 'Beslut vid regeringssammanträde': rpubl.beslutsdatum, 'Rubrik': dct.title, 'Senast ändrad': dct.changed } # munger contains a set of tuples where the first item is a # method for converting a plain text into the appropriate # RDFLib value, e.g: # - "Utredning av foo" => Literal("Utredning av foo",lang="sv") # - "1987-02-19" => datetime(1987,2,19) # - "Arbetsdepartementet" => URIRef("http://lagen.nu/terms/arbdep") # The second item is the Describer method that # should be used to add the value to the graph, i.e. .value # for Literals and .rel for URIRefs munger = {'Dir nr': (self.sanitize_identifier, d.value), # the RDFLib constructor 'Departement': (functools.partial(self.lookup_resource, warn=False), d.rel), 'Beslut vid regeringssammanträde': (self.parse_iso_date, d.value), 'Rubrik': (self.sanitize_rubrik, d.value), 'Senast ändrad': (self.parse_iso_date, d.value) } # headerlines wraps a TextReader in an iterator that parses # "key:value\n" lines with support for line continuation, eg # "long\nkey:long\nvalue\n" for (key, val) in self.header_lines(chunk): if not val: continue try: pred = predicates[key] (transformer, setter) = munger[key] setter(pred, transformer(val)) except (KeyError, ValueError) as e: self.log.error( "Couldn't munge value '%s' into a proper object for predicate '%s'" % (val, key)) d.rel(dct.publisher, self.lookup_resource("Regeringskansliet")) d.rel(owl.sameAs, self.sameas_uri(uri)) self.infer_triples(d, basefile)
def infer_metadata(self, resource, basefile): # remove the bogus dcterms:issued thing that we only added to # aid URI generation. NB: This is removed in the superclass' # postprocess_doc as well, because for this lagen.nu-derived # class it needs to be done at this point, but for use of the # superclass directly, it needs to be done at some point. for o in resource.objects(DCTERMS.issued): if not o.datatype: resource.remove(DCTERMS.issued, o) sameas_uri = self.sameas_minter.space.coin_uri(resource) resource.add(OWL.sameAs, URIRef(sameas_uri)) resource.graph.add((URIRef(self.canonical_uri(basefile, True)), OWL.sameAs, resource.identifier)) # then find each rpubl:konsolideringsunderlag, and create # owl:sameas for them as well for subresource in resource.objects(RPUBL.konsolideringsunderlag): # sometimes there'll be a rpubl:konsolideringsunderlag to # a resource URI but no actual data about that # resource. This seems to happen if SFST is updated but # SFSR is not. In those cases we can't generate a # owl:sameAs URI since we have no other data about the # resource. if subresource.value(RDF.type): uri = self.sameas_minter.space.coin_uri(subresource) subresource.add(OWL.sameAs, URIRef(uri)) desc = Describer(resource.graph, resource.identifier) de = DocumentEntry(self.store.documententry_path(basefile)) if de.orig_updated: desc.value(RINFOEX.senastHamtad, de.orig_updated) if de.orig_checked: desc.value(RINFOEX.senastKontrollerad, de.orig_checked) rooturi = URIRef(desc.getrel(RPUBL.konsoliderar)) v = self.commondata.value(rooturi, DCTERMS.alternate, any=True) if v: desc.value(DCTERMS.alternate, v) v = self.commondata.value(rooturi, RDFS.label, any=True) if v: # don't include labels if they're essentially the same as # dcterms:title (legalref needs it to be able to parse # refs to laws that typically don't include SFS numbers, # so that's why they're in sfs.ttl basetitle = str(resource.value(DCTERMS.title)).rsplit(" (")[0] if not v.startswith(basetitle.lower()): desc.value(RDFS.label, util.ucfirst(v))
def setUp(self): self.graph = Graph() self.graph.parse(data=""" @prefix dcterms: <http://purl.org/dc/terms/> . @prefix foaf: <http://xmlns.com/foaf/0.1/> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . <http://example.org/doc> a foaf:Document; dcterms:title "Hello world"@en ; dcterms:identifier "ID1", "ID2"; dcterms:issued "2013-10-11"^^xsd:date; dcterms:references <http://example.org/doc2>; dcterms:subject <http://example.org/concept1>, <http://example.org/concept2> . """, format="turtle") self.desc = Describer(self.graph, "http://example.org/doc")
def parse(self, doc): # create a dummy txt d = Describer(doc.meta, doc.uri) d.rdftype(self.rdf_type) d.value(self.ns['dcterms'].title, Literal(doc.basefile, lang=doc.lang)) d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) doc.body = Body() # can be empty, all content in doc.meta self.parse_entry_update(doc) return True
def polish_metadata(self, head, basefile, infer_nodes=True): # where do we get refdesc, domdesc? coin_uri = self.sameas_minter.space.coin_uri resource = super(DV, self).polish_metadata(head, basefile) refuri = resource.identifier if 'rinfoex:patchdescription' in head: resource.add(RINFOEX.patchdescription, Literal(head['rinfoex:patchdescription'], lang="sv")) refuri_sameas = coin_uri(resource) resource.graph.add((URIRef(refuri), OWL.sameAs, URIRef(refuri_sameas))) # NB: In theory, we have all the data we need to generate a # canonical URI for the dom. In practice, this data does not # meet requirements of our URISpace templates in certain cases # (all MD verdicts use rpubl:domsnummer instead of # rpubl:malnummer, which is what the template expects. The # superclass' definition of polish_metadata gets around this # by creating a minimal graph from the plain dict in head and # feeds that to coin_uri. So we do the same here, instead of # the very simple: # # domuri_sameas = coin_uri(resource.value(RPUBL.referatAvDomstolsavgorande)) # # (also, this version handles the uncommon but valid case # where one referat concerns multiple dom:s) domuri = resource.value(RPUBL.referatAvDomstolsavgorande).identifier for malnummer in head['_localid']: bnodetmp = BNode() gtmp = Graph() gtmp.bind("rpubl", RPUBL) gtmp.bind("dcterms", DCTERMS) dtmp = Describer(gtmp, bnodetmp) dtmp.rdftype(RPUBL.VagledandeDomstolsavgorande) dtmp.value(RPUBL.malnummer, malnummer) dtmp.value(RPUBL.avgorandedatum, head['Avgörandedatum']) dtmp.rel(DCTERMS.publisher, self.lookup_resource(head["Domstol"])) rtmp = dtmp.graph.resource(bnodetmp) domuri_sameas = coin_uri(rtmp) resource.graph.add( (URIRef(domuri), OWL.sameAs, URIRef(domuri_sameas))) return resource
def decorate_bodyparts(self, part, baseuri): if isinstance(part, str): return if isinstance(part, (Section, Subsection, Subsubsection)): # print("Decorating %s %s" % (part.__class__.__name__,part.ordinal)) part.uri = "%s#S%s" % (baseuri, part.ordinal) part.meta = self.make_graph() desc = Describer(part.meta, part.uri) desc.rdftype(self.ns['bibo'].DocumentPart) desc.value(self.ns['dcterms'].title, Literal(part.title, lang="en")) desc.value(self.ns['bibo'].chapter, part.ordinal) # desc.value(self.ns['dcterms'].isPartOf, part.parent.uri) # implied for subpart in part: self.decorate_bodyparts(subpart, baseuri)
def metadata_from_basefile(self, doc): desc = Describer(doc.meta, doc.uri) desc.rel(CDM.resource_legal_id_celex, Literal(doc.basefile)) # the sixth letter in rdftype = {"R": CDM.regulation, "L": CDM.directive, "C": CDM.decision_cjeu}[doc.basefile[5]] desc.rel(RDF.type, rdftype) return doc.meta
def infer_metadata(self, resource, basefile): # remove the bogus dcterms:issued thing that we only added to # aid URI generation. NB: This is removed in the superclass' # postprocess_doc as well, because for this lagen.nu-derived # class it needs to be done at this point, but for use of the # superclass directly, it needs to be done at some point. for o in resource.objects(DCTERMS.issued): if not o.datatype: resource.remove(DCTERMS.issued, o) sameas_uri = self.sameas_minter.space.coin_uri(resource) resource.add(OWL.sameAs, URIRef(sameas_uri)) resource.graph.add((URIRef(self.canonical_uri(basefile, True)), OWL.sameAs, resource.identifier)) # then find each rpubl:konsolideringsunderlag, and create # owl:sameas for them as well for subresource in resource.objects(RPUBL.konsolideringsunderlag): # sometimes there'll be a rpubl:konsolideringsunderlag to # a resource URI but no actual data about that # resource. This seems to happen if SFST is updated but # SFSR is not. In those cases we can't generate a # owl:sameAs URI since we have no other data about the # resource. if subresource.value(RDF.type): uri = self.sameas_minter.space.coin_uri(subresource) subresource.add(OWL.sameAs, URIRef(uri)) desc = Describer(resource.graph, resource.identifier) de = DocumentEntry(self.store.documententry_path(basefile)) if de.orig_updated: desc.value(RINFOEX.senastHamtad, de.orig_updated) if de.orig_checked: desc.value(RINFOEX.senastKontrollerad, de.orig_checked) v = self.commondata.value(resource.identifier, DCTERMS.alternate, any=True) if v: desc.value(DCTERMS.alternate, v)
class TestDescriber(unittest.TestCase): def setUp(self): self.graph = Graph() self.graph.parse(data=""" @prefix dcterms: <http://purl.org/dc/terms/> . @prefix foaf: <http://xmlns.com/foaf/0.1/> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . <http://example.org/doc> a foaf:Document; dcterms:title "Hello world"@en ; dcterms:identifier "ID1", "ID2"; dcterms:issued "2013-10-11"^^xsd:date; dcterms:references <http://example.org/doc2>; dcterms:subject <http://example.org/concept1>, <http://example.org/concept2> . """, format="turtle") self.desc = Describer(self.graph, "http://example.org/doc") def test_getvalues(self): self.assertEqual(self.desc.getvalues(DCTERMS.alternate), []) self.assertEqual(self.desc.getvalues(DCTERMS.title), ["Hello world"]) self.assertEqual(set(self.desc.getvalues(DCTERMS.identifier)), set(["ID1", "ID2"])) def test_getvalue(self): self.assertEqual(self.desc.getvalue(DCTERMS.title), "Hello world") self.assertEqual(self.desc.getvalue(DCTERMS.issued), datetime.date(2013,10,11)) with self.assertRaises(KeyError): self.desc.getvalue(DCTERMS.alternate) with self.assertRaises(KeyError): self.desc.getvalue(DCTERMS.identifier) def test_getrels(self): self.assertEqual(self.desc.getrels(DCTERMS.replaces), []) self.assertEqual(self.desc.getrels(DCTERMS.references), ["http://example.org/doc2"]) self.assertEqual(set(self.desc.getrels(DCTERMS.subject)), set(["http://example.org/concept1", "http://example.org/concept2"])) def test_getrel(self): self.assertEqual(self.desc.getrel(DCTERMS.references), "http://example.org/doc2") with self.assertRaises(KeyError): self.desc.getrel(DCTERMS.replaces) with self.assertRaises(KeyError): self.desc.getrel(DCTERMS.subject) def test_getrdftype(self): self.assertEqual(self.desc.getrdftype(), "http://xmlns.com/foaf/0.1/Document")
def parse(self, doc): # some very simple heuristic rules for determining # what an individual paragraph is def is_heading(p): # If it's on a single line and it isn't indented with spaces # it's probably a heading. if p.count("\n") == 0 and not p.startswith(" "): return True def is_pagebreak(p): # if it contains a form feed character, it represents a page break return "\f" in p # Parsing a document consists mainly of two parts: # 1: First we parse the body of text and store it in doc.body from ferenda.elements import Body, Preformatted, Title, Heading from ferenda import Describer reader = TextReader(self.store.downloaded_path(doc.basefile)) # First paragraph of an RFC is always a header block header = reader.readparagraph() # Preformatted is a ferenda.elements class representing a # block of preformatted text. It is derived from the built-in # list type, and must thus be initialized with an iterable, in # this case a single-element list of strings. (Note: if you # try to initialize it with a string, because strings are # iterables as well, you'll end up with a list where each # character in the string is an element, which is not what you # want). preheader = Preformatted([header]) # Doc.body is a ferenda.elements.Body class, which is also # is derived from list, so it has (amongst others) the append # method. We build our document by adding to this root # element. doc.body.append(preheader) # Second paragraph is always the title, and we don't include # this in the body of the document, since we'll add it to the # medata -- once is enough title = reader.readparagraph() # After that, just iterate over the document and guess what # everything is. TextReader.getiterator is useful for # iterating through a text in other chunks than single lines for para in reader.getiterator(reader.readparagraph): if is_heading(para): # Heading is yet another of these ferenda.elements # classes. doc.body.append(Heading([para])) elif is_pagebreak(para): # Just drop these remnants of a page-and-paper-based past pass else: # If we don't know that it's something else, it's a # preformatted section (the safest bet for RFC text). doc.body.append(Preformatted([para])) # 2: Then we create metadata for the document and store it in # doc.meta (in this case using the convenience # ferenda.Describer class). desc = Describer(doc.meta, doc.uri) # Set the rdf:type of the document desc.rdftype(self.rdf_type) # Set the title we've captured as the dcterms:title of the document and # specify that it is in English desc.value(self.ns['dcterms'].title, util.normalize_space(title), lang="en") # Construct the dcterms:identifier (eg "RFC 6991") for this document from the basefile desc.value(self.ns['dcterms'].identifier, "RFC " + doc.basefile) # find and convert the publication date in the header to a datetime # object, and set it as the dcterms:issued date for the document re_date = re.compile( "(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})" ).search # This is a context manager that temporarily sets the system # locale to the "C" locale in order to be able to use strptime # with a string on the form "August 2013", even though the # system may use another locale. dt_match = re_date(header) if dt_match: with util.c_locale(): dt = datetime.strptime(re_date(header).group(0), "%B %Y") pubdate = date(dt.year, dt.month, dt.day) # Note that using some python types (cf. datetime.date) # results in a datatyped RDF literal, ie in this case # <http://localhost:8000/res/rfc/6994> dcterms:issued "2013-08-01"^^xsd:date desc.value(self.ns['dcterms'].issued, pubdate) # find any older RFCs that this document updates or obsoletes obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE) updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE) # Find the category of this RFC, store it as dcterms:subject cat_match = re.search("^Category: ([\w ]+?)( |$)", header, re.MULTILINE) if cat_match: desc.value(self.ns['dcterms'].subject, cat_match.group(1)) for predicate, matches in ((self.ns['rfc'].updates, updates), (self.ns['rfc'].obsoletes, obsoletes)): if matches is None: continue # add references between this document and these older rfcs, # using either rfc:updates or rfc:obsoletes for match in matches.group(1).strip().split(", "): uri = self.canonical_uri(match) # Note that this uses our own unofficial # namespace/vocabulary # http://example.org/ontology/rfc/ desc.rel(predicate, uri) # And now we're done. We don't need to return anything as # we've modified the Document object that was passed to # us. The calling code will serialize this modified object to # XHTML and RDF and store it on disk # end parse1 # Now do it again reader.seek(0) reader.readparagraph() reader.readparagraph() doc.body = Body() doc.body.append(preheader) # doc.body.append(Title([util.normalize_space(title)])) # begin parse2 from ferenda.elements import Section, Subsection, Subsubsection # More heuristic rules: Section headers start at the beginning # of a line and are numbered. Subsections and subsubsections # have dotted numbers, optionally with a trailing period, ie # '9.2.' or '11.3.1' def is_section(p): return re.match(r"\d+\.? +[A-Z]", p) def is_subsection(p): return re.match(r"\d+\.\d+\.? +[A-Z]", p) def is_subsubsection(p): return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p) def split_sectionheader(p): # returns a tuple of title, ordinal, identifier ordinal, title = p.split(" ", 1) ordinal = ordinal.strip(".") return title.strip(), ordinal, "RFC %s, section %s" % ( doc.basefile, ordinal) # Use a list as a simple stack to keep track of the nesting # depth of a document. Every time we create a Section, # Subsection or Subsubsection object, we push it onto the # stack (and clear the stack down to the appropriate nesting # depth). Every time we create some other object, we append it # to whatever object is at the top of the stack. As your rules # for representing the nesting of structure become more # complicated, you might want to use the # :class:`~ferenda.FSMParser` class, which lets you define # heuristic rules (recognizers), states and transitions, and # takes care of putting your structure together. stack = [doc.body] for para in reader.getiterator(reader.readparagraph): if is_section(para): title, ordinal, identifier = split_sectionheader(para) s = Section(title=title, ordinal=ordinal, identifier=identifier) stack[1:] = [] # clear all but bottom element stack[0].append(s) # add new section to body stack.append(s) # push new section on top of stack elif is_subsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsection(title=title, ordinal=ordinal, identifier=identifier) stack[2:] = [] # clear all but bottom two elements stack[1].append(s) # add new subsection to current section stack.append(s) elif is_subsubsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsubsection(title=title, ordinal=ordinal, identifier=identifier) stack[3:] = [] # clear all but bottom three stack[-1].append( s) # add new subsubsection to current subsection stack.append(s) elif is_heading(para): stack[-1].append(Heading([para])) elif is_pagebreak(para): pass else: pre = Preformatted([para]) stack[-1].append(pre) # end parse2 # begin citation1 from pyparsing import Word, CaselessLiteral, nums section_citation = ( CaselessLiteral("section") + Word(nums + ".").setResultsName("Sec")).setResultsName("SecRef") rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") + "]").setResultsName("RFCRef") section_rfc_citation = (section_citation + "of" + rfc_citation).setResultsName("SecRFCRef") # end citation1 # begin citation2 def rfc_uriformatter(parts): uri = "" if 'RFC' in parts: uri += self.canonical_uri(parts['RFC'].lstrip("0")) if 'Sec' in parts: uri += "#S" + parts['Sec'] return uri # end citation2 # begin citation3 from ferenda import CitationParser, URIFormatter citparser = CitationParser(section_rfc_citation, section_citation, rfc_citation) citparser.set_formatter( URIFormatter(("SecRFCRef", rfc_uriformatter), ("SecRef", rfc_uriformatter), ("RFCRef", rfc_uriformatter))) citparser.parse_recursive(doc.body)
def parse_metadata_from_soup(self, soup, doc): doc.lang = self.lang d = Describer(doc.meta, doc.uri) d.rdftype(self.rdf_type) d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) dcterms = self.ns['dcterms'] # dcterms:title d.value(dcterms.title, soup.find("title").string, lang=doc.lang) d.value(dcterms.identifier, doc.basefile) # dcterms:abstract abstract = soup.find(_class="abstract") if abstract: d.value(dcterms['abstract'], abstract.string, lang=doc.lang) # dcterms:published datehdr = soup.find(lambda x: x.name in ('h2', 'h3') and re.search("W3C\s+Recommendation,?\s+", x.text)) if datehdr: datestr = " ".join(datehdr.text.split()) m = re.search("(\d+)[ \-](\w+),?[ \-](\d{4})", datestr) if not m: self.log.warning("%s: Couldn't parse datestr %s" % (doc.basefile, datestr)) else: datestr = " ".join(m.groups()) date = None try: # 17 December 1996 date = util.strptime(datestr, "%d %B %Y").date() except ValueError: try: # 17 Dec 1996 date = util.strptime(datestr, "%d %b %Y").date() except ValueError: self.log.warning("%s: Could not parse datestr %s" % (doc.basefile, datestr)) if date: d.value(dcterms.issued, date) # dcterms:editor editors = soup.find("dt", text=re.compile("Editors?:")) if editors: for editor in editors.find_next_siblings("dd"): editor_string = " ".join(x for x in editor.stripped_strings if not "@" in x) editor_name = editor_string.split(", ")[0] d.value(dcterms.editor, editor_name) # dcterms:publisher d.rel(dcterms.publisher, "http://localhost:8000/ext/w3c") # assure we got exactly one of each of the required properties for required in (dcterms.title, dcterms.issued): d.getvalue(required) # throws KeyError if not found (or more than one)
def polish_metadata(self, head, doc): basefile_regex = re.compile('(?P<type>\w+)/(?P<year>\d+)-(?P<ordinal>\d+)') def basefile_to_referat(basefile): templ = {'ADO': 'AD %(year)s nr %(ordinal)s', 'MD': 'MD %(year)s:%(ordinal)s'} m = basefile_regex.match(basefile) if m: return templ[m.group("type")] % (m.groupdict()) def ref_to_uri(ref): # FIXME: We'd like to retire legalref and replace it with # pyparsing grammars. nodes = self.rattsfall_parser.parse(ref) uri = nodes[0].uri return localize_uri(uri) def dom_to_uri(domstol, malnr, avg): baseuri = self.config.url slug = self.slugs[domstol] return "%(baseuri)sres/dv/%(slug)s/%(malnr)s/%(avg)s" % locals() def localize_uri(uri): if "publ/rattsfall" in uri: return uri.replace("http://rinfo.lagrummet.se/publ/rattsfall", self.config.url + "res/dv") elif "publ/sfs/" in uri: return uri.replace("http://rinfo.lagrummet.se/publ/sfs", self.config.url + "res/sfs") def split_nja(value): # "NJA 2008 s 567 (NJA 2008:86)"=>("NJA 2008 s 567", "NJA 2008:86") return [x[:-1] for x in value.split("(")] def sokord_uri(value): return self.config.url + "concept/%s" % util.ucfirst(value).replace(' ', '_') # 0. create Referat key if not present if "Referat" not in head: # For some courts (MD, AD, MOD?, MIG?) this is possible head["Referat"] = basefile_to_referat(doc.basefile) # 1. mint uris and create the two Describers we'll use refuri = ref_to_uri(head["Referat"]) refdesc = Describer(doc.meta, refuri) domuri = dom_to_uri(head["Domstol"], head["Målnummer"], head["Avgörandedatum"]) domdesc = Describer(doc.meta, domuri) # 2. convert all strings in head to proper RDF for label, value in head.items(): if label == "Rubrik": value = util.normalize_space(value) refdesc.value(self.ns['rpubl'].referatrubrik, value, lang="sv") domdesc.value(self.ns['dct'].title, value, lang="sv") elif label == "Domstol": domdesc.rel(self.ns['dct'].publisher, self.lookup_resource(value)) elif label == "Målnummer": domdesc.rel(self.ns['rpubl'].malnummer, value) elif label == "Domsnummer": domdesc.rel(self.ns['rpubl'].domsnummer, value) elif label == "Diarienummer": domdesc.rel(self.ns['rpubl'].diarienummer, value) elif label == "Avdelning": domdesc.rel(self.ns['rpubl'].avdelning, value) elif label == "Referat": for pred, regex in {'rattsfallspublikation': r'([^ ]+)', 'arsutgava': r'(\d{4})', 'lopnummer': r'\d{4}(?:\:| nr )(\d+)', 'sidnummer': r's.? ?(\d+)'}.items(): m = re.search(regex, value) if m: if pred == 'rattsfallspublikation': # "NJA" -> "http://lcaolhost:8000/coll/dv/nja" uri = self.config.url + "coll/dv/" + m.group(1).lower() refdesc.rel(self.ns['rpubl'][pred], uri) else: refdesc.value(self.ns['rpubl'][pred], m.group(1)) if value.startswith("NJA"): realvalue, extra = split_nja(value) ordinal = extra.split(" ")[1] refdesc.value(self.ns['dct'].bibliographicCitation, extra) refdesc.rel(self.ns['owl'].sameAs, self.config.url + "res/dv/nja/" + ordinal) refdesc.value(self.ns['dct'].identifier, realvalue) else: refdesc.value(self.ns['dct'].identifier, value) elif label == "Avgörandedatum": with util.c_locale(): d = datetime.strptime(value, '%Y-%m-%d') domdesc.value(self.ns['rpubl'].avgorandedatum, d) elif label == "Lagrum": for i in value: # better be list not string for node in self.lagrum_parser.parse(i): if isinstance(node, Link): domdesc.rel(self.ns['rpubl'].lagrum, localize_uri(node.uri)) elif label == "Rättsfall": for i in value: for node in self.rattsfall_parser.parse(i): if isinstance(node, Link): domdesc.rel(self.ns['rpubl'].rattsfall, localize_uri(node.uri)) elif label == "Litteratur": for i in value.split(";"): domdesc.value(self.ns['dct'].relation, util.normalize_space(i)) elif label == "Sökord": for s in self.re_delimSplit(value): s = util.normalize_space(s) if not s: continue # terms longer than 72 chars are not legitimate # terms. more likely descriptions. If a term has a - in # it, it's probably a separator between a term and a # description while len(s) >= 72 and " - " in s: h, s = s.split(" - ", 1) domdesc.rel(self.ns['dct'].subject, sokord_uri(h)) if len(s) < 72: domdesc.rel(self.ns['dct'].subject, sokord_uri(s)) # 3. mint some owl:sameAs URIs refdesc.rel(self.ns['owl'].sameAs, self.sameas_uri(refuri)) domdesc.rel(self.ns['owl'].sameAs, self.sameas_uri(domuri)) # 4. Add some same-for-everyone properties refdesc.rel(self.ns['dct'].publisher, self.lookup_resource('Domstolsverket')) refdesc.rdftype(self.ns['rpubl'].Rattsfallsreferat) domdesc.rdftype(self.ns['rpubl'].VagledandeDomstolsavgorande) refdesc.rel(self.ns['rpubl'].referatAvDomstolsavgorande, domuri) # 5. assert that we have everything we need # 6. done! return refuri
def parse(self, doc): """Parse downloaded documents into structured XML and RDF.""" reader = TextReader(self.store.downloaded_path(doc.basefile), linesep=TextReader.UNIX) # Some more preprocessing: Remove the faux-bold formatting # used in some RFCs (using repetitions of characters # interleaved with backspace control sequences). Note: that # is '\b' as in backspace, not r'\b' as in word boundary # docstring = re.sub('.\b','',docstring) cleanparagraphs = (re.sub('.\b', '', x) for x in reader.getiterator(reader.readparagraph)) parser = self.get_parser(doc.basefile) if not self.config.fsmdebug: self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ parser.debug = self.config.fsmdebug doc.body = parser.parse(cleanparagraphs) header = doc.body.pop(0) # body.findByClass(RFCHeader) title = " ".join(doc.body.pop(0).split()) # body.findByClass(DocHeader) for part in doc.body: if isinstance(part, PreambleSection) and part.title == "Table of Contents": doc.body.remove(part) break # create (RDF) metadata for document Note: The provided # basefile may be incorrect -- let whatever is in the header # override realid = self.get_rfc_num(header) if not realid: # eg RFC 100 -- fallback to basefile in that case realid = doc.basefile doc.uri = self.canonical_uri(realid) desc = Describer(doc.meta, doc.uri) desc.rdftype(self.ns['rfc'].RFC) desc.value(self.ns['dct'].title, title, lang="en") self.parse_header(header, desc) if not desc.getvalues(self.ns['dct'].identifier): desc.value(self.ns['dct'].identifier, "RFC %s" % doc.basefile) doc.lang = "en" # process body - remove the temporary Pagebreak objects, after # having extracted the shortTitle found in them shorttitle = self.cleanup_body(doc.body) if shorttitle and (desc.getvalue(self.ns['dct'].title) != shorttitle): desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en") # process body - add good metadata citparser = self.make_citation_parser() doc.body = citparser.parse_recursive(doc.body) PreambleSection.counter = 0 # self.decorate_bodyparts(doc.body,doc.uri) if self.config.fsmdebug: print(serialize(doc.body))
def selector(entry): graph = Graph() graph.parse(self.store.distilled_path(entry.basefile)) desc = Describer(graph, entry.id) return desc.getvalue(self.ns['dct'].subject) == category
def parse_metadata_from_soup(self, soup, doc): from ferenda import Describer from datetime import datetime title = "My Document title" authors = ["Fred Bloggs", "Joe Shmoe"] identifier = "Docno 2013:4711" pubdate = datetime(2013, 1, 6, 10, 8, 0) d = Describer(doc.meta, doc.uri) d.rdftype(self.rdf_type) d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) d.value(self.ns['dcterms'].title, title, lang=doc.lang) d.value(self.ns['dcterms'].identifier, identifier) for author in authors: d.value(self.ns['dcterms'].author, author)
def parse_metadata_from_soup(self, soup, doc): from rdflib import Namespace from ferenda import Describer from ferenda import util import re DCT = Namespace("http://purl.org/dc/terms/") FOAF = Namespace("http://xmlns.com/foaf/0.1/") d = Describer(doc.meta, doc.uri) d.rdftype(FOAF.Document) d.value(DCT.title, soup.find("title").text, lang=doc.lang) d.value(DCT.abstract, soup.find(True, "abstract"), lang=doc.lang) # find the issued date -- assume it's the first thing that looks # like a date on the form "22 August 2013" re_date = re.compile(r'(\d+ \w+ \d{4})') datenode = soup.find(text=re_date) datestr = re_date.search(datenode).group(1) d.value(DCT.issued, util.strptime(datestr, "%d %B %Y")) editors = soup.find("dt", text=re.compile("Editors?:")) for editor in editors.find_next_siblings("dd"): editor_name = editor.text.strip().split(", ")[0] d.value(DCT.editor, editor_name)
def parse_metadata_from_soup(self, soup, doc): from rdflib import Namespace from ferenda import Describer from ferenda import util import re DCTERMS = Namespace("http://purl.org/dc/terms/") FOAF = Namespace("http://xmlns.com/foaf/0.1/") d = Describer(doc.meta, doc.uri) d.rdftype(FOAF.Document) d.value(DCTERMS.title, soup.find("title").text, lang=doc.lang) d.value(DCTERMS.abstract, soup.find(True, "abstract"), lang=doc.lang) # find the issued date -- assume it's the first thing that looks # like a date on the form "22 August 2013" re_date = re.compile(r'(\d+ \w+ \d{4})') datenode = soup.find(text=re_date) datestr = re_date.search(datenode).group(1) d.value(DCTERMS.issued, util.strptime(datestr, "%d %B %Y")) editors = soup.find("dt", text=re.compile("Editors?:")) for editor in editors.find_next_siblings("dd"): editor_name = editor.text.strip().split(", ")[0] d.value(DCTERMS.editor, editor_name)
def parse(self, doc): """Parse downloaded documents into structured XML and RDF.""" reader = TextReader(self.store.downloaded_path(doc.basefile), linesep=TextReader.UNIX) # Some more preprocessing: Remove the faux-bold formatting # used in some RFCs (using repetitions of characters # interleaved with backspace control sequences). Note: that # is '\b' as in backspace, not r'\b' as in word boundary # docstring = re.sub('.\b','',docstring) cleanparagraphs = (re.sub('.\b', '', x) for x in reader.getiterator(reader.readparagraph)) parser = self.get_parser(doc.basefile) if not self.config.fsmdebug: self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ parser.debug = self.config.fsmdebug doc.body = parser.parse(cleanparagraphs) header = doc.body.pop(0) # body.findByClass(RFCHeader) title = " ".join( doc.body.pop(0).split()) # body.findByClass(DocHeader) for part in doc.body: if isinstance( part, PreambleSection) and part.title == "Table of Contents": doc.body.remove(part) break # create (RDF) metadata for document Note: The provided # basefile may be incorrect -- let whatever is in the header # override realid = self.get_rfc_num(header) if not realid: # eg RFC 100 -- fallback to basefile in that case realid = doc.basefile doc.uri = self.canonical_uri(realid) desc = Describer(doc.meta, doc.uri) desc.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) desc.value(self.ns['dcterms'].title, title, lang="en") self.parse_header(header, desc) # parse_header might have set .rdftype, but if not: try: desc.getrdftype() except KeyError: desc.rdftype(self.ns['rfc'].RFC) if not desc.getvalues(self.ns['dcterms'].identifier): desc.value(self.ns['dcterms'].identifier, "RFC %s" % doc.basefile) doc.lang = "en" # process body - remove the temporary Pagebreak objects, after # having extracted the shortTitle found in them shorttitle = self.cleanup_body(doc.body) if shorttitle and (desc.getvalue(self.ns['dcterms'].title) != shorttitle): desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en") # process body - add good metadata citparser = self.make_citation_parser() doc.body = citparser.parse_recursive(doc.body) PreambleSection.counter = 0 # self.decorate_bodyparts(doc.body,doc.uri) if self.config.fsmdebug: print(serialize(doc.body)) return True
def selector(entry): graph = Graph() with self.store.open_distilled(entry.basefile) as fp: graph.parse(data=fp.read()) desc = Describer(graph, entry.id) return desc.getrel(self.ns['dcterms'].subject) == category
def parse_metadata_from_soup(self, soup, doc): from ferenda import Describer from datetime import datetime title = "My Document title" authors = ["Fred Bloggs", "Joe Shmoe"] identifier = "Docno 2013:4711" pubdate = datetime(2013,1,6,10,8,0) d = Describer(doc.meta, doc.uri) d.rdftype(self.rdf_type) d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) d.value(self.ns['dcterms'].title, title, lang=doc.lang) d.value(self.ns['dcterms'].identifier, identifier) for author in authors: d.value(self.ns['dcterms'].author, author)
def parse_metadata_from_soup(self, soup, doc): doc.lang = "sv" d = Describer(doc.meta, doc.uri) d.rdftype(self.rdf_type) d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) sameas = self.sameas_uri(doc.uri) if sameas: d.rel(self.ns['owl'].sameAs, sameas) content = soup.find(id="content") title = content.find("h1").string d.value(self.ns['dct'].title, title, lang=doc.lang) identifier = self.sanitize_identifier( content.find("p", "lead").text) # might need fixing up d.value(self.ns['dct'].identifier, identifier) definitions = content.find("dl", "definitions") if definitions: for dt in definitions.find_all("dt"): key = dt.get_text(strip=True) value = dt.find_next_sibling("dd").get_text(strip=True) if key == "Utgiven:": try: d.value(self.ns['dct'].published, self.parse_swedish_date(value)) except ValueError as e: self.log.warning( "Could not parse %s as swedish date" % value) elif key == "Avsändare:": if value.endswith("departementet"): d.rel(self.ns['rpubl'].departement, self.lookup_resource(value)) else: d.rel(self.ns['dct'].publisher, self.lookup_resource(value)) if content.find("h2", text="Sammanfattning"): sums = content.find("h2", text="Sammanfattning").find_next_siblings("p") # "\n\n" doesn't seem to survive being stuffed in a rdfa # content attribute. Replace with simple space. summary = " ".join([x.get_text(strip=True) for x in sums]) d.value(self.ns['dct'].abstract, summary, lang=doc.lang) # find related documents re_basefile = re.compile(r'\d{4}(|/\d{2,4}):\d+') # legStep1=Kommittedirektiv, 2=Utredning, 3=lagrådsremiss, # 4=proposition. Assume that relationships between documents # are reciprocal (ie if the page for a Kommittedirektiv # references a Proposition, the page for that Proposition # references the Kommittedirektiv. elements = {self.KOMMITTEDIREKTIV: [], self.DS: ["legStep1"], self.PROPOSITION: ["legStep1", "legStep2"], self.SOU: ["legStep1"]}[self.document_type] for elementid in elements: box = content.find(id=elementid) for listitem in box.find_all("li"): if not listitem.find("span", "info"): continue infospans = [x.text.strip( ) for x in listitem.find_all("span", "info")] rel_basefile = None identifier = None for infospan in infospans: if re_basefile.search(infospan): # scrub identifier ("Dir. 2008:50" -> "2008:50" etc) rel_basefile = re_basefile.search(infospan).group() identifier = infospan if not rel_basefile: self.log.warning( "Couldn't find rel_basefile (elementid #%s) among %r" % (elementid, infospans)) continue if elementid == "legStep1": subjUri = self.canonical_uri( rel_basefile, self.KOMMITTEDIREKTIV) elif elementid == "legStep2": if identifier.startswith("SOU"): subjUri = self.canonical_uri(rel_basefile, self.SOU) elif identifier.startswith(("Ds", "DS")): subjUri = self.canonical_uri(rel_basefile, self.DS) else: self.log.warning( "Cannot find out what type of document the linked %s is (#%s)" % (identifier, elementid)) self.log.warning("Infospans was %r" % infospans) continue elif elementid == "legStep3": subjUri = self.canonical_uri( rel_basefile, self.PROPOSITION) d.rel(self.ns['rpubl'].utgarFran, subjUri) # find related pages related = content.find("h2", text="Relaterat") if related: for link in related.findParent("div").find_all("a"): r = urljoin( "http://www.regeringen.se/", link["href"]) d.rel(RDFS.seeAlso, URIRef(r)) # with d.rel(RDFS.seeAlso, URIRef(r)): # d.value(RDFS.label, link.get_text(strip=True)) self.infer_triples(d, doc.basefile)
class TestDescriber(unittest.TestCase): def setUp(self): self.graph = Graph() self.graph.parse(data=""" @prefix dcterms: <http://purl.org/dc/terms/> . @prefix foaf: <http://xmlns.com/foaf/0.1/> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . <http://example.org/doc> a foaf:Document; dcterms:title "Hello world"@en ; dcterms:identifier "ID1", "ID2"; dcterms:issued "2013-10-11"^^xsd:date; dcterms:references <http://example.org/doc2>; dcterms:subject <http://example.org/concept1>, <http://example.org/concept2> . """, format="turtle") self.desc = Describer(self.graph, "http://example.org/doc") def test_getvalues(self): self.assertEqual(self.desc.getvalues(DCTERMS.alternate), []) self.assertEqual(self.desc.getvalues(DCTERMS.title), ["Hello world"]) self.assertEqual(set(self.desc.getvalues(DCTERMS.identifier)), set(["ID1", "ID2"])) def test_getvalue(self): self.assertEqual(self.desc.getvalue(DCTERMS.title), "Hello world") self.assertEqual(self.desc.getvalue(DCTERMS.issued), datetime.date(2013, 10, 11)) with self.assertRaises(KeyError): self.desc.getvalue(DCTERMS.alternate) with self.assertRaises(KeyError): self.desc.getvalue(DCTERMS.identifier) def test_getrels(self): self.assertEqual(self.desc.getrels(DCTERMS.replaces), []) self.assertEqual(self.desc.getrels(DCTERMS.references), ["http://example.org/doc2"]) self.assertEqual( set(self.desc.getrels(DCTERMS.subject)), set(["http://example.org/concept1", "http://example.org/concept2"])) def test_getrel(self): self.assertEqual(self.desc.getrel(DCTERMS.references), "http://example.org/doc2") with self.assertRaises(KeyError): self.desc.getrel(DCTERMS.replaces) with self.assertRaises(KeyError): self.desc.getrel(DCTERMS.subject) def test_getrdftype(self): self.assertEqual(self.desc.getrdftype(), "http://xmlns.com/foaf/0.1/Document")
def parse(self, doc): # some very simple heuristic rules for determining # what an individual paragraph is def is_heading(p): # If it's on a single line and it isn't indented with spaces # it's probably a heading. if p.count("\n") == 0 and not p.startswith(" "): return True def is_pagebreak(p): # if it contains a form feed character, it represents a page break return "\f" in p # Parsing a document consists mainly of two parts: # 1: First we parse the body of text and store it in doc.body from ferenda.elements import Body, Preformatted, Title, Heading from ferenda import Describer reader = TextReader(self.store.downloaded_path(doc.basefile)) # First paragraph of an RFC is always a header block header = reader.readparagraph() # Preformatted is a ferenda.elements class representing a # block of preformatted text. It is derived from the built-in # list type, and must thus be initialized with an iterable, in # this case a single-element list of strings. (Note: if you # try to initialize it with a string, because strings are # iterables as well, you'll end up with a list where each # character in the string is an element, which is not what you # want). preheader = Preformatted([header]) # Doc.body is a ferenda.elements.Body class, which is also # is derived from list, so it has (amongst others) the append # method. We build our document by adding to this root # element. doc.body.append(preheader) # Second paragraph is always the title, and we don't include # this in the body of the document, since we'll add it to the # medata -- once is enough title = reader.readparagraph() # After that, just iterate over the document and guess what # everything is. TextReader.getiterator is useful for # iterating through a text in other chunks than single lines for para in reader.getiterator(reader.readparagraph): if is_heading(para): # Heading is yet another of these ferenda.elements # classes. doc.body.append(Heading([para])) elif is_pagebreak(para): # Just drop these remnants of a page-and-paper-based past pass else: # If we don't know that it's something else, it's a # preformatted section (the safest bet for RFC text). doc.body.append(Preformatted([para])) # 2: Then we create metadata for the document and store it in # doc.meta (in this case using the convenience # ferenda.Describer class). desc = Describer(doc.meta, doc.uri) # Set the rdf:type of the document desc.rdftype(self.rdf_type) # Set the title we've captured as the dct:title of the document and # specify that it is in English desc.value(self.ns['dct'].title, util.normalize_space(title), lang="en") # Construct the dct:identifier (eg "RFC 6991") for this document from the basefile desc.value(self.ns['dct'].identifier, "RFC " + doc.basefile) # find and convert the publication date in the header to a datetime # object, and set it as the dct:issued date for the document re_date = re.compile("(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})").search # This is a context manager that temporarily sets the system # locale to the "C" locale in order to be able to use strptime # with a string on the form "August 2013", even though the # system may use another locale. dt_match = re_date(header) if dt_match: with util.c_locale(): dt = datetime.strptime(re_date(header).group(0), "%B %Y") pubdate = date(dt.year,dt.month,dt.day) # Note that using some python types (cf. datetime.date) # results in a datatyped RDF literal, ie in this case # <http://localhost:8000/res/rfc/6994> dct:issued "2013-08-01"^^xsd:date desc.value(self.ns['dct'].issued, pubdate) # find any older RFCs that this document updates or obsoletes obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE) updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE) # Find the category of this RFC, store it as dct:subject cat_match = re.search("^Category: ([\w ]+?)( |$)", header, re.MULTILINE) if cat_match: desc.value(self.ns['dct'].subject, cat_match.group(1)) for predicate, matches in ((self.ns['rfc'].updates, updates), (self.ns['rfc'].obsoletes, obsoletes)): if matches is None: continue # add references between this document and these older rfcs, # using either rfc:updates or rfc:obsoletes for match in matches.group(1).strip().split(", "): uri = self.canonical_uri(match) # Note that this uses our own unofficial # namespace/vocabulary # http://example.org/ontology/rfc/ desc.rel(predicate, uri) # And now we're done. We don't need to return anything as # we've modified the Document object that was passed to # us. The calling code will serialize this modified object to # XHTML and RDF and store it on disk # end parse1 # Now do it again reader.seek(0) reader.readparagraph() reader.readparagraph() doc.body = Body() doc.body.append(preheader) # doc.body.append(Title([util.normalize_space(title)])) # begin parse2 from ferenda.elements import Section, Subsection, Subsubsection # More heuristic rules: Section headers start at the beginning # of a line and are numbered. Subsections and subsubsections # have dotted numbers, optionally with a trailing period, ie # '9.2.' or '11.3.1' def is_section(p): return re.match(r"\d+\.? +[A-Z]", p) def is_subsection(p): return re.match(r"\d+\.\d+\.? +[A-Z]", p) def is_subsubsection(p): return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p) def split_sectionheader(p): # returns a tuple of title, ordinal, identifier ordinal, title = p.split(" ",1) ordinal = ordinal.strip(".") return title.strip(), ordinal, "RFC %s, section %s" % (doc.basefile, ordinal) # Use a list as a simple stack to keep track of the nesting # depth of a document. Every time we create a Section, # Subsection or Subsubsection object, we push it onto the # stack (and clear the stack down to the appropriate nesting # depth). Every time we create some other object, we append it # to whatever object is at the top of the stack. As your rules # for representing the nesting of structure become more # complicated, you might want to use the # :class:`~ferenda.FSMParser` class, which lets you define # heuristic rules (recognizers), states and transitions, and # takes care of putting your structure together. stack = [doc.body] for para in reader.getiterator(reader.readparagraph): if is_section(para): title, ordinal, identifier = split_sectionheader(para) s = Section(title=title, ordinal=ordinal, identifier=identifier) stack[1:] = [] # clear all but bottom element stack[0].append(s) # add new section to body stack.append(s) # push new section on top of stack elif is_subsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsection(title=title, ordinal=ordinal, identifier=identifier) stack[2:] = [] # clear all but bottom two elements stack[1].append(s) # add new subsection to current section stack.append(s) elif is_subsubsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsubsection(title=title, ordinal=ordinal, identifier=identifier) stack[3:] = [] # clear all but bottom three stack[-1].append(s) # add new subsubsection to current subsection stack.append(s) elif is_heading(para): stack[-1].append(Heading([para])) elif is_pagebreak(para): pass else: pre = Preformatted([para]) stack[-1].append(pre) # end parse2 # begin citation1 from pyparsing import Word, CaselessLiteral, nums section_citation = (CaselessLiteral("section") + Word(nums+".").setResultsName("Sec")).setResultsName("SecRef") rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") + "]").setResultsName("RFCRef") section_rfc_citation = (section_citation + "of" + rfc_citation).setResultsName("SecRFCRef") # end citation1 # begin citation2 def rfc_uriformatter(parts): uri = "" if 'RFC' in parts: uri += self.canonical_uri(parts['RFC'].lstrip("0")) if 'Sec' in parts: uri += "#S" + parts['Sec'] return uri # end citation2 # begin citation3 from ferenda import CitationParser, URIFormatter citparser = CitationParser(section_rfc_citation, section_citation, rfc_citation) citparser.set_formatter(URIFormatter(("SecRFCRef", rfc_uriformatter), ("SecRef", rfc_uriformatter), ("RFCRef", rfc_uriformatter))) citparser.parse_recursive(doc.body)
def selector(entry): graph = Graph() with self.store.open_distilled(entry.basefile) as fp: graph.parse(data=fp.read()) desc = Describer(graph, entry.id) return desc.getvalue(self.ns['dct'].subject) == category