def parse_metadata_from_soup(self, soup, doc): doc.lang = self.lang d = Describer(doc.meta, doc.uri) d.rdftype(self.rdf_type) d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) dcterms = self.ns['dcterms'] # dcterms:title d.value(dcterms.title, soup.find("title").string, lang=doc.lang) d.value(dcterms.identifier, doc.basefile) # dcterms:abstract abstract = soup.find(_class="abstract") if abstract: d.value(dcterms['abstract'], abstract.string, lang=doc.lang) # dcterms:published datehdr = soup.find(lambda x: x.name in ('h2', 'h3') and re.search("W3C\s+Recommendation,?\s+", x.text)) if datehdr: datestr = " ".join(datehdr.text.split()) m = re.search("(\d+)[ \-](\w+),?[ \-](\d{4})", datestr) if not m: self.log.warning("%s: Couldn't parse datestr %s" % (doc.basefile, datestr)) else: datestr = " ".join(m.groups()) date = None try: # 17 December 1996 date = util.strptime(datestr, "%d %B %Y").date() except ValueError: try: # 17 Dec 1996 date = util.strptime(datestr, "%d %b %Y").date() except ValueError: self.log.warning("%s: Could not parse datestr %s" % (doc.basefile, datestr)) if date: d.value(dcterms.issued, date) # dcterms:editor editors = soup.find("dt", text=re.compile("Editors?:")) if editors: for editor in editors.find_next_siblings("dd"): editor_string = " ".join(x for x in editor.stripped_strings if not "@" in x) editor_name = editor_string.split(", ")[0] d.value(dcterms.editor, editor_name) # dcterms:publisher d.rel(dcterms.publisher, "http://localhost:8000/ext/w3c") # assure we got exactly one of each of the required properties for required in (dcterms.title, dcterms.issued): d.getvalue(required) # throws KeyError if not found (or more than one)
def parse(self, doc): """Parse downloaded documents into structured XML and RDF.""" reader = TextReader(self.store.downloaded_path(doc.basefile), linesep=TextReader.UNIX) # Some more preprocessing: Remove the faux-bold formatting # used in some RFCs (using repetitions of characters # interleaved with backspace control sequences). Note: that # is '\b' as in backspace, not r'\b' as in word boundary # docstring = re.sub('.\b','',docstring) cleanparagraphs = (re.sub('.\b', '', x) for x in reader.getiterator(reader.readparagraph)) parser = self.get_parser(doc.basefile) if not self.config.fsmdebug: self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ parser.debug = self.config.fsmdebug doc.body = parser.parse(cleanparagraphs) header = doc.body.pop(0) # body.findByClass(RFCHeader) title = " ".join(doc.body.pop(0).split()) # body.findByClass(DocHeader) for part in doc.body: if isinstance(part, PreambleSection) and part.title == "Table of Contents": doc.body.remove(part) break # create (RDF) metadata for document Note: The provided # basefile may be incorrect -- let whatever is in the header # override realid = self.get_rfc_num(header) if not realid: # eg RFC 100 -- fallback to basefile in that case realid = doc.basefile doc.uri = self.canonical_uri(realid) desc = Describer(doc.meta, doc.uri) desc.rdftype(self.ns['rfc'].RFC) desc.value(self.ns['dct'].title, title, lang="en") self.parse_header(header, desc) if not desc.getvalues(self.ns['dct'].identifier): desc.value(self.ns['dct'].identifier, "RFC %s" % doc.basefile) doc.lang = "en" # process body - remove the temporary Pagebreak objects, after # having extracted the shortTitle found in them shorttitle = self.cleanup_body(doc.body) if shorttitle and (desc.getvalue(self.ns['dct'].title) != shorttitle): desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en") # process body - add good metadata citparser = self.make_citation_parser() doc.body = citparser.parse_recursive(doc.body) PreambleSection.counter = 0 # self.decorate_bodyparts(doc.body,doc.uri) if self.config.fsmdebug: print(serialize(doc.body))
class TestDescriber(unittest.TestCase): def setUp(self): self.graph = Graph() self.graph.parse(data=""" @prefix dcterms: <http://purl.org/dc/terms/> . @prefix foaf: <http://xmlns.com/foaf/0.1/> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . <http://example.org/doc> a foaf:Document; dcterms:title "Hello world"@en ; dcterms:identifier "ID1", "ID2"; dcterms:issued "2013-10-11"^^xsd:date; dcterms:references <http://example.org/doc2>; dcterms:subject <http://example.org/concept1>, <http://example.org/concept2> . """, format="turtle") self.desc = Describer(self.graph, "http://example.org/doc") def test_getvalues(self): self.assertEqual(self.desc.getvalues(DCTERMS.alternate), []) self.assertEqual(self.desc.getvalues(DCTERMS.title), ["Hello world"]) self.assertEqual(set(self.desc.getvalues(DCTERMS.identifier)), set(["ID1", "ID2"])) def test_getvalue(self): self.assertEqual(self.desc.getvalue(DCTERMS.title), "Hello world") self.assertEqual(self.desc.getvalue(DCTERMS.issued), datetime.date(2013,10,11)) with self.assertRaises(KeyError): self.desc.getvalue(DCTERMS.alternate) with self.assertRaises(KeyError): self.desc.getvalue(DCTERMS.identifier) def test_getrels(self): self.assertEqual(self.desc.getrels(DCTERMS.replaces), []) self.assertEqual(self.desc.getrels(DCTERMS.references), ["http://example.org/doc2"]) self.assertEqual(set(self.desc.getrels(DCTERMS.subject)), set(["http://example.org/concept1", "http://example.org/concept2"])) def test_getrel(self): self.assertEqual(self.desc.getrel(DCTERMS.references), "http://example.org/doc2") with self.assertRaises(KeyError): self.desc.getrel(DCTERMS.replaces) with self.assertRaises(KeyError): self.desc.getrel(DCTERMS.subject) def test_getrdftype(self): self.assertEqual(self.desc.getrdftype(), "http://xmlns.com/foaf/0.1/Document")
class TestDescriber(unittest.TestCase): def setUp(self): self.graph = Graph() self.graph.parse(data=""" @prefix dcterms: <http://purl.org/dc/terms/> . @prefix foaf: <http://xmlns.com/foaf/0.1/> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . <http://example.org/doc> a foaf:Document; dcterms:title "Hello world"@en ; dcterms:identifier "ID1", "ID2"; dcterms:issued "2013-10-11"^^xsd:date; dcterms:references <http://example.org/doc2>; dcterms:subject <http://example.org/concept1>, <http://example.org/concept2> . """, format="turtle") self.desc = Describer(self.graph, "http://example.org/doc") def test_getvalues(self): self.assertEqual(self.desc.getvalues(DCTERMS.alternate), []) self.assertEqual(self.desc.getvalues(DCTERMS.title), ["Hello world"]) self.assertEqual(set(self.desc.getvalues(DCTERMS.identifier)), set(["ID1", "ID2"])) def test_getvalue(self): self.assertEqual(self.desc.getvalue(DCTERMS.title), "Hello world") self.assertEqual(self.desc.getvalue(DCTERMS.issued), datetime.date(2013, 10, 11)) with self.assertRaises(KeyError): self.desc.getvalue(DCTERMS.alternate) with self.assertRaises(KeyError): self.desc.getvalue(DCTERMS.identifier) def test_getrels(self): self.assertEqual(self.desc.getrels(DCTERMS.replaces), []) self.assertEqual(self.desc.getrels(DCTERMS.references), ["http://example.org/doc2"]) self.assertEqual( set(self.desc.getrels(DCTERMS.subject)), set(["http://example.org/concept1", "http://example.org/concept2"])) def test_getrel(self): self.assertEqual(self.desc.getrel(DCTERMS.references), "http://example.org/doc2") with self.assertRaises(KeyError): self.desc.getrel(DCTERMS.replaces) with self.assertRaises(KeyError): self.desc.getrel(DCTERMS.subject) def test_getrdftype(self): self.assertEqual(self.desc.getrdftype(), "http://xmlns.com/foaf/0.1/Document")
def selector(entry): graph = Graph() graph.parse(self.store.distilled_path(entry.basefile)) desc = Describer(graph, entry.id) return desc.getvalue(self.ns['dct'].subject) == category
def selector(entry): graph = Graph() with self.store.open_distilled(entry.basefile) as fp: graph.parse(data=fp.read()) desc = Describer(graph, entry.id) return desc.getvalue(self.ns['dct'].subject) == category
def parse(self, doc): """Parse downloaded documents into structured XML and RDF.""" reader = TextReader(self.store.downloaded_path(doc.basefile), linesep=TextReader.UNIX) # Some more preprocessing: Remove the faux-bold formatting # used in some RFCs (using repetitions of characters # interleaved with backspace control sequences). Note: that # is '\b' as in backspace, not r'\b' as in word boundary # docstring = re.sub('.\b','',docstring) cleanparagraphs = (re.sub('.\b', '', x) for x in reader.getiterator(reader.readparagraph)) parser = self.get_parser(doc.basefile) if not self.config.fsmdebug: self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ parser.debug = self.config.fsmdebug doc.body = parser.parse(cleanparagraphs) header = doc.body.pop(0) # body.findByClass(RFCHeader) title = " ".join( doc.body.pop(0).split()) # body.findByClass(DocHeader) for part in doc.body: if isinstance( part, PreambleSection) and part.title == "Table of Contents": doc.body.remove(part) break # create (RDF) metadata for document Note: The provided # basefile may be incorrect -- let whatever is in the header # override realid = self.get_rfc_num(header) if not realid: # eg RFC 100 -- fallback to basefile in that case realid = doc.basefile doc.uri = self.canonical_uri(realid) desc = Describer(doc.meta, doc.uri) desc.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) desc.value(self.ns['dcterms'].title, title, lang="en") self.parse_header(header, desc) # parse_header might have set .rdftype, but if not: try: desc.getrdftype() except KeyError: desc.rdftype(self.ns['rfc'].RFC) if not desc.getvalues(self.ns['dcterms'].identifier): desc.value(self.ns['dcterms'].identifier, "RFC %s" % doc.basefile) doc.lang = "en" # process body - remove the temporary Pagebreak objects, after # having extracted the shortTitle found in them shorttitle = self.cleanup_body(doc.body) if shorttitle and (desc.getvalue(self.ns['dcterms'].title) != shorttitle): desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en") # process body - add good metadata citparser = self.make_citation_parser() doc.body = citparser.parse_recursive(doc.body) PreambleSection.counter = 0 # self.decorate_bodyparts(doc.body,doc.uri) if self.config.fsmdebug: print(serialize(doc.body)) return True