def parse_metadata_from_soup(self, soup, doc): doc.lang = self.lang d = Describer(doc.meta, doc.uri) d.rdftype(self.rdf_type) d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) dcterms = self.ns['dcterms'] # dcterms:title d.value(dcterms.title, soup.find("title").string, lang=doc.lang) d.value(dcterms.identifier, doc.basefile) # dcterms:abstract abstract = soup.find(_class="abstract") if abstract: d.value(dcterms['abstract'], abstract.string, lang=doc.lang) # dcterms:published datehdr = soup.find(lambda x: x.name in ('h2', 'h3') and re.search("W3C\s+Recommendation,?\s+", x.text)) if datehdr: datestr = " ".join(datehdr.text.split()) m = re.search("(\d+)[ \-](\w+),?[ \-](\d{4})", datestr) if not m: self.log.warning("%s: Couldn't parse datestr %s" % (doc.basefile, datestr)) else: datestr = " ".join(m.groups()) date = None try: # 17 December 1996 date = util.strptime(datestr, "%d %B %Y").date() except ValueError: try: # 17 Dec 1996 date = util.strptime(datestr, "%d %b %Y").date() except ValueError: self.log.warning("%s: Could not parse datestr %s" % (doc.basefile, datestr)) if date: d.value(dcterms.issued, date) # dcterms:editor editors = soup.find("dt", text=re.compile("Editors?:")) if editors: for editor in editors.find_next_siblings("dd"): editor_string = " ".join(x for x in editor.stripped_strings if not "@" in x) editor_name = editor_string.split(", ")[0] d.value(dcterms.editor, editor_name) # dcterms:publisher d.rel(dcterms.publisher, "http://localhost:8000/ext/w3c") # assure we got exactly one of each of the required properties for required in (dcterms.title, dcterms.issued): d.getvalue(required) # throws KeyError if not found (or more than one)
def parse_metadata_from_soup(self, soup, doc): from rdflib import Namespace from ferenda import Describer from ferenda import util import re DCT = Namespace("http://purl.org/dc/terms/") FOAF = Namespace("http://xmlns.com/foaf/0.1/") d = Describer(doc.meta, doc.uri) d.rdftype(FOAF.Document) d.value(DCT.title, soup.find("title").text, lang=doc.lang) d.value(DCT.abstract, soup.find(True, "abstract"), lang=doc.lang) # find the issued date -- assume it's the first thing that looks # like a date on the form "22 August 2013" re_date = re.compile(r'(\d+ \w+ \d{4})') datenode = soup.find(text=re_date) datestr = re_date.search(datenode).group(1) d.value(DCT.issued, util.strptime(datestr, "%d %B %Y")) editors = soup.find("dt", text=re.compile("Editors?:")) for editor in editors.find_next_siblings("dd"): editor_name = editor.text.strip().split(", ")[0] d.value(DCT.editor, editor_name)
def parse_metadata_from_soup(self, soup, doc): from rdflib import Namespace from ferenda import Describer from ferenda import util import re DCTERMS = Namespace("http://purl.org/dc/terms/") FOAF = Namespace("http://xmlns.com/foaf/0.1/") d = Describer(doc.meta, doc.uri) d.rdftype(FOAF.Document) d.value(DCTERMS.title, soup.find("title").text, lang=doc.lang) d.value(DCTERMS.abstract, soup.find(True, "abstract"), lang=doc.lang) # find the issued date -- assume it's the first thing that looks # like a date on the form "22 August 2013" re_date = re.compile(r'(\d+ \w+ \d{4})') datenode = soup.find(text=re_date) datestr = re_date.search(datenode).group(1) d.value(DCTERMS.issued, util.strptime(datestr, "%d %B %Y")) editors = soup.find("dt", text=re.compile("Editors?:")) for editor in editors.find_next_siblings("dd"): editor_name = editor.text.strip().split(", ")[0] d.value(DCTERMS.editor, editor_name)
def extract_metadata(self, soup, basefile): attribs = self.metadata_from_basefile(basefile) attribs["dcterms:title"] = soup.dokument.titel.text attribs["dcterms:issued"] = util.strptime( soup.dokument.publicerad.text, "%Y-%m-%d %H:%M:%S").date() return attribs
def extract_metadata(self, soup, basefile): attribs = self.metadata_from_basefile(basefile) attribs["dcterms:title"] = soup.dokument.titel.text attribs["dcterms:issued"] = util.strptime(soup.dokument.publicerad.text, "%Y-%m-%d %H:%M:%S").date() return attribs