Beispiel #1
0
    def parse_metadata_from_soup(self, soup, doc):
        doc.lang = self.lang
        d = Describer(doc.meta, doc.uri)
        d.rdftype(self.rdf_type)
        d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
        dcterms = self.ns['dcterms']

        # dcterms:title
        d.value(dcterms.title, soup.find("title").string, lang=doc.lang)
        d.value(dcterms.identifier, doc.basefile)
        # dcterms:abstract
        abstract = soup.find(_class="abstract")
        if abstract:
            d.value(dcterms['abstract'], abstract.string, lang=doc.lang)

        # dcterms:published
        datehdr = soup.find(lambda x: x.name in ('h2', 'h3')
                            and re.search("W3C\s+Recommendation,?\s+", x.text))
        if datehdr:
            datestr = " ".join(datehdr.text.split())
            m = re.search("(\d+)[ \-](\w+),?[ \-](\d{4})", datestr)
            if not m:
                self.log.warning("%s: Couldn't parse datestr %s" %
                                 (doc.basefile, datestr))
            else:
                datestr = " ".join(m.groups())
                date = None
                try:
                    # 17 December 1996
                    date = util.strptime(datestr, "%d %B %Y").date()
                except ValueError:
                    try:
                        # 17 Dec 1996
                        date = util.strptime(datestr, "%d %b %Y").date()
                    except ValueError:
                        self.log.warning("%s: Could not parse datestr %s" %
                                         (doc.basefile, datestr))
                if date:
                    d.value(dcterms.issued, date)

        # dcterms:editor
        editors = soup.find("dt", text=re.compile("Editors?:"))
        if editors:
            for editor in editors.find_next_siblings("dd"):
                editor_string = " ".join(x for x in editor.stripped_strings if not "@" in x)
                editor_name = editor_string.split(", ")[0]
                d.value(dcterms.editor, editor_name)

        # dcterms:publisher
        d.rel(dcterms.publisher, "http://localhost:8000/ext/w3c")

        # assure we got exactly one of each of the required properties
        for required in (dcterms.title, dcterms.issued):
            d.getvalue(required)  # throws KeyError if not found (or more than one)
Beispiel #2
0
    def parse(self, doc):
        """Parse downloaded documents into structured XML and RDF."""

        reader = TextReader(self.store.downloaded_path(doc.basefile),
                            linesep=TextReader.UNIX)
        # Some more preprocessing: Remove the faux-bold formatting
        # used in some RFCs (using repetitions of characters
        # interleaved with backspace control sequences). Note: that
        # is '\b' as in backspace, not r'\b' as in word boundary
        # docstring = re.sub('.\b','',docstring)
        cleanparagraphs = (re.sub('.\b', '', x) for x in
                           reader.getiterator(reader.readparagraph))

        parser = self.get_parser(doc.basefile)

        if not self.config.fsmdebug:
            self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ
        parser.debug = self.config.fsmdebug
        doc.body = parser.parse(cleanparagraphs)

        header = doc.body.pop(0)  # body.findByClass(RFCHeader)
        title = " ".join(doc.body.pop(0).split())  # body.findByClass(DocHeader)
        for part in doc.body:
            if isinstance(part, PreambleSection) and part.title == "Table of Contents":
                doc.body.remove(part)
                break

        # create (RDF) metadata for document Note: The provided
        # basefile may be incorrect -- let whatever is in the header
        # override
        realid = self.get_rfc_num(header)
        if not realid:  # eg RFC 100 -- fallback to basefile in that case
            realid = doc.basefile
        doc.uri = self.canonical_uri(realid)
        desc = Describer(doc.meta, doc.uri)
        desc.rdftype(self.ns['rfc'].RFC)
        desc.value(self.ns['dct'].title, title, lang="en")
        self.parse_header(header, desc)
        if not desc.getvalues(self.ns['dct'].identifier):
            desc.value(self.ns['dct'].identifier, "RFC %s" % doc.basefile)

        doc.lang = "en"

        # process body - remove the temporary Pagebreak objects, after
        # having extracted the shortTitle found in them
        shorttitle = self.cleanup_body(doc.body)
        if shorttitle and (desc.getvalue(self.ns['dct'].title) != shorttitle):
            desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en")

        # process body - add good metadata
        citparser = self.make_citation_parser()
        doc.body = citparser.parse_recursive(doc.body)
        PreambleSection.counter = 0
        # self.decorate_bodyparts(doc.body,doc.uri)
        if self.config.fsmdebug:
            print(serialize(doc.body))
Beispiel #3
0
class TestDescriber(unittest.TestCase):
    def setUp(self):
        self.graph = Graph()
        self.graph.parse(data="""
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<http://example.org/doc> a foaf:Document;
        dcterms:title "Hello world"@en ;
        dcterms:identifier "ID1",
                       "ID2";
        dcterms:issued "2013-10-11"^^xsd:date;
        dcterms:references <http://example.org/doc2>;
        dcterms:subject <http://example.org/concept1>,
                    <http://example.org/concept2> .
        """, format="turtle")
        self.desc = Describer(self.graph, "http://example.org/doc")

    def test_getvalues(self):
        self.assertEqual(self.desc.getvalues(DCTERMS.alternate),
                         [])
        self.assertEqual(self.desc.getvalues(DCTERMS.title),
                         ["Hello world"])
        self.assertEqual(set(self.desc.getvalues(DCTERMS.identifier)),
                         set(["ID1", "ID2"]))

    def test_getvalue(self):
        self.assertEqual(self.desc.getvalue(DCTERMS.title),
                         "Hello world")
        self.assertEqual(self.desc.getvalue(DCTERMS.issued),
                         datetime.date(2013,10,11))
        with self.assertRaises(KeyError):
            self.desc.getvalue(DCTERMS.alternate)
        with self.assertRaises(KeyError):
            self.desc.getvalue(DCTERMS.identifier)

    def test_getrels(self):
        self.assertEqual(self.desc.getrels(DCTERMS.replaces),
                         [])
        self.assertEqual(self.desc.getrels(DCTERMS.references),
                         ["http://example.org/doc2"])
        self.assertEqual(set(self.desc.getrels(DCTERMS.subject)),
                         set(["http://example.org/concept1",
                              "http://example.org/concept2"]))

    def test_getrel(self):
        self.assertEqual(self.desc.getrel(DCTERMS.references),
                         "http://example.org/doc2")
        with self.assertRaises(KeyError):
            self.desc.getrel(DCTERMS.replaces)
        with self.assertRaises(KeyError):
            self.desc.getrel(DCTERMS.subject)
            
    def test_getrdftype(self):
        self.assertEqual(self.desc.getrdftype(),
                         "http://xmlns.com/foaf/0.1/Document")
Beispiel #4
0
class TestDescriber(unittest.TestCase):
    def setUp(self):
        self.graph = Graph()
        self.graph.parse(data="""
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<http://example.org/doc> a foaf:Document;
        dcterms:title "Hello world"@en ;
        dcterms:identifier "ID1",
                       "ID2";
        dcterms:issued "2013-10-11"^^xsd:date;
        dcterms:references <http://example.org/doc2>;
        dcterms:subject <http://example.org/concept1>,
                    <http://example.org/concept2> .
        """,
                         format="turtle")
        self.desc = Describer(self.graph, "http://example.org/doc")

    def test_getvalues(self):
        self.assertEqual(self.desc.getvalues(DCTERMS.alternate), [])
        self.assertEqual(self.desc.getvalues(DCTERMS.title), ["Hello world"])
        self.assertEqual(set(self.desc.getvalues(DCTERMS.identifier)),
                         set(["ID1", "ID2"]))

    def test_getvalue(self):
        self.assertEqual(self.desc.getvalue(DCTERMS.title), "Hello world")
        self.assertEqual(self.desc.getvalue(DCTERMS.issued),
                         datetime.date(2013, 10, 11))
        with self.assertRaises(KeyError):
            self.desc.getvalue(DCTERMS.alternate)
        with self.assertRaises(KeyError):
            self.desc.getvalue(DCTERMS.identifier)

    def test_getrels(self):
        self.assertEqual(self.desc.getrels(DCTERMS.replaces), [])
        self.assertEqual(self.desc.getrels(DCTERMS.references),
                         ["http://example.org/doc2"])
        self.assertEqual(
            set(self.desc.getrels(DCTERMS.subject)),
            set(["http://example.org/concept1",
                 "http://example.org/concept2"]))

    def test_getrel(self):
        self.assertEqual(self.desc.getrel(DCTERMS.references),
                         "http://example.org/doc2")
        with self.assertRaises(KeyError):
            self.desc.getrel(DCTERMS.replaces)
        with self.assertRaises(KeyError):
            self.desc.getrel(DCTERMS.subject)

    def test_getrdftype(self):
        self.assertEqual(self.desc.getrdftype(),
                         "http://xmlns.com/foaf/0.1/Document")
Beispiel #5
0
 def selector(entry):
     graph = Graph()
     graph.parse(self.store.distilled_path(entry.basefile))
     desc = Describer(graph, entry.id)
     return desc.getvalue(self.ns['dct'].subject) == category
Beispiel #6
0
 def selector(entry):
     graph = Graph()
     with self.store.open_distilled(entry.basefile) as fp:
         graph.parse(data=fp.read())
     desc = Describer(graph, entry.id)
     return desc.getvalue(self.ns['dct'].subject) == category
Beispiel #7
0
    def parse(self, doc):
        """Parse downloaded documents into structured XML and RDF."""

        reader = TextReader(self.store.downloaded_path(doc.basefile),
                            linesep=TextReader.UNIX)
        # Some more preprocessing: Remove the faux-bold formatting
        # used in some RFCs (using repetitions of characters
        # interleaved with backspace control sequences). Note: that
        # is '\b' as in backspace, not r'\b' as in word boundary
        # docstring = re.sub('.\b','',docstring)
        cleanparagraphs = (re.sub('.\b', '', x)
                           for x in reader.getiterator(reader.readparagraph))

        parser = self.get_parser(doc.basefile)

        if not self.config.fsmdebug:
            self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ
        parser.debug = self.config.fsmdebug
        doc.body = parser.parse(cleanparagraphs)

        header = doc.body.pop(0)  # body.findByClass(RFCHeader)
        title = " ".join(
            doc.body.pop(0).split())  # body.findByClass(DocHeader)
        for part in doc.body:
            if isinstance(
                    part,
                    PreambleSection) and part.title == "Table of Contents":
                doc.body.remove(part)
                break

        # create (RDF) metadata for document Note: The provided
        # basefile may be incorrect -- let whatever is in the header
        # override
        realid = self.get_rfc_num(header)
        if not realid:  # eg RFC 100 -- fallback to basefile in that case
            realid = doc.basefile
        doc.uri = self.canonical_uri(realid)
        desc = Describer(doc.meta, doc.uri)
        desc.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
        desc.value(self.ns['dcterms'].title, title, lang="en")
        self.parse_header(header, desc)
        # parse_header might have set .rdftype, but if not:
        try:
            desc.getrdftype()
        except KeyError:
            desc.rdftype(self.ns['rfc'].RFC)

        if not desc.getvalues(self.ns['dcterms'].identifier):
            desc.value(self.ns['dcterms'].identifier, "RFC %s" % doc.basefile)

        doc.lang = "en"

        # process body - remove the temporary Pagebreak objects, after
        # having extracted the shortTitle found in them
        shorttitle = self.cleanup_body(doc.body)
        if shorttitle and (desc.getvalue(self.ns['dcterms'].title) !=
                           shorttitle):
            desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en")

        # process body - add good metadata
        citparser = self.make_citation_parser()
        doc.body = citparser.parse_recursive(doc.body)
        PreambleSection.counter = 0
        # self.decorate_bodyparts(doc.body,doc.uri)
        if self.config.fsmdebug:
            print(serialize(doc.body))
        return True