Exemple #1
0
    def parse(self, doc):
        doc.uri = self.canonical_uri(doc.basefile)
        d = Describer(doc.meta, doc.uri)
        d.rdftype(self.rdf_type)
        d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
        self.infer_triples(d, doc.basefile)

        # prefer PDF or Word files over the plaintext-containing HTML files
        # FIXME: PDF or Word files are now stored as attachments

        pdffile = self.generic_path(doc.basefile, 'downloaded', '.pdf')

        wordfiles = (self.generic_path(doc.basefile, 'downloaded', '.doc'),
                     self.generic_path(doc.basefile, 'downloaded', '.docx'),
                     self.generic_path(doc.basefile, 'downloaded', '.wpd'),
                     self.generic_path(doc.basefile, 'downloaded', '.rtf'))
        wordfile = None
        for f in wordfiles:
            if os.path.exists(f):
                wordfile = f

        # if we lack a .pdf file, use Open/LibreOffice to convert any
        # .wpd or .doc file to .pdf first
        if (wordfile
                and not os.path.exists(pdffile)):
            intermediate_pdf = self.generic_path(
                doc.basefile, "intermediate", ".pdf")
            if not os.path.exists(intermediate_pdf):
                cmdline = "%s --headless -convert-to pdf -outdir '%s' %s" % (self.config.get('soffice', 'soffice'),
                                                                             os.path.dirname(
                                                                                 intermediate_pdf),
                                                                             wordfile)
                self.log.debug(
                    "%s: Converting to PDF: %s" % (doc.basefile, cmdline))
                (ret, stdout, stderr) = util.runcmd(
                    cmdline, require_success=True)
            pdffile = intermediate_pdf

        if os.path.exists(pdffile):
            self.log.debug("%s: Using %s" % (doc.basefile, pdffile))
            intermediate_dir = os.path.dirname(
                self.generic_path(doc.basefile, 'intermediate', '.foo'))
            self.setup_logger('pdfreader', self.config.get('log', 'INFO'))
            pdfreader = PDFReader()
            pdfreader.read(pdffile, intermediate_dir)
            self.parse_from_pdfreader(pdfreader, doc)
        else:
            downloaded_path = self.downloaded_path(doc.basefile)
            intermediate_path = self.generic_path(
                doc.basefile, 'intermediate', '.txt')
            self.log.debug("%s: Using %s (%s)" % (doc.basefile,
                           downloaded_path, intermediate_path))
            if not os.path.exists(intermediate_path):
                html = codecs.open(
                    downloaded_path, encoding="iso-8859-1").read()
                util.writefile(intermediate_path, util.extract_text(
                    html, '<pre>', '</pre>'), encoding="utf-8")
            textreader = TextReader(intermediate_path, encoding="utf-8")
            self.parse_from_textreader(textreader, doc)
Exemple #2
0
 def infer_metadata(self, resource, basefile):
     super(InferTimes, self).infer_metadata(resource, basefile)
     desc = Describer(resource.graph, resource.identifier)
     de = DocumentEntry(self.store.documententry_path(basefile))
     if de.orig_updated:
         desc.value(RINFOEX.senastHamtad, de.orig_updated)
     if de.orig_checked:
         desc.value(RINFOEX.senastKontrollerad, de.orig_checked)
    def parse_from_pdfreader(self, pdfreader, doc):
        doc.body = Body([pdfreader])

        d = Describer(doc.meta, doc.uri)
        d.rdftype(self.rdf_type)
        d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())

        return doc
Exemple #4
0
    def setUp(self):
        super(News, self).setUp()
        self.faceted_data = []
        # create a bunch of DocumentEntry objects and save them
        basetime = datetime(2013, 1, 1, 12, 0)
        for basefile in range(25):
            v = {'id':self.repo.canonical_uri(basefile),
                 'title':"Doc #%s" % basefile}
            self.faceted_data.append({'uri': v['id'],
                                      'dcterms_title': v['title'],
                                      'rdf_type': 'http://xmlns.com/foaf/0.1/Document'})
            de = DocumentEntry()
            de.orig_created = basetime + timedelta(hours=basefile)
            de.orig_updated = basetime + timedelta(hours=basefile, minutes=10)
            de.orig_checked = basetime + timedelta(hours=basefile, minutes=20)
            de.published    = basetime + timedelta(hours=basefile, minutes=30)
            de.updated      = basetime + timedelta(hours=basefile, minutes=40)
            de.orig_url     = "http://source.example.org/doc/%s" % basefile
            de.title        = v['title']
            de.save(self.repo.store.documententry_path(str(basefile)))

            g = rdflib.Graph()
            desc = Describer(g, self.repo.canonical_uri(basefile))
            dcterms = self.repo.ns['dcterms']
            desc.rdftype(self.repo.ns['foaf'].Document)
            desc.value(dcterms.title, "Invalid title")
            util.ensure_dir(self.repo.store.distilled_path(str(basefile)))
            with open(self.repo.store.distilled_path(str(basefile)), "wb") as fp:
                g.serialize(fp, format="pretty-xml")

            util.ensure_dir(self.repo.store.parsed_path(str(basefile)))
            with open(self.repo.store.parsed_path(str(basefile)), "w") as fp:
                fp.write("""<?xml version='1.0' encoding='utf-8'?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN" "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:dcterms="http://purl.org/dc/terms/" xml:lang="en">
  <head about="%(id)s">
    <title>%(title)s</title>
  </head>
  <body about="%(id)s">
    <h1>%(title)s</h1>
  </body>
</html>""" % v)

            util.ensure_dir(self.repo.store.generated_path(str(basefile)))
            with open(self.repo.store.generated_path(str(basefile)), "w") as fp:
                fp.write("""<!DOCTYPE html>
<html>
  <head>
    <title>%(title)s</title>
  </head>
  <body>
    <h1>%(title)s</h1>
  </body>
</html>""" % v)
Exemple #5
0
 def parse(self, doc):
     # create a dummy txt
     d = Describer(doc.meta, doc.uri)
     d.rdftype(self.rdf_type)
     d.value(self.ns['dcterms'].title, Literal(doc.basefile, lang=doc.lang))
     d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
     doc.body = Body()  # can be empty, all content in doc.meta
     return True
Exemple #6
0
    def parse_document_from_soup(self, soup, doc):
        from ferenda.elements import Page
        from ferenda import Describer
        part = Page(["This is a part of a document"],
                    ordinal=42,
                    uri="http://example.org/doc#42",
                    meta=self.make_graph())
        d = Describer(part.meta, part.uri)
        d.rdftype(self.ns['bibo'].DocumentPart)
        # the dcterms:identifier for a document part is often whatever
        # would be the preferred way to cite that part in another
        # document
        d.value(self.ns['dcterms'].identifier, "Doc:4711, p 42")
# end part
        from lxml import etree
        return etree.tostring(part.as_xhtml("http://example.org/doc"))
Exemple #7
0
 def parse_from_textreader(self, textreader, doc):
     describer = Describer(doc.meta, doc.uri)
     for p in textreader.getiterator(textreader.readparagraph):
         # print "Handing %r (%s)" % (p[:40], len(doc.body))
         if not p.strip():
             continue
         elif not doc.body and 'Obs! Dokumenten i denna databas kan vara ofullständiga.' in p:
             continue
         elif not doc.body and p.strip().startswith("Dokument:"):
             # We already know this
             continue
         elif not doc.body and p.strip().startswith("Titel:"):
             describer.value(
                 self.ns['dct'].title, util.normalize_space(p[7:]))
         else:
             doc.body.append(Preformatted([p]))
Exemple #8
0
    def make_meta(self, chunk, meta, uri, basefile):
        d = Describer(meta, uri)
        dct = self.ns['dct']
        prov = self.ns['prov']
        owl = self.ns['owl']
        rpubl = RPUBL

        d.rdftype(self.rdf_type)
        d.value(prov.wasGeneratedBy, self.qualified_class_name())

        # predicates maps key strings to corresponsing RDFLib terms,
        # e.g. "Rubrik" -> dct:title
        predicates = {'Dir nr': dct.identifier,
                      'Departement': rpubl.departement,
                      'Beslut vid regeringssammanträde':
                      rpubl.beslutsdatum,
                      'Rubrik': dct.title,
                      'Senast ändrad': dct.changed
                      }
        # munger contains a set of tuples where the first item is a
        # method for converting a plain text into the appropriate
        # RDFLib value, e.g:
        # - "Utredning av foo" => Literal("Utredning av foo",lang="sv")
        # - "1987-02-19" => datetime(1987,2,19)
        # - "Arbetsdepartementet" => URIRef("http://lagen.nu/terms/arbdep")
        # The second item is the Describer method that
        # should be used to add the value to the graph, i.e. .value
        # for Literals and .rel for URIRefs
        munger = {'Dir nr': (self.sanitize_identifier, d.value),  # the RDFLib constructor
                  'Departement': (functools.partial(self.lookup_resource, warn=False), d.rel),
                  'Beslut vid regeringssammanträde': (self.parse_iso_date, d.value),
                  'Rubrik': (self.sanitize_rubrik, d.value),
                  'Senast ändrad': (self.parse_iso_date, d.value)
                  }

        # headerlines wraps a TextReader in an iterator that parses
        # "key:value\n" lines with support for line continuation, eg
        # "long\nkey:long\nvalue\n"
        for (key, val) in self.header_lines(chunk):
            if not val:
                continue
            try:
                pred = predicates[key]
                (transformer, setter) = munger[key]
                setter(pred, transformer(val))
            except (KeyError, ValueError) as e:
                self.log.error(
                    "Couldn't munge value '%s' into a proper object for predicate '%s'" % (val, key))

        d.rel(dct.publisher, self.lookup_resource("Regeringskansliet"))
        d.rel(owl.sameAs, self.sameas_uri(uri))
        self.infer_triples(d, basefile)
Exemple #9
0
    def infer_metadata(self, resource, basefile):
        # remove the bogus dcterms:issued thing that we only added to
        # aid URI generation. NB: This is removed in the superclass'
        # postprocess_doc as well, because for this lagen.nu-derived
        # class it needs to be done at this point, but for use of the
        # superclass directly, it needs to be done at some point.
        for o in resource.objects(DCTERMS.issued):
            if not o.datatype:
                resource.remove(DCTERMS.issued, o)
        sameas_uri = self.sameas_minter.space.coin_uri(resource)
        resource.add(OWL.sameAs, URIRef(sameas_uri))
        resource.graph.add((URIRef(self.canonical_uri(basefile, True)),
                            OWL.sameAs, resource.identifier))
        # then find each rpubl:konsolideringsunderlag, and create
        # owl:sameas for them as well
        for subresource in resource.objects(RPUBL.konsolideringsunderlag):
            # sometimes there'll be a rpubl:konsolideringsunderlag to
            # a resource URI but no actual data about that
            # resource. This seems to happen if SFST is updated but
            # SFSR is not. In those cases we can't generate a
            # owl:sameAs URI since we have no other data about the
            # resource.
            if subresource.value(RDF.type):
                uri = self.sameas_minter.space.coin_uri(subresource)
                subresource.add(OWL.sameAs, URIRef(uri))
        desc = Describer(resource.graph, resource.identifier)
        de = DocumentEntry(self.store.documententry_path(basefile))
        if de.orig_updated:
            desc.value(RINFOEX.senastHamtad, de.orig_updated)
        if de.orig_checked:
            desc.value(RINFOEX.senastKontrollerad, de.orig_checked)
        rooturi = URIRef(desc.getrel(RPUBL.konsoliderar))

        v = self.commondata.value(rooturi, DCTERMS.alternate, any=True)
        if v:
            desc.value(DCTERMS.alternate, v)
        v = self.commondata.value(rooturi, RDFS.label, any=True)
        if v:
            # don't include labels if they're essentially the same as
            # dcterms:title (legalref needs it to be able to parse
            # refs to laws that typically don't include SFS numbers,
            # so that's why they're in sfs.ttl
            basetitle = str(resource.value(DCTERMS.title)).rsplit(" (")[0]
            if not v.startswith(basetitle.lower()):
                desc.value(RDFS.label, util.ucfirst(v))
Exemple #10
0
    def setUp(self):
        self.graph = Graph()
        self.graph.parse(data="""
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<http://example.org/doc> a foaf:Document;
        dcterms:title "Hello world"@en ;
        dcterms:identifier "ID1",
                       "ID2";
        dcterms:issued "2013-10-11"^^xsd:date;
        dcterms:references <http://example.org/doc2>;
        dcterms:subject <http://example.org/concept1>,
                    <http://example.org/concept2> .
        """,
                         format="turtle")
        self.desc = Describer(self.graph, "http://example.org/doc")
Exemple #11
0
 def parse(self, doc):
     # create a dummy txt
     d = Describer(doc.meta, doc.uri)
     d.rdftype(self.rdf_type)
     d.value(self.ns['dcterms'].title, Literal(doc.basefile, lang=doc.lang))
     d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
     doc.body = Body()  # can be empty, all content in doc.meta
     self.parse_entry_update(doc)
     return True
Exemple #12
0
    def polish_metadata(self, head, basefile, infer_nodes=True):

        # where do we get refdesc, domdesc?
        coin_uri = self.sameas_minter.space.coin_uri
        resource = super(DV, self).polish_metadata(head, basefile)
        refuri = resource.identifier
        if 'rinfoex:patchdescription' in head:
            resource.add(RINFOEX.patchdescription,
                         Literal(head['rinfoex:patchdescription'], lang="sv"))
        refuri_sameas = coin_uri(resource)
        resource.graph.add((URIRef(refuri), OWL.sameAs, URIRef(refuri_sameas)))
        # NB: In theory, we have all the data we need to generate a
        # canonical URI for the dom. In practice, this data does not
        # meet requirements of our URISpace templates in certain cases
        # (all MD verdicts use rpubl:domsnummer instead of
        # rpubl:malnummer, which is what the template expects. The
        # superclass' definition of polish_metadata gets around this
        # by creating a minimal graph from the plain dict in head and
        # feeds that to coin_uri. So we do the same here, instead of
        # the very simple:
        #
        #    domuri_sameas = coin_uri(resource.value(RPUBL.referatAvDomstolsavgorande))
        #
        # (also, this version handles the uncommon but valid case
        # where one referat concerns multiple dom:s)
        domuri = resource.value(RPUBL.referatAvDomstolsavgorande).identifier
        for malnummer in head['_localid']:
            bnodetmp = BNode()
            gtmp = Graph()
            gtmp.bind("rpubl", RPUBL)
            gtmp.bind("dcterms", DCTERMS)
            dtmp = Describer(gtmp, bnodetmp)
            dtmp.rdftype(RPUBL.VagledandeDomstolsavgorande)
            dtmp.value(RPUBL.malnummer, malnummer)
            dtmp.value(RPUBL.avgorandedatum, head['Avgörandedatum'])
            dtmp.rel(DCTERMS.publisher, self.lookup_resource(head["Domstol"]))
            rtmp = dtmp.graph.resource(bnodetmp)
            domuri_sameas = coin_uri(rtmp)
            resource.graph.add(
                (URIRef(domuri), OWL.sameAs, URIRef(domuri_sameas)))
        return resource
Exemple #13
0
 def decorate_bodyparts(self, part, baseuri):
     if isinstance(part, str):
         return
     if isinstance(part, (Section, Subsection, Subsubsection)):
         # print("Decorating %s %s" % (part.__class__.__name__,part.ordinal))
         part.uri = "%s#S%s" % (baseuri, part.ordinal)
         part.meta = self.make_graph()
         desc = Describer(part.meta, part.uri)
         desc.rdftype(self.ns['bibo'].DocumentPart)
         desc.value(self.ns['dcterms'].title, Literal(part.title, lang="en"))
         desc.value(self.ns['bibo'].chapter, part.ordinal)
         # desc.value(self.ns['dcterms'].isPartOf, part.parent.uri) # implied
     for subpart in part:
         self.decorate_bodyparts(subpart, baseuri)
    def parse_from_pdfreader(self, pdfreader, doc):
        doc.body = Body([pdfreader])

        d = Describer(doc.meta, doc.uri)
        d.rdftype(self.rdf_type)
        d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())

        return doc
Exemple #15
0
 def infer_metadata(self, resource, basefile):
     super(InferTimes, self).infer_metadata(resource, basefile)
     desc = Describer(resource.graph, resource.identifier)
     de = DocumentEntry(self.store.documententry_path(basefile))
     if de.orig_updated:
         desc.value(RINFOEX.senastHamtad, de.orig_updated)
     if de.orig_checked:
         desc.value(RINFOEX.senastKontrollerad, de.orig_checked)
Exemple #16
0
 def metadata_from_basefile(self, doc):
     desc = Describer(doc.meta, doc.uri)
     desc.rel(CDM.resource_legal_id_celex, Literal(doc.basefile))
     # the sixth letter in 
     rdftype = {"R": CDM.regulation,
                "L": CDM.directive,
                "C": CDM.decision_cjeu}[doc.basefile[5]]
     desc.rel(RDF.type, rdftype)
     return doc.meta
Exemple #17
0
    def setUp(self):
        super(News, self).setUp()
        self.faceted_data = []
        # create a bunch of DocumentEntry objects and save them
        basetime = datetime(2013, 1, 1, 12, 0)
        for basefile in range(25):
            v = {'id':self.repo.canonical_uri(basefile),
                 'title':"Doc #%s" % basefile}
            self.faceted_data.append({'uri': v['id'],
                                      'dcterms_title': v['title'],
                                      'rdf_type': 'http://xmlns.com/foaf/0.1/Document'})
            de = DocumentEntry()
            de.orig_created = basetime + timedelta(hours=basefile)
            de.orig_updated = basetime + timedelta(hours=basefile, minutes=10)
            de.orig_checked = basetime + timedelta(hours=basefile, minutes=20)
            de.published    = basetime + timedelta(hours=basefile, minutes=30)
            de.updated      = basetime + timedelta(hours=basefile, minutes=40)
            de.orig_url     = "http://source.example.org/doc/%s" % basefile
            de.title        = v['title']
            de.save(self.repo.store.documententry_path(str(basefile)))

            g = rdflib.Graph()
            desc = Describer(g, self.repo.canonical_uri(basefile))
            dcterms = self.repo.ns['dcterms']
            desc.rdftype(self.repo.ns['foaf'].Document)
            desc.value(dcterms.title, "Invalid title")
            util.ensure_dir(self.repo.store.distilled_path(str(basefile)))
            with open(self.repo.store.distilled_path(str(basefile)), "wb") as fp:
                g.serialize(fp, format="pretty-xml")

            util.ensure_dir(self.repo.store.parsed_path(str(basefile)))
            with open(self.repo.store.parsed_path(str(basefile)), "w") as fp:
                fp.write("""<?xml version='1.0' encoding='utf-8'?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN" "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:dcterms="http://purl.org/dc/terms/" xml:lang="en">
  <head about="%(id)s">
    <title>%(title)s</title>
  </head>
  <body about="%(id)s">
    <h1>%(title)s</h1>
  </body>
</html>""" % v)

            util.ensure_dir(self.repo.store.generated_path(str(basefile)))
            with open(self.repo.store.generated_path(str(basefile)), "w") as fp:
                fp.write("""<!DOCTYPE html>
<html>
  <head>
    <title>%(title)s</title>
  </head>
  <body>
    <h1>%(title)s</h1>
  </body>
</html>""" % v)
Exemple #18
0
    def setUp(self):
        self.graph = Graph()
        self.graph.parse(data="""
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<http://example.org/doc> a foaf:Document;
        dcterms:title "Hello world"@en ;
        dcterms:identifier "ID1",
                       "ID2";
        dcterms:issued "2013-10-11"^^xsd:date;
        dcterms:references <http://example.org/doc2>;
        dcterms:subject <http://example.org/concept1>,
                    <http://example.org/concept2> .
        """, format="turtle")
        self.desc = Describer(self.graph, "http://example.org/doc")
Exemple #19
0
 def parse_document_from_soup(self, soup, doc):
     from ferenda.elements import Page
     from ferenda import Describer
     part = Page(["This is a part of a document"],
                 ordinal=42,
                 uri="http://example.org/doc#42",
                 meta=self.make_graph())
     d = Describer(part.meta, part.uri)
     d.rdftype(self.ns['bibo'].DocumentPart)
     # the dcterms:identifier for a document part is often whatever
     # would be the preferred way to cite that part in another
     # document
     d.value(self.ns['dcterms'].identifier, "Doc:4711, p 42")
     # end part
     from lxml import etree
     return etree.tostring(part.as_xhtml("http://example.org/doc"))
Exemple #20
0
 def infer_metadata(self, resource, basefile):
     # remove the bogus dcterms:issued thing that we only added to
     # aid URI generation. NB: This is removed in the superclass'
     # postprocess_doc as well, because for this lagen.nu-derived
     # class it needs to be done at this point, but for use of the
     # superclass directly, it needs to be done at some point.
     for o in resource.objects(DCTERMS.issued):
         if not o.datatype:
             resource.remove(DCTERMS.issued, o)
     sameas_uri = self.sameas_minter.space.coin_uri(resource)
     resource.add(OWL.sameAs, URIRef(sameas_uri))
     resource.graph.add((URIRef(self.canonical_uri(basefile, True)),
                         OWL.sameAs, resource.identifier))
     # then find each rpubl:konsolideringsunderlag, and create
     # owl:sameas for them as well
     for subresource in resource.objects(RPUBL.konsolideringsunderlag):
         # sometimes there'll be a rpubl:konsolideringsunderlag to
         # a resource URI but no actual data about that
         # resource. This seems to happen if SFST is updated but
         # SFSR is not. In those cases we can't generate a
         # owl:sameAs URI since we have no other data about the
         # resource.
         if subresource.value(RDF.type):
             uri = self.sameas_minter.space.coin_uri(subresource)
             subresource.add(OWL.sameAs, URIRef(uri))
     desc = Describer(resource.graph, resource.identifier)
     de = DocumentEntry(self.store.documententry_path(basefile))
     if de.orig_updated:
         desc.value(RINFOEX.senastHamtad, de.orig_updated)
     if de.orig_checked:
         desc.value(RINFOEX.senastKontrollerad, de.orig_checked)
     v = self.commondata.value(resource.identifier,
                               DCTERMS.alternate,
                               any=True)
     if v:
         desc.value(DCTERMS.alternate, v)
Exemple #21
0
class TestDescriber(unittest.TestCase):
    def setUp(self):
        self.graph = Graph()
        self.graph.parse(data="""
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<http://example.org/doc> a foaf:Document;
        dcterms:title "Hello world"@en ;
        dcterms:identifier "ID1",
                       "ID2";
        dcterms:issued "2013-10-11"^^xsd:date;
        dcterms:references <http://example.org/doc2>;
        dcterms:subject <http://example.org/concept1>,
                    <http://example.org/concept2> .
        """, format="turtle")
        self.desc = Describer(self.graph, "http://example.org/doc")

    def test_getvalues(self):
        self.assertEqual(self.desc.getvalues(DCTERMS.alternate),
                         [])
        self.assertEqual(self.desc.getvalues(DCTERMS.title),
                         ["Hello world"])
        self.assertEqual(set(self.desc.getvalues(DCTERMS.identifier)),
                         set(["ID1", "ID2"]))

    def test_getvalue(self):
        self.assertEqual(self.desc.getvalue(DCTERMS.title),
                         "Hello world")
        self.assertEqual(self.desc.getvalue(DCTERMS.issued),
                         datetime.date(2013,10,11))
        with self.assertRaises(KeyError):
            self.desc.getvalue(DCTERMS.alternate)
        with self.assertRaises(KeyError):
            self.desc.getvalue(DCTERMS.identifier)

    def test_getrels(self):
        self.assertEqual(self.desc.getrels(DCTERMS.replaces),
                         [])
        self.assertEqual(self.desc.getrels(DCTERMS.references),
                         ["http://example.org/doc2"])
        self.assertEqual(set(self.desc.getrels(DCTERMS.subject)),
                         set(["http://example.org/concept1",
                              "http://example.org/concept2"]))

    def test_getrel(self):
        self.assertEqual(self.desc.getrel(DCTERMS.references),
                         "http://example.org/doc2")
        with self.assertRaises(KeyError):
            self.desc.getrel(DCTERMS.replaces)
        with self.assertRaises(KeyError):
            self.desc.getrel(DCTERMS.subject)
            
    def test_getrdftype(self):
        self.assertEqual(self.desc.getrdftype(),
                         "http://xmlns.com/foaf/0.1/Document")
Exemple #22
0
    def parse(self, doc):
        # some very simple heuristic rules for determining
        # what an individual paragraph is

        def is_heading(p):
            # If it's on a single line and it isn't indented with spaces
            # it's probably a heading.
            if p.count("\n") == 0 and not p.startswith(" "):
                return True

        def is_pagebreak(p):
            # if it contains a form feed character, it represents a page break
            return "\f" in p

        # Parsing a document consists mainly of two parts:
        # 1: First we parse the body of text and store it in doc.body
        from ferenda.elements import Body, Preformatted, Title, Heading
        from ferenda import Describer
        reader = TextReader(self.store.downloaded_path(doc.basefile))

        # First paragraph of an RFC is always a header block
        header = reader.readparagraph()
        # Preformatted is a ferenda.elements class representing a
        # block of preformatted text. It is derived from the built-in
        # list type, and must thus be initialized with an iterable, in
        # this case a single-element list of strings. (Note: if you
        # try to initialize it with a string, because strings are
        # iterables as well, you'll end up with a list where each
        # character in the string is an element, which is not what you
        # want).
        preheader = Preformatted([header])
        # Doc.body is a ferenda.elements.Body class, which is also
        # is derived from list, so it has (amongst others) the append
        # method. We build our document by adding to this root
        # element.
        doc.body.append(preheader)

        # Second paragraph is always the title, and we don't include
        # this in the body of the document, since we'll add it to the
        # medata -- once is enough
        title = reader.readparagraph()

        # After that, just iterate over the document and guess what
        # everything is. TextReader.getiterator is useful for
        # iterating through a text in other chunks than single lines
        for para in reader.getiterator(reader.readparagraph):
            if is_heading(para):
                # Heading is yet another of these ferenda.elements
                # classes.
                doc.body.append(Heading([para]))
            elif is_pagebreak(para):
                # Just drop these remnants of a page-and-paper-based past
                pass
            else:
                # If we don't know that it's something else, it's a
                # preformatted section (the safest bet for RFC text).
                doc.body.append(Preformatted([para]))

        # 2: Then we create metadata for the document and store it in
        # doc.meta (in this case using the convenience
        # ferenda.Describer class).

        desc = Describer(doc.meta, doc.uri)

        # Set the rdf:type of the document
        desc.rdftype(self.rdf_type)

        # Set the title we've captured as the dcterms:title of the document and
        # specify that it is in English
        desc.value(self.ns['dcterms'].title,
                   util.normalize_space(title),
                   lang="en")

        # Construct the dcterms:identifier (eg "RFC 6991") for this document from the basefile
        desc.value(self.ns['dcterms'].identifier, "RFC " + doc.basefile)

        # find and convert the publication date in the header to a datetime
        # object, and set it as the dcterms:issued date for the document
        re_date = re.compile(
            "(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})"
        ).search
        # This is a context manager that temporarily sets the system
        # locale to the "C" locale in order to be able to use strptime
        # with a string on the form "August 2013", even though the
        # system may use another locale.
        dt_match = re_date(header)
        if dt_match:
            with util.c_locale():
                dt = datetime.strptime(re_date(header).group(0), "%B %Y")
            pubdate = date(dt.year, dt.month, dt.day)
            # Note that using some python types (cf. datetime.date)
            # results in a datatyped RDF literal, ie in this case
            #   <http://localhost:8000/res/rfc/6994> dcterms:issued "2013-08-01"^^xsd:date
            desc.value(self.ns['dcterms'].issued, pubdate)

        # find any older RFCs that this document updates or obsoletes
        obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE)
        updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE)

        # Find the category of this RFC, store it as dcterms:subject
        cat_match = re.search("^Category: ([\w ]+?)(  |$)", header,
                              re.MULTILINE)
        if cat_match:
            desc.value(self.ns['dcterms'].subject, cat_match.group(1))

        for predicate, matches in ((self.ns['rfc'].updates, updates),
                                   (self.ns['rfc'].obsoletes, obsoletes)):
            if matches is None:
                continue
            # add references between this document and these older rfcs,
            # using either rfc:updates or rfc:obsoletes
            for match in matches.group(1).strip().split(", "):
                uri = self.canonical_uri(match)
                # Note that this uses our own unofficial
                # namespace/vocabulary
                # http://example.org/ontology/rfc/
                desc.rel(predicate, uri)

        # And now we're done. We don't need to return anything as
        # we've modified the Document object that was passed to
        # us. The calling code will serialize this modified object to
        # XHTML and RDF and store it on disk

# end parse1
# Now do it again
        reader.seek(0)
        reader.readparagraph()
        reader.readparagraph()
        doc.body = Body()
        doc.body.append(preheader)
        # doc.body.append(Title([util.normalize_space(title)]))
        # begin parse2
        from ferenda.elements import Section, Subsection, Subsubsection

        # More heuristic rules: Section headers start at the beginning
        # of a line and are numbered. Subsections and subsubsections
        # have dotted numbers, optionally with a trailing period, ie
        # '9.2.' or '11.3.1'
        def is_section(p):
            return re.match(r"\d+\.? +[A-Z]", p)

        def is_subsection(p):
            return re.match(r"\d+\.\d+\.? +[A-Z]", p)

        def is_subsubsection(p):
            return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p)

        def split_sectionheader(p):
            # returns a tuple of title, ordinal, identifier
            ordinal, title = p.split(" ", 1)
            ordinal = ordinal.strip(".")
            return title.strip(), ordinal, "RFC %s, section %s" % (
                doc.basefile, ordinal)

        # Use a list as a simple stack to keep track of the nesting
        # depth of a document. Every time we create a Section,
        # Subsection or Subsubsection object, we push it onto the
        # stack (and clear the stack down to the appropriate nesting
        # depth). Every time we create some other object, we append it
        # to whatever object is at the top of the stack. As your rules
        # for representing the nesting of structure become more
        # complicated, you might want to use the
        # :class:`~ferenda.FSMParser` class, which lets you define
        # heuristic rules (recognizers), states and transitions, and
        # takes care of putting your structure together.
        stack = [doc.body]

        for para in reader.getiterator(reader.readparagraph):
            if is_section(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Section(title=title,
                            ordinal=ordinal,
                            identifier=identifier)
                stack[1:] = []  # clear all but bottom element
                stack[0].append(s)  # add new section to body
                stack.append(s)  # push new section on top of stack
            elif is_subsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsection(title=title,
                               ordinal=ordinal,
                               identifier=identifier)
                stack[2:] = []  # clear all but bottom two elements
                stack[1].append(s)  # add new subsection to current section
                stack.append(s)
            elif is_subsubsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsubsection(title=title,
                                  ordinal=ordinal,
                                  identifier=identifier)
                stack[3:] = []  # clear all but bottom three
                stack[-1].append(
                    s)  # add new subsubsection to current subsection
                stack.append(s)
            elif is_heading(para):
                stack[-1].append(Heading([para]))
            elif is_pagebreak(para):
                pass
            else:
                pre = Preformatted([para])
                stack[-1].append(pre)
# end parse2

# begin citation1
        from pyparsing import Word, CaselessLiteral, nums
        section_citation = (
            CaselessLiteral("section") +
            Word(nums + ".").setResultsName("Sec")).setResultsName("SecRef")
        rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") +
                        "]").setResultsName("RFCRef")
        section_rfc_citation = (section_citation + "of" +
                                rfc_citation).setResultsName("SecRFCRef")

        # end citation1

        # begin citation2
        def rfc_uriformatter(parts):
            uri = ""
            if 'RFC' in parts:
                uri += self.canonical_uri(parts['RFC'].lstrip("0"))
            if 'Sec' in parts:
                uri += "#S" + parts['Sec']
            return uri
# end citation2

# begin citation3

        from ferenda import CitationParser, URIFormatter
        citparser = CitationParser(section_rfc_citation, section_citation,
                                   rfc_citation)
        citparser.set_formatter(
            URIFormatter(("SecRFCRef", rfc_uriformatter),
                         ("SecRef", rfc_uriformatter),
                         ("RFCRef", rfc_uriformatter)))
        citparser.parse_recursive(doc.body)
Exemple #23
0
    def parse_metadata_from_soup(self, soup, doc):
        doc.lang = self.lang
        d = Describer(doc.meta, doc.uri)
        d.rdftype(self.rdf_type)
        d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
        dcterms = self.ns['dcterms']

        # dcterms:title
        d.value(dcterms.title, soup.find("title").string, lang=doc.lang)
        d.value(dcterms.identifier, doc.basefile)
        # dcterms:abstract
        abstract = soup.find(_class="abstract")
        if abstract:
            d.value(dcterms['abstract'], abstract.string, lang=doc.lang)

        # dcterms:published
        datehdr = soup.find(lambda x: x.name in ('h2', 'h3')
                            and re.search("W3C\s+Recommendation,?\s+", x.text))
        if datehdr:
            datestr = " ".join(datehdr.text.split())
            m = re.search("(\d+)[ \-](\w+),?[ \-](\d{4})", datestr)
            if not m:
                self.log.warning("%s: Couldn't parse datestr %s" %
                                 (doc.basefile, datestr))
            else:
                datestr = " ".join(m.groups())
                date = None
                try:
                    # 17 December 1996
                    date = util.strptime(datestr, "%d %B %Y").date()
                except ValueError:
                    try:
                        # 17 Dec 1996
                        date = util.strptime(datestr, "%d %b %Y").date()
                    except ValueError:
                        self.log.warning("%s: Could not parse datestr %s" %
                                         (doc.basefile, datestr))
                if date:
                    d.value(dcterms.issued, date)

        # dcterms:editor
        editors = soup.find("dt", text=re.compile("Editors?:"))
        if editors:
            for editor in editors.find_next_siblings("dd"):
                editor_string = " ".join(x for x in editor.stripped_strings if not "@" in x)
                editor_name = editor_string.split(", ")[0]
                d.value(dcterms.editor, editor_name)

        # dcterms:publisher
        d.rel(dcterms.publisher, "http://localhost:8000/ext/w3c")

        # assure we got exactly one of each of the required properties
        for required in (dcterms.title, dcterms.issued):
            d.getvalue(required)  # throws KeyError if not found (or more than one)
Exemple #24
0
    def polish_metadata(self, head, doc):
        basefile_regex = re.compile('(?P<type>\w+)/(?P<year>\d+)-(?P<ordinal>\d+)')

        def basefile_to_referat(basefile):
            templ = {'ADO': 'AD %(year)s nr %(ordinal)s',
                     'MD': 'MD %(year)s:%(ordinal)s'}
            m = basefile_regex.match(basefile)
            if m:
                return templ[m.group("type")] % (m.groupdict())

        def ref_to_uri(ref):
            # FIXME: We'd like to retire legalref and replace it with
            # pyparsing grammars.
            nodes = self.rattsfall_parser.parse(ref)
            uri = nodes[0].uri
            return localize_uri(uri)

        def dom_to_uri(domstol, malnr, avg):
            baseuri = self.config.url
            slug = self.slugs[domstol]
            return "%(baseuri)sres/dv/%(slug)s/%(malnr)s/%(avg)s" % locals()

        def localize_uri(uri):
            if "publ/rattsfall" in uri:
                return uri.replace("http://rinfo.lagrummet.se/publ/rattsfall",
                                   self.config.url + "res/dv")
            elif "publ/sfs/" in uri:
                return uri.replace("http://rinfo.lagrummet.se/publ/sfs",
                                   self.config.url + "res/sfs")

        def split_nja(value):
            # "NJA 2008 s 567 (NJA 2008:86)"=>("NJA 2008 s 567", "NJA 2008:86")
            return [x[:-1] for x in value.split("(")]

        def sokord_uri(value):
            return self.config.url + "concept/%s" % util.ucfirst(value).replace(' ', '_')

        # 0. create Referat key if not present
        if "Referat" not in head:
            # For some courts (MD, AD, MOD?, MIG?) this is possible
            head["Referat"] = basefile_to_referat(doc.basefile)

        # 1. mint uris and create the two Describers we'll use
        refuri = ref_to_uri(head["Referat"])
        refdesc = Describer(doc.meta, refuri)
        domuri = dom_to_uri(head["Domstol"],
                            head["Målnummer"],
                            head["Avgörandedatum"])
        domdesc = Describer(doc.meta, domuri)

        # 2. convert all strings in head to proper RDF
        for label, value in head.items():
            if label == "Rubrik":
                value = util.normalize_space(value)
                refdesc.value(self.ns['rpubl'].referatrubrik, value, lang="sv")
                domdesc.value(self.ns['dct'].title, value, lang="sv")

            elif label == "Domstol":
                domdesc.rel(self.ns['dct'].publisher, self.lookup_resource(value))
            elif label == "Målnummer":
                domdesc.rel(self.ns['rpubl'].malnummer, value)
            elif label == "Domsnummer":
                domdesc.rel(self.ns['rpubl'].domsnummer, value)
            elif label == "Diarienummer":
                domdesc.rel(self.ns['rpubl'].diarienummer, value)
            elif label == "Avdelning":
                domdesc.rel(self.ns['rpubl'].avdelning, value)
            elif label == "Referat":

                for pred, regex in {'rattsfallspublikation': r'([^ ]+)',
                                    'arsutgava': r'(\d{4})',
                                    'lopnummer': r'\d{4}(?:\:| nr )(\d+)',
                                    'sidnummer': r's.? ?(\d+)'}.items():
                    m = re.search(regex, value)
                    if m:
                        if pred == 'rattsfallspublikation':
                            # "NJA" -> "http://lcaolhost:8000/coll/dv/nja"
                            uri = self.config.url + "coll/dv/" + m.group(1).lower()
                            refdesc.rel(self.ns['rpubl'][pred], uri)
                        else:
                            refdesc.value(self.ns['rpubl'][pred], m.group(1))

                    if value.startswith("NJA"):
                        realvalue, extra = split_nja(value)
                        ordinal = extra.split(" ")[1]
                        refdesc.value(self.ns['dct'].bibliographicCitation,
                                      extra)
                        refdesc.rel(self.ns['owl'].sameAs,
                                    self.config.url + "res/dv/nja/" + ordinal)
                        refdesc.value(self.ns['dct'].identifier, realvalue)
                    else:
                        refdesc.value(self.ns['dct'].identifier, value)

            elif label == "Avgörandedatum":
                with util.c_locale():
                    d = datetime.strptime(value, '%Y-%m-%d')
                domdesc.value(self.ns['rpubl'].avgorandedatum, d)

            elif label == "Lagrum":
                for i in value:  # better be list not string
                    for node in self.lagrum_parser.parse(i):
                        if isinstance(node, Link):

                            domdesc.rel(self.ns['rpubl'].lagrum,
                                        localize_uri(node.uri))
            elif label == "Rättsfall":
                for i in value:
                    for node in self.rattsfall_parser.parse(i):
                        if isinstance(node, Link):
                            domdesc.rel(self.ns['rpubl'].rattsfall,
                                        localize_uri(node.uri))
            elif label == "Litteratur":
                for i in value.split(";"):
                    domdesc.value(self.ns['dct'].relation, util.normalize_space(i))
            elif label == "Sökord":
                for s in self.re_delimSplit(value):
                    s = util.normalize_space(s)
                    if not s:
                        continue
                    # terms longer than 72 chars are not legitimate
                    # terms. more likely descriptions. If a term has a - in
                    # it, it's probably a separator between a term and a
                    # description
                    while len(s) >= 72 and " - " in s:
                        h, s = s.split(" - ", 1)
                        domdesc.rel(self.ns['dct'].subject, sokord_uri(h))
                    if len(s) < 72:
                        domdesc.rel(self.ns['dct'].subject, sokord_uri(s))

        # 3. mint some owl:sameAs URIs
        refdesc.rel(self.ns['owl'].sameAs, self.sameas_uri(refuri))
        domdesc.rel(self.ns['owl'].sameAs, self.sameas_uri(domuri))

        # 4. Add some same-for-everyone properties
        refdesc.rel(self.ns['dct'].publisher, self.lookup_resource('Domstolsverket'))
        refdesc.rdftype(self.ns['rpubl'].Rattsfallsreferat)
        domdesc.rdftype(self.ns['rpubl'].VagledandeDomstolsavgorande)
        refdesc.rel(self.ns['rpubl'].referatAvDomstolsavgorande, domuri)
        # 5. assert that we have everything we need

        # 6. done!
        return refuri
Exemple #25
0
    def parse(self, doc):
        """Parse downloaded documents into structured XML and RDF."""

        reader = TextReader(self.store.downloaded_path(doc.basefile),
                            linesep=TextReader.UNIX)
        # Some more preprocessing: Remove the faux-bold formatting
        # used in some RFCs (using repetitions of characters
        # interleaved with backspace control sequences). Note: that
        # is '\b' as in backspace, not r'\b' as in word boundary
        # docstring = re.sub('.\b','',docstring)
        cleanparagraphs = (re.sub('.\b', '', x) for x in
                           reader.getiterator(reader.readparagraph))

        parser = self.get_parser(doc.basefile)

        if not self.config.fsmdebug:
            self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ
        parser.debug = self.config.fsmdebug
        doc.body = parser.parse(cleanparagraphs)

        header = doc.body.pop(0)  # body.findByClass(RFCHeader)
        title = " ".join(doc.body.pop(0).split())  # body.findByClass(DocHeader)
        for part in doc.body:
            if isinstance(part, PreambleSection) and part.title == "Table of Contents":
                doc.body.remove(part)
                break

        # create (RDF) metadata for document Note: The provided
        # basefile may be incorrect -- let whatever is in the header
        # override
        realid = self.get_rfc_num(header)
        if not realid:  # eg RFC 100 -- fallback to basefile in that case
            realid = doc.basefile
        doc.uri = self.canonical_uri(realid)
        desc = Describer(doc.meta, doc.uri)
        desc.rdftype(self.ns['rfc'].RFC)
        desc.value(self.ns['dct'].title, title, lang="en")
        self.parse_header(header, desc)
        if not desc.getvalues(self.ns['dct'].identifier):
            desc.value(self.ns['dct'].identifier, "RFC %s" % doc.basefile)

        doc.lang = "en"

        # process body - remove the temporary Pagebreak objects, after
        # having extracted the shortTitle found in them
        shorttitle = self.cleanup_body(doc.body)
        if shorttitle and (desc.getvalue(self.ns['dct'].title) != shorttitle):
            desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en")

        # process body - add good metadata
        citparser = self.make_citation_parser()
        doc.body = citparser.parse_recursive(doc.body)
        PreambleSection.counter = 0
        # self.decorate_bodyparts(doc.body,doc.uri)
        if self.config.fsmdebug:
            print(serialize(doc.body))
Exemple #26
0
 def selector(entry):
     graph = Graph()
     graph.parse(self.store.distilled_path(entry.basefile))
     desc = Describer(graph, entry.id)
     return desc.getvalue(self.ns['dct'].subject) == category
Exemple #27
0
 def parse_metadata_from_soup(self, soup, doc):
     from ferenda import Describer
     from datetime import datetime
     title = "My Document title"
     authors = ["Fred Bloggs", "Joe Shmoe"]
     identifier = "Docno 2013:4711"
     pubdate = datetime(2013, 1, 6, 10, 8, 0)
     d = Describer(doc.meta, doc.uri)
     d.rdftype(self.rdf_type)
     d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
     d.value(self.ns['dcterms'].title, title, lang=doc.lang)
     d.value(self.ns['dcterms'].identifier, identifier)
     for author in authors:
         d.value(self.ns['dcterms'].author, author)
Exemple #28
0
 def parse_metadata_from_soup(self, soup, doc):
     from rdflib import Namespace
     from ferenda import Describer
     from ferenda import util
     import re
     DCT = Namespace("http://purl.org/dc/terms/")
     FOAF = Namespace("http://xmlns.com/foaf/0.1/")
     d = Describer(doc.meta, doc.uri)
     d.rdftype(FOAF.Document)
     d.value(DCT.title, soup.find("title").text, lang=doc.lang)
     d.value(DCT.abstract, soup.find(True, "abstract"), lang=doc.lang)
     # find the issued date -- assume it's the first thing that looks
     # like a date on the form "22 August 2013"
     re_date = re.compile(r'(\d+ \w+ \d{4})')
     datenode = soup.find(text=re_date)
     datestr = re_date.search(datenode).group(1)
     d.value(DCT.issued, util.strptime(datestr, "%d %B %Y"))
     editors = soup.find("dt", text=re.compile("Editors?:"))
     for editor in editors.find_next_siblings("dd"):
         editor_name = editor.text.strip().split(", ")[0]
         d.value(DCT.editor, editor_name)
Exemple #29
0
 def parse_metadata_from_soup(self, soup, doc):
     from rdflib import Namespace
     from ferenda import Describer
     from ferenda import util
     import re
     DCTERMS = Namespace("http://purl.org/dc/terms/")
     FOAF = Namespace("http://xmlns.com/foaf/0.1/")
     d = Describer(doc.meta, doc.uri)
     d.rdftype(FOAF.Document)
     d.value(DCTERMS.title, soup.find("title").text, lang=doc.lang)
     d.value(DCTERMS.abstract, soup.find(True, "abstract"), lang=doc.lang)
     # find the issued date -- assume it's the first thing that looks
     # like a date on the form "22 August 2013"
     re_date = re.compile(r'(\d+ \w+ \d{4})')
     datenode = soup.find(text=re_date)
     datestr = re_date.search(datenode).group(1)
     d.value(DCTERMS.issued, util.strptime(datestr, "%d %B %Y"))
     editors = soup.find("dt", text=re.compile("Editors?:"))
     for editor in editors.find_next_siblings("dd"):
         editor_name = editor.text.strip().split(", ")[0]
         d.value(DCTERMS.editor, editor_name)
Exemple #30
0
    def parse(self, doc):
        """Parse downloaded documents into structured XML and RDF."""

        reader = TextReader(self.store.downloaded_path(doc.basefile),
                            linesep=TextReader.UNIX)
        # Some more preprocessing: Remove the faux-bold formatting
        # used in some RFCs (using repetitions of characters
        # interleaved with backspace control sequences). Note: that
        # is '\b' as in backspace, not r'\b' as in word boundary
        # docstring = re.sub('.\b','',docstring)
        cleanparagraphs = (re.sub('.\b', '', x)
                           for x in reader.getiterator(reader.readparagraph))

        parser = self.get_parser(doc.basefile)

        if not self.config.fsmdebug:
            self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ
        parser.debug = self.config.fsmdebug
        doc.body = parser.parse(cleanparagraphs)

        header = doc.body.pop(0)  # body.findByClass(RFCHeader)
        title = " ".join(
            doc.body.pop(0).split())  # body.findByClass(DocHeader)
        for part in doc.body:
            if isinstance(
                    part,
                    PreambleSection) and part.title == "Table of Contents":
                doc.body.remove(part)
                break

        # create (RDF) metadata for document Note: The provided
        # basefile may be incorrect -- let whatever is in the header
        # override
        realid = self.get_rfc_num(header)
        if not realid:  # eg RFC 100 -- fallback to basefile in that case
            realid = doc.basefile
        doc.uri = self.canonical_uri(realid)
        desc = Describer(doc.meta, doc.uri)
        desc.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
        desc.value(self.ns['dcterms'].title, title, lang="en")
        self.parse_header(header, desc)
        # parse_header might have set .rdftype, but if not:
        try:
            desc.getrdftype()
        except KeyError:
            desc.rdftype(self.ns['rfc'].RFC)

        if not desc.getvalues(self.ns['dcterms'].identifier):
            desc.value(self.ns['dcterms'].identifier, "RFC %s" % doc.basefile)

        doc.lang = "en"

        # process body - remove the temporary Pagebreak objects, after
        # having extracted the shortTitle found in them
        shorttitle = self.cleanup_body(doc.body)
        if shorttitle and (desc.getvalue(self.ns['dcterms'].title) !=
                           shorttitle):
            desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en")

        # process body - add good metadata
        citparser = self.make_citation_parser()
        doc.body = citparser.parse_recursive(doc.body)
        PreambleSection.counter = 0
        # self.decorate_bodyparts(doc.body,doc.uri)
        if self.config.fsmdebug:
            print(serialize(doc.body))
        return True
Exemple #31
0
 def selector(entry):
     graph = Graph()
     with self.store.open_distilled(entry.basefile) as fp:
         graph.parse(data=fp.read())
     desc = Describer(graph, entry.id)
     return desc.getrel(self.ns['dcterms'].subject) == category
Exemple #32
0
 def parse_metadata_from_soup(self, soup, doc):
     from ferenda import Describer
     from datetime import datetime
     title = "My Document title"
     authors = ["Fred Bloggs", "Joe Shmoe"]
     identifier = "Docno 2013:4711"
     pubdate = datetime(2013,1,6,10,8,0)
     d = Describer(doc.meta, doc.uri)
     d.rdftype(self.rdf_type)
     d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
     d.value(self.ns['dcterms'].title, title, lang=doc.lang)
     d.value(self.ns['dcterms'].identifier, identifier)
     for author in authors:
         d.value(self.ns['dcterms'].author, author)
Exemple #33
0
    def infer_metadata(self, resource, basefile):
        # remove the bogus dcterms:issued thing that we only added to
        # aid URI generation. NB: This is removed in the superclass'
        # postprocess_doc as well, because for this lagen.nu-derived
        # class it needs to be done at this point, but for use of the
        # superclass directly, it needs to be done at some point.
        for o in resource.objects(DCTERMS.issued):
            if not o.datatype:
                resource.remove(DCTERMS.issued, o)
        sameas_uri = self.sameas_minter.space.coin_uri(resource)
        resource.add(OWL.sameAs, URIRef(sameas_uri))
        resource.graph.add((URIRef(self.canonical_uri(basefile, True)),
                            OWL.sameAs, resource.identifier))
        # then find each rpubl:konsolideringsunderlag, and create
        # owl:sameas for them as well
        for subresource in resource.objects(RPUBL.konsolideringsunderlag):
            # sometimes there'll be a rpubl:konsolideringsunderlag to
            # a resource URI but no actual data about that
            # resource. This seems to happen if SFST is updated but
            # SFSR is not. In those cases we can't generate a
            # owl:sameAs URI since we have no other data about the
            # resource.
            if subresource.value(RDF.type):
                uri = self.sameas_minter.space.coin_uri(subresource)
                subresource.add(OWL.sameAs, URIRef(uri))
        desc = Describer(resource.graph, resource.identifier)
        de = DocumentEntry(self.store.documententry_path(basefile))
        if de.orig_updated:
            desc.value(RINFOEX.senastHamtad, de.orig_updated)
        if de.orig_checked:
            desc.value(RINFOEX.senastKontrollerad, de.orig_checked)
        rooturi = URIRef(desc.getrel(RPUBL.konsoliderar))

        v = self.commondata.value(rooturi, DCTERMS.alternate, any=True)
        if v:
            desc.value(DCTERMS.alternate, v)
        v = self.commondata.value(rooturi, RDFS.label, any=True)
        if v:
            # don't include labels if they're essentially the same as
            # dcterms:title (legalref needs it to be able to parse
            # refs to laws that typically don't include SFS numbers,
            # so that's why they're in sfs.ttl
            basetitle = str(resource.value(DCTERMS.title)).rsplit(" (")[0]
            if not v.startswith(basetitle.lower()):
                desc.value(RDFS.label, util.ucfirst(v))
Exemple #34
0
    def parse_metadata_from_soup(self, soup, doc):
        doc.lang = "sv"
        d = Describer(doc.meta, doc.uri)
        d.rdftype(self.rdf_type)
        d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
        sameas = self.sameas_uri(doc.uri)
        if sameas:
            d.rel(self.ns['owl'].sameAs, sameas)

        content = soup.find(id="content")
        title = content.find("h1").string
        d.value(self.ns['dct'].title, title, lang=doc.lang)
        identifier = self.sanitize_identifier(
            content.find("p", "lead").text)  # might need fixing up
        d.value(self.ns['dct'].identifier, identifier)

        definitions = content.find("dl", "definitions")
        if definitions:
            for dt in definitions.find_all("dt"):
                key = dt.get_text(strip=True)
                value = dt.find_next_sibling("dd").get_text(strip=True)
                if key == "Utgiven:":
                    try:
                        d.value(self.ns['dct'].published,
                                self.parse_swedish_date(value))
                    except ValueError as e:
                        self.log.warning(
                            "Could not parse %s as swedish date" % value)
                elif key == "Avsändare:":
                    if value.endswith("departementet"):
                        d.rel(self.ns['rpubl'].departement,
                              self.lookup_resource(value))
                    else:
                        d.rel(self.ns['dct'].publisher,
                              self.lookup_resource(value))

        if content.find("h2", text="Sammanfattning"):
            sums = content.find("h2", text="Sammanfattning").find_next_siblings("p")
            # "\n\n" doesn't seem to survive being stuffed in a rdfa
            # content attribute. Replace with simple space.
            summary = " ".join([x.get_text(strip=True) for x in sums])
            d.value(self.ns['dct'].abstract,
                    summary, lang=doc.lang)

        # find related documents
        re_basefile = re.compile(r'\d{4}(|/\d{2,4}):\d+')
        # legStep1=Kommittedirektiv, 2=Utredning, 3=lagrådsremiss,
        # 4=proposition. Assume that relationships between documents
        # are reciprocal (ie if the page for a Kommittedirektiv
        # references a Proposition, the page for that Proposition
        # references the Kommittedirektiv.
        elements = {self.KOMMITTEDIREKTIV: [],
                    self.DS: ["legStep1"],
                    self.PROPOSITION: ["legStep1", "legStep2"],
                    self.SOU: ["legStep1"]}[self.document_type]

        for elementid in elements:
            box = content.find(id=elementid)
            for listitem in box.find_all("li"):
                if not listitem.find("span", "info"):
                    continue
                infospans = [x.text.strip(
                ) for x in listitem.find_all("span", "info")]

                rel_basefile = None
                identifier = None

                for infospan in infospans:
                    if re_basefile.search(infospan):
                        # scrub identifier ("Dir. 2008:50" -> "2008:50" etc)
                        rel_basefile = re_basefile.search(infospan).group()
                        identifier = infospan

                if not rel_basefile:
                    self.log.warning(
                        "Couldn't find rel_basefile (elementid #%s) among %r" % (elementid, infospans))
                    continue
                if elementid == "legStep1":
                    subjUri = self.canonical_uri(
                        rel_basefile, self.KOMMITTEDIREKTIV)
                elif elementid == "legStep2":
                    if identifier.startswith("SOU"):
                        subjUri = self.canonical_uri(rel_basefile, self.SOU)
                    elif identifier.startswith(("Ds", "DS")):
                        subjUri = self.canonical_uri(rel_basefile, self.DS)
                    else:
                        self.log.warning(
                            "Cannot find out what type of document the linked %s is (#%s)" % (identifier, elementid))
                        self.log.warning("Infospans was %r" % infospans)
                        continue
                elif elementid == "legStep3":
                    subjUri = self.canonical_uri(
                        rel_basefile, self.PROPOSITION)
                d.rel(self.ns['rpubl'].utgarFran, subjUri)

        # find related pages
        related = content.find("h2", text="Relaterat")
        if related:
            for link in related.findParent("div").find_all("a"):
                r = urljoin(
                    "http://www.regeringen.se/", link["href"])
                d.rel(RDFS.seeAlso, URIRef(r))
                # with d.rel(RDFS.seeAlso, URIRef(r)):
                #    d.value(RDFS.label, link.get_text(strip=True))

        self.infer_triples(d, doc.basefile)
Exemple #35
0
class TestDescriber(unittest.TestCase):
    def setUp(self):
        self.graph = Graph()
        self.graph.parse(data="""
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<http://example.org/doc> a foaf:Document;
        dcterms:title "Hello world"@en ;
        dcterms:identifier "ID1",
                       "ID2";
        dcterms:issued "2013-10-11"^^xsd:date;
        dcterms:references <http://example.org/doc2>;
        dcterms:subject <http://example.org/concept1>,
                    <http://example.org/concept2> .
        """,
                         format="turtle")
        self.desc = Describer(self.graph, "http://example.org/doc")

    def test_getvalues(self):
        self.assertEqual(self.desc.getvalues(DCTERMS.alternate), [])
        self.assertEqual(self.desc.getvalues(DCTERMS.title), ["Hello world"])
        self.assertEqual(set(self.desc.getvalues(DCTERMS.identifier)),
                         set(["ID1", "ID2"]))

    def test_getvalue(self):
        self.assertEqual(self.desc.getvalue(DCTERMS.title), "Hello world")
        self.assertEqual(self.desc.getvalue(DCTERMS.issued),
                         datetime.date(2013, 10, 11))
        with self.assertRaises(KeyError):
            self.desc.getvalue(DCTERMS.alternate)
        with self.assertRaises(KeyError):
            self.desc.getvalue(DCTERMS.identifier)

    def test_getrels(self):
        self.assertEqual(self.desc.getrels(DCTERMS.replaces), [])
        self.assertEqual(self.desc.getrels(DCTERMS.references),
                         ["http://example.org/doc2"])
        self.assertEqual(
            set(self.desc.getrels(DCTERMS.subject)),
            set(["http://example.org/concept1",
                 "http://example.org/concept2"]))

    def test_getrel(self):
        self.assertEqual(self.desc.getrel(DCTERMS.references),
                         "http://example.org/doc2")
        with self.assertRaises(KeyError):
            self.desc.getrel(DCTERMS.replaces)
        with self.assertRaises(KeyError):
            self.desc.getrel(DCTERMS.subject)

    def test_getrdftype(self):
        self.assertEqual(self.desc.getrdftype(),
                         "http://xmlns.com/foaf/0.1/Document")
Exemple #36
0
    def parse(self, doc):
        # some very simple heuristic rules for determining 
        # what an individual paragraph is
   
        def is_heading(p):
            # If it's on a single line and it isn't indented with spaces
            # it's probably a heading.
            if p.count("\n") == 0 and not p.startswith(" "):
                return True
  
        def is_pagebreak(p):
            # if it contains a form feed character, it represents a page break
            return "\f" in p
        
        # Parsing a document consists mainly of two parts:
        # 1: First we parse the body of text and store it in doc.body
        from ferenda.elements import Body, Preformatted, Title, Heading
        from ferenda import Describer
        reader = TextReader(self.store.downloaded_path(doc.basefile))
  
        # First paragraph of an RFC is always a header block 
        header = reader.readparagraph()
        # Preformatted is a ferenda.elements class representing a
        # block of preformatted text. It is derived from the built-in
        # list type, and must thus be initialized with an iterable, in
        # this case a single-element list of strings. (Note: if you
        # try to initialize it with a string, because strings are
        # iterables as well, you'll end up with a list where each
        # character in the string is an element, which is not what you
        # want).
        preheader = Preformatted([header])
        # Doc.body is a ferenda.elements.Body class, which is also
        # is derived from list, so it has (amongst others) the append
        # method. We build our document by adding to this root
        # element.
        doc.body.append(preheader)
  
        # Second paragraph is always the title, and we don't include
        # this in the body of the document, since we'll add it to the
        # medata -- once is enough
        title = reader.readparagraph()
        
        # After that, just iterate over the document and guess what
        # everything is. TextReader.getiterator is useful for
        # iterating through a text in other chunks than single lines
        for para in reader.getiterator(reader.readparagraph):
            if is_heading(para):
                # Heading is yet another of these ferenda.elements
                # classes.
                doc.body.append(Heading([para]))
            elif is_pagebreak(para):
                # Just drop these remnants of a page-and-paper-based past
                pass
            else:
                # If we don't know that it's something else, it's a
                # preformatted section (the safest bet for RFC text).
                doc.body.append(Preformatted([para])) 

        # 2: Then we create metadata for the document and store it in
        # doc.meta (in this case using the convenience
        # ferenda.Describer class).

        desc = Describer(doc.meta, doc.uri)

        # Set the rdf:type of the document
        desc.rdftype(self.rdf_type)

        # Set the title we've captured as the dct:title of the document and 
        # specify that it is in English
        desc.value(self.ns['dct'].title, util.normalize_space(title), lang="en")

        # Construct the dct:identifier (eg "RFC 6991") for this document from the basefile
        desc.value(self.ns['dct'].identifier, "RFC " + doc.basefile)
  
        # find and convert the publication date in the header to a datetime 
        # object, and set it as the dct:issued date for the document   
        re_date = re.compile("(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})").search
        # This is a context manager that temporarily sets the system
        # locale to the "C" locale in order to be able to use strptime
        # with a string on the form "August 2013", even though the
        # system may use another locale.
        dt_match = re_date(header)
        if dt_match:
            with util.c_locale(): 
                dt = datetime.strptime(re_date(header).group(0), "%B %Y")
            pubdate = date(dt.year,dt.month,dt.day)
            # Note that using some python types (cf. datetime.date)
            # results in a datatyped RDF literal, ie in this case
            #   <http://localhost:8000/res/rfc/6994> dct:issued "2013-08-01"^^xsd:date
            desc.value(self.ns['dct'].issued, pubdate)
  
        # find any older RFCs that this document updates or obsoletes
        obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE)
        updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE)

        # Find the category of this RFC, store it as dct:subject
        cat_match = re.search("^Category: ([\w ]+?)(  |$)", header, re.MULTILINE)
        if cat_match:
            desc.value(self.ns['dct'].subject, cat_match.group(1))
            
        for predicate, matches in ((self.ns['rfc'].updates, updates),
                                   (self.ns['rfc'].obsoletes, obsoletes)):
            if matches is None:
                continue
            # add references between this document and these older rfcs, 
            # using either rfc:updates or rfc:obsoletes
            for match in matches.group(1).strip().split(", "):
                uri = self.canonical_uri(match)
                # Note that this uses our own unofficial
                # namespace/vocabulary
                # http://example.org/ontology/rfc/
                desc.rel(predicate, uri)
  
        # And now we're done. We don't need to return anything as
        # we've modified the Document object that was passed to
        # us. The calling code will serialize this modified object to
        # XHTML and RDF and store it on disk

# end parse1
        # Now do it again
        reader.seek(0)
        reader.readparagraph()
        reader.readparagraph()
        doc.body = Body()
        doc.body.append(preheader)
        # doc.body.append(Title([util.normalize_space(title)]))
# begin parse2                                   
        from ferenda.elements import Section, Subsection, Subsubsection

        # More heuristic rules: Section headers start at the beginning
        # of a line and are numbered. Subsections and subsubsections
        # have dotted numbers, optionally with a trailing period, ie
        # '9.2.' or '11.3.1'
        def is_section(p):
            return re.match(r"\d+\.? +[A-Z]", p)

        def is_subsection(p):
            return re.match(r"\d+\.\d+\.? +[A-Z]", p)

        def is_subsubsection(p):
            return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p)

        def split_sectionheader(p):
            # returns a tuple of title, ordinal, identifier
            ordinal, title = p.split(" ",1)
            ordinal = ordinal.strip(".")
            return title.strip(), ordinal, "RFC %s, section %s" % (doc.basefile, ordinal)

        # Use a list as a simple stack to keep track of the nesting
        # depth of a document. Every time we create a Section,
        # Subsection or Subsubsection object, we push it onto the
        # stack (and clear the stack down to the appropriate nesting
        # depth). Every time we create some other object, we append it
        # to whatever object is at the top of the stack. As your rules
        # for representing the nesting of structure become more
        # complicated, you might want to use the
        # :class:`~ferenda.FSMParser` class, which lets you define
        # heuristic rules (recognizers), states and transitions, and
        # takes care of putting your structure together.
        stack = [doc.body]

        for para in reader.getiterator(reader.readparagraph):
            if is_section(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Section(title=title, ordinal=ordinal, identifier=identifier)
                stack[1:] = [] # clear all but bottom element
                stack[0].append(s) # add new section to body
                stack.append(s)    # push new section on top of stack
            elif is_subsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsection(title=title, ordinal=ordinal, identifier=identifier)
                stack[2:] = [] # clear all but bottom two elements
                stack[1].append(s) # add new subsection to current section
                stack.append(s)
            elif is_subsubsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsubsection(title=title, ordinal=ordinal, identifier=identifier)
                stack[3:] = [] # clear all but bottom three
                stack[-1].append(s) # add new subsubsection to current subsection
                stack.append(s)
            elif is_heading(para):
                stack[-1].append(Heading([para]))
            elif is_pagebreak(para):
                pass
            else:
                pre = Preformatted([para])
                stack[-1].append(pre)
# end parse2                                   

# begin citation1                                   
        from pyparsing import Word, CaselessLiteral, nums
        section_citation = (CaselessLiteral("section") + Word(nums+".").setResultsName("Sec")).setResultsName("SecRef")
        rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") + "]").setResultsName("RFCRef")
        section_rfc_citation = (section_citation + "of" + rfc_citation).setResultsName("SecRFCRef")
# end citation1                                   

# begin citation2
        def rfc_uriformatter(parts):
            uri = ""
            if 'RFC' in parts:
                 uri += self.canonical_uri(parts['RFC'].lstrip("0"))
            if 'Sec' in parts:
                 uri += "#S" + parts['Sec']
            return uri
# end citation2                                   

# begin citation3
        from ferenda import CitationParser, URIFormatter
        citparser = CitationParser(section_rfc_citation, 
                                   section_citation,
                                   rfc_citation)
        citparser.set_formatter(URIFormatter(("SecRFCRef", rfc_uriformatter),
                                             ("SecRef", rfc_uriformatter),
                                             ("RFCRef", rfc_uriformatter)))
        citparser.parse_recursive(doc.body)
Exemple #37
0
 def selector(entry):
     graph = Graph()
     with self.store.open_distilled(entry.basefile) as fp:
         graph.parse(data=fp.read())
     desc = Describer(graph, entry.id)
     return desc.getvalue(self.ns['dct'].subject) == category