Python Describer.rel Examples

Programming Language: Python

Namespace/Package Name: ferenda

Class/Type: Describer

Method/Function: rel

Examples at hotexamples.com: 8

Python Describer.rel - 8 examples found. These are the top rated real world Python examples of ferenda.Describer.rel extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Describer(17)

value(14)

rdftype(11)

rel(4)

getrel(3)

getvalue(3)

getrdftype(2)

getvalues(2)

getrels(1)

Example #1

Show file

File: eurlex.py Project: mavteam/ferenda

 def metadata_from_basefile(self, doc):
     desc = Describer(doc.meta, doc.uri)
     desc.rel(CDM.resource_legal_id_celex, Literal(doc.basefile))
     # the sixth letter in 
     rdftype = {"R": CDM.regulation,
                "L": CDM.directive,
                "C": CDM.decision_cjeu}[doc.basefile[5]]
     desc.rel(RDF.type, rdftype)
     return doc.meta

Example #2

Show file

    def parse_metadata_from_soup(self, soup, doc):
        doc.lang = self.lang
        d = Describer(doc.meta, doc.uri)
        d.rdftype(self.rdf_type)
        d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
        dcterms = self.ns['dcterms']

        # dcterms:title
        d.value(dcterms.title, soup.find("title").string, lang=doc.lang)
        d.value(dcterms.identifier, doc.basefile)
        # dcterms:abstract
        abstract = soup.find(_class="abstract")
        if abstract:
            d.value(dcterms['abstract'], abstract.string, lang=doc.lang)

        # dcterms:published
        datehdr = soup.find(lambda x: x.name in ('h2', 'h3')
                            and re.search("W3C\s+Recommendation,?\s+", x.text))
        if datehdr:
            datestr = " ".join(datehdr.text.split())
            m = re.search("(\d+)[ \-](\w+),?[ \-](\d{4})", datestr)
            if not m:
                self.log.warning("%s: Couldn't parse datestr %s" %
                                 (doc.basefile, datestr))
            else:
                datestr = " ".join(m.groups())
                date = None
                try:
                    # 17 December 1996
                    date = util.strptime(datestr, "%d %B %Y").date()
                except ValueError:
                    try:
                        # 17 Dec 1996
                        date = util.strptime(datestr, "%d %b %Y").date()
                    except ValueError:
                        self.log.warning("%s: Could not parse datestr %s" %
                                         (doc.basefile, datestr))
                if date:
                    d.value(dcterms.issued, date)

        # dcterms:editor
        editors = soup.find("dt", text=re.compile("Editors?:"))
        if editors:
            for editor in editors.find_next_siblings("dd"):
                editor_string = " ".join(x for x in editor.stripped_strings if not "@" in x)
                editor_name = editor_string.split(", ")[0]
                d.value(dcterms.editor, editor_name)

        # dcterms:publisher
        d.rel(dcterms.publisher, "http://localhost:8000/ext/w3c")

        # assure we got exactly one of each of the required properties
        for required in (dcterms.title, dcterms.issued):
            d.getvalue(required)  # throws KeyError if not found (or more than one)

Example #3

Show file

File: direktiv.py Project: h4ck3rm1k3/ferenda

    def make_meta(self, chunk, meta, uri, basefile):
        d = Describer(meta, uri)
        dct = self.ns['dct']
        prov = self.ns['prov']
        owl = self.ns['owl']
        rpubl = RPUBL

        d.rdftype(self.rdf_type)
        d.value(prov.wasGeneratedBy, self.qualified_class_name())

        # predicates maps key strings to corresponsing RDFLib terms,
        # e.g. "Rubrik" -> dct:title
        predicates = {'Dir nr': dct.identifier,
                      'Departement': rpubl.departement,
                      'Beslut vid regeringssammanträde':
                      rpubl.beslutsdatum,
                      'Rubrik': dct.title,
                      'Senast ändrad': dct.changed
                      }
        # munger contains a set of tuples where the first item is a
        # method for converting a plain text into the appropriate
        # RDFLib value, e.g:
        # - "Utredning av foo" => Literal("Utredning av foo",lang="sv")
        # - "1987-02-19" => datetime(1987,2,19)
        # - "Arbetsdepartementet" => URIRef("http://lagen.nu/terms/arbdep")
        # The second item is the Describer method that
        # should be used to add the value to the graph, i.e. .value
        # for Literals and .rel for URIRefs
        munger = {'Dir nr': (self.sanitize_identifier, d.value),  # the RDFLib constructor
                  'Departement': (functools.partial(self.lookup_resource, warn=False), d.rel),
                  'Beslut vid regeringssammanträde': (self.parse_iso_date, d.value),
                  'Rubrik': (self.sanitize_rubrik, d.value),
                  'Senast ändrad': (self.parse_iso_date, d.value)
                  }

        # headerlines wraps a TextReader in an iterator that parses
        # "key:value\n" lines with support for line continuation, eg
        # "long\nkey:long\nvalue\n"
        for (key, val) in self.header_lines(chunk):
            if not val:
                continue
            try:
                pred = predicates[key]
                (transformer, setter) = munger[key]
                setter(pred, transformer(val))
            except (KeyError, ValueError) as e:
                self.log.error(
                    "Couldn't munge value '%s' into a proper object for predicate '%s'" % (val, key))

        d.rel(dct.publisher, self.lookup_resource("Regeringskansliet"))
        d.rel(owl.sameAs, self.sameas_uri(uri))
        self.infer_triples(d, basefile)

Example #4

Show file

    def polish_metadata(self, head, basefile, infer_nodes=True):

        # where do we get refdesc, domdesc?
        coin_uri = self.sameas_minter.space.coin_uri
        resource = super(DV, self).polish_metadata(head, basefile)
        refuri = resource.identifier
        if 'rinfoex:patchdescription' in head:
            resource.add(RINFOEX.patchdescription,
                         Literal(head['rinfoex:patchdescription'], lang="sv"))
        refuri_sameas = coin_uri(resource)
        resource.graph.add((URIRef(refuri), OWL.sameAs, URIRef(refuri_sameas)))
        # NB: In theory, we have all the data we need to generate a
        # canonical URI for the dom. In practice, this data does not
        # meet requirements of our URISpace templates in certain cases
        # (all MD verdicts use rpubl:domsnummer instead of
        # rpubl:malnummer, which is what the template expects. The
        # superclass' definition of polish_metadata gets around this
        # by creating a minimal graph from the plain dict in head and
        # feeds that to coin_uri. So we do the same here, instead of
        # the very simple:
        #
        #    domuri_sameas = coin_uri(resource.value(RPUBL.referatAvDomstolsavgorande))
        #
        # (also, this version handles the uncommon but valid case
        # where one referat concerns multiple dom:s)
        domuri = resource.value(RPUBL.referatAvDomstolsavgorande).identifier
        for malnummer in head['_localid']:
            bnodetmp = BNode()
            gtmp = Graph()
            gtmp.bind("rpubl", RPUBL)
            gtmp.bind("dcterms", DCTERMS)
            dtmp = Describer(gtmp, bnodetmp)
            dtmp.rdftype(RPUBL.VagledandeDomstolsavgorande)
            dtmp.value(RPUBL.malnummer, malnummer)
            dtmp.value(RPUBL.avgorandedatum, head['Avgörandedatum'])
            dtmp.rel(DCTERMS.publisher, self.lookup_resource(head["Domstol"]))
            rtmp = dtmp.graph.resource(bnodetmp)
            domuri_sameas = coin_uri(rtmp)
            resource.graph.add(
                (URIRef(domuri), OWL.sameAs, URIRef(domuri_sameas)))
        return resource

Example #5

Show file

File: rfcs.py Project: h4ck3rm1k3/ferenda

    def parse(self, doc):
        # some very simple heuristic rules for determining 
        # what an individual paragraph is
   
        def is_heading(p):
            # If it's on a single line and it isn't indented with spaces
            # it's probably a heading.
            if p.count("\n") == 0 and not p.startswith(" "):
                return True
  
        def is_pagebreak(p):
            # if it contains a form feed character, it represents a page break
            return "\f" in p
        
        # Parsing a document consists mainly of two parts:
        # 1: First we parse the body of text and store it in doc.body
        from ferenda.elements import Body, Preformatted, Title, Heading
        from ferenda import Describer
        reader = TextReader(self.store.downloaded_path(doc.basefile))
  
        # First paragraph of an RFC is always a header block 
        header = reader.readparagraph()
        # Preformatted is a ferenda.elements class representing a
        # block of preformatted text. It is derived from the built-in
        # list type, and must thus be initialized with an iterable, in
        # this case a single-element list of strings. (Note: if you
        # try to initialize it with a string, because strings are
        # iterables as well, you'll end up with a list where each
        # character in the string is an element, which is not what you
        # want).
        preheader = Preformatted([header])
        # Doc.body is a ferenda.elements.Body class, which is also
        # is derived from list, so it has (amongst others) the append
        # method. We build our document by adding to this root
        # element.
        doc.body.append(preheader)
  
        # Second paragraph is always the title, and we don't include
        # this in the body of the document, since we'll add it to the
        # medata -- once is enough
        title = reader.readparagraph()
        
        # After that, just iterate over the document and guess what
        # everything is. TextReader.getiterator is useful for
        # iterating through a text in other chunks than single lines
        for para in reader.getiterator(reader.readparagraph):
            if is_heading(para):
                # Heading is yet another of these ferenda.elements
                # classes.
                doc.body.append(Heading([para]))
            elif is_pagebreak(para):
                # Just drop these remnants of a page-and-paper-based past
                pass
            else:
                # If we don't know that it's something else, it's a
                # preformatted section (the safest bet for RFC text).
                doc.body.append(Preformatted([para])) 

        # 2: Then we create metadata for the document and store it in
        # doc.meta (in this case using the convenience
        # ferenda.Describer class).

        desc = Describer(doc.meta, doc.uri)

        # Set the rdf:type of the document
        desc.rdftype(self.rdf_type)

        # Set the title we've captured as the dct:title of the document and 
        # specify that it is in English
        desc.value(self.ns['dct'].title, util.normalize_space(title), lang="en")

        # Construct the dct:identifier (eg "RFC 6991") for this document from the basefile
        desc.value(self.ns['dct'].identifier, "RFC " + doc.basefile)
  
        # find and convert the publication date in the header to a datetime 
        # object, and set it as the dct:issued date for the document   
        re_date = re.compile("(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})").search
        # This is a context manager that temporarily sets the system
        # locale to the "C" locale in order to be able to use strptime
        # with a string on the form "August 2013", even though the
        # system may use another locale.
        dt_match = re_date(header)
        if dt_match:
            with util.c_locale(): 
                dt = datetime.strptime(re_date(header).group(0), "%B %Y")
            pubdate = date(dt.year,dt.month,dt.day)
            # Note that using some python types (cf. datetime.date)
            # results in a datatyped RDF literal, ie in this case
            #   <http://localhost:8000/res/rfc/6994> dct:issued "2013-08-01"^^xsd:date
            desc.value(self.ns['dct'].issued, pubdate)
  
        # find any older RFCs that this document updates or obsoletes
        obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE)
        updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE)

        # Find the category of this RFC, store it as dct:subject
        cat_match = re.search("^Category: ([\w ]+?)(  |$)", header, re.MULTILINE)
        if cat_match:
            desc.value(self.ns['dct'].subject, cat_match.group(1))
            
        for predicate, matches in ((self.ns['rfc'].updates, updates),
                                   (self.ns['rfc'].obsoletes, obsoletes)):
            if matches is None:
                continue
            # add references between this document and these older rfcs, 
            # using either rfc:updates or rfc:obsoletes
            for match in matches.group(1).strip().split(", "):
                uri = self.canonical_uri(match)
                # Note that this uses our own unofficial
                # namespace/vocabulary
                # http://example.org/ontology/rfc/
                desc.rel(predicate, uri)
  
        # And now we're done. We don't need to return anything as
        # we've modified the Document object that was passed to
        # us. The calling code will serialize this modified object to
        # XHTML and RDF and store it on disk

# end parse1
        # Now do it again
        reader.seek(0)
        reader.readparagraph()
        reader.readparagraph()
        doc.body = Body()
        doc.body.append(preheader)
        # doc.body.append(Title([util.normalize_space(title)]))
# begin parse2                                   
        from ferenda.elements import Section, Subsection, Subsubsection

        # More heuristic rules: Section headers start at the beginning
        # of a line and are numbered. Subsections and subsubsections
        # have dotted numbers, optionally with a trailing period, ie
        # '9.2.' or '11.3.1'
        def is_section(p):
            return re.match(r"\d+\.? +[A-Z]", p)

        def is_subsection(p):
            return re.match(r"\d+\.\d+\.? +[A-Z]", p)

        def is_subsubsection(p):
            return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p)

        def split_sectionheader(p):
            # returns a tuple of title, ordinal, identifier
            ordinal, title = p.split(" ",1)
            ordinal = ordinal.strip(".")
            return title.strip(), ordinal, "RFC %s, section %s" % (doc.basefile, ordinal)

        # Use a list as a simple stack to keep track of the nesting
        # depth of a document. Every time we create a Section,
        # Subsection or Subsubsection object, we push it onto the
        # stack (and clear the stack down to the appropriate nesting
        # depth). Every time we create some other object, we append it
        # to whatever object is at the top of the stack. As your rules
        # for representing the nesting of structure become more
        # complicated, you might want to use the
        # :class:`~ferenda.FSMParser` class, which lets you define
        # heuristic rules (recognizers), states and transitions, and
        # takes care of putting your structure together.
        stack = [doc.body]

        for para in reader.getiterator(reader.readparagraph):
            if is_section(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Section(title=title, ordinal=ordinal, identifier=identifier)
                stack[1:] = [] # clear all but bottom element
                stack[0].append(s) # add new section to body
                stack.append(s)    # push new section on top of stack
            elif is_subsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsection(title=title, ordinal=ordinal, identifier=identifier)
                stack[2:] = [] # clear all but bottom two elements
                stack[1].append(s) # add new subsection to current section
                stack.append(s)
            elif is_subsubsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsubsection(title=title, ordinal=ordinal, identifier=identifier)
                stack[3:] = [] # clear all but bottom three
                stack[-1].append(s) # add new subsubsection to current subsection
                stack.append(s)
            elif is_heading(para):
                stack[-1].append(Heading([para]))
            elif is_pagebreak(para):
                pass
            else:
                pre = Preformatted([para])
                stack[-1].append(pre)
# end parse2                                   

# begin citation1                                   
        from pyparsing import Word, CaselessLiteral, nums
        section_citation = (CaselessLiteral("section") + Word(nums+".").setResultsName("Sec")).setResultsName("SecRef")
        rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") + "]").setResultsName("RFCRef")
        section_rfc_citation = (section_citation + "of" + rfc_citation).setResultsName("SecRFCRef")
# end citation1                                   

# begin citation2
        def rfc_uriformatter(parts):
            uri = ""
            if 'RFC' in parts:
                 uri += self.canonical_uri(parts['RFC'].lstrip("0"))
            if 'Sec' in parts:
                 uri += "#S" + parts['Sec']
            return uri
# end citation2                                   

# begin citation3
        from ferenda import CitationParser, URIFormatter
        citparser = CitationParser(section_rfc_citation, 
                                   section_citation,
                                   rfc_citation)
        citparser.set_formatter(URIFormatter(("SecRFCRef", rfc_uriformatter),
                                             ("SecRef", rfc_uriformatter),
                                             ("RFCRef", rfc_uriformatter)))
        citparser.parse_recursive(doc.body)

Example #6

Show file

File: dv.py Project: h4ck3rm1k3/ferenda

    def polish_metadata(self, head, doc):
        basefile_regex = re.compile('(?P<type>\w+)/(?P<year>\d+)-(?P<ordinal>\d+)')

        def basefile_to_referat(basefile):
            templ = {'ADO': 'AD %(year)s nr %(ordinal)s',
                     'MD': 'MD %(year)s:%(ordinal)s'}
            m = basefile_regex.match(basefile)
            if m:
                return templ[m.group("type")] % (m.groupdict())

        def ref_to_uri(ref):
            # FIXME: We'd like to retire legalref and replace it with
            # pyparsing grammars.
            nodes = self.rattsfall_parser.parse(ref)
            uri = nodes[0].uri
            return localize_uri(uri)

        def dom_to_uri(domstol, malnr, avg):
            baseuri = self.config.url
            slug = self.slugs[domstol]
            return "%(baseuri)sres/dv/%(slug)s/%(malnr)s/%(avg)s" % locals()

        def localize_uri(uri):
            if "publ/rattsfall" in uri:
                return uri.replace("http://rinfo.lagrummet.se/publ/rattsfall",
                                   self.config.url + "res/dv")
            elif "publ/sfs/" in uri:
                return uri.replace("http://rinfo.lagrummet.se/publ/sfs",
                                   self.config.url + "res/sfs")

        def split_nja(value):
            # "NJA 2008 s 567 (NJA 2008:86)"=>("NJA 2008 s 567", "NJA 2008:86")
            return [x[:-1] for x in value.split("(")]

        def sokord_uri(value):
            return self.config.url + "concept/%s" % util.ucfirst(value).replace(' ', '_')

        # 0. create Referat key if not present
        if "Referat" not in head:
            # For some courts (MD, AD, MOD?, MIG?) this is possible
            head["Referat"] = basefile_to_referat(doc.basefile)

        # 1. mint uris and create the two Describers we'll use
        refuri = ref_to_uri(head["Referat"])
        refdesc = Describer(doc.meta, refuri)
        domuri = dom_to_uri(head["Domstol"],
                            head["Målnummer"],
                            head["Avgörandedatum"])
        domdesc = Describer(doc.meta, domuri)

        # 2. convert all strings in head to proper RDF
        for label, value in head.items():
            if label == "Rubrik":
                value = util.normalize_space(value)
                refdesc.value(self.ns['rpubl'].referatrubrik, value, lang="sv")
                domdesc.value(self.ns['dct'].title, value, lang="sv")

            elif label == "Domstol":
                domdesc.rel(self.ns['dct'].publisher, self.lookup_resource(value))
            elif label == "Målnummer":
                domdesc.rel(self.ns['rpubl'].malnummer, value)
            elif label == "Domsnummer":
                domdesc.rel(self.ns['rpubl'].domsnummer, value)
            elif label == "Diarienummer":
                domdesc.rel(self.ns['rpubl'].diarienummer, value)
            elif label == "Avdelning":
                domdesc.rel(self.ns['rpubl'].avdelning, value)
            elif label == "Referat":

                for pred, regex in {'rattsfallspublikation': r'([^ ]+)',
                                    'arsutgava': r'(\d{4})',
                                    'lopnummer': r'\d{4}(?:\:| nr )(\d+)',
                                    'sidnummer': r's.? ?(\d+)'}.items():
                    m = re.search(regex, value)
                    if m:
                        if pred == 'rattsfallspublikation':
                            # "NJA" -> "http://lcaolhost:8000/coll/dv/nja"
                            uri = self.config.url + "coll/dv/" + m.group(1).lower()
                            refdesc.rel(self.ns['rpubl'][pred], uri)
                        else:
                            refdesc.value(self.ns['rpubl'][pred], m.group(1))

                    if value.startswith("NJA"):
                        realvalue, extra = split_nja(value)
                        ordinal = extra.split(" ")[1]
                        refdesc.value(self.ns['dct'].bibliographicCitation,
                                      extra)
                        refdesc.rel(self.ns['owl'].sameAs,
                                    self.config.url + "res/dv/nja/" + ordinal)
                        refdesc.value(self.ns['dct'].identifier, realvalue)
                    else:
                        refdesc.value(self.ns['dct'].identifier, value)

            elif label == "Avgörandedatum":
                with util.c_locale():
                    d = datetime.strptime(value, '%Y-%m-%d')
                domdesc.value(self.ns['rpubl'].avgorandedatum, d)

            elif label == "Lagrum":
                for i in value:  # better be list not string
                    for node in self.lagrum_parser.parse(i):
                        if isinstance(node, Link):

                            domdesc.rel(self.ns['rpubl'].lagrum,
                                        localize_uri(node.uri))
            elif label == "Rättsfall":
                for i in value:
                    for node in self.rattsfall_parser.parse(i):
                        if isinstance(node, Link):
                            domdesc.rel(self.ns['rpubl'].rattsfall,
                                        localize_uri(node.uri))
            elif label == "Litteratur":
                for i in value.split(";"):
                    domdesc.value(self.ns['dct'].relation, util.normalize_space(i))
            elif label == "Sökord":
                for s in self.re_delimSplit(value):
                    s = util.normalize_space(s)
                    if not s:
                        continue
                    # terms longer than 72 chars are not legitimate
                    # terms. more likely descriptions. If a term has a - in
                    # it, it's probably a separator between a term and a
                    # description
                    while len(s) >= 72 and " - " in s:
                        h, s = s.split(" - ", 1)
                        domdesc.rel(self.ns['dct'].subject, sokord_uri(h))
                    if len(s) < 72:
                        domdesc.rel(self.ns['dct'].subject, sokord_uri(s))

        # 3. mint some owl:sameAs URIs
        refdesc.rel(self.ns['owl'].sameAs, self.sameas_uri(refuri))
        domdesc.rel(self.ns['owl'].sameAs, self.sameas_uri(domuri))

        # 4. Add some same-for-everyone properties
        refdesc.rel(self.ns['dct'].publisher, self.lookup_resource('Domstolsverket'))
        refdesc.rdftype(self.ns['rpubl'].Rattsfallsreferat)
        domdesc.rdftype(self.ns['rpubl'].VagledandeDomstolsavgorande)
        refdesc.rel(self.ns['rpubl'].referatAvDomstolsavgorande, domuri)
        # 5. assert that we have everything we need

        # 6. done!
        return refuri

Example #7

Show file

    def parse(self, doc):
        # some very simple heuristic rules for determining
        # what an individual paragraph is

        def is_heading(p):
            # If it's on a single line and it isn't indented with spaces
            # it's probably a heading.
            if p.count("\n") == 0 and not p.startswith(" "):
                return True

        def is_pagebreak(p):
            # if it contains a form feed character, it represents a page break
            return "\f" in p

        # Parsing a document consists mainly of two parts:
        # 1: First we parse the body of text and store it in doc.body
        from ferenda.elements import Body, Preformatted, Title, Heading
        from ferenda import Describer
        reader = TextReader(self.store.downloaded_path(doc.basefile))

        # First paragraph of an RFC is always a header block
        header = reader.readparagraph()
        # Preformatted is a ferenda.elements class representing a
        # block of preformatted text. It is derived from the built-in
        # list type, and must thus be initialized with an iterable, in
        # this case a single-element list of strings. (Note: if you
        # try to initialize it with a string, because strings are
        # iterables as well, you'll end up with a list where each
        # character in the string is an element, which is not what you
        # want).
        preheader = Preformatted([header])
        # Doc.body is a ferenda.elements.Body class, which is also
        # is derived from list, so it has (amongst others) the append
        # method. We build our document by adding to this root
        # element.
        doc.body.append(preheader)

        # Second paragraph is always the title, and we don't include
        # this in the body of the document, since we'll add it to the
        # medata -- once is enough
        title = reader.readparagraph()

        # After that, just iterate over the document and guess what
        # everything is. TextReader.getiterator is useful for
        # iterating through a text in other chunks than single lines
        for para in reader.getiterator(reader.readparagraph):
            if is_heading(para):
                # Heading is yet another of these ferenda.elements
                # classes.
                doc.body.append(Heading([para]))
            elif is_pagebreak(para):
                # Just drop these remnants of a page-and-paper-based past
                pass
            else:
                # If we don't know that it's something else, it's a
                # preformatted section (the safest bet for RFC text).
                doc.body.append(Preformatted([para]))

        # 2: Then we create metadata for the document and store it in
        # doc.meta (in this case using the convenience
        # ferenda.Describer class).

        desc = Describer(doc.meta, doc.uri)

        # Set the rdf:type of the document
        desc.rdftype(self.rdf_type)

        # Set the title we've captured as the dcterms:title of the document and
        # specify that it is in English
        desc.value(self.ns['dcterms'].title,
                   util.normalize_space(title),
                   lang="en")

        # Construct the dcterms:identifier (eg "RFC 6991") for this document from the basefile
        desc.value(self.ns['dcterms'].identifier, "RFC " + doc.basefile)

        # find and convert the publication date in the header to a datetime
        # object, and set it as the dcterms:issued date for the document
        re_date = re.compile(
            "(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})"
        ).search
        # This is a context manager that temporarily sets the system
        # locale to the "C" locale in order to be able to use strptime
        # with a string on the form "August 2013", even though the
        # system may use another locale.
        dt_match = re_date(header)
        if dt_match:
            with util.c_locale():
                dt = datetime.strptime(re_date(header).group(0), "%B %Y")
            pubdate = date(dt.year, dt.month, dt.day)
            # Note that using some python types (cf. datetime.date)
            # results in a datatyped RDF literal, ie in this case
            #   <http://localhost:8000/res/rfc/6994> dcterms:issued "2013-08-01"^^xsd:date
            desc.value(self.ns['dcterms'].issued, pubdate)

        # find any older RFCs that this document updates or obsoletes
        obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE)
        updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE)

        # Find the category of this RFC, store it as dcterms:subject
        cat_match = re.search("^Category: ([\w ]+?)(  |$)", header,
                              re.MULTILINE)
        if cat_match:
            desc.value(self.ns['dcterms'].subject, cat_match.group(1))

        for predicate, matches in ((self.ns['rfc'].updates, updates),
                                   (self.ns['rfc'].obsoletes, obsoletes)):
            if matches is None:
                continue
            # add references between this document and these older rfcs,
            # using either rfc:updates or rfc:obsoletes
            for match in matches.group(1).strip().split(", "):
                uri = self.canonical_uri(match)
                # Note that this uses our own unofficial
                # namespace/vocabulary
                # http://example.org/ontology/rfc/
                desc.rel(predicate, uri)

        # And now we're done. We don't need to return anything as
        # we've modified the Document object that was passed to
        # us. The calling code will serialize this modified object to
        # XHTML and RDF and store it on disk

# end parse1
# Now do it again
        reader.seek(0)
        reader.readparagraph()
        reader.readparagraph()
        doc.body = Body()
        doc.body.append(preheader)
        # doc.body.append(Title([util.normalize_space(title)]))
        # begin parse2
        from ferenda.elements import Section, Subsection, Subsubsection

        # More heuristic rules: Section headers start at the beginning
        # of a line and are numbered. Subsections and subsubsections
        # have dotted numbers, optionally with a trailing period, ie
        # '9.2.' or '11.3.1'
        def is_section(p):
            return re.match(r"\d+\.? +[A-Z]", p)

        def is_subsection(p):
            return re.match(r"\d+\.\d+\.? +[A-Z]", p)

        def is_subsubsection(p):
            return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p)

        def split_sectionheader(p):
            # returns a tuple of title, ordinal, identifier
            ordinal, title = p.split(" ", 1)
            ordinal = ordinal.strip(".")
            return title.strip(), ordinal, "RFC %s, section %s" % (
                doc.basefile, ordinal)

        # Use a list as a simple stack to keep track of the nesting
        # depth of a document. Every time we create a Section,
        # Subsection or Subsubsection object, we push it onto the
        # stack (and clear the stack down to the appropriate nesting
        # depth). Every time we create some other object, we append it
        # to whatever object is at the top of the stack. As your rules
        # for representing the nesting of structure become more
        # complicated, you might want to use the
        # :class:`~ferenda.FSMParser` class, which lets you define
        # heuristic rules (recognizers), states and transitions, and
        # takes care of putting your structure together.
        stack = [doc.body]

        for para in reader.getiterator(reader.readparagraph):
            if is_section(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Section(title=title,
                            ordinal=ordinal,
                            identifier=identifier)
                stack[1:] = []  # clear all but bottom element
                stack[0].append(s)  # add new section to body
                stack.append(s)  # push new section on top of stack
            elif is_subsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsection(title=title,
                               ordinal=ordinal,
                               identifier=identifier)
                stack[2:] = []  # clear all but bottom two elements
                stack[1].append(s)  # add new subsection to current section
                stack.append(s)
            elif is_subsubsection(para):
                title, ordinal, identifier = split_sectionheader(para)
                s = Subsubsection(title=title,
                                  ordinal=ordinal,
                                  identifier=identifier)
                stack[3:] = []  # clear all but bottom three
                stack[-1].append(
                    s)  # add new subsubsection to current subsection
                stack.append(s)
            elif is_heading(para):
                stack[-1].append(Heading([para]))
            elif is_pagebreak(para):
                pass
            else:
                pre = Preformatted([para])
                stack[-1].append(pre)
# end parse2

# begin citation1
        from pyparsing import Word, CaselessLiteral, nums
        section_citation = (
            CaselessLiteral("section") +
            Word(nums + ".").setResultsName("Sec")).setResultsName("SecRef")
        rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") +
                        "]").setResultsName("RFCRef")
        section_rfc_citation = (section_citation + "of" +
                                rfc_citation).setResultsName("SecRFCRef")

        # end citation1

        # begin citation2
        def rfc_uriformatter(parts):
            uri = ""
            if 'RFC' in parts:
                uri += self.canonical_uri(parts['RFC'].lstrip("0"))
            if 'Sec' in parts:
                uri += "#S" + parts['Sec']
            return uri
# end citation2

# begin citation3

        from ferenda import CitationParser, URIFormatter
        citparser = CitationParser(section_rfc_citation, section_citation,
                                   rfc_citation)
        citparser.set_formatter(
            URIFormatter(("SecRFCRef", rfc_uriformatter),
                         ("SecRef", rfc_uriformatter),
                         ("RFCRef", rfc_uriformatter)))
        citparser.parse_recursive(doc.body)

Example #8

Show file

File: regeringen.py Project: h4ck3rm1k3/ferenda

    def parse_metadata_from_soup(self, soup, doc):
        doc.lang = "sv"
        d = Describer(doc.meta, doc.uri)
        d.rdftype(self.rdf_type)
        d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name())
        sameas = self.sameas_uri(doc.uri)
        if sameas:
            d.rel(self.ns['owl'].sameAs, sameas)

        content = soup.find(id="content")
        title = content.find("h1").string
        d.value(self.ns['dct'].title, title, lang=doc.lang)
        identifier = self.sanitize_identifier(
            content.find("p", "lead").text)  # might need fixing up
        d.value(self.ns['dct'].identifier, identifier)

        definitions = content.find("dl", "definitions")
        if definitions:
            for dt in definitions.find_all("dt"):
                key = dt.get_text(strip=True)
                value = dt.find_next_sibling("dd").get_text(strip=True)
                if key == "Utgiven:":
                    try:
                        d.value(self.ns['dct'].published,
                                self.parse_swedish_date(value))
                    except ValueError as e:
                        self.log.warning(
                            "Could not parse %s as swedish date" % value)
                elif key == "Avsändare:":
                    if value.endswith("departementet"):
                        d.rel(self.ns['rpubl'].departement,
                              self.lookup_resource(value))
                    else:
                        d.rel(self.ns['dct'].publisher,
                              self.lookup_resource(value))

        if content.find("h2", text="Sammanfattning"):
            sums = content.find("h2", text="Sammanfattning").find_next_siblings("p")
            # "\n\n" doesn't seem to survive being stuffed in a rdfa
            # content attribute. Replace with simple space.
            summary = " ".join([x.get_text(strip=True) for x in sums])
            d.value(self.ns['dct'].abstract,
                    summary, lang=doc.lang)

        # find related documents
        re_basefile = re.compile(r'\d{4}(|/\d{2,4}):\d+')
        # legStep1=Kommittedirektiv, 2=Utredning, 3=lagrådsremiss,
        # 4=proposition. Assume that relationships between documents
        # are reciprocal (ie if the page for a Kommittedirektiv
        # references a Proposition, the page for that Proposition
        # references the Kommittedirektiv.
        elements = {self.KOMMITTEDIREKTIV: [],
                    self.DS: ["legStep1"],
                    self.PROPOSITION: ["legStep1", "legStep2"],
                    self.SOU: ["legStep1"]}[self.document_type]

        for elementid in elements:
            box = content.find(id=elementid)
            for listitem in box.find_all("li"):
                if not listitem.find("span", "info"):
                    continue
                infospans = [x.text.strip(
                ) for x in listitem.find_all("span", "info")]

                rel_basefile = None
                identifier = None

                for infospan in infospans:
                    if re_basefile.search(infospan):
                        # scrub identifier ("Dir. 2008:50" -> "2008:50" etc)
                        rel_basefile = re_basefile.search(infospan).group()
                        identifier = infospan

                if not rel_basefile:
                    self.log.warning(
                        "Couldn't find rel_basefile (elementid #%s) among %r" % (elementid, infospans))
                    continue
                if elementid == "legStep1":
                    subjUri = self.canonical_uri(
                        rel_basefile, self.KOMMITTEDIREKTIV)
                elif elementid == "legStep2":
                    if identifier.startswith("SOU"):
                        subjUri = self.canonical_uri(rel_basefile, self.SOU)
                    elif identifier.startswith(("Ds", "DS")):
                        subjUri = self.canonical_uri(rel_basefile, self.DS)
                    else:
                        self.log.warning(
                            "Cannot find out what type of document the linked %s is (#%s)" % (identifier, elementid))
                        self.log.warning("Infospans was %r" % infospans)
                        continue
                elif elementid == "legStep3":
                    subjUri = self.canonical_uri(
                        rel_basefile, self.PROPOSITION)
                d.rel(self.ns['rpubl'].utgarFran, subjUri)

        # find related pages
        related = content.find("h2", text="Relaterat")
        if related:
            for link in related.findParent("div").find_all("a"):
                r = urljoin(
                    "http://www.regeringen.se/", link["href"])
                d.rel(RDFS.seeAlso, URIRef(r))
                # with d.rel(RDFS.seeAlso, URIRef(r)):
                #    d.value(RDFS.label, link.get_text(strip=True))

        self.infer_triples(d, doc.basefile)