Beispiel #1
0
    def categoryLink(self, m):
        uri = 'http://lagen.nu/concept/%s' % util.ucfirst(
            m.group(1)).replace(' ', '_')

        if len(m.groups()) == 2:
            # lcwl = "Labeled Category WikiLink"
            return '<a class="lcwl" href="%s">%s</a>' % (uri, m.group(2))
        else:
            # cwl = "Category wikilink"
            return '<a class="cwl" href="%s">%s</a>' % (uri, m.group(1))
Beispiel #2
0
    def capitalizedLink(self, m):
        if m.group(1).startswith('SFS/'):
            uri = 'http://rinfo.lagrummet.se/publ/%s' % m.group(1).lower()
        else:
            uri = 'http://lagen.nu/concept/%s' % util.ucfirst(
                m.group(1)).replace(' ', '_')

        if len(m.groups()) == 3:
            # lwl = "Labeled WikiLink"
            return '<a class="lwl" href="%s">%s%s</a>' % (uri, m.group(2), m.group(3))
        else:
            return '<a class="wl" href="%s">%s%s</a>' % (uri, m.group(1), m.group(2))
Beispiel #3
0
    def infer_metadata(self, resource, basefile):
        # remove the bogus dcterms:issued thing that we only added to
        # aid URI generation. NB: This is removed in the superclass'
        # postprocess_doc as well, because for this lagen.nu-derived
        # class it needs to be done at this point, but for use of the
        # superclass directly, it needs to be done at some point.
        for o in resource.objects(DCTERMS.issued):
            if not o.datatype:
                resource.remove(DCTERMS.issued, o)
        sameas_uri = self.sameas_minter.space.coin_uri(resource)
        resource.add(OWL.sameAs, URIRef(sameas_uri))
        resource.graph.add((URIRef(self.canonical_uri(basefile, True)),
                            OWL.sameAs, resource.identifier))
        # then find each rpubl:konsolideringsunderlag, and create
        # owl:sameas for them as well
        for subresource in resource.objects(RPUBL.konsolideringsunderlag):
            # sometimes there'll be a rpubl:konsolideringsunderlag to
            # a resource URI but no actual data about that
            # resource. This seems to happen if SFST is updated but
            # SFSR is not. In those cases we can't generate a
            # owl:sameAs URI since we have no other data about the
            # resource.
            if subresource.value(RDF.type):
                uri = self.sameas_minter.space.coin_uri(subresource)
                subresource.add(OWL.sameAs, URIRef(uri))
        desc = Describer(resource.graph, resource.identifier)
        de = DocumentEntry(self.store.documententry_path(basefile))
        if de.orig_updated:
            desc.value(RINFOEX.senastHamtad, de.orig_updated)
        if de.orig_checked:
            desc.value(RINFOEX.senastKontrollerad, de.orig_checked)
        rooturi = URIRef(desc.getrel(RPUBL.konsoliderar))

        v = self.commondata.value(rooturi, DCTERMS.alternate, any=True)
        if v:
            desc.value(DCTERMS.alternate, v)
        v = self.commondata.value(rooturi, RDFS.label, any=True)
        if v:
            # don't include labels if they're essentially the same as
            # dcterms:title (legalref needs it to be able to parse
            # refs to laws that typically don't include SFS numbers,
            # so that's why they're in sfs.ttl
            basetitle = str(resource.value(DCTERMS.title)).rsplit(" (")[0]
            if not v.startswith(basetitle.lower()):
                desc.value(RDFS.label, util.ucfirst(v))
Beispiel #4
0
    def infer_metadata(self, resource, basefile):
        # remove the bogus dcterms:issued thing that we only added to
        # aid URI generation. NB: This is removed in the superclass'
        # postprocess_doc as well, because for this lagen.nu-derived
        # class it needs to be done at this point, but for use of the
        # superclass directly, it needs to be done at some point.
        for o in resource.objects(DCTERMS.issued):
            if not o.datatype:
                resource.remove(DCTERMS.issued, o)
        sameas_uri = self.sameas_minter.space.coin_uri(resource)
        resource.add(OWL.sameAs, URIRef(sameas_uri))
        resource.graph.add((URIRef(self.canonical_uri(basefile, True)),
                            OWL.sameAs, resource.identifier))
        # then find each rpubl:konsolideringsunderlag, and create
        # owl:sameas for them as well
        for subresource in resource.objects(RPUBL.konsolideringsunderlag):
            # sometimes there'll be a rpubl:konsolideringsunderlag to
            # a resource URI but no actual data about that
            # resource. This seems to happen if SFST is updated but
            # SFSR is not. In those cases we can't generate a
            # owl:sameAs URI since we have no other data about the
            # resource.
            if subresource.value(RDF.type):
                uri = self.sameas_minter.space.coin_uri(subresource)
                subresource.add(OWL.sameAs, URIRef(uri))
        desc = Describer(resource.graph, resource.identifier)
        de = DocumentEntry(self.store.documententry_path(basefile))
        if de.orig_updated:
            desc.value(RINFOEX.senastHamtad, de.orig_updated)
        if de.orig_checked:
            desc.value(RINFOEX.senastKontrollerad, de.orig_checked)
        rooturi = URIRef(desc.getrel(RPUBL.konsoliderar))

        v = self.commondata.value(rooturi, DCTERMS.alternate, any=True)
        if v:
            desc.value(DCTERMS.alternate, v)
        v = self.commondata.value(rooturi, RDFS.label, any=True)
        if v:
            # don't include labels if they're essentially the same as
            # dcterms:title (legalref needs it to be able to parse
            # refs to laws that typically don't include SFS numbers,
            # so that's why they're in sfs.ttl
            basetitle = str(resource.value(DCTERMS.title)).rsplit(" (")[0]
            if not v.startswith(basetitle.lower()):
                desc.value(RDFS.label, util.ucfirst(v))
Beispiel #5
0
def prop_sanitize_identifier(identifier):
    if not identifier:
        return identifier # allow infer_identifier to do it's magic later
    if identifier.startswith("prop"):
        identifier = util.ucfirst(identifier)
    if identifier.startswith("PROP"):
        identifier = identifier.replace("PROP", "Prop")
    if identifier.startswith("Prop "):
        identifier = identifier.replace("Prop ", "Prop. ")
    if re.match("Prop\.\d{4}", identifier): # missing space
        identifier = identifier.replace("Prop.", "Prop. ")
    if "\xa0" in identifier: # Non-breakable space
        identifier = identifier.replace("\xa0", " ")
    if not identifier.startswith("Prop. "):
        identifier = "Prop. " + identifier
    # identify and correct the not-uncommon "2009/2010:87" pattern (should be 2009/10:87)
    m = re.search(r"(\d{4})/(\d{4}):(\d+)$", identifier)
    if m and m.group(2) != "2000" and int(m.group(1)) == int(m.group(2)) - 1:
        identifier = identifier.replace(m.group(2), m.group(2)[-2:])
    if not re.match(r"^Prop\. (19|20)\d{2}(|/\d{2}|/2000):(|B ?|U ?)[1-9]\d{0,2}$", identifier):
        raise ValueError("Irregular identifier %s" % identifier)
    return Literal(identifier)
Beispiel #6
0
 def authorLink(self, m):
     uri = 'http://wiki.lagen.nu/index.php/%s' % util.ucfirst(
         m.group(1)).replace(' ', '_')
     return '<a class="awl" href="%s">%s</a>' % (uri, m.group(2))
Beispiel #7
0
 def hiddenLink(self, m):
     uri = 'http://lagen.nu/concept/%s' % util.ucfirst(
         m.group(1)).replace(' ', '_')
     return '<a class="hcwl" rel="dct:subject" href="%s"/>' % uri
Beispiel #8
0
 def sokord_uri(value):
     return self.config.url + "concept/%s" % util.ucfirst(value).replace(' ', '_')
Beispiel #9
0
    def parse_from_soup(self, soup):
        # Step 1: Find out basic metadata
        rubrik = soup.first("title").string
        beslutsdatum = soup.first(
            "meta", {'name': 'SG_Beslutsdatum'})['content']

        beslutsdatum = datetime.strptime(beslutsdatum, "%Y-%m-%d").date()
        diarienummer = soup.first(
            "meta", {'name': 'SG_Dokumentbet'})['content']
        arendetyp = soup.first("meta", {'name': 'Subject'})['content']
        # the keywords for a documents is contained in a metatag
        # formatted like:
        #    <meta name="Keywords" content="hets_mot_folkgrupp\nmeddelarfrihet\åklagare">
        #
        # Transform this into an array like:
        #    [u'http://lagen.nu/concept/Hets_mot_folkgrupp',
        #     u'http://lagen.nu/concept/Meddelarfrihet',
        #     u'http://lagen.nu/concept/Åklagare']
        nyckelord = soup.first("meta", {'name': 'Keywords'})['content']
        begrepp = ['http://lagen.nu/concept/%s' % util.ucfirst(
            x).strip().replace(" ", "_") for x in nyckelord.split("\n")]

        # Step 2: Using the metadata, construct the canonical URI for this document
        uri = LegalURI.construct({'type': LegalRef.MYNDIGHETSBESLUT,
                                  'myndighet': 'jk',
                                  'dnr': diarienummer})
        # self.log.debug("URI: %s" % uri)

        # Step 3: Create a RDF graph of all our metadata (so far)
        g = Graph()
        g.bind('dct', self.ns['dct'])
        g.bind('rinfo', self.ns['rinfo'])
        g.bind('rinfoex', self.ns['rinfoex'])
        g.bind('xsd', util.ns['xsd'])
        g.add((
            URIRef(uri), self.ns['dct']['title'], Literal(rubrik, lang="sv")))
        g.add((URIRef(uri), self.ns['rinfo']['beslutsdatum'],
              Literal(beslutsdatum, lang="sv")))
        g.add((URIRef(uri), self.ns['rinfo']['diarienummer'],
              Literal(diarienummer, lang="sv")))
        g.add((URIRef(uri), self.ns['rinfoex']['arendetyp'],
              Literal(arendetyp, lang="sv")))
        for s in begrepp:
            g.add((URIRef(uri), self.ns['dct']['subject'], URIRef(s)))

        g.add((URIRef(uri), self.ns['dct']['identifier'], Literal(
            "JK %s" % diarienummer, lang="sv")))
        g.add((URIRef(uri), RDF.type, self.rdf_type))

        # Step 4: Process the actual text of the document
        self.parser = LegalRef(LegalRef.LAGRUM,
                               LegalRef.KORTLAGRUM,
                               LegalRef.RATTSFALL,
                               LegalRef.FORARBETEN)

        # newer documents have a semantic structure with h1 and h2
        # elements. Older have elements like <p class="Rubrik_1">. Try
        # to determine which one we're dealing with?
        tag = soup.find('a', {'name': "Start"})
        if tag:
            # self.log.debug("Using new-style document structure")
            elements = tag.parent.findAllNext()
        else:
            # self.log.debug("Using old-style document structure")
            elements = soup.findAll("p")
        # self.log.debug("Found %d elements" % len(elements))
        from collections import deque
        elements = deque(elements)
        body = self.make_sektion(elements, "Referat av beslut")

        # Step 5: Combine the metadata and the document, and return it
        doc = {'meta': g,
               'body': body,
               'lang': 'sv',
               'uri': uri}
        return doc
Beispiel #10
0
 def keyword_uri(self, keyword):
     baseuri = "https://lagen.nu/begrepp/"
     return baseuri + util.ucfirst(keyword).replace(' ', '_').replace('"', "%22").replace("»", "//")