def categoryLink(self, m): uri = 'http://lagen.nu/concept/%s' % util.ucfirst( m.group(1)).replace(' ', '_') if len(m.groups()) == 2: # lcwl = "Labeled Category WikiLink" return '<a class="lcwl" href="%s">%s</a>' % (uri, m.group(2)) else: # cwl = "Category wikilink" return '<a class="cwl" href="%s">%s</a>' % (uri, m.group(1))
def capitalizedLink(self, m): if m.group(1).startswith('SFS/'): uri = 'http://rinfo.lagrummet.se/publ/%s' % m.group(1).lower() else: uri = 'http://lagen.nu/concept/%s' % util.ucfirst( m.group(1)).replace(' ', '_') if len(m.groups()) == 3: # lwl = "Labeled WikiLink" return '<a class="lwl" href="%s">%s%s</a>' % (uri, m.group(2), m.group(3)) else: return '<a class="wl" href="%s">%s%s</a>' % (uri, m.group(1), m.group(2))
def infer_metadata(self, resource, basefile): # remove the bogus dcterms:issued thing that we only added to # aid URI generation. NB: This is removed in the superclass' # postprocess_doc as well, because for this lagen.nu-derived # class it needs to be done at this point, but for use of the # superclass directly, it needs to be done at some point. for o in resource.objects(DCTERMS.issued): if not o.datatype: resource.remove(DCTERMS.issued, o) sameas_uri = self.sameas_minter.space.coin_uri(resource) resource.add(OWL.sameAs, URIRef(sameas_uri)) resource.graph.add((URIRef(self.canonical_uri(basefile, True)), OWL.sameAs, resource.identifier)) # then find each rpubl:konsolideringsunderlag, and create # owl:sameas for them as well for subresource in resource.objects(RPUBL.konsolideringsunderlag): # sometimes there'll be a rpubl:konsolideringsunderlag to # a resource URI but no actual data about that # resource. This seems to happen if SFST is updated but # SFSR is not. In those cases we can't generate a # owl:sameAs URI since we have no other data about the # resource. if subresource.value(RDF.type): uri = self.sameas_minter.space.coin_uri(subresource) subresource.add(OWL.sameAs, URIRef(uri)) desc = Describer(resource.graph, resource.identifier) de = DocumentEntry(self.store.documententry_path(basefile)) if de.orig_updated: desc.value(RINFOEX.senastHamtad, de.orig_updated) if de.orig_checked: desc.value(RINFOEX.senastKontrollerad, de.orig_checked) rooturi = URIRef(desc.getrel(RPUBL.konsoliderar)) v = self.commondata.value(rooturi, DCTERMS.alternate, any=True) if v: desc.value(DCTERMS.alternate, v) v = self.commondata.value(rooturi, RDFS.label, any=True) if v: # don't include labels if they're essentially the same as # dcterms:title (legalref needs it to be able to parse # refs to laws that typically don't include SFS numbers, # so that's why they're in sfs.ttl basetitle = str(resource.value(DCTERMS.title)).rsplit(" (")[0] if not v.startswith(basetitle.lower()): desc.value(RDFS.label, util.ucfirst(v))
def prop_sanitize_identifier(identifier): if not identifier: return identifier # allow infer_identifier to do it's magic later if identifier.startswith("prop"): identifier = util.ucfirst(identifier) if identifier.startswith("PROP"): identifier = identifier.replace("PROP", "Prop") if identifier.startswith("Prop "): identifier = identifier.replace("Prop ", "Prop. ") if re.match("Prop\.\d{4}", identifier): # missing space identifier = identifier.replace("Prop.", "Prop. ") if "\xa0" in identifier: # Non-breakable space identifier = identifier.replace("\xa0", " ") if not identifier.startswith("Prop. "): identifier = "Prop. " + identifier # identify and correct the not-uncommon "2009/2010:87" pattern (should be 2009/10:87) m = re.search(r"(\d{4})/(\d{4}):(\d+)$", identifier) if m and m.group(2) != "2000" and int(m.group(1)) == int(m.group(2)) - 1: identifier = identifier.replace(m.group(2), m.group(2)[-2:]) if not re.match(r"^Prop\. (19|20)\d{2}(|/\d{2}|/2000):(|B ?|U ?)[1-9]\d{0,2}$", identifier): raise ValueError("Irregular identifier %s" % identifier) return Literal(identifier)
def authorLink(self, m): uri = 'http://wiki.lagen.nu/index.php/%s' % util.ucfirst( m.group(1)).replace(' ', '_') return '<a class="awl" href="%s">%s</a>' % (uri, m.group(2))
def hiddenLink(self, m): uri = 'http://lagen.nu/concept/%s' % util.ucfirst( m.group(1)).replace(' ', '_') return '<a class="hcwl" rel="dct:subject" href="%s"/>' % uri
def sokord_uri(value): return self.config.url + "concept/%s" % util.ucfirst(value).replace(' ', '_')
def parse_from_soup(self, soup): # Step 1: Find out basic metadata rubrik = soup.first("title").string beslutsdatum = soup.first( "meta", {'name': 'SG_Beslutsdatum'})['content'] beslutsdatum = datetime.strptime(beslutsdatum, "%Y-%m-%d").date() diarienummer = soup.first( "meta", {'name': 'SG_Dokumentbet'})['content'] arendetyp = soup.first("meta", {'name': 'Subject'})['content'] # the keywords for a documents is contained in a metatag # formatted like: # <meta name="Keywords" content="hets_mot_folkgrupp\nmeddelarfrihet\åklagare"> # # Transform this into an array like: # [u'http://lagen.nu/concept/Hets_mot_folkgrupp', # u'http://lagen.nu/concept/Meddelarfrihet', # u'http://lagen.nu/concept/Åklagare'] nyckelord = soup.first("meta", {'name': 'Keywords'})['content'] begrepp = ['http://lagen.nu/concept/%s' % util.ucfirst( x).strip().replace(" ", "_") for x in nyckelord.split("\n")] # Step 2: Using the metadata, construct the canonical URI for this document uri = LegalURI.construct({'type': LegalRef.MYNDIGHETSBESLUT, 'myndighet': 'jk', 'dnr': diarienummer}) # self.log.debug("URI: %s" % uri) # Step 3: Create a RDF graph of all our metadata (so far) g = Graph() g.bind('dct', self.ns['dct']) g.bind('rinfo', self.ns['rinfo']) g.bind('rinfoex', self.ns['rinfoex']) g.bind('xsd', util.ns['xsd']) g.add(( URIRef(uri), self.ns['dct']['title'], Literal(rubrik, lang="sv"))) g.add((URIRef(uri), self.ns['rinfo']['beslutsdatum'], Literal(beslutsdatum, lang="sv"))) g.add((URIRef(uri), self.ns['rinfo']['diarienummer'], Literal(diarienummer, lang="sv"))) g.add((URIRef(uri), self.ns['rinfoex']['arendetyp'], Literal(arendetyp, lang="sv"))) for s in begrepp: g.add((URIRef(uri), self.ns['dct']['subject'], URIRef(s))) g.add((URIRef(uri), self.ns['dct']['identifier'], Literal( "JK %s" % diarienummer, lang="sv"))) g.add((URIRef(uri), RDF.type, self.rdf_type)) # Step 4: Process the actual text of the document self.parser = LegalRef(LegalRef.LAGRUM, LegalRef.KORTLAGRUM, LegalRef.RATTSFALL, LegalRef.FORARBETEN) # newer documents have a semantic structure with h1 and h2 # elements. Older have elements like <p class="Rubrik_1">. Try # to determine which one we're dealing with? tag = soup.find('a', {'name': "Start"}) if tag: # self.log.debug("Using new-style document structure") elements = tag.parent.findAllNext() else: # self.log.debug("Using old-style document structure") elements = soup.findAll("p") # self.log.debug("Found %d elements" % len(elements)) from collections import deque elements = deque(elements) body = self.make_sektion(elements, "Referat av beslut") # Step 5: Combine the metadata and the document, and return it doc = {'meta': g, 'body': body, 'lang': 'sv', 'uri': uri} return doc
def keyword_uri(self, keyword): baseuri = "https://lagen.nu/begrepp/" return baseuri + util.ucfirst(keyword).replace(' ', '_').replace('"', "%22").replace("»", "//")