Exemple #1
0
 def downloaded_to_intermediate(self, basefile):
     # Check to see if this might not be a proper SFS at all
     # (from time to time, other agencies publish their stuff
     # in SFS - this seems to be handled by giving those
     # documents a SFS nummer on the form "N1992:31". Filter
     # these out.
     if basefile.startswith('N'):
         raise IckeSFS("%s is not a regular SFS" % basefile)
     filename = self.store.downloaded_path(basefile)
     try:
         t = TextReader(filename, encoding=self.source_encoding)
     except IOError:
         self.log.warning("%s: Fulltext is missing" % basefile)
         # FIXME: This code needs to be rewritten
         baseuri = self.canonical_uri(basefile)
         if baseuri in registry:
             title = registry[baseuri].value(URIRef(baseuri),
                                             self.ns['dcterms'].title)
             desc.value(self.ns['dcterms'].title, title)
         desc.rel(self.ns['dcterms'].publisher,
                  self.lookup_resource("Regeringskansliet"))
         desc.value(self.ns['dcterms'].identifier, "SFS " + basefile)
         doc.body = Forfattning([Stycke(['Lagtext saknas'], id='S1')])
     # Check to see if the Författning has been revoked (using
     # plain fast string searching, no fancy HTML parsing and
     # traversing)
     if not self.config.keepexpired:
         try:
             t.cuepast('<i>Författningen är upphävd/skall upphävas: ')
             datestr = t.readto('</i></b>')
             if datetime.strptime(datestr, '%Y-%m-%d') < datetime.today():
                 self.log.debug('%s: Expired' % basefile)
                 raise UpphavdForfattning(
                     "%s is an expired SFS" % basefile,
                     dummyfile=self.store.parsed_path(basefile))
             t.seek(0)
         except IOError:
             t.seek(0)
     t.cuepast('<pre>')
     # remove &auml; et al
     try:
         # this is the preferred way from py34 onwards. FIXME: Move
         # this to ferenda.compat
         import html
         txt = html.unescape(t.readto('</pre>'))
     except ImportError:
         # this is the old way.
         hp = HTMLParser()
         txt = hp.unescape(t.readto('</pre>'))
     if '\r\n' not in txt:
         txt = txt.replace('\n', '\r\n')
     re_tags = re.compile("</?\w{1,3}>")
     txt = re_tags.sub('', txt)
     # add ending CRLF aids with producing better diffs
     txt += "\r\n"
     util.writefile(self.store.intermediate_path(basefile),
                    txt,
                    encoding=self.source_encoding)
     return codecs.open(self.store.intermediate_path(basefile),
                        encoding=self.source_encoding)
Exemple #2
0
    def extract_metadata_header(self, reader, basefile):
        re_sfs = re.compile(r'(\d{4}:\d+)\s*$').search
        d = {}
        for line in reader:
            if ":" in line:
                (key,
                 val) = [util.normalize_space(x) for x in line.split(":", 1)]
            # Simple string literals
            if key == 'Rubrik':
                d["dcterms:title"] = val
            elif key == 'Övrigt':
                d["rdfs:comment"] = val
            elif key == 'SFS nr':
                identifier = "SFS " + val
                # delay actual writing to graph, since we may need to
                # amend this

            # date literals
            elif key == 'Utfärdad':
                d["rpubl:utfardandedatum"] = val[:10]
            elif key == 'Tidsbegränsad':
                # FIXME: Should be done by lagen.nu.SFS
                d["rinfoex:tidsbegransad"] = val[:10]
            elif key == 'Upphävd':
                dat = datetime.strptime(val[:10], '%Y-%m-%d')
                d["rpubl:upphavandedatum"] = val[:10]
                if not self.config.keepexpired and dat < datetime.today():
                    raise UpphavdForfattning(
                        "%s is an expired SFS" % basefile,
                        dummyfile=self.store.parsed_path(basefile))

            # urirefs
            elif key == 'Departement/ myndighet':
                # this is only needed because of SFS 1942:724, which
                # has "Försvarsdepartementet, Socialdepartementet"...
                if "departementet, " in val:
                    val = val.split(", ")[0]
                d["dcterms:creator"] = val
            elif (key == 'Ändring införd' and re_sfs(val)):
                uppdaterad = re_sfs(val).group(1)
                # not sure we need to add this, since parse_metadata
                # catches the same
                d["rpubl:konsolideringsunderlag"] = [
                    URIRef(self.canonical_uri(uppdaterad))
                ]
                if identifier and identifier != "SFS " + uppdaterad:
                    identifier += " i lydelse enligt SFS " + uppdaterad
                d["dcterms:issued"] = uppdaterad

            elif (key == 'Omtryck' and re_sfs(val)):
                d["rinfoex:omtryck"] = self.canonical_uri(re_sfs(val).group(1))
            elif (key == 'Författningen har upphävts genom' and re_sfs(val)):
                s = re_sfs(val).group(1)
                d["rinfoex:upphavdAv"] = self.canonical_uri(s)
            else:
                self.log.warning('%s: Obekant nyckel [\'%s\']' %
                                 (basefile, key))

        d["dcterms:identifier"] = identifier

        # FIXME: This is a misuse of the dcterms:issued prop in order
        # to mint the correct URI. We need to remove this somehow afterwards.
        if "dcterms:issued" not in d:
            d["dcterms:issued"] = basefile

        if "dcterms:title" not in d:
            self.log.warning("%s: Rubrik saknas" % basefile)
        return d
Exemple #3
0
    def extract_metadata_register(self, soup, basefile):
        d = {}
        rubrik = util.normalize_space(soup.body('table')[2].text)
        changes = soup.body('table')[3:-2]
        g = self.make_graph()  # used for qname lookup only
        for table in changes:
            sfsnr = table.find(text="SFS-nummer:").find_parent(
                "td").find_next_sibling("td").text.strip()
            docuri = self.canonical_uri(sfsnr)
            rowdict = {}
            parts = sfsnr.split(":")
            d[docuri] = {
                "dcterms:publisher": "Regeringskansliet",
                "rpubl:arsutgava": parts[0],
                "rpubl:beslutadAv": "Regeringskansliet",
                "rpubl:forfattningssamling": "SFS",
                "rpubl:lopnummer": parts[1]
            }
            for row in table('tr'):
                key = row.td.text.strip()
                if key.endswith(":"):
                    key = key[:-1]  # trim ending ":"
                elif key == '':
                    continue
                # FIXME: the \xa0 (&nbsp;) to space conversion should
                # maye be part of normalize_space?
                val = util.normalize_space(row('td')[1].text)
                if val == "":
                    continue
                rowdict[key] = val
            # first change does not contain a "Rubrik" key. Fake it.
            if 'Rubrik' not in rowdict and rubrik:
                rowdict['Rubrik'] = rubrik
                rubrik = None
            for key, val in rowdict.items():
                if key == 'SFS-nummer':
                    (arsutgava, lopnummer) = val.split(":")
                    d[docuri]["dcterms:identifier"] = "SFS " + val
                    d[docuri]["rpubl:arsutgava"] = arsutgava
                    d[docuri]["rpubl:lopnummer"] = lopnummer

                elif key == 'Ansvarig myndighet':
                    d[docuri]["rpubl:departement"] = val
                    # FIXME: Sanitize this in
                    # sanitize_metadata->sanitize_department, lookup
                    # resource in polish_metadata
                elif key == 'Rubrik':
                    # Change acts to Balkar never contain the SFS no
                    # of the Balk.
                    if basefile not in val and not val.endswith("balken"):
                        self.log.warning("%s: Base SFS %s not in title %r" %
                                         (basefile, basefile, val))
                    d[docuri]["dcterms:title"] = val
                    d[docuri]["rdf:type"] = self._forfattningstyp(val)
                elif key == 'Observera':
                    if not self.config.keepexpired:
                        if 'Författningen är upphävd/skall upphävas: ' in val:
                            dateval = datetime.strptime(val[41:51], '%Y-%m-%d')
                            if dateval < datetime.today():
                                raise UpphavdForfattning(
                                    "%s is an expired SFS" % basefile,
                                    dummyfile=self.store.parsed_path(basefile))
                    d[docuri]["rdfs:comment"] = val
                elif key == 'Ikraft':
                    d[docuri]["rpubl:ikrafttradandedatum"] = val[:10]
                elif key == 'Omfattning':
                    # First, create rdf statements for every
                    # single modified section we can find
                    for changecat in val.split('; '):
                        if (changecat.startswith('ändr.')
                                or changecat.startswith('ändr ')
                                or changecat.startswith('ändring ')):
                            pred = self.ns['rpubl'].ersatter
                        elif (changecat.startswith('upph.')
                              or changecat.startswith('upp.')
                              or changecat.startswith('utgår')):
                            pred = self.ns['rpubl'].upphaver
                        elif (changecat.startswith('ny')
                              or changecat.startswith('ikrafttr.')
                              or changecat.startswith('ikrafftr.')
                              or changecat.startswith('ikraftr.')
                              or changecat.startswith('ikraftträd.')
                              or changecat.startswith('tillägg')):
                            pred = self.ns['rpubl'].inforsI
                        elif (changecat.startswith('nuvarande')
                              or changecat.startswith('rubr. närmast')
                              or changecat
                              in ('begr. giltighet', 'Omtryck', 'omtryck',
                                  'forts.giltighet', 'forts. giltighet',
                                  'forts. giltighet av vissa best.')):
                            # some of these changecats are renames, eg
                            # "nuvarande 2, 3, 4, 5 §§ betecknas 10,
                            # 11, 12, 13, 14, 15 §§;" or
                            # "rubr. närmast efter 1 § sätts närmast
                            # före 10 §"
                            pred = None
                        else:
                            self.log.warning("%s: Okänd omfattningstyp %r" %
                                             (basefile, changecat))
                            pred = None
                        old_currenturl = self.lagrum_parser._currenturl
                        self.lagrum_parser._currenturl = docuri
                        for node in self.lagrum_parser.parse_string(
                                changecat, pred):
                            if hasattr(node, 'predicate'):
                                qname = g.qname(node.predicate)
                                d[docuri][qname] = node.uri
                        self.lagrum_parser._currenturl = old_currenturl
                    # Secondly, preserve the entire text
                    d[docuri]["rpubl:andrar"] = val
                elif key == 'Förarbeten':
                    for node in self.forarbete_parser.parse_string(
                            val, "rpubl:forarbete"):
                        if hasattr(node, 'uri'):
                            if "rpubl:forarbete" not in d[docuri]:
                                d[docuri]["rpubl:forarbete"] = []
                            d[docuri]["rpubl:forarbete"].append(node.uri)
                            d[node.uri] = {"dcterms:identifier": str(node)}
                elif key == 'CELEX-nr':
                    for celex in re.findall('3\d{2,4}[LR]\d{4}', val):
                        b = BNode()
                        cg = Graph()
                        cg.add((b, RPUBL.celexNummer, Literal(celex)))
                        celexuri = self.minter.space.coin_uri(cg.resource(b))
                        if "rpubl:genomforDirektiv" not in d[docuri]:
                            d[docuri]["rpubl:genomforDirektiv"] = []
                        d[docuri]["rpubl:genomforDirektiv"].append(celexuri)
                        d[celexuri] = {"rpubl:celexNummer": celex}
                elif key == 'Tidsbegränsad':
                    d["rinfoex:tidsbegransad"] = val[:10]
                    expdate = datetime.strptime(val[:10], '%Y-%m-%d')
                    if expdate < datetime.today():
                        if not self.config.keepexpired:
                            raise UpphavdForfattning(
                                "%s is expired (time-limited) SFS" % basefile,
                                dummyfile=self.store.parsed_path(basefile))
                else:
                    self.log.warning('%s: Obekant nyckel [\'%s\']' % basefile,
                                     key)
            utfardandedatum = self._find_utfardandedatum(sfsnr)
            if utfardandedatum:
                d[docuri]["rpubl:utfardandedatum"] = utfardandedatum
        return d