Esempio n. 1
0
 def downloaded_to_intermediate(self, basefile):
     # Check to see if this might not be a proper SFS at all
     # (from time to time, other agencies publish their stuff
     # in SFS - this seems to be handled by giving those
     # documents a SFS nummer on the form "N1992:31". Filter
     # these out.
     if basefile.startswith('N'):
         raise IckeSFS("%s is not a regular SFS" % basefile)
     filename = self.store.downloaded_path(basefile)
     try:
         t = TextReader(filename, encoding=self.source_encoding)
     except IOError:
         self.log.warning("%s: Fulltext is missing" % basefile)
         # FIXME: This code needs to be rewritten
         baseuri = self.canonical_uri(basefile)
         if baseuri in registry:
             title = registry[baseuri].value(URIRef(baseuri),
                                             self.ns['dcterms'].title)
             desc.value(self.ns['dcterms'].title, title)
         desc.rel(self.ns['dcterms'].publisher,
                  self.lookup_resource("Regeringskansliet"))
         desc.value(self.ns['dcterms'].identifier, "SFS " + basefile)
         doc.body = Forfattning([Stycke(['Lagtext saknas'], id='S1')])
     # Check to see if the Författning has been revoked (using
     # plain fast string searching, no fancy HTML parsing and
     # traversing)
     if not self.config.keepexpired:
         try:
             t.cuepast('<i>Författningen är upphävd/skall upphävas: ')
             datestr = t.readto('</i></b>')
             if datetime.strptime(datestr, '%Y-%m-%d') < datetime.today():
                 self.log.debug('%s: Expired' % basefile)
                 raise UpphavdForfattning(
                     "%s is an expired SFS" % basefile,
                     dummyfile=self.store.parsed_path(basefile))
             t.seek(0)
         except IOError:
             t.seek(0)
     t.cuepast('<pre>')
     # remove &auml; et al
     try:
         # this is the preferred way from py34 onwards. FIXME: Move
         # this to ferenda.compat
         import html
         txt = html.unescape(t.readto('</pre>'))
     except ImportError:
         # this is the old way.
         hp = HTMLParser()
         txt = hp.unescape(t.readto('</pre>'))
     if '\r\n' not in txt:
         txt = txt.replace('\n', '\r\n')
     re_tags = re.compile("</?\w{1,3}>")
     txt = re_tags.sub('', txt)
     # add ending CRLF aids with producing better diffs
     txt += "\r\n"
     util.writefile(self.store.intermediate_path(basefile),
                    txt,
                    encoding=self.source_encoding)
     return codecs.open(self.store.intermediate_path(basefile),
                        encoding=self.source_encoding)
Esempio n. 2
0
 def downloaded_to_intermediate(self, basefile, attachment=None):
     # Check to see if this might not be a proper SFS at all
     # (from time to time, other agencies publish their stuff
     # in SFS - this seems to be handled by giving those
     # documents a SFS nummer on the form "N1992:31". Filter
     # these out.
     if basefile.startswith('N'):
         raise IckeSFS("%s is not a regular SFS" % basefile)
     filename = self.store.downloaded_path(basefile)
     try:
         t = TextReader(filename, encoding=self.source_encoding)
     except IOError:
         self.log.warning("%s: Fulltext is missing" % basefile)
         # FIXME: This code needs to be rewritten
         baseuri = self.canonical_uri(basefile)
         if baseuri in registry:
             title = registry[baseuri].value(URIRef(baseuri),
                                             self.ns['dcterms'].title)
             desc.value(self.ns['dcterms'].title, title)
         desc.rel(self.ns['dcterms'].publisher,
                  self.lookup_resource("Regeringskansliet"))
         desc.value(self.ns['dcterms'].identifier, "SFS " + basefile)
         doc.body = Forfattning([Stycke(['Lagtext saknas'],
                                        id='S1')])
     # Check to see if the Författning has been revoked (using
     # plain fast string searching, no fancy HTML parsing and
     # traversing)
     if not self.config.keepexpired:
         try:
             t.cuepast('<i>Författningen är upphävd/skall upphävas: ')
             datestr = t.readto('</i></b>')
             if datetime.strptime(datestr, '%Y-%m-%d') < datetime.today():
                 self.log.debug('%s: Expired' % basefile)
                 raise UpphavdForfattning("%s is an expired SFS" % basefile,
                                          dummyfile=self.store.parsed_path(basefile))
             t.seek(0)
         except IOError:
             t.seek(0)
     t.cuepast('<pre>')
     # remove &auml; et al
     try:
         # this is the preferred way from py34 onwards. FIXME: Move
         # this to ferenda.compat
         import html
         txt = html.unescape(t.readto('</pre>'))
     except ImportError:
         # this is the old way.
         hp = HTMLParser()
         txt = hp.unescape(t.readto('</pre>'))
     if '\r\n' not in txt:
         txt = txt.replace('\n', '\r\n')
     re_tags = re.compile("</?\w{1,3}>")
     txt = re_tags.sub('', txt)
     # add ending CRLF aids with producing better diffs
     txt += "\r\n"
     util.writefile(self.store.intermediate_path(basefile), txt,
                    encoding=self.source_encoding)
     return codecs.open(self.store.intermediate_path(basefile),
                        encoding=self.source_encoding)