Esempio n. 1
0
    def extract_head(self, fp, basefile):
        """Parsear ut det SFSR-registret som innehåller alla ändringar
        i lagtexten från HTML-filer"""

        # NB: We should really call self.store.register_path, but that
        # custom func isn't mocked by ferenda.testutil.RepoTester,
        # and downloaded_path is. So we call that one and munge it.
        filename = self.store.downloaded_path(basefile).replace(
            "/downloaded/", "/register/")
        with codecs.open(filename, encoding=self.source_encoding) as rfp:
            soup = bs4.BeautifulSoup(rfp.read(), "lxml")
        # do we really have a registry?
        notfound = soup.find(text="Sökningen gav ingen träff!")
        if notfound:
            raise InteExisterandeSFS(str(notfound))
        textheader = fp.read(2048)
        if not isinstance(textheader, str):
            # Depending on whether the fp is opened through standard
            # open() or bz2.BZ2File() in self.parse_open(), it might
            # return bytes or unicode strings. This seem to be a
            # problem in BZ2File (or how we use it). Just roll with it.
            textheader = textheader.decode(self.source_encoding)
        idx = textheader.index("\r\n" * 4)
        fp.seek(idx + 8)
        reader = TextReader(string=textheader, linesep=TextReader.DOS)
        subreader = reader.getreader(reader.readchunk, reader.linesep * 4)
        return soup, subreader.getiterator(subreader.readparagraph)
Esempio n. 2
0
    def extract_head(self, fp, basefile):
        """Parsear ut det SFSR-registret som innehåller alla ändringar
        i lagtexten från HTML-filer"""

        # NB: We should really call self.store.register_path, but that
        # custom func isn't mocked by ferenda.testutil.RepoTester,
        # and downloaded_path is. So we call that one and munge it.
        filename = self.store.downloaded_path(basefile).replace(
            "/downloaded/", "/register/")
        with codecs.open(filename, encoding=self.source_encoding) as rfp:
            soup = bs4.BeautifulSoup(rfp.read(), "lxml")
        # do we really have a registry?
        notfound = soup.find(text="Sökningen gav ingen träff!")
        if notfound:
            raise InteExisterandeSFS(str(notfound))
        textheader = fp.read(2048)
        if not isinstance(textheader, str):
            # Depending on whether the fp is opened through standard
            # open() or bz2.BZ2File() in self.parse_open(), it might
            # return bytes or unicode strings. This seem to be a
            # problem in BZ2File (or how we use it). Just roll with it.
            textheader = textheader.decode(self.source_encoding)
        idx = textheader.index("\r\n" * 4)
        fp.seek(idx + 8)
        reader = TextReader(string=textheader,
                            linesep=TextReader.DOS)
        subreader = reader.getreader(
            reader.readchunk, reader.linesep * 4)
        return soup, subreader.getiterator(subreader.readparagraph)