def extract_head(self, fp, basefile): """Parsear ut det SFSR-registret som innehåller alla ändringar i lagtexten från HTML-filer""" # NB: We should really call self.store.register_path, but that # custom func isn't mocked by ferenda.testutil.RepoTester, # and downloaded_path is. So we call that one and munge it. filename = self.store.downloaded_path(basefile).replace( "/downloaded/", "/register/") with codecs.open(filename, encoding=self.source_encoding) as rfp: soup = bs4.BeautifulSoup(rfp.read(), "lxml") # do we really have a registry? notfound = soup.find(text="Sökningen gav ingen träff!") if notfound: raise InteExisterandeSFS(str(notfound)) textheader = fp.read(2048) if not isinstance(textheader, str): # Depending on whether the fp is opened through standard # open() or bz2.BZ2File() in self.parse_open(), it might # return bytes or unicode strings. This seem to be a # problem in BZ2File (or how we use it). Just roll with it. textheader = textheader.decode(self.source_encoding) idx = textheader.index("\r\n" * 4) fp.seek(idx + 8) reader = TextReader(string=textheader, linesep=TextReader.DOS) subreader = reader.getreader(reader.readchunk, reader.linesep * 4) return soup, subreader.getiterator(subreader.readparagraph)
def extract_head(self, fp, basefile): """Parsear ut det SFSR-registret som innehåller alla ändringar i lagtexten från HTML-filer""" # NB: We should really call self.store.register_path, but that # custom func isn't mocked by ferenda.testutil.RepoTester, # and downloaded_path is. So we call that one and munge it. filename = self.store.downloaded_path(basefile).replace( "/downloaded/", "/register/") with codecs.open(filename, encoding=self.source_encoding) as rfp: soup = bs4.BeautifulSoup(rfp.read(), "lxml") # do we really have a registry? notfound = soup.find(text="Sökningen gav ingen träff!") if notfound: raise InteExisterandeSFS(str(notfound)) textheader = fp.read(2048) if not isinstance(textheader, str): # Depending on whether the fp is opened through standard # open() or bz2.BZ2File() in self.parse_open(), it might # return bytes or unicode strings. This seem to be a # problem in BZ2File (or how we use it). Just roll with it. textheader = textheader.decode(self.source_encoding) idx = textheader.index("\r\n" * 4) fp.seek(idx + 8) reader = TextReader(string=textheader, linesep=TextReader.DOS) subreader = reader.getreader( reader.readchunk, reader.linesep * 4) return soup, subreader.getiterator(subreader.readparagraph)