def download(self): # do something with static/sitenews.txt --> split into # <datadir>/sitenews/<timestamp>.txt ofp = temppath = path = basefile = None with codecs.open(self.resourceloader.filename(self.config.newsfile), encoding="utf-8") as fp: for line in fp: m = self.re_news_subjectline(line) if m: if ofp: ofp.close() if util.replace_if_different(temppath, path): self.log.info("%s: creating news item" % basefile) d = datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S") basefile = str(int(d.timestamp())) path = self.store.downloaded_path(basefile) fileno, temppath = tempfile.mkstemp(text=True) util.ensure_dir(path) # ofp = codecs.open(path, "w", encoding="utf-8") ofp = os.fdopen(fileno, "w") ofp.write(line) ofp.close() if util.replace_if_different(temppath, path): self.log.info("%s: download OK (creating news item)" % basefile)
def test_replace_if_different(self): # test 1: dst does not exist util.writefile(self.fname, "Hello") self.assertTrue(util.replace_if_different(self.fname, self.fname2)) self.assertFalse(os.path.exists(self.fname)) self.assertTrue(os.path.exists(self.fname2)) # test 2: dst exists, but is different (gets overwritten) util.writefile(self.fname, "Hello (different)") self.assertTrue(util.replace_if_different(self.fname, self.fname2)) self.assertFalse(os.path.exists(self.fname)) self.assertEqual("Hello (different)", util.readfile(self.fname2)) # test 3: src and dst is identical (src gets removed) util.writefile(self.fname, "Hello (different)") self.assertFalse(util.replace_if_different(self.fname, self.fname2)) self.assertFalse(os.path.exists(self.fname)) # test 4: dst exist, is different, gets archived newfile = self.dname+"/new.txt" archivefile = self.dname+"/archive.txt" util.writefile(newfile, "Hello (archiving)") self.assertTrue(util.replace_if_different(newfile, self.fname2, archivefile)) self.assertFalse(os.path.exists(newfile)) self.assertEqual("Hello (archiving)", util.readfile(self.fname2)) self.assertEqual("Hello (different)", util.readfile(archivefile))
def download(self, basefile=None): soup = BeautifulSoup(requests.get(self.start_url).text) main = soup.find(id="mainarea") docs = [] for numberlabel in main.findAll(text="NUMMER"): numberdiv = numberlabel.findParent("div").parent typediv = numberdiv.findNextSibling() if typediv.find("div", "FFFSListAreaLeft").get_text(strip=True) != "TYP": self.log.error("Expected TYP in div, found %s" % typediv.get_text(strip=True)) continue titlediv = typediv.findNextSibling() if titlediv.find("div", "FFFSListAreaLeft").get_text(strip=True) != "RUBRIK": self.log.error("Expected RUBRIK in div, found %s" % titlediv.get_text(strip=True)) continue number = numberdiv.find("div", "FFFSListAreaRight").get_text(strip=True) tmpfile = mktemp() snippetfile = self.store.downloaded_path(number).replace(".pdf", ".snippet.html") fp = codecs.open(tmpfile, "w", encoding="utf-8") fp.write(str(numberdiv)) fp.write(str(typediv)) fp.write(str(titlediv)) fp.close() util.replace_if_different(tmpfile, snippetfile) self.download_single(number, usecache)
def crop(self, top=0, left=0, bottom=None, right=None): """Removes any :py:class:`ferenda.pdfreader.Textbox` objects that does not fit within the bounding box specified by the parameters.""" # Crop any text box that sticks out # Actually if top and left != 0, we need to adjust them newboxes = [] for box in self.boundingbox(top, left, bottom, right): box.top = box.top - top box.left = box.left - left box.right = box.right - right box.bottom = box.bottom - bottom newboxes.append(box) self[:] = [] self.extend(newboxes) self.width = right - left self.height = bottom - top # Then crop the background images... somehow if os.path.exists(self.background): cmdline = "convert %s -crop %dx%d+%d+%d +repage %s" % (self.background, self.width, self.height, left, top, self.background + ".new") # print "Running %s" % cmdline (returncode, stdout, stderr) = util.runcmd(cmdline, require_success=True) util.replace_if_different( "%s.new" % self.background, self.background)