def tag(text, tagtable, sliceleft, sliceright): c = hashlib.md5() c.update(text) text_checksum = c.hexdigest() pickled_tagger = tagtable # remember, not a real tagtable struct pickled_tagged = "%s-%s.pickle" % (pickled_tagger, text_checksum) if not os.path.exists(pickled_tagged): # 2. Dump text as string full_text_path = "%s/%s.txt" % (os.path.dirname(pickled_tagger), text_checksum) with open(full_text_path, "wb") as fp: fp.write(text) # 3. call script (that loads the pickled tagtable + string # file, saves tagged text as pickle) util.runcmd("%s %s %s %s %s" % (python_exe, tagstring_script, pickled_tagger, full_text_path, text_checksum), require_success=True) # 4. load tagged text pickle with open(pickled_tagged, "rb") as fp: res = pickle.load(fp) return res
def makeimage(basename, label): filename = "res/img/sfs/%s.png" % basename if not os.path.exists(filename): util.ensure_dir(filename) self.log.info("Creating img %s with label %s" % (filename, label)) cmd = 'convert -background transparent -fill Grey -font %s -pointsize 10 -size 44x14 -gravity East label:"%s " %s' % (font, label, filename) util.runcmd(cmd) return filename
def textreader_from_basefile(self, basefile, encoding): infile = self.store.downloaded_path(basefile) tmpfile = self.store.path(basefile, "intermediate", ".pdf") outfile = self.store.path(basefile, "intermediate", ".txt") util.copy_if_different(infile, tmpfile) util.runcmd("pdftotext %s" % tmpfile, require_success=True) util.robust_remove(tmpfile) return TextReader(outfile, encoding=encoding, linesep=TextReader.UNIX)
def makeimage(basename, label): filename = "res/img/sfs/%s.png" % basename if not os.path.exists(filename): util.ensure_dir(filename) self.log.info("Creating img %s with label %s" % (filename, label)) cmd = 'convert -background transparent -fill gray50 -font %s -pointsize 10 -size 44x14 -gravity East label:"%s " %s' % ( font, label, filename) util.runcmd(cmd) return filename
def buildTagger(self, production=None, processor=None): pickled_tagger = "%s/%s-%s.pickle" % (external_simpleparse_state, self.declaration_md5, production) if not os.path.exists(pickled_tagger): # 3. call the script with python 27 and production cmdline = "%s %s %s/%s %s" % (python_exe, buildtagger_script, external_simpleparse_state, self.declaration_md5, production) util.runcmd(cmdline, require_success=True) # 4. the script builds tagtable and dumps it to a pickle file assert os.path.exists(pickled_tagger) return pickled_tagger # filename instead of tagtable struct
def word_to_docbook(self, indoc, outdoc): """Convert a old Word document (.doc) to a pseudo-docbook file through antiword.""" tmpfile = mktemp() indoc = os.path.normpath(indoc) wrapper = textwrap.TextWrapper(break_long_words=False, width=72) util.ensure_dir(outdoc) if " " in indoc: indoc = '"%s"' % indoc cmd = "antiword -x db %s > %s" % (indoc, tmpfile) self.log.debug("Executing %s" % cmd) (ret, stdout, stderr) = util.runcmd(cmd) if ret != 0: self.log.error("Docbook conversion failed: %s" % stderr) raise errors.ExternalCommandError( "Docbook conversion failed: %s" % stderr.strip()) tree = ET.parse(tmpfile) for element in tree.getiterator(): if element.text and element.text.strip() != "": replacement = "" for p in element.text.split("\n"): if p: replacement += wrapper.fill(p) + "\n\n" element.text = replacement.strip() tree.write(outdoc, encoding="utf-8") os.unlink(tmpfile)
def crop(self, top=0, left=0, bottom=None, right=None): """Removes any :py:class:`ferenda.pdfreader.Textbox` objects that does not fit within the bounding box specified by the parameters.""" # Crop any text box that sticks out # Actually if top and left != 0, we need to adjust them newboxes = [] for box in self.boundingbox(top, left, bottom, right): box.top = box.top - top box.left = box.left - left box.right = box.right - right box.bottom = box.bottom - bottom newboxes.append(box) self[:] = [] self.extend(newboxes) self.width = right - left self.height = bottom - top # Then crop the background images... somehow if os.path.exists(self.background): cmdline = "convert %s -crop %dx%d+%d+%d +repage %s" % (self.background, self.width, self.height, left, top, self.background + ".new") # print "Running %s" % cmdline (returncode, stdout, stderr) = util.runcmd(cmdline, require_success=True) util.replace_if_different( "%s.new" % self.background, self.background)
def parse(self, doc): doc.uri = self.canonical_uri(doc.basefile) d = Describer(doc.meta, doc.uri) d.rdftype(self.rdf_type) d.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) self.infer_triples(d, doc.basefile) # prefer PDF or Word files over the plaintext-containing HTML files # FIXME: PDF or Word files are now stored as attachments pdffile = self.generic_path(doc.basefile, 'downloaded', '.pdf') wordfiles = (self.generic_path(doc.basefile, 'downloaded', '.doc'), self.generic_path(doc.basefile, 'downloaded', '.docx'), self.generic_path(doc.basefile, 'downloaded', '.wpd'), self.generic_path(doc.basefile, 'downloaded', '.rtf')) wordfile = None for f in wordfiles: if os.path.exists(f): wordfile = f # if we lack a .pdf file, use Open/LibreOffice to convert any # .wpd or .doc file to .pdf first if (wordfile and not os.path.exists(pdffile)): intermediate_pdf = self.generic_path( doc.basefile, "intermediate", ".pdf") if not os.path.exists(intermediate_pdf): cmdline = "%s --headless -convert-to pdf -outdir '%s' %s" % (self.config.get('soffice', 'soffice'), os.path.dirname( intermediate_pdf), wordfile) self.log.debug( "%s: Converting to PDF: %s" % (doc.basefile, cmdline)) (ret, stdout, stderr) = util.runcmd( cmdline, require_success=True) pdffile = intermediate_pdf if os.path.exists(pdffile): self.log.debug("%s: Using %s" % (doc.basefile, pdffile)) intermediate_dir = os.path.dirname( self.generic_path(doc.basefile, 'intermediate', '.foo')) self.setup_logger('pdfreader', self.config.get('log', 'INFO')) pdfreader = PDFReader() pdfreader.read(pdffile, intermediate_dir) self.parse_from_pdfreader(pdfreader, doc) else: downloaded_path = self.downloaded_path(doc.basefile) intermediate_path = self.generic_path( doc.basefile, 'intermediate', '.txt') self.log.debug("%s: Using %s (%s)" % (doc.basefile, downloaded_path, intermediate_path)) if not os.path.exists(intermediate_path): html = codecs.open( downloaded_path, encoding="iso-8859-1").read() util.writefile(intermediate_path, util.extract_text( html, '<pre>', '</pre>'), encoding="utf-8") textreader = TextReader(intermediate_path, encoding="utf-8") self.parse_from_textreader(textreader, doc)
def _run_curl(self, options): if options['method'] == 'GET': cmd = 'curl -o "%(filename)s" --header "Accept:%(accept)s" "%(url)s"' % options elif options['method'] == 'POST': cmd = 'curl -X POST --data-binary "@%(filename)s" --header "Content-Type:%(contenttype)s" "%(url)s"' % options (ret, stdout, stderr) = util.runcmd(cmd) if ret != 0: raise errors.TriplestoreError(stderr) return stdout
def _run_curl(self, options): if "<" in options["url"]: options["url"] = options["url"].replace("<", "%3C").replace(">", "%3E") if options['method'] == 'GET': cmd = 'curl -o "%(filename)s" --header "Accept:%(accept)s" "%(url)s"' % options elif options['method'] == 'POST': cmd = 'curl -X POST --data-binary "@%(filename)s" --header "Content-Type:%(contenttype)s" "%(url)s"' % options (ret, stdout, stderr) = util.runcmd(cmd) if ret != 0: raise errors.TriplestoreError(stderr) return stdout
def test_open_intermediate_path(self): self.store.intermediate_suffixes = [".html", ".xhtml"] with self.store.open_intermediate("123/a", mode="w", suffix=".xhtml") as fp: fp.write(self.dummytext) filename = self.p("intermediate/123/a.xhtml" + self.expected_suffix) self.assertTrue(os.path.exists(filename)) mimetype = util.runcmd("file -b --mime-type %s" % filename)[1] self.assertIn(mimetype.strip(), self.expected_mimetype) with self.store.open_intermediate("123/a") as fp: # note, open_intermediate should open the file with the # the .xhtml suffix automatically self.assertEqual(self.dummytext, fp.read())
def test_runcmd(self): filename = self.dname+os.sep+"räksmörgås.txt" util.writefile(filename, "räksmörgås") if sys.platform == "win32": cmd = "type" else: cmd = "cat" cmdline = "%s %s" % (cmd, filename) (retcode, stdout, stderr) = util.runcmd(cmdline) self.assertEqual(0, retcode) self.assertEqual("räksmörgås", stdout) self.assertEqual("", stderr) cmdline = "non-existing-binary foo" (retcode, stdout, stderr) = util.runcmd(cmdline) self.assertNotEqual(0, retcode) self.assertNotEqual("", stderr) with self.assertRaises(errors.ExternalCommandError): (retcode, stdout, stderr) = util.runcmd(cmdline, require_success=True)
def test_open_binary(self): wanted_filename = self.store.path("basefile", "maindir", ".suffix") # the smallest possible PNG image bindata = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\nIDATx\x9cc\x00\x01\x00\x00\x05\x00\x01\r\n-\xb4\x00\x00\x00\x00IEND\xaeB`\x82' with self.store.open("basefile", "maindir", ".suffix", "wb") as fp: fp.write(bindata) mimetype = util.runcmd("file -b --mime-type %s" % wanted_filename)[1] self.assertEqual("image/png", mimetype.strip()) # make sure that the open method also can be used with self.store.open("basefile", "maindir", ".suffix", "rb") as fp: self.assertEqual(bindata, fp.read())
def read(self, pdffile, workdir): """Initializes a PDFReader object from an existing PDF file. After initialization, the PDFReader contains a list of :py:class:`~ferenda.pdfreader.Page` objects. :param pdffile: The full path to the PDF file :param workdir: A directory where intermediate files (particularly background PNG files) are stored """ self.filename = pdffile assert os.path.exists(pdffile), "PDF %s not found" % pdffile basename = os.path.basename(pdffile) xmlfile = os.sep.join( (workdir, os.path.splitext(basename)[0] + ".xml")) if not util.outfile_is_newer([pdffile], xmlfile): tmppdffile = os.sep.join([workdir, basename]) util.copy_if_different(pdffile, tmppdffile) # two pass coding: First use -c (complex) to extract # background pictures, then use -xml to get easy-to-parse # text with bounding boxes. cmd = "pdftohtml -nodrm -c %s" % tmppdffile self.log.debug("Converting: %s" % cmd) (returncode, stdout, stderr) = util.runcmd(cmd, require_success=True) # we won't need the html files for f in os.listdir(workdir): if f.endswith(".html"): os.unlink(workdir + os.sep + f) cmd = "pdftohtml -nodrm -xml %s" % tmppdffile self.log.debug("Converting: %s" % cmd) (returncode, stdout, stderr) = util.runcmd(cmd, require_success=True) return self._parse_xml(xmlfile)
def download(self): hg_clone_path = os.sep.join(self.config.datadir, self.alias, 'clone') if os.path.exists(hg_clone_path): self.log.debug("Pulling latest changes") util.runcmd("hg pull", cwd=hg_clone_path) self.log.debug("Updating local clone") util.runcmd("hg update", cwd=hg_clone_path) else: hg_clone_parent = os.sep.join(self.config.datadir, self.alias) util.runcmd("hg clone %s clone" % self.start_url, cwd=hg_clone_parent) new_last_rev = None cmd = "LANGUAGE=C hg log -v" util.runcmd(cmd) for rev in "LANGUAGE=C hg log -v": if not new_last_rev: new_last_rev = rev.id if rev > self.config.last_rev: for f in rev.files: # rev.files only contain proper pep files "hg cat -r %s > downloaded/%s-r%s.txt" % ( f, self.store.downloaded_path(f), rev.id) else: self.config.last_rev = new_last_rev break
def word_to_docbook(self, indoc, outfp): """Convert a old Word document (.doc) to a pseudo-docbook file through antiword.""" tmpfile = mktemp() indoc = os.path.normpath(indoc) wrapper = textwrap.TextWrapper(break_long_words=False, width=72) if " " in indoc: indoc = '"%s"' % indoc cmd = "antiword -x db %s > %s" % (indoc, tmpfile) # make sure HOME is set even on win32 -- antiword seems to require it? if 'HOME' not in os.environ and 'USERPROFILE' in os.environ: os.environ['HOME'] = os.environ['USERPROFILE'] self.log.debug("Executing %s" % cmd) (ret, stdout, stderr) = util.runcmd(cmd) if ret != 0: self.log.error("Docbook conversion failed: %s" % stderr) raise errors.ExternalCommandError("Docbook conversion failed: %s" % stderr.strip()) # wrap long lines in the docbook output. Maybe should be configurable? tree = ET.parse(tmpfile) if hasattr(tree, 'iter'): iterator = tree.iter() else: # Python 2.6 way -- results in a PendingDeprecationWarning # on newer pythons. iterator = tree.getiterator() for element in iterator: if element.text and element.text.strip() != "": replacement = "" for p in element.text.split("\n"): if p: replacement += wrapper.fill(p) + "\n\n" element.text = replacement.strip() tree.write(outfp, encoding="utf-8") os.unlink(tmpfile)
def get_pathfunc(self, environ, basefile, params, contenttype, suffix): """Given the parameters, return a function that will, given a basefile, produce the proper path to that basefile. If the parameters indicate a version of the resource that does not exist as a static file on disk (like ".../basefile/data.rdf"), returns None """ if "extended" in params: # by definition, this means that we don't have a static file on disk return None # try to lookup pathfunc from contenttype (or possibly suffix, or maybe params) if "repo" in params: # this must be a CompositeRepository that has the get_instance method for cls in self.repo.subrepos: if cls.alias == params['repo']: repo = self.repo.get_instance(cls) break else: raise ValueError("No '%s' repo is a subrepo of %s" % (params['repo'], self.repo.alias)) else: repo = self.repo if "dir" in params: method = { 'downloaded': repo.store.downloaded_path, 'intermediate': repo.store.intermediate_path, 'parsed': repo.store.parsed_path }[params["dir"]] if "page" in params and "format" in params: # check if this is a robot we need to ban (we try to # ban them through robots.txt but not all are well # behaved) if getattr(self.repo.config, 'imagerobots', None): if re.search(self.repo.config.imagerobots, environ.get("User-Agent")): raise Forbidden() baseparam = "-size 400x300 -pointsize 12 -gravity center" baseattach = None try: if "attachment" in params: sourcefile = method(basefile, attachment=params["attachment"]) else: sourcefile = method(basefile) # we might run this on a host to where we haven't # transferred the downloaded files -- try to # re-aquire them now that someone wants to watch # them. if not os.path.exists(sourcefile): repo.download(basefile) assert params["page"].isdigit( ), "%s is not a digit" % params["page"] assert params["format"] in ("png", "jpg"), ( "%s is not a valid image format" % params["format"]) baseattach = "page_%s.%s" % (params["page"], params["format"]) if "attachment" in params: baseattach = "%s_%s" % (params["attachment"], baseattach) outfile = repo.store.intermediate_path( basefile, attachment=baseattach) if not os.path.exists(outfile): # params['page'] is 0-based, pdftoppm is 1-based cmdline = "pdftoppm -f %s -singlefile -png %s %s" % ( int(params["page"]) + 1, sourcefile, outfile.replace(".png", ".tmp")) util.runcmd(cmdline, require_success=True) cmdline = "convert %s -trim %s" % (outfile.replace( ".png", ".tmp.png"), outfile) util.runcmd(cmdline, require_success=True) os.unlink(outfile.replace(".png", ".tmp.png")) logfile = self.repo.config._parent.datadir + os.sep + "ua.log" with open(logfile, "a") as fp: fp.write("%s\t%s\t%s\n" % (outfile, environ.get("User-Agent"), environ.get("Referer"))) except Exception as e: if not baseattach: baseattach = "page_error.png" outfile = repo.store.intermediate_path( basefile, attachment=baseattach) errormsg = "%s\n%s: %s" % ("".join( traceback.format_tb( sys.exc_info()[2])), e.__class__.__name__, str(e)) errormsg = errormsg.replace("\n", "\\n").replace("'", "\\'") cmdline = 'convert label:"%s" %s' % (errormsg, outfile) util.runcmd(cmdline, require_success=True) method = partial(repo.store.intermediate_path, attachment=baseattach) return method # we really don't want to partial() # this method again below elif "version" in params: method = partial(repo.store.generated_path, version=params["version"]) elif "diff" in params and params.get("from") != "None": return None elif contenttype in self._mimemap: method = getattr(repo.store, self._mimemap[contenttype]) elif suffix in self._suffixmap: method = getattr(repo.store, self._suffixmap[suffix]) elif "attachment" in params and mimetypes.guess_extension(contenttype): method = repo.store.generated_path else: # method = repo.store.generated_path return None if "attachment" in params: method = partial(method, attachment=params["attachment"]) return method
def query_webservice(self, query, page): # this is the only soap template we'll need, so we include it # verbatim to avoid having a dependency on a soap module like # zeep. endpoint = 'https://eur-lex.europa.eu/EURLexWebService' envelope = """<soap-env:Envelope xmlns:soap-env="http://www.w3.org/2003/05/soap-envelope"> <soap-env:Header> <wsse:Security xmlns:wsse="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-secext-1.0.xsd"> <wsse:UsernameToken> <wsse:Username>%s</wsse:Username> <wsse:Password Type="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-username-token-profile-1.0#PasswordText">%s</wsse:Password> </wsse:UsernameToken> </wsse:Security> </soap-env:Header> <soap-env:Body> <sear:searchRequest xmlns:sear="http://eur-lex.europa.eu/search"> <sear:expertQuery>%s</sear:expertQuery> <sear:page>%s</sear:page> <sear:pageSize>%s</sear:pageSize> <sear:searchLanguage>%s</sear:searchLanguage> </sear:searchRequest> </soap-env:Body> </soap-env:Envelope> """ % (self.config.username, self.config.password, escape(query, quote=False), page, self.pagesize, self.lang) headers = {'Content-Type': 'application/soap+xml; charset=utf-8; action="https://eur-lex.europa.eu/EURLexWebService/doQuery"', 'SOAPAction': 'https://eur-lex.europa.eu/EURLexWebService/doQuery'} if self.config.curl: # dump the envelope to a tempfile headerstr = "" for k, v in headers.items(): assert "'" not in v # if it is, we need to work on escaping it headerstr += " --header '%s: %s'" % (k, v) with tempfile.NamedTemporaryFile() as fp: fp.write(envelope.encode("utf-8")) fp.flush() envelopename = fp.name headerfiledesc, headerfilename = tempfile.mkstemp() cmd = 'curl -L -X POST -D %(headerfilename)s --data-binary "@%(envelopename)s" %(headerstr)s %(endpoint)s' % locals() (ret, stdout, stderr) = util.runcmd(cmd) headerfp = os.fdopen(headerfiledesc) header = headerfp.read() headerfp.close() util.robust_remove(headerfilename) status, headers = header.split('\n', 1) prot, code, msg = status.split(" ", 2) headers = dict(email.message_from_string(headers).items()) res = FakeResponse(int(code), stdout, headers) else: res = util.robust_fetch(self.session.post, endpoint, self.log, raise_for_status=False, data=envelope, headers=headers, timeout=10) if res.status_code == 500: tree = etree.parse(BytesIO(res.content)) statuscode = tree.find(".//{http://www.w3.org/2003/05/soap-envelope}Subcode")[0].text statusmsg = tree.find(".//{http://www.w3.org/2003/05/soap-envelope}Text").text raise errors.DownloadError("%s: %s" % (statuscode, statusmsg)) elif res.status_code == 301: # the call to robust_fetch or curl should have followed # the redirect, but at this point we'll just have to # report the error raise errors.DownloadError("%s: was redirected to %s" % (endpoint, res.headers['Location'])) return res
def get_pathfunc(self, environ, basefile, params, contenttype, suffix): """Given the parameters, return a function that will, given a basefile, produce the proper path to that basefile. If the parameters indicate a version of the resource that does not exist as a static file on disk (like ".../basefile/data.rdf"), returns None """ # try to lookup pathfunc from contenttype (or possibly suffix, or maybe params) if "repo" in params: # this must be a CompositeRepository that has the get_instance method for cls in self.repo.subrepos: if cls.alias == params['repo']: repo = self.repo.get_instance(cls) break else: raise ValueError("No '%s' repo is a subrepo of %s" % (param['repo'], self.repo.alias)) else: repo = self.repo if "dir" in params: method = {'downloaded': repo.store.downloaded_path, 'parsed': repo.store.parsed_path}[params["dir"]] if "page" in params and "format" in params: baseparam = "-size 400x300 -pointsize 12 -gravity center" baseattach = None try: if "attachment" in params: sourcefile = method(basefile, attachment=params["attachment"]) else: sourcefile = method(basefile) # we might run this on a host to where we haven't # transferred the downloaded files -- try to # re-aquire them now that someone wants to watch # them. if not os.path.exists(sourcefile): repo.download(basefile) assert params["page"].isdigit(), "%s is not a digit" % params["page"] assert params["format"] in ("png", "jpg"), ("%s is not a valid image format" % params["format"]) baseattach = "page_%s.%s" % (params["page"], params["format"]) if "attachment" in params: baseattach = "%s_%s" % (params["attachment"], baseattach) outfile = repo.store.intermediate_path(basefile, attachment=baseattach) if not os.path.exists(outfile): # params['page'] is 0-based, pdftoppm is 1-based cmdline = "pdftoppm -f %s -singlefile -png %s %s" % (int(params["page"])+1, sourcefile, outfile.replace(".png",".tmp")) util.runcmd(cmdline, require_success=True) cmdline = "convert %s -trim %s" % (outfile.replace(".png", ".tmp.png"), outfile) util.runcmd(cmdline, require_success=True) os.unlink(outfile.replace(".png", ".tmp.png")) except Exception as e: if not baseattach: baseattach = "page_error.png" outfile = repo.store.intermediate_path(basefile, attachment=baseattach) errormsg = str(e).replace("\n", "\\n").replace("'", "\\'") cmdline = 'convert label:"%s" %s' % (errormsg, outfile) util.runcmd(cmdline, require_success=True) method = partial(repo.store.intermediate_path, attachment=baseattach) return method # we really don't want to partial() # this method again below elif contenttype in self._mimemap and not basefile.endswith("/data"): method = getattr(repo.store, self._mimemap[contenttype]) elif suffix in self._suffixmap and not basefile.endswith("/data"): method = getattr(repo.store, self._suffixmap[suffix]) elif "attachment" in params and mimetypes.guess_extension(contenttype): method = repo.store.generated_path else: # method = repo.store.generated_path return None if "attachment" in params: method = partial(method, attachment=params["attachment"]) return method
def get_pathfunc(self, environ, basefile, params, contenttype, suffix): """Given the parameters, return a function that will, given a basefile, produce the proper path to that basefile. If the parameters indicate a version of the resource that does not exist as a static file on disk (like ".../basefile/data.rdf"), returns None """ # try to lookup pathfunc from contenttype (or possibly suffix, or maybe params) if "repo" in params: # this must be a CompositeRepository that has the get_instance method for cls in self.repo.subrepos: if cls.alias == params['repo']: repo = self.repo.get_instance(cls) break else: raise ValueError("No '%s' repo is a subrepo of %s" % (params['repo'], self.repo.alias)) else: repo = self.repo if "dir" in params: method = {'downloaded': repo.store.downloaded_path, 'intermediate': repo.store.intermediate_path, 'parsed': repo.store.parsed_path}[params["dir"]] if "page" in params and "format" in params: baseparam = "-size 400x300 -pointsize 12 -gravity center" baseattach = None try: if "attachment" in params: sourcefile = method(basefile, attachment=params["attachment"]) else: sourcefile = method(basefile) # we might run this on a host to where we haven't # transferred the downloaded files -- try to # re-aquire them now that someone wants to watch # them. if not os.path.exists(sourcefile): repo.download(basefile) assert params["page"].isdigit(), "%s is not a digit" % params["page"] assert params["format"] in ("png", "jpg"), ("%s is not a valid image format" % params["format"]) baseattach = "page_%s.%s" % (params["page"], params["format"]) if "attachment" in params: baseattach = "%s_%s" % (params["attachment"], baseattach) outfile = repo.store.intermediate_path(basefile, attachment=baseattach) if not os.path.exists(outfile): # params['page'] is 0-based, pdftoppm is 1-based cmdline = "pdftoppm -f %s -singlefile -png %s %s" % (int(params["page"])+1, sourcefile, outfile.replace(".png",".tmp")) util.runcmd(cmdline, require_success=True) cmdline = "convert %s -trim %s" % (outfile.replace(".png", ".tmp.png"), outfile) util.runcmd(cmdline, require_success=True) os.unlink(outfile.replace(".png", ".tmp.png")) except Exception as e: if not baseattach: baseattach = "page_error.png" outfile = repo.store.intermediate_path(basefile, attachment=baseattach) errormsg = "%s\n%s: %s" % ("".join(traceback.format_tb(sys.exc_info()[2])), e.__class__.__name__, str(e)) errormsg = errormsg.replace("\n", "\\n").replace("'", "\\'") cmdline = 'convert label:"%s" %s' % (errormsg, outfile) util.runcmd(cmdline, require_success=True) method = partial(repo.store.intermediate_path, attachment=baseattach) return method # we really don't want to partial() # this method again below elif contenttype in self._mimemap and not basefile.endswith("/data"): method = getattr(repo.store, self._mimemap[contenttype]) elif suffix in self._suffixmap and not basefile.endswith("/data"): method = getattr(repo.store, self._suffixmap[suffix]) elif "attachment" in params and mimetypes.guess_extension(contenttype): method = repo.store.generated_path else: # method = repo.store.generated_path return None if "attachment" in params: method = partial(method, attachment=params["attachment"]) return method