def handle_path(self, path): fd, outfile = tempfile.mkstemp(suffix='.txt') try: run(which("pdftotext"), path, outfile) with open(outfile, "rb") as f: text = f.read() return self.decode(text) finally: os.close(fd) os.remove(outfile)
def _get_path(path, **kwargs): try: return run('antiword', path).decode('utf8') except ShellError as e: if b'not a Word Document' not in e.stderr: raise LOGGER.warning('.doc file unsupported format') except MissingCommandException: LOGGER.warning('CLI tool "antiword" missing, using "abiword"') # Try abiword, slower, but supports more formats. return run('abiword', '--to=txt', '--to-name=fd://1', path).decode('utf8')
def handle_path(self, path): try: return self.decode(run('antiword', path)) except ShellError as e: if b'not a Word Document' not in e.stderr: raise LOGGER.warning('.doc file unsupported format, trying abiword') except MissingCommandException: LOGGER.warning('CLI tool "antiword" missing, using "abiword"') # Try abiword, slower, but supports more formats. return self.decode(run('abiword', '--to=txt', '--to-name=fd://1', path))
def handle_path(self, path): try: return self.decode(run('antiword', path)) except ShellError as e: if b'not a Word Document' not in e.stderr: raise LOGGER.warning('.doc file unsupported format, trying abiword') except MissingCommandException: LOGGER.warning('CLI tool "antiword" missing, using "abiword"') # Try abiword, slower, but supports more formats. return self.decode( run('abiword', '--to=txt', '--to-name=fd://1', path))
def handle_title(self, f): if is_file_path(f): # Doesn't work with file objs. bout = run("pdfinfo", f) out = self.decode(bout) for line in out.split("\n"): if line.startswith("Title:"): return line.partition("Title:")[2].strip()
def handle_path(self, path): cmd = ['unrtf', '--text', '--nopict', path] if POSIX: return self.strip(run(*cmd)) else: # On Windows unrtf.exe prints a lot of cruft to stderr. # We trim it out. out = subprocess.check_output(cmd, stderr=subprocess.PIPE) return self.strip(out)
def handle_fobj(self, f): return self.strip( run('unrtf', '--text', '--nopict', stdin=f))
def handle_path(self, path): out = run(*unix_cmd(path, **self.kwargs)) return self.decode(out)
def handle_path(self, path): out = self.decode(run(*cmd(path))) return to_text_with_backend(out)
def _get_path(path, **kwargs): return _strip(run('unrtf', '--text', '--nopict', path))
def handle_fobj(self, f): out = run(*unix_cmd('-', **self.kwargs), stdin=f) return self.decode(out)
def _get_file(f, **kwargs): return run(*_cmd('-', **kwargs), stdin=f).decode('utf-8')
def _get_file(f, **kwargs): return _strip(run('unrtf', '--text', '--nopict', stdin=f))
def handle_path(self, path): out = run('pstotext', path) return self.decode(out)
def _get_file(f, **kwargs): return run('antiword', '-', stdin=f).decode('utf8')
def _get_path(path, **kwargs): return run(*_cmd(path, **kwargs)).decode('utf-8')
def handle_fobj(self, f): out = run('pstotext', '-', stdin=f) return self.decode(out)
def _get_path(path, **kwargs): return run('antiword', path).decode('utf8')