Esempio n. 1
0
 def handle_path(self, path):
     fd, outfile = tempfile.mkstemp(suffix='.txt')
     try:
         run(which("pdftotext"), path, outfile)
         with open(outfile, "rb") as f:
             text = f.read()
         return self.decode(text)
     finally:
         os.close(fd)
         os.remove(outfile)
Esempio n. 2
0
def _get_path(path, **kwargs):
    try:
        return run('antiword', path).decode('utf8')
    except ShellError as e:
        if b'not a Word Document' not in e.stderr:
            raise
        LOGGER.warning('.doc file unsupported format')
    except MissingCommandException:
        LOGGER.warning('CLI tool "antiword" missing, using "abiword"')

    # Try abiword, slower, but supports more formats.
    return run('abiword', '--to=txt', '--to-name=fd://1', path).decode('utf8')
Esempio n. 3
0
    def handle_path(self, path):
        try:
            return self.decode(run('antiword', path))
        except ShellError as e:
            if b'not a Word Document' not in e.stderr:
                raise
            LOGGER.warning('.doc file unsupported format, trying abiword')
        except MissingCommandException:
            LOGGER.warning('CLI tool "antiword" missing, using "abiword"')

        # Try abiword, slower, but supports more formats.
        return self.decode(run('abiword', '--to=txt', '--to-name=fd://1',
                               path))
Esempio n. 4
0
    def handle_path(self, path):
        try:
            return self.decode(run('antiword', path))
        except ShellError as e:
            if b'not a Word Document' not in e.stderr:
                raise
            LOGGER.warning('.doc file unsupported format, trying abiword')
        except MissingCommandException:
            LOGGER.warning('CLI tool "antiword" missing, using "abiword"')

        # Try abiword, slower, but supports more formats.
        return self.decode(
            run('abiword', '--to=txt', '--to-name=fd://1', path))
Esempio n. 5
0
 def handle_title(self, f):
     if is_file_path(f):
         # Doesn't work with file objs.
         bout = run("pdfinfo", f)
         out = self.decode(bout)
         for line in out.split("\n"):
             if line.startswith("Title:"):
                 return line.partition("Title:")[2].strip()
Esempio n. 6
0
 def handle_path(self, path):
     cmd = ['unrtf', '--text', '--nopict', path]
     if POSIX:
         return self.strip(run(*cmd))
     else:
         # On Windows unrtf.exe prints a lot of cruft to stderr.
         # We trim it out.
         out = subprocess.check_output(cmd, stderr=subprocess.PIPE)
         return self.strip(out)
Esempio n. 7
0
 def handle_fobj(self, f):
     return self.strip(
         run('unrtf', '--text', '--nopict', stdin=f))
Esempio n. 8
0
 def handle_path(self, path):
     out = run(*unix_cmd(path, **self.kwargs))
     return self.decode(out)
Esempio n. 9
0
 def handle_path(self, path):
     out = self.decode(run(*cmd(path)))
     return to_text_with_backend(out)
Esempio n. 10
0
def _get_path(path, **kwargs):
    return _strip(run('unrtf', '--text', '--nopict', path))
Esempio n. 11
0
 def handle_fobj(self, f):
     out = run(*unix_cmd('-', **self.kwargs), stdin=f)
     return self.decode(out)
Esempio n. 12
0
def _get_file(f, **kwargs):
    return run(*_cmd('-', **kwargs), stdin=f).decode('utf-8')
Esempio n. 13
0
def _get_file(f, **kwargs):
    return _strip(run('unrtf', '--text', '--nopict', stdin=f))
Esempio n. 14
0
 def handle_path(self, path):
     out = run('pstotext', path)
     return self.decode(out)
Esempio n. 15
0
def _get_file(f, **kwargs):
    return run('antiword', '-', stdin=f).decode('utf8')
Esempio n. 16
0
def _get_path(path, **kwargs):
    return run(*_cmd(path, **kwargs)).decode('utf-8')
Esempio n. 17
0
 def handle_path(self, path):
     out = run('pstotext', path)
     return self.decode(out)
Esempio n. 18
0
 def handle_fobj(self, f):
     out = run('pstotext', '-', stdin=f)
     return self.decode(out)
Esempio n. 19
0
def _get_path(path, **kwargs):
    return run('antiword', path).decode('utf8')
Esempio n. 20
0
 def handle_fobj(self, f):
     out = run('pstotext', '-', stdin=f)
     return self.decode(out)