def ocrpossible(config, path): # Check for tesseract global tesseractcmd if not tesseractcmd: config.setKeyDir(os.path.dirname(path)) if tesseractcmd: # It is very tempting to quote this value, esp. on Windows where it # will contain whitespace. There is no chance that an actual # command line would have quotes, so unquote it. tesseractcmd = config.getConfParam("tesseractcmd").strip('"') else: tesseractcmd = rclexecm.which("tesseract") if not tesseractcmd: _deb("tesseractcmd not found") return False if not os.path.isfile(tesseractcmd): _deb("tesseractcmd parameter [%s] is not a file" % tesseractcmd) return False # Check input format base, ext = os.path.splitext(path) ext = ext.lower() if ext in _okexts: return True if ext == '.pdf': global pdftocairocmd if not pdftocairocmd: pdftocairocmd = rclexecm.which("pdftocairo") if not pdftocairocmd: pdftocairocmd = rclexecm.which("poppler/pdftocairo") if pdftocairocmd: return True return False
def __init__(self, em): super(DJVUExtractor, self).__init__(em) self.djvutxt = rclexecm.which("djvutxt") if not self.djvutxt: print("RECFILTERROR HELPERNOTFOUND djvutxt") sys.exit(1) self.djvused = rclexecm.which("djvused")
def _initextrameta(self): if not _mswindows: self.pdfinfo = rclexecm.which("pdfinfo") if not self.pdfinfo: self.pdfinfo = rclexecm.which("poppler/pdfinfo") if not self.pdfinfo: self.extrameta = None return # extrameta is like "metanm|rclnm ...", where |rclnm maybe absent (keep # original name). Parse into a list of pairs. l = self.extrameta.split() self.extrameta = [] for e in l: l1 = e.split('|') if len(l1) == 1: l1.append(l1[0]) self.extrameta.append(l1) # Using lxml because it is better with # namespaces. With xml, we'd have to walk the XML tree # first, extracting all xmlns attributes and # constructing a tree (I tried and did not succeed in # doing this actually). lxml does it partially for # us. See http://stackoverflow.com/questions/14853243/ # parsing-xml-with-namespace-in-python-via-elementtree global ET #import xml.etree.ElementTree as ET try: import lxml.etree as ET except Exception as err: self.em.rclog("Can't import lxml etree: %s" % err) self.extrameta = None self.pdfinfo = None return self.re_head = re.compile(br'<head>', re.IGNORECASE) self.re_xmlpacket = re.compile(br'<\?xpacket[ ]+begin.*\?>' + br'(.*)' + br'<\?xpacket[ ]+end', flags = re.DOTALL) global EMF EMF = None if self.extrametafix: try: import importlib.util spec = importlib.util.spec_from_file_location( 'pdfextrametafix', self.extrametafix) EMF = importlib.util.module_from_spec(spec) spec.loader.exec_module(EMF) except Exception as err: self.em.rclog("Import extrametafix failed: %s" % err) EMF = None pass
def __init__(self, em): self.currentindex = 0 self.pdftotext = None self.pdfinfo = None self.pdftk = None self.em = em self.tesseract = None # Avoid picking up a default version on Windows, we want ours if not _mswindows: self.pdftotext = rclexecm.which("pdftotext") if not self.pdftotext: self.pdftotext = rclexecm.which("poppler/pdftotext") if not self.pdftotext: # No need for anything else. openfile() will return an # error at once return self.config = rclconfig.RclConfig() self.confdir = self.config.getConfDir() # The user can set a list of meta tags to be extracted from # the XMP metadata packet. These are specified as # (xmltag,rcltag) pairs self.extrameta = self.config.getConfParam("pdfextrameta") if self.extrameta: self.extrametafix = self.config.getConfParam("pdfextrametafix") self._initextrameta() # Check if we need to escape portions of text where old # versions of pdftotext output raw HTML special characters. self.needescape = True try: version = subprocess.check_output([self.pdftotext, "-v"], stderr=subprocess.STDOUT) major,minor,rev = version.split()[2].split('.') # Don't know exactly when this changed but it's fixed in # jessie 0.26.5 if int(major) > 0 or int(minor) >= 26: self.needescape = False except: pass # Pdftk is optionally used to extract attachments. This takes # a hit on performance even in the absence of any attachments, # so it can be disabled in the configuration. self.attextractdone = False self.attachlist = [] cf_attach = self.config.getConfParam("pdfattach") cf_attach = rclexecm.configparamtrue(cf_attach) if cf_attach: self.pdftk = rclexecm.which("pdftk") if self.pdftk: self.maybemaketmpdir()
def openfile(self, params): self.filename = params["filename:"] self.currentindex = 0 #self.em.rclog("openfile: [%s]" % self.filename) if not self.djvutxt: self.djvutxt = rclexecm.which("djvutxt") if not self.djvutxt: print("RECFILTERROR HELPERNOTFOUND djvutxt") sys.exit(1); self.djvused = rclexecm.which("djvused") return True
def openfile(self, params): self.filename = params["filename:"] self.currentindex = 0 #self.em.rclog("openfile: [%s]" % self.filename) if not self.djvutxt: self.djvutxt = rclexecm.which("djvutxt") if not self.djvutxt: print("RECFILTERROR HELPERNOTFOUND djvutxt") sys.exit(1) self.djvused = rclexecm.which("djvused") return True
def __init__(self, em): self.currentindex = 0 self.pdftotext = None self.em = em self.confdir = rclconfig.RclConfig().getConfDir() cf_doocr = rclconfig.RclConfig().getConfParam("pdfocr") cf_attach = rclconfig.RclConfig().getConfParam("pdfattach") self.pdftotext = rclexecm.which("pdftotext") if not self.pdftotext: self.pdftotext = rclexecm.which("poppler/pdftotext") # Check if we need to escape portions of text where old # versions of pdftotext output raw HTML special characters. self.needescape = True try: version = subprocess.check_output([self.pdftotext, "-v"], stderr=subprocess.STDOUT) major, minor, rev = version.split()[2].split('.') # Don't know exactly when this changed but it's fixed in # jessie 0.26.5 if int(major) > 0 or int(minor) >= 26: self.needescape = False except: pass # See if we'll try to perform OCR. Need the commands and the # either the presence of a file in the config dir (historical) # or a set config variable. self.ocrpossible = False if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")): self.tesseract = rclexecm.which("tesseract") if self.tesseract: self.pdftoppm = rclexecm.which("pdftoppm") if self.pdftoppm: self.ocrpossible = True self.maybemaketmpdir() # self.em.rclog("OCRPOSSIBLE: %d" % self.ocrpossible) # Pdftk is optionally used to extract attachments. This takes # a hit on perfmance even in the absence of any attachments, # so it can be disabled in the configuration. self.attextractdone = False self.attachlist = [] if cf_attach: self.pdftk = rclexecm.which("pdftk") else: self.pdftk = None if self.pdftk: self.maybemaketmpdir()
def __init__(self, em): self.generator = None self.em = em if _mswindows: self.target = "\\\\?\\c:\\nonexistent" else: self.target = "/nonexistent" self.pffexport = rclexecm.which("pffexport") if not self.pffexport: self.pffexport = rclexecm.which("pffinstall/mingw32/bin/pffexport") if not self.pffexport: # No need for anything else. openfile() will return an # error at once return self.cmd = [self.pffexport, "-q", "-t", self.target, "-s"]
def getCmd(self, fn): if self.ntry: return ([], None) self.ntry = 1 cmd = rclexecm.which("unrtf") if cmd: return ([cmd, "--nopict", "--html"], RTFProcessData(self.em)) else: return ([], None)
def getCmd(self, fn): '''Return command to execute, and postprocessor, according to our state: first try antiword, then others depending on mime identification. Do 2 tries at most''' if self.ntry == 0: self.ntry = 1 cmd = rclexecm.which("antiword") if cmd: return ([cmd, "-t", "-i", "1", "-m", "UTF-8"], WordProcessData(self.em)) else: return ([], None) elif self.ntry == 1: self.ntry = 2 # antiword failed. Check for an rtf file, or text and # process accordingly. It the doc is actually msword, try # wvWare. mt = self.mimetype(fn) self.em.rclog("rcldoc.py: actual MIME type %s" % mt) if mt == "text/plain": return ([ sys.executable, os.path.join(self.execdir, "rcltext.py") ], WordPassData(self.em)) elif mt == "text/rtf": cmd = [ sys.executable, os.path.join(self.execdir, "rclrtf.py"), "-s" ] self.em.rclog("rcldoc.py: returning cmd %s" % cmd) return (cmd, WordPassData(self.em)) elif mt == "application/msword": cmd = rclexecm.which("wvWare") if cmd: return ([cmd, "--nographics", "--charset=utf-8"], WordPassData(self.em)) else: return ([], None) else: return ([], None) else: return ([], None)
def getCmd(self, fn): if self.ntry: return ([], None) self.ntry = 1 cmd = rclexecm.which("ppt-dump.py") if cmd: # ppt-dump.py often exits 1 with valid data. Ignore exit value return ([sys.executable, cmd, "--no-struct-output", "--dump-text"], PPTProcessData(self.em), rclexec1.Executor.opt_ignxval) else: return ([], None)
def ocrpossible(config, path): # Check for tesseract global tesseractcmd if not tesseractcmd: config.setKeyDir(os.path.dirname(path)) if tesseractcmd: # It is very tempting to quote this value, esp. on Windows where it # will contain whitespace. There is no chance that an actual # command line would have quotes, so unquote it. tesseractcmd = config.getConfParam("tesseractcmd").strip('"') else: tesseractcmd = rclexecm.which("tesseract") if not tesseractcmd: _deb("tesseractcmd not found") return False if not os.path.isfile(tesseractcmd): _deb("tesseractcmd parameter [%s] is not a file" % tesseractcmd) return False # Check input format base, ext = os.path.splitext(path) ext = ext.lower() if ext in _okexts: return True if ext == '.pdf': # Check for pdftoppm. We could use pdftocairo, which can # produce a multi-page pdf and make the rest simpler, but the # legacy code used pdftoppm for some reason, and it appears # that the newest builds from conda-forge do not include # pdftocairo. So stay with pdftoppm. global pdftoppmcmd if not pdftoppmcmd: pdftoppmcmd = rclexecm.which("pdftoppm") if not pdftoppmcmd: pdftoppmcmd = rclexecm.which("poppler/pdftoppm") if pdftoppmcmd: return True return False
def _initextrameta(self): self.pdfinfo = rclexecm.which("pdfinfo") if not self.pdfinfo: self.pdfinfo = rclexecm.which("poppler/pdfinfo") if not self.pdfinfo: self.extrameta = None return # extrameta is like "samename metanm|rclnm ..." # we turn it into a list of pairs l = self.extrameta.split() self.extrameta = [] for e in l: l1 = e.split('|') if len(l1) == 1: l1.append(l1[0]) self.extrameta.append(l1) # Using lxml because it is better with # namespaces. With xml, we'd have to walk the XML tree # first, extracting all xmlns attributes and # constructing a tree (I tried and did not succeed in # doing this actually). lxml does it partially for # us. See http://stackoverflow.com/questions/14853243/ # parsing-xml-with-namespace-in-python-via-elementtree global ET #import xml.etree.ElementTree as ET try: import lxml.etree as ET except Exception as err: self.em.rclog("Can't import lxml etree: %s" % err) self.extrameta = None self.pdfinfo = None return self.re_head = re.compile(r'<head>', re.IGNORECASE) self.re_xmlpacket = re.compile(r'<\?xpacket[ ]+begin.*\?>' + r'(.*)' + r'<\?xpacket[ ]+end', flags = re.DOTALL)
def getCmd(self, fn): '''Return command to execute, and postprocessor, according to our state: first try antiword, then others depending on mime identification. Do 2 tries at most''' if self.ntry == 0: self.ntry = 1 cmd = rclexecm.which("antiword") if cmd: return ([cmd, "-t", "-i", "1", "-m", "UTF-8"], WordProcessData(self.em)) else: return ([],None) elif self.ntry == 1: self.ntry = 2 # antiword failed. Check for an rtf file, or text and # process accordingly. It the doc is actually msword, try # wvWare. mt = self.mimetype(fn) self.em.rclog("rcldoc.py: actual MIME type %s" % mt) if mt == "text/plain": return ([sys.executable, os.path.join(self.execdir, "rcltext.py")], WordPassData(self.em)) elif mt == "text/rtf": cmd = [sys.executable, os.path.join(self.execdir, "rclrtf.py"), "-s"] self.em.rclog("rcldoc.py: returning cmd %s" % cmd) return (cmd, WordPassData(self.em)) elif mt == "application/msword": cmd = rclexecm.which("wvWare") if cmd: return ([cmd, "--nographics", "--charset=utf-8"], WordPassData(self.em)) else: return ([],None) else: return ([],None) else: return ([],None)
def getCmd(self, fn): if self.ntry: return ([], None) self.ntry = 1 cmd = rclexecm.which("ppt-dump.py") if cmd: # ppt-dump.py often exits 1 with valid data. Ignore exit value return ( [sys.executable, cmd, "--no-struct-output", "--dump-text"], PPTProcessData(self.em), rclexec1.Executor.opt_ignxval, ) else: return ([], None)
def __init__(self, em): self.currentindex = 0 self.pdftotext = None self.em = em self.confdir = rclconfig.RclConfig().getConfDir() cf_doocr = rclconfig.RclConfig().getConfParam("pdfocr") cf_attach = rclconfig.RclConfig().getConfParam("pdfattach") self.pdftotext = rclexecm.which("pdftotext") if not self.pdftotext: self.pdftotext = rclexecm.which("poppler/pdftotext") # See if we'll try to perform OCR. Need the commands and the # either the presence of a file in the config dir (historical) # or a set config variable. self.ocrpossible = False if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")): self.tesseract = rclexecm.which("tesseract") if self.tesseract: self.pdftoppm = rclexecm.which("pdftoppm") if self.pdftoppm: self.ocrpossible = True self.maybemaketmpdir() # self.em.rclog("OCRPOSSIBLE: %d" % self.ocrpossible) # Pdftk is optionally used to extract attachments. This takes # a hit on perfmance even in the absence of any attachments, # so it can be disabled in the configuration. self.attextractdone = False self.attachlist = [] if cf_attach: self.pdftk = rclexecm.which("pdftk") else: self.pdftk = None if self.pdftk: self.maybemaketmpdir()
def ocrpossible(config, path): global abbyyocrcmd if not abbyyocrcmd: config.setKeyDir(os.path.dirname(path)) abbyyocrcmd = config.getConfParam("abbyyocrcmd") if not abbyyocrcmd: abbyyocrcmd = rclexecm.which("abbyyocr11") if not abbyyocrcmd: return False global abbyyocrdir abbyyocrdir = os.path.dirname(abbyyocrcmd) # Check input format base, ext = os.path.splitext(path) ext = ext.lower() if ext in _okexts: return True return False
def getCmd(self, fn): if self.ntry: return ([], None) self.ntry = 1 # Some HTML files masquerade as XLS try: data = open(fn, 'rb').read(512) if data.find(b'html') != -1 or data.find(b'HTML') != -1: return ("cat", XLSProcessData(self.em, True)) except Exception as err: self.em.rclog("Error reading %s:%s" % (fn, str(err))) pass cmd = rclexecm.which("xls-dump.py") if cmd: # xls-dump.py often exits 1 with valid data. Ignore exit value return ([sys.executable, cmd, "--dump-mode=canonical-xml", \ "--utf-8", "--catch"], XLSProcessData(self.em), rclexec1.Executor.opt_ignxval) else: return ([], None)
return b'\n'.join(self.out) class RTFFilter: def __init__(self, em): self.em = em self.ntry = 0 def reset(self): self.ntry = 0 def getCmd(self, fn): if self.ntry: return ([], None) self.ntry = 1 cmd = rclexecm.which("unrtf") if cmd: return ([cmd, "--nopict", "--html"], RTFProcessData(self.em)) else: return ([], None) if __name__ == '__main__': if not rclexecm.which("unrtf"): print("RECFILTERROR HELPERNOTFOUND unrtf") sys.exit(1) proto = rclexecm.RclExecM() filter = RTFFilter(proto) extract = rclexec1.Executor(proto, filter) rclexecm.main(proto, extract)
WordPassData(self.em)) elif mt == "text/rtf": cmd = [sys.executable, os.path.join(self.execdir, "rclrtf.py"), "-s"] self.em.rclog("rcldoc.py: returning cmd %s" % cmd) return (cmd, WordPassData(self.em)) elif mt == "application/msword": cmd = rclexecm.which("wvWare") if cmd: return ([cmd, "--nographics", "--charset=utf-8"], WordPassData(self.em)) else: return ([],None) else: return ([],None) else: return ([],None) if __name__ == '__main__': # Remember where we execute filters from, in case we need to exec another execdir = os.path.dirname(sys.argv[0]) # Check that we have antiword. We could fallback to wvWare, but # this is not what the old filter did. if not rclexecm.which("antiword"): print("RECFILTERROR HELPERNOTFOUND antiword") sys.exit(1) proto = rclexecm.RclExecM() filter = WordFilter(proto, execdir) extract = rclexec1.Executor(proto, filter) rclexecm.main(proto, extract)
def _msg(s): rclexecm.logmsg(s) sysplat = platform.system() if sysplat != "Windows": _msg("rcluncomp.py: only for Windows") sys.exit(1) try: import msvcrt msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) except Exception as err: _msg("setmode binary failed: %s" % str(err)) sevenz = rclexecm.which("7z") if not sevenz: _msg("rcluncomp.py: can't find 7z exe. Maybe set recollhelperpath " \ "in recoll.conf ?") sys.exit(2) # Params: uncompression program, input file name, temp directory. # We ignore the uncomp program, and always use 7z on Windows infile = sys.argv[2] outdir = sys.argv[3] # _msg("rcluncomp.py infile [%s], outdir [%s]" % (infile, outdir)) # There is apparently no way to suppress 7z output. Hopefully the # possible deadlock described by the subprocess module doc can't occur # here because there is little data printed. AFAIK nothing goes to stderr anyway
sys.executable, os.path.join(self.execdir, "rclrtf.py"), "-s" ] self.em.rclog("rcldoc.py: returning cmd %s" % cmd) return (cmd, WordPassData(self.em)) elif mt == "application/msword": cmd = rclexecm.which("wvWare") if cmd: return ([cmd, "--nographics", "--charset=utf-8"], WordPassData(self.em)) else: return ([], None) else: return ([], None) else: return ([], None) if __name__ == '__main__': # Remember where we execute filters from, in case we need to exec another execdir = os.path.dirname(sys.argv[0]) # Check that we have antiword. We could fallback to wvWare, but # this is not what the old filter did. if not rclexecm.which("antiword"): print("RECFILTERROR HELPERNOTFOUND antiword") sys.exit(1) proto = rclexecm.RclExecM() filter = WordFilter(proto, execdir) extract = rclexec1.Executor(proto, filter) rclexecm.main(proto, extract)
self.ntry = 0 def reset(self): self.ntry = 0 pass def getCmd(self, fn): if self.ntry: return ([], None) self.ntry = 1 cmd = rclexecm.which("ppt-dump.py") if cmd: # ppt-dump.py often exits 1 with valid data. Ignore exit value return ( [sys.executable, cmd, "--no-struct-output", "--dump-text"], PPTProcessData(self.em), rclexec1.Executor.opt_ignxval, ) else: return ([], None) if __name__ == "__main__": if not rclexecm.which("ppt-dump.py"): print("RECFILTERROR HELPERNOTFOUND ppt-dump.py") sys.exit(1) proto = rclexecm.RclExecM() filter = PPTFilter(proto) extract = rclexec1.Executor(proto, filter) rclexecm.main(proto, extract)
class PPTFilter: def __init__(self, em): self.em = em self.ntry = 0 def reset(self): self.ntry = 0 pass def getCmd(self, fn): if self.ntry: return ([], None) self.ntry = 1 cmd = rclexecm.which("ppt-dump.py") if cmd: # ppt-dump.py often exits 1 with valid data. Ignore exit value return ([sys.executable, cmd, "--no-struct-output", "--dump-text"], PPTProcessData(self.em), rclexec1.Executor.opt_ignxval) else: return ([], None) if __name__ == '__main__': if not rclexecm.which("ppt-dump.py"): print("RECFILTERROR HELPERNOTFOUND ppt-dump.py") sys.exit(1) proto = rclexecm.RclExecM() filter = PPTFilter(proto) extract = rclexec1.Executor(proto, filter) rclexecm.main(proto, extract)
def wrapData(self): return self.out class RTFFilter: def __init__(self, em): self.em = em self.ntry = 0 def reset(self): self.ntry = 0 def getCmd(self, fn): if self.ntry: return ([], None) self.ntry = 1 cmd = rclexecm.which("unrtf") if cmd: return ([cmd, "--nopict", "--html"], RTFProcessData(self.em)) else: return ([], None) if __name__ == '__main__': if not rclexecm.which("unrtf"): print("RECFILTERROR HELPERNOTFOUND unrtf") sys.exit(1) proto = rclexecm.RclExecM() filter = RTFFilter(proto) extract = rclexec1.Executor(proto, filter) rclexecm.main(proto, extract)
ftrace = sys.stderr #ftrace = open("C:/Users/Bill/log-uncomp.txt", "w") sysplat = platform.system() if sysplat != "Windows": print("rcluncomp.py: only for Windows", file = ftrace) sys.exit(1) try: import msvcrt msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) except Exception as err: print("setmode binary failed: %s" % str(err), file = ftrace) sevenz = rclexecm.which("7z") if not sevenz: print("rcluncomp.py: can't find 7z exe. Maybe set recollhelperpath " \ "in recoll.conf ?", file=ftrace) sys.exit(2) # Params: uncompression program, input file name, temp directory. # We ignore the uncomp program, and always use 7z on Windows infile = sys.argv[2] outdir = sys.argv[3] # print("rcluncomp.py infile [%s], outdir [%s]" % (infile, outdir), file = ftrace) # There is apparently no way to suppress 7z output. Hopefully the # possible deadlock described by the subprocess module doc can't occur # here because there is little data printed. AFAIK nothing goes to stderr anyway
if self.ntry: return ([], None) self.ntry = 1 # Some HTML files masquerade as XLS try: data = open(fn, 'rb').read(512) if data.find(b'html') != -1 or data.find(b'HTML') != -1: return ("cat", XLSProcessData(self.em, True)) except Exception as err: self.em.rclog("Error reading %s:%s" % (fn, str(err))) pass cmd = rclexecm.which("xls-dump.py") if cmd: # xls-dump.py often exits 1 with valid data. Ignore exit value # We later treat an empty output as an error return ([sys.executable, cmd, "--dump-mode=canonical-xml", \ "--utf-8", "--catch"], XLSProcessData(self.em), rclexec1.Executor.opt_ignxval) else: return ([], None) if __name__ == '__main__': if not rclexecm.which("xls-dump.py"): print("RECFILTERROR HELPERNOTFOUND ppt-dump.py") sys.exit(1) proto = rclexecm.RclExecM() filter = XLSFilter(proto) extract = rclexec1.Executor(proto, filter) rclexecm.main(proto, extract)