Ejemplo n.º 1
0
def ocrpossible(config, path):
    # Check for tesseract
    global tesseractcmd
    if not tesseractcmd:
        config.setKeyDir(os.path.dirname(path))
        if tesseractcmd:
            # It is very tempting to quote this value, esp. on Windows where it
            # will contain whitespace. There is no chance that an actual
            # command line would have quotes, so unquote it.
            tesseractcmd = config.getConfParam("tesseractcmd").strip('"')
        else:
            tesseractcmd = rclexecm.which("tesseract")
        if not tesseractcmd:
            _deb("tesseractcmd not found")
            return False
    if not os.path.isfile(tesseractcmd):
        _deb("tesseractcmd parameter [%s] is not a file" % tesseractcmd)
        return False

    # Check input format
    base, ext = os.path.splitext(path)
    ext = ext.lower()
    if ext in _okexts:
        return True

    if ext == '.pdf':
        global pdftocairocmd
        if not pdftocairocmd:
            pdftocairocmd = rclexecm.which("pdftocairo")
            if not pdftocairocmd:
                pdftocairocmd = rclexecm.which("poppler/pdftocairo")
        if pdftocairocmd:
            return True

    return False
Ejemplo n.º 2
0
 def __init__(self, em):
     super(DJVUExtractor, self).__init__(em)
     self.djvutxt = rclexecm.which("djvutxt")
     if not self.djvutxt:
         print("RECFILTERROR HELPERNOTFOUND djvutxt")
         sys.exit(1)
     self.djvused = rclexecm.which("djvused")
Ejemplo n.º 3
0
    def _initextrameta(self):
        if not _mswindows:
            self.pdfinfo = rclexecm.which("pdfinfo")
        if not self.pdfinfo:
            self.pdfinfo = rclexecm.which("poppler/pdfinfo")
        if not self.pdfinfo:
            self.extrameta = None
            return

        # extrameta is like "metanm|rclnm ...", where |rclnm maybe absent (keep
        # original name). Parse into a list of pairs.
        l = self.extrameta.split()
        self.extrameta = []
        for e in l:
            l1 = e.split('|')
            if len(l1) == 1:
                l1.append(l1[0])
            self.extrameta.append(l1)

        # Using lxml because it is better with
        # namespaces. With xml, we'd have to walk the XML tree
        # first, extracting all xmlns attributes and
        # constructing a tree (I tried and did not succeed in
        # doing this actually). lxml does it partially for
        # us. See http://stackoverflow.com/questions/14853243/
        #    parsing-xml-with-namespace-in-python-via-elementtree
        global ET
        #import xml.etree.ElementTree as ET
        try:
            import lxml.etree as ET
        except Exception as err:
            self.em.rclog("Can't import lxml etree: %s" % err)
            self.extrameta = None
            self.pdfinfo = None
            return

        self.re_head = re.compile(br'<head>', re.IGNORECASE)
        self.re_xmlpacket = re.compile(br'<\?xpacket[ 	]+begin.*\?>' +
                                       br'(.*)' + br'<\?xpacket[ 	]+end',
                                       flags = re.DOTALL)
        global EMF
        EMF = None
        if self.extrametafix:
            try:
                import importlib.util
                spec = importlib.util.spec_from_file_location(
                    'pdfextrametafix', self.extrametafix)
                EMF = importlib.util.module_from_spec(spec)
                spec.loader.exec_module(EMF)
            except Exception as err:
                self.em.rclog("Import extrametafix failed: %s" % err)
                EMF = None
                pass
Ejemplo n.º 4
0
    def __init__(self, em):
        self.currentindex = 0
        self.pdftotext = None
        self.pdfinfo = None
        self.pdftk = None
        self.em = em
        self.tesseract = None

        # Avoid picking up a default version on Windows, we want ours
        if not _mswindows:
            self.pdftotext = rclexecm.which("pdftotext")
        if not self.pdftotext:
            self.pdftotext = rclexecm.which("poppler/pdftotext")
            if not self.pdftotext:
                # No need for anything else. openfile() will return an
                # error at once
                return

        self.config = rclconfig.RclConfig()
        self.confdir = self.config.getConfDir()
        # The user can set a list of meta tags to be extracted from
        # the XMP metadata packet. These are specified as
        # (xmltag,rcltag) pairs
        self.extrameta = self.config.getConfParam("pdfextrameta")
        if self.extrameta:
            self.extrametafix = self.config.getConfParam("pdfextrametafix")
            self._initextrameta()

        # Check if we need to escape portions of text where old
        # versions of pdftotext output raw HTML special characters.
        self.needescape = True
        try:
            version = subprocess.check_output([self.pdftotext, "-v"],
                                              stderr=subprocess.STDOUT)
            major,minor,rev = version.split()[2].split('.')
            # Don't know exactly when this changed but it's fixed in
            # jessie 0.26.5
            if int(major) > 0 or int(minor) >= 26:
                self.needescape = False
        except:
            pass
        
        # Pdftk is optionally used to extract attachments. This takes
        # a hit on performance even in the absence of any attachments,
        # so it can be disabled in the configuration.
        self.attextractdone = False
        self.attachlist = []
        cf_attach = self.config.getConfParam("pdfattach")
        cf_attach = rclexecm.configparamtrue(cf_attach)
        if cf_attach:
            self.pdftk = rclexecm.which("pdftk")
        if self.pdftk:
            self.maybemaketmpdir()
Ejemplo n.º 5
0
    def openfile(self, params):
        self.filename = params["filename:"]
        self.currentindex = 0
        #self.em.rclog("openfile: [%s]" % self.filename)

        if not self.djvutxt:
            self.djvutxt = rclexecm.which("djvutxt")
            if not self.djvutxt:
                print("RECFILTERROR HELPERNOTFOUND djvutxt")
                sys.exit(1);
            self.djvused = rclexecm.which("djvused")

        return True
Ejemplo n.º 6
0
    def openfile(self, params):
        self.filename = params["filename:"]
        self.currentindex = 0
        #self.em.rclog("openfile: [%s]" % self.filename)

        if not self.djvutxt:
            self.djvutxt = rclexecm.which("djvutxt")
            if not self.djvutxt:
                print("RECFILTERROR HELPERNOTFOUND djvutxt")
                sys.exit(1)
            self.djvused = rclexecm.which("djvused")

        return True
Ejemplo n.º 7
0
    def __init__(self, em):
        self.currentindex = 0
        self.pdftotext = None
        self.em = em

        self.confdir = rclconfig.RclConfig().getConfDir()
        cf_doocr = rclconfig.RclConfig().getConfParam("pdfocr")
        cf_attach = rclconfig.RclConfig().getConfParam("pdfattach")

        self.pdftotext = rclexecm.which("pdftotext")
        if not self.pdftotext:
            self.pdftotext = rclexecm.which("poppler/pdftotext")

        # Check if we need to escape portions of text where old
        # versions of pdftotext output raw HTML special characters.
        self.needescape = True
        try:
            version = subprocess.check_output([self.pdftotext, "-v"],
                                              stderr=subprocess.STDOUT)
            major, minor, rev = version.split()[2].split('.')
            # Don't know exactly when this changed but it's fixed in
            # jessie 0.26.5
            if int(major) > 0 or int(minor) >= 26:
                self.needescape = False
        except:
            pass

        # See if we'll try to perform OCR. Need the commands and the
        # either the presence of a file in the config dir (historical)
        # or a set config variable.
        self.ocrpossible = False
        if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
            self.tesseract = rclexecm.which("tesseract")
            if self.tesseract:
                self.pdftoppm = rclexecm.which("pdftoppm")
                if self.pdftoppm:
                    self.ocrpossible = True
                    self.maybemaketmpdir()
        # self.em.rclog("OCRPOSSIBLE: %d" % self.ocrpossible)

        # Pdftk is optionally used to extract attachments. This takes
        # a hit on perfmance even in the absence of any attachments,
        # so it can be disabled in the configuration.
        self.attextractdone = False
        self.attachlist = []
        if cf_attach:
            self.pdftk = rclexecm.which("pdftk")
        else:
            self.pdftk = None
        if self.pdftk:
            self.maybemaketmpdir()
Ejemplo n.º 8
0
 def __init__(self, em):
     self.generator = None
     self.em = em
     if _mswindows:
         self.target = "\\\\?\\c:\\nonexistent"
     else:
         self.target = "/nonexistent"
     self.pffexport = rclexecm.which("pffexport")
     if not self.pffexport:
         self.pffexport = rclexecm.which("pffinstall/mingw32/bin/pffexport")
         if not self.pffexport:
             # No need for anything else. openfile() will return an
             # error at once
             return
     self.cmd = [self.pffexport, "-q", "-t", self.target, "-s"]
Ejemplo n.º 9
0
 def getCmd(self, fn):
     if self.ntry:
         return ([], None)
     self.ntry = 1
     cmd = rclexecm.which("unrtf")
     if cmd:
         return ([cmd, "--nopict", "--html"], RTFProcessData(self.em))
     else:
         return ([], None)
Ejemplo n.º 10
0
 def getCmd(self, fn):
     if self.ntry:
         return ([], None)
     self.ntry = 1
     cmd = rclexecm.which("unrtf")
     if cmd:
         return ([cmd, "--nopict", "--html"], RTFProcessData(self.em))
     else:
         return ([], None)
Ejemplo n.º 11
0
 def getCmd(self, fn):
     '''Return command to execute, and postprocessor, according to
     our state: first try antiword, then others depending on mime
     identification. Do 2 tries at most'''
     if self.ntry == 0:
         self.ntry = 1
         cmd = rclexecm.which("antiword")
         if cmd:
             return ([cmd, "-t", "-i", "1", "-m",
                      "UTF-8"], WordProcessData(self.em))
         else:
             return ([], None)
     elif self.ntry == 1:
         self.ntry = 2
         # antiword failed. Check for an rtf file, or text and
         # process accordingly. It the doc is actually msword, try
         # wvWare.
         mt = self.mimetype(fn)
         self.em.rclog("rcldoc.py: actual MIME type %s" % mt)
         if mt == "text/plain":
             return ([
                 sys.executable,
                 os.path.join(self.execdir, "rcltext.py")
             ], WordPassData(self.em))
         elif mt == "text/rtf":
             cmd = [
                 sys.executable,
                 os.path.join(self.execdir, "rclrtf.py"), "-s"
             ]
             self.em.rclog("rcldoc.py: returning cmd %s" % cmd)
             return (cmd, WordPassData(self.em))
         elif mt == "application/msword":
             cmd = rclexecm.which("wvWare")
             if cmd:
                 return ([cmd, "--nographics",
                          "--charset=utf-8"], WordPassData(self.em))
             else:
                 return ([], None)
         else:
             return ([], None)
     else:
         return ([], None)
Ejemplo n.º 12
0
 def getCmd(self, fn):
     if self.ntry:
         return ([], None)
     self.ntry = 1
     cmd = rclexecm.which("ppt-dump.py")
     if cmd:
         # ppt-dump.py often exits 1 with valid data. Ignore exit value
         return ([sys.executable, cmd, "--no-struct-output", "--dump-text"],
                 PPTProcessData(self.em), rclexec1.Executor.opt_ignxval)
     else:
         return ([], None)
Ejemplo n.º 13
0
def ocrpossible(config, path):
    # Check for tesseract
    global tesseractcmd
    if not tesseractcmd:
        config.setKeyDir(os.path.dirname(path))
        if tesseractcmd:
            # It is very tempting to quote this value, esp. on Windows where it
            # will contain whitespace. There is no chance that an actual
            # command line would have quotes, so unquote it.
            tesseractcmd = config.getConfParam("tesseractcmd").strip('"')
        else:
            tesseractcmd = rclexecm.which("tesseract")
        if not tesseractcmd:
            _deb("tesseractcmd not found")
            return False
    if not os.path.isfile(tesseractcmd):
        _deb("tesseractcmd parameter [%s] is not a file" % tesseractcmd)
        return False

    # Check input format
    base, ext = os.path.splitext(path)
    ext = ext.lower()
    if ext in _okexts:
        return True

    if ext == '.pdf':
        # Check for pdftoppm. We could use pdftocairo, which can
        # produce a multi-page pdf and make the rest simpler, but the
        # legacy code used pdftoppm for some reason, and it appears
        # that the newest builds from conda-forge do not include
        # pdftocairo. So stay with pdftoppm.
        global pdftoppmcmd
        if not pdftoppmcmd:
            pdftoppmcmd = rclexecm.which("pdftoppm")
            if not pdftoppmcmd:
                pdftoppmcmd = rclexecm.which("poppler/pdftoppm")
        if pdftoppmcmd:
            return True

    return False
Ejemplo n.º 14
0
    def _initextrameta(self):
        self.pdfinfo = rclexecm.which("pdfinfo")
        if not self.pdfinfo:
            self.pdfinfo = rclexecm.which("poppler/pdfinfo")
        if not self.pdfinfo:
            self.extrameta = None
            return

        # extrameta is like "samename metanm|rclnm ..."
        # we turn it into a list of pairs
        l = self.extrameta.split()
        self.extrameta = []
        for e in l:
            l1 = e.split('|')
            if len(l1) == 1:
                l1.append(l1[0])
            self.extrameta.append(l1)

        # Using lxml because it is better with
        # namespaces. With xml, we'd have to walk the XML tree
        # first, extracting all xmlns attributes and
        # constructing a tree (I tried and did not succeed in
        # doing this actually). lxml does it partially for
        # us. See http://stackoverflow.com/questions/14853243/
        #    parsing-xml-with-namespace-in-python-via-elementtree
        global ET
        #import xml.etree.ElementTree as ET
        try:
            import lxml.etree as ET
        except Exception as err:
            self.em.rclog("Can't import lxml etree: %s" % err)
            self.extrameta = None
            self.pdfinfo = None
            return

        self.re_head = re.compile(r'<head>', re.IGNORECASE)
        self.re_xmlpacket = re.compile(r'<\?xpacket[ 	]+begin.*\?>' +
                                       r'(.*)' + r'<\?xpacket[ 	]+end',
                                       flags = re.DOTALL)
Ejemplo n.º 15
0
 def getCmd(self, fn):
     '''Return command to execute, and postprocessor, according to
     our state: first try antiword, then others depending on mime
     identification. Do 2 tries at most'''
     if self.ntry == 0:
         self.ntry = 1
         cmd = rclexecm.which("antiword")
         if cmd:
             return ([cmd, "-t", "-i", "1", "-m", "UTF-8"],
                     WordProcessData(self.em))
         else:
             return ([],None)
     elif self.ntry == 1:
         self.ntry = 2
         # antiword failed. Check for an rtf file, or text and
         # process accordingly. It the doc is actually msword, try
         # wvWare.
         mt = self.mimetype(fn)
         self.em.rclog("rcldoc.py: actual MIME type %s" % mt)
         if mt == "text/plain":
             return ([sys.executable, os.path.join(self.execdir, "rcltext.py")],
                    WordPassData(self.em))
         elif mt == "text/rtf":
             cmd = [sys.executable, os.path.join(self.execdir, "rclrtf.py"),
                    "-s"]
             self.em.rclog("rcldoc.py: returning cmd %s" % cmd)
             return (cmd, WordPassData(self.em))
         elif mt == "application/msword":
             cmd = rclexecm.which("wvWare")
             if cmd:
                 return ([cmd, "--nographics", "--charset=utf-8"],
                         WordPassData(self.em))
             else:
                 return ([],None)    
         else:
             return ([],None)
     else:
         return ([],None)
Ejemplo n.º 16
0
 def getCmd(self, fn):
     if self.ntry:
         return ([], None)
     self.ntry = 1
     cmd = rclexecm.which("ppt-dump.py")
     if cmd:
         # ppt-dump.py often exits 1 with valid data. Ignore exit value
         return (
             [sys.executable, cmd, "--no-struct-output", "--dump-text"],
             PPTProcessData(self.em),
             rclexec1.Executor.opt_ignxval,
         )
     else:
         return ([], None)
Ejemplo n.º 17
0
    def __init__(self, em):
        self.currentindex = 0
        self.pdftotext = None
        self.em = em

        self.confdir = rclconfig.RclConfig().getConfDir()
        cf_doocr = rclconfig.RclConfig().getConfParam("pdfocr")
        cf_attach = rclconfig.RclConfig().getConfParam("pdfattach")
        
        self.pdftotext = rclexecm.which("pdftotext")
        if not self.pdftotext:
            self.pdftotext = rclexecm.which("poppler/pdftotext")

        # See if we'll try to perform OCR. Need the commands and the
        # either the presence of a file in the config dir (historical)
        # or a set config variable.
        self.ocrpossible = False
        if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
            self.tesseract = rclexecm.which("tesseract")
            if self.tesseract:
                self.pdftoppm = rclexecm.which("pdftoppm")
                if self.pdftoppm:
                    self.ocrpossible = True
                    self.maybemaketmpdir()
        # self.em.rclog("OCRPOSSIBLE: %d" % self.ocrpossible)

        # Pdftk is optionally used to extract attachments. This takes
        # a hit on perfmance even in the absence of any attachments,
        # so it can be disabled in the configuration.
        self.attextractdone = False
        self.attachlist = []
        if cf_attach:
            self.pdftk = rclexecm.which("pdftk")
        else:
            self.pdftk = None
        if self.pdftk:
            self.maybemaketmpdir()
Ejemplo n.º 18
0
def ocrpossible(config, path):
    global abbyyocrcmd
    if not abbyyocrcmd:
        config.setKeyDir(os.path.dirname(path))
        abbyyocrcmd = config.getConfParam("abbyyocrcmd")
        if not abbyyocrcmd:
            abbyyocrcmd = rclexecm.which("abbyyocr11")
        if not abbyyocrcmd:
            return False
        global abbyyocrdir
        abbyyocrdir = os.path.dirname(abbyyocrcmd)

    # Check input format
    base, ext = os.path.splitext(path)
    ext = ext.lower()
    if ext in _okexts:
        return True
    return False
Ejemplo n.º 19
0
 def getCmd(self, fn):
     if self.ntry:
         return ([], None)
     self.ntry = 1
     # Some HTML files masquerade as XLS
     try:
         data = open(fn, 'rb').read(512)
         if data.find(b'html') != -1 or data.find(b'HTML') != -1:
             return ("cat", XLSProcessData(self.em, True))
     except Exception as err:
         self.em.rclog("Error reading %s:%s" % (fn, str(err)))
         pass
     cmd = rclexecm.which("xls-dump.py")
     if cmd:
         # xls-dump.py often exits 1 with valid data. Ignore exit value
         return ([sys.executable, cmd, "--dump-mode=canonical-xml", \
                  "--utf-8", "--catch"],
                 XLSProcessData(self.em), rclexec1.Executor.opt_ignxval)
     else:
         return ([], None)
Ejemplo n.º 20
0
        return b'\n'.join(self.out)


class RTFFilter:
    def __init__(self, em):
        self.em = em
        self.ntry = 0

    def reset(self):
        self.ntry = 0

    def getCmd(self, fn):
        if self.ntry:
            return ([], None)
        self.ntry = 1
        cmd = rclexecm.which("unrtf")
        if cmd:
            return ([cmd, "--nopict", "--html"], RTFProcessData(self.em))
        else:
            return ([], None)


if __name__ == '__main__':
    if not rclexecm.which("unrtf"):
        print("RECFILTERROR HELPERNOTFOUND unrtf")
        sys.exit(1)
    proto = rclexecm.RclExecM()
    filter = RTFFilter(proto)
    extract = rclexec1.Executor(proto, filter)
    rclexecm.main(proto, extract)
Ejemplo n.º 21
0
                       WordPassData(self.em))
            elif mt == "text/rtf":
                cmd = [sys.executable, os.path.join(self.execdir, "rclrtf.py"),
                       "-s"]
                self.em.rclog("rcldoc.py: returning cmd %s" % cmd)
                return (cmd, WordPassData(self.em))
            elif mt == "application/msword":
                cmd = rclexecm.which("wvWare")
                if cmd:
                    return ([cmd, "--nographics", "--charset=utf-8"],
                            WordPassData(self.em))
                else:
                    return ([],None)    
            else:
                return ([],None)
        else:
            return ([],None)

if __name__ == '__main__':
    # Remember where we execute filters from, in case we need to exec another
    execdir = os.path.dirname(sys.argv[0])
    # Check that we have antiword. We could fallback to wvWare, but
    # this is not what the old filter did.
    if not rclexecm.which("antiword"):
        print("RECFILTERROR HELPERNOTFOUND antiword")
        sys.exit(1)
    proto = rclexecm.RclExecM()
    filter = WordFilter(proto, execdir)
    extract = rclexec1.Executor(proto, filter)
    rclexecm.main(proto, extract)
Ejemplo n.º 22
0
def _msg(s):
    rclexecm.logmsg(s)
    

sysplat = platform.system()
if sysplat != "Windows":
    _msg("rcluncomp.py: only for Windows")
    sys.exit(1)

try:
    import msvcrt
    msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
except Exception as err:
    _msg("setmode binary failed: %s" % str(err))

sevenz = rclexecm.which("7z")
if not sevenz:
    _msg("rcluncomp.py: can't find 7z exe. Maybe set recollhelperpath " \
          "in recoll.conf ?")
    sys.exit(2)

# Params: uncompression program, input file name, temp directory.
# We ignore the uncomp program, and always use 7z on Windows

infile = sys.argv[2]
outdir = sys.argv[3]
# _msg("rcluncomp.py infile [%s], outdir [%s]" % (infile, outdir))

# There is apparently no way to suppress 7z output. Hopefully the
# possible deadlock described by the subprocess module doc can't occur
# here because there is little data printed. AFAIK nothing goes to stderr anyway
Ejemplo n.º 23
0
                    sys.executable,
                    os.path.join(self.execdir, "rclrtf.py"), "-s"
                ]
                self.em.rclog("rcldoc.py: returning cmd %s" % cmd)
                return (cmd, WordPassData(self.em))
            elif mt == "application/msword":
                cmd = rclexecm.which("wvWare")
                if cmd:
                    return ([cmd, "--nographics",
                             "--charset=utf-8"], WordPassData(self.em))
                else:
                    return ([], None)
            else:
                return ([], None)
        else:
            return ([], None)


if __name__ == '__main__':
    # Remember where we execute filters from, in case we need to exec another
    execdir = os.path.dirname(sys.argv[0])
    # Check that we have antiword. We could fallback to wvWare, but
    # this is not what the old filter did.
    if not rclexecm.which("antiword"):
        print("RECFILTERROR HELPERNOTFOUND antiword")
        sys.exit(1)
    proto = rclexecm.RclExecM()
    filter = WordFilter(proto, execdir)
    extract = rclexec1.Executor(proto, filter)
    rclexecm.main(proto, extract)
Ejemplo n.º 24
0
        self.ntry = 0

    def reset(self):
        self.ntry = 0
        pass

    def getCmd(self, fn):
        if self.ntry:
            return ([], None)
        self.ntry = 1
        cmd = rclexecm.which("ppt-dump.py")
        if cmd:
            # ppt-dump.py often exits 1 with valid data. Ignore exit value
            return (
                [sys.executable, cmd, "--no-struct-output", "--dump-text"],
                PPTProcessData(self.em),
                rclexec1.Executor.opt_ignxval,
            )
        else:
            return ([], None)


if __name__ == "__main__":
    if not rclexecm.which("ppt-dump.py"):
        print("RECFILTERROR HELPERNOTFOUND ppt-dump.py")
        sys.exit(1)
    proto = rclexecm.RclExecM()
    filter = PPTFilter(proto)
    extract = rclexec1.Executor(proto, filter)
    rclexecm.main(proto, extract)
Ejemplo n.º 25
0
class PPTFilter:
    def __init__(self, em):
        self.em = em
        self.ntry = 0

    def reset(self):
        self.ntry = 0
        pass
            
    def getCmd(self, fn):
        if self.ntry:
            return ([], None)
        self.ntry = 1
        cmd = rclexecm.which("ppt-dump.py")
        if cmd:
            # ppt-dump.py often exits 1 with valid data. Ignore exit value
            return ([sys.executable, cmd, "--no-struct-output", "--dump-text"],
                    PPTProcessData(self.em), rclexec1.Executor.opt_ignxval)
        else:
            return ([], None)

if __name__ == '__main__':
    if not rclexecm.which("ppt-dump.py"):
        print("RECFILTERROR HELPERNOTFOUND ppt-dump.py")
        sys.exit(1)
    proto = rclexecm.RclExecM()
    filter = PPTFilter(proto)
    extract = rclexec1.Executor(proto, filter)
    rclexecm.main(proto, extract)
Ejemplo n.º 26
0
    def wrapData(self):
        return self.out

class RTFFilter:
    def __init__(self, em):
        self.em = em
        self.ntry = 0

    def reset(self):
        self.ntry = 0
            
    def getCmd(self, fn):
        if self.ntry:
            return ([], None)
        self.ntry = 1
        cmd = rclexecm.which("unrtf")
        if cmd:
            return ([cmd, "--nopict", "--html"], RTFProcessData(self.em))
        else:
            return ([], None)

if __name__ == '__main__':
    if not rclexecm.which("unrtf"):
        print("RECFILTERROR HELPERNOTFOUND unrtf")
        sys.exit(1)
    proto = rclexecm.RclExecM()
    filter = RTFFilter(proto)
    extract = rclexec1.Executor(proto, filter)
    rclexecm.main(proto, extract)
Ejemplo n.º 27
0
ftrace = sys.stderr
#ftrace = open("C:/Users/Bill/log-uncomp.txt", "w")

sysplat = platform.system()
if sysplat != "Windows":
    print("rcluncomp.py: only for Windows", file = ftrace)
    sys.exit(1)

try:
    import msvcrt
    msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
except Exception as err:
    print("setmode binary failed: %s" % str(err), file = ftrace)

sevenz = rclexecm.which("7z")
if not sevenz:
    print("rcluncomp.py: can't find 7z exe. Maybe set recollhelperpath " \
          "in recoll.conf ?", file=ftrace)
    sys.exit(2)

# Params: uncompression program, input file name, temp directory.
# We ignore the uncomp program, and always use 7z on Windows

infile = sys.argv[2]
outdir = sys.argv[3]
# print("rcluncomp.py infile [%s], outdir [%s]" % (infile, outdir), file = ftrace)

# There is apparently no way to suppress 7z output. Hopefully the
# possible deadlock described by the subprocess module doc can't occur
# here because there is little data printed. AFAIK nothing goes to stderr anyway
Ejemplo n.º 28
0
        if self.ntry:
            return ([], None)
        self.ntry = 1
        # Some HTML files masquerade as XLS
        try:
            data = open(fn, 'rb').read(512)
            if data.find(b'html') != -1 or data.find(b'HTML') != -1:
                return ("cat", XLSProcessData(self.em, True))
        except Exception as err:
            self.em.rclog("Error reading %s:%s" % (fn, str(err)))
            pass
        cmd = rclexecm.which("xls-dump.py")
        if cmd:
            # xls-dump.py often exits 1 with valid data. Ignore exit value
            # We later treat an empty output as an error
            return ([sys.executable, cmd, "--dump-mode=canonical-xml", \
                     "--utf-8", "--catch"],
                    XLSProcessData(self.em), rclexec1.Executor.opt_ignxval)
        else:
            return ([], None)


if __name__ == '__main__':
    if not rclexecm.which("xls-dump.py"):
        print("RECFILTERROR HELPERNOTFOUND ppt-dump.py")
        sys.exit(1)
    proto = rclexecm.RclExecM()
    filter = XLSFilter(proto)
    extract = rclexec1.Executor(proto, filter)
    rclexecm.main(proto, extract)