Esempio n. 1
0
    def html_text(self, fn):
        # hwp wants str filenames. This is unfortunate
        fn = fn.decode('utf-8')
        try:
            hwpfile = fs_Hwp5File(fn)
        except Exception as ex:
            self.em.rclog("hwpfile open failed: %s" % ex)
            raise ex
        try:
            tt = hwpfile.summaryinfo.title.strip()
            if tt:
                tt = rclexecm.htmlescape(tt.encode('utf-8'))
                self.em.setfield('caption', tt)

            for k, v in metafields(hwpfile.summaryinfo):
                v = "{0}".format(v)
                v = v.strip()
                if v:
                    v = rclexecm.htmlescape(v.encode('utf-8'))
                    k = k.encode('utf-8')
                    self.em.setfield(k, v)
        except Exception as e:
            self.em.rclog("Exception: %s" % e)
        finally:
            hwpfile.close()

        # The first version of this file used conversion to text using
        # the hwp5 module (no subproc). But this apparently mishandled
        # tables. Switched to executing hwp5html instead. See 1st git
        # version for the old approach.
        return rclexecm.execPythonScript(["hwp5html", "--html", fn])
Esempio n. 2
0
    def takeLine(self, line):
        if not self.gotdata:
            if line == b'':
                return
            self.out.append(b'<html><head><title></title>' + \
                       b'<meta http-equiv="Content-Type"' + \
                       b'content="text/html;charset=UTF-8">' + \
                       b'</head><body><p>')
            self.gotdata = True

        if self.cont:
            line = self.cont + line
            self.cont = ""

        if line == b'\f':
            self.out.append('</p><hr><p>')
            return

        if self.patcont.search(line):
            # Break at last whitespace
            match = self.patws.search(line)
            if match:
                self.cont = line[match.start(2):match.end(2)]
                line = line[0:match.start(1)]
            else:
                self.cont = line
                line = b''

        if line:
            self.out.append(rclexecm.htmlescape(line) + b'<br>')
        else:
            self.out.append(b'<br>')
Esempio n. 3
0
 def html_text(self, fn):
     # No charset, so recoll will have to use its config to guess it
     html = b'<html><head><title></title></head><body><pre>'
     with open(fn, "rb") as f:
         html += rclexecm.htmlescape(f.read())
     html += b'</pre></body></html>'
     return html
Esempio n. 4
0
    def _fixhtml(self, input):
        #print input
        inheader = False
        inbody = False
        didcs = False
        output = []
        isempty = True
        for line in input.split(b'\n'):
            if re.search(b'</head>', line):
                inheader = False
            if re.search(b'</pre>', line):
                inbody = False
            if inheader:
                if not didcs:
                    output.append(b'<meta http-equiv="Content-Type"' + \
                              b'content="text/html; charset=UTF-8">\n')
                    didcs = True
                if self.needescape:
                    m = re.search(b'''(.*<title>)(.*)(<\/title>.*)''', line)
                    if not m:
                        m = re.search(b'''(.*content=")(.*)(".*/>.*)''', line)
                    if m:
                        line = m.group(1) + rclexecm.htmlescape(m.group(2)) + \
                               m.group(3)

                # Recoll treats "Subject" as a "title" element
                # (based on emails). The PDF "Subject" metadata
                # field is more like an HTML "description"
                line = re.sub(b'name="Subject"', b'name="Description"', line, 1)

            elif inbody:
                s = line[0:1]
                if s != b"\x0c" and s != b"<":
                    isempty = False
                # We used to remove end-of-line hyphenation (and join
                # lines), but but it's not clear that we should do
                # this as pdftotext without the -layout option does it ?
                line = rclexecm.htmlescape(line)

            if re.search(b'<head>', line):
                inheader = True
            if re.search(b'<pre>', line):
                inbody = True

            output.append(line)

        return b'\n'.join(output), isempty
Esempio n. 5
0
    def html_text(self, filename):
        ok = False

        metadata = pyexiv2.ImageMetadata(filename)
        metadata.read()
        keys = metadata.exif_keys + metadata.iptc_keys + metadata.xmp_keys
        mdic = {}
        for k in keys:
            # we skip numeric keys and undecoded makernote data
            if k != 'Exif.Photo.MakerNote' and not khexre.match(k):
                mdic[k] = str(metadata[k].raw_value)

        docdata = b'<html><head>\n'

        ttdata = set()
        for k in pyexiv2_titles:
            if k in mdic:
                ttdata.add(rclexecm.htmlescape(mdic[k]))
        if ttdata:
            title = ""
            for v in ttdata:
                v = v.replace('[', '').replace(']', '').replace("'", "")
                title += v + " "
            docdata += rclexecm.makebytes("<title>" + title + "</title>\n")

        for k in exiv2_dates:
            if k in mdic:
                # Recoll wants: %Y-%m-%d %H:%M:%S.
                # We get 2014:06:27 14:58:47
                dt = mdic[k].replace(":", "-", 2)
                docdata += b'<meta name="date" content="' + \
                           rclexecm.makebytes(dt) + b'">\n'
                break

        for k,v in mdic.items():
            if k ==  'Xmp.digiKam.TagsList':
                docdata += b'<meta name="keywords" content="' + \
                           rclexecm.makebytes(rclexecm.htmlescape(mdic[k])) + \
                           b'">\n'

        docdata += b'</head><body>\n'
        for k,v in mdic.items():
            docdata += rclexecm.makebytes(k + " : " + \
                                     rclexecm.htmlescape(mdic[k]) + "<br />\n")
        docdata += b'</body></html>'

        return docdata
Esempio n. 6
0
 def takeLine(self, line):
     if not self.gotdata:
         self.out.append(b'<html><head>' + \
                         b'<meta http-equiv="Content-Type" ' + \
                         b'content="text/html;charset=UTF-8">' + \
                         b'</head><body><pre>')
         self.gotdata = True
     self.out.append(rclexecm.htmlescape(line))
Esempio n. 7
0
 def wrapData(self):
     if not self.gotdata:
         raise Exception("xls-dump returned no data")
         return b''
     if self.ishtml:
         return b'\n'.join(self.out)
     handler = xlsxmltocsv.XlsXmlHandler()
     xml.sax.parseString(b'\n'.join(self.xmldata), handler)
     self.out.append(rclexecm.htmlescape(b'\n'.join(handler.output)))
     return b'\n'.join(self.out) + b'</pre></body></html>'
Esempio n. 8
0
def _htmlwrapplain(txt, title=b"", charset=b"utf-8"):
    return \
        b'<html>\n<head>\n<title>' + \
        title + \
        b'</title>\n' + \
        b'<meta http-equiv="Content-Type" content="text/html; charset=' + \
        charset + \
        b'">\n' + \
        b'<body>\n<pre>\n' + \
        rclexecm.htmlescape(txt) + \
        b'</pre>\n</body>\n</html>\n'
Esempio n. 9
0
    def html_text(self, fn):
        self.em.setmimetype('text/html')

        # Extract metadata
        metadata = b""
        if self.djvused:
            try:
                metadata = subprocess.check_output(
                    [self.djvused, fn, "-e", "select 1;print-meta"])
            except Exception as e:
                self.em.rclog("djvused failed: %s" % e)
        author = ""
        title = ""
        metadata = metadata.decode('UTF-8', 'replace')
        for line in metadata.split('\n'):
            line = line.split('"')
            if len(line) >= 2:
                nm = line[0].strip()
                if nm == "author":
                    author = ' '.join(line[1:])
                elif nm == "title":
                    title = ' '.join(line[1:])

        # Main text
        txtdata = subprocess.check_output([self.djvutxt, fn])

        txtdata = txtdata.decode('UTF-8', 'replace')

        data = '''<html><head>'''
        data += '''<title>''' + rclexecm.htmlescape(title) + '''</title>'''
        data += '''<meta http-equiv="Content-Type" '''
        data += '''content="text/html;charset=UTF-8">'''
        if author:
            data += '''<meta name="author" content="''' + \
                    rclexecm.htmlescape(author) + '''">'''
        data += '''</head><body><pre>'''

        data += rclexecm.htmlescape(txtdata)
        data += '''</pre></body></html>'''
        return data
Esempio n. 10
0
    def _selfdoc(self):
        '''Extract the text from the pdf doc (as opposed to attachment)'''
        self.em.setmimetype('text/html')

        if self.attextractdone and len(self.attachlist) == 0:
            eof = rclexecm.RclExecM.eofnext
        else:
            eof = rclexecm.RclExecM.noteof
            
        html = subprocess.check_output([self.pdftotext, "-htmlmeta", "-enc",
                                        "UTF-8", "-eol", "unix", "-q",
                                        self.filename, "-"])

        html, isempty = self._fixhtml(html)
        #self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))

        if isempty:
            self.config.setKeyDir(os.path.dirname(self.filename))
            s = self.config.getConfParam("pdfocr")
            if rclexecm.configparamtrue(s):
                try:
                    cmd = [sys.executable, os.path.join(_execdir, "rclocr.py"),
                           self.filename]
                    data = subprocess.check_output(cmd)
                    html = _htmlprefix + rclexecm.htmlescape(data) + _htmlsuffix
                except Exception as e:
                    self.em.rclog("%s failed: %s" % (cmd, e))
                    pass

        if self.extrameta:
            try:
                html = self._setextrameta(html)
            except Exception as err:
                self.em.rclog("Metadata extraction failed: %s %s" %
                              (err, traceback.format_exc()))

        if havepopplerglib:
            try:
                html = self._process_annotations(html)
            except Exception as err:
                self.em.rclog("Annotation extraction failed: %s %s" %
                              (err, traceback.format_exc()))
        return (True, html, "", eof)
Esempio n. 11
0
 def _metatag(self, nm, val):
     return b"<meta name=\"" + rclexecm.makebytes(nm) + b"\" content=\"" + \
            rclexecm.htmlescape(rclexecm.makebytes(val)) + b"\">"