コード例 #1
0
ファイル: richtext.py プロジェクト: hjanime/VisTrails
    def compute(self):
        """ compute() -> None
        Dispatch the HTML contents to the spreadsheet
        """
        filename = self.get_input("File").name

        text_format = self.get_input("Format")
        with open(filename, 'rb') as fp:
            if text_format == 'html':
                html = fp.read() # reads bytes
            elif text_format == 'rtf':
                try:
                    py_import('pyth', {'pip': 'pyth'})
                except ImportError:
                    raise ModuleError(self, "'rtf' format requires the pyth "
                                      "Python library")
                else:
                    from pyth.plugins.rtf15.reader import Rtf15Reader
                    from pyth.plugins.xhtml.writer import XHTMLWriter
                    doc = Rtf15Reader.read(fp)
                    html = XHTMLWriter.write(doc).read() # gets bytes
            else:
                raise ModuleError(self, "'%s' format is unknown" % text_format)

        self.displayAndWait(RichTextCellWidget, (html,))
コード例 #2
0
    def compute(self):
        """ compute() -> None
        Dispatch the HTML contents to the spreadsheet
        """
        filename = self.get_input("File").name

        text_format = self.get_input("Format")
        with open(filename, 'rb') as fp:
            if text_format == 'html':
                html = fp.read()  # reads bytes
            elif text_format == 'rtf':
                try:
                    py_import('pyth', {'pip': 'pyth'})
                except ImportError:
                    raise ModuleError(
                        self, "'rtf' format requires the pyth "
                        "Python library")
                else:
                    from pyth.plugins.rtf15.reader import Rtf15Reader
                    from pyth.plugins.xhtml.writer import XHTMLWriter
                    doc = Rtf15Reader.read(fp)
                    html = XHTMLWriter.write(doc).read()  # gets bytes
            else:
                raise ModuleError(self, "'%s' format is unknown" % text_format)

        self.displayAndWait(RichTextCellWidget, (html, ))
コード例 #3
0
ファイル: test_writexhtml.py プロジェクト: kippr/pyth
 def test_inline_png(self):
     sample_with_image = os.path.join(os.path.abspath(os.path.dirname(__file__)), "rtfs", "sample-with-image.rtf")
     with open(sample_with_image, 'rb') as rtf:
         source = Rtf15Reader.read(rtf)
         doc = XHTMLWriter.write(source).getvalue()
         self.assertIn('<img src="data:image/png;base64,', doc)
         self.assertIn('width:50px', doc)
         self.assertIn('height:50px', doc)
コード例 #4
0
ファイル: rtf_html.py プロジェクト: joka/plone.transforms
    def transform(self, data, options=None):
        if self._validate(data) is None:
            return None
        
        file = cStringIO.StringIO()
        file.write(''.join(self.filter(data)))
        file.seek(0)
        doc = Rtf15Reader.read(file, errors='replace')
        xhtml = XHTMLWriter.write(doc)
        xhtml_ = xhtml.read()
        xhtml.close()

        return TransformResult(StringIter(xhtml_))
コード例 #5
0
ファイル: parse_remote.py プロジェクト: IgKh/Open-Knesset
def rtf(url):
    '''
    gets the url of the rtf file, and (tries to) return an xhtml version of it.
    returns False if couldn't convert.
    '''
    remote = urlopen(url)
    data = remote.read()
    remote.close()
    temp = TemporaryFile()
    temp.write(data)
    temp.seek(0)
    try:
        doc = Rtf15Reader.read(temp)
        xhtml = XHTMLWriter.write(doc, pretty=True).read()
    except:
        xhtml = False
    temp.close()
    return xhtml
コード例 #6
0
 def download_text(self):
     filename = self.link[33:] + "." + self.typ
     try:
         with requests.get(self.link, stream=True) as r:
             with open(filename, 'wb') as f:
                 shutil.copyfileobj(r.raw, f)
     except:
         print("Error downloading " + self.link)
     text = ""
     if self.typ == "pdf":
         try:
             with pdfplumber.open(filename) as pdf:
                 for page in pdf.pages:
                     text += page.extract_text()
         except:
             try:
                 text += textract.process(filename,
                                          method="tesseract",
                                          language="rus").decode("utf-8")
             except:
                 print("Error extracting " + filename)
     elif self.typ == "doc":
         try:
             text += docx2txt.process(filename)
         except:
             try:
                 output = filename[:-3] + "txt"
                 os.system("antiword {} > {}".format(filename, output))
                 with open(output) as f:
                     text += f.read()
                 os.remove(output)
             except:
                 print("Error extracting " + filename)
     elif self.typ == "rtf":
         try:
             doc = Rtf15Reader.read(open(filename, "rb"))
             text += html2text.html2text(
                 XHTMLWriter.write(doc, pretty=True).read().decode("utf-8"))
         except:
             print("Error extracting " + filename)
     if os.path.exists(filename):
         os.remove(filename)
     self.text = text
コード例 #7
0
ファイル: parse_remote.py プロジェクト: uriklar/Open-Knesset
def rtf(url):
    '''
    gets the url of the rtf file, and (tries to) return an xhtml version of it.
    returns False if couldn't convert.
    '''
    remote = urlopen(url)
    data = remote.read()
    remote.close()
    temp = TemporaryFile()
    temp.write(data)
    temp.seek(0)
    try:
        doc = Rtf15Reader.read(temp)
        xhtml = XHTMLWriter.write(doc, pretty=True).read()
    except:
        xhtml = False
        exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
        logger.warn(''.join(traceback.format_exception(exceptionType, exceptionValue, exceptionTraceback)))
        
    temp.close()
    return xhtml
コード例 #8
0
ファイル: parse_remote.py プロジェクト: yoseft/Open-Knesset
def rtf(url):
    '''
    gets the url of the rtf file, and (tries to) return an xhtml version of it.
    returns False if couldn't convert.
    '''
    remote = urlopen(url)
    data = remote.read()
    remote.close()
    temp = TemporaryFile()
    temp.write(data)
    temp.seek(0)
    try:
        doc = Rtf15Reader.read(temp, errors='ignore')
        xhtml = XHTMLWriter.write(doc, pretty=True).read()
    except:
        xhtml = False

        logger.exception('Failed reading rtf from {0}'.format(url))

    temp.close()
    return xhtml
コード例 #9
0
ファイル: viewer.py プロジェクト: danvk/personal-archive
    def GET(self, day):
        out = StringIO()
        out.write(
            """<html>
<head>
<link rel="stylesheet" href="/static/viewer.css" />
<script src="/static/jquery-1.7.min.js"></script>
</head>
<body>
<div id="oneday">
"""
        )
        data = utils.GetOneDay(datetime.strptime(day, "%Y/%m/%d").date())
        for maker in sorted(data.keys()):
            out.write("<h2>%s</h2>\n" % maker)
            # TODO(danvk): include URL, thumbnail if available.
            out.write("<p>%s</p>\n" % data[maker]["summary"]["summary"].encode("utf8"))

            if "originals" in data[maker]:
                originals = data[maker]["originals"]
                for filename in sorted(originals.keys()):
                    out.write("<h3>%s</h3>\n" % filename)
                    _, ext = os.path.splitext(filename)
                    if ext == ".txt":
                        out.write("<pre>%s</pre>\n" % originals[filename])
                    elif ext == ".html":
                        out.write(originals[filename])
                    elif ext == ".rtf":
                        f = StringIO(originals[filename])
                        doc = Rtf15Reader.read(f)
                        html = XHTMLWriter.write(doc).getvalue()
                        out.write(html)
                    else:
                        out.write('<p>(Unknown format "%s")</p>' % ext)

            out.write("<hr/>\n")

        out.write("</div></body></html>")
        return out.getvalue()
コード例 #10
0
ファイル: viewer.py プロジェクト: danvk/personal-archive
    def GET(self, day):
        out = StringIO()
        out.write("""<html>
<head>
<link rel="stylesheet" href="/static/viewer.css" />
<script src="/static/jquery-1.7.min.js"></script>
</head>
<body>
<div id="oneday">
""")
        data = utils.GetOneDay(datetime.strptime(day, '%Y/%m/%d').date())
        for maker in sorted(data.keys()):
            out.write('<h2>%s</h2>\n' % maker)
            # TODO(danvk): include URL, thumbnail if available.
            out.write('<p>%s</p>\n' %
                      data[maker]['summary']['summary'].encode('utf8'))

            if 'originals' in data[maker]:
                originals = data[maker]['originals']
                for filename in sorted(originals.keys()):
                    out.write('<h3>%s</h3>\n' % filename)
                    _, ext = os.path.splitext(filename)
                    if ext == '.txt':
                        out.write('<pre>%s</pre>\n' % originals[filename])
                    elif ext == '.html':
                        out.write(originals[filename])
                    elif ext == '.rtf':
                        f = StringIO(originals[filename])
                        doc = Rtf15Reader.read(f)
                        html = XHTMLWriter.write(doc).getvalue()
                        out.write(html)
                    else:
                        out.write('<p>(Unknown format "%s")</p>' % ext)

            out.write('<hr/>\n')

        out.write('</div></body></html>')
        return out.getvalue()
コード例 #11
0
ファイル: test_readrtf15.py プロジェクト: Tessmore/pyth
 def testmethod(self):  # the test method to be added
     inputfilename = os.path.join(rtfinputsdir, basename+".rtf")
     outputfilename = os.path.join(testoutputdir, basename+".html")
     referencefilename = os.path.join(referenceoutputdir, basename+".html")
     #--- obtain reference output or skip test:
     try:
         with open(referencefilename, "rb") as input:
             the_referenceoutput = input.read()
     except OSError:
         print("no", referencefilename, ": skipped")
         return  # TODO: not so great: it will count as a correct test
     #--- read and convert RTF:
     with open(inputfilename, "rb") as input:
         document = Rtf15Reader.read(input)
     the_testoutput = XHTMLWriter.write(document, pretty=True).read()
     #--- compute test output:
     write_html_file(outputfilename, the_testoutput, print_msg=False)
     with open(outputfilename, "rb") as input:
         the_testoutput = input.read()
     #--- check outcome:
     if the_testoutput == the_referenceoutput:
         os.remove(outputfilename)  # assert will succeed, so it is no longer needed
     self.assertEqual(the_testoutput, the_referenceoutput)
コード例 #12
0
ファイル: test_readrtf15.py プロジェクト: tuffnatty/pyth
    def testmethod(self):  # the test method to be added
        inputfilename = os.path.join(rtfinputsdir, basename+".rtf")
        outputfilename = os.path.join(testoutputdir, 
                                      "%s.%s" % (basename, writer))
        #--- obtain reference output or skip test:
        with open(referencefilename, "rb") as input:
            the_referenceoutput = input.read()
        #--- read and convert RTF:
        with open(inputfilename, "rb") as input:
            document = Rtf15Reader.read(input)
        if writer == 'html':
            the_testoutput = XHTMLWriter.write(document, pretty=True).read()
            write_html_file(outputfilename, the_testoutput, print_msg=False)
        elif writer == 'txt':
            with open(outputfilename, "wt") as f:
                PlaintextWriter.write(document, f)

        #--- compute test output:
        with open(outputfilename, "rb") as input:
            the_testoutput = input.read()
        #--- check outcome:
        if the_testoutput == the_referenceoutput:
            os.remove(outputfilename)  # assert will succeed, so it is no longer needed
        self.assertEqual(the_testoutput, the_referenceoutput)
コード例 #13
0
from pyth.plugins.rtf15.reader import Rtf15Reader
from pyth.plugins.xhtml.writer import XHTMLWriter
from StringIO import StringIO
import json
import sys

while True:
    try:
        line = sys.stdin.readline()
    except KeyboardInterrupt:
        break

    try:
        rtf = json.loads(line.strip())
        doc = Rtf15Reader.read(StringIO(rtf))
        html = XHTMLWriter.write(doc).read()
        print json.dumps({'html': html})
    except Exception, e:
        print json.dumps({'error': unicode(e)})
    sys.stdout.flush()
コード例 #14
0
from pyth.plugins.rtf15.reader import Rtf15Reader
from pyth.plugins.xhtml.writer import XHTMLWriter

import sys

if len(sys.argv) > 1:
    filename = sys.argv[1]
else:
    filename = "sample.rtf"

doc = Rtf15Reader.read(open(filename, "rb"))

print XHTMLWriter.write(doc, pretty=True).read()
コード例 #15
0
ファイル: xhtml.py プロジェクト: tuffnatty/pyth
from __future__ import absolute_import
from __future__ import print_function
from pyth.plugins.xhtml.writer import XHTMLWriter
import pythonDoc

docTemplate = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
  <title>Pyth document</title>
  <style type="text/css">body { font-family: Verdana; }</style>
</head>
<body>
%s
</body>
</html>
'''

if __name__ == "__main__":
    doc = pythonDoc.buildDoc()
    print(docTemplate % XHTMLWriter.write(doc, pretty=True).getvalue())
コード例 #16
0
from pyth.plugins.plaintext.writer import PlaintextWriter
from pyth.plugins.xhtml.writer import XHTMLWriter
import re
import sys

from bs4 import UnicodeDammit
from xml.dom.minidom import parseString
import dicttoxml

raw_data_folder = "~/raw_data/factiva/"

doc1 = Rtf15Reader.read(open(raw_data_folder + 'Factiva-20141201-0822marchaus1.rtf', 'rb'))
doc2 = Rtf15Reader.read(open(raw_data_folder + 'Factiva-20141201-0830marchaus2.rtf', 'rb'))
doc3 = Rtf15Reader.read(open(raw_data_folder + 'Factiva-20141201-0834marchaus3.rtf', 'rb'))

xmltext1 = XHTMLWriter.write(doc1, pretty=True).read()
xmltext2 = XHTMLWriter.write(doc2, pretty=True).read()
xmltext3 = XHTMLWriter.write(doc3, pretty=True).read()

xmltext = xmltext1 + "\n\n" + xmltext2 + "\n\n" + xmltext3

# Replace smart quotes
xmltext = xmltext.replace("\xe2\x80\x99\x19", "'")
xmltext = xmltext.replace("\xe2\x80\x9c\x1c", "\"")
xmltext = xmltext.replace("\xe2\x80\x9d\x1d", "\"")
xmltext = xmltext.replace("\xe2\x80\x94\x14", "-")
xmltext = xmltext.replace("\xc2\xa9\xc2\xa9", " ")
xmltext = xmltext.replace("\xe2\x82\xac\xc2\xac", "E")
xmltext = xmltext.replace("\xc2\xa3\xc2\xa3", "L")
xmltext = xmltext.replace("\xe2\x80\x98\x18","'")
xmltext = xmltext.replace("\xe2\x80\x99\x19","'")
コード例 #17
0
ファイル: main.py プロジェクト: matthynes/article_word_find
while input != ';':
    i_list = input.split(', ')
    words += ([i_list[0], i_list[1]],)
    input = raw_input("Another word? ; to quit\n")

scores = {w: float(n) for w, n in words}
total = 0

output = open("results.txt", "w")

results = {}

for dirs, subdirs, files in os.walk(root):
    for f in files:
        if f.endswith('.rtf'):
            doc = Rtf15Reader.read(open(f, "rb"))
            total = 0
            text = dehtml(XHTMLWriter.write(doc).read().lower().split())
            for word in text:
                word = re.sub('\W+', '', word)
                total += scores.get(word, 0)

            results[f] = total

for key, value in sorted(results.items()):
    output.write(key + "       " + str(value) + "\n")

print "Finished! Check results.txt"
raw_input("\nPress enter to close.")
コード例 #18
0
ファイル: xhtml.py プロジェクト: weiconglix/pyth
    <p>
      example<span style="vertical-align: super"> super </span>
      example<span style="vertical-align: sub"> sub </span>
    </p>
    a list
    <ul>
      <li>hello
      test</li>
      <li>bonjour</li>
      <li>guten tag</li>
    </ul>
    <p>
      <a href=http://www.google.com>a link
      </a> single space here.
      <br/>a br tag
    </p>
  </div>
""")

css = """
  .important {font-weight: bold}
  p.bold {font-weight: bold}
  .other {font-weight: normal; color: blue}
"""

if __name__ == '__main__':
    # Parse the document and then reconstruct it using the xhtml
    # writer.
    doc = XHTMLReader.read(content, css)
    print XHTMLWriter.write(doc).getvalue()
コード例 #19
0
ファイル: test_writexhtml.py プロジェクト: kippr/pyth
 def test_underline(self):
     text = Text(content=[u'Underlined'], properties={'underline': True})
     para = Paragraph(content=[text])
     doc = Document(content=[para])
     result = XHTMLWriter.write(doc).getvalue()
     self.assertIn('<u>Underlined</u>', result)
コード例 #20
0
ファイル: rtf2html.py プロジェクト: walling/node-unrtf
from pyth.plugins.rtf15.reader import Rtf15Reader
from pyth.plugins.xhtml.writer import XHTMLWriter
from StringIO import StringIO
import json
import sys

while True:
	try:
		line = sys.stdin.readline()
	except KeyboardInterrupt:
		break

	try:
		rtf = json.loads(line.strip())
		doc = Rtf15Reader.read(StringIO(rtf))
		html = XHTMLWriter.write(doc).read()
		print json.dumps({ 'html': html })
	except Exception, e:
		print json.dumps({ 'error': unicode(e) })
	sys.stdout.flush()
コード例 #21
0
ファイル: rtf15.py プロジェクト: tuffnatty/pyth
from __future__ import absolute_import
from __future__ import print_function
import sys
import os.path

from pyth.plugins.rtf15.reader import Rtf15Reader
from pyth.plugins.xhtml.writer import XHTMLWriter, write_html_file

numargs = len(sys.argv) - 1

if numargs not in [1, 2]:
    print("usage: rtf15 inputfile.rtf [outputdir]")
else:
    inputfile = sys.argv[1]
    doc = Rtf15Reader.read(open(inputfile, "rb"))
    the_output = XHTMLWriter.write(doc, pretty=True).read()
    if numargs == 1:
        print("<!-- ##### RTF file" + inputfile + "as XHTML: -->")
        print(the_output)
    else:
        basename = os.path.basename(inputfile)
        outputdir = sys.argv[2]
        outputfile = os.path.join(outputdir,
                                  os.path.splitext(basename)[0] + ".html")
        write_html_file(outputfile, the_output, print_msg=True)
コード例 #22
0
ファイル: rtf15ToXhtml.py プロジェクト: kippr/pyth
from pyth.plugins.xhtml.writer  import XHTMLWriter
from pyth.plugins.rtf15.reader import Rtf15Reader
import sys

if len(sys.argv) > 1:
    filename = sys.argv[1]
else:
    filename = "tests/rtfs/sample-with-image.rtf"
source = open(filename, "rb")
doc = Rtf15Reader.read(source)

print XHTMLWriter.write(doc).getvalue()
コード例 #23
0
ファイル: xhtml.py プロジェクト: CongWu/pyth
from pyth.plugins.xhtml.writer import XHTMLWriter
import pythonDoc


docTemplate = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
  <title>Pyth document</title>
  <style type="text/css">body { font-family: Verdana; }</style>
</head>
<body>
%s
</body>
</html>
'''


if __name__ == "__main__":
    doc = pythonDoc.buildDoc()
    print docTemplate % XHTMLWriter.write(doc, pretty=True).getvalue()
コード例 #24
0
ファイル: parse.py プロジェクト: jamesonwatts/recall
db = {}
with open('resources/db.csv', 'rU') as csvfile:
    myreader = csv.reader(csvfile)
    for row in myreader:
        if row[0] in db:
            db[row[0]].append(row[5])
        else:
            db[row[0]] = [row[5]]
            
import os
rtfs = os.listdir('resources/rtf')
for rtf in rtfs:
    if rtf != '.DS_Store':
        doc = Rtf15Reader.read(open('resources/rtf/'+rtf, "rb"))
        rid = rtf.split("#")[1].split(".")[0]
        html = XHTMLWriter.write(doc, pretty=True).read()
        doc_id = False
        tmp = ""
        index = 0
        for item in html.split("</p>"):
            if "<p>" in item and "DOCUMENT" in item:
                if doc_id in db[rid]:
                    index += 1
                    f = open('resources/tmp/'+rid+"_"+str(index)+"_"+doc_id+"_.txt",'w')
                    f.write(tmp)
                    
                doc_id = item[ item.find("<p>")+len("<p>") : ].split(" ")[0]
                tmp = ""
            
            if doc_id:
                tmp += item+"</p>"