def compute(self): """ compute() -> None Dispatch the HTML contents to the spreadsheet """ filename = self.get_input("File").name text_format = self.get_input("Format") with open(filename, 'rb') as fp: if text_format == 'html': html = fp.read() # reads bytes elif text_format == 'rtf': try: py_import('pyth', {'pip': 'pyth'}) except ImportError: raise ModuleError(self, "'rtf' format requires the pyth " "Python library") else: from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.xhtml.writer import XHTMLWriter doc = Rtf15Reader.read(fp) html = XHTMLWriter.write(doc).read() # gets bytes else: raise ModuleError(self, "'%s' format is unknown" % text_format) self.displayAndWait(RichTextCellWidget, (html,))
def compute(self): """ compute() -> None Dispatch the HTML contents to the spreadsheet """ filename = self.get_input("File").name text_format = self.get_input("Format") with open(filename, 'rb') as fp: if text_format == 'html': html = fp.read() # reads bytes elif text_format == 'rtf': try: py_import('pyth', {'pip': 'pyth'}) except ImportError: raise ModuleError( self, "'rtf' format requires the pyth " "Python library") else: from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.xhtml.writer import XHTMLWriter doc = Rtf15Reader.read(fp) html = XHTMLWriter.write(doc).read() # gets bytes else: raise ModuleError(self, "'%s' format is unknown" % text_format) self.displayAndWait(RichTextCellWidget, (html, ))
def test_inline_png(self): sample_with_image = os.path.join(os.path.abspath(os.path.dirname(__file__)), "rtfs", "sample-with-image.rtf") with open(sample_with_image, 'rb') as rtf: source = Rtf15Reader.read(rtf) doc = XHTMLWriter.write(source).getvalue() self.assertIn('<img src="data:image/png;base64,', doc) self.assertIn('width:50px', doc) self.assertIn('height:50px', doc)
def transform(self, data, options=None): if self._validate(data) is None: return None file = cStringIO.StringIO() file.write(''.join(self.filter(data))) file.seek(0) doc = Rtf15Reader.read(file, errors='replace') xhtml = XHTMLWriter.write(doc) xhtml_ = xhtml.read() xhtml.close() return TransformResult(StringIter(xhtml_))
def rtf(url): ''' gets the url of the rtf file, and (tries to) return an xhtml version of it. returns False if couldn't convert. ''' remote = urlopen(url) data = remote.read() remote.close() temp = TemporaryFile() temp.write(data) temp.seek(0) try: doc = Rtf15Reader.read(temp) xhtml = XHTMLWriter.write(doc, pretty=True).read() except: xhtml = False temp.close() return xhtml
def download_text(self): filename = self.link[33:] + "." + self.typ try: with requests.get(self.link, stream=True) as r: with open(filename, 'wb') as f: shutil.copyfileobj(r.raw, f) except: print("Error downloading " + self.link) text = "" if self.typ == "pdf": try: with pdfplumber.open(filename) as pdf: for page in pdf.pages: text += page.extract_text() except: try: text += textract.process(filename, method="tesseract", language="rus").decode("utf-8") except: print("Error extracting " + filename) elif self.typ == "doc": try: text += docx2txt.process(filename) except: try: output = filename[:-3] + "txt" os.system("antiword {} > {}".format(filename, output)) with open(output) as f: text += f.read() os.remove(output) except: print("Error extracting " + filename) elif self.typ == "rtf": try: doc = Rtf15Reader.read(open(filename, "rb")) text += html2text.html2text( XHTMLWriter.write(doc, pretty=True).read().decode("utf-8")) except: print("Error extracting " + filename) if os.path.exists(filename): os.remove(filename) self.text = text
def rtf(url): ''' gets the url of the rtf file, and (tries to) return an xhtml version of it. returns False if couldn't convert. ''' remote = urlopen(url) data = remote.read() remote.close() temp = TemporaryFile() temp.write(data) temp.seek(0) try: doc = Rtf15Reader.read(temp) xhtml = XHTMLWriter.write(doc, pretty=True).read() except: xhtml = False exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() logger.warn(''.join(traceback.format_exception(exceptionType, exceptionValue, exceptionTraceback))) temp.close() return xhtml
def rtf(url): ''' gets the url of the rtf file, and (tries to) return an xhtml version of it. returns False if couldn't convert. ''' remote = urlopen(url) data = remote.read() remote.close() temp = TemporaryFile() temp.write(data) temp.seek(0) try: doc = Rtf15Reader.read(temp, errors='ignore') xhtml = XHTMLWriter.write(doc, pretty=True).read() except: xhtml = False logger.exception('Failed reading rtf from {0}'.format(url)) temp.close() return xhtml
def GET(self, day): out = StringIO() out.write( """<html> <head> <link rel="stylesheet" href="/static/viewer.css" /> <script src="/static/jquery-1.7.min.js"></script> </head> <body> <div id="oneday"> """ ) data = utils.GetOneDay(datetime.strptime(day, "%Y/%m/%d").date()) for maker in sorted(data.keys()): out.write("<h2>%s</h2>\n" % maker) # TODO(danvk): include URL, thumbnail if available. out.write("<p>%s</p>\n" % data[maker]["summary"]["summary"].encode("utf8")) if "originals" in data[maker]: originals = data[maker]["originals"] for filename in sorted(originals.keys()): out.write("<h3>%s</h3>\n" % filename) _, ext = os.path.splitext(filename) if ext == ".txt": out.write("<pre>%s</pre>\n" % originals[filename]) elif ext == ".html": out.write(originals[filename]) elif ext == ".rtf": f = StringIO(originals[filename]) doc = Rtf15Reader.read(f) html = XHTMLWriter.write(doc).getvalue() out.write(html) else: out.write('<p>(Unknown format "%s")</p>' % ext) out.write("<hr/>\n") out.write("</div></body></html>") return out.getvalue()
def GET(self, day): out = StringIO() out.write("""<html> <head> <link rel="stylesheet" href="/static/viewer.css" /> <script src="/static/jquery-1.7.min.js"></script> </head> <body> <div id="oneday"> """) data = utils.GetOneDay(datetime.strptime(day, '%Y/%m/%d').date()) for maker in sorted(data.keys()): out.write('<h2>%s</h2>\n' % maker) # TODO(danvk): include URL, thumbnail if available. out.write('<p>%s</p>\n' % data[maker]['summary']['summary'].encode('utf8')) if 'originals' in data[maker]: originals = data[maker]['originals'] for filename in sorted(originals.keys()): out.write('<h3>%s</h3>\n' % filename) _, ext = os.path.splitext(filename) if ext == '.txt': out.write('<pre>%s</pre>\n' % originals[filename]) elif ext == '.html': out.write(originals[filename]) elif ext == '.rtf': f = StringIO(originals[filename]) doc = Rtf15Reader.read(f) html = XHTMLWriter.write(doc).getvalue() out.write(html) else: out.write('<p>(Unknown format "%s")</p>' % ext) out.write('<hr/>\n') out.write('</div></body></html>') return out.getvalue()
def testmethod(self): # the test method to be added inputfilename = os.path.join(rtfinputsdir, basename+".rtf") outputfilename = os.path.join(testoutputdir, basename+".html") referencefilename = os.path.join(referenceoutputdir, basename+".html") #--- obtain reference output or skip test: try: with open(referencefilename, "rb") as input: the_referenceoutput = input.read() except OSError: print("no", referencefilename, ": skipped") return # TODO: not so great: it will count as a correct test #--- read and convert RTF: with open(inputfilename, "rb") as input: document = Rtf15Reader.read(input) the_testoutput = XHTMLWriter.write(document, pretty=True).read() #--- compute test output: write_html_file(outputfilename, the_testoutput, print_msg=False) with open(outputfilename, "rb") as input: the_testoutput = input.read() #--- check outcome: if the_testoutput == the_referenceoutput: os.remove(outputfilename) # assert will succeed, so it is no longer needed self.assertEqual(the_testoutput, the_referenceoutput)
def testmethod(self): # the test method to be added inputfilename = os.path.join(rtfinputsdir, basename+".rtf") outputfilename = os.path.join(testoutputdir, "%s.%s" % (basename, writer)) #--- obtain reference output or skip test: with open(referencefilename, "rb") as input: the_referenceoutput = input.read() #--- read and convert RTF: with open(inputfilename, "rb") as input: document = Rtf15Reader.read(input) if writer == 'html': the_testoutput = XHTMLWriter.write(document, pretty=True).read() write_html_file(outputfilename, the_testoutput, print_msg=False) elif writer == 'txt': with open(outputfilename, "wt") as f: PlaintextWriter.write(document, f) #--- compute test output: with open(outputfilename, "rb") as input: the_testoutput = input.read() #--- check outcome: if the_testoutput == the_referenceoutput: os.remove(outputfilename) # assert will succeed, so it is no longer needed self.assertEqual(the_testoutput, the_referenceoutput)
from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.xhtml.writer import XHTMLWriter from StringIO import StringIO import json import sys while True: try: line = sys.stdin.readline() except KeyboardInterrupt: break try: rtf = json.loads(line.strip()) doc = Rtf15Reader.read(StringIO(rtf)) html = XHTMLWriter.write(doc).read() print json.dumps({'html': html}) except Exception, e: print json.dumps({'error': unicode(e)}) sys.stdout.flush()
from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.xhtml.writer import XHTMLWriter import sys if len(sys.argv) > 1: filename = sys.argv[1] else: filename = "sample.rtf" doc = Rtf15Reader.read(open(filename, "rb")) print XHTMLWriter.write(doc, pretty=True).read()
from __future__ import absolute_import from __future__ import print_function from pyth.plugins.xhtml.writer import XHTMLWriter import pythonDoc docTemplate = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <title>Pyth document</title> <style type="text/css">body { font-family: Verdana; }</style> </head> <body> %s </body> </html> ''' if __name__ == "__main__": doc = pythonDoc.buildDoc() print(docTemplate % XHTMLWriter.write(doc, pretty=True).getvalue())
from pyth.plugins.plaintext.writer import PlaintextWriter from pyth.plugins.xhtml.writer import XHTMLWriter import re import sys from bs4 import UnicodeDammit from xml.dom.minidom import parseString import dicttoxml raw_data_folder = "~/raw_data/factiva/" doc1 = Rtf15Reader.read(open(raw_data_folder + 'Factiva-20141201-0822marchaus1.rtf', 'rb')) doc2 = Rtf15Reader.read(open(raw_data_folder + 'Factiva-20141201-0830marchaus2.rtf', 'rb')) doc3 = Rtf15Reader.read(open(raw_data_folder + 'Factiva-20141201-0834marchaus3.rtf', 'rb')) xmltext1 = XHTMLWriter.write(doc1, pretty=True).read() xmltext2 = XHTMLWriter.write(doc2, pretty=True).read() xmltext3 = XHTMLWriter.write(doc3, pretty=True).read() xmltext = xmltext1 + "\n\n" + xmltext2 + "\n\n" + xmltext3 # Replace smart quotes xmltext = xmltext.replace("\xe2\x80\x99\x19", "'") xmltext = xmltext.replace("\xe2\x80\x9c\x1c", "\"") xmltext = xmltext.replace("\xe2\x80\x9d\x1d", "\"") xmltext = xmltext.replace("\xe2\x80\x94\x14", "-") xmltext = xmltext.replace("\xc2\xa9\xc2\xa9", " ") xmltext = xmltext.replace("\xe2\x82\xac\xc2\xac", "E") xmltext = xmltext.replace("\xc2\xa3\xc2\xa3", "L") xmltext = xmltext.replace("\xe2\x80\x98\x18","'") xmltext = xmltext.replace("\xe2\x80\x99\x19","'")
while input != ';': i_list = input.split(', ') words += ([i_list[0], i_list[1]],) input = raw_input("Another word? ; to quit\n") scores = {w: float(n) for w, n in words} total = 0 output = open("results.txt", "w") results = {} for dirs, subdirs, files in os.walk(root): for f in files: if f.endswith('.rtf'): doc = Rtf15Reader.read(open(f, "rb")) total = 0 text = dehtml(XHTMLWriter.write(doc).read().lower().split()) for word in text: word = re.sub('\W+', '', word) total += scores.get(word, 0) results[f] = total for key, value in sorted(results.items()): output.write(key + " " + str(value) + "\n") print "Finished! Check results.txt" raw_input("\nPress enter to close.")
<p> example<span style="vertical-align: super"> super </span> example<span style="vertical-align: sub"> sub </span> </p> a list <ul> <li>hello test</li> <li>bonjour</li> <li>guten tag</li> </ul> <p> <a href=http://www.google.com>a link </a> single space here. <br/>a br tag </p> </div> """) css = """ .important {font-weight: bold} p.bold {font-weight: bold} .other {font-weight: normal; color: blue} """ if __name__ == '__main__': # Parse the document and then reconstruct it using the xhtml # writer. doc = XHTMLReader.read(content, css) print XHTMLWriter.write(doc).getvalue()
def test_underline(self): text = Text(content=[u'Underlined'], properties={'underline': True}) para = Paragraph(content=[text]) doc = Document(content=[para]) result = XHTMLWriter.write(doc).getvalue() self.assertIn('<u>Underlined</u>', result)
from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.xhtml.writer import XHTMLWriter from StringIO import StringIO import json import sys while True: try: line = sys.stdin.readline() except KeyboardInterrupt: break try: rtf = json.loads(line.strip()) doc = Rtf15Reader.read(StringIO(rtf)) html = XHTMLWriter.write(doc).read() print json.dumps({ 'html': html }) except Exception, e: print json.dumps({ 'error': unicode(e) }) sys.stdout.flush()
from __future__ import absolute_import from __future__ import print_function import sys import os.path from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.xhtml.writer import XHTMLWriter, write_html_file numargs = len(sys.argv) - 1 if numargs not in [1, 2]: print("usage: rtf15 inputfile.rtf [outputdir]") else: inputfile = sys.argv[1] doc = Rtf15Reader.read(open(inputfile, "rb")) the_output = XHTMLWriter.write(doc, pretty=True).read() if numargs == 1: print("<!-- ##### RTF file" + inputfile + "as XHTML: -->") print(the_output) else: basename = os.path.basename(inputfile) outputdir = sys.argv[2] outputfile = os.path.join(outputdir, os.path.splitext(basename)[0] + ".html") write_html_file(outputfile, the_output, print_msg=True)
from pyth.plugins.xhtml.writer import XHTMLWriter from pyth.plugins.rtf15.reader import Rtf15Reader import sys if len(sys.argv) > 1: filename = sys.argv[1] else: filename = "tests/rtfs/sample-with-image.rtf" source = open(filename, "rb") doc = Rtf15Reader.read(source) print XHTMLWriter.write(doc).getvalue()
from pyth.plugins.xhtml.writer import XHTMLWriter import pythonDoc docTemplate = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <title>Pyth document</title> <style type="text/css">body { font-family: Verdana; }</style> </head> <body> %s </body> </html> ''' if __name__ == "__main__": doc = pythonDoc.buildDoc() print docTemplate % XHTMLWriter.write(doc, pretty=True).getvalue()
db = {} with open('resources/db.csv', 'rU') as csvfile: myreader = csv.reader(csvfile) for row in myreader: if row[0] in db: db[row[0]].append(row[5]) else: db[row[0]] = [row[5]] import os rtfs = os.listdir('resources/rtf') for rtf in rtfs: if rtf != '.DS_Store': doc = Rtf15Reader.read(open('resources/rtf/'+rtf, "rb")) rid = rtf.split("#")[1].split(".")[0] html = XHTMLWriter.write(doc, pretty=True).read() doc_id = False tmp = "" index = 0 for item in html.split("</p>"): if "<p>" in item and "DOCUMENT" in item: if doc_id in db[rid]: index += 1 f = open('resources/tmp/'+rid+"_"+str(index)+"_"+doc_id+"_.txt",'w') f.write(tmp) doc_id = item[ item.find("<p>")+len("<p>") : ].split(" ")[0] tmp = "" if doc_id: tmp += item+"</p>"