def upload(request): # user uploads a document -> convert into a dict of the terms found if request.FILES: if 'file' in request.FILES: result = '' f = request.FILES['file'] fp = 'shake_v3/static/data/' + str(f) fp2 = fp[:len(fp)-3] + 'txt' if fp[len(fp)-3:len(fp)] == 'pdf': with open(fp, 'wb+') as pdff: for chunk in f.chunks(): pdff.write(chunk) result = pdf_to_txt(fp) with open(fp2, 'wb+') as txtf: txtf.write(result) elif fp[len(fp)-3:len(fp)] == 'rtf': with open(fp, 'wb+') as rtff: for line in f: rtff.write(line) doc = Rtf15Reader.read(open(fp, 'rb')) doctxt = PlaintextWriter.write(doc).getvalue() with open(fp2, 'wb+') as txtf: for line in doctxt: txtf.write(line) f = str(f)[:-4] + ".txt" result = doctxt else: with open(fp2, 'wb+') as txtf: for line in f: txtf.write(line) result = open(fp2, 'r').read() response_dict = generate_term_dict(result) response_dict['fp'] = 'static/data/' + str(f) return HttpResponse(simplejson.dumps(response_dict), mimetype='application/javascript') # user indicates terms -> give a grade elif request.POST: #TO DO: implement saving the data rating = "" score = custom_POST_to_score(request) if score > 4.5: rating = 'A+' elif score > 4: rating = 'A' elif score > 3.5: rating = 'B+' elif score > 3: rating = 'B' elif score > 2.5: rating = 'C+' elif score > 2: rating = 'C' elif score > 1: rating = 'D' else: rating = 'F' return HttpResponse(rating) # display the upload part 1 else: score = 0 return render_to_response('upload.html', {'score': score}, context_instance = RequestContext(request))
def convert_to_txt(file_path): logger.debug("convert_to_txt: %s" % file_path) words = None if not os.path.exists(file_path): logger.error("missing file %s", file_path) file_size = os.stat(file_path).st_size logger.debug("convert_to_txt: %d bytes at %s",file_size, file_path) ext = _get_extension(file_path) if ext == '.txt': logger.debug("loading txt file") worked = False try: encoding, file_handle, words = open_with_correct_encoding(file_path) except Exception as e: logger.error("Wasn't able to read the words from the file %s" % file_path) words = "" elif ext == '.docx': logger.debug("loading docx file") words = _docx_to_txt(file_path) elif ext == '.rtf': logger.debug("loading rtf file") doc = Rtf15Reader.read(open(file_path)) words = PlaintextWriter.write(doc).getvalue() else: logging.warning("Couldn't find an extension on the file, so assuming text") with codecs.open(file_path, 'r', ENCODING_UTF_8) as myfile: words = myfile.read() logger.debug("loaded %d chars" % len(words)) return words
def get_one_month_from_rtf(url): rtf_file = urllib2.urlopen(url) rtf_file = StringIO(rtf_file.read()) doc = Rtf15Reader.read(rtf_file) final_data = [] header = False for c in doc.content: full_p = c.content.__repr__().lower() if "capacity" in full_p and "use cna" in full_p: header = True continue if header: row= re.split(r"\t", c.content[0].content[0]) if len(row) == 7 : final_data.append(row) df = pd.DataFrame(final_data, columns = ["prison_name","baseline_cna", "in_use_cna", "operational_capacity", "population", "perc_pop_to_used_cna", "perc_acc_available"]) df.iloc[:,1:] = df.iloc[:,1:].replace("%", "", regex=True).replace(",", "", regex=True) for c in df.columns: df[c]= pd.to_numeric(df[c], errors='ignore') cols = [c for c in df.columns if "perc" in c] df.loc[:,cols] = df.loc[:,cols]/100 return df
def rtf_to_text(value): if len(value) == 0: return value rtf_doc = Rtf15Reader.read(BytesIO(value.encode("latin_1"))) txt_doc = BytesIO() PlaintextWriter.write(rtf_doc, txt_doc, encoding="latin_1") return txt_doc.getvalue().decode("latin_1")
def rtf(f): doc = Rtf15Reader.read(open(f, "rb")) result = [] for element in doc.content: for text in element.content: result.append("".join(text.content)) return "".join(result)
def convertRtfToText(path): from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.plaintext.writer import PlaintextWriter doc = Rtf15Reader.read(open('sample.rtf')) print(PlaintextWriter.write(doc).getvalue())
def GetExternal(version, odl_data, source, class_id): external = "" for item in version[2]: if item[0] == "Attribute" \ and item[1] == "_Art1_RTF": if len(item[2]) == 2: if isinstance(source, ZipFile): data = source.open(item[2][0]).read() else: file_name = join(source, item[2][0]) f = open(file_name, 'rb') data = f.read() f.close() data = data.replace("\x0c", "") elif len(item[2]) == 1: data = item[2][0] if data == "": return "" f = StringIO() f.write(data) doc = Rtf15Reader.read(f, clean_paragraphs = False) external = PlaintextWriter.write(doc).getvalue() external = external.replace("\n\n", "\n") return ReplaceTextNames(external, version, odl_data, class_id)
def read_recommendations(self, file_name): """ Function reads the targeted values from the file "WHO Daily Recommended Values.rtf" It process the entries and creates a dictionary with Nutrient name as Key and Nutrient Value as value :param file_name: :return: """ target = dict() filtered_col = list() doc = Rtf15Reader.read(open(file_name)) entities = PlaintextWriter.write(doc).getvalue().split('\n\n') for item in entities: splited = item.split(',') name = splited[0].split('(')[0] value = splited[1] try: unit = splited[0].split('(')[1].split(')')[0] except: unit = '' # target.append({'nutrient': name, # 'unit': unit, # 'value': value}) target.update({name: value}) filtered_col.append(name) self.target_values = target return target, filtered_col
def main(): ''' Purpose:: Input:: Output:: Assumptions:: ''' # Get arguments args = parse_arguments() if args.url: url = args.url # Get file and read it into structure try: with open(url, 'rb') as rtffile: judges = extract_terms(Rtf15Reader.read(rtffile)) #print PlaintextWriter.write(doc).getvalue() except IOError as e: print 'An error occured fetching %s \n %s' % (url, e.reason) return 1 f = open('US_legal_lexicon.txt', 'w') # Print data #f.write("\n".join(str(i).encode('utf8') for i in judges)) for i in judges: f.write((i).encode('utf8') +'\n') f.close()
def compute(self): """ compute() -> None Dispatch the HTML contents to the spreadsheet """ filename = self.get_input("File").name text_format = self.get_input("Format") with open(filename, 'rb') as fp: if text_format == 'html': html = fp.read() # reads bytes elif text_format == 'rtf': try: py_import('pyth', {'pip': 'pyth'}) except ImportError: raise ModuleError( self, "'rtf' format requires the pyth " "Python library") else: from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.xhtml.writer import XHTMLWriter doc = Rtf15Reader.read(fp) html = XHTMLWriter.write(doc).read() # gets bytes else: raise ModuleError(self, "'%s' format is unknown" % text_format) self.displayAndWait(RichTextCellWidget, (html, ))
def compute(self): """ compute() -> None Dispatch the HTML contents to the spreadsheet """ filename = self.get_input("File").name text_format = self.get_input("Format") with open(filename, 'rb') as fp: if text_format == 'html': html = fp.read() # reads bytes elif text_format == 'rtf': try: py_import('pyth', {'pip': 'pyth'}) except ImportError: raise ModuleError(self, "'rtf' format requires the pyth " "Python library") else: from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.xhtml.writer import XHTMLWriter doc = Rtf15Reader.read(fp) html = XHTMLWriter.write(doc).read() # gets bytes else: raise ModuleError(self, "'%s' format is unknown" % text_format) self.displayAndWait(RichTextCellWidget, (html,))
def main(): ''' Purpose:: Input:: Output:: Assumptions:: ''' # Get arguments args = parse_arguments() if args.url: url = args.url # Get file and read it into structure try: with open(url, 'rb') as rtffile: judges = extract_terms(Rtf15Reader.read(rtffile)) #print PlaintextWriter.write(doc).getvalue() except IOError as e: print 'An error occured fetching %s \n %s' % (url, e.reason) return 1 f = open('US_legal_lexicon.txt', 'w') # Print data #f.write("\n".join(str(i).encode('utf8') for i in judges)) for i in judges: f.write((i).encode('utf8') + '\n') f.close()
def decode_cell(cell): '''The cell matched so lets handle it''' # variable that will hold the converted text temp_cell = [] # pyth checks for the rtf syntax before processing, so 'unicode_escape' escapes the '\' so pyth doesn't complain cell_encode = re.sub(r'\\u|\\\\u|\\N|\\\\N', ' ', cell) cell_encode = cell_encode.decode('unicode_escape') cell_encode = filter(lambda x: x in string.printable, cell_encode) cell_rtf = Rtf15Reader.read(StringIO(cell_encode)) # turn the pyth object into readable text cell_txt = [x.content for x in cell_rtf.content] # iterate and extract the pyth object text into temp_cell for line in cell_txt: for l in line: temp_cell.append(l.content) # combine and join the extracted text into one string (for one cell) combined = [i for sub in temp_cell for i in sub] new_cell = ' '.join(combined) # the non-ascii characters in your file were followed by _ so i removed them for cleanliness # uncomment to keep the _ new_cell = re.sub('_', '', new_cell) # remove extra whitespace and return the converted cell # remove L at end of string return ' '.join(new_cell[:-1].split())
def convert_to_txt(file_path): logger.debug("convert_to_txt: %s" % file_path) if not os.path.exists(file_path): logger.error("missing file %s", file_path) file_size = os.stat(file_path).st_size logger.debug("convert_to_txt: %d bytes at %s", file_size, file_path) ext = _get_extension(file_path) if ext == '.txt': logger.debug("loading txt file") try: encoding, file_handle, words = open_with_correct_encoding( file_path) except Exception: logger.error("Wasn't able to read the words from the file %s" % file_path) words = "" elif ext == '.docx': logger.debug("loading docx file") words = _docx_to_txt(file_path) elif ext == '.rtf': logger.debug("loading rtf file") doc = Rtf15Reader.read(open(file_path)) words = PlaintextWriter.write(doc).getvalue() else: logging.warning( "Couldn't find an extension on the file, so assuming text") with open(file_path, 'r') as myfile: words = myfile.read() logger.debug("loaded %d chars" % len(words)) return words
def analyze(committeeFile): try: doc = Rtf15Reader.read(open(committeeFile, "rb")) except: print "%s - skipped..." % committeeFile errFile = committeeFile.replace(global_options.indir, global_options.errdir) shutil.copyfile(committeeFile, errFile) return False #print PlaintextWriter.write(doc).getValue() f = open("test.out", 'w') f.write(PlaintextWriter.write(doc).getvalue()) f.close() f = open("test.out", 'r') participants = find_participants(f.read()) f.close() # Getting the indication whether the participant spoke in the committee f = open("test.out", 'r') docstring = f.read() for line in docstring.splitlines(): name = '' if ":" in line: participant = line.split(":")[0] for p in participants: if participant in p['name']: p['speaker'] = True p['speak_count'] += 1 f.close() fname = committeeFile.replace(global_options.indir, global_options.outdir) fname = fname.replace("rtf", "txt") file = codecs.open(fname, "w", "utf-8") for participant in participants: string_builder = [] for key, val in participant.iteritems(): string = u"'%s': '%s'" if val is not None: if type(val) == str: val = val.replace("'", "") val = val.replace('"', '') string = string % (key, print_unicode(val)) string_builder.append(string) wrt_ln = ', '.join(string_builder) wrt_ln += ',\n' try: file.write(wrt_ln) except UnicodeEncodeError: print wrt_ln file.close() verbose("Generated participants file: " + fname) return True
def test_inline_png(self): sample_with_image = os.path.join(os.path.abspath(os.path.dirname(__file__)), "rtfs", "sample-with-image.rtf") with open(sample_with_image, 'rb') as rtf: doc = Rtf15Reader.read(rtf) image = next(node.content[0] for node in doc.content if isinstance(node.content[0], pyth.document.Image)) expected = {'pngblip': True, 'picw': '20714', 'picwgoal': '750', 'pich': '12143', 'pichgoal': '750', 'picscaley': '100', 'picscalex': '100'} self.assertEquals(expected, image.properties)
def test_inline_png(self): sample_with_image = os.path.join(os.path.abspath(os.path.dirname(__file__)), "rtfs", "sample-with-image.rtf") with open(sample_with_image, 'rb') as rtf: source = Rtf15Reader.read(rtf) doc = XHTMLWriter.write(source).getvalue() self.assertIn('<img src="data:image/png;base64,', doc) self.assertIn('width:50px', doc) self.assertIn('height:50px', doc)
def main(): if len(sys.argv) < 2: print("usage %s <rtf_file_name> <txt_file_name>") else: doc = Rtf15Reader.read(open(os.path.join(sys.argv[1]))) txt_filename = sys.argv[2] with open(os.path.join(txt_filename), "w") as of: of.write(PlaintextWriter.write(doc).getvalue())
def rtf(f): with open(f, "rb") as f: doc = Rtf15Reader.read(f) result = [] for element in doc.content: for text in element.content: result.append(''.join(text.content)) return '\r\n'.join(result)
def load_stickies(path): stickies = [] with open(path) as fd: for i,rtf in enumerate(parse_sticky_database(fd.read())): doc = Rtf15Reader.read(StringIO.StringIO(rtf)) plaintext = PlaintextWriter.write(doc).getvalue() stickies.append(plaintext) return stickies
def parse(self, path): # Directory if os.path.isdir(path): raise NotImplementedError() # File else: doc = Rtf15Reader.read(open(path)) sample = Sample(path, None, PlaintextWriter.write(doc).getvalue()) return sample
def get_rtf_text(path): """ Take the path of an rtf file as an argument and return the text """ doc = Rtf15Reader.read(open(path)) return PlaintextWriter.write(doc).getvalue()
def readRtf(self, path): try: doc = Rtf15Reader.read(open(path, "rb")) except: self._log("Some screwy rtf shit going on with " + path) return "Can't process ur shitty rtf <3 dfbot" contents = PlaintextWriter.write(doc).getvalue() #print contents return contents
def test_read2(self): rtf = StringIO("""{\\rtf1\\ansi\\ansicpg1252\\cocoartf1343\\cocoasubrtf160\\cocoascreenfonts1{\\fonttbl\\f0\\fnil\\fcharset222 Thonburi;} {\\colortbl;\\red255\\green255\\blue255;} \\pard\\tx560\\tx1120\\tx1680\\tx2240\\tx2800\\tx3360\\tx3920\\tx4480\\tx5040\\tx5600\\tx6160\\tx6720\\pardirnatural\\qc {\\f0\\fs24 \\cf0 \\'b9\\'e9\\'d3\\'b5\\'a1}""") doc = Rtf15Reader.read(rtf) text = PlaintextWriter.write(doc).read() print text self.assertEquals(u"น้ำตก", text.decode('utf8'))
def rtf_to_plain_text(file_name): print file_name out_file_name = './PlainText/%s.txt' % (file_name[:-4]) fw = open(out_file_name, 'w') doc = Rtf15Reader.read(open(file_name, "r")) res = PlaintextWriter.write(doc).getvalue() fw.write(res) fw.close()
def get_text(self): """ return a unicode object from the rtf file """ loc = self.get_file_loc() if loc: doc = Rtf15Reader.read(open(loc, "rb")) txt = PlaintextWriter.write(doc).getvalue() return txt.decode('utf-8') else: return u""
def clean_rtf(fname): doc = Rtf15Reader.read(open(fname)) plain = PlaintextWriter.write(doc).getvalue() lines = plain.split("\n") # print '#############################\norig: %s' % pprint.pformat(lines[:10]) lines = filter(lambda l: len(l) > 0, lines) # print "##############################\nno blank lines:\t%s" % pprint.pformat(lines[:10]) lines = [line.split(";") for line in lines] lines = [[val[1:-1] for val in line] for line in lines] # print "##############################\nsplit lines:\t%s" % pprint.pformat(lines[:10]) return lines
def transform(self, data, options=None): if self._validate(data) is None: return None file = cStringIO.StringIO() file.write(''.join(self.filter(data))) file.seek(0) doc = Rtf15Reader.read(file, errors='replace') xhtml = XHTMLWriter.write(doc) xhtml_ = xhtml.read() xhtml.close() return TransformResult(StringIter(xhtml_))
def _rtf_to_txt(file_path, dst_dir, file_name): """ Uses the pyth python module to extract text from a rtf file and save to .txt in dst_dir. """ if file_name is None: file_name = os.path.split(file_path)[1] file_dst = os.path.join(dst_dir, re.sub(r'\.rtf$', '.txt', file_name)) doc = Rtf15Reader.read(open(file_path)) txt = PlaintextWriter.write(doc).getvalue() txt = unidecode(txt) with open(file_dst, 'w') as f: f.write(txt) return 0
def test_when_last_item_sublist_item(self): """ With structures like this, both lists were getting dropped Start * 1 * 1.1 """ list_bug = os.path.join(os.path.abspath(os.path.dirname(__file__)), "rtfs", "list-bug.rtf") with open(list_bug, 'rb') as rtf: doc = Rtf15Reader.read(rtf) text = [] traverse_text(doc, lambda x: text.append(x)) self.assertIn('Start', text) self.assertIn('1', text) self.assertIn('1.1', text)
def getFileText(file_path, html=False, pdf_utf8=False): ''' input: string of file path output: either raw string or parsed html text content ''' file_extension = os.path.splitext(file_path)[1] if file_extension.lower() != ".py": if file_extension.lower() == ".html" or file_extension.lower( ) == '.htm': file_content = open(file_path).read() if html: try: html_text = lh.fromstring(file_content).text_content() return html_text except UnicodeDecodeError: try: html_text = lh.fromstring( helpers.convert_encoding( file_content)).text_content() except UnicodeDecodeError: html_text = lh.fromstring( unicode(file_content, errors='ignore')).text_content() return html_text return html_text else: return file_content if file_extension == ".pdf": pdf_content = open(file_path, "rb") pdfReader = PyPDF2.PdfFileReader(pdf_content) num_pages = pdfReader.getNumPages() page_text = "" for i in range(0, num_pages): pageObj = pdfReader.getPage(i) page_text = page_text + " " + pageObj.extractText() # Need to check for pdfs that are just scanned images if len(page_text) <= num_pages: return None else: if pdf_utf8: return page_text.encode('utf-8') else: return page_text if file_extension == ".rtf": doc = Rtf15Reader.read(open(file_path)) page_text = PlaintextWriter.write(doc).getvalue() uni_page_text = page_text.decode('utf-8') return uni_page_text return None
def test_basic(self): """ Try to read an empty rtf document """ rtf = open('test.rtf', 'rb') rtf = rtf.read() # Read file content by chunks content = StringIO() content.write(rtf) content.seek(0) doc = Rtf15Reader.read(content) self.assert_(isinstance(doc.content[0], pyth.document.Paragraph)) self.assert_(doc.content[2].content[0].content[0], u"[` ~ ! @ # $ % ^ & * ( ) - _ = + [ { ] } \ | ; : ' 0x810x67 , < . > / ?]")
def _convert_rtf_to_text(self, password=None): input_rtf = self.cvFile rtf = Rtf15Reader.read(open(input_rtf)) outputPath = self.scratchDir inputPath = os.getcwd() if os.path.exists(input_rtf): inputPath = os.path.dirname(input_rtf) input_filename = os.path.basename(input_rtf) input_parts = input_filename.split(".") input_parts.pop() randomStr = int(time.time()) output_filename = outputPath + os.path.sep + ".".join(input_parts) + randomStr.__str__() + r".txt" self.cvTextFile = output_filename fw = open(self.cvTextFile, "w") fw.write(PlaintextWriter.write(rtf).getvalue()) fw.close() return (0)
def Run(journal_file): raw_entries = plistlib.readPlist(journal_file) acc = utils.EntryAccumulator(lambda x: x['date']) for k, v in raw_entries.iteritems(): if not v: continue # 12/29/2001 -> 2001-12-29 new_k = re.sub(r'(\d\d)/(\d\d)/(\d\d\d\d)', r'\3-\1-\2', k) d = parser.parse(new_k) if isinstance(v, plistlib.Data): f = StringIO.StringIO(v.data) try: doc = Rtf15Reader.read(f) except ValueError as e: print v.data raise e txt = PlaintextWriter.write(doc).getvalue() acc.add({'date': d, 'rtf': v.data, 'text': txt}) else: acc.add({'date': d, 'text': v}) for day, entries in acc.iteritems(): assert len(entries) == 1 entry = entries[0] if not entry['text']: continue summary = utils.SummarizeText(entry['text']) utils.WriteSingleSummary(day, maker='osxapp', summary=summary, dry_run=dry_run) if 'rtf' in entry: utils.WriteOriginal(day, maker='osxapp', contents=entry['rtf'], filename='journal.rtf', dry_run=dry_run) else: utils.WriteOriginal(day, maker='osxapp', contents=entry['text'].encode('utf8'), filename='journal.txt', dry_run=dry_run)
def loadAllRTFToDB(folderPath): db = DBController() for dirPath, dirNames, fileNames in os.walk(folderPath): for fileName in fileNames: if not fileName.endswith('.rtf'): continue filePath = os.path.join(dirPath, fileName) print(filePath) try: doc = Rtf15Reader.read(open(filePath)) text = PlaintextWriter.write(doc).getvalue() except: continue lines = [line.strip() for line in text.split('\n') if line] articleLinesDict, articleStartIndex = {}, 0 for i, line in enumerate(lines): if line.startswith('Document ') and len(line.split(' ')) == 2: articleId = line.split(' ')[-1] articleLinesDict[articleId] = lines[articleStartIndex : i] articleStartIndex = i + 1 for articleId, lines in articleLinesDict.iteritems(): bylineIndex, wordCountIndex, textStartIndex = -1, -1, -1 for i, line in enumerate(lines): line = line.lower() if line.startswith('by '): bylineIndex = i elif line.endswith(' words'): wordCountIndex = i elif line == 'english': textStartIndex = i + 2 if wordCountIndex == -1 or textStartIndex == -1 or wordCountIndex > textStartIndex: print(filePath + ', ' + articleId) else: articleDict = {'_id': articleId, 'filePath' : filePath.split('Marshall_RA/')[-1], 'headline': ' '.join(lines[: wordCountIndex]) if bylineIndex == -1 else ' '.join(lines[: bylineIndex]), 'byline' : '' if bylineIndex == -1 else lines[bylineIndex], 'date' : parser.parse(lines[wordCountIndex + 1]), 'sourceName' : lines[wordCountIndex + 2] if lines[wordCountIndex + 2].find(' AM') == -1 and lines[wordCountIndex + 2].find(' PM') == -1 else lines[wordCountIndex + 3], 'leadParagraph' : '', 'tailParagraph' : '\n'.join(lines[textStartIndex:]), 'sourceCode' : '', 'industry' : [], 'region' : [], 'newsSubject' : [], 'company' : []} db.saveArticle(articleDict)
def rtf(url): ''' gets the url of the rtf file, and (tries to) return an xhtml version of it. returns False if couldn't convert. ''' remote = urlopen(url) data = remote.read() remote.close() temp = TemporaryFile() temp.write(data) temp.seek(0) try: doc = Rtf15Reader.read(temp) xhtml = XHTMLWriter.write(doc, pretty=True).read() except: xhtml = False temp.close() return xhtml
def reviewfile(path, filename): document_object = Rtf15Reader.read(open(path + filename, "rb")) # string and list objects relating from transcript transcriptstring, transcriptnopunctuation = FileProcessor(document_object) # find legend start and end which are used to distinguish between speakers # section and body of transcript legendstart, legendend, legendendmarker = FindLegend(transcriptstring) # extract executive, analyst and operator strings used to idenfity speakers executiveslist, analystslist, operator =\ FindSpeakers(transcriptstring, legendendmarker) # Next cut transcript string into segments by speaker commentlist, aggregatecommentlist =\ OrganizeTranscriptBySpeaker(transcriptstring, executiveslist, analystslist, operator, filename, legendend) # Analyze transcript # orderedworddict =\ # AnalyzeTranscript(transcriptnopunctuation) personwordcounts = AnalyzeComments(aggregatecommentlist) personobjects = [] for person in personwordcounts: isexecutive = person[0] in executiveslist isanalyst = person[0] in analystslist personobject =\ Speaker( person[0], filename, isexecutive, isanalyst, person[1], person[2] ) personobjects.append(personobject) # personobjects = map() # print "AGGREGATE COMMENTS ORGANIZED BY EACH PERSON" # for person, count, comment in personwordcounts: # print "Speaker:" # print person # print "" # print "Word Count:" # print count # print "" # print "All Comments of Speaker:" # print comment # print "" # print "" # print "" return personobjects
def documentToText(path): if path[-4:] == ".doc": cmd = ['antiword', path] p = Popen(cmd, stdout=PIPE) stdout, stderr = p.communicate() return removeNonAscii(stdout) elif path[-5:] == ".docx": return removeNonAscii(doc.process(path)) elif path[-4:] == ".txt": inputFile = open(path) text = inputFile.read() #Because memory and such inputFile.close() return(removeNonAscii(text)) elif path[-4:] == ".pdf": return removeNonAscii(convert_pdf_to_txt(path)) elif path[-4:] == ".rtf": text = Rtf15Reader.read(open(path)) return removeNonAscii(PlaintextWriter.write(text).getvalue()) return "Returned Nothing."
def download_text(self): filename = self.link[33:] + "." + self.typ try: with requests.get(self.link, stream=True) as r: with open(filename, 'wb') as f: shutil.copyfileobj(r.raw, f) except: print("Error downloading " + self.link) text = "" if self.typ == "pdf": try: with pdfplumber.open(filename) as pdf: for page in pdf.pages: text += page.extract_text() except: try: text += textract.process(filename, method="tesseract", language="rus").decode("utf-8") except: print("Error extracting " + filename) elif self.typ == "doc": try: text += docx2txt.process(filename) except: try: output = filename[:-3] + "txt" os.system("antiword {} > {}".format(filename, output)) with open(output) as f: text += f.read() os.remove(output) except: print("Error extracting " + filename) elif self.typ == "rtf": try: doc = Rtf15Reader.read(open(filename, "rb")) text += html2text.html2text( XHTMLWriter.write(doc, pretty=True).read().decode("utf-8")) except: print("Error extracting " + filename) if os.path.exists(filename): os.remove(filename) self.text = text
def Run(journal_file): raw_entries = plistlib.readPlist(journal_file) acc = utils.EntryAccumulator(lambda x: x['date']) for k, v in raw_entries.iteritems(): if not v: continue # 12/29/2001 -> 2001-12-29 new_k = re.sub(r'(\d\d)/(\d\d)/(\d\d\d\d)', r'\3-\1-\2', k) d = parser.parse(new_k) if isinstance(v, plistlib.Data): f = StringIO.StringIO(v.data) try: doc = Rtf15Reader.read(f) except ValueError as e: print v.data raise e txt = PlaintextWriter.write(doc).getvalue() acc.add({ 'date': d, 'rtf': v.data, 'text': txt }) else: acc.add({ 'date': d, 'text': v }) for day, entries in acc.iteritems(): assert len(entries) == 1 entry = entries[0] if not entry['text']: continue summary = utils.SummarizeText(entry['text']) utils.WriteSingleSummary(day, maker='osxapp', summary=summary, dry_run=dry_run) if 'rtf' in entry: utils.WriteOriginal(day, maker='osxapp', contents=entry['rtf'], filename='journal.rtf', dry_run=dry_run) else: utils.WriteOriginal(day, maker='osxapp', contents=entry['text'].encode('utf8'), filename='journal.txt', dry_run=dry_run)
def rtf(url): ''' gets the url of the rtf file, and (tries to) return an xhtml version of it. returns False if couldn't convert. ''' remote = urlopen(url) data = remote.read() remote.close() temp = TemporaryFile() temp.write(data) temp.seek(0) try: doc = Rtf15Reader.read(temp, errors='ignore') xhtml = XHTMLWriter.write(doc, pretty=True).read() except: xhtml = False logger.exception('Failed reading rtf from {0}'.format(url)) temp.close() return xhtml
def rtf(url): ''' gets the url of the rtf file, and (tries to) return an xhtml version of it. returns False if couldn't convert. ''' remote = urlopen(url) data = remote.read() remote.close() temp = TemporaryFile() temp.write(data) temp.seek(0) try: doc = Rtf15Reader.read(temp) xhtml = XHTMLWriter.write(doc, pretty=True).read() except: xhtml = False exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() logger.warn(''.join(traceback.format_exception(exceptionType, exceptionValue, exceptionTraceback))) temp.close() return xhtml
def parse(self, downloaded_file): logging.info("Parsing AAMS Block list") from pyth.plugins.rtf15.reader import Rtf15Reader, Group def _handle_ansi_escape(self, code): try: Group._handle_ansi_escape(self, code) except: self.content.append(" ") Group.handle_ansi_escape = _handle_ansi_escape doc = Rtf15Reader.read(downloaded_file) doc.content[0].content siti = doc.content[0].content[3].content[0] for sito in siti.split("\n"): m = re.search("(\d+)(.*)", sito) if m: url = m.group(2) yield { "url": url, }
def handle_files(): ''' The main function to start processing the rtf files into csv ''' file_prefix = "old_committee-meetings-protocols" for file_name in glob2.glob(protocol_dir): if file_prefix in file_name: doc = Rtf15Reader.read(open(file_name)) data = PlaintextWriter.write(doc).getvalue() data = data.split(':') for leg in data: if "מוזמנים" in leg: index_visitor = data.index(leg) + 1 name = ({'header': 'מוזמנים', 'body': data[index_visitor]}) dir_file = file_name.replace('.rtf', '.csv') with open(dir_file, 'w') as f: w = csv.DictWriter(f, name.keys()) w.writeheader() w.writerow(name) break
def _readin_rtf(self): doc = Rtf15Reader.read(open(self.fname, 'r')) self.doc_text = [] self.doc_props = [] for i,element in enumerate(doc.content): if hasattr(element,"content"): if len(element.content) == 0: self.doc_text.append('') # paragraph self.doc_props.append([]) for text in element.content: if not isinstance(text, pyth.document.Text): if isinstance(text, pyth.document.ListEntry): continue print("### Unknown paragraph element", text, file=sys.stderr) sys.exit(1) #print text.content[0] self.doc_text.append(text.content[0]) self.doc_props.append(list(text.properties.keys())) self._formatting_fix_ups()
def GET(self, day): out = StringIO() out.write("""<html> <head> <link rel="stylesheet" href="/static/viewer.css" /> <script src="/static/jquery-1.7.min.js"></script> </head> <body> <div id="oneday"> """) data = utils.GetOneDay(datetime.strptime(day, '%Y/%m/%d').date()) for maker in sorted(data.keys()): out.write('<h2>%s</h2>\n' % maker) # TODO(danvk): include URL, thumbnail if available. out.write('<p>%s</p>\n' % data[maker]['summary']['summary'].encode('utf8')) if 'originals' in data[maker]: originals = data[maker]['originals'] for filename in sorted(originals.keys()): out.write('<h3>%s</h3>\n' % filename) _, ext = os.path.splitext(filename) if ext == '.txt': out.write('<pre>%s</pre>\n' % originals[filename]) elif ext == '.html': out.write(originals[filename]) elif ext == '.rtf': f = StringIO(originals[filename]) doc = Rtf15Reader.read(f) html = XHTMLWriter.write(doc).getvalue() out.write(html) else: out.write('<p>(Unknown format "%s")</p>' % ext) out.write('<hr/>\n') out.write('</div></body></html>') return out.getvalue()
def get_one_month_from_rtf(url): rtf_file = urllib2.urlopen(url) rtf_file = StringIO(rtf_file.read()) doc = Rtf15Reader.read(rtf_file) final_data = [] header = False for c in doc.content: full_p = c.content.__repr__().lower() if "capacity" in full_p and "use cna" in full_p: header = True continue if header: row = re.split(r"\t", c.content[0].content[0]) if len(row) == 7: final_data.append(row) df = pd.DataFrame(final_data, columns=[ "prison_name", "baseline_cna", "in_use_cna", "operational_capacity", "population", "perc_pop_to_used_cna", "perc_acc_available" ]) df.iloc[:, 1:] = df.iloc[:, 1:].replace("%", "", regex=True).replace(",", "", regex=True) for c in df.columns: df[c] = pd.to_numeric(df[c], errors='ignore') cols = [c for c in df.columns if "perc" in c] df.loc[:, cols] = df.loc[:, cols] / 100 return df
def testmethod(self): # the test method to be added inputfilename = os.path.join(rtfinputsdir, basename+".rtf") outputfilename = os.path.join(testoutputdir, "%s.%s" % (basename, writer)) #--- obtain reference output or skip test: with open(referencefilename, "rb") as input: the_referenceoutput = input.read() #--- read and convert RTF: with open(inputfilename, "rb") as input: document = Rtf15Reader.read(input) if writer == 'html': the_testoutput = XHTMLWriter.write(document, pretty=True).read() write_html_file(outputfilename, the_testoutput, print_msg=False) elif writer == 'txt': with open(outputfilename, "wt") as f: PlaintextWriter.write(document, f) #--- compute test output: with open(outputfilename, "rb") as input: the_testoutput = input.read() #--- check outcome: if the_testoutput == the_referenceoutput: os.remove(outputfilename) # assert will succeed, so it is no longer needed self.assertEqual(the_testoutput, the_referenceoutput)
def convertRtfToText(path): doc = Rtf15Reader.read(open(path)) return PlaintextWriter.write(doc).getvalue()
def document_create_index(document, user_id=None): import os from xlrd import open_workbook from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.plaintext.writer import PlaintextWriter import sunburnt document = json.loads(document) table = s3db.doc_document id = document["id"] name = document["name"] filename = document["filename"] filename = "%s/%s/uploads/%s" % (os.path.abspath("applications"), \ request.application, filename) si = sunburnt.SolrInterface(settings.get_base_solr_url()) extension = os.path.splitext(filename)[1][1:] if extension == "pdf": data = os.popen("pdf2txt.py " + filename).read() elif extension == "doc": data = os.popen("antiword " + filename).read() elif extension == "xls": wb = open_workbook(filename) data = " " for s in wb.sheets(): for row in range(s.nrows): values = [] for col in range(s.ncols): values.append(str(s.cell(row, col).value)) data = data + ",".join(values) + "\n" elif extension == "rtf": doct = Rtf15Reader.read(open(filename)) data = PlaintextWriter.write(doct).getvalue() else: data = os.popen("strings " + filename).read() # The text needs to be in unicode or ascii, with no contol characters data = str(unicode(data, errors="ignore")) data = "".join(c if ord(c) >= 32 else " " for c in data) # Put the data according to the Multiple Fields # @ToDo: Also, would change this according to requirement of Eden document = { "id": str(id), # doc_document.id "name": data, # the data of the file "url": filename, # the encoded file name stored in uploads/ "filename": name, # the filename actually uploaded by the user "filetype": extension # x.pdf -> pdf is the extension of the file } # Add and commit Indices si.add(document) si.commit() # After Indexing, set the value for has_been_indexed to True in the database db(table.id == id).update(has_been_indexed=True) db.commit()
def test(self): # Just make sure they don't crash, for now Rtf15Reader.read(open(path, "rb"))
def rtf(from_file, to_txt, opts): doc = Rtf15Reader.read(open(from_file.path, "rb")) text = PlaintextWriter.write(doc).getvalue() return save_raw_data(to_txt.path, text)
def main(): y = 0 contains = False for a in range(1, 7): list = [] dev = '{}{}'.format("dev", a) root = api.path + '/{}/interviews/'.format(dev) dirlist = [ item for item in os.listdir(root) if os.path.isfile(os.path.join(root, item)) ] for i in dirlist: if i.endswith('.rtf'): x = 0 print root + i doc = PlaintextWriter.write(Rtf15Reader.read( open(root + i))).getvalue() first_index = doc.find("Clip Transcript") + 21 second_index = doc.find("Clip Keywords") - 19 interview_date = doc[first_index - 35:first_index - 25].replace("_", "-") clip_transcript = doc[first_index:second_index] list.append([interview_date, clip_transcript]) while x < doc.count("Clip Transcript"): if doc.find("Clip Transcript", second_index) > 0: first_index = doc.find("Clip Transcript", second_index) + 21 else: break if doc.find("Clip Keywords", first_index) > 0: second_index = doc.find("Clip Keywords", first_index) - 16 else: break interview_date = doc[first_index - 35:first_index - 25].replace("_", "-") clip_transcript = doc[first_index:second_index] print interview_date, clip_transcript for sublist in list: if sublist[0] == interview_date: sublist[1] = sublist[ 1] + "\n------------------\n" + clip_transcript contains = True if not contains: list.append([interview_date, clip_transcript]) contains = False x = x + 1 print list # print list[0][0], list[0][1] for i in dirlist: if i.endswith('.mp3'): tag = i[:10].replace("_", "-") index = 0 for sublist in list: if tag.strip() == sublist[0].strip(): description = unicode(list[index][1], errors="ignore") contains = True index = index + 1 if not contains: description = "" contains = False data_audio = { "description": description, "duration": (MP3(api.path + '/{}/interviews/{}'.format(dev, i))).info.length, "id": "", "interview": api.get('interviews', y + 1), "status": "PRIVATE", "tag": i, "uri": "http://opendata.soccerlab.polymtl.ca/audios/" + i, "author": "", "license": "" } api.request("Audio", 'audios', data_audio) y = y + 1
def parse(): committeeIds = {} for csvDir, _, csvFiles in os.walk('./csv'): for csvFileName in csvFiles: if not re.match('\w+\d{4}\.csv$', csvFileName): continue year = csvFileName[7:11] committeeIds[year] = set() with open(os.path.join(csvDir, csvFileName)) as csvFile: for row in csv.reader(csvFile): committeeId = row[8] if row[6] == '2' and committeeId != '0': committeeIds[year].add(committeeId) dictionary = {} with open('./log.txt', 'w+') as outputFile: for rtfDir, _, rtfFiles in os.walk('./rtf'): for fileName in rtfFiles: if not fileName.endswith('.rtf'): continue year = fileName[:4] if year not in dictionary: dictionary[year] = {} with open(os.path.join(rtfDir, fileName)) as rtfFile: parsedName = re.findall(u'\d+', fileName) date, meetingId = str( '/'.join(parsedName[0:3][::-1]) ), parsedName[-1] if len(parsedName) > 3 else '00' try: doc = Rtf15Reader.read(rtfFile) except Exception: continue for line in PlaintextWriter.write(doc): line = unicode(line, encoding='utf-8') if len(line) < 95 and re.match(approvedLine, line): res = list( set(re.findall(u'\d+', line)) & committeeIds[year]) if len(res) > 0: outputFile.write(fileName + ': ' + line.encode('utf-8')) for requestId in res: dictionary[year][requestId] = [ date, meetingId ] for csvDir, _, csvFiles in os.walk('./csv'): for csvFileName in csvFiles: if not re.match('\w+\d{4}\.csv$', csvFileName): continue year = csvFileName[7:11] if not year in dictionary: continue with open(os.path.join(csvDir, csvFileName)) as csvFile: with open( os.path.join( csvDir, csvFileName[:-4] + '_out' + csvFileName[-4:]), 'w+') as outputCsv: writer = csv.writer(outputCsv) reader = csv.reader(csvFile) writer.writerow(reader.next() + headerColumns) for row in csv.reader(csvFile): committeeId = '' if len(row) < 9 else row[8] writer.writerow(row + ( ['', ''] if committeeId not in dictionary[year] else dictionary[year][committeeId]))
from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.xhtml.writer import XHTMLWriter import sys if len(sys.argv) > 1: filename = sys.argv[1] else: filename = "sample.rtf" doc = Rtf15Reader.read(open(filename, "rb")) print XHTMLWriter.write(doc, pretty=True).read()