def convert_to_txt(file_path): logger.debug("convert_to_txt: %s" % file_path) words = None if not os.path.exists(file_path): logger.error("missing file %s", file_path) file_size = os.stat(file_path).st_size logger.debug("convert_to_txt: %d bytes at %s",file_size, file_path) ext = _get_extension(file_path) if ext == '.txt': logger.debug("loading txt file") worked = False try: encoding, file_handle, words = open_with_correct_encoding(file_path) except Exception as e: logger.error("Wasn't able to read the words from the file %s" % file_path) words = "" elif ext == '.docx': logger.debug("loading docx file") words = _docx_to_txt(file_path) elif ext == '.rtf': logger.debug("loading rtf file") doc = Rtf15Reader.read(open(file_path)) words = PlaintextWriter.write(doc).getvalue() else: logging.warning("Couldn't find an extension on the file, so assuming text") with codecs.open(file_path, 'r', ENCODING_UTF_8) as myfile: words = myfile.read() logger.debug("loaded %d chars" % len(words)) return words
def upload(request): # user uploads a document -> convert into a dict of the terms found if request.FILES: if 'file' in request.FILES: result = '' f = request.FILES['file'] fp = 'shake_v3/static/data/' + str(f) fp2 = fp[:len(fp)-3] + 'txt' if fp[len(fp)-3:len(fp)] == 'pdf': with open(fp, 'wb+') as pdff: for chunk in f.chunks(): pdff.write(chunk) result = pdf_to_txt(fp) with open(fp2, 'wb+') as txtf: txtf.write(result) elif fp[len(fp)-3:len(fp)] == 'rtf': with open(fp, 'wb+') as rtff: for line in f: rtff.write(line) doc = Rtf15Reader.read(open(fp, 'rb')) doctxt = PlaintextWriter.write(doc).getvalue() with open(fp2, 'wb+') as txtf: for line in doctxt: txtf.write(line) f = str(f)[:-4] + ".txt" result = doctxt else: with open(fp2, 'wb+') as txtf: for line in f: txtf.write(line) result = open(fp2, 'r').read() response_dict = generate_term_dict(result) response_dict['fp'] = 'static/data/' + str(f) return HttpResponse(simplejson.dumps(response_dict), mimetype='application/javascript') # user indicates terms -> give a grade elif request.POST: #TO DO: implement saving the data rating = "" score = custom_POST_to_score(request) if score > 4.5: rating = 'A+' elif score > 4: rating = 'A' elif score > 3.5: rating = 'B+' elif score > 3: rating = 'B' elif score > 2.5: rating = 'C+' elif score > 2: rating = 'C' elif score > 1: rating = 'D' else: rating = 'F' return HttpResponse(rating) # display the upload part 1 else: score = 0 return render_to_response('upload.html', {'score': score}, context_instance = RequestContext(request))
def rtf_to_text(value): if len(value) == 0: return value rtf_doc = Rtf15Reader.read(BytesIO(value.encode("latin_1"))) txt_doc = BytesIO() PlaintextWriter.write(rtf_doc, txt_doc, encoding="latin_1") return txt_doc.getvalue().decode("latin_1")
def convertRtfToText(path): from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.plaintext.writer import PlaintextWriter doc = Rtf15Reader.read(open('sample.rtf')) print(PlaintextWriter.write(doc).getvalue())
def read_rtf_text(fp, errors='strict', encoding='utf-8'): doc = CustomRtf15Reader.read(fp, errors=errors) for p in doc.content: p.content = filter(paragraph_is_text_like, p.content) return PlaintextWriter.write(doc).read().decode(encoding)
def convert_to_txt(file_path): logger.debug("convert_to_txt: %s" % file_path) if not os.path.exists(file_path): logger.error("missing file %s", file_path) file_size = os.stat(file_path).st_size logger.debug("convert_to_txt: %d bytes at %s", file_size, file_path) ext = _get_extension(file_path) if ext == '.txt': logger.debug("loading txt file") try: encoding, file_handle, words = open_with_correct_encoding( file_path) except Exception: logger.error("Wasn't able to read the words from the file %s" % file_path) words = "" elif ext == '.docx': logger.debug("loading docx file") words = _docx_to_txt(file_path) elif ext == '.rtf': logger.debug("loading rtf file") doc = Rtf15Reader.read(open(file_path)) words = PlaintextWriter.write(doc).getvalue() else: logging.warning( "Couldn't find an extension on the file, so assuming text") with open(file_path, 'r') as myfile: words = myfile.read() logger.debug("loaded %d chars" % len(words)) return words
def FileProcessor(document_object): transcriptstring = PlaintextWriter.write(document_object).read() # # Removes non-text information from transcriptstring transcriptstring = re.sub(r'\b\w{50,}\b', '', transcriptstring) # print len(transcriptstring) # # Removes trailing spaces for each line in transcriptstring transcriptstring =\ ''.join([line.rstrip()+'\n' for line in transcriptstring.splitlines()]) # # Removes disclaimer at endcut callendmarker = '\n\n\n\n\n\n\n\n\n\n\n\n\n' endcut = transcriptstring.find(callendmarker) if endcut is not -1: transcriptstring = transcriptstring[:endcut] # # Strips out certain escape characters that don't seem to get handled above # Below doesn't work properly because it also deletes all "s" characters # escchars = '\xc2\xa9\xe2\x80\x99s' # replace_esc = string.maketrans(escchars, # ' '*len(escchars)) # transcriptstring = transcriptstring.translate(replace_esc) # # # Decodes strings transcriptstring = transcriptstring.decode('utf-8', 'replace') transcriptnopunctuation = deletepunctuation(transcriptstring) # transcriptnopunctuation =\ # transcriptnopunctuation.decode('utf-8', 'replace') # print len(transcriptnopunctuation) return transcriptstring, transcriptnopunctuation
def read_recommendations(self, file_name): """ Function reads the targeted values from the file "WHO Daily Recommended Values.rtf" It process the entries and creates a dictionary with Nutrient name as Key and Nutrient Value as value :param file_name: :return: """ target = dict() filtered_col = list() doc = Rtf15Reader.read(open(file_name)) entities = PlaintextWriter.write(doc).getvalue().split('\n\n') for item in entities: splited = item.split(',') name = splited[0].split('(')[0] value = splited[1] try: unit = splited[0].split('(')[1].split(')')[0] except: unit = '' # target.append({'nutrient': name, # 'unit': unit, # 'value': value}) target.update({name: value}) filtered_col.append(name) self.target_values = target return target, filtered_col
def GetExternal(version, odl_data, source, class_id): external = "" for item in version[2]: if item[0] == "Attribute" \ and item[1] == "_Art1_RTF": if len(item[2]) == 2: if isinstance(source, ZipFile): data = source.open(item[2][0]).read() else: file_name = join(source, item[2][0]) f = open(file_name, 'rb') data = f.read() f.close() data = data.replace("\x0c", "") elif len(item[2]) == 1: data = item[2][0] if data == "": return "" f = StringIO() f.write(data) doc = Rtf15Reader.read(f, clean_paragraphs = False) external = PlaintextWriter.write(doc).getvalue() external = external.replace("\n\n", "\n") return ReplaceTextNames(external, version, odl_data, class_id)
def analyze(committeeFile): try: doc = Rtf15Reader.read(open(committeeFile, "rb")) except: print "%s - skipped..." % committeeFile errFile = committeeFile.replace(global_options.indir, global_options.errdir) shutil.copyfile(committeeFile, errFile) return False #print PlaintextWriter.write(doc).getValue() f = open("test.out", 'w') f.write(PlaintextWriter.write(doc).getvalue()) f.close() f = open("test.out", 'r') participants = find_participants(f.read()) f.close() # Getting the indication whether the participant spoke in the committee f = open("test.out", 'r') docstring = f.read() for line in docstring.splitlines(): name = '' if ":" in line: participant = line.split(":")[0] for p in participants: if participant in p['name']: p['speaker'] = True p['speak_count'] += 1 f.close() fname = committeeFile.replace(global_options.indir, global_options.outdir) fname = fname.replace("rtf", "txt") file = codecs.open(fname, "w", "utf-8") for participant in participants: string_builder = [] for key, val in participant.iteritems(): string = u"'%s': '%s'" if val is not None: if type(val) == str: val = val.replace("'", "") val = val.replace('"', '') string = string % (key, print_unicode(val)) string_builder.append(string) wrt_ln = ', '.join(string_builder) wrt_ln += ',\n' try: file.write(wrt_ln) except UnicodeEncodeError: print wrt_ln file.close() verbose("Generated participants file: " + fname) return True
def load_stickies(path): stickies = [] with open(path) as fd: for i,rtf in enumerate(parse_sticky_database(fd.read())): doc = Rtf15Reader.read(StringIO.StringIO(rtf)) plaintext = PlaintextWriter.write(doc).getvalue() stickies.append(plaintext) return stickies
def main(): if len(sys.argv) < 2: print("usage %s <rtf_file_name> <txt_file_name>") else: doc = Rtf15Reader.read(open(os.path.join(sys.argv[1]))) txt_filename = sys.argv[2] with open(os.path.join(txt_filename), "w") as of: of.write(PlaintextWriter.write(doc).getvalue())
def extract_terms(rtffile): """ Get data from rtffile """ judges_list = [] rtf_text = PlaintextWriter.write(rtffile).getvalue() lines = re.split('\n',rtf_text) for line in itertools.islice(lines, 0, None, 4): # 1: from the second line ([1]), judges_list.append(line) # None: to the end, return judges_list # 2: step
def readRtf(self, path): try: doc = Rtf15Reader.read(open(path, "rb")) except: self._log("Some screwy rtf shit going on with " + path) return "Can't process ur shitty rtf <3 dfbot" contents = PlaintextWriter.write(doc).getvalue() #print contents return contents
def get_rtf_text(path): """ Take the path of an rtf file as an argument and return the text """ doc = Rtf15Reader.read(open(path)) return PlaintextWriter.write(doc).getvalue()
def extract_terms(rtffile): """ Get data from rtffile """ judges_list = [] rtf_text = PlaintextWriter.write(rtffile).getvalue() lines = re.split('\n', rtf_text) for line in itertools.islice(lines, 0, None, 4): # 1: from the second line ([1]), judges_list.append(line) # None: to the end, return judges_list # 2: step
def parse(self, path): # Directory if os.path.isdir(path): raise NotImplementedError() # File else: doc = Rtf15Reader.read(open(path)) sample = Sample(path, None, PlaintextWriter.write(doc).getvalue()) return sample
def test_read2(self): rtf = StringIO("""{\\rtf1\\ansi\\ansicpg1252\\cocoartf1343\\cocoasubrtf160\\cocoascreenfonts1{\\fonttbl\\f0\\fnil\\fcharset222 Thonburi;} {\\colortbl;\\red255\\green255\\blue255;} \\pard\\tx560\\tx1120\\tx1680\\tx2240\\tx2800\\tx3360\\tx3920\\tx4480\\tx5040\\tx5600\\tx6160\\tx6720\\pardirnatural\\qc {\\f0\\fs24 \\cf0 \\'b9\\'e9\\'d3\\'b5\\'a1}""") doc = Rtf15Reader.read(rtf) text = PlaintextWriter.write(doc).read() print text self.assertEquals(u"น้ำตก", text.decode('utf8'))
def rtf_to_plain_text(file_name): print file_name out_file_name = './PlainText/%s.txt' % (file_name[:-4]) fw = open(out_file_name, 'w') doc = Rtf15Reader.read(open(file_name, "r")) res = PlaintextWriter.write(doc).getvalue() fw.write(res) fw.close()
def clean_rtf(fname): doc = Rtf15Reader.read(open(fname)) plain = PlaintextWriter.write(doc).getvalue() lines = plain.split("\n") # print '#############################\norig: %s' % pprint.pformat(lines[:10]) lines = filter(lambda l: len(l) > 0, lines) # print "##############################\nno blank lines:\t%s" % pprint.pformat(lines[:10]) lines = [line.split(";") for line in lines] lines = [[val[1:-1] for val in line] for line in lines] # print "##############################\nsplit lines:\t%s" % pprint.pformat(lines[:10]) return lines
def get_text(self): """ return a unicode object from the rtf file """ loc = self.get_file_loc() if loc: doc = Rtf15Reader.read(open(loc, "rb")) txt = PlaintextWriter.write(doc).getvalue() return txt.decode('utf-8') else: return u""
def _rtf_to_txt(file_path, dst_dir, file_name): """ Uses the pyth python module to extract text from a rtf file and save to .txt in dst_dir. """ if file_name is None: file_name = os.path.split(file_path)[1] file_dst = os.path.join(dst_dir, re.sub(r'\.rtf$', '.txt', file_name)) doc = Rtf15Reader.read(open(file_path)) txt = PlaintextWriter.write(doc).getvalue() txt = unidecode(txt) with open(file_dst, 'w') as f: f.write(txt) return 0
def getFileText(file_path, html=False, pdf_utf8=False): ''' input: string of file path output: either raw string or parsed html text content ''' file_extension = os.path.splitext(file_path)[1] if file_extension.lower() != ".py": if file_extension.lower() == ".html" or file_extension.lower( ) == '.htm': file_content = open(file_path).read() if html: try: html_text = lh.fromstring(file_content).text_content() return html_text except UnicodeDecodeError: try: html_text = lh.fromstring( helpers.convert_encoding( file_content)).text_content() except UnicodeDecodeError: html_text = lh.fromstring( unicode(file_content, errors='ignore')).text_content() return html_text return html_text else: return file_content if file_extension == ".pdf": pdf_content = open(file_path, "rb") pdfReader = PyPDF2.PdfFileReader(pdf_content) num_pages = pdfReader.getNumPages() page_text = "" for i in range(0, num_pages): pageObj = pdfReader.getPage(i) page_text = page_text + " " + pageObj.extractText() # Need to check for pdfs that are just scanned images if len(page_text) <= num_pages: return None else: if pdf_utf8: return page_text.encode('utf-8') else: return page_text if file_extension == ".rtf": doc = Rtf15Reader.read(open(file_path)) page_text = PlaintextWriter.write(doc).getvalue() uni_page_text = page_text.decode('utf-8') return uni_page_text return None
def testmethod(self): # the test method to be added inputfilename = os.path.join(rtfinputsdir, basename+".rtf") outputfilename = os.path.join(testoutputdir, "%s.%s" % (basename, writer)) #--- obtain reference output or skip test: with open(referencefilename, "rb") as input: the_referenceoutput = input.read() #--- read and convert RTF: with open(inputfilename, "rb") as input: document = Rtf15Reader.read(input) if writer == 'html': the_testoutput = XHTMLWriter.write(document, pretty=True).read() write_html_file(outputfilename, the_testoutput, print_msg=False) elif writer == 'txt': with open(outputfilename, "wt") as f: PlaintextWriter.write(document, f) #--- compute test output: with open(outputfilename, "rb") as input: the_testoutput = input.read() #--- check outcome: if the_testoutput == the_referenceoutput: os.remove(outputfilename) # assert will succeed, so it is no longer needed self.assertEqual(the_testoutput, the_referenceoutput)
def Run(journal_file): raw_entries = plistlib.readPlist(journal_file) acc = utils.EntryAccumulator(lambda x: x['date']) for k, v in raw_entries.iteritems(): if not v: continue # 12/29/2001 -> 2001-12-29 new_k = re.sub(r'(\d\d)/(\d\d)/(\d\d\d\d)', r'\3-\1-\2', k) d = parser.parse(new_k) if isinstance(v, plistlib.Data): f = StringIO.StringIO(v.data) try: doc = Rtf15Reader.read(f) except ValueError as e: print v.data raise e txt = PlaintextWriter.write(doc).getvalue() acc.add({'date': d, 'rtf': v.data, 'text': txt}) else: acc.add({'date': d, 'text': v}) for day, entries in acc.iteritems(): assert len(entries) == 1 entry = entries[0] if not entry['text']: continue summary = utils.SummarizeText(entry['text']) utils.WriteSingleSummary(day, maker='osxapp', summary=summary, dry_run=dry_run) if 'rtf' in entry: utils.WriteOriginal(day, maker='osxapp', contents=entry['rtf'], filename='journal.rtf', dry_run=dry_run) else: utils.WriteOriginal(day, maker='osxapp', contents=entry['text'].encode('utf8'), filename='journal.txt', dry_run=dry_run)
def _convert_rtf_to_text(self, password=None): input_rtf = self.cvFile rtf = Rtf15Reader.read(open(input_rtf)) outputPath = self.scratchDir inputPath = os.getcwd() if os.path.exists(input_rtf): inputPath = os.path.dirname(input_rtf) input_filename = os.path.basename(input_rtf) input_parts = input_filename.split(".") input_parts.pop() randomStr = int(time.time()) output_filename = outputPath + os.path.sep + ".".join(input_parts) + randomStr.__str__() + r".txt" self.cvTextFile = output_filename fw = open(self.cvTextFile, "w") fw.write(PlaintextWriter.write(rtf).getvalue()) fw.close() return (0)
def loadAllRTFToDB(folderPath): db = DBController() for dirPath, dirNames, fileNames in os.walk(folderPath): for fileName in fileNames: if not fileName.endswith('.rtf'): continue filePath = os.path.join(dirPath, fileName) print(filePath) try: doc = Rtf15Reader.read(open(filePath)) text = PlaintextWriter.write(doc).getvalue() except: continue lines = [line.strip() for line in text.split('\n') if line] articleLinesDict, articleStartIndex = {}, 0 for i, line in enumerate(lines): if line.startswith('Document ') and len(line.split(' ')) == 2: articleId = line.split(' ')[-1] articleLinesDict[articleId] = lines[articleStartIndex : i] articleStartIndex = i + 1 for articleId, lines in articleLinesDict.iteritems(): bylineIndex, wordCountIndex, textStartIndex = -1, -1, -1 for i, line in enumerate(lines): line = line.lower() if line.startswith('by '): bylineIndex = i elif line.endswith(' words'): wordCountIndex = i elif line == 'english': textStartIndex = i + 2 if wordCountIndex == -1 or textStartIndex == -1 or wordCountIndex > textStartIndex: print(filePath + ', ' + articleId) else: articleDict = {'_id': articleId, 'filePath' : filePath.split('Marshall_RA/')[-1], 'headline': ' '.join(lines[: wordCountIndex]) if bylineIndex == -1 else ' '.join(lines[: bylineIndex]), 'byline' : '' if bylineIndex == -1 else lines[bylineIndex], 'date' : parser.parse(lines[wordCountIndex + 1]), 'sourceName' : lines[wordCountIndex + 2] if lines[wordCountIndex + 2].find(' AM') == -1 and lines[wordCountIndex + 2].find(' PM') == -1 else lines[wordCountIndex + 3], 'leadParagraph' : '', 'tailParagraph' : '\n'.join(lines[textStartIndex:]), 'sourceCode' : '', 'industry' : [], 'region' : [], 'newsSubject' : [], 'company' : []} db.saveArticle(articleDict)
def documentToText(path): if path[-4:] == ".doc": cmd = ['antiword', path] p = Popen(cmd, stdout=PIPE) stdout, stderr = p.communicate() return removeNonAscii(stdout) elif path[-5:] == ".docx": return removeNonAscii(doc.process(path)) elif path[-4:] == ".txt": inputFile = open(path) text = inputFile.read() #Because memory and such inputFile.close() return(removeNonAscii(text)) elif path[-4:] == ".pdf": return removeNonAscii(convert_pdf_to_txt(path)) elif path[-4:] == ".rtf": text = Rtf15Reader.read(open(path)) return removeNonAscii(PlaintextWriter.write(text).getvalue()) return "Returned Nothing."
def Run(journal_file): raw_entries = plistlib.readPlist(journal_file) acc = utils.EntryAccumulator(lambda x: x['date']) for k, v in raw_entries.iteritems(): if not v: continue # 12/29/2001 -> 2001-12-29 new_k = re.sub(r'(\d\d)/(\d\d)/(\d\d\d\d)', r'\3-\1-\2', k) d = parser.parse(new_k) if isinstance(v, plistlib.Data): f = StringIO.StringIO(v.data) try: doc = Rtf15Reader.read(f) except ValueError as e: print v.data raise e txt = PlaintextWriter.write(doc).getvalue() acc.add({ 'date': d, 'rtf': v.data, 'text': txt }) else: acc.add({ 'date': d, 'text': v }) for day, entries in acc.iteritems(): assert len(entries) == 1 entry = entries[0] if not entry['text']: continue summary = utils.SummarizeText(entry['text']) utils.WriteSingleSummary(day, maker='osxapp', summary=summary, dry_run=dry_run) if 'rtf' in entry: utils.WriteOriginal(day, maker='osxapp', contents=entry['rtf'], filename='journal.rtf', dry_run=dry_run) else: utils.WriteOriginal(day, maker='osxapp', contents=entry['text'].encode('utf8'), filename='journal.txt', dry_run=dry_run)
def handle_files(): ''' The main function to start processing the rtf files into csv ''' file_prefix = "old_committee-meetings-protocols" for file_name in glob2.glob(protocol_dir): if file_prefix in file_name: doc = Rtf15Reader.read(open(file_name)) data = PlaintextWriter.write(doc).getvalue() data = data.split(':') for leg in data: if "מוזמנים" in leg: index_visitor = data.index(leg) + 1 name = ({'header': 'מוזמנים', 'body': data[index_visitor]}) dir_file = file_name.replace('.rtf', '.csv') with open(dir_file, 'w') as f: w = csv.DictWriter(f, name.keys()) w.writeheader() w.writerow(name) break
def upload_file(request): error_message = "" if request.method == "POST": form = UploadForm(request.POST, request.FILES) if form.is_valid(): doc_name = UploadedFile(request.FILES["doc_file"]) doc_uploaded_date = timezone.now() doc = request.FILES["doc_file"] if get_file_type(doc_name) == ".rtf": result = Rtf15Reader.read(doc) parser = LawHtmlParser(PlaintextWriter.write(result).read()) elif get_file_type(doc_name) == ".txt": parser = LawHtmlParser(doc.read()) parsed_doc_content = parser.get_parsed_text() new_doc = Document(name=doc_name, content=parsed_doc_content, uploaded_date=doc_uploaded_date, file=doc) new_doc.save() return HttpResponseRedirect(reverse("document:list")) else: error_message = "Please select a file." form = UploadForm() return render(request, "document/upload.html", {"form": form, "error_message": error_message})
def parseRTFstring(rtfSTRING): doc = Rtf15Reader.read(rtfSTRING) #print PlaintextWriter.write(doc).getvalue() return PlaintextWriter.write(doc).getvalue()
totalInvalidNamesCount = 0 for committeeFile in committeeFiles: print "====================" print str(progressCount) + " - analyzing " + committeeFile progressCount -= 1 try: doc = Rtf15Reader.read(open(committeeFile, "rb")) except: print committeeFile + " - skipped..." errFile = committeeFile.replace(srcDir, "errFiles") shutil.copyfile(committeeFile, errFile) skippedFilesCount += 1 continue f = open("test.out", "w") f.write(PlaintextWriter.write(doc).getvalue()) f.close() f = open("test.out", "r") docstring = f.read() # print docstring participants = [] agendas = [] agendaIndex = 1 addParticipant = False handleAgenda = False invalidNamesCount = 0 digit_sep = [".", " ", "(", ")"] for line in docstring.splitlines(): # print line
def parse(): committeeIdsPerYear = {} for csvFileName in glob.glob(CSV_PATH): if not re.match('.+\d{4}\.csv$', csvFileName): continue year = csvFileName[-8:-4] if int(year) > 2010: continue committeeIdsPerYear[year] = set() with open(csvFileName) as csvFile: for row in csv.reader(csvFile): committeeId = row[8] if row[0].strip() == year: if row[6] == '2' and committeeId != '0': committeeIdsPerYear[year].add(committeeId) committeeIdsPerYear[year].add("%03d-%02d" % (int(row[2]),int(row[1]))) requestToMeetingMap = {} with open('./log.txt', 'w+') as outputFile: rtfFiles = glob.glob(RTF_PATH) rtfFiles.sort() for fileName in rtfFiles: if not fileName.endswith('.rtf'): continue year = fileName[:4] if year not in committeeIdsPerYear: continue if year not in requestToMeetingMap: requestToMeetingMap[year] = {} hadAnything = False with open(fileName) as rtfFile: parsedName = re.findall(u'\d+', fileName) date, meetingId = str('/'.join(parsedName[0:3][::-1])), parsedName[-1] if len(parsedName) > 3 else '00' try: doc = Rtf15Reader.read(rtfFile) except Exception, e: print "failed to parse %s: %s" % (fileName,e) continue for line in PlaintextWriter.write(doc): line = unicode(line, encoding='utf-8') if u'אושר' in line and len(line)<95: print fileName+":"+line for _request in REQUEST_RE.findall(line) + APPROVEDLINE_RE.findall(line): request = _request[0] hadAnything = True ids = set() doubles2 = set(DOUBLES_RE2.findall(request)) for d in doubles2: p1,p2 = d combination_id = '%s-%s' % (p2,p1) print "D2 "+combination_id if combination_id in committeeIdsPerYear[year]: ids.add(combination_id) request = DOUBLES_RE2.subn('',request)[0] doubles3 = set(DOUBLES_RE3.findall(request)) for d in doubles3: p1,p2 = d combination_id = '%s-%s' % (p1,p2) print "D3 "+combination_id if combination_id in committeeIdsPerYear[year]: ids.add(combination_id) request = DOUBLES_RE3.subn('',request)[0] doubles = set(DOUBLES_RE.findall(request)) for d in doubles: p1,sep,p2 = d combination_id = '%s-%s' % (p1,p2) if p1.startswith('0') and sep == '-': ids.add(combination_id) else: if combination_id in committeeIdsPerYear[year]: ids.add(combination_id) else: p1 = int(p1) p2 = int(p2) if abs(p2-p1) < 100: ids.update(map(str,range(min(p1,p2),max(p1,p2)+1))) request = DOUBLES_RE.subn('',request)[0] singles = SINGLES_RE.findall(request) ids.update(singles) for word in BAD_WORDS: if word in request: ids = set() res = list(ids & committeeIdsPerYear[year]) outputFile.write(fileName + ': ' + _request[0].strip().encode('utf-8')+" " +repr(ids)+"->"+repr(res)+"\n") outputFile.flush() for requestId in res: requestToMeetingMap[year][requestId] = [date, meetingId] if not hadAnything: pass #os.rename(fileName,fileName+".empty")
def unrtf(value): return PlaintextWriter.write(Rtf15Reader.read(StringIO(value))).getvalue()
def rtf(from_file, to_txt, opts): doc = Rtf15Reader.read(open(from_file.path, "rb")) text = PlaintextWriter.write(doc).getvalue() return save_raw_data(to_txt.path, text)
ftags = '' fmetd = str(meta.to_json(indent=2)).replace("'", "\\'") floc = os.path.join(dirname, filename) modtime = meta["file_timestamps"]["modified"] print "Name : " + filename print "Size : " + str(fsize) print "Location :" + floc print "Extension : " + fext print "Type : " + ftype print "Modified Time : " + modtime print "category : " + fcat if fext == "rtf": doc = Rtf15Reader.read(open(floc)) text_article = PlaintextWriter.write(doc).getvalue() abc = str(keywords(text_article)) abc = abc.replace("'", "\\'") ftags = abc if fext == "txt": doc = open(floc).read() abc = str(keywords(doc)) abc = abc.replace("'", "\\'") ftags = abc query = "INSERT INTO `datainfo`( `name`, `file type`, `extension`, `size`, `category`, `tags`, `metadata`,`location`,`modified_time`) VALUES ('" + filename + "','" + ftype + "','" + fext + "','" + str( fsize ) + "','" + fcat + "','" + ftags + "','" + fmetd + "','" + floc + "','" + modtime + "')" x.execute(query)
def parse(): committeeIds = {} for csvDir, _, csvFiles in os.walk('./csv'): for csvFileName in csvFiles: if not re.match('\w+\d{4}\.csv$', csvFileName): continue year = csvFileName[7:11] committeeIds[year] = set() with open(os.path.join(csvDir, csvFileName)) as csvFile: for row in csv.reader(csvFile): committeeId = row[8] if row[6] == '2' and committeeId != '0': committeeIds[year].add(committeeId) dictionary = {} with open('./log.txt', 'w+') as outputFile: for rtfDir, _, rtfFiles in os.walk('./rtf'): for fileName in rtfFiles: if not fileName.endswith('.rtf'): continue year = fileName[:4] if year not in dictionary: dictionary[year] = {} with open(os.path.join(rtfDir, fileName)) as rtfFile: parsedName = re.findall(u'\d+', fileName) date, meetingId = str( '/'.join(parsedName[0:3][::-1]) ), parsedName[-1] if len(parsedName) > 3 else '00' try: doc = Rtf15Reader.read(rtfFile) except Exception: continue for line in PlaintextWriter.write(doc): line = unicode(line, encoding='utf-8') if len(line) < 95 and re.match(approvedLine, line): res = list( set(re.findall(u'\d+', line)) & committeeIds[year]) if len(res) > 0: outputFile.write(fileName + ': ' + line.encode('utf-8')) for requestId in res: dictionary[year][requestId] = [ date, meetingId ] for csvDir, _, csvFiles in os.walk('./csv'): for csvFileName in csvFiles: if not re.match('\w+\d{4}\.csv$', csvFileName): continue year = csvFileName[7:11] if not year in dictionary: continue with open(os.path.join(csvDir, csvFileName)) as csvFile: with open( os.path.join( csvDir, csvFileName[:-4] + '_out' + csvFileName[-4:]), 'w+') as outputCsv: writer = csv.writer(outputCsv) reader = csv.reader(csvFile) writer.writerow(reader.next() + headerColumns) for row in csv.reader(csvFile): committeeId = '' if len(row) < 9 else row[8] writer.writerow(row + ( ['', ''] if committeeId not in dictionary[year] else dictionary[year][committeeId]))
def document_to_text(file_path): """ Convert document file (.pdf, .doc, .docx, .odt, .rtf) into plain text. * Additional dependency 'antiword' and 'odt2txt' command is required to run this function. * Converting pdf file takes much more time than others :rtype : string :param file_path: :return: text-converted version of document file contents """ dir_name, file_name = os.path.split(file_path) def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() str = retstr.getvalue() retstr.close() return str if file_name[-4:] == ".doc": cmd = ['antiword', file_path] p = Popen(cmd, stdout=PIPE) stdout, stderr = p.communicate() if len(stdout) > 0: return stdout.decode('ascii', 'ignore') else: # try .rtf format when it's not .doc file try: doc = Rtf15Reader.read(open(file_path)) return PlaintextWriter.write(doc).getvalue() except: pass elif file_name[-5:] == ".docx": document = opendocx(file_path) paratextlist = getdocumenttext(document) newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")) return '\n\n'.join(newparatextlist) elif file_name[-4:] == ".odt": cmd = ['odt2txt', file_path] p = Popen(cmd, stdout=PIPE) stdout, stderr = p.communicate() return stdout.decode('ascii', 'ignore') elif file_name[-4:] == ".pdf": return convert_pdf_to_txt(file_path) elif file_name[-4:] == ".rtf": doc = Rtf15Reader.read(open(file_path)) return PlaintextWriter.write(doc).getvalue()
def readfile(file): try: if file.startswith('https://') or file.startswith( 'http://') or file.startswith('ftp://'): data = BytesIO(download(file)) else: data = open(file, 'rb') if file.endswith('.caj') or file.endswith('.pdf'): with StringIO() as outfp: rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, outfp) process_pdf(rsrcmgr, device, data) return outfp.getvalue() elif file.endswith('.doc'): text = '' document = olefile.OleFileIO(data) wordDocument = document.openstream('WordDocument').read() # Parsing the WordDocument Stream # See https://msdn.microsoft.com/en-us/library/office/dd904907(v=office.14).aspx # And http://b2xtranslator.sourceforge.net/howtos/How_to_retrieve_text_from_a_binary_doc_file.pdf # Loading the FIB fib = wordDocument[:1472] # Loading and Parsing the piece table fcClx = int.from_bytes(fib[0x01A2:0x01A5], byteorder='little') lcbClx = int.from_bytes(fib[0x01A6:0x01A9], byteorder='little') tableFlag = ((int.from_bytes( fib[0x000A:0x000E], byteorder='little') & 0x0200) == 0x0200) tableName = ('0Table', '1Table')[tableFlag] table = document.openstream(tableName).read() clx = table[fcClx:fcClx + lcbClx] pos = 0 pieceTable = '' lcbPieceTable = 0 while True: if clx[pos] == 2: # this entry is the piece table lcbPieceTable = int.from_bytes(clx[pos + 1:pos + 5], byteorder='little') pieceTable = clx[pos + 5:pos + 5 + lcbPieceTable] break elif clx[pos] == 1: # skip this entry pos = pos + 1 + 1 + ord(clx[pos + 1]) else: break i = 1 pieceCount = (lcbPieceTable - 4) / 12 while i <= pieceCount: cpStart = int.from_bytes(pieceTable[i * 4:i * 4 + 4], byteorder='little') cpEnd = int.from_bytes(pieceTable[(i + 1) * 4:(i + 1) * 4 + 4], byteorder='little') offsetPieceDescriptor = int(((pieceCount + 1) * 4) + (i * 8)) pieceDescriptor = pieceTable[ offsetPieceDescriptor:offsetPieceDescriptor + 8] fcValue = int.from_bytes(pieceDescriptor[2:6], byteorder='little') isANSII = (fcValue & 0x40000000) == 0x40000000 fc = fcValue & 0xBFFFFFFF encoding = ('utf-16', 'cp1252')[isANSII] cb = cpEnd - cpStart cb = (cb * 2, cb)[isANSII] text += wordDocument[fc:fc + cb].decode(encoding) i += 1 return text elif file.endswith('.docx'): text = '' document = Document(data) text += '\n\n'.join( [paragraph.text for paragraph in document.paragraphs]) for table in document.tables: text += _parse_docx_table(table, text) return text elif file.endswith('.htm') or file.endswith('.html'): html = html2text.HTML2Text() html.ignore_links = True return html.handle(data.read().decode('utf-8')) elif file.endswith('.rtf'): with BytesIO() as outfp: document = Rtf15Reader.read(data) return PlaintextWriter.write(document, outfp).getvalue() elif file.endswith('.txt'): return data.read() else: raise Exception('Unknown file extension') except: pass
def convert(path, f, target=None): import re #für Regular Expressions (google es ;)) from collections import OrderedDict #andernfalls würde sich Vers1, Chorus usw immer alphabetisch sortieren from pandas import read_csv, Index #alle Imports für die PDF Generierung (unbedingt für nächster Zelle mit pyth ausführen) from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import A4 from reportlab.lib.units import inch from reportlab.pdfbase.pdfmetrics import stringWidth from reportlab.lib import styles, colors from reportlab.platypus import Paragraph #RTF einlese Package from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.plaintext.writer import PlaintextWriter #RTF einlesen doc = Rtf15Reader.read(f) raw = PlaintextWriter.write(doc).getvalue() pattern = "(^[\xc3\x9f\xc3\x84\xc3\x96\xc3\x9c\xc3\xa4\xc3\xbc\xc3\xb6\xe2\x80\x99,\w\s]+\n+)(key:[\w#]+\n+)?(bpm:[\d]+\n+)?(.+)(CCLI Song # (\d+)\\n+(.+)\\n+\\xc2\\xa9 (.+))" match = re.search(pattern, raw, re.DOTALL) info_dict = {} info_dict['title'] = match.group(1).replace('\n', '') if match.group(2): info_dict['key'] = match.group(2).replace('\n', '').replace('key:', '') else: print "No key found" if match.group(3): info_dict['bpm'] = match.group(3).replace('\n', '').replace('bpm:', '') else: print "No bpm found" info_dict['song'] = match.group(4) info_dict['ccli_nr'] = match.group(6) info_dict['composer'] = match.group(7).replace('\n', '') info_dict['copyright'] = match.group(8) akkorde = read_csv("Akkorde.csv", sep=";") def getTransformedKey(source, target, chord): return (akkorde[target][Index(akkorde[source]).get_loc(chord)]) def replChords(matchObj): return ('[' + getTransformedKey( source=info_dict['key'], target=target, chord=matchObj.group(1)) + ']') def transform(): info_dict['song'] = re.sub('\[([\w\d#/]+)\]', replChords, info_dict['song']) info_dict['key'] = target #target = request.form['trans_key'] if (target and target != info_dict['key']): transform() #Einzelne Zeilen aus dem RTF in Liste laden line_list = info_dict.get('song').split('\n\n') line_list pattern = '^(Verse\s?\d*|Chorus\s?\d*|Instrumental|Bridge|Pre-Chorus|Intro)$' #Dieses Pattern funktioniert auf alles VersX und Chorus in eckiger Klammer (porbier regexr.com) song_dict = OrderedDict() #Das oben erwähnte Ordered Dict in_element = False #mit diesem Flag könnte man sich später noch title: composer: key: usw holen (so weit bin ich noch nicht) element = None #hier wird gleich drin gespeichert, in welcher Untergruppe wir jeweils sind for i in range(len(line_list)): if in_element: #wenn wir in einem Element sind, werden alle folgenden Zeilen zu diesem Eintrag hinzugefügt if not re.search(pattern, line_list[i]): song_dict[element].extend([line_list[i]]) match = re.search( pattern, line_list[i] ) #Bis wir den ersten Match haben (zB VersX oder Chorus), gibt es auch kein Element if match: #Wenn wir jedoch ein Match haben, sind wir in einem Element. Dieses wird neu im Dictonary angelegt. in_element = True element = match.group(1) song_dict[element] = [ ] #Wir geben an, dass hinter diesem Dictonary Eintrag eine neue Liste steht. def createPDF(fontSize=13): width, height = A4 #keep for later font = 'Helvetica' lineHeight = fontSize + .75 * fontSize wordSpace = 3 boarder = inch topBoarder = .75 * boarder instrSpaces = 5 chordstyle = styles.ParagraphStyle('chord') chordstyle.fontSize = fontSize hstyle = styles.ParagraphStyle('heading') hstyle.fontSize = fontSize + 1 tstyle = styles.ParagraphStyle('title') tstyle.fontSize = fontSize + 5 copyrightstyle = styles.ParagraphStyle('copyright') copyrightstyle.fontSize = 8 pattern = '\[([\w\d#/]+)\]' y = height - topBoarder - fontSize x = boarder realWidth = width - 2 * boarder c = canvas.Canvas(path + info_dict['title'] + '-' + info_dict['key'] + '.pdf', pagesize=A4) c.setFont(font, fontSize - 1) P1 = Paragraph("<u><b>" + info_dict['title'] + "</b></u>", tstyle) P1.wrap(realWidth, height) P1.drawOn(c, x, y) if info_dict.has_key('key'): P1 = Paragraph("<b>" + info_dict['key'] + "</b>", chordstyle) P1.wrap(realWidth, height) P1.drawOn( c, width - boarder - stringWidth(info_dict['key'], font, chordstyle.fontSize), y) if info_dict.has_key('bpm'): c.drawRightString(width - boarder, y - lineHeight, '%s' % info_dict['bpm']) P1 = Paragraph(info_dict['composer'], copyrightstyle) P1.wrap(realWidth, height) P1.drawOn(c, x, y - lineHeight) c.setFont(font, fontSize) y -= hstyle.fontSize + 2 * lineHeight for key in song_dict: P1 = Paragraph("<b><i>" + key + "</i></b>", hstyle) P1.wrap(realWidth, height) P1.drawOn(c, x, y) xOfLast = boarder lineCount = 0 if re.search(pattern, song_dict.get(key)[0]): y -= 1.8 * ( lineHeight ) #Abstand von Überschrift zu erster Zeile wenn Akkorde else: y -= 1.2 * ( lineHeight ) #Abstand von Überschrift zu erster Zeile wenn keine Akkorde if (key in ["Instrumental", "Intro"]): for line in song_dict.get(key): line = line.replace('[', '').replace(']', '').replace( ' ', ' ' * (instrSpaces)) P1 = Paragraph("<b>" + line + "</b>", chordstyle) P1.wrap(realWidth, height) P1.drawOn(c, x, y) y -= 1.5 * lineHeight #Abstand nach jedem Abschnitt else: for line in song_dict.get(key): if ((xOfLast + stringWidth(line, font, fontSize)) < (width - boarder)) and (lineCount < 2): x = xOfLast lineCount += 1 elif not re.search(pattern, line): y -= 1 * lineHeight else: y -= 1.5 * lineHeight lineCount = 1 line = line.decode('utf-8') last_was_chord = False x_min = 0 cursor = 0 while cursor < len(line): l = line[cursor] if l == ' ': if last_was_chord: x += last_cord_length last_was_chord = False else: x += wordSpace elif l == '[': end = line.find(']', cursor) chord = line[cursor + 1:end] P1 = Paragraph("<b>" + chord + "</b>", chordstyle) P1.wrap(realWidth, height) if x < x_min: x = x_min P1.drawOn(c, x, y + fontSize + 0.01 * fontSize**2) cursor = end last_was_chord = True last_cord_length = stringWidth( chord, font, fontSize) x_min = x + last_cord_length + wordSpace * 7 else: last_was_chord = False c.drawString(x, y, l) x += stringWidth(l, font, fontSize) cursor += 1 xOfLast = x + wordSpace x = boarder y -= 1.5 * lineHeight #Abstand nach jedem Abschnitt P1 = Paragraph( ('© ') + info_dict['copyright'] + '<br/>Gebrauch nur zur Nutzung im Rahmen von Veranstaltungen der City Chapel Stuttgart', copyrightstyle) P1.wrap(realWidth, height) P1.drawOn(c, x, boarder - P1.height) # + lineHeight) c.showPage() c.save() return (y < boarder) nochmal = True fontSize = 13 while (nochmal): nochmal = createPDF(fontSize) fontSize -= .5
def extractstringfromRTF(path, filename): document_object = Rtf15Reader.read(open(path + filename, "rb")) transcriptstring = PlaintextWriter.write(document_object).read() return transcriptstring
def parse(): committeeIds = {} for csvDir, _, csvFiles in os.walk('./csv'): for csvFileName in csvFiles: if not re.match('\w+\d{4}\.csv$', csvFileName): continue year = csvFileName[7:11] committeeIds[year] = set() with open(os.path.join(csvDir, csvFileName)) as csvFile: for row in csv.reader(csvFile): committeeId = row[8] if row[6] == '2' and committeeId != '0': committeeIds[year].add(committeeId) dictionary = {} with open('./log.txt', 'w+') as outputFile: for rtfDir, _, rtfFiles in os.walk('./rtf'): for fileName in rtfFiles: if not fileName.endswith('.rtf'): continue year = fileName[:4] if year not in dictionary: dictionary[year] = {} with open(os.path.join(rtfDir, fileName)) as rtfFile: parsedName = re.findall(u'\d+', fileName) date, meetingId = str('/'.join(parsedName[0:3][::-1])), parsedName[-1] if len(parsedName) > 3 else '00' try: doc = Rtf15Reader.read(rtfFile) except Exception: continue for line in PlaintextWriter.write(doc): line = unicode(line, encoding='utf-8') if len(line) < 95 and re.match(approvedLine, line): res = list(set(re.findall(u'\d+', line)) & committeeIds[year]) if len(res) > 0: outputFile.write(fileName + ': ' + line.encode('utf-8')) for requestId in res: dictionary[year][requestId] = [date, meetingId] for csvDir, _, csvFiles in os.walk('./csv'): for csvFileName in csvFiles: if not re.match('\w+\d{4}\.csv$', csvFileName): continue year = csvFileName[7:11] if not year in dictionary: continue with open(os.path.join(csvDir, csvFileName)) as csvFile: with open(os.path.join(csvDir, csvFileName[:-4] + '_out' + csvFileName[-4:]), 'w+') as outputCsv: writer = csv.writer(outputCsv) reader = csv.reader(csvFile) writer.writerow(reader.next() + headerColumns) for row in csv.reader(csvFile): committeeId = '' if len(row) < 9 else row[8] writer.writerow( row + (['', ''] if committeeId not in dictionary[year] else dictionary[year][committeeId]))
def extract_content(self, filepath): with open(filepath, "rb") as fh: doc = Rtf15Reader.read(fh) return to_utf8(PlaintextWriter.write(doc).getvalue())
def convertRtfToText(path): doc = Rtf15Reader.read(open(path)) return PlaintextWriter.write(doc).getvalue()
# import sys # sys.path.append('F:\陶士来文件\downloads\PyRTF-0.45\PyRTF-0.45') # # from PyRTF import * # with open('test.rtf','rb') as f: # for each in f: # print(each) from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.plaintext.writer import PlaintextWriter import re with open('test2.rtf', 'rb') as f: # for each in f : # each=re.sub('\n','',each) doc = Rtf15Reader.read(f, clean_paragraphs=True) print(doc) print(PlaintextWriter.write(doc, newline="\n").getvalue(), end="")
def main(): y = 0 contains = False for a in range(1, 7): list = [] dev = '{}{}'.format("dev", a) root = api.path + '/{}/interviews/'.format(dev) dirlist = [ item for item in os.listdir(root) if os.path.isfile(os.path.join(root, item)) ] for i in dirlist: if i.endswith('.rtf'): x = 0 print root + i doc = PlaintextWriter.write(Rtf15Reader.read( open(root + i))).getvalue() first_index = doc.find("Clip Transcript") + 21 second_index = doc.find("Clip Keywords") - 19 interview_date = doc[first_index - 35:first_index - 25].replace("_", "-") clip_transcript = doc[first_index:second_index] list.append([interview_date, clip_transcript]) while x < doc.count("Clip Transcript"): if doc.find("Clip Transcript", second_index) > 0: first_index = doc.find("Clip Transcript", second_index) + 21 else: break if doc.find("Clip Keywords", first_index) > 0: second_index = doc.find("Clip Keywords", first_index) - 16 else: break interview_date = doc[first_index - 35:first_index - 25].replace("_", "-") clip_transcript = doc[first_index:second_index] print interview_date, clip_transcript for sublist in list: if sublist[0] == interview_date: sublist[1] = sublist[ 1] + "\n------------------\n" + clip_transcript contains = True if not contains: list.append([interview_date, clip_transcript]) contains = False x = x + 1 print list # print list[0][0], list[0][1] for i in dirlist: if i.endswith('.mp3'): tag = i[:10].replace("_", "-") index = 0 for sublist in list: if tag.strip() == sublist[0].strip(): description = unicode(list[index][1], errors="ignore") contains = True index = index + 1 if not contains: description = "" contains = False data_audio = { "description": description, "duration": (MP3(api.path + '/{}/interviews/{}'.format(dev, i))).info.length, "id": "", "interview": api.get('interviews', y + 1), "status": "PRIVATE", "tag": i, "uri": "http://opendata.soccerlab.polymtl.ca/audios/" + i, "author": "", "license": "" } api.request("Audio", 'audios', data_audio) y = y + 1
def parse_rtf(file): doc = Rtf15Reader.read(file) return PlaintextWriter.write(doc).getvalue().split('\n')
from __future__ import absolute_import from __future__ import print_function from pyth.plugins.plaintext.writer import PlaintextWriter import pythonDoc doc = pythonDoc.buildDoc() print(PlaintextWriter.write(doc).getvalue())
import glob import codecs import sys from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.plaintext.writer import PlaintextWriter in_dir, out_dir = sys.argv[1:3] for in_filename in glob.glob("{}/*.rtf".format(in_dir)): in_file = codecs.open(in_filename) out_filename = in_filename.split("/")[1].rsplit(".", 1)[0] out_file = codecs.open("{}/{}.txt".format(out_dir, out_filename), "w") doc = Rtf15Reader.read(in_file) PlaintextWriter.write(doc, out_file) in_file.close() out_file.close()
def document_create_index(document, user_id=None): import os from xlrd import open_workbook from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.plaintext.writer import PlaintextWriter import sunburnt document = json.loads(document) table = s3db.doc_document id = document["id"] name = document["name"] filename = document["filename"] filename = "%s/%s/uploads/%s" % (os.path.abspath("applications"), \ request.application, filename) si = sunburnt.SolrInterface(settings.get_base_solr_url()) extension = os.path.splitext(filename)[1][1:] if extension == "pdf": data = os.popen("pdf2txt.py " + filename).read() elif extension == "doc": data = os.popen("antiword " + filename).read() elif extension == "xls": wb = open_workbook(filename) data=" " for s in wb.sheets(): for row in range(s.nrows): values = [] for col in range(s.ncols): values.append(str(s.cell(row, col).value)) data = data + ",".join(values) + "\n" elif extension == "rtf": doct = Rtf15Reader.read(open(filename)) data = PlaintextWriter.write(doct).getvalue() else: data = os.popen("strings " + filename).read() # The text needs to be in unicode or ascii, with no contol characters data = str(unicode(data, errors="ignore")) data = "".join(c if ord(c) >= 32 else " " for c in data) # Put the data according to the Multiple Fields # @ToDo: Also, would change this according to requirement of Eden document = {"id": str(id), # doc_document.id "name": data, # the data of the file "url": filename, # the encoded file name stored in uploads/ "filename": name, # the filename actually uploaded by the user "filetype": extension # x.pdf -> pdf is the extension of the file } # Add and commit Indices si.add(document) si.commit() # After Indexing, set the value for has_been_indexed to True in the database db(table.id == id).update(has_been_indexed = True) db.commit()
def parse(self): doc = Rtf15Reader.read(open(self._config['filePath'], "rb")) self.__parserData = PlaintextWriter.write(doc).getvalue() return self.__parsedData
def document_create_index(document, user_id=None): import os from xlrd import open_workbook from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.plaintext.writer import PlaintextWriter import sunburnt document = json.loads(document) table = s3db.doc_document id = document["id"] name = document["name"] filename = document["filename"] filename = "%s/%s/uploads/%s" % (os.path.abspath("applications"), \ request.application, filename) si = sunburnt.SolrInterface(settings.get_base_solr_url()) extension = os.path.splitext(filename)[1][1:] if extension == "pdf": data = os.popen("pdf2txt.py " + filename).read() elif extension == "doc": data = os.popen("antiword " + filename).read() elif extension == "xls": wb = open_workbook(filename) data = " " for s in wb.sheets(): for row in range(s.nrows): values = [] for col in range(s.ncols): values.append(str(s.cell(row, col).value)) data = data + ",".join(values) + "\n" elif extension == "rtf": doct = Rtf15Reader.read(open(filename)) data = PlaintextWriter.write(doct).getvalue() else: data = os.popen("strings " + filename).read() # The text needs to be in unicode or ascii, with no contol characters data = str(unicode(data, errors="ignore")) data = "".join(c if ord(c) >= 32 else " " for c in data) # Put the data according to the Multiple Fields # @ToDo: Also, would change this according to requirement of Eden document = { "id": str(id), # doc_document.id "name": data, # the data of the file "url": filename, # the encoded file name stored in uploads/ "filename": name, # the filename actually uploaded by the user "filetype": extension # x.pdf -> pdf is the extension of the file } # Add and commit Indices si.add(document) si.commit() # After Indexing, set the value for has_been_indexed to True in the database db(table.id == id).update(has_been_indexed=True) db.commit()
########################################################################### # Simple program to convert rtf text file to plain text format # Input is a file with the .rtf extension # Output is a plain text file with the same name and the .txt extension # This program uses the package pyth # Install by: # pip install pyth ########################################################################### from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.plaintext.writer import PlaintextWriter import sys if __name__ == '__main__': if len(sys.argv) != 2: print 'Convert an rtf file to plain text' print 'Usage : %s filename.rtf' % sys.argv[0] else: input_file = open(sys.argv[1],'r') output_file = open(input_file.name.replace('.rtf','') + '.txt','w') doc = Rtf15Reader.read(input_file) output_file.write(PlaintextWriter.write(doc).getvalue()) output_file.close() input_file.close()