def get_mark_down(filename=DEFAULT_FILE): with open(filename) as f: content = f.read() # incase string passed in if Path(filename).suffix == '.rtf': content = rtf_to_text(content) else: print( f"> > > > - - - - - - - - - - < < < < {Path(filename).suffix} > > > >" ) replacement = '' # remove anything inside 'comment' delimiters //* this is a comment *// content = re.sub(r'\/\/\*.*?\*\/\/', replacement, content, flags=re.MULTILINE | re.DOTALL) for line in iter(content.splitlines()): print(line) print("\n\n\n\n\n\n") return content
def cleanup_message_body(body: AnyStr, body_type: BodyType, size_threshold: int = 0) -> str: # Decode first body = decode(body) if body_type is BodyType.RTF: # Strip formatting body = rtf_to_text(body) elif body_type is BodyType.HTML: # Strip markup body = BeautifulSoup(body, "html.parser").get_text() # Strip what might be lines of base64 encoded data if len(body) > size_threshold: body = re.sub(r"^[>\s]*[A-Za-z0-9+/]{76,}\n?", "", body, flags=re.MULTILINE) # Strip uuencoded attachments if len(body) > size_threshold: body = re.sub(r"begin [0-7]{3}.*?end", "", body, flags=re.DOTALL) # Strip notes/calendar data if len(body) > size_threshold: body = re.sub(r"<(OMNI|omni)([^>]*?)>.*?</\1\2>(\s)*", "", body, flags=re.DOTALL) return body.strip()
def decode(cell): # Progress Calculation global numSuccess global numError global totalRows global numEmpty sys.stdout.write(f'Successful: {numSuccess}; Errors: {numError}; Empty: {numEmpty}; Percent Done: { round((numSuccess + numError + numEmpty) / totalRows, 4) * 100 }% \r') sys.stdout.flush() # Return empty string for empty-ish values if not isinstance(cell, str) or cell == '0x00' or cell is None or pd.isnull(cell): numEmpty += 1 return '' try: html_reg = re.compile('<.*?>') # Regex to match HTML tags cell = cell[2:] if cell[:2] == '0x' else cell #Remove 0x prefix cell = bytes.fromhex(cell).decode('latin1', errors='replace') #Decode from hex try: cell = rtf_to_text(cell) # Strip RTF tags except TypeError as e: #Some 'NoneType' values still slipping through; counting as empty not errors errors.write(f'{numEmpty + numError + numSuccess},{str(e).replace(",", " ")}\n') numEmpty += 1 return '' cell = re.sub(html_reg, '', cell) # Strip HTML tags cell = cell.strip() # Remove leading and trailing whitespace numSuccess += 1 return cell except Exception as e: # Write errors to errors.csv numError += 1 errors.write(f'{numEmpty + numError + numSuccess},{str(e).replace(",", " ")}\n') print(e) return ''
def get_text(path, coding='utf-8'): # возвращаемый список с текстом text_list = list() with open(path, encoding=coding) as file: for strings in file: string = rtf_to_text(strings) try: lines = string.split('.') if lines[0][0] == lines[0][0].lower( ): # добавление старых и соединение новых text_list[-1] += lines[0] for elem in lines[1:]: if elem: text_list.append(elem) elif lines[0][0] != lines[0][0].lower( ): # соединение новых предложений for line in lines: text_list.append(line) else: # соединение одной буквы text_list[-1] += lines[0] except IndexError: del lines return text_list
def find_content(filepaths): print('Reading files within the Repository for content ...') documents = [] for fp in filepaths: # Split the extension from the path and normalise it to lowercase. ext = os.path.splitext(fp)[-1].lower() # Now we can simply use == to check for equality, no need for wildcards. if ext == ".pdf": document = read_pdf_data(fp) elif ext == '.rtf': with open(fp, 'r') as file: text = file.read() document = rtf_to_text(text).replace('\n', ' ').replace('\t', ' ') elif ext == '.docx': document = getText(fp) else: meta_path = os.path.dirname(fp) + 'metadata.csv' des = pd.read_csv(meta_path) try: description = des['Description'][0] except: description = des['Title'][0] document = description documents.append(document) return documents
def extract_text(self) -> str: txt = self.data.decode(DEFAULT_TEXT_ENCODING) # Hack to handle Apple's extensions to the RTF format txt = txt.replace("\\\n\\\n", "\\\n\\par\n") return rtf_to_text(txt)
def get_mark_down(filename=DEFAULT_DOC_TO_PROCESS): print(f"FILE_LOC***\n***\n{filename}\n***\n***\n") with open(filename) as f: content = f.read() # incase string passed in if Path(filename).suffix == '.rtf': content = rtf_to_text(content) else: print( f"> > > > - - - - - - - - - - < < < < {Path(filename)} {filename} {Path(filename).suffix} > > > >" ) replacement = '' # remove anything inside 'comment' delimiters //* this is a comment *// content = re.sub(r'\/\/\*.*?\*\/\/', replacement, content, flags=re.MULTILINE | re.DOTALL) # debug verify comment removal # for line in iter(content.splitlines()): # print(line) # # print("\n\n\n**8**\n\n\n") return content
def test_table(self): simple_table_rtf = RTF_DIR / 'simple_table.rtf' simple_table_txt = TEXT_DIR / 'simple_table.txt' with simple_table_rtf.open() as source: result = rtf_to_text(source.read()) with simple_table_txt.open() as destination: self.assertEqual(destination.read(), result)
def test_extract_simple_table(self): simple_table_rtf = RTF_DIR / "line_break_textedit_mac.rtf" simple_table_txt = TXT_DIR / "line_break_textedit_mac.txt" with simple_table_rtf.open() as source: result = rtf_to_text(source.read()) with simple_table_txt.open() as destination: self.assertEqual(destination.read(), result)
def test_empty(self): example_rtf = RTF_DIR / "french.rtf" example_txt = TEXT_DIR / "french.txt" with example_rtf.open() as source: result = rtf_to_text(source.read()) with example_txt.open() as destination: self.assertEqual(destination.read(), result)
def get_sentiment_no_tokenize(self): article_sentiments = {} for filename in self.filenames: with open('articles/' + self.city_name +'/' + filename, 'r') as file: data = file.read().replace('\n', '') rtf = file.read().replace('\n', '') data = rtf_to_text(rtf) words = self.tokenizer.word_tokenize(data)
def test_full_table(self): example_rtf = RTF_DIR / "nested_table.rtf" example_txt = TEXT_DIR / "nested_table.txt" with example_rtf.open() as source: result = rtf_to_text(source.read()) with example_txt.open() as destination: self.assertEqual(destination.read(), result)
def get_text_content_of_file(rtf_filepath): with open(rtf_filepath, 'r') as f: rtf = f.read() # print('- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - S') # print(rtf) # print('- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - E') return rtf_to_text(rtf) # convert to text and return
def parseStickNotes(self): output = self.volumeInfo result = [] try: bias = datetime.timedelta(hours=-self.bias) except TypeError: pass if "FAT" or "NTFS" in output.split(" ")[0]: os.chdir("%s/%s/" % (self.mountDir, output.split(" ")[2])) logger.info( "Loading every user info!") # TODO:It should be per user! try: os.chdir("Users/") except FileNotFoundError: logger.error("Couldn't find Users folder!") return None for userDir in os.listdir("."): if os.access( "{0}/AppData/Roaming/Microsoft/Sticky Notes/StickyNotes.snt" .format(userDir), os.F_OK | os.R_OK): pass else: logger.warning("Couldn't find StickNotes file on %s" % userDir) continue doc = compoundfiles.CompoundFileReader( "{0}/AppData/Roaming/Microsoft/Sticky Notes/StickyNotes.snt" .format(userDir)) for item in doc: if item.isdir: logger.info("Directory name: {0}.".format(item.name)) logger.info( "Directory last modified time: {0}.".format( item.modified)) for sub_item in item: content = doc.open(sub_item).read() logger.info("Entry name: {0}.".format( sub_item.name)) if "Rich Text" in magic.from_buffer(content): logger.debug( "This is an RTF file.Stripping to normal text." ) logger.info("Entry content: {0}.".format( rtf_to_text(content.decode()))) else: logger.info("Entry type: {0}.".format( magic.from_buffer( doc.open(sub_item).read()))) logger.info( "Entry content: {0}.".format(content)) elif item.isfile: logger.info("Entry name: {0}.".format(item.name)) logger.info("Entry content: {0}.".format( doc.open(item).read())) logger.info("Entry type: {0}.".format( magic.from_buffer(doc.open(item).read()))) else: continue
def find_keywords(filepaths): print("Finding Keywords...") file_keywords = [] files = [] documents = [] for fp in filepaths: ext = os.path.splitext(fp)[-1].lower() if ext == ".pdf": # keywords_set = clean_keywords(read_pdf(fp)) text = '' pdfFileObj = open(fp, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) num = pdfReader.numPages for i in range(num): pageObj = pdfReader.getPage(i) text += pageObj.extractText() pdfFileObj.close() if text != '': document = summarize(text) else: meta_path = os.path.dirname(fp) + '\metadata.csv' des = pd.read_csv(meta_path, encoding='unicode_escape') try: description = des['Description'][0] except: description = des['Title'][0] document = description keywords_set = clean_keywords(keywords_from_summary(document)) files.append(fp) elif ext == '.rtf': files.append(fp) with open(fp, 'r') as file: text = file.read() document_t = rtf_to_text(text).replace('\n', ' ').replace('\t', ' ') keywords_set = clean_keywords( keywords_from_summary(summarize(document_t))) document = document_t elif ext == '.docx': text = getText(fp) document = text keywords_set = clean_keywords( keywords_from_summary(summarize(text))) files.append(fp) else: files.append(fp) meta_path = os.path.dirname(fp) + '\metadata.csv' des = pd.read_csv(meta_path, encoding='unicode_escape') try: description = des['Description'][0] except: description = des['Title'][0] document = description keywords_set = clean_keywords(keywords_from_summary(description)) file_keywords.append(keywords_set) documents.append(document) # print(documents) return file_keywords, documents
def test_speiseplan(self): example_rtf = RTF_DIR / "Speiseplan_KW_32-33_Eybl.rtf" example_txt = TEXT_DIR / "Speiseplan_KW_32-33_Eybl.txt" with example_rtf.open() as source: result = rtf_to_text(source.read()) with example_txt.open() as destination: self.maxDiff = None self.assertEqual(destination.read(), result)
def test_extract_simple_table(self): simple_table_rtf = RTF_DIR / "test_line_breaks_google_docs.rtf" simple_table_txt = TXT_DIR / "test_line_breaks_google_docs.txt" with simple_table_rtf.open() as source: result = rtf_to_text(source.read()) with open("o.txt", "w") as f: f.write(result) with simple_table_txt.open() as destination: self.assertEqual(destination.read(), result)
def parser_rtf(link: str) -> str: '''Функция извлечения текста из файлов .rtf''' with open(link, encoding='utf-8') as f: text = f.read() if len(text) > 100_000: description = text[:100_000].rstrip() else: description = text.rstrip() text = rtf_to_text(description) return text
def test_full_table(self): example_rtf = RTF_DIR / "calcium_score.rtf" example_txt = TEXT_DIR / "calcium_score.txt" with example_rtf.open() as source: result = rtf_to_text(source.read()) with (open("foo.text", "w")) as f: f.write(result) with example_txt.open() as destination: self.assertEqual(destination.read(), result)
def test_full_table(self): example_rtf = RTF_DIR / 'calcium_score.rtf' example_txt = TEXT_DIR / 'calcium_score.txt' with example_rtf.open() as source: result = rtf_to_text(source.read()) print(result) with (open('foo.text', 'w')) as f: f.write(result) with example_txt.open() as destination: self.assertEqual(destination.read(), result)
def parse_file(input_dir: str, input_file_name: str, output_dir: str): if not os.path.isdir(output_dir): os.mkdir(output_dir) input_file_path = os.path.join(input_dir, input_file_name) output_file_path = os.path.join(output_dir, input_file_name.replace('.rtf', '.txt')) with open(input_file_path, 'r', encoding='ansi') as in_file: text = rtf_to_text(in_file.read()) with open(output_file_path, 'w', encoding='utf-8') as out_file: out_file.write(text)
def iter_books(zip): '''Returns iterator of book files as strings.''' # remove foreword, appendixes, etc fps = [ f.filename for f in zip.filelist if re.match(r'nwt_[\d]{2}_[\w]+_E.rtf', f.filename) ] for fp in sorted(fps): # could be in any order book_rtf_bytes = zip.read(fp) book_rtf_str = book_rtf_bytes.decode('utf-8') yield rtf_to_text(book_rtf_str)
def process_rtf_to_dataframe(fpath, docid_patt, date_patt, time_patt): """ Returns the document ID, date, time, and article text in a pandas DataFrame :param fpath: string, path to .RTF file :param docid_patt: string, regex pattern for document ID :param date_patt: string, regex pattern for date :param time_patt: string, regex pattern for time :return: pandas DataFrame """ # read the raw content of the .RTF file with open(fpath, encoding="utf8") as f: try: rtf = f.read() except ValueError as err: print("Error ({fpath}): {err}".format(fpath=fpath, err=err)) raise except: print( "Unexpected error ({fpath}): {err}".format( fpath=fpath, err=sys.exc_info()[0] ) ) raise # strip formatting to get plain text text = rtf_to_text(rtf).strip() # get document ID corresponding to each article doc_ids = re.findall(re.compile(docid_patt, re.M), text) # segmentation - split the text at each document ID, into individual articles articles = re.split("|".join(doc_ids), text) # the last item in the list after split operation should be blank (i.e. ''), so it can be dropped articles = articles[:-1] if len(articles) != len(doc_ids): print( 'Text is not segmented appropriately, check regex "{docidpatt}": document ids {n_docid}, ' "{n_articles}".format( n_docid=len(doc_ids), docidpatt=docid_patt, n_articles=len(articles) ) ) return # strip blank lines/spaces from the beginning/end of each article articles = [a.strip() for a in articles] # extract date from each article article_dates = list(map(lambda x: find_in_text(date_patt, x), articles)) # extract time from each article article_times = list(map(lambda x: find_in_text(time_patt, x), articles)) # assemble dataframe data = pd.DataFrame( zip(doc_ids, article_dates, article_times, articles), columns=["document_id", "date", "time", "text"], ) return data
def get_text(rs): bb = [] ctr = 0 for r in rs: if fname(r)[0] == '_': continue print(fnamene(r)) a = file_to_text(r) b = rtf_to_text(a) bb.append(b) b = '\n\n'.join(bb) b = b.lower() return b
def butterfly(): "Count how many times the word 'butterfly' appears in document" document = request.data.decode() # Make bytes into text if request.content_type == 'text/plain': text, decoder = document, None elif request.content_type == 'text/html': soup = BeautifulSoup(document) text, decoder = soup.text, "BeautifulSoup" elif request.content_type == 'text/rtf': text, decoder = rtf_to_text(document), "striprtf" else: abort(400) # 400 Bad Request count = text.lower().count('butterfly') return jsonify({"count": count, "decoder": decoder})
def process_answer_key(PATH_ANSKEY): # RTF support rtf = open(PATH_ANSKEY, "r") text = rtf_to_text(rtf.read()) # print(text) # Seperate each answer along with its marks from answer key indexes = list(find_all(text, '(M:')) marks = get_marks_from_anskey(text, indexes) reference_answers = get_answers_from_anskey(text, indexes) return marks, reference_answers
def downloadArticle(articleId): global args, exit_flag if exit_flag: return url = "https://www.gopress.be/Public/download-article.php?articleOriginalId={}&format=rtf".format( articleId) headers = {} headers[ 'Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' headers['Accept-Language'] = 'en-US,en;q=0.9,nl;q=0.8,fr;q=0.7' headers['Sec-Fetch-Dest'] = 'document' headers['Sec-Fetch-Mode'] = 'navigate' headers['Sec-Fetch-Site'] = 'none' headers[ 'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4277.0 Safari/537.36 Edg/87.0.658.0' headers['Cookie'] = 'PHPSESSID={}'.format(php_sess_id) req = request.Request(url, {}, headers) with request.urlopen(req) as response: text = response.read() decoded = text.decode('utf-8') if decoded == 'Action not authorized when user not authenticated': print( 'Not authenticated, please make sure you are logged in and have an arbitrary article open' ) exit_flag = True return elif decoded.startswith( '<html><head><title>Download error</title></head>'): log('Following article couldn\'t be downloaded: {}'.format( articleId)) return elif decoded == 'Article not found : Article not found': log('Following article couldn\'t be found: {}'.format(articleId)) return filename = buildFilename(rtf_to_text(decoded).split('\n')) if filename == 'unknown': log('Following article couldn\'t be parsed: {}'.format(articleId)) return path = os.path.join(args.directory, filename) if os.path.isfile(path): log('Following article was already downloaded: {}'.format( filename)) return with open(path, 'wb') as f: f.write(text) del req
def get_text(path, coding='utf-8'): # возвращаемый список с текстом text_list = list() transitional_list = list() styles_list = list() with open(path, encoding=coding) as file: for strings in file: string = rtf_to_text(strings) try: lines = string.split('.') if lines[0][0] == lines[0][0].lower( ): # добавление старых и соединение новых transitional_list[-1] += lines[0] for elem in lines[1:]: if elem: transitional_list.append(elem) elif lines[0][0] != lines[0][0].lower( ): # соединение новых предложений for line in lines: transitional_list.append(line) else: # соединение одной буквы transitional_list[-1] += lines[0] except: del lines # отделение стилей от текста all_text = '' for line in transitional_list: match = re.search(';', line) if match: styles_list.append(line) del line else: line += '.' all_text += line raw_text_list = all_text.split('. ') # деление текста на предложения for string in raw_text_list: lines = string.split( '\n.') # разбиение первых и последних предложений в абзацах for line in lines: text_list.append(line) return text_list, styles_list
def get_word_frequency(self, word): city_freq_dict = {} word_count = 0 trip = False for filename in self.filenames: if trip: return city_freq_dict with open(self.directory + '/' + self.city_name +'/' + filename, 'r') as file: rtf = file.read().replace('\n', '') data = rtf_to_text(rtf) words = self.clean_data(data) count = words.count(word) word_count += count #city_freq_dict[filename] = count #print(city_freq_dict) print(self.city_name + ": ", word_count/len(self.filenames))
def get_sentiment_tokenize(self): article_sentiments = {} trip = False for filename in self.filenames: if trip: return article_sentiments with open(self.directory + '/' + self.city_name +'/' + filename, 'r') as file: rtf = file.read().replace('\n', '') data = rtf_to_text(rtf) sentences = self.tokenizer.tokenize(data) num_sentences = len(sentences) compound_score = 0 for sentence in sentences: scores = self.sid.polarity_scores(sentence) compound_score += scores['compound'] article_sentiments[filename] = compound_score/num_sentences return article_sentiments