Beispiel #1
0
def get_mark_down(filename=DEFAULT_FILE):

    with open(filename) as f:
        content = f.read()

    # incase string passed in
    if Path(filename).suffix == '.rtf':
        content = rtf_to_text(content)
    else:
        print(
            f"> > > > - - - - - - - - - - < < < < {Path(filename).suffix} > > > >"
        )

    replacement = ''
    # remove anything inside 'comment' delimiters //* this is a comment *//
    content = re.sub(r'\/\/\*.*?\*\/\/',
                     replacement,
                     content,
                     flags=re.MULTILINE | re.DOTALL)

    for line in iter(content.splitlines()):
        print(line)

    print("\n\n\n\n\n\n")

    return content
Beispiel #2
0
def cleanup_message_body(body: AnyStr,
                         body_type: BodyType,
                         size_threshold: int = 0) -> str:
    # Decode first
    body = decode(body)

    if body_type is BodyType.RTF:
        # Strip formatting
        body = rtf_to_text(body)

    elif body_type is BodyType.HTML:
        # Strip markup
        body = BeautifulSoup(body, "html.parser").get_text()

    # Strip what might be lines of base64 encoded data
    if len(body) > size_threshold:
        body = re.sub(r"^[>\s]*[A-Za-z0-9+/]{76,}\n?",
                      "",
                      body,
                      flags=re.MULTILINE)

    # Strip uuencoded attachments
    if len(body) > size_threshold:
        body = re.sub(r"begin [0-7]{3}.*?end", "", body, flags=re.DOTALL)

    # Strip notes/calendar data
    if len(body) > size_threshold:
        body = re.sub(r"<(OMNI|omni)([^>]*?)>.*?</\1\2>(\s)*",
                      "",
                      body,
                      flags=re.DOTALL)

    return body.strip()
Beispiel #3
0
def decode(cell):
    # Progress Calculation
    global numSuccess
    global numError
    global totalRows
    global numEmpty
    sys.stdout.write(f'Successful: {numSuccess}; Errors: {numError}; Empty: {numEmpty}; Percent Done: { round((numSuccess + numError + numEmpty) / totalRows, 4) * 100 }% \r')
    sys.stdout.flush()

    # Return empty string for empty-ish values
    if not isinstance(cell, str) or cell == '0x00' or cell is None or pd.isnull(cell):
        numEmpty += 1
        return ''

    try:
        html_reg = re.compile('<.*?>') # Regex to match HTML tags
        cell = cell[2:] if cell[:2] == '0x' else cell #Remove 0x prefix
        cell = bytes.fromhex(cell).decode('latin1', errors='replace') #Decode from hex
        try:
            cell = rtf_to_text(cell)  # Strip RTF tags
        except TypeError as e: #Some 'NoneType' values still slipping through; counting as empty not errors
            errors.write(f'{numEmpty + numError + numSuccess},{str(e).replace(",", " ")}\n')
            numEmpty += 1
            return ''
        cell = re.sub(html_reg, '', cell) # Strip HTML tags
        cell = cell.strip() # Remove leading and trailing whitespace
        numSuccess += 1
        return cell
    except Exception as e:
        # Write errors to errors.csv
        numError += 1
        errors.write(f'{numEmpty + numError + numSuccess},{str(e).replace(",", " ")}\n')
        print(e)
        return ''
Beispiel #4
0
def get_text(path, coding='utf-8'):
    # возвращаемый список с текстом
    text_list = list()

    with open(path, encoding=coding) as file:
        for strings in file:
            string = rtf_to_text(strings)
            try:
                lines = string.split('.')
                if lines[0][0] == lines[0][0].lower(
                ):  # добавление старых и соединение новых
                    text_list[-1] += lines[0]
                    for elem in lines[1:]:
                        if elem:
                            text_list.append(elem)
                elif lines[0][0] != lines[0][0].lower(
                ):  # соединение новых предложений
                    for line in lines:
                        text_list.append(line)
                else:  # соединение одной буквы
                    text_list[-1] += lines[0]
            except IndexError:
                del lines

    return text_list
def find_content(filepaths):
    print('Reading files within the Repository for content ...')
    documents = []
    for fp in filepaths:
        # Split the extension from the path and normalise it to lowercase.
        ext = os.path.splitext(fp)[-1].lower()
        # Now we can simply use == to check for equality, no need for wildcards.
        if ext == ".pdf":
            document = read_pdf_data(fp)
        elif ext == '.rtf':
            with open(fp, 'r') as file:
                text = file.read()
                document = rtf_to_text(text).replace('\n',
                                                     ' ').replace('\t', ' ')
        elif ext == '.docx':
            document = getText(fp)
        else:
            meta_path = os.path.dirname(fp) + 'metadata.csv'
            des = pd.read_csv(meta_path)
            try:
                description = des['Description'][0]
            except:
                description = des['Title'][0]
            document = description
        documents.append(document)
    return documents
Beispiel #6
0
    def extract_text(self) -> str:
        txt = self.data.decode(DEFAULT_TEXT_ENCODING)

        # Hack to handle Apple's extensions to the RTF format
        txt = txt.replace("\\\n\\\n", "\\\n\\par\n")

        return rtf_to_text(txt)
Beispiel #7
0
def get_mark_down(filename=DEFAULT_DOC_TO_PROCESS):

    print(f"FILE_LOC***\n***\n{filename}\n***\n***\n")

    with open(filename) as f:
        content = f.read()

    # incase string passed in
    if Path(filename).suffix == '.rtf':
        content = rtf_to_text(content)
    else:
        print(
            f"> > > > - - - - - - - - - - < < < < {Path(filename)} {filename} {Path(filename).suffix} > > > >"
        )

    replacement = ''
    # remove anything inside 'comment' delimiters //* this is a comment *//
    content = re.sub(r'\/\/\*.*?\*\/\/',
                     replacement,
                     content,
                     flags=re.MULTILINE | re.DOTALL)

    # debug verify comment removal
    # for line in iter(content.splitlines()):
    #     print(line)
    #
    # print("\n\n\n**8**\n\n\n")

    return content
Beispiel #8
0
    def test_table(self):
        simple_table_rtf = RTF_DIR / 'simple_table.rtf'
        simple_table_txt = TEXT_DIR / 'simple_table.txt'

        with simple_table_rtf.open() as source:
            result = rtf_to_text(source.read())
        with simple_table_txt.open() as destination:
            self.assertEqual(destination.read(), result)
Beispiel #9
0
    def test_extract_simple_table(self):
        simple_table_rtf = RTF_DIR / "line_break_textedit_mac.rtf"
        simple_table_txt = TXT_DIR / "line_break_textedit_mac.txt"

        with simple_table_rtf.open() as source:
            result = rtf_to_text(source.read())
        with simple_table_txt.open() as destination:
            self.assertEqual(destination.read(), result)
Beispiel #10
0
    def test_empty(self):
        example_rtf = RTF_DIR / "french.rtf"
        example_txt = TEXT_DIR / "french.txt"

        with example_rtf.open() as source:
            result = rtf_to_text(source.read())
        with example_txt.open() as destination:
            self.assertEqual(destination.read(), result)
 def get_sentiment_no_tokenize(self):
     article_sentiments = {}
     for filename in self.filenames:
         with open('articles/' + self.city_name +'/' + filename, 'r') as file:
             data = file.read().replace('\n', '')
             rtf = file.read().replace('\n', '')
             data = rtf_to_text(rtf)
             words = self.tokenizer.word_tokenize(data)
Beispiel #12
0
    def test_full_table(self):
        example_rtf = RTF_DIR / "nested_table.rtf"
        example_txt = TEXT_DIR / "nested_table.txt"

        with example_rtf.open() as source:
            result = rtf_to_text(source.read())
        with example_txt.open() as destination:
            self.assertEqual(destination.read(), result)
Beispiel #13
0
def get_text_content_of_file(rtf_filepath):

    with open(rtf_filepath, 'r') as f:
        rtf = f.read()
    # print('- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - S')
    # print(rtf)
    # print('- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - E')
    return rtf_to_text(rtf)  # convert to text and return
Beispiel #14
0
 def parseStickNotes(self):
     output = self.volumeInfo
     result = []
     try:
         bias = datetime.timedelta(hours=-self.bias)
     except TypeError:
         pass
     if "FAT" or "NTFS" in output.split(" ")[0]:
         os.chdir("%s/%s/" % (self.mountDir, output.split(" ")[2]))
         logger.info(
             "Loading every user info!")  # TODO:It should be per user!
         try:
             os.chdir("Users/")
         except FileNotFoundError:
             logger.error("Couldn't find Users folder!")
             return None
         for userDir in os.listdir("."):
             if os.access(
                     "{0}/AppData/Roaming/Microsoft/Sticky Notes/StickyNotes.snt"
                     .format(userDir), os.F_OK | os.R_OK):
                 pass
             else:
                 logger.warning("Couldn't find StickNotes file on %s" %
                                userDir)
                 continue
             doc = compoundfiles.CompoundFileReader(
                 "{0}/AppData/Roaming/Microsoft/Sticky Notes/StickyNotes.snt"
                 .format(userDir))
             for item in doc:
                 if item.isdir:
                     logger.info("Directory name: {0}.".format(item.name))
                     logger.info(
                         "Directory last modified time: {0}.".format(
                             item.modified))
                     for sub_item in item:
                         content = doc.open(sub_item).read()
                         logger.info("Entry name: {0}.".format(
                             sub_item.name))
                         if "Rich Text" in magic.from_buffer(content):
                             logger.debug(
                                 "This is an RTF file.Stripping to normal text."
                             )
                             logger.info("Entry content: {0}.".format(
                                 rtf_to_text(content.decode())))
                         else:
                             logger.info("Entry type: {0}.".format(
                                 magic.from_buffer(
                                     doc.open(sub_item).read())))
                             logger.info(
                                 "Entry content: {0}.".format(content))
                 elif item.isfile:
                     logger.info("Entry name: {0}.".format(item.name))
                     logger.info("Entry content: {0}.".format(
                         doc.open(item).read()))
                     logger.info("Entry type: {0}.".format(
                         magic.from_buffer(doc.open(item).read())))
                 else:
                     continue
def find_keywords(filepaths):
    print("Finding Keywords...")
    file_keywords = []
    files = []
    documents = []
    for fp in filepaths:
        ext = os.path.splitext(fp)[-1].lower()
        if ext == ".pdf":
            # keywords_set = clean_keywords(read_pdf(fp))
            text = ''
            pdfFileObj = open(fp, 'rb')
            pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
            num = pdfReader.numPages
            for i in range(num):
                pageObj = pdfReader.getPage(i)
                text += pageObj.extractText()
            pdfFileObj.close()
            if text != '':
                document = summarize(text)
            else:
                meta_path = os.path.dirname(fp) + '\metadata.csv'
                des = pd.read_csv(meta_path, encoding='unicode_escape')
                try:
                    description = des['Description'][0]
                except:
                    description = des['Title'][0]
            document = description
            keywords_set = clean_keywords(keywords_from_summary(document))
            files.append(fp)
        elif ext == '.rtf':
            files.append(fp)
            with open(fp, 'r') as file:
                text = file.read()
                document_t = rtf_to_text(text).replace('\n',
                                                       ' ').replace('\t', ' ')
                keywords_set = clean_keywords(
                    keywords_from_summary(summarize(document_t)))
                document = document_t
        elif ext == '.docx':
            text = getText(fp)
            document = text
            keywords_set = clean_keywords(
                keywords_from_summary(summarize(text)))
            files.append(fp)
        else:
            files.append(fp)
            meta_path = os.path.dirname(fp) + '\metadata.csv'
            des = pd.read_csv(meta_path, encoding='unicode_escape')
            try:
                description = des['Description'][0]
            except:
                description = des['Title'][0]
            document = description
            keywords_set = clean_keywords(keywords_from_summary(description))
        file_keywords.append(keywords_set)
        documents.append(document)
    # print(documents)
    return file_keywords, documents
Beispiel #16
0
    def test_speiseplan(self):
        example_rtf = RTF_DIR / "Speiseplan_KW_32-33_Eybl.rtf"
        example_txt = TEXT_DIR / "Speiseplan_KW_32-33_Eybl.txt"

        with example_rtf.open() as source:
            result = rtf_to_text(source.read())
        with example_txt.open() as destination:
            self.maxDiff = None
            self.assertEqual(destination.read(), result)
    def test_extract_simple_table(self):
        simple_table_rtf = RTF_DIR / "test_line_breaks_google_docs.rtf"
        simple_table_txt = TXT_DIR / "test_line_breaks_google_docs.txt"

        with simple_table_rtf.open() as source:
            result = rtf_to_text(source.read())
            with open("o.txt", "w") as f:
                f.write(result)
        with simple_table_txt.open() as destination:
            self.assertEqual(destination.read(), result)
Beispiel #18
0
def parser_rtf(link: str) -> str:
    '''Функция извлечения текста из файлов .rtf'''
    with open(link, encoding='utf-8') as f:
        text = f.read()
        if len(text) > 100_000:
            description = text[:100_000].rstrip()
        else:
            description = text.rstrip()
    text = rtf_to_text(description)
    return text
Beispiel #19
0
    def test_full_table(self):
        example_rtf = RTF_DIR / "calcium_score.rtf"
        example_txt = TEXT_DIR / "calcium_score.txt"

        with example_rtf.open() as source:
            result = rtf_to_text(source.read())
            with (open("foo.text", "w")) as f:
                f.write(result)
        with example_txt.open() as destination:
            self.assertEqual(destination.read(), result)
Beispiel #20
0
    def test_full_table(self):
        example_rtf = RTF_DIR / 'calcium_score.rtf'
        example_txt = TEXT_DIR / 'calcium_score.txt'

        with example_rtf.open() as source:
            result = rtf_to_text(source.read())
            print(result)
            with (open('foo.text', 'w')) as f:
                f.write(result)
        with example_txt.open() as destination:
            self.assertEqual(destination.read(), result)
Beispiel #21
0
def parse_file(input_dir: str, input_file_name: str, output_dir: str):
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    input_file_path = os.path.join(input_dir, input_file_name)
    output_file_path = os.path.join(output_dir,
                                    input_file_name.replace('.rtf', '.txt'))
    with open(input_file_path, 'r', encoding='ansi') as in_file:
        text = rtf_to_text(in_file.read())

        with open(output_file_path, 'w', encoding='utf-8') as out_file:
            out_file.write(text)
Beispiel #22
0
def iter_books(zip):
    '''Returns iterator of book files as strings.'''
    # remove foreword, appendixes, etc
    fps = [
        f.filename for f in zip.filelist
        if re.match(r'nwt_[\d]{2}_[\w]+_E.rtf', f.filename)
    ]
    for fp in sorted(fps):  # could be in any order
        book_rtf_bytes = zip.read(fp)
        book_rtf_str = book_rtf_bytes.decode('utf-8')
        yield rtf_to_text(book_rtf_str)
Beispiel #23
0
def process_rtf_to_dataframe(fpath, docid_patt, date_patt, time_patt):
    """
    Returns the document ID, date, time, and article text in a pandas DataFrame
    :param fpath: string, path to .RTF file
    :param docid_patt: string, regex pattern for document ID
    :param date_patt: string, regex pattern for date
    :param time_patt: string, regex pattern for time
    :return: pandas DataFrame
    """
    # read the raw content of the .RTF file

    with open(fpath, encoding="utf8") as f:
        try:
            rtf = f.read()
        except ValueError as err:
            print("Error ({fpath}): {err}".format(fpath=fpath, err=err))
            raise
        except:
            print(
                "Unexpected error ({fpath}): {err}".format(
                    fpath=fpath, err=sys.exc_info()[0]
                )
            )
            raise

    # strip formatting to get plain text
    text = rtf_to_text(rtf).strip()
    # get document ID corresponding to each article
    doc_ids = re.findall(re.compile(docid_patt, re.M), text)
    # segmentation - split the text at each document ID, into individual articles
    articles = re.split("|".join(doc_ids), text)
    # the last item in the list after split operation should be blank (i.e. ''), so it can be dropped
    articles = articles[:-1]
    if len(articles) != len(doc_ids):
        print(
            'Text is not segmented appropriately, check regex "{docidpatt}": document ids {n_docid}, '
            "{n_articles}".format(
                n_docid=len(doc_ids), docidpatt=docid_patt, n_articles=len(articles)
            )
        )
        return
    # strip blank lines/spaces from the beginning/end of each article
    articles = [a.strip() for a in articles]
    # extract date from each article
    article_dates = list(map(lambda x: find_in_text(date_patt, x), articles))
    # extract time from each article
    article_times = list(map(lambda x: find_in_text(time_patt, x), articles))
    # assemble dataframe
    data = pd.DataFrame(
        zip(doc_ids, article_dates, article_times, articles),
        columns=["document_id", "date", "time", "text"],
    )
    return data
Beispiel #24
0
def get_text(rs):
    bb = []
    ctr = 0
    for r in rs:
        if fname(r)[0] == '_':
            continue
        print(fnamene(r))
        a = file_to_text(r)
        b = rtf_to_text(a)
        bb.append(b)
    b = '\n\n'.join(bb)
    b = b.lower()
    return b
Beispiel #25
0
def butterfly():
    "Count how many times the word 'butterfly' appears in document"
    document = request.data.decode()  # Make bytes into text
    if request.content_type == 'text/plain':
        text, decoder = document, None
    elif request.content_type == 'text/html':
        soup = BeautifulSoup(document)
        text, decoder = soup.text, "BeautifulSoup"
    elif request.content_type == 'text/rtf':
        text, decoder = rtf_to_text(document), "striprtf"
    else:
        abort(400)  # 400 Bad Request

    count = text.lower().count('butterfly')
    return jsonify({"count": count, "decoder": decoder})
def process_answer_key(PATH_ANSKEY):

    # RTF support

    rtf = open(PATH_ANSKEY, "r")
    text = rtf_to_text(rtf.read())
    # print(text)

    # Seperate each answer along with its marks from answer key

    indexes = list(find_all(text, '(M:'))
    marks = get_marks_from_anskey(text, indexes)
    reference_answers = get_answers_from_anskey(text, indexes)

    return marks, reference_answers
Beispiel #27
0
def downloadArticle(articleId):
    global args, exit_flag
    if exit_flag:
        return
    url = "https://www.gopress.be/Public/download-article.php?articleOriginalId={}&format=rtf".format(
        articleId)

    headers = {}
    headers[
        'Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
    headers['Accept-Language'] = 'en-US,en;q=0.9,nl;q=0.8,fr;q=0.7'
    headers['Sec-Fetch-Dest'] = 'document'
    headers['Sec-Fetch-Mode'] = 'navigate'
    headers['Sec-Fetch-Site'] = 'none'
    headers[
        'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4277.0 Safari/537.36 Edg/87.0.658.0'
    headers['Cookie'] = 'PHPSESSID={}'.format(php_sess_id)

    req = request.Request(url, {}, headers)
    with request.urlopen(req) as response:
        text = response.read()
        decoded = text.decode('utf-8')
        if decoded == 'Action not authorized when user not authenticated':
            print(
                'Not authenticated, please make sure you are logged in and have an arbitrary article open'
            )
            exit_flag = True
            return
        elif decoded.startswith(
                '<html><head><title>Download error</title></head>'):
            log('Following article couldn\'t be downloaded: {}'.format(
                articleId))
            return
        elif decoded == 'Article not found : Article not found':
            log('Following article couldn\'t be found: {}'.format(articleId))
            return
        filename = buildFilename(rtf_to_text(decoded).split('\n'))
        if filename == 'unknown':
            log('Following article couldn\'t be parsed: {}'.format(articleId))
            return
        path = os.path.join(args.directory, filename)
        if os.path.isfile(path):
            log('Following article was already downloaded: {}'.format(
                filename))
            return
        with open(path, 'wb') as f:
            f.write(text)
    del req
Beispiel #28
0
def get_text(path, coding='utf-8'):
    # возвращаемый список с текстом
    text_list = list()

    transitional_list = list()
    styles_list = list()

    with open(path, encoding=coding) as file:
        for strings in file:
            string = rtf_to_text(strings)
            try:
                lines = string.split('.')
                if lines[0][0] == lines[0][0].lower(
                ):  # добавление старых и соединение новых
                    transitional_list[-1] += lines[0]
                    for elem in lines[1:]:
                        if elem:
                            transitional_list.append(elem)
                elif lines[0][0] != lines[0][0].lower(
                ):  # соединение новых предложений
                    for line in lines:
                        transitional_list.append(line)
                else:  # соединение одной буквы
                    transitional_list[-1] += lines[0]
            except:
                del lines

        # отделение стилей от текста
        all_text = ''

        for line in transitional_list:
            match = re.search(';', line)
            if match:
                styles_list.append(line)
                del line
            else:
                line += '.'
                all_text += line

        raw_text_list = all_text.split('. ')  # деление текста на предложения
        for string in raw_text_list:
            lines = string.split(
                '\n.')  # разбиение первых и последних предложений в абзацах
            for line in lines:
                text_list.append(line)

    return text_list, styles_list
 def get_word_frequency(self, word):
     city_freq_dict = {}
     word_count = 0
     trip = False
     for filename in self.filenames:
         if trip:
             return city_freq_dict
         with open(self.directory + '/' + self.city_name +'/' + filename, 'r') as file:
             rtf = file.read().replace('\n', '')
             data = rtf_to_text(rtf)
             words = self.clean_data(data)
             count = words.count(word)
             word_count += count
         #city_freq_dict[filename] = count
   
     #print(city_freq_dict)
     print(self.city_name + ": ", word_count/len(self.filenames))
 def get_sentiment_tokenize(self):
     article_sentiments = {}
     trip = False
     for filename in self.filenames:
         if trip:
             return article_sentiments
         with open(self.directory + '/' + self.city_name +'/' + filename, 'r') as file:
             rtf = file.read().replace('\n', '')
             data = rtf_to_text(rtf)
             sentences = self.tokenizer.tokenize(data)
             num_sentences = len(sentences)
             compound_score = 0
             for sentence in sentences:
                 scores = self.sid.polarity_scores(sentence)
                 compound_score += scores['compound']
             article_sentiments[filename] = compound_score/num_sentences
         
     return article_sentiments