def convert_images_to_string(images, output_file=False):
    #whole_oa_string = ''

    ocr_dict = {}

    count = 1

    print('converting images to string')
    for img in images:
        #whole_oa_string += pytesseract.image_to_string(img)
        text = pytesseract.image_to_string(img)
        ocr_dict[count] = text

        print("page {} converted".format(count))
        count += 1
    print('done converting to string')

    if output_file:
        output_string = ""

        for page_no, text in ocr_dict.items():
            output_string += str(page_no) + "\n" + text + "\n\n"

        fw.print_string_to_txt(output_string, "output.txt")

    #return whole_oa_string
    return ocr_dict
Exemple #2
0
def convert_pdf_to_txt(pdf_file_path,use_pickle=True):
    output_path = rename_pdf_to_txt(pdf_file_path)
    #output_folder = "output/"
    #output_path = output_folder + output_text_file
    
    if use_pickle:
        filehandler = open('images.p', 'r')
        images = pickle.load(filehandler)
    else:
        images = convert_pdf_to_images(pdf_file_path)
    
    raw_string = pytess.convertImagesToString(images)
    
    file_writer.print_string_to_txt(raw_string,output_path)
    
    return output_path
def test_main():
    #only detailed description
    raw_text = file_reader.get_string_from_txt('test.test')

    words = nltk.word_tokenize(raw_text)

    file_writer.print_string_to_txt("|\n".join(words), 'words.test')

    regex_all_num = re.compile(r'\d+', re.M)

    ref_numerals = []

    for i, w in enumerate(words):
        matchObj = re.fullmatch(regex_all_num, w)
        if matchObj:
            #print(matchObj.group(0))
            ref_numerals.append((w, i))
            print(ref_numeral_features(w))
Exemple #4
0
        for j in range(int(start_para), int(end_para)+1):
            if j not in input_paragraphs:
                input_paragraphs.append(j)

print(input_paragraphs)

#test
#input_paragraphs = [103,104,105,106,107,108,109,29]
input_path = "input/test_reference.txt"

input_paragraphs.sort()

reference_string = file_reader.get_string_from_txt(input_path)

#print(reference_string)
reference_split = reference_string.split("\n")
#print(reference_split)

output_string = []

for para in input_paragraphs:
    print(para)
    regex = r'\[\d+' + str(para) + r'\].+'
    match_obj = re.search(regex, reference_string)
    if (match_obj):
        print(match_obj[0])
        output_string.append(match_obj[0])
        
file_writer.print_string_to_txt("\n\n".join(output_string),"output/analyzed_reference.txt")
    
    
Exemple #5
0
@author: alanyliu
"""

import file_reader
import file_writer
import re

input_path = "training/2018-09-21 15694060 nonfinal rejection.txt"
text = file_reader.get_string_from_txt(input_path)

print(text)

text = re.sub(r'(]\.)', '\g<1>\n', text)
text = re.sub(r'\n\s+\n', '\n', text)
text = re.sub(r'\n{2,}', '\n', text)
#print(re.findall(r'\w\n\w',text))
#print(re.findall(r'(\w)(\n)(\w)',text))
text = re.sub(r'(\w|,)(\n)(\w)', '\g<1> \g<3>', text)
text = re.sub(r'(\.)\s', '\g<1>\n', text)

file_writer.print_string_to_txt(text, "training/train.txt")

#punct-features ideas
#next word is punctuation
#last letter of previous word is capitalized
#probably doesn't end with acronym
#false positives, false negatives
#single double quotes

#claim interpretation