def convert_images_to_string(images, output_file=False): #whole_oa_string = '' ocr_dict = {} count = 1 print('converting images to string') for img in images: #whole_oa_string += pytesseract.image_to_string(img) text = pytesseract.image_to_string(img) ocr_dict[count] = text print("page {} converted".format(count)) count += 1 print('done converting to string') if output_file: output_string = "" for page_no, text in ocr_dict.items(): output_string += str(page_no) + "\n" + text + "\n\n" fw.print_string_to_txt(output_string, "output.txt") #return whole_oa_string return ocr_dict
def convert_pdf_to_txt(pdf_file_path,use_pickle=True): output_path = rename_pdf_to_txt(pdf_file_path) #output_folder = "output/" #output_path = output_folder + output_text_file if use_pickle: filehandler = open('images.p', 'r') images = pickle.load(filehandler) else: images = convert_pdf_to_images(pdf_file_path) raw_string = pytess.convertImagesToString(images) file_writer.print_string_to_txt(raw_string,output_path) return output_path
def test_main(): #only detailed description raw_text = file_reader.get_string_from_txt('test.test') words = nltk.word_tokenize(raw_text) file_writer.print_string_to_txt("|\n".join(words), 'words.test') regex_all_num = re.compile(r'\d+', re.M) ref_numerals = [] for i, w in enumerate(words): matchObj = re.fullmatch(regex_all_num, w) if matchObj: #print(matchObj.group(0)) ref_numerals.append((w, i)) print(ref_numeral_features(w))
for j in range(int(start_para), int(end_para)+1): if j not in input_paragraphs: input_paragraphs.append(j) print(input_paragraphs) #test #input_paragraphs = [103,104,105,106,107,108,109,29] input_path = "input/test_reference.txt" input_paragraphs.sort() reference_string = file_reader.get_string_from_txt(input_path) #print(reference_string) reference_split = reference_string.split("\n") #print(reference_split) output_string = [] for para in input_paragraphs: print(para) regex = r'\[\d+' + str(para) + r'\].+' match_obj = re.search(regex, reference_string) if (match_obj): print(match_obj[0]) output_string.append(match_obj[0]) file_writer.print_string_to_txt("\n\n".join(output_string),"output/analyzed_reference.txt")
@author: alanyliu """ import file_reader import file_writer import re input_path = "training/2018-09-21 15694060 nonfinal rejection.txt" text = file_reader.get_string_from_txt(input_path) print(text) text = re.sub(r'(]\.)', '\g<1>\n', text) text = re.sub(r'\n\s+\n', '\n', text) text = re.sub(r'\n{2,}', '\n', text) #print(re.findall(r'\w\n\w',text)) #print(re.findall(r'(\w)(\n)(\w)',text)) text = re.sub(r'(\w|,)(\n)(\w)', '\g<1> \g<3>', text) text = re.sub(r'(\.)\s', '\g<1>\n', text) file_writer.print_string_to_txt(text, "training/train.txt") #punct-features ideas #next word is punctuation #last letter of previous word is capitalized #probably doesn't end with acronym #false positives, false negatives #single double quotes #claim interpretation