def test_main(): #only detailed description raw_text = file_reader.get_string_from_txt('test.test') words = nltk.word_tokenize(raw_text) file_writer.print_string_to_txt("|\n".join(words), 'words.test') regex_all_num = re.compile(r'\d+', re.M) ref_numerals = [] for i, w in enumerate(words): matchObj = re.fullmatch(regex_all_num, w) if matchObj: #print(matchObj.group(0)) ref_numerals.append((w, i)) print(ref_numeral_features(w))
def create_labeled_set(): raw_text = file_reader.get_string_from_txt('training/train_1.txt') #print(raw_text) linebreak_split = raw_text.strip().split("\n") #print(linebreak_split) sents = [] for l in linebreak_split: sent = nltk.word_tokenize(l) #print(sent) sents.append(sent) data = ClassifiedSentenceData(sents) #print(data.featuresets) #print(len(data.featuresets)) return data
def get_test_string(): return file_reader.get_string_from_txt( #'2018-09-21 15694060 nonfinal rejection.txt') #'2018-09-18 15599191 nonfinal rejection.txt') '2018-09-26 15332415 final rejection.txt')
def get_trained_classifier(): featuresets = get_featuresets() #randomize data np.random.shuffle(featuresets) classifier = pk_nltk.train_classifier(featuresets) return classifier classifier = get_trained_classifier() #read app in .txt to string raw_text = file_reader.get_string_from_txt('1225.txt') #tokenize raw text words = nltk.word_tokenize(raw_text) test_set = set() new_dataset = [] for w in words: #print(w) #print(classifier.classify(ref_numeral_features(w))) new_dataset.append((w, classifier.classify(ref_numeral_features(w)))) if (classifier.classify(ref_numeral_features(w))): #print(w)
def main(): images = None output_files = True generate_pickle = True """ doesn't work #root = tk.Tk() global root root.title("Convert drawings?") frame = tk.Frame(root) frame.pack() button = tk.Button(frame, text="YES", command=convert_drawings_to_images(output_files, generate_pickle)) button.pack(side=tk.LEFT) slogan = tk.Button(frame, text="NO", command=quit) slogan.pack(side=tk.LEFT) root.mainloop() print("root destroyed") """ #sys.exit() images = convert_drawings_to_images(output_files, generate_pickle) sys.exit() read_pickle = True #ocr_images(images, read_pickle) #print(pytesseract.image_to_string("image_ref_numeral_only.jpg")) """ ocr_dict = pytess.convert_images_to_string(images) filehandler = open('ocr_dict.p','wb') pickle.dump(ocr_dict, filehandler) print(ocr_dict) """ #output_path = fr.convert_pdf_to_txt(filepath) #print(output_path) print("open application txt file") #get application txt file filepath = fr.get_filepath() #get raw text of application raw_text = fr.get_string_from_txt(filepath) print("open drawings csv") filepath = fr.get_filepath() csv_dict, ref_numerals_dict = get_drawings_data(filepath) print(ref_numerals_dict) #sys.exit() analyze_drawings_against_application(csv_dict, raw_text) find_ref_numerals_in_application(ref_numerals_dict, raw_text) """
print(range_hyphen_split) start_para = range_hyphen_split[0].strip() end_para = range_hyphen_split[1].strip() for j in range(int(start_para), int(end_para)+1): if j not in input_paragraphs: input_paragraphs.append(j) print(input_paragraphs) #test #input_paragraphs = [103,104,105,106,107,108,109,29] input_path = "input/test_reference.txt" input_paragraphs.sort() reference_string = file_reader.get_string_from_txt(input_path) #print(reference_string) reference_split = reference_string.split("\n") #print(reference_split) output_string = [] for para in input_paragraphs: print(para) regex = r'\[\d+' + str(para) + r'\].+' match_obj = re.search(regex, reference_string) if (match_obj): print(match_obj[0]) output_string.append(match_obj[0])
# -*- coding: utf-8 -*- """ Created on Fri Oct 26 11:13:36 2018 @author: alanyliu """ import file_reader import file_writer import re input_path = "training/2018-09-21 15694060 nonfinal rejection.txt" text = file_reader.get_string_from_txt(input_path) print(text) text = re.sub(r'(]\.)', '\g<1>\n', text) text = re.sub(r'\n\s+\n', '\n', text) text = re.sub(r'\n{2,}', '\n', text) #print(re.findall(r'\w\n\w',text)) #print(re.findall(r'(\w)(\n)(\w)',text)) text = re.sub(r'(\w|,)(\n)(\w)', '\g<1> \g<3>', text) text = re.sub(r'(\.)\s', '\g<1>\n', text) file_writer.print_string_to_txt(text, "training/train.txt") #punct-features ideas #next word is punctuation #last letter of previous word is capitalized #probably doesn't end with acronym #false positives, false negatives