def eval(): if not os.path.exists('./results'): os.makedirs('./results') # Load graph print("Graph loaded") print("Model name:{}".format(hp.modelname)) # Load data print("Testing Data...") txt_src_names, idx_src_names, txt_tgt_names, _ = load_evaluate_data( eval_mode="test") x_w2i, x_i2w, y_w2i, y_i2w = load_vocab() g = Graph(is_training=False) with g.graph.as_default(), tf.Session() as sess: sv = tf.train.Saver() # Restore parameters print("Parameter Restoring...") sv.restore(sess, tf.train.latest_checkpoint(hp.logdir + '/' + hp.modelname)) # Inference count = 0 with open('./results/' + hp.modelname + '_result.txt', "w") as fout: for i in range(0, len(txt_src_names), hp.batch_size): batch_txt_src_names = txt_src_names[i:i + hp.batch_size] batch_idx_src_names = idx_src_names[i:i + hp.batch_size] batch_txt_tgt_names = txt_tgt_names[i:i + hp.batch_size] batch_predicted_ids = sess.run(g.pred_outputs, { g.x: batch_idx_src_names }).predicted_ids[:, :, :] for source, target, predicted_ids in zip( batch_txt_src_names, batch_txt_tgt_names, batch_predicted_ids): print( str(count) + '\t' + source + '\t' + hangul.join_jamos(target)) count += 1 candidates = [] predicted_ids = predicted_ids.transpose(1, 0) for pred in predicted_ids: candidate = "".join(y_i2w[idx] for idx in pred).split("E")[0] candidate = hangul.join_jamos(candidate) candidates.append(candidate) fout.write(source + '\t') fout.write(hangul.join_jamos(target)) for candidate in candidates: fout.write('\t') fout.write(candidate.encode('utf-8')) fout.write('\n') fout.flush()
def page_text_finder(self, report_text): page_text = '' text = '' found = False company_name = self.file_nm.split('_')[3] company_num = self.file_nm.split('_')[4][1:] company_dict = {'LG상사': 'LG 상사'} # To resolve hangul encoding issue company_name = hangul.join_jamos(j2hcj(h2j(company_name))) if company_name in company_dict.keys(): company_name = company_dict[company_name] for line in report_text.split('\n'): if "page_id" in line and '||Title|| ' + company_name in text and company_num in text: page_text = text found = True break elif "page_id" in line: text = '' else: text += line + '\n' return page_text, found, company_name, company_num
def save_to_txt(file_nm, file_text): root_dir = '/Users/daniel/Desktop/test_2/after_inspec_txt/' path = root_dir + file_nm path = hangul.join_jamos(j2hcj(h2j(path))) print(file_nm) with open(path, 'w') as out_file: out_file.write(file_text)
def convert_pdf_to_txt(self, pdf_file): """PDF파일을 텍스트로 변환해주는 함수 Args: pdf ([PDF]): PDF파일 Returns: [dict]: PDF에서 텍스트로 변환된 결과물 """ output_string = StringIO() self.file_nm = pdf_file.split(".")[0] file_ex = pdf_file.split(".")[1] self.pdf_path = self.report_pdf_dir + pdf_file self.pdf_path = hangul.join_jamos(j2hcj(h2j(self.pdf_path))) laparams = LAParams(line_overlap=.5, char_margin=1.35, line_margin=1.0, word_margin=0.01, boxes_flow=.5, detect_vertical=False, all_texts=False) rsrcmgr = PDFResourceManager() device = FinanceConverter(rsrcmgr, output_string, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Extract text found = False with open(self.pdf_path, 'rb') as in_file: for page_num, page in enumerate(PDFPage.get_pages(in_file, check_extractable=True)): interpreter.process_page(page) page_text = output_string.getvalue() report_text, found, company_nm, company_num = self.page_text_finder( page_text) if found: break if not found: report_text = None return report_text, company_nm, company_num
def save_to_txt(self, txt): output_path = self.output_txt_dir + self.file_nm + '.txt' output_path = hangul.join_jamos(j2hcj(h2j(output_path))) with open(output_path, 'w') as out_file: out_file.write(txt)