def deal_pdf(file_path): """ 批量处理pdf :param file_path: :return: """ files = os.listdir(file_path) # List中是所有pdf,每个pdf分为title:content:tables1:tablesi all_pdf = [] for file in files: if not os.path.isdir(file) and file.endswith(".pdf"): parsed_pdf = [] f_path = file_path + file file_title = "".join(file.split(".pdf")) pdf_parser = PDFParser(f_path) logging.info("开始处理文章: %s" % file_title) print("开始处理文章: %s" % file_title) try: pdf_texts, pdf_tables = pdf_parser.parser() except Exception as e: logging.warning("处理:*%s*时出错:%s" % (file_title, e)) pdf_texts = '' pdf_tables = None # 拼成(title, pdf_text, pdf_tables的形式,表格往后排列) pdf_texts = "".join(pdf_texts) parsed_pdf.append(file_title) parsed_pdf.append(pdf_texts) if pdf_tables is not None: for tab in pdf_tables: parsed_pdf.append(tab) all_pdf.append(parsed_pdf) return all_pdf
def server(): print('Compiling assets...') compile_assets() app.PDFParser = PDFParser() return app
def _load_report(self, new_report, old_report, **kwargs): self.report = [] parser_a = PDFParser() parse_new_result = parser_a.parse(new_report, kwargs.get("password_a", "")) new_index_tree = parser_a.analyze("Monetary Report", parse_new_result) self.report.append(new_index_tree) parser_b = PDFParser() parse_old_result = parser_b.parse(old_report, kwargs.get("password_b", "")) old_index_tree = parser_b.analyze("Monetary Report", parse_old_result) self.report.append(old_index_tree)