Ejemplo n.º 1
0
def deal_pdf(file_path):
    """
    批量处理pdf
    :param file_path:
    :return:
    """
    files = os.listdir(file_path)
    # List中是所有pdf,每个pdf分为title:content:tables1:tablesi
    all_pdf = []
    for file in files:
        if not os.path.isdir(file) and file.endswith(".pdf"):
            parsed_pdf = []
            f_path = file_path + file
            file_title = "".join(file.split(".pdf"))
            pdf_parser = PDFParser(f_path)
            logging.info("开始处理文章: %s" % file_title)
            print("开始处理文章: %s" % file_title)
            try:
                pdf_texts, pdf_tables = pdf_parser.parser()
            except Exception as e:
                logging.warning("处理:*%s*时出错:%s" % (file_title, e))
                pdf_texts = ''
                pdf_tables = None
            # 拼成(title, pdf_text, pdf_tables的形式,表格往后排列)
            pdf_texts = "".join(pdf_texts)
            parsed_pdf.append(file_title)
            parsed_pdf.append(pdf_texts)
            if pdf_tables is not None:
                for tab in pdf_tables:
                    parsed_pdf.append(tab)
            all_pdf.append(parsed_pdf)
    return all_pdf
Ejemplo n.º 2
0
def server():
    print('Compiling assets...')
    compile_assets()

    app.PDFParser = PDFParser()

    return app
Ejemplo n.º 3
0
    def _load_report(self, new_report, old_report, **kwargs):
        self.report = []

        parser_a = PDFParser()
        parse_new_result = parser_a.parse(new_report, kwargs.get("password_a", ""))
        new_index_tree = parser_a.analyze("Monetary Report", parse_new_result)
        self.report.append(new_index_tree)

        parser_b = PDFParser()
        parse_old_result = parser_b.parse(old_report, kwargs.get("password_b", ""))
        old_index_tree = parser_b.analyze("Monetary Report", parse_old_result)
        self.report.append(old_index_tree)