Python read_pdf_to_tablesの例

プログラミング言語: Python

名前空間/パッケージ名: FileRead.PDFRead

メソッド/関数: read_pdf_to_tables

hotexamples.comのコード掲載数: 4

Python read_pdf_to_tables - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのFileRead.PDFRead.read_pdf_to_tablesの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: GetPlanInfo.py プロジェクト: zhangyuankai2018/Knowledge-Map-and-Question-Answer

def write_plan_info_tsinghua_2013(store_path, info_path):
    year = "2013"
    pages = read_pdf_to_tables(info_path)
    li_table = pages[0][0]
    index = 0
    for i_line in range(len(pages[1][0])):
        if pages[1][0][i_line][0] == "专业名称":
            index = i_line
            break
    li_table += pages[1][0][:index]
    wen_table = pages[1][0][index:]
    li_table = li_table[1:-1]
    wen_table = wen_table[:-1]

    # 理科
    table_head = li_table[0]
    for i in range(len(table_head)):
        table_head[i] = table_head[i].replace("\n", "")
        if table_head[i] == "黑龙":
            table_head[i] = "黑龙江"
        if table_head[i] == "内蒙":
            table_head[i] = "内蒙古"
    pro_line_li = li_table[-1]
    pro_line_wen = wen_table[-1]
    for item in li_table:
        if item[0] == "专业名称" or item[0] == "理科合计":
            li_table.remove(item)
    for item in wen_table:
        if item[0] == "专业名称" or item[0] == "文科合计":
            wen_table.remove(item)
    # print(table_head)
    # for item in li_table:
    #     print(item)
    # print("---------------")
    # for item in wen_table:
    #     print(item)
    # 写入文件
    for i_pro in range(2, len(table_head)):
        sub_plan_table_name = year + "-" + table_head[i_pro]
        sub_plan_table_head = ["专业", "类别", "人数"]
        sub_plan_table_content = []
        # 理科部分
        for item in li_table:
            if item[i_pro] != "":
                temp = [item[0], "理工", item[i_pro]]
                sub_plan_table_content.append(temp)
        # 统计人数
        temp = ["理工", "统计", pro_line_li[i_pro]]
        sub_plan_table_content.append(temp)
        # 文科部分
        for item in wen_table:
            if item[i_pro] != "":
                temp = [item[0], "文史", item[i_pro]]
                sub_plan_table_content.append(temp)
        # 统计人数
        temp = ["文史", "统计", pro_line_wen[i_pro]]
        sub_plan_table_content.append(temp)

        write_table(store_path, sub_plan_table_name, sub_plan_table_head,
                    sub_plan_table_content)

コード例 #2

ファイルを表示

ファイル: GetPlanInfo.py プロジェクト: zhangyuankai2018/Knowledge-Map-and-Question-Answer

def write_plan_info_tsinghua_2014(store_path, info_path):
    year = "2014"
    pages = read_pdf_to_tables(info_path)
    li_table = pages[0][0] + pages[1][0]
    wen_table = pages[1][1]
    # 理科
    table_head = li_table[0]
    for i in range(len(table_head)):
        table_head[i] = table_head[i].replace("\n", "")
    pro_line_li = li_table[-1]
    pro_line_wen = wen_table[-1]
    for item in li_table:
        if item[0] == "专业名称" or item[0] == "理科合计":
            li_table.remove(item)
    for item in wen_table:
        if item[0] == "专业名称" or item[0] == "文科合计":
            wen_table.remove(item)

    # 写入文件
    for i_pro in range(1, len(table_head)):
        sub_plan_table_name = year + "-" + table_head[i_pro]
        sub_plan_table_head = ["专业", "类别", "人数"]
        sub_plan_table_content = []
        # 理科部分
        for item in li_table:
            if item[i_pro] != "":
                temp = [item[0].replace("\n", ""), "理工", item[i_pro]]
                sub_plan_table_content.append(temp)
        # 统计人数
        temp = ["理工", "统计", pro_line_li[i_pro]]
        sub_plan_table_content.append(temp)
        # 文科部分
        for item in wen_table:
            if item[i_pro] != "":
                temp = [item[0], "文史", item[i_pro]]
                sub_plan_table_content.append(temp)
        # 统计人数
        temp = ["文史", "统计", pro_line_wen[i_pro]]
        sub_plan_table_content.append(temp)

        write_table(store_path, sub_plan_table_name, sub_plan_table_head,
                    sub_plan_table_content)

コード例 #3

ファイルを表示

ファイル: GetPlanInfo.py プロジェクト: zhangyuankai2018/Knowledge-Map-and-Question-Answer

def get_plan_info_nju():
    mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    mylogger.info("开始获取网页源码...")
    main_url = "http://bkzs.nju.edu.cn"
    # 获取分类信息

    file_path = "Information/九校联盟/南京大学/招生计划"
    # 使用selenium获取隐藏部分源码
    # browser = selenium_chrome(main_url+"/4543/list.htm")
    # pro_list = browser.find_element_by_id("MapControl")
    # with open(file_path+"/source/"+"index","w",encoding="utf-8") as file:
    #     file.write(pro_list.get_attribute('innerHTML'))

    # 开始将源码读入并使用bs4解析，获取html页面源码
    # with open(file_path + "/source/" + "index", "r", encoding="utf-8") as file:
    #     source_code = file.read()
    # main_page_soup = BeautifulSoup(source_code, "lxml")
    # main_page_soup.prettify()
    # for li in main_page_soup.find_all("li"):
    #     url = li.a["href"]
    #     pro = li.span.text
    #     print(pro + "\t" + url)
    #     browser = selenium_chrome(main_url+url)
    #     page_source = browser.find_element_by_class_name("wp_articlecontent").get_attribute("innerHTML")
    #     year = re.findall("\d{4}",BeautifulSoup(page_source,"lxml").find("p").text)[0]
    #     with open(file_path + "/source/"+year+"-"+pro+".html","w",encoding="utf-8") as file:
    #         file.write(page_source)
    #     browser.quit()
    #     time.sleep(5)

    # 获取pdf文件
    # file_list = read_all_file_list(file_path + "/source")
    # for file_name in file_list:
    #     pdf_name = file_name.split("\\")[-1][:-5]
    #     if file_name[-4:] == "html":
    #         print(file_name)
    #         with open(file_name, "r", encoding="utf-8") as file:
    #             page_source = file.read()
    #         page_soup = BeautifulSoup(page_source,"lxml")
    #         for item in page_soup.find_all("div",class_="wp_pdf_player"):
    #             pdf_url = item["pdfsrc"]
    #             pdf_source = request_url(main_url+pdf_url)
    #             with open(file_path + "/source/"+pdf_name+".pdf","wb")as pdf_file:
    #                 pdf_file.write(pdf_source.content)

    # 解析pdf文件
    file_list = read_all_file_list(file_path + "/source")
    for file_name in file_list:
        if file_name[-3:] == "pdf":
            pdf_name = file_name.split("\\")[-1][:-4]
            year = pdf_name.split("-")[0]
            pro = pdf_name.split("-")[-1]
            pages = read_pdf_to_tables(file_name)
            table_name = year + "-" + pro
            table_head = ["专业", "类别", "人数"]
            mylogger.debug(table_name)
            mylogger.debug(str(table_head))
            all_lines = []
            for tables in pages:
                for table in tables:
                    for line in table:
                        all_lines.append(line)
            # 分表
            all_tables = []
            table = []
            for line in all_lines:
                if line[0] == "科类":
                    if len(table) != 0:
                        all_tables.append(table)
                    table = []
                    table.append(line)
                else:
                    table.append(line)
            all_tables.append(table)
            # 将标记写入该表下的每一项
            all_lines = []
            for table in all_tables:
                sign = table[1][0]
                if sign == "国家专项计划" or sign == "提前批":
                    for line in table:
                        all_lines.append([
                            line[0],
                            str(line[1]) + "(" + sign + ")", line[2]
                        ])
                else:
                    for line in table:
                        all_lines.append(line)

            table_content = []
            for line in all_lines:
                if line[0] == "科类" or line[0] == "总计" or line[1].find("小计") != -1 or line[1].find("None") != -1 \
                        or line[2] == "" or line[2] == "0" or line[2] is None:
                    continue
                classy = line[0]
                if classy == "理":
                    classy = "理工"
                elif classy == "文":
                    classy = "文史"
                table_content.append(
                    [line[1].replace("( )\n", ""), classy, line[2]])
            for line in table_content:
                mylogger.debug(str(line))
            write_table(file_path, table_name, table_head, table_content)
            mylogger.info(year + pro + "招生计划已存入文件")

コード例 #4

ファイルを表示

ファイル: GetPlanInfo.py プロジェクト: zhangyuankai2018/Knowledge-Map-and-Question-Answer

def write_plan_info_sjtu_2015(store_path, info_path):
    print("2015")
    pages = read_pdf_to_tables(info_path)
    print(pages)