def write_plan_info_tsinghua_2013(store_path, info_path): year = "2013" pages = read_pdf_to_tables(info_path) li_table = pages[0][0] index = 0 for i_line in range(len(pages[1][0])): if pages[1][0][i_line][0] == "专业名称": index = i_line break li_table += pages[1][0][:index] wen_table = pages[1][0][index:] li_table = li_table[1:-1] wen_table = wen_table[:-1] # 理科 table_head = li_table[0] for i in range(len(table_head)): table_head[i] = table_head[i].replace("\n", "") if table_head[i] == "黑龙": table_head[i] = "黑龙江" if table_head[i] == "内蒙": table_head[i] = "内蒙古" pro_line_li = li_table[-1] pro_line_wen = wen_table[-1] for item in li_table: if item[0] == "专业名称" or item[0] == "理科合计": li_table.remove(item) for item in wen_table: if item[0] == "专业名称" or item[0] == "文科合计": wen_table.remove(item) # print(table_head) # for item in li_table: # print(item) # print("---------------") # for item in wen_table: # print(item) # 写入文件 for i_pro in range(2, len(table_head)): sub_plan_table_name = year + "-" + table_head[i_pro] sub_plan_table_head = ["专业", "类别", "人数"] sub_plan_table_content = [] # 理科部分 for item in li_table: if item[i_pro] != "": temp = [item[0], "理工", item[i_pro]] sub_plan_table_content.append(temp) # 统计人数 temp = ["理工", "统计", pro_line_li[i_pro]] sub_plan_table_content.append(temp) # 文科部分 for item in wen_table: if item[i_pro] != "": temp = [item[0], "文史", item[i_pro]] sub_plan_table_content.append(temp) # 统计人数 temp = ["文史", "统计", pro_line_wen[i_pro]] sub_plan_table_content.append(temp) write_table(store_path, sub_plan_table_name, sub_plan_table_head, sub_plan_table_content)
def write_plan_info_tsinghua_2014(store_path, info_path): year = "2014" pages = read_pdf_to_tables(info_path) li_table = pages[0][0] + pages[1][0] wen_table = pages[1][1] # 理科 table_head = li_table[0] for i in range(len(table_head)): table_head[i] = table_head[i].replace("\n", "") pro_line_li = li_table[-1] pro_line_wen = wen_table[-1] for item in li_table: if item[0] == "专业名称" or item[0] == "理科合计": li_table.remove(item) for item in wen_table: if item[0] == "专业名称" or item[0] == "文科合计": wen_table.remove(item) # 写入文件 for i_pro in range(1, len(table_head)): sub_plan_table_name = year + "-" + table_head[i_pro] sub_plan_table_head = ["专业", "类别", "人数"] sub_plan_table_content = [] # 理科部分 for item in li_table: if item[i_pro] != "": temp = [item[0].replace("\n", ""), "理工", item[i_pro]] sub_plan_table_content.append(temp) # 统计人数 temp = ["理工", "统计", pro_line_li[i_pro]] sub_plan_table_content.append(temp) # 文科部分 for item in wen_table: if item[i_pro] != "": temp = [item[0], "文史", item[i_pro]] sub_plan_table_content.append(temp) # 统计人数 temp = ["文史", "统计", pro_line_wen[i_pro]] sub_plan_table_content.append(temp) write_table(store_path, sub_plan_table_name, sub_plan_table_head, sub_plan_table_content)
def get_plan_info_nju(): mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog() mylogger.info("开始获取网页源码...") main_url = "http://bkzs.nju.edu.cn" # 获取分类信息 file_path = "Information/九校联盟/南京大学/招生计划" # 使用selenium获取隐藏部分源码 # browser = selenium_chrome(main_url+"/4543/list.htm") # pro_list = browser.find_element_by_id("MapControl") # with open(file_path+"/source/"+"index","w",encoding="utf-8") as file: # file.write(pro_list.get_attribute('innerHTML')) # 开始将源码读入并使用bs4解析,获取html页面源码 # with open(file_path + "/source/" + "index", "r", encoding="utf-8") as file: # source_code = file.read() # main_page_soup = BeautifulSoup(source_code, "lxml") # main_page_soup.prettify() # for li in main_page_soup.find_all("li"): # url = li.a["href"] # pro = li.span.text # print(pro + "\t" + url) # browser = selenium_chrome(main_url+url) # page_source = browser.find_element_by_class_name("wp_articlecontent").get_attribute("innerHTML") # year = re.findall("\d{4}",BeautifulSoup(page_source,"lxml").find("p").text)[0] # with open(file_path + "/source/"+year+"-"+pro+".html","w",encoding="utf-8") as file: # file.write(page_source) # browser.quit() # time.sleep(5) # 获取pdf文件 # file_list = read_all_file_list(file_path + "/source") # for file_name in file_list: # pdf_name = file_name.split("\\")[-1][:-5] # if file_name[-4:] == "html": # print(file_name) # with open(file_name, "r", encoding="utf-8") as file: # page_source = file.read() # page_soup = BeautifulSoup(page_source,"lxml") # for item in page_soup.find_all("div",class_="wp_pdf_player"): # pdf_url = item["pdfsrc"] # pdf_source = request_url(main_url+pdf_url) # with open(file_path + "/source/"+pdf_name+".pdf","wb")as pdf_file: # pdf_file.write(pdf_source.content) # 解析pdf文件 file_list = read_all_file_list(file_path + "/source") for file_name in file_list: if file_name[-3:] == "pdf": pdf_name = file_name.split("\\")[-1][:-4] year = pdf_name.split("-")[0] pro = pdf_name.split("-")[-1] pages = read_pdf_to_tables(file_name) table_name = year + "-" + pro table_head = ["专业", "类别", "人数"] mylogger.debug(table_name) mylogger.debug(str(table_head)) all_lines = [] for tables in pages: for table in tables: for line in table: all_lines.append(line) # 分表 all_tables = [] table = [] for line in all_lines: if line[0] == "科类": if len(table) != 0: all_tables.append(table) table = [] table.append(line) else: table.append(line) all_tables.append(table) # 将标记写入该表下的每一项 all_lines = [] for table in all_tables: sign = table[1][0] if sign == "国家专项计划" or sign == "提前批": for line in table: all_lines.append([ line[0], str(line[1]) + "(" + sign + ")", line[2] ]) else: for line in table: all_lines.append(line) table_content = [] for line in all_lines: if line[0] == "科类" or line[0] == "总计" or line[1].find("小计") != -1 or line[1].find("None") != -1 \ or line[2] == "" or line[2] == "0" or line[2] is None: continue classy = line[0] if classy == "理": classy = "理工" elif classy == "文": classy = "文史" table_content.append( [line[1].replace("( )\n", ""), classy, line[2]]) for line in table_content: mylogger.debug(str(line)) write_table(file_path, table_name, table_head, table_content) mylogger.info(year + pro + "招生计划已存入文件")
def write_plan_info_sjtu_2015(store_path, info_path): print("2015") pages = read_pdf_to_tables(info_path) print(pages)