def read_pdf_to_text(path): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() function_logger.info("开始读取pdf文件!") # 按页返回页中各个文本 source_pdf = open(path, 'rb') # 创建PDF,资源管理器,来共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释其对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 循环遍历列表,每次处理一个page内容 # doc.get_pages() 获取page列表 for page in PDFPage.get_pages(source_pdf): interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 # 想要获取文本就获得对象的text属性, for x in layout: if isinstance(x, LTTextBoxHorizontal): results = x.get_text() print(results) print("-----------") # print("-------------------") function_logger.info("pdf文件读取文本完成!")
def build_template_by_infos(template_path: str, fields_question_condition: list, fields_question_target: list, template_sentence_questions: list, template_sentence_answers: list): """ 通过提供的信息构造问题模板 :param template_path: 模板路径 :param fields_question_condition: 问句条件词,例["school 学校", "year 年份", "major 专业", "district 省份", "classy 类别"] :param fields_question_target: 问句目标词,例["numbers 招生人数 招生计划 招多少人 招生计划是多少 招生人数是多少"] :param template_sentence_questions: 模板问题句 :param template_sentence_answers: 模板答案句 :return: """ function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() function_logger.info("开始构造%s的问题模板..." % template_path.split("\\")[-1]) # 使用pickle存储模板文件 template_dict = {} template_dict["fq_conditon"] = fields_question_condition template_dict["fq_target"] = fields_question_target template_dict["ts_answers"] = template_sentence_answers build_question_sentences = [] # 获取问句条件词的英文字段 template_fields_en = [] for fq_condition in fields_question_condition: template_fields_en.append(fq_condition.split(" ")[0]) # 利用问句条件词英文字段构造子集,使用替换的方法构造所有全排列的问句 fields_en_subset = get_subset_binary(template_fields_en) print(len(fields_en_subset)) for i_question in range(len(template_sentence_questions)): # 查询当前模板问句包含多少问句目标词 match_question_target = [] for fq_target in fields_question_target: if fq_target.split( " ")[0] in template_sentence_questions[i_question]: match_question_target = fq_target.split(" ") # 找到一个问句目标词即退出,默认问句中只有一个问句目标词 break # 对问句目标词中对应的每一个询问方式进行替换 target_word = match_question_target[0] for question_mode in match_question_target[1:]: # 对子集中的每一个集合进行替换 for subset in fields_en_subset: sentence = template_sentence_questions[i_question].replace("(" + target_word + ")", question_mode) \ + "--" + str(i_question) # 子集为空,不缺省参数 if not subset: build_question_sentences.append(sentence) # 子集为原集合,不添加 elif len(subset) == len(template_fields_en): continue # 子集有缺省,去除子集中的元素后再添加 else: for field_en in subset: sentence = sentence.replace("(" + field_en + ")", "") build_question_sentences.append(sentence) template_dict["ts_questions"] = build_question_sentences with open(template_path, "wb") as p_file: pickle.dump(template_dict, p_file) function_logger.info("%s的问题模板构建完成!" % template_path.split("\\")[-1])
def build_university_major_dict(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() function_logger.info("构造专业名称词典...") source_path = "Information/大学/大学学科(百度百科网页源码).txt" with open(source_path, "r", encoding="utf-8") as source_file: main_page_source = source_file.read() # bs4方法解析尝试 # main_page_soup = BeautifulSoup(main_page_source, "lxml") # main_page_soup.prettify() # # source_major_list = main_page_soup.find_all("div", class_="para") # i=0 # while source_major_list[i].text.find("01学科门类") == -1: # i += 1 # else: # cut_index = i # source_major_list = source_major_list[cut_index:] # for item in source_major_list: # print(item.text) # 正则方式获取大学专业目录 result = re.findall(r'\d{4,6}[KT]*\s*[\u4e00-\u9fa5]+', main_page_source) # 切分部分不符合的数据 result = result[4:-4] # 根据资料构建词典 with open(dictionary_path + "/major.txt", "w", encoding="utf-8") as major_dict: major_dict.truncate() for item in result: major_dict.write(re.findall(r'[\u4e00-\u9fa5]+', item)[0] + "\n") function_logger.info("构造专业名称词典完成")
def load_table_content(file_path: str): """ 通过excel表格加载表格内容 :param file_path: :return: """ function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() # 加载excel表格 function_logger.info("加载表格:%s" % file_path.split("\\")[-1]) wb = load_workbook(file_path) sheet_names = wb.sheetnames sheet_first = wb.get_sheet_by_name(sheet_names[0]) table_head = [] for item in range(1, sheet_first.max_column + 1): table_head.append(sheet_first.cell(row=1, column=item).value) function_logger.debug("表头:%s" % str(table_head)) table_attr = {} for i_column in range(1, sheet_first.max_column + 1): column_name = sheet_first.cell(row=1, column=i_column).value column_value = set() for i_row in range(2, sheet_first.max_row + 1): column_value.add( sheet_first.cell(row=i_row, column=i_column).value) table_attr[column_name] = str(list(column_value)) for key in table_attr: function_logger.debug(key) value_list = [ value.replace("'", "").strip() for value in table_attr[key][1:-1].split(",") ] value_list.sort() function_logger.debug("列表长度:%d" % len(value_list)) function_logger.debug(str(value_list)) function_logger.info("加载表格:%s完成!" % file_path.split("\\")[-1])
def build_mysql_string_by_template_and_keymap(template_question: str, template_question_type: str, keyword_dict: dict) -> str: """ 通过模板类型及关键词键值映射返回mysql语句 :param template_question: 模板问题 :param template_question_type: 模板问题类型 :param keyword_dict: 关键词映射 :return: SQL语句 """ function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() function_logger.info("开始构造MySQL语句...") search_table = template_question_type # 提取模板句中的槽 pattern = re.compile(r"[(].*?[)]") slots = re.findall(pattern, template_question) # print(slots) # 构造SQL语句 mysql_string = "" for i_slot in range(len(slots)): # function_logger.debug("slot:"+slots[i_slot][1:-1]) key = keyword_dict["search_" + slots[i_slot][1:-1]] # function_logger.debug("key"+key) if key == "": continue else: mysql_string += slots[i_slot][1:-1] + "='" + key + "' and " if mysql_string != "": mysql_string = "select * from " + search_table + " where " + mysql_string[: -5] + ";" function_logger.info("MySQL语句构造完成!") return mysql_string
def build_mysql_string_by_template(template_question: str, template_question_type: str) -> str: """ 通过模板类型(槽位)构造MySQL语句 :param template_question: 模板问句 :param template_question_type: 模板问句类型 :return: 对应的mysql语句 """ function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() function_logger.info("开始构造MySQL语句...") search_table = template_question_type # 提取模板句中的槽 pattern = re.compile(r"[(].*?[)]") slots = re.findall(pattern, template_question) # print(slots) # 构造SQL语句 mysql_string = "select * from " + search_table + " where " for i_slot in range(len(slots)): if i_slot == len(slots) - 1: mysql_string += "[" + slots[i_slot][1:-1] + "='" + slots[ i_slot] + "'" + "]" else: mysql_string += "[" + slots[i_slot][1:-1] + "='" + slots[ i_slot] + "' and " + "]" mysql_string += ";" function_logger.info("MySQL语句构造完成!") return mysql_string
def pretreat_crawl_questions(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() data_dir = "Information/大学/常问问题集/Data" pickle_dir = "Information/大学/常问问题集/Pickle" file_list = os.listdir(data_dir) function_logger.debug("大学数量:%d" % len(file_list)) for file in file_list: university_name = file[:-9] function_logger.debug(university_name) function_logger.info("开始读取%s的常问问题集..." % university_name) with open(data_dir + "/" + file, "r", encoding="utf-8") as csvfile: csv_reader = csv.reader(csvfile) fqa_lines = [] for row in csv_reader: if len(row) == 5: line = {} line["title"] = row[0] line["from"] = row[1] line["time"] = row[2] line["question"] = row[3] line["answer"] = row[4] fqa_lines.append(line) fqa_lines.pop(0) function_logger.info("读取%s的常用问题集完成!" % university_name) function_logger.info("开始写入%s的常用问题集..." % university_name) with open(pickle_dir + "/" + university_name, "wb") as p_file: pickle.dump(fqa_lines, p_file) function_logger.info("写入%s的常用问题集完成!" % university_name) function_logger.info("数据处理完成!")
def plan_doc_to_mysql_table_tuple(file_path, school): mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog() mylogger.info("插入文件" + file_path) file_content = read_file_content(file_path) file_name = file_path.split("\\")[-1] year = file_name.split("-")[0] district = file_name.split("-")[1] # mylogger.debug("年份:" + year + "地区:" + district) table_content = [] for i in range(len(file_content)): file_content[i] = file_content[i].strip() temp = file_content[i].split("\t") table_content.append(temp) table_head = table_content[0] # mylogger.debug("表头:" + str(table_head)) table_content = table_content[1:] # 去除统计部分的数据项、无数据的项 for item in table_content: if item[0] == "无数据": table_content.remove(item) # elif item[1] == "统计": # table_content.remove(item) mysql_content = [] for item in table_content: major = item[0] classy = item[1] numbers = item[2] temp = (school, district, year, major, classy, numbers) mysql_content.append(temp) # mylogger.debug("构造后的数据表项如下:") # for item in mysql_content: # mylogger.debug(str(item)) return mysql_content
def read_pdf_to_words(path): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() function_logger.info("开始读取pdf文件!") with pdfplumber.open(path) as pdf: all_words = [] for page in pdf.pages: words = page.extract_words() all_words.append(words) function_logger.info("pdf文件读取table完成!") return all_words
def build_classy_dict(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() function_logger.info("构造类别名称词典...") classy = ["文科", "理科", "文史", "理工"] with open(dictionary_path + "/classy.txt", "w", encoding="utf-8") as classy_dict: classy_dict.truncate() for item in classy: classy_dict.write(item + "\n") function_logger.info("构造类别名称词典完成!")
def build_mysql_major_dict(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() function_logger.info("获取招生计划表中的专业字段...") # 招生计划中的专业名称 plan_sql_string = "SELECT major FROM admission_plan GROUP BY major;" myresult = mysql_query_sentence(plan_sql_string) function_logger.debug("招生计划表中专业数%d:" % len(myresult)) pattern = re.compile(r"[(([].*?[))\]].*") plan_major_set = set() for major in myresult: temp = re.sub(pattern, "", major[0]) plan_major_set.add(temp) function_logger.debug("招生计划表中专业数(统计合并后):%d" % len(plan_major_set)) function_logger.debug( str( sorted(list(plan_major_set), key=lambda x: lazy_pinyin(x.lower())[0][0]))) # 录取分数中的专业名称 score_sql_string = "SELECT major FROM admission_score_major GROUP BY major;" myresult = mysql_query_sentence(score_sql_string) function_logger.debug("录取分数表中专业数%d:" % len(myresult)) score_major_set = set() for major in myresult: temp = re.sub(pattern, "", major[0]) score_major_set.add(temp) function_logger.debug("录取分数表中专业数(统计合并后):%d" % len(score_major_set)) function_logger.debug( str( sorted(list(score_major_set), key=lambda x: lazy_pinyin(x.lower())[0][0]))) # 测定两者交集 function_logger.debug("以上两者的交集为:") major_and_set = plan_major_set.intersection(score_major_set) function_logger.debug("交集长度为:%d", len(major_and_set)) function_logger.debug( str( sorted(list(major_and_set), key=lambda x: lazy_pinyin(x.lower())[0][0]))) # 测定两者并集 function_logger.debug("以上两者的并集为:") major_or_set = plan_major_set.union(score_major_set) function_logger.debug("并集长度为:%d", len(major_or_set)) function_logger.debug( str( sorted(list(major_or_set), key=lambda x: lazy_pinyin(x.lower())[0][0])))
def read_pdf_to_tables(path): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() """ read pdf and return a table list. :param path: the pdf path :return tables: table list """ function_logger.info("开始读取pdf文件!") with pdfplumber.open(path) as pdf: all_tables = [] for page in pdf.pages: tables = page.extract_tables() all_tables.append(tables) function_logger.info("pdf文件读取table完成!") return all_tables
def read_pdf_to_tables(file_path): """ 解析pdf文件中的表格 :param file_path: pdf文件路径 :return: 表格数据列表 """ function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() function_logger.info("开始读取pdf文件!") with pdfplumber.open(path) as pdf: all_tables = [] # 对每一页pdf中的表格进行解析并添加到列表结构中 for page in pdf.pages: tables = page.extract_tables() all_tables.append(tables) function_logger.info("pdf文件读取table完成!") return all_tables
def build_school_dict(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() function_logger.info("构造学校名称词典...") c9 = [ "北京大学", "清华大学", "复旦大学", "上海交通大学", "浙江大学", "南京大学", "中国科学技术大学", "哈尔滨工业大学", "西安交通大学", "北京大学医学部", "上海交通大学医学部", "复旦大学上海医学部" ] c9_j = [ "北大", "清华", "复旦", "上交", "浙大", "南大", "中科大", "哈工大", "西交大", "北大医学部", "上交医学部", "复旦医学部" ] with open(dictionary_path + "/school.txt", "w", encoding="utf-8") as school_dict: school_dict.truncate() for item in c9: school_dict.write(item + "\n") for item in c9_j: school_dict.write(item + "\n") function_logger.info("构造学校名称词典完成!")
def create_database(db_name: str): """ 创建数据库university_admission :param db_name: 数据库名 :return: """ function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() mydb = connect_mysql_without_db() mycursor = mydb.cursor() mycursor.execute("SHOW DATABASES") dbs = [] function_logger.debug("数据库如下:") for db in mycursor: dbs.append(db[0]) function_logger.debug(db[0]) if db_name in dbs: function_logger.info("数据库" + db_name + "已存在!") else: mycursor.execute("CREATE DATABASE " + db_name) function_logger.info(db_name + "已创建!")
def get_plan_info_ustc(): mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog() file_path = "Information/九校联盟/中国科学技术大学/招生计划" main_url = "https://zsb.ustc.edu.cn" # 获取分类信息 main_page_source = request_url(main_url + "/12993/list.htm") main_page_source.encoding = main_page_source.apparent_encoding main_page_soup = BeautifulSoup(main_page_source.text, "lxml") main_page_soup.prettify() for area in main_page_soup.find_all("area"): page_url = area["href"] page_source = request_url(page_url) page_source.encoding = page_source.apparent_encoding page_soup = BeautifulSoup(page_source.text, "lxml") page_soup.prettify() title = page_soup.find("h1", class_="arti_title").string year = title[:4] district = title[5:-4] table_name = year + "-" + district table_head = ["专业", "类别", "人数"] mylogger.debug(table_name) mylogger.debug(str(table_head)) all_lines = [] for tr in page_soup.find("div", class_="wp_articlecontent").find_all("tr"): line = [] for td in tr: line.append(td.text) all_lines.append(line) table_content = [] for line in all_lines[1:]: if line[0] != "合计" and line[0] != "小计": if district == "浙江" or district == "上海": table_content.append( [line[0] + "(" + line[1] + ")", "理工", line[2]]) else: table_content.append([line[0], "理工", line[1]]) for line in table_content: mylogger.debug(str(line)) write_table(file_path, table_name, table_head, table_content) mylogger.info(year + district + "的招生计划已存入文件")
def get_undergraduate_university_info(): # 院校库主页 function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() main_url = "https://gaokao.chsi.com.cn/sch/search.do?searchType=1&xlcc=bk&start=" main_page_source = request_url(main_url + "0") main_page_source.encoding = main_page_source.apparent_encoding main_page_soup = BeautifulSoup(main_page_source.text, "lxml") page_count = int( main_page_soup.find("li", class_="lip dot").next_sibling.text) page_university_count = 20 university_infos = [] for i_page in range(page_count): page_url = main_url + str(i_page * page_university_count) function_logger.info("页面抓取进度(%d,%d)" % (i_page + 1, int(page_count))) function_logger.info("页面url%s" % page_url) browser = selenium_chrome(page_url) page_souce = browser.find_element_by_class_name( "ch-table").get_attribute("innerHTML") browser.quit() page_soup = BeautifulSoup(page_souce, "lxml") page_soup.prettify() head = [th.text for th in page_soup.find("tr").find_all("th")] print(head) for tr in page_soup.find_all("tr")[1:]: info = {} td_list = tr.find_all("td") info["url"] = "https://gaokao.chsi.com.cn" + td_list[0].find( "a")["href"] for i in [0, 1, 2, 3, 4, 7]: info[head[i]] = td_list[i].text.strip() info[head[5]] = td_list[5].text.strip().replace("\n", "").replace( " ", "").replace("\u2002", " ") info[head[6]] = td_list[6].text.strip().replace( "\ue664", "有") if td_list[6].text.strip() != "" else "无" university_infos.append(info) for info in university_infos: print(info) with open("Information/大学/university_info", "wb") as p_file: pickle.dump(university_infos, p_file)
def create_plan_score_folder_c9(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() # C9及其医学部 c9 = [ "北京大学", "清华大学", "复旦大学", "上海交通大学", "浙江大学", "南京大学", "中国科学技术大学", "哈尔滨工业大学", "西安交通大学", "北京大学医学部", "上海交通大学医学部", "复旦大学上海医学部" ] catalog = ["招生计划", "录取分数"] root_path = "Information/九校联盟" for university in c9: function_logger.info("创建%s的文件夹" % university) if not os.path.exists(root_path + "/" + university): os.makedirs(root_path + "/" + university) for cat in catalog: if not os.path.exists(root_path + "/" + university + "/" + cat): os.makedirs(root_path + "/" + university + "/" + cat) # 创建source文件夹(存储网络爬取的原始数据) if not os.path.exists(root_path + "/" + university + "/" + cat + "/source"): os.makedirs(root_path + "/" + university + "/" + cat + "/source") function_logger.info("%s的文件夹创建完成!" % university)
def insert_all_school_table_admission_plan(): mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog() c9 = [ "北京大学", "清华大学", "复旦大学", "上海交通大学", "浙江大学", "南京大学", "中国科学技术大学", "哈尔滨工业大学", "西安交通大学", "北京大学医学部", "上海交通大学医学部", "复旦大学上海医学部" ] already_get = ["南京大学"] for school in already_get: mylogger.info("开始插入" + school + "的招生计划数据...") dir_path = "Information/九校联盟/" + school + "/招生计划" file_list = read_all_file_list(dir_path) for file in file_list: mylogger.info("构造数据项元组...") mysql_content = plan_doc_to_mysql_table_tuple(file, school) mylogger.info("将元组数据插入数据库...") insert_table_admission_plan(mysql_content) mylogger.info("元组数据插入完成!") time.sleep(5)
def create_admission_plan_table(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() db_name = "university_admission" tables = search_table_in_db(db_name) mydb = connect_mysql_with_db(db_name) mycursor = mydb.cursor() if "admission_plan" in tables: function_logger.info("admission_plan表已存在!") function_logger.info("正在删除admission_plan表...") mycursor.execute("DROP TABLE admission_plan;") mycursor.execute("CREATE TABLE admission_plan(" "id INT AUTO_INCREMENT PRIMARY KEY NOT NULL ," "school VARCHAR(30)," "district VARCHAR(10)," "year INT," "major VARCHAR(100)," "classy varchar(10)," "numbers varchar(10))") function_logger.info("admission_plan表已重新创建!")
def create_admission_score_pro_table(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() db_name = "university_admission" tables = search_table_in_db(db_name) mydb = connect_mysql_with_db(db_name) mycursor = mydb.cursor() if "admission_score_pro" in tables: function_logger.info("admission_score_pro表已存在!") function_logger.info("正在删除admission_score_pro表...") mycursor.execute("DROP TABLE admission_score_pro;") mydb = connect_mysql_with_db(db_name) mycursor = mydb.cursor() # 各省的高校分数线(学校、地区、年份、类别、批次、分数线) mycursor.execute("CREATE TABLE admission_score_pro(" "id INT AUTO_INCREMENT PRIMARY KEY NOT NULL," "school VARCHAR(30)," "year INT," "district VARCHAR(10)," "batch varchar(30)," "classy varchar(10)," "line varchar(30))") function_logger.info("admission_score_pro表创建完成!")
def create_admission_score_major_table(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() db_name = "university_admission" tables = search_table_in_db(db_name) mydb = connect_mysql_with_db(db_name) mycursor = mydb.cursor() if "admission_score_major" in tables: function_logger.info("admission_score_major表已存在!") function_logger.info("正在删除admission_score_major表...") mycursor.execute("DROP TABLE admission_score_major;") # 高校的专业分数线(学校、地区、年份、专业、类别、最高分、平均分、最低分、录取人数) mycursor.execute("CREATE TABLE admission_score_major(" "id INT AUTO_INCREMENT PRIMARY KEY NOT NULL," "school VARCHAR(30)," "district VARCHAR(10)," "year INT," "major VARCHAR(100)," "classy varchar(30)," "highest varchar(10) NULL," "average varchar(10) NULL," "lowest varchar(10)," "amount varchar(10) NULL)") function_logger.info("admission_score_major表创建完成!")
# -*- coding: utf-8 -*- """ @File : HanLPAPI.py @Author: SangYu @Date : 2018/12/27 14:56 @Desc : HanLP平台的API """ from pyhanlp import * from Log.Logger import MyLog # 分词(有词性标注) def hanlp_nlp_segmentor(sentence): nlp_tokenizer = JClass("com.hankcs.hanlp.tokenizer.NLPTokenizer") return str(nlp_tokenizer.analyze(sentence)).split(" ") # 分词(无词性标注) def hanlp_nlp_segmentor_without_nature(sentence): nlp_tokenizer = JClass("com.hankcs.hanlp.tokenizer.NLPTokenizer") word_list = str(nlp_tokenizer.analyze(sentence)).split(" ") return [word.split("/")[0] for word in word_list] if __name__ == "__main__": mylogger = MyLog(logger=__name__).getlog() mylogger.info("start...") print(type(hanlp_nlp_segmentor("2015年哈工大软件工程在河南招多少人?"))) print(hanlp_nlp_segmentor("一五年哈工大软件工程在河南招多少人?")) mylogger.info("end...")
def get_question_yggk(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() # 院校咨询页url main_url = "https://gaokao.chsi.com.cn" file_path = "Information/大学/Test" school_urls = [["北京大学", str(26232)], ["哈尔滨工业大学", str(26617)], ["北京大学医学部", str(6405529)], ["上海交通大学", str(6217)], ["上海交通大学医学院", str(61811)], ["清华大学", str(36710)], ["复旦大学", str(7243)], ["南京大学", str(4453)], ["浙江大学", str(43617)], ["中国科学技术大学", str(6280)], ["哈尔滨工业大学(威海)", str(62646117)], ["西安交通大学", str(53593)]] for school in school_urls: function_logger.info("开始抓取" + school[0] + "的招生问题数据...") # 创建该学校的问题集收集表,sheet,并写好表头 table_head = ["标题", "来源", "时间", "问题", "回答"] with open(file_path + "/" + school[0] + "常用问题集.csv", "w", encoding='utf-8') as csvfile: csvfile.truncate() writer = csv.writer(csvfile) writer.writerow(table_head) main_page_source = request_url( "https://gaokao.chsi.com.cn/zxdy/forum--method-listDefault,year-2005,forumid-" + school[1] + ",start-0.dhtml") main_page_source.encoding = main_page_source.apparent_encoding main_page_soup = BeautifulSoup(main_page_source.content, "lxml") # 页面总数 page_count = main_page_soup.find( "li", class_="lip dot").next_sibling.a.string # 置顶问题个数 top_question_count = len( main_page_soup.find("table", class_="ch-table zx-table").find_all( "span", class_="question_top_txt")) # 每页问题个数 page_question_count = 15 # 通过构造每一个页面url进入具体页面 for i_page in list(range(10)) + list(range(11, int(page_count))): page_url = main_url + "/zxdy/forum--method-listDefault,year-2005,forumid-" + school[ 1] + ",start-" + str(i_page * page_question_count) + ".dhtml" # xls表格记录基点(页问题量+置顶问题量+表头) # if i_page == 0: # base_count = 1 # else: # base_count = i_page * page_question_count + top_question_count + 1 function_logger.info("页面抓取进度(%d,%d)" % (i_page + 1, int(page_count))) function_logger.info("页面url%s" % page_url) page_source = request_url(page_url) page_source.encoding = page_source.apparent_encoding page_soup = BeautifulSoup(page_source.text, "lxml") tr_list = page_soup.find("table", class_="ch-table zx-table").contents for item in tr_list: if item == "\n": tr_list.remove(item) records = [] # 置顶问答只记录一次 if i_page == 0: start_index = 0 else: start_index = top_question_count * 2 for i_qa_pair in range(start_index, len(tr_list), 2): question_title = "q_title" question_from = "" question_time = "" question_text = "q_text" answer_text = "a_text" question_title = str(tr_list[i_qa_pair].find( "a", class_="question_t_txt").string).strip() function_logger.debug("标题:%s" % question_title) question_from = str(tr_list[i_qa_pair].find( "i", title="提问人").next_sibling.string).strip() function_logger.debug("来源:%s" % question_from) question_time = str(tr_list[i_qa_pair].find( "td", class_="question_t ch-table-center").text).strip() function_logger.debug("时间:%s" % question_time) # 问题与答案可能出现本页无法写下的情况,需要进行页面跳转获取信息 question_text_class = tr_list[i_qa_pair + 1].find( "div", class_="question") if question_text_class.find(text='[详细]') is None: question_text = str(question_text_class.text).strip() else: turn_page_url = main_url + question_text_class.find( "a", text='[详细]')["href"] turn_page_source = request_url(turn_page_url) turn_page_source.encoding = turn_page_source.apparent_encoding turn_page_soup = BeautifulSoup(turn_page_source.text, "lxml") question_text = str( turn_page_soup.find("div", class_="question").text).strip() function_logger.debug("问题:%s" % question_text) answer_text_class = tr_list[i_qa_pair + 1].find( "div", class_="question_a") if answer_text_class.find(text='[详细]') is None: answer_text = str(answer_text_class.text).replace( "[ 回复 ]", "").strip() else: turn_page_url = main_url + answer_text_class.find( "a", text='[详细]')["href"] turn_page_source = request_url(turn_page_url) turn_page_source.encoding = turn_page_source.apparent_encoding turn_page_soup = BeautifulSoup(turn_page_source.text, "lxml") pattern = re.compile(r"\s+|\n|\t|\v|\ue63c") answer_text = re.sub(pattern, "", str(turn_page_soup.find("div", class_="question_a").text)) \ .replace("[回复]", "") function_logger.debug("回答:%s" % answer_text) records.append([ question_title, question_from, question_time, question_text, answer_text ]) with open(file_path + "/" + school[0] + "常用问题集.csv", "a", encoding='utf-8') as csvfile: writer = csv.writer(csvfile) for record in records: writer.writerow(record) time.sleep(3) function_logger.info("%s的常用问题集收集完毕!" % school[0])
def frequent_question_normalize(dir_path: str): """ 处理常用问题集(csv),问题和答案部分 :param dir_path: 文件夹路径 :return: """ function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() function_logger.info("开始进行数据处理...") file_list = read_all_file_list(dir_path + "/source") for file in file_list: function_logger.debug(file) school_name = file.split("\\")[-1][:-9] function_logger.info("开始读取%s的常问问题集..." % school_name) with open(file, "r", encoding="utf-8") as csvfile: csv_reader = csv.reader(csvfile) fqa_lines = [] for row in csv_reader: if len(row) == 5: line = {} line["title"] = row[0].replace(" ", "") line["from"] = row[1] line["time"] = row[2] line["question"] = row[3].replace("\u3000", "").replace( "\n", ",").replace(" ", "") line["answer"] = row[4].replace("\ue63c", "").replace("\u3000", "").replace("\n", ",")\ .replace(" ", "").lstrip(",") fqa_lines.append(line) fqa_lines.pop(0) function_logger.info("读取%s的常用问题集完成!" % school_name) function_logger.info("开始写入%s的常用问题集..." % school_name) with open(dir_path + "/预处理/pickle/" + school_name, "wb") as p_file: pickle.dump(fqa_lines, p_file) function_logger.info("写入%s的常用问题集完成!" % school_name) function_logger.info("数据处理完成!")
for row in csv_reader: if len(row) == 5: line = {} line["title"] = row[0].replace(" ", "") line["from"] = row[1] line["time"] = row[2] line["question"] = row[3].replace("\u3000", "").replace( "\n", ",").replace(" ", "") line["answer"] = row[4].replace("\ue63c", "").replace("\u3000", "").replace("\n", ",")\ .replace(" ", "").lstrip(",") fqa_lines.append(line) fqa_lines.pop(0) function_logger.info("读取%s的常用问题集完成!" % school_name) function_logger.info("开始写入%s的常用问题集..." % school_name) with open(dir_path + "/预处理/pickle/" + school_name, "wb") as p_file: pickle.dump(fqa_lines, p_file) function_logger.info("写入%s的常用问题集完成!" % school_name) function_logger.info("数据处理完成!") if __name__ == '__main__': main_logger = MyLog(logger=__name__).getlog() main_logger.info("start...") question_set_dir = "../InformationGet/Information/大学/常问问题集" # frequent_question_normalize(question_set_dir) with open(question_set_dir + "/预处理/pickle/" + "上海交通大学医学院", "rb") as p_file: data = pickle.load(p_file) for q in data[:10]: print(q) main_logger.info("end...")
def get_plan_info_xjtu(): mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog() file_path = "Information/九校联盟/西安交通大学/招生计划" # 通过获取单个网页获取信息,需要后续处理,很麻烦 # mylogger.info("开始获取网页源码...共五个网页") # with open(file_path+"/source/page_url_list","w",encoding="utf-8")as url_file: # for i in range(1, 6): # main_url = "http://zs.xjtu.edu.cn/lmy.jsp?a43639t=5&a43639p=" + str(i) \ # + "&a43639c=10&urltype=tree.TreeTempUrl&wbtreeid=1005" # # 获取分类信息 # main_page_source = requests.get(main_url).text # main_page_soup = BeautifulSoup(main_page_source, "lxml") # main_page_soup.prettify() # for item in main_page_soup.find("div", id="fybt").find("ul").find_all("a"): # url_file.write(str(item)+"\n") # mylogger.info("招生计划页面url获取完成") # mylogger.info("开始获取具体页面信息") # with open(file_path + "/source/page_url_list", "r", encoding="utf-8")as url_file: # url_source = url_file.read() # url_soup = BeautifulSoup(url_source,"lxml") # url_soup.prettify() # for page_url in url_soup.find_all("a"): # print(page_url) # 直接从官网进行数据查询,使用form提交 # 获取可查询的年份和地区 main_url = "http://zs.xjtu.edu.cn/bkscx/zsjhcx.htm" main_page_source = request_url(main_url) main_page_source.encoding = main_page_source.apparent_encoding main_page_soup = BeautifulSoup(main_page_source.text, "lxml") main_page_soup.prettify() years = [] districts = [] for year in main_page_soup.find("select", id="nf").find_all("option"): years.append(year.string) for district in main_page_soup.find("select", id="sf").find_all("option")[1:]: districts.append(district.string) mylogger.debug("可查询的年份" + str(years)) mylogger.debug("可查询的省份" + str(districts)) search_url = "http://zs.xjtu.edu.cn/zsjg.jsp?wbtreeid=1168" for year in years: for district in districts: # x,y 是查询按钮点击时的坐标,查询按钮大小x,y(54x22) params = {"nf": year, "sf": district, "x": "27", "y": "11"} return_html = requests.post(search_url, data=params) return_soup = BeautifulSoup(return_html.text, "lxml") return_soup.prettify() all_lines = [] for tr in return_soup.find("div", id="fybt").find_all("tr"): line = [] for td in tr: if td.string != "\n": line.append(str(td.string).strip()) all_lines.append(line) table_name = year + "-" + district[:-1] table_head = ["专业", "类别", "人数"] table_content = [] for line in all_lines[1:-1]: classy = line[2] if classy == "理": classy = "理工" if classy == "文": classy = "文史" table_content.append([line[0], classy, line[4]]) mylogger.debug(table_name) mylogger.debug(str(table_head)) for line in table_content: mylogger.debug(str(line)) write_table(file_path, table_name, table_head, table_content) mylogger.info(year + district + "的招生计划已存入文件")
def get_plan_info_nju(): mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog() mylogger.info("开始获取网页源码...") main_url = "http://bkzs.nju.edu.cn" # 获取分类信息 file_path = "Information/九校联盟/南京大学/招生计划" # 使用selenium获取隐藏部分源码 # browser = selenium_chrome(main_url+"/4543/list.htm") # pro_list = browser.find_element_by_id("MapControl") # with open(file_path+"/source/"+"index","w",encoding="utf-8") as file: # file.write(pro_list.get_attribute('innerHTML')) # 开始将源码读入并使用bs4解析,获取html页面源码 # with open(file_path + "/source/" + "index", "r", encoding="utf-8") as file: # source_code = file.read() # main_page_soup = BeautifulSoup(source_code, "lxml") # main_page_soup.prettify() # for li in main_page_soup.find_all("li"): # url = li.a["href"] # pro = li.span.text # print(pro + "\t" + url) # browser = selenium_chrome(main_url+url) # page_source = browser.find_element_by_class_name("wp_articlecontent").get_attribute("innerHTML") # year = re.findall("\d{4}",BeautifulSoup(page_source,"lxml").find("p").text)[0] # with open(file_path + "/source/"+year+"-"+pro+".html","w",encoding="utf-8") as file: # file.write(page_source) # browser.quit() # time.sleep(5) # 获取pdf文件 # file_list = read_all_file_list(file_path + "/source") # for file_name in file_list: # pdf_name = file_name.split("\\")[-1][:-5] # if file_name[-4:] == "html": # print(file_name) # with open(file_name, "r", encoding="utf-8") as file: # page_source = file.read() # page_soup = BeautifulSoup(page_source,"lxml") # for item in page_soup.find_all("div",class_="wp_pdf_player"): # pdf_url = item["pdfsrc"] # pdf_source = request_url(main_url+pdf_url) # with open(file_path + "/source/"+pdf_name+".pdf","wb")as pdf_file: # pdf_file.write(pdf_source.content) # 解析pdf文件 file_list = read_all_file_list(file_path + "/source") for file_name in file_list: if file_name[-3:] == "pdf": pdf_name = file_name.split("\\")[-1][:-4] year = pdf_name.split("-")[0] pro = pdf_name.split("-")[-1] pages = read_pdf_to_tables(file_name) table_name = year + "-" + pro table_head = ["专业", "类别", "人数"] mylogger.debug(table_name) mylogger.debug(str(table_head)) all_lines = [] for tables in pages: for table in tables: for line in table: all_lines.append(line) # 分表 all_tables = [] table = [] for line in all_lines: if line[0] == "科类": if len(table) != 0: all_tables.append(table) table = [] table.append(line) else: table.append(line) all_tables.append(table) # 将标记写入该表下的每一项 all_lines = [] for table in all_tables: sign = table[1][0] if sign == "国家专项计划" or sign == "提前批": for line in table: all_lines.append([ line[0], str(line[1]) + "(" + sign + ")", line[2] ]) else: for line in table: all_lines.append(line) table_content = [] for line in all_lines: if line[0] == "科类" or line[0] == "总计" or line[1].find("小计") != -1 or line[1].find("None") != -1 \ or line[2] == "" or line[2] == "0" or line[2] is None: continue classy = line[0] if classy == "理": classy = "理工" elif classy == "文": classy = "文史" table_content.append( [line[1].replace("( )\n", ""), classy, line[2]]) for line in table_content: mylogger.debug(str(line)) write_table(file_path, table_name, table_head, table_content) mylogger.info(year + pro + "招生计划已存入文件")
def get_plan_info_sjtu(): mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog() # main_url = "http://zsb.sjtu.edu.cn/web/jdzsb" # url = main_url + "/3810061.htm" # page_source = request_url(url) # page_source.encoding = page_source.apparent_encoding # page_soup = BeautifulSoup(page_source.text, "lxml") # page_soup.prettify() main_file_path = "Information/九校联盟/上海交通大学/招生计划" # 下载文件 # logger.info("开始下载文件") # for item in page_soup.find("ul", class_="infor_right02_cont").find_all("li"): # logger.debug(item.a["title"]) # logger.debug(re.findall('\d{4}', item.a["title"])) # year = re.findall('\d{4}', item.a["title"])[0] # logger.debug(item.a["href"]) # specific_url = main_url + "/" + item.a["href"] # sub_page_source = request_url(specific_url) # sub_page_source.encoding = "utf-8" # sub_page_soup = BeautifulSoup(sub_page_source.text, "lxml") # sub_page_soup.prettify() # image_index = 0 # for sub_item in sub_page_soup.find("div", class_="artical_box").find_all("img"): # file_name = year + str(image_index) + sub_item["src"].split("/")[-1][-4:] # file_url = sub_item["src"] # if file_url[0] == "f": # continue # else: # file_content = request_url("http://zsb.sjtu.edu.cn" + file_url) # with open(main_file_path + "/source/" + file_name, "wb") as img: # img.write(file_content.content) # image_index += 1 # logger.info("文件下载完成!") # 文件解析 # logger.info("开始文件解析") # file_list = read_all_file_list(main_file_path + "/source") # file_2015 = [] # file_2016 = [] # file_2017 = [] # file_2018 = [] # for item in file_list: # if item[-3:] == "jpg" or item[-3:] == "png": # if item.find("2015") != -1: # file_2015.append(item) # elif item.find("2016") != -1: # file_2016.append(item) # elif item.find("2017") != -1: # file_2017.append(item) # elif item.find("2018") != -1: # file_2018.append(item) # logger.info("图片文件转为pdf文件") # store_path = os.getcwd() + "/" + main_file_path + "/source" # image_to_pdf(file_2015, store_path, "2015.pdf") # image_to_pdf(file_2016, store_path, "2016.pdf") # image_to_pdf(file_2017, store_path, "2017.pdf") # image_to_pdf(file_2018, store_path, "2018.pdf") # logger.info("图片文件转成pdf完成!") # 重新读入文件列表 file_list = read_all_file_list(main_file_path + "/source") for item in file_list: if item[-3:] == "pdf": if item.find("2015") != -1: # write_plan_info_sjtu_2015(main_file_path, item) mylogger.info("2015年数据解析完成!") elif item.find("2016") != -1: # write_plan_info_sjtu_2016(main_file_path, item) mylogger.info("2016年数据解析完成!") elif item.find("2017") != -1: # write_plan_info_sjtu_2017(main_file_path, item) mylogger.info("2017年数据解析完成!") elif item.find("2018") != -1: write_plan_info_sjtu_2018(main_file_path, item) mylogger.info("2018年数据解析完成!")
def get_plan_info_hit(): mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog() mylogger.info("开始获取网页源码...") main_url = "http://zsb.hit.edu.cn/information/plan" # 获取分类信息 main_page_source = requests.get(main_url).text main_page_soup = BeautifulSoup(main_page_source, "lxml") main_page_soup.prettify() # 招生计划省份 mylogger.info("解析招生地区...") province = [] for item in main_page_soup.find(class_="province").find_all(name='a'): province.append(item.string.strip()) mylogger.debug("哈工大招生地区:" + str(province)) # 招生计划年份 mylogger.info("解析招生年份...") years = [] for item in main_page_soup.find_all(class_="year-select"): years.append(item.string.strip()) mylogger.debug("哈工大招生年份:" + str(years)) # 对每年份各省数据进行抽取 mylogger.info("开始获取各年各地区数据...") for pro in province: for year in years: mylogger.info("开始获取" + year + pro + "的招生计划") # 构造链接 specific_url = main_url + "?" + "year=" + year + "&" + "province=" + pro page_source = requests.get(specific_url).text page_soup = BeautifulSoup(page_source, "lxml") page_soup.prettify() # 表名 table_name = year + "-" + pro mylogger.debug("表名:" + table_name) # 表头 table_head = [] for item in page_soup.find(class_="info_table").thead.find_all( name="td"): table_head.append(item.string.strip()) mylogger.debug("表头:" + str(table_head)) # 表内容 table_content = [] for item in page_soup.find(class_="info_table").tbody.find_all( name="tr"): temp = [] for sub_item in item.find_all(name="td"): temp.append(sub_item.string.strip()) table_content.append(temp) # 去除统计部分的数据项、无数据的项 for item in table_content: if item[0] == "无数据": table_content.remove(item) # elif item[1] == "统计": # table_content.remove(item) mylogger.debug("表内容如下:") for item in table_content: mylogger.debug(item) # 将表内容写入文本文件 file_path = "Information/九校联盟/哈尔滨工业大学/招生计划" write_table(file_path, table_name, table_head, table_content) mylogger.info(year + pro + "的招生计划已存入文件")