def plan_doc_to_mysql_table_tuple(file_path, school): mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog() mylogger.info("插入文件" + file_path) file_content = read_file_content(file_path) file_name = file_path.split("\\")[-1] year = file_name.split("-")[0] district = file_name.split("-")[1] # mylogger.debug("年份:" + year + "地区:" + district) table_content = [] for i in range(len(file_content)): file_content[i] = file_content[i].strip() temp = file_content[i].split("\t") table_content.append(temp) table_head = table_content[0] # mylogger.debug("表头:" + str(table_head)) table_content = table_content[1:] # 去除统计部分的数据项、无数据的项 for item in table_content: if item[0] == "无数据": table_content.remove(item) # elif item[1] == "统计": # table_content.remove(item) mysql_content = [] for item in table_content: major = item[0] classy = item[1] numbers = item[2] temp = (school, district, year, major, classy, numbers) mysql_content.append(temp) # mylogger.debug("构造后的数据表项如下:") # for item in mysql_content: # mylogger.debug(str(item)) return mysql_content
def __init__(self, university_id, page_id, page_count, top_question_count, record_queue): Thread.__init__(self) self.university_id = university_id self.page_id = page_id self.page_count = page_count self.top_question_count = top_question_count self.record_queue = record_queue self.thread_logger = MyLog( logger="thread" + str(threading.current_thread().ident)).getlog()
def write_plan_info_sjtu_2018(store_path, info_path): mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog() year = "2018" with pdfplumber.open(info_path) as pdf: first_page = pdf.pages[0] im = first_page.to_image() im.draw_rects(first_page.extract_words())
def score_pro_doc_to_mysql_table_tuple(file_path, school): mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog() file_content = read_file_content(file_path) file_name = file_path.split("\\")[-1] year = file_name.split("-")[0] table_format = file_name.split("-")[-1] # mylogger.debug("年份:" + year + "表类型:" + table_format) table_content = [] for i in range(len(file_content)): file_content[i] = file_content[i].strip().replace("-", "NULL") temp = file_content[i].split("\t") table_content.append(temp) table_head = table_content[0] # mylogger.debug("表头:" + str(table_head)) table_content = table_content[1:] mysql_content = [] for item in table_content: district = item[0] batch = item[1] classy = item[2] line = item[3] temp = (school, year, district, batch, classy, line) mysql_content.append(temp) # for item in mysql_content: # print(item) return mysql_content
def insert_all_school_table_admission_plan(): mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog() c9 = [ "北京大学", "清华大学", "复旦大学", "上海交通大学", "浙江大学", "南京大学", "中国科学技术大学", "哈尔滨工业大学", "西安交通大学", "北京大学医学部", "上海交通大学医学部", "复旦大学上海医学部" ] already_get = ["南京大学"] for school in already_get: mylogger.info("开始插入" + school + "的招生计划数据...") dir_path = "Information/九校联盟/" + school + "/招生计划" file_list = read_all_file_list(dir_path) for file in file_list: mylogger.info("构造数据项元组...") mysql_content = plan_doc_to_mysql_table_tuple(file, school) mylogger.info("将元组数据插入数据库...") insert_table_admission_plan(mysql_content) mylogger.info("元组数据插入完成!") time.sleep(5)
def create_database(db_name: str): """ 创建数据库university_admission :param db_name: 数据库名 :return: """ function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() mydb = connect_mysql_without_db() mycursor = mydb.cursor() mycursor.execute("SHOW DATABASES") dbs = [] function_logger.debug("数据库如下:") for db in mycursor: dbs.append(db[0]) function_logger.debug(db[0]) if db_name in dbs: function_logger.info("数据库" + db_name + "已存在!") else: mycursor.execute("CREATE DATABASE " + db_name) function_logger.info(db_name + "已创建!")
def get_plan_info_ustc(): mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog() file_path = "Information/九校联盟/中国科学技术大学/招生计划" main_url = "https://zsb.ustc.edu.cn" # 获取分类信息 main_page_source = request_url(main_url + "/12993/list.htm") main_page_source.encoding = main_page_source.apparent_encoding main_page_soup = BeautifulSoup(main_page_source.text, "lxml") main_page_soup.prettify() for area in main_page_soup.find_all("area"): page_url = area["href"] page_source = request_url(page_url) page_source.encoding = page_source.apparent_encoding page_soup = BeautifulSoup(page_source.text, "lxml") page_soup.prettify() title = page_soup.find("h1", class_="arti_title").string year = title[:4] district = title[5:-4] table_name = year + "-" + district table_head = ["专业", "类别", "人数"] mylogger.debug(table_name) mylogger.debug(str(table_head)) all_lines = [] for tr in page_soup.find("div", class_="wp_articlecontent").find_all("tr"): line = [] for td in tr: line.append(td.text) all_lines.append(line) table_content = [] for line in all_lines[1:]: if line[0] != "合计" and line[0] != "小计": if district == "浙江" or district == "上海": table_content.append( [line[0] + "(" + line[1] + ")", "理工", line[2]]) else: table_content.append([line[0], "理工", line[1]]) for line in table_content: mylogger.debug(str(line)) write_table(file_path, table_name, table_head, table_content) mylogger.info(year + district + "的招生计划已存入文件")
def build_mysql_string_by_template_and_keymap(template_question: str, template_question_type: str, keyword_dict: dict) -> str: """ 通过模板类型及关键词键值映射返回mysql语句 :param template_question: 模板问题 :param template_question_type: 模板问题类型 :param keyword_dict: 关键词映射 :return: SQL语句 """ function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() function_logger.info("开始构造MySQL语句...") search_table = template_question_type # 提取模板句中的槽 pattern = re.compile(r"[(].*?[)]") slots = re.findall(pattern, template_question) # print(slots) # 构造SQL语句 mysql_string = "" for i_slot in range(len(slots)): # function_logger.debug("slot:"+slots[i_slot][1:-1]) key = keyword_dict["search_" + slots[i_slot][1:-1]] # function_logger.debug("key"+key) if key == "": continue else: mysql_string += slots[i_slot][1:-1] + "='" + key + "' and " if mysql_string != "": mysql_string = "select * from " + search_table + " where " + mysql_string[: -5] + ";" function_logger.info("MySQL语句构造完成!") return mysql_string
def build_mysql_string_by_template(template_question: str, template_question_type: str) -> str: """ 通过模板类型(槽位)构造MySQL语句 :param template_question: 模板问句 :param template_question_type: 模板问句类型 :return: 对应的mysql语句 """ function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() function_logger.info("开始构造MySQL语句...") search_table = template_question_type # 提取模板句中的槽 pattern = re.compile(r"[(].*?[)]") slots = re.findall(pattern, template_question) # print(slots) # 构造SQL语句 mysql_string = "select * from " + search_table + " where " for i_slot in range(len(slots)): if i_slot == len(slots) - 1: mysql_string += "[" + slots[i_slot][1:-1] + "='" + slots[ i_slot] + "'" + "]" else: mysql_string += "[" + slots[i_slot][1:-1] + "='" + slots[ i_slot] + "' and " + "]" mysql_string += ";" function_logger.info("MySQL语句构造完成!") return mysql_string
def read_pdf_to_text(path): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() function_logger.info("开始读取pdf文件!") # 按页返回页中各个文本 source_pdf = open(path, 'rb') # 创建PDF,资源管理器,来共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释其对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 循环遍历列表,每次处理一个page内容 # doc.get_pages() 获取page列表 for page in PDFPage.get_pages(source_pdf): interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 # 想要获取文本就获得对象的text属性, for x in layout: if isinstance(x, LTTextBoxHorizontal): results = x.get_text() print(results) print("-----------") # print("-------------------") function_logger.info("pdf文件读取文本完成!")
def build_template_by_infos(template_path: str, fields_question_condition: list, fields_question_target: list, template_sentence_questions: list, template_sentence_answers: list): """ 通过提供的信息构造问题模板 :param template_path: 模板路径 :param fields_question_condition: 问句条件词,例["school 学校", "year 年份", "major 专业", "district 省份", "classy 类别"] :param fields_question_target: 问句目标词,例["numbers 招生人数 招生计划 招多少人 招生计划是多少 招生人数是多少"] :param template_sentence_questions: 模板问题句 :param template_sentence_answers: 模板答案句 :return: """ function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() function_logger.info("开始构造%s的问题模板..." % template_path.split("\\")[-1]) # 使用pickle存储模板文件 template_dict = {} template_dict["fq_conditon"] = fields_question_condition template_dict["fq_target"] = fields_question_target template_dict["ts_answers"] = template_sentence_answers build_question_sentences = [] # 获取问句条件词的英文字段 template_fields_en = [] for fq_condition in fields_question_condition: template_fields_en.append(fq_condition.split(" ")[0]) # 利用问句条件词英文字段构造子集,使用替换的方法构造所有全排列的问句 fields_en_subset = get_subset_binary(template_fields_en) print(len(fields_en_subset)) for i_question in range(len(template_sentence_questions)): # 查询当前模板问句包含多少问句目标词 match_question_target = [] for fq_target in fields_question_target: if fq_target.split( " ")[0] in template_sentence_questions[i_question]: match_question_target = fq_target.split(" ") # 找到一个问句目标词即退出,默认问句中只有一个问句目标词 break # 对问句目标词中对应的每一个询问方式进行替换 target_word = match_question_target[0] for question_mode in match_question_target[1:]: # 对子集中的每一个集合进行替换 for subset in fields_en_subset: sentence = template_sentence_questions[i_question].replace("(" + target_word + ")", question_mode) \ + "--" + str(i_question) # 子集为空,不缺省参数 if not subset: build_question_sentences.append(sentence) # 子集为原集合,不添加 elif len(subset) == len(template_fields_en): continue # 子集有缺省,去除子集中的元素后再添加 else: for field_en in subset: sentence = sentence.replace("(" + field_en + ")", "") build_question_sentences.append(sentence) template_dict["ts_questions"] = build_question_sentences with open(template_path, "wb") as p_file: pickle.dump(template_dict, p_file) function_logger.info("%s的问题模板构建完成!" % template_path.split("\\")[-1])
def build_university_major_dict(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() function_logger.info("构造专业名称词典...") source_path = "Information/大学/大学学科(百度百科网页源码).txt" with open(source_path, "r", encoding="utf-8") as source_file: main_page_source = source_file.read() # bs4方法解析尝试 # main_page_soup = BeautifulSoup(main_page_source, "lxml") # main_page_soup.prettify() # # source_major_list = main_page_soup.find_all("div", class_="para") # i=0 # while source_major_list[i].text.find("01学科门类") == -1: # i += 1 # else: # cut_index = i # source_major_list = source_major_list[cut_index:] # for item in source_major_list: # print(item.text) # 正则方式获取大学专业目录 result = re.findall(r'\d{4,6}[KT]*\s*[\u4e00-\u9fa5]+', main_page_source) # 切分部分不符合的数据 result = result[4:-4] # 根据资料构建词典 with open(dictionary_path + "/major.txt", "w", encoding="utf-8") as major_dict: major_dict.truncate() for item in result: major_dict.write(re.findall(r'[\u4e00-\u9fa5]+', item)[0] + "\n") function_logger.info("构造专业名称词典完成")
def label_data(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() data_dir = "Information/大学/常问问题集/Data" pickle_dir = "Information/大学/常问问题集/Pickle" label_dir = "Information/大学/常问问题集/label" file_list = os.listdir(pickle_dir) function_logger.debug("大学数量:%d" % len(file_list)) line_1 = [] line_2 = [] line_3 = [] line_4 = [] line_5 = [] line_6 = [] line_7 = [] all_count = 0 for file in file_list: print(file) university_name = file with open(pickle_dir + "/" + university_name, "rb") as p_file: lines = pickle.load(p_file) lines_count = len(lines) all_count += lines_count print(all_count)
def read_pdf_to_words(path): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() function_logger.info("开始读取pdf文件!") with pdfplumber.open(path) as pdf: all_words = [] for page in pdf.pages: words = page.extract_words() all_words.append(words) function_logger.info("pdf文件读取table完成!") return all_words
def build_classy_dict(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() function_logger.info("构造类别名称词典...") classy = ["文科", "理科", "文史", "理工"] with open(dictionary_path + "/classy.txt", "w", encoding="utf-8") as classy_dict: classy_dict.truncate() for item in classy: classy_dict.write(item + "\n") function_logger.info("构造类别名称词典完成!")
def create_admission_plan_table(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() db_name = "university_admission" tables = search_table_in_db(db_name) mydb = connect_mysql_with_db(db_name) mycursor = mydb.cursor() if "admission_plan" in tables: function_logger.info("admission_plan表已存在!") function_logger.info("正在删除admission_plan表...") mycursor.execute("DROP TABLE admission_plan;") mycursor.execute("CREATE TABLE admission_plan(" "id INT AUTO_INCREMENT PRIMARY KEY NOT NULL ," "school VARCHAR(30)," "district VARCHAR(10)," "year INT," "major VARCHAR(100)," "classy varchar(10)," "numbers varchar(10))") function_logger.info("admission_plan表已重新创建!")
def insert_all_school_table_admission_score(): mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog() c9 = [ "北京大学", "清华大学", "复旦大学", "上海交通大学", "浙江大学", "南京大学", "中国科学技术大学", "哈尔滨工业大学", "西安交通大学", "北京大学医学部", "上海交通大学医学部", "复旦大学上海医学部" ] already_get = ["复旦大学", "复旦大学上海医学部"] for school in already_get: dir_path = "Information/九校联盟/" + school + "/录取分数" file_list = read_all_file_list(dir_path) for file in file_list: table_format = file.split("-")[-1] if table_format == "major": mysql_content = score_major_doc_to_mysql_table_tuple( file, school) insert_table_admission_score_major(mysql_content) elif table_format == "pro": mysql_content = score_pro_doc_to_mysql_table_tuple( file, school) insert_table_admission_score_pro(mysql_content)
def create_admission_score_pro_table(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() db_name = "university_admission" tables = search_table_in_db(db_name) mydb = connect_mysql_with_db(db_name) mycursor = mydb.cursor() if "admission_score_pro" in tables: function_logger.info("admission_score_pro表已存在!") function_logger.info("正在删除admission_score_pro表...") mycursor.execute("DROP TABLE admission_score_pro;") mydb = connect_mysql_with_db(db_name) mycursor = mydb.cursor() # 各省的高校分数线(学校、地区、年份、类别、批次、分数线) mycursor.execute("CREATE TABLE admission_score_pro(" "id INT AUTO_INCREMENT PRIMARY KEY NOT NULL," "school VARCHAR(30)," "year INT," "district VARCHAR(10)," "batch varchar(30)," "classy varchar(10)," "line varchar(30))") function_logger.info("admission_score_pro表创建完成!")
def read_pdf_to_tables(path): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() """ read pdf and return a table list. :param path: the pdf path :return tables: table list """ function_logger.info("开始读取pdf文件!") with pdfplumber.open(path) as pdf: all_tables = [] for page in pdf.pages: tables = page.extract_tables() all_tables.append(tables) function_logger.info("pdf文件读取table完成!") return all_tables
def search_table_in_db(db_name: str) -> list: """ 查询数据库中表名 :param db_name: 数据库名 :return: 数据库中表名列表 """ function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() mydb = connect_mysql_with_db(db_name) mycursor = mydb.cursor() mycursor.execute("SHOW TABLES") tables = [] function_logger.debug(db_name + "数据库中有以下表:") for table in mycursor: tables.append(table[0]) function_logger.debug(table[0]) return tables
def read_pdf_to_tables(file_path): """ 解析pdf文件中的表格 :param file_path: pdf文件路径 :return: 表格数据列表 """ function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() function_logger.info("开始读取pdf文件!") with pdfplumber.open(path) as pdf: all_tables = [] # 对每一页pdf中的表格进行解析并添加到列表结构中 for page in pdf.pages: tables = page.extract_tables() all_tables.append(tables) function_logger.info("pdf文件读取table完成!") return all_tables
def create_admission_score_major_table(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() db_name = "university_admission" tables = search_table_in_db(db_name) mydb = connect_mysql_with_db(db_name) mycursor = mydb.cursor() if "admission_score_major" in tables: function_logger.info("admission_score_major表已存在!") function_logger.info("正在删除admission_score_major表...") mycursor.execute("DROP TABLE admission_score_major;") # 高校的专业分数线(学校、地区、年份、专业、类别、最高分、平均分、最低分、录取人数) mycursor.execute("CREATE TABLE admission_score_major(" "id INT AUTO_INCREMENT PRIMARY KEY NOT NULL," "school VARCHAR(30)," "district VARCHAR(10)," "year INT," "major VARCHAR(100)," "classy varchar(30)," "highest varchar(10) NULL," "average varchar(10) NULL," "lowest varchar(10)," "amount varchar(10) NULL)") function_logger.info("admission_score_major表创建完成!")
def score_major_doc_to_mysql_table_tuple(file_path, school): mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog() file_content = read_file_content(file_path) file_name = file_path.split("\\")[-1] year = file_name.split("-")[0] district = file_name.split("-")[1] table_format = file_name.split("-")[-1] # mylogger.debug("年份:" + year + "地区:" + district + "表类型:" + table_format) table_content = [] for i in range(len(file_content)): file_content[i] = file_content[i].strip().replace("-", "NULL") temp = file_content[i].split("\t") table_content.append(temp) table_head = table_content[0] # mylogger.debug("表头:" + str(table_head)) table_content = table_content[1:] # # 去除统计部分的数据项、无数据的项 # for item in table_content: # if item[0] == "无数据": # table_content.remove(item) # elif item[1] == "统计": # table_content.remove(item) mysql_content = [] for item in table_content: major = item[0] classy = item[1] highest = item[2] average = item[3] lowest = item[4] amount = item[5] temp = (school, district, year, major, classy, highest, average, lowest, amount) mysql_content.append(temp) # for item in mysql_content: # print(item) return mysql_content
def build_school_dict(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() function_logger.info("构造学校名称词典...") c9 = [ "北京大学", "清华大学", "复旦大学", "上海交通大学", "浙江大学", "南京大学", "中国科学技术大学", "哈尔滨工业大学", "西安交通大学", "北京大学医学部", "上海交通大学医学部", "复旦大学上海医学部" ] c9_j = [ "北大", "清华", "复旦", "上交", "浙大", "南大", "中科大", "哈工大", "西交大", "北大医学部", "上交医学部", "复旦医学部" ] with open(dictionary_path + "/school.txt", "w", encoding="utf-8") as school_dict: school_dict.truncate() for item in c9: school_dict.write(item + "\n") for item in c9_j: school_dict.write(item + "\n") function_logger.info("构造学校名称词典完成!")
def get_undergraduate_university_info(): # 院校库主页 function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() main_url = "https://gaokao.chsi.com.cn/sch/search.do?searchType=1&xlcc=bk&start=" main_page_source = request_url(main_url + "0") main_page_source.encoding = main_page_source.apparent_encoding main_page_soup = BeautifulSoup(main_page_source.text, "lxml") page_count = int( main_page_soup.find("li", class_="lip dot").next_sibling.text) page_university_count = 20 university_infos = [] for i_page in range(page_count): page_url = main_url + str(i_page * page_university_count) function_logger.info("页面抓取进度(%d,%d)" % (i_page + 1, int(page_count))) function_logger.info("页面url%s" % page_url) browser = selenium_chrome(page_url) page_souce = browser.find_element_by_class_name( "ch-table").get_attribute("innerHTML") browser.quit() page_soup = BeautifulSoup(page_souce, "lxml") page_soup.prettify() head = [th.text for th in page_soup.find("tr").find_all("th")] print(head) for tr in page_soup.find_all("tr")[1:]: info = {} td_list = tr.find_all("td") info["url"] = "https://gaokao.chsi.com.cn" + td_list[0].find( "a")["href"] for i in [0, 1, 2, 3, 4, 7]: info[head[i]] = td_list[i].text.strip() info[head[5]] = td_list[5].text.strip().replace("\n", "").replace( " ", "").replace("\u2002", " ") info[head[6]] = td_list[6].text.strip().replace( "\ue664", "有") if td_list[6].text.strip() != "" else "无" university_infos.append(info) for info in university_infos: print(info) with open("Information/大学/university_info", "wb") as p_file: pickle.dump(university_infos, p_file)
def create_plan_score_folder_c9(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() # C9及其医学部 c9 = [ "北京大学", "清华大学", "复旦大学", "上海交通大学", "浙江大学", "南京大学", "中国科学技术大学", "哈尔滨工业大学", "西安交通大学", "北京大学医学部", "上海交通大学医学部", "复旦大学上海医学部" ] catalog = ["招生计划", "录取分数"] root_path = "Information/九校联盟" for university in c9: function_logger.info("创建%s的文件夹" % university) if not os.path.exists(root_path + "/" + university): os.makedirs(root_path + "/" + university) for cat in catalog: if not os.path.exists(root_path + "/" + university + "/" + cat): os.makedirs(root_path + "/" + university + "/" + cat) # 创建source文件夹(存储网络爬取的原始数据) if not os.path.exists(root_path + "/" + university + "/" + cat + "/source"): os.makedirs(root_path + "/" + university + "/" + cat + "/source") function_logger.info("%s的文件夹创建完成!" % university)
# -*- coding: utf-8 -*- """ @File : HanLPAPI.py @Author: SangYu @Date : 2018/12/27 14:56 @Desc : HanLP平台的API """ from pyhanlp import * from Log.Logger import MyLog # 分词(有词性标注) def hanlp_nlp_segmentor(sentence): nlp_tokenizer = JClass("com.hankcs.hanlp.tokenizer.NLPTokenizer") return str(nlp_tokenizer.analyze(sentence)).split(" ") # 分词(无词性标注) def hanlp_nlp_segmentor_without_nature(sentence): nlp_tokenizer = JClass("com.hankcs.hanlp.tokenizer.NLPTokenizer") word_list = str(nlp_tokenizer.analyze(sentence)).split(" ") return [word.split("/")[0] for word in word_list] if __name__ == "__main__": mylogger = MyLog(logger=__name__).getlog() mylogger.info("start...") print(type(hanlp_nlp_segmentor("2015年哈工大软件工程在河南招多少人?"))) print(hanlp_nlp_segmentor("一五年哈工大软件工程在河南招多少人?")) mylogger.info("end...")
else: turn_page_url = main_url + answer_text_class.find( "a", text='[详细]')["href"] turn_page_source = request_url(turn_page_url) turn_page_source.encoding = turn_page_source.apparent_encoding turn_page_soup = BeautifulSoup(turn_page_source.text, "lxml") pattern = re.compile(r"\s+|\n|\t|\v|\ue63c") answer_text = re.sub(pattern, "", str(turn_page_soup.find("div", class_="question_a").text)) \ .replace("[回复]", "") function_logger.debug("回答:%s" % answer_text) records.append([ question_title, question_from, question_time, question_text, answer_text ]) with open(file_path + "/" + school[0] + "常用问题集.csv", "a", encoding='utf-8') as csvfile: writer = csv.writer(csvfile) for record in records: writer.writerow(record) time.sleep(3) function_logger.info("%s的常用问题集收集完毕!" % school[0]) if __name__ == '__main__': main_logger = MyLog(__name__).getlog() main_logger.debug("start...") get_question_yggk() main_logger.debug("end...")
def get_question_yggk(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() # 院校咨询页url main_url = "https://gaokao.chsi.com.cn" file_path = "Information/大学/Test" school_urls = [["北京大学", str(26232)], ["哈尔滨工业大学", str(26617)], ["北京大学医学部", str(6405529)], ["上海交通大学", str(6217)], ["上海交通大学医学院", str(61811)], ["清华大学", str(36710)], ["复旦大学", str(7243)], ["南京大学", str(4453)], ["浙江大学", str(43617)], ["中国科学技术大学", str(6280)], ["哈尔滨工业大学(威海)", str(62646117)], ["西安交通大学", str(53593)]] for school in school_urls: function_logger.info("开始抓取" + school[0] + "的招生问题数据...") # 创建该学校的问题集收集表,sheet,并写好表头 table_head = ["标题", "来源", "时间", "问题", "回答"] with open(file_path + "/" + school[0] + "常用问题集.csv", "w", encoding='utf-8') as csvfile: csvfile.truncate() writer = csv.writer(csvfile) writer.writerow(table_head) main_page_source = request_url( "https://gaokao.chsi.com.cn/zxdy/forum--method-listDefault,year-2005,forumid-" + school[1] + ",start-0.dhtml") main_page_source.encoding = main_page_source.apparent_encoding main_page_soup = BeautifulSoup(main_page_source.content, "lxml") # 页面总数 page_count = main_page_soup.find( "li", class_="lip dot").next_sibling.a.string # 置顶问题个数 top_question_count = len( main_page_soup.find("table", class_="ch-table zx-table").find_all( "span", class_="question_top_txt")) # 每页问题个数 page_question_count = 15 # 通过构造每一个页面url进入具体页面 for i_page in list(range(10)) + list(range(11, int(page_count))): page_url = main_url + "/zxdy/forum--method-listDefault,year-2005,forumid-" + school[ 1] + ",start-" + str(i_page * page_question_count) + ".dhtml" # xls表格记录基点(页问题量+置顶问题量+表头) # if i_page == 0: # base_count = 1 # else: # base_count = i_page * page_question_count + top_question_count + 1 function_logger.info("页面抓取进度(%d,%d)" % (i_page + 1, int(page_count))) function_logger.info("页面url%s" % page_url) page_source = request_url(page_url) page_source.encoding = page_source.apparent_encoding page_soup = BeautifulSoup(page_source.text, "lxml") tr_list = page_soup.find("table", class_="ch-table zx-table").contents for item in tr_list: if item == "\n": tr_list.remove(item) records = [] # 置顶问答只记录一次 if i_page == 0: start_index = 0 else: start_index = top_question_count * 2 for i_qa_pair in range(start_index, len(tr_list), 2): question_title = "q_title" question_from = "" question_time = "" question_text = "q_text" answer_text = "a_text" question_title = str(tr_list[i_qa_pair].find( "a", class_="question_t_txt").string).strip() function_logger.debug("标题:%s" % question_title) question_from = str(tr_list[i_qa_pair].find( "i", title="提问人").next_sibling.string).strip() function_logger.debug("来源:%s" % question_from) question_time = str(tr_list[i_qa_pair].find( "td", class_="question_t ch-table-center").text).strip() function_logger.debug("时间:%s" % question_time) # 问题与答案可能出现本页无法写下的情况,需要进行页面跳转获取信息 question_text_class = tr_list[i_qa_pair + 1].find( "div", class_="question") if question_text_class.find(text='[详细]') is None: question_text = str(question_text_class.text).strip() else: turn_page_url = main_url + question_text_class.find( "a", text='[详细]')["href"] turn_page_source = request_url(turn_page_url) turn_page_source.encoding = turn_page_source.apparent_encoding turn_page_soup = BeautifulSoup(turn_page_source.text, "lxml") question_text = str( turn_page_soup.find("div", class_="question").text).strip() function_logger.debug("问题:%s" % question_text) answer_text_class = tr_list[i_qa_pair + 1].find( "div", class_="question_a") if answer_text_class.find(text='[详细]') is None: answer_text = str(answer_text_class.text).replace( "[ 回复 ]", "").strip() else: turn_page_url = main_url + answer_text_class.find( "a", text='[详细]')["href"] turn_page_source = request_url(turn_page_url) turn_page_source.encoding = turn_page_source.apparent_encoding turn_page_soup = BeautifulSoup(turn_page_source.text, "lxml") pattern = re.compile(r"\s+|\n|\t|\v|\ue63c") answer_text = re.sub(pattern, "", str(turn_page_soup.find("div", class_="question_a").text)) \ .replace("[回复]", "") function_logger.debug("回答:%s" % answer_text) records.append([ question_title, question_from, question_time, question_text, answer_text ]) with open(file_path + "/" + school[0] + "常用问题集.csv", "a", encoding='utf-8') as csvfile: writer = csv.writer(csvfile) for record in records: writer.writerow(record) time.sleep(3) function_logger.info("%s的常用问题集收集完毕!" % school[0])
def frequent_question_normalize(dir_path: str): """ 处理常用问题集(csv),问题和答案部分 :param dir_path: 文件夹路径 :return: """ function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() function_logger.info("开始进行数据处理...") file_list = read_all_file_list(dir_path + "/source") for file in file_list: function_logger.debug(file) school_name = file.split("\\")[-1][:-9] function_logger.info("开始读取%s的常问问题集..." % school_name) with open(file, "r", encoding="utf-8") as csvfile: csv_reader = csv.reader(csvfile) fqa_lines = [] for row in csv_reader: if len(row) == 5: line = {} line["title"] = row[0].replace(" ", "") line["from"] = row[1] line["time"] = row[2] line["question"] = row[3].replace("\u3000", "").replace( "\n", ",").replace(" ", "") line["answer"] = row[4].replace("\ue63c", "").replace("\u3000", "").replace("\n", ",")\ .replace(" ", "").lstrip(",") fqa_lines.append(line) fqa_lines.pop(0) function_logger.info("读取%s的常用问题集完成!" % school_name) function_logger.info("开始写入%s的常用问题集..." % school_name) with open(dir_path + "/预处理/pickle/" + school_name, "wb") as p_file: pickle.dump(fqa_lines, p_file) function_logger.info("写入%s的常用问题集完成!" % school_name) function_logger.info("数据处理完成!")