def pretreat_crawl_questions(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() data_dir = "Information/大学/常问问题集/Data" pickle_dir = "Information/大学/常问问题集/Pickle" file_list = os.listdir(data_dir) function_logger.debug("大学数量:%d" % len(file_list)) for file in file_list: university_name = file[:-9] function_logger.debug(university_name) function_logger.info("开始读取%s的常问问题集..." % university_name) with open(data_dir + "/" + file, "r", encoding="utf-8") as csvfile: csv_reader = csv.reader(csvfile) fqa_lines = [] for row in csv_reader: if len(row) == 5: line = {} line["title"] = row[0] line["from"] = row[1] line["time"] = row[2] line["question"] = row[3] line["answer"] = row[4] fqa_lines.append(line) fqa_lines.pop(0) function_logger.info("读取%s的常用问题集完成!" % university_name) function_logger.info("开始写入%s的常用问题集..." % university_name) with open(pickle_dir + "/" + university_name, "wb") as p_file: pickle.dump(fqa_lines, p_file) function_logger.info("写入%s的常用问题集完成!" % university_name) function_logger.info("数据处理完成!")
def frequent_question_normalize(dir_path: str): """ 处理常用问题集(csv),问题和答案部分 :param dir_path: 文件夹路径 :return: """ function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() function_logger.info("开始进行数据处理...") file_list = read_all_file_list(dir_path + "/source") for file in file_list: function_logger.debug(file) school_name = file.split("\\")[-1][:-9] function_logger.info("开始读取%s的常问问题集..." % school_name) with open(file, "r", encoding="utf-8") as csvfile: csv_reader = csv.reader(csvfile) fqa_lines = [] for row in csv_reader: if len(row) == 5: line = {} line["title"] = row[0].replace(" ", "") line["from"] = row[1] line["time"] = row[2] line["question"] = row[3].replace("\u3000", "").replace( "\n", ",").replace(" ", "") line["answer"] = row[4].replace("\ue63c", "").replace("\u3000", "").replace("\n", ",")\ .replace(" ", "").lstrip(",") fqa_lines.append(line) fqa_lines.pop(0) function_logger.info("读取%s的常用问题集完成!" % school_name) function_logger.info("开始写入%s的常用问题集..." % school_name) with open(dir_path + "/预处理/pickle/" + school_name, "wb") as p_file: pickle.dump(fqa_lines, p_file) function_logger.info("写入%s的常用问题集完成!" % school_name) function_logger.info("数据处理完成!")
def load_table_content(file_path: str): """ 通过excel表格加载表格内容 :param file_path: :return: """ function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() # 加载excel表格 function_logger.info("加载表格:%s" % file_path.split("\\")[-1]) wb = load_workbook(file_path) sheet_names = wb.sheetnames sheet_first = wb.get_sheet_by_name(sheet_names[0]) table_head = [] for item in range(1, sheet_first.max_column + 1): table_head.append(sheet_first.cell(row=1, column=item).value) function_logger.debug("表头:%s" % str(table_head)) table_attr = {} for i_column in range(1, sheet_first.max_column + 1): column_name = sheet_first.cell(row=1, column=i_column).value column_value = set() for i_row in range(2, sheet_first.max_row + 1): column_value.add( sheet_first.cell(row=i_row, column=i_column).value) table_attr[column_name] = str(list(column_value)) for key in table_attr: function_logger.debug(key) value_list = [ value.replace("'", "").strip() for value in table_attr[key][1:-1].split(",") ] value_list.sort() function_logger.debug("列表长度:%d" % len(value_list)) function_logger.debug(str(value_list)) function_logger.info("加载表格:%s完成!" % file_path.split("\\")[-1])
def search_table_in_db(db_name: str) -> list: """ 查询数据库中表名 :param db_name: 数据库名 :return: 数据库中表名列表 """ function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() mydb = connect_mysql_with_db(db_name) mycursor = mydb.cursor() mycursor.execute("SHOW TABLES") tables = [] function_logger.debug(db_name + "数据库中有以下表:") for table in mycursor: tables.append(table[0]) function_logger.debug(table[0]) return tables
def create_database(db_name: str): """ 创建数据库university_admission :param db_name: 数据库名 :return: """ function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() mydb = connect_mysql_without_db() mycursor = mydb.cursor() mycursor.execute("SHOW DATABASES") dbs = [] function_logger.debug("数据库如下:") for db in mycursor: dbs.append(db[0]) function_logger.debug(db[0]) if db_name in dbs: function_logger.info("数据库" + db_name + "已存在!") else: mycursor.execute("CREATE DATABASE " + db_name) function_logger.info(db_name + "已创建!")
def label_data(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() data_dir = "Information/大学/常问问题集/Data" pickle_dir = "Information/大学/常问问题集/Pickle" label_dir = "Information/大学/常问问题集/label" file_list = os.listdir(pickle_dir) function_logger.debug("大学数量:%d" % len(file_list)) line_1 = [] line_2 = [] line_3 = [] line_4 = [] line_5 = [] line_6 = [] line_7 = [] all_count = 0 for file in file_list: print(file) university_name = file with open(pickle_dir + "/" + university_name, "rb") as p_file: lines = pickle.load(p_file) lines_count = len(lines) all_count += lines_count print(all_count)
def get_plan_info_ustc(): mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog() file_path = "Information/九校联盟/中国科学技术大学/招生计划" main_url = "https://zsb.ustc.edu.cn" # 获取分类信息 main_page_source = request_url(main_url + "/12993/list.htm") main_page_source.encoding = main_page_source.apparent_encoding main_page_soup = BeautifulSoup(main_page_source.text, "lxml") main_page_soup.prettify() for area in main_page_soup.find_all("area"): page_url = area["href"] page_source = request_url(page_url) page_source.encoding = page_source.apparent_encoding page_soup = BeautifulSoup(page_source.text, "lxml") page_soup.prettify() title = page_soup.find("h1", class_="arti_title").string year = title[:4] district = title[5:-4] table_name = year + "-" + district table_head = ["专业", "类别", "人数"] mylogger.debug(table_name) mylogger.debug(str(table_head)) all_lines = [] for tr in page_soup.find("div", class_="wp_articlecontent").find_all("tr"): line = [] for td in tr: line.append(td.text) all_lines.append(line) table_content = [] for line in all_lines[1:]: if line[0] != "合计" and line[0] != "小计": if district == "浙江" or district == "上海": table_content.append( [line[0] + "(" + line[1] + ")", "理工", line[2]]) else: table_content.append([line[0], "理工", line[1]]) for line in table_content: mylogger.debug(str(line)) write_table(file_path, table_name, table_head, table_content) mylogger.info(year + district + "的招生计划已存入文件")
def get_plan_info_xjtu(): mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog() file_path = "Information/九校联盟/西安交通大学/招生计划" # 通过获取单个网页获取信息,需要后续处理,很麻烦 # mylogger.info("开始获取网页源码...共五个网页") # with open(file_path+"/source/page_url_list","w",encoding="utf-8")as url_file: # for i in range(1, 6): # main_url = "http://zs.xjtu.edu.cn/lmy.jsp?a43639t=5&a43639p=" + str(i) \ # + "&a43639c=10&urltype=tree.TreeTempUrl&wbtreeid=1005" # # 获取分类信息 # main_page_source = requests.get(main_url).text # main_page_soup = BeautifulSoup(main_page_source, "lxml") # main_page_soup.prettify() # for item in main_page_soup.find("div", id="fybt").find("ul").find_all("a"): # url_file.write(str(item)+"\n") # mylogger.info("招生计划页面url获取完成") # mylogger.info("开始获取具体页面信息") # with open(file_path + "/source/page_url_list", "r", encoding="utf-8")as url_file: # url_source = url_file.read() # url_soup = BeautifulSoup(url_source,"lxml") # url_soup.prettify() # for page_url in url_soup.find_all("a"): # print(page_url) # 直接从官网进行数据查询,使用form提交 # 获取可查询的年份和地区 main_url = "http://zs.xjtu.edu.cn/bkscx/zsjhcx.htm" main_page_source = request_url(main_url) main_page_source.encoding = main_page_source.apparent_encoding main_page_soup = BeautifulSoup(main_page_source.text, "lxml") main_page_soup.prettify() years = [] districts = [] for year in main_page_soup.find("select", id="nf").find_all("option"): years.append(year.string) for district in main_page_soup.find("select", id="sf").find_all("option")[1:]: districts.append(district.string) mylogger.debug("可查询的年份" + str(years)) mylogger.debug("可查询的省份" + str(districts)) search_url = "http://zs.xjtu.edu.cn/zsjg.jsp?wbtreeid=1168" for year in years: for district in districts: # x,y 是查询按钮点击时的坐标,查询按钮大小x,y(54x22) params = {"nf": year, "sf": district, "x": "27", "y": "11"} return_html = requests.post(search_url, data=params) return_soup = BeautifulSoup(return_html.text, "lxml") return_soup.prettify() all_lines = [] for tr in return_soup.find("div", id="fybt").find_all("tr"): line = [] for td in tr: if td.string != "\n": line.append(str(td.string).strip()) all_lines.append(line) table_name = year + "-" + district[:-1] table_head = ["专业", "类别", "人数"] table_content = [] for line in all_lines[1:-1]: classy = line[2] if classy == "理": classy = "理工" if classy == "文": classy = "文史" table_content.append([line[0], classy, line[4]]) mylogger.debug(table_name) mylogger.debug(str(table_head)) for line in table_content: mylogger.debug(str(line)) write_table(file_path, table_name, table_head, table_content) mylogger.info(year + district + "的招生计划已存入文件")
def get_plan_info_nju(): mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog() mylogger.info("开始获取网页源码...") main_url = "http://bkzs.nju.edu.cn" # 获取分类信息 file_path = "Information/九校联盟/南京大学/招生计划" # 使用selenium获取隐藏部分源码 # browser = selenium_chrome(main_url+"/4543/list.htm") # pro_list = browser.find_element_by_id("MapControl") # with open(file_path+"/source/"+"index","w",encoding="utf-8") as file: # file.write(pro_list.get_attribute('innerHTML')) # 开始将源码读入并使用bs4解析,获取html页面源码 # with open(file_path + "/source/" + "index", "r", encoding="utf-8") as file: # source_code = file.read() # main_page_soup = BeautifulSoup(source_code, "lxml") # main_page_soup.prettify() # for li in main_page_soup.find_all("li"): # url = li.a["href"] # pro = li.span.text # print(pro + "\t" + url) # browser = selenium_chrome(main_url+url) # page_source = browser.find_element_by_class_name("wp_articlecontent").get_attribute("innerHTML") # year = re.findall("\d{4}",BeautifulSoup(page_source,"lxml").find("p").text)[0] # with open(file_path + "/source/"+year+"-"+pro+".html","w",encoding="utf-8") as file: # file.write(page_source) # browser.quit() # time.sleep(5) # 获取pdf文件 # file_list = read_all_file_list(file_path + "/source") # for file_name in file_list: # pdf_name = file_name.split("\\")[-1][:-5] # if file_name[-4:] == "html": # print(file_name) # with open(file_name, "r", encoding="utf-8") as file: # page_source = file.read() # page_soup = BeautifulSoup(page_source,"lxml") # for item in page_soup.find_all("div",class_="wp_pdf_player"): # pdf_url = item["pdfsrc"] # pdf_source = request_url(main_url+pdf_url) # with open(file_path + "/source/"+pdf_name+".pdf","wb")as pdf_file: # pdf_file.write(pdf_source.content) # 解析pdf文件 file_list = read_all_file_list(file_path + "/source") for file_name in file_list: if file_name[-3:] == "pdf": pdf_name = file_name.split("\\")[-1][:-4] year = pdf_name.split("-")[0] pro = pdf_name.split("-")[-1] pages = read_pdf_to_tables(file_name) table_name = year + "-" + pro table_head = ["专业", "类别", "人数"] mylogger.debug(table_name) mylogger.debug(str(table_head)) all_lines = [] for tables in pages: for table in tables: for line in table: all_lines.append(line) # 分表 all_tables = [] table = [] for line in all_lines: if line[0] == "科类": if len(table) != 0: all_tables.append(table) table = [] table.append(line) else: table.append(line) all_tables.append(table) # 将标记写入该表下的每一项 all_lines = [] for table in all_tables: sign = table[1][0] if sign == "国家专项计划" or sign == "提前批": for line in table: all_lines.append([ line[0], str(line[1]) + "(" + sign + ")", line[2] ]) else: for line in table: all_lines.append(line) table_content = [] for line in all_lines: if line[0] == "科类" or line[0] == "总计" or line[1].find("小计") != -1 or line[1].find("None") != -1 \ or line[2] == "" or line[2] == "0" or line[2] is None: continue classy = line[0] if classy == "理": classy = "理工" elif classy == "文": classy = "文史" table_content.append( [line[1].replace("( )\n", ""), classy, line[2]]) for line in table_content: mylogger.debug(str(line)) write_table(file_path, table_name, table_head, table_content) mylogger.info(year + pro + "招生计划已存入文件")
def get_plan_info_hit(): mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog() mylogger.info("开始获取网页源码...") main_url = "http://zsb.hit.edu.cn/information/plan" # 获取分类信息 main_page_source = requests.get(main_url).text main_page_soup = BeautifulSoup(main_page_source, "lxml") main_page_soup.prettify() # 招生计划省份 mylogger.info("解析招生地区...") province = [] for item in main_page_soup.find(class_="province").find_all(name='a'): province.append(item.string.strip()) mylogger.debug("哈工大招生地区:" + str(province)) # 招生计划年份 mylogger.info("解析招生年份...") years = [] for item in main_page_soup.find_all(class_="year-select"): years.append(item.string.strip()) mylogger.debug("哈工大招生年份:" + str(years)) # 对每年份各省数据进行抽取 mylogger.info("开始获取各年各地区数据...") for pro in province: for year in years: mylogger.info("开始获取" + year + pro + "的招生计划") # 构造链接 specific_url = main_url + "?" + "year=" + year + "&" + "province=" + pro page_source = requests.get(specific_url).text page_soup = BeautifulSoup(page_source, "lxml") page_soup.prettify() # 表名 table_name = year + "-" + pro mylogger.debug("表名:" + table_name) # 表头 table_head = [] for item in page_soup.find(class_="info_table").thead.find_all( name="td"): table_head.append(item.string.strip()) mylogger.debug("表头:" + str(table_head)) # 表内容 table_content = [] for item in page_soup.find(class_="info_table").tbody.find_all( name="tr"): temp = [] for sub_item in item.find_all(name="td"): temp.append(sub_item.string.strip()) table_content.append(temp) # 去除统计部分的数据项、无数据的项 for item in table_content: if item[0] == "无数据": table_content.remove(item) # elif item[1] == "统计": # table_content.remove(item) mylogger.debug("表内容如下:") for item in table_content: mylogger.debug(item) # 将表内容写入文本文件 file_path = "Information/九校联盟/哈尔滨工业大学/招生计划" write_table(file_path, table_name, table_head, table_content) mylogger.info(year + pro + "的招生计划已存入文件")
def get_plan_info_fudan(): mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog() file_path_benbu = "Information/九校联盟/复旦大学/招生计划" file_path_yixue = "Information/九校联盟/复旦大学上海医学部/招生计划" # 直接从官网进行数据查询,使用form提交 # 获取可查询的年份和地区 main_url = "http://www.ao.fudan.edu.cn/index!enrollmentPlan.html" main_page_source = request_url(main_url) main_page_source.encoding = main_page_source.apparent_encoding main_page_soup = BeautifulSoup(main_page_source.text, "lxml") main_page_soup.prettify() years = [] districts = [] for year in main_page_soup.find("select", id="nf").find_all("option"): years.append(year.string) for district in main_page_soup.find("select", id="ss").find_all("option"): districts.append(district.string) mylogger.debug("可查询的年份" + str(years)) mylogger.debug("可查询的省份" + str(districts)) search_url = "http://www.ao.fudan.edu.cn/index!enrollmentPlan.action" # 2006-2015年有数据 for year in years: for district in districts: params = {"lb": "plan", "nf": year, "ss": district} return_html = requests.post(search_url, data=params) return_soup = BeautifulSoup(return_html.text, "lxml") return_soup.prettify() all_lines = [] for div in return_soup.find_all("div", class_="inquirytable_result"): for tr in div.find_all("tr"): line = [] for td in tr: if td.string != "\n": line.append(str(td.string).strip()) all_lines.append(line) table_name = year + "-" + district table_head = ["专业", "类别", "人数"] mylogger.debug(table_name) mylogger.debug(str(table_head)) # 数据查询为空 if len(all_lines) < 3: continue # 开始提取数据 table_content_benbu = [] table_content_yixue = [] # 2013年开始复旦大学与复旦大学上海医学部分开招生 if int(year) < 2013: for line in all_lines[1:-1]: # 去除文史汇总和理工汇总 if line[0] == "文史汇总" or line[0] == "理工汇总": continue # 上海地区表头有不同 if district == "上海": table_content_benbu.append([line[0], line[1], line[5]]) else: table_content_benbu.append([line[0], line[1], line[3]]) else: # 先将本部和医学院的数据分开 index = 0 for i_line in range(1, len(all_lines)): if all_lines[i_line][0] == "专业名称": index = i_line break if index == 0: all_lines_benbu = all_lines all_lines_yixue = [] else: all_lines_benbu = all_lines[:index] all_lines_yixue = all_lines[index:] for line in all_lines_benbu[1:-1]: # 去除文史汇总和理工汇总 if line[0] == "文史汇总" or line[0] == "理工汇总": continue # 上海地区表头有不同 if district == "上海": table_content_benbu.append([line[0], line[1], line[5]]) else: table_content_benbu.append([line[0], line[1], line[3]]) if len(all_lines_yixue) != 0: for line in all_lines_yixue[1:-1]: # 去除文史汇总和理工汇总 if line[0] == "文史汇总" or line[0] == "理工汇总": continue # 上海地区表头有不同 if district == "上海": table_content_yixue.append( [line[0], line[1], line[5]]) else: table_content_yixue.append( [line[0], line[1], line[3]]) mylogger.debug("本部招生计划:") for line in table_content_benbu: mylogger.debug(str(line)) mylogger.debug("医学院招生计划:") for line in table_content_yixue: mylogger.debug(str(line)) write_table(file_path_benbu, table_name, table_head, table_content_benbu) mylogger.info("本部" + year + district + "的招生计划已存入文件") if len(table_content_yixue) != 0: write_table(file_path_yixue, table_name, table_head, table_content_yixue) mylogger.info("医学院" + year + district + "的招生计划已存入文件")
def build_mysql_major_dict(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() function_logger.info("获取招生计划表中的专业字段...") # 招生计划中的专业名称 plan_sql_string = "SELECT major FROM admission_plan GROUP BY major;" myresult = mysql_query_sentence(plan_sql_string) function_logger.debug("招生计划表中专业数%d:" % len(myresult)) pattern = re.compile(r"[(([].*?[))\]].*") plan_major_set = set() for major in myresult: temp = re.sub(pattern, "", major[0]) plan_major_set.add(temp) function_logger.debug("招生计划表中专业数(统计合并后):%d" % len(plan_major_set)) function_logger.debug( str( sorted(list(plan_major_set), key=lambda x: lazy_pinyin(x.lower())[0][0]))) # 录取分数中的专业名称 score_sql_string = "SELECT major FROM admission_score_major GROUP BY major;" myresult = mysql_query_sentence(score_sql_string) function_logger.debug("录取分数表中专业数%d:" % len(myresult)) score_major_set = set() for major in myresult: temp = re.sub(pattern, "", major[0]) score_major_set.add(temp) function_logger.debug("录取分数表中专业数(统计合并后):%d" % len(score_major_set)) function_logger.debug( str( sorted(list(score_major_set), key=lambda x: lazy_pinyin(x.lower())[0][0]))) # 测定两者交集 function_logger.debug("以上两者的交集为:") major_and_set = plan_major_set.intersection(score_major_set) function_logger.debug("交集长度为:%d", len(major_and_set)) function_logger.debug( str( sorted(list(major_and_set), key=lambda x: lazy_pinyin(x.lower())[0][0]))) # 测定两者并集 function_logger.debug("以上两者的并集为:") major_or_set = plan_major_set.union(score_major_set) function_logger.debug("并集长度为:%d", len(major_or_set)) function_logger.debug( str( sorted(list(major_or_set), key=lambda x: lazy_pinyin(x.lower())[0][0])))
def test_frequent_question(file_path: str): """ 读取常用问题集文件,并进行测试 :param file_path: 文件(csv)路径 :return: """ function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() school_name = file.split("\\")[-1][:-14] print(school_name) function_logger.info("开始读取%s的常问问题集" % school_name) with open(file, "r", encoding="utf-8") as csvfile: csv_reader = csv.reader(csvfile) table_lines = [] for row in csv_reader: table_lines.append(row) table_head = table_lines[0] table_content = table_lines[1:] # print(table_head) # for line in table_content: # print(line) # 开始进行测试 function_logger.info("读取完成,开始进行测试!") question_count = len(table_content) print(question_count) answer_count = 0 answer_null_count = 0 mysql_string_null_count = 0 mysql_string_only_school_count = 0 current_index = 1 os.chdir(os.path.split(os.path.realpath(__file__))[0]) if not os.path.exists("record/" + school_name): os.makedirs("record/" + school_name) with open("record/" + school_name + "/mysql_string_null", "w", encoding="utf-8") as msn_file, \ open("record/" + school_name + "/mysql_string_only_school", "w", encoding="utf-8") as msos_file, \ open("record/" + school_name + "/answer_null", "w", encoding="utf-8") as an_file, \ open("record/" + school_name + "/answer_not_null", "w", encoding="utf-8") as ann_file: for record in table_content: function_logger.info("%s测试进度%d/%d" % (school_name, question_count, current_index)) if len(record) == 5: question = record[3] function_logger.debug(question) start_time = time.time() mid_result, answer = answer_question_by_template( question, 1, school_name) end_time = time.time() function_logger.debug("查询时间:%s" % (end_time - start_time)) function_logger.debug(mid_result["mysql_string"]) function_logger.debug(answer[0]) if answer == ["问句条件词为空,无法构建查询语句!"]: mysql_string_null_count += 1 msn_file.write(question + "\n") msn_file.write(mid_result["mysql_string"] + "\n") msn_file.write(str(answer) + "\n") elif answer == ["问句条件词只有学校,查询过宽!"]: mysql_string_only_school_count += 1 msos_file.write(question + "\n") msos_file.write(mid_result["mysql_string"] + "\n") msos_file.write(str(answer) + "\n") elif answer == ["查询结果为空!"]: answer_null_count += 1 an_file.write(question + "\n") an_file.write(mid_result["mysql_string"] + "\n") an_file.write(str(answer) + "\n") else: answer_count += 1 # 多余三条的记录只记录前三条和总记录条数 ann_file.write(question + "\n") ann_file.write(mid_result["mysql_string"] + "\n") ann_file.write("答案条数:" + str(len(answer)) + "\t") if len(answer) > 3: ann_file.write(str(answer[:3]) + "\n") else: ann_file.write(str(answer) + "\n") current_index += 1 for input_file in [msn_file, msos_file, an_file, ann_file]: input_file.write( "总问题数%d\t查询语句构造为空数%d\t只有学校关键词数%d\t查询结果为空数%d\t有回答数%d\n" % (question_count, mysql_string_null_count, mysql_string_only_school_count, answer_null_count, answer_count)) os.chdir(os.path.split(os.path.realpath(__file__))[0]) with open("record/all.txt", "a", encoding="utf-8") as record_file: now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') record_file.write(now_time + "\t" + school_name + "\n") record_file.write( "总问题数%d\t查询语句构造为空数%d\t只有学校关键词数%d\t查询结果为空数%d\t有回答数%d\n" % (question_count, mysql_string_null_count, mysql_string_only_school_count, answer_null_count, answer_count))
def get_question_yggk(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() # 院校咨询页url main_url = "https://gaokao.chsi.com.cn" file_path = "Information/大学/Test" allready_get = [["北京大学", str(26232)], ["哈尔滨工业大学", str(26617)], ["北京大学医学部", str(6405529)], ["上海交通大学", str(6217)], ["上海交通大学医学院", str(61811)], ["清华大学", str(36710)], ["复旦大学", str(7243)], ["南京大学", str(4453)], ["浙江大学", str(43617)], ["中国科学技术大学", str(6280)], ["哈尔滨工业大学(威海)", str(62646117)], ["西安交通大学", str(53593)]] university_formid = [] with open("Information/大学/university_info", "rb") as p_file: university_infos = pickle.load(p_file) for info in university_infos: if "985" in info["院校特性"] or "211" in info["院校特性"]: if info["forum_id"] != "": university_formid.append([info["院校名称"], info["forum_id"]]) function_logger.info("共有%d所985、211大学" % len(university_formid)) for university in university_formid: begin = time.time() function_logger.info("开始抓取" + university[0] + "的招生问题数据...") main_page_url = "https://gaokao.chsi.com.cn/zxdy/forum--method-listDefault,year-2005,forumid-" + university[ 1] + ",start-0.dhtml" try: main_page_source = request_url(main_page_url) main_page_source.encoding = main_page_source.apparent_encoding main_page_soup = BeautifulSoup(main_page_source.content, "lxml") # 获取页面总数,页面栏含有省略号、不含省略号两种查找方式 if main_page_soup.find("li", class_="lip dot"): page_count = main_page_soup.find( "li", class_="lip dot").next_sibling.a.string else: page_count = main_page_soup.find( "ul", class_="ch-page clearfix").find_all("li")[-2].a.string # 置顶问题个数 top_question_count = len( main_page_soup.find("table", class_="ch-table zx-table").find_all( "span", class_="question_top_txt")) function_logger.debug("页面总数:%d 置顶问题个数:%d" % (int(page_count), int(top_question_count))) except Exception as e: # 招生咨询页面没有数据(三个大学) function_logger.error("%s咨询界面没有数据,页面链接为:%s" % (university[0], main_page_url)) function_logger.error("错误信息:%s" % e) continue # 创建该学校的问题集收集表,并写好表头 table_head = ["标题", "来源", "时间", "问题", "回答"] csvfile = open(file_path + "/" + university[0] + "常用问题集.csv", "w", newline="", encoding='utf-8') csvfile.truncate() writer = csv.writer(csvfile) writer.writerow(table_head) record_queue = Queue() # 每次开启10个线程,进行数据下载和存储 start_index = 0 end_index = 10 while True: if start_index > int(page_count): break else: dThread = [ DownloadPageInfo(university[1], page_id, int(page_count), top_question_count, record_queue) for page_id in range(start_index, end_index) ] sThread = SavePageInfo(record_queue, writer) for d in dThread: d.start() sThread.start() for d in dThread: d.join() record_queue.put(-1) sThread.join() start_index += 10 end_index += 10 if end_index > int(page_count): end_index = int(page_count) csvfile.close() function_logger.info("抓取%s的信息用时:%ds" % (university[0], time.time() - begin))
else: turn_page_url = main_url + answer_text_class.find( "a", text='[详细]')["href"] turn_page_source = request_url(turn_page_url) turn_page_source.encoding = turn_page_source.apparent_encoding turn_page_soup = BeautifulSoup(turn_page_source.text, "lxml") pattern = re.compile(r"\s+|\n|\t|\v|\ue63c") answer_text = re.sub(pattern, "", str(turn_page_soup.find("div", class_="question_a").text)) \ .replace("[回复]", "") function_logger.debug("回答:%s" % answer_text) records.append([ question_title, question_from, question_time, question_text, answer_text ]) with open(file_path + "/" + school[0] + "常用问题集.csv", "a", encoding='utf-8') as csvfile: writer = csv.writer(csvfile) for record in records: writer.writerow(record) time.sleep(3) function_logger.info("%s的常用问题集收集完毕!" % school[0]) if __name__ == '__main__': main_logger = MyLog(__name__).getlog() main_logger.debug("start...") get_question_yggk() main_logger.debug("end...")
def get_question_yggk(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() # 院校咨询页url main_url = "https://gaokao.chsi.com.cn" file_path = "Information/大学/Test" school_urls = [["北京大学", str(26232)], ["哈尔滨工业大学", str(26617)], ["北京大学医学部", str(6405529)], ["上海交通大学", str(6217)], ["上海交通大学医学院", str(61811)], ["清华大学", str(36710)], ["复旦大学", str(7243)], ["南京大学", str(4453)], ["浙江大学", str(43617)], ["中国科学技术大学", str(6280)], ["哈尔滨工业大学(威海)", str(62646117)], ["西安交通大学", str(53593)]] for school in school_urls: function_logger.info("开始抓取" + school[0] + "的招生问题数据...") # 创建该学校的问题集收集表,sheet,并写好表头 table_head = ["标题", "来源", "时间", "问题", "回答"] with open(file_path + "/" + school[0] + "常用问题集.csv", "w", encoding='utf-8') as csvfile: csvfile.truncate() writer = csv.writer(csvfile) writer.writerow(table_head) main_page_source = request_url( "https://gaokao.chsi.com.cn/zxdy/forum--method-listDefault,year-2005,forumid-" + school[1] + ",start-0.dhtml") main_page_source.encoding = main_page_source.apparent_encoding main_page_soup = BeautifulSoup(main_page_source.content, "lxml") # 页面总数 page_count = main_page_soup.find( "li", class_="lip dot").next_sibling.a.string # 置顶问题个数 top_question_count = len( main_page_soup.find("table", class_="ch-table zx-table").find_all( "span", class_="question_top_txt")) # 每页问题个数 page_question_count = 15 # 通过构造每一个页面url进入具体页面 for i_page in list(range(10)) + list(range(11, int(page_count))): page_url = main_url + "/zxdy/forum--method-listDefault,year-2005,forumid-" + school[ 1] + ",start-" + str(i_page * page_question_count) + ".dhtml" # xls表格记录基点(页问题量+置顶问题量+表头) # if i_page == 0: # base_count = 1 # else: # base_count = i_page * page_question_count + top_question_count + 1 function_logger.info("页面抓取进度(%d,%d)" % (i_page + 1, int(page_count))) function_logger.info("页面url%s" % page_url) page_source = request_url(page_url) page_source.encoding = page_source.apparent_encoding page_soup = BeautifulSoup(page_source.text, "lxml") tr_list = page_soup.find("table", class_="ch-table zx-table").contents for item in tr_list: if item == "\n": tr_list.remove(item) records = [] # 置顶问答只记录一次 if i_page == 0: start_index = 0 else: start_index = top_question_count * 2 for i_qa_pair in range(start_index, len(tr_list), 2): question_title = "q_title" question_from = "" question_time = "" question_text = "q_text" answer_text = "a_text" question_title = str(tr_list[i_qa_pair].find( "a", class_="question_t_txt").string).strip() function_logger.debug("标题:%s" % question_title) question_from = str(tr_list[i_qa_pair].find( "i", title="提问人").next_sibling.string).strip() function_logger.debug("来源:%s" % question_from) question_time = str(tr_list[i_qa_pair].find( "td", class_="question_t ch-table-center").text).strip() function_logger.debug("时间:%s" % question_time) # 问题与答案可能出现本页无法写下的情况,需要进行页面跳转获取信息 question_text_class = tr_list[i_qa_pair + 1].find( "div", class_="question") if question_text_class.find(text='[详细]') is None: question_text = str(question_text_class.text).strip() else: turn_page_url = main_url + question_text_class.find( "a", text='[详细]')["href"] turn_page_source = request_url(turn_page_url) turn_page_source.encoding = turn_page_source.apparent_encoding turn_page_soup = BeautifulSoup(turn_page_source.text, "lxml") question_text = str( turn_page_soup.find("div", class_="question").text).strip() function_logger.debug("问题:%s" % question_text) answer_text_class = tr_list[i_qa_pair + 1].find( "div", class_="question_a") if answer_text_class.find(text='[详细]') is None: answer_text = str(answer_text_class.text).replace( "[ 回复 ]", "").strip() else: turn_page_url = main_url + answer_text_class.find( "a", text='[详细]')["href"] turn_page_source = request_url(turn_page_url) turn_page_source.encoding = turn_page_source.apparent_encoding turn_page_soup = BeautifulSoup(turn_page_source.text, "lxml") pattern = re.compile(r"\s+|\n|\t|\v|\ue63c") answer_text = re.sub(pattern, "", str(turn_page_soup.find("div", class_="question_a").text)) \ .replace("[回复]", "") function_logger.debug("回答:%s" % answer_text) records.append([ question_title, question_from, question_time, question_text, answer_text ]) with open(file_path + "/" + school[0] + "常用问题集.csv", "a", encoding='utf-8') as csvfile: writer = csv.writer(csvfile) for record in records: writer.writerow(record) time.sleep(3) function_logger.info("%s的常用问题集收集完毕!" % school[0])
result_edit.append("问句条件词为空,无法构建查询语句!") else: # 若只有学校一个关键词 if "and" not in mysql_string: result_edit.append("问句条件词只有学校,查询过宽!") else: result = mysql_query_sentence(mysql_string) if len(result) == 0: result_edit.append("查询结果为空!") else: mid_result["search_result"] = result for item in result: answer_string = build_mysql_answer_string_by_template( match_template_answer, item) result_edit.append(answer_string) return mid_result, result_edit if __name__ == '__main__': main_logger = MyLog(logger=__name__).getlog() main_logger.info("start...") test_question = "哈工大前年软件工程石家庄招生人数?" main_logger.debug(test_question) test_mid_result, test_result = answer_question_by_template(test_question) for mid in test_mid_result: main_logger.debug(str(mid) + ":" + str(test_mid_result[mid])) main_logger.debug("查询结果:") for result in test_result: main_logger.debug(str(result)) main_logger.info("end...")