def get_answer_text(self, turn_page_url): try: turn_page_source = request_url(turn_page_url) turn_page_source.encoding = turn_page_source.apparent_encoding turn_page_soup = BeautifulSoup(turn_page_source.text, "lxml") answer_text = str( turn_page_soup.find("div", class_="question_a").text).strip() return answer_text except Exception as e: self.thread_logger.error("答句%s抓取失败,失败原因%s" % (turn_page_url, e)) return ""
def get_plan_info_ustc(): mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog() file_path = "Information/九校联盟/中国科学技术大学/招生计划" main_url = "https://zsb.ustc.edu.cn" # 获取分类信息 main_page_source = request_url(main_url + "/12993/list.htm") main_page_source.encoding = main_page_source.apparent_encoding main_page_soup = BeautifulSoup(main_page_source.text, "lxml") main_page_soup.prettify() for area in main_page_soup.find_all("area"): page_url = area["href"] page_source = request_url(page_url) page_source.encoding = page_source.apparent_encoding page_soup = BeautifulSoup(page_source.text, "lxml") page_soup.prettify() title = page_soup.find("h1", class_="arti_title").string year = title[:4] district = title[5:-4] table_name = year + "-" + district table_head = ["专业", "类别", "人数"] mylogger.debug(table_name) mylogger.debug(str(table_head)) all_lines = [] for tr in page_soup.find("div", class_="wp_articlecontent").find_all("tr"): line = [] for td in tr: line.append(td.text) all_lines.append(line) table_content = [] for line in all_lines[1:]: if line[0] != "合计" and line[0] != "小计": if district == "浙江" or district == "上海": table_content.append( [line[0] + "(" + line[1] + ")", "理工", line[2]]) else: table_content.append([line[0], "理工", line[1]]) for line in table_content: mylogger.debug(str(line)) write_table(file_path, table_name, table_head, table_content) mylogger.info(year + district + "的招生计划已存入文件")
def get_undergraduate_university_info(): # 院校库主页 function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() main_url = "https://gaokao.chsi.com.cn/sch/search.do?searchType=1&xlcc=bk&start=" main_page_source = request_url(main_url + "0") main_page_source.encoding = main_page_source.apparent_encoding main_page_soup = BeautifulSoup(main_page_source.text, "lxml") page_count = int( main_page_soup.find("li", class_="lip dot").next_sibling.text) page_university_count = 20 university_infos = [] for i_page in range(page_count): page_url = main_url + str(i_page * page_university_count) function_logger.info("页面抓取进度(%d,%d)" % (i_page + 1, int(page_count))) function_logger.info("页面url%s" % page_url) browser = selenium_chrome(page_url) page_souce = browser.find_element_by_class_name( "ch-table").get_attribute("innerHTML") browser.quit() page_soup = BeautifulSoup(page_souce, "lxml") page_soup.prettify() head = [th.text for th in page_soup.find("tr").find_all("th")] print(head) for tr in page_soup.find_all("tr")[1:]: info = {} td_list = tr.find_all("td") info["url"] = "https://gaokao.chsi.com.cn" + td_list[0].find( "a")["href"] for i in [0, 1, 2, 3, 4, 7]: info[head[i]] = td_list[i].text.strip() info[head[5]] = td_list[5].text.strip().replace("\n", "").replace( " ", "").replace("\u2002", " ") info[head[6]] = td_list[6].text.strip().replace( "\ue664", "有") if td_list[6].text.strip() != "" else "无" university_infos.append(info) for info in university_infos: print(info) with open("Information/大学/university_info", "wb") as p_file: pickle.dump(university_infos, p_file)
def get_consultation_forum_id(): with open("Information/大学/university_info", "rb") as p_file: university_infos = pickle.load(p_file) for i_info in range(len(university_infos)): info = university_infos[i_info] # if "985" not in info["院校特性"] and "211" not in info["院校特性"]: # continue print(info) try: page_source = request_url(info["url"]) page_source.encoding = page_source.apparent_encoding forum_id = \ BeautifulSoup(page_source.text, "lxml").find("a", class_="ch-btn zx-question")["href"].split("-")[-1][ :-6] print(forum_id) except: forum_id = "" university_infos[i_info]["forum_id"] = forum_id with open("Information/大学/university_info", "wb") as p_file: pickle.dump(university_infos, p_file) for info in university_infos: if "985" in info["院校特性"] and "211" in info["院校特性"]: print(info)
def get_question_yggk(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() # 院校咨询页url main_url = "https://gaokao.chsi.com.cn" file_path = "Information/大学/Test" school_urls = [["北京大学", str(26232)], ["哈尔滨工业大学", str(26617)], ["北京大学医学部", str(6405529)], ["上海交通大学", str(6217)], ["上海交通大学医学院", str(61811)], ["清华大学", str(36710)], ["复旦大学", str(7243)], ["南京大学", str(4453)], ["浙江大学", str(43617)], ["中国科学技术大学", str(6280)], ["哈尔滨工业大学(威海)", str(62646117)], ["西安交通大学", str(53593)]] for school in school_urls: function_logger.info("开始抓取" + school[0] + "的招生问题数据...") # 创建该学校的问题集收集表,sheet,并写好表头 table_head = ["标题", "来源", "时间", "问题", "回答"] with open(file_path + "/" + school[0] + "常用问题集.csv", "w", encoding='utf-8') as csvfile: csvfile.truncate() writer = csv.writer(csvfile) writer.writerow(table_head) main_page_source = request_url( "https://gaokao.chsi.com.cn/zxdy/forum--method-listDefault,year-2005,forumid-" + school[1] + ",start-0.dhtml") main_page_source.encoding = main_page_source.apparent_encoding main_page_soup = BeautifulSoup(main_page_source.content, "lxml") # 页面总数 page_count = main_page_soup.find( "li", class_="lip dot").next_sibling.a.string # 置顶问题个数 top_question_count = len( main_page_soup.find("table", class_="ch-table zx-table").find_all( "span", class_="question_top_txt")) # 每页问题个数 page_question_count = 15 # 通过构造每一个页面url进入具体页面 for i_page in list(range(10)) + list(range(11, int(page_count))): page_url = main_url + "/zxdy/forum--method-listDefault,year-2005,forumid-" + school[ 1] + ",start-" + str(i_page * page_question_count) + ".dhtml" # xls表格记录基点(页问题量+置顶问题量+表头) # if i_page == 0: # base_count = 1 # else: # base_count = i_page * page_question_count + top_question_count + 1 function_logger.info("页面抓取进度(%d,%d)" % (i_page + 1, int(page_count))) function_logger.info("页面url%s" % page_url) page_source = request_url(page_url) page_source.encoding = page_source.apparent_encoding page_soup = BeautifulSoup(page_source.text, "lxml") tr_list = page_soup.find("table", class_="ch-table zx-table").contents for item in tr_list: if item == "\n": tr_list.remove(item) records = [] # 置顶问答只记录一次 if i_page == 0: start_index = 0 else: start_index = top_question_count * 2 for i_qa_pair in range(start_index, len(tr_list), 2): question_title = "q_title" question_from = "" question_time = "" question_text = "q_text" answer_text = "a_text" question_title = str(tr_list[i_qa_pair].find( "a", class_="question_t_txt").string).strip() function_logger.debug("标题:%s" % question_title) question_from = str(tr_list[i_qa_pair].find( "i", title="提问人").next_sibling.string).strip() function_logger.debug("来源:%s" % question_from) question_time = str(tr_list[i_qa_pair].find( "td", class_="question_t ch-table-center").text).strip() function_logger.debug("时间:%s" % question_time) # 问题与答案可能出现本页无法写下的情况,需要进行页面跳转获取信息 question_text_class = tr_list[i_qa_pair + 1].find( "div", class_="question") if question_text_class.find(text='[详细]') is None: question_text = str(question_text_class.text).strip() else: turn_page_url = main_url + question_text_class.find( "a", text='[详细]')["href"] turn_page_source = request_url(turn_page_url) turn_page_source.encoding = turn_page_source.apparent_encoding turn_page_soup = BeautifulSoup(turn_page_source.text, "lxml") question_text = str( turn_page_soup.find("div", class_="question").text).strip() function_logger.debug("问题:%s" % question_text) answer_text_class = tr_list[i_qa_pair + 1].find( "div", class_="question_a") if answer_text_class.find(text='[详细]') is None: answer_text = str(answer_text_class.text).replace( "[ 回复 ]", "").strip() else: turn_page_url = main_url + answer_text_class.find( "a", text='[详细]')["href"] turn_page_source = request_url(turn_page_url) turn_page_source.encoding = turn_page_source.apparent_encoding turn_page_soup = BeautifulSoup(turn_page_source.text, "lxml") pattern = re.compile(r"\s+|\n|\t|\v|\ue63c") answer_text = re.sub(pattern, "", str(turn_page_soup.find("div", class_="question_a").text)) \ .replace("[回复]", "") function_logger.debug("回答:%s" % answer_text) records.append([ question_title, question_from, question_time, question_text, answer_text ]) with open(file_path + "/" + school[0] + "常用问题集.csv", "a", encoding='utf-8') as csvfile: writer = csv.writer(csvfile) for record in records: writer.writerow(record) time.sleep(3) function_logger.info("%s的常用问题集收集完毕!" % school[0])
def get_plan_info_xjtu(): mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog() file_path = "Information/九校联盟/西安交通大学/招生计划" # 通过获取单个网页获取信息,需要后续处理,很麻烦 # mylogger.info("开始获取网页源码...共五个网页") # with open(file_path+"/source/page_url_list","w",encoding="utf-8")as url_file: # for i in range(1, 6): # main_url = "http://zs.xjtu.edu.cn/lmy.jsp?a43639t=5&a43639p=" + str(i) \ # + "&a43639c=10&urltype=tree.TreeTempUrl&wbtreeid=1005" # # 获取分类信息 # main_page_source = requests.get(main_url).text # main_page_soup = BeautifulSoup(main_page_source, "lxml") # main_page_soup.prettify() # for item in main_page_soup.find("div", id="fybt").find("ul").find_all("a"): # url_file.write(str(item)+"\n") # mylogger.info("招生计划页面url获取完成") # mylogger.info("开始获取具体页面信息") # with open(file_path + "/source/page_url_list", "r", encoding="utf-8")as url_file: # url_source = url_file.read() # url_soup = BeautifulSoup(url_source,"lxml") # url_soup.prettify() # for page_url in url_soup.find_all("a"): # print(page_url) # 直接从官网进行数据查询,使用form提交 # 获取可查询的年份和地区 main_url = "http://zs.xjtu.edu.cn/bkscx/zsjhcx.htm" main_page_source = request_url(main_url) main_page_source.encoding = main_page_source.apparent_encoding main_page_soup = BeautifulSoup(main_page_source.text, "lxml") main_page_soup.prettify() years = [] districts = [] for year in main_page_soup.find("select", id="nf").find_all("option"): years.append(year.string) for district in main_page_soup.find("select", id="sf").find_all("option")[1:]: districts.append(district.string) mylogger.debug("可查询的年份" + str(years)) mylogger.debug("可查询的省份" + str(districts)) search_url = "http://zs.xjtu.edu.cn/zsjg.jsp?wbtreeid=1168" for year in years: for district in districts: # x,y 是查询按钮点击时的坐标,查询按钮大小x,y(54x22) params = {"nf": year, "sf": district, "x": "27", "y": "11"} return_html = requests.post(search_url, data=params) return_soup = BeautifulSoup(return_html.text, "lxml") return_soup.prettify() all_lines = [] for tr in return_soup.find("div", id="fybt").find_all("tr"): line = [] for td in tr: if td.string != "\n": line.append(str(td.string).strip()) all_lines.append(line) table_name = year + "-" + district[:-1] table_head = ["专业", "类别", "人数"] table_content = [] for line in all_lines[1:-1]: classy = line[2] if classy == "理": classy = "理工" if classy == "文": classy = "文史" table_content.append([line[0], classy, line[4]]) mylogger.debug(table_name) mylogger.debug(str(table_head)) for line in table_content: mylogger.debug(str(line)) write_table(file_path, table_name, table_head, table_content) mylogger.info(year + district + "的招生计划已存入文件")
def get_plan_info_fudan(): mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog() file_path_benbu = "Information/九校联盟/复旦大学/招生计划" file_path_yixue = "Information/九校联盟/复旦大学上海医学部/招生计划" # 直接从官网进行数据查询,使用form提交 # 获取可查询的年份和地区 main_url = "http://www.ao.fudan.edu.cn/index!enrollmentPlan.html" main_page_source = request_url(main_url) main_page_source.encoding = main_page_source.apparent_encoding main_page_soup = BeautifulSoup(main_page_source.text, "lxml") main_page_soup.prettify() years = [] districts = [] for year in main_page_soup.find("select", id="nf").find_all("option"): years.append(year.string) for district in main_page_soup.find("select", id="ss").find_all("option"): districts.append(district.string) mylogger.debug("可查询的年份" + str(years)) mylogger.debug("可查询的省份" + str(districts)) search_url = "http://www.ao.fudan.edu.cn/index!enrollmentPlan.action" # 2006-2015年有数据 for year in years: for district in districts: params = {"lb": "plan", "nf": year, "ss": district} return_html = requests.post(search_url, data=params) return_soup = BeautifulSoup(return_html.text, "lxml") return_soup.prettify() all_lines = [] for div in return_soup.find_all("div", class_="inquirytable_result"): for tr in div.find_all("tr"): line = [] for td in tr: if td.string != "\n": line.append(str(td.string).strip()) all_lines.append(line) table_name = year + "-" + district table_head = ["专业", "类别", "人数"] mylogger.debug(table_name) mylogger.debug(str(table_head)) # 数据查询为空 if len(all_lines) < 3: continue # 开始提取数据 table_content_benbu = [] table_content_yixue = [] # 2013年开始复旦大学与复旦大学上海医学部分开招生 if int(year) < 2013: for line in all_lines[1:-1]: # 去除文史汇总和理工汇总 if line[0] == "文史汇总" or line[0] == "理工汇总": continue # 上海地区表头有不同 if district == "上海": table_content_benbu.append([line[0], line[1], line[5]]) else: table_content_benbu.append([line[0], line[1], line[3]]) else: # 先将本部和医学院的数据分开 index = 0 for i_line in range(1, len(all_lines)): if all_lines[i_line][0] == "专业名称": index = i_line break if index == 0: all_lines_benbu = all_lines all_lines_yixue = [] else: all_lines_benbu = all_lines[:index] all_lines_yixue = all_lines[index:] for line in all_lines_benbu[1:-1]: # 去除文史汇总和理工汇总 if line[0] == "文史汇总" or line[0] == "理工汇总": continue # 上海地区表头有不同 if district == "上海": table_content_benbu.append([line[0], line[1], line[5]]) else: table_content_benbu.append([line[0], line[1], line[3]]) if len(all_lines_yixue) != 0: for line in all_lines_yixue[1:-1]: # 去除文史汇总和理工汇总 if line[0] == "文史汇总" or line[0] == "理工汇总": continue # 上海地区表头有不同 if district == "上海": table_content_yixue.append( [line[0], line[1], line[5]]) else: table_content_yixue.append( [line[0], line[1], line[3]]) mylogger.debug("本部招生计划:") for line in table_content_benbu: mylogger.debug(str(line)) mylogger.debug("医学院招生计划:") for line in table_content_yixue: mylogger.debug(str(line)) write_table(file_path_benbu, table_name, table_head, table_content_benbu) mylogger.info("本部" + year + district + "的招生计划已存入文件") if len(table_content_yixue) != 0: write_table(file_path_yixue, table_name, table_head, table_content_yixue) mylogger.info("医学院" + year + district + "的招生计划已存入文件")
def get_question_yggk(): function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog() # 院校咨询页url main_url = "https://gaokao.chsi.com.cn" file_path = "Information/大学/Test" allready_get = [["北京大学", str(26232)], ["哈尔滨工业大学", str(26617)], ["北京大学医学部", str(6405529)], ["上海交通大学", str(6217)], ["上海交通大学医学院", str(61811)], ["清华大学", str(36710)], ["复旦大学", str(7243)], ["南京大学", str(4453)], ["浙江大学", str(43617)], ["中国科学技术大学", str(6280)], ["哈尔滨工业大学(威海)", str(62646117)], ["西安交通大学", str(53593)]] university_formid = [] with open("Information/大学/university_info", "rb") as p_file: university_infos = pickle.load(p_file) for info in university_infos: if "985" in info["院校特性"] or "211" in info["院校特性"]: if info["forum_id"] != "": university_formid.append([info["院校名称"], info["forum_id"]]) function_logger.info("共有%d所985、211大学" % len(university_formid)) for university in university_formid: begin = time.time() function_logger.info("开始抓取" + university[0] + "的招生问题数据...") main_page_url = "https://gaokao.chsi.com.cn/zxdy/forum--method-listDefault,year-2005,forumid-" + university[ 1] + ",start-0.dhtml" try: main_page_source = request_url(main_page_url) main_page_source.encoding = main_page_source.apparent_encoding main_page_soup = BeautifulSoup(main_page_source.content, "lxml") # 获取页面总数,页面栏含有省略号、不含省略号两种查找方式 if main_page_soup.find("li", class_="lip dot"): page_count = main_page_soup.find( "li", class_="lip dot").next_sibling.a.string else: page_count = main_page_soup.find( "ul", class_="ch-page clearfix").find_all("li")[-2].a.string # 置顶问题个数 top_question_count = len( main_page_soup.find("table", class_="ch-table zx-table").find_all( "span", class_="question_top_txt")) function_logger.debug("页面总数:%d 置顶问题个数:%d" % (int(page_count), int(top_question_count))) except Exception as e: # 招生咨询页面没有数据(三个大学) function_logger.error("%s咨询界面没有数据,页面链接为:%s" % (university[0], main_page_url)) function_logger.error("错误信息:%s" % e) continue # 创建该学校的问题集收集表,并写好表头 table_head = ["标题", "来源", "时间", "问题", "回答"] csvfile = open(file_path + "/" + university[0] + "常用问题集.csv", "w", newline="", encoding='utf-8') csvfile.truncate() writer = csv.writer(csvfile) writer.writerow(table_head) record_queue = Queue() # 每次开启10个线程,进行数据下载和存储 start_index = 0 end_index = 10 while True: if start_index > int(page_count): break else: dThread = [ DownloadPageInfo(university[1], page_id, int(page_count), top_question_count, record_queue) for page_id in range(start_index, end_index) ] sThread = SavePageInfo(record_queue, writer) for d in dThread: d.start() sThread.start() for d in dThread: d.join() record_queue.put(-1) sThread.join() start_index += 10 end_index += 10 if end_index > int(page_count): end_index = int(page_count) csvfile.close() function_logger.info("抓取%s的信息用时:%ds" % (university[0], time.time() - begin))
def get_page_info(self): main_url = "https://gaokao.chsi.com.cn" page_question_count = 15 page_url = main_url + "/zxdy/forum--method-listDefault,year-2005,forumid-" + self.university_id + ",start-" + str( self.page_id * page_question_count) + ".dhtml" self.thread_logger.info("页面抓取进度(%d,%d)" % (self.page_id + 1, self.page_count)) self.thread_logger.info("页面url %s" % page_url) try: page_source = request_url(page_url) page_source.encoding = page_source.apparent_encoding page_soup = BeautifulSoup(page_source.text, "lxml") # 获取咨询序列(所有的子节点) tr_list = page_soup.find("table", class_="ch-table zx-table").contents # 除去其中的空行 for item in tr_list: if item == "\n": tr_list.remove(item) # 置顶问答只记录一次 if self.page_id == 0: start_index = 0 else: start_index = self.top_question_count * 2 page_infos = [] for i_qa_pair in range(start_index, len(tr_list), 2): question_title = "q_title" question_from = "" question_time = "" question_text = "q_text" answer_text = "a_text" question_title = str(tr_list[i_qa_pair].find( "a", class_="question_t_txt").string).strip().replace(",", ",") # self.thread_logger.debug("标题:%s" % question_title) question_from = str(tr_list[i_qa_pair].find( "i", title="提问人").next_sibling.string).strip().replace( ",", ",") # self.thread_logger.debug("来源:%s" % question_from) question_time = str(tr_list[i_qa_pair].find( "td", class_="question_t ch-table-center").text).strip().replace( ",", ",") # self.thread_logger.debug("时间:%s" % question_time) # 问题与答案可能出现本页无法写下的情况,需要进行页面跳转获取信息 question_text_class = tr_list[i_qa_pair + 1].find( "div", class_="question") if question_text_class.find(text='[详细]') is None: question_text = str(question_text_class.text).strip() else: turn_page_url = main_url + question_text_class.find( "a", text='[详细]')["href"] question_text = self.get_question_text(turn_page_url) replace_str = [ "回复", "\n", "\r", "\t", "\xa0", "\ue63c", "\ue5e5", "\u3000" "[", "]", " " ] for r_str in replace_str: question_text = question_text.replace(r_str, "") question_text.replace(",", ",") # self.thread_logger.debug("问题:%s" % question_text) answer_text_class = tr_list[i_qa_pair + 1].find( "div", class_="question_a") if answer_text_class.find(text='[详细]') is None: answer_text = str(answer_text_class.text).replace( "[ 回复 ]", "").strip() else: turn_page_url = main_url + answer_text_class.find( "a", text='[详细]')["href"] answer_text = self.get_answer_text(turn_page_url) replace_str = [ "回复", "\n", "\r", "\t", "\xa0", "\ue63c", "\ue5e5", "\u3000" "[", "]", " " ] for r_str in replace_str: answer_text = answer_text.replace(r_str, "") answer_text.replace(",", ",") # self.thread_logger.debug("回答:%s" % answer_text) page_infos.append([ question_title, question_from, question_time, question_text, answer_text ]) return page_infos except Exception as e: self.thread_logger.error("错误信息%s" % e) return []