def login(self): try: main_page = self.__session.get(url=self.__url) soup = BeautifulSoup(main_page.text, "html.parser") lt = soup.find_all('input')[4] lt_code = lt.attrs["value"] data = { "username": self.__username, "password": self.__password, "imageCodeName": "", "errors": 0, "lt": lt_code, "_eventId": "submit" } page = self.__session.post(url=self.__url, data=data) # print(page.content.decode("utf-8")) if page.status_code == 200: if "登录成功" in page.text: self.login_state = True return True else: return False else: return 503 except Exception as e: print(e) raise exception.CrawlerException("ce9:内部错误:oa系统访问失败稍后重试")
def login(self): page_lt = self.__session.get(url=self.__login_page) # print(page_lt.text) # soup = BeautifulSoup(page_lt.text, "html.parser") # lt = soup.find_all("input")[3] html = etree.HTML(page_lt.content.decode("utf-8")) lt = html.xpath("//input[@name='lt']/@value")[0] # print(lt) execution = html.xpath("//input[@name='execution']/@value")[0] self.data["lt"] = lt self.data["execution"] = execution # print(self.data) page = self.__session.post(url=self.__login_page, data=self.data) # print(page.content.decode("utf-8")) if page.status_code == 200: if "密码错误" in page.text or "账户不存在" in page.text or "为空" in page.text or "密码有误" in page.text: return False elif "请输入验证码" in page.text: raise exceptions.CrawlerException( "ce13:验证码输入:需要输入验证码,本系统暂不支持自动输入验证码,请在学校网站登录一次你的用户和密码") else: return True return False
def get_course_table_with_stuid(self, stuid, week=1, semester=662): data = { "ignoreHead": 1, "setting.kind": "std", "startWeek": week, "semester.id": semester, "ids": stuid } page = self.__session.post(url=self.__course_table_page, data=data) if page.status_code !=200: raise exceptions.CrawlerException("ce14:教育系统崩溃了,请稍后在尝试") soup = BeautifulSoup(page.text, "html.parser") script = str(soup.find_all("script")[-2]) course_list = re.findall("activity\s=\snew\sTaskActivity\((.*?)\)", script) course_id_list = [] if len(course_list) != 0: for course in course_list: course_id = str(course).split("(")[1] course_id_list.append(course_id) course_id_list = list(set(course_id_list)) # list去重, 得到课程id list course_dic_list = [] # 课程id的list => 课程id字典的list for course_id in course_id_list: dic = { "username": self.__username, "course_id": course_id, "semester": semester } course_dic_list.append(dic) return course_dic_list else: return None
def get_course_table_another_way(self): # 免登录,不会出现评教提醒 url = self.__another_course_table_page+"?stdCode="+self.__username page = self.__session.get(url) if page.status_code !=200: raise exceptions.CrawlerException("ce14:教育系统崩溃了,请稍后在尝试") soup = BeautifulSoup(page.text, "html.parser") script = str(soup.find_all("script")[-1]) course_list = re.findall("activity\s=\snew\sTaskActivity\((.*?)\)", script) course_id_list = [] if len(course_list) != 0: for course in course_list: course_id = str(course).split("(")[1] course_id_list.append(course_id) course_id_list = list(set(course_id_list)) # list去重, 得到课程id list course_dic_list = [] # 课程id的list => 课程id字典的list for course_id in course_id_list: dic = { "username": self.__username, "course_id": course_id, "semester": 662 } course_dic_list.append(dic) return course_dic_list else: return None
def get_user_detail(self): # 可获取studentID <input type="hidden" name="studentId" value="246944"/> page = self.__session.get(url=self.__user_detail_page) if page.status_code !=200: raise exceptions.CrawlerException("ce14:教育系统崩溃了,请稍后在尝试") soup = BeautifulSoup(page.text, "html.parser") td = soup.find_all("td") print(soup) # stuid = self.get_stuid() # student ID username = td[2].string # 学号 self.__username = username name = td[4].string # 姓名 sex = td[9].string # 性别 grade = td[11].string # 年级 level = td[17].string # 学历 department = td[21].string # 院系 profession = td[23].string # 专业 class_ = td[43].string # 班级 campus = td[45].string # 校区 birthday = td[56].string # 生日 photo_state = self.save_photo() if photo_state == -1: dic = { "username": username, # "stuid": int(stuid), "name": name, "sex": sex, "grade": grade, "level": level, "department": department, "profession": profession, "class": class_, "campus": campus, "birthday": birthday, "photo": None, "avatar": "/static/media/avatar/default.png" } else: dic = { "username": username, # "stuid": int(stuid), "name": name, "sex": sex, "grade": grade, "level": level, "department": department, "profession": profession, "class": class_, "campus": campus, "birthday": birthday, "photo": photo_state, "avatar": None } return dic
def get_stuid(self): try: page = self.__session.get(url="https://jx.sspu.edu.cn/eams/courseTableForStd.action") if page.status_code != 200: raise exceptions.CrawlerException("ce14:教育系统崩溃了,请稍后在尝试") # if "未进行导师评估,不能进行此操作" in page.text: # raise exceptions.CrawlerException("ce7") soup = BeautifulSoup(page.text, "html.parser") script = soup.find_all("script")[-1] pattern = re.findall(r"bg.form.addInput\(form,\"ids\",\"(.*?)\"\);", str(script)) self.__stuid = int(pattern[0]) return pattern[0] except Exception as e: with open("log.log","a+") as f: f.write(str(self.__username)+"place error"+str(e)+"\n") return -1 # 若出错,stuid为-1
def get_all_semester_summary(self): data = { "projectType": "MAJOR" } page = self.__session.post(url=self.__all_grade_page, data=data) if page.status_code !=200: raise exceptions.CrawlerException("ce14:教育系统崩溃了,请稍后在尝试") soup = BeautifulSoup(page.text, "html.parser") tbodys = soup.find_all("tbody")[0] trs = tbodys.find_all("tr") average_grade_list = [] length = len(trs) for i, tr in enumerate(trs): # i:0,1,2,3,.... #tr:item if i < length-1: if i == length-2: ths = tr.find_all("th") all_semester_summary = { "username": self.__username, "school_year": "sum", "lesson_num": ths[1].text, "total_credit": ths[2].text, "average_score": ths[3].text#平均绩点 } average_grade_list.append(all_semester_summary) else: tds = tr.find_all("td") if tds: a_semester_summary = { "username": self.__username, "school_year": tds[0].text+" "+tds[1].text, "lesson_num": tds[2].text, "total_credit": tds[3].text, "average_score": tds[4].text#平均绩点 } average_grade_list.append(a_semester_summary) return average_grade_list
def login(username,password,school,system=None): if school == 3:#上海第二工业大学 try: if system == 0: oa = OASession(username, password) login_state = oa.login() if login_state: return oa elif login_state == 503: raise exceptions.CrawlerException("ce9:内部错误:oa系统访问失败稍后重试") else: raise exceptions.CrawlerException("ce3:登陆错误:oa密码错误") elif system == 1: eams = EAMSSession(username, password) eams_state = eams.login() if eams_state and eams_state !=503: return eams elif eams_state == 503: raise exceptions.CrawlerException("ce9:内部错误:教务系统访问失败稍后重试") else: raise exceptions.CrawlerException("ce4:登陆错误:eams密码错误") if not system: oa = OASession(username, password) login_state = oa.login() if login_state: return "oa" eams = EAMSSession(username, password) eams_state = eams.login() if eams_state: return "eams" if eams_state == 503: raise exceptions.CrawlerException("ce9:内部错误:教务系统访问失败稍后重试") raise exceptions.CrawlerException("ce1:登陆错误:用户提供的用户名、密码对错误,oa,eams系统都登陆不上") except requests.exceptions.ConnectionError: raise exceptions.CrawlerException("ce10:访问失败:学校服务器对你的请求没有响应,访问失败") except exceptions.CrawlerException as e: raise e except Exception as e: raise e elif school == 4:#上海金融学院 if not system: try: li_session = LiXinSession(username,password) li_state = li_session.login() if li_state: return li_session else: raise exceptions.CrawlerException("ce1:密码错误:请检查你输入的密码是否正确") except requests.exceptions.ConnectionError: raise exceptions.CrawlerException("ce10:访问失败:学校服务器对你的请求没有响应,访问失败") except Exception as e: raise e elif school == 5: # 上海杉达学院 if not system: try: san_seesion = SanDauSession(username, password) san_state = san_seesion.login() if san_state: return san_seesion else: raise exceptions.CrawlerException("ce1:密码错误:请检查你输入的密码是否正确") except requests.exceptions.ConnectionError: raise exceptions.CrawlerException("ce10:访问失败:学校服务器对你的请求没有响应,访问失败") except Exception as e: raise e elif school == 7: # 上海电力大学 if not system: try: ship_session = SHIEPSession(username,password) ship_state = ship_session.login() if ship_state: return ship_session else: raise exceptions.CrawlerException("ce1:密码错误:请检查你输入的密码是否正确") except requests.exceptions.ConnectionError: raise exceptions.CrawlerException("ce10:访问失败:学校服务器对你的请求没有响应,访问失败") except Exception as e: raise e
def transaction(self): # 重要:要访问两次才嫩得到带JSESSIONID的页面 # 这个页面会set-cookie,set的cookie为JSESSIONID;也就是这个页面的response cookie为JSESSIONID self.session.get(self.url1) # 访问这个页面的作用是调用一下刚刚的session,使得response cookie生效 page = self.session.get(self.url3) money = re.findall("余额:(.*?)元", page.text)[0] if len( re.findall("余额:(.*?)元", page.text)) > 0 else None _csrf = etree.HTML( page.text).xpath("//input[@name='_csrf']//@value")[0] if page.status_code != 200: raise exception.CrawlerException("ce14:教育系统崩溃了,请稍后在尝试") if "退出" not in page.text: raise exception.CrawlerException("ce12:校卡余额并不能被eams密码查到,请用oa密码登陆") check_record = "https://card.sspu.edu.cn/epay/consume/query" post_data = { "starttime": self.begin, "endtime": self.end, "_csrf": _csrf } html = self.session.post(check_record, post_data).text html_etree = etree.HTML(html) #获取页数 page_numer = html_etree.xpath( "//table[@align='right']/tr[1]/td[1]//text()") tbody_tr = [] ##获取多页 if len(page_numer) > 1: for number in range( 1, int(re.findall(".*/(\d+)页", "".join(page_numer))[0]) + 1): post_data["pageNo"] = number html_etree = etree.HTML( self.session.post(check_record, data=post_data).text) tbody_tr.extend( html_etree.xpath("//div[@class='tab-content']//tbody//tr")) else: tbody_tr = html_etree.xpath( "//div[@class='tab-content']//tbody//tr") if len(tbody_tr) == 0: # 没有交易数据 tran_list = "ce6:{}到{}没有交易数据".format(self.begin, self.end) else: # 有交易数据 tran_list = [] for tr in tbody_tr: tran_dic = { "number": "".join(tr.xpath("./td[1]/div[2]/text()")).strip(), "transaction number": "".join(tr.xpath("./td[2]/div/text()")), "happen_datetime": "".join(tr.xpath("./td[1]/div[1]/text()")).strip(), "way": "".join(tr.xpath("./td[2]/a/text()")), "money": "".join(tr.xpath("./td[4]/text()")).replace( "\xa0", "").replace("\r", "").replace("\n", "").replace("\t", ""), "amount_money": "".join(tr.xpath("./td[5]/text()")).replace("\xa0", "").strip(), "is_ok": "".join(tr.xpath("./td[6]/span/text()")) } tran_list.append(tran_dic) return {"balance": money, "detail": tran_list}
def get_all_semester_summary(self): try: data = {"projectType": "MAJOR"} self.mutex.acquire() page = self.__session.post(url=self.__all_score, data=data) if page.status_code != 200: raise exceptions.CrawlerException("ce14:教育系统崩溃了,请稍后在尝试") self.mutex.release() soup = BeautifulSoup(page.text, "html.parser") # print(page.text) tbodys = soup.find_all("tbody")[0] trs = tbodys.find_all("tr") average_grade_list = [] length = len(trs) for i, tr in enumerate(trs): # i:0,1,2,3,.... #tr:item if i < length - 1: if i == length - 2: ths = tr.find_all("th") all_semester_summary = { "username": self.__username, "school_year": "sum", "lesson_num": ths[1].text, "total_credit": ths[2].text, "average_score": ths[3].text } average_grade_list.append(all_semester_summary) else: tds = tr.find_all("td") if tds: a_semester_summary = { "username": self.__username, "school_year": tds[0].text + " " + tds[1].text, "lesson_num": tds[2].text, "total_credit": tds[3].text, "average_score": tds[4].text } average_grade_list.append(a_semester_summary) # print(average_grade_list) return {"all_semester": {"state": 1, "data": average_grade_list}} except requests.exceptions.ConnectionError: self.mutex.release() return { "all_semester": { "state": -1, "error_code": "ce10", "reason": "学校服务器对你的请求没有响应,访问失败" } } except exceptions.CrawlerException as e: error = str(e).split(":") return { "all_semester": { "state": -1, "error_code": error[0], "reason": error[1] } } except Exception as e: self.mutex.release() return { "all_semester": { "state": -1, "error_code": "ce8", "reason": "其他错误:" + str(e) } }
def get_all_score(self): try: data = {"projectType1": "MAJOR"} self.mutex.acquire() html = self.__session.post(self.__all_score, data=data) if html.status_code != 200: raise exceptions.CrawlerException("ce14:教育系统崩溃了,请稍后在尝试") self.mutex.release() html = etree.HTML(html.content.decode("utf-8")) # print(etree.tostring(html, encoding="utf-8").decode("utf-8")) tr_data = html.xpath("//table/tbody[contains(@id,'data')]//tr") score_data = [] for tr in tr_data: if not tr.xpath(".//td"): continue semester = "".join(tr.xpath(".//td[1]//text()")) # print("".join(tr.xpath(".//td[4]//text()"))) courseid = "".join( re.findall("(\S.*\S)", "".join(tr.xpath(".//td[2]//text()")))) course_name = "".join( re.findall("(\S.*\S)", "".join(tr.xpath(".//td[4]//text()")))) code = "".join(tr.xpath(".//td[3]//text()")) if "/" in code: coursecode = re.sub("\W*", "", code) + "/" else: coursecode = re.sub("\W*", "", code) courseevaluation = self.hand_space("".join( tr.xpath(".//td")[-3].xpath(".//text()"))) coursescore = self.hand_space("".join( tr.xpath(".//td")[-2].xpath(".//text()"))) score_dic = { "semester": semester, "username": self.__username, "course_id": courseid, "course_name": course_name, "course_code": coursecode, "course_evaluation": courseevaluation, "course_score": coursescore, } score_data.append(score_dic) # 成绩列表 # print(score_data) return {"score": {"state": 1, "data": score_data}} # print(self.final_data) except requests.exceptions.ConnectionError: self.mutex.release() return { "score": { "state": -1, "error_code": "ce10", "reason": "学校服务器对你的请求没有响应,访问失败" } } except exceptions.CrawlerException as e: error = str(e).split(":") return { "score": { "state": -1, "error_code": error[0], "reason": error[1] } } except Exception as e: self.mutex.release() return { "score": { "state": -1, "error_code": "ce8", "reason": "其他错误:" + str(e) } }
def get_course_table(self, from_week=1, semester=163): try: self.mutex.acquire() self.get_stuid() data = { "ignoreHead": 1, "setting.kind": "std", "startWeek": from_week, "semester.id": semester, "ids": self.__stuid } page = self.__session.post(url=self.__course_table_page, data=data) if page.status_code != 200: raise exceptions.CrawlerException("ce14:教育系统崩溃了,请稍后在尝试") self.mutex.release() soup = BeautifulSoup(page.text, "html.parser") script = str(soup.find_all("script")[-3]) HTML = etree.HTML(page.content.decode("utf-8")) tbody = HTML.xpath('//tbody[contains(@id,"data")]//tr') name_score_list = {} # 课程名称学分表 for tr in tbody: name = tr.xpath(".//td[4]/text()")[0].replace( "\t", "").replace("\r", "").replace("\n", "").strip() score = tr.xpath(".//td[9]/text()")[0].replace( "\t", "").replace("\r", "").replace("\n", "").strip() name_score_list.setdefault(name, score) course_dic_list = [] course_all_list = script.split("activity = new TaskActivity") for i in range(1, len(course_all_list)): course_list = re.match("\((.+)\)", course_all_list[i]).group() teacher = course_list.split(',"')[1] cour_aoccpy_week = course_list.split(",\"")[-1][:-2] course_id = re.findall("(.*)\)", course_list.split("(")[2])[0] place = course_list.split(',"')[5][:-1] vally_time = GetWeek() result = vally_time.marshal(cour_aoccpy_week, 2, from_week, 19) name = re.match("(.+)\(\d{2}", course_list.split(',"')[3]).group(1) course_week_time_list = re.findall("index\s=(.*);?", course_all_list[i]) week = [] time = [] for week_time in course_week_time_list: week_time = re.findall("\d{1,2}", week_time) week.append(str(int(week_time[0]) + 1)) time.append(str(int(week_time[1]) + 1)) dic = { "username": self.__username, "course_id": course_id, # 课程序号 "course_code": str(course_id).split(".")[0] if "." in str(course_id) else course_id, #课程代码 "teacher": teacher, "week_place": place, #地点 "name": name, #名字, "duration": result, "course_score": name_score_list[name], "week": ";".join(list(set(week))), #星期几 "week_pitch": "-".join(time) if len(time) <= 2 else time[0] + "-" + time[len(time) - 1], #第几节 "semester": semester } course_dic_list.append(dic) # index = 0 # for course_list_con in course_dic_list: # for other_course in course_dic_list: # if course_list_con["name"] == other_course["name"] and course_list_con["time"] != other_course["time"]: # if course_list_con["time"][0] == other_course["time"][0]: # if course_list_con["time"][len(course_list_con["time"]-1)] < other_course["time"][len(other_course["time"]-1)]: # course_list_con["get_time"] = course_list_con["time"].spilt("-")[0] + other_course["get_time"].spilt("-")[1] # print(course_dic_list) hand_course_list = self.hand_course_same(course_dic_list) return {"course": {"state": 1, "data": hand_course_list}} except requests.exceptions.ConnectionError: self.mutex.release() return { "course": { "state": -1, "error_code": "ce10", "reason": "学校服务器对你的请求没有响应,访问失败" } } except exceptions.CrawlerException as e: error = str(e).split(":") return { "course": { "state": -1, "error_code": error[0], "reason": error[1] } } except Exception as e: self.mutex.release() traceback.print_exc() return { "course": { "state": -1, "error_code": "ce8", "reason": "其他错误:" + str(e) } }
def get_detail( self ): # 可获取studentID <input type="hidden" name="studentId" value="246944"/> try: self.mutex.acquire() page = self.__session.get(url=self.__user_detail_page) if page.status_code != 200: raise exceptions.CrawlerException("ce14:教育系统崩溃了,请稍后在尝试") self.mutex.release() soup = BeautifulSoup(page.text, "html.parser") td = soup.find_all("td") stuid = self.get_stuid() # student ID username = td[2].string # 学号 self.__username = username name = td[4].string # 姓名 sex = td[9].string # 性别 grade = td[11].string # 年级 level = td[17].string # 学历 department = td[21].string # 院系 profession = td[23].string # 专业 class_ = td[43].string # 班级 campus = td[45].string # 校区 birthday = td[47].string # 生日 phone = td[56].string photo_state = self.save_photo() if photo_state == -1: detail = { "username": username, "stuid": int(stuid), "name": name, "sex": sex, "grade": grade, "level": level, "department": department, "profession": profession, "class": class_, "campus": campus, "birthday": birthday, "phone": phone, "photo": None, "avatar": "/static/media/avatar/default.png" } else: detail = { "username": username, "stuid": int(stuid), "name": name, "sex": sex, "grade": grade, "level": level, "department": department, "profession": profession, "class": class_, "campus": campus, "birthday": birthday, "photo": photo_state, "avatar": None } # print(detail) return {"detail": {"state": 1, "data": detail}} except requests.exceptions.ConnectionError: self.mutex.release() return { "detail": { "state": -1, "error_code": "ce10", "reason": "学校服务器对你的请求没有响应,访问失败" } } except exceptions.CrawlerException as e: error = str(e).split(":") return { "detail": { "state": -1, "error_code": error[0], "reason": error[1] } } except Exception as e: self.mutex.release() traceback.print_exc() return { "detail": { "state": -1, "error_code": "ce8", "reason": "其他错误:" + str(e) } }
def get_course_table(self, week=1, semester=1640420192): try: self.mutex.acquire() page = self.session.get(url=self.__course) if page.status_code != 200: raise exceptions.CrawlerException("ce14:教育系统崩溃了,请稍后在尝试") self.mutex.release() # print(page.content.decode("utf-8")) soup = BeautifulSoup(page.text, "html.parser") script = str(soup.find_all("script")[-1]) #获取用户ids ids = re.findall("addInput.*\"(\d+)", script)[0] data = { "setting.kind": "std", "weekSpan": week, "project.id": 5, "semester.id": semester, "ids": ids } page = self.session.post(url=self.__course_table_url, data=data) html = page.content.decode("utf-8") html_contend = etree.HTML(html) course_list = [] tr_list = html_contend.xpath("//tbody/tr") for tr in tr_list: course_id = tr.xpath("./td[2]/text()")[0] course_code = tr.xpath("./td[3]/text()")[0] course_list.append([course_id, course_code]) # print(course_list) course_dict_list = [] #查询课程时间 for course in course_list: data = { "lesson.semester.id": semester, "lesson.project.id": 5, "lesson.no": course[0], "lesson.course.code": course[1] } page = self.session.post(url=self.__all_course_url, data=data) # print(page.content.decode("utf-8")) html = page.content.decode("utf-8") # print(html) td_list = etree.HTML(html).xpath("//tbody//td") teacher = "".join(td_list[6].xpath(".//text()")).replace( "\t", "").replace("\n", "").strip() prepare_week = td_list[5].xpath(".//text()") time_list = [] for time in prepare_week: period = 0 time = time.replace("\t", "").replace("\n", "").strip() temp_lsit = time.split() if len(temp_lsit) > 0: if "单" in temp_lsit[2]: period = 1 temp_lsit[2] = temp_lsit[2].replace("单", "") elif "双" in temp_lsit[2]: period = 2 temp_lsit[2] = temp_lsit[2].replace("双", "") time_list.append({ "week": temp_lsit[0], "week_pitch": temp_lsit[1], "duration": temp_lsit[2], "place": temp_lsit[3], "period": period }) dic_course = { "username": self.__username, "course_id": course[0], "course_code": course[1], "teacher": teacher, "name": td_list[3].xpath("./a/text()")[0], "course_score": td_list[-2].xpath(".//text()")[0], "time": time_list, "semester": semester } course_dict_list.append(dic_course) ###不用 # # week = ";".join(re.findall("(星期.? \d{1,2}-\d{1,2})",prepare_week))#星期三 1-4;星期五 1-4 # # print(prepare_week) # # week_list = re.findall("(星期.? \d{1,2}-\d{1,2})",prepare_week) # #对weeK_list进行遍历 # time_list = [] #[{duration: 17-18,place: 二教505}, {duration: 17-18, place:二教311}], # for w in week_list: # temp = w.split() # time_list.append({"week":temp[0],"week_pitch":temp[1]}) # # # place_list = re.findall("星期.?\s\d{1,2}-\d{1,2}\s(.*?\s.+?)\s",prepare_week) #17-18 二教505;17-18 二教311 # # new_place_list = [] # # num = 0 # 记录下面弄到第几个了 # # for place in place_list: # num += 1 # temp_samp = False # for i in range(num, len(place_list)): # if place == place_list[i]: # temp_samp = True # if not temp_samp: # new_place_list.append(place) # # place = ";".join(new_place_list) # # place_record = [] # # for p in new_place_list: # temp = p.split() # place_record.append({"duration":temp[0],"place":temp[1]}) # dic_course = { # "username":self.__username, # "course_id":course[0], # "course_code":course[1], # "week":time_list, # "week_place":place_record, # "name":td_list[3].xpath("./a/text()")[0], # "course_score":td_list[-2].xpath(".//text()")[0], # "duration":td_list[-1].xpath(".//text()")[0] # } ###不用 # course_dict_list.append(dic_course) # print(course_dict_list) return {"course": {"state": 1, "data": course_dict_list}} except requests.exceptions.ConnectionError: self.mutex.release() return { "course": { "state": -1, "error_code": "ce10", "reason": "学校服务器对你的请求没有响应,访问失败" } } except exceptions.CrawlerException as e: error = str(e).split(":") return { "course": { "state": -1, "error_code": error[0], "reason": error[1] } } except Exception as e: traceback.print_exc() self.mutex.release() return { "course": { "state": -1, "error_code": "ce8", "reason": "其他错误:" + str(e) } }
def get_detail(self): try: self.mutex.acquire() page = self.session.get(url=self.__detail) if page.status_code != 200: raise exceptions.CrawlerException("ce14:教育系统崩溃了,请稍后在尝试") self.mutex.release() soup = BeautifulSoup(page.text, "html.parser") td = soup.find_all("td") html = etree.HTML(page.content.decode("utf-8")) td_page_2 = html.xpath("//div[@id='tabPage2']//td") username = td[2].string # 学号 name = td[4].string # 姓名 sex = td_page_2[2].xpath(".//text()")[0] # 性别 grade = td[7].string # 年级 level = td[11].string # 学历 department = td[15].string # 院系 profession = td[19].string # 专业 class_ = td[21].string # 班级 campus = td[27].string # 校区 birthday = td_page_2[4].xpath(".//text()")[0] # 生日 detail = { "username": username, "name": name, "sex": sex, "grade": grade, "level": level, "department": department, "profession": profession, "class": class_, "campus": campus, "birthday": birthday, } # print(detail) return {"detail": {"state": 1, "data": detail}} except requests.exceptions.ConnectionError: self.mutex.release() return { "detail": { "state": -1, "error_code": "ce10", "reason": "学校服务器对你的请求没有响应,访问失败" } } except exceptions.CrawlerException as e: error = str(e).split(":") return { "detail": { "state": -1, "error_code": error[0], "reason": error[1] } } except Exception as e: self.mutex.release() return { "detail": { "state": -1, "error_code": "ce8", "reason": "其他错误:" + str(e) } }
def get_all_score(self): try: self.mutex.acquire() page = self.session.get(url=self.__score) if page.status_code != 200: raise exceptions.CrawlerException("ce14:教育系统崩溃了,请稍后在尝试") self.mutex.release() html_contend = page.content.decode("utf-8") # print(html_contend) html = etree.HTML(html_contend) score_title = html.xpath("//h4/text()") # print(score_title) tbody_list = html.xpath("//tbody")[1:] score_dic_list = [] title_num = 1 for tbody in tbody_list: tr_list = tbody.xpath(".//tr") for tr in tr_list: evaluation = tr.xpath(".//td")[-2].xpath("./text()")[0] point = tr.xpath(".//td")[-1].xpath("./text()")[0] score_dic = { "semester": score_title[title_num] [:score_title[title_num].find("学期") - 1], "username": self.__username, "course_code": tr.xpath(".//td[2]/text()")[0], "course_name": tr.xpath(".//td[3]/text()")[0], "course_evaluation": re.findall("\d+.\d+", evaluation)[0] if re.findall( "\d+.\d+", evaluation) else re.sub( "\s+", "", evaluation), "course_score": re.findall("\d+.\d+", point)[0] if re.findall( "\d+.\d+", point) else re.sub("\s+", "", point) } score_dic_list.append(score_dic) title_num += 1 # print(score_dic_list) return {"score": {"state": 1, "data": score_dic_list}} except requests.exceptions.ConnectionError: self.mutex.release() return { "score": { "state": -1, "error_code": "ce10", "reason": "学校服务器对你的请求没有响应,访问失败" } } except exceptions.CrawlerException as e: error = str(e).split(":") return { "score": { "state": -1, "error_code": error[0], "reason": error[1] } } except Exception as e: self.mutex.release() return { "score": { "state": -1, "error_code": "ce8", "reason": "其他错误:" + str(e) } }