def run(self): self._login() url_course = self.url_prefix + "xskbcx.aspx?xh=" + self.username r_course = requests.get(url_course, cookies=self.cookies) '''parse''' if u"调查问卷".encode(self.charset) in r_course.content: raise GrabError("无法抓取您的课程,请先填写教务网调查问卷。") strainer = SoupStrainer("table", id="xsgrid") soup = BeautifulSoup(r_course.content, parse_only=strainer) rows = soup.select("tr") courses = [] for r in rows: if r.has_key('class') and r['class'] == ["datagridhead"]: continue cols = r.select("td") semester_text = cols[3].get_text(strip=True) time_texts = [text for text in cols[4].stripped_strings] locations = [text for text in cols[5].stripped_strings] lessons = self.get_lessons(time_texts, locations, semester_text) course = { 'original_id': cols[0].get_text(strip=True), 'name': cols[1].get_text(strip=True), 'teacher': cols[2].get_text(strip=True), 'lessons': lessons, } courses.append(course) self.courses = courses return pretty_format(courses)
def run(self): self._login() url_course = self.url_prefix + 'xkAction.do?actionType=6' r_course = requests.get(url_course, cookies=self.cookies) soup = BeautifulSoup(r_course.content.replace('class', 'id')) soup.prettify() soup = soup.find_all("table")[7] rows = soup.select("tr") courses = [] for r in rows: if r.has_key('id') and r['id'] != "odd": continue cols = r.select("td") if cols == []: continue location = cols[15].get_text(strip=True) + ' ' + cols[16].get_text(strip=True) + ' ' + cols[17].get_text(strip=True) teacher = self.get_teachers(cols[7].get_text(strip=True)) weeks_text = cols[11].get_text(strip=True) day_text = cols[12].get_text(strip=True) start_end_text = cols[13].get_text(strip=True) + '-' + str(int(cols[13].get_text(strip=True)) + int(cols[14].get_text(strip=True)) - 1) lessons = self.get_lessons(weeks_text, day_text, start_end_text, location) course = { 'original_id': cols[1].get_text(strip=True), 'name': cols[2].get_text(strip=True), 'teacher': teacher, 'lessons': lessons, } courses.append(course) self.courses = courses return pretty_format(courses)
course = { 'original_id': row[1], 'name': row[2], 'credit': str(float(row[4])), 'teacher': teacher, 'lessons': [lesson], } if courses: if (course['original_id'] == courses[-1]['original_id'] and course['teacher'] == courses[-1]['teacher'] and prev_code_name == code_name): courses[-1]['lessons'].append(lesson) else: courses.append(course) else: courses.append(course) prev_code_name = code_name #print courses total_courses = len(courses) print "Converted %d courses. Writing to yaml...\n" % total_courses if courses != []: with open(('bjtu.yaml'), 'w') as yaml_file: yaml_file.write(pretty_format(courses)) except IOError: print "Cannot open data/bjtu.csv, exiting." exit()
def grab_all(self): self._local_setup() self.next_url = 'http://portal.ruc.edu.cn/cas/login?service=http%3A%2F%2Fportal.ruc.edu.cn%2Fidc%2Feducation%2Fselectcourses%2Fresultquery%2FResultQueryAction.do%3Fmethod%3DforwardAllQueryXkjg' self._login() r_cookies = requests.post(self.next_url, cookies=self.cookies, verify=False) content = r_cookies.content.decode(self.charset) self.cookies = r_cookies.cookies '''parser, start.''' ''' - get colleges''' strainer_colleges = SoupStrainer("select", id="condition_yx") soup_colleges = BeautifulSoup(r_cookies.content.decode('gbk'), parse_only=strainer_colleges) colleges = [option['value'] for option in soup_colleges.select("option") if option['value']] colleges_name = [option.get_text() for option in soup_colleges.select("option") if option['value']] pretty_print(colleges_name) print "{0} colleges.".format(len(colleges)) ''' - iter colleges''' total_courses = 0 for i, college in enumerate(colleges): courses = [] url_courses = 'http://portal.ruc.edu.cn/idc/education/selectcourses/resultquery/ResultQueryAction.do' '''get courses''' for j in range(1, 15): data = { 'method': "allJxb", 'condition_xnd': "2012-2013", 'condition_xq': "1", 'condition_yx': college.encode('gbk'), 'isNeedInitSQL': "true", 'ksj1': j, 'ksj2': j, } r_courses = requests.post(url_courses, data=data, cookies=self.cookies) content = r_courses.content.decode('gbk') soup_courses = BeautifulSoup(content) rows = soup_courses.find_all("row") if len(rows) == 1: continue for r in rows: teacher = r.select("xm")[0].get_text(strip=True).replace('/', ',') time_and_location_texts = r.select("sksj > tagbr") lessons = self.get_lessons(time_and_location_texts) course = { 'original_id': r.select("jxbh")[0].get_text(strip=True), 'name': r.select("kcmc")[0].get_text(strip=True), 'credit': str(float(r.select("xf")[0].get_text(strip=True))), 'teacher': teacher, 'lessons': lessons, } courses.append(course) print "#{0} {1}: {2} courses.".format(i, colleges_name[i].encode('utf8'), len(courses)) if len(courses) == 0: continue total_courses += len(courses) output_dir = os.path.join(os.path.dirname(__file__), 'ruc') if not os.path.exists(output_dir): os.makedirs(output_dir) if courses != []: with open(os.path.join(output_dir, colleges_name[i] + '.yaml'), 'w') as yaml_file: yaml_file.write(pretty_format(courses)) print "Done! Totally exported {0} courses.".format(total_courses)
def grab_all(self): # self._local_setup() # self._login() self._fake_login() url_courses = self.url_prefix + "jxrw_zd.aspx?xh=" + self.username '''get viewstate''' r_viewstate = requests.get(url_courses, cookies=self.cookies) result = re.search( '<input type="hidden" name="__VIEWSTATE" value="(.+)" />', r_viewstate.content) viewstate = result.group(1) print "Get viewstate: done." '''parser, start.''' ''' - get colleges''' strainer_colleges = SoupStrainer("select", id="ddlXY") soup_colleges = BeautifulSoup(r_viewstate.content.decode(self.charset), parse_only=strainer_colleges) colleges = [ option['value'] for option in soup_colleges.select("option") if option['value'] ] pretty_print(colleges) print "{} colleges.".format(len(colleges)) ''' - iter colleges''' total_courses = 0 for i, college in enumerate(colleges): '''get courses''' data = { '__EVENTTARGET': "", '__EVENTARGUMENT': "", '__VIEWSTATE': viewstate, 'ddlXN': "2012-2013", 'ddlXQ': "1", 'ddlXY': college.encode(self.charset), 'ddlZY': "", 'ddlKC': "", 'btnFilter': u' 查 询 '.encode(self.charset), } r_courses = requests.post(url_courses, data=data, cookies=self.cookies) content = r_courses.content.decode(self.charset) strainer_courses = SoupStrainer("table", id="DBGrid") soup_courses = BeautifulSoup(content, parse_only=strainer_courses) rows = soup_courses.select("tr") courses = [] for r in rows: if r.has_key('class') and r['class'] == ["datagridhead"]: continue cols = r.select("td") semester_text = cols[0].get_text(strip=True) teacher = cols[7].get_text(strip=True).replace('/', ',') time_texts = map(string.strip, cols[8].get_text().split(';')) locations = map(string.strip, cols[9].get_text().split(';')) lessons = self.get_lessons(time_texts, locations, semester_text) course = { 'original_id': cols[3].get_text(strip=True), 'name': cols[4].get_text(strip=True), 'credit': float(cols[6].get_text(strip=True)), 'teacher': teacher, 'lessons': lessons, } courses.append(course) print "#{} {}: {} courses.".format(i, college.encode("utf8"), len(courses)) total_courses += len(courses) output_dir = os.path.join(os.path.dirname(__file__), 'zju') if not os.path.exists(output_dir): os.makedirs(output_dir) with open(os.path.join(output_dir, str(i) + '.yaml'), 'w') as yaml_file: yaml_file.write(pretty_format(courses)) # with open(os.path.join(output_dir, str(i) + '.html'), 'w') as html_file: # html_file.write(soup_courses.prettify().encode("utf8")) print "Done! Totally exported {} courses.".format(total_courses)
def grab_all(self): self._local_setup() self.next_url = 'http://portal.ruc.edu.cn/cas/login?service=http%3A%2F%2Fportal.ruc.edu.cn%2Fidc%2Feducation%2Fselectcourses%2Fresultquery%2FResultQueryAction.do%3Fmethod%3DforwardAllQueryXkjg' self._login() r_cookies = requests.post(self.next_url, cookies=self.cookies, verify=False) content = r_cookies.content.decode(self.charset) self.cookies = r_cookies.cookies '''parser, start.''' ''' - get colleges''' strainer_colleges = SoupStrainer("select", id="condition_yx") soup_colleges = BeautifulSoup(r_cookies.content.decode('gbk'), parse_only=strainer_colleges) colleges = [ option['value'] for option in soup_colleges.select("option") if option['value'] ] colleges_name = [ option.get_text() for option in soup_colleges.select("option") if option['value'] ] pretty_print(colleges_name) print "{0} colleges.".format(len(colleges)) ''' - iter colleges''' total_courses = 0 for i, college in enumerate(colleges): courses = [] url_courses = 'http://portal.ruc.edu.cn/idc/education/selectcourses/resultquery/ResultQueryAction.do' '''get courses''' for j in range(1, 15): data = { 'method': "allJxb", 'condition_xnd': "2012-2013", 'condition_xq': "1", 'condition_yx': college.encode('gbk'), 'isNeedInitSQL': "true", 'ksj1': j, 'ksj2': j, } r_courses = requests.post(url_courses, data=data, cookies=self.cookies) content = r_courses.content.decode('gbk') soup_courses = BeautifulSoup(content) rows = soup_courses.find_all("row") if len(rows) == 1: continue for r in rows: teacher = r.select("xm")[0].get_text(strip=True).replace( '/', ',') time_and_location_texts = r.select("sksj > tagbr") lessons = self.get_lessons(time_and_location_texts) course = { 'original_id': r.select("jxbh")[0].get_text(strip=True), 'name': r.select("kcmc")[0].get_text(strip=True), 'credit': str(float(r.select("xf")[0].get_text(strip=True))), 'teacher': teacher, 'lessons': lessons, } courses.append(course) print "#{0} {1}: {2} courses.".format( i, colleges_name[i].encode('utf8'), len(courses)) if len(courses) == 0: continue total_courses += len(courses) output_dir = os.path.join(os.path.dirname(__file__), 'ruc') if not os.path.exists(output_dir): os.makedirs(output_dir) if courses != []: with open(os.path.join(output_dir, colleges_name[i] + '.yaml'), 'w') as yaml_file: yaml_file.write(pretty_format(courses)) print "Done! Totally exported {0} courses.".format(total_courses)
def grab_all(self): self._local_setup() self._login() url_courses = self.url_prefix + 'courseSearchAction.do?temp=1' '''get TOKEN''' r_viewstate = requests.get(url_courses, cookies=self.cookies) result = re.search('<input type="hidden" name="org.apache.struts.taglib.html.TOKEN" value="(.+)">', r_viewstate.content) TOKEN = result.group(1) print "Get TOKEN: done." '''parser, start.''' ''' - get colleges''' strainer_colleges = SoupStrainer('select', id="xsjc") soup_colleges = BeautifulSoup(r_viewstate.content.decode('gbk').replace('name', 'id'), parse_only=strainer_colleges) colleges = [option['value'] for option in soup_colleges.select("option") if option['value']] pretty_print(colleges) print "{0} colleges.".format(len(colleges)) ''' - iter colleges''' url_courses = self.url_prefix + 'courseSearchAction.do' total_courses = 0 for i, college in enumerate(colleges): '''get courses''' showColumn = [u'kch#课程号'.encode('gbk'), u'kcm#课程名'.encode('gbk'), u'xf#学分'.encode('gbk'), u'skjs#教师'.encode('gbk'), u'zcsm#周次'.encode('gbk'), u'skxq#星期'.encode('gbk'), u'skjc#节次'.encode('gbk'), u'xqm#校区'.encode('gbk'), u'jxlm#教学楼'.encode('gbk'), u'jasm#教室'.encode('gbk'), u'kxh#课序号'.encode('gbk')] data = { 'org.apache.struts.taglib.html.TOKEN': TOKEN.encode('gbk'), 'pageNumber': "0".encode('gbk'), 'actionType': "1".encode('gbk'), 'xsjc': college.encode('gbk'), 'pageSize': '1000'.encode('gbk'), 'showColumn': showColumn, } r_courses = requests.post(url_courses, data=data, cookies=self.cookies) content = r_courses.content.decode('gbk') strainer_courses = SoupStrainer("table", id="titleTop2") soup_courses = BeautifulSoup(content.replace('class', 'id'), parse_only=strainer_courses) rows = soup_courses.select("tr") prev_code_name = '-1' courses = [] for r in rows: if not r.has_key('id'): continue cols = r.select("td") try: test_text = cols[0].get_text(strip=True) except: break teacher = self.get_teachers(cols[3].get_text(strip=True)) weeks_text = cols[4].get_text(strip=True) day_text = cols[5].get_text(strip=True) start_end_text = cols[6].get_text(strip=True) location = cols[7].get_text(strip=True) + ' ' + cols[8].get_text(strip=True) + ' ' + cols[9].get_text(strip=True) lessons = self.get_lessons(weeks_text, day_text, start_end_text, location) code_name = cols[10].get_text(strip=True) course = { 'original_id': cols[0].get_text(strip=True), 'name': cols[1].get_text(strip=True), 'credit': str(float(cols[2].get_text(strip=True).replace(' ', ''))), 'teacher': teacher, 'lessons': lessons, } try: last_course = courses.pop() except: pass else: if course['original_id'] == last_course['original_id'] and course['teacher'] == last_course['teacher'] and prev_code_name == code_name: course['lessons'] = course['lessons'] + last_course['lessons'] else: courses.append(last_course) prev_code_name = code_name courses.append(course) print "#{0} {1}: {2} courses.".format(i, college.encode("utf8"), len(courses)) total_courses += len(courses) output_dir = os.path.join(os.path.dirname(__file__), 'bupt') if not os.path.exists(output_dir): os.makedirs(output_dir) if courses != []: with open(os.path.join(output_dir, str(i) + '.yaml'), 'w') as yaml_file: yaml_file.write(pretty_format(courses)) print "Done! Totally exported {0} courses.".format(total_courses)
def grab_all(self): # self._local_setup() # self._login() self._fake_login() url_courses = self.url_prefix + "jxrw_zd.aspx?xh=" + self.username '''get viewstate''' r_viewstate = requests.get(url_courses, cookies=self.cookies) result = re.search('<input type="hidden" name="__VIEWSTATE" value="(.+)" />', r_viewstate.content) viewstate = result.group(1) print "Get viewstate: done." '''parser, start.''' ''' - get colleges''' strainer_colleges = SoupStrainer("select", id="ddlXY") soup_colleges = BeautifulSoup(r_viewstate.content.decode(self.charset), parse_only=strainer_colleges) colleges = [option['value'] for option in soup_colleges.select("option") if option['value']] pretty_print(colleges) print "{} colleges.".format(len(colleges)) ''' - iter colleges''' total_courses = 0 for i, college in enumerate(colleges): '''get courses''' data = { '__EVENTTARGET': "", '__EVENTARGUMENT': "", '__VIEWSTATE': viewstate, 'ddlXN': "2012-2013", 'ddlXQ': "1", 'ddlXY': college.encode(self.charset), 'ddlZY': "", 'ddlKC': "", 'btnFilter': u' 查 询 '.encode(self.charset), } r_courses = requests.post(url_courses, data=data, cookies=self.cookies) content = r_courses.content.decode(self.charset) strainer_courses = SoupStrainer("table", id="DBGrid") soup_courses = BeautifulSoup(content, parse_only=strainer_courses) rows = soup_courses.select("tr") courses = [] for r in rows: if r.has_key('class') and r['class'] == ["datagridhead"]: continue cols = r.select("td") semester_text = cols[0].get_text(strip=True) teacher = cols[7].get_text(strip=True).replace('/', ',') time_texts = map(string.strip, cols[8].get_text().split(';')) locations = map(string.strip, cols[9].get_text().split(';')) lessons = self.get_lessons(time_texts, locations, semester_text) course = { 'original_id': cols[3].get_text(strip=True), 'name': cols[4].get_text(strip=True), 'credit': float(cols[6].get_text(strip=True)), 'teacher': teacher, 'lessons': lessons, } courses.append(course) print "#{} {}: {} courses.".format(i, college.encode("utf8"), len(courses)) total_courses += len(courses) output_dir = os.path.join(os.path.dirname(__file__), 'zju') if not os.path.exists(output_dir): os.makedirs(output_dir) with open(os.path.join(output_dir, str(i) + '.yaml'), 'w') as yaml_file: yaml_file.write(pretty_format(courses)) # with open(os.path.join(output_dir, str(i) + '.html'), 'w') as html_file: # html_file.write(soup_courses.prettify().encode("utf8")) print "Done! Totally exported {} courses.".format(total_courses)