-
Notifications
You must be signed in to change notification settings - Fork 0
/
course.py
executable file
·285 lines (262 loc) · 12.7 KB
/
course.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
# encoding=utf-8
"""
@author: hxer
@contact: hxer7963@gmail.com
@time: 2017/4/21 23:08
"""
import sys, re
from collections import namedtuple
from lxml import html
from time import perf_counter as pc
from request import date, request
from xls_data_process import xls_search_answer, json_extract
def main_page():
index_url = '/student/index.do?{}&'.format(date()) # 作为之后登陆的Referer
text = request(index_url).text
tree = html.fromstring(text)
user_info = tree.xpath('//div[@class="bottom"]')[0].text.split()[:-1]
print(' '.join(user_info))
course_list = []
list_course = tree.xpath('//ul[@class="subNav"]')[0]
for node in list_course.xpath('./li'):
course_homepage_href = node.get('onclick').split("'")[1]
course_name = node.xpath('./span/@title')[0]
course_list.append((course_name, course_homepage_href))
print('您有{}门试题库: {}'.format(len(course_list), ', '.join([course_name for course_name, _ in course_list])))
return course_list
def course(name, course_homepage_href):
try:
learning_url, bbs_url = courses_homepage(name, course_homepage_href)
except TimeoutError as te:
print(te)
return None
course_id = course_homepage_href.split('&')[-1].split('=')[-1]
teaching_task_id = learning_url.split('&')[-1].split('=')[-1]
# 获取交流界面,查看是否需要评论
forum_id, discuss_cnt, bbs_hrefs = bbs_page(bbs_url, name)
if discuss_cnt > 0 and bbs_hrefs: # disuss
bbs_task(name, teaching_task_id, course_id, forum_id, discuss_cnt, bbs_hrefs)
# 点击进入学习->获取课程任务页面:题库下载、自测、考试
resource_url, exam_time, task_entities = task_assignment(name, learning_url, teaching_task_id, forum_id) # 返回set,存储各测试的url
qst_entities, course_abstract = extract_qst_entities(name, exam_time, task_entities)
if qst_entities:
abstract = manage_assignment(name, resource_url, qst_entities)
course_abstract.append(abstract)
return '\n'.join([_ for _ in course_abstract])
def manage_assignment(name, resource_url, qst_entities):
""" Firstly, Download the excel resource and decompress to dict."""
from xls_data_process import excel_dict
xls_dict = excel_dict(resource_url) # 获取excel文件hash title存储到dict中
if not xls_dict:
return '《%s》课程excel文件获取失败!'
" each coroutine execute 10 requests"
MAX_WORKERS = max(2, len(qst_entities)//30 + 1)
from concurrent import futures
from time import perf_counter as pc
t0 = pc()
with futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
to_do_list = []
for qst in qst_entities:
future = executor.submit(trackle_qst, xls_dict, qst)
to_do_list.append(future)
from tqdm import tqdm
done_iter = tqdm(futures.as_completed(to_do_list), total=len(to_do_list), ncols=90)
result = []
for future in done_iter:
res = future.result()
if res:
result.append(res)
msg = '《{}》已完成,用时{:.2f}s;'.format(name, pc()-t0)
print(msg, '\n未保存的题数有:', len(result))
return msg
def extract_qst_entities(name, exam_time, tasks_entities):
qst_entities, abstract, qst_cnt = [], [], 0
for category, task_url in tasks_entities: # 使用多线程,assignment_document请求document,manage_exam中请求xhr
try:
entities = assignment_document(name, task_url, exam_time) # 请求任务document
except ValueError as ve:
print(ve)
else:
unimcompleted_cnt = len(entities)
if unimcompleted_cnt == 0:
msg = '《{}》{}已完成,但*未提交*'.format(name, category)
print(msg)
abstract.append(msg)
else:
qst_entities.extend(entities)
qst_cnt += unimcompleted_cnt
if qst_cnt > 0:
print('==>《{}》课程共有{}道题未写:-)'.format(name, qst_cnt))
return qst_entities, abstract
def courses_homepage(name, course_url):
""" check the time and return the task_homepage with the bbs discuss href"""
texts = request(course_url).text
homepage_closed = '课程开始时间:(.*?),暂时不能进入学习!'
t0 = re.search(homepage_closed, texts)
if t0:
raise TimeoutError('《{}》课程开始时间:{}, 暂时不能进入学习'.format(name, t0.group(1)))
# raise TimeoutError('%s课程开始时间:%r,暂时不能进入学习', % (name, begin_time.group(1)))
if texts.find('课程已结束') != -1:
raise TimeoutError('{}课程已结束,不能进入学习'.format(name))
homepage = html.fromstring(texts)
tasks = homepage.xpath('//li[@class="item "]') # //table[@id="insideNav"]
task_homepage = tasks[0].get('onclick').split("'")[1]
bbs = tasks[-1].get('onclick').split("'")[1]
return task_homepage, bbs # 进入学习 and 交流 页面
def extract_bbs(href):
text = request(href).text
tree = html.fromstring(text)
title = tree.xpath('//div[@class="assignment-head"]/text()')
content = tree.xpath('//div[@class="BbsContent"]/span//text()')
if not content:
content = title[0].strip()
else:
content = '\n'.join([stm.strip() for stm in content])
return (title[0].strip(), content)
# 请求“交流”页面,同时获取高跟帖问题
def bbs_page(bbs_url, name):
text = request(bbs_url).text
tree = html.fromstring(text)
user_info = tree.xpath('//div[@class="bottom"]')[0].text.split()[:-1][-1]
questions = tree.xpath('//tr[@class="a"]')
entities = []
for question in questions:
try:
qst_href = question.xpath('td/a/@href')[0].strip()
except IndexError: # maybe there is a XSS reflect
continue
comments = int([item.strip() for item in question.xpath('td/text()') if item.strip()][1])
entities.append((comments, qst_href))
entities.sort()
user_name_num = len(re.findall(user_info, text))
discuss_cnt = 3 - user_name_num # 还需评论数
forum_id = '&forumId=(.*?)&' # forumId=
forum_id = re.search(forum_id, text).group(1)
if len(entities) > 13:
entities = entities[-13:]
hrefs = [href for _, href in entities]
return forum_id, discuss_cnt, hrefs
def bbs_task(name, teaching_task_id, course_id, forum_id, dis_cnt, qst_href): # 提问题
from random import choice
hrefs = [choice(qst_href) for _ in range(dis_cnt)]
qst = [extract_bbs(href) for href in hrefs]
print('《{}》课程讨论问题:'.format(name))
for i in range(dis_cnt):
post_url = '/student/bbs/manageDiscuss.do?{}&method=toAdd&teachingTaskId={}&forumId={}&' \
'isModerator=false'.format(date(), teaching_task_id, forum_id)
request(post_url) # 获取发布讨论的网页
title, content = qst[i][0], qst[i][1].replace('\n', '')
data = {
'forumId': forum_id,
'teachingTaskId': teaching_task_id,
'isModerator': 'false',
'topic': title,
'content': content,
}
dispatch = '/student/bbs/manageDiscuss.do?{}&method=add'.format(date())
request(dispatch, data) # 这是一个302转发
# bbs_index = '/student/bbs/index.do?teachingTaskId={}&forumId={}&{}&'\
# .format(teaching_task_id, course_id, date()) # 直接添加就行
# request(bbs_index)
# print(type(content), content)
# if len(content.decode('utf8')) > 50:
# content = content.decode('utf-8')[:30] + '...' + content.decode('utf-8')[-15:] + '(omitted!)'
print('\t标题: {}\t内容: {}'.format(title, content))
return forum_id
def task_assignment(name, task_homepage, teaching_task_id, forum_id): # 获取自测url
texts = request(task_homepage).text # 进入主页面
tree = html.fromstring(texts)
cur_state_idx = [_.strip() for _ in tree.xpath('//tr[@class="bg_tr"]/th/text()') if _.strip()].index('当前状态')
tr = tree.xpath('//tr[@class="a"]')
tasks_entities = []
for task in tr:
category = task.xpath('.//td[@title]')[0].text.strip()
if category == '题库下载':
resource_url = task.xpath('.//a/@href')[0]
else:
if category == '考试':
exam_time = task.xpath('./td[4]')[0].text.strip()
cur_state = [_.strip() for _ in task.xpath('./td//text()') if _.strip()]
if '分' in cur_state[cur_state_idx]:
print('《%s》%s %s ^_^' % (name, cur_state[1], cur_state[cur_state_idx]))
continue
task_url = task.xpath('.//a/@href')[0] # 没有完成任务则给出链接
tasks_entities.append((category, task_url))
return resource_url, exam_time, tasks_entities # 答题的链接
# examReplyId examId teachingTaskId在不同的自测任务中不同。exercises_id为list, 每道题不同.
constant = namedtuple('Constant', 'date_time examReplyId examId teachingTaskId exam exercise_id idx')
def assignment_document(name, task_url, exam_time): # 获取没有加载xhr题目的页面,得到xhr的参数
text = request(task_url).text
alert = '任务未开始'
if re.search(alert, text):
raise ValueError('《{}》课程*考试*尚未开始, 截止日期: {}'.format(name, exam_time))
# xpath to get examReplyId examId teachingTaskId in Form
tree = html.fromstring(text)
fmt = '/student/exam/manageExam.do?(.*?)&method=getExerciseInfo'
date_time = re.search(fmt, text).group(1)[1:]
form_input = tree.xpath('//*[@id="saveAnswerForm"]')[0]
exam_reply_id = form_input.xpath('//*[@id="examReplyId"]/@value')[0]
exam_id = form_input.xpath('//*[@id="examId"]/@value')[0]
teaching_task_id = form_input.xpath('//*[@id="teachingTaskId"]/@value')[0]
fmt = '"complete":false,"examStudentExerciseId":(.*?),"exerciseId":(.*?),"index":(.*?)}' #
const = [ constant(date_time, exam_reply_id, exam_id, teaching_task_id, exam, exercise_id, idx) for exam, exercise_id, idx in re.findall(fmt, text) ]
return const
def xhr_question(date_time, reply_id, student_exercise_id, exercise_id):
url = '/student/exam/manageExam.do'
data = {
str(date_time): '',
'method': 'getExerciseInfo',
'examReplyId': reply_id,
'exerciseId': exercise_id,
'examStudentExerciseId': student_exercise_id,
}
return request(url, data).json()
# assert 'optionsC' in r.json()
def trackle_qst(xls_dict, qst):
date_time, exam_reply_id, examId, teachingTaskId, examStudentExerciseId, exerciseId, idx = qst
data = {
'examReplyId': exam_reply_id,
'examStudentExerciseId': '',
'exerciseId': '',
'examId': examId,
'teachingTaskId': teachingTaskId,
'content': '',
}
json = xhr_question(date_time, exam_reply_id, examStudentExerciseId, exerciseId)
try:
title, options_answers, category = json_extract(json)
answer = xls_search_answer(xls_dict, title, options_answers, category)
except (ValueError, TypeError) as exc: # 没有找打就不保存题目,直接下一题
return idx
else:
data['examStudentExerciseId'] = examStudentExerciseId
data['exerciseId'] = exerciseId
# print('答案为:', ' '.join(answer))
# print('正在保存...', end=' ')
sys.stdout.flush()
types = json['type']
save_answer(types, answer, data)
return None
def save_answer(types, answers_list, data):
"""
duoxAnswer:A,B,C DXanswer:B DuoXanswerA:A
DuoXanswerB:B DuoXanswerC:C
"""
data_copy = data.copy()
if types == 1:
prefix = 'DXanswer'
elif types == 4:
prefix = 'DuoXanswer{}'
data_copy['duoxAnswer'] = ','.join(answers_list) # 考试时,多选题多了一个Json数据
else: # 好像多了一个单选A DXanswer: A
prefix = 'PDanswer'
if types != 4:
data_copy[prefix] = answers_list[0] # answers_list[0]
else:
data_copy['duoxAnswer'] = ','.join(answers_list)
for answer in answers_list:
key = prefix.format(answer)
data_copy[key] = 'true' # 切莫用data,因为传过来的data是按址传参的...
hand_url = '/student/exam/manageExam.do?i%s&method=saveAnswer' % date()
response = request(hand_url, data_copy) # response "status": "ok"
return response.json() # AssertionError