async def get_pages(self, item): data = item.data item.data = data.format(1, random.random()) logging.info('get_pages: ' + item.data) resp = await self.async_web_request(item) if resp is None: self.add_task('get_pages', item) return None html_string = sync_text(resp) if not html_string: self.add_task('get_pages', item) return None js = json.loads(html_string) if js['info'] != 'success' or js['status'] != 1: logging.error('[get_pages]: {}\n{}'.format(item.data), json.dumps(js, ensure_ascii=False)) self.add_task('get_pages', item) return None for page_num in range(1, int(js['data'][1]) // 20 + 1): qs_item = Item( dict( method='POST', url='http://www.wln100.com/Home/Index/getTestList.html', data=data.format(page_num, random.random()), info=item.info, headers=headers, )) self.add_task('get_questions', qs_item)
async def get_questions(self, item): if self.no_new_question > 5: return None logging.info('get_questions: ' + item.data) resp = await self.async_web_request(item) if resp is None: self.add_task('get_questions', item) return None html_string = sync_text(resp) if not html_string: self.add_task('get_questions', item) return None js = json.loads(html_string) if js['info'] != 'success' or js['status'] != 1: logging.error('[get_pages]: {}'.format(item.data)) self.add_task('get_questions', item) return None save_question(js, item.info, json.dumps(item.json(), ensure_ascii=False)) no_new = True for qs in js['data'][0]: if is_archived(qs['testid']): continue no_new = False as_item = Item( dict( method='POST', url='http://www.wln100.com/Home/Index/getOneTestById.html', data='id={}&width=500&s={}'.format(qs['testid'], random.random()), headers=headers, info=item.info, cookies=self.cookies, )) self.add_task('get_answer', as_item, qs['testid']) if no_new: self.no_new_question += 1
async def get_questions(self, item): if self.no_new_question > 5: return None item.proxy = 'http://' + _proxy.get() item.max_retry = 2 item.timeout = 10 item.cookies = self.cookies logging.info('get_questions: ' + item.data) resp = await self.async_web_request(item) if resp is None: item.proxy = 'http://' + _proxy.get() self.add_task('get_questions', item) return None html_string = sync_text(resp) if not html_string: item.proxy = 'http://' + _proxy.get() self.add_task('get_questions', item) return None js = json.loads(html_string) if js['error_code'] != 0 or js['success'] != True: logging.error('[get_pages]: {}\n{}'.format(item.data, js)) item.proxy = 'http://' + _proxy.get() self.add_task('get_questions', item) en_accounts[self.u]['block'] = True self.login17() return None no_new = True for qs in js['data']['questions']: for s_qs in qs['qs']: _id = s_qs['_id'] if is_archived('17zuoye_qs_' + _id): continue no_new = False save_html(s_qs, item.info, json.dumps(item.json(), ensure_ascii=False)) if no_new: self.no_new_question += 1
async def get_pages(self, item): item.proxy = 'http://' + _proxy.get() item.max_retry = 2 item.timeout = 10 item.cookies = self.cookies logging.info('get_pages: ' + item.data) resp = await self.async_web_request(item) if resp is None: item.proxy = 'http://' + _proxy.get() self.add_task('get_pages', item) return None html_string = sync_text(resp) if not html_string: item.proxy = 'http://' + _proxy.get() self.add_task('get_pages', item) return None js = json.loads(html_string) if js['error_code'] != 0 or js['success'] != True: logging.error('[get_pages]: {}\n{}'.format(item.data), json.dumps(js, ensure_ascii=False)) item.proxy = 'http://' + _proxy.get() self.add_task('get_pages', item) en_accounts[self.u]['block'] = True self.login17() return None for page_num in range(1, int(js['data']['page_count'])): qs_item = Item( dict( method='POST', url='http://zx.17zuoye.com/teacher/assign/searchQuestions', data= 'book_id=BK_20300001489009&lesson_id=BKC_20300076895304&page={}' .format(page_num), headers=headers, cookies=self.cookies, info=item.info, max_retry=2, timeout=10, )) self.add_task('get_questions', qs_item)
async def get_answer(self, item, testid): logging.info('get_answer: ' + item.data) resp = await self.async_web_request(item) if resp is None: self.add_task('get_answer', item) return None html_string = sync_text(resp) if not html_string: self.add_task('get_answer', item) return None js = json.loads(html_string) if js['info'] != 'success' or js['status'] != 1: logging.error('[get_pages]: {}'.format(item.data)) self.add_task('get_answer', item) return None save_answer(js, item.info, json.dumps(item.json(), ensure_ascii=False), testid)