async def get_pages(self, item): data = item.data item.data = data.format(1, random.random()) logging.info('get_pages: ' + item.data) resp = await self.async_web_request(item) if resp is None: self.add_task('get_pages', item) return None html_string = sync_text(resp) if not html_string: self.add_task('get_pages', item) return None js = json.loads(html_string) if js['info'] != 'success' or js['status'] != 1: logging.error('[get_pages]: {}\n{}'.format(item.data), json.dumps(js, ensure_ascii=False)) self.add_task('get_pages', item) return None for page_num in range(1, int(js['data'][1]) // 20 + 1): qs_item = Item( dict( method='POST', url='http://www.wln100.com/Home/Index/getTestList.html', data=data.format(page_num, random.random()), info=item.info, headers=headers, )) self.add_task('get_questions', qs_item)
def make_qs_item(qid): item = Item(dict( method = 'GET', url = 'http://www.wln100.com/Test/{}.html'.format(qid), max_retry = 3, timeout = 30, headers = dict(HEADERS), )) return item
def make_item(qid): url = URL.format(qid) item = Item(dict( method='GET', url=url, max_retry=2, timeout=60, )) return item
def make_item(url, data, info): item = Item(dict( method = 'POST', url = url, data = data, info = info, max_retry = 2, timeout = 120, )) return item
def make_page_item(info): url = 'http://www.dz101.com/zujuan/zhishidian/Problems' item = Item( dict( method='GET', url=url + '?' + PARAM.format(**info), max_retry=2, timeout=120, )) return item
def make_as_item(qid): item = Item(dict( method = 'POST', url = 'http://www.wln100.com/Test/TestPreview/getOneTestById.html', data = 'id={}&width=500&s={}'.format(qid, random.random()), max_retry = 3, timeout = 30, headers = dict(HEADERS), )) return item
def make_item(qid): url = URL.format(qid) item = Item( dict( method='GET', url=url, headers=HEADERS, max_retry=2, timeout=20, )) return item
def iter_pre_item(): SUBJS = ( { "SubjectID": "12", "aft_subid": 21, }, { "SubjectID": "13", "aft_subid": 22, }, { "SubjectID": "14", "aft_subid": 23, }, { "SubjectID": "15", "aft_subid": 25, }, { "SubjectID": "16", "aft_subid": 26, }, { "SubjectID": "17", "aft_subid": 29, }, { "SubjectID": "18", "aft_subid": 30, }, { "SubjectID": "19", "aft_subid": 28, }, { "SubjectID": "20", "aft_subid": 27, }, ) for info in SUBJS: item = Item( dict( method='POST', url='http://www.wln100.com/Home/Index/getTestList.html', data= 'sid={}&kid=0&tid=0&dtid=0&dif=0&o=0&page={{}}&sourceid=0&rand={{}}' .format(info['SubjectID']), info=info, headers=headers, )) yield item
async def run(self): return None # has done, no do it again item = Item( dict( method='GET', url='http://zx.17zuoye.com/teacher/assign/books?grade_id=0&_={}' .format(int(time.time() * 1000)), headers=headers, cookies=self.cookies, )) while True: resp = await self.async_web_request(item) js = json.loads(resp) #js = json.loads(sync_text(resp)) if js['error_code'] != 0 or js['success'] != True: logging.error('[get_pages]: {}\n{}'.format(item.data), json.dumps(js, ensure_ascii=False)) item.proxy = 'http://' + _proxy.get() self.stop() else: break for book in js['data']['books']: book['subject'] = 2 item = Item( dict( method='POST', url='http://zx.17zuoye.com/teacher/assign/searchQuestions', data='book_id={}&lesson_id={}&page=1'.format( book['_id'], book['series_id']), headers=headers, info=book, cookies=self.cookies, max_retry=2, timeout=10, )) self.add_task('get_pages', item)
def make_item(subj, page, _type='xt'): # _type = None 系统题目 # _type = 'qp' 全品题目 if _type == 'qp': url = ('http://tiku.manfen5.com/zujuan/UserSTListAjax.aspx?' 'type=getUserST&UnionID=10050&CourseID={}&ZSDZJType=' '&ZSDZJID=&EndID=0&STTX=&STLeavel=&page={}').format(subj, page) elif _type == 'xt': url = ('http://tiku.manfen5.com/zujuan/STListAjax.aspx?' 'type=getST&CourseID={}&ZSDZJType=&ZSDZJID=' '&EndID=0&STTX=&STLeavel=&IsOnlineTest=&page={}').format( subj, page) item = Item(dict( method='POST', url=url, max_retry=2, timeout=80, )) item.subj = subj item._type = _type item.name = '{}_{}_{}'.format(subj, _type, page) return item
async def run(self): return None # use task_queue item = Item( dict( method='POST', url='http://zx.17zuoye.com/teacher/assign/searchQuestions', data= 'book_id=BK_20300001489009&lesson_id=BKC_20300076895304&page=1', headers=headers, info={'subject': 3}, cookies=self.cookies, max_retry=2, timeout=10, )) self.add_task('get_pages', item)
async def get_questions(self, item): if self.no_new_question > 5: return None logging.info('get_questions: ' + item.data) resp = await self.async_web_request(item) if resp is None: self.add_task('get_questions', item) return None html_string = sync_text(resp) if not html_string: self.add_task('get_questions', item) return None js = json.loads(html_string) if js['info'] != 'success' or js['status'] != 1: logging.error('[get_pages]: {}'.format(item.data)) self.add_task('get_questions', item) return None save_question(js, item.info, json.dumps(item.json(), ensure_ascii=False)) no_new = True for qs in js['data'][0]: if is_archived(qs['testid']): continue no_new = False as_item = Item( dict( method='POST', url='http://www.wln100.com/Home/Index/getOneTestById.html', data='id={}&width=500&s={}'.format(qs['testid'], random.random()), headers=headers, info=item.info, cookies=self.cookies, )) self.add_task('get_answer', as_item, qs['testid']) if no_new: self.no_new_question += 1
async def get_pages(self, item): item.proxy = 'http://' + _proxy.get() item.max_retry = 2 item.timeout = 10 item.cookies = self.cookies logging.info('get_pages: ' + item.data) resp = await self.async_web_request(item) if resp is None: item.proxy = 'http://' + _proxy.get() self.add_task('get_pages', item) return None html_string = sync_text(resp) if not html_string: item.proxy = 'http://' + _proxy.get() self.add_task('get_pages', item) return None js = json.loads(html_string) if js['error_code'] != 0 or js['success'] != True: logging.error('[get_pages]: {}\n{}'.format(item.data), json.dumps(js, ensure_ascii=False)) item.proxy = 'http://' + _proxy.get() self.add_task('get_pages', item) en_accounts[self.u]['block'] = True self.login17() return None for page_num in range(1, int(js['data']['page_count'])): qs_item = Item( dict( method='POST', url='http://zx.17zuoye.com/teacher/assign/searchQuestions', data= 'book_id=BK_20300001489009&lesson_id=BKC_20300076895304&page={}' .format(page_num), headers=headers, cookies=self.cookies, info=item.info, max_retry=2, timeout=10, )) self.add_task('get_questions', qs_item)
async def run(self): rows = get_question_ids() for row in rows: testid = row[0][10:] if is_archived(testid): continue info = dict(aft_subid=row[1]) as_item = Item( dict( method='POST', url='http://www.wln100.com/Home/Index/getOneTestById.html', data='id={}&width=500&s={}'.format(testid, random.random()), headers=headers, info=info, cookies=self.cookies, )) self.add_task('get_answer', as_item, testid)