async def fetch_status(self, first): url = self.status_prefix.format(self.account.nickname, first) status_list = [] try: response = await self.load_page(url, {'Cookie': self.cookie}) if not response: return False soup = self.get_lxml_bs4(response.body) status_table = soup.find('table', class_='table_text') for row in status_table.children: if row.name != 'tr': continue if row.get('class') and 'table_header' in row.get('class'): continue td_text = [td.text for td in row.children] status = { 'type': DataType.Submit, 'account': self.account, 'status': submit.SubmitStatus.BROKEN, 'run_id': td_text[0], 'submit_time': td_text[1], 'result': td_text[2], 'pro_id': td_text[3], 'run_time': td_text[4][:-2], 'memory': td_text[5][:-1], 'lang': td_text[7], 'code': None } status_list.append(status) return status_list except Exception as ex: logger.error(ex) logger.error('{} fetch status account: {} first: {}'.format( self.TAG, self.account, first))
async def get_status(self, handle, start=1, length=50): is_gym = lambda cid: len(str(cid)) >= 6 url = self.status_prefix.format(handle, start, length) try: response = await self.load_page(url) if not response: return False response_data = json.loads(response.body.decode()) if response_data['status'] != 'OK': return False result = response_data['result'] status_list = [] for row in result: if is_gym(row['contestId']): continue pro_id = '{0}{1}'.format(row['contestId'], row['problem']['index']) submit_at = datetime.fromtimestamp(row['creationTimeSeconds']) # code = yield self.get_code(row['contestId'], row['id']) status = { 'type': DataType.Submit, 'account': self.account, 'status': submit.SubmitStatus.BROKEN, 'pro_id': pro_id, 'run_id': row['id'], 'submit_time': submit_at, 'run_time': row['timeConsumedMillis'], 'memory': row['memoryConsumedBytes'] // 1024, 'lang': row['programmingLanguage'], 'code': None, 'result': row['verdict'] } status_list.append(status) return status_list except Exception as e: logger.error(e)
async def fetch_status(self, first=''): url = self.status_prefix.format(self.account.nickname, first) status_list = [] try: response = await self.load_page(url) if not response: return False soup = self.get_lxml_bs4(response.body) status_table = soup.find('table', class_='a') for row in status_table.children: if row.name != 'tr': continue if row.get('class') and 'in' in row.get('class'): continue td_text = [td.text for td in row.children if td.name == 'td'] # code = yield self.get_code(td_text[0]) run_time = td_text[5][:-2] or '-1' memory = td_text[4][:-1] or '-1' status = { 'type': DataType.Submit, 'account': self.account, 'status': submit.SubmitStatus.BROKEN, 'run_id': td_text[0], 'submit_time': td_text[8], 'result': td_text[3], 'pro_id': td_text[2], 'run_time': run_time, 'memory': memory, 'lang': td_text[6], 'code': None } status_list.append(status) return status_list except Exception as ex: logger.error(ex) logger.error('{} fetch status => user_id: {} top: {}'.format( self.TAG,self.account.nickname, first))
async def fetch_status(self, first): url = self.status_prefix.format(self.account.nickname, first) status_list = [] try: response = await self.load_page(url, {'Cookie': self.cookie}) if not response: return False soup = self.get_lxml_bs4(response.body) status_table = soup.find('table', class_='table_text') for row in status_table.children: if row.name != 'tr': continue if row.get('class') and 'table_header' in row.get('class'): continue td_text = [td.text for td in row.children] status = { 'type': DataType.Submit, 'account': self.account, 'status': submit.SubmitStatus.BROKEN, 'run_id': td_text[0], 'submit_time': td_text[1], 'result': td_text[2], 'pro_id': td_text[3], 'run_time': td_text[4][:-2], 'memory': td_text[5][:-1], 'lang': td_text[7], 'code': None } status_list.append(status) return status_list except Exception as ex: logger.error(ex) logger.error('{} fetch status account: {} first: {}'.format(self.TAG, self.account, first))
def init_http_client(): try: httpclient.AsyncHTTPClient.configure( "tornado.curl_httpclient.CurlAsyncHTTPClient") logger.info('[ACM-Spider] 配置 CurlAsyncHTTPClient 成功') except Exception as ex: logger.error( '[ACM-Spider] 配置 CurlAsyncHTTPClient 失败: {}'.format(ex))
async def load_page(url, headers=None, **kwargs): response = None try: response = await Spider.fetch(url, headers=headers, **kwargs) except httpclient.HTTPError as ex: logger.error('加载 {} 失败: {}'.format(url, ex)) raise LoadPageException('加载 {} 失败: {}'.format(url, ex)) finally: return response
async def get_code(self, run_id, **kwargs): url = self.code_url_prefix.format(run_id) try: response = await self.load_page(url, {'cookie': self.cookie}, validate_cert=False) soup = self.get_lxml_bs4(response.body) code = soup.find('pre', class_='sh-c').text return code except Exception as e: logger.error(e) logger.error(traceback.format_exc())
async def get_code(self, run_id, **kwargs): pro_id = kwargs['pro_id'] contest_id = re.compile(r'^\d+').match(pro_id).group() url = self.code_prefix.format(contest_id, run_id) try: response = await self.load_page(url) if not response: return None soup = self.get_lxml_bs4(response.body) code = soup.find('pre', class_='program-source').text return code except Exception as e: logger.error(e)
async def get_rating(self): url = self.rating_api_prefix.format(self.account.nickname) try: response = await self.fetch(url) if not response: return False res = json.loads(response.body.decode()) if len(res) > 0: max_rating = max(res, key=lambda x: x['rating']) return dict(rating=res[-1]['rating'], maxRating=max_rating['rating']) except Exception as ex: logger.error(ex) logger.error('{} {} get Rating error'.format(self.TAG, self.account))
async def get_rating(self): url = self.rating_api_prefix.format(self.account.nickname) try: response = await self.fetch(url) if not response: return False res = json.loads(response.body.decode()) if len(res) > 0: max_rating = max(res, key=lambda x: x['rating']) return dict(rating=res[-1]['rating'], maxRating=max_rating['rating']) except Exception as ex: logger.error(ex) logger.error('{} {} get Rating error'.format( self.TAG, self.account))
async def get_code(self, run_id, **kwargs): url = self.source_code_prefix.format(run_id) try: response = await self.load_page(url, {'cookie': self.cookie}) if not response: return False soup = self.get_lxml_bs4(response.body) pre_node = soup.find('pre') if not pre_node: return False logger.debug("{} fetch {}\'s code {} success".format(self.TAG, self.account, run_id)) return pre_node.text except Exception as ex: logger.error(ex) logger.error('{} fetch {}\'s {} code error'.format(self.TAG, self.account, run_id))
async def get_solved(self): url = self.user_url_prefix.format(self.account.nickname) try: response = await self.load_page(url, {'cookie': self.cookie}) if not response: return False soup = self.get_lxml_bs4(response.body) solved = soup.find('button', id='showac').previous_sibling.string.strip() submitted = soup.find('a', href='status.php?showname={}'.format(self.account.nickname)).text return { 'solved': solved, 'submitted': submitted, # 'solved_list': self._get_solved_list(soup) } except Exception as ex: logger.error('{} get Solved/Submitted error {}: {}'.format(self.TAG, self.account, ex)) raise ex
async def get_solved(self): url = self.user_info_prefix.format(self.account.nickname) try: response = await self.load_page(url) if not response: return False ret = json.loads(response.body.decode()) if ret['status'] != 'OK': return False user_info = ret['result'][0] return { 'rating': user_info['rating'], 'maxRating': user_info['maxRating'] } except Exception as e: logger.error(e)
async def get_code_zip(self, min, max): url = self.code_zip_url.format(min, max) try: response = await self.fetch(url, method=HttpMethod.GET, headers={'cookie': self.cookie}, validate_cert=False) buffer = response.buffer with ZipFile(buffer) as code_zip: for name in code_zip.namelist(): run_id = re.split(r'/|_', name)[2] with code_zip.open(name) as code_fp: code = code_fp.read() status = { 'type': DataType.Code, 'account': self.account, 'run_id': run_id, 'code': code } await self.put_queue([status]) except Exception as e: logger.error(e) logger.error(traceback.format_exc())
async def get_solved(self): url = self.user_url_prefix.format(self.account.nickname) try: response = await self.load_page(url) if not response: return False soup = self.get_lxml_bs4(response.body) # solved count solved_count = soup.find('a', href=re.compile('^status\?result=0')).text submitted_count = soup.find('a', href=re.compile('^status\?user_id')).text # solved list # solved_list = self._get_solved_list(soup) return { 'solved': solved_count, 'submitted': submitted_count, #'solved_list': solved_list } except Exception as ex: logger.error('{} {} get Solved/Submitted error: {}'.format(self.TAG, self.account, ex)) raise ex
async def spider_runner(idx): """ 爬虫运行地 """ logger.info('[SpiderRunner #{0}] 开始运行 ...'.format(idx)) while True: cur_account = await AccountQueue.get() logger.info( '[SpiderRunner #{0}] {1} <=== account_queue(size={2})'.format( idx, cur_account, AccountQueue.qsize())) # let spider.run() worker = await SpiderFactory[cur_account.oj_name].get() worker.account = cur_account try: cur_account.set_status(account.AccountStatus.UPDATING) cur_account.save() await worker.run() cur_account.set_status(account.AccountStatus.NORMAL) except LoginException as ex: logger.error(ex) cur_account.set_status(account.AccountStatus.ACCOUNT_ERROR) except Exception as ex: logger.error(ex) logger.error(traceback.format_exc()) cur_account.set_status(account.AccountStatus.UPDATE_ERROR) finally: cur_account.save() # work done logger.info('[SpiderRunner #{0}] {1} work done'.format( idx, cur_account)) SpiderFactory[cur_account.oj_name].task_done() AccountQueue.task_done() await SpiderFactory[cur_account.oj_name].put(worker)
async def spider_runner(idx): """ 爬虫运行地 """ logger.info('[SpiderRunner #{0}] 开始运行 ...'.format(idx)) while True: cur_account = await AccountQueue.get() logger.info('[SpiderRunner #{0}] {1} <=== account_queue(size={2})' .format(idx, cur_account, AccountQueue.qsize())) # let spider.run() worker = await SpiderFactory[cur_account.oj_name].get() worker.account = cur_account try: cur_account.set_status(account.AccountStatus.UPDATING) cur_account.save() await worker.run() cur_account.set_status(account.AccountStatus.NORMAL) except LoginException as ex: logger.error(ex) cur_account.set_status(account.AccountStatus.ACCOUNT_ERROR) await gen.sleep(60 * 2) except Exception as ex: logger.error(ex) logger.error(traceback.format_exc()) cur_account.set_status(account.AccountStatus.UPDATE_ERROR) await gen.sleep(60 * 2) finally: cur_account.save() # work done logger.info('[SpiderRunner #{0}] {1} work done'.format(idx, cur_account)) SpiderFactory[cur_account.oj_name].task_done() AccountQueue.task_done() await SpiderFactory[cur_account.oj_name].put(worker)
async def get_solved(self): url = self.user_url_prefix.format(self.account.nickname) try: response = await self.load_page(url, {'Cookie': self.cookie}) if not response: return False soup = self.get_lxml_bs4(response.body) # solved count count = soup.find_all('td', text=['Problems Submitted', 'Problems Solved']) submitted_count = count[0].next_sibling.text solved_count = count[1].next_sibling.text # solved list # solved_list = self._get_solved_list(soup) return { 'solved': solved_count, 'submitted': submitted_count, # 'solved_list': solved_list } except Exception as ex: logger.error('{} {} get Solved/Submitted error: {}'.format(self.TAG, self.account, ex)) raise ex
async def get_solved(self): url = self.user_url_prefix.format(self.account.nickname) try: response = await self.load_page(url, {'Cookie': self.cookie}) if not response: return False soup = self.get_lxml_bs4(response.body) # solved count count = soup.find_all( 'td', text=['Problems Submitted', 'Problems Solved']) submitted_count = count[0].next_sibling.text solved_count = count[1].next_sibling.text # solved list # solved_list = self._get_solved_list(soup) return { 'solved': solved_count, 'submitted': submitted_count, # 'solved_list': solved_list } except Exception as ex: logger.error('{} {} get Solved/Submitted error: {}'.format( self.TAG, self.account, ex)) raise ex
async def get_code(self, run_id, **kwargs): url = self.source_code_prefix.format(run_id) try: response = await self.load_page(url, {'Cookie': self.cookie}) if not response: return False soup = self.get_lxml_bs4(response.body) code_area = soup.find('textarea', id='usercode') if not code_area: logger.error('{} {} Fail to load code {} page'.format(self.TAG, self.account, run_id)) logger.error('{}: {}'.format(self.TAG, code_area)) return False code = code_area.text logger.debug('{} {} Success to load code {} page'.format(self.TAG, self.account, run_id)) return code except Exception as ex: logger.error(ex) logger.error('{} fetch {}\'s {} code error'.format(self.TAG, self.account, run_id))
async def get_code(self, run_id, **kwargs): url = self.source_code_prefix.format(run_id) try: response = await self.load_page(url, {'Cookie': self.cookie}) if not response: return False soup = self.get_lxml_bs4(response.body) code_area = soup.find('textarea', id='usercode') if not code_area: logger.error('{} {} Fail to load code {} page'.format( self.TAG, self.account, run_id)) logger.error('{}: {}'.format(self.TAG, code_area)) return False code = code_area.text logger.debug('{} {} Success to load code {} page'.format( self.TAG, self.account, run_id)) return code except Exception as ex: logger.error(ex) logger.error('{} fetch {}\'s {} code error'.format( self.TAG, self.account, run_id))
async def get_code(self, run_id, **kwargs): url = self.code_prefix.format(run_id) try: response = await self.load_page(url, {'cookie': self.cookie}) if not response: logger.error('{} {} Fail to load code {} page'.format(self.TAG, self.account, run_id)) logger.error('{}: response => {}'.format(self.TAG, response)) return False res = json.loads(response.body.decode('utf-8')) code = res['source'] logger.debug('{} {} Success to load code {} page'.format(self.TAG, self.account, run_id)) return unescape(code) except Exception as ex: logger.error('{} fetch {}\'s {} code error {}'.format(self.TAG, self.account, run_id, ex))
async def wrapper(*args, **kwargs): left_times = times call_state, ret = False, None while left_times > 0 and call_state is False: try: if left_times != times: logger.warn('[重试第 {0} 次] ===> {1}({2})'.format( times - left_times, function.__name__, args)) ret = await function(*args, **kwargs) call_state = True if ret else False if not call_state: await gen.sleep(duration * 60) except Exception as e: logger.error(e) logger.error(traceback.format_exc()) finally: left_times -= 1 if call_state is False: message = '[已经重试 {0} 次] def {1}({2}) call fail'.format(times, function.__name__, args) logger.error(message) return ret
async def wrapper(*args, **kwargs): left_times = times call_state, ret = False, None while left_times > 0 and call_state is False: try: if left_times != times: logger.warn('[重试第 {0} 次] ===> {1}({2})'.format( times - left_times, function.__name__, args)) ret = await function(*args, **kwargs) call_state = True if ret else False if not call_state: await gen.sleep(duration * 60) except Exception as e: logger.error(e) logger.error(traceback.format_exc()) finally: left_times -= 1 if call_state is False: message = '[已经重试 {0} 次] def {1}({2}) call fail'.format( times, function.__name__, args) logger.error(message) return ret
def init_http_client(): try: httpclient.AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient") logger.info('[ACM-Spider] 配置 CurlAsyncHTTPClient 成功') except Exception as ex: logger.error('[ACM-Spider] 配置 CurlAsyncHTTPClient 失败: {}'.format(ex))