def problem_parse(self, response, pid, url): problem = Problem() problem.remote_oj = 'Codeforces' problem.remote_id = pid problem.remote_url = url if response is None: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem elif response.status_code == 302: problem.status = Problem.Status.STATUS_PROBLEM_NOT_EXIST return problem elif response.status_code != 200: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem elif response.text is None: problem.status = Problem.Status.STATUS_PROBLEM_NOT_EXIST return problem website = response.text soup = BeautifulSoup(website, 'lxml') match_groups = soup.find('div', attrs={'class': 'title'}) if match_groups: problem.title = match_groups.string problem.title = str(problem.title)[2:] match_groups = soup.find(name='div', attrs={'class': 'time-limit'}) if match_groups: problem.time_limit = match_groups.contents[-1] match_groups = soup.find(name='div', attrs={'class': 'memory-limit'}) if match_groups: problem.memory_limit = match_groups.contents[-1] match_groups = soup.find(name='div', attrs={'class': 'problem-statement'}) problem.html = '' if match_groups and isinstance(match_groups, element.Tag): for child in match_groups.children: if isinstance(child, element.Tag) and child.get('class') and set( child['class']).intersection({'header'}): pass elif isinstance(child, element.Tag): for tag in child: if isinstance(tag, element.Tag): if tag.get('class') is None: tag['class'] = () if tag.get('class') and set( tag['class']).intersection( {'section-title'}): tag['class'] += (HtmlTag.TagDesc.TITLE.value, ) tag['style'] = HtmlTag.TagStyle.TITLE.value else: tag['class'] += ( HtmlTag.TagDesc.CONTENT.value, ) tag['style'] = HtmlTag.TagStyle.CONTENT.value problem.html += str( HtmlTag.update_tag(child, self._static_prefix)) else: problem.html += str( HtmlTag.update_tag(child, self._static_prefix)) problem.html = '<html>' + problem.html + self._script + '</html>' problem.status = Problem.Status.STATUS_CRAWLING_SUCCESS return problem
def problem_parse(self, response, pid, url): problem = Problem() problem.remote_id = pid problem.remote_url = url problem.remote_oj = 'WUST' if response is None: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem website_data = response.text status_code = response.status_code if status_code != 200: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem if re.search('Problem is not Available', website_data): problem.status = Problem.Status.STATUS_PROBLEM_NOT_EXIST return problem match_groups = re.search(r'[\d]{4,}: ([\s\S]*?)</h2>', website_data) if match_groups: problem.title = match_groups.group(1) match_groups = re.search(r'(\d* Sec)', website_data) if match_groups: problem.time_limit = match_groups.group(1) match_groups = re.search(r'(\d* MB)', website_data) if match_groups: problem.memory_limit = match_groups.group(1) problem.special_judge = re.search(r'class=red>Special Judge</span>', website_data) is not None soup = BeautifulSoup(website_data, 'lxml') problem.html = '' for tag in soup.find('div', attrs={'class': 'rich_text'}).children: if type(tag) == element.Tag: if tag.name in ['h2', 'div']: if not tag.get('class'): tag['class'] = () if tag.name == 'h2': if tag.div: tag.div.decompose() if tag.img: tag.img.decompose() tag['style'] = HtmlTag.TagStyle.TITLE.value tag['class'] += (HtmlTag.TagDesc.TITLE.value, ) problem.html += str( HtmlTag.update_tag( tag, self._static_prefix, update_style=HtmlTag.TagStyle.TITLE.value)) else: tag['style'] = HtmlTag.TagStyle.CONTENT.value tag['class'] += (HtmlTag.TagDesc.CONTENT.value, ) problem.html += str( HtmlTag.update_tag( tag, self._static_prefix, update_style=HtmlTag.TagStyle.CONTENT.value)) problem.html = '<body>' + problem.html + '</body>' problem.status = Problem.Status.STATUS_CRAWLING_SUCCESS return problem
def problem_parse(self, response, pid, url): problem = Problem() problem.remote_id = pid problem.remote_url = url problem.remote_oj = 'FZU' if response is None: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem website_data = response.text status_code = response.status_code if status_code != 200: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem if re.search('No Such Problem!', website_data): problem.status = Problem.Status.STATUS_PROBLEM_NOT_EXIST return problem soup = BeautifulSoup(website_data, 'lxml') match_groups = re.search(r'<b> Problem [\d]* ([\s\S]*?)</b>', website_data) if match_groups: problem.title = match_groups.group(1) match_groups = re.search(r'(\d* mSec)', website_data) if match_groups: problem.time_limit = match_groups.group(1) match_groups = re.search(r'(\d* KB)', website_data) if match_groups: problem.memory_limit = match_groups.group(1) problem.special_judge = re.search(r'<font color="blue">Special Judge</font>', website_data) is not None problem.html = '' for tag in soup.find('div', attrs={'class': 'problem_content'}).children: if tag.name == 'h2': if tag.img: tag.img.decompose() if not tag.get('class'): tag['class'] = (HtmlTag.TagDesc.TITLE.value,) else: tag['class'] += (HtmlTag.TagDesc.TITLE.value,) tag['style'] = HtmlTag.TagStyle.TITLE.value if tag.name == 'div': if not tag.get('class'): tag['class'] = (HtmlTag.TagDesc.CONTENT.value,) else: tag['class'] += (HtmlTag.TagDesc.CONTENT.value,) tag['style'] = HtmlTag.TagStyle.CONTENT.value problem.html += str(HtmlTag.update_tag(tag, self._static_prefix)) problem.html = '<body>' + self._global_style + problem.html + '</body>' problem.status = Problem.Status.STATUS_CRAWLING_SUCCESS return problem
def get_problem(self, pid, account, **kwargs): if not self._oj: problem = Problem() problem.remote_oj = self._origin_name problem.remote_id = pid problem.status = Problem.Status.STATUS_OJ_NOT_EXIST return problem self._oj.set_cookies(account.cookies) problem = self._oj.get_problem(pid=pid, account=account, **kwargs) problem.title = self._space_and_enter_strip(problem.title) problem.time_limit = self._space_and_enter_strip(problem.time_limit) problem.memory_limit = self._space_and_enter_strip( problem.memory_limit) return problem
def problem_parse(self, response, pid, url): problem = Problem() problem.remote_id = pid problem.remote_url = url problem.remote_oj = 'HDU' if response is None: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem website_data = response.text status_code = response.status_code if status_code != 200: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem if re.search('No such problem', website_data): problem.status = Problem.Status.STATUS_PROBLEM_NOT_EXIST return problem soup = BeautifulSoup(website_data, 'lxml') match_groups = re.search(r'color:#1A5CC8\'>([\s\S]*?)</h1>', website_data) if match_groups: problem.title = match_groups.group(1) match_groups = re.search(r'(\d* MS)', website_data) if match_groups: problem.time_limit = match_groups.group(1) match_groups = re.search(r'/(\d* K)', website_data) if match_groups: problem.memory_limit = match_groups.group(1) problem.special_judge = re.search(r'color=red>Special Judge</font>', website_data) is not None problem.html = '' for tag in soup.find('h1').parent.children: if type(tag) == element.Tag and tag.get('class') and set(tag['class']).intersection({'panel_title', 'panel_content', 'panel_bottom'}): if set(tag['class']).intersection({'panel_title', }): tag['class'] += (HtmlTag.TagDesc.TITLE.value,) tag['style'] = HtmlTag.TagStyle.TITLE.value else: tag['class'] += (HtmlTag.TagDesc.CONTENT.value,) tag['style'] = HtmlTag.TagStyle.CONTENT.value problem.html += str(HtmlTag.update_tag(tag, self._static_prefix)) problem.html += self._script problem.status = Problem.Status.STATUS_CRAWLING_SUCCESS return problem
def get_problem(self, *args, **kwargs): url = 'http://poj.org/problem?id=' + str(kwargs['pid']) problem = Problem() try: res = self.request.get(url=url) website_data = res.text problem.remote_id = kwargs['pid'] problem.remote_url = url problem.remote_oj = 'POJ' # print('probid', problem.remote_id) problem.title = re.search(r'ptt" lang="en-US">([\s\S]*?)</div>', website_data).group(1) problem.time_limit = re.search(r'(\d*MS)', website_data).group(1) problem.memory_limit = re.search(r'Memory Limit:</b> ([\s\S]*?)</td>', website_data).group(1) problem.special_judge = re.search(r'red;">Special Judge</td>', website_data) is not None problem.description = re.search(r'>Description</p>[\s\S]*?lang="en-US">([\s\S]*?)</div>', website_data).group(1) problem.input = re.search(r'>Input</p>[\s\S]*?lang="en-US">([\s\S]*?)</div>', website_data).group(1) problem.output = re.search(r'>Output</p>[\s\S]*?lang="en-US">([\s\S]*?)</div>', website_data).group(1) match_group = re.search(r'>Sample Input</p>([\s\S]*?)<p class', website_data) input_data = '' if match_group: input_data = re.search('"sio">([\s\S]*?)</pre>', match_group.group(1)).group(1) output_data = '' match_group = re.search(r'>Sample Output</p>([\s\S]*?)<p class', website_data) if match_group: output_data = re.search('"sio">([\s\S]*?)</pre>', match_group.group(1)).group(1) problem.sample = [ {'input': input_data, 'output': output_data}] # match_group = re.search(r'>Author</div>[\s\S]*?panel_content>([\s\S]*?)</div>', website_data) # if match_group: # problem.author = match_group.group(1) match_group = re.search(r'>Hint</p>[\s\S]*?"en-US">([\s\S]*?)</div>', website_data) if match_group: problem.hint = match_group.group(1) match_group = re.search(r'>Source</p>[\s\S]*?"en-US">([\s\S]*?)</div>', website_data) if match_group: problem.source = match_group.group(1) return problem except: pass return None
def get_problem(self, *args, **kwargs): url = 'http://acm.hdu.edu.cn/showproblem.php?pid=' + str(kwargs['pid']) problem = Problem() try: website_data = self.request.get(url) self.cookies = website_data.cookies problem.remote_id = kwargs['pid'] problem.remote_url = url problem.remote_oj = 'HDU' problem.title = re.search(r'color:#1A5CC8\'>([\s\S]*?)</h1>', website_data.text).group(1) problem.time_limit = re.search(r'(\d* MS)', website_data.text).group(1) problem.memory_limit = re.search(r'/(\d* K)', website_data.text).group(1) problem.special_judge = re.search(r'color=red>Special Judge</font>', website_data.text) is not None problem.description = re.search(r'>Problem Description</div>[\s\S]*?panel_content>([\s\S]*?)</div>', website_data.text).group(1) problem.input = re.search(r'>Input</div>[\s\S]*?panel_content>([\s\S]*?)</div>', website_data.text).group(1) problem.output = re.search(r'>Output</div>[\s\S]*?panel_content>([\s\S]*?)</div>', website_data.text).group( 1) match_group = re.search(r'>Sample Input</div>[\s\S]*?panel_content>([\s\S]*?)</div', website_data.text) input_data = '' if match_group: input_data = re.search(r'(<pre><div[\s\S]*?>)?([\s\S]*)', match_group.group(1)).group(2) output_data = '' match_group = re.search(r'>Sample Output</div>[\s\S]*?panel_content>([\s\S]*?)</div', website_data.text) if match_group: output_data = re.search(r'(<pre><div[\s\S]*?>)?([\s\S]*)', match_group.group(1)).group(2) if re.search('<div', output_data): output_data = re.search(r'([\s\S]*?)<div', output_data).group(1) problem.sample = [ {'input': input_data, 'output': output_data}] match_group = re.search(r'>Author</div>[\s\S]*?panel_content>([\s\S]*?)</div>', website_data.text) if match_group: problem.author = match_group.group(1) match_group = re.search(r'<i>Hint</i>[\s\S]*?/div>[\s]*([\s\S]+?)</div>', website_data.text) if match_group: problem.hint = match_group.group(1) except: return None return problem
def problem_parse(self, response, pid, url): problem = Problem() problem.remote_id = pid problem.remote_oj = 'Aizu' problem.remote_url = url if response is None: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem website_data = response.text status_code = response.status_code if status_code in [401, 404]: problem.status = Problem.Status.STATUS_PROBLEM_NOT_EXIST return problem elif status_code != 200: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem site_data = json.loads(website_data) soup = BeautifulSoup(site_data.get('html'), 'lxml') problem.title = str(soup.find('h1').get_text()) problem.time_limit = str(site_data.get('time_limit')) + ' sec' problem.memory_limit = str(site_data.get('memory_limit')) + ' KB' problem.special_judge = False problem.html = '' for tag in soup.body: if type(tag) == element.Tag and tag.name in [ 'p', 'h2', 'pre', 'center' ]: if not tag.get('class'): tag['class'] = () if tag.name == 'h2': tag['style'] = HtmlTag.TagStyle.TITLE.value tag['class'] += (HtmlTag.TagDesc.TITLE.value, ) else: tag['style'] = HtmlTag.TagStyle.CONTENT.value tag['class'] += (HtmlTag.TagDesc.CONTENT.value, ) problem.html += str( HtmlTag.update_tag(tag, self._static_prefix)) problem.html += self._script problem.status = Problem.Status.STATUS_CRAWLING_SUCCESS return problem
def problem_parse(self, response, pid, url): problem = Problem() problem.remote_id = pid problem.remote_url = url problem.remote_oj = 'POJ' if response is None: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem website_data = response.text status_code = response.status_code if status_code != 200: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem if re.search('Can not find problem', website_data): problem.status = Problem.Status.STATUS_PROBLEM_NOT_EXIST return problem soup = BeautifulSoup(website_data, 'lxml') match_groups = re.search(r'ptt" lang="en-US">([\s\S]*?)</div>', website_data) if match_groups: problem.title = match_groups.group(1) match_groups = re.search(r'(\d*MS)', website_data) if match_groups: problem.time_limit = match_groups.group(1) match_groups = re.search(r'Memory Limit:</b> ([\s\S]*?)</td>', website_data) if match_groups: problem.memory_limit = match_groups.group(1) problem.special_judge = re.search(r'red;">Special Judge</td>', website_data) is not None problem.html = '' for tag in soup.find('div', attrs={'class': 'ptt'}).next_siblings: if type(tag) == Tag and set(tag.get('class')).intersection({'ptx', 'pst', 'sio'}): if set(tag['class']).intersection({'pst', }): tag['style'] = HtmlTag.TagStyle.TITLE.value tag['class'] += (HtmlTag.TagDesc.TITLE.value,) else: tag['style'] = HtmlTag.TagStyle.CONTENT.value tag['class'] += (HtmlTag.TagDesc.CONTENT.value,) problem.html += str(HtmlTag.update_tag(tag, self._static_prefix)) problem.status = Problem.Status.STATUS_CRAWLING_SUCCESS return problem
def problem_parse(self, response, pid, url): problem = Problem() problem.remote_id = pid problem.remote_url = url problem.remote_oj = 'ZOJ' if not response: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem website_data = response.text status_code = response.status_code if status_code != 200: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem if re.search('No such problem', website_data): problem.status = Problem.Status.STATUS_PROBLEM_NOT_EXIST return problem soup = BeautifulSoup(website_data, 'lxml') problem.title = str( soup.find('span', attrs={ 'class': 'bigProblemTitle' }).get_text()) match_groups = re.search(r'(\d* Second)', website_data) if match_groups: problem.time_limit = match_groups.group(1) match_groups = re.search(r'(\d* KB)', website_data) if match_groups: problem.memory_limit = match_groups.group(1) problem.special_judge = re.search( r'<font color="blue">Special Judge</font>', website_data) is not None problem.html = '' problem.html += self._script raw_html = soup.find('div', attrs={'id': 'content_body'}) for tag in raw_html.children: if type(tag) == element.NavigableString: problem.html += str(tag) if type(tag) == element.Tag and tag.name not in ['center', 'hr']: if tag.name == 'a' and tag.get( 'href') == '/onlinejudge/faq.do#sample': continue if tag.name == 'h2': tag['style'] = HtmlTag.TagStyle.TITLE.value if tag.get('class'): tag['class'] += (HtmlTag.TagDesc.TITLE.value, ) else: tag['class'] = (HtmlTag.TagDesc.TITLE.value, ) elif tag.name == 'p' and tag.b and tag.b.string in [ 'Input', 'Output', 'Sample Input', 'Sample Output' ]: tag.b['style'] = HtmlTag.TagStyle.TITLE.value if tag.get('class'): tag.b['class'] += (HtmlTag.TagDesc.TITLE.value, ) else: tag.b['class'] = (HtmlTag.TagDesc.TITLE.value, ) else: tag['style'] = HtmlTag.TagStyle.CONTENT.value if tag.get('class'): tag['class'] += (HtmlTag.TagDesc.CONTENT.value, ) else: tag['class'] = (HtmlTag.TagDesc.CONTENT.value, ) HtmlTag.update_tag(tag, self._static_prefix) problem.html += str(tag) problem.status = Problem.Status.STATUS_CRAWLING_SUCCESS return problem