def problem_parse(self, response, pid, url): problem = Problem() problem.remote_id = pid problem.remote_url = url problem.remote_oj = 'WUST' if response is None: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem website_data = response.text status_code = response.status_code if status_code != 200: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem if re.search('Problem is not Available', website_data): problem.status = Problem.Status.STATUS_PROBLEM_NOT_EXIST return problem match_groups = re.search(r'[\d]{4,}: ([\s\S]*?)</h2>', website_data) if match_groups: problem.title = match_groups.group(1) match_groups = re.search(r'(\d* Sec)', website_data) if match_groups: problem.time_limit = match_groups.group(1) match_groups = re.search(r'(\d* MB)', website_data) if match_groups: problem.memory_limit = match_groups.group(1) problem.special_judge = re.search(r'class=red>Special Judge</span>', website_data) is not None soup = BeautifulSoup(website_data, 'lxml') problem.html = '' for tag in soup.find('div', attrs={'class': 'rich_text'}).children: if type(tag) == element.Tag: if tag.name in ['h2', 'div']: if not tag.get('class'): tag['class'] = () if tag.name == 'h2': if tag.div: tag.div.decompose() if tag.img: tag.img.decompose() tag['style'] = HtmlTag.TagStyle.TITLE.value tag['class'] += (HtmlTag.TagDesc.TITLE.value, ) problem.html += str( HtmlTag.update_tag( tag, self._static_prefix, update_style=HtmlTag.TagStyle.TITLE.value)) else: tag['style'] = HtmlTag.TagStyle.CONTENT.value tag['class'] += (HtmlTag.TagDesc.CONTENT.value, ) problem.html += str( HtmlTag.update_tag( tag, self._static_prefix, update_style=HtmlTag.TagStyle.CONTENT.value)) problem.html = '<body>' + problem.html + '</body>' problem.status = Problem.Status.STATUS_CRAWLING_SUCCESS return problem
def problem_parse(self, response, pid, url): problem = Problem() problem.remote_oj = 'Codeforces' problem.remote_id = pid problem.remote_url = url if response is None: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem elif response.status_code == 302: problem.status = Problem.Status.STATUS_PROBLEM_NOT_EXIST return problem elif response.status_code != 200: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem elif response.text is None: problem.status = Problem.Status.STATUS_PROBLEM_NOT_EXIST return problem website = response.text soup = BeautifulSoup(website, 'lxml') match_groups = soup.find('div', attrs={'class': 'title'}) if match_groups: problem.title = match_groups.string problem.title = str(problem.title)[2:] match_groups = soup.find(name='div', attrs={'class': 'time-limit'}) if match_groups: problem.time_limit = match_groups.contents[-1] match_groups = soup.find(name='div', attrs={'class': 'memory-limit'}) if match_groups: problem.memory_limit = match_groups.contents[-1] match_groups = soup.find(name='div', attrs={'class': 'problem-statement'}) problem.html = '' if match_groups and isinstance(match_groups, element.Tag): for child in match_groups.children: if isinstance(child, element.Tag) and child.get('class') and set( child['class']).intersection({'header'}): pass elif isinstance(child, element.Tag): for tag in child: if isinstance(tag, element.Tag): if tag.get('class') is None: tag['class'] = () if tag.get('class') and set( tag['class']).intersection( {'section-title'}): tag['class'] += (HtmlTag.TagDesc.TITLE.value, ) tag['style'] = HtmlTag.TagStyle.TITLE.value else: tag['class'] += ( HtmlTag.TagDesc.CONTENT.value, ) tag['style'] = HtmlTag.TagStyle.CONTENT.value problem.html += str( HtmlTag.update_tag(child, self._static_prefix)) else: problem.html += str( HtmlTag.update_tag(child, self._static_prefix)) problem.html = '<html>' + problem.html + self._script + '</html>' problem.status = Problem.Status.STATUS_CRAWLING_SUCCESS return problem
def problem_parse(self, response, pid, url): problem = Problem() problem.remote_id = pid problem.remote_url = url problem.remote_oj = 'FZU' if response is None: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem website_data = response.text status_code = response.status_code if status_code != 200: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem if re.search('No Such Problem!', website_data): problem.status = Problem.Status.STATUS_PROBLEM_NOT_EXIST return problem soup = BeautifulSoup(website_data, 'lxml') match_groups = re.search(r'<b> Problem [\d]* ([\s\S]*?)</b>', website_data) if match_groups: problem.title = match_groups.group(1) match_groups = re.search(r'(\d* mSec)', website_data) if match_groups: problem.time_limit = match_groups.group(1) match_groups = re.search(r'(\d* KB)', website_data) if match_groups: problem.memory_limit = match_groups.group(1) problem.special_judge = re.search(r'<font color="blue">Special Judge</font>', website_data) is not None problem.html = '' for tag in soup.find('div', attrs={'class': 'problem_content'}).children: if tag.name == 'h2': if tag.img: tag.img.decompose() if not tag.get('class'): tag['class'] = (HtmlTag.TagDesc.TITLE.value,) else: tag['class'] += (HtmlTag.TagDesc.TITLE.value,) tag['style'] = HtmlTag.TagStyle.TITLE.value if tag.name == 'div': if not tag.get('class'): tag['class'] = (HtmlTag.TagDesc.CONTENT.value,) else: tag['class'] += (HtmlTag.TagDesc.CONTENT.value,) tag['style'] = HtmlTag.TagStyle.CONTENT.value problem.html += str(HtmlTag.update_tag(tag, self._static_prefix)) problem.html = '<body>' + self._global_style + problem.html + '</body>' problem.status = Problem.Status.STATUS_CRAWLING_SUCCESS return problem
def problem_parse(self, response, pid, url): problem = Problem() problem.remote_id = pid problem.remote_url = url problem.remote_oj = 'HDU' if response is None: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem website_data = response.text status_code = response.status_code if status_code != 200: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem if re.search('No such problem', website_data): problem.status = Problem.Status.STATUS_PROBLEM_NOT_EXIST return problem soup = BeautifulSoup(website_data, 'lxml') match_groups = re.search(r'color:#1A5CC8\'>([\s\S]*?)</h1>', website_data) if match_groups: problem.title = match_groups.group(1) match_groups = re.search(r'(\d* MS)', website_data) if match_groups: problem.time_limit = match_groups.group(1) match_groups = re.search(r'/(\d* K)', website_data) if match_groups: problem.memory_limit = match_groups.group(1) problem.special_judge = re.search(r'color=red>Special Judge</font>', website_data) is not None problem.html = '' for tag in soup.find('h1').parent.children: if type(tag) == element.Tag and tag.get('class') and set(tag['class']).intersection({'panel_title', 'panel_content', 'panel_bottom'}): if set(tag['class']).intersection({'panel_title', }): tag['class'] += (HtmlTag.TagDesc.TITLE.value,) tag['style'] = HtmlTag.TagStyle.TITLE.value else: tag['class'] += (HtmlTag.TagDesc.CONTENT.value,) tag['style'] = HtmlTag.TagStyle.CONTENT.value problem.html += str(HtmlTag.update_tag(tag, self._static_prefix)) problem.html += self._script problem.status = Problem.Status.STATUS_CRAWLING_SUCCESS return problem
def problem_parse(self, response, pid, url): problem = Problem() problem.remote_id = pid problem.remote_oj = 'Aizu' problem.remote_url = url if response is None: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem website_data = response.text status_code = response.status_code if status_code in [401, 404]: problem.status = Problem.Status.STATUS_PROBLEM_NOT_EXIST return problem elif status_code != 200: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem site_data = json.loads(website_data) soup = BeautifulSoup(site_data.get('html'), 'lxml') problem.title = str(soup.find('h1').get_text()) problem.time_limit = str(site_data.get('time_limit')) + ' sec' problem.memory_limit = str(site_data.get('memory_limit')) + ' KB' problem.special_judge = False problem.html = '' for tag in soup.body: if type(tag) == element.Tag and tag.name in [ 'p', 'h2', 'pre', 'center' ]: if not tag.get('class'): tag['class'] = () if tag.name == 'h2': tag['style'] = HtmlTag.TagStyle.TITLE.value tag['class'] += (HtmlTag.TagDesc.TITLE.value, ) else: tag['style'] = HtmlTag.TagStyle.CONTENT.value tag['class'] += (HtmlTag.TagDesc.CONTENT.value, ) problem.html += str( HtmlTag.update_tag(tag, self._static_prefix)) problem.html += self._script problem.status = Problem.Status.STATUS_CRAWLING_SUCCESS return problem
def problem_parse(self, response, pid, url): problem = Problem() problem.remote_id = pid problem.remote_url = url problem.remote_oj = 'POJ' if response is None: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem website_data = response.text status_code = response.status_code if status_code != 200: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem if re.search('Can not find problem', website_data): problem.status = Problem.Status.STATUS_PROBLEM_NOT_EXIST return problem soup = BeautifulSoup(website_data, 'lxml') match_groups = re.search(r'ptt" lang="en-US">([\s\S]*?)</div>', website_data) if match_groups: problem.title = match_groups.group(1) match_groups = re.search(r'(\d*MS)', website_data) if match_groups: problem.time_limit = match_groups.group(1) match_groups = re.search(r'Memory Limit:</b> ([\s\S]*?)</td>', website_data) if match_groups: problem.memory_limit = match_groups.group(1) problem.special_judge = re.search(r'red;">Special Judge</td>', website_data) is not None problem.html = '' for tag in soup.find('div', attrs={'class': 'ptt'}).next_siblings: if type(tag) == Tag and set(tag.get('class')).intersection({'ptx', 'pst', 'sio'}): if set(tag['class']).intersection({'pst', }): tag['style'] = HtmlTag.TagStyle.TITLE.value tag['class'] += (HtmlTag.TagDesc.TITLE.value,) else: tag['style'] = HtmlTag.TagStyle.CONTENT.value tag['class'] += (HtmlTag.TagDesc.CONTENT.value,) problem.html += str(HtmlTag.update_tag(tag, self._static_prefix)) problem.status = Problem.Status.STATUS_CRAWLING_SUCCESS return problem
def problem_parse(self, response, pid, url): problem = Problem() problem.remote_id = pid problem.remote_url = url problem.remote_oj = 'ZOJ' if not response: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem website_data = response.text status_code = response.status_code if status_code != 200: problem.status = Problem.Status.STATUS_SUBMIT_FAILED return problem if re.search('No such problem', website_data): problem.status = Problem.Status.STATUS_PROBLEM_NOT_EXIST return problem soup = BeautifulSoup(website_data, 'lxml') problem.title = str( soup.find('span', attrs={ 'class': 'bigProblemTitle' }).get_text()) match_groups = re.search(r'(\d* Second)', website_data) if match_groups: problem.time_limit = match_groups.group(1) match_groups = re.search(r'(\d* KB)', website_data) if match_groups: problem.memory_limit = match_groups.group(1) problem.special_judge = re.search( r'<font color="blue">Special Judge</font>', website_data) is not None problem.html = '' problem.html += self._script raw_html = soup.find('div', attrs={'id': 'content_body'}) for tag in raw_html.children: if type(tag) == element.NavigableString: problem.html += str(tag) if type(tag) == element.Tag and tag.name not in ['center', 'hr']: if tag.name == 'a' and tag.get( 'href') == '/onlinejudge/faq.do#sample': continue if tag.name == 'h2': tag['style'] = HtmlTag.TagStyle.TITLE.value if tag.get('class'): tag['class'] += (HtmlTag.TagDesc.TITLE.value, ) else: tag['class'] = (HtmlTag.TagDesc.TITLE.value, ) elif tag.name == 'p' and tag.b and tag.b.string in [ 'Input', 'Output', 'Sample Input', 'Sample Output' ]: tag.b['style'] = HtmlTag.TagStyle.TITLE.value if tag.get('class'): tag.b['class'] += (HtmlTag.TagDesc.TITLE.value, ) else: tag.b['class'] = (HtmlTag.TagDesc.TITLE.value, ) else: tag['style'] = HtmlTag.TagStyle.CONTENT.value if tag.get('class'): tag['class'] += (HtmlTag.TagDesc.CONTENT.value, ) else: tag['class'] = (HtmlTag.TagDesc.CONTENT.value, ) HtmlTag.update_tag(tag, self._static_prefix) problem.html += str(tag) problem.status = Problem.Status.STATUS_CRAWLING_SUCCESS return problem