def scrape_submissions(**query_args): url = "https://acm.timus.ru/status.aspx" response = get_page( url, space=query_args.pop('space', 1), count=query_args.pop('count', 1000), **query_args, ) soup = BeautifulSoup(response.text, 'html.parser') table = soup.select_one("table.status") for row in table.select("tr.odd, tr.even"): row = row.select('td') assert len(row) == 9 submission = dict(judge_id=TIMUS_JUDGE_ID, submission_id=row[0].text, author_id=row[2].find( "a", href=True)['href'].split('id=')[-1], task_id=str(int(row[3].text.split('.')[0])), submitted_on=parse_date(row[1].contents), verdict=parse_verdict(row[5].text), language=row[4].text) if row[7].text: submission['time_exec'] = parse_time_exec(row[7].text) if row[8].text: submission['memory_used'] = parse_memory_used(row[8].text) yield submission
def get_submission_count(**query_dict): page_url = f"https://www.infoarena.ro/monitor" page = get_page(page_url, **query_dict) soup = BeautifulSoup(page.content, 'html.parser') submision_count_text = soup.select_one('#monitor-table .pager .count').text submission_count = parsers.parse_submission_count(submision_count_text) return submission_count
def scrape_user_info(handle, default_avatar): """ Scrapes user information for given handles. :param handle: the handle of infoarena user :param default_avatar: obtainable from get_default_avatar() :return: user information, in dict format """ page_url = "https://www.infoarena.ro/utilizator/" + handle page = get_page(page_url) soup = BeautifulSoup(page.content, 'html.parser') table = soup.select_one('table.compact') cells = list(table.select('td')) user_info = { 'judge_id': INFOARENA_JUDGE_ID, 'handle': handle.lower(), 'rating': int(cells[3].text), } # FIXME: This may not be right!? full_name = cells[1].text if len(full_name.split()) == 1: user_info.update({ 'first_name': full_name, 'last_name': None, }) else: first_name, last_name = full_name.rsplit(' ', 1) user_info.update({ 'first_name': first_name, 'last_name': last_name, }) avatar_url = cells[0].find("a", href=True)['href'] if avatar_url.lower() != f'/avatar/full/{handle.lower()}': raise Exception('Avatar url is not as expected.') user_avatar = get_page(__get_avatar_url(handle)).content if user_avatar == default_avatar: user_info['photo_url'] = None else: user_info['photo_url'] = __get_avatar_url(handle) return user_info
def __scrape_paginated_table_rows(page_url, from_page, to_page, table_css_selector, **query_dict): for page_id in range(from_page, to_page + 1): page = get_page(page_url, **query_dict, page=page_id) soup = BeautifulSoup(page.content, 'html.parser') table = soup.select_one(table_css_selector) rows = table.find_all("tr")[1:] if len(rows) == 0: break for row in rows: yield row.find_all("td")
def scrape_task_info(task_id, space=1): url = "https://acm.timus.ru/problem.aspx" response = get_page(url, num=task_id, space=space) soup = BeautifulSoup(response.text, 'html.parser') title = soup.select_one("h2.problem_title").text.split('.', 1)[1].strip() contents = soup.select_one("div.problem_limits").contents tl, ml = contents[0], contents[2] return { 'judge_id': TIMUS_JUDGE_ID, 'task_id': task_id, 'title': title, 'time_limit': parse_time_limit(tl), 'memory_limit': parse_memory_limit(ml), }
def _api_get(api_method: str, kwargs) -> Any: page_url = f"https://codeforces.com/api/{api_method}" try: response = get_page(page_url, **kwargs) except Exception as ex: log.error(f"GET request got exception: {ex}") return [] json_data = response.json() status = json_data['status'] if status != 'OK': log.error(f"Codeforces API error " f"(expected status: 'OK' got: '{status}', " f"message: '{json_data.get('comment')}')") return [] return json_data['result']
def scrape_task_info(task_id): """ Scrapes task information for a given task id. :param task_id: the id of the task :return: task information, in dict format """ page_url = "https://oj.uz/problem/view/" + task_id page = get_page(page_url) soup = BeautifulSoup(page.content, 'html.parser') title_div = soup.select_one('.problem-title') title = title_div.find('h1').text.lstrip().split('\n')[0].rstrip() cols = title_div.parent.select_one('.table-responsive').find_all('td') return { 'judge_id': OJUZ_JUDGE_ID, 'task_id': task_id.lower(), 'title': title, 'time_limit': parsers.parse_time_limit(cols[0].text), 'memory_limit': parsers.parse_memory_limit(cols[1].text), }
def process(url): log.info(f"Processing {url}") response = get_page(f"https://codeforces.com{url}", max_retries=1) soup = BeautifulSoup(response.content, "html.parser") for a in soup.find_all('a', href=True): href = a['href'] href = href.split('codeforces.com')[-1].rstrip('/') if '#' in href or '?' in href or href.startswith('http'): continue out.write(f"{url} {href}\n") if '/problem/' in href: log.info(f"Found edge: {url}->{href}") if href in seen: continue href = href.replace('profile', 'blog') if 'blog' in href: seen.add(href) queue.append(href)
def __scrape_paginated_table_rows(page_url, from_page, to_page, results_per_page, table_css_selector, **query_dict): if results_per_page > 250: raise Exception( "Infoarena does not support more than 250 results per page") for page_id in range(from_page, to_page + 1): first_entry = results_per_page * (page_id - 1) page = get_page(page_url, **query_dict, first_entry=first_entry, display_entries=results_per_page) soup = BeautifulSoup(page.content, 'html.parser') table = soup.select_one(table_css_selector) if table is None: break if "Nici o solutie" in table.text: break rows = table.find_all("tr")[1:] for row in rows: yield row.find_all("td")
def scrape_past_contest_ids(): """ Scrapes all past (finished) contests. :return: a list of contest ids (e.g. ['arc092', 'agc003']) """ page_url = "https://atcoder.jp/contest/archive" contest_ids = set() for page_id in range(1, PAGE_LIMIT): page = get_page(page_url, p=page_id) soup = BeautifulSoup(page.content, 'html.parser') old_len = len(contest_ids) table = soup.find('table') for a in table.find_all('a', href=True): contest_url = a['href'] contest_id = __parse_contest_id(contest_url) if contest_id is not None: contest_ids.add(contest_id) if old_len == len(contest_ids): break return list(contest_ids)
def scrape_task_info(task_id: str): """ Scrapes the task info for given tasks. :param task_id: the id of the task (e.g. 'agc003/agc003_a') :return: a task info object """ contest_id, task_id = task_id.split('/') page_url = f"https://atcoder.jp/contests/{contest_id}/tasks/{task_id}" page = get_page(page_url) soup = BeautifulSoup(page.content, 'html.parser') main_div = soup.select_one('span.h2').parent time_limit_text, memory_limit_text = map( str.strip, main_div.select_one('p').text.split('/')) task_info = { 'judge_id': ATCODER_JUDGE_ID, 'task_id': "/".join([contest_id, task_id]).lower(), 'title': parse_title(main_div.select_one('span.h2').text), 'time_limit': parse_time_limit(time_limit_text), 'memory_limit': parse_memory_limit(memory_limit_text), 'tags': [], 'source': soup.select_one('.contest-title').text, } return task_info
def scrape_task_statement(task_id: str): response = get_page(f"https://www.infoarena.ro/problema/{task_id}") soup = BeautifulSoup(response.text, 'html.parser') text_block = soup.select_one("#main > .wiki_text_block") found_h1 = False text_lines = [] html = "" for child in text_block.find_all(recursive=False): if child.name == 'h1': found_h1 = True continue if not found_h1: continue if "Exempl" in child.text: break text_lines.append(child.text) html += str(child) html = html\ .replace('`', '\'')\ .replace('<var>', '<code>')\ .replace('</var>', '</code>')\ .replace('<h2>', '<h3>')\ .replace('</h2>', '</h3>') def remove_text(match): if len(match.split()) <= 6: return None ret = [] found_words = 0 word_streak = 0 for token in match.split(): is_word = False stripped = __strip_accents(token) for c in string.punctuation: stripped = stripped.rstrip(c) stripped = "".join([c for c in stripped if c != '-']) if (len(stripped) > 1 and stripped.isalpha()) or \ (stripped.lower() in ['o', 'a', 'e'] and word_streak >= 2): is_word = True found_words += len(stripped) if not is_word: ret.append(f"<code>{token}</code>") word_streak = 0 else: ret.append(token) word_streak += 1 if found_words <= 12: return None return " ".join(ret) for match in set(re.findall(r'<code>(.*?)<\/code>', html)): if '.in' in match or '.out' in match: if '<' in match: repl = re.sub(r"<[^>]*>", "", match) html = html.replace(match, repl) continue replace = remove_text(match) if replace: html = html.replace(f"<code>{match}</code>", f"{replace}") for match in set(re.findall(r'<code>(.*?)<\/code>', html)): if '.in' in match or '.out' in match: continue for c in ['\'', '\"']: if match.startswith(c) and match.endswith(c): continue latex = match latex = latex.replace('<', '<').replace('>', '>') for c in "&%$#_{}": latex = latex.replace(c, '\\' + c) latex = latex.replace('\\\'', '\'') \ .replace('<sub>', '_{') \ .replace('</sub>', '}') \ .replace('<sup>', '^{') \ .replace('</sup>', '}') \ .replace('<b>', '\\textbf{') \ .replace('</b>', '}') \ .replace(' * ', ' \\cdot ') \ .replace('<i>', '\\textit{') \ .replace('</i>', '}') latex = re.sub('<[^>]+>', '', latex) sub = {} for idx, occ in enumerate(set(re.findall(r"\{([^\}]*)\}", latex))): nidx = f"__{idx}__" latex = latex.replace("{" + occ + "}", "{" + nidx + "}") sub[nidx] = occ latex = re.sub(r'([^a-zA-Z\\]|^)([a-zA-Z][a-zA-Z]*[a-zA-Z])', r"\g<1>\\text{\g<2>}", latex) latex = re.sub(r"\s+", ' ', latex) latex = latex.replace(" ", "$</code> <code>$") for nidx, occ in sub.items(): latex = latex.replace("{" + nidx + "}", "{" + occ + "}") html = html.replace(f"<code>{match}</code>", f"<code>${latex}$</code>") html = re.sub(r"([(\[])<code>\$([^<]+)\$</code>([)\]])", r"<code>$\g<1>\g<2>\g<3>$</code>", html) examples = [] for table in soup.select("table.example"): for tr in table.select("tr"): tds = list(tr.select("td")) if len(tds) == 0: continue if len(tds) == 1: log.critical( f"Failed parsing example for task: {task_id} -- odd number of tds" ) continue examples.append({ "input": tds[0].text, "output": tds[1].text, }) soup = BeautifulSoup(html, 'html.parser') for img in soup.find_all('img'): if 'latex' in img['src']: latex_formula = img['alt'] latex_tag = soup.new_tag('code') latex_tag.string = "$" + latex_formula + "$" img.replace_with(latex_tag) html = str(soup) md = html2text(html, bodywidth=0) return { "statement": markdown.prettify(md), "examples": examples, }
def search_result(request): query = request.GET.get('query') if query is None or query.strip() == '': return redirect('home') client_ip = get_client_ip(request) page = get_page(request.GET.get('page', 1)) query = query.strip() links_only = query[-5:] == ':link' and len(query) > 5 if links_only: query = query[:-5] # check if getting resuts from db is possible time_limit = request.session.get('time_limit', TIME_LIMIT_DEFAULT) time_limit_delta = timezone.now() - timedelta(minutes=time_limit) search_result_query = Query.objects.filter(timestamp__gte=time_limit_delta, client_ip=client_ip, search_phrase=query.lower(), page_number=page) # get results from db if search_result_query.exists(): q = search_result_query.last() total_results = q.total_results if links_only: items = [{ 'link': item.link, } for item in q.result_items.all()] most_common_words = None else: items = [{ 'title': item.title, 'link': item.link, 'formatted_url': item.formatted_url, 'html_snippet': mark_safe(item.html_snippet) } for item in q.result_items.all()] most_common_words = q.most_common_words.all() # download results from google api else: start_index = 1 if page == 1 else page * 10 - 9 response = google_search(search_query=query, start_index=start_index) # check if there are results try: response['items'] except KeyError: return render(request, 'scraper/no_results.html') # save search result to db q = Query.objects.create( client_ip=client_ip, search_phrase=query.lower(), total_results=response['searchInformation']['totalResults'], page_number=page) for item in response['items']: result_item, created = ResultItem.objects.get_or_create( title=item['title'], link=item['link'], formatted_url=item['formattedUrl'], html_snippet=item['htmlSnippet']) q.result_items.add(result_item) # get 10 most common words from page all_words = '' for item in response['items']: all_words += item['title'].lower() + item['snippet'].lower() most_common_words = get_most_common_words(all_words, 10) for word in most_common_words: common_word, created = Word.objects.get_or_create(text=word) q.most_common_words.add(common_word) q.save() total_results = response['searchInformation']['totalResults'] if links_only: items = [{ 'link': item['link'], } for item in response['items']] most_common_words = None else: items = [{ 'title': item['title'], 'link': item['link'], 'formatted_url': item['formattedUrl'], 'html_snippet': mark_safe(item['htmlSnippet']) } for item in response['items']] ctx = { 'total_results': total_results, 'items': items, 'most_common_words': most_common_words, 'page_no': page, 'previous_page': page - 1 if page > 1 else None, 'next_page': page + 1 if page < 10 else None, 'links_only': links_only, } return render(request, 'scraper/search_result.html', ctx)
def get_default_avatar(): return get_page(__get_avatar_url(USER_WITH_DEFAULT_AVATAR)).content
def scrape_task_info(task_id): """ Scrapes task information for a given task id. :param task_id: the id of the task :return: task information, in dict format """ page_url = "https://www.infoarena.ro/problema/" + task_id page = get_page(page_url) soup = BeautifulSoup(page.content, 'html.parser') main_view = soup.find(id='main') info_table = main_view.find('table') title = main_view.find('h1').text.strip() input_file, output_file = map( str.strip, info_table.find_all('tr')[0].find_all('td')[1].text.split(',')) time_limit = info_table.find_all('tr')[2].find_all('td')[1].text memory_limit = info_table.find_all('tr')[2].find_all('td')[3].text source = info_table.find_all('tr')[0].find_all('td')[3].text tags = [] for tag_a in main_view.select('a.tag_search_anchor'): tag = parsers.parse_tag(tag_a.text) if tag is not None: tags.append(tag) task_info = { 'judge_id': INFOARENA_JUDGE_ID, 'task_id': task_id.lower(), 'title': title, 'source': source, 'time_limit': parsers.parse_time_limit(time_limit), 'memory_limit': parsers.parse_memory_limit(memory_limit), 'input_file': input_file, 'output_file': output_file, 'tags': tags, } try: # Go to the monitor to find out submission count and first submission date. task_info.update( dict(total_submission_count=get_submission_count(task=task_id), accepted_submission_count=get_submission_count( task=task_id, score_begin=100))) submission_count = task_info['total_submission_count'] if submission_count > 0: # A little hack to get only the very first submissions. first_few_submissions = list( scrape_submissions(from_page=max(1, submission_count // 20 - 1), results_per_page=20, task=task_id)) if len(first_few_submissions) == 0: raise Exception("BUG: First few submissions are non-existant") first_submitted_on = min( [sub['submitted_on'] for sub in first_few_submissions]) task_info['first_submitted_on'] = first_submitted_on except Exception as ex: log.warning(f"Failed to parse extra data for task {task_id}: {ex}") return task_info
def scrape_task_statement(task_id: str): contest_id, task_letter = task_id.split('/') contest_or_gym = "gym" if int(contest_id) >= 100000 else "contest" response = get_page( f"https://codeforces.com/{contest_or_gym}/{contest_id}/problem/{task_letter}" ) soup = BeautifulSoup(response.text, 'html.parser') statement = soup.select_one(".problem-statement") result = "" # print(statement.select_one('.time-limit')) # print(statement.select_one('.time-limit').string) task_info = { "time_limit": parse_time_limit( statement.select_one('.time-limit').find(text=True, recursive=False)), "memory_limit": parse_memory_limit( statement.select_one('.memory-limit').find(text=True, recursive=False)), "input_file": parse_filename( statement.select_one(".input-file").find(text=True, recursive=False)), "output_file": parse_filename( statement.select_one(".output-file").find(text=True, recursive=False)), } inputs = [] outputs = [] for child in statement.find_all("div", recursive=False): klass = child.get("class", []) if "header" in klass: continue if "sample-tests" in klass: for test in child.select('.sample-test'): for br in test.find_all('br'): br.replace_with("\n" + br.text) for tag in test.select('.input pre'): inputs.append(tag.text) for tag in test.select(".output pre"): outputs.append(tag.text) continue result += str(child) result = f"<div>{result}</div>" soup = BeautifulSoup(result, 'html.parser') for node in soup.select(".section-title"): node.wrap(soup.new_tag("h3")) for node in soup.select(".tex-span"): inner_text = str(node) inner_text = inner_text.replace("<", "<") inner_text = inner_text.replace(">", ">") for c in "&%$#_{}": inner_text = inner_text.replace(c, '\\' + c) inner_text = re.sub(r"<span[^>]*>(.*)<\/span>", r"\g<1>", inner_text) inner_text = re.sub(r"<sub[^>]*>(.*?)<\/sub>", r"_{\g<1>}", inner_text) inner_text = re.sub(r"<sup[^>]*>(.*?)<\/sup>", r"^{\g<1>}", inner_text) inner_text = re.sub(r"<i[^>]*>(.*?)<\/i>", r"\g<1>", inner_text) code = soup.new_tag("latex") code.string = inner_text node.replace_with(code) for node in soup.select(".tex-font-style-tt"): code = soup.new_tag("latex") code.string = node.text node.replace_with(code) result = str(soup) latex_codes = [] result = re.sub(r'\$\$\$(.*?)\$\$\$', '<latex>\g<1></latex>', result) # result = re.sub(r'\$\$\$(.*?)\$\$\$', "<code>$<1>$</code>", result) md = markdown.html2text(result) for match, repl in zip( reversed(list(re.finditer(r'`\[\[\[LATEX\]\]\]`', md))), latex_codes): [b, e] = match.span() md = md[:b] + '$' + repl + '$' + md[e:] examples = None if len(inputs) == len(outputs): examples = [{"input": i, "output": o} for i, o in zip(inputs, outputs)] else: log.critical( f"Could not parse examples for {task_id}: unequal number of inputs and outputs" ) print(md) task_info.update({ "statement": md, "examples": examples, }) return task_info
def scrape_submissions_for_contest(contest_id, query_dict, from_page=1, to_page=PAGE_LIMIT): """ Scrapes all the submissions for a given contest. :param contest_id: the id of the contest (e.g. 'agc003') :param from_page: the page from which to start :param to_page: the page for which to end :param query_dict: query parameters for the url (e.g. user_screen_name='tourist') :return: a generator of submission objects """ base_url = __get_contest_url(contest_id) for page_id in range(from_page, to_page + 1): page_url = f"{base_url}submissions" page = get_page(page_url, page=page_id, **query_dict) soup = BeautifulSoup(page.content, 'html.parser') rows = __scrape_table_rows(soup.select_one(".panel-submission"), table_css_selector="table") submission_found = False for row in rows: submitted_on = datetime.datetime.strptime(row[0].find('time').text, "%Y-%m-%d %H:%M:%S%z") ts = time.mktime(submitted_on.utctimetuple()) submitted_on = datetime.datetime.utcfromtimestamp(ts) submission = { 'judge_id': ATCODER_JUDGE_ID, 'submission_id': row[-1].find('a', href=True)['href'].split('/')[-1], 'submitted_on': submitted_on, 'task_id': "/".join([ contest_id, row[1].find('a', href=True)['href'].split('/')[-1] ]).lower(), 'author_id': row[2].find('a', href=True)['href'].split('/')[-1].lower(), 'language': row[3].text, 'source_size': int(row[5].text.split()[0]), 'verdict': row[6].select_one('span.label').text.split()[-1], } if row[4].text != '-': submission['score'] = int(row[4].text) if len(row) == 10: submission.update({ 'exec_time': int(row[7].text.split()[0]), 'memory_used': int(row[8].text.split()[0]), }) submission_found = True yield submission if not submission_found: break