Python get_page Examples, scraper.utils.get_page Python Examples

Example #1

0

Show file

def scrape_submissions(**query_args):
    url = "https://acm.timus.ru/status.aspx"
    response = get_page(
        url,
        space=query_args.pop('space', 1),
        count=query_args.pop('count', 1000),
        **query_args,
    )
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.select_one("table.status")

    for row in table.select("tr.odd, tr.even"):
        row = row.select('td')
        assert len(row) == 9
        submission = dict(judge_id=TIMUS_JUDGE_ID,
                          submission_id=row[0].text,
                          author_id=row[2].find(
                              "a", href=True)['href'].split('id=')[-1],
                          task_id=str(int(row[3].text.split('.')[0])),
                          submitted_on=parse_date(row[1].contents),
                          verdict=parse_verdict(row[5].text),
                          language=row[4].text)
        if row[7].text:
            submission['time_exec'] = parse_time_exec(row[7].text)
        if row[8].text:
            submission['memory_used'] = parse_memory_used(row[8].text)
        yield submission

Example #2

0

Show file

def get_submission_count(**query_dict):
    page_url = f"https://www.infoarena.ro/monitor"
    page = get_page(page_url, **query_dict)
    soup = BeautifulSoup(page.content, 'html.parser')
    submision_count_text = soup.select_one('#monitor-table .pager .count').text
    submission_count = parsers.parse_submission_count(submision_count_text)
    return submission_count

Example #3

0

Show file

def scrape_user_info(handle, default_avatar):
    """
    Scrapes user information for given handles.
    :param handle: the handle of infoarena user
    :param default_avatar: obtainable from get_default_avatar()
    :return: user information, in dict format
    """

    page_url = "https://www.infoarena.ro/utilizator/" + handle
    page = get_page(page_url)
    soup = BeautifulSoup(page.content, 'html.parser')

    table = soup.select_one('table.compact')
    cells = list(table.select('td'))

    user_info = {
        'judge_id': INFOARENA_JUDGE_ID,
        'handle': handle.lower(),
        'rating': int(cells[3].text),
    }

    # FIXME: This may not be right!?
    full_name = cells[1].text
    if len(full_name.split()) == 1:
        user_info.update({
            'first_name': full_name,
            'last_name': None,
        })
    else:
        first_name, last_name = full_name.rsplit(' ', 1)
        user_info.update({
            'first_name': first_name,
            'last_name': last_name,
        })

    avatar_url = cells[0].find("a", href=True)['href']
    if avatar_url.lower() != f'/avatar/full/{handle.lower()}':
        raise Exception('Avatar url is not as expected.')

    user_avatar = get_page(__get_avatar_url(handle)).content
    if user_avatar == default_avatar:
        user_info['photo_url'] = None
    else:
        user_info['photo_url'] = __get_avatar_url(handle)
    return user_info

Example #4

0

Show file

def __scrape_paginated_table_rows(page_url, from_page, to_page,
                                  table_css_selector, **query_dict):
    for page_id in range(from_page, to_page + 1):
        page = get_page(page_url, **query_dict, page=page_id)
        soup = BeautifulSoup(page.content, 'html.parser')
        table = soup.select_one(table_css_selector)

        rows = table.find_all("tr")[1:]
        if len(rows) == 0:
            break
        for row in rows:
            yield row.find_all("td")

Example #5

0

Show file

def scrape_task_info(task_id, space=1):
    url = "https://acm.timus.ru/problem.aspx"
    response = get_page(url, num=task_id, space=space)
    soup = BeautifulSoup(response.text, 'html.parser')

    title = soup.select_one("h2.problem_title").text.split('.', 1)[1].strip()
    contents = soup.select_one("div.problem_limits").contents
    tl, ml = contents[0], contents[2]
    return {
        'judge_id': TIMUS_JUDGE_ID,
        'task_id': task_id,
        'title': title,
        'time_limit': parse_time_limit(tl),
        'memory_limit': parse_memory_limit(ml),
    }

Example #6

0

Show file

def _api_get(api_method: str, kwargs) -> Any:
    page_url = f"https://codeforces.com/api/{api_method}"
    try:
        response = get_page(page_url, **kwargs)
    except Exception as ex:
        log.error(f"GET request got exception: {ex}")
        return []

    json_data = response.json()
    status = json_data['status']
    if status != 'OK':
        log.error(f"Codeforces API error "
                  f"(expected status: 'OK' got: '{status}', "
                  f"message: '{json_data.get('comment')}')")
        return []
    return json_data['result']

Example #7

0

Show file

def scrape_task_info(task_id):
    """
    Scrapes task information for a given task id.
    :param task_id: the id of the task
    :return: task information, in dict format
    """
    page_url = "https://oj.uz/problem/view/" + task_id
    page = get_page(page_url)
    soup = BeautifulSoup(page.content, 'html.parser')

    title_div = soup.select_one('.problem-title')
    title = title_div.find('h1').text.lstrip().split('\n')[0].rstrip()
    cols = title_div.parent.select_one('.table-responsive').find_all('td')

    return {
        'judge_id': OJUZ_JUDGE_ID,
        'task_id': task_id.lower(),
        'title': title,
        'time_limit': parsers.parse_time_limit(cols[0].text),
        'memory_limit': parsers.parse_memory_limit(cols[1].text),
    }

Example #8

0

Show file

        def process(url):
            log.info(f"Processing {url}")
            response = get_page(f"https://codeforces.com{url}", max_retries=1)
            soup = BeautifulSoup(response.content, "html.parser")
            for a in soup.find_all('a', href=True):
                href = a['href']
                href = href.split('codeforces.com')[-1].rstrip('/')
                if '#' in href or '?' in href or href.startswith('http'):
                    continue

                out.write(f"{url} {href}\n")
                if '/problem/' in href:
                    log.info(f"Found edge: {url}->{href}")

                if href in seen:
                    continue

                href = href.replace('profile', 'blog')

                if 'blog' in href:
                    seen.add(href)
                    queue.append(href)

Example #9

0

Show file

def __scrape_paginated_table_rows(page_url, from_page, to_page,
                                  results_per_page, table_css_selector,
                                  **query_dict):
    if results_per_page > 250:
        raise Exception(
            "Infoarena does not support more than 250 results per page")

    for page_id in range(from_page, to_page + 1):
        first_entry = results_per_page * (page_id - 1)
        page = get_page(page_url,
                        **query_dict,
                        first_entry=first_entry,
                        display_entries=results_per_page)
        soup = BeautifulSoup(page.content, 'html.parser')
        table = soup.select_one(table_css_selector)
        if table is None:
            break
        if "Nici o solutie" in table.text:
            break

        rows = table.find_all("tr")[1:]
        for row in rows:
            yield row.find_all("td")

Example #10

0

Show file

File: utils.py Project: bicsi/cpaggregator

def scrape_past_contest_ids():
    """
    Scrapes all past (finished) contests.
    :return: a list of contest ids (e.g. ['arc092', 'agc003'])
    """
    page_url = "https://atcoder.jp/contest/archive"

    contest_ids = set()
    for page_id in range(1, PAGE_LIMIT):
        page = get_page(page_url, p=page_id)
        soup = BeautifulSoup(page.content, 'html.parser')
        old_len = len(contest_ids)

        table = soup.find('table')
        for a in table.find_all('a', href=True):
            contest_url = a['href']
            contest_id = __parse_contest_id(contest_url)
            if contest_id is not None:
                contest_ids.add(contest_id)

        if old_len == len(contest_ids):
            break
    return list(contest_ids)

Example #11

0

Show file

File: utils.py Project: bicsi/cpaggregator

def scrape_task_info(task_id: str):
    """
    Scrapes the task info for given tasks.
    :param task_id: the id of the task (e.g. 'agc003/agc003_a')
    :return: a task info object
    """
    contest_id, task_id = task_id.split('/')
    page_url = f"https://atcoder.jp/contests/{contest_id}/tasks/{task_id}"
    page = get_page(page_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    main_div = soup.select_one('span.h2').parent
    time_limit_text, memory_limit_text = map(
        str.strip,
        main_div.select_one('p').text.split('/'))
    task_info = {
        'judge_id': ATCODER_JUDGE_ID,
        'task_id': "/".join([contest_id, task_id]).lower(),
        'title': parse_title(main_div.select_one('span.h2').text),
        'time_limit': parse_time_limit(time_limit_text),
        'memory_limit': parse_memory_limit(memory_limit_text),
        'tags': [],
        'source': soup.select_one('.contest-title').text,
    }
    return task_info

Example #12

0

Show file

def scrape_task_statement(task_id: str):
    response = get_page(f"https://www.infoarena.ro/problema/{task_id}")
    soup = BeautifulSoup(response.text, 'html.parser')
    text_block = soup.select_one("#main > .wiki_text_block")

    found_h1 = False
    text_lines = []
    html = ""
    for child in text_block.find_all(recursive=False):
        if child.name == 'h1':
            found_h1 = True
            continue
        if not found_h1:
            continue
        if "Exempl" in child.text:
            break
        text_lines.append(child.text)
        html += str(child)

    html = html\
        .replace('`', '\'')\
        .replace('<var>', '<code>')\
        .replace('</var>', '</code>')\
        .replace('<h2>', '<h3>')\
        .replace('</h2>', '</h3>')

    def remove_text(match):
        if len(match.split()) <= 6:
            return None

        ret = []
        found_words = 0
        word_streak = 0
        for token in match.split():
            is_word = False
            stripped = __strip_accents(token)
            for c in string.punctuation:
                stripped = stripped.rstrip(c)
            stripped = "".join([c for c in stripped if c != '-'])

            if (len(stripped) > 1 and stripped.isalpha()) or \
                    (stripped.lower() in ['o', 'a', 'e'] and word_streak >= 2):
                is_word = True
                found_words += len(stripped)
            if not is_word:
                ret.append(f"<code>{token}</code>")
                word_streak = 0
            else:
                ret.append(token)
                word_streak += 1

        if found_words <= 12:
            return None
        return " ".join(ret)

    for match in set(re.findall(r'<code>(.*?)<\/code>', html)):
        if '.in' in match or '.out' in match:
            if '<' in match:
                repl = re.sub(r"<[^>]*>", "", match)
                html = html.replace(match, repl)
            continue
        replace = remove_text(match)
        if replace:
            html = html.replace(f"<code>{match}</code>", f"{replace}")

    for match in set(re.findall(r'<code>(.*?)<\/code>', html)):
        if '.in' in match or '.out' in match:
            continue

        for c in ['\'', '\"']:
            if match.startswith(c) and match.endswith(c):
                continue

        latex = match
        latex = latex.replace('&lt;', '<').replace('&gt;', '>')

        for c in "&%$#_{}":
            latex = latex.replace(c, '\\' + c)

        latex = latex.replace('\\\'', '\'') \
            .replace('<sub>', '_{') \
            .replace('</sub>', '}') \
            .replace('<sup>', '^{') \
            .replace('</sup>', '}') \
            .replace('<b>', '\\textbf{') \
            .replace('</b>', '}') \
            .replace(' * ', ' \\cdot ') \
            .replace('<i>', '\\textit{') \
            .replace('</i>', '}')
        latex = re.sub('<[^>]+>', '', latex)

        sub = {}
        for idx, occ in enumerate(set(re.findall(r"\{([^\}]*)\}", latex))):
            nidx = f"__{idx}__"
            latex = latex.replace("{" + occ + "}", "{" + nidx + "}")
            sub[nidx] = occ

        latex = re.sub(r'([^a-zA-Z\\]|^)([a-zA-Z][a-zA-Z]*[a-zA-Z])',
                       r"\g<1>\\text{\g<2>}", latex)

        latex = re.sub(r"\s+", ' ', latex)
        latex = latex.replace(" ", "$</code> <code>$")
        for nidx, occ in sub.items():
            latex = latex.replace("{" + nidx + "}", "{" + occ + "}")

        html = html.replace(f"<code>{match}</code>", f"<code>${latex}$</code>")

    html = re.sub(r"([(\[])<code>\$([^<]+)\$</code>([)\]])",
                  r"<code>$\g<1>\g<2>\g<3>$</code>", html)

    examples = []
    for table in soup.select("table.example"):
        for tr in table.select("tr"):
            tds = list(tr.select("td"))
            if len(tds) == 0:
                continue

            if len(tds) == 1:
                log.critical(
                    f"Failed parsing example for task: {task_id} -- odd number of tds"
                )
                continue

            examples.append({
                "input": tds[0].text,
                "output": tds[1].text,
            })

    soup = BeautifulSoup(html, 'html.parser')
    for img in soup.find_all('img'):
        if 'latex' in img['src']:
            latex_formula = img['alt']
            latex_tag = soup.new_tag('code')
            latex_tag.string = "$" + latex_formula + "$"
            img.replace_with(latex_tag)
    html = str(soup)

    md = html2text(html, bodywidth=0)

    return {
        "statement": markdown.prettify(md),
        "examples": examples,
    }

Example #13

0

Show file

def search_result(request):
    query = request.GET.get('query')
    if query is None or query.strip() == '':
        return redirect('home')

    client_ip = get_client_ip(request)
    page = get_page(request.GET.get('page', 1))
    query = query.strip()
    links_only = query[-5:] == ':link' and len(query) > 5
    if links_only:
        query = query[:-5]

    # check if getting resuts from db is possible
    time_limit = request.session.get('time_limit', TIME_LIMIT_DEFAULT)
    time_limit_delta = timezone.now() - timedelta(minutes=time_limit)

    search_result_query = Query.objects.filter(timestamp__gte=time_limit_delta,
                                               client_ip=client_ip,
                                               search_phrase=query.lower(),
                                               page_number=page)

    # get results from db
    if search_result_query.exists():
        q = search_result_query.last()
        total_results = q.total_results
        if links_only:
            items = [{
                'link': item.link,
            } for item in q.result_items.all()]
            most_common_words = None
        else:
            items = [{
                'title': item.title,
                'link': item.link,
                'formatted_url': item.formatted_url,
                'html_snippet': mark_safe(item.html_snippet)
            } for item in q.result_items.all()]
            most_common_words = q.most_common_words.all()

    # download results from google api
    else:
        start_index = 1 if page == 1 else page * 10 - 9
        response = google_search(search_query=query, start_index=start_index)

        # check if there are results
        try:
            response['items']
        except KeyError:
            return render(request, 'scraper/no_results.html')

        # save search result to db
        q = Query.objects.create(
            client_ip=client_ip,
            search_phrase=query.lower(),
            total_results=response['searchInformation']['totalResults'],
            page_number=page)
        for item in response['items']:
            result_item, created = ResultItem.objects.get_or_create(
                title=item['title'],
                link=item['link'],
                formatted_url=item['formattedUrl'],
                html_snippet=item['htmlSnippet'])
            q.result_items.add(result_item)

        # get 10 most common words from page
        all_words = ''
        for item in response['items']:
            all_words += item['title'].lower() + item['snippet'].lower()
        most_common_words = get_most_common_words(all_words, 10)

        for word in most_common_words:
            common_word, created = Word.objects.get_or_create(text=word)
            q.most_common_words.add(common_word)
        q.save()

        total_results = response['searchInformation']['totalResults']

        if links_only:
            items = [{
                'link': item['link'],
            } for item in response['items']]
            most_common_words = None
        else:
            items = [{
                'title': item['title'],
                'link': item['link'],
                'formatted_url': item['formattedUrl'],
                'html_snippet': mark_safe(item['htmlSnippet'])
            } for item in response['items']]

    ctx = {
        'total_results': total_results,
        'items': items,
        'most_common_words': most_common_words,
        'page_no': page,
        'previous_page': page - 1 if page > 1 else None,
        'next_page': page + 1 if page < 10 else None,
        'links_only': links_only,
    }
    return render(request, 'scraper/search_result.html', ctx)

Example #14

0

Show file

def get_default_avatar():
    return get_page(__get_avatar_url(USER_WITH_DEFAULT_AVATAR)).content

Example #15

0

Show file

def scrape_task_info(task_id):
    """
    Scrapes task information for a given task id.
    :param task_id: the id of the task
    :return: task information, in dict format
    """
    page_url = "https://www.infoarena.ro/problema/" + task_id
    page = get_page(page_url)
    soup = BeautifulSoup(page.content, 'html.parser')

    main_view = soup.find(id='main')
    info_table = main_view.find('table')

    title = main_view.find('h1').text.strip()
    input_file, output_file = map(
        str.strip,
        info_table.find_all('tr')[0].find_all('td')[1].text.split(','))
    time_limit = info_table.find_all('tr')[2].find_all('td')[1].text
    memory_limit = info_table.find_all('tr')[2].find_all('td')[3].text
    source = info_table.find_all('tr')[0].find_all('td')[3].text

    tags = []
    for tag_a in main_view.select('a.tag_search_anchor'):
        tag = parsers.parse_tag(tag_a.text)
        if tag is not None:
            tags.append(tag)

    task_info = {
        'judge_id': INFOARENA_JUDGE_ID,
        'task_id': task_id.lower(),
        'title': title,
        'source': source,
        'time_limit': parsers.parse_time_limit(time_limit),
        'memory_limit': parsers.parse_memory_limit(memory_limit),
        'input_file': input_file,
        'output_file': output_file,
        'tags': tags,
    }

    try:
        # Go to the monitor to find out submission count and first submission date.
        task_info.update(
            dict(total_submission_count=get_submission_count(task=task_id),
                 accepted_submission_count=get_submission_count(
                     task=task_id, score_begin=100)))

        submission_count = task_info['total_submission_count']
        if submission_count > 0:
            # A little hack to get only the very first submissions.
            first_few_submissions = list(
                scrape_submissions(from_page=max(1,
                                                 submission_count // 20 - 1),
                                   results_per_page=20,
                                   task=task_id))
            if len(first_few_submissions) == 0:
                raise Exception("BUG: First few submissions are non-existant")

            first_submitted_on = min(
                [sub['submitted_on'] for sub in first_few_submissions])
            task_info['first_submitted_on'] = first_submitted_on
    except Exception as ex:
        log.warning(f"Failed to parse extra data for task {task_id}: {ex}")

    return task_info

Example #16

0

Show file

def scrape_task_statement(task_id: str):
    contest_id, task_letter = task_id.split('/')
    contest_or_gym = "gym" if int(contest_id) >= 100000 else "contest"
    response = get_page(
        f"https://codeforces.com/{contest_or_gym}/{contest_id}/problem/{task_letter}"
    )
    soup = BeautifulSoup(response.text, 'html.parser')
    statement = soup.select_one(".problem-statement")
    result = ""

    # print(statement.select_one('.time-limit'))
    # print(statement.select_one('.time-limit').string)

    task_info = {
        "time_limit":
        parse_time_limit(
            statement.select_one('.time-limit').find(text=True,
                                                     recursive=False)),
        "memory_limit":
        parse_memory_limit(
            statement.select_one('.memory-limit').find(text=True,
                                                       recursive=False)),
        "input_file":
        parse_filename(
            statement.select_one(".input-file").find(text=True,
                                                     recursive=False)),
        "output_file":
        parse_filename(
            statement.select_one(".output-file").find(text=True,
                                                      recursive=False)),
    }

    inputs = []
    outputs = []
    for child in statement.find_all("div", recursive=False):
        klass = child.get("class", [])
        if "header" in klass:
            continue
        if "sample-tests" in klass:
            for test in child.select('.sample-test'):
                for br in test.find_all('br'):
                    br.replace_with("\n" + br.text)
                for tag in test.select('.input pre'):
                    inputs.append(tag.text)
                for tag in test.select(".output pre"):
                    outputs.append(tag.text)
            continue
        result += str(child)

    result = f"<div>{result}</div>"
    soup = BeautifulSoup(result, 'html.parser')

    for node in soup.select(".section-title"):
        node.wrap(soup.new_tag("h3"))

    for node in soup.select(".tex-span"):

        inner_text = str(node)
        inner_text = inner_text.replace("&lt;", "<")
        inner_text = inner_text.replace("&gt;", ">")

        for c in "&%$#_{}":
            inner_text = inner_text.replace(c, '\\' + c)

        inner_text = re.sub(r"<span[^>]*>(.*)<\/span>", r"\g<1>", inner_text)
        inner_text = re.sub(r"<sub[^>]*>(.*?)<\/sub>", r"_{\g<1>}", inner_text)
        inner_text = re.sub(r"<sup[^>]*>(.*?)<\/sup>", r"^{\g<1>}", inner_text)
        inner_text = re.sub(r"<i[^>]*>(.*?)<\/i>", r"\g<1>", inner_text)

        code = soup.new_tag("latex")
        code.string = inner_text
        node.replace_with(code)

    for node in soup.select(".tex-font-style-tt"):
        code = soup.new_tag("latex")
        code.string = node.text
        node.replace_with(code)

    result = str(soup)
    latex_codes = []
    result = re.sub(r'\$\$\$(.*?)\$\$\$', '<latex>\g<1></latex>', result)

    # result = re.sub(r'\$\$\$(.*?)\$\$\$', "<code>$<1>$</code>", result)

    md = markdown.html2text(result)

    for match, repl in zip(
            reversed(list(re.finditer(r'`\[\[\[LATEX\]\]\]`', md))),
            latex_codes):
        [b, e] = match.span()
        md = md[:b] + '$' + repl + '$' + md[e:]

    examples = None
    if len(inputs) == len(outputs):
        examples = [{"input": i, "output": o} for i, o in zip(inputs, outputs)]
    else:
        log.critical(
            f"Could not parse examples for {task_id}: unequal number of inputs and outputs"
        )

    print(md)

    task_info.update({
        "statement": md,
        "examples": examples,
    })
    return task_info

Example #17

0

Show file

File: utils.py Project: bicsi/cpaggregator

def scrape_submissions_for_contest(contest_id,
                                   query_dict,
                                   from_page=1,
                                   to_page=PAGE_LIMIT):
    """
    Scrapes all the submissions for a given contest.
    :param contest_id: the id of the contest (e.g. 'agc003')
    :param from_page: the page from which to start
    :param to_page: the page for which to end
    :param query_dict: query parameters for the url (e.g. user_screen_name='tourist')
    :return: a generator of submission objects
    """
    base_url = __get_contest_url(contest_id)

    for page_id in range(from_page, to_page + 1):
        page_url = f"{base_url}submissions"
        page = get_page(page_url, page=page_id, **query_dict)
        soup = BeautifulSoup(page.content, 'html.parser')
        rows = __scrape_table_rows(soup.select_one(".panel-submission"),
                                   table_css_selector="table")
        submission_found = False

        for row in rows:
            submitted_on = datetime.datetime.strptime(row[0].find('time').text,
                                                      "%Y-%m-%d %H:%M:%S%z")
            ts = time.mktime(submitted_on.utctimetuple())
            submitted_on = datetime.datetime.utcfromtimestamp(ts)

            submission = {
                'judge_id':
                ATCODER_JUDGE_ID,
                'submission_id':
                row[-1].find('a', href=True)['href'].split('/')[-1],
                'submitted_on':
                submitted_on,
                'task_id':
                "/".join([
                    contest_id, row[1].find('a',
                                            href=True)['href'].split('/')[-1]
                ]).lower(),
                'author_id':
                row[2].find('a', href=True)['href'].split('/')[-1].lower(),
                'language':
                row[3].text,
                'source_size':
                int(row[5].text.split()[0]),
                'verdict':
                row[6].select_one('span.label').text.split()[-1],
            }

            if row[4].text != '-':
                submission['score'] = int(row[4].text)

            if len(row) == 10:
                submission.update({
                    'exec_time': int(row[7].text.split()[0]),
                    'memory_used': int(row[8].text.split()[0]),
                })

            submission_found = True
            yield submission

        if not submission_found:
            break