def get_page(page_url, max_retries=10, **query_dict): """ Sends a GET request, while also printing the page to console. :param max_retries: the maximum number of retries :param page_url: the url of the GET request :param query_dict: the GET query parameters :return: the page received """ if len(query_dict) > 0: query_string = urllib.parse.urlencode(query_dict) page_url += "?" + query_string page = None for tries in range(max_retries): log.debug(f"GET: {page_url}") page = requests.get(page_url) if page.status_code == 200 or page.status_code == 400 or page.status_code == 404: break else: log.warning( f'Request failed (status code: {page.status_code}). Sleeping for 2 seconds...' ) time.sleep(2) log.info('Retrying...') if not page: log.error("Request failed. Page not found.") if page.status_code != 200: log.error("Request failed. Status code: %d" % page.status_code) return page
def write_tasks(tasks): tasks = list(tasks) task_tags = {tag.tag_id: tag for tag in MethodTag.objects.all()} judges = {judge.judge_id: judge for judge in Judge.objects.all()} total_created = 0 for task_info in tasks: judge = judges[task_info['judge_id']] task, created = Task.objects.get_or_create( judge=judge, task_id=task_info['task_id']) if created: total_created += 1 task.name = task_info['title'] if 'statement' in task_info: statement, _ = TaskStatement.objects.get_or_create(task=task) if 'time_limit' in task_info: statement.time_limit_ms = task_info['time_limit'] if 'memory_limit' in task_info: statement.memory_limit_kb = task_info['memory_limit'] if 'input_file' in task_info: statement.input_file = task_info['input_file'] if 'output_file' in task_info: statement.output_file = task_info['output_file'] if statement.modified_by_user: log.info( f"Skipped updating statement for {task}: modified by user") else: statement.text = task_info['statement'] statement.examples = task_info['examples'] statement.save() for tag_id in task_info.get('tags', []): if tag_id in task_tags: task.tags.add(task_tags[tag_id]) else: log.warning(f'Skipped adding tag {tag_id}. Does not exist') if 'source' in task_info: source_id = slugify(task_info['source']) source, _ = TaskSource.objects.get_or_create( judge=task.judge, source_id=source_id, defaults={'name': task_info['source']}) task.source = source task.save() statistic_defaults = dict( total_submission_count=task_info.get('total_submission_count'), accepted_submission_count=task_info.get( 'accepted_submission_count'), first_submitted_on=task_info.get('first_submitted_on'), ) statistic_defaults = {k: v for k, v in statistic_defaults.items() if v} if len(statistic_defaults) > 0: JudgeTaskStatistic.objects.get_or_create( task=task, defaults=statistic_defaults) log.success( f"Successfully updated {len(tasks)} tasks! ({total_created} created)")
def scrape_submissions_for_users(*user_ids, from_days=0, to_days=100000): log.info(f"Scraping submissions for users {user_ids}...") from_date = datetime.now() - timedelta(days=from_days) to_date = datetime.now() - timedelta(days=to_days) log.info(f'Dates between {to_date} and {from_date}...') handle_dict = {} for user in user_ids: judge_id, handle = user.split('/', 1) handles = __expand_handle(judge_id, handle) handle_dict[judge_id] = handle_dict.get(judge_id, []) + handles for judge_id, handles in handle_dict.items(): handles = list(set(handles)) log.info(f"Scraping user submissions from judge '{judge_id}':") log.info(f'Handles: {handles}') scraper = scrapers.create_scraper(judge_id) # Get submissions as list of generators. for handle in handles: try: submissions = scraper.scrape_submissions_for_user(handle) submissions = itertools.takewhile( lambda x: x['submitted_on'] >= to_date, submissions) queries.write_submissions(submissions) except NotImplementedError: log.warning( f'Scraping submissions not implemented for {scraper.__class__.__name__}.') return except Exception as ex: log.exception(ex)
def scrape_submissions_for_tasks(*tasks, from_days=0, to_days=100000): log.info(f"Scraping submissions for tasks {tasks}...") from_date = datetime.now() - timedelta(days=from_days) to_date = datetime.now() - timedelta(days=to_days) log.info(f'Dates between {to_date} and {from_date}...') task_dict = {} for task in tasks: judge_id, task_id = task.split('/', 1) task_ids = __expand_task(judge_id, task_id) task_dict[judge_id] = task_dict.get(judge_id, []) + task_ids for judge_id, task_ids in task_dict.items(): task_ids = list(set(task_ids)) log.info(f"Scraping task submissions from judge '{judge_id}':") log.info(f'Task ids: {task_ids}') scraper = scrapers.create_scraper(judge_id) for task_id in task_ids: try: submissions = scraper.scrape_submissions_for_task(task_id) submissions = itertools.takewhile( lambda x: x['submitted_on'] >= to_date, submissions) queries.write_submissions(submissions) except NotImplementedError: log.warning( f'Scraping submissions not implemented for {scraper.__class__.__name__}.') break except Exception as ex: log.exception(ex)
def get_object(self, queryset=None): sheet_id = self.kwargs['sheet_id'] task_id = self.kwargs['task_id'] log.warning("HERE") return get_object_or_404(TaskSheetTask, task=task_id, sheet__sheet_id=sheet_id)
def __scrape_table_rows(node, table_css_selector): table = node.select_one(table_css_selector) if table is None: log.warning(f'Could not find table with selector {table_css_selector}') return for row in table.find("tbody").find_all("tr"): yield row.find_all("td")
def write_submissions(submissions): submissions = list(submissions) # Get all handles. handles = { (handle.judge.judge_id, handle.handle.lower()): handle for handle in UserHandle.objects.annotate(handle_lower=Lower('handle')).filter( handle_lower__in={sub['author_id'].lower() for sub in submissions}).select_related('judge') } # Get all required tasks. tasks = {(task.judge.judge_id, task.task_id): task for task in Task.objects.filter( task_id__in={sub['task_id'] for sub in submissions}).select_related('judge')} log.info(f"Writing {len(submissions)} submissions to database...") log.debug(f"TASKS: {tasks}") log.debug(f"HANDLES: {handles}") submission_models = [] for sub in submissions: author = handles.get((sub['judge_id'], sub['author_id'])) task = tasks.get((sub['judge_id'], sub['task_id'])) if not author or not task: continue fields = dict( submission_id=sub['submission_id'], author=author, submitted_on=timezone.make_aware(sub['submitted_on']), task=task, verdict=sub['verdict'], language=sub.get('language'), source_size=sub.get('source_size'), score=sub.get('score'), exec_time=sub.get('exec_time'), memory_used=sub.get('memory_used'), ) if fields['score'] and math.isnan(fields['score']): fields['score'] = None fields = {k: v for k, v in fields.items() if v is not None} submission_models.append(Submission(**fields)) if submission_models: result = Submission.objects.bulk_create(submission_models, ignore_conflicts=True) to_update = [x for x in result if x.pk is None] log.warning("TODO: Implement update!") log.success( f"Successfully upserted {len(submission_models)} submissions! " f"({len(result) - len(to_update)} created, 0 updated)") else: log.info("No submissions to upsert.")
def scrape_submissions(from_page=1, to_page=SCRAPER_LIMIT, results_per_page=200, **query_dict): """ Scrapes all submissions from the eval monitor. :param from_page: first page of the pagination :param to_page: last page of the pagination :param results_per_page: number of results to get for each request :param query_dict: optional GET query params to give to the monitor (e.g. user='******') """ page_url = "https://www.infoarena.ro/monitor" rows = __scrape_paginated_table_rows(page_url, from_page, to_page, results_per_page, table_css_selector="#monitor-table", **query_dict) for row in rows: if len(row) != 7: raise Exception("Unexpected number of columns.") # Parse required information. submission_id = None try: verdict_text = row[6].find("span").text submission_id = row[0].find("a", href=True)['href'].split('/')[-1] if not row[4].find("a"): log.debug(f"Skipped submission #{submission_id}: private.") continue if verdict_text.startswith("Eroare"): log.debug( f"Skipped submission #{submission_id}: system error.") continue submission = dict( judge_id=INFOARENA_JUDGE_ID, submission_id=submission_id, author_id=row[1].find( "a", href=True)['href'].split('/')[-1].lower(), task_id=row[2].find("a", href=True)['href'].split('/')[-1].lower(), source_size=parsers.parse_source_size(row[4].find("a").text), submitted_on=parsers.parse_date(row[5].text), verdict=parsers.parse_verdict(verdict_text), score=parsers.parse_score(verdict_text), ) yield submission except (TypeError, AttributeError) as e: # Probably task name was hidden. log.warning(f"Error scraping submission #{submission_id}: {e}")
def user_step(user_ratings, task_ratings, coef=1): new_user_ratings = user_ratings.copy() for user_i in range(user_count): tasks_solved = user_tasks[user_i] tasks_solved_ratings = task_ratings[tasks_solved] def neg_log_like(user_rating): user_rating = user_rating**2 + 1e-9 log_like = np.log( user_rating) - tasks_solved_ratings / user_rating log_prior = -coef * user_rating return -(log_prior + np.sum(log_like)) result = minimize_scalar(neg_log_like) if not result.success: log.warning('Minimization did not succeed.') new_user_ratings[user_i] = result.x return new_user_ratings
def task_step(user_ratings, task_ratings, coef=3): new_task_ratings = task_ratings.copy() for task_i in range(task_count): users_solved = task_users[task_i] users_solved_ratings = user_ratings[users_solved] def neg_log_like(task_rating): task_rating = task_rating**2 + 1e-9 log_like = -np.log( task_rating) - task_rating / users_solved_ratings log_prior = -coef / task_rating # * 0.5 + np.log(2) return -(log_prior + np.sum(log_like)) result = minimize_scalar(neg_log_like) if not result.success: log.warning('Minimization did not succeed.') new_task_ratings[task_i] = result.x return new_task_ratings
def scrape_submissions(from_page=1, to_page=SCRAPER_LIMIT, **query_dict): """ Scrapes all submissions from the eval monitor. :param from_page: first page of the pagination :param to_page: last page of the pagination :param query_dict: optional GET query params to give to the monitor (e.g. problem='BOI18_genetics', handle='retrograd') """ page_url = "https://oj.uz/submissions" rows = __scrape_paginated_table_rows( page_url, from_page, to_page, table_css_selector=".container .table", **query_dict) for row in rows: if len(row) != 8: raise Exception("Unexpected number of columns.") # Parse required information. try: verdict_text = row[5].text submission = dict( judge_id=OJUZ_JUDGE_ID, submission_id=row[0].find("a", href=True)['href'].split('/')[-1], submitted_on=parsers.parse_date(row[1].text), author_id=row[2].find( "a", href=True)['href'].split('/')[-1].lower(), task_id=row[3].find("a", href=True)['href'].split('/')[-1].lower(), verdict=parsers.parse_verdict(verdict_text), score=parsers.parse_score(verdict_text), ) if submission['verdict'] != 'CE': submission.update( dict( time_exec=parsers.parse_time_exec(row[6].text), memory_used=parsers.parse_memory_used(row[7].text), )) yield submission except (TypeError, AttributeError) as e: # Probably task name was hidden. log.warning(f"Skipped one submission. Error: {e}")
def scrape_task_info(task_id: str): """ Scrapes task information for given task ids. :param task_id: the id of the task :return: task information, in dict format """ contest_id = task_id.split('/')[0] response = _api_get('contest.standings', kwargs={'contestId': contest_id}) found = False for task_data in response['problems']: curr_task_id = '/'.join( [str(task_data['contestId']), task_data['index']]).lower() if task_id != curr_task_id: continue log.info(f"Updating task '{task_id}' [{task_data['name']}]...") tags = [] for tag_data in task_data['tags']: tag = parse_tag(tag_data) if tag: tags.append(tag) task_info = { 'judge_id': CODEFORCES_JUDGE_ID, 'task_id': task_id.lower(), 'title': task_data['name'], 'tags': tags, 'source': response['contest']['name'], } creation_time_seconds = response['contest'].get('startTimeSeconds') if creation_time_seconds: task_info[ 'first_submitted_on'] = datetime.datetime.utcfromtimestamp( creation_time_seconds) found = True return task_info if not found: log.warning(f"Task id '{task_id}' not found.") return None
def parse_submission(submission_data): try: submission_id = submission_data['id'] task_id = '/'.join([ str(submission_data['problem']['contestId']), submission_data['problem']['index'] ]) if submission_data['verdict'] == 'TESTING': log.info(f'Skipped submission {submission_id}: still testing.') return [] if 'verdict' not in submission_data: log.warning(f'Skipped submission {submission_id}: no verdict?.') return [] for author in submission_data['author']['members']: author_id = author['handle'] submission = dict( judge_id=CODEFORCES_JUDGE_ID, submission_id=str(submission_id), task_id=task_id.lower(), submitted_on=datetime.datetime.utcfromtimestamp( submission_data['creationTimeSeconds']), language=submission_data['programmingLanguage'], verdict=parse_verdict(submission_data['verdict']), author_id=author_id.lower(), time_exec=submission_data['timeConsumedMillis'], memory_used=round(submission_data['memoryConsumedBytes'] / 1024), ) yield submission except Exception as ex: log.error( f"Failed to parse submission.\nSubmission data:{submission_data}\nError: {ex}" )
def parse_task_url(url: str) -> Optional[ParseTaskResult]: # url = url.split('?')[0] result = re.search( r"https://competitive\.herokuapp\.com/" r"task/(?P<judge_id>[^/]+)/(?P<task_id>[^/]+)", url) if result: return ParseTaskResult(judge_id=result.group("judge_id"), task_id=result.group("task_id")) for parser in TASK_PARSERS: re_search = parser.regex.search(url) if not re_search: continue if parser.type == ParserType.SIMPLE: task_id = re_search.group('task_id') else: task_id = '/'.join( [re_search.group('contest_id'), re_search.group('task_id')]) return ParseTaskResult(judge_id=parser.judge_id, task_id=task_id) log.warning(f'Could not parse URL: {url}.') return None
def scrape_handle_info(handle): log.info(f"Scraping info for handle '{handle}'...") judge_id, handle_id = handle.split('/', 1) handles = __expand_handle(judge_id, handle_id) log.info(f"Handles: {handles}") scraper = scrapers.create_scraper(judge_id) user_infos = [] for handle in handles: try: user_info = scraper.scrape_user_info(handle) log.info(f"Successfully scraped user info for '{handle}'") log.debug(user_info) if user_info: user_infos.append(user_info) except NotImplementedError: log.warning( f'Scraping handles not implemented for {scraper.__class__.__name__}.') return except Exception as ex: log.exception(ex) queries.write_handles(user_infos)
def scrape_task_info(task): log.info(f"Scraping task info for task '{task}'...") judge_id, task_id = task.split('/', 1) task_ids = __expand_task(judge_id, task_id) scraper = scrapers.create_scraper(judge_id) task_infos = [] log.info(f"Task ids: {task_ids}") for task_id in task_ids: try: task_info = scraper.scrape_task_info(task_id) if task_info is None: log.warning( f"Did not find task info for '{task_id}'. Skipping...") continue log.debug(task_info) log.info( f"Successfully scraped '{task_id}' [{task_info['title']}]...") try: statement_info = scraper.scrape_task_statement(task_id) task_info.update(statement_info) except NotImplementedError: log.warning( f"Could not get statement of task {task_id}: not implemented." ) except Exception as ex: log.warning(f"Could not get statement of task {task_id}: {ex}") task_infos.append(task_info) except NotImplementedError: log.warning( f'Scraping tasks not implemented for {scraper.__class__.__name__}.' ) return except Exception as ex: log.exception(ex) queries.write_tasks(task_infos)
def translate_ro_en(text: str, use_glossary=False): client = __get_client() parent = client.location_path(PROJECT_ID, 'us-central1') html_text = markdownify(text) html_text = re.sub(r"(<h.>)[^<>]*[Cc]erin[^<>]*(</h.>)", r"\g<1><task/>\g<2>", html_text) html_text = re.sub(r"(<h.>)[^<>]*ntrare[^<>]*(</h.>)", r"\g<1><input/>\g<2>", html_text) html_text = re.sub(r"(<h.>)[^<>]*e.ire[^<>]*(</h.>)", r"\g<1><output/>\g<2>", html_text) html_text = re.sub(r"(<h.>)[^<>]*estric[^<>]*(</h.>)", r"\g<1><constraints/>\g<2>", html_text) html_text = re.sub(r"(<h.>)[^<>]*reciz.r[^<>]*(</h.>)", r"\g<1><notes/>\g<2>", html_text) replace = {} for idx, code in enumerate( set(re.findall(r"<code>(.*?)<\/code>", html_text))): placeholder = f"<span id=\"{idx}\">0</span>" replace[code] = placeholder html_text = html_text.replace(code, placeholder) # Ro glossary html_text = re.sub(r"Sa( se)? ", "Trebuie sa ", html_text) html_text = re.sub(r"Să( se)? ", "Trebuie să ", html_text) html_text = re.sub("Fie ", "Consideră ", html_text) html_text = re.sub("Se da ", "Se consideră ", html_text) html_text = re.sub("Se dau ", "Se consideră ", html_text) html_text = re.sub("Se dă ", "Se consideră ", html_text) html_text = re.sub(' mod ', ' modulo ', html_text) html_text = re.sub('modulo', 'mmoodduulloo', html_text) response = None for tries in range(3): try: response = client.translate_text(parent=parent, contents=[html_text], source_language_code='ro', target_language_code='en', mime_type='text/html') break except Exception as ex: if tries == 2: raise if "RESOURCE_EXHAUSTED" in str(ex): log.warning("RESOURCE_EXHAUSTED. Sleeping for 60s...") time.sleep(60) translated = response.translations translated = translated[0].translated_text # En glossary glossary = [ ('peak', 'vertex'), ('peaks', 'vertices'), ('tip', 'vertex'), ('tips', 'vertices'), ('mmoodduulloo', 'modulo'), ] for word, rep in glossary: translated = re.sub(rf'([^a-zA-Z]|^){word}([^a-zA-Z]|$)', rf'\g<1>{rep}\g<2>', translated) for code, placeholder in replace.items(): translated = translated.replace(placeholder, code) translated = translated \ .replace('<task/>', 'Task') \ .replace('<input/>', 'Input')\ .replace('<output/>', 'Output')\ .replace('<constraints/>', 'Constraints')\ .replace('<notes/>', 'Notes')\ for pref, header_text, suff in set( re.findall(r"(<h.>)([^<>]*)(</h.>)", translated)): print(pref, header_text, suff) translated = translated.replace( pref + header_text + suff, pref + header_text.strip().capitalize() + suff) # translated = re.sub(r"<code>[^<>]*\.in<\/code>([^\n]{1,25}<code>[^<>]*\.in<\/code>)", r"\g<1>", translated) # translated = re.sub(r"<code>[^<>]*\.out<\/code>([^\n]{1,25}<code>[^<>]*\.out<\/code>)", r"\g<1>", translated) translated = html2text(translated, bodywidth=0) return markdown.prettify(translated)
def scrape_task_info(task_id): """ Scrapes task information for a given task id. :param task_id: the id of the task :return: task information, in dict format """ page_url = "https://www.infoarena.ro/problema/" + task_id page = get_page(page_url) soup = BeautifulSoup(page.content, 'html.parser') main_view = soup.find(id='main') info_table = main_view.find('table') title = main_view.find('h1').text.strip() input_file, output_file = map( str.strip, info_table.find_all('tr')[0].find_all('td')[1].text.split(',')) time_limit = info_table.find_all('tr')[2].find_all('td')[1].text memory_limit = info_table.find_all('tr')[2].find_all('td')[3].text source = info_table.find_all('tr')[0].find_all('td')[3].text tags = [] for tag_a in main_view.select('a.tag_search_anchor'): tag = parsers.parse_tag(tag_a.text) if tag is not None: tags.append(tag) task_info = { 'judge_id': INFOARENA_JUDGE_ID, 'task_id': task_id.lower(), 'title': title, 'source': source, 'time_limit': parsers.parse_time_limit(time_limit), 'memory_limit': parsers.parse_memory_limit(memory_limit), 'input_file': input_file, 'output_file': output_file, 'tags': tags, } try: # Go to the monitor to find out submission count and first submission date. task_info.update( dict(total_submission_count=get_submission_count(task=task_id), accepted_submission_count=get_submission_count( task=task_id, score_begin=100))) submission_count = task_info['total_submission_count'] if submission_count > 0: # A little hack to get only the very first submissions. first_few_submissions = list( scrape_submissions(from_page=max(1, submission_count // 20 - 1), results_per_page=20, task=task_id)) if len(first_few_submissions) == 0: raise Exception("BUG: First few submissions are non-existant") first_submitted_on = min( [sub['submitted_on'] for sub in first_few_submissions]) task_info['first_submitted_on'] = first_submitted_on except Exception as ex: log.warning(f"Failed to parse extra data for task {task_id}: {ex}") return task_info
def parse_verdict(verdict_text): if verdict_text in VERDICT_DICT: return VERDICT_DICT[verdict_text] log.warning(f'Unknown verdict: {verdict_text}.') return 'WA'
def parse_tag(tag_text): if tag_text in TAG_DICT: return TAG_DICT[tag_text] log.warning(f"Unknown tag: {tag_text}.") return None
def __parse_contest_id(contest_url: str): result = re.search(r'https?://(.*)\.contest\.atcoder\.jp/?', contest_url) if result is None: log.warning(f"Could not parse contest id from {contest_url}.") return None return result.group(1)
def generate_new_task(ladder, commit=True): profile = ladder.profile log.info(f"Generating new task for {profile}...") handles = list( UserHandle.objects.filter(user=profile).select_related('judge')) judges = {handle.judge for handle in handles} tried_tasks = set( Submission.objects.filter(author__in=handles).values_list( 'task', flat=True).distinct()) previous_tasks = set(ladder.tasks.values_list('task', flat=True)) forbidden_tasks = tried_tasks | previous_tasks available_tasks = [ task for task in Task.objects.filter( judge__in=judges, statistics__isnull=False).select_related( 'statistics') if task.pk not in forbidden_tasks and task.statistics.users_solved_count >= 2 ] if not available_tasks: log.warning("Could not generate: no tasks to choose from.") return None solved_tasks_scores = [ score for _, score in Submission.objects.filter( author__in=handles, verdict='AC').values_list( 'task', 'task__statistics__difficulty_score').distinct() if score is not None ] bounds = 25, 60 if len(solved_tasks_scores) >= 25: solved_tasks_scores.sort() solved_tasks_scores = solved_tasks_scores[-50:-5] mid_score = random.choice(solved_tasks_scores) bounds = mid_score * 0.9, mid_score * 1.1 if profile.user.username == "adrian.budau": bounds = (bounds[0] * 1.5, bounds[1] * 1.5) sought_score = random.randint(int(bounds[0]), int(bounds[1])) log.info(f"Sought score: {sought_score} (bounds: {bounds})") random.shuffle(available_tasks) best_error, chosen_task = None, None for task in available_tasks: curr_error = abs(task.statistics.difficulty_score - sought_score) if not chosen_task or best_error > curr_error: best_error, chosen_task = curr_error, task log.info( f"Chosen task: {chosen_task} (score: {chosen_task.statistics.difficulty_score})" ) duration = datetime.timedelta(minutes=120) ladder_task = LadderTask(ladder=ladder, task=chosen_task, duration=duration, status=LadderTask.Status.NEW) if commit: ladder_task.save() return ladder_task