def get_page(page_url, max_retries=10, **query_dict): """ Sends a GET request, while also printing the page to console. :param max_retries: the maximum number of retries :param page_url: the url of the GET request :param query_dict: the GET query parameters :return: the page received """ if len(query_dict) > 0: query_string = urllib.parse.urlencode(query_dict) page_url += "?" + query_string page = None for tries in range(max_retries): log.debug(f"GET: {page_url}") page = requests.get(page_url) if page.status_code == 200 or page.status_code == 400 or page.status_code == 404: break else: log.warning( f'Request failed (status code: {page.status_code}). Sleeping for 2 seconds...' ) time.sleep(2) log.info('Retrying...') if not page: log.error("Request failed. Page not found.") if page.status_code != 200: log.error("Request failed. Status code: %d" % page.status_code) return page
def create_handle(sender, instance, created, **kwargs): if created and settings.USE_CELERY: log.info('Created new handle: updating info...') handle = '/'.join([instance.judge.judge_id, instance.handle]) services.scraper_services.scrape_handle_info.si(handle).apply_async() services.scraper_services.scrape_submissions_for_users.si( handle).apply_async()
def make_dataset(augment_from_mongo): dataset = [ (s.task.id, s.author.user.user.username) for s in Submission.objects.best().filter(verdict='AC').select_related( 'task', 'task__judge', 'author__user__user') ] # Add mongodb submissions if augment_from_mongo: mongo_task_map = { ":".join([t.judge.judge_id, t.task_id]): t for t in Task.objects.select_related('judge').all() } mongo_author_map = { uh.handle: uh.user.user.username for uh in UserHandle.objects.select_related("user__user").all() } log.info(f'Dataset size before augmentation: {len(dataset)}') for mongo_sub in database.find_submissions(database.get_db(), verdict='AC'): task = mongo_task_map.get(":".join( [mongo_sub['judge_id'], mongo_sub['task_id']])) if task: username = mongo_author_map.get(mongo_sub['author_id'], mongo_sub['author_id']) dataset.append((task.id, username)) dataset = set(dataset) log.info(f'Dataset size after augmentation: {len(dataset)}') tasks = [t for (t, _) in dataset] users = [u for (_, u) in dataset] return tasks, users
def write_tasks(tasks): tasks = list(tasks) task_tags = {tag.tag_id: tag for tag in MethodTag.objects.all()} judges = {judge.judge_id: judge for judge in Judge.objects.all()} total_created = 0 for task_info in tasks: judge = judges[task_info['judge_id']] task, created = Task.objects.get_or_create( judge=judge, task_id=task_info['task_id']) if created: total_created += 1 task.name = task_info['title'] if 'statement' in task_info: statement, _ = TaskStatement.objects.get_or_create(task=task) if 'time_limit' in task_info: statement.time_limit_ms = task_info['time_limit'] if 'memory_limit' in task_info: statement.memory_limit_kb = task_info['memory_limit'] if 'input_file' in task_info: statement.input_file = task_info['input_file'] if 'output_file' in task_info: statement.output_file = task_info['output_file'] if statement.modified_by_user: log.info( f"Skipped updating statement for {task}: modified by user") else: statement.text = task_info['statement'] statement.examples = task_info['examples'] statement.save() for tag_id in task_info.get('tags', []): if tag_id in task_tags: task.tags.add(task_tags[tag_id]) else: log.warning(f'Skipped adding tag {tag_id}. Does not exist') if 'source' in task_info: source_id = slugify(task_info['source']) source, _ = TaskSource.objects.get_or_create( judge=task.judge, source_id=source_id, defaults={'name': task_info['source']}) task.source = source task.save() statistic_defaults = dict( total_submission_count=task_info.get('total_submission_count'), accepted_submission_count=task_info.get( 'accepted_submission_count'), first_submitted_on=task_info.get('first_submitted_on'), ) statistic_defaults = {k: v for k, v in statistic_defaults.items() if v} if len(statistic_defaults) > 0: JudgeTaskStatistic.objects.get_or_create( task=task, defaults=statistic_defaults) log.success( f"Successfully updated {len(tasks)} tasks! ({total_created} created)")
def compute_ladder_statistics(): log.info('Computing ladder statistics...') scores = {} for task in (LadderTask.objects.filter( status=LadderTask.Status.COMPLETED).select_related( 'task', 'task__statistics', 'ladder')): ladder = task.ladder score = scores.get(ladder, 0) try: score += task.task.statistics.difficulty_score except ObjectDoesNotExist: pass scores[ladder] = score stats = [] for ladder in Ladder.objects.all(): stat, _ = LadderStatistics.objects.update_or_create( ladder=ladder, defaults=dict(total_points=scores.get(ladder, 0))) stats.append(stat) ranks = compute_ranks(stats, key=lambda x: x.total_points, reverse=True) for stat, rank in zip(stats, ranks): stat.rank = rank for stat in stats: stat.save(force_update=True)
def form_valid(self, form): task_url = form.cleaned_data['task_url'] parse_result = urlparsers.parse_task_url(task_url) if not parse_result: messages.add_message(self.request, messages.ERROR, 'Could not parse task from url.') else: log.info(parse_result) judge = Judge.objects.get(judge_id=parse_result.judge_id) task, _ = Task.objects.get_or_create( judge=judge, task_id=parse_result.task_id.lower(), ) _, created = TaskSheetTask.objects.get_or_create( task=task, sheet=self.object, ) if created: messages.add_message( self.request, messages.SUCCESS, f'Task __{task.name_or_id()}__ added successfully!') else: messages.add_message( self.request, messages.ERROR, f'Task __{task.name_or_id()}__ is already present in the sheet.' ) return redirect( self.request.META.get('HTTP_REFERER', reverse_lazy('home')))
def create_task(sender, instance, created, **kwargs): if created and settings.USE_CELERY: task_path = instance.get_path() log.info(f'Created new task {task_path}: updating info async...') services.scraper_services.scrape_task_info.si(task_path).apply_async() log.info('Scraping submissions and updating users async...') services.scraper_services.scrape_submissions_for_tasks.si( task_path).apply_async()
def write_submissions(submissions): submissions = list(submissions) # Get all handles. handles = { (handle.judge.judge_id, handle.handle.lower()): handle for handle in UserHandle.objects.annotate(handle_lower=Lower('handle')).filter( handle_lower__in={sub['author_id'].lower() for sub in submissions}).select_related('judge') } # Get all required tasks. tasks = {(task.judge.judge_id, task.task_id): task for task in Task.objects.filter( task_id__in={sub['task_id'] for sub in submissions}).select_related('judge')} log.info(f"Writing {len(submissions)} submissions to database...") log.debug(f"TASKS: {tasks}") log.debug(f"HANDLES: {handles}") submission_models = [] for sub in submissions: author = handles.get((sub['judge_id'], sub['author_id'])) task = tasks.get((sub['judge_id'], sub['task_id'])) if not author or not task: continue fields = dict( submission_id=sub['submission_id'], author=author, submitted_on=timezone.make_aware(sub['submitted_on']), task=task, verdict=sub['verdict'], language=sub.get('language'), source_size=sub.get('source_size'), score=sub.get('score'), exec_time=sub.get('exec_time'), memory_used=sub.get('memory_used'), ) if fields['score'] and math.isnan(fields['score']): fields['score'] = None fields = {k: v for k, v in fields.items() if v is not None} submission_models.append(Submission(**fields)) if submission_models: result = Submission.objects.bulk_create(submission_models, ignore_conflicts=True) to_update = [x for x in result if x.pk is None] log.warning("TODO: Implement update!") log.success( f"Successfully upserted {len(submission_models)} submissions! " f"({len(result) - len(to_update)} created, 0 updated)") else: log.info("No submissions to upsert.")
def scrape_submissions_for_users(*user_ids, from_days=0, to_days=100000): log.info(f"Scraping submissions for users {user_ids}...") from_date = datetime.now() - timedelta(days=from_days) to_date = datetime.now() - timedelta(days=to_days) log.info(f'Dates between {to_date} and {from_date}...') handle_dict = {} for user in user_ids: judge_id, handle = user.split('/', 1) handles = __expand_handle(judge_id, handle) handle_dict[judge_id] = handle_dict.get(judge_id, []) + handles for judge_id, handles in handle_dict.items(): handles = list(set(handles)) log.info(f"Scraping user submissions from judge '{judge_id}':") log.info(f'Handles: {handles}') scraper = scrapers.create_scraper(judge_id) # Get submissions as list of generators. for handle in handles: try: submissions = scraper.scrape_submissions_for_user(handle) submissions = itertools.takewhile( lambda x: x['submitted_on'] >= to_date, submissions) queries.write_submissions(submissions) except NotImplementedError: log.warning( f'Scraping submissions not implemented for {scraper.__class__.__name__}.') return except Exception as ex: log.exception(ex)
def scrape_submissions_for_tasks(*tasks, from_days=0, to_days=100000): log.info(f"Scraping submissions for tasks {tasks}...") from_date = datetime.now() - timedelta(days=from_days) to_date = datetime.now() - timedelta(days=to_days) log.info(f'Dates between {to_date} and {from_date}...') task_dict = {} for task in tasks: judge_id, task_id = task.split('/', 1) task_ids = __expand_task(judge_id, task_id) task_dict[judge_id] = task_dict.get(judge_id, []) + task_ids for judge_id, task_ids in task_dict.items(): task_ids = list(set(task_ids)) log.info(f"Scraping task submissions from judge '{judge_id}':") log.info(f'Task ids: {task_ids}') scraper = scrapers.create_scraper(judge_id) for task_id in task_ids: try: submissions = scraper.scrape_submissions_for_task(task_id) submissions = itertools.takewhile( lambda x: x['submitted_on'] >= to_date, submissions) queries.write_submissions(submissions) except NotImplementedError: log.warning( f'Scraping submissions not implemented for {scraper.__class__.__name__}.') break except Exception as ex: log.exception(ex)
def fix_task_ids(): for task in Task.objects.select_related("judge").all(): log.info(f"OLD TASK ID: {task.task_id}") if task.judge.judge_id == "cf": task.task_id = task.task_id.replace("_", "/") if task.judge.judge_id == "ac": if "/" in task.task_id: contest_id, task_id = task.task_id.split('/') else: contest_id, _ = task.task_id.rsplit('_', 1) task_id = task.task_id task.task_id = "/".join([contest_id.replace("_", "-"), task_id]) log.info(f"NEW TASK ID: {task.task_id}") task.save()
def write_handles(db, handles, chunk_size=100): """ Writes a list of tasks to database. :param db: Database instance. :param chunk_size: how many tasks to be written at once :param handles: list/generator of handle info :return: the number of tasks inserted """ total_inserted = 0 for chunk in split_into_chunks(handles, chunk_size): log.info("Writing chunk to database...") num_inserted = database.insert_handles(db, chunk) log.info(f"{num_inserted} handles written to database.") total_inserted += num_inserted return total_inserted
def write_submissions(db, submissions, chunk_size=100): """ Writes a list of submissions to database. :param db: Database instance. :param chunk_size: how many submissions to be written at once :param submissions: list/generator of submissions :return: the number of submissions inserted """ total_inserted = 0 for chunk in split_into_chunks(submissions, chunk_size): log.info(f"Writing chunk of size {len(chunk)} to database...") num_inserted = database.insert_submissions(db, chunk) log.info(f"{num_inserted} submissions written to database.") total_inserted += num_inserted return total_inserted
def scrape_task_info(task_id: str): """ Scrapes task information for given task ids. :param task_id: the id of the task :return: task information, in dict format """ contest_id = task_id.split('/')[0] response = _api_get('contest.standings', kwargs={'contestId': contest_id}) found = False for task_data in response['problems']: curr_task_id = '/'.join( [str(task_data['contestId']), task_data['index']]).lower() if task_id != curr_task_id: continue log.info(f"Updating task '{task_id}' [{task_data['name']}]...") tags = [] for tag_data in task_data['tags']: tag = parse_tag(tag_data) if tag: tags.append(tag) task_info = { 'judge_id': CODEFORCES_JUDGE_ID, 'task_id': task_id.lower(), 'title': task_data['name'], 'tags': tags, 'source': response['contest']['name'], } creation_time_seconds = response['contest'].get('startTimeSeconds') if creation_time_seconds: task_info[ 'first_submitted_on'] = datetime.datetime.utcfromtimestamp( creation_time_seconds) found = True return task_info if not found: log.warning(f"Task id '{task_id}' not found.") return None
def update_all_users(): log.info(f'Updating all users profile picture...') # Photo urls. judge_priority = {'cf': 3, 'ia': 2, 'csa': 1} handles = list( UserHandle.objects.filter( judge__judge_id__in=judge_priority.keys()).select_related( 'user', 'judge')) handles.sort(key=lambda h: judge_priority[h.judge.judge_id], reverse=True) computed_users = set() users_to_update = [] for handle in handles: if not handle.photo_url or handle.user in computed_users: continue computed_users.add(handle.user) if handle.photo_url != handle.user.avatar_url: handle.user.avatar_url = handle.photo_url users_to_update.append(handle.user) UserProfile.objects.bulk_update(users_to_update, ['avatar_url']) log.success(f"{len(users_to_update)} users updated.")
def process(url): log.info(f"Processing {url}") response = get_page(f"https://codeforces.com{url}", max_retries=1) soup = BeautifulSoup(response.content, "html.parser") for a in soup.find_all('a', href=True): href = a['href'] href = href.split('codeforces.com')[-1].rstrip('/') if '#' in href or '?' in href or href.startswith('http'): continue out.write(f"{url} {href}\n") if '/problem/' in href: log.info(f"Found edge: {url}->{href}") if href in seen: continue href = href.replace('profile', 'blog') if 'blog' in href: seen.add(href) queue.append(href)
def compute_task_statistics(): """ Computes a TaskStatistic object for each task, which it saves to the database """ log.info('Computing task statistics...') BASE_SCORE = 100 task_ratings, user_ratings = multipliers.compute_task_and_user_ratings( augment_from_mongo=False) default_multiplier = max(task_ratings.values()) def get_multiplier(task): return task_ratings.get(task.id, default_multiplier) task_count = 0 for task in Task.objects.all(): users_solved_count = Submission.objects.best().filter( task=task, verdict='AC').count() users_tried_count = Submission.objects.best().filter(task=task).count() submission_count = Submission.objects.filter(task=task).count() favorited_count = task.favorite_users.count() TaskStatistics.objects.update_or_create( task=task, defaults=dict( users_tried_count=users_tried_count, users_solved_count=users_solved_count, submission_count=submission_count, favorited_count=favorited_count, )) task_count += 1 if task_count == 0: return mean_multiplier = sum( [get_multiplier(task) for task in Task.objects.all()]) mean_multiplier /= task_count log.info(f'MEAN MULTIPLIER: {mean_multiplier}') scores = { task: normalize_range(BASE_SCORE * get_multiplier(task) / mean_multiplier, min=5, max=1000, step=5) for task in Task.objects.all() } log.info(scores) for task in Task.objects.select_related('statistics'): statistics = task.statistics statistics.difficulty_score = scores[task] statistics.save()
def parse_submission(submission_data): try: submission_id = submission_data['id'] task_id = '/'.join([ str(submission_data['problem']['contestId']), submission_data['problem']['index'] ]) if submission_data['verdict'] == 'TESTING': log.info(f'Skipped submission {submission_id}: still testing.') return [] if 'verdict' not in submission_data: log.warning(f'Skipped submission {submission_id}: no verdict?.') return [] for author in submission_data['author']['members']: author_id = author['handle'] submission = dict( judge_id=CODEFORCES_JUDGE_ID, submission_id=str(submission_id), task_id=task_id.lower(), submitted_on=datetime.datetime.utcfromtimestamp( submission_data['creationTimeSeconds']), language=submission_data['programmingLanguage'], verdict=parse_verdict(submission_data['verdict']), author_id=author_id.lower(), time_exec=submission_data['timeConsumedMillis'], memory_used=round(submission_data['memoryConsumedBytes'] / 1024), ) yield submission except Exception as ex: log.error( f"Failed to parse submission.\nSubmission data:{submission_data}\nError: {ex}" )
def scrape_task_info(task): log.info(f"Scraping task info for task '{task}'...") judge_id, task_id = task.split('/', 1) task_ids = __expand_task(judge_id, task_id) scraper = scrapers.create_scraper(judge_id) task_infos = [] log.info(f"Task ids: {task_ids}") for task_id in task_ids: try: task_info = scraper.scrape_task_info(task_id) if task_info is None: log.warning( f"Did not find task info for '{task_id}'. Skipping...") continue log.debug(task_info) log.info( f"Successfully scraped '{task_id}' [{task_info['title']}]...") try: statement_info = scraper.scrape_task_statement(task_id) task_info.update(statement_info) except NotImplementedError: log.warning( f"Could not get statement of task {task_id}: not implemented." ) except Exception as ex: log.warning(f"Could not get statement of task {task_id}: {ex}") task_infos.append(task_info) except NotImplementedError: log.warning( f'Scraping tasks not implemented for {scraper.__class__.__name__}.' ) return except Exception as ex: log.exception(ex) queries.write_tasks(task_infos)
def scrape_handle_info(handle): log.info(f"Scraping info for handle '{handle}'...") judge_id, handle_id = handle.split('/', 1) handles = __expand_handle(judge_id, handle_id) log.info(f"Handles: {handles}") scraper = scrapers.create_scraper(judge_id) user_infos = [] for handle in handles: try: user_info = scraper.scrape_user_info(handle) log.info(f"Successfully scraped user info for '{handle}'") log.debug(user_info) if user_info: user_infos.append(user_info) except NotImplementedError: log.warning( f'Scraping handles not implemented for {scraper.__class__.__name__}.') return except Exception as ex: log.exception(ex) queries.write_handles(user_infos)
def compute_task_and_user_ratings(num_epochs=10, augment_from_mongo=True): tasks, users = make_dataset(augment_from_mongo) # Transform strs into ints. task_le = LabelEncoder().fit(tasks) user_le = LabelEncoder().fit(users) tasks = task_le.transform(tasks) users = user_le.transform(users) task_count = task_le.classes_.shape[0] user_count = user_le.classes_.shape[0] log.info(f'Tasks: {task_count} Users: {user_count}') task_users = [users[tasks == i] for i in range(task_count)] user_tasks = [tasks[users == i] for i in range(user_count)] def user_step(user_ratings, task_ratings, coef=1): new_user_ratings = user_ratings.copy() for user_i in range(user_count): tasks_solved = user_tasks[user_i] tasks_solved_ratings = task_ratings[tasks_solved] def neg_log_like(user_rating): user_rating = user_rating**2 + 1e-9 log_like = np.log( user_rating) - tasks_solved_ratings / user_rating log_prior = -coef * user_rating return -(log_prior + np.sum(log_like)) result = minimize_scalar(neg_log_like) if not result.success: log.warning('Minimization did not succeed.') new_user_ratings[user_i] = result.x return new_user_ratings def task_step(user_ratings, task_ratings, coef=3): new_task_ratings = task_ratings.copy() for task_i in range(task_count): users_solved = task_users[task_i] users_solved_ratings = user_ratings[users_solved] def neg_log_like(task_rating): task_rating = task_rating**2 + 1e-9 log_like = -np.log( task_rating) - task_rating / users_solved_ratings log_prior = -coef / task_rating # * 0.5 + np.log(2) return -(log_prior + np.sum(log_like)) result = minimize_scalar(neg_log_like) if not result.success: log.warning('Minimization did not succeed.') new_task_ratings[task_i] = result.x return new_task_ratings task_ratings = np.ones((task_count, )) * 2 user_ratings = np.ones((user_count, )) * 1 for epoch in range(1, num_epochs + 1): log.info(f'Epoch {epoch}/{num_epochs}') user_ratings = user_step(user_ratings, task_ratings, coef=1) task_ratings = task_step(user_ratings, task_ratings, coef=4) task_ratings = {tn: tr for tn, tr in zip(task_le.classes_, task_ratings)} user_ratings = {un: ur for un, ur in zip(user_le.classes_, user_ratings)} return task_ratings, user_ratings
from django.test import TestCase # Create your tests here. from search.queries import search_task from core.logging import log log.info(search_task("Round 352"))
def parse_submissions(csrf_token, task_name_dict, user_id, task_id, from_date): publicuser, evaljobs = get_eval_jobs(csrf_token, user_id, task_id, from_date) # Make user id to username map. We use usernames. :) user_id_to_username = {} for user in publicuser: username = user['username'] user_id = user['id'] if username: username = username.lower() user_id_to_username[user_id] = username # Parse submissions. for eval_job in sorted(evaljobs, key=lambda ej: ej['id'], reverse=True): submission_id = str(eval_job['id']) if not eval_job['isDone']: log.info( f'Skipping submission {submission_id}: Not finished evaluating.' ) continue task_name = task_name_dict[eval_job['evalTaskId']] # Parse easy data. submission = dict( judge_id=CSACADEMY_JUDGE_ID, submission_id=submission_id, submitted_on=datetime.datetime.utcfromtimestamp( eval_job['timeSubmitted']), task_id=task_name, author_id=user_id_to_username[eval_job['userId']], source_size=len(eval_job['sourceText']), verdict='CE', ) # Parse verdict. verdict = 'CE' score = None if eval_job['compileOK']: score = round(eval_job['score'] * 100) if score == 100: verdict = 'AC' else: verdict = 'WA' if verdict != 'CE': submission.update(dict( verdict=verdict, score=score, )) # Parse memory_used and time_exec. time_exec = 0 memory_used = 0 for test in eval_job['tests']: time_exec = max(time_exec, test['wallTime']) memory_used = max(memory_used, test['memUsage']) time_exec = round(time_exec * 1000) memory_used = round(memory_used / 1024) submission.update(dict( time_exec=time_exec, memory_used=memory_used, )) # If author has no username, put the user id (Facebook-created accounts?). if submission['author_id'] is None: submission['author_id'] = 'uid:%s' % eval_job['userId'] yield submission
def start(): log.info("Scheduler starting...") scheduler.start()
def generate_new_task(ladder, commit=True): profile = ladder.profile log.info(f"Generating new task for {profile}...") handles = list( UserHandle.objects.filter(user=profile).select_related('judge')) judges = {handle.judge for handle in handles} tried_tasks = set( Submission.objects.filter(author__in=handles).values_list( 'task', flat=True).distinct()) previous_tasks = set(ladder.tasks.values_list('task', flat=True)) forbidden_tasks = tried_tasks | previous_tasks available_tasks = [ task for task in Task.objects.filter( judge__in=judges, statistics__isnull=False).select_related( 'statistics') if task.pk not in forbidden_tasks and task.statistics.users_solved_count >= 2 ] if not available_tasks: log.warning("Could not generate: no tasks to choose from.") return None solved_tasks_scores = [ score for _, score in Submission.objects.filter( author__in=handles, verdict='AC').values_list( 'task', 'task__statistics__difficulty_score').distinct() if score is not None ] bounds = 25, 60 if len(solved_tasks_scores) >= 25: solved_tasks_scores.sort() solved_tasks_scores = solved_tasks_scores[-50:-5] mid_score = random.choice(solved_tasks_scores) bounds = mid_score * 0.9, mid_score * 1.1 if profile.user.username == "adrian.budau": bounds = (bounds[0] * 1.5, bounds[1] * 1.5) sought_score = random.randint(int(bounds[0]), int(bounds[1])) log.info(f"Sought score: {sought_score} (bounds: {bounds})") random.shuffle(available_tasks) best_error, chosen_task = None, None for task in available_tasks: curr_error = abs(task.statistics.difficulty_score - sought_score) if not chosen_task or best_error > curr_error: best_error, chosen_task = curr_error, task log.info( f"Chosen task: {chosen_task} (score: {chosen_task.statistics.difficulty_score})" ) duration = datetime.timedelta(minutes=120) ladder_task = LadderTask(ladder=ladder, task=chosen_task, duration=duration, status=LadderTask.Status.NEW) if commit: ladder_task.save() return ladder_task