def fetch_problem(p): errors = set() for attempt in range(3): try: page = REQ.get(p['url'], time_out=30) match = re.search('<a[^>]*href="(?P<href>[^"]*module=ProblemDetail[^"]*)"[^>]*>', page) page = REQ.get(urljoin(p['url'], match.group('href')), time_out=30) matches = re.findall(r'<td[^>]*class="statTextBig"[^>]*>(?P<key>[^<]*)</td>\s*<td[^>]*>(?P<value>.*?)</td>', page, re.DOTALL) # noqa for key, value in matches: key = key.strip().rstrip(':').lower() if key == 'categories': tags = [t.strip().lower() for t in value.split(',')] tags = [t for t in tags if t] if tags: p['tags'] = tags elif key.startswith('writer') or key.startswith('tester'): key = key.rstrip('s') + 's' p[key] = re.findall('(?<=>)[^<>,]+(?=<)', value) for w in p.get('writers', []): writers[w] += 1 except Exception as e: errors.add(f'error parse problem info {p}: {e}') sleep(5 + attempt) else: errors = None if errors: LOG.error(errors) return p
def fetch_problem(p): errors = set() for attempt in range(3): try: page = REQ.get(p['url'], time_out=30) match = re.search('<a[^>]*href="(?P<href>[^"]*module=ProblemDetail[^"]*)"[^>]*>', page) page = REQ.get(urljoin(p['url'], match.group('href')), time_out=30) matches = re.findall(r'<td[^>]*class="statTextBig"[^>]*>(?P<key>[^<]*)</td>\s*<td[^>]*>(?P<value>.*?)</td>', page, re.DOTALL) # noqa for key, value in matches: key = key.strip().rstrip(':').lower() if key == 'categories': tags = [t.strip().lower() for t in value.split(',')] tags = [t for t in tags if t] if tags: p['tags'] = tags elif key.startswith('writer') or key.startswith('tester'): key = key.rstrip('s') + 's' p[key] = re.findall('(?<=>)[^<>,]+(?=<)', value) for w in p.get('writers', []): writers[w] += 1 info = p.setdefault('info', {}) matches = re.finditer('<table[^>]*paddingTable2[^>]*>.*?</table>', page, re.DOTALL) for match in matches: html_table = match.group(0) rows = parsed_table.ParsedTable(html_table) for row in rows: key, value = None, None for k, v in row.items(): if k == "": key = v.value elif k and division_str in k.split(): value = v.value if key and value: key = re.sub(' +', '_', key.lower()) info[key] = value if key == 'point_value': value = toint(value) or asfloat(value) if value is not None: p['full_score'] = value except Exception as e: errors.add(f'error parse problem info {p}: {e}') sleep(5 + attempt) else: errors = None if errors: LOG.error(errors) return p
def get_standings(self, users=None, statistics=None): result = {} start_time = self.start_time.replace(tzinfo=None) if not self.standings_url and datetime.now() - start_time < timedelta( days=30): re_round_overview = re.compile( r''' (?:<td[^>]*> (?: [^<]*<a[^>]*href="(?P<url>[^"]*/stat[^"]*rd=(?P<rd>[0-9]+)[^"]*)"[^>]*>(?P<title>[^<]*)</a>[^<]*| (?P<date>[0-9]+\.[0-9]+\.[0-9]+) )</td>[^<]* ){2} ''', re.VERBOSE, ) for url in [ 'https://www.topcoder.com/tc?module=MatchList&nr=100500', 'https://community.topcoder.com/longcontest/stats/?module=MatchList&nr=100500', ]: page = REQ.get(url) matches = re_round_overview.finditer(str(page)) opt = 0.61803398875 for match in matches: date = datetime.strptime(match.group('date'), '%m.%d.%Y') if abs(date - start_time) < timedelta(days=2): title = match.group('title') intersection = len( set(title.split()) & set(self.name.split())) union = len( set(title.split()) | set(self.name.split())) iou = intersection / union if iou > opt: opt = iou self.standings_url = urljoin( url, match.group('url')) if not self.standings_url: raise InitModuleException('Not set standings url for %s' % self.name) url = self.standings_url + '&nr=100000042' page = REQ.get(url) result_urls = re.findall( r'<a[^>]*href="(?P<url>[^"]*)"[^>]*>Results</a>', str(page), re.I) if not result_urls: # marathon match match = re.search( '<[^>]*>Problem:[^<]*<a[^>]*href="(?P<href>[^"]*)"[^>]*>(?P<name>[^<]*)<', page) problem_name = match.group('name').strip() problems_info = [{ 'short': problem_name, 'url': urljoin(url, match.group('href').replace('&', '&')) }] rows = etree.HTML(page).xpath( "//table[contains(@class, 'stat')]//tr") header = None for row in rows: r = parsed_table.ParsedTableRow(row) if len(r.columns) < 8: continue values = [ c.value.strip().replace(u'\xa0', '') for c in r.columns ] if header is None: header = values continue d = OrderedDict(list(zip(header, values))) handle = d.pop('Handle').strip() d = self._dict_as_number(d) if 'rank' not in d or users and handle not in users: continue row = result.setdefault(handle, OrderedDict()) row.update(d) score = row.pop('final_score' if 'final_score' in row else 'provisional_score') row['member'] = handle row['place'] = row.pop('rank') row['solving'] = score row['solved'] = {'solving': 1 if score > 0 else 0} problems = row.setdefault('problems', {}) problem = problems.setdefault(problem_name, {}) problem['result'] = score history_index = values.index('submission history') if history_index: column = r.columns[history_index] href = column.node.xpath('a/@href') if href: problem['url'] = urljoin(url, href[0]) else: # single round match matches = re.finditer('<table[^>]*>.*?</table>', page, re.DOTALL) problems_sets = [] for match in matches: problems = re.findall( '<a[^>]*href="(?P<href>[^"]*c=problem_statement[^"]*)"[^>]*>(?P<name>[^/]*)</a>', match.group(), re.IGNORECASE, ) if problems: problems_sets.append([{ 'short': n, 'url': urljoin(url, u) } for u, n in problems]) problems_info = dict() if len(problems_sets) > 1 else list() for problems_set, result_url in zip(problems_sets, result_urls): url = urljoin(self.standings_url, result_url + '&em=1000000042') url = url.replace('&', '&') division = int(parse_qs(url)['dn'][0]) for p in problems_set: d = problems_info if len(problems_sets) > 1: d = d.setdefault('division', OrderedDict()) d = d.setdefault('I' * division, []) d.append(p) page = REQ.get(url) rows = etree.HTML(page).xpath("//tr[@valign='middle']") header = None url_infos = [] for row in rows: r = parsed_table.ParsedTableRow(row) if len(r.columns) < 10: continue values = [c.value for c in r.columns] if header is None: header = values continue d = OrderedDict(list(zip(header, values))) handle = d.pop('Coders').strip() d = self._dict_as_number(d) if 'division_placed' not in d or users and handle not in users: continue row = result.setdefault(handle, OrderedDict()) row.update(d) if not row.get('new_rating') and not row.get( 'old_rating') and not row.get('rating_change'): row.pop('new_rating', None) row.pop('old_rating', None) row.pop('rating_change', None) row['member'] = handle row['place'] = row.pop('division_placed') row['solving'] = row['point_total'] row['solved'] = {'solving': 0} row['division'] = 'I' * division if 'adv.' in row: row['advanced'] = row.pop('adv.').lower().startswith( 'y') url_info = urljoin(url, r.columns[0].node.xpath('a/@href')[0]) url_infos.append(url_info) def fetch_solution(url): for i in range(2): try: page = REQ.get(url, time_out=60) match = re.search( '<td[^>]*class="problemText"[^>]*>(?P<solution>.*?)</td>', page, re.DOTALL | re.IGNORECASE) ret = html.unescape(match.group('solution')) ret = ret.strip() ret = ret.replace('<BR>', '\n') ret = ret.replace('\xa0', ' ') return ret except FailOnGetResponse: sleep(i * 10 + 3) return None def fetch_info(url): delay = 3 for _ in range(5): try: page = REQ.get(url) break except Exception: sleep(delay) delay *= 2 else: return None, None, None match = re.search( 'class="coderBrackets">.*?<a[^>]*>(?P<handle>[^<]*)</a>', page, re.IGNORECASE) handle = html.unescape(match.group('handle').strip()) match = re.search(r' Room\s*(?P<room>[0-9]+)', page) room = match.group('room') if match else None matches = re.finditer( r''' <td[^>]*>[^<]*<a[^>]*href="(?P<url>[^"]*c=problem_solution[^"]*)"[^>]*>(?P<short>[^<]*)</a>[^<]*</td>[^<]* <td[^>]*>[^<]*</td>[^<]* <td[^>]*>[^<]*</td>[^<]* <td[^>]*>(?P<time>[^<]*)</td>[^<]* <td[^>]*>(?P<status>[^<]*)</td>[^<]* <td[^>]*>(?P<result>[^<]*)</td>[^<]* ''', page, re.VERBOSE | re.IGNORECASE) problems = {} n_fetch_solution = 0 for match in matches: d = match.groupdict() short = d.pop('short') solution_url = urljoin(url, d['url']) d['url'] = solution_url d = self._dict_as_number(d) if d['status'] in [ 'Challenge Succeeded', 'Failed System Test' ]: d['result'] = -d['result'] if abs(d['result']) < 1e-9: d.pop('result') if re.match('^[0.:]+$', d['time']): d.pop('time') solution = (statistics or {}).get(handle, {}).get( 'problems', {}).get(short, {}).get('solution') if not solution: n_fetch_solution += 1 solution = fetch_solution(solution_url) d['solution'] = solution problems[short] = d challenges = [] matches = re.finditer( r''' <td[^>]*>[^<]*<a[^>]*href="[^"]*module=MemberProfile[^"]*"[^>]*>(?P<target>[^<]*)</a>[^<]*</td>[^<]* <td[^>]*>(?P<problem>[^<]*)</td>[^<]* <td[^>]*>(?P<status>[^<]*)</td>[^<]* <td[^>]*>(?P<time>[^<]*)</td>[^<]* <td[^>]*>(?P<result>[^<]*)</td>[^<]* <td[^>]*>[^<]*<a[^>]*href="(?P<url>[^"]*)"[^>]*>\s*details\s*</a>[^<]*</td>[^<]* ''', page, re.VERBOSE | re.IGNORECASE) for match in matches: d = match.groupdict() d = {k: v.strip() for k, v in d.items()} d['result'] = float(d['result'].replace(',', '.')) d['url'] = urljoin(url, d['url']) p = problems.setdefault(d['problem'], {}) p.setdefault('extra_score', 0) p['extra_score'] += d['result'] p.setdefault( 'extra_info', []).append(f'{d["target"]}: {d["result"]}') challenges.append(d) return url, handle, room, problems, challenges, n_fetch_solution with PoolExecutor(max_workers=20) as executor, tqdm.tqdm( total=len(url_infos)) as pbar: n_fetch_solution = 0 for url, handle, room, problems, challenges, n_sol in executor.map( fetch_info, url_infos): n_fetch_solution += n_sol pbar.set_description(f'div{division} {url}') pbar.set_postfix(n_solution=n_fetch_solution) pbar.update() if handle is not None: if handle not in result: LOG.error( f'{handle} not in result, url = {url}') result[handle]['url'] = url if room: result[handle]['room'] = room result[handle]['problems'] = problems result[handle]['challenges'] = challenges for p in problems.values(): if p.get('result', 0) > 1e-9: result[handle]['solved']['solving'] += 1 if challenges: h = result[handle].setdefault( 'hack', { 'title': 'challenges', 'successful': 0, 'unsuccessful': 0, }) for c in challenges: h['successful' if c['status'].lower() == 'yes' else 'unsuccessful'] += 1 standings = { 'result': result, 'url': self.standings_url, 'problems': problems_info, 'options': { 'fixed_fields': [('hack', 'Challenges')], }, } if re.search(r'\bfinals?(?:\s+rounds?)?$', self.name, re.I): standings['options']['medals'] = [{ 'name': name, 'count': 1 } for name in ('gold', 'silver', 'bronze')] return standings
def get_standings(self, users=None, statistics=None): # REQ.get('https://www.codechef.com/') # try: # form = REQ.form() # form['post'].update({ # 'name': self._username, # 'pass': self._password, # }) # page = REQ.get(form['url'], post=form['post']) # form = REQ.form() # if form['url'] == '/session/limit': # for field in form['unchecked'][:-1]: # form['post'][field['name']] = field['value'].encode('utf8') # page = REQ.get(form['url'], post=form['post']) # except Exception: # pass url = self.API_CONTEST_URL_FORMAT_.format(**self.__dict__) page = REQ.get(url) data = json.loads(page) if data['status'] != 'success': raise ExceptionParseStandings(json.dumps(data)) if 'child_contests' in data: contest_infos = { d['contest_code']: { 'division': k } for k, d in data['child_contests'].items() } else: contest_infos = {self.key: {}} result = {} problems_info = dict() if len(contest_infos) > 1 else list() for key, contest_info in contest_infos.items(): url = self.STANDINGS_URL_FORMAT_.format(key=key) page = REQ.get(url) match = re.search( '<input[^>]*name="csrfToken"[^>]*id="edit-csrfToken"[^>]*value="([^"]*)"', page) csrf_token = match.group(1) n_page = 0 per_page = 150 n_total_page = None pbar = None contest_type = None while n_total_page is None or n_page < n_total_page: n_page += 1 time.sleep(2) url = self.API_RANKING_URL_FORMAT_.format(key=key, page=n_page, per_page=per_page) if users: urls = [f'{url}&search={user}' for user in users] else: urls = [url] for url in urls: delay = 10 for _ in range(10): try: headers = { 'x-csrf-token': csrf_token, 'x-requested-with': 'XMLHttpRequest', } page = REQ.get(url, headers=headers) data = json.loads(page) assert data.get('status') != 'rate_limit_exceeded' break except Exception: traceback.print_exc() delay = min(300, delay * 2) sys.stdout.write(f'url = {url}\n') sys.stdout.write(f'Sleep {delay}... ') sys.stdout.flush() time.sleep(delay) sys.stdout.write('Done\n') else: raise ExceptionParseStandings( f'Failed getting {n_page} by url {url}') if 'status' in data and data['status'] != 'success': raise ExceptionParseStandings(json.dumps(data)) unscored_problems = data['contest_info'][ 'unscored_problems'] if n_total_page is None: for p in data['problems']: if p['code'] in unscored_problems: continue d = problems_info if 'division' in contest_info: d = d.setdefault('division', OrderedDict()) d = d.setdefault(contest_info['division'], []) d.append({ 'short': p['code'], 'name': p['name'], 'url': f"https://www.codechef.com/problems/{p['code']}", }) n_total_page = data['availablePages'] pbar = tqdm.tqdm(total=n_total_page * len(urls)) contest_type = data['contest_info'].get('type') for d in data['list']: handle = d.pop('user_handle') d.pop('html_handle', None) problems_status = d.pop('problems_status') if d['score'] < 1e-9 and not problems_status: LOG.warning(f'Skip handle = {handle}: {d}') continue row = result.setdefault(handle, {}) row['member'] = handle row['place'] = d.pop('rank') row['solving'] = d.pop('score') problems = row.setdefault('problems', {}) solved, upsolved = 0, 0 if problems_status: for k, v in problems_status.items(): t = 'upsolving' if k in unscored_problems else 'result' v[t] = v.pop('score') solved += 1 if v.get('result', 0) > 0 else 0 upsolved += 1 if v.get('upsolving', 0) > 0 else 0 if contest_type == '1' and 'penalty' in v: penalty = v.pop('penalty') if v[t] > 0: v[t] = f'+{"" if penalty == 0 else penalty}' else: v[t] = f'-{penalty}' problems[k] = v row['solved'] = { 'solving': solved, 'upsolving': upsolved } country = d.pop('country_code') if country: d['country'] = country row.update(d) row.update(contest_info) pbar.set_description(f'key={key} url={url}') pbar.update() has_penalty = False for row in result.values(): p = row.get('penalty') has_penalty = has_penalty or p and str(p) != "0" if not has_penalty: for row in result.values(): row.pop('penalty', None) if pbar is not None: pbar.close() standings = { 'result': result, 'url': self.url, 'problems': problems_info, } return standings
def get_standings(self, users=None, statistics=None): result = {} hidden_fields = [] fields_types = {} order = None writers = defaultdict(int) start_time = self.start_time.replace(tzinfo=None) if not self.standings_url and datetime.now() - start_time < timedelta(days=30): opt = 0.61803398875 def canonize_title(value): value = value.lower() value = re.sub(r'\s+-[^-]+$', '', value) value = re.sub(r'\bsingle\s+round\s+match\b', 'srm', value) value = re.sub(r'\bmarathon\s+match\b', 'mm', value) value = re.sub(r'[0-9]*([0-9]{2})\s*tco(\s+)', r'tco\1\2', value) value = re.sub(r'tco\s*[0-9]*([0-9]{2})(\s+)', r'tco\1\2', value) value = re.sub(r'^[0-9]{2}([0-9]{2})(\s+)', r'tco\1\2', value) return set(re.split('[^A-Za-z0-9]+', value)) def process_match(date, title, url): nonlocal opt if abs(date - start_time) > timedelta(days=2): return a1 = canonize_title(title) a2 = canonize_title(self.name) intersection = 0 for w1 in a1: for w2 in a2: if w1.isdigit() or w2.isdigit(): if w1 == w2: intersection += 1 break elif w1.startswith(w2) or w2.startswith(w1): intersection += 1 break union = len(a1) + len(a2) - intersection iou = intersection / union if iou > opt: opt = iou self.standings_url = url url = 'https://www.topcoder.com/tc?module=MatchList&nr=100500' page = REQ.get(url) re_round_overview = re.compile( r''' (?:<td[^>]*>(?: [^<]*<a[^>]*href="(?P<url>[^"]*/stat[^"]*rd=(?P<rd>[0-9]+)[^"]*)"[^>]*>(?P<title>[^<]*)</a>[^<]*| (?P<date>[0-9]+\.[0-9]+\.[0-9]+) )</td>[^<]*){2} ''', re.VERBOSE, ) matches = re_round_overview.finditer(str(page)) for match in matches: date = datetime.strptime(match.group('date'), '%m.%d.%Y') process_match(date, match.group('title'), urljoin(url, match.group('url'))) url = 'https://www.topcoder.com/tc?module=BasicData&c=dd_round_list' page = REQ.get(url) root = ET.fromstring(page) for child in root: data = {} for field in child: data[field.tag] = field.text date = dateutil.parser.parse(data['date']) url = 'https://www.topcoder.com/stat?c=round_overview&er=5&rd=' + data['round_id'] process_match(date, data['full_name'], url) for url in self.url, self.standings_url: if url: match = re.search('/challenges/(?P<cid>[0-9]+)', url) if match: challenge_id = match.group('cid') break else: challenge_id = None if challenge_id: # marathon match url = conf.TOPCODER_API_MM_URL_FORMAT.format(challenge_id) page = REQ.get(url) data = json.loads(page) problems_info = [] hidden_fields.extend(['time', 'submits', 'style']) fields_types = {'delta_rank': ['delta'], 'delta_score': ['delta']} order = ['place_as_int', '-solving', 'addition__provisional_rank', '-addition__provisional_score'] for row in data: handle = row.pop('member') r = result.setdefault(handle, OrderedDict()) r['member'] = handle r['place'] = row.pop('finalRank', None) r['provisional_rank'] = row.pop('provisionalRank', None) r['style'] = row.pop('style') if r['place'] and r['provisional_rank']: r['delta_rank'] = r['provisional_rank'] - r['place'] submissions = row.pop('submissions') has_solution = False for s in submissions: score = s.get('finalScore') if not score or score == '-': if 'provisional_score' not in r: p_score = s.pop('provisionalScore', None) if isinstance(p_score, str): p_score = asfloat(p_score) if p_score is not None: r['provisional_score'] = round(p_score, 2) if p_score >= 0 else False r['time'] = s['created'] has_solution = True continue r['solving'] = score r['solved'] = {'solving': int(score > 0)} p_score = s.pop('provisionalScore') if isinstance(p_score, str): p_score = asfloat(p_score) if p_score is not None and p_score > 0: r['provisional_score'] = round(p_score, 2) r['delta_score'] = round(score - p_score, 2) r['time'] = s['created'] has_solution = True break if not has_solution: continue r['submits'] = len(submissions) if not result: raise ExceptionParseStandings('empty standings') else: # single round match if not self.standings_url: raise InitModuleException('Not set standings url for %s' % self.name) url = self.standings_url + '&nr=100000042' page = REQ.get(url, time_out=100) result_urls = re.findall(r'<a[^>]*href="(?P<url>[^"]*)"[^>]*>Results</a>', str(page), re.I) if not result_urls: raise ExceptionParseStandings('not found result urls') dd_round_results = {} match = re.search('rd=(?P<rd>[0-9]+)', url) if match: rd = match.group('rd') url = f'https://www.topcoder.com/tc?module=BasicData&c=dd_round_results&rd={rd}' try: dd_round_results_page = REQ.get(url) root = ET.fromstring(dd_round_results_page) for child in root: data = {} for field in child: data[field.tag] = field.text handle = data.pop('handle') dd_round_results[handle] = self._dict_as_number(data) except FailOnGetResponse: pass hidden_fields.extend(['coding_phase', 'challenge_phase', 'system_test', 'point_total', 'room']) matches = re.finditer('<table[^>]*>.*?</table>', page, re.DOTALL) problems_sets = [] for match in matches: problems = re.findall( '<a[^>]*href="(?P<href>[^"]*c=problem_statement[^"]*)"[^>]*>(?P<name>[^/]*)</a>', match.group(), re.IGNORECASE, ) if problems: problems_sets.append([ {'short': n, 'url': urljoin(url, u)} for u, n in problems ]) problems_info = dict() if len(problems_sets) > 1 else list() for problems_set, result_url in zip(problems_sets, result_urls): url = urljoin(self.standings_url, result_url + '&em=1000000042') url = url.replace('&', '&') division = int(parse_qs(url)['dn'][0]) division_str = 'I' * division with PoolExecutor(max_workers=3) as executor: def fetch_problem(p): errors = set() for attempt in range(3): try: page = REQ.get(p['url'], time_out=30) match = re.search('<a[^>]*href="(?P<href>[^"]*module=ProblemDetail[^"]*)"[^>]*>', page) page = REQ.get(urljoin(p['url'], match.group('href')), time_out=30) matches = re.findall(r'<td[^>]*class="statTextBig"[^>]*>(?P<key>[^<]*)</td>\s*<td[^>]*>(?P<value>.*?)</td>', page, re.DOTALL) # noqa for key, value in matches: key = key.strip().rstrip(':').lower() if key == 'categories': tags = [t.strip().lower() for t in value.split(',')] tags = [t for t in tags if t] if tags: p['tags'] = tags elif key.startswith('writer') or key.startswith('tester'): key = key.rstrip('s') + 's' p[key] = re.findall('(?<=>)[^<>,]+(?=<)', value) for w in p.get('writers', []): writers[w] += 1 info = p.setdefault('info', {}) matches = re.finditer('<table[^>]*paddingTable2[^>]*>.*?</table>', page, re.DOTALL) for match in matches: html_table = match.group(0) rows = parsed_table.ParsedTable(html_table) for row in rows: key, value = None, None for k, v in row.items(): if k == "": key = v.value elif k and division_str in k.split(): value = v.value if key and value: key = re.sub(' +', '_', key.lower()) info[key] = value if key == 'point_value': value = toint(value) or asfloat(value) if value is not None: p['full_score'] = value except Exception as e: errors.add(f'error parse problem info {p}: {e}') sleep(5 + attempt) else: errors = None if errors: LOG.error(errors) return p for p in tqdm.tqdm(executor.map(fetch_problem, problems_set), total=len(problems_set)): d = problems_info if len(problems_sets) > 1: d = d.setdefault('division', OrderedDict()) d = d.setdefault(division_str, []) d.append(p) if not users and users is not None: continue page = REQ.get(url) rows = etree.HTML(page).xpath("//tr[@valign='middle']") header = None url_infos = [] for row in rows: r = parsed_table.ParsedTableRow(row) if len(r.columns) < 10: continue values = [c.value for c in r.columns] if header is None: header = values continue d = OrderedDict(list(zip(header, values))) handle = d.pop('Coders').strip() d = self._dict_as_number(d) if users and handle not in users: continue row = result.setdefault(handle, OrderedDict()) row.update(d) if not row.get('new_rating') and not row.get('old_rating') and not row.get('rating_change'): row.pop('new_rating', None) row.pop('old_rating', None) row.pop('rating_change', None) row['member'] = handle row['place'] = row.pop('division_placed', None) row['solving'] = row['point_total'] row['solved'] = {'solving': 0} row['division'] = 'I' * division if 'adv.' in row: row['advanced'] = row.pop('adv.').lower().startswith('y') url_info = urljoin(url, r.columns[0].node.xpath('a/@href')[0]) url_infos.append(url_info) def fetch_solution(url): for i in range(2): try: page = REQ.get(url, time_out=60) match = re.search('<td[^>]*class="problemText"[^>]*>(?P<solution>.*?)</td>', page, re.DOTALL | re.IGNORECASE) if not match: break ret = html.unescape(match.group('solution')) ret = ret.strip() ret = ret.replace('<BR>', '\n') ret = ret.replace('\xa0', ' ') return ret except FailOnGetResponse: sleep(i * 10 + 3) return None n_failed_fetch_info = 0 def fetch_info(url): nonlocal n_failed_fetch_info if n_failed_fetch_info > 10: return delay = 10 for _ in range(5): try: page = REQ.get(url, time_out=delay) match = re.search('class="coderBrackets">.*?<a[^>]*>(?P<handle>[^<]*)</a>', page, re.IGNORECASE) if match: break except Exception: sleep(delay + _) else: n_failed_fetch_info += 1 return handle = html.unescape(match.group('handle').strip()) match = re.search(r' Room\s*(?P<room>[0-9]+)', page) room = match.group('room') if match else None matches = re.finditer(r''' <td[^>]*>[^<]*<a[^>]*href="(?P<url>[^"]*c=problem_solution[^"]*)"[^>]*>(?P<short>[^<]*)</a>[^<]*</td>[^<]* <td[^>]*>[^<]*</td>[^<]* <td[^>]*>[^<]*</td>[^<]* <td[^>]*>(?P<time>[^<]*)</td>[^<]* <td[^>]*>(?P<status>[^<]*)</td>[^<]* <td[^>]*>(?P<result>[^<]*)</td>[^<]* ''', page, re.VERBOSE | re.IGNORECASE) problems = {} n_fetch_solution = 0 for match in matches: d = match.groupdict() short = d.pop('short') solution_url = urljoin(url, d['url']) d['url'] = solution_url d = self._dict_as_number(d) if d['status'] in ['Challenge Succeeded', 'Failed System Test']: d['result'] = -d['result'] if abs(d['result']) < 1e-9: d.pop('result') if re.match('^[0.:]+$', d['time']): d.pop('time') else: time_in_seconds = 0 for t in d['time'].split(':'): time_in_seconds = time_in_seconds * 60 + asfloat(t) d['time_in_seconds'] = time_in_seconds solution = (statistics or {}).get(handle, {}).get('problems', {}).get(short, {}).get('solution') if not solution: n_fetch_solution += 1 solution = fetch_solution(solution_url) d['solution'] = solution problems[short] = d challenges = [] matches = re.finditer(r''' <td[^>]*>[^<]*<a[^>]*href="[^"]*module=MemberProfile[^"]*"[^>]*>(?P<target>[^<]*)</a>[^<]*</td>[^<]* <td[^>]*>(?P<problem>[^<]*)</td>[^<]* <td[^>]*>(?P<status>[^<]*)</td>[^<]* <td[^>]*>(?P<time>[^<]*)</td>[^<]* <td[^>]*>(?P<result>[^<]*)</td>[^<]* <td[^>]*>[^<]*<a[^>]*href="(?P<url>[^"]*)"[^>]*>\s*details\s*</a>[^<]*</td>[^<]* ''', page, re.VERBOSE | re.IGNORECASE) for match in matches: d = match.groupdict() d = {k: v.strip() for k, v in d.items()} d['result'] = float(d['result'].replace(',', '.')) d['url'] = urljoin(url, d['url']) p = problems.setdefault(d['problem'], {}) p.setdefault('extra_score', 0) p['extra_score'] += d['result'] p.setdefault('extra_info', []).append(f'{d["target"]}: {d["result"]}') challenges.append(d) return url, handle, room, problems, challenges, n_fetch_solution with PoolExecutor(max_workers=20) as executor, tqdm.tqdm(total=len(url_infos)) as pbar: n_fetch_solution = 0 for info in executor.map(fetch_info, url_infos): if info is None: continue url, handle, room, problems, challenges, n_sol = info n_fetch_solution += n_sol pbar.set_description(f'div{division} {url}') pbar.set_postfix(n_solution=n_fetch_solution, n_failed_fetch_info=n_failed_fetch_info) pbar.update() if handle is not None: if handle not in result: LOG.error(f'{handle} not in result, url = {url}') row = result[handle] row['url'] = url if room: row['room'] = room row['problems'] = problems row['challenges'] = challenges for p in problems.values(): if p.get('result', 0) > 1e-9: row['solved']['solving'] += 1 if challenges: h = row.setdefault('hack', { 'title': 'challenges', 'successful': 0, 'unsuccessful': 0, }) for c in challenges: h['successful' if c['status'].lower() == 'yes' else 'unsuccessful'] += 1 if dd_round_results: fields = set() hidden_fields_set = set(hidden_fields) for data in result.values(): for field in data.keys(): fields.add(field) k_mapping = {'new_vol': 'new_volatility', 'advanced': None} for handle, data in dd_round_results.items(): if handle not in result: continue row = result[handle] for k, v in data.items(): k = k_mapping.get(k, k) if k and k not in fields: if k in {'new_rating', 'old_rating'} and not v: continue row[k] = v if k not in hidden_fields_set: hidden_fields_set.add(k) hidden_fields.append(k) ks = k.split('_') if ks[0] == 'level' and ks[-1] == 'language' and v and v.lower() != 'unspecified': idx = {'one': 0, 'two': 1, 'three': 2}.get(ks[1], None) d = problems_info if len(problems_sets) > 1: d = d['division'][row['division']] if idx is not None and 0 <= idx < len(d) and d[idx]['short'] in row['problems']: row['problems'][d[idx]['short']]['language'] = v standings = { 'result': result, 'url': self.standings_url, 'problems': problems_info, 'hidden_fields': hidden_fields, 'fields_types': fields_types, 'options': { 'fixed_fields': [('hack', 'Challenges')], }, } if writers: writers = [w[0] for w in sorted(writers.items(), key=lambda w: w[1], reverse=True)] standings['writers'] = writers if re.search(r'\bfinals?(?:\s+rounds?)?$', self.name, re.I): standings['options']['medals'] = [{'name': name, 'count': 1} for name in ('gold', 'silver', 'bronze')] if order: standings['options']['order'] = order return standings
def _update_submissions(self, fusers, standings): result = standings['result'] with PoolExecutor(max_workers=8) as executor: submissions = list( tqdm(executor.map(self.fetch_submissions, fusers), total=len(fusers), desc='gettings first page')) for fuser, page_submissions in zip(fusers, submissions): if page_submissions is None: break url, page, table, _, n_page = page_submissions submissions_times = {} def process_page(url, page, table): last_submission_time = 0 for r in table: row = dict() for k, v in list(r.items()): if v.value == 'Detail': href = first(v.column.node.xpath('.//a/@href')) if href: row['url'] = urllib.parse.urljoin( url, href) row['external_solution'] = True elif k == 'User': row['name'] = v.value href = v.column.node.xpath('.//a/@href')[0] row['user'] = href.split('/')[-1] else: k = k.lower().replace(' ', '_') row[k] = v.value submission_time = arrow.get(row['submission_time']) upsolve = submission_time >= self.end_time row['submission_time'] = submission_time.timestamp last_submission_time = max(last_submission_time, row['submission_time']) row['verdict'] = row.pop('status') user = row.pop('user') name = row.pop('name') task = row.pop('task').split()[0] score = float(row.pop('score')) res = result.setdefault(user, collections.OrderedDict()) res.setdefault('member', user) if name != user: res['name'] = name problems = res.setdefault('problems', {}) problem = problems.setdefault(task, {}) problem_score = problem.get('result', 0) eps = 1e-9 if upsolve: problem = problem.setdefault('upsolving', {}) st = submissions_times.setdefault( (user, task), problem.get('submission_time')) problem_score = problem.get('result', 0) if 'submission_time' not in problem: problem_score = 0 if score > eps: row['result'] = score elif problem_score < eps and ( not st or st < row['submission_time']): problem['result'] = problem_score - 1 row.pop('result', None) if 'submission_time' in problem and row[ 'submission_time'] <= problem[ 'submission_time']: continue if problem_score > eps and problem_score > score: continue problem.update(row) return last_submission_time last_submission_time = process_page(url, page, table) st_data = result.get(fuser, {}) if fuser else standings submissions_info = st_data.setdefault('_submissions_info', {}) limit_st = submissions_info.pop( 'last_submission_time', self.DEFAULT_LAST_SUBMISSION_TIME) last_page = submissions_info.pop('last_page', self.DEFAULT_LAST_PAGE) last_page_st = submissions_info.pop( 'last_page_st', self.DEFAULT_LAST_SUBMISSION_TIME) c_page = last_page self._stop = False fetch_submissions_user = functools.partial( self.fetch_submissions, fuser) for page_submissions in tqdm( executor.map(fetch_submissions_user, range(last_page + 1, n_page + 1)), total=n_page - last_page, desc=f'getting submissions for ({last_page};{n_page}]' ): if page_submissions is None: submissions_info['last_page'] = c_page submissions_info['last_page_st'] = last_page_st LOG.info( f'stopped after ({last_page};{c_page}] of {n_page}' ) self._stop = True break url, page, table, c_page, _ = page_submissions submission_time = process_page(url, page, table) last_page_st = max(last_page_st, submission_time) if submission_time < limit_st: self._stop = True break if 'last_page' not in submissions_info: submissions_info['last_submission_time'] = \ last_submission_time if last_page == self.DEFAULT_LAST_PAGE else last_page_st
def get_standings(self, users=None, statistics=None): result = {} if users and statistics: fusers = users for member in users: if member not in statistics: continue info = collections.OrderedDict(deepcopy( statistics.get(member))) info['member'] = member for f in ('place', 'solving', 'upsolving'): info[f] = '__unchanged__' result[member] = info standings = {'result': result} else: fusers = [] page = self._get(self.url) match = re.search(r'(?<=<li>)Writer:.*', page) writers = [] if match: matches = re.findall('(?<=>)[^<]+(?=</)', match.group()) writers = list() for m in matches: writers.extend(map(str.strip, re.split(r'[,\s]+', m))) writers = [w for w in writers if w and w != '?'] url = f'{self.RESULTS_URL_.format(self)}/' page = self._get(url) match = re.search(r'var\s*results\s*=\s*(\[[^\n]*\]);$', page, re.MULTILINE) data = json.loads(match.group(1)) results = {} for row in data: if not row.get('IsRated'): continue handle = row.pop('UserScreenName') if users and handle not in users: continue if 'NewRating' not in row: continue r = collections.OrderedDict() for k in ['OldRating', 'NewRating', 'Performance']: if k in row: r[k] = row[k] results[handle] = r url = f'{self.STANDING_URL_.format(self)}/json' try: page = self._get(url) except FailOnGetResponse as e: if e.code == 404: return {'action': 'delete'} raise e data = json.loads(page) task_info = collections.OrderedDict() for t in data['TaskInfo']: k = t['TaskScreenName'] task_info[k] = { 'short': t['Assignment'], 'name': t['TaskName'], 'url': self.PROBLEM_URL_.format(self, self.key.replace('-', '_'), t['Assignment'].lower()), } has_rated = False has_new_rating = False for row in data['StandingsData']: if not row['TaskResults'] and not row.get('IsRated'): continue handle = row.pop('UserScreenName') if users is not None and handle not in users: continue r = result.setdefault(handle, collections.OrderedDict()) r['member'] = handle if row.pop('UserIsDeleted', None): r['action'] = 'delete' continue r['place'] = row.pop('Rank') total_result = row.pop('TotalResult') penalty = total_result['Elapsed'] // 10**9 r['penalty'] = f'{penalty // 60}:{penalty % 60:02d}' r['solving'] = total_result['Score'] / 100. r['country'] = row.pop('Country') if 'UserName' in row: r['name'] = row.pop('UserName') r['url'] = self.SUBMISSIONS_URL_.format( self) + f'?f.User={handle}' stats = (statistics or {}).get(handle, {}) problems = r.setdefault('problems', stats.get('problems', {})) solving = 0 task_results = row.pop('TaskResults', {}) for k, v in task_results.items(): if 'Score' not in v: continue letter = task_info[k]['short'] p = problems.setdefault(letter, {}) if v['Score'] > 0: solving += 1 p['result'] = v['Score'] / 100. seconds = v['Elapsed'] // 10**9 p['time_in_seconds'] = seconds p['time'] = f'{seconds // 60}:{seconds % 60:02d}' if v['Penalty'] > 0: p['penalty'] = v['Penalty'] else: p['result'] = -v['Failure'] r['solved'] = {'solving': solving} row.update(r) row.pop('Additional', None) if 'AtCoderRank' in row: row['AtcoderRank'] = row.pop('AtCoderRank') rating = row.pop('Rating', None) if rating is not None: r['info'] = {'rating': rating} old_rating = row.pop('OldRating', None) for k, v in sorted(row.items()): r[k] = v if handle in results: r.update(results.pop(handle)) if old_rating is not None and (old_rating or 'NewRating' in r): r['OldRating'] = old_rating if r.get('IsRated'): has_rated = True if r.get('NewRating') is not None: has_new_rating = True url = f'{self.STANDING_URL_.format(self)}/virtual/json' page = self._get(url) data = json.loads(page) for row in data['StandingsData']: if not row['TaskResults']: continue handle = row.pop('UserScreenName') if users is not None and handle not in users: continue r = result.setdefault(handle, collections.OrderedDict()) r['member'] = handle if row.pop('UserIsDeleted', None): r['action'] = 'delete' continue r['country'] = row.pop('Country') if 'UserName' in row: r['name'] = row.pop('UserName') r['url'] = self.SUBMISSIONS_URL_.format( self) + f'?f.User={handle}' stats = (statistics or {}).get(handle, {}) problems = r.setdefault('problems', stats.get('problems', {})) task_results = row.pop('TaskResults', {}) for k, v in task_results.items(): if 'Score' not in v: continue letter = task_info[k]['short'] p = problems.setdefault(letter, {}) p = p.setdefault('upsolving', {}) p_result = p.get('result', 0) score = v['Score'] / 100. if score > 0 and score > p_result: p['result'] = score seconds = v['Elapsed'] // 10**9 p['time_in_seconds'] = seconds p['time'] = f'{seconds // 60}:{seconds % 60:02d}' if v['Penalty'] > 0: p['penalty'] = v['Penalty'] elif score <= 0 and p_result <= 0 and -v[ 'Failure'] < p_result: p['result'] = -v['Failure'] if self.info.get('_submissions_info', {}).get( 'last_submission_time', -1) > 0: for r in result.values(): problems = r.get('problems', {}) if not problems: continue no_url = False for p in problems.values(): u = p.get('upsolving', {}) if 'result' in p and 'url' not in p or 'result' in u and 'url' not in u: no_url = True break if no_url: fusers.append(r['member']) if statistics: for member, row in statistics.items(): if member not in result: has_result = any( 'result' in p for p in row.get('problems', {}).values()) if has_result: continue row['member'] = member result[member] = row standings = { 'result': result, 'url': self.STANDING_URL_.format(self), 'problems': list(task_info.values()), 'writers': writers, } if (has_rated and not has_new_rating and self.end_time + timedelta(hours=3) > datetime.utcnow().replace(tzinfo=pytz.utc)): standings['timing_statistic_delta'] = timedelta(minutes=30) if users or users is None: self._stop = False page_submissions = self.fetch_submissions() if page_submissions is not None: standings[ '_submissions_info'] = {} if statistics is None else self.info.pop( '_submissions_info', {}) standings['info_fields'] = ['_submissions_info'] *_, n_page = page_submissions if not users: if fusers: LOG.info( f'Numbers of users without urls for some problems: {len(fusers)}' ) if not fusers or 'last_page' in standings[ '_submissions_info']: fusers = [None] elif len(fusers) > n_page: standings['_submissions_info'].pop( '_last_submission_time', None) fusers = [None] self._update_submissions(fusers, standings) if page_submissions is None or 'last_page' in standings[ '_submissions_info']: delta = timedelta(minutes=15) LOG.info(f'Repeat statistics update after {delta}') standings['timing_statistic_delta'] = delta for row in result.values(): has_result = any('result' in p for p in row.get('problems', {}).values()) if has_result or row.get('IsRated'): row.pop('_no_update_n_contests', None) else: row['_no_update_n_contests'] = True standings['hidden_fields'] = [ 'Affiliation', 'AtcoderRank', 'Competitions', 'IsRated', 'IsTeam', ] return standings
def get_standings(self, users=None, statistics=None): # REQ.get('https://www.codechef.com/') # try: # form = REQ.form() # form['post'].update({ # 'name': self._username, # 'pass': self._password, # }) # page = REQ.get(form['url'], post=form['post']) # form = REQ.form() # if form['url'] == '/session/limit': # for field in form['unchecked'][:-1]: # form['post'][field['name']] = field['value'].encode('utf8') # page = REQ.get(form['url'], post=form['post']) # except Exception: # pass url = self.API_CONTEST_URL_FORMAT_.format(**self.__dict__) page = REQ.get(url) data = json.loads(page) if data['status'] != 'success': raise ExceptionParseStandings(json.dumps(data)) if 'child_contests' in data: contest_infos = { d['contest_code']: { 'division': k } for k, d in data['child_contests'].items() } else: contest_infos = {self.key: {}} result = {} problems_info = dict() if len(contest_infos) > 1 else list() hidden_fields = set() problems_data = defaultdict(dict) writers = defaultdict(int) for key, contest_info in contest_infos.items(): url = self.STANDINGS_URL_FORMAT_.format(key=key) page = REQ.get(url) match = re.search( '<input[^>]*name="csrfToken"[^>]*id="edit-csrfToken"[^>]*value="([^"]*)"', page) if not match: raise ExceptionParseStandings('not found csrf token') csrf_token = match.group(1) headers = { 'x-csrf-token': csrf_token, 'x-requested-with': 'XMLHttpRequest' } n_page = 0 per_page = 150 n_total_page = None pbar = None contest_type = None while n_total_page is None or n_page < n_total_page: n_page += 1 time.sleep(2) url = self.API_RANKING_URL_FORMAT_.format(key=key, page=n_page, per_page=per_page) if users: urls = [f'{url}&search={user}' for user in users] else: urls = [url] for url in urls: delay = 5 for _ in range(10): try: page = REQ.get(url, headers=headers) data = json.loads(page) assert data.get('status') != 'rate_limit_exceeded' break except Exception: traceback.print_exc() delay = min(300, delay * 2) sys.stdout.write(f'url = {url}\n') sys.stdout.write(f'Sleep {delay}... ') sys.stdout.flush() time.sleep(delay) sys.stdout.write('Done\n') else: raise ExceptionParseStandings( f'Failed getting {n_page} by url {url}') if 'status' in data and data['status'] != 'success': raise ExceptionParseStandings(json.dumps(data)) unscored_problems = data['contest_info'][ 'unscored_problems'] if n_total_page is None: for p in data['problems']: if p['code'] in unscored_problems: continue d = problems_info if 'division' in contest_info: d = d.setdefault('division', OrderedDict()) d = d.setdefault(contest_info['division'], []) code = p['code'] problem_info = { 'short': code, 'name': p['name'], 'url': f'https://www.codechef.com/problems/{code}', } d.append(problem_info) if code not in problems_data: problem_url = self.API_PROBLEM_URL_FORMAT_.format( code=code) page = REQ.get(problem_url, headers=headers) problem_data = json.loads(page) writer = problem_data.get('problem_author') if writer: writers[writer] += 1 problems_data[code]['writers'] = [writer] tags = problem_data.get('tags') if tags: matches = re.findall( '<a[^>]*>([^<]+)</a>', tags) problems_data[code]['tags'] = matches problem_info.update(problems_data[code]) n_total_page = data['availablePages'] pbar = tqdm.tqdm(total=n_total_page * len(urls)) contest_type = data['contest_info'].get('type') for d in data['list']: handle = d.pop('user_handle') d.pop('html_handle', None) problems_status = d.pop('problems_status') if d['score'] < 1e-9 and not problems_status: LOG.warning(f'Skip handle = {handle}: {d}') continue row = result.setdefault(handle, OrderedDict()) row['member'] = handle row['place'] = d.pop('rank') row['solving'] = d.pop('score') for k in 'time', 'total_time': if k in d: row['time'] = d.pop(k) break problems = row.setdefault('problems', {}) solved, upsolved = 0, 0 if problems_status: for k, v in problems_status.items(): t = 'upsolving' if k in unscored_problems else 'result' v[t] = v.pop('score') solved += 1 if v.get('result', 0) > 0 else 0 upsolved += 1 if v.get('upsolving', 0) > 0 else 0 if contest_type == '1' and 'penalty' in v: penalty = v.pop('penalty') if v[t] > 0: v[t] = f'+{"" if penalty == 0 else penalty}' else: v[t] = f'-{penalty}' if v.get('time'): time_in_seconds = 0 for t in str(v['time']).split(':'): time_in_seconds = time_in_seconds * 60 + int( t) v['time_in_seconds'] = time_in_seconds problems[k] = v row['solved'] = { 'solving': solved, 'upsolving': upsolved } country = d.pop('country_code') if country: d['country'] = country rating = d.pop('rating', None) if rating and rating != '0': hidden_fields.add('rating') row['rating'] = rating row.update(d) row.update(contest_info) if statistics and handle in statistics: stat = statistics[handle] for k in ('rating_change', 'new_rating'): if k in stat: row[k] = stat[k] hidden_fields |= set(list(d.keys())) pbar.set_description(f'key={key} url={url}') pbar.update() has_penalty = False for row in result.values(): p = row.get('penalty') has_penalty = has_penalty or p and str(p) != "0" if not has_penalty: for row in result.values(): row.pop('penalty', None) if pbar is not None: pbar.close() standings = { 'result': result, 'url': self.url, 'problems': problems_info, 'hidden_fields': list(hidden_fields), } if writers: writers = [ w[0] for w in sorted( writers.items(), key=lambda w: w[1], reverse=True) ] standings['writers'] = writers return standings