def get_from_icpc(year): medal_result_url = f'https://icpc.global/api/help/cms/virtpublic/community/results-{year}' page = REQ.get(medal_result_url) try: json_data = json.loads(page) except json.decoder.JSONDecodeError: return regex = '''<table[^>]*id=["']medalTable[^>]*>.*?</table>''' match = re.search(regex, json_data['content'], re.DOTALL) if not match: return html_table = match.group(0) table = parsed_table.ParsedTable(html_table) medals = OrderedDict() fields = ('gold', 'silver', 'bronze') for f in fields: medals[f] = 0 for r in table: _, v = next(iter(r.items())) for attr in v.attrs.get('class', '').split(): if attr in fields: medals[attr] = medals.get(attr, 0) + 1 break if not medals: return return medals
def fetch_ratings(user, account): if account.info.get('is_virtual'): return user, False, None try: page = REQ.get(f'https://toph.co/u/{user}/ratings') except FailOnGetResponse as e: if e.code == 404: return user, None, None return user, False, None tables = re.findall('<table[^>]*>.*?</table>', page, re.DOTALL) t = parsed_table.ParsedTable(html=tables[-1]) ratings = {} info = {} for row in t: href = row['Contest'].column.node.xpath('.//a/@href')[0] key = href.rstrip('/').split('/')[-1] rating = int(row['Rating'].value) ratings[key] = {'new_rating': rating} info.setdefault('rating', rating) matches = re.finditer( ''' <div[^>]*class="?value"?[^>]*>(?P<value>[^<]*)</div>[^<]* <div[^>]*class="?title"?>(?P<key>[^<]*)</div> ''', page, re.DOTALL | re.VERBOSE) for match in matches: key = match.group('key').lower() value = match.group('value') info[key] = value return user, info, ratings
def fetch_submissions(self, fuser=None, c_page=1): url = self.SUBMISSIONS_URL_.format(self) + f'?page={c_page}' if fuser: url += f'&f.User={fuser}' for attempt in range(4): if self._stop: return try: page = self._get(url) break except FailOnGetResponse: time.sleep(attempt) else: return regex = '<table[^>]*>.*?</table>' html_table = re.search(regex, page, re.DOTALL).group(0) table = parsed_table.ParsedTable(html_table, with_duplicate_colspan=True) pages = re.findall( r'''<a[^>]*href=["'][^"']*/submissions\?[^"']*page=([0-9]+)[^"']*["'][^>]*>[0-9]+</a>''', page) # noqa n_page = max(map(int, pages)) return url, page, table, c_page, n_page
def get_standings(self, users=None, statistics=None): year = self.start_time.year - (0 if self.start_time.month > 8 else 1) season = f'{year}-{year + 1}' result = {} page = REQ.get(self.standings_url) table = parsed_table.ParsedTable( html=page, xpath="//table[@class='ir-contest-standings']//tr") problems_info = collections.OrderedDict() for r in table: row = collections.OrderedDict() problems = row.setdefault('problems', {}) ioi_total_fields = ['Sum', 'Сумма'] ioi_style = any((f in r for f in ioi_total_fields)) for k, v in list(r.items()): classes = v.attrs['class'].split() if 'ir-column-contestant' in classes: row['member'] = v.value + ' ' + season row['name'] = v.value elif 'ir-column-place' in classes: row['place'] = v.value elif 'ir-column-penalty' in classes: row['penalty'] = int(v.value) elif 'ir-problem-count' in classes or k in ioi_total_fields: row['solving'] = int(v.value) elif len(k.split()[0]) == 1: letter = k.split()[0] problems_info[letter] = {'short': letter} if v.value == DOT: continue p = problems.setdefault(letter, {}) values = v.value.replace('−', '-').split(' ') p['result'] = values[0] if len(values) > 1: p['time'] = values[1] if ioi_style and p['result'].isdigit(): val = int(p['result']) if val: p['partial'] = val < 100 else: row[k.lower()] = v.value if not problems or users and row['member'] not in users: continue member = row['member'] if member in result: idx = 0 while member + f'-{idx}' in result: idx += 1 member += f'-{idx}' row['member'] = member result[member] = row standings = { 'result': result, 'url': self.standings_url, 'problems': list(problems_info.values()), 'problems_time_format': '{H}:{m:02d}', } return standings
def get_standings(self, users=None, statistics=None): try: page = REQ.get(self.url) except FailOnGetResponse as e: return {'action': 'delete'} if e.code == 404 else {} match = re.search('<table[^>]*past_event_rating[^>]*>.*?</table>', page, re.DOTALL) if not match: raise ExceptionParseStandings('not found table') header_mapping = { 'Team': 'name', 'Place': 'place', 'CTF points': 'solving', } table = parsed_table.ParsedTable(html=match.group(0), header_mapping=header_mapping) results = {} max_score = 0 for r in table: row = OrderedDict() for k, v in r.items(): k = k.strip('*') k = k.strip(' ') value = ' '.join([c.value for c in v]).strip() if isinstance( v, list) else v.value if k == 'name': href = v.column.node.xpath('.//a/@href')[0] match = re.search('/([0-9]+)/?$', href) row['member'] = match.group(1) row['name'] = value else: value = as_number(value) row[k] = value max_score = max(max_score, row.get('solving', 0)) results[row['member']] = row if max_score > 0: for row in results.values(): if 'solving' in row: row['percent'] = f'{row["solving"] * 100 / max_score:.2f}' has_medals = not re.search(r'\bqual', self.name, flags=re.I) and re.search( r'\bfinal', self.name, flags=re.I) medals = [{'name': 'gold', 'count': 1}] if has_medals else [] return dict( standings_url=self.url, result=results, options={'medals': medals}, )
def fetch_members(r): url = r.pop('members_url', None) if url: members_page = self._get(url) members_page = json.loads(members_page)['data'] members_table = parsed_table.ParsedTable(members_page) for member_row in members_table: _, member = member_row['Developer'].value.strip( ).rsplit(' ', 1) r['members'].append(member) return r
def get_standings(self, users=None, statistics=None): result = {} problems_info = OrderedDict() if not self.standings_url: self.standings_url = self.url.replace('/olympiads/', '/results/') page = REQ.get(self.standings_url) regex = '<table[^>]*>.*?</table>' html_table = re.search(regex, page, re.DOTALL).group(0) table = parsed_table.ParsedTable(html_table, as_list=True) for r in table: row = OrderedDict() problems = row.setdefault('problems', {}) problem_idx = 0 for k, v in r: if 'taskscore' in v.header.attrs.get('class', '').split(): problem_idx += 1 d = problems_info.setdefault(problem_idx, {}) d['short'] = str(problem_idx) d['full_score'] = 100 d['name'] = k try: score = float(v.value) p = problems.setdefault(str(problem_idx), {}) p['result'] = v.value p['partial'] = score < 100 except Exception: pass elif k == 'Abs.': row['solving'] = float(v.value) elif k == 'Rank': row['place'] = v.value.strip('*').strip('.') elif k == 'Contestant': url = first(v.column.node.xpath('a[@href]/@href')) member = url.strip('/').split('/')[-1] row['member'] = member row['name'] = v.value elif k == 'Country': row['country'] = re.sub(r'\s*[0-9]+$', '', v.value) else: row[k] = v.value result[row['member']] = row standings = { 'result': result, 'url': self.standings_url, 'problems': list(problems_info.values()), } return standings
def fetch_problem(p): errors = set() for attempt in range(3): try: page = REQ.get(p['url'], time_out=30) match = re.search('<a[^>]*href="(?P<href>[^"]*module=ProblemDetail[^"]*)"[^>]*>', page) page = REQ.get(urljoin(p['url'], match.group('href')), time_out=30) matches = re.findall(r'<td[^>]*class="statTextBig"[^>]*>(?P<key>[^<]*)</td>\s*<td[^>]*>(?P<value>.*?)</td>', page, re.DOTALL) # noqa for key, value in matches: key = key.strip().rstrip(':').lower() if key == 'categories': tags = [t.strip().lower() for t in value.split(',')] tags = [t for t in tags if t] if tags: p['tags'] = tags elif key.startswith('writer') or key.startswith('tester'): key = key.rstrip('s') + 's' p[key] = re.findall('(?<=>)[^<>,]+(?=<)', value) for w in p.get('writers', []): writers[w] += 1 info = p.setdefault('info', {}) matches = re.finditer('<table[^>]*paddingTable2[^>]*>.*?</table>', page, re.DOTALL) for match in matches: html_table = match.group(0) rows = parsed_table.ParsedTable(html_table) for row in rows: key, value = None, None for k, v in row.items(): if k == "": key = v.value elif k and division_str in k.split(): value = v.value if key and value: key = re.sub(' +', '_', key.lower()) info[key] = value if key == 'point_value': value = toint(value) or asfloat(value) if value is not None: p['full_score'] = value except Exception as e: errors.add(f'error parse problem info {p}: {e}') sleep(5 + attempt) else: errors = None if errors: LOG.error(errors) return p
def get_standings(self, users=None, statistics=None): if not self.standings_url: self.standings_url = f'https://projecteuler.net/fastest={self.key}' result = {} page = REQ.get(self.standings_url, headers=conf.PROJECTEULER_COOKIE_HEADER) regex = '<table[^>]*>.*?</table>' html_table = re.search(regex, page, re.DOTALL).group(0) table = parsed_table.ParsedTable(html_table) for r in table: row = OrderedDict() row['solving'] = 1 for k, v in r.items(): if isinstance(v, list): place, country = v row['place'] = re.match('[0-9]+', place.value).group(0) country = first(country.column.node.xpath('.//@title')) if country: row['country'] = country elif k == 'Time To Solve': params = {} for x in v.value.split(', '): value, field = x.split() if field[-1] != 's': field += 's' params[field] = int(value) delta = timedelta(**params) row['penalty'] = f'{delta.total_seconds() / 60:.2f}' elif k == 'User': member = first(v.column.node.xpath('.//@title')) or v.value row['member'] = member else: row[k.lower()] = v.value if 'member' not in row: continue result[row['member']] = row standings = { 'result': result, 'url': self.standings_url, 'problems': [], } return standings
def parse_problems_infos(): problem_url = self.standings_url.replace('/ranking', '/p') page = REQ.get(problem_url) match = re.search( r'<h1[^>]*>[^<]*</h1>(\s*<[^/][^>]*>)*\s*(?P<table><table[^>]*>.*?</table>)', page, re.DOTALL) if not match: raise ExceptionParseStandings('Not found problems table') table = parsed_table.ParsedTable(html=match.group('table'), ignore_wrong_header_number=False) skip = False problems_infos = collections.OrderedDict() for r in table: if isinstance(r, parsed_table.ParsedTableRow): runda = re.sub(r'\s*\(.*\)\s*$', '', r.columns[0].value).strip() skip = runda.lower() not in self.name.lower() continue if skip: continue problem_info = {} for k, vs in list(r.items()): if isinstance(vs, list): v = ' '.join([v.value for v in vs]).strip() else: v = vs.value if not k: problem_info['short'] = v elif k in ('Nazwa', 'Name'): match = re.search(r'\[(?P<letter>[^\]]+)\]$', v) if match: problem_info['_letter'] = match.group('letter') problem_info['name'] = v href = vs.column.node.xpath('//a/@href') if href: problem_info['url'] = urljoin(problem_url, href[0]) if problem_info: problems_infos[problem_info['short']] = problem_info return problems_infos
def _get_medals(year): default = OrderedDict([(k, 4) for k in ('gold', 'silver', 'bronze')]) main_url = 'https://icpc.baylor.edu/' page = REQ.get(main_url) match = re.search('src="(?P<js>/static/js/main.[^"]*.js)"', page) if not match: return default js_url = match.group('js') page = REQ.get(js_url) match = re.search('XWIKI:"(?P<xwiki>[^"]*)"', page) if not match: return default xwiki_url = match.group('xwiki') xwiki_url = urljoin(main_url, xwiki_url).rstrip('/') + '/' medal_result_url = urljoin(xwiki_url, f'community/results-{year}') page = REQ.get(medal_result_url) json_data = json.loads(page) regex = '''<table[^>]*id=["']medalTable[^>]*>.*?</table>''' match = re.search(regex, json_data['content'], re.DOTALL) if not match: return default html_table = match.group(0) table = parsed_table.ParsedTable(html_table) medals = OrderedDict() fields = ('gold', 'silver', 'bronze') for f in fields: medals[f] = 0 for r in table: _, v = next(iter(r.items())) for attr in v.attrs.get('class', '').split(): if attr in fields: medals[attr] = medals.get(attr, 0) + 1 break if not medals: return default return medals
def fetch_table(page): nonlocal web_archive_url nonlocal total_num_pages nonlocal standings_url url = standings_url if n_page > 1: url += f'/page/{page}' if not web_archive_url: url += '?locale=en' page = Statistic.get(url) match = re.search('<title>[^<]*-(?P<name>[^<]*)</title>', page) if codename not in match.group('name'): return if total_num_pages is None: matches = re.findall( '<span[^>]*class="[^"]*page-index[^"]*"[^>]*pageindex="([0-9]+)"[^>]*>', page, re.I, ) if matches: total_num_pages = int(matches[-1]) regex = '''<table[^>]*class="[^>]*table[^>]*"[^>]*>.*?</table>''' match = re.search(regex, page, re.DOTALL) table = parsed_table.ParsedTable( match.group(0), header_mapping={ '№': '#', 'Участник': 'Participant', 'Бои': 'Games', 'Игры': 'Games', 'Побед': 'Won', 'Рейтинг': 'Rating', 'Язык': 'Language', }, ) return table
def get_standings(self, users=None, statistics=None): season = self.key.split()[0] result = {} page = REQ.get(self.standings_url) table = parsed_table.ParsedTable( html=page, xpath="//table[@class='ir-contest-standings']//tr") problems_info = collections.OrderedDict() for r in table: row = {} problems = row.setdefault('problems', {}) for k, v in list(r.items()): classes = v.attrs['class'].split() if 'ir-column-contestant' in classes: row['member'] = v.value + ' ' + season row['name'] = v.value elif 'ir-column-place' in classes: row['place'] = v.value elif 'ir-column-penalty' in classes: row['penalty'] = int(v.value) elif 'ir-problem-count' in classes: row['solving'] = int(v.value) else: letter = k.split()[0] problems_info[letter] = {'short': letter} if v.value == DOT: continue p = problems.setdefault(letter, {}) values = v.value.replace('−', '-').split(' ') p['result'] = values[0] if len(values) > 1: p['time'] = values[1] result[row['member']] = row standings = { 'result': result, 'url': self.standings_url, 'problems': list(problems_info.values()), } return standings
def get_standings(self, users=None, statistics=None): result = {} page = REQ.get(self.standings_url + ('&' if '?' in self.standings_url else '?') + 'locale=en') table = parsed_table.ParsedTable(html=page, xpath="//table[@class='monitor']//tr") problems_info = collections.OrderedDict() for r in table: row = {} problems = row.setdefault('problems', {}) for k, v in list(r.items()): title = first(v.header.node.xpath('a[@title]/@title')) if k in ['Участник', 'Participant']: url = first(v.column.node.xpath('a[@href]/@href')) row['member'] = re.search('([0-9]+)/?$', url).group(1) row['name'] = v.value elif k in ['Место', 'Rank']: row['place'] = v.value elif k in ['Время', 'Time']: row['penalty'] = int(v.value) elif k in ['Решено', 'Solved']: row['solving'] = int(v.value) elif len(k) == 1 and title is not None: problems_info[k] = {'short': k, 'name': title} url = first(v.header.node.xpath('a[@href]/@href')) if url is not None: problems_info[k]['url'] = urllib.parse.urljoin(self.standings_url, url) if v.value: p = problems.setdefault(k, {}) values = v.value.replace('–', '-').split(' ') p['result'] = values[0] if len(values) > 1: p['time'] = values[1] result[row['member']] = row standings = { 'result': result, 'url': self.standings_url, 'problems': list(problems_info.values()), } return standings
def get_standings(self, users=None, statistics=None): season = self.get_season() def standings_page(req): return req.get(self.standings_url) print(self.standings_url) with REQ( with_proxy=True, args_proxy=dict( time_limit=3, n_limit=30, connect=standings_page, ), ) as req: page = req.proxer.get_connect_ret() html_table = re.search('<table[^>]*>.*?</table>', page, re.MULTILINE | re.DOTALL) if not html_table: raise ExceptionParseStandings('Not found html table') mapping = { 'Rank': 'place', 'Name': 'name', 'Language': 'language', } table = parsed_table.ParsedTable(html_table.group(0), header_mapping=mapping) result = {} for r in table: row = dict() for k, v in r.items(): if v.value: row[k] = v.value if 'member' not in row: row['member'] = f'{row["name"]} {season}' result[row['member']] = row return {'result': result}
def get_table(page): html_table = re.search( '<table[^>]*bgcolor="silver"[^>]*>.*?</table>', page, re.MULTILINE | re.DOTALL).group(0) table = parsed_table.ParsedTable(html_table) return table
def get_standings(self, users=None, statistics=None): if not self.standings_url: self.standings_url = f'https://projecteuler.net/fastest={self.key}' user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36' # noqa page = REQ.get(self.standings_url, headers={'User-Agent': user_agent}) sign_out = re.search('<form[^>]*action="sign_out"[^>]*>', page) if not sign_out: for attempt in range(20): while True: value = f'{random.random():.16f}' image_bytes = REQ.get(f'https://projecteuler.net/captcha/show_captcha.php?{value}') image_stream = io.BytesIO(image_bytes) image_rgb = Image.open(image_stream) text = pytesseract.image_to_string(image_rgb, config='--oem 0 --psm 13 digits') text = text.strip() if re.match('^[0-9]{5}$', text): break REQ.get('https://projecteuler.net/sign_in') page = REQ.submit_form( name='sign_in_form', action=None, data={ 'username': conf.PROJECTEULER_USERNAME, 'password': conf.PROJECTEULER_PASSWORD, 'captcha': text, 'remember_me': '1', }, ) match = re.search('<p[^>]*class="warning"[^>]*>(?P<message>[^<]*)</p>', page) if match: REQ.print(match.group('message')) else: break else: raise ExceptionParseStandings('Did not recognize captcha for sign in') page = REQ.get(self.standings_url) result = {} problem_name = self.name.split('.', 1)[1].strip() problems_info = [{'name': problem_name, 'url': self.url}] regex = '<table[^>]*>.*?</table>' html_table = re.search(regex, page, re.DOTALL) if html_table: table = parsed_table.ParsedTable(html_table.group(0)) for r in table: row = OrderedDict() row['solving'] = 1 for k, v in r.items(): if isinstance(v, list): place, country = v row['place'] = re.match('[0-9]+', place.value).group(0) country = first(country.column.node.xpath('.//@title')) if country: row['country'] = country elif k == 'Time To Solve': params = {} for x in v.value.split(', '): value, field = x.split() if field[-1] != 's': field += 's' params[field] = int(value) rel_delta = relativedelta(**params) now = timezone.now() delta = now - (now - rel_delta) row['penalty'] = f'{delta.total_seconds() / 60:.2f}' elif k == 'User': member = first(v.column.node.xpath('.//@title')) or v.value row['member'] = member else: row[k.lower()] = v.value problems = row.setdefault('problems', {}) problem = problems.setdefault(problem_name, {}) problem['result'] = '+' problem['binary'] = True row['_skip_for_problem_stat'] = True if 'member' not in row: continue result[row['member']] = row standings = { 'result': result, 'url': self.standings_url, 'problems': problems_info, } if len(result) < 100: delta = timezone.now() - self.start_time if delta < timedelta(days=1): standings['timing_statistic_delta'] = timedelta(minutes=60) elif delta < timedelta(days=30): standings['timing_statistic_delta'] = timedelta(days=1) return standings
def get_standings(self, users=None, statistics=None): if not hasattr(self, 'season'): year = self.start_time.year - (0 if self.start_time.month > 8 else 1) season = f'{year}-{year + 1}' else: season = self.season result = {} problems_info = OrderedDict() if not re.search('/[0-9]+/', self.standings_url): return {} url = self.standings_url n_page = 1 while True: page = REQ.get(url) match = re.search( '<table[^>]*class="[^"]*standings[^>]*>.*?</table>', page, re.MULTILINE | re.DOTALL) if not match: raise ExceptionParseStandings('Not found table standings') html_table = match.group(0) table = parsed_table.ParsedTable(html_table) for r in table: row = {} problems = row.setdefault('problems', {}) solved = 0 has_solved = False for k, v in list(r.items()): if 'table__cell_role_result' in v.attrs['class']: letter = k.split(' ', 1)[0] if letter == 'X': continue p = problems_info.setdefault(letter, {'short': letter}) names = v.header.node.xpath('.//span/@title') if len(names) == 1: p['name'] = names[0] p = problems.setdefault(letter, {}) n = v.column.node if n.xpath( 'img[contains(@class,"image_type_success")]'): res = '+' p['binary'] = True elif n.xpath( 'img[contains(@class,"image_type_fail")]'): res = '-' p['binary'] = False else: if ' ' not in v.value: problems.pop(letter) continue res = v.value.split(' ', 1)[0] p['result'] = res p['time'] = v.value.split(' ', 1)[-1] if 'table__cell_firstSolved_true' in v.attrs['class']: p['first_ac'] = True if '+' in res or res.startswith('100'): solved += 1 try: has_solved = has_solved or '+' not in res and float( res) > 0 except ValueError: pass elif 'table__cell_role_participant' in v.attrs['class']: title = v.column.node.xpath('.//@title') if title: name = title[0] else: name = v.value.replace(' ', '', 1) row['name'] = name row['member'] = name if ' ' not in name else f'{name} {season}' elif 'table__cell_role_place' in v.attrs['class']: row['place'] = v.value elif 'table__header_type_penalty' in v.attrs['class']: row['penalty'] = int( v.value) if v.value.isdigit() else v.value elif 'table__header_type_score' in v.attrs['class']: row['solving'] = int(round(float(v.value))) if has_solved: row['solved'] = {'solving': solved} result[row['member']] = row n_page += 1 match = re.search( f'<a[^>]*href="(?P<href>[^"]*standings[^"]*p[^"]*={n_page})"[^>]*>', page) if not match: break url = urljoin(url, match.group('href')) standings = { 'result': result, 'url': self.standings_url, 'problems': list(problems_info.values()), } return standings
def get_standings(self, users=None, statistics=None): standings_url = self.standings_url or self.url page = REQ.get(standings_url) standings = {'url': standings_url} options = standings.setdefault('options', {'parse': {}}) regex = '<table>.*?</table>' match = re.search(regex, page, re.DOTALL) if match: html_table = match.group(0) table = parsed_table.ParsedTable(html_table, without_header=True, ignore_wrong_header_number=False) infos = {} for r in table: k, v = [col.value for col in r.columns] k = k.strip(':').lower().replace(' ', '_') infos[k] = v options['parse'] = infos def find_related(statistics): infos = deepcopy(self.info.get('standings', {}).get('parse', {})) if '_related' in infos and Contest.objects.get( pk=infos['_related']): options['parse']['_related'] = infos['_related'] return related = None infos.update(options.get('parse', {})) host_mapping = self.resource.info['_host_mapping'] host = infos.get('official_page') if host: match = re.search('.*https?://(?P<host>[^/]*)/', host) host = match.group('host') else: host = infos.get('series') ignore_n_statistics = False ignore_title = None for mapping in host_mapping: if re.search(mapping['regex'], host): host = mapping['host'] ignore_title = mapping.get('ignore_title') ignore_n_statistics = mapping.get('ignore_n_statistics', ignore_n_statistics) break if host: delta_start = timedelta(days=3) qs = Contest.objects.filter(resource__host=host) qs = qs.filter( Q(start_time__gte=self.start_time - delta_start, start_time__lte=self.start_time + delta_start) | Q(end_time__gte=self.start_time - delta_start, end_time__lte=self.start_time + delta_start)) if not ignore_n_statistics: teams = set() for r in statistics.values(): if 'team_id' in r: teams.add(r['team_id']) n_statistics = len(teams) if teams else len(statistics) delta_n = round(n_statistics * 0.15) qs = qs.filter(n_statistics__gte=n_statistics - delta_n, n_statistics__lte=n_statistics + delta_n) if ignore_title: qs = qs.exclude(title__iregex=ignore_title) if len(qs) > 1: first = None for stat in statistics.values(): if stat.get('place') == '1': first = stat['member'].split(':', 1)[-1] qs = qs.filter(statistics__place_as_int=1, statistics__account__key=first) if len(qs) == 1: related = qs.first().pk if related is not None: options['parse']['_related'] = related standings['invisible'] = True else: standings['invisible'] = False regex = '<table[^>]*class="[^"]*table[^"]*"[^>]*>.*?</table>' match = re.search(regex, page, re.DOTALL) html_table = match.group(0) table = parsed_table.ParsedTable(html_table) profile_urls = {} for r in table: row = OrderedDict() rank = r.pop('Rank') row['place'] = rank.value medal = rank.column.node.xpath( './/img[contains(@alt,"medal")]/@title') if medal: row['medal'] = medal[0].lower() name_key = 'Name' if 'Name' in r else 'Team' name = r.pop(name_key) members = name.column.node.xpath('.//a') val = name.value if name_key == 'Team': if ':' in val: val = val.rsplit(': ', 1)[0] row['team_id'] = val row['name'] = val val = r.pop('Score').value.strip() row['solving'] = as_number(val) if val and val != '?' else 0 row['_no_update_name'] = True for k, v in r.items(): k = k.lower() if k in row: continue v = v.value.strip() if not v or v == '?': continue row[k.lower()] = as_number(v) for member in members: url = urljoin(standings_url, member.attrib['href']) row['_profile_url'] = url profile_urls[url] = deepcopy(row) statistics_profiles_urls = {} if statistics: for s in statistics.values(): if '_profile_url' in s: statistics_profiles_urls[s['_profile_url']] = s def get_handle(row): url = row['_profile_url'] if 'university' in url: row['_skip'] = True if url in statistics_profiles_urls: stat = statistics_profiles_urls[url] for k, v in stat.items(): if k not in row: row[k] = v if '_member' in row and '_info' in row: row['member'] = row['_member'] row['info'] = row['_info'] return row page = REQ.get(url) info = row.setdefault('info', {}) if 'university' in url: handle = unquote(urlparse(url).path) handle = handle.strip('/') handle = handle.replace('/', ':') row['member'] = handle else: match = re.search( '<link[^>]*rel="canonical"[^>]*href="[^"]*/profile/(?P<handle>[^"]*)"[^>]*>', page) handle = match.group('handle') row['member'] = handle match = re.search( r'>[^<]*prize[^<]*money[^<]*(?:<[^>]*>)*[^<]*\$(?P<val>[.0-9]+)', page, re.IGNORECASE) if match: info['prize_money'] = as_number(match.group('val')) match = re.search( r'>country:</[^>]*>(?:\s*<[^>]*>)*\s*<a[^>]*href="[^"]*/country/(?P<country>[^"]*)"', page, re.IGNORECASE) if match: info['country'] = match.group('country') match = re.search('<h3[^>]*>(?P<name>[^>]*)<', page) info['name'] = match.group('name').strip() row['_member'] = row['member'] row['_info'] = dict(info) return row result = {} members = defaultdict(list) with PoolExecutor(max_workers=4) as executor, tqdm( total=len(result), desc='urls') as pbar: for row in executor.map(get_handle, profile_urls.values()): pbar.update() result[row['member']] = row skip = row.pop('_skip', False) if not skip and 'team_id' in row: members[row['team_id']].append({ 'account': row['member'], 'name': row['info']['name'] }) if members: for row in result.values(): if 'team_id' in row: row['_members'] = members[row['team_id']] find_related(result) standings['result'] = result return standings
def get_standings(self, users=None, statistics=None): if not self.standings_url: page = REQ.get(urljoin(self.url, '/')) for name in ( 'Соревнования', 'Тренировочные олимпиады', ): match = re.search( '<a[^>]*href="(?P<url>[^"]*)"[^>]*>{}<'.format(name), page) page = REQ.get(match.group('url')) match = re.search( '{}.*?<a[^>]*href="(?P<url>[^"]*)"[^>]*>{}<'.format( re.escape(self.name), 'Результаты прошедших тренировок'), page, re.DOTALL, ) if not match: raise ExceptionParseStandings('Not found standing url') url = match.group('url') page = REQ.get(url) date = self.start_time.strftime('%Y-%m-%d') matches = re.findall( r''' <tr[^>]*>[^<]*<td[^>]*>{}</td>[^<]* <td[^>]*>(?P<title>[^<]*)</td>[^<]* <td[^>]*>[^<]*<a[^>]*href\s*=["\s]*(?P<url>[^">]*)["\s]*[^>]*> '''.format(date), page, re.MULTILINE | re.VERBOSE) urls = [(title, urljoin(url, u)) for title, u in matches] if len(urls) > 1: urls = [( title, urljoin(url, u) ) for title, u in matches if not re.search( r'[0-9]\s*-\s*[0-9].*(?:[0-9]\s*-\s*[0-9].*\bкл\b|школа)', title, re.I)] if not urls: raise ExceptionParseStandings('Not found standing url') if len(urls) > 1: ok = True urls_set = set() for _, u in urls: page = REQ.get(u) path = re.findall( '<td[^>]*nowrap><a[^>]*href="(?P<href>[^"]*)"', page) if len(path) < 2: ok = False parent = urljoin(u, path[-2]) urls_set.add(parent) if len(urls_set) > 1 or not ok: raise ExceptionParseStandings('Too much standing url') url = urls_set.pop() else: _, url = urls[0] page = REQ.get(url) self.standings_url = REQ.last_url else: page = REQ.get(self.standings_url) html_table = re.search('<table[^>]*bgcolor="silver"[^>]*>.*?</table>', page, re.MULTILINE | re.DOTALL).group(0) table = parsed_table.ParsedTable(html_table) problems_info = OrderedDict() max_score = defaultdict(float) result = {} for r in table: row = OrderedDict() problems = row.setdefault('problems', {}) solved = 0 for k, v in list(r.items()): if k == 'Имя': href = v.column.node.xpath('a/@href') if not href: continue uid = re.search('[0-9]+$', href[0]).group(0) row['member'] = uid row['name'] = v.value elif k == 'Место': row['place'] = v.value elif k == 'Время': row['penalty'] = int(v.value) elif k in ['Сумма', 'Задачи']: row['solving'] = float(v.value) elif re.match('^[a-zA-Z0-9]+$', k): problems_info[k] = {'short': k} if v.value: p = problems.setdefault(k, {}) p['result'] = v.value try: max_score[k] = max(max_score[k], float(v.value)) except ValueError: pass elif k: row[k.strip()] = v.value.strip() elif v.value.strip().lower() == 'log': href = v.column.node.xpath('.//a/@href') if href: row['url'] = urljoin(self.standings_url, href[0]) result[row['member']] = row for r in result.values(): solved = 0 for k, p in r['problems'].items(): score = p['result'] if score.startswith('+'): solved += 1 else: try: score = float(score) except ValueError: pass if abs(max_score[k] - score) < 1e-9 and score > 0: solved += 1 r['solved'] = {'solving': solved} standings = { 'result': result, 'url': self.standings_url, 'problems': list(problems_info.values()), } return standings
def get_standings(self, users=None, statistics=None): year = self.start_time.year year = year + 1 if self.start_time.month >= 9 else year season = '%d-%d' % (year - 1, year) standings_urls = [] if not self.standings_url: for url in ( f'http://static.kattis.com/icpc/wf{year}/', f'https://zibada.guru/finals/{year}/', f'http://web.archive.org/web/{year}/https://icpc.baylor.edu/scoreboard/', ): try: page = REQ.get(url) except FailOnGetResponse: continue if 'web.archive.org' in REQ.last_url and f'/{year}' not in REQ.last_url: continue standings_urls.append(url) else: standings_urls.append(self.standings_url) if not standings_urls: raise ExceptionParseStandings( f'Not found standings url year = {year}') for standings_url in standings_urls: page = REQ.get(standings_url) result = {} problems_info = OrderedDict() has_submission = False if 'zibada' in standings_url: match = re.search(r' = (?P<data>[\{\[].*?);?\s*$', page, re.MULTILINE) if match: names = self._json_load(match.group('data')) else: names = None try: page = REQ.get('standings.js') match = re.search(r' = (?P<data>\{.*?);?\s*$', page, re.MULTILINE) data = self._json_load(match.group('data')) except Exception: assert names data = names for p_name in data['problems']: problems_info[p_name] = {'short': p_name} events = data.pop('events', None) if events: teams = {} time_divider = 60 events.sort(key=lambda e: int(e.split()[-1])) for e in events: tid, p_name, status, attempt, time = e.split() time = int(time) team = teams.setdefault(tid, {}) problems = team.setdefault('problems', {}) result = problems.get(p_name, {}).get('result', '') if not result.startswith('?') and status.startswith( '?'): continue has_submission = True if status == '+': attempt = int(attempt) - 1 p_info = problems_info[p_name] problems[p_name] = { 'time': time, 'result': '+' if status == '+' and attempt == 0 else f'{status}{attempt}', } for tid, team in teams.items(): name = names[int(tid)][0] name = html.unescape(name) team['member'] = f'{name} {season}' team['name'] = name penalty = 0 solving = 0 for p_name, problem in team.get('problems', {}).items(): if problem['result'].startswith('+'): solving += 1 attempt_penalty = (int( problem['result'].lstrip('+') or 0)) * 20 * time_divider penalty += problem['time'] + attempt_penalty team['penalty'] = int(round(penalty / time_divider)) team['solving'] = solving else: teams = {} time_divider = 1 data_teams = data['teams'] if isinstance(data_teams, dict): data_teams = data_teams.values() for team in data_teams: row = {} def get(key, index): return team[key] if isinstance( team, dict) else team[index] name = get('name', 0) name = html.unescape(name) row['member'] = f'{name} {season}' row['name'] = name row['solving'] = int(get('score', 2)) row['penalty'] = int(get('time', 3)) if isinstance(team, dict): team['problems'] = [ team[str(index)] for index in range(len(data['problems'])) ] problems = row.setdefault('problems', {}) for p_name, verdict in zip(data['problems'], get('problems', 4)): if not verdict: continue if isinstance(verdict, dict): verdict = {k[0]: v for k, v in verdict.items()} verdict['a'] = int(verdict['a']) if isinstance(verdict.get('p'), int): verdict['a'] += verdict['p'] if isinstance(verdict['s'], str): verdict['s'] = int(verdict['s']) status = '+' if verdict['s'] else ( '?' if verdict.get('p', False) else '-') time = verdict['t'] result = verdict['a'] time_divider = 1000 * 60 if not result: continue else: status, result = verdict.split(' ', 1) if ' ' in result: result, time = result.split() time = int(time) else: time = None result = int(result) has_submission = True problem = problems.setdefault(p_name, {}) if status == '+': problem['time'] = time problem[ 'result'] = '+' if result == 1 else f'+{result - 1}' else: problem['result'] = f'{status}{result}' teams[row['member']] = row teams = list(teams.values()) teams.sort(key=lambda t: (t['solving'], -t['penalty']), reverse=True) rank = 0 prev = None for i, t in enumerate(teams): curr = (t['solving'], t['penalty']) if prev != curr: rank = i + 1 prev = curr t['place'] = rank result = {t['member']: t for t in teams} problems_info = OrderedDict(sorted(problems_info.items())) else: regex = '''<table[^>]*(?:id=["']standings|class=["']scoreboard)[^>]*>.*?</table>''' match = re.search(regex, page, re.DOTALL) html_table = match.group(0) table = parsed_table.ParsedTable(html_table) time_divider = 1 for r in table: row = {} problems = row.setdefault('problems', {}) for k, vs in r.items(): if isinstance(vs, list): v = ' '.join(i.value for i in vs if i.value) else: v = vs.value k = k.lower().strip('.') v = v.strip() if k in ('rank', 'rk'): row['place'] = v elif k == 'team': row['member'] = f'{v} {season}' row['name'] = v elif k == 'time': row['penalty'] = int(v) elif k == 'slv': row['solving'] = int(v) elif k == 'score': if ' ' in v: row['solving'], row['penalty'] = map( int, v.split()) else: row['solving'] = int(v) elif len(k) == 1: k = k.title() if k not in problems_info: problems_info[k] = {'short': k} if 'title' in vs.header.attrs: problems_info[k]['name'] = vs.header.attrs[ 'title'] v = re.sub(r'([0-9]+)\s+([0-9]+)\s+tr.*', r'\2 \1', v) v = re.sub('tr[a-z]*', '', v) v = re.sub('-*', '', v) v = v.strip() if not v: continue has_submission = True p = problems.setdefault(k, {}) if ' ' in v: pnt, time = map(int, v.split()) p['result'] = '+' if pnt == 1 else f'+{pnt - 1}' p['time'] = time if ('solvedfirst' in vs.column.attrs.get( 'class', '' ) or vs.column.node.xpath( './/*[contains(@class, "score_first")]' )): p['first_ac'] = True else: p['result'] = f'-{v}' result[row['member']] = row if not has_submission: continue first_ac_of_all = None for team in result.values(): for p_name, problem in team['problems'].items(): p_info = problems_info[p_name] if not problem['result'].startswith('+'): continue time = problem['time'] if 'first_ac' not in p_info or time < p_info['first_ac']: p_info['first_ac'] = time if first_ac_of_all is None or time < first_ac_of_all: first_ac_of_all = time if problem.get('first_ac'): p_info['has_first_ac'] = True for team in result.values(): for p_name, problem in team['problems'].items(): p_info = problems_info[p_name] if problem['result'].startswith('+'): if p_info.get('has_first_ac' ) and not problem.get('first_ac'): continue if problem['time'] == p_info['first_ac']: problem['first_ac'] = True if problem['time'] == first_ac_of_all: problem['first_ac_of_all'] = True if 'time' in problem: problem['time'] = int( round(problem['time'] / time_divider)) without_medals = any(p['result'].startswith('?') for row in result.values() for p in row.get('problems', {}).values()) options = {'per_page': None} if not without_medals: medals = self._get_medals(year) medals = [{'name': k, 'count': v} for k, v in medals.items()] options['medals'] = medals standings = { 'result': result, 'url': standings_url, 'problems': list(problems_info.values()), 'options': options, } return standings raise ExceptionParseStandings( f'Not found standings url from {standings_urls}')
def get_standings(self, users=None, statistics=None): year = self.start_time.year year = year if self.start_time.month >= 9 else year - 1 season = '%d-%d' % (year, year + 1) page = REQ.get(self.url) match = re.search( r'''<a[^>]*href=["']?(?P<href>[^"' ]*rating[^"' ]*)["']?[^>]*>\[Рейтинг\]''', page) if not match and re.search( r'''<b>Олимпиада №[0-9]+ не существует!</b>''', page): return {'action': 'delete'} page = REQ.get(match.group('href')) standings_url = REQ.last_url match = re.search( r'''var(?P<vars>(?:\s*[a-z]+=[0-9]+,)+)\s*M=(?:new Array)?[\[\(]?(?P<data>.*?)[\]\)]\s*(?:function|var)''', page) # noqa result = {} problems_info = OrderedDict() def canonize_name(name): name = name.replace('\r', ' ') name = name.replace('\n', ' ') name = re.sub(r'\s+', ' ', name) name = re.sub(r'<br/?>', ',', name) name = re.sub(r'<[^>]*>', '', name) name = re.sub(r'\s*,\s*', ', ', name) name = name.strip() return name if match: data = match.group('data') data = data.replace('\\', '\\\\') data = data.replace('"', r'\"') data = data.replace("'", '"') data = re.sub(r'\s+', ' ', data) data = json.loads(f'[{data}]') variables = {} for var in re.split(r',\s*', match.group('vars').strip()): if not var: continue k, v = var.split('=') variables[k] = v match = re.search(r'''M\[\((?P<val>[0-9]+)\+''', page) offset = int(match.group('val')) n_problems = int(variables['tn']) n_teams = int(variables['nk']) n_fields = offset + 3 * n_problems place = 0 last = None for rank, st in enumerate(range(0, n_teams * n_fields, n_fields), start=1): row = data[st:st + n_fields] name = canonize_name(row[0]) member = name + ', ' + season r = result.setdefault(member, {}) r['name'] = name r['member'] = member r['solving'] = int(row[1]) r['penalty'] = int(row[2]) score = r['solving'], r['penalty'] if score != last: place = rank last = score r['place'] = place n_problems_fields = 3 problems = r.setdefault('problems', {}) for idx in range(0, n_problems): p_info = row[offset + idx * n_problems_fields:offset + (idx + 1) * n_problems_fields] stat, errors, seconds = map(int, p_info) key = chr(ord('A') + idx) if n_problems < 27 else f'{idx + 1:02d}' if key not in problems_info: info = {'short': key} if abs(errors) >= 1000: info['full_score'] = 100 problems_info[key] = info if not stat: continue p = problems.setdefault(key, {}) p['time'] = self.to_time(seconds, num=2) if abs(errors) < 1000: p['result'] = f'+{errors if errors else ""}' if stat == 1 else f'-{errors}' else: solved = r.setdefault('solved', {'solving': 0}) score = errors - 1000 p['result'] = score if score > 0: p['partial'] = score < problems_info[key][ 'full_score'] if not p['partial']: solved['solving'] += 1 if not problems: result.pop(member) else: regex = '''<table[^>]*class=["']?olimp["']?[^>]*>.*?</table>''' match = re.search(regex, page, re.DOTALL) if not match and 'Рейтинг олимпиады' not in page: return {'action': 'delete'} table = parsed_table.ParsedTable(match.group(0)) for row in table: r = OrderedDict() problems = r.setdefault('problems', {}) for k, v in list(row.items()): if k == '=': r['solving'] = int(v.value) elif k == 'Место': r['place'] = int(v.value) elif k == 'Время': r['penalty'] = int(v.value) elif k == 'Участник': name = canonize_name(v.value) r['name'] = name r['member'] = name + ', ' + season elif len(k) == 1 and k not in ['№']: if k not in problems_info: info = {'short': k} problems_info[k] = info if v.value != DOT: p = problems.setdefault(k, {}) p['result'], *values = v.value.split() if values: p['time'] = values[0] if not problems: continue result[r['member']] = r standings = { 'result': result, 'url': standings_url, 'problems': list(problems_info.values()), } return standings
def get_users_infos(users, resource=None, accounts=None, pbar=None): @RateLimiter(max_calls=5, period=1) def fetch_profle_page(user): for format_url in ( Statistic.PROFILE_URL_FORMAT_, Statistic.TEAM_URL_FORMAT_, ): page = None url = format_url.format(user=user) try: ret = REQ.get(url, return_url=True) if not ret: continue page, page_url = ret if url != page_url: page = None break except FailOnGetResponse as e: if e.args[0].code == 404: page = None else: raise e return page with PoolExecutor(max_workers=4) as executor: for user, page in zip(users, executor.map(fetch_profle_page, users)): if pbar: pbar.update() if page is None: yield {'info': None} continue match = re.search(r'jQuery.extend\(Drupal.settings,(?P<data>[^;]*)\);$', str(page), re.MULTILINE) data = json.loads(match.group('data')) if 'date_versus_rating' not in data: info = {} info['is_team'] = True regex = '<table[^>]*cellpadding=""[^>]*>.*?</table>' match = re.search(regex, page, re.DOTALL) if match: html_table = match.group(0) table = parsed_table.ParsedTable(html_table) for r in table: for k, v in list(r.items()): k = k.lower().replace(' ', '_') info[k] = v.value matches = re.finditer(r''' <td[^>]*>\s*<b[^>]*>Member[^<]*</b>\s*</td>\s* <td[^>]*><a[^>]*href\s*=\s*"[^"]*/users/(?P<member>[^"/]*)"[^>]*> ''', page, re.VERBOSE) coders = set() for match in matches: coders.add(match.group('member')) if coders: info['members'] = list(coders) ret = {'info': info, 'coders': coders} else: data = data['date_versus_rating']['all'] matches = re.finditer( r''' <li[^>]*>\s*<label[^>]*>(?P<key>[^<]*):\s*</label>\s* <span[^>]*>(?P<value>[^<]*)</span>\s*</li> ''', page, re.VERBOSE, ) info = {} for match in matches: key = match.group('key').strip().replace(' ', '_').lower() value = match.group('value').strip() info[key] = value contest_addition_update = {} prev_rating = None for row in data: rating = row.get('rating') if not rating: continue rating = int(rating) info['rating'] = rating code = row.get('code') if code: if re.search(r'\bdiv(ision)?[-_\s]+[AB12]', row['name'], re.I) and re.search('[AB]$', code): code = code[:-1] update = contest_addition_update.setdefault(code, OrderedDict()) update['rating_change'] = rating - prev_rating if prev_rating is not None else None update['new_rating'] = rating prev_rating = rating ret = {'info': info, 'contest_addition_update': contest_addition_update} yield ret
def get_standings(self, users=None, statistics=None): year = int(re.search(r'\b[0-9]{4}\b', self.key).group(0)) season = '%d-%d' % (year - 1, year) icpc_standings_url = f'https://icpc.global/community/results-{year}' icpc_api_standings_url = f'https://icpc.global/api/help/cms/virtpublic/community/results-{year}' standings_urls = [] if not self.standings_url: for url in ( f'http://static.kattis.com/icpc/wf{year}/', f'https://zibada.guru/finals/{year}/', f'http://web.archive.org/web/{year}/https://icpc.baylor.edu/scoreboard/', f'http://web.archive.org/web/{year}/https://icpc.global/scoreboard/', f'https://cphof.org/standings/icpc/{year}', icpc_api_standings_url, ): try: page = REQ.get(url) except FailOnGetResponse: continue if 'web.archive.org' in REQ.last_url and f'/{year}' not in REQ.last_url: continue if not re.search( rf'\b(world\s*finals\s*{year}|{year}\s*world\s*finals)\b', page, re.IGNORECASE): continue standings_urls.append(url) else: if self.standings_url == icpc_standings_url: standings_urls.append(icpc_api_standings_url) else: standings_urls.append(self.standings_url) if not standings_urls: raise ExceptionParseStandings( f'Not found standings url year = {year}') for standings_url in standings_urls: is_icpc_api_standings_url = standings_url == icpc_api_standings_url page = REQ.get(standings_url) result = {} hidden_fields = set(self.info.get('hidden_fields', [])) | {'region'} problems_info = OrderedDict() if 'zibada' in standings_url: match = re.search(r' = (?P<data>[\{\[].*?);?\s*$', page, re.MULTILINE) if match: names = self._json_load(match.group('data')) else: names = None try: page = REQ.get('standings.js') match = re.search(r' = (?P<data>\{.*?);?\s*$', page, re.MULTILINE) data = self._json_load(match.group('data')) except Exception: assert names data = names for p_name in data['problems']: problems_info[p_name] = {'short': p_name} events = data.pop('events', None) if events: teams = {} time_divider = 60 events.sort(key=lambda e: int(e.split()[-1])) for e in events: tid, p_name, status, attempt, time = e.split() time = int(time) team = teams.setdefault(tid, {}) problems = team.setdefault('problems', {}) result = problems.get(p_name, {}).get('result', '') if not result.startswith('?') and status.startswith( '?'): continue if status == '+': attempt = int(attempt) - 1 p_info = problems_info[p_name] problems[p_name] = { 'time': time, 'result': '+' if status == '+' and attempt == 0 else f'{status}{attempt}', } for tid, team in teams.items(): name = names[int(tid)][0] name = html.unescape(name) team['member'] = f'{name} {season}' team['name'] = name penalty = 0 solving = 0 for p_name, problem in team.get('problems', {}).items(): if problem['result'].startswith('+'): solving += 1 attempt_penalty = (int( problem['result'].lstrip('+') or 0)) * 20 * time_divider penalty += problem['time'] + attempt_penalty team['penalty'] = int(round(penalty / time_divider)) team['solving'] = solving else: teams = {} time_divider = 1 data_teams = data['teams'] if isinstance(data_teams, dict): data_teams = data_teams.values() for team in data_teams: row = {} def get(key, index): return team[key] if isinstance( team, dict) else team[index] name = get('name', 0) name = html.unescape(name) row['member'] = f'{name} {season}' row['name'] = name row['solving'] = int(get('score', 2)) row['penalty'] = int(get('time', 3)) if isinstance(team, dict): team['problems'] = [ team[str(index)] for index in range(len(data['problems'])) ] problems = row.setdefault('problems', {}) for p_name, verdict in zip(data['problems'], get('problems', 4)): if not verdict: continue if isinstance(verdict, dict): verdict = {k[0]: v for k, v in verdict.items()} verdict['a'] = int(verdict['a']) if isinstance(verdict.get('p'), int): verdict['a'] += verdict['p'] if isinstance(verdict['s'], str): verdict['s'] = int(verdict['s']) status = '+' if verdict['s'] else ( '?' if verdict.get('p', False) else '-') time = verdict['t'] result = verdict['a'] time_divider = 1000 * 60 if not result: continue else: status, result = verdict.split(' ', 1) if ' ' in result: result, time = result.split() time = int(time) else: time = None result = int(result) problem = problems.setdefault(p_name, {}) if status == '+': problem['time'] = time problem[ 'result'] = '+' if result == 1 else f'+{result - 1}' else: problem['result'] = f'{status}{result}' teams[row['member']] = row teams = list(teams.values()) teams.sort(key=lambda t: (t['solving'], -t['penalty']), reverse=True) rank = 0 prev = None for i, t in enumerate(teams): curr = (t['solving'], t['penalty']) if prev != curr: rank = i + 1 prev = curr t['place'] = rank result = {t['member']: t for t in teams} problems_info = OrderedDict(sorted(problems_info.items())) else: if is_icpc_api_standings_url: page = re.sub( r'</table>\s*<table>\s*(<tr[^>]*>\s*<t[^>]*>)', r'\1', page, flags=re.I) regex = '''(?:<table[^>]*(?:id=["']standings|class=["']scoreboard)[^>]*>|"content":"[^"]*<table[^>]*>|<table[^>]*class="[^"]*(?:table[^"]*){3}"[^>]*>).*?</table>''' # noqa match = re.search(regex, page, re.DOTALL) if match: html_table = match.group(0) table = parsed_table.ParsedTable( html_table, with_not_full_row=is_icpc_api_standings_url) else: table = [] time_divider = 1 last_place = None honorables = [] for r in table: row = {} problems = row.setdefault('problems', {}) for k, vs in r.items(): if isinstance(vs, list): v = ' '.join(i.value for i in vs if i.value) else: v = vs.value k = k.lower().strip('.') v = v.strip() if honorables: if v: honorables.append(v) continue if k in ('rank', 'rk', 'place'): if not isinstance(vs, list): medal = vs.column.node.xpath('.//img/@alt') if medal and medal[0].endswith('medal'): row['medal'] = medal[0].split()[0] if v and not v[0].isdigit(): honorables.append(v) row['place'] = v elif k in ('team', 'name', 'university'): if isinstance(vs, list): for el in vs: logo = el.column.node.xpath('.//img/@src') if logo: logo = urllib.parse.urljoin( standings_url, logo[0]) row.setdefault('info', {})['logo'] = logo break for el in vs: region = el.column.node.xpath( './/*[@class="badge badge-warning"]') if region: region = ''.join([ s.strip() for s in region[0].xpath('text()') ]) if region: row['region'] = region if 'cphof' in standings_url: member = vs.column.node.xpath( './/a/text()')[0].strip() row['member'] = f'{member} {season}' else: row['member'] = f'{v} {season}' row['name'] = v elif k in ('time', 'penalty', 'total time (min)', 'minutes'): if v: row['penalty'] = int(v) elif k in ('slv', 'solved', '# solved'): row['solving'] = int(v) elif k == 'score': if ' ' in v: row['solving'], row['penalty'] = map( int, v.split()) else: row['solving'] = int(v) elif len(k) == 1: k = k.title() if k not in problems_info: problems_info[k] = {'short': k} if 'title' in vs.header.attrs: problems_info[k]['name'] = vs.header.attrs[ 'title'] v = re.sub(r'([0-9]+)\s+([0-9]+)\s+tr.*', r'\2 \1', v) v = re.sub('tr[a-z]*', '', v) v = re.sub('-*', '', v) v = v.strip() if not v: continue p = problems.setdefault(k, {}) if '+' in v: v = v.replace(' ', '') p['result'] = f'?{v}' elif ' ' in v: pnt, time = map(int, v.split()) p['result'] = '+' if pnt == 1 else f'+{pnt - 1}' p['time'] = time if ('solvedfirst' in vs.column.attrs.get( 'class', '' ) or vs.column.node.xpath( './/*[contains(@class, "score_first")]' )): p['first_ac'] = True else: p['result'] = f'-{v}' if row.get('place'): last_place = row['place'] elif last_place: row['place'] = last_place if 'member' not in row or row['member'].startswith(' '): continue result[row['member']] = row elements = etree.HTML(page).xpath( '//div[@class="card-header"]/following-sibling::div[@class="card-body"]//li' ) # noqa for el in elements: name = ''.join([s.strip() for s in el.xpath('text()')]) member = f'{name} {season}' row = result.setdefault(member, { 'member': member, 'name': name }) logo = el.xpath('./img/@src') if logo: row.setdefault('info', {})['logo'] = urllib.parse.urljoin( standings_url, logo[0]) while el is not None: prv = el.getprevious() if prv is not None and prv.tag == 'div' and prv.get( 'class') == 'card-header': break el = el.getparent() if el is not None: region = ''.join( [s.strip() for s in prv.xpath('text()')]) row['region'] = region if result and honorables: for name in honorables: if 'honorable' in name.lower(): continue row = dict(name=name, member=f'{name} {season}') result[row['member']] = row if not result: continue if statistics: for team, row in result.items(): stat = statistics.get(team) if not stat: continue for k, v in stat.items(): if k not in row: hidden_fields.add(k) row[k] = v if any(['region' not in r for r in result.values()]): try: url = f'https://icpc.global/api/team/wf/{year}/published' page = REQ.get(url, time_out=60) data = self._json_load(page) except Exception: traceback.print_exc() data = None if data: def canonize_name(name): name = name.lower() name = name.replace('&', ' and ') name = re.sub(r'\s{2,}', ' ', name) name = re.split(r'(?:\s-\s|\s-|-\s|,\s)', name) name = tuple(sorted([n.strip() for n in name])) return name matching = {} for key, row in result.items(): name = row['name'] matching.setdefault(name, key) name = canonize_name(name) matching.setdefault(name, key) for site in data: region = site['siteName'] for team in site['teams']: name = team['university'] if name not in matching: name = canonize_name(name) if name not in matching: name = tuple( sorted(name + canonize_name(team['name']))) if name not in matching: logger.warning(f'Not found team = {name}') else: row = result[matching[name]] row['region'] = region for k, v in team.items(): k = k.lower() if k not in row: hidden_fields.add(k) row[k] = v first_ac_of_all = None for team in result.values(): for p_name, problem in team.get('problems', {}).items(): p_info = problems_info[p_name] if not problem['result'].startswith('+'): continue time = problem['time'] if 'first_ac' not in p_info or time < p_info['first_ac']: p_info['first_ac'] = time if first_ac_of_all is None or time < first_ac_of_all: first_ac_of_all = time if problem.get('first_ac'): p_info['has_first_ac'] = True for team in result.values(): for p_name, problem in team.get('problems', {}).items(): p_info = problems_info[p_name] if problem['result'].startswith('+'): if p_info.get('has_first_ac' ) and not problem.get('first_ac'): continue if problem['time'] == p_info['first_ac']: problem['first_ac'] = True if problem['time'] == first_ac_of_all: problem['first_ac_of_all'] = True if 'time' in problem: problem['time'] = int( round(problem['time'] / time_divider)) without_medals = any(p['result'].startswith('?') for row in result.values() for p in row.get('problems', {}).values()) options = {'per_page': None} if not without_medals: medals = self._get_medals(year) if medals: medals = [{ 'name': k, 'count': v } for k, v in medals.items()] options['medals'] = medals standings = { 'result': result, 'url': icpc_standings_url if is_icpc_api_standings_url else standings_url, 'problems': list(problems_info.values()), 'options': options, 'hidden_fields': list(hidden_fields), } return standings raise ExceptionParseStandings( f'Not found standings url from {standings_urls}')
def get_standings(self, users=None, statistics=None): def parse_problems(page, full=False): matches = re.finditer( r''' <div[^>]*class=['"]panel\s*historypanel['"][^>]*>\s* <div[^>]*>\s*<h[^>]*>(?P<index>[^<]*)</h[^>]*>\s*</div>\s* <div[^>]*>(\s*<[^>]*>)*(?P<name>[^<]+) (\s*<[^>]*>)*\s*<a[^>]*href=["'](?P<url>[^"']*)["'][^>]*> ''', page, re.VERBOSE) problems = [] problemsets = [] prev_index = None for match in matches: index = match.group('index') if prev_index and index <= prev_index: if full: problemsets.append(problems) problems = [] else: break prev_index = index url = urllib.parse.urljoin(self.standings_url, match.group('url')) cpid = re.search('cpid=([0-9]+)', url).group(1) problems.append({ 'short': str(len(problems) + 1), 'code': cpid, 'name': match.group('name'), 'url': url, }) if problems: problemsets.append(problems) return problemsets if full else problems page = REQ.get(self.standings_url) divisions = list( re.finditer( '<a[^>]*href="(?P<url>[^"]*data[^"]*_(?P<name>[^_]*)_results.html)"[^>]*>', page)) descriptions = [] prev_span = None for division_match in divisions: curr_span = division_match.span() if prev_span is not None: descriptions.append(page[prev_span[1]:curr_span[0]]) prev_span = curr_span if prev_span is not None: descriptions.append(page[prev_span[1]:]) problems_info = OrderedDict() match = re.search( '''<a[^>]*href=["'](?P<href>[^"']*page=[a-z0-9]+problems)["'][^>]*>''', page) if match: url = urllib.parse.urljoin(self.standings_url, match.group('href')) page = REQ.get(url) problemsets = parse_problems(page, full=True) assert len(divisions) == len(problemsets) else: problemsets = None result = {} d0_set = set() for division_idx, (division_match, description) in enumerate( zip(divisions, descriptions)): division = division_match.group('name') d_problems = parse_problems( description ) if problemsets is None else problemsets[division_idx] division_info = problems_info.setdefault('division', OrderedDict()) division_info[division] = d_problems d0 = division[0].upper() assert d0 not in d0_set d0_set.add(d0) for p in d_problems: p['short'] = d0 + p['short'] url = urllib.parse.urljoin(self.standings_url, division_match.group('url')) page = REQ.get(url) tables = re.finditer( r'>(?P<title>[^<]*)</[^>]*>\s*(?P<html><table[^>]*>.*?</table>)', page, re.DOTALL) for table_match in tables: title = table_match.group('title') table = parsed_table.ParsedTable(table_match.group('html')) for r in table: row = OrderedDict() problems = row.setdefault('problems', {}) solved = 0 idx = 0 for key, value in r.items(): key = key.replace(' ', ' ').strip() if not key: continue if isinstance(value, list): status = ''.join(v.value for v in value) idx += 1 if not status: continue partial = not bool(re.match(r'^[\*]+$', status)) solved += not partial problems[d0 + str(idx)] = { 'partial': partial, 'result': 1000 / len(d_problems) * status.count('*') / len(status), 'status': status, } elif key == 'Score': row['solving'] = int(value.value) else: row[key.lower()] = value.value.replace( ' ', ' ').strip() row['member'] = f'{row["name"]}, {row["country"]}' row['division'] = division row['list'] = title.strip().strip(':') row['solved'] = {'solving': solved} result[row['member']] = row standings = { 'result': result, 'url': self.standings_url, 'problems': problems_info, 'hidden_fields': ['list'], } return standings
def get_standings(self, users=None, statistics=None): geolocator = Nominatim(user_agent="clist.by") geocode_func = partial(geolocator.geocode, timeout=10) geocode = RateLimiter(geocode_func, min_delay_seconds=1, max_retries=3) season = self.key.split('.')[0] if not self.standings_url: return {} page = REQ.get(self.standings_url) page = re.sub('<(/?)tl([^>]*)>', r'<\1tr\2>', page) regex = '<table[^>]*class="standings"[^>]*>.*?</table>' match = re.search(regex, page, re.DOTALL) if not match: regex = r'<table\s*(?:align="center"\s*)?border="1"\s*(?:align="center"\s*)?>.*?</table>' matches = re.finditer(regex, page, re.DOTALL) for match in matches: pass if not match: raise ExceptionParseStandings('not found standings table') html_table = match.group(0) c_mapping = { 'place': 'place', 'место': 'place', 'user': '******', 'team': 'name', 'участник': 'name', 'solved': 'solved', 'total': 'solved', 'имя': 'first_name', 'фамилия': 'last_name', 'отчество': 'middle_name', 'логин': 'login', 'login': '******', 'класс': 'class', 'город': 'city', 'субъект российской федерации (для иностранных участников - государство)': 'city', 'балл': 'solving', 'сумма': 'solving', 'баллы': 'solving', 'score': 'solving', 'sum': 'solving', 'диплом': 'diploma', 'степень диплома': 'diploma', 'номер диплома': 'diploma_number', 'страна': 'country', 'школа (сокр.)': 'school', 'школа': 'school', 'учебное зачедение, класс': 'school', 'регион/статус': 'region', 'регион': 'region', 'имя в таблице': 'handle', 'uid': 'uid', } table = parsed_table.ParsedTable(html_table, strip_empty_columns=True) locations = None if os.path.exists(self.LOCATION_CACHE_FILE): with open(self.LOCATION_CACHE_FILE, 'r') as fo: locations = yaml.safe_load(fo) if locations is None: locations = {} def get_location(loc_info): loc_info = re.sub(r'[.,\s]+', ' ', loc_info).strip().lower() if loc_info not in locations: try: ru = geocode(loc_info, language='ru') en = geocode(loc_info, language='en') if ru is None and en is None: locations[loc_info] = None else: locations[loc_info] = { 'ru': ru.address, 'en': en.address } except Exception: pass return locations.get(loc_info) def get_country(address): *_, country = map(str.strip, address['en'].split(',')) if country.startswith('The '): country = country[4:] return country try: result = {} problems_info = OrderedDict() has_bold = False last, place, placing = None, None, {} for idx, r in enumerate(tqdm.tqdm(table, total=len(table)), start=1): row = OrderedDict() problems = row.setdefault('problems', {}) letter = chr(ord('A') - 1) solved = 0 for k, v in list(r.items()): is_russian = bool(re.search('[а-яА-Я]', k)) c = v.attrs.get('class') c = c.split()[0] if c else k.lower() if c and c.startswith('st_'): c = c[3:].lower() if c in ['prob'] or c not in c_mapping and not is_russian: letter = chr(ord(letter) + 1) problem_info = problems_info.setdefault( letter, { 'short': letter, 'full_score': 100, }) if letter.lower() != k.lower(): problem_info['name'] = k if 'title' in v.attrs: problem_info['name'] = v.attrs['title'] if v.value != DOT and v.value: p = problems.setdefault(letter, {}) if v.column.node.xpath('b'): p['partial'] = False has_bold = True v = v.value if SPACE in v: v, t = v.split(SPACE, 1) t = t.strip() m = re.match(r'^\((?P<val>[0-9]+)\)$', t) if m: t = int(m.group('val')) if t > 1: p['attempts'] = t - 1 else: p['time'] = t try: score = float(v) p['result'] = v p['partial'] = score < problem_info[ 'full_score'] except ValueError: pass if 'partial' in p and not p['partial']: solved += 1 else: v = v.value.strip() if not v or v == '-': continue c = c_mapping.get(c, c).lower() row[c] = v if c == 'diploma': row['_medal_title_field'] = 'diploma' v = v.lower().split()[0] if re.search('(^в.к|^вне)', v): continue if v in ['gold', 'i', '1'] or v.startswith('перв'): row['medal'] = 'gold' elif v in ['silver', 'ii', '2' ] or v.startswith('втор'): row['medal'] = 'silver' elif v in ['bronze', 'iii', '3' ] or v.startswith('трет'): row['medal'] = 'bronze' else: row['medal'] = 'honorable' if 'solving' not in row: if 'solved' in row: row['solving'] = row.pop('solved') else: continue row['solved'] = {'solving': solved} if 'place' not in row: if place is None and idx != 1: continue if row['solving'] != last: place = idx last = row['solving'] placing[place] = idx row['place'] = place if 'name' not in row: if 'first_name' in row and 'last_name' in row: row['name'] = row['last_name'] + ' ' + row['first_name'] elif 'first_name' in row and 'last_name' not in row: row['name'] = row.pop('first_name') if 'login' in row: row['member'] = row['login'] if 'name' in row: row['_name_instead_key'] = True elif 'name' in row: name = row['name'] if ' ' in name: row['member'] = name + ' ' + season else: row.pop('name') row['member'] = name else: row['member'] = f'{self.pk}-{idx}' addition = (statistics or {}).get(row['member'], {}) if addition: country = addition.get('country') if country: row.setdefault('country', country) if 'country' not in row: locs = [] if 'city' in row: locs.append(row['city']) if 'extra' in row: extra = row['extra'] extra = re.sub( r'\s*(Не\s*РФ|Not\s*RF|Участник\s*вне\s*конкурса):\s*', ' ', extra, re.IGNORECASE) extra = re.sub('<[^>]*>', '', extra) locs.extend(re.split('[,:]', extra)) for loc in locs: loc = re.sub(r'\s*[0-9]+\s*', ' ', loc) loc = loc.strip() address = get_location(loc) if address: country = get_country(address) row['country'] = country break result[row['member']] = row if placing: for row in result.values(): place = row['place'] last = placing[place] row['place'] = str( place) if place == last else f'{place}-{last}' if has_bold: for row in result.values(): for p in row.get('problems').values(): if 'partial' not in p and 'result' in p: p['partial'] = True finally: with open(self.LOCATION_CACHE_FILE, 'wb') as fo: yaml.dump(locations, fo, encoding='utf8', allow_unicode=True) standings = { 'result': result, 'problems': list(problems_info.values()), 'hidden_fields': [ 'extra', 'first_name', 'last_name', 'middle_name', 'class', 'city', 'country', 'diploma', 'school', 'login', 'region', 'uid', 'handle', 'diploma_number', ], } if not statistics and result: standings['timing_statistic_delta'] = timedelta(minutes=5) return standings
def get_results(standings_url, division_data): page = REQ.get(standings_url) page_format = division_data.get('format') if page_format == 'json': data = json.loads(page) scores_field = None if 'problems' in data: scores_field = 'problem' elif 'tournaments' in data: scores_field = 'tournament' if scores_field: scores_fields_mapping = {'submission': 'T', 'request': 'R'} scores_mapping = OrderedDict() for score in data[f'{scores_field}s']: name = str(score[f'{scores_field}Id']) scores_mapping[name] = scores_fields_mapping.get(name, name.split(':')[-1]) table = [] for team in data['teams']: row = OrderedDict() row['name'] = team['team']['teamName'] row['solving'] = team['score'] row['country'] = team['team']['customData']['country'] if scores_field: problems = row.setdefault('_scores', OrderedDict()) scores = team[f'{scores_field}s'] for field, out in scores_mapping.items(): if field in scores: problems[out] = as_number(scores.get(field, {}).get('score')) table.append(row) else: mapping = { 'Rank': 'place', '': 'place', 'Score': 'solving', 'score': 'solving', 'Total Score': 'solving', 'Team': 'name', 'name': 'name', 'score + unspent LAM': 'unspent_lam', } xpath = division_data.get('xpath', '//table//tr') table = parsed_table.ParsedTable(html=page, header_mapping=mapping, xpath=xpath) season = self.get_season() ret = {} was_place = False for r in table: row = OrderedDict() for k, v in r.items(): was_place = was_place or k == 'place' if isinstance(v, parsed_table.ParsedTableValue): v = v.value if k == 'name': row['name'] = v row['member'] = f'{v} {season}' else: row[k] = as_number(v) if k in {'place', 'solving'} else v ret[row['member']] = row if not was_place: place = None last = None for idx, row in enumerate(sorted(ret.values(), key=lambda r: r['solving'], reverse=True), start=1): if row['solving'] != last: last = row['solving'] place = idx row['place'] = place return ret
def get_standings(self, users=None, statistics=None): year = self.start_time.year year = year if self.start_time.month >= 9 else year - 1 season = '%d-%d' % (year, year + 1) result = {} problems_info = OrderedDict() try: standings_xml = REQ.get(self.standings_url.replace( '.html', '.xml'), detect_charsets=False) xml_result = parse_xml(standings_xml) except FailOnGetResponse: xml_result = {} page = REQ.get(self.standings_url) regex = '<table[^>]*class="standings"[^>]*>.*?</table>' match = re.search(regex, page, re.DOTALL) if not match: page = re.sub('<table[^>]*wrapper[^>]*>', '', page) regex = '<table[^>]*>.*?</table>' match = re.search(regex, page, re.DOTALL) html_table = match.group(0) table = parsed_table.ParsedTable(html_table) university_regex = self.info.get('standings', {}).get('1st_u', {}).get('regex') for r in table: row = {} problems = row.setdefault('problems', {}) for k, v in list(r.items()): k = k.split()[0] if k == 'Total' or k == '=': row['solving'] = int(v.value) elif len(k) <= 3: problems_info[k] = {'short': k} if 'title' in v.attrs: problems_info[k]['name'] = v.attrs['title'] if '-' in v.value or '+' in v.value or '?' in v.value: p = problems.setdefault(k, {}) if ' ' in v.value: point, time = v.value.split() p['time'] = time else: point = v.value p['result'] = point first_ac = v.column.node.xpath( './/*[@class="first-to-solve"]') if len(first_ac): p['first_ac'] = True elif k == 'Time': row['penalty'] = int(v.value) elif k.lower() in ['place', 'rank']: row['place'] = v.value.strip('.') elif 'team' in k.lower() or 'name' in k.lower(): if xml_result: problems.update(xml_result[v.value]) row['member'] = v.value + ' ' + season row['name'] = v.value else: row[k] = v.value for f in 'diploma', 'medal': medal = row.pop(f, None) or row.pop(f.title(), None) if medal: if medal in ['З', 'G']: row['medal'] = 'gold' elif medal in ['С', 'S']: row['medal'] = 'silver' elif medal in ['Б', 'B']: row['medal'] = 'bronze' break if university_regex: match = re.search(university_regex, row['name']) if match: u = match.group('key').strip() row['university'] = u result[row['member']] = row standings = { 'result': result, 'url': self.standings_url, 'problems': list(problems_info.values()), 'problems_time_format': '{M}:{s:02d}', 'hidden_fields': ['university'], } return standings
def get_standings(self, users=None, statistics=None): if not self.standings_url: raise ExceptionParseStandings('Not set stnadings url') is_final = self.name.lower().startswith('final round') now = datetime.utcnow().replace(tzinfo=pytz.utc) if not is_final and self.end_time + timedelta(days=3) < now: raise ExceptionParseStandings('Too late') page = REQ.get(self.standings_url) html_table = re.search('<table[^>]*>.*?</table>', page, re.MULTILINE | re.DOTALL).group(0) table = parsed_table.ParsedTable(html_table, as_list=True, ignore_wrong_header_number=False, ignore_display_none=True) problems_info = OrderedDict() result = {} season = self.get_season() advanced = False for r in table: if isinstance(r, parsed_table.ParsedTableRow): if re.search(r'qualification\s*threshold', r.columns[0].value, re.I): advanced = True for row in result.values(): row['advanced'] = True continue row = OrderedDict() problems = row.setdefault('problems', {}) if advanced: row['advanced'] = False pid = 0 for k, v in r: if k == '#': row['place'] = v.value elif k == 'Name': row['name'] = v.value elif k.startswith('Total'): row['solving'] = v.value elif '_top_column' in v.header.attrs: problem_key = str(pid) if problem_key not in problems_info: name = v.header.attrs['_top_column'].value p_info = {'code': problem_key} p_info_regex = r'^(?P<name>.*)\s+\(?(?P<score>[0-9]{2,})\)?$' match = re.search(p_info_regex, name) if match: name = match.group('name').strip() match = re.search(p_info_regex, k) if match: p_info['subname'] = match.group('name').strip() p_info['full_score'] = int(match.group('score')) p_info['name'] = name href = v.header.node.xpath('a/@href') if href: p_info['suburl'] = href[0] p_info['url'] = href[0] problems_info[problem_key] = p_info if v.value: try: val = float(v.value) if val: p = problems.setdefault(problem_key, {}) p['result'] = v.value full_score = problems_info[problem_key].get( 'full_score') if full_score is not None: p['partial'] = val < full_score else: style = v.attrs.get('style') if style: if 'yellow' in style: p['partial'] = True elif 'lightgreen' in style: p['partial'] = False if full_score is None: problems_info[problem_key][ 'full_score'] = int( round(val, 0)) except ValueError: pass pid += 1 else: row.setdefault('_info', {})[k] = v.value if not problems: continue handle = row['name'] + ' ' + season row['member'] = handle if handle in result: continue result[handle] = row standings = { 'result': result, 'problems': list(problems_info.values()), } if is_final: standings['options'] = { 'medals': [{ 'name': k, 'count': 1 } for k in ('gold', 'silver', 'bronze')] } return standings
def get_standings(self, users=None, statistics=None): geolocator = Nominatim(user_agent="clist.by") geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1, max_retries=3) year = self.start_time.year year = year if self.start_time.month >= 9 else year - 1 season = '%d-%d' % (year, year + 1) if not self.standings_url: return {} try: standings_xml = REQ.get(self.standings_url.replace('.html', '.xml'), detect_charsets=False) xml_result = parse_xml(standings_xml) except FailOnGetResponse: xml_result = {} page = REQ.get(self.standings_url) regex = '<table[^>]*class="standings"[^>]*>.*?</table>' html_table = re.search(regex, page, re.DOTALL).group(0) table = parsed_table.ParsedTable(html_table) mapping_key = { 'rank': 'place', 'rankl': 'place', 'party': 'name', 'solved': 'solving', } locations = None if os.path.exists(self.LOCATION_CACHE_FILE): with open(self.LOCATION_CACHE_FILE, 'r') as fo: locations = yaml.safe_load(fo) if locations is None: locations = {} try: result = {} problems_info = OrderedDict() for r in tqdm.tqdm(table): row = OrderedDict() problems = row.setdefault('problems', {}) for k, v in list(r.items()): c = v.attrs['class'].split()[0] if c in ['problem', 'ioiprob']: problems_info[k] = {'short': k, 'name': v.attrs['title']} if v.value != DOT: p = problems.setdefault(k, {}) first_ac = v.column.node.xpath('.//*[@class="first-to-solve"]') if len(first_ac): p['first_ac'] = True partial = v.column.node.xpath('self::td[@class="ioiprob"]/u') if partial: p['partial'] = True v = v.value if SPACE in v: v, t = v.split(SPACE, 1) p['time'] = t p['result'] = v else: c = mapping_key.get(c, c) row[c] = v.value if xml_result and c == 'name': problems.update(xml_result[v.value]) if 'penalty' not in row: match = re.search(r'\s*\((?P<info>[^\)]*)\)\s*$', row['name']) if match: row['name'] = row['name'][:match.span()[0]] group_info = match.group('info') if u'класс' in group_info: row['degree'], loc_info = map(str.strip, group_info.split(',', 1)) else: loc_info = group_info loc_info = re.sub(r'[.,\s]+', ' ', loc_info).strip().lower() if loc_info not in locations: try: locations[loc_info] = { 'ru': geocode(loc_info, language='ru').address, 'en': geocode(loc_info, language='en').address, } except Exception: locations[loc_info] = None address = locations[loc_info] if address: *_, country = map(str.strip, address['en'].split(',')) if country.startswith('The '): country = country[4:] row['country'] = country if ', ' in address['ru']: row['city'], *_ = map(str.strip, address['ru'].split(',')) solved = [p for p in list(problems.values()) if p['result'] == '100'] row['solved'] = {'solving': len(solved)} elif re.match('^[0-9]+$', row['penalty']): row['penalty'] = int(row['penalty']) for f in 'diploma', 'medal': medal = row.pop(f, None) or row.pop(f.title(), None) if medal: if medal in ['З', 'G']: row['medal'] = 'gold' elif medal in ['С', 'S']: row['medal'] = 'silver' elif medal in ['Б', 'B']: row['medal'] = 'bronze' break row['member'] = row['name'] + ' ' + season result[row['member']] = row finally: with open(self.LOCATION_CACHE_FILE, 'wb') as fo: yaml.dump(locations, fo, encoding='utf8', allow_unicode=True) standings = { 'result': result, 'problems': list(problems_info.values()), } return standings