Esempio n. 1
0
    def get_standings(self, users=None, statistics=None):
        year = self.start_time.year
        year = year if self.start_time.month >= 9 else year - 1
        season = '%d-%d' % (year, year + 1)

        result = {}
        problems_info = OrderedDict()

        try:
            standings_xml = REQ.get(self.standings_url.replace(
                '.html', '.xml'),
                                    detect_charsets=False)
            xml_result = parse_xml(standings_xml)
        except FailOnGetResponse:
            xml_result = {}

        page = REQ.get(self.standings_url)

        regex = '<table[^>]*class="standings"[^>]*>.*?</table>'
        match = re.search(regex, page, re.DOTALL)
        if not match:
            page = re.sub('<table[^>]*wrapper[^>]*>', '', page)
            regex = '<table[^>]*>.*?</table>'
            match = re.search(regex, page, re.DOTALL)
        html_table = match.group(0)
        table = parsed_table.ParsedTable(html_table)

        university_regex = self.info.get('standings', {}).get('1st_u',
                                                              {}).get('regex')
        for r in table:
            row = {}
            problems = row.setdefault('problems', {})
            for k, v in list(r.items()):
                k = k.split()[0]
                if k == 'Total' or k == '=':
                    row['solving'] = int(v.value)
                elif len(k) <= 3:
                    problems_info[k] = {'short': k}
                    if 'title' in v.attrs:
                        problems_info[k]['name'] = v.attrs['title']

                    if '-' in v.value or '+' in v.value or '?' in v.value:
                        p = problems.setdefault(k, {})
                        if ' ' in v.value:
                            point, time = v.value.split()
                            p['time'] = time
                        else:
                            point = v.value
                        p['result'] = point

                        first_ac = v.column.node.xpath(
                            './/*[@class="first-to-solve"]')
                        if len(first_ac):
                            p['first_ac'] = True
                elif k == 'Time':
                    row['penalty'] = int(v.value)
                elif k.lower() in ['place', 'rank']:
                    row['place'] = v.value.strip('.')
                elif 'team' in k.lower() or 'name' in k.lower():
                    if xml_result:
                        problems.update(xml_result[v.value])
                    row['member'] = v.value + ' ' + season
                    row['name'] = v.value
                else:
                    row[k] = v.value
            for f in 'diploma', 'medal':
                medal = row.pop(f, None) or row.pop(f.title(), None)
                if medal:
                    if medal in ['З', 'G']:
                        row['medal'] = 'gold'
                    elif medal in ['С', 'S']:
                        row['medal'] = 'silver'
                    elif medal in ['Б', 'B']:
                        row['medal'] = 'bronze'
                    break
            if university_regex:
                match = re.search(university_regex, row['name'])
                if match:
                    u = match.group('key').strip()
                    row['university'] = u
            result[row['member']] = row

        standings = {
            'result': result,
            'url': self.standings_url,
            'problems': list(problems_info.values()),
            'problems_time_format': '{M}:{s:02d}',
            'hidden_fields': ['university'],
        }
        return standings
Esempio n. 2
0
    def get_standings(self, users=None, statistics=None):
        year = self.start_time.year
        year = year if self.start_time.month >= 9 else year - 1
        season = '%d-%d' % (year, year + 1)

        result = {}
        problems_info = OrderedDict()

        page = REQ.get(self.standings_url)

        try:
            standings_xml = REQ.get(self.standings_url.replace(
                '.html', '.xml'),
                                    detect_charsets=False)
            xml_result = parse_xml(standings_xml)
        except FailOnGetResponse:
            xml_result = {}

        regex = '<table[^>]*class="standings"[^>]*>.*?</table>'
        match = re.search(regex, page, re.DOTALL)
        if not match:
            page = re.sub('<table[^>]*wrapper[^>]*>', '', page)
            regex = '<table[^>]*>.*?</table>'
            match = re.search(regex, page, re.DOTALL)
        html_table = match.group(0)
        table = parsed_table.ParsedTable(html_table, as_list=True)

        university_regex = self.info.get('standings', {}).get('1st_u',
                                                              {}).get('regex')
        for r in table:
            row = {}
            problems = row.setdefault('problems', {})
            for k, v in r:
                k = k.split()[0]
                if k == 'Total' or k == '=':
                    row['solving'] = int(v.value)
                elif len(k) <= 3:
                    problems_info[k] = {'short': k}
                    if 'title' in v.attrs:
                        problems_info[k]['name'] = v.attrs['title']

                    if '-' in v.value or '+' in v.value or '?' in v.value:
                        p = problems.setdefault(k, {})
                        if ' ' in v.value:
                            point, time = v.value.split()
                        else:
                            point = v.value
                            time = None
                        if 'result' in p and point != p.get('result'):
                            p.clear()
                        p['result'] = point
                        if time is not None:
                            p['time'] = time

                        first_ac = v.column.node.xpath(
                            './/*[@class="first-to-solve"]')
                        if len(first_ac):
                            p['first_ac'] = True
                elif k == 'Time':
                    row['penalty'] = int(v.value)
                elif k.lower() in ['place', 'rank']:
                    row['place'] = v.value.strip('.')
                elif 'team' in k.lower() or 'name' in k.lower():
                    if xml_result:
                        problems.update(xml_result[v.value])
                    row['member'] = v.value + ' ' + season
                    row['name'] = v.value
                else:
                    row[k] = v.value
            for f in 'diploma', 'medal':
                medal = row.pop(f, None) or row.pop(f.title(), None)
                if medal:
                    if medal in ['З', 'G']:
                        row['medal'] = 'gold'
                    elif medal in ['С', 'S']:
                        row['medal'] = 'silver'
                    elif medal in ['Б', 'B']:
                        row['medal'] = 'bronze'
                    break
            if university_regex:
                match = re.search(university_regex, row['name'])
                if match:
                    u = match.group('key').strip()
                    row['university'] = u
            result[row['member']] = row

        if statistics and self.info.get('use_icpc.kimden.online'):
            team_regions = {}

            def canonize_name(name):
                name = re.sub(':', '', name)
                name = re.sub(r'\s+', ' ', name)
                return name

            def get_region(team_name):
                nonlocal team_regions
                if not team_regions:
                    page = REQ.get('https://icpc.kimden.online/')
                    matches = re.finditer(
                        '<label[^>]*for="(?P<selector>[^"]*)"[^"]*onclick="setRegion[^"]*"[^>]*>(?P<name>[^>]*)</',
                        page,
                    )
                    regions = {}
                    for match in matches:
                        selector = match.group('selector').replace(
                            'selector', '').replace('--', '-')
                        regions[selector] = match.group('name')
                    pprint(regions)

                    matches = re.finditer(
                        r'''
                        <tr[^>]*class="(?P<class>[^"]*)"[^>]*>\s*<td[^>]*>[^<]*</td>\s*<td[^>]*title="(?P<name>[^"]*)">[^<]*</td>
                        ''',
                        page,
                        re.VERBOSE,
                    )

                    for match in matches:
                        classes = match.group('class').split()
                        name = match.group('name')
                        name = canonize_name(name)
                        for c in classes:
                            if c in regions:
                                team_regions[name] = regions[c]
                                break
                team_name = canonize_name(team_name)
                return team_regions[team_name]

            for row in result.values():
                stat = statistics.get(row['member'])
                if not stat:
                    continue
                if stat.get('region'):
                    row['region'] = stat['region']
                else:
                    row['region'] = get_region(row['name'])

        standings = {
            'result': result,
            'url': self.standings_url,
            'problems': list(problems_info.values()),
            'problems_time_format': '{M}:{s:02d}',
            'hidden_fields': ['university', 'region', 'medal'],
        }
        return standings
Esempio n. 3
0
    def get_standings(self, users=None, statistics=None):
        geolocator = Nominatim(user_agent="clist.by")
        geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1, max_retries=3)

        year = self.start_time.year
        year = year if self.start_time.month >= 9 else year - 1
        season = '%d-%d' % (year, year + 1)

        if not self.standings_url:
            return {}

        try:
            standings_xml = REQ.get(self.standings_url.replace('.html', '.xml'), detect_charsets=False)
            xml_result = parse_xml(standings_xml)
        except FailOnGetResponse:
            xml_result = {}

        page = REQ.get(self.standings_url)

        regex = '<table[^>]*class="standings"[^>]*>.*?</table>'
        html_table = re.search(regex, page, re.DOTALL).group(0)
        table = parsed_table.ParsedTable(html_table)
        mapping_key = {
            'rank': 'place',
            'rankl': 'place',
            'party': 'name',
            'solved': 'solving',
        }

        locations = None
        if os.path.exists(self.LOCATION_CACHE_FILE):
            with open(self.LOCATION_CACHE_FILE, 'r') as fo:
                locations = yaml.safe_load(fo)
        if locations is None:
            locations = {}

        try:
            result = {}
            problems_info = OrderedDict()
            for r in tqdm.tqdm(table):
                row = OrderedDict()
                problems = row.setdefault('problems', {})
                for k, v in list(r.items()):
                    c = v.attrs['class'].split()[0]
                    if c in ['problem', 'ioiprob']:
                        problems_info[k] = {'short': k, 'name': v.attrs['title']}
                        if v.value != DOT:
                            p = problems.setdefault(k, {})

                            first_ac = v.column.node.xpath('.//*[@class="first-to-solve"]')
                            if len(first_ac):
                                p['first_ac'] = True

                            partial = v.column.node.xpath('self::td[@class="ioiprob"]/u')
                            if partial:
                                p['partial'] = True

                            v = v.value
                            if SPACE in v:
                                v, t = v.split(SPACE, 1)
                                p['time'] = t
                            p['result'] = v
                    else:
                        c = mapping_key.get(c, c)
                        row[c] = v.value
                        if xml_result and c == 'name':
                            problems.update(xml_result[v.value])
                if 'penalty' not in row:
                    match = re.search(r'\s*\((?P<info>[^\)]*)\)\s*$', row['name'])
                    if match:
                        row['name'] = row['name'][:match.span()[0]]
                        group_info = match.group('info')
                        if u'класс' in group_info:
                            row['degree'], loc_info = map(str.strip, group_info.split(',', 1))
                        else:
                            loc_info = group_info

                        loc_info = re.sub(r'[.,\s]+', ' ', loc_info).strip().lower()
                        if loc_info not in locations:
                            try:
                                locations[loc_info] = {
                                    'ru': geocode(loc_info, language='ru').address,
                                    'en': geocode(loc_info, language='en').address,
                                }
                            except Exception:
                                locations[loc_info] = None
                        address = locations[loc_info]
                        if address:
                            *_, country = map(str.strip, address['en'].split(','))
                            if country.startswith('The '):
                                country = country[4:]
                            row['country'] = country
                            if ', ' in address['ru']:
                                row['city'], *_ = map(str.strip, address['ru'].split(','))

                    solved = [p for p in list(problems.values()) if p['result'] == '100']
                    row['solved'] = {'solving': len(solved)}
                elif re.match('^[0-9]+$', row['penalty']):
                    row['penalty'] = int(row['penalty'])

                for f in 'diploma', 'medal':
                    medal = row.pop(f, None) or row.pop(f.title(), None)
                    if medal:
                        if medal in ['З', 'G']:
                            row['medal'] = 'gold'
                        elif medal in ['С', 'S']:
                            row['medal'] = 'silver'
                        elif medal in ['Б', 'B']:
                            row['medal'] = 'bronze'
                        break
                row['member'] = row['name'] + ' ' + season
                result[row['member']] = row
        finally:
            with open(self.LOCATION_CACHE_FILE, 'wb') as fo:
                yaml.dump(locations, fo, encoding='utf8', allow_unicode=True)

        standings = {
            'result': result,
            'problems': list(problems_info.values()),
        }
        return standings
Esempio n. 4
0
    def get_standings(self, users=None, statistics=None):
        geolocator = Nominatim(user_agent="clist.by")
        geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1, max_retries=3)

        year = self.start_time.year
        year = year if self.start_time.month >= 9 else year - 1
        season = '%d-%d' % (year, year + 1)

        if not self.standings_url:
            return {}

        try:
            standings_xml = REQ.get(self.standings_url.replace('.html', '.xml'), detect_charsets=False)
            xml_result = parse_xml(standings_xml)
        except FailOnGetResponse:
            xml_result = {}

        page = REQ.get(self.standings_url)

        regex = '<table[^>]*class="standings"[^>]*>.*?</table>'
        html_table = re.search(regex, page, re.DOTALL).group(0)
        table = parsed_table.ParsedTable(html_table)
        mapping_key = {
            'rank': 'place',
            'rankl': 'place',
            'party': 'name',
            'solved': 'solving',
        }

        locations = None
        if os.path.exists(self.LOCATION_CACHE_FILE):
            with open(self.LOCATION_CACHE_FILE, 'r') as fo:
                locations = yaml.safe_load(fo)
        if locations is None:
            locations = {}

        def get_location(loc_info):
            loc_info = re.sub(r'[.,\s]+', ' ', loc_info).strip().lower()
            if loc_info not in locations:
                try:
                    locations[loc_info] = {
                        'ru': geocode(loc_info, language='ru').address,
                        'en': geocode(loc_info, language='en').address,
                    }
                except Exception:
                    pass

            return locations.get(loc_info)

        def get_country(address):
            *_, country = map(str.strip, address['en'].split(','))
            if country.startswith('The '):
                country = country[4:]
            return country

        try:
            result = {}
            problems_info = OrderedDict()
            for r in tqdm.tqdm(table):
                row = OrderedDict()
                problems = row.setdefault('problems', {})
                for k, v in list(r.items()):
                    c = v.attrs['class'].split()[0]
                    if c in ['problem', 'ioiprob']:
                        problems_info[k] = {'short': k}
                        if 'title' in v.attrs:
                            problems_info[k]['name'] = v.attrs['title']

                        if v.value != DOT:
                            p = problems.setdefault(k, {})

                            first_ac = v.column.node.xpath('.//*[@class="first-to-solve"]')
                            if len(first_ac):
                                p['first_ac'] = True

                            partial = v.column.node.xpath('self::td[@class="ioiprob"]/u')
                            if partial:
                                p['partial'] = True

                            v = v.value
                            if SPACE in v:
                                v, t = v.split(SPACE, 1)
                                p['time'] = t
                            p['result'] = v
                    else:
                        c = mapping_key.get(c, c).lower()
                        row[c] = v.value.strip()
                        if xml_result and c == 'name':
                            problems.update(xml_result[v.value])

                        if c in ('diploma', 'medal'):
                            medal = row.pop(c, None)
                            if medal:
                                if medal in ['З', 'G']:
                                    row['medal'] = 'gold'
                                elif medal in ['С', 'S']:
                                    row['medal'] = 'silver'
                                elif medal in ['Б', 'B']:
                                    row['medal'] = 'bronze'
                                else:
                                    row[k.lower()] = medal
                name = row['name']

                if 'penalty' not in row:
                    for regex_info in (
                        r'\s*\((?P<info>[^\)]*)\)\s*$',
                        r',(?P<info>.*)$',
                    ):
                        match = re.search(regex_info, row['name'])
                        if not match:
                            continue

                        row['name'] = row['name'][:match.span()[0]]
                        if ',' in row['name']:
                            row['name'] = re.sub(r'[\s,]+', ' ', row['name'])

                        group_info = match.group('info')

                        infos = [s.strip() for s in group_info.split(',')]

                        loc_infos = []
                        for info in infos:
                            if 'degree' not in row:
                                match = re.match(r'^(?P<class>[0-9]+)(?:\s*класс)?$', info, re.IGNORECASE)
                                if match:
                                    row['degree'] = int(match.group('class'))
                                    continue
                            loc_infos.append(info)

                        if not loc_infos:
                            break

                        n_loc_infos = len(loc_infos)
                        for idx in range(n_loc_infos):
                            loc_info = ', '.join(loc_infos[:n_loc_infos - idx])
                            address = get_location(loc_info)
                            if address:
                                break
                        else:
                            address = None

                        if address:
                            row['country'] = get_country(address)
                            if ', ' in address['ru']:
                                row['city'], *_ = map(str.strip, address['ru'].split(','))
                        break

                    solved = [p for p in list(problems.values()) if p['result'] == '100']
                    row['solved'] = {'solving': len(solved)}
                elif re.match('^[0-9]+$', row['penalty']):
                    row['penalty'] = int(row['penalty'])

                if self.resource.info.get('statistics', {}).get('key_as_full_name'):
                    row['member'] = name + ' ' + season
                else:
                    row['member'] = row['name'] + ' ' + season

                addition = (statistics or {}).get(row['member'], {})
                if addition:
                    country = addition.get('country')
                    if country:
                        row.setdefault('country', country)
                    detect_location = self.info.get('_detect_location')
                    if 'country' not in row and detect_location:
                        match = re.search(detect_location['regex'], row['name'])
                        if match:
                            loc = match.group('location')
                            split = detect_location.get('split')
                            locs = loc.split(split) if split else [loc]

                            countries = defaultdict(int)
                            for loc in locs:
                                address = get_location(loc)
                                if address:
                                    country = get_country(address)
                                    countries[country] += 1
                            if len(countries) == 1:
                                country = list(countries.keys())[0]
                                row['country'] = country

                result[row['member']] = row
        finally:
            with open(self.LOCATION_CACHE_FILE, 'wb') as fo:
                yaml.dump(locations, fo, encoding='utf8', allow_unicode=True)

        standings = {
            'result': result,
            'problems': list(problems_info.values()),
        }
        return standings