Python get_pageの例、HtmlParser.common.SiteConnector.get_page Pythonの例

コード例 #1

0

ファイルを表示

ファイル: LeaguePage.py プロジェクト: DeyLak/dotabuf-parser

    def parse(self):
        print('Parsing league', self.league)
        league_page = get_page(get_league_url(self.league))

        table_selector = '#content table'
        league_years_table = league_page.getroot().cssselect(table_selector)[0].getchildren()
        for row in league_years_table[1:]:
            current_year = row.getchildren()[0].getchildren()[0]

            year_page = YearPage(current_year.get('href'))
            self.new_parsed_data[current_year.text] = year_page.parse()

        return self.new_parsed_data

コード例 #2

0

ファイルを表示

    def parse(self):
        print('Parsing match', self.match_id)
        if (self.match_id in BAD_MATCHES):
            print('Bad match!')
            return []
        url = get_match_url(self.match_id)
        self.page = get_page(url)

        self.parse_team_results()
        self.parse_winner()
        self.parse_match_info()

        return self.get_result_data()

コード例 #3

0

ファイルを表示

    def parse(self):
        while self.offset is not None:
            print('Parsing offset', self.offset)

            current_page = get_page(
                get_teams_url(self.season, self.year_stamp, self.offset))

            teams_table_selector = '.card > table > tbody'
            teams_table = current_page.getroot().cssselect(
                teams_table_selector)[0].getchildren()

            for row in teams_table:
                new_data = []
                team_name = row[0].getchildren()[1].text
                new_data.append(team_name)

                team_country = row[1].getchildren()[0].getchildren()[0].get(
                    'title')
                new_data.append(team_country)

                for i in range(2, 6):
                    new_data.append(row[i].getchildren()[0].text)

                new_data.append(row[6].text)
                for i in range(7, 9):
                    new_data.append(row[i].getchildren()[0].text)

                new_data.append(parse_money(row[9].text))
                self.new_parsed_data.append(new_data)

            next_button_selector = '.card-footer > .pagination'
            next_button_url = current_page.getroot().cssselect(
                next_button_selector)[0].getchildren()[1].getchildren()[0].get(
                    'href')
            offset = re.search(r'offset=(\d+)', next_button_url)
            if offset is None:
                break
            self.offset = offset[1]

        return self.new_parsed_data

コード例 #4

0

ファイルを表示

ファイル: YearPage.py プロジェクト: DeyLak/dotabuf-parser

    def parse(self):
        print('Parsing year', self.year_url)
        year_page = get_page(get_year_url(self.year_url))

        table_selector = '.standings'
        standings_table = year_page.getroot().cssselect(table_selector)
        if (len(standings_table) == 0):
            return []
        standings_table = standings_table[0].getchildren()[1].getchildren()
        data = []
        for row in standings_table:
            current_tds = row.getchildren()
            i = 0
            current_data = {}
            current_data[PROPERTIES[i]] = current_tds[i].getchildren()[0].text
            i += 1
            for td in current_tds[1:]:
                current_data[PROPERTIES[i]] = td.text
                i += 1
            data.append(current_data)

        return data

コード例 #5

0

ファイルを表示

    def parse(self):
        current_page = get_page(get_team_search_page_url(self.team_name))
        # print(get_team_search_page_url(self.team_name))
        wiki_link_selector = '.b_algo a[href^="https://en.wikipedia"]'
        # wiki_link_selector = 'body'
        # print(etree.tostring(current_page.getroot(), pretty_print=True))
        # print(current_page.getroot().text_content())
        current_wiki_link = current_page.getroot().cssselect(
            wiki_link_selector)[0]
        current_name = current_wiki_link.text_content().replace(
            '- Wikipedia', '')
        # if len(current_wiki_link.getchildren()) > 0:
        #     current_name = current_wiki_link.getchildren()[0].text
        #     print(current_name)
        # print(current_name)
        # current_name += current_wiki_link.text

        # print(current_wiki_link)
        current_name = unquote(current_name).replace(
            'https://en.wikipedia.org/wiki/', '')
        # print(current_name)
        current_name = current_name.replace('_', ' ')
        return current_name

コード例 #6

0

ファイルを表示

    def parse(self):
        print('Parsing team', self.team_id)
        while self.page_number is not None:
            self.page_number += 1
            url = get_teams_matches_url(self.team_id, self.page_number)
            self.page = get_page(url)

            matches_table = self.page.getroot().cssselect(
                '.recent-esports-matches')
            if (len(matches_table) == 0):
                break
            matches_table = matches_table[0].getchildren()[1].getchildren()
            for row in matches_table:
                current_td = row.getchildren()[1].getchildren()
                current_match_date = parse(
                    current_td[1].getchildren()[0].get('datetime'))
                if PATCH_700_DATE > current_match_date:
                    self.page_number = None
                    break
                current_match_id = current_td[0].getchildren()[0].get(
                    'href').split('/')[-1]
                # Checking for match full players data, for some reason some matches have only one team
                opponent_team_td = row.getchildren()[5]
                if len(opponent_team_td.getchildren()[0].getchildren()) == 0:
                    print(current_match_id, 'doesn\'t have opponents team')
                    continue

                if (current_match_id not in self.parsed_data):
                    new_match = MatchPage(current_match_id)
                    self.new_parsed_data[current_match_id] = new_match.parse()
            table_counter = self.page.getroot().cssselect(
                'div.viewport')[0].text
            counters = re.findall('\d+ - (\d+) of (\d+)', table_counter)
            if counters[0][0] == counters[0][1]:
                self.page_number = None
        return self.new_parsed_data

コード例 #7

0

ファイルを表示

ファイル: index.py プロジェクト: DeyLak/dotabuf-parser

import time
from urllib.error import HTTPError

from HtmlParser.DotabuffParser.dotabuffParser.CsvWriter import *
from HtmlParser.DotabuffParser.dotabuffParser.DotabuffUrls import get_teams_url
from HtmlParser.DotabuffParser.dotabuffParser.TeamsMatchesPage import TeamsMatchesPage
from HtmlParser.common.SiteConnector import get_page

start_new_project()

teams_url = get_teams_url()
teams_page = get_page(teams_url)

table_selector = 'article.r-tabbed-table table'
match_info_table = teams_page.getroot().cssselect(table_selector)[0].getchildren()[1].getchildren()
teams_ids = []
for team in match_info_table:
    teams_ids.append(team.getchildren()[1].getchildren()[0].get('href').split('/')[-1])

parsed_data = {}
new_parsed_data = {}
should_wait = False
parsed_ids = []
for team_id in teams_ids:
    while team_id not in parsed_ids:
        try:
            new_page = TeamsMatchesPage(team_id, parsed_data)
            new_parsed_data = new_page.parse()
            parsed_data.update(new_parsed_data)
        except HTTPError as e:
            print(e, 'Let\'s wait')

コード例 #8

0

ファイルを表示

ファイル: LeaguePage.py プロジェクト: DeyLak/dotabuf-parser

    def parse(self):
        print('Parsing league', self.league)
        for year in range(FIRST_SEASON, LAST_SEASON, -1):
            print('Season', year)
            seasonText = str(year)[2:] + '/' + str(year + 1)[2:]
            nextSeasonText = str(year + 1)[2:] + '/' + str(year + 2)[2:]
            if seasonText not in self.new_parsed_data:
                self.new_parsed_data[seasonText] = {}
            if nextSeasonText not in self.new_parsed_data:
                self.new_parsed_data[nextSeasonText] = {}

            for transferWindow in TRANSFER_WINDOWS:
                print('Window', transferWindow)
                league_page = get_page(
                    get_league_url(self.league, year, transferWindow))

                team_heuristic_selector = '.box'
                team_heuristic = league_page.getroot().cssselect(
                    team_heuristic_selector)

                team_box_condition_selector = '.responsive-table'
                for box in team_heuristic:
                    team_condition = box.cssselect(team_box_condition_selector)
                    if (len(team_condition) == 0):
                        continue
                    team_name = get_team_name(
                        box.getchildren()[0].getchildren()[1].text,
                        self.league)
                    print('Team', team_name)

                    team_info = []

                    arrivals_info_selector = '.responsive-table + .transfer-zusatzinfo-box'
                    departures_info_selector = '.responsive-table > .transfer-zusatzinfo-box'

                    selectors = [
                        arrivals_info_selector, departures_info_selector
                    ]
                    for i in range(len(TRANSFERS_TYPES)):
                        current_selector = selectors[i]
                        current_info = box.cssselect(current_selector)
                        if len(current_info) == 0:
                            team_info.append(NO_DATA)
                            team_info.append(NO_DATA)
                            continue

                        transfer_info = current_info[0].getchildren()
                        team_info.append(parse_money(transfer_info[1].text))
                        team_info.append(parse_money(transfer_info[2].text))

                    transfer_tables_selector = '.responsive-table > table > tbody'
                    transfer_tables = box.cssselect(transfer_tables_selector)
                    for i in range(len(TRANSFERS_TYPES)):
                        transfer_table = transfer_tables[i].getchildren()
                        for j in range(VALUABLE_TRANSFERS_COUNT):
                            if j >= len(transfer_table):
                                team_info.append(NO_DATA)
                                continue
                            current_player = transfer_table[j].getchildren()
                            if len(current_player) != 9:
                                team_info.append(NO_DATA)
                                continue
                            current_player = current_player[5]

                            team_info.append(parse_money(current_player.text))

                    if team_name not in self.new_parsed_data[seasonText]:
                        self.new_parsed_data[seasonText][team_name] = {}

                    if team_name not in self.new_parsed_data[nextSeasonText]:
                        self.new_parsed_data[nextSeasonText][team_name] = {}

                    if transferWindow == SEASON_WINTER:
                        current_season = nextSeasonText
                    else:
                        current_season = seasonText
                    self.new_parsed_data[current_season][team_name][
                        transferWindow] = team_info

        return self.new_parsed_data