def parse(self): print('Parsing league', self.league) league_page = get_page(get_league_url(self.league)) table_selector = '#content table' league_years_table = league_page.getroot().cssselect(table_selector)[0].getchildren() for row in league_years_table[1:]: current_year = row.getchildren()[0].getchildren()[0] year_page = YearPage(current_year.get('href')) self.new_parsed_data[current_year.text] = year_page.parse() return self.new_parsed_data
def parse(self): print('Parsing match', self.match_id) if (self.match_id in BAD_MATCHES): print('Bad match!') return [] url = get_match_url(self.match_id) self.page = get_page(url) self.parse_team_results() self.parse_winner() self.parse_match_info() return self.get_result_data()
def parse(self): while self.offset is not None: print('Parsing offset', self.offset) current_page = get_page( get_teams_url(self.season, self.year_stamp, self.offset)) teams_table_selector = '.card > table > tbody' teams_table = current_page.getroot().cssselect( teams_table_selector)[0].getchildren() for row in teams_table: new_data = [] team_name = row[0].getchildren()[1].text new_data.append(team_name) team_country = row[1].getchildren()[0].getchildren()[0].get( 'title') new_data.append(team_country) for i in range(2, 6): new_data.append(row[i].getchildren()[0].text) new_data.append(row[6].text) for i in range(7, 9): new_data.append(row[i].getchildren()[0].text) new_data.append(parse_money(row[9].text)) self.new_parsed_data.append(new_data) next_button_selector = '.card-footer > .pagination' next_button_url = current_page.getroot().cssselect( next_button_selector)[0].getchildren()[1].getchildren()[0].get( 'href') offset = re.search(r'offset=(\d+)', next_button_url) if offset is None: break self.offset = offset[1] return self.new_parsed_data
def parse(self): print('Parsing year', self.year_url) year_page = get_page(get_year_url(self.year_url)) table_selector = '.standings' standings_table = year_page.getroot().cssselect(table_selector) if (len(standings_table) == 0): return [] standings_table = standings_table[0].getchildren()[1].getchildren() data = [] for row in standings_table: current_tds = row.getchildren() i = 0 current_data = {} current_data[PROPERTIES[i]] = current_tds[i].getchildren()[0].text i += 1 for td in current_tds[1:]: current_data[PROPERTIES[i]] = td.text i += 1 data.append(current_data) return data
def parse(self): current_page = get_page(get_team_search_page_url(self.team_name)) # print(get_team_search_page_url(self.team_name)) wiki_link_selector = '.b_algo a[href^="https://en.wikipedia"]' # wiki_link_selector = 'body' # print(etree.tostring(current_page.getroot(), pretty_print=True)) # print(current_page.getroot().text_content()) current_wiki_link = current_page.getroot().cssselect( wiki_link_selector)[0] current_name = current_wiki_link.text_content().replace( '- Wikipedia', '') # if len(current_wiki_link.getchildren()) > 0: # current_name = current_wiki_link.getchildren()[0].text # print(current_name) # print(current_name) # current_name += current_wiki_link.text # print(current_wiki_link) current_name = unquote(current_name).replace( 'https://en.wikipedia.org/wiki/', '') # print(current_name) current_name = current_name.replace('_', ' ') return current_name
def parse(self): print('Parsing team', self.team_id) while self.page_number is not None: self.page_number += 1 url = get_teams_matches_url(self.team_id, self.page_number) self.page = get_page(url) matches_table = self.page.getroot().cssselect( '.recent-esports-matches') if (len(matches_table) == 0): break matches_table = matches_table[0].getchildren()[1].getchildren() for row in matches_table: current_td = row.getchildren()[1].getchildren() current_match_date = parse( current_td[1].getchildren()[0].get('datetime')) if PATCH_700_DATE > current_match_date: self.page_number = None break current_match_id = current_td[0].getchildren()[0].get( 'href').split('/')[-1] # Checking for match full players data, for some reason some matches have only one team opponent_team_td = row.getchildren()[5] if len(opponent_team_td.getchildren()[0].getchildren()) == 0: print(current_match_id, 'doesn\'t have opponents team') continue if (current_match_id not in self.parsed_data): new_match = MatchPage(current_match_id) self.new_parsed_data[current_match_id] = new_match.parse() table_counter = self.page.getroot().cssselect( 'div.viewport')[0].text counters = re.findall('\d+ - (\d+) of (\d+)', table_counter) if counters[0][0] == counters[0][1]: self.page_number = None return self.new_parsed_data
import time from urllib.error import HTTPError from HtmlParser.DotabuffParser.dotabuffParser.CsvWriter import * from HtmlParser.DotabuffParser.dotabuffParser.DotabuffUrls import get_teams_url from HtmlParser.DotabuffParser.dotabuffParser.TeamsMatchesPage import TeamsMatchesPage from HtmlParser.common.SiteConnector import get_page start_new_project() teams_url = get_teams_url() teams_page = get_page(teams_url) table_selector = 'article.r-tabbed-table table' match_info_table = teams_page.getroot().cssselect(table_selector)[0].getchildren()[1].getchildren() teams_ids = [] for team in match_info_table: teams_ids.append(team.getchildren()[1].getchildren()[0].get('href').split('/')[-1]) parsed_data = {} new_parsed_data = {} should_wait = False parsed_ids = [] for team_id in teams_ids: while team_id not in parsed_ids: try: new_page = TeamsMatchesPage(team_id, parsed_data) new_parsed_data = new_page.parse() parsed_data.update(new_parsed_data) except HTTPError as e: print(e, 'Let\'s wait')
def parse(self): print('Parsing league', self.league) for year in range(FIRST_SEASON, LAST_SEASON, -1): print('Season', year) seasonText = str(year)[2:] + '/' + str(year + 1)[2:] nextSeasonText = str(year + 1)[2:] + '/' + str(year + 2)[2:] if seasonText not in self.new_parsed_data: self.new_parsed_data[seasonText] = {} if nextSeasonText not in self.new_parsed_data: self.new_parsed_data[nextSeasonText] = {} for transferWindow in TRANSFER_WINDOWS: print('Window', transferWindow) league_page = get_page( get_league_url(self.league, year, transferWindow)) team_heuristic_selector = '.box' team_heuristic = league_page.getroot().cssselect( team_heuristic_selector) team_box_condition_selector = '.responsive-table' for box in team_heuristic: team_condition = box.cssselect(team_box_condition_selector) if (len(team_condition) == 0): continue team_name = get_team_name( box.getchildren()[0].getchildren()[1].text, self.league) print('Team', team_name) team_info = [] arrivals_info_selector = '.responsive-table + .transfer-zusatzinfo-box' departures_info_selector = '.responsive-table > .transfer-zusatzinfo-box' selectors = [ arrivals_info_selector, departures_info_selector ] for i in range(len(TRANSFERS_TYPES)): current_selector = selectors[i] current_info = box.cssselect(current_selector) if len(current_info) == 0: team_info.append(NO_DATA) team_info.append(NO_DATA) continue transfer_info = current_info[0].getchildren() team_info.append(parse_money(transfer_info[1].text)) team_info.append(parse_money(transfer_info[2].text)) transfer_tables_selector = '.responsive-table > table > tbody' transfer_tables = box.cssselect(transfer_tables_selector) for i in range(len(TRANSFERS_TYPES)): transfer_table = transfer_tables[i].getchildren() for j in range(VALUABLE_TRANSFERS_COUNT): if j >= len(transfer_table): team_info.append(NO_DATA) continue current_player = transfer_table[j].getchildren() if len(current_player) != 9: team_info.append(NO_DATA) continue current_player = current_player[5] team_info.append(parse_money(current_player.text)) if team_name not in self.new_parsed_data[seasonText]: self.new_parsed_data[seasonText][team_name] = {} if team_name not in self.new_parsed_data[nextSeasonText]: self.new_parsed_data[nextSeasonText][team_name] = {} if transferWindow == SEASON_WINTER: current_season = nextSeasonText else: current_season = seasonText self.new_parsed_data[current_season][team_name][ transferWindow] = team_info return self.new_parsed_data