def __init__(self): self.sender = Sender() self.logger = Logger(2) self.url = 'http://www.marca.com/futbol/primera-division/calendario.html' self.raw_content = '' self.writer = PrefixedMongoWrapper('marca') self.collection_name = 'current_season_results'
def __init__(self, log_detail_level=0): self.prefix = 'etl' self.mongo_wrapper = PrefixedMongoWrapper(self.prefix) self.collection = 'results_all' self.results = [] self.counters = {} self.tournament_positions = {} self.logger = Logger(log_detail_level) #Diccionario con los datos recientes. Ver self._update_counters self.teams_recent_history = {} self.counter_template = { 'played_home': 0, 'played_away': 0, 'score_competition': 0, #'score_competition_home': 0, #'score_competition_away': 0, 'matches_won_home': 0, 'matches_won_away': 0, 'matches_tied_home': 0, 'matches_tied_away': 0, 'matches_lost_home': 0, 'matches_lost_away': 0, 'goals_scored_home': 0, 'goals_scored_away': 0, 'goals_conceded_home': 0, 'goals_conceded_away': 0, 'num_days_without_goals_home': 0, 'num_days_without_goals_away': 0, 'num_days_without_victory_home': 0, 'num_days_without_victory_away': 0, 'ranking_home': 0, 'ranking_away': 0 }
def __init__(self): self.logger = Logger(2) self.mongo_wrapper = PrefixedMongoWrapper('laliga_web') self.base_url = 'http://www.laliga.es/estadisticas-historicas/plantillas/' self.sender = Sender() self.sender.set_debug_level(2) self.collection = 'players'
def __init__(self): self.logger = Logger(2) self.mapper = TeamsNormalizer() self.template = { 'season': 'primera/2017-18', 'day_num': '', 'home': '', 'away': '', 'score_home': '', 'score_away': '' }
class PlayersScraper: def __init__(self): self.logger = Logger(2) self.mongo_wrapper = PrefixedMongoWrapper('laliga_web') self.base_url = 'http://www.laliga.es/estadisticas-historicas/plantillas/' self.sender = Sender() self.sender.set_debug_level(2) self.collection = 'players' def get_teams(self, season): response = self.sender.get(self.base_url + season, {}) result = [] if response != '': html = BeautifulSoup(response, 'html.parser') for item in html.find('select', {'id': 'select_equipos_estadisticas_historicas'}).find_all('option'): team = item['value'] if team != '': result.append(team) return result def scrape_season(self, season): teams = self.get_teams(season) for team in teams: self.scrape_team(team, season) def scrape_team(self, team, season): self.logger.debug('Processing ' + team + ' ' + season) response = self.sender.get(self.base_url + season + '/' + team, {}) if response != '': html = BeautifulSoup(response, 'lxml') container = html.find('div', {'class': 'container main table clearfix'}) for row in container.find('tbody').find_all('tr'): dict = { 'player': row.find('td').getText(), 'team': team, 'season': season } self.mongo_wrapper.write_dictionary(self.collection, dict)
def __init__(self): self.logger = Logger(2) self.mongo_wrapper = PrefixedMongoWrapper('aggregated_match_results') self.mapper = TeamsNormalizer() self.template = { 'season': 'primera/2017-18', 'day_num': '', 'home': '', 'away': '', 'score_home': '', 'score_away': '' }
class LineUpManager: def __init__(self): self.logger = Logger() self.normalizer = PlayerNormalizer() def create_by_match_id(self, match_id, team): mongo_wrapper = PrefixedMongoWrapper('laliga_web_primera') stats = mongo_wrapper.get_collection('popups_matches_stats').find({'match_id': match_id, 'team': team, 'main_lineup': True}) result = [] for stat in stats: player_id = self.normalizer.find_player_id('master', stat['player']) if (player_id != ''): result.append(str(player_id)) else: self.logger.log(400, 'Unmatched player ' + stat['player']) if len(result) != 11: self.logger.error(100, 'Lineup of ' + str(len(result)) + ' players') return self._generate_key(result) def _generate_key(self, player_ids): result = player_ids.copy() result.sort() return '_'.join(result) def create_by_list(self, players): result = [] for player in players: player_id = self.normalizer.find_player_id('marca', player) if (player_id != ''): result.append(str(player_id)) else: self.logger.log(400, 'Unmatched player ' + player) return self._generate_key(result)
class MatchDetailsScraper: def __init__(self, url): self.url = url self.logger = Logger(2) def extract_lineups(self): sender = Sender() sender.set_debug_level(2) response = sender.get(self.url, {}) self.logger.debug('Processing lineups of ' + self.url) html = BeautifulSoup(response, 'html.parser') result = {'home': '', 'away': ''} teams_container_main = html.find('section', {'class': 'laliga-fantasy columna2'}) if not teams_container_main is None: teams_container = teams_container_main.find_all('section')[:2] home_container = teams_container[0] away_container = teams_container[1] result = { 'home': self._extract_players(home_container), 'away': self._extract_players(away_container) } else: self.logger.error(500, 'No lineup found in ' + self.url) return result def _extract_players(self, team_container): result = [] for player in team_container.find_all('li'): result.append(player.getText().strip()) #self.logger.debug('Retrieving ' + str(result)) if len(result) != 11: self.logger.error(100, 'Team with ' + str(len(result)) + ' players') return result
def __init__(self): self.sender = scrape_request.Sender() self.sender.set_delay(2) self.sender.set_debug_level(2) self.logger = Logger(2)
class PlayerNormalizer: def __init__(self): self.logger = Logger(2) self.default_csv_filename = './players_mapping.csv' self.loaded = False def find_player_id(self, source, player): self._init_data() results_indexes = self.data['master'].index[self.data[source] == player] if len(results_indexes) > 1: self.logger.error( 300, 'More than a candidate (' + str(len(results_indexes)) + '): ' + player) for result in results_indexes: return result self.logger.error(100, 'Cannot find map for ' + source + ': ' + player) return '' def _init_data(self): if self.loaded == False: self.logger.debug('Loading map file') if not os.path.isfile(self.default_csv_filename): self.init_map_file() self.data = pd.read_csv(self.default_csv_filename) self.loaded = True def init_map_file(self): self.logger.debug('Generating master') mongo_wrapper = PrefixedMongoWrapper('laliga_web') # datos sacados del apartado "plantillas" #result = mongo_wrapper.get_collection('players').find({'season': 'primera/2016-17'}).distinct('player') #result += mongo_wrapper.get_collection('players').find({'season': 'segunda/2016-17'}).distinct('player') # faltan los de la temporada 1928-29 integramos con los resultados de primera y segunda result = mongo_wrapper.get_collection('players').distinct('player') result += mongo_wrapper.get_collection( 'primera_popups_matches_stats').distinct('player') result += mongo_wrapper.get_collection( 'segunda_popups_matches_stats').distinct('player') self.logger.debug('Done') data = {'master': list(set(result))} self.save_csv(data) def _get_marca_list(self): result = [] mongo_wrapper = PrefixedMongoWrapper('marca_current_season') for day in mongo_wrapper.get_collection('results').find( {"results.home_lineup": { "$exists": True }}): for match in day['results']: result += match['home_lineup'] result += match['away_lineup'] return list(set(result)) def normalize(self): self._init_data() self.logger.debug('Normalizing data...') return self._normalize_one('marca', self._get_marca_list()) def save_csv(self, result): self.logger.debug('Creating ' + self.default_csv_filename) csv_filename = self.default_csv_filename repo = pd.DataFrame(result) repo.index += 1 repo.to_csv(csv_filename) def get_valid_players(self): mongo_wrapper = PrefixedMongoWrapper('laliga_web') result = mongo_wrapper.get_collection( 'primera_popups_matches_stats').distinct('player') result += mongo_wrapper.get_collection('players').find({ 'season': 'primera/2016-17' }).distinct('player') result += mongo_wrapper.get_collection('players').find({ 'season': 'segunda/2016-17' }).distinct('player') return list(set(result)) def _normalize_one(self, source, players): result = { 'master': [], source: [], } num_matched = 0 valid_players = self.get_valid_players() #print(valid_players) #exit() already_got = [] for master_player in self.data['master']: best_similarity = 0 second_best_similarity = 0 matched = '' if master_player in valid_players: for player in players: matcher = SequenceMatcher( None, self.preprocess_name(master_player), self.preprocess_name(player)) similarity = matcher.ratio() if (similarity > best_similarity) and \ (similarity > 0.95) and \ (second_best_similarity < 0.60) and \ (player not in already_got): second_best_similarity = best_similarity best_similarity = similarity matched = player if matched != '': self.logger.debug('Matched "' + matched + '" with "' + master_player + '" ' + str(best_similarity)) already_got.append(matched) num_matched += 1 result['master'].append(master_player) result[source].append(matched) self.logger.debug( str(len(players)) + ' players, ' + str(num_matched) + ' matched') return result def preprocess_name(self, name): result = name.lower() result = result.replace(',', '') return result
def __init__(self): self.writer_obj = False self.collection = 'as_classifications_data' self.started = datetime.datetime.now().isoformat() self.logger = Logger(2)
class AsClassificationScraper: def __init__(self): self.writer_obj = False self.collection = 'as_classifications_data' self.started = datetime.datetime.now().isoformat() self.logger = Logger(2) def writer(self): if not self.writer_obj: self.writer_obj = prefixed_mongo_wrapper.PrefixedMongoWrapper(self.collection) return self.writer_obj def scrape_page(self): self.writer().drop_collection('classification') self.logger.debug('Downloading as web classifications data') sender = scrape_request.Sender() sender.set_debug_level(2) raw_html = sender.get('https://resultados.as.com/resultados/futbol/primera/clasificacion/', {}) self.process_page(raw_html) self.logger.debug('Done') def process_page(self, raw_html): html = BeautifulSoup(raw_html, 'html.parser') main_table = html.find('table', {'class', 'tabla-datos table-hover'}) header = self.process_header(main_table) data_table = main_table.find('tbody') for row in data_table.find_all('tr'): self.process_row(row, header) def process_header(self, main_table): result = [] head = main_table.find_all('th', {'scope': 'col'}) for column in head: result.append(column.getText().replace('.', '')) return result def process_row(self, row, header): team_content = row.find('th').find('span', {'class': 'nombre-equipo'}).getText() result = {'process_id': self.started, 'team': team_content, 'total': {}, 'home': {}, 'away': {}} cell_counter = 1 for cell in row.find_all('td'): if cell_counter <= 7: result['total'][header[cell_counter]] = cell.getText() else: if cell_counter > 7 and cell_counter <= 14: result['home'][header[cell_counter]] = cell.getText() else: result['away'][header[cell_counter]] = cell.getText() cell_counter = cell_counter + 1 self.writer().write_dictionary('classification', result)
def __init__(self, logger=Logger()): self.logger = logger self.delay = 0
class MatchesFactsAggregator: ''' Genera la tabla de facts partidos en csv considerando los partidos terminados En self.counters se van actualizando los valores incrementales, tipo los goles marcados a la fecha. Para modificar los ficheros que genera hay que definir el template de los registros que se van generando en self.counter_template, y poner conforme los valores en _update_counters ''' def __init__(self, log_detail_level=0): self.prefix = 'etl' self.mongo_wrapper = PrefixedMongoWrapper(self.prefix) self.collection = 'results_all' self.results = [] self.counters = {} self.tournament_positions = {} self.logger = Logger(log_detail_level) #Diccionario con los datos recientes. Ver self._update_counters self.teams_recent_history = {} self.counter_template = { 'played_home': 0, 'played_away': 0, 'score_competition': 0, #'score_competition_home': 0, #'score_competition_away': 0, 'matches_won_home': 0, 'matches_won_away': 0, 'matches_tied_home': 0, 'matches_tied_away': 0, 'matches_lost_home': 0, 'matches_lost_away': 0, 'goals_scored_home': 0, 'goals_scored_away': 0, 'goals_conceded_home': 0, 'goals_conceded_away': 0, 'num_days_without_goals_home': 0, 'num_days_without_goals_away': 0, 'num_days_without_victory_home': 0, 'num_days_without_victory_away': 0, 'ranking_home': 0, 'ranking_away': 0 } def _update_counters(self, match): match_winner = self._winner(match) if match_winner != '': # Goles hechos por cada equipo self._add_to_counter(match['home'], 'goals_scored_home', match['score_home']) self._add_to_counter(match['away'], 'goals_scored_away', match['score_away']) # Partidos jugados self._add_to_counter(match['home'], 'played_home', 1) self._add_to_counter(match['away'], 'played_away', 1) self._add_to_counter(match['home'], 'goals_conceded_home', match['score_away']) self._add_to_counter(match['away'], 'goals_conceded_away', match['score_home']) # Partidos ganados, empatados, perdidos key_map = { 'home': 'matches_won_home', 'away': 'matches_lost_home', 'none': 'matches_tied_home' } self._add_to_counter(match['home'], key_map[match_winner], 1) key_map = { 'home': 'matches_lost_away', 'away': 'matches_won_away', 'none': 'matches_tied_away' } self._add_to_counter(match['away'], key_map[match_winner], 1) # añado al historiar los goles hechos self.teams_recent_history[match['home']]['goals'].append( int(match['score_home'])) self.teams_recent_history[match['away']]['goals'].append( int(match['score_away'])) # Suma de los goles hechos en los últimos 5 días self._set_counter( match['home'], 'ranking_home', sum(self.teams_recent_history[match['home']]['goals'][-5:])) self._set_counter( match['away'], 'ranking_away', sum(self.teams_recent_history[match['away']]['goals'][-5:])) # Puntos key_map = {'home': 3, 'away': 0, 'none': 1} #self._add_to_counter(match['home'], 'score_competition_home', key_map[match_winner]) self._add_to_counter(match['home'], 'score_competition', key_map[match_winner]) key_map = {'home': 0, 'away': 3, 'none': 1} #self._add_to_counter(match['away'], 'score_competition_away', key_map[match_winner]) self._add_to_counter(match['away'], 'score_competition', key_map[match_winner]) # Días sin ganar if match_winner == 'home': self._set_counter(match['home'], 'num_days_without_victory_home', 0) self._add_to_counter(match['away'], 'num_days_without_victory_away', 1) if match_winner == 'away': self._set_counter(match['away'], 'num_days_without_victory_home', 0) self._add_to_counter(match['home'], 'num_days_without_victory_away', 1) if match_winner == 'none': self._add_to_counter(match['home'], 'num_days_without_victory_home', 1) self._add_to_counter(match['away'], 'num_days_without_victory_away', 1) # Días sin marcar if int(match['score_home']) > 0: self._set_counter(match['home'], 'num_days_without_goals_home', 0) else: self._add_to_counter(match['home'], 'num_days_without_goals_home', 1) if int(match['score_away']) > 0: self._set_counter(match['away'], 'num_days_without_goals_away', 0) else: self._add_to_counter(match['away'], 'num_days_without_goals_away', 1) def _generate_tournament_positions(self): tournament_scores = {} raw_data = self.counters for team in raw_data.keys(): score_competition = raw_data[team]['score_competition'] if score_competition not in tournament_scores.keys(): tournament_scores[score_competition] = [] tournament_scores[score_competition].append(team) tournament_positions = {} current_position = 1 for team_score in reversed(sorted(tournament_scores)): if current_position not in tournament_positions: tournament_positions[current_position] = [] tournament_positions[current_position] += tournament_scores[ team_score] current_position += 1 result = {} for current_position in tournament_positions.keys(): for team in tournament_positions[current_position]: result[team] = current_position self.tournament_positions = result def process_matches_played(self, season): self._init_counters(season) for match in self._collection().find({ 'season': season }).sort([('day_num', pymongo.ASCENDING)]): entry = self._process_match(match) if entry['winner'] != '': self._add_to_results(entry) def process_matches_to_play(self, season): self._init_counters(season) for match in self._collection().find({ 'season': season }).sort([('day_num', pymongo.ASCENDING)]): entry = self._process_match(match) if entry['winner'] == '': self._add_to_results(entry) def write_data_mongo(self): ''' Escribe en mongo los resultados del proceso :return: ''' self.mongo_wrapper.drop_collection('aggregated_results') self.mongo_wrapper.write_dictionaries_list('aggregated_results', self.results) def write_data_csv(self, filename): ''' Exporta a csv los resultados del proceso :param filename: :return: ''' import pandas as pd data = {} #print(self.results[0].keys()) #exit() for column in self.results[0].keys(): data[column] = [] for result in self.results: for attribute_name in result.keys(): data[attribute_name].append(result[attribute_name]) repo = pd.DataFrame(data) repo.to_csv(filename) def _process_match(self, match): self.logger.debug('processing ' + str(match)) home_stats = self.counters[match['home']] away_stats = self.counters[match['away']] entry = {} self._generate_tournament_positions() entry['score_competition_diff'] = self.counters[ match['home']]['score_competition'] - self.counters[ match['away']]['score_competition'] entry['tournament_position_home'] = self.tournament_positions[ match['home']] entry['tournament_position_away'] = self.tournament_positions[ match['away']] entry['season'] = match['season'] entry['day_num'] = match['day_num'] entry['team_home'] = match['home'] entry['team_away'] = match['away'] entry['lineup_home'] = match['lineup_home'] entry['lineup_away'] = match['lineup_away'] entry['score_home'] = match['score_home'] entry['score_away'] = match['score_away'] #entry['ranking_home'] = 2 * entry['ranking_home'] #entry['ranking_away'] = 2 * entry['ranking_away'] entry['winner'] = self._winner(match) for key in home_stats.keys(): entry[key + '_home'] = home_stats[key] for key in away_stats.keys(): entry[key + '_away'] = away_stats[key] self._update_counters(match) return entry def _winner(self, match): if match['score_home'] == '': return '' if int(match['score_home']) > int(match['score_away']): return 'home' if int(match['score_home']) < int(match['score_away']): return 'away' if int(match['score_home']) == int(match['score_away']): return 'none' def _add_to_results(self, entry): self.results.append(entry) def _add_to_counter(self, team, key, qty): self.counters[team][key] += int(qty) def _set_counter(self, team, key, value): self.counters[team][key] = value def _init_counters(self, season): self.counters = {} self.teams_recent_history = {} for team in self._collection().find({ 'season': season }).distinct("home"): self.counters[team] = self.counter_template.copy() self.teams_recent_history[team] = {'goals': []} def _collection(self): return self.mongo_wrapper.get_collection(self.collection)
def __init__(self): self.logger = Logger(2) self.default_csv_filename = './players_mapping.csv' self.loaded = False
class ResultsMerger: def __init__(self): self.logger = Logger(2) self.mapper = TeamsNormalizer() self.template = { 'season': 'primera/2017-18', 'day_num': '', 'home': '', 'away': '', 'score_home': '', 'score_away': '' } def merge(self): self.logger.debug('Merging...') results = [] results += self._get_archive_results() results += self._get_current_results() self.logger.debug('Processed ' + str(len(results)) + ' matches') self.logger.debug('Done') return results def save(self, result): self.logger.debug('Saving') wrapper = PrefixedMongoWrapper('etl_results') wrapper.drop_collection('all') wrapper.write_dictionaries_list('all', result) self.logger.debug('Done') def _get_archive_results(self): self.logger.debug( 'Getting archive results ... this will take time ...') wrapper = scraping.laliga.utils.create_mongo_writer() archive = wrapper.get_collection('primera_results').\ find({'season': {'$in': ['primera/2015-16', 'primera/2016-17']}}).\ sort([('day_num', pymongo.ASCENDING)]) result = [] lineup_manager = LineUpManager() #el historico for archive_match in archive: entry = self.template.copy() entry['season'] = archive_match['season'] entry['day_num'] = int(archive_match['day_num']) entry['home'] = archive_match['home'] entry['away'] = archive_match['away'] entry['score_home'] = int(archive_match['score_home']) entry['score_away'] = int(archive_match['score_away']) entry['lineup_home'] = lineup_manager.create_by_match_id( archive_match['match_id'], archive_match['home']) entry['lineup_away'] = lineup_manager.create_by_match_id( archive_match['match_id'], archive_match['away']) self.logger.debug('Processing ' + str(entry)) result.append(entry) return result def _extract_scores(self, text): result = {} splitted = text.split('-') if len(splitted) == 2: if (len(splitted[0]) <= 2) and (len(splitted[1]) <= 2): result['score_home'] = int(splitted[0]) result['score_away'] = int(splitted[1]) return result def _get_current_results(self): self.logger.debug('Getting current season results') result = [] wrapper = PrefixedMongoWrapper('marca') lineup_manager = LineUpManager() #los actuales bajados por la web de marca for day in wrapper.get_collection('current_season_results').find({}): for match in day['results']: match['result'] = self._marca_process_result(match['result']) entry = self.template.copy() entry['day_num'] = int(day['day']['num_day'].replace( 'Jornada', '').strip()) entry['home'] = self.mapper.find_team_id( 'marca', match['home']) entry['away'] = self.mapper.find_team_id( 'marca', match['away']) scores = self._extract_scores(match['result']) if len(scores) == 2: entry['score_home'] = scores['score_home'] entry['score_away'] = scores['score_away'] else: entry['score_home'] = match['result'] entry['score_away'] = match['result'] if 'home_lineup' in match.keys(): entry['lineup_home'] = lineup_manager.create_by_list( match['home_lineup']) else: entry['lineup_home'] = '' if 'away_lineup' in match.keys(): entry['lineup_away'] = lineup_manager.create_by_list( match['away_lineup']) else: entry['lineup_away'] = '' result.append(entry) return result def _marca_process_result(self, text): ''' En la web de marca en las celdas con los resultados pueden haber tres tipos de info, por ejemplo: 1-1 Sab- 13:00 Sin confirmar controlamos simplemente por longitud del text :param text: :return: ''' result = '' if len(text) < 8: result = text return result
def __init__(self): self.logger = Logger() self.normalizer = PlayerNormalizer()
class SeasonScraper: def __init__(self): self.sender = scrape_request.Sender() self.sender.set_delay(2) self.sender.set_debug_level(2) self.logger = Logger(2) def scrape_page(self, season): splitted = season.split('/') league = splitted[0] page_content = self.do_request(season) writer = utils.create_mongo_writer(league) if page_content: html = BeautifulSoup(page_content, 'html.parser') days = html.find_all('div', {'class': 'jornada-calendario-historico'}) matches_results = [] for day in days[:]: table_title = day.find('div', {'class': 'nombre_jornada'}) day_str = self.extract_day(table_title.contents[0]) day_num = self.extract_daynum(table_title.contents[0]) tables = day.find_all( 'table', {'class': 'tabla_jornada_calendario_historico'}) for table in tables[:]: rows = table.find_all( 'tr', {'onclick': re.compile('^abrir_partido')}) for row in rows[:]: js_params = self.extract_popup_win_js_params( row['onclick']) if js_params != False: match_id = js_params['temporada'] + '_' + js_params['jornada'] + '_' + \ js_params['equipo'] + '_' + js_params['competicion'] cell = row.find('td') content_to_process = str(cell.contents[0]) txt = self.extract_result(content_to_process) matches_results.append({ 'season': season, 'day': day_str, 'day_num': day_num, 'home': str.strip(txt.group(1)), 'away': str.strip(txt.group(3)), 'score_home': txt.group(2), 'score_away': txt.group(4), 'match_id': match_id }) popup_scraper = PopUpScraper.PopUpScraper( match_id, writer) popup_scraper.scrape_popup(js_params) writer.write_dictionaries_list('results', matches_results) def do_request(self, path): sender = self.sender url = 'http://www.laliga.es/estadisticas-historicas/calendario/' + path + '/' return sender.get(url, {}) def extract_result(self, content_to_process): ''' :param content_to_process: Cosas como '<span>RCD Mallorca: <b>1</b><br>Real Madrid: <b>2</b></span>' :return: array con 4 elementos para nombre del equipo y goles ''' content_to_process = content_to_process.replace(":", "") cell_pattern_str = '<span>(.+?)<b>(.+?)</b><br/>(.+?)<b>(.+?)</b></span>' return re.search(cell_pattern_str, content_to_process) def extract_daynum(self, content_to_process): ''' :param content_to_process: Cosas como 'Jornada: 02 - 26/08/2016' :return: fecha en formato dd-mm-yyyy ''' cell_pattern_str = '(\d+)' parsed = re.search(cell_pattern_str, content_to_process) return str.strip(parsed.group(1)) def extract_day(self, content_to_process): cell_pattern_str = '(\d+/\d+/\d+)' parsed = re.search(cell_pattern_str, content_to_process) return str.strip(str.replace(parsed.group(1), "/", "-")) def extract_popup_win_js_params(self, function_call_str): ''' Saca los parámetros a pasar al JS que obtiene los popups con los detalles de un partido. ej. 'abrir_partido(115,37,"barcelona",1)' ''' pattern = 'abrir_partido\((.+?),(.+?),"(.+?)",(.+?)\)' parsed = re.search(pattern, function_call_str) if parsed is None: self.logger.error( 400, 'Error in extract_popup_win_js_params ' + function_call_str) return False else: return { 'temporada': parsed.group(1), 'jornada': parsed.group(2), 'equipo': parsed.group(3), 'competicion': parsed.group(4) }
def __init__(self, url): self.url = url self.logger = Logger(2)
class ResultsCurrentYearScraper: def __init__(self): self.sender = Sender() self.logger = Logger(2) self.url = 'http://www.marca.com/futbol/primera-division/calendario.html' self.raw_content = '' self.writer = PrefixedMongoWrapper('marca') self.collection_name = 'current_season_results' def _getPage(self): self.raw_content = self.sender.get(self.url, {}) if self.raw_content == '': self.logger.error(500, 'Empty page') exit() def scrape(self): self.logger.debug('Downloading marca web data') self._getPage() self.writer.drop_collection(self.collection_name) html = BeautifulSoup(self.raw_content, 'html.parser') for day_table in html.find_all('li', {'id': 'contenedorCalendarioInt'}): day_info = self.extract_day(day_table) self.logger.debug('Processing "' + day_info['num_day'] + ', ' + day_info['date'] + '"') results = self.process_results(day_table) dictionary_to_insert = {'day': day_info, 'results': results} self.writer.write_dictionary(self.collection_name, dictionary_to_insert) self.logger.debug('Done') def extract_day(self, day_table): header = day_table.find('span') num_day = header.find('h2').getText() date = header.contents[2].strip() return {'num_day': num_day, 'date': date} def process_results(self, day_table): results = [] for row in day_table.find('ul', { 'class': 'partidos-jornada' }).find_all('a'): counter = 0 result = {} colmap = {0: 'home', 1: 'away', 2: 'result'} for cell in row.find_all('span'): result[colmap[counter % 3]] = cell.getText() counter = counter + 1 if row.has_attr('href'): lineups = self._get_lineups(row['href']) result['home_lineup'] = lineups['home'] result['away_lineup'] = lineups['away'] results.append(result) self.logger.debug('Inserted ' + str(len(results)) + ' items') return results def _get_lineups(self, url): scraper = MatchDetailsScraper(url) return scraper.extract_lineups()