def __init__(self, log_detail_level=0):
        self.prefix = 'etl'
        self.mongo_wrapper = PrefixedMongoWrapper(self.prefix)
        self.collection = 'results_all'
        self.results = []

        self.counters = {}
        self.tournament_positions = {}
        self.logger = Logger(log_detail_level)

        #Diccionario con los datos recientes. Ver self._update_counters
        self.teams_recent_history = {}
        self.counter_template = {
            'played_home': 0,
            'played_away': 0,
            'score_competition': 0,
            #'score_competition_home': 0,
            #'score_competition_away': 0,
            'matches_won_home': 0,
            'matches_won_away': 0,
            'matches_tied_home': 0,
            'matches_tied_away': 0,
            'matches_lost_home': 0,
            'matches_lost_away': 0,
            'goals_scored_home': 0,
            'goals_scored_away': 0,
            'goals_conceded_home': 0,
            'goals_conceded_away': 0,
            'num_days_without_goals_home': 0,
            'num_days_without_goals_away': 0,
            'num_days_without_victory_home': 0,
            'num_days_without_victory_away': 0,
            'ranking_home': 0,
            'ranking_away': 0
        }
 def __init__(self):
     self.sender = Sender()
     self.logger = Logger(2)
     self.url = 'http://www.marca.com/futbol/primera-division/calendario.html'
     self.raw_content = ''
     self.writer = PrefixedMongoWrapper('marca')
     self.collection_name = 'current_season_results'
    def _get_current_results(self):
        self.logger.debug('Getting current season results')

        result = []
        wrapper = PrefixedMongoWrapper('marca')

        #los actuales bajados por la web de marca
        for day in wrapper.get_collection('current_season_results').find():
            for match in day['results']:

                match['result'] = self._marca_process_result(match['result'])

                entry = self.template.copy()
                entry['day_num'] = int(day['day']['num_day'].replace(
                    'Jornada', '').strip())
                entry['home'] = self.mapper.find_team_id(
                    'marca', match['home'])
                entry['away'] = self.mapper.find_team_id(
                    'marca', match['away'])

                scores = self._extract_scores(match['result'])

                if len(scores) == 2:
                    entry['score_home'] = scores['score_home']
                    entry['score_away'] = scores['score_away']
                else:
                    entry['score_home'] = match['result']
                    entry['score_away'] = match['result']

                result.append(entry)
        return result
Esempio n. 4
0
    def __init__(self):

        self.logger = Logger(2)
        self.mongo_wrapper = PrefixedMongoWrapper('laliga_web')
        self.base_url = 'http://www.laliga.es/estadisticas-historicas/plantillas/'
        self.sender = Sender()
        self.sender.set_debug_level(2)
        self.collection = 'players'
Esempio n. 5
0
    def _get_marca_list(self):
        result = []

        mongo_wrapper = PrefixedMongoWrapper('marca_current_season')
        for day in mongo_wrapper.get_collection('results').find(
            {"results.home_lineup": {
                "$exists": True
            }}):
            for match in day['results']:
                result += match['home_lineup']
                result += match['away_lineup']

        return list(set(result))
Esempio n. 6
0
class PlayersScraper:

    def __init__(self):

        self.logger = Logger(2)
        self.mongo_wrapper = PrefixedMongoWrapper('laliga_web')
        self.base_url = 'http://www.laliga.es/estadisticas-historicas/plantillas/'
        self.sender = Sender()
        self.sender.set_debug_level(2)
        self.collection = 'players'


    def get_teams(self, season):
        response = self.sender.get(self.base_url + season, {})
        result = []
        if response != '':
            html = BeautifulSoup(response, 'html.parser')

            for item in html.find('select', {'id': 'select_equipos_estadisticas_historicas'}).find_all('option'):

                team = item['value']
                if team != '':
                    result.append(team)

        return result

    def scrape_season(self, season):

        teams = self.get_teams(season)

        for team in teams:
            self.scrape_team(team, season)

    def scrape_team(self, team, season):
        self.logger.debug('Processing ' + team + ' ' + season)
        response = self.sender.get(self.base_url + season + '/' + team, {})

        if response != '':
            html = BeautifulSoup(response, 'lxml')

            container = html.find('div', {'class': 'container main table clearfix'})
            for row in container.find('tbody').find_all('tr'):
                dict = {
                    'player': row.find('td').getText(),
                    'team': team,
                    'season': season
                }


                self.mongo_wrapper.write_dictionary(self.collection, dict)
Esempio n. 7
0
    def get_valid_players(self):
        mongo_wrapper = PrefixedMongoWrapper('laliga_web')

        result = mongo_wrapper.get_collection(
            'primera_popups_matches_stats').distinct('player')
        result += mongo_wrapper.get_collection('players').find({
            'season':
            'primera/2016-17'
        }).distinct('player')
        result += mongo_wrapper.get_collection('players').find({
            'season':
            'segunda/2016-17'
        }).distinct('player')

        return list(set(result))
Esempio n. 8
0
    def init_map_file(self):
        self.logger.debug('Generating master')

        mongo_wrapper = PrefixedMongoWrapper('laliga_web')
        # datos sacados del apartado "plantillas"
        #result = mongo_wrapper.get_collection('players').find({'season': 'primera/2016-17'}).distinct('player')
        #result += mongo_wrapper.get_collection('players').find({'season': 'segunda/2016-17'}).distinct('player')

        # faltan los de la temporada 1928-29 integramos con los resultados de primera y segunda
        result = mongo_wrapper.get_collection('players').distinct('player')
        result += mongo_wrapper.get_collection(
            'primera_popups_matches_stats').distinct('player')
        result += mongo_wrapper.get_collection(
            'segunda_popups_matches_stats').distinct('player')

        self.logger.debug('Done')
        data = {'master': list(set(result))}
        self.save_csv(data)
Esempio n. 9
0
    def create_by_match_id(self, match_id, team):
        mongo_wrapper = PrefixedMongoWrapper('laliga_web_primera')

        stats = mongo_wrapper.get_collection('popups_matches_stats').find({'match_id': match_id, 'team': team, 'main_lineup': True})

        result = []
        for stat in stats:
            player_id = self.normalizer.find_player_id('master', stat['player'])
            if (player_id != ''):
                result.append(str(player_id))
            else:
                self.logger.log(400, 'Unmatched player ' + stat['player'])

        if len(result) != 11:
            self.logger.error(100, 'Lineup of ' + str(len(result)) + ' players')


        return self._generate_key(result)
    def __init__(self):

        self.logger = Logger(2)
        self.mongo_wrapper = PrefixedMongoWrapper('aggregated_match_results')
        self.mapper = TeamsNormalizer()

        self.template = {
            'season': 'primera/2017-18',
            'day_num': '',
            'home': '',
            'away': '',
            'score_home': '',
            'score_away': ''
        }
class MatchesFactsAggregator:
    '''
    Genera la tabla de facts partidos en csv considerando los partidos terminados

    En self.counters se van actualizando los valores incrementales, tipo los goles marcados a la fecha.

    Para modificar los ficheros que genera hay que definir el template de los registros que se van
    generando en self.counter_template, y poner conforme los valores en _update_counters
    '''
    def __init__(self, log_detail_level=0):
        self.prefix = 'etl'
        self.mongo_wrapper = PrefixedMongoWrapper(self.prefix)
        self.collection = 'results_all'
        self.results = []

        self.counters = {}
        self.tournament_positions = {}
        self.logger = Logger(log_detail_level)

        #Diccionario con los datos recientes. Ver self._update_counters
        self.teams_recent_history = {}
        self.counter_template = {
            'played_home': 0,
            'played_away': 0,
            'score_competition': 0,
            #'score_competition_home': 0,
            #'score_competition_away': 0,
            'matches_won_home': 0,
            'matches_won_away': 0,
            'matches_tied_home': 0,
            'matches_tied_away': 0,
            'matches_lost_home': 0,
            'matches_lost_away': 0,
            'goals_scored_home': 0,
            'goals_scored_away': 0,
            'goals_conceded_home': 0,
            'goals_conceded_away': 0,
            'num_days_without_goals_home': 0,
            'num_days_without_goals_away': 0,
            'num_days_without_victory_home': 0,
            'num_days_without_victory_away': 0,
            'ranking_home': 0,
            'ranking_away': 0
        }

    def _update_counters(self, match):

        match_winner = self._winner(match)

        if match_winner != '':

            # Goles hechos por cada equipo
            self._add_to_counter(match['home'], 'goals_scored_home',
                                 match['score_home'])
            self._add_to_counter(match['away'], 'goals_scored_away',
                                 match['score_away'])

            # Partidos jugados
            self._add_to_counter(match['home'], 'played_home', 1)
            self._add_to_counter(match['away'], 'played_away', 1)

            self._add_to_counter(match['home'], 'goals_conceded_home',
                                 match['score_away'])
            self._add_to_counter(match['away'], 'goals_conceded_away',
                                 match['score_home'])

            # Partidos ganados, empatados, perdidos
            key_map = {
                'home': 'matches_won_home',
                'away': 'matches_lost_home',
                'none': 'matches_tied_home'
            }
            self._add_to_counter(match['home'], key_map[match_winner], 1)

            key_map = {
                'home': 'matches_lost_away',
                'away': 'matches_won_away',
                'none': 'matches_tied_away'
            }
            self._add_to_counter(match['away'], key_map[match_winner], 1)

            # añado al historiar los goles hechos
            self.teams_recent_history[match['home']]['goals'].append(
                int(match['score_home']))
            self.teams_recent_history[match['away']]['goals'].append(
                int(match['score_away']))

            # Suma de los goles hechos en los últimos 5 días
            self._set_counter(
                match['home'], 'ranking_home',
                sum(self.teams_recent_history[match['home']]['goals'][-5:]))
            self._set_counter(
                match['away'], 'ranking_away',
                sum(self.teams_recent_history[match['away']]['goals'][-5:]))

            # Puntos
            key_map = {'home': 3, 'away': 0, 'none': 1}
            #self._add_to_counter(match['home'], 'score_competition_home', key_map[match_winner])
            self._add_to_counter(match['home'], 'score_competition',
                                 key_map[match_winner])

            key_map = {'home': 0, 'away': 3, 'none': 1}
            #self._add_to_counter(match['away'], 'score_competition_away', key_map[match_winner])
            self._add_to_counter(match['away'], 'score_competition',
                                 key_map[match_winner])

            # Días sin ganar
            if match_winner == 'home':
                self._set_counter(match['home'],
                                  'num_days_without_victory_home', 0)
                self._add_to_counter(match['away'],
                                     'num_days_without_victory_away', 1)

            if match_winner == 'away':
                self._set_counter(match['away'],
                                  'num_days_without_victory_home', 0)
                self._add_to_counter(match['home'],
                                     'num_days_without_victory_away', 1)

            if match_winner == 'none':
                self._add_to_counter(match['home'],
                                     'num_days_without_victory_home', 1)
                self._add_to_counter(match['away'],
                                     'num_days_without_victory_away', 1)

            # Días sin marcar
            if int(match['score_home']) > 0:
                self._set_counter(match['home'], 'num_days_without_goals_home',
                                  0)
            else:
                self._add_to_counter(match['home'],
                                     'num_days_without_goals_home', 1)

            if int(match['score_away']) > 0:
                self._set_counter(match['away'], 'num_days_without_goals_away',
                                  0)
            else:
                self._add_to_counter(match['away'],
                                     'num_days_without_goals_away', 1)

    def _generate_tournament_positions(self):
        tournament_scores = {}

        raw_data = self.counters

        for team in raw_data.keys():

            score_competition = raw_data[team]['score_competition']

            if score_competition not in tournament_scores.keys():
                tournament_scores[score_competition] = []

            tournament_scores[score_competition].append(team)

        tournament_positions = {}
        current_position = 1
        for team_score in reversed(sorted(tournament_scores)):

            if current_position not in tournament_positions:
                tournament_positions[current_position] = []

            tournament_positions[current_position] += tournament_scores[
                team_score]
            current_position += 1

        result = {}
        for current_position in tournament_positions.keys():
            for team in tournament_positions[current_position]:
                result[team] = current_position

        self.tournament_positions = result

    def process_matches_played(self, season):
        self._init_counters(season)
        for match in self._collection().find({
                'season': season
        }).sort([('day_num', pymongo.ASCENDING)]):
            entry = self._process_match(match)
            if entry['winner'] != '':
                self._add_to_results(entry)

    def process_matches_to_play(self, season):
        self._init_counters(season)
        for match in self._collection().find({
                'season': season
        }).sort([('day_num', pymongo.ASCENDING)]):
            entry = self._process_match(match)
            if entry['winner'] == '':
                self._add_to_results(entry)

    def write_data_mongo(self):
        '''
        Escribe en mongo los resultados del proceso
        :return:
        '''
        self.mongo_wrapper.drop_collection('aggregated_results')
        self.mongo_wrapper.write_dictionaries_list('aggregated_results',
                                                   self.results)

    def write_data_csv(self, filename):
        '''
        Exporta a csv los resultados del proceso
        :param filename:
        :return:
        '''
        import pandas as pd

        data = {}
        #print(self.results[0].keys())
        #exit()
        for column in self.results[0].keys():
            data[column] = []

        for result in self.results:
            for attribute_name in result.keys():
                data[attribute_name].append(result[attribute_name])

        repo = pd.DataFrame(data)
        repo.to_csv(filename)

    def _process_match(self, match):

        self.logger.debug('processing ' + str(match))

        home_stats = self.counters[match['home']]
        away_stats = self.counters[match['away']]

        entry = {}
        self._generate_tournament_positions()

        entry['score_competition_diff'] = self.counters[
            match['home']]['score_competition'] - self.counters[
                match['away']]['score_competition']
        entry['tournament_position_home'] = self.tournament_positions[
            match['home']]
        entry['tournament_position_away'] = self.tournament_positions[
            match['away']]

        entry['season'] = match['season']
        entry['day_num'] = match['day_num']

        entry['team_home'] = match['home']
        entry['team_away'] = match['away']

        entry['lineup_home'] = match['lineup_home']
        entry['lineup_away'] = match['lineup_away']

        entry['score_home'] = match['score_home']
        entry['score_away'] = match['score_away']

        #entry['ranking_home'] = 2 * entry['ranking_home']
        #entry['ranking_away'] = 2 * entry['ranking_away']
        entry['winner'] = self._winner(match)

        for key in home_stats.keys():
            entry[key + '_home'] = home_stats[key]

        for key in away_stats.keys():
            entry[key + '_away'] = away_stats[key]

        self._update_counters(match)

        return entry

    def _winner(self, match):
        if match['score_home'] == '':
            return ''
        if int(match['score_home']) > int(match['score_away']):
            return 'home'
        if int(match['score_home']) < int(match['score_away']):
            return 'away'
        if int(match['score_home']) == int(match['score_away']):
            return 'none'

    def _add_to_results(self, entry):
        self.results.append(entry)

    def _add_to_counter(self, team, key, qty):
        self.counters[team][key] += int(qty)

    def _set_counter(self, team, key, value):
        self.counters[team][key] = value

    def _init_counters(self, season):
        self.counters = {}
        self.teams_recent_history = {}

        for team in self._collection().find({
                'season': season
        }).distinct("home"):
            self.counters[team] = self.counter_template.copy()
            self.teams_recent_history[team] = {'goals': []}

    def _collection(self):
        return self.mongo_wrapper.get_collection(self.collection)
Esempio n. 12
0
 def save(self, result):
     self.logger.debug('Saving')
     wrapper = PrefixedMongoWrapper('etl_results')
     wrapper.drop_collection('all')
     wrapper.write_dictionaries_list('all', result)
     self.logger.debug('Done')
class ResultsCurrentYearScraper:
    def __init__(self):
        self.sender = Sender()
        self.logger = Logger(2)
        self.url = 'http://www.marca.com/futbol/primera-division/calendario.html'
        self.raw_content = ''
        self.writer = PrefixedMongoWrapper('marca')
        self.collection_name = 'current_season_results'

    def _getPage(self):

        self.raw_content = self.sender.get(self.url, {})

        if self.raw_content == '':
            self.logger.error(500, 'Empty page')
            exit()

    def scrape(self):
        self.logger.debug('Downloading marca web data')

        self._getPage()
        self.writer.drop_collection(self.collection_name)

        html = BeautifulSoup(self.raw_content, 'html.parser')

        for day_table in html.find_all('li',
                                       {'id': 'contenedorCalendarioInt'}):
            day_info = self.extract_day(day_table)
            self.logger.debug('Processing "' + day_info['num_day'] + ', ' +
                              day_info['date'] + '"')
            results = self.process_results(day_table)

            dictionary_to_insert = {'day': day_info, 'results': results}
            self.writer.write_dictionary(self.collection_name,
                                         dictionary_to_insert)

        self.logger.debug('Done')

    def extract_day(self, day_table):
        header = day_table.find('span')
        num_day = header.find('h2').getText()
        date = header.contents[2].strip()

        return {'num_day': num_day, 'date': date}

    def process_results(self, day_table):
        results = []
        for row in day_table.find('ul', {
                'class': 'partidos-jornada'
        }).find_all('a'):
            counter = 0
            result = {}
            colmap = {0: 'home', 1: 'away', 2: 'result'}
            for cell in row.find_all('span'):
                result[colmap[counter % 3]] = cell.getText()
                counter = counter + 1
            if row.has_attr('href'):
                lineups = self._get_lineups(row['href'])
                result['home_lineup'] = lineups['home']
                result['away_lineup'] = lineups['away']

            results.append(result)

        self.logger.debug('Inserted ' + str(len(results)) + ' items')
        return results

    def _get_lineups(self, url):
        scraper = MatchDetailsScraper(url)
        return scraper.extract_lineups()