コード例 #1
0
 def __init__(self):
     self.sender = Sender()
     self.logger = Logger(2)
     self.url = 'http://www.marca.com/futbol/primera-division/calendario.html'
     self.raw_content = ''
     self.writer = PrefixedMongoWrapper('marca')
     self.collection_name = 'current_season_results'
コード例 #2
0
    def __init__(self, log_detail_level=0):
        self.prefix = 'etl'
        self.mongo_wrapper = PrefixedMongoWrapper(self.prefix)
        self.collection = 'results_all'
        self.results = []

        self.counters = {}
        self.tournament_positions = {}
        self.logger = Logger(log_detail_level)

        #Diccionario con los datos recientes. Ver self._update_counters
        self.teams_recent_history = {}
        self.counter_template = {
            'played_home': 0,
            'played_away': 0,
            'score_competition': 0,
            #'score_competition_home': 0,
            #'score_competition_away': 0,
            'matches_won_home': 0,
            'matches_won_away': 0,
            'matches_tied_home': 0,
            'matches_tied_away': 0,
            'matches_lost_home': 0,
            'matches_lost_away': 0,
            'goals_scored_home': 0,
            'goals_scored_away': 0,
            'goals_conceded_home': 0,
            'goals_conceded_away': 0,
            'num_days_without_goals_home': 0,
            'num_days_without_goals_away': 0,
            'num_days_without_victory_home': 0,
            'num_days_without_victory_away': 0,
            'ranking_home': 0,
            'ranking_away': 0
        }
コード例 #3
0
    def __init__(self):

        self.logger = Logger(2)
        self.mongo_wrapper = PrefixedMongoWrapper('laliga_web')
        self.base_url = 'http://www.laliga.es/estadisticas-historicas/plantillas/'
        self.sender = Sender()
        self.sender.set_debug_level(2)
        self.collection = 'players'
コード例 #4
0
    def __init__(self):

        self.logger = Logger(2)
        self.mapper = TeamsNormalizer()

        self.template = {
            'season': 'primera/2017-18',
            'day_num': '',
            'home': '',
            'away': '',
            'score_home': '',
            'score_away': ''
        }
コード例 #5
0
class PlayersScraper:

    def __init__(self):

        self.logger = Logger(2)
        self.mongo_wrapper = PrefixedMongoWrapper('laliga_web')
        self.base_url = 'http://www.laliga.es/estadisticas-historicas/plantillas/'
        self.sender = Sender()
        self.sender.set_debug_level(2)
        self.collection = 'players'


    def get_teams(self, season):
        response = self.sender.get(self.base_url + season, {})
        result = []
        if response != '':
            html = BeautifulSoup(response, 'html.parser')

            for item in html.find('select', {'id': 'select_equipos_estadisticas_historicas'}).find_all('option'):

                team = item['value']
                if team != '':
                    result.append(team)

        return result

    def scrape_season(self, season):

        teams = self.get_teams(season)

        for team in teams:
            self.scrape_team(team, season)

    def scrape_team(self, team, season):
        self.logger.debug('Processing ' + team + ' ' + season)
        response = self.sender.get(self.base_url + season + '/' + team, {})

        if response != '':
            html = BeautifulSoup(response, 'lxml')

            container = html.find('div', {'class': 'container main table clearfix'})
            for row in container.find('tbody').find_all('tr'):
                dict = {
                    'player': row.find('td').getText(),
                    'team': team,
                    'season': season
                }


                self.mongo_wrapper.write_dictionary(self.collection, dict)
コード例 #6
0
    def __init__(self):

        self.logger = Logger(2)
        self.mongo_wrapper = PrefixedMongoWrapper('aggregated_match_results')
        self.mapper = TeamsNormalizer()

        self.template = {
            'season': 'primera/2017-18',
            'day_num': '',
            'home': '',
            'away': '',
            'score_home': '',
            'score_away': ''
        }
コード例 #7
0
class LineUpManager:

    def __init__(self):
        self.logger = Logger()
        self.normalizer = PlayerNormalizer()

    def create_by_match_id(self, match_id, team):
        mongo_wrapper = PrefixedMongoWrapper('laliga_web_primera')

        stats = mongo_wrapper.get_collection('popups_matches_stats').find({'match_id': match_id, 'team': team, 'main_lineup': True})

        result = []
        for stat in stats:
            player_id = self.normalizer.find_player_id('master', stat['player'])
            if (player_id != ''):
                result.append(str(player_id))
            else:
                self.logger.log(400, 'Unmatched player ' + stat['player'])

        if len(result) != 11:
            self.logger.error(100, 'Lineup of ' + str(len(result)) + ' players')


        return self._generate_key(result)

    def _generate_key(self, player_ids):
        result = player_ids.copy()
        result.sort()
        return '_'.join(result)

    def create_by_list(self, players):


        result = []
        for player in players:
            player_id = self.normalizer.find_player_id('marca', player)
            if (player_id != ''):
                result.append(str(player_id))
            else:
                self.logger.log(400, 'Unmatched player ' + player)

        return self._generate_key(result)
コード例 #8
0
class MatchDetailsScraper:
    def __init__(self, url):
        self.url = url
        self.logger = Logger(2)

    def extract_lineups(self):

        sender = Sender()
        sender.set_debug_level(2)
        response = sender.get(self.url, {})
        self.logger.debug('Processing lineups of ' + self.url)

        html = BeautifulSoup(response, 'html.parser')
        result = {'home': '', 'away': ''}

        teams_container_main = html.find('section',
                                         {'class': 'laliga-fantasy columna2'})
        if not teams_container_main is None:
            teams_container = teams_container_main.find_all('section')[:2]
            home_container = teams_container[0]
            away_container = teams_container[1]

            result = {
                'home': self._extract_players(home_container),
                'away': self._extract_players(away_container)
            }
        else:
            self.logger.error(500, 'No lineup found in ' + self.url)

        return result

    def _extract_players(self, team_container):
        result = []
        for player in team_container.find_all('li'):
            result.append(player.getText().strip())

        #self.logger.debug('Retrieving ' + str(result))
        if len(result) != 11:
            self.logger.error(100,
                              'Team with ' + str(len(result)) + ' players')
        return result
コード例 #9
0
 def __init__(self):
     self.sender = scrape_request.Sender()
     self.sender.set_delay(2)
     self.sender.set_debug_level(2)
     self.logger = Logger(2)
コード例 #10
0
class PlayerNormalizer:
    def __init__(self):

        self.logger = Logger(2)
        self.default_csv_filename = './players_mapping.csv'
        self.loaded = False

    def find_player_id(self, source, player):

        self._init_data()
        results_indexes = self.data['master'].index[self.data[source] ==
                                                    player]
        if len(results_indexes) > 1:
            self.logger.error(
                300, 'More than a candidate (' + str(len(results_indexes)) +
                '): ' + player)

        for result in results_indexes:
            return result

        self.logger.error(100, 'Cannot find map for ' + source + ': ' + player)
        return ''

    def _init_data(self):
        if self.loaded == False:
            self.logger.debug('Loading map file')
            if not os.path.isfile(self.default_csv_filename):
                self.init_map_file()

            self.data = pd.read_csv(self.default_csv_filename)
            self.loaded = True

    def init_map_file(self):
        self.logger.debug('Generating master')

        mongo_wrapper = PrefixedMongoWrapper('laliga_web')
        # datos sacados del apartado "plantillas"
        #result = mongo_wrapper.get_collection('players').find({'season': 'primera/2016-17'}).distinct('player')
        #result += mongo_wrapper.get_collection('players').find({'season': 'segunda/2016-17'}).distinct('player')

        # faltan los de la temporada 1928-29 integramos con los resultados de primera y segunda
        result = mongo_wrapper.get_collection('players').distinct('player')
        result += mongo_wrapper.get_collection(
            'primera_popups_matches_stats').distinct('player')
        result += mongo_wrapper.get_collection(
            'segunda_popups_matches_stats').distinct('player')

        self.logger.debug('Done')
        data = {'master': list(set(result))}
        self.save_csv(data)

    def _get_marca_list(self):
        result = []

        mongo_wrapper = PrefixedMongoWrapper('marca_current_season')
        for day in mongo_wrapper.get_collection('results').find(
            {"results.home_lineup": {
                "$exists": True
            }}):
            for match in day['results']:
                result += match['home_lineup']
                result += match['away_lineup']

        return list(set(result))

    def normalize(self):
        self._init_data()
        self.logger.debug('Normalizing data...')

        return self._normalize_one('marca', self._get_marca_list())

    def save_csv(self, result):
        self.logger.debug('Creating ' + self.default_csv_filename)
        csv_filename = self.default_csv_filename

        repo = pd.DataFrame(result)
        repo.index += 1
        repo.to_csv(csv_filename)

    def get_valid_players(self):
        mongo_wrapper = PrefixedMongoWrapper('laliga_web')

        result = mongo_wrapper.get_collection(
            'primera_popups_matches_stats').distinct('player')
        result += mongo_wrapper.get_collection('players').find({
            'season':
            'primera/2016-17'
        }).distinct('player')
        result += mongo_wrapper.get_collection('players').find({
            'season':
            'segunda/2016-17'
        }).distinct('player')

        return list(set(result))

    def _normalize_one(self, source, players):

        result = {
            'master': [],
            source: [],
        }

        num_matched = 0
        valid_players = self.get_valid_players()
        #print(valid_players)
        #exit()

        already_got = []

        for master_player in self.data['master']:

            best_similarity = 0
            second_best_similarity = 0
            matched = ''

            if master_player in valid_players:

                for player in players:

                    matcher = SequenceMatcher(
                        None, self.preprocess_name(master_player),
                        self.preprocess_name(player))
                    similarity = matcher.ratio()
                    if (similarity > best_similarity) and \
                            (similarity > 0.95) and \
                            (second_best_similarity < 0.60) and \
                            (player not in already_got):

                        second_best_similarity = best_similarity
                        best_similarity = similarity
                        matched = player

                if matched != '':
                    self.logger.debug('Matched "' + matched + '" with "' +
                                      master_player + '" ' +
                                      str(best_similarity))
                    already_got.append(matched)
                    num_matched += 1

            result['master'].append(master_player)
            result[source].append(matched)

        self.logger.debug(
            str(len(players)) + ' players, ' + str(num_matched) + ' matched')
        return result

    def preprocess_name(self, name):
        result = name.lower()
        result = result.replace(',', '')
        return result
コード例 #11
0
 def __init__(self):
     self.writer_obj = False
     self.collection = 'as_classifications_data'
     self.started = datetime.datetime.now().isoformat()
     self.logger = Logger(2)
コード例 #12
0
class AsClassificationScraper:

    def __init__(self):
        self.writer_obj = False
        self.collection = 'as_classifications_data'
        self.started = datetime.datetime.now().isoformat()
        self.logger = Logger(2)

    def writer(self):

        if not self.writer_obj:
            self.writer_obj = prefixed_mongo_wrapper.PrefixedMongoWrapper(self.collection)

        return self.writer_obj

    def scrape_page(self):

        self.writer().drop_collection('classification')
        self.logger.debug('Downloading as web classifications data')

        sender = scrape_request.Sender()
        sender.set_debug_level(2)
        raw_html = sender.get('https://resultados.as.com/resultados/futbol/primera/clasificacion/', {})
        self.process_page(raw_html)

        self.logger.debug('Done')

    def process_page(self, raw_html):
        html = BeautifulSoup(raw_html, 'html.parser')

        main_table = html.find('table', {'class', 'tabla-datos table-hover'})
        header = self.process_header(main_table)

        data_table = main_table.find('tbody')
        for row in data_table.find_all('tr'):
            self.process_row(row, header)

    def process_header(self, main_table):
        result = []
        head = main_table.find_all('th', {'scope': 'col'})

        for column in head:
            result.append(column.getText().replace('.', ''))

        return result

    def process_row(self, row, header):

        team_content = row.find('th').find('span', {'class': 'nombre-equipo'}).getText()
        result = {'process_id': self.started, 'team': team_content, 'total': {}, 'home': {}, 'away': {}}
        cell_counter = 1
        for cell in row.find_all('td'):

            if cell_counter <= 7:
                result['total'][header[cell_counter]] = cell.getText()
            else:
                if cell_counter > 7 and cell_counter <= 14:
                    result['home'][header[cell_counter]] = cell.getText()
                else:
                    result['away'][header[cell_counter]] = cell.getText()
            cell_counter = cell_counter + 1
        self.writer().write_dictionary('classification', result)
コード例 #13
0
 def __init__(self, logger=Logger()):
     self.logger = logger
     self.delay = 0
コード例 #14
0
class MatchesFactsAggregator:
    '''
    Genera la tabla de facts partidos en csv considerando los partidos terminados

    En self.counters se van actualizando los valores incrementales, tipo los goles marcados a la fecha.

    Para modificar los ficheros que genera hay que definir el template de los registros que se van
    generando en self.counter_template, y poner conforme los valores en _update_counters
    '''
    def __init__(self, log_detail_level=0):
        self.prefix = 'etl'
        self.mongo_wrapper = PrefixedMongoWrapper(self.prefix)
        self.collection = 'results_all'
        self.results = []

        self.counters = {}
        self.tournament_positions = {}
        self.logger = Logger(log_detail_level)

        #Diccionario con los datos recientes. Ver self._update_counters
        self.teams_recent_history = {}
        self.counter_template = {
            'played_home': 0,
            'played_away': 0,
            'score_competition': 0,
            #'score_competition_home': 0,
            #'score_competition_away': 0,
            'matches_won_home': 0,
            'matches_won_away': 0,
            'matches_tied_home': 0,
            'matches_tied_away': 0,
            'matches_lost_home': 0,
            'matches_lost_away': 0,
            'goals_scored_home': 0,
            'goals_scored_away': 0,
            'goals_conceded_home': 0,
            'goals_conceded_away': 0,
            'num_days_without_goals_home': 0,
            'num_days_without_goals_away': 0,
            'num_days_without_victory_home': 0,
            'num_days_without_victory_away': 0,
            'ranking_home': 0,
            'ranking_away': 0
        }

    def _update_counters(self, match):

        match_winner = self._winner(match)

        if match_winner != '':

            # Goles hechos por cada equipo
            self._add_to_counter(match['home'], 'goals_scored_home',
                                 match['score_home'])
            self._add_to_counter(match['away'], 'goals_scored_away',
                                 match['score_away'])

            # Partidos jugados
            self._add_to_counter(match['home'], 'played_home', 1)
            self._add_to_counter(match['away'], 'played_away', 1)

            self._add_to_counter(match['home'], 'goals_conceded_home',
                                 match['score_away'])
            self._add_to_counter(match['away'], 'goals_conceded_away',
                                 match['score_home'])

            # Partidos ganados, empatados, perdidos
            key_map = {
                'home': 'matches_won_home',
                'away': 'matches_lost_home',
                'none': 'matches_tied_home'
            }
            self._add_to_counter(match['home'], key_map[match_winner], 1)

            key_map = {
                'home': 'matches_lost_away',
                'away': 'matches_won_away',
                'none': 'matches_tied_away'
            }
            self._add_to_counter(match['away'], key_map[match_winner], 1)

            # añado al historiar los goles hechos
            self.teams_recent_history[match['home']]['goals'].append(
                int(match['score_home']))
            self.teams_recent_history[match['away']]['goals'].append(
                int(match['score_away']))

            # Suma de los goles hechos en los últimos 5 días
            self._set_counter(
                match['home'], 'ranking_home',
                sum(self.teams_recent_history[match['home']]['goals'][-5:]))
            self._set_counter(
                match['away'], 'ranking_away',
                sum(self.teams_recent_history[match['away']]['goals'][-5:]))

            # Puntos
            key_map = {'home': 3, 'away': 0, 'none': 1}
            #self._add_to_counter(match['home'], 'score_competition_home', key_map[match_winner])
            self._add_to_counter(match['home'], 'score_competition',
                                 key_map[match_winner])

            key_map = {'home': 0, 'away': 3, 'none': 1}
            #self._add_to_counter(match['away'], 'score_competition_away', key_map[match_winner])
            self._add_to_counter(match['away'], 'score_competition',
                                 key_map[match_winner])

            # Días sin ganar
            if match_winner == 'home':
                self._set_counter(match['home'],
                                  'num_days_without_victory_home', 0)
                self._add_to_counter(match['away'],
                                     'num_days_without_victory_away', 1)

            if match_winner == 'away':
                self._set_counter(match['away'],
                                  'num_days_without_victory_home', 0)
                self._add_to_counter(match['home'],
                                     'num_days_without_victory_away', 1)

            if match_winner == 'none':
                self._add_to_counter(match['home'],
                                     'num_days_without_victory_home', 1)
                self._add_to_counter(match['away'],
                                     'num_days_without_victory_away', 1)

            # Días sin marcar
            if int(match['score_home']) > 0:
                self._set_counter(match['home'], 'num_days_without_goals_home',
                                  0)
            else:
                self._add_to_counter(match['home'],
                                     'num_days_without_goals_home', 1)

            if int(match['score_away']) > 0:
                self._set_counter(match['away'], 'num_days_without_goals_away',
                                  0)
            else:
                self._add_to_counter(match['away'],
                                     'num_days_without_goals_away', 1)

    def _generate_tournament_positions(self):
        tournament_scores = {}

        raw_data = self.counters

        for team in raw_data.keys():

            score_competition = raw_data[team]['score_competition']

            if score_competition not in tournament_scores.keys():
                tournament_scores[score_competition] = []

            tournament_scores[score_competition].append(team)

        tournament_positions = {}
        current_position = 1
        for team_score in reversed(sorted(tournament_scores)):

            if current_position not in tournament_positions:
                tournament_positions[current_position] = []

            tournament_positions[current_position] += tournament_scores[
                team_score]
            current_position += 1

        result = {}
        for current_position in tournament_positions.keys():
            for team in tournament_positions[current_position]:
                result[team] = current_position

        self.tournament_positions = result

    def process_matches_played(self, season):
        self._init_counters(season)
        for match in self._collection().find({
                'season': season
        }).sort([('day_num', pymongo.ASCENDING)]):
            entry = self._process_match(match)
            if entry['winner'] != '':
                self._add_to_results(entry)

    def process_matches_to_play(self, season):
        self._init_counters(season)
        for match in self._collection().find({
                'season': season
        }).sort([('day_num', pymongo.ASCENDING)]):
            entry = self._process_match(match)
            if entry['winner'] == '':
                self._add_to_results(entry)

    def write_data_mongo(self):
        '''
        Escribe en mongo los resultados del proceso
        :return:
        '''
        self.mongo_wrapper.drop_collection('aggregated_results')
        self.mongo_wrapper.write_dictionaries_list('aggregated_results',
                                                   self.results)

    def write_data_csv(self, filename):
        '''
        Exporta a csv los resultados del proceso
        :param filename:
        :return:
        '''
        import pandas as pd

        data = {}
        #print(self.results[0].keys())
        #exit()
        for column in self.results[0].keys():
            data[column] = []

        for result in self.results:
            for attribute_name in result.keys():
                data[attribute_name].append(result[attribute_name])

        repo = pd.DataFrame(data)
        repo.to_csv(filename)

    def _process_match(self, match):

        self.logger.debug('processing ' + str(match))

        home_stats = self.counters[match['home']]
        away_stats = self.counters[match['away']]

        entry = {}
        self._generate_tournament_positions()

        entry['score_competition_diff'] = self.counters[
            match['home']]['score_competition'] - self.counters[
                match['away']]['score_competition']
        entry['tournament_position_home'] = self.tournament_positions[
            match['home']]
        entry['tournament_position_away'] = self.tournament_positions[
            match['away']]

        entry['season'] = match['season']
        entry['day_num'] = match['day_num']

        entry['team_home'] = match['home']
        entry['team_away'] = match['away']

        entry['lineup_home'] = match['lineup_home']
        entry['lineup_away'] = match['lineup_away']

        entry['score_home'] = match['score_home']
        entry['score_away'] = match['score_away']

        #entry['ranking_home'] = 2 * entry['ranking_home']
        #entry['ranking_away'] = 2 * entry['ranking_away']
        entry['winner'] = self._winner(match)

        for key in home_stats.keys():
            entry[key + '_home'] = home_stats[key]

        for key in away_stats.keys():
            entry[key + '_away'] = away_stats[key]

        self._update_counters(match)

        return entry

    def _winner(self, match):
        if match['score_home'] == '':
            return ''
        if int(match['score_home']) > int(match['score_away']):
            return 'home'
        if int(match['score_home']) < int(match['score_away']):
            return 'away'
        if int(match['score_home']) == int(match['score_away']):
            return 'none'

    def _add_to_results(self, entry):
        self.results.append(entry)

    def _add_to_counter(self, team, key, qty):
        self.counters[team][key] += int(qty)

    def _set_counter(self, team, key, value):
        self.counters[team][key] = value

    def _init_counters(self, season):
        self.counters = {}
        self.teams_recent_history = {}

        for team in self._collection().find({
                'season': season
        }).distinct("home"):
            self.counters[team] = self.counter_template.copy()
            self.teams_recent_history[team] = {'goals': []}

    def _collection(self):
        return self.mongo_wrapper.get_collection(self.collection)
コード例 #15
0
    def __init__(self):

        self.logger = Logger(2)
        self.default_csv_filename = './players_mapping.csv'
        self.loaded = False
コード例 #16
0
class ResultsMerger:
    def __init__(self):

        self.logger = Logger(2)
        self.mapper = TeamsNormalizer()

        self.template = {
            'season': 'primera/2017-18',
            'day_num': '',
            'home': '',
            'away': '',
            'score_home': '',
            'score_away': ''
        }

    def merge(self):

        self.logger.debug('Merging...')

        results = []
        results += self._get_archive_results()
        results += self._get_current_results()

        self.logger.debug('Processed ' + str(len(results)) + ' matches')

        self.logger.debug('Done')
        return results

    def save(self, result):
        self.logger.debug('Saving')
        wrapper = PrefixedMongoWrapper('etl_results')
        wrapper.drop_collection('all')
        wrapper.write_dictionaries_list('all', result)
        self.logger.debug('Done')

    def _get_archive_results(self):
        self.logger.debug(
            'Getting archive results ... this will take time ...')
        wrapper = scraping.laliga.utils.create_mongo_writer()
        archive = wrapper.get_collection('primera_results').\
            find({'season': {'$in': ['primera/2015-16', 'primera/2016-17']}}).\
            sort([('day_num', pymongo.ASCENDING)])

        result = []

        lineup_manager = LineUpManager()

        #el historico
        for archive_match in archive:

            entry = self.template.copy()
            entry['season'] = archive_match['season']
            entry['day_num'] = int(archive_match['day_num'])
            entry['home'] = archive_match['home']
            entry['away'] = archive_match['away']
            entry['score_home'] = int(archive_match['score_home'])
            entry['score_away'] = int(archive_match['score_away'])
            entry['lineup_home'] = lineup_manager.create_by_match_id(
                archive_match['match_id'], archive_match['home'])
            entry['lineup_away'] = lineup_manager.create_by_match_id(
                archive_match['match_id'], archive_match['away'])

            self.logger.debug('Processing ' + str(entry))
            result.append(entry)

        return result

    def _extract_scores(self, text):

        result = {}
        splitted = text.split('-')

        if len(splitted) == 2:
            if (len(splitted[0]) <= 2) and (len(splitted[1]) <= 2):
                result['score_home'] = int(splitted[0])
                result['score_away'] = int(splitted[1])

        return result

    def _get_current_results(self):
        self.logger.debug('Getting current season results')

        result = []
        wrapper = PrefixedMongoWrapper('marca')
        lineup_manager = LineUpManager()

        #los actuales bajados por la web de marca
        for day in wrapper.get_collection('current_season_results').find({}):
            for match in day['results']:
                match['result'] = self._marca_process_result(match['result'])

                entry = self.template.copy()
                entry['day_num'] = int(day['day']['num_day'].replace(
                    'Jornada', '').strip())
                entry['home'] = self.mapper.find_team_id(
                    'marca', match['home'])
                entry['away'] = self.mapper.find_team_id(
                    'marca', match['away'])

                scores = self._extract_scores(match['result'])

                if len(scores) == 2:
                    entry['score_home'] = scores['score_home']
                    entry['score_away'] = scores['score_away']
                else:
                    entry['score_home'] = match['result']
                    entry['score_away'] = match['result']

                if 'home_lineup' in match.keys():
                    entry['lineup_home'] = lineup_manager.create_by_list(
                        match['home_lineup'])
                else:
                    entry['lineup_home'] = ''

                if 'away_lineup' in match.keys():
                    entry['lineup_away'] = lineup_manager.create_by_list(
                        match['away_lineup'])
                else:
                    entry['lineup_away'] = ''

                result.append(entry)
        return result

    def _marca_process_result(self, text):
        '''

        En la web de marca en las celdas con los resultados pueden haber tres tipos de info, por ejemplo:
        1-1
        Sab-  13:00
        Sin confirmar

        controlamos simplemente por longitud del text

        :param text:
        :return:
        '''
        result = ''
        if len(text) < 8:
            result = text

        return result
コード例 #17
0
 def __init__(self):
     self.logger = Logger()
     self.normalizer = PlayerNormalizer()
コード例 #18
0
class SeasonScraper:
    def __init__(self):
        self.sender = scrape_request.Sender()
        self.sender.set_delay(2)
        self.sender.set_debug_level(2)
        self.logger = Logger(2)

    def scrape_page(self, season):

        splitted = season.split('/')
        league = splitted[0]
        page_content = self.do_request(season)
        writer = utils.create_mongo_writer(league)

        if page_content:
            html = BeautifulSoup(page_content, 'html.parser')
            days = html.find_all('div',
                                 {'class': 'jornada-calendario-historico'})

            matches_results = []

            for day in days[:]:

                table_title = day.find('div', {'class': 'nombre_jornada'})
                day_str = self.extract_day(table_title.contents[0])
                day_num = self.extract_daynum(table_title.contents[0])

                tables = day.find_all(
                    'table', {'class': 'tabla_jornada_calendario_historico'})

                for table in tables[:]:
                    rows = table.find_all(
                        'tr', {'onclick': re.compile('^abrir_partido')})

                    for row in rows[:]:
                        js_params = self.extract_popup_win_js_params(
                            row['onclick'])

                        if js_params != False:

                            match_id = js_params['temporada'] + '_' + js_params['jornada'] + '_' + \
                                       js_params['equipo'] + '_' + js_params['competicion']

                            cell = row.find('td')
                            content_to_process = str(cell.contents[0])
                            txt = self.extract_result(content_to_process)

                            matches_results.append({
                                'season':
                                season,
                                'day':
                                day_str,
                                'day_num':
                                day_num,
                                'home':
                                str.strip(txt.group(1)),
                                'away':
                                str.strip(txt.group(3)),
                                'score_home':
                                txt.group(2),
                                'score_away':
                                txt.group(4),
                                'match_id':
                                match_id
                            })
                            popup_scraper = PopUpScraper.PopUpScraper(
                                match_id, writer)
                            popup_scraper.scrape_popup(js_params)

            writer.write_dictionaries_list('results', matches_results)

    def do_request(self, path):
        sender = self.sender
        url = 'http://www.laliga.es/estadisticas-historicas/calendario/' + path + '/'

        return sender.get(url, {})

    def extract_result(self, content_to_process):
        '''
        :param content_to_process: Cosas como '<span>RCD Mallorca: <b>1</b><br>Real Madrid: <b>2</b></span>'
        :return: array con 4 elementos para nombre del equipo y goles
        '''
        content_to_process = content_to_process.replace(":", "")
        cell_pattern_str = '<span>(.+?)<b>(.+?)</b><br/>(.+?)<b>(.+?)</b></span>'
        return re.search(cell_pattern_str, content_to_process)

    def extract_daynum(self, content_to_process):
        '''
        :param content_to_process: Cosas como 'Jornada: 02 - 26/08/2016'
        :return: fecha en formato dd-mm-yyyy
        '''
        cell_pattern_str = '(\d+)'
        parsed = re.search(cell_pattern_str, content_to_process)
        return str.strip(parsed.group(1))

    def extract_day(self, content_to_process):
        cell_pattern_str = '(\d+/\d+/\d+)'
        parsed = re.search(cell_pattern_str, content_to_process)
        return str.strip(str.replace(parsed.group(1), "/", "-"))

    def extract_popup_win_js_params(self, function_call_str):
        '''
        Saca los parámetros a pasar al JS que obtiene los popups con los detalles de un partido.
        ej. 'abrir_partido(115,37,"barcelona",1)'
        '''
        pattern = 'abrir_partido\((.+?),(.+?),"(.+?)",(.+?)\)'
        parsed = re.search(pattern, function_call_str)

        if parsed is None:
            self.logger.error(
                400,
                'Error in extract_popup_win_js_params ' + function_call_str)
            return False
        else:
            return {
                'temporada': parsed.group(1),
                'jornada': parsed.group(2),
                'equipo': parsed.group(3),
                'competicion': parsed.group(4)
            }
コード例 #19
0
 def __init__(self, url):
     self.url = url
     self.logger = Logger(2)
コード例 #20
0
class ResultsCurrentYearScraper:
    def __init__(self):
        self.sender = Sender()
        self.logger = Logger(2)
        self.url = 'http://www.marca.com/futbol/primera-division/calendario.html'
        self.raw_content = ''
        self.writer = PrefixedMongoWrapper('marca')
        self.collection_name = 'current_season_results'

    def _getPage(self):

        self.raw_content = self.sender.get(self.url, {})

        if self.raw_content == '':
            self.logger.error(500, 'Empty page')
            exit()

    def scrape(self):
        self.logger.debug('Downloading marca web data')

        self._getPage()
        self.writer.drop_collection(self.collection_name)

        html = BeautifulSoup(self.raw_content, 'html.parser')

        for day_table in html.find_all('li',
                                       {'id': 'contenedorCalendarioInt'}):
            day_info = self.extract_day(day_table)
            self.logger.debug('Processing "' + day_info['num_day'] + ', ' +
                              day_info['date'] + '"')
            results = self.process_results(day_table)

            dictionary_to_insert = {'day': day_info, 'results': results}
            self.writer.write_dictionary(self.collection_name,
                                         dictionary_to_insert)

        self.logger.debug('Done')

    def extract_day(self, day_table):
        header = day_table.find('span')
        num_day = header.find('h2').getText()
        date = header.contents[2].strip()

        return {'num_day': num_day, 'date': date}

    def process_results(self, day_table):
        results = []
        for row in day_table.find('ul', {
                'class': 'partidos-jornada'
        }).find_all('a'):
            counter = 0
            result = {}
            colmap = {0: 'home', 1: 'away', 2: 'result'}
            for cell in row.find_all('span'):
                result[colmap[counter % 3]] = cell.getText()
                counter = counter + 1
            if row.has_attr('href'):
                lineups = self._get_lineups(row['href'])
                result['home_lineup'] = lineups['home']
                result['away_lineup'] = lineups['away']

            results.append(result)

        self.logger.debug('Inserted ' + str(len(results)) + ' items')
        return results

    def _get_lineups(self, url):
        scraper = MatchDetailsScraper(url)
        return scraper.extract_lineups()