Beispiel #1
0
class MatchDetailsScraper:
    def __init__(self, url):
        self.url = url
        self.logger = Logger(2)

    def extract_lineups(self):

        sender = Sender()
        sender.set_debug_level(2)
        response = sender.get(self.url, {})
        self.logger.debug('Processing lineups of ' + self.url)

        html = BeautifulSoup(response, 'html.parser')
        result = {'home': '', 'away': ''}

        teams_container_main = html.find('section',
                                         {'class': 'laliga-fantasy columna2'})
        if not teams_container_main is None:
            teams_container = teams_container_main.find_all('section')[:2]
            home_container = teams_container[0]
            away_container = teams_container[1]

            result = {
                'home': self._extract_players(home_container),
                'away': self._extract_players(away_container)
            }
        else:
            self.logger.error(500, 'No lineup found in ' + self.url)

        return result

    def _extract_players(self, team_container):
        result = []
        for player in team_container.find_all('li'):
            result.append(player.getText().strip())

        #self.logger.debug('Retrieving ' + str(result))
        if len(result) != 11:
            self.logger.error(100,
                              'Team with ' + str(len(result)) + ' players')
        return result
Beispiel #2
0
class SeasonScraper:
    def __init__(self):
        self.sender = scrape_request.Sender()
        self.sender.set_delay(2)
        self.sender.set_debug_level(2)
        self.logger = Logger(2)

    def scrape_page(self, season):

        splitted = season.split('/')
        league = splitted[0]
        page_content = self.do_request(season)
        writer = utils.create_mongo_writer(league)

        if page_content:
            html = BeautifulSoup(page_content, 'html.parser')
            days = html.find_all('div',
                                 {'class': 'jornada-calendario-historico'})

            matches_results = []

            for day in days[:]:

                table_title = day.find('div', {'class': 'nombre_jornada'})
                day_str = self.extract_day(table_title.contents[0])
                day_num = self.extract_daynum(table_title.contents[0])

                tables = day.find_all(
                    'table', {'class': 'tabla_jornada_calendario_historico'})

                for table in tables[:]:
                    rows = table.find_all(
                        'tr', {'onclick': re.compile('^abrir_partido')})

                    for row in rows[:]:
                        js_params = self.extract_popup_win_js_params(
                            row['onclick'])

                        if js_params != False:

                            match_id = js_params['temporada'] + '_' + js_params['jornada'] + '_' + \
                                       js_params['equipo'] + '_' + js_params['competicion']

                            cell = row.find('td')
                            content_to_process = str(cell.contents[0])
                            txt = self.extract_result(content_to_process)

                            matches_results.append({
                                'season':
                                season,
                                'day':
                                day_str,
                                'day_num':
                                day_num,
                                'home':
                                str.strip(txt.group(1)),
                                'away':
                                str.strip(txt.group(3)),
                                'score_home':
                                txt.group(2),
                                'score_away':
                                txt.group(4),
                                'match_id':
                                match_id
                            })
                            popup_scraper = PopUpScraper.PopUpScraper(
                                match_id, writer)
                            popup_scraper.scrape_popup(js_params)

            writer.write_dictionaries_list('results', matches_results)

    def do_request(self, path):
        sender = self.sender
        url = 'http://www.laliga.es/estadisticas-historicas/calendario/' + path + '/'

        return sender.get(url, {})

    def extract_result(self, content_to_process):
        '''
        :param content_to_process: Cosas como '<span>RCD Mallorca: <b>1</b><br>Real Madrid: <b>2</b></span>'
        :return: array con 4 elementos para nombre del equipo y goles
        '''
        content_to_process = content_to_process.replace(":", "")
        cell_pattern_str = '<span>(.+?)<b>(.+?)</b><br/>(.+?)<b>(.+?)</b></span>'
        return re.search(cell_pattern_str, content_to_process)

    def extract_daynum(self, content_to_process):
        '''
        :param content_to_process: Cosas como 'Jornada: 02 - 26/08/2016'
        :return: fecha en formato dd-mm-yyyy
        '''
        cell_pattern_str = '(\d+)'
        parsed = re.search(cell_pattern_str, content_to_process)
        return str.strip(parsed.group(1))

    def extract_day(self, content_to_process):
        cell_pattern_str = '(\d+/\d+/\d+)'
        parsed = re.search(cell_pattern_str, content_to_process)
        return str.strip(str.replace(parsed.group(1), "/", "-"))

    def extract_popup_win_js_params(self, function_call_str):
        '''
        Saca los parámetros a pasar al JS que obtiene los popups con los detalles de un partido.
        ej. 'abrir_partido(115,37,"barcelona",1)'
        '''
        pattern = 'abrir_partido\((.+?),(.+?),"(.+?)",(.+?)\)'
        parsed = re.search(pattern, function_call_str)

        if parsed is None:
            self.logger.error(
                400,
                'Error in extract_popup_win_js_params ' + function_call_str)
            return False
        else:
            return {
                'temporada': parsed.group(1),
                'jornada': parsed.group(2),
                'equipo': parsed.group(3),
                'competicion': parsed.group(4)
            }
class ResultsCurrentYearScraper:
    def __init__(self):
        self.sender = Sender()
        self.logger = Logger(2)
        self.url = 'http://www.marca.com/futbol/primera-division/calendario.html'
        self.raw_content = ''
        self.writer = PrefixedMongoWrapper('marca')
        self.collection_name = 'current_season_results'

    def _getPage(self):

        self.raw_content = self.sender.get(self.url, {})

        if self.raw_content == '':
            self.logger.error(500, 'Empty page')
            exit()

    def scrape(self):
        self.logger.debug('Downloading marca web data')

        self._getPage()
        self.writer.drop_collection(self.collection_name)

        html = BeautifulSoup(self.raw_content, 'html.parser')

        for day_table in html.find_all('li',
                                       {'id': 'contenedorCalendarioInt'}):
            day_info = self.extract_day(day_table)
            self.logger.debug('Processing "' + day_info['num_day'] + ', ' +
                              day_info['date'] + '"')
            results = self.process_results(day_table)

            dictionary_to_insert = {'day': day_info, 'results': results}
            self.writer.write_dictionary(self.collection_name,
                                         dictionary_to_insert)

        self.logger.debug('Done')

    def extract_day(self, day_table):
        header = day_table.find('span')
        num_day = header.find('h2').getText()
        date = header.contents[2].strip()

        return {'num_day': num_day, 'date': date}

    def process_results(self, day_table):
        results = []
        for row in day_table.find('ul', {
                'class': 'partidos-jornada'
        }).find_all('a'):
            counter = 0
            result = {}
            colmap = {0: 'home', 1: 'away', 2: 'result'}
            for cell in row.find_all('span'):
                result[colmap[counter % 3]] = cell.getText()
                counter = counter + 1
            if row.has_attr('href'):
                lineups = self._get_lineups(row['href'])
                result['home_lineup'] = lineups['home']
                result['away_lineup'] = lineups['away']

            results.append(result)

        self.logger.debug('Inserted ' + str(len(results)) + ' items')
        return results

    def _get_lineups(self, url):
        scraper = MatchDetailsScraper(url)
        return scraper.extract_lineups()
Beispiel #4
0
class PlayerNormalizer:
    def __init__(self):

        self.logger = Logger(2)
        self.default_csv_filename = './players_mapping.csv'
        self.loaded = False

    def find_player_id(self, source, player):

        self._init_data()
        results_indexes = self.data['master'].index[self.data[source] ==
                                                    player]
        if len(results_indexes) > 1:
            self.logger.error(
                300, 'More than a candidate (' + str(len(results_indexes)) +
                '): ' + player)

        for result in results_indexes:
            return result

        self.logger.error(100, 'Cannot find map for ' + source + ': ' + player)
        return ''

    def _init_data(self):
        if self.loaded == False:
            self.logger.debug('Loading map file')
            if not os.path.isfile(self.default_csv_filename):
                self.init_map_file()

            self.data = pd.read_csv(self.default_csv_filename)
            self.loaded = True

    def init_map_file(self):
        self.logger.debug('Generating master')

        mongo_wrapper = PrefixedMongoWrapper('laliga_web')
        # datos sacados del apartado "plantillas"
        #result = mongo_wrapper.get_collection('players').find({'season': 'primera/2016-17'}).distinct('player')
        #result += mongo_wrapper.get_collection('players').find({'season': 'segunda/2016-17'}).distinct('player')

        # faltan los de la temporada 1928-29 integramos con los resultados de primera y segunda
        result = mongo_wrapper.get_collection('players').distinct('player')
        result += mongo_wrapper.get_collection(
            'primera_popups_matches_stats').distinct('player')
        result += mongo_wrapper.get_collection(
            'segunda_popups_matches_stats').distinct('player')

        self.logger.debug('Done')
        data = {'master': list(set(result))}
        self.save_csv(data)

    def _get_marca_list(self):
        result = []

        mongo_wrapper = PrefixedMongoWrapper('marca_current_season')
        for day in mongo_wrapper.get_collection('results').find(
            {"results.home_lineup": {
                "$exists": True
            }}):
            for match in day['results']:
                result += match['home_lineup']
                result += match['away_lineup']

        return list(set(result))

    def normalize(self):
        self._init_data()
        self.logger.debug('Normalizing data...')

        return self._normalize_one('marca', self._get_marca_list())

    def save_csv(self, result):
        self.logger.debug('Creating ' + self.default_csv_filename)
        csv_filename = self.default_csv_filename

        repo = pd.DataFrame(result)
        repo.index += 1
        repo.to_csv(csv_filename)

    def get_valid_players(self):
        mongo_wrapper = PrefixedMongoWrapper('laliga_web')

        result = mongo_wrapper.get_collection(
            'primera_popups_matches_stats').distinct('player')
        result += mongo_wrapper.get_collection('players').find({
            'season':
            'primera/2016-17'
        }).distinct('player')
        result += mongo_wrapper.get_collection('players').find({
            'season':
            'segunda/2016-17'
        }).distinct('player')

        return list(set(result))

    def _normalize_one(self, source, players):

        result = {
            'master': [],
            source: [],
        }

        num_matched = 0
        valid_players = self.get_valid_players()
        #print(valid_players)
        #exit()

        already_got = []

        for master_player in self.data['master']:

            best_similarity = 0
            second_best_similarity = 0
            matched = ''

            if master_player in valid_players:

                for player in players:

                    matcher = SequenceMatcher(
                        None, self.preprocess_name(master_player),
                        self.preprocess_name(player))
                    similarity = matcher.ratio()
                    if (similarity > best_similarity) and \
                            (similarity > 0.95) and \
                            (second_best_similarity < 0.60) and \
                            (player not in already_got):

                        second_best_similarity = best_similarity
                        best_similarity = similarity
                        matched = player

                if matched != '':
                    self.logger.debug('Matched "' + matched + '" with "' +
                                      master_player + '" ' +
                                      str(best_similarity))
                    already_got.append(matched)
                    num_matched += 1

            result['master'].append(master_player)
            result[source].append(matched)

        self.logger.debug(
            str(len(players)) + ' players, ' + str(num_matched) + ' matched')
        return result

    def preprocess_name(self, name):
        result = name.lower()
        result = result.replace(',', '')
        return result