Exemple #1
0
class Scraper:
    def __init__(self, tournament_url):
        self.tournament_url = tournament_url

    def scrape(self):

        # Get tournament info
        try:
            results = requests.get(self.tournament_url)
        except requests.exceptions.MissingSchema:
            results = requests.get("http://{}".format(self.tournament_url))
        results_tree = html.fromstring(results.content)
        try:
            tournament_name = results_tree.xpath(
                '//span[@class="tournName"]/text()')[0]
            updated = (results_tree.xpath('//span[@class="lastUpdate"]/text()')
                       [0].replace('Last Updated:', '').strip())
        except IndexError:
            raise ScrapeError("Tournament info not found.")

        self.tournament = Tournament(tournament_name, results.url, updated)

        # Get tournament events
        try:
            event_urls = results_tree.xpath(
                '//div[@id="schedule"]/table/tr/td/a[text()="View"]/@href')
        except IndexError:
            raise ScrapeError("No event schedule found.")

        loop = asyncio.new_event_loop()
        loop.run_until_complete(self.scrape_events(event_urls))

        return self.tournament

    async def scrape_events(self, event_urls):

        with ThreadPoolExecutor(max_workers=20) as executor:
            loop = asyncio.get_event_loop()
            futures = []

            for event_url in event_urls:
                if not urlparse(event_url).netloc:
                    event_url = urljoin(self.tournament.url, event_url)
                futures.append(
                    loop.run_in_executor(executor, requests.get, event_url))

            for response in await asyncio.gather(*futures):
                event = self.parse_event(response)
                self.tournament.add_event(event)

            self.tournament.count_all_fencers()

    def parse_event(self, event):
        # Get the event details (name, time) as text
        event_tree = html.fromstring(event.content)
        event_details = event_tree.xpath(
            '//span[@class="tournDetails"]/text()')
        try:
            event_name = event_details[0]
            event_time = event_details[1]
        except IndexError:
            raise ScrapeError(
                "Failed to interpret live results for event \"{}\".".format(
                    event_details))

        # Get the event status
        if event_tree.xpath('//a[text()="Final Results"]'):
            fencers = event_tree.xpath(
                '//div[@id="finalResults"]/table/tr/td[2]/text()')
            fencers = [Fencer(f, True) for f in fencers]
            event_status = Event.STATUS_FINISHED
        elif event_tree.xpath('//a[text()="Seeding"]'):
            fencers = event_tree.xpath(
                '//div[@id="Round1Seeding"]/table/tr/td[2]/text()')
            fencers = [Fencer(f, True) for f in fencers]
            event_status = Event.STATUS_STARTED
        elif event_tree.xpath('//a[text()="Check-In Status"]'):
            event_status = Event.STATUS_REGISTRATION
            fencers_checked_in = [
                True if len(list(f)) else False for f in event_tree.xpath(
                    '//div[@id="checkIn"]/table/tr/td[1]')
            ]
            fencers = event_tree.xpath(
                '//div[@id="checkIn"]/table/tr/td[2]/text()')
            fencers = [
                Fencer(f, ci) for (f, ci) in zip(fencers, fencers_checked_in)
            ]

        return Event(event_name, event_time, event_status, event.url, fencers)