Beispiel #1
 def __init__(self):
     self.schedule = GetSchedule()
     self.url = ""
Beispiel #2
class WomensProSoccer(object):
    """Download schedule from official WPS website and process information."""
    def __init__(self):
        self.schedule = GetSchedule()
        self.url = ""
    def crawl(self):
        """Crawl the page with BeautifulSoup for applicable information."""
        schedule = []
        soup = self.schedule.load_page(self.url)
        print 'Locating scheduling section'
        tbody = soup.find("tbody").findAll('tr')
        del tbody[0] # Remove table header
        for row in tbody:
            stat = row.findAll("div", {"align": "center"})
            stat = [self.cleaner(info, 1) for info in stat] # Remove <div>
            # If the row consists of one column it contains the date. This is
            #quicker than searching for <th> with BeatifulSoup
            if len(stat) == 1:
                date =[0])
                match = {} # Dictionary to hold match information
                match['date'] = date # Assign the date of the last header row
                match['team1'] = stat[0][0]
                match['team2'] = stat[1][0]
                match['venue'] = stat[2][0]
                # Fourth element is a link to match report
                print stat[5]
                match['attendance'] = self.attendance(stat[5])
                print match
                # Add match dictionary to schedule list
        return schedule
    def cleaner(self, html, repeat):
        """Recursively remove tags for a certain count, returning a list."""
        count = 0
        html = unicode(html)
        soup = BeautifulSoup(html)
        for element in soup.findAll(True):
            if count == repeat:
                content = element.contents
                count += 1
        return content
    def date(self, html):
        """Find the date in a table row. If playoff match create element."""
        row = self.cleaner(html, 1) # Remove <strong>
        if len(row) == 1:
            date = row[0]
        else: # Process ugly formatting of playoff games
            row = [element for element in row if isinstance(element, NavigableString)]
            date = row[-1] #TODO(pamolloy): Store game title
        return date
    def score(self, section):
        """Process the score into number of (penalty) goals for each team"""
        match = {}
        if section[0] == 'Postponed':
        elif section[0] == '\n': # Penalty goals
            section = section[1]
            goals = re.findall('\d', unicode(section))
            match['goals1'] = int(goals[0])
            match['goals2'] = int(goals[1])
            match['pens1'] = int(goals[2])
            match['pens2'] = int(goals[3])
            section = section[0]
            match['goals1'] = int(section[0])
            match['goals2'] = int(section[4])
        return match
    def attendance(self, html):
        """Find the attendance within list"""
        print html
        if len(html) == 1:
            attendance = html[0]
        elif len(html) == 2:
            print html
            attendance = [element for element in html if isinstance(element, NavigableString)]
            print attendance
            attendance = attendance[0]
        elif len(html) == 0: # Postponed game
            attendance = int() 
        return attendance
Beispiel #3
 def __init__(self):
     self.schedule = GetSchedule()
     self.url = ""
     self.att = {"class": "genericTable"}
Beispiel #4
class USSoccer(object):
    """Download schedule from official USSF website and process information."""
    def __init__(self):
        self.schedule = GetSchedule()
        self.url = ""
        self.att = {"class": "genericTable"}
    def crawl(self):
        """Crawl the page with BeautifulSoup for applicable information."""
        schedule = []
        soup = self.schedule.load_page(self.url)
        print 'Locating scheduling section: {}'.format(self.att.values()[0])
        table = soup.find("table", self.att).findAll('tr')
        del table[0] # Remove table header
        for row in table:
            #stat = row.findAll("div", {"align": "center"})
            stats = row.contents
            stats = [element for element in stats if element != '\n']
            stats = [element.contents for element in stats] # Remove <td>
            match = {} # Dictionary to hold match information
            match['date'] = self.cleaner(stats[0], 1)[0]
            match['time'] = self.cleaner(stats[2], 1)[0]
            stadium = self.cleaner(stats[3][0], 1)[0]
            city = stats[3][2]
            match['venue'] = '{}, {}'.format(stadium, city)
            teams = stats[1][0]
            teams = teams.split(' vs. ')
            match['team1'] = teams[0]
            match['team2'] = teams[1]
            channels = stats[4][0]
            channels = channels.strip()
            if channels == '&nbsp;':
                channels = channels.split(', ')
                count = 0
                for station in channels:
                    match['tv{}'.format(count)] = station
                    count += 1
            # Fifth element is "Info Center"
            print match
            # Add match dictionary to schedule list
        return schedule
    def cleaner(self, html, repeat):
        """Recursively remove tags for a certain count, returning a list."""
        count = 0
        html = unicode(html)
        soup = BeautifulSoup(html)
        for element in soup.findAll(True):
            if count == repeat:
                content = element.contents
                count += 1
        return content
Beispiel #5
 def __init__(self):
     self.schedule = GetSchedule()
     self.url = ""
     self.att = "schedule-page"
Beispiel #6
class MLSSoccer(object):
    """Download schedule from official MLS website and process information."""
    def __init__(self):
        self.schedule = GetSchedule()
        self.url = ""
        self.att = "schedule-page"
    def crawl(self):
        """Crawl the page with BeautifulSoup for applicable information."""
        schedule = []
        soup = self.schedule.load_page(self.url)
        print 'Locating scheduling section: {}'.format(att)
        section = soup.find("div", {"class": self.att})
        for table in section.findAll("table"):
            table_body = table.find('tbody')
            date =
            table_rows = table_body.findAll('tr')
            for row in table_rows:
                match = {}
                # NOT easily processed information
                match['date'] = date
                details = self.details(row)
                goals = self.score(row)
                # Easily processed information
                match['venue'] = self.generic(row, "views-field venue")
                match['team1'] = self.generic(row, "views-field home-team")
                match['team2'] = self.generic(row, "views-field away-team")
                # Add match dictionary to schedule list
        return schedule
    def date(self, section):
        """Find the date of each match based on the last preceding <h3> tag"""
        date = section.findPreviousSibling("h3")
        date = BeautifulSoup(unicode(date))
        date = date.h3.contents[0] # Remove tags
        return date
    def details(self, section):
        """Process the venue and channels from the details section"""
        match = {}
        html = section.find("td", {"class": "views-field start-time"})
        details = html.contents
        # If the game has passed, ignore "Final"
        if details[0] == u'Final':
            return match
            match['hour'] = details[0]
            count = 0
            channels = html.findAll('strong')
            for station in channels:
                station = BeautifulSoup('{}'.format(station))
                match['tv{}'.format(count)] = station.strong.contents[0]
                count += 1
            return match
    def score(self, section):
        """Process the score into number of (penalty) goals for each team"""
        match = {}
        score = section.find("td", {"class": "views-field score"}).contents
        # Ignore score for upcoming games, which return empty list
        if score == []:
           return match 
            score = score[0] # Select first string from list
            # Store penalties
            if'(\(|\))', score):
                match['goals1'] = int(score[0])
                match['goals2'] = int(score[8])
                match['pens1'] = int(score[3])
                match['pens2'] = int(score[11])
            elif'[0-9]', score):
                match['goals1'] = int(score[0])
                match['goals2'] = int(score[4])
            else: pass
            return match
    def generic(self, section, attribute):
        """Find and return the match venue."""
        info = section.find("td", {"class": attribute}).contents[0]
        return info