コード例 #1
0
 def parse_data(self, response): 
     
     # Create tables 
     tables = pd.read_html(response.body)
     
     # Create yearly table 
     yearly = self.yearlystats(tables[2])
     
     # Create dynamic items
     stat = response.meta['stat'].replace(" ","").replace("/","").replace(".","").replace("-", "")
     field_list = yearly.columns
     DynamicItem = create_item_class("yearly_" + stat, field_list)
     item = DynamicItem()
     
     # yield items
     for record in yearly.to_dict(orient='record'):
         for k, v in record.items(): 
             item[k] = v
         yield item
     
     # Create game-by-game table 
     gamebygame = self.gamestats(tables[4], response.meta['team'])
     
     # Create dynamic items
     field_list = gamebygame.columns
     DynamicItem = create_item_class('gamebygame_' + stat, field_list)
     item = DynamicItem()
     
     # yield items
     for record in gamebygame.to_dict(orient='record'):
         for k, v in record.items(): 
             item[k] = v
         yield item
コード例 #2
0
    def parse(self, response):
        # Year
        year = response.selector.xpath(
            "//body//div[@id='contentarea']//fieldset//div/" +
            "/form[@id='change_sport_form']//select[@id='year_list']//option[@select"
            + "ed='selected']//text()").extract()[0]

        # Write out the rushing stats
        table = self.table_cleaner(html=response.body,
                                   target_table=2,
                                   stat='rushing',
                                   year=year,
                                   team=response.meta['team'],
                                   trim_n_rows=1)

        # Create dynamic items
        stat = 'rushing'
        field_list = table.columns
        DynamicItem = create_item_class(stat, field_list)
        item = DynamicItem()

        # yield items
        for record in table.to_dict(orient='record'):
            for k, v in record.items():
                item[k] = v
            yield item

        # Write out the game-by-game links
        gamebygame = LinkExtractor(allow=self.gamebygameregex)
        gamebygamelinks = gamebygame.extract_links(response)

        gamebygameItem = create_item_class('links_gamebygames',
                                           ['team', 'year', 'link'])
        gameitem = gamebygameItem()

        for link in gamebygamelinks:
            gameitem['team'] = response.meta['team']
            gameitem['year'] = year
            gameitem['link'] = link.url
            yield gameitem

        # Get all the links on the page
        le = LinkExtractor(restrict_xpaths=(self.xpath, ), allow=self.regex)

        links = le.extract_links(response)

        # Build up the output
        for link in links:
            yield scrapy.Request(link.url,
                                 callback=self.parse_stats,
                                 meta={
                                     'year': year,
                                     'team': response.meta['team'],
                                     'stat': link.text
                                 })
コード例 #3
0
    def parse(self, response): 
        """Parse the crawled pages"""
        # Extract current year from the returned object
        curr_year = urllib.parse.parse_qs(urllib.parse.urlparse(response.url).query)['academic_year'][0]
        
        # Get all the links on the page
        le = LinkExtractor() 
        links = le.extract_links(response)
        
        # Regex pattern to find just the team links 
        team_url_pattern = "http\:\/\/stats\.ncaa\.org\/team\/\d+\/\d+"
        
        # Extract the links, put into item object, yield item object to pipeline which saves results 
        for link in links: 
            match = re.search(team_url_pattern, link.url)
            if match != None: 
		
                # Create dynamic items
                field_list = ['Link', 'Team', 'Year']
                DynamicItem = create_item_class('Links_Team', field_list)
                item = DynamicItem()
		
                # yield items
                for k, v in {'Link':link.url, 'Team':link.text, 'Year':curr_year }.items(): 
                    item[k] = v
                yield item
コード例 #4
0
ファイル: Coach_spider.py プロジェクト: cjohns38/ncaafootball
    def parse(self, response):
        # Create table
        tables = pd.read_html(response.body)
        # Stop index
        tables[1].rename(columns={"WL%": "WL"}, inplace=True)
        stop_index = [
            1 if 'Unnamed: ' in column else 0
            for idx, column in enumerate(tables[1].columns)
        ]
        if sum(stop_index) > 1:
            stop_index = stop_index.index(1)
        else:
            stop_index = len(tables[1].columns)
        tables[1] = tables[1][:-1].iloc[:, :stop_index].copy()
        tables[1]['Name'] = response.meta['name']
        tables[1]['Team'] = response.meta['team']

        # Create dynamic items
        field_list = tables[1].columns
        DynamicItem = create_item_class('Coaches', field_list)
        item = DynamicItem()

        # yield items
        for record in tables[1].to_dict(orient='record'):
            for k, v in record.items():
                item[k] = v
            yield item
コード例 #5
0
    def parse_data(self, response):

        teamName = response.xpath(
            '//*[@id="contentarea"]/fieldset/legend/a/text()').extract_first()
        record = {'shortName': response.meta['team'], 'longName': teamName}

        # Create dynamic items
        field_list = record.keys()
        DynamicItem = create_item_class('gamebygame_teamNames', field_list)
        item = DynamicItem()

        # yield items
        for k, v in record.items():
            item[k] = v
        yield item
コード例 #6
0
    def parse(self, response):         
        # Create tables 
        tables = pd.read_html(response.body)
        tables[0]['Team'] = response.meta['team']
        tables[0].rename(columns={"WL%":"WL", "Head Coaches":"HeadCoaches"}, inplace = True)

        # Create dynamic items
        field_list = tables[0].columns
        DynamicItem = create_item_class('History', field_list)
        item = DynamicItem()
        
        # yield items
        for record in tables[0][:-1].to_dict(orient='record'):
            for k, v in record.items(): 
                item[k] = v
            yield item
コード例 #7
0
    def parse(self, response):
        # year
        year = response.selector.xpath(
            "//body//div[@id='contentarea']//fieldset//div/" +
            "/form[@id='change_sport_form']//select[@id='year_list']//option[@select"
            + "ed='selected']//text()").extract()

        # Create tables
        tables = pd.read_html(response.body)
        tables[0].columns = tables[0].columns.droplevel(0)
        tables[0]['Year'] = year[0]
        tables[0]['Team'] = response.meta['team']

        # Create dynamic items
        field_list = tables[0].columns
        DynamicItem = create_item_class('Roster', field_list)
        item = DynamicItem()

        # yield items
        for record in tables[0].to_dict(orient='record'):
            for k, v in record.items():
                item[k] = v
            yield item
コード例 #8
0
    def parse_stats(self, response):
        print(response.meta['stat'], response.meta['year'],
              response.meta['team'])
        # Parse tables
        table = self.table_cleaner(html=response.body,
                                   target_table=2,
                                   stat=response.meta['stat'],
                                   year=response.meta['year'],
                                   team=response.meta['team'],
                                   trim_n_rows=3)
        # Create dynamic items
        stat = response.meta['stat'].replace(" ",
                                             "").replace("/",
                                                         "").replace(".", "")
        field_list = table.columns
        DynamicItem = create_item_class(stat, field_list)
        item = DynamicItem()

        # yield items
        for record in table.to_dict(orient='record'):
            for k, v in record.items():
                item[k] = v
            yield item
コード例 #9
0
 def parse(self, response): 
     # Extract Team Name
     team = response.selector.xpath("//body//div//fieldset//legend//a/text()").extract()[0]
     
     # Get all the links on the page
     le = LinkExtractor() 
     links = le.extract_links(response)
             
     # Extract the links pass it to the pipeline for saving
     field_list0 = ['team', 'link', 'txt', 'key', 'year']
     dyn_item0 = create_item_class('links_teaminfo', field_list0)
     for link in links: 
         item0 = dyn_item0()
         for k, pattern in self.patterns.items():
             if re.search(pattern, link.url) != None: 
                 record = {'team':team, 
                           'link':link.url, 
                           'txt':link.text, 
                           'key':k, 
                           'year':response.meta['year']
                          }
                 
                 for k, v in record.items(): 
                     item0[k] = v
                 yield item0
     
     # Find all the tables 
     tables = pd.read_html(response.body)
     
     # Create a results table
     tables[1].rename(columns=tables[1].iloc[1], inplace = True)
     tables[1].drop([0,1], inplace = True)
     tables[1]['Team'] = team
     
     # Convert to table to a list, use dict to create scrapy item, send item to pipeline 
     field_list1 = tables[1].columns
     print(field_list1)
     dyn_item1 = create_item_class('results', field_list1)
     item1 = dyn_item1()
     for record in tables[1].to_dict(orient='record'):
         for k, v in record.items(): 
             item1[k] = v
         yield item1
             
     # Team stats 
     tables[2].rename(columns=tables[2].iloc[1], inplace = True)
     tables[2].drop([0,1], inplace = True)
     tables[2]['Team'] = team
     tables[2]['Year'] = response.meta['year']
     
     # Convert to table to a list, use dict to create scrapy item, send item to pipeline 
     field_list2 = tables[2].columns
     dyn_item2 = create_item_class('teamstats', field_list2)
     item2 = dyn_item2()
     for record in tables[2][:-1].to_dict(orient='record'):
         for k, v in record.items(): 
             item2[k] = v
         yield item2
             
     # Individual stats
     tables[3].rename(columns=tables[3].iloc[1], inplace = True)
     tables[3].drop([0,1], inplace = True)
     tables[3]['Team'] = team
     tables[3]['Year'] = response.meta['year']
     
     # Convert to table to a list, use dict to create scrapy item, send item to pipeline 
     field_list3 = tables[3].columns
     dyn_item3 = create_item_class('individualleaders', field_list3)
     item3 = dyn_item3()
     for record in tables[3][:-1].to_dict(orient='record'):
         for k, v in record.items(): 
             item3[k] = v
         yield item3