def parse_data(self, response): # Create tables tables = pd.read_html(response.body) # Create yearly table yearly = self.yearlystats(tables[2]) # Create dynamic items stat = response.meta['stat'].replace(" ","").replace("/","").replace(".","").replace("-", "") field_list = yearly.columns DynamicItem = create_item_class("yearly_" + stat, field_list) item = DynamicItem() # yield items for record in yearly.to_dict(orient='record'): for k, v in record.items(): item[k] = v yield item # Create game-by-game table gamebygame = self.gamestats(tables[4], response.meta['team']) # Create dynamic items field_list = gamebygame.columns DynamicItem = create_item_class('gamebygame_' + stat, field_list) item = DynamicItem() # yield items for record in gamebygame.to_dict(orient='record'): for k, v in record.items(): item[k] = v yield item
def parse(self, response): # Year year = response.selector.xpath( "//body//div[@id='contentarea']//fieldset//div/" + "/form[@id='change_sport_form']//select[@id='year_list']//option[@select" + "ed='selected']//text()").extract()[0] # Write out the rushing stats table = self.table_cleaner(html=response.body, target_table=2, stat='rushing', year=year, team=response.meta['team'], trim_n_rows=1) # Create dynamic items stat = 'rushing' field_list = table.columns DynamicItem = create_item_class(stat, field_list) item = DynamicItem() # yield items for record in table.to_dict(orient='record'): for k, v in record.items(): item[k] = v yield item # Write out the game-by-game links gamebygame = LinkExtractor(allow=self.gamebygameregex) gamebygamelinks = gamebygame.extract_links(response) gamebygameItem = create_item_class('links_gamebygames', ['team', 'year', 'link']) gameitem = gamebygameItem() for link in gamebygamelinks: gameitem['team'] = response.meta['team'] gameitem['year'] = year gameitem['link'] = link.url yield gameitem # Get all the links on the page le = LinkExtractor(restrict_xpaths=(self.xpath, ), allow=self.regex) links = le.extract_links(response) # Build up the output for link in links: yield scrapy.Request(link.url, callback=self.parse_stats, meta={ 'year': year, 'team': response.meta['team'], 'stat': link.text })
def parse(self, response): """Parse the crawled pages""" # Extract current year from the returned object curr_year = urllib.parse.parse_qs(urllib.parse.urlparse(response.url).query)['academic_year'][0] # Get all the links on the page le = LinkExtractor() links = le.extract_links(response) # Regex pattern to find just the team links team_url_pattern = "http\:\/\/stats\.ncaa\.org\/team\/\d+\/\d+" # Extract the links, put into item object, yield item object to pipeline which saves results for link in links: match = re.search(team_url_pattern, link.url) if match != None: # Create dynamic items field_list = ['Link', 'Team', 'Year'] DynamicItem = create_item_class('Links_Team', field_list) item = DynamicItem() # yield items for k, v in {'Link':link.url, 'Team':link.text, 'Year':curr_year }.items(): item[k] = v yield item
def parse(self, response): # Create table tables = pd.read_html(response.body) # Stop index tables[1].rename(columns={"WL%": "WL"}, inplace=True) stop_index = [ 1 if 'Unnamed: ' in column else 0 for idx, column in enumerate(tables[1].columns) ] if sum(stop_index) > 1: stop_index = stop_index.index(1) else: stop_index = len(tables[1].columns) tables[1] = tables[1][:-1].iloc[:, :stop_index].copy() tables[1]['Name'] = response.meta['name'] tables[1]['Team'] = response.meta['team'] # Create dynamic items field_list = tables[1].columns DynamicItem = create_item_class('Coaches', field_list) item = DynamicItem() # yield items for record in tables[1].to_dict(orient='record'): for k, v in record.items(): item[k] = v yield item
def parse_data(self, response): teamName = response.xpath( '//*[@id="contentarea"]/fieldset/legend/a/text()').extract_first() record = {'shortName': response.meta['team'], 'longName': teamName} # Create dynamic items field_list = record.keys() DynamicItem = create_item_class('gamebygame_teamNames', field_list) item = DynamicItem() # yield items for k, v in record.items(): item[k] = v yield item
def parse(self, response): # Create tables tables = pd.read_html(response.body) tables[0]['Team'] = response.meta['team'] tables[0].rename(columns={"WL%":"WL", "Head Coaches":"HeadCoaches"}, inplace = True) # Create dynamic items field_list = tables[0].columns DynamicItem = create_item_class('History', field_list) item = DynamicItem() # yield items for record in tables[0][:-1].to_dict(orient='record'): for k, v in record.items(): item[k] = v yield item
def parse(self, response): # year year = response.selector.xpath( "//body//div[@id='contentarea']//fieldset//div/" + "/form[@id='change_sport_form']//select[@id='year_list']//option[@select" + "ed='selected']//text()").extract() # Create tables tables = pd.read_html(response.body) tables[0].columns = tables[0].columns.droplevel(0) tables[0]['Year'] = year[0] tables[0]['Team'] = response.meta['team'] # Create dynamic items field_list = tables[0].columns DynamicItem = create_item_class('Roster', field_list) item = DynamicItem() # yield items for record in tables[0].to_dict(orient='record'): for k, v in record.items(): item[k] = v yield item
def parse_stats(self, response): print(response.meta['stat'], response.meta['year'], response.meta['team']) # Parse tables table = self.table_cleaner(html=response.body, target_table=2, stat=response.meta['stat'], year=response.meta['year'], team=response.meta['team'], trim_n_rows=3) # Create dynamic items stat = response.meta['stat'].replace(" ", "").replace("/", "").replace(".", "") field_list = table.columns DynamicItem = create_item_class(stat, field_list) item = DynamicItem() # yield items for record in table.to_dict(orient='record'): for k, v in record.items(): item[k] = v yield item
def parse(self, response): # Extract Team Name team = response.selector.xpath("//body//div//fieldset//legend//a/text()").extract()[0] # Get all the links on the page le = LinkExtractor() links = le.extract_links(response) # Extract the links pass it to the pipeline for saving field_list0 = ['team', 'link', 'txt', 'key', 'year'] dyn_item0 = create_item_class('links_teaminfo', field_list0) for link in links: item0 = dyn_item0() for k, pattern in self.patterns.items(): if re.search(pattern, link.url) != None: record = {'team':team, 'link':link.url, 'txt':link.text, 'key':k, 'year':response.meta['year'] } for k, v in record.items(): item0[k] = v yield item0 # Find all the tables tables = pd.read_html(response.body) # Create a results table tables[1].rename(columns=tables[1].iloc[1], inplace = True) tables[1].drop([0,1], inplace = True) tables[1]['Team'] = team # Convert to table to a list, use dict to create scrapy item, send item to pipeline field_list1 = tables[1].columns print(field_list1) dyn_item1 = create_item_class('results', field_list1) item1 = dyn_item1() for record in tables[1].to_dict(orient='record'): for k, v in record.items(): item1[k] = v yield item1 # Team stats tables[2].rename(columns=tables[2].iloc[1], inplace = True) tables[2].drop([0,1], inplace = True) tables[2]['Team'] = team tables[2]['Year'] = response.meta['year'] # Convert to table to a list, use dict to create scrapy item, send item to pipeline field_list2 = tables[2].columns dyn_item2 = create_item_class('teamstats', field_list2) item2 = dyn_item2() for record in tables[2][:-1].to_dict(orient='record'): for k, v in record.items(): item2[k] = v yield item2 # Individual stats tables[3].rename(columns=tables[3].iloc[1], inplace = True) tables[3].drop([0,1], inplace = True) tables[3]['Team'] = team tables[3]['Year'] = response.meta['year'] # Convert to table to a list, use dict to create scrapy item, send item to pipeline field_list3 = tables[3].columns dyn_item3 = create_item_class('individualleaders', field_list3) item3 = dyn_item3() for record in tables[3][:-1].to_dict(orient='record'): for k, v in record.items(): item3[k] = v yield item3