def parse(self, response): # Get this game code from file with open(os.getcwd() + "/tmpfiles/" + ''.join(e for e in response.url if e.isalnum()) + ".txt") as f: data = f.read() m = re.search(r"Code: (?P<code>\d+)", data) code = str(m.group('code')).zfill(16) # Scrape box score and save raw file table = response.xpath('//table[contains(@class, "mod-data")]') rows = [] visitor = int(code) / 1000000000000 home = (int(code) / 100000000) % 1000 date = int(code) % 100000000 for row in table.xpath('.//tr'): new_rows1 = [x.xpath('.//text()').extract() for x in row.xpath('.//td')] if len(new_rows1) > 0: rows.append(new_rows1) new_rows2 = [x.xpath('.//text()').extract() for x in row.xpath('.//th')] if len(new_rows2) > 0: if len(new_rows2) == 3: new_rows2 = [new_rows2[0], "", new_rows2[1], new_rows2[2]] rows.append(new_rows2) for i in range(0, len(rows[len(rows)-1])): rows[len(rows)-1][i] = ''.join([re.sub(r"\[u'\\xa0'\]|', |\[u'|u'|'\]|\[|\]", '', str(rows[len(rows)-1][i]))]) Write_CSV(rows, "box/" + str(visitor).zfill(4) + str(home).zfill(4) + str(date) + ".csv") # Convert to team-game-statistics format visitor_TGS = Team_Game_Statistics(code, visitor) home_TGS = Team_Game_Statistics(code, home) team_names = Read_CSV("2014 Stats/team.csv") team_names = team_names[1:] team_abbvs = Read_CSV("2014 Stats/abbrevations.csv") # Get score for i in range(0, len(rows)): first_qtr = re.search(r"FIRST QUARTER", rows[i][0]) if first_qtr: while len(rows[i+1]) >= 5: i += 1 visitor_TGS.Points = rows[i][4] home_TGS.Points = rows[i][5] second_qtr = re.search(r"SECOND QUARTER", rows[i][0]) if second_qtr: while len(rows[i+1]) >= 5: i += 1 visitor_TGS.Points = rows[i][4] home_TGS.Points = rows[i][5] third_qtr = re.search(r"THIRD QUARTER", rows[i][0]) if third_qtr: while len(rows[i+1]) >= 5: i += 1 visitor_TGS.Points = rows[i][4] home_TGS.Points = rows[i][5] fourth_qtr = re.search(r"FOURTH QUARTER", rows[i][0]) if fourth_qtr: while len(rows[i+1]) >= 5: i += 1 visitor_TGS.Points = rows[i][4] home_TGS.Points = rows[i][5] # Box score stats for i in range(0, len(rows)): # Total 1st downs first_downs = re.search(r"1st Downs", rows[i][0]) if first_downs: visitor_TGS.First_Down_Total = rows[i][1] home_TGS.First_Down_Total = rows[i][2] # 3rd down conversions third_downs = re.search(r"3rd down efficiency", rows[i][0]) if third_downs: eff = re.match(r"(?P<conv>\d+)\-(?P<att>\d+)", rows[i][1]) visitor_TGS.Third_Down_Att = eff.group("att") visitor_TGS.Third_Down_Conv = eff.group("conv") eff = re.match(r"(?P<conv>\d+)\-(?P<att>\d+)", rows[i][2]) home_TGS.Third_Down_Att = eff.group("att") home_TGS.Third_Down_Conv = eff.group("conv") # 4th down conversions fourth_downs = re.search(r"4th down efficiency", rows[i][0]) if fourth_downs: eff = re.match(r"(?P<conv>\d+)\-(?P<att>\d+)", rows[i][1]) visitor_TGS.Fourth_Down_Att = eff.group("att") visitor_TGS.Fourth_Down_Conv = eff.group("conv") eff = re.match(r"(?P<conv>\d+)\-(?P<att>\d+)", rows[i][2]) home_TGS.Fourth_Down_Att = eff.group("att") home_TGS.Fourth_Down_Conv = eff.group("conv") # Penalties penalties = re.search(r"Penalties", rows[i][0]) if penalties: num_yrds = re.search(r"(?P<num>\d+)\-(?P<yrds>\d+)", rows[i][1]) visitor_TGS.Penalty = num_yrds.group("num") visitor_TGS.Penalty_Yard = num_yrds.group("yrds") num_yrds = re.search(r"(?P<num>\d+)\-(?P<yrds>\d+)", rows[i][2]) home_TGS.Penalty = num_yrds.group("num") home_TGS.Penalty_Yard = num_yrds.group("yrds") # Possession possession = re.search(r"Possession", rows[i][0]) if possession: visitor_TGS.Time_Of_Possession = rows[i][1] home_TGS.Time_Of_Possession = rows[i][2] # Fumbles Lost fum_lost = re.search(r"Fumbles lost", rows[i][0]) if fum_lost: visitor_TGS.Fum_Lost = rows[i][1] home_TGS.Fum_Lost = rows[i][2] visitor_TGS.Fum_Ret = home_TGS.Fum_Lost home_TGS.Fum_Ret = visitor_TGS.Fum_Lost # Find stats visitor_TGS = Parse_Box(rows, visitor_TGS, team_abbvs) # START DEBUG -- #if int(visitor_TGS.Rush_Att) + int(visitor_TGS.Pass_Att) == 0: #pdb.set_trace() #visitor_TGS = Parse_Box(rows, visitor_TGS, team_abbvs) # END DEBUG -- home_TGS = Parse_Box(rows, home_TGS, team_abbvs) # START DEBUG -- #if int(home_TGS.Rush_Att) + int(home_TGS.Pass_Att) == 0: #pdb.set_trace() #home_TGS = Parse_Box(rows, visitor_TGS, team_abbvs) # END DEBUG -- if os.path.isfile("2014 Stats/team-game-statistics.csv"): f = open("2014 Stats/team-game-statistics.csv","a") data_writer = csv.writer(f, lineterminator = '\n') new_rows = [] new_rows.append(visitor_TGS.Compile()) new_rows.append(home_TGS.Compile()) data_writer.writerows(new_rows) f.close() else: new_rows = [] new_rows.append(visitor_TGS.Header()) new_rows.append(visitor_TGS.Compile()) new_rows.append(home_TGS.Compile()) Write_CSV(new_rows, "2014 Stats/team-game-statistics.csv")
def parse(self, response): # Get this game code from file with open(os.getcwd() + "/tmpfiles/" + ''.join(e for e in response.url if e.isalnum()) + ".txt") as f: data = f.read() m = re.search(r"Code: (?P<code>\d+)", data) code = str(m.group('code')).zfill(16) # Scrape box score away = int(long(code) / 1e12) home = int((long(code) / 1e8) % 1e3) date = int(long(code) % 1e8) away_TGS = Team_Game_Statistics(code, away) home_TGS = Team_Game_Statistics(code, home) # MOVE SOME OF THESE TO MATCHUP SCRAPER # Scrape passing pass_div = response.xpath('//div[@id="gamepackage-passing"]') # away away_pass_div = pass_div.xpath('.//div[contains(@class,"gamepackage-away-wrap")]') away_TGS = Extract_Passing(away_TGS, away_pass_div) # home home_pass_div = pass_div.xpath('.//div[contains(@class,"gamepackage-home-wrap")]') home_TGS = Extract_Passing(home_TGS, home_pass_div) # Scrape rushing rush_div = response.xpath('//div[@id="gamepackage-rushing"]') # away away_rush_div = rush_div.xpath('.//div[contains(@class,"gamepackage-away-wrap")]') away_TGS = Extract_Rushing(away_TGS, away_rush_div) # home home_rush_div = rush_div.xpath('.//div[contains(@class,"gamepackage-home-wrap")]') home_TGS = Extract_Rushing(home_TGS, home_rush_div) # Scrape kick returns kr_div = response.xpath('//div[@id="gamepackage-kickReturns"]') # away away_kr_div = kr_div.xpath('.//div[contains(@class,"gamepackage-away-wrap")]') away_TGS = Extract_KickReturns(away_TGS, away_kr_div) # home home_kr_div = kr_div.xpath('.//div[contains(@class,"gamepackage-home-wrap")]') home_TGS = Extract_KickReturns(home_TGS, home_kr_div) # Scrape punt returns pr_div = response.xpath('//div[@id="gamepackage-puntReturns"]') # away away_pr_div = pr_div.xpath('.//div[contains(@class,"gamepackage-away-wrap")]') away_TGS = Extract_PuntReturns(away_TGS, away_pr_div) # home home_pr_div = pr_div.xpath('.//div[contains(@class,"gamepackage-home-wrap")]') home_TGS = Extract_PuntReturns(home_TGS, home_pr_div) # Scrape interception returns int_div = response.xpath('//div[@id="gamepackage-interceptions"]') # away away_int_div = int_div.xpath('.//div[contains(@class,"gamepackage-away-wrap")]') away_TGS = Extract_Interceptions(away_TGS, away_int_div) # home home_int_div = int_div.xpath('.//div[contains(@class,"gamepackage-home-wrap")]') home_TGS = Extract_Interceptions(home_TGS, home_int_div) # Scrape kicking kick_div = response.xpath('//div[@id="gamepackage-kicking"]') # away away_kick_div = kick_div.xpath('.//div[contains(@class,"gamepackage-away-wrap")]') away_TGS = Extract_Kicking(away_TGS, away_kick_div) # home home_kick_div = kick_div.xpath('.//div[contains(@class,"gamepackage-home-wrap")]') home_TGS = Extract_Kicking(home_TGS, home_kick_div) # Scrape punting punt_div = response.xpath('//div[@id="gamepackage-punting"]') # away away_punt_div = punt_div.xpath('.//div[contains(@class,"gamepackage-away-wrap")]') away_TGS = Extract_Punting(away_TGS, away_punt_div) # home home_punt_div = punt_div.xpath('.//div[contains(@class,"gamepackage-home-wrap")]') home_TGS = Extract_Punting(home_TGS, home_punt_div) # Get points points_div = response.xpath('//div[@class="competitors"]') away_points = points_div.xpath('.//div[contains(@class,"away")]') away_TGS = Extract_Points(away_TGS, away_points) home_points = points_div.xpath('.//div[contains(@class,"home")]') home_TGS = Extract_Points(home_TGS, home_points) # Write stats to file if os.path.isfile(str(year) + " Stats/boxscore-stats.csv"): f = open(str(year) + " Stats/boxscore-stats.csv","a") data_writer = csv.writer(f, lineterminator = '\n') new_rows = [] new_rows.append(away_TGS.Compile()) new_rows.append(home_TGS.Compile()) data_writer.writerows(new_rows) f.close() else: new_rows = [] new_rows.append(away_TGS.Header()) new_rows.append(away_TGS.Compile()) new_rows.append(home_TGS.Compile()) Write_CSV(new_rows, str(year) + " Stats/boxscore-stats.csv")
play_data.append(play) Write_CSV(play_data, str(year) + " Stats temp/unparsed_plays.csv") # Build team-game-statistics prev_game_code = 0 allTGS = [] for play in allPlays: # found a new game if float(play.Game_Code) != prev_game_code: # save old data if prev_game_code != 0: allTGS.append(home_tgs) allTGS.append(visitor_tgs) visitor_code = int(math.floor(float(play.Game_Code) / 1e12)) home_code = int(math.floor(float(play.Game_Code) / 1e8)) % 1e4 home_tgs = Team_Game_Statistics(play.Game_Code, home_code) visitor_tgs = Team_Game_Statistics(play.Game_Code, visitor_code) prev_game_code = float(play.Game_Code) # increment data if play.Offense == home_tgs.Team_Code: home_tgs.Extract_Play_Offense(play) elif play.Offense == visitor_tgs.Team_Code: visitor_tgs.Extract_Play_Offense(play) # Write team-game-statistics to file tgs_data = [] tgs_data.append(allTGS[0].Header()) for tgs in allTGS: tgs_data.append(tgs.Compile_Stats()) Write_CSV(tgs_data, str(year) + " Stats temp/play_TGS.csv")
def parse(self, response): # Get this game code from file with open(os.getcwd() + "/tmpfiles/" + ''.join(e for e in response.url if e.isalnum()) + ".txt") as f: data = f.read() m = re.search(r"Code: (?P<code>\d+)", data) code = str(m.group('code')).zfill(16) # Scrape box score away = int(long(code) / 1e12) home = int((long(code) / 1e8) % 1e3) date = int(long(code) % 1e8) away_TGS = Team_Game_Statistics(code, away) home_TGS = Team_Game_Statistics(code, home) # Scrape first downs first_div = response.xpath('//tr[@data-stat-attr="firstDowns"]') away_TGS.First_Down_Total = re.sub( r'[\\\t|\\\n]', '', first_div.xpath('.//td/text()').extract()[1]) home_TGS.First_Down_Total = re.sub( r'[\\\t|\\\n]', '', first_div.xpath('.//td/text()').extract()[2]) # Scrape turnovers fumble_div = response.xpath('//tr[@data-stat-attr="fumblesLost"]') away_TGS.Fum_Lost = re.sub( r'[\\\t|\\\n]', '', fumble_div.xpath('.//td/text()').extract()[1]) home_TGS.Fum_Lost = re.sub( r'[\\\t|\\\n]', '', fumble_div.xpath('.//td/text()').extract()[2]) away_TGS.Fum_Ret = home_TGS.Fum_Lost home_TGS.Fum_Ret = away_TGS.Fum_Lost # Scrape first down efficiency on 3rd down eff_div = response.xpath('//tr[@data-stat-attr="thirdDownEff"]') fatt_away = re.sub(r'[\\\t|\\\n]', '', eff_div.xpath('.//td/text()').extract()[1]) fatt_home = re.sub(r'[\\\t|\\\n]', '', eff_div.xpath('.//td/text()').extract()[2]) # away m = re.search(r'(?P<f>\d+)\-(?P<a>\d+)', fatt_away) away_TGS.Third_Down_Att = m.group('a') away_TGS.Third_Down_Conv = m.group('f') # home m = re.search(r'(?P<f>\d+)\-(?P<a>\d+)', fatt_home) home_TGS.Third_Down_Att = m.group('a') home_TGS.Third_Down_Conv = m.group('f') # Scrape first down efficiency on 4th down eff_div = response.xpath('//tr[@data-stat-attr="fourthDownEff"]') fatt_away = re.sub(r'[\\\t|\\\n]', '', eff_div.xpath('.//td/text()').extract()[1]) fatt_home = re.sub(r'[\\\t|\\\n]', '', eff_div.xpath('.//td/text()').extract()[2]) # away m = re.search(r'(?P<f>\d+)\-(?P<a>\d+)', fatt_away) away_TGS.Fourth_Down_Att = m.group('a') away_TGS.Fourth_Down_Conv = m.group('f') # home m = re.search(r'(?P<f>\d+)\-(?P<a>\d+)', fatt_home) home_TGS.Fourth_Down_Att = m.group('a') home_TGS.Fourth_Down_Conv = m.group('f') # Scrape time of possession top_div = response.xpath('//tr[@data-stat-attr="possessionTime"]') try: top_away = re.sub(r'[\\\t|\\\n]', '', top_div.xpath('.//td/text()').extract()[1]) except: top_away = "30:00" try: top_home = re.sub(r'[\\\t|\\\n]', '', top_div.xpath('.//td/text()').extract()[2]) except: top_home = "30:00" # away m_away = re.search(r'(?P<h>\d+)\:(?P<m>\d+)', top_away) # home m_home = re.search(r'(?P<h>\d+)\:(?P<m>\d+)', top_home) try: away_TGS.Time_Of_Possession = str(60 * int(m_away.group('h')) + int(m_away.group('m'))) home_TGS.Time_Of_Possession = str(60 * int(m_home.group('h')) + int(m_home.group('m'))) except: away_TGS.Time_Of_Possession = 1800 home_TGS.Time_Of_Possession = 1800 if int(away_TGS.Time_Of_Possession) == 1800 and int( home_TGS.Time_Of_Possession) != 1800: away_TGS.Time_Of_Possession = str(3600 - int(home_TGS.Time_Of_Possession)) elif int(home_TGS.Time_Of_Possession) == 1800 and int( away_TGS.Time_Of_Possession) != 1800: home_TGS.Time_Of_Possession = str(3600 - int(away_TGS.Time_Of_Possession)) # Scrape penalties pen_div = response.xpath('//tr[@data-stat-attr="totalPenaltiesYards"]') pen_away = re.sub(r'[\\\t|\\\n]', '', pen_div.xpath('.//td/text()').extract()[1]) pen_home = re.sub(r'[\\\t|\\\n]', '', pen_div.xpath('.//td/text()').extract()[2]) # away m = re.search(r'(?P<tot>\d+)\-(?P<yds>\d+)', pen_away) away_TGS.Penalty = m.group('tot') away_TGS.Penalty_Yard = m.group('yds') # home m = re.search(r'(?P<tot>\d+)\-(?P<yds>\d+)', pen_home) home_TGS.Penalty = m.group('tot') home_TGS.Penalty_Yard = m.group('yds') # Write stats to file if os.path.isfile(str(year) + " Stats/matchup-stats.csv"): f = open(str(year) + " Stats/matchup-stats.csv", "a") data_writer = csv.writer(f, lineterminator='\n') new_rows = [] new_rows.append(away_TGS.Compile()) new_rows.append(home_TGS.Compile()) data_writer.writerows(new_rows) f.close() else: new_rows = [] new_rows.append(away_TGS.Header()) new_rows.append(away_TGS.Compile()) new_rows.append(home_TGS.Compile()) Write_CSV(new_rows, str(year) + " Stats/matchup-stats.csv")
def parse(self, response): # Get this game code from file with open(os.getcwd() + "/tmpfiles/" + ''.join(e for e in response.url if e.isalnum()) + ".txt") as f: data = f.read() m = re.search(r"Code: (?P<code>\d+)", data) code = str(m.group('code')).zfill(16) # Scrape box score away = int(long(code) / 1e12) home = int((long(code) / 1e8) % 1e3) date = int(long(code) % 1e8) away_TGS = Team_Game_Statistics(code, away) home_TGS = Team_Game_Statistics(code, home) # Scrape first downs first_div = response.xpath('//tr[@data-stat-attr="firstDowns"]') away_TGS.First_Down_Total = re.sub(r'[\\\t|\\\n]','',first_div.xpath('.//td/text()').extract()[1]) home_TGS.First_Down_Total = re.sub(r'[\\\t|\\\n]','',first_div.xpath('.//td/text()').extract()[2]) # Scrape turnovers fumble_div = response.xpath('//tr[@data-stat-attr="fumblesLost"]') away_TGS.Fum_Lost = re.sub(r'[\\\t|\\\n]','',fumble_div.xpath('.//td/text()').extract()[1]) home_TGS.Fum_Lost = re.sub(r'[\\\t|\\\n]','',fumble_div.xpath('.//td/text()').extract()[2]) away_TGS.Fum_Ret = home_TGS.Fum_Lost home_TGS.Fum_Ret = away_TGS.Fum_Lost # Scrape first down efficiency on 3rd down eff_div = response.xpath('//tr[@data-stat-attr="thirdDownEff"]') fatt_away = re.sub(r'[\\\t|\\\n]','',eff_div.xpath('.//td/text()').extract()[1]) fatt_home = re.sub(r'[\\\t|\\\n]','',eff_div.xpath('.//td/text()').extract()[2]) # away m = re.search(r'(?P<f>\d+)\-(?P<a>\d+)', fatt_away) away_TGS.Third_Down_Att = m.group('a') away_TGS.Third_Down_Conv = m.group('f') # home m = re.search(r'(?P<f>\d+)\-(?P<a>\d+)', fatt_home) home_TGS.Third_Down_Att = m.group('a') home_TGS.Third_Down_Conv = m.group('f') # Scrape first down efficiency on 4th down eff_div = response.xpath('//tr[@data-stat-attr="fourthDownEff"]') fatt_away = re.sub(r'[\\\t|\\\n]','',eff_div.xpath('.//td/text()').extract()[1]) fatt_home = re.sub(r'[\\\t|\\\n]','',eff_div.xpath('.//td/text()').extract()[2]) # away m = re.search(r'(?P<f>\d+)\-(?P<a>\d+)', fatt_away) away_TGS.Fourth_Down_Att = m.group('a') away_TGS.Fourth_Down_Conv = m.group('f') # home m = re.search(r'(?P<f>\d+)\-(?P<a>\d+)', fatt_home) home_TGS.Fourth_Down_Att = m.group('a') home_TGS.Fourth_Down_Conv = m.group('f') # Scrape time of possession top_div = response.xpath('//tr[@data-stat-attr="possessionTime"]') try: top_away = re.sub(r'[\\\t|\\\n]','',top_div.xpath('.//td/text()').extract()[1]) except: top_away = "30:00" try: top_home = re.sub(r'[\\\t|\\\n]','',top_div.xpath('.//td/text()').extract()[2]) except: top_home = "30:00" # away m_away = re.search(r'(?P<h>\d+)\:(?P<m>\d+)', top_away) # home m_home = re.search(r'(?P<h>\d+)\:(?P<m>\d+)', top_home) try: away_TGS.Time_Of_Possession = str(60*int(m_away.group('h')) + int(m_away.group('m'))) home_TGS.Time_Of_Possession = str(60*int(m_home.group('h')) + int(m_home.group('m'))) except: away_TGS.Time_Of_Possession = 1800 home_TGS.Time_Of_Possession = 1800 if int(away_TGS.Time_Of_Possession) == 1800 and int(home_TGS.Time_Of_Possession) != 1800: away_TGS.Time_Of_Possession = str(3600 - int(home_TGS.Time_Of_Possession)) elif int(home_TGS.Time_Of_Possession) == 1800 and int(away_TGS.Time_Of_Possession) != 1800: home_TGS.Time_Of_Possession = str(3600 - int(away_TGS.Time_Of_Possession)) # Scrape penalties pen_div = response.xpath('//tr[@data-stat-attr="totalPenaltiesYards"]') pen_away = re.sub(r'[\\\t|\\\n]','',pen_div.xpath('.//td/text()').extract()[1]) pen_home = re.sub(r'[\\\t|\\\n]','',pen_div.xpath('.//td/text()').extract()[2]) # away m = re.search(r'(?P<tot>\d+)\-(?P<yds>\d+)', pen_away) away_TGS.Penalty = m.group('tot') away_TGS.Penalty_Yard = m.group('yds') # home m = re.search(r'(?P<tot>\d+)\-(?P<yds>\d+)', pen_home) home_TGS.Penalty = m.group('tot') home_TGS.Penalty_Yard = m.group('yds') # Write stats to file if os.path.isfile(str(year) + " Stats/matchup-stats.csv"): f = open(str(year) + " Stats/matchup-stats.csv","a") data_writer = csv.writer(f, lineterminator = '\n') new_rows = [] new_rows.append(away_TGS.Compile()) new_rows.append(home_TGS.Compile()) data_writer.writerows(new_rows) f.close() else: new_rows = [] new_rows.append(away_TGS.Header()) new_rows.append(away_TGS.Compile()) new_rows.append(home_TGS.Compile()) Write_CSV(new_rows, str(year) + " Stats/matchup-stats.csv")