def main(argv): starturl = "http://www.espn.com/college-football/schedule" path = "{0}{1}/{2}".format(settings.predict_root, year, settings.predict_sched) print("Scrape Schedule Tool") print("**************************") print("data is from {0}".format(starturl)) print print("Year is: {0}".format(year)) print("Directory location: {0}".format(path)) print("**************************") Path(path).mkdir(parents=True, exist_ok=True) for p in Path(path).glob("sched*.*"): p.unlink() url = [] url.append("{0}/_/week/1/year/{1}/seasontype/3".format(starturl, year)) if (year == int(now.year)): for week in range(1, 17): url.append("{0}/_/week/{1}/seasontype/2".format(starturl, week)) url.append("{0}/_/week/1/seasontype/3".format(starturl)) else: for week in range(1, 17): url.append("{0}/_/week/{1}/year/{2}/seasontype/2".format( starturl, week, year)) url.append("{0}/_/week/1/year/{1}/seasontype/3".format(starturl, year)) pages = [] for item in url: req = Request( url=item, headers={ 'User-Agent': ' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0' }) try: page = urlopen(req) except HTTPError as e: page = e.read() pages.append(BeautifulSoup(page, "html5lib")) loop = 0 for page in pages: loop += 1 dates = page.findAll("h2", {"class": "table-caption"}) tables = page.findAll('table', {"class": "schedule"}) dateidx = 0 index = 0 IDX = [] Y = [] A = [] B = [] C = [] D = [] F = [] G = [] for table in tables: teams = table.findAll('abbr') home = table.findAll('td', {"class": "home"}) scores = table.findAll('td') E = [] for score in scores: data = score.find(text=True) if (data is not None and ("Canceled" in data or "Postponed" in data)): E.append(data) elif data is not None and ',' in data and num_there(data): E.append(data) else: E.append("?") if loop == len(pages): for item in range(2, len(E), 7): F.append(E[item]) else: for item in range(2, len(E), 6): F.append(E[item]) neutral = table.findAll('tr', {'class': ['odd', 'even']}) line = 0 count = 0 for team in teams: if (line % 2 == 0): if dateidx < len(dates): theDate = dates[dateidx].find(text=True) else: theDate = "?" A.append(theDate) if "January" not in theDate: Y.append(year) else: Y.append(year + 1) B.append(pyBlitz.CleanString(team['title'])) if loop != len(pages): try: if (neutral[count]['data-is-neutral-site'] == 'true'): C.append("Neutral") else: C.append("?") except KeyError as e: C.append("Neutral") else: C.append("Neutral") if (index < len(F)): G.append(F[index]) else: G.append("?") count += 1 index += 1 IDX.append(index) else: D.append(pyBlitz.CleanString(team['title'])) if (C[-1] == '?'): C[-1] = D[-1] line += 1 dateidx += 1 df = pd.DataFrame(IDX, columns=['Index']) df['Year'] = Y df['Date'] = A df['TeamA'] = B df['Home'] = C df['TeamB'] = D df['Score'] = G if (not df.empty): filename = "{0}sched{1}.json".format(path, loop) with open(filename, 'w') as f: f.write(df.to_json(orient='index')) with open(filename) as sched_json: dict_sched = json.load(sched_json, object_pairs_hook=OrderedDict) filename = "{0}sched{1}.csv".format(path, loop) sched_sheet = open(filename, 'w', newline='') csvwriter = csv.writer(sched_sheet) count = 0 for row in dict_sched.values(): if (count == 0): header = row.keys() csvwriter.writerow(header) count += 1 csvwriter.writerow(row.values()) sched_sheet.close() for root, dirs, files in os.walk(settings.predict_root): for d in dirs: os.chmod(os.path.join(root, d), stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO) for f in files: os.chmod( os.path.join(root, f), stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH | stat.S_IWOTH) print("done.")
D = [] E = [] F = [] index = 0 for row in dict_bpi.values(): #Main key put every one in A.append(row["School"]) B.append("?") C.append("?") D.append("?") E.append("?") F.append(row["Class"]) index += 1 IDX.append(str(index)) for item in dict_stats_merge: teamrankings = pyBlitz.CleanString(item['teamrankings']) team = pyBlitz.CleanString(item['BPI']) if (item['corrected BPI'].strip() != ""): team = pyBlitz.CleanString(item['corrected BPI']) index = GetIndex(A, team, F) for row in dict_teamrankings.values(): if (row['Team'].lower().strip() == teamrankings.lower().strip()): if (index > -1): B[index] = teamrankings break for item in dict_abbr_merge: abbr_team = pyBlitz.CleanString(item['abbr team']) stats = pyBlitz.CleanString(item["stats team"].lower().strip()) if (item["corrected stats team"].lower().strip()): stats = pyBlitz.CleanString(
team_set = set(AllTeams) stats_teams = list(team_set) stats_teams.sort() file = "{0}merge_schedule.csv".format(settings.data_path) merge_sheet = open(file, 'w', newline='') csvwriter = csv.writer(merge_sheet) dict_merge = OrderedDict() dict_merge["scheduled team"] = [] dict_merge["match ratio"] = [] dict_merge["stats team"] = [] dict_merge["corrected stats team"] = [] values = [] for item in sched_teams: key = process.extractOne(item, stats_teams, scorer=fuzz.QRatio) dict_merge["scheduled team"].append(pyBlitz.CleanString(item)) dict_merge["match ratio"].append(key[1]) dict_merge["stats team"].append(pyBlitz.CleanString(key[0])) ovr = GetOverride(item, list_overrides) dict_merge["corrected stats team"].append(ovr) values.append([item, key[1], key[0], ovr]) #pdb.set_trace() csvwriter.writerow(dict_merge.keys()) for value in values: #pdb.set_trace() csvwriter.writerow(value) merge_sheet.close() print("done.")
r = requests.post(url, data=data6, headers=headers) soup = BeautifulSoup(r.content, "html5lib") table6 = soup.findAll("table") IDX=[] A=[] B=[] C=[] index=0 for row in table1[0].findAll("tr"): col=row.findAll('td') if len(col)>0 and col[0].find(text=True)!="School": index+=1 IDX.append(index) A.append(pyBlitz.CleanString(col[0].find(text=True))) B.append(col[1].find(text=True)) C.append(col[2].find(text=True)) for row in table2[0].findAll("tr"): col=row.findAll('td') if len(col)>0 and col[0].find(text=True)!="School": index+=1 IDX.append(index) A.append(pyBlitz.CleanString(col[0].find(text=True))) B.append(col[1].find(text=True)) C.append(col[2].find(text=True)) for row in table3[0].findAll("tr"): col=row.findAll('td') if len(col)>0 and col[0].find(text=True)!="School": index+=1 IDX.append(index)
AllTeams = [] for item in dict_teamrankings.values(): AllTeams.append(item["Team"]) team_set = set(AllTeams) teamrankings = list(team_set) teamrankings.sort() file = "{0}merge_stats.csv".format(settings.data_path) merge_sheet = open(file, 'w', newline='') csvwriter = csv.writer(merge_sheet) dict_merge = OrderedDict() dict_merge["teamrankings"] = [] dict_merge["match ratio"] = [] dict_merge["BPI"] = [] dict_merge["corrected BPI"] = [] values = [] for item in teamrankings: key = process.extractOne(item, bpi, scorer=fuzz.QRatio) dict_merge["teamrankings"].append(pyBlitz.CleanString(item)) dict_merge["match ratio"].append(key[1]) dict_merge["BPI"].append(pyBlitz.CleanString(key[0])) ovr = GetOverride(item, list_overrides) dict_merge["corrected BPI"].append(ovr) values.append([item, key[1], key[0], ovr]) csvwriter.writerow(dict_merge.keys()) for value in values: csvwriter.writerow(value) merge_sheet.close() print("done.")
def main(argv): starturl = "http://www.espn.com/college-football/schedule" print("Scrape abbreviations Tool") print("**************************") print("data is from {0}".format(starturl)) print print("Year is: {0}".format(year)) print("Directory location: {0}".format(settings.data_path)) print("**************************") Path(settings.data_path).mkdir(parents=True, exist_ok=True) url = [] url.append("{0}/_/week/1/year/{1}/seasontype/3".format(starturl, year)) if (year == int(now.year)): for week in range(1, 17): url.append("{0}/_/week/{1}/seasontype/2".format(starturl, week)) url.append("{0}/_/week/1/seasontype/3".format(starturl)) else: for week in range(1, 17): url.append("{0}/_/week/{1}/year/{2}/seasontype/2".format( starturl, week, year)) url.append("{0}/_/week/1/year/{1}/seasontype/3".format(starturl, year)) pages = [] for item in url: req = Request( url=item, headers={ 'User-Agent': ' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0' }) try: page = urlopen(req) except HTTPError as e: page = e.read() pages.append(BeautifulSoup(page, "html5lib")) Path(settings.data_path).mkdir(parents=True, exist_ok=True) stats_sheet = open(settings.data_path + 'abbreviation.csv', 'w', newline='') csvwriter = csv.writer(stats_sheet) index = 0 A = [] B = [] C = [] D = [] for page in pages: tables = page.findAll('table', {"class": "schedule"}) for table in tables: teams = table.findAll('abbr') for team in teams: A.append(pyBlitz.CleanString(team['title'])) B.append(team.text) index += 1 C = list(OrderedDict.fromkeys(A)) D = list(OrderedDict.fromkeys(B)) index = len(C) IDX = [] for loop in range(1, index + 1): IDX.append(loop) df = pd.DataFrame(IDX, columns=['Index']) df['Team'] = C df['Abbreviation'] = D if (not df.empty): with open(settings.data_path + 'abbreviation.json', 'w') as f: f.write(df.to_json(orient='index')) with open(settings.data_path + "abbreviation.json") as stats_json: dict_stats = json.load(stats_json, object_pairs_hook=OrderedDict) count = 0 for row in dict_stats.values(): if (count == 0): header = row.keys() csvwriter.writerow(header) count += 1 csvwriter.writerow(row.values()) stats_sheet.close() for root, dirs, files in os.walk(settings.data_path): for d in dirs: os.chmod(os.path.join(root, d), stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO) for f in files: os.chmod( os.path.join(root, f), stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH | stat.S_IWOTH) print("done.")
# Add any Missing Teams Here AddSchool("ALABAMA-BIRMINGHAM", "UAB") AddSchool("ALABAMA A&M", "AAMU") AddSchool("ALBANY-NY", "ALBY") AddSchool("WESTERN KENTUCKY", "WKU") # Add any Missing Teams Here for row in tables[0].findAll("tr"): col = row.findAll('td') if len(col) > 0: tag = str(col[0].find(text=True)).strip() tag2 = str(col[0].find(href=True)).lower().strip() if (tag != "None"): if ("#f" in tag2): index += 1 IDX.append(index) A.append(pyBlitz.CleanString(tag)) B.append(col[1].find(text=True)) df = pd.DataFrame(IDX, columns=['Index']) df['Team'] = A df['Abbreviation'] = B Path(settings.data_path).mkdir(parents=True, exist_ok=True) with open(settings.data_path + 'abbreviation.json', 'w') as f: f.write(df.to_json(orient='index')) with open(settings.data_path + "abbreviation.json") as stats_json: dict_stats = json.load(stats_json, object_pairs_hook=OrderedDict) stats_sheet = open(settings.data_path + 'abbreviation.csv', 'w', newline='') csvwriter = csv.writer(stats_sheet) count = 0
print ("teamrankings file is missing, run the scrape_teamrankings tool to create") exit() with open(file) as stats_file: dict_teamrankings = json.load(stats_file, object_pairs_hook=OrderedDict) IDX=[] A=[] B=[] C=[] D=[] E=[] F=[] G=[] H=[] index = 0 for item in dict_merge.values(): teamrankings = pyBlitz.CleanString(item['teamrankings']) team = pyBlitz.CleanString(item['BPI']) row_team = [] for row in dict_teamrankings.values(): if(row['Team'].lower().strip()==teamrankings.lower().strip()): row_team = row break for row in dict_bpi.values(): if(row['School'].lower().strip()==team.lower().strip() and row['Class'].upper().strip()=="DIVISION 1 FBS"): index+=1 IDX.append(str(index)) A.append(team) B.append(teamrankings) if (row_team):