def download_events(competition_id, event_type): """Download all event data. Args: competition_id: Id of competition event_type: String value of event type Returns: None """ logging.info("Downloading events") logging.debug(competition_id, event_type) comps = pd.read_csv( os.path.join(config.SOURCE_DIR, "stb", "competitions", "competitions_None.csv")) season_ids = comps[comps.competition_id == competition_id].season_id.values for season_id in season_ids: logging.debug(season_id) mats = pd.read_csv( os.path.join( config.SOURCE_DIR, "stb", "matches", "matches_{0}_{1}.csv".format(competition_id, season_id), )) match_ids = mats.match_id.values os.chdir(os.path.join(config.SOURCE_DIR, "stb", "events")) for match_id in match_ids: logging.debug(match_id) sb.Events(event_id=str(match_id)).save_data(event_type=event_type) return
def main(): """Use the Main for CLI usage.""" logging.info("Executing events module") download_competitions() download_matches(competition_id=11) download_events(competition_id=11, event_type="shot") build_event_data(competition_id=11, event_type="shot")
def main(): """Use the Main for CLI usage.""" logging.info("Executing results module") download_results() ## Some dates wrong??? unzip_results_files() format_results() ## SOME TEAM NAMES DO NOT LOAD/PARSE archive_results_files() results_analysis()
def build_event_data(competition_id, event_type, directory=config.MASTER_DIR): """Build formatted event data from source files. Args: event_type: String value of event type directory: Folder for processed data Returns: None """ logging.info("Building event data") comps = utilities.folder_loader("stb", "competitions") logging.debug(comps.info()) matches = utilities.folder_loader("stb", "matches") logging.debug(matches.info()) events = utilities.folder_loader("stb", "events", "match_event") logging.debug(events.info()) data = ( comps.loc[ comps.competition_id == competition_id, ["season_id", "country_name", "competition_name", "season_name"], ] .merge( matches.loc[matches.competition == competition_id, ["match_id", "match_date", "kick_off", "season"], ], how="inner", left_on="season_id", right_on="season", ).merge( events.loc[:, [ "event_type", "period", "minute", "team", "player", "statsbomb_xg", "type", "outcome", "start_location_x", "start_location_y", "end_location_x", "end_location_y", "end_location_z", "match_id", ], ], how="inner", on="match_id", )) logging.debug(data.info()) utilities.save_master(data, "events_{0}".format(event_type), directory=directory) return data
def download_results(directory=config.SOURCE_DIR, season_filter=()): """Download results. INPUT: directory: Parent folder to save results under season_filter: Optional List of seasons to filter on OUTPUT: None """ logging.info("Downloading results") remotepath = os.path.dirname(config.RESULTS_SCRAPE["ftd"][0]) soup = utilities.make_soup(config.RESULTS_SCRAPE["ftd"][0]) logging.info( "Scrape list of zip files from {0}".format(config.RESULTS_SCRAPE["ftd"][0]) ) for table in soup.findAll("table", attrs={"cellspacing": "2", "border": "0"}): for item in table.findAll("a", href=True): # logging.info(item.text) # logging.info(item['href']) season = item.text.replace("Season ", "").replace("/", "-") if len(season) > 0 and season not in season_filter: continue remotefile = remotepath + "/" + item["href"] logging.info("Remote file: {0}".format(remotefile)) localfile = os.path.join( directory, config.RESULTS_SCRAPE["ftd"][1], season, os.path.basename(item["href"]), ) logging.info("Local file: {0}".format(localfile)) utilities.ensure_dir(localfile) if remotefile.lower().startswith("http"): req = urllib.request.Request(remotefile) else: raise ValueError from None with urllib.request.urlopen(req) as response: with open(localfile, "wb") as output: output.write(response.read()) # response = requests.get(remotefile, stream=True) # with open(localfile, 'wb') as output: # for data in response.iter_content(): # output.write(data) logging.info("Retrieve OK: " + str([remotefile, localfile]))
def download_competitions(): """Download all competition data for events. Args: None Returns: None """ logging.info("Downloading event competitions") os.chdir(os.path.join(config.SOURCE_DIR, "stb", "competitions")) sb.Competitions().save_data() return
def get_outfile(source_name): """Return outfile stub for given source. INPUT: source_name: String containing name of the data source OUTPUT: outfile_stub: Stub to use when saving output """ logging.info("Mapping {0} to outfile".format(source_name)) if source_name == "tmk_cnt": outfile_stub = "players_contract" elif source_name == "tmk_psm": outfile_stub = "players_performance" logging.debug(outfile_stub) return outfile_stub
def archive_results_files( parentDirectory=config.SOURCE_DIR, subDirectory=config.RESULTS_SCRAPE["ftd"][1] ): """Zip/cleardown all source files. INPUT: parentDirectory: Parent directory to traverse looking for files to zip/clear subDirectory: Sub-Directory to traverse looking for files to zip/clear OUTPUT: None """ directory = os.path.join(parentDirectory, subDirectory) logging.info("Removing/zipping unzipped results files from {0}".format(directory)) for root, _dirs, files in os.walk(directory): for file in files: if file.endswith(".csv"): file_to_delete = os.path.join(root, file) logging.info("File to delete: {0}".format(file_to_delete)) os.remove(file_to_delete) elif file.endswith((".xls", ".xlsx")): file_to_zip = os.path.join(root, file) file_as_zip = os.path.splitext(file_to_zip)[0] + ".zip" logging.info("File to zip: {0}".format((file_to_zip, file_as_zip))) with zipfile.ZipFile(file_as_zip, "w") as myzip: myzip.write(file_to_zip, file) os.remove(file_to_zip)
def results_analysis( directory=config.MASTER_DIR, buckets=("Div", "Season"), stats=("HS", "AS", "HST", "AST", "FTHG", "FTAG"), filteron="HomeTeam", values=("Barcelona"), aggfunc="mean", ): """Customizable analysis of results. INPUT: directory: Directory containing processed results buckets: List of Fields to group by stats: Aggregated metrics to include filteron: Field to filter on values: List of values to filter on aggfunc: Aggregation method OUTPUT: selected: Analysis dataframe """ logging.info("Results analysis") dframe = utilities.get_master("results", directory) # dframe.info() # dframe.describe(include="all") # print(dframe) # buckets = ['Div','Season'] # stats = 'HS','AS','HST','AST','FTHG','FTAG' # filteron = 'HomeTeam' # values = ['Barcelona'] # aggfunc = 'mean' pseudocode = ( "SELECT " + aggfunc + " OF " + str(stats) + " WHERE " + filteron + " IS " + str(values) + " GROUPED BY " + str(buckets) ) logging.info("Analysis pseudocode: {0}".format(pseudocode)) selected = ( dframe[dframe[filteron].isin(values)].groupby(buckets)[stats].agg(aggfunc) ) # print(selected) logging.info(selected.describe(include="all")) # pd.scatter_matrix(selected, diagonal='kde') # plt.show() return selected
def download_matches(competition_id): """Download all match data for events. Args: competition_id: Id of competition Returns: None """ logging.info("Downloading event matches") comps = pd.read_csv( os.path.join(config.SOURCE_DIR, "stb", "competitions", "competitions_None.csv")) season_ids = comps[comps.competition_id == competition_id].season_id.values os.chdir(os.path.join(config.SOURCE_DIR, "stb", "matches")) for season_id in season_ids: mats = sb.Matches(event_id=competition_id, season_id=season_id) mats.save_data() return
def download_stadiums( scrapeSource=config.STADIUMS_SCRAPE, directoryIn=config.SOURCE_DIR ): """Download stadiums data. INPUT: scrapeSource: Info on scraped data directoryIn: Location to download to OUTPUT: Boolean: True if download ok, False otherwise """ logging.info("Downloading stadiums") for _code, endpoints in scrapeSource.items(): try: df = pd.read_csv(endpoints[0]) df.to_csv(os.path.join(directoryIn, endpoints[1]), index=False) logging.info("retrieve OK: {0}".format(endpoints)) return True except Exception as e: logging.warning("retrieve FAILED: {0}".format(endpoints)) logging.error(e) return False
def unzip_results_files( parentDirectory=config.SOURCE_DIR, subDirectory=config.RESULTS_SCRAPE["ftd"][1] ): """Unzip downloaded results files. INPUT: directory: Directory to traverse looking for zip files OUTPUT: None """ directory = os.path.join(parentDirectory, subDirectory) logging.info("Unzipping results files in {0}".format(directory)) for root, _dirs, files in os.walk(directory): for file in files: if file == "data.zip": # logging.info(root) # logging.info(dirs) # logging.info(file) file_to_unzip = os.path.join(root, file) logging.info("File to unzip: {0}".format(file_to_unzip)) with zipfile.ZipFile(file_to_unzip, "r") as zip_ref: zip_ref.extractall(root) os.remove(file_to_unzip)
def format_stadiums( dgl_file=config.STADIUMS_SCRAPE["dgl"][1], ops_file=config.STADIUMS_SCRAPE["ops"][1], directoryOut=config.MASTER_DIR, ): """Format stadiums data. INPUT: dgl_file: Path for "dgl" stadiums file ops_file: Path for "ops" stadiums file directoryOut: Direcory to save formatted data to OUTPUT: None """ logging.info("Formatting stadiums") # dgl_file = config.STADIUMS_SCRAPE["dgl"][1] logging.info("Parsing: {0}".format(dgl_file)) dgl = pd.read_csv(dgl_file, encoding="utf8", sep=",") dgl.rename(columns={"Name": "Stadium"}, inplace=True) dgl.set_index("Team", inplace=True) logging.debug("\n{0}".format(dgl)) # ops_file = config.STADIUMS_SCRAPE["ops"][1] logging.info("Parsing: {0}".format(ops_file)) ops = pd.read_csv(ops_file, encoding="utf8", sep=",") ops.rename(columns={"Team": "TeamFull", "FDCOUK": "Team"}, inplace=True) ops.set_index("Team", inplace=True) logging.debug("\n{0}".format(ops)) ## TODO - fuzzy matching teams? (name inconsistencies?) logging.info("Create combined stadiums data") # combo = pd.merge(dgl, ops, left_on='Team', right_on='FDCOUK', how='inner') combo = ops.combine_first(dgl) combo.reset_index(level=0, inplace=True) logging.debug("\n{0}".format(combo)) utilities.save_master(combo, "stadiums", directory=directoryOut) return
def format_summaries( directoryOut=config.MASTER_DIR, ): """Format national statistical summary data. INPUT: directoryOut: Directory to save formatted data to OUTPUT: None """ logging.info("Formatting national statistical summary data") elo_list = [] for file in os.listdir(os.path.join(config.SOURCE_DIR, "elo")): if not file.endswith(".json"): continue df_json = pd.read_json(os.path.join(config.SOURCE_DIR, "elo", file)) for i in df_json.index.values: df_tmp = pd.DataFrame(df_json.Team[i], index=[i]) for col in df_tmp.columns.drop("Team"): df_tmp[col] = ( df_tmp[col].str.replace("−", "-").str.replace("+", "").astype(int) ) df_tmp["Filename"] = file df_tmp["Year"] = int(file[:4]) elo_list.append(df_tmp) # print(len(elo_list)) elo = pd.concat(elo_list, ignore_index=True) elo.loc[elo.Team == "Czechia", "Team"] = "Czech Republic" elo.loc[elo.Team == "Yugoslavia", "Team"] = "Serbia" elo.loc[elo.Team == "Ireland", "Team"] = "Republic of Ireland" elo["Country"] = elo.Team elo.loc[ elo.Team.isin(["England", "Scotland", "Wales", "Northern Ireland"]), "Country" ] = "United Kingdom" elo.loc[(elo.Team == "Russia"), "Country"] = "Russian Federation" # elo.describe().T penn = pd.read_excel( os.path.join(config.SOURCE_DIR, "rug", "pwt100.xlsx"), sheet_name="Data" ) penn = penn[["country", "year", "rgdpe", "pop"]] penn.columns = ["Country", "Data Year", "GDP (PPP)", "Population"] penn.dropna(axis="index", inplace=True) penn.sort_values(by=["Data Year"], inplace=True) penn.loc[penn.Country == "Ireland", "Country"] = "Republic of Ireland" # penn.info() summary = pd.merge_asof( elo, penn, left_by="Country", right_by="Country", left_on="Year", right_on="Data Year", tolerance=4, allow_exact_matches=False, ) # summary.info() utilities.save_master(summary, "nations_summaries", directory=directoryOut) return summary
def main(): """Use the Main for CLI usage.""" logging.info("Executing players module") clean_data("tmk_cnt") clean_data("tmk_psm")
def format_results( parentDirectory=config.SOURCE_DIR, subDirectory=config.RESULTS_SCRAPE["ftd"][1], directoryOut=config.MASTER_DIR, ): """Format raw results and save processed output. INPUT: parentDirectory: Parent directory to traverse looking for files to zip/clear subDirectory: Sub-Directory to traverse looking for files to zip/clear directoryOut: Directory to save output to OUTPUT: None """ directoryIn = os.path.join(parentDirectory, subDirectory) logging.info("Format results in {0}".format(directoryIn)) pieces = [] core_cols = ["Div", "Date"] # ,'HomeTeam','AwayTeam','FTHG','FTAG','FTR'] use_cols = [ "Season", "Div", "Country", "Tier", "Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "FTR", "HTHG", "HTAG", "HTR", "Attendance", "Referee", "HS", "AS", "HST", "AST", "HHW", "AHW", "HC", "AC", "HF", "AF", "HO", "AO", "HY", "AY", "HR", "AR", "HBP", "ABP", ] for root, _dirs, files in os.walk(directoryIn): for file in files: if file.endswith(".csv"): # logging.info(root) filepath = os.path.join(root, file) logging.info("Filepath: {0}".format(filepath)) # logging.info(root[-9:]) # try: df = pd.read_csv( filepath, error_bad_lines=False, warn_bad_lines=False, encoding="latin9", ) # , parse_dates=['Date']) logging.debug("Input columns: {0}".format(df.columns)) # df['File'] = file df["Season"] = root[-9:] if set(["HomeTeam", "AwayTeam"]).issubset(df.columns): # logging.info(df[["HomeTeam", "AwayTeam"]].head()) try: df["HomeTeam"] = df[ "HomeTeam" ] # .apply(lambda x: x.decode('latin9').encode('utf-8')) df["AwayTeam"] = df[ "AwayTeam" ] # .apply(lambda x: x.decode('latin9').encode('utf-8')) except BaseException: df["HomeTeam"] = np.nan df["AwayTeam"] = np.nan elif set(["HT", "AT"]).issubset(df.columns): # logging.info(df[["HT", "AT"]].head()) try: df["HomeTeam"] = df[ "HT" ] # .apply(lambda x: x.decode('latin9').encode('utf-8')) df["AwayTeam"] = df[ "AT" ] # .apply(lambda x: x.decode('latin9').encode('utf-8')) except BaseException: df["HomeTeam"] = np.nan df["AwayTeam"] = np.nan else: raise # logging.info(df[["HomeTeam", "AwayTeam"]].head()) # drop useless rows df = df.dropna(subset=core_cols) logging.debug("Output columns: {0}".format(df.columns)) pieces.append(df) # except: # logging.info("read_csv FAILED: "+os.path.join(root, file)) # logging.info(df.count()) logging.info("Concatenate everything into a single DataFrame") dframe = pd.concat(pieces, ignore_index=True, sort=False) dframe["Country"], dframe["Tier"] = zip(*dframe["Div"].map(func_div)) # dframe["Date"] = pd.to_datetime(dframe['Date'], format='%d/%m/%y') dframe.Date = pd.to_datetime(dframe.Date, dayfirst=True) logging.info(dframe[use_cols].info()) # logging.info(dframe[((dframe['HomeTeam']=="Middlesbrough")|(dframe['AwayTeam']=="Middlesbrough"))&(dframe['Season']=="2006-2007")][["Date", "HomeTeam", "AwayTeam"]]) utilities.save_master( dframe[use_cols], "results", directory=directoryOut ) # , enc="ascii")
def main(): """Use the Main for CLI usage.""" logging.info("Executing stadiums module") download_stadiums() format_stadiums()
def format_matches( directoryOut=config.MASTER_DIR, ): """Format national team match data. INPUT: directoryOut: Directory to save formatted data to OUTPUT: match: National match data dataframe """ logging.info("Formatting national team match data") comp = utilities.folder_loader( "fbr", "competition", source_header=[ "Round", "Wk", "Day", "Date", "Time", "Team_1", "Score", "Team_2", "Attendance", "Venue", "Referee", "Match Report", "Notes", ], ) comp2 = utilities.folder_loader( "fbr", "competition2", source_header=[ "Round", "Wk", "Day", "Date", "Time", "Team_1", "xG_1", "Score", "xG_2", "Team_2", "Attendance", "Venue", "Referee", "Match Report", "Notes", ], ) comp = pd.concat([comp, comp2], axis=0, sort=False, ignore_index=True) comp.dropna(subset=["Round"], inplace=True) comp.reset_index(drop=True, inplace=True) comp["Year"] = comp.Date.str[:4] comp["Team_abbrev_1"] = comp["Team_1"].str[-3:].str.strip() comp["Team_1"] = comp["Team_1"].str[:-3].str.strip() comp["Team_abbrev_2"] = comp["Team_2"].str[:3].str.strip() comp["Team_2"] = comp["Team_2"].str[3:].str.strip() comp["Goals_1"] = comp.Score.str.extract(pat="(?:^|\) )([0-9]{1,2})[^0-9]+[0-9]{1,2}") comp["Goals_2"] = comp.Score.str.extract(pat="[0-9]{1,2}[^0-9]+([0-9]{1,2})(?:$| \()") for i in range(1, 3): comp["Goals_" + str(i)] = pd.to_numeric( comp["Goals_" + str(i)], errors="coerce" ) comp["Goal_diff"] = comp.Goals_1 - comp.Goals_2 logging.debug("\n{0}".format(comp.info())) venue = pd.read_csv( os.path.join(config.SOURCE_DIR, "wkp", "wkp_std", "wkp_std_nat.csv"), encoding="latin9", sep=",", ) venue.columns = ["Venue_country", "Venue_city", "Venue", "Venue_URL"] logging.debug("\n{0}".format(venue.info())) match = pd.merge(comp, venue, on="Venue", how="left") ## workaround for venues that aren't mapping match.loc[match.Venue == "Stadion Energa Gdańsk", "Venue_country"] = "Poland" match.loc[match.Venue == "Bakı Olimpiya Stadionu", "Venue_country"] = "Azerbaijan" match.loc[match.Venue == "Arena Naţională", "Venue_country"] = "Romania" for i in range(1, 3): match["Home_" + str(i)] = 0 match.loc[match["Team_" + str(i)] == match.Venue_country, "Home_" + str(i)] = 1 logging.debug("\n{0}".format(match.info())) utilities.save_master(match, "nations_matches", directory=directoryOut) return match
def main(): """Use the Main for CLI usage.""" logging.info("Executing data quality module") calculate_quality()
def calculate_quality(directory=config.MASTER_DIR): """Calculate data quality. INPUT: directory: Location of data to assess OUTPUT: overall_score: Overall quality score across all files """ logging.info("Calculating data quality") calc_date = datetime.datetime.today().strftime("%Y-%m-%d") logging.info("Current date is {0}".format(calc_date)) dq_data = [] for file in os.listdir(directory): if file == "ftb_quality.txt": continue if not file.endswith(".txt"): continue logging.info("Assessing {0}".format(file)) file_path = os.path.join(directory, file) file_date = datetime.datetime.fromtimestamp( os.path.getmtime(file_path)).strftime("%Y-%m-%d") logging.info("File modification date is {0}".format(calc_date)) file_stub = file.replace("ftb_", "").replace(".txt", "") df = utilities.get_master(file_stub, directory=directory) if df.shape[0] > 50000: df = df.sample(50000, replace=False, random_state=42) no_of_rows, no_of_columns = df.shape no_of_cells = no_of_rows * no_of_columns # Consistency, coherence, or clarity category = "Consistency" logging.info("Running {0} tests".format(category)) test = None if file in ["ftb_events_shot.txt"]: test = "xG between 0 and 1" score = 1 - (df[~df.statsbomb_xg.between(0, 1)].shape[0] / df.shape[0]) elif file in ["ftb_fulldata.txt"]: test = "Goals <= Shots" score = 1 - (df[df["Goals"] > df["Shots"]].shape[0] / df.shape[0]) elif file in ["ftb_managers.txt"]: test = "DateFrom <= DateTo" score = 1 - (df[df.DateFrom > df.DateTo].shape[0] / df.shape[0]) elif file in ["ftb_players_contract.txt"]: test = "Joined <= Contract expires" score = 1 - (df[df["Joined"] > df["Contract expires"]].shape[0] / df.shape[0]) elif file in ["ftb_players_performance.txt"]: test = "Games started <= In squad" score = 1 - (df[df["Games started"] > df["In squad"]].shape[0] / df.shape[0]) elif file in ["ftb_results.txt"]: test = "Home goals <= Home shots" score = 1 - (df[df["FTHG"] > df["HS"]].shape[0] / df.shape[0]) elif file in ["ftb_nations_matches.txt"]: test = "Max one home team" score = 1 - (df[df["Home_1"] + df["Home_2"] == 2].shape[0] / df.shape[0]) if test: dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) # Completeness or comprehensiveness category = "Completeness" logging.info("Running {0} tests".format(category)) test = "Missing values" score = df.count().sum() / no_of_cells dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) # Timeliness or latency category = "Timeliness" logging.info("Running {0} tests".format(category)) test = "Days since file updated" score = max( 1 - ((int(calc_date.replace("-", "")) - int(file_date.replace("-", ""))) / 100000), 0, ) dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) date_field = None if file in [ "ftb_fulldata.txt", "ftb_results.txt", "ftb_nations_matches.txt" ]: date_field = "Date" elif file in ["ftb_events_shot.txt"]: date_field = "match_date" elif file in ["ftb_managers.txt"]: date_field = "DateTo" if date_field: test = "Days since last match date" score = max( 1 - ((int(calc_date.replace("-", "")) - int(df[date_field].max().replace("-", ""))) / 100000), 0, ) dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) # Accuracy or correctness category = "Accuracy" logging.info("Running {0} tests".format(category)) # Spots tests against ref data? # Wikipedia (Ajax, Frankfurt) # Don Balon (Spain 08/09) # SkySports Football Yearbook (England & Scotland 07/08) # Uniqueness category = "Uniqueness" logging.info("Running {0} tests".format(category)) test = "Duplicated rows" score = df.drop_duplicates().shape[0] / no_of_rows dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) test = "Duplicated columns" score = df.T.drop_duplicates().T.shape[0] / no_of_rows dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) # Validity or reasonableness category = "Validity" logging.info("Running {0} tests".format(category)) # 0 <= Goals <= 10 test = "3 stdev from mean" score = 1 - ( ((df < (df.mean() - 3 * df.std())) | (df > df.mean() + 3 * df.std())).sum().sum() / no_of_cells) dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) test = "1.5 IQR rule" Q1 = df.quantile(0.25) Q3 = df.quantile(0.75) IQR = Q3 - Q1 score = 1 - (((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).sum().sum() / no_of_cells) dq_data.append({ "file": file_stub, "file_date": file_date, "calc_date": calc_date, "category": category, "test": test, "score": score, }) # Orderliness # Auditability # Conformity # accessibility or availability # comparability # credibility, reliability, or reputation # relevance, pertinence, or usefulness df_dq = pd.DataFrame(dq_data) df_dq["score"] = df_dq["score"].clip(lower=0, upper=1) utilities.save_master(df_dq, "quality", directory=directory) overall_score = df_dq.score.mean() logging.info("Overall score is {0}".format(overall_score)) return overall_score
def main(): """Use the Main for CLI usage.""" logging.info("Executing clubs module") build_fulldata() get_summary("Div")
def clean_data(source_name, directory=config.MASTER_DIR): """Clean raw player data and save processed version. INPUT: source_name: String containing name of the data source directory: Directory to save output to OUTPUT: df: Dataframe containing the cleaned data """ logging.info("Loading {0} data".format(source_name)) if source_name == "tmk_cnt": source_header = [ "Shirt number", "Position", "Name", "Date of birth", "Nationality", "Height", "Foot", "Joined", "Signed from", "Contract expires", "Market value", ] drop_cols = ["Nationality", "Signed from", "Competition"] notna_cols = ["Market value"] elif source_name == "tmk_psm": source_header = [ "Shirt number", "Position", "Name", "Age", "Nationality", "In squad", "Games started", "Goals", "Assists", "Yellow cards", "Second yellow cards", "Red cards", "Substitutions on", "Substitutions off", "PPG", "Minutes played", ] drop_cols = ["Nationality"] notna_cols = ["In squad"] df = utilities.folder_loader(source_name[:3], source_name, "comp_season", source_header=source_header) ## Name and Position are mis-aligned in the source files df["Name"].fillna(method="bfill", inplace=True) df["Position"] = df.Name.shift(-1) df.loc[df.Position == df.Name, "Position"] = df.Name.shift(-2) df.drop(axis=1, columns=drop_cols, inplace=True) df.dropna(subset=notna_cols, inplace=True) df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x) df = df.replace("-", np.nan) df = df.replace("Was not used during this season", np.nan) df = df.replace("Not in squad during this season", np.nan) df = df.replace("Not used during this season", np.nan) df["Shirt number"] = pd.to_numeric(df["Shirt number"], downcast="integer") df["Position group"] = None df.loc[(df.Position.str.upper().str.contains("KEEPER")) | (df.Position.str.upper().str.contains("GOAL")), "Position group", ] = "G" df.loc[(df.Position.str.upper().str.contains("BACK")) | (df.Position.str.upper().str.contains("DEF")), "Position group", ] = "D" df.loc[(df.Position.str.upper().str.contains("MID")) | (df.Position.str.upper().str.contains("MIT")) | (df.Position.str.upper().str.contains("WING")), "Position group", ] = "M" df.loc[(df.Position.str.upper().str.contains("STRIKER")) | (df.Position.str.upper().str.contains("FORW")), "Position group", ] = "F" if source_name == "tmk_cnt": df["Age"] = (df["Date of birth"].str.extract( r".*([0-9]{2})", expand=False).astype("int")) df["Date of birth"] = pd.to_datetime( df["Date of birth"].str.extract(r"(.*) \([0-9]{2}\)", expand=False), format="%b %d, %Y", ) df["Joined"] = pd.to_datetime(df.Joined, format="%b %d, %Y") df["Contract expires"] = pd.to_datetime(df["Contract expires"], format="%d.%m.%Y") df["Height"] = (df["Height"].str.strip().str.replace( " ", "").str.replace(",", "").str.replace("m", "").replace({ "-": np.nan, "": np.nan }).astype(float)) df.loc[df.Name.isin(df[df.Height.notna()].Name.values) & df.Name.isin(df[df.Height.isna()].Name.values), "Height", ] = ( df.loc[df.Name.isin(df[df.Height.notna()].Name.values) & df.Name.isin(df[df.Height.isna()].Name.values)]. sort_values(by=["Name", "Season"]).Height.fillna( method="bfill")) df.loc[df.Name.isin(df[df.Foot.notna()].Name.values) & df.Name.isin(df[df.Foot.isna()].Name.values), "Foot", ] = (df.loc[ df.Name.isin(df[df.Foot.notna()].Name.values) & df.Name.isin(df[df.Foot.isna()].Name.values)].sort_values( by=["Name", "Season"]).Foot.fillna(method="bfill")) df["Market value"] = ( df["Market value"].str.strip().replace({ "-": np.nan }).replace(r"[£kmTh\.]", "", regex=True).astype(float) * df["Market value"].str.extract( r"[\d\.]+([kmTh\.]+)", expand=False).fillna(1).replace( ["k", "Th.", "m"], [10**3, 10**3, 10**6]).astype(int) / 10**6) elif source_name == "tmk_psm": df["PPG"] = df["PPG"].str.strip().replace(r"[,]", ".", regex=True).astype(float) df["Minutes played"] = (df["Minutes played"].str.strip().replace( r"[.\']", "", regex=True).astype(float)) df[[ "In squad", "Games started", "Goals", "Assists", "Yellow cards", "Second yellow cards", "Red cards", "Substitutions on", "Substitutions off", "PPG", "Minutes played", ]] = df[[ "In squad", "Games started", "Goals", "Assists", "Yellow cards", "Second yellow cards", "Red cards", "Substitutions on", "Substitutions off", "PPG", "Minutes played", ]].fillna(0) df[[ "In squad", "Games started", "Goals", "Assists", "Yellow cards", "Second yellow cards", "Red cards", "Substitutions on", "Substitutions off", "PPG", "Minutes played", ]] = df[[ "In squad", "Games started", "Goals", "Assists", "Yellow cards", "Second yellow cards", "Red cards", "Substitutions on", "Substitutions off", "PPG", "Minutes played", ]].astype(float) logging.debug(df.describe(include="all")) logging.info("Saving processed data to ") utilities.save_master(df, get_outfile(source_name), directory=directory) return df
def main(): """Use the Main for CLI usage.""" logging.info("Executing nations module") format_matches() format_summaries()
def build_fulldata(directory=config.MASTER_DIR): """Combine results, stadiums and managers data into full dataset for clubs. INPUT: directory: Directory to save output to OUTPUT: fulldata: Dataframe containing all the clubs data """ logging.info( "Building fulldata dataframe from results, stadiums, managers ...") home_renames = { "HomeTeam": "Team", "AwayTeam": "TeamOpp", "FTHG": "Goals", "FTAG": "GoalsOpp", "HTHG": "Goals1stHalf", "HTAG": "Goals1stHalfOpp", "HS": "Shots", "AS": "ShotsOpp", "HST": "ShotsOnTarget", "AST": "ShotsOnTargetOpp", "HHW": "ShotsHitWoodwork", "AHW": "ShotsHitWoodworkOpp", "HC": "Corners", "AC": "CornersOpp", "HF": "Fouls", "AF": "FoulsOpp", "HO": "Offsides", "AO": "OffsidesOpp", "HY": "YellowCards", "AY": "YellowCardsOpp", "HR": "RedCards", "AR": "RedCardsOpp", "HBP": "BookingPoints", "ABP": "BookingPointsOpp", } away_renames = {} for key, val in home_renames.items(): if val.endswith("Opp"): away_renames[key] = val[:-3] else: away_renames[key] = val + "Opp" stat_to_diff = [ "Goals", "Goals1stHalf", "Shots", "ShotsOnTarget", "ShotsHitWoodwork", "Corners", "Fouls", "Offsides", "YellowCards", "RedCards", "BookingPoints", ] # logging.debug(list(away_renames)) # logging.debug(list(home_renames)) logging.info("Process results") results = utilities.get_master("results", directory=directory) homeresults = results.rename(columns=home_renames) homeresults["HomeAway"] = "Home" # homeresults.info() # logging.debug(homeresults.describe(include="all")) awayresults = results.rename(columns=away_renames) awayresults["HomeAway"] = "Away" # awayresults.info() # logging.debug(homeresults.describe(include="all")) allresults = pd.concat([homeresults, awayresults], ignore_index=True, sort=False) allresults.drop(["FTR", "HTR", "Unnamed: 0"], axis=1, inplace=True) # logging.debug(allresults[(allresults['Team']=="Middlesbrough")&(allresults['Season']=="2006-2007")]["Date"].min()) for stat in stat_to_diff: allresults[stat + "Diff"] = allresults[stat] - allresults[stat + "Opp"] allresults["Total" + stat] = allresults[stat] + allresults[stat + "Opp"] allresults[ "Saves"] = allresults["ShotsOnTargetOpp"] - allresults["GoalsOpp"] allresults["SavesOpp"] = allresults["ShotsOnTarget"] - allresults["Goals"] allresults["SavesDiff"] = allresults["Saves"] - allresults["SavesOpp"] allresults[ "Goals2ndHalf"] = allresults["Goals"] - allresults["Goals1stHalf"] allresults["Goals2ndHalfOpp"] = (allresults["GoalsOpp"] - allresults["Goals1stHalfOpp"]) allresults["Goals2ndHalfDiff"] = (allresults["GoalsDiff"] - allresults["Goals1stHalfDiff"]) # Result,Points,PointsOpp,Win,WinDraw,Draw,DrawLoss,Loss,CleanSheet,CleanSheetOpp ( allresults["Result"], allresults["Points"], allresults["PointsOpp"], allresults["Win"], allresults["WinDraw"], allresults["Draw"], allresults["DrawLoss"], allresults["Loss"], allresults["WinShare"], ) = zip(*allresults["GoalsDiff"].map(func_score)) allresults["CleanSheet"] = allresults["Goals"].map(func_nogoal) allresults["CleanSheetOpp"] = allresults["GoalsOpp"].map(func_nogoal) # allresults['Date'] = pd.to_datetime(allresults['Date'], format="%d/%m/%y") allresults["GameWeek"] = (allresults.sort_values("Date").groupby( ["Season", "Div", "Team"]).cumcount() + 1) ## TODO - Validate derived values logging.info("Process stadiums") stadiums = utilities.get_master("stadiums", directory=directory) stadiums.drop(["Country", "TeamFull"], axis=1, inplace=True) fulldata = pd.merge(allresults, stadiums, on="Team", how="left") # fulldata.drop(['Unnamed: 0'], axis=1, inplace=True) stadiums.rename(columns={"Team": "TeamOpp"}, inplace=True) fulldata = pd.merge(fulldata, stadiums, on="TeamOpp", how="left", suffixes=("", "Opp")) fulldata.drop(["Unnamed: 0", "Unnamed: 0Opp"], axis=1, inplace=True) fulldata["EuclideanDistance"] = ( (fulldata.Latitude - fulldata.LatitudeOpp)**2 + (fulldata.Longitude - fulldata.LongitudeOpp)**2)**0.5 # logging.debug(100000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')])) logging.info("Process managers") managers = utilities.get_master("managers", directory=directory) managers.dropna(subset=["Manager"], inplace=True) fulldata = pd.merge(fulldata, managers, on="Team", how="left") # logging.debug(200000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')])) fulldata = fulldata[((fulldata["Date"] >= fulldata["DateFrom"]) & (fulldata["Date"] <= fulldata["DateTo"])) | (fulldata["Manager"].isnull())] # logging.debug(300000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')])) fulldata.drop( ["Unnamed: 0", "DateFrom", "DateTo", "Duration", "YearRange"], axis=1, inplace=True, ) fulldata = fulldata.drop_duplicates() # fulldata.info() # logging.debug(400000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')])) managers.rename(columns={"Team": "TeamOpp"}, inplace=True) fulldata = pd.merge(fulldata, managers, on="TeamOpp", how="left", suffixes=("", "Opp")) # logging.debug(500000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')]) fulldata = fulldata[((fulldata["Date"] >= fulldata["DateFrom"]) & (fulldata["Date"] <= fulldata["DateTo"])) | (fulldata["ManagerOpp"].isnull())] # logging.debug(600000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')])) fulldata.drop( ["Unnamed: 0", "DateFrom", "DateTo", "Duration", "YearRange"], axis=1, inplace=True, ) fulldata = fulldata.drop_duplicates() # fulldata.info() # logging.debug(700000+len(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['TeamOpp']=="Chelsea")&(fulldata['Season']=="2016-2017")&(fulldata['HomeAway']=='Home')])) # fulldata.info() # logging.debug(fulldata.describe(include="all")) # logging.debug(fulldata[(fulldata['Team']=="Middlesbrough")&(fulldata['Season']=="2006-2007")]["Date"].min()#.describe(include="all")) utilities.save_master(fulldata, "fulldata", directory=directory) return fulldata