Exemple #1
0
def build_event_data(competition_id, event_type, directory=config.MASTER_DIR):
    """Build formatted event data from source files.

    Args:
        event_type: String value of event type
        directory: Folder for processed data

    Returns:
        None
    """
    logging.info("Building event data")

    comps = utilities.folder_loader("stb", "competitions")
    logging.debug(comps.info())
    matches = utilities.folder_loader("stb", "matches")
    logging.debug(matches.info())
    events = utilities.folder_loader("stb", "events", "match_event")
    logging.debug(events.info())

    data = (
        comps.loc[
            comps.competition_id == competition_id,
            ["season_id", "country_name", "competition_name", "season_name"], ]
        .merge(
            matches.loc[matches.competition == competition_id,
                        ["match_id", "match_date", "kick_off", "season"], ],
            how="inner",
            left_on="season_id",
            right_on="season",
        ).merge(
            events.loc[:, [
                "event_type",
                "period",
                "minute",
                "team",
                "player",
                "statsbomb_xg",
                "type",
                "outcome",
                "start_location_x",
                "start_location_y",
                "end_location_x",
                "end_location_y",
                "end_location_z",
                "match_id",
            ], ],
            how="inner",
            on="match_id",
        ))
    logging.debug(data.info())

    utilities.save_master(data,
                          "events_{0}".format(event_type),
                          directory=directory)

    return data
Exemple #2
0
def get_outfile(source_name):
    """Return outfile stub for given source.

    INPUT:
        source_name: String containing name of the data source

    OUTPUT:
        outfile_stub: Stub to use when saving output
    """
    logging.info("Mapping {0} to outfile".format(source_name))

    if source_name == "tmk_cnt":
        outfile_stub = "players_contract"

    elif source_name == "tmk_psm":
        outfile_stub = "players_performance"

    logging.debug(outfile_stub)
    return outfile_stub
Exemple #3
0
def download_events(competition_id, event_type):
    """Download all event data.

    Args:
        competition_id: Id of competition
        event_type: String value of event type

    Returns:
        None
    """
    logging.info("Downloading events")

    logging.debug(competition_id, event_type)
    comps = pd.read_csv(
        os.path.join(config.SOURCE_DIR, "stb", "competitions",
                     "competitions_None.csv"))
    season_ids = comps[comps.competition_id == competition_id].season_id.values

    for season_id in season_ids:
        logging.debug(season_id)
        mats = pd.read_csv(
            os.path.join(
                config.SOURCE_DIR,
                "stb",
                "matches",
                "matches_{0}_{1}.csv".format(competition_id, season_id),
            ))
        match_ids = mats.match_id.values

        os.chdir(os.path.join(config.SOURCE_DIR, "stb", "events"))
        for match_id in match_ids:
            logging.debug(match_id)
            sb.Events(event_id=str(match_id)).save_data(event_type=event_type)

    return
def format_stadiums(
    dgl_file=config.STADIUMS_SCRAPE["dgl"][1],
    ops_file=config.STADIUMS_SCRAPE["ops"][1],
    directoryOut=config.MASTER_DIR,
):
    """Format stadiums data.

    INPUT:
        dgl_file: Path for "dgl" stadiums file
        ops_file: Path for "ops" stadiums file
        directoryOut: Direcory to save formatted data to

    OUTPUT:
        None
    """
    logging.info("Formatting stadiums")

    # dgl_file = config.STADIUMS_SCRAPE["dgl"][1]
    logging.info("Parsing: {0}".format(dgl_file))
    dgl = pd.read_csv(dgl_file, encoding="utf8", sep=",")
    dgl.rename(columns={"Name": "Stadium"}, inplace=True)
    dgl.set_index("Team", inplace=True)
    logging.debug("\n{0}".format(dgl))

    # ops_file = config.STADIUMS_SCRAPE["ops"][1]
    logging.info("Parsing: {0}".format(ops_file))
    ops = pd.read_csv(ops_file, encoding="utf8", sep=",")
    ops.rename(columns={"Team": "TeamFull", "FDCOUK": "Team"}, inplace=True)
    ops.set_index("Team", inplace=True)
    logging.debug("\n{0}".format(ops))

    ## TODO - fuzzy matching teams? (name inconsistencies?)

    logging.info("Create combined stadiums data")
    # combo = pd.merge(dgl, ops, left_on='Team', right_on='FDCOUK', how='inner')
    combo = ops.combine_first(dgl)
    combo.reset_index(level=0, inplace=True)
    logging.debug("\n{0}".format(combo))

    utilities.save_master(combo, "stadiums", directory=directoryOut)

    return
Exemple #5
0
def format_matches(
    directoryOut=config.MASTER_DIR,
):
    """Format national team match data.

    INPUT:
        directoryOut: Directory to save formatted data to

    OUTPUT:
        match: National match data dataframe
    """
    logging.info("Formatting national team match data")

    comp = utilities.folder_loader(
        "fbr",
        "competition",
        source_header=[
            "Round",
            "Wk",
            "Day",
            "Date",
            "Time",
            "Team_1",
            "Score",
            "Team_2",
            "Attendance",
            "Venue",
            "Referee",
            "Match Report",
            "Notes",
        ],
    )
    comp2 = utilities.folder_loader(
        "fbr",
        "competition2",
        source_header=[
            "Round",
            "Wk",
            "Day",
            "Date",
            "Time",
            "Team_1",
            "xG_1",
            "Score",
            "xG_2",
            "Team_2",
            "Attendance",
            "Venue",
            "Referee",
            "Match Report",
            "Notes",
        ],
    )
    comp = pd.concat([comp, comp2], axis=0, sort=False, ignore_index=True)
    comp.dropna(subset=["Round"], inplace=True)
    comp.reset_index(drop=True, inplace=True)
    comp["Year"] = comp.Date.str[:4]
    comp["Team_abbrev_1"] = comp["Team_1"].str[-3:].str.strip()
    comp["Team_1"] = comp["Team_1"].str[:-3].str.strip()
    comp["Team_abbrev_2"] = comp["Team_2"].str[:3].str.strip()
    comp["Team_2"] = comp["Team_2"].str[3:].str.strip()
    comp["Goals_1"] = comp.Score.str.extract(pat="(?:^|\) )([0-9]{1,2})[^0-9]+[0-9]{1,2}")
    comp["Goals_2"] = comp.Score.str.extract(pat="[0-9]{1,2}[^0-9]+([0-9]{1,2})(?:$| \()")
    for i in range(1, 3):
        comp["Goals_" + str(i)] = pd.to_numeric(
            comp["Goals_" + str(i)], errors="coerce"
        )
    comp["Goal_diff"] = comp.Goals_1 - comp.Goals_2
    logging.debug("\n{0}".format(comp.info()))

    venue = pd.read_csv(
        os.path.join(config.SOURCE_DIR, "wkp", "wkp_std", "wkp_std_nat.csv"),
        encoding="latin9",
        sep=",",
    )
    venue.columns = ["Venue_country", "Venue_city", "Venue", "Venue_URL"]
    logging.debug("\n{0}".format(venue.info()))

    match = pd.merge(comp, venue, on="Venue", how="left")

    ## workaround for venues that aren't mapping
    match.loc[match.Venue == "Stadion Energa Gdańsk", "Venue_country"] = "Poland"
    match.loc[match.Venue == "Bakı Olimpiya Stadionu", "Venue_country"] = "Azerbaijan"
    match.loc[match.Venue == "Arena Naţională", "Venue_country"] = "Romania"

    for i in range(1, 3):
        match["Home_" + str(i)] = 0
        match.loc[match["Team_" + str(i)] == match.Venue_country, "Home_" + str(i)] = 1

    logging.debug("\n{0}".format(match.info()))

    utilities.save_master(match, "nations_matches", directory=directoryOut)

    return match
Exemple #6
0
def get_summary(
        group_key,
        df=None,
        agg_method="mean",
        base_filters={},
        metric_mins={},
        output_metrics=(),
):
    """Generate summarised clubs data.

    INPUT:
        group_key: Field (or Fields) to group data on
        df: (Optional) pass in clubs Datafarme
        agg_method: Aggregation method
        base_filters: Dictionary with Field/Value(s) pairs to filter base data
        metric_mins: Dictionary with Field/Value(s) pairs to filter agg data
        output_metrics: Metric fields to include in output

    OUTPUT:
        df: Aggregated dataframe
    """
    logging.debug("Get summarised data")

    if df is None:
        # fetch from master csv
        df = utilities.get_master("fulldata")
    # logging.debug(list(df.columns.values))

    # filter unwanted records
    # df = df[(df['Team']=="Chelsea")]
    # df = df[(df['Country']=="England")]
    # df = df[(df['Tier']==1)]
    for field, vals in base_filters.items():
        df = df[(df[field].isin(vals))]

    #    selected_columns = [group_key]+metrics
    #    df = df[selected_columns]
    df.dropna(subset=[group_key], inplace=True)

    # aggregate data
    #   df_avg = df[[group_key]+metrics].groupby(group_key).mean()
    df_avg = df.groupby(group_key).agg(agg_method)
    # df_avg.info()
    df_cnt = df[group_key].value_counts()
    # df_cnt.columns = ['NumberOfMatches']
    # logging.debug(df_cnt)
    df = pd.concat([df_cnt, df_avg], axis=1, sort=True)
    df.rename(columns={group_key: "NumberOfMatches"}, inplace=True)
    if "Unnamed: 0" in df.columns:
        df.drop(["Unnamed: 0"], axis=1, inplace=True)

    # add derived metrics
    df["ShotAccuracy"] = df["ShotsOnTarget"] / df["Shots"]
    df["ShotAccuracyOpp"] = df["ShotsOnTargetOpp"] / df["ShotsOpp"]
    df["ShotPercent"] = df["Goals"] / df["ShotsOnTarget"]
    df["ShotPercentOpp"] = df["GoalsOpp"] / df["ShotsOnTargetOpp"]
    df["SavePercent"] = df["Saves"] / df["ShotsOnTargetOpp"]
    df["SavePercentOpp"] = df["SavesOpp"] / df["ShotsOnTarget"]
    df["ShotConversion"] = df["Goals"] / df["Shots"]
    df["ShotConversionOpp"] = df["GoalsOpp"] / df["ShotsOpp"]
    df["TSR"] = df["Shots"] / df["TotalShots"]
    df["TSROpp"] = df["ShotsOpp"] / df["TotalShots"]
    df["ShotOnTargetRatio"] = df["ShotsOnTarget"] / df["TotalShotsOnTarget"]
    df["ShotOnTargetRatioOpp"] = df["ShotsOnTargetOpp"] / df[
        "TotalShotsOnTarget"]
    df["ShotDominance"] = df["Shots"] / df["ShotsOpp"]
    df["ShotPace"] = df["TotalShots"]
    df["PDO"] = 1000 * (df["ShotPercent"] + df["SavePercent"])
    df["PDOOpp"] = 1000 * (df["ShotPercentOpp"] + df["SavePercentOpp"])
    df["%TSoTt"] = df["ShotAccuracy"] + (1 - df["ShotAccuracyOpp"])
    df["%TSoTtOpp"] = df["ShotAccuracyOpp"] + (1 - df["ShotAccuracy"])
    df["GraysonRating"] = ((0.5 + (df["TSR"] - 0.5) * 0.732**0.5) *
                           (1.0 + (df["%TSoTt"] - 1.0) * 0.166**0.5) *
                           (1000 + (df["PDO"] - 1000) * 0.176**0.5))
    df["GraysonRatingOpp"] = ((0.5 + (df["TSROpp"] - 0.5) * 0.732**0.5) *
                              (1.0 + (df["%TSoTtOpp"] - 1.0) * 0.166**0.5) *
                              (1000 + (df["PDOOpp"] - 1000) * 0.176**0.5))
    df["GraysonScore"] = 10 * (df["GraysonRating"] - 363) / (695 - 363)
    df["GraysonScoreOpp"] = 10 * (df["GraysonRatingOpp"] - 363) / (695 - 363)

    # filter unwanted aggregate data
    # df = df[(df["NumberOfMatches"] >= 50)]
    for field, val in metric_mins.items():
        df = df[(df[field] >= val)]

    if output_metrics:
        df = df[output_metrics]
    # df.info()
    logging.debug("Showing summarised dataframe...\n{0}".format(df))
    return df
Exemple #7
0
def format_results(
    parentDirectory=config.SOURCE_DIR,
    subDirectory=config.RESULTS_SCRAPE["ftd"][1],
    directoryOut=config.MASTER_DIR,
):
    """Format raw results and save processed output.

    INPUT:
        parentDirectory: Parent directory  to traverse looking for files to zip/clear
        subDirectory: Sub-Directory to traverse looking for files to zip/clear
        directoryOut: Directory to save output to

    OUTPUT:
        None
    """
    directoryIn = os.path.join(parentDirectory, subDirectory)
    logging.info("Format results in {0}".format(directoryIn))
    pieces = []
    core_cols = ["Div", "Date"]  # ,'HomeTeam','AwayTeam','FTHG','FTAG','FTR']
    use_cols = [
        "Season",
        "Div",
        "Country",
        "Tier",
        "Date",
        "HomeTeam",
        "AwayTeam",
        "FTHG",
        "FTAG",
        "FTR",
        "HTHG",
        "HTAG",
        "HTR",
        "Attendance",
        "Referee",
        "HS",
        "AS",
        "HST",
        "AST",
        "HHW",
        "AHW",
        "HC",
        "AC",
        "HF",
        "AF",
        "HO",
        "AO",
        "HY",
        "AY",
        "HR",
        "AR",
        "HBP",
        "ABP",
    ]

    for root, _dirs, files in os.walk(directoryIn):
        for file in files:
            if file.endswith(".csv"):
                # logging.info(root)
                filepath = os.path.join(root, file)
                logging.info("Filepath: {0}".format(filepath))
                # logging.info(root[-9:])
                # try:
                df = pd.read_csv(
                    filepath,
                    error_bad_lines=False,
                    warn_bad_lines=False,
                    encoding="latin9",
                )  # , parse_dates=['Date'])
                logging.debug("Input columns: {0}".format(df.columns))
                # df['File'] = file
                df["Season"] = root[-9:]

                if set(["HomeTeam", "AwayTeam"]).issubset(df.columns):
                    # logging.info(df[["HomeTeam", "AwayTeam"]].head())
                    try:
                        df["HomeTeam"] = df[
                            "HomeTeam"
                        ]  # .apply(lambda x: x.decode('latin9').encode('utf-8'))
                        df["AwayTeam"] = df[
                            "AwayTeam"
                        ]  # .apply(lambda x: x.decode('latin9').encode('utf-8'))
                    except BaseException:
                        df["HomeTeam"] = np.nan
                        df["AwayTeam"] = np.nan

                elif set(["HT", "AT"]).issubset(df.columns):
                    # logging.info(df[["HT", "AT"]].head())
                    try:
                        df["HomeTeam"] = df[
                            "HT"
                        ]  # .apply(lambda x: x.decode('latin9').encode('utf-8'))
                        df["AwayTeam"] = df[
                            "AT"
                        ]  # .apply(lambda x: x.decode('latin9').encode('utf-8'))
                    except BaseException:
                        df["HomeTeam"] = np.nan
                        df["AwayTeam"] = np.nan
                else:
                    raise
                # logging.info(df[["HomeTeam", "AwayTeam"]].head())

                # drop useless rows
                df = df.dropna(subset=core_cols)
                logging.debug("Output columns: {0}".format(df.columns))

                pieces.append(df)
                # except:
                #     logging.info("read_csv FAILED: "+os.path.join(root, file))
                # logging.info(df.count())

    logging.info("Concatenate everything into a single DataFrame")
    dframe = pd.concat(pieces, ignore_index=True, sort=False)

    dframe["Country"], dframe["Tier"] = zip(*dframe["Div"].map(func_div))

    # dframe["Date"] = pd.to_datetime(dframe['Date'], format='%d/%m/%y')
    dframe.Date = pd.to_datetime(dframe.Date, dayfirst=True)
    logging.info(dframe[use_cols].info())

    # logging.info(dframe[((dframe['HomeTeam']=="Middlesbrough")|(dframe['AwayTeam']=="Middlesbrough"))&(dframe['Season']=="2006-2007")][["Date", "HomeTeam", "AwayTeam"]])
    utilities.save_master(
        dframe[use_cols], "results", directory=directoryOut
    )  # , enc="ascii")
Exemple #8
0
def clean_data(source_name, directory=config.MASTER_DIR):
    """Clean raw player data and save processed version.

    INPUT:
        source_name: String containing name of the data source
        directory: Directory to save output to

    OUTPUT:
        df: Dataframe containing the cleaned data
    """
    logging.info("Loading {0} data".format(source_name))

    if source_name == "tmk_cnt":
        source_header = [
            "Shirt number",
            "Position",
            "Name",
            "Date of birth",
            "Nationality",
            "Height",
            "Foot",
            "Joined",
            "Signed from",
            "Contract expires",
            "Market value",
        ]
        drop_cols = ["Nationality", "Signed from", "Competition"]
        notna_cols = ["Market value"]

    elif source_name == "tmk_psm":
        source_header = [
            "Shirt number",
            "Position",
            "Name",
            "Age",
            "Nationality",
            "In squad",
            "Games started",
            "Goals",
            "Assists",
            "Yellow cards",
            "Second yellow cards",
            "Red cards",
            "Substitutions on",
            "Substitutions off",
            "PPG",
            "Minutes played",
        ]
        drop_cols = ["Nationality"]
        notna_cols = ["In squad"]

    df = utilities.folder_loader(source_name[:3],
                                 source_name,
                                 "comp_season",
                                 source_header=source_header)

    ## Name and Position are mis-aligned in the source files

    df["Name"].fillna(method="bfill", inplace=True)

    df["Position"] = df.Name.shift(-1)
    df.loc[df.Position == df.Name, "Position"] = df.Name.shift(-2)

    df.drop(axis=1, columns=drop_cols, inplace=True)

    df.dropna(subset=notna_cols, inplace=True)

    df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
    df = df.replace("-", np.nan)
    df = df.replace("Was not used during this season", np.nan)
    df = df.replace("Not in squad during this season", np.nan)
    df = df.replace("Not used during this season", np.nan)

    df["Shirt number"] = pd.to_numeric(df["Shirt number"], downcast="integer")

    df["Position group"] = None
    df.loc[(df.Position.str.upper().str.contains("KEEPER"))
           | (df.Position.str.upper().str.contains("GOAL")),
           "Position group", ] = "G"
    df.loc[(df.Position.str.upper().str.contains("BACK"))
           | (df.Position.str.upper().str.contains("DEF")),
           "Position group", ] = "D"
    df.loc[(df.Position.str.upper().str.contains("MID"))
           | (df.Position.str.upper().str.contains("MIT"))
           | (df.Position.str.upper().str.contains("WING")),
           "Position group", ] = "M"
    df.loc[(df.Position.str.upper().str.contains("STRIKER"))
           | (df.Position.str.upper().str.contains("FORW")),
           "Position group", ] = "F"

    if source_name == "tmk_cnt":
        df["Age"] = (df["Date of birth"].str.extract(
            r".*([0-9]{2})", expand=False).astype("int"))

        df["Date of birth"] = pd.to_datetime(
            df["Date of birth"].str.extract(r"(.*) \([0-9]{2}\)",
                                            expand=False),
            format="%b %d, %Y",
        )

        df["Joined"] = pd.to_datetime(df.Joined, format="%b %d, %Y")

        df["Contract expires"] = pd.to_datetime(df["Contract expires"],
                                                format="%d.%m.%Y")

        df["Height"] = (df["Height"].str.strip().str.replace(
            " ", "").str.replace(",", "").str.replace("m", "").replace({
                "-":
                np.nan,
                "":
                np.nan
            }).astype(float))
        df.loc[df.Name.isin(df[df.Height.notna()].Name.values)
               & df.Name.isin(df[df.Height.isna()].Name.values),
               "Height", ] = (
                   df.loc[df.Name.isin(df[df.Height.notna()].Name.values)
                          & df.Name.isin(df[df.Height.isna()].Name.values)].
                   sort_values(by=["Name", "Season"]).Height.fillna(
                       method="bfill"))

        df.loc[df.Name.isin(df[df.Foot.notna()].Name.values)
               & df.Name.isin(df[df.Foot.isna()].Name.values),
               "Foot", ] = (df.loc[
                   df.Name.isin(df[df.Foot.notna()].Name.values)
                   & df.Name.isin(df[df.Foot.isna()].Name.values)].sort_values(
                       by=["Name", "Season"]).Foot.fillna(method="bfill"))

        df["Market value"] = (
            df["Market value"].str.strip().replace({
                "-": np.nan
            }).replace(r"[£kmTh\.]", "", regex=True).astype(float) *
            df["Market value"].str.extract(
                r"[\d\.]+([kmTh\.]+)", expand=False).fillna(1).replace(
                    ["k", "Th.", "m"], [10**3, 10**3, 10**6]).astype(int) /
            10**6)

    elif source_name == "tmk_psm":
        df["PPG"] = df["PPG"].str.strip().replace(r"[,]", ".",
                                                  regex=True).astype(float)
        df["Minutes played"] = (df["Minutes played"].str.strip().replace(
            r"[.\']", "", regex=True).astype(float))

        df[[
            "In squad",
            "Games started",
            "Goals",
            "Assists",
            "Yellow cards",
            "Second yellow cards",
            "Red cards",
            "Substitutions on",
            "Substitutions off",
            "PPG",
            "Minutes played",
        ]] = df[[
            "In squad",
            "Games started",
            "Goals",
            "Assists",
            "Yellow cards",
            "Second yellow cards",
            "Red cards",
            "Substitutions on",
            "Substitutions off",
            "PPG",
            "Minutes played",
        ]].fillna(0)

        df[[
            "In squad",
            "Games started",
            "Goals",
            "Assists",
            "Yellow cards",
            "Second yellow cards",
            "Red cards",
            "Substitutions on",
            "Substitutions off",
            "PPG",
            "Minutes played",
        ]] = df[[
            "In squad",
            "Games started",
            "Goals",
            "Assists",
            "Yellow cards",
            "Second yellow cards",
            "Red cards",
            "Substitutions on",
            "Substitutions off",
            "PPG",
            "Minutes played",
        ]].astype(float)
    logging.debug(df.describe(include="all"))

    logging.info("Saving processed data to ")
    utilities.save_master(df, get_outfile(source_name), directory=directory)

    return df