Ejemplo n.º 1
def cleanse_metrica_event_data(game, reverse):
    Function to clean the Metrica event data. Notice that quite a lot of the code is needed to make the Metrica data
    compatible with the Wyscout format
    :param game: (int) GameId
    :param reverse: (bool) If True, the away team is playing left to right in the first half
    :return: None

    logging.info(f"Cleansing metrica event data for game {game}")

    df_events = io.read_data("event_data",

    # rename columns to camelStyle
    df_events.columns = [

    # make sure that the position is in meters and events are always from the perspective of the
    # team having the event
    df_events = set_positions(df_events, reverse)

    # make sure that the end frame is always at least the start frame
    df_events["endFrame"] = df_events[["startFrame", "endFrame"]].max(axis=1)
    df_events["endTime"] = df_events[["startTime", "endTime"]].max(axis=1)

    df_events["subtype"].fillna("  ", inplace=True)

    # identify goals and own goals
    df_events["goal"] = 1 * (df_events.apply(
        lambda row: row["type"] == "SHOT" and "-GOAL" in row["subtype"],
    df_events["ownGoal"] = 1 * (df_events.apply(
        lambda row: row["type"] == "BALL OUT" and "-GOAL" in row["subtype"],

    df_events = compute_wyscout_columns(df_events, game)

    df_events.sort_values(["startFrame", "endFrame"], inplace=True)
Ejemplo n.º 2
def cleanse_tracking_data(game):
    Function to clean the Metrica tracking data. Despite the obvious steps, the ball data is also cleaned in such a
    way that it floats more smoothly and has less hickups.
    :param game: (int) GameId
    :return: None
    logging.info(f"Cleansing metrica tracking data for game {game}")

    # read the raw data of the home and away team
    df_home = io.read_data("home_team_tracking",
    df_away = io.read_data("away_team_tracking",

    # extract the ball data and clean it to have less
    df_ball = extract_ball_data(df_home)
    df_ball = cleanse_ball_tracking_data(df_ball, game)

    # convert the data frames into a long format to be able to work with them more easily
    df_home = convert_to_long_data_frame(df_home, "Home")
    df_away = convert_to_long_data_frame(df_away, "Away")

    # convert position into meters
    df_players = pd.concat([df_home, df_away])
    df_players = convert_positions_to_meters(df_players)
    df_players["xPosMetrica"] = df_players["xPos"].copy()
    df_players["yPosMetrica"] = df_players["yPos"].copy()

    # combine player data with ball data
    df_all = pd.concat([df_players, df_ball])
    df_all.sort_values(["frame", "playerId"], inplace=True)

    df_all.drop(["outOfBounds", "ballOut"], axis=1, inplace=True)
    df_all = pd.merge(df_all,
                      df_ball[["frame", "outOfBounds", "ballOut"]],

    # consider whether the ball is in play rather than whether it is out
    df_all.rename(columns={"ballOut": "ballInPlay"}, inplace=True)
    df_all["ballInPlay"] = 1 - df_all["ballInPlay"]

    df_all.drop(["outOfBounds", "outPeriod"], axis=1, inplace=True)

    # save to parquet file
Ejemplo n.º 3
def build_formation_data(game):
    Function builds the formation data, i.e. which player played for how long, based on the event data
    :param game: (int) GameId
    :return: None

    logging.info(f"Building formation data for game {game}")

    df_track = io.read_tracking_data(game=game, clean=True)

    # get the first and last time the player was seen in the tracking data
    df = (df_track.groupby("playerId").agg(minTime=("time", "min"),
                                           maxTime=("time", "max"),
                                           team=("team", "min")).reset_index())

    # convert seconds to minutes
    df["minuteStart"] = (df["minTime"] / 60).clip(lower=0).astype(int)
    df["minuteEnd"] = (df["maxTime"] / 60).clip(upper=90).astype(int)

    # fill other relevant columns
    df = df[df["playerId"] != -1].copy()
    df["matchId"] = game
    df["teamId"] = np.where(df["team"] == "Home", 1, 2)
    df["lineup"] = 1 * (df["minuteStart"] == 0)
    df["substituteIn"] = 1 * (df["minuteStart"] > 0)
    df["substituteOut"] = 1 * (df["maxTime"] < df["maxTime"].max())
    df["minutesPlayed"] = df["minuteEnd"] - df["minuteStart"]

    # make sure it is the same format as the wyscout data
    cols = [

    df = df[cols].copy()

    # save to parquet file
Ejemplo n.º 4
def cleanse_wyscout_match_data(country):
    Function to cleanse the wyscout match data and save it in the data folder
    :param country: (str) Country for which the event data should be cleansed
    :return: None

    logging.info(f"Cleansing wyscout match data for {country}")

    # read the JSON file with matches
    matches = io.read_data("match_data",

    # save relevant information in data frame
    df_matches = pd.concat(
        [get_team_view(matches, 0),
         get_team_view(matches, 1)], axis=0)

    # attach the points per team
    df_matches["points"] = np.where(
        df_matches["score"] > df_matches["oppScore"],
        np.where(df_matches["score"] == df_matches["oppScore"], 1, 0),

    df_matches["dateutc"] = pd.to_datetime(df_matches["dateutc"])

    df_matches["scoreDiff"] = df_matches["score"] - df_matches["oppScore"]

    df_matches.sort_values(["matchId", "side"],
                           ascending=[True, False],
    io.write_data(df_matches, "match_data", league=country.lower())

    df_formations = get_all_formations(matches)
    io.write_data(df_formations, "formation_data", league=country.lower())
Ejemplo n.º 5
def cleanse_wyscout_event_data(country):
    Function to cleanse the wyscout event data and save it in the data folder
    :param country: (str) Country for which the event data should be cleansed
    :return: None

    logging.info(f"Cleansing wyscout event data for {country}")

    # read event data
    events = io.read_data("event_data",

    # normalize to get a pandas data frame
    df_events = pd.json_normalize(events)

    # save positions in different columns
    df_events["posBeforeX"] = df_events["positions"].map(lambda x: x[0]["x"])
    df_events["posBeforeY"] = df_events["positions"].map(lambda x: x[0]["y"])
    df_events["posAfterX"] = df_events["positions"].map(
        lambda x: x[1]["x"] if len(x) > 1 else np.nan)
    df_events["posAfterY"] = df_events["positions"].map(
        lambda x: x[1]["y"] if len(x) > 1 else np.nan)

    # save tags in different columns
    # read the tags that contain a description for each event code
    tags = io.read_data("tags", sep=";", data_folder="raw_data_wyscout")
    dict_tags = {row["Tag"]: row["Description"] for _, row in tags.iterrows()}
    df_events["tags"] = df_events["tags"].map(
        lambda x: [tag["id"] for tag in x])
    for key in dict_tags:
            dict_tags[key]] = 1 * df_events["tags"].map(lambda x: key in x)

    # drop columns that are not needed
    df_events.drop(["positions", "tags"], axis=1, inplace=True)

    num_cols = ["subEventId"]
    for col in num_cols:
        df_events[col] = pd.to_numeric(df_events[col], errors="coerce")

    # make sure that the event "Offside" also leads to a subevent "Offside"
    df_events["subEventName"] = np.where(df_events["eventName"] == "Offside",
                                         "Offside", df_events["subEventName"])

    # make sure the goal kick is always taken at the own goal
    df_events["posBeforeX"] = np.where(
        df_events["subEventName"] == "Goal kick", 5, df_events["posBeforeX"])
    df_events["posBeforeY"] = np.where(
        df_events["subEventName"] == "Goal kick", 50, df_events["posBeforeY"])

    # make sure the save attempt always happens at the own goal (currently at (0,0) or (100,100))
    df_events["posBeforeX"] = np.where(
        df_events["subEventName"].isin(["Save attempt", "Reflexes"]),
    df_events["posBeforeY"] = np.where(
        df_events["subEventName"].isin(["Save attempt", "Reflexes"]),

    # change position of the event into meters

    # read the field length and the field width
    with open(io._get_config_file(), "r", encoding="utf-8") as f:
        config = ruamel.yaml.YAML().load(f)

    field_length = config["general"]["field_length"]
    field_width = config["general"]["field_width"]

    # compute the position in meters
    df_events = add_position_in_meters(
        cols_length=["posBeforeX", "posAfterX"],
        cols_width=["posBeforeY", "posAfterY"],

    # Prepare the output table

    # drop columns that are not needed any more
    pos_cols = [
        col for col in df_events.columns if col.startswith("Position:")
    cols_drop = [
        "Free space right",
        "Free space left",
        "Missed ball",
        "Take on left",
        "Take on right",
        "Sliding tackle",
        "Red card",
        "Yellow card",
        "Second yellow card",
    ] + pos_cols
    cols_drop = [col for col in cols_drop if col in df_events.columns]
    df_events.drop(cols_drop, axis=1, inplace=True)

    # add some player information
    df_players = io.read_data("player_data")
    df_players = df_players[[
        "playerId", "playerName", "playerStrongFoot", "playerPosition"
    df_events = pd.merge(df_events, df_players, on="playerId", how="left")

    # add home and away team
    df_matches = io.read_data("match_data", league=country.lower())
    for side in ["home", "away"]:
        df_side = df_matches[df_matches["side"] == side][["matchId", "teamId"]]
        df_side.rename(columns={"teamId": f"{side}TeamId"}, inplace=True)
        df_events = pd.merge(df_events, df_side, on="matchId", how="left")

    # compute the team that is currently in possession of the ball
    df_events["teamPossession"] = df_events.apply(
        lambda row: compute_possession(row), axis=1)

    # change column names to camelCase
    lowercase_cols = [col[0].lower() + col[1:] for col in df_events.columns]
    df_events.columns = lowercase_cols

    col_changes = {
        "own goal": "ownGoal",
        "key pass": "******",
        "counter attack": "counterAttack",
        "left foot": "leftFoot",
        "right foot": "rightFoot",
        "dangerous ball lost": "dangerousBallLost",
        "not accurate": "notAccurate",

    df_events.rename(columns=col_changes, inplace=True)

    # bring columns into correct order
    col_order = [

    other_cols = [col for col in df_events.columns if col not in col_order]
    col_order = col_order + other_cols
    df_events = df_events[col_order].copy()

    io.write_data(df_events, "event_data", league=country.lower())
Ejemplo n.º 6
def cleanse_wyscout_player_data():
    Function to cleanse the wyscout player data and save the data in the data folder
    :return: None

    logging.info("Cleansing wyscout player data")

    # read the JSON file
    players = io.read_data("player_data", data_folder="raw_data_wyscout")

    # normalize to get a pandas data frame
    df_players = pd.json_normalize(players)

    # make sure the encoding is done correctly
    for col in df_players.select_dtypes("object").columns:
            df_players[col] = df_players[col].map(
                lambda x: codecs.unicode_escape_decode(x)[0])
        except TypeError:

    # rename to playerId so that it can be easily merged with other tables
    df_players.rename(columns={"wyId": "playerId"}, inplace=True)

    df_players["birthDate"] = pd.to_datetime(df_players["birthDate"])
    df_players["weight"] = np.where(df_players["weight"] > 0,
                                    df_players["weight"], np.nan)
    df_players["height"] = np.where(df_players["height"] > 0,
                                    df_players["height"], np.nan)
    df_players["foot"] = np.where(df_players["foot"].isin(["null", ""]),
                                  "unknown", df_players["foot"])

    id_cols = ["currentTeamId", "currentNationalTeamId"]
    for col in id_cols:
        df_players[col] = pd.to_numeric(df_players[col], errors="coerce")

    # drop duplicates columns that are not needed
    drop_cols = [
    df_players.drop(drop_cols, axis=1, inplace=True)

            "role.code2": "playerPosition",
            "foot": "playerStrongFoot",
            "shortName": "playerName",

    cols_keep = [col for col in df_players.columns if col.startswith("player")]
    df_players = df_players[cols_keep].copy()

    io.write_data(df_players, data_type="player_data")
Ejemplo n.º 7
def cleanse_wyscout_team_data(country):
    Function to cleanse the wyscout team data and save the data in the data folder
    :param country: (str) Country for which the team data should be cleansed
    :return: None

    valid_countries = ["Germany", "England", "Spain", "Italy", "France"]
    if country not in valid_countries:
        raise KeyError(
            f"Country '{country}' not supported. Choose one out of: {', '.join(valid_countries)}"

    logging.info(f"Cleansing wyscout team data for {country}")

    # read the JSON file
    teams = io.read_data("team_data", data_folder="raw_data_wyscout")

    # normalize to get a pandas data frame
    df_teams = pd.json_normalize(teams)

    # make sure the encoding is done correctly
    for col in df_teams.select_dtypes("object").columns:
            df_teams[col] = df_teams[col].map(
                lambda x: codecs.unicode_escape_decode(x)[0])
        except TypeError:

            "wyId": "teamId",
            "name": "teamName",
            "area.name": "country"

    # only keep club teams from the specified country
    df_teams = df_teams[(df_teams["type"] == "club")
                        & (df_teams["country"] == country)].copy()
    df_teams = df_teams[["teamId", "teamName"]].copy()

    # attach the table to the teams to get a good feeling on how good each team is
    df_matches = io.read_data("match_data", league=country.lower())
    df_table = gen_helper.get_table(df_matches)
    df_table.drop("week", axis=1, inplace=True)
    df_teams = pd.merge(df_teams, df_table, on="teamId", how="left")

    df_teams.sort_values("position", inplace=True)
    df_teams = df_teams[[

    io.write_data(df_teams, data_type="team_data", league=country.lower())