Ejemplo n.º 1
0
def cleanse_metrica_event_data(game, reverse):
    """
    Function to clean the Metrica event data. Notice that quite a lot of the code is needed to make the Metrica data
    compatible with the Wyscout format
    :param game: (int) GameId
    :param reverse: (bool) If True, the away team is playing left to right in the first half
    :return: None
    """

    logging.info(f"Cleansing metrica event data for game {game}")

    df_events = io.read_data("event_data",
                             league=str(game),
                             sep=",",
                             data_folder="raw_data_metrica")

    # rename columns to camelStyle
    df_events.columns = [
        "team",
        "type",
        "subtype",
        "period",
        "startFrame",
        "startTime",
        "endFrame",
        "endTime",
        "from",
        "to",
        "xPosStart",
        "yPosStart",
        "xPosEnd",
        "yPosEnd",
    ]

    # make sure that the position is in meters and events are always from the perspective of the
    # team having the event
    df_events = set_positions(df_events, reverse)

    # make sure that the end frame is always at least the start frame
    df_events["endFrame"] = df_events[["startFrame", "endFrame"]].max(axis=1)
    df_events["endTime"] = df_events[["startTime", "endTime"]].max(axis=1)

    df_events["subtype"].fillna("  ", inplace=True)

    # identify goals and own goals
    df_events["goal"] = 1 * (df_events.apply(
        lambda row: row["type"] == "SHOT" and "-GOAL" in row["subtype"],
        axis=1))
    df_events["ownGoal"] = 1 * (df_events.apply(
        lambda row: row["type"] == "BALL OUT" and "-GOAL" in row["subtype"],
        axis=1))

    df_events = compute_wyscout_columns(df_events, game)

    df_events.sort_values(["startFrame", "endFrame"], inplace=True)
    io.write_data(df_events,
                  "event_data",
                  league=str(game),
                  data_folder="metrica_data")
Ejemplo n.º 2
0
def cleanse_tracking_data(game):
    """
    Function to clean the Metrica tracking data. Despite the obvious steps, the ball data is also cleaned in such a
    way that it floats more smoothly and has less hickups.
    :param game: (int) GameId
    :return: None
    """
    logging.info(f"Cleansing metrica tracking data for game {game}")

    # read the raw data of the home and away team
    df_home = io.read_data("home_team_tracking",
                           league=str(game),
                           data_folder="raw_data_metrica")
    df_away = io.read_data("away_team_tracking",
                           league=str(game),
                           data_folder="raw_data_metrica")

    # extract the ball data and clean it to have less
    df_ball = extract_ball_data(df_home)
    df_ball = cleanse_ball_tracking_data(df_ball, game)

    # convert the data frames into a long format to be able to work with them more easily
    df_home = convert_to_long_data_frame(df_home, "Home")
    df_away = convert_to_long_data_frame(df_away, "Away")

    # convert position into meters
    df_players = pd.concat([df_home, df_away])
    df_players = convert_positions_to_meters(df_players)
    df_players["xPosMetrica"] = df_players["xPos"].copy()
    df_players["yPosMetrica"] = df_players["yPos"].copy()

    # combine player data with ball data
    df_all = pd.concat([df_players, df_ball])
    df_all.sort_values(["frame", "playerId"], inplace=True)

    df_all.drop(["outOfBounds", "ballOut"], axis=1, inplace=True)
    df_all = pd.merge(df_all,
                      df_ball[["frame", "outOfBounds", "ballOut"]],
                      on="frame",
                      how="left")

    # consider whether the ball is in play rather than whether it is out
    df_all.rename(columns={"ballOut": "ballInPlay"}, inplace=True)
    df_all["ballInPlay"] = 1 - df_all["ballInPlay"]

    df_all.drop(["outOfBounds", "outPeriod"], axis=1, inplace=True)

    # save to parquet file
    io.write_data(df_all,
                  "tracking_data",
                  league=str(game),
                  data_folder="metrica_data")
Ejemplo n.º 3
0
def build_formation_data(game):
    """
    Function builds the formation data, i.e. which player played for how long, based on the event data
    :param game: (int) GameId
    :return: None
    """

    logging.info(f"Building formation data for game {game}")

    df_track = io.read_tracking_data(game=game, clean=True)

    # get the first and last time the player was seen in the tracking data
    df = (df_track.groupby("playerId").agg(minTime=("time", "min"),
                                           maxTime=("time", "max"),
                                           team=("team", "min")).reset_index())

    # convert seconds to minutes
    df["minuteStart"] = (df["minTime"] / 60).clip(lower=0).astype(int)
    df["minuteEnd"] = (df["maxTime"] / 60).clip(upper=90).astype(int)

    # fill other relevant columns
    df = df[df["playerId"] != -1].copy()
    df["matchId"] = game
    df["teamId"] = np.where(df["team"] == "Home", 1, 2)
    df["lineup"] = 1 * (df["minuteStart"] == 0)
    df["substituteIn"] = 1 * (df["minuteStart"] > 0)
    df["substituteOut"] = 1 * (df["maxTime"] < df["maxTime"].max())
    df["minutesPlayed"] = df["minuteEnd"] - df["minuteStart"]

    # make sure it is the same format as the wyscout data
    cols = [
        "playerId",
        "lineup",
        "matchId",
        "teamId",
        "substituteIn",
        "substituteOut",
        "minuteStart",
        "minuteEnd",
        "minutesPlayed",
    ]

    df = df[cols].copy()

    # save to parquet file
    io.write_data(df,
                  "formation_data",
                  league=str(game),
                  data_folder="metrica_data")
Ejemplo n.º 4
0
def cleanse_wyscout_match_data(country):
    """
    Function to cleanse the wyscout match data and save it in the data folder
    :param country: (str) Country for which the event data should be cleansed
    :return: None
    """

    logging.info(f"Cleansing wyscout match data for {country}")

    # read the JSON file with matches
    matches = io.read_data("match_data",
                           league=country,
                           data_folder="raw_data_wyscout")

    # save relevant information in data frame
    df_matches = pd.concat(
        [get_team_view(matches, 0),
         get_team_view(matches, 1)], axis=0)

    # attach the points per team
    df_matches["points"] = np.where(
        df_matches["score"] > df_matches["oppScore"],
        3,
        np.where(df_matches["score"] == df_matches["oppScore"], 1, 0),
    )

    df_matches["dateutc"] = pd.to_datetime(df_matches["dateutc"])

    df_matches["scoreDiff"] = df_matches["score"] - df_matches["oppScore"]

    df_matches.sort_values(["matchId", "side"],
                           ascending=[True, False],
                           inplace=True)
    io.write_data(df_matches, "match_data", league=country.lower())

    df_formations = get_all_formations(matches)
    io.write_data(df_formations, "formation_data", league=country.lower())
Ejemplo n.º 5
0
def cleanse_wyscout_event_data(country):
    """
    Function to cleanse the wyscout event data and save it in the data folder
    :param country: (str) Country for which the event data should be cleansed
    :return: None
    """

    logging.info(f"Cleansing wyscout event data for {country}")

    # read event data
    #################
    events = io.read_data("event_data",
                          league=country,
                          data_folder="raw_data_wyscout")

    # normalize to get a pandas data frame
    df_events = pd.json_normalize(events)

    # save positions in different columns
    df_events["posBeforeX"] = df_events["positions"].map(lambda x: x[0]["x"])
    df_events["posBeforeY"] = df_events["positions"].map(lambda x: x[0]["y"])
    df_events["posAfterX"] = df_events["positions"].map(
        lambda x: x[1]["x"] if len(x) > 1 else np.nan)
    df_events["posAfterY"] = df_events["positions"].map(
        lambda x: x[1]["y"] if len(x) > 1 else np.nan)

    # save tags in different columns
    ################
    # read the tags that contain a description for each event code
    tags = io.read_data("tags", sep=";", data_folder="raw_data_wyscout")
    dict_tags = {row["Tag"]: row["Description"] for _, row in tags.iterrows()}
    df_events["tags"] = df_events["tags"].map(
        lambda x: [tag["id"] for tag in x])
    for key in dict_tags:
        df_events[
            dict_tags[key]] = 1 * df_events["tags"].map(lambda x: key in x)

    # drop columns that are not needed
    df_events.drop(["positions", "tags"], axis=1, inplace=True)

    num_cols = ["subEventId"]
    for col in num_cols:
        df_events[col] = pd.to_numeric(df_events[col], errors="coerce")

    # make sure that the event "Offside" also leads to a subevent "Offside"
    df_events["subEventName"] = np.where(df_events["eventName"] == "Offside",
                                         "Offside", df_events["subEventName"])

    # make sure the goal kick is always taken at the own goal
    df_events["posBeforeX"] = np.where(
        df_events["subEventName"] == "Goal kick", 5, df_events["posBeforeX"])
    df_events["posBeforeY"] = np.where(
        df_events["subEventName"] == "Goal kick", 50, df_events["posBeforeY"])

    # make sure the save attempt always happens at the own goal (currently at (0,0) or (100,100))
    df_events["posBeforeX"] = np.where(
        df_events["subEventName"].isin(["Save attempt", "Reflexes"]),
        0,
        df_events["posBeforeX"],
    )
    df_events["posBeforeY"] = np.where(
        df_events["subEventName"].isin(["Save attempt", "Reflexes"]),
        50,
        df_events["posBeforeY"],
    )

    # change position of the event into meters
    ##############

    # read the field length and the field width
    with open(io._get_config_file(), "r", encoding="utf-8") as f:
        config = ruamel.yaml.YAML().load(f)

    field_length = config["general"]["field_length"]
    field_width = config["general"]["field_width"]

    # compute the position in meters
    df_events = add_position_in_meters(
        df_events,
        cols_length=["posBeforeX", "posAfterX"],
        cols_width=["posBeforeY", "posAfterY"],
        field_length=field_length,
        field_width=field_width,
    )

    # Prepare the output table
    ##########################

    # drop columns that are not needed any more
    pos_cols = [
        col for col in df_events.columns if col.startswith("Position:")
    ]
    cols_drop = [
        "eventId",
        "subEventId",
        "posBeforeX",
        "posAfterX",
        "posBeforeY",
        "posAfterY",
        "Free space right",
        "Free space left",
        "Missed ball",
        "Take on left",
        "Take on right",
        "Sliding tackle",
        "Through",
        "Fairplay",
        "Lost",
        "Neutral",
        "Won",
        "Red card",
        "Yellow card",
        "Second yellow card",
        "Anticipated",
        "Anticipation",
        "High",
        "Low",
        "Interception",
        "Clearance",
        "Opportunity",
        "Feint",
        "Blocked",
    ] + pos_cols
    cols_drop = [col for col in cols_drop if col in df_events.columns]
    df_events.drop(cols_drop, axis=1, inplace=True)

    # add some player information
    ########################
    df_players = io.read_data("player_data")
    df_players = df_players[[
        "playerId", "playerName", "playerStrongFoot", "playerPosition"
    ]].copy()
    df_events = pd.merge(df_events, df_players, on="playerId", how="left")

    # add home and away team
    ########################
    df_matches = io.read_data("match_data", league=country.lower())
    for side in ["home", "away"]:
        df_side = df_matches[df_matches["side"] == side][["matchId", "teamId"]]
        df_side.rename(columns={"teamId": f"{side}TeamId"}, inplace=True)
        df_events = pd.merge(df_events, df_side, on="matchId", how="left")

    # compute the team that is currently in possession of the ball
    df_events["teamPossession"] = df_events.apply(
        lambda row: compute_possession(row), axis=1)

    # change column names to camelCase
    lowercase_cols = [col[0].lower() + col[1:] for col in df_events.columns]
    df_events.columns = lowercase_cols

    col_changes = {
        "own goal": "ownGoal",
        "key pass": "******",
        "counter attack": "counterAttack",
        "left foot": "leftFoot",
        "right foot": "rightFoot",
        "dangerous ball lost": "dangerousBallLost",
        "not accurate": "notAccurate",
    }

    df_events.rename(columns=col_changes, inplace=True)

    # bring columns into correct order
    col_order = [
        "id",
        "matchId",
        "matchPeriod",
        "eventSec",
        "eventName",
        "subEventName",
        "teamId",
        "posBeforeXMeters",
        "posBeforeYMeters",
        "posAfterXMeters",
        "posAfterYMeters",
        "playerId",
        "playerName",
        "playerPosition",
        "playerStrongFoot",
        "teamPossession",
        "homeTeamId",
        "awayTeamId",
        "accurate",
        "notAccurate",
    ]

    other_cols = [col for col in df_events.columns if col not in col_order]
    col_order = col_order + other_cols
    df_events = df_events[col_order].copy()

    io.write_data(df_events, "event_data", league=country.lower())
Ejemplo n.º 6
0
def cleanse_wyscout_player_data():
    """
    Function to cleanse the wyscout player data and save the data in the data folder
    :return: None
    """

    logging.info("Cleansing wyscout player data")

    # read the JSON file
    players = io.read_data("player_data", data_folder="raw_data_wyscout")

    # normalize to get a pandas data frame
    df_players = pd.json_normalize(players)

    # make sure the encoding is done correctly
    for col in df_players.select_dtypes("object").columns:
        try:
            df_players[col] = df_players[col].map(
                lambda x: codecs.unicode_escape_decode(x)[0])
        except TypeError:
            pass

    # rename to playerId so that it can be easily merged with other tables
    df_players.rename(columns={"wyId": "playerId"}, inplace=True)

    df_players["birthDate"] = pd.to_datetime(df_players["birthDate"])
    df_players["weight"] = np.where(df_players["weight"] > 0,
                                    df_players["weight"], np.nan)
    df_players["height"] = np.where(df_players["height"] > 0,
                                    df_players["height"], np.nan)
    df_players["foot"] = np.where(df_players["foot"].isin(["null", ""]),
                                  "unknown", df_players["foot"])

    id_cols = ["currentTeamId", "currentNationalTeamId"]
    for col in id_cols:
        df_players[col] = pd.to_numeric(df_players[col], errors="coerce")

    # drop duplicates columns that are not needed
    drop_cols = [
        "birthArea.alpha3code",
        "birthArea.alpha2code",
        "role.code3",
        "role.name",
        "passportArea.alpha3code",
        "passportArea.alpha2code",
        "middleName",
        "birthArea.id",
        "passportArea.id",
    ]
    df_players.drop(drop_cols, axis=1, inplace=True)

    df_players.rename(
        columns={
            "role.code2": "playerPosition",
            "foot": "playerStrongFoot",
            "shortName": "playerName",
        },
        inplace=True,
    )

    cols_keep = [col for col in df_players.columns if col.startswith("player")]
    df_players = df_players[cols_keep].copy()

    io.write_data(df_players, data_type="player_data")
Ejemplo n.º 7
0
def cleanse_wyscout_team_data(country):
    """
    Function to cleanse the wyscout team data and save the data in the data folder
    :param country: (str) Country for which the team data should be cleansed
    :return: None
    """

    valid_countries = ["Germany", "England", "Spain", "Italy", "France"]
    if country not in valid_countries:
        raise KeyError(
            f"Country '{country}' not supported. Choose one out of: {', '.join(valid_countries)}"
        )

    logging.info(f"Cleansing wyscout team data for {country}")

    # read the JSON file
    teams = io.read_data("team_data", data_folder="raw_data_wyscout")

    # normalize to get a pandas data frame
    df_teams = pd.json_normalize(teams)

    # make sure the encoding is done correctly
    for col in df_teams.select_dtypes("object").columns:
        try:
            df_teams[col] = df_teams[col].map(
                lambda x: codecs.unicode_escape_decode(x)[0])
        except TypeError:
            pass

    df_teams.rename(
        columns={
            "wyId": "teamId",
            "name": "teamName",
            "area.name": "country"
        },
        inplace=True,
    )

    # only keep club teams from the specified country
    df_teams = df_teams[(df_teams["type"] == "club")
                        & (df_teams["country"] == country)].copy()
    df_teams = df_teams[["teamId", "teamName"]].copy()

    # attach the table to the teams to get a good feeling on how good each team is
    df_matches = io.read_data("match_data", league=country.lower())
    df_table = gen_helper.get_table(df_matches)
    df_table.drop("week", axis=1, inplace=True)
    df_teams = pd.merge(df_teams, df_table, on="teamId", how="left")

    df_teams.sort_values("position", inplace=True)
    df_teams = df_teams[[
        "position",
        "teamId",
        "teamName",
        "matches",
        "goals",
        "concededGoals",
        "goalsDiff",
        "points",
    ]].copy()

    io.write_data(df_teams, data_type="team_data", league=country.lower())