def main(): """ This example shows the use of Statsbomb datasets, and how we can pass argument to the dataset loader. """ logging.basicConfig( stream=sys.stdout, level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) dataset = datasets.load( "statsbomb", {"event_types": ["pass", "take_on", "carry", "shot"]}) with performance_logging("transform", logger=logger): # convert to TRACAB coordinates dataset = transform(dataset, to_orientation="FIXED_HOME_AWAY", to_pitch_dimensions=[(-5500, 5500), (-3300, 3300)]) with performance_logging("to pandas", logger=logger): dataframe = to_pandas(dataset) print(dataframe[:50].to_string())
def test_read(self): base_dir = os.path.dirname(__file__) with open(f"{base_dir}/files/epts_meta.xml", "rb") as meta_data_fp: meta_data = load_meta_data(meta_data_fp) with open(f"{base_dir}/files/epts_raw.txt", "rb") as raw_data: iterator = read_raw_data(raw_data, meta_data) with performance_logging("load"): assert list(iterator)
def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> EventDataset: """ Deserialize Opta event data into a `EventDataset`. Parameters ---------- inputs : dict input `f24_data` should point to a `Readable` object containing the 'xml' formatted event data. input `f7_data` should point to a `Readable` object containing the 'xml' formatted f7 data. options : dict Options for deserialization of the Opta file. Possible options are `event_types` (list of event types) to specify the event types that should be returned. Valid types: "shot", "pass", "carry", "take_on" and "generic". Generic is everything other than the first 4. Those events are barely parsed. This type of event can be used to do the parsing yourself. Every event has a 'raw_event' attribute which contains the original dictionary. Returns ------- dataset : EventDataset Raises ------ See Also -------- Examples -------- >>> serializer = OptaSerializer() >>> with open("123_f24.xml", "rb") as f24_data, \ >>> open("123_f7.xml", "rb") as f7_data: >>> >>> dataset = serializer.deserialize( >>> inputs={ >>> 'f24_data': f24_data, >>> 'f7_data': f7_data >>> }, >>> options={ >>> 'event_types': ["pass", "take_on", "carry", "shot"] >>> } >>> ) """ self.__validate_inputs(inputs) if not options: options = {} with performance_logging("load data", logger=logger): f7_root = objectify.fromstring(inputs["f7_data"].read()) f24_root = objectify.fromstring(inputs["f24_data"].read()) wanted_event_types = [ EventType[event_type.upper()] for event_type in options.get("event_types", []) ] with performance_logging("parse data", logger=logger): matchdata_path = objectify.ObjectPath( "SoccerFeed.SoccerDocument.MatchData") team_elms = list( matchdata_path.find(f7_root).iterchildren("TeamData")) away_player_map = {} home_player_map = {} home_team_id = None away_team_id = None for team_elm in team_elms: player_map = { player_elm.attrib["PlayerRef"].lstrip("p"): player_elm.attrib["ShirtNumber"] for player_elm in team_elm.find( "PlayerLineUp").iterchildren("MatchPlayer") } team_id = team_elm.attrib["TeamRef"].lstrip("t") if team_elm.attrib["Side"] == "Home": home_player_map = player_map home_team_id = team_id elif team_elm.attrib["Side"] == "Away": away_player_map = player_map away_team_id = team_id else: raise Exception(f"Unknown side: {team_elm.attrib['Side']}") if not away_player_map or not home_player_map: raise Exception("LineUp incomplete") game_elm = f24_root.find("Game") periods = [ Period( id=1, start_timestamp=None, end_timestamp=None, ), Period( id=2, start_timestamp=None, end_timestamp=None, ), ] events = [] for event_elm in game_elm.iterchildren("Event"): event_id = event_elm.attrib["id"] type_id = int(event_elm.attrib["type_id"]) timestamp = _parse_f24_datetime(event_elm.attrib["timestamp"]) period_id = int(event_elm.attrib["period_id"]) for period in periods: if period.id == period_id: break else: logger.debug( f"Skipping event {event_id} because period doesn't match {period_id}" ) continue if type_id == EVENT_TYPE_START_PERIOD: logger.debug( f"Set start of period {period.id} to {timestamp}") period.start_timestamp = timestamp elif type_id == EVENT_TYPE_END_PERIOD: logger.debug( f"Set end of period {period.id} to {timestamp}") period.end_timestamp = timestamp else: if not period.start_timestamp: # not started yet continue if event_elm.attrib["team_id"] == home_team_id: team = Team.HOME current_team_map = home_player_map elif event_elm.attrib["team_id"] == away_team_id: team = Team.AWAY current_team_map = away_player_map else: raise Exception( f"Unknown team_id {event_elm.attrib['team_id']}") x = float(event_elm.attrib["x"]) y = float(event_elm.attrib["y"]) outcome = int(event_elm.attrib["outcome"]) qualifiers = { int(qualifier_elm.attrib["qualifier_id"]): qualifier_elm.attrib.get("value") for qualifier_elm in event_elm.iterchildren("Q") } player_jersey_no = None if "player_id" in event_elm.attrib: player_jersey_no = current_team_map[ event_elm.attrib["player_id"]] generic_event_kwargs = dict( # from DataRecord period=period, timestamp=timestamp - period.start_timestamp, ball_owning_team=None, ball_state=BallState.ALIVE, # from Event event_id=event_id, team=team, player_jersey_no=player_jersey_no, position=Point(x=x, y=y), raw_event=event_elm, ) if type_id == EVENT_TYPE_PASS: pass_event_kwargs = _parse_pass(qualifiers, outcome) event = PassEvent( **pass_event_kwargs, **generic_event_kwargs, ) elif type_id == EVENT_TYPE_OFFSIDE_PASS: pass_event_kwargs = _parse_offside_pass() event = PassEvent( **pass_event_kwargs, **generic_event_kwargs, ) elif type_id == EVENT_TYPE_TAKE_ON: take_on_event_kwargs = _parse_take_on(outcome) event = TakeOnEvent( **take_on_event_kwargs, **generic_event_kwargs, ) elif type_id in ( EVENT_TYPE_SHOT_MISS, EVENT_TYPE_SHOT_POST, EVENT_TYPE_SHOT_SAVED, EVENT_TYPE_SHOT_GOAL, ): shot_event_kwargs = _parse_shot( qualifiers, type_id, position=generic_event_kwargs["position"], ) kwargs = {} kwargs.update(generic_event_kwargs) kwargs.update(shot_event_kwargs) event = ShotEvent(**kwargs) else: event = GenericEvent(**generic_event_kwargs, result=None) if (not wanted_event_types or event.event_type in wanted_event_types): events.append(event) return EventDataset( flags=DatasetFlag.BALL_OWNING_TEAM, orientation=Orientation.ACTION_EXECUTING_TEAM, pitch_dimensions=PitchDimensions(x_dim=Dimension(0, 100), y_dim=Dimension(0, 100)), periods=periods, records=events, )
def deserialize( self, inputs: Dict[str, Readable], options: Dict = None ) -> EventDataset: """ Deserialize StatsBomb event data into a `EventDataset`. Parameters ---------- inputs : dict input `event_data` should point to a `Readable` object containing the 'json' formatted event data. input `lineup_data` should point to a `Readable` object containing the 'json' formatted lineup data. options : dict Options for deserialization of the StatsBomb file. Possible options are `event_types` (list of event types) to specify the event types that should be returned. Valid types: "shot", "pass", "carry", "take_on" and "generic". Generic is everything other than the first 4. Those events are barely parsed. This type of event can be used to do the parsing yourself. Every event has a 'raw_event' attribute which contains the original dictionary. Returns ------- dataset : EventDataset Raises ------ See Also -------- Examples -------- >>> serializer = StatsBombSerializer() >>> with open("events/12312312.json", "rb") as event_data, \ >>> open("lineups/123123123.json", "rb") as lineup_data: >>> >>> dataset = serializer.deserialize( >>> inputs={ >>> 'event_data': event_data, >>> 'lineup_data': lineup_data >>> }, >>> options={ >>> 'event_types': ["pass", "take_on", "carry", "shot"] >>> } >>> ) """ self.__validate_inputs(inputs) if not options: options = {} with performance_logging("load data", logger=logger): raw_events = json.load(inputs["event_data"]) home_lineup, away_lineup = json.load(inputs["lineup_data"]) ( shot_fidelity_version, xy_fidelity_version, ) = _determine_xy_fidelity_versions(raw_events) logger.info( f"Determined Fidelity versions: shot v{shot_fidelity_version} / XY v{xy_fidelity_version}" ) with performance_logging("parse data", logger=logger): home_team = Team( team_id=str(home_lineup["team_id"]), name=home_lineup["team_name"], ground=Ground.HOME, ) home_team.players = [ Player( player_id=str(player["player_id"]), team=home_team, name=player["player_name"], jersey_no=int(player["jersey_number"]), ) for player in home_lineup["lineup"] ] away_team = Team( team_id=str(away_lineup["team_id"]), name=away_lineup["team_name"], ground=Ground.AWAY, ) away_team.players = [ Player( player_id=str(player["player_id"]), team=away_team, name=player["player_name"], jersey_no=int(player["jersey_number"]), ) for player in away_lineup["lineup"] ] teams = [home_team, away_team] wanted_event_types = [ EventType[event_type.upper()] for event_type in options.get("event_types", []) ] periods = [] period = None events = [] for raw_event in raw_events: if raw_event["team"]["id"] == home_lineup["team_id"]: team = teams[0] elif raw_event["team"]["id"] == away_lineup["team_id"]: team = teams[1] else: raise Exception( f"Unknown team_id {raw_event['team']['id']}" ) if ( raw_event["possession_team"]["id"] == home_lineup["team_id"] ): possession_team = teams[0] elif ( raw_event["possession_team"]["id"] == away_lineup["team_id"] ): possession_team = teams[1] else: raise Exception( f"Unknown possession_team_id: {raw_event['possession_team']}" ) timestamp = parse_str_ts(raw_event["timestamp"]) period_id = int(raw_event["period"]) if not period or period.id != period_id: period = Period( id=period_id, start_timestamp=( timestamp if not period # period = [start, end], add millisecond to prevent overlapping else timestamp + period.end_timestamp + 0.001 ), end_timestamp=None, ) periods.append(period) else: period.end_timestamp = period.start_timestamp + timestamp player = None if "player" in raw_event: player = team.get_player_by_id(raw_event["player"]["id"]) event_type = raw_event["type"]["id"] if event_type == SB_EVENT_TYPE_SHOT: fidelity_version = shot_fidelity_version elif event_type in ( SB_EVENT_TYPE_CARRY, SB_EVENT_TYPE_DRIBBLE, SB_EVENT_TYPE_PASS, ): fidelity_version = xy_fidelity_version else: # TODO: Uh ohhhh.. don't know which one to pick fidelity_version = xy_fidelity_version generic_event_kwargs = dict( # from DataRecord period=period, timestamp=timestamp, ball_owning_team=possession_team, ball_state=BallState.ALIVE, # from Event event_id=raw_event["id"], team=team, player=player, coordinates=( _parse_coordinates( raw_event.get("location"), fidelity_version ) if "location" in raw_event else None ), raw_event=raw_event, ) if event_type == SB_EVENT_TYPE_PASS: pass_event_kwargs = _parse_pass( pass_dict=raw_event["pass"], team=team, fidelity_version=fidelity_version, ) event = PassEvent( # TODO: Consider moving this to _parse_pass receive_timestamp=timestamp + raw_event["duration"], **pass_event_kwargs, **generic_event_kwargs, ) elif event_type == SB_EVENT_TYPE_SHOT: shot_event_kwargs = _parse_shot( shot_dict=raw_event["shot"] ) event = ShotEvent( **shot_event_kwargs, **generic_event_kwargs ) # For dribble and carry the definitions # are flipped between Statsbomb and kloppy elif event_type == SB_EVENT_TYPE_DRIBBLE: take_on_event_kwargs = _parse_take_on( take_on_dict=raw_event["dribble"] ) event = TakeOnEvent( **take_on_event_kwargs, **generic_event_kwargs ) elif event_type == SB_EVENT_TYPE_CARRY: carry_event_kwargs = _parse_carry( carry_dict=raw_event["carry"], fidelity_version=fidelity_version, ) event = CarryEvent( # TODO: Consider moving this to _parse_carry end_timestamp=timestamp + raw_event["duration"], **carry_event_kwargs, **generic_event_kwargs, ) else: event = GenericEvent( result=None, event_name=raw_event["type"]["name"], **generic_event_kwargs, ) if ( not wanted_event_types or event.event_type in wanted_event_types ): events.append(event) metadata = Metadata( teams=teams, periods=periods, pitch_dimensions=PitchDimensions( x_dim=Dimension(0, 120), y_dim=Dimension(0, 80) ), frame_rate=None, orientation=Orientation.ACTION_EXECUTING_TEAM, flags=DatasetFlag.BALL_OWNING_TEAM, score=None, ) return EventDataset(metadata=metadata, records=events,)
def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> TrackingDataset: """ Deserialize Metrica tracking data into a `TrackingDataset`. Parameters ---------- inputs : dict input `raw_data_home` should point to a `Readable` object containing the 'csv' formatted raw data for the home team. input `raw_data_away` should point to a `Readable` object containing the 'csv' formatted raw data for the away team. options : dict Options for deserialization of the Metrica file. Possible options are `sample_rate` (float between 0 and 1) to specify the amount of frames that should be loaded, `limit` to specify the max number of frames that will be returned. Returns ------- dataset : TrackingDataset Raises ------ ValueError when both input files don't seem to belong to each other See Also -------- Examples -------- >>> serializer = MetricaTrackingSerializer() >>> with open("Sample_Game_1_RawTrackingData_Away_Team.csv", "rb") as raw_home, \ >>> open("Sample_Game_1_RawTrackingData_Home_Team.csv", "rb") as raw_away: >>> >>> dataset = serializer.deserialize( >>> inputs={ >>> 'raw_data_home': raw_home, >>> 'raw_data_away': raw_away >>> }, >>> options={ >>> 'sample_rate': 1/12 >>> } >>> ) """ self.__validate_inputs(inputs) if not options: options = {} sample_rate = float(options.get('sample_rate', 1.0)) limit = int(options.get('limit', 0)) # consider reading this from data frame_rate = 25 with performance_logging("prepare", logger=logger): home_iterator = self.__create_iterator(inputs['raw_data_home'], sample_rate, frame_rate) away_iterator = self.__create_iterator(inputs['raw_data_away'], sample_rate, frame_rate) partial_frames = zip(home_iterator, away_iterator) with performance_logging("loading", logger=logger): frames = [] periods = [] partial_frame_type = self.__PartialFrame home_partial_frame: partial_frame_type away_partial_frame: partial_frame_type for n, (home_partial_frame, away_partial_frame) in enumerate(partial_frames): self.__validate_partials(home_partial_frame, away_partial_frame) period: Period = home_partial_frame.period frame_id: int = home_partial_frame.frame_id frame = Frame(frame_id=frame_id, timestamp=frame_id / frame_rate - period.start_timestamp, ball_position=home_partial_frame.ball_position, home_team_player_positions=home_partial_frame. player_positions, away_team_player_positions=away_partial_frame. player_positions, period=period, ball_state=None, ball_owning_team=None) frames.append(frame) if not periods or period.id != periods[-1].id: periods.append(period) if not period.attacking_direction_set: period.set_attacking_direction( attacking_direction=attacking_direction_from_frame( frame)) n += 1 if limit and n >= limit: break orientation = ( Orientation.FIXED_HOME_AWAY if periods[0].attacking_direction == AttackingDirection.HOME_AWAY else Orientation.FIXED_AWAY_HOME) return TrackingDataset( flags=~(DatasetFlag.BALL_STATE | DatasetFlag.BALL_OWNING_TEAM), frame_rate=frame_rate, orientation=orientation, pitch_dimensions=PitchDimensions(x_dim=Dimension(0, 1), y_dim=Dimension(0, 1)), periods=periods, records=frames)
def deserialize( self, inputs: Dict[str, Readable], options: Dict = None ) -> TrackingDataset: """ Deserialize EPTS tracking data into a `TrackingDataset`. Parameters ---------- inputs : dict input `raw_data` should point to a `Readable` object containing the 'csv' formatted raw data. input `metadata` should point to the xml metadata data. options : dict Options for deserialization of the EPTS file. Possible options are `sample_rate` (float between 0 and 1) to specify the amount of frames that should be loaded, `limit` to specify the max number of frames that will be returned. Returns ------- dataset : TrackingDataset Raises ------ - See Also -------- Examples -------- >>> serializer = EPTSSerializer() >>> with open("metadata.xml", "rb") as meta, \ >>> open("raw.dat", "rb") as raw: >>> dataset = serializer.deserialize( >>> inputs={ >>> 'metadata': meta, >>> 'raw_data': raw >>> }, >>> options={ >>> 'sample_rate': 1/12 >>> } >>> ) """ self.__validate_inputs(inputs) if not options: options = {} sample_rate = float(options.get("sample_rate", 1.0)) limit = int(options.get("limit", 0)) with performance_logging("Loading metadata", logger=logger): metadata = load_metadata(inputs["metadata"]) with performance_logging("Loading data", logger=logger): # assume they are sorted frames = [ self._frame_from_row(row, metadata) for row in read_raw_data( raw_data=inputs["raw_data"], metadata=metadata, sensor_ids=[ "position" ], # we don't care about other sensors sample_rate=sample_rate, limit=limit, ) ] return TrackingDataset(records=frames, metadata=metadata)
def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> EventDataset: """ Deserialize StatsBomb event data into a `EventDataset`. Parameters ---------- inputs : dict input `event_data` should point to a `Readable` object containing the 'json' formatted event data. input `lineup_data` should point to a `Readable` object containing the 'json' formatted lineup data. options : dict Options for deserialization of the StatsBomb file. Possible options are `event_types` (list of event types) to specify the event types that should be returned. Valid types: "shot", "pass", "carry", "take_on" and "generic". Generic is everything other than the first 4. Those events are barely parsed. This type of event can be used to do the parsing yourself. Every event has a 'raw_event' attribute which contains the original dictionary. Returns ------- dataset : EventDataset Raises ------ See Also -------- Examples -------- >>> serializer = StatsBombSerializer() >>> with open("events/12312312.json", "rb") as event_data, \ >>> open("lineups/123123123.json", "rb") as lineup_data: >>> >>> dataset = serializer.deserialize( >>> inputs={ >>> 'event_data': event_data, >>> 'lineup_data': lineup_data >>> }, >>> options={ >>> 'event_types': ["pass", "take_on", "carry", "shot"] >>> } >>> ) """ self.__validate_inputs(inputs) if not options: options = {} with performance_logging("load data", logger=logger): raw_events = json.load(inputs['event_data']) home_lineup, away_lineup = json.load(inputs['lineup_data']) shot_fidelity_version, xy_fidelity_version = _determine_xy_fidelity_versions( raw_events) logger.info( f"Determined Fidelity versions: shot v{shot_fidelity_version} / XY v{xy_fidelity_version}" ) with performance_logging("parse data", logger=logger): home_player_map = { player['player_id']: str(player['jersey_number']) for player in home_lineup['lineup'] } away_player_map = { player['player_id']: str(player['jersey_number']) for player in away_lineup['lineup'] } wanted_event_types = [ EventType[event_type.upper()] for event_type in options.get('event_types', []) ] periods = [] period = None events = [] for raw_event in raw_events: if raw_event['team']['id'] == home_lineup['team_id']: team = Team.HOME current_team_map = home_player_map elif raw_event['team']['id'] == away_lineup['team_id']: team = Team.AWAY current_team_map = away_player_map else: raise Exception( f"Unknown team_id {raw_event['team']['id']}") if raw_event['possession_team']['id'] == home_lineup[ 'team_id']: possession_team = Team.HOME elif raw_event['possession_team']['id'] == away_lineup[ 'team_id']: possession_team = Team.AWAY else: raise Exception( f"Unknown possession_team_id: {raw_event['possession_team']}" ) timestamp = parse_str_ts(raw_event['timestamp']) period_id = int(raw_event['period']) if not period or period.id != period_id: period = Period(id=period_id, start_timestamp=timestamp if not period else timestamp + period.end_timestamp, end_timestamp=None) periods.append(period) else: period.end_timestamp = period.start_timestamp + timestamp player_jersey_no = None if 'player' in raw_event: player_jersey_no = current_team_map[raw_event['player'] ['id']] event_type = raw_event['type']['id'] if event_type == SB_EVENT_TYPE_SHOT: fidelity_version = shot_fidelity_version elif event_type in (SB_EVENT_TYPE_CARRY, SB_EVENT_TYPE_DRIBBLE, SB_EVENT_TYPE_PASS): fidelity_version = xy_fidelity_version else: # TODO: Uh ohhhh.. don't know which one to pick fidelity_version = xy_fidelity_version generic_event_kwargs = dict( # from DataRecord period=period, timestamp=timestamp, ball_owning_team=possession_team, ball_state=BallState.ALIVE, # from Event event_id=raw_event['id'], team=team, player_jersey_no=player_jersey_no, position=(_parse_position(raw_event.get('location'), fidelity_version) if 'location' in raw_event else None), raw_event=raw_event) if event_type == SB_EVENT_TYPE_PASS: pass_event_kwargs = _parse_pass( pass_dict=raw_event['pass'], current_team_map=current_team_map, fidelity_version=fidelity_version) event = PassEvent( # TODO: Consider moving this to _parse_pass receive_timestamp=timestamp + raw_event['duration'], **pass_event_kwargs, **generic_event_kwargs) elif event_type == SB_EVENT_TYPE_SHOT: shot_event_kwargs = _parse_shot( shot_dict=raw_event['shot']) event = ShotEvent(**shot_event_kwargs, **generic_event_kwargs) # For dribble and carry the definitions # are flipped between Statsbomb and kloppy elif event_type == SB_EVENT_TYPE_DRIBBLE: take_on_event_kwargs = _parse_take_on( take_on_dict=raw_event['dribble']) event = TakeOnEvent(**take_on_event_kwargs, **generic_event_kwargs) elif event_type == SB_EVENT_TYPE_CARRY: carry_event_kwargs = _parse_carry( carry_dict=raw_event['carry'], fidelity_version=fidelity_version) event = CarryEvent( # TODO: Consider moving this to _parse_carry end_timestamp=timestamp + raw_event['duration'], **carry_event_kwargs, **generic_event_kwargs) else: event = GenericEvent(result=None, **generic_event_kwargs) if not wanted_event_types or event.event_type in wanted_event_types: events.append(event) return EventDataset(flags=DatasetFlag.BALL_OWNING_TEAM, orientation=Orientation.ACTION_EXECUTING_TEAM, pitch_dimensions=PitchDimensions( x_dim=Dimension(0, 120), y_dim=Dimension(0, 80)), periods=periods, records=events)
def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> TrackingDataSet: """ Deserialize TRACAB tracking data into a `TrackingDataSet`. Parameters ---------- inputs : dict input `raw_data` should point to a `Readable` object containing the 'csv' formatted raw data. input `meta_data` should point to the xml metadata data. options : dict Options for deserialization of the TRACAB file. Possible options are `only_alive` (boolean) to specify that only frames with alive ball state should be loaded, or `sample_rate` (float between 0 and 1) to specify the amount of frames that should be loaded. Returns ------- data_set : TrackingDataSet Raises ------ - See Also -------- Examples -------- >>> serializer = TRACABSerializer() >>> with open("metadata.xml", "rb") as meta, \ >>> open("raw.dat", "rb") as raw: >>> data_set = serializer.deserialize( >>> inputs={ >>> 'meta_data': meta, >>> 'raw_data': raw >>> }, >>> options={ >>> 'only_alive': True, >>> 'sample_rate': 1/12 >>> } >>> ) """ self.__validate_inputs(inputs) if not options: options = {} sample_rate = float(options.get('sample_rate', 1.0)) only_alive = bool(options.get('only_alive', True)) with performance_logging("Loading metadata"): match = objectify.fromstring(inputs['meta_data'].read()).match frame_rate = int(match.attrib['iFrameRateFps']) pitch_size_width = float(match.attrib['fPitchXSizeMeters']) pitch_size_height = float(match.attrib['fPitchYSizeMeters']) periods = [] for period in match.iterchildren(tag='period'): start_frame_id = int(period.attrib['iStartFrame']) end_frame_id = int(period.attrib['iEndFrame']) if start_frame_id != 0 or end_frame_id != 0: periods.append( Period(id=int(period.attrib['iId']), start_timestamp=start_frame_id / frame_rate, end_timestamp=end_frame_id / frame_rate)) with performance_logging("Loading data"): def _iter(): n = 0 sample = 1. / sample_rate for line in inputs['raw_data'].readlines(): line = line.strip().decode("ascii") if not line: continue frame_id = int(line[:10].split(":", 1)[0]) if only_alive and not line.endswith("Alive;:"): continue for period in periods: if period.contains(frame_id / frame_rate): if n % sample == 0: yield period, line n += 1 frames = [] for period, line in _iter(): frame = self._frame_from_line(period, line, frame_rate) frames.append(frame) if not period.attacking_direction_set: period.set_attacking_direction( attacking_direction=attacking_direction_from_frame( frame)) orientation = ( Orientation.FIXED_HOME_AWAY if periods[0].attacking_direction == AttackingDirection.HOME_AWAY else Orientation.FIXED_AWAY_HOME) return TrackingDataSet(flags=DataSetFlag.BALL_OWNING_TEAM | DataSetFlag.BALL_STATE, frame_rate=frame_rate, orientation=orientation, pitch_dimensions=PitchDimensions( x_dim=Dimension(-1 * pitch_size_width / 2, pitch_size_width / 2), y_dim=Dimension(-1 * pitch_size_height / 2, pitch_size_height / 2), x_per_meter=100, y_per_meter=100), periods=periods, records=frames)
def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> TrackingDataset: """ Deserialize EPTS tracking data into a `TrackingDataset`. Parameters ---------- inputs : dict input `raw_data` should point to a `Readable` object containing the 'csv' formatted raw data. input `meta_data` should point to the xml metadata data. options : dict Options for deserialization of the EPTS file. Possible options are `sample_rate` (float between 0 and 1) to specify the amount of frames that should be loaded, `limit` to specify the max number of frames that will be returned. Returns ------- dataset : TrackingDataset Raises ------ - See Also -------- Examples -------- >>> serializer = EPTSSerializer() >>> with open("metadata.xml", "rb") as meta, \ >>> open("raw.dat", "rb") as raw: >>> dataset = serializer.deserialize( >>> inputs={ >>> 'meta_data': meta, >>> 'raw_data': raw >>> }, >>> options={ >>> 'sample_rate': 1/12 >>> } >>> ) """ self.__validate_inputs(inputs) if not options: options = {} sample_rate = float(options.get("sample_rate", 1.0)) limit = int(options.get("limit", 0)) with performance_logging("Loading metadata", logger=logger): meta_data = load_meta_data(inputs["meta_data"]) periods = meta_data.periods with performance_logging("Loading data", logger=logger): # assume they are sorted frames = [ self._frame_from_row(row, meta_data) for row in read_raw_data( raw_data=inputs["raw_data"], meta_data=meta_data, sensor_ids=["position" ], # we don't care about other sensors sample_rate=sample_rate, limit=limit, ) ] if periods: start_attacking_direction = periods[0].attacking_direction elif frames: start_attacking_direction = attacking_direction_from_frame( frames[0]) else: start_attacking_direction = None orientation = ( (Orientation.FIXED_HOME_AWAY if start_attacking_direction == AttackingDirection.HOME_AWAY else Orientation.FIXED_AWAY_HOME) if start_attacking_direction != AttackingDirection.NOT_SET else None) return TrackingDataset( flags=~(DatasetFlag.BALL_STATE | DatasetFlag.BALL_OWNING_TEAM), frame_rate=meta_data.frame_rate, orientation=orientation, pitch_dimensions=meta_data.pitch_dimensions, periods=periods, records=frames, )
def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> EventDataset: """ Deserialize Metrica Sports event data json format into a `EventDataset`. Parameters ---------- inputs : dict input `raw_data` should point to a `Readable` object containing the 'json' formatted event data. input `metadata` should point to a `Readable` object containing the `xml` metadata file. options : dict Options for deserialization of the Metrica Sports event json file. Possible options are `event_types` (list of event types) to specify the event types that should be returned. Valid types: "shot", "pass", "carry", "take_on" and "generic". Generic is everything other than the first 4. Those events are barely parsed. This type of event can be used to do the parsing yourself. Every event has a 'raw_event' attribute which contains the original dictionary. Returns ------- dataset : EventDataset Raises ------ See Also -------- Examples -------- >>> serializer = MetricaEventsJsonSerializer() >>> with open("events.json", "rb") as raw_data, \ >>> open("metadata.xml", "rb") as metadata: >>> >>> dataset = serializer.deserialize( >>> inputs={ >>> 'raw_data': raw_data, >>> 'metadata': metadata >>> }, >>> options={ >>> 'event_types': ["pass", "take_on", "carry", "shot"] >>> } >>> ) """ self.__validate_inputs(inputs) if not options: options = {} with performance_logging("load data", logger=logger): raw_events = json.load(inputs["raw_data"]) metadata = load_metadata(inputs["metadata"], provider=Provider.METRICA) with performance_logging("parse data", logger=logger): wanted_event_types = [ EventType[event_type.upper()] for event_type in options.get("event_types", []) ] events = [] for raw_event in raw_events["data"]: if raw_event["team"]["id"] == metadata.teams[0].team_id: team = metadata.teams[0] elif raw_event["team"]["id"] == metadata.teams[1].team_id: team = metadata.teams[1] else: raise Exception( f"Unknown team_id {raw_event['team']['id']}") player = team.get_player_by_id(raw_event["from"]["id"]) event_type = raw_event["type"]["id"] subtypes = _parse_subtypes(raw_event) period = [ period for period in metadata.periods if period.id == raw_event["period"] ][0] generic_event_kwargs = dict( # from DataRecord period=period, timestamp=raw_event["start"]["time"], ball_owning_team=_parse_ball_owning_team(event_type, team), ball_state=BallState.ALIVE, # from Event event_id=None, team=team, player=player, coordinates=(_parse_coordinates(raw_event["start"])), raw_event=raw_event, ) if event_type in MS_PASS_TYPES: pass_event_kwargs = _parse_pass( event=raw_event, subtypes=subtypes, team=team, ) event = PassEvent( **pass_event_kwargs, **generic_event_kwargs, ) elif event_type == MS_EVENT_TYPE_SHOT: shot_event_kwargs = _parse_shot(event=raw_event, subtypes=subtypes) event = ShotEvent(**shot_event_kwargs, **generic_event_kwargs) elif subtypes and MS_EVENT_TYPE_DRIBBLE in subtypes: take_on_event_kwargs = _parse_take_on(subtypes=subtypes) event = TakeOnEvent(**take_on_event_kwargs, **generic_event_kwargs) elif event_type == MS_EVENT_TYPE_CARRY: carry_event_kwargs = _parse_carry(event=raw_event, ) event = CarryEvent( **carry_event_kwargs, **generic_event_kwargs, ) else: event = GenericEvent( result=None, event_name=raw_event["type"]["name"], **generic_event_kwargs, ) if (not wanted_event_types or event.event_type in wanted_event_types): events.append(event) return EventDataset( metadata=metadata, records=events, )
def deserialize(self, inputs: Dict[str, Readable], options: Dict = None) -> TrackingDataset: """ Deserialize TRACAB tracking data into a `TrackingDataset`. Parameters ---------- inputs : dict input `raw_data` should point to a `Readable` object containing the 'csv' formatted raw data. input `metadata` should point to the xml metadata data. options : dict Options for deserialization of the TRACAB file. Possible options are `only_alive` (boolean) to specify that only frames with alive ball state should be loaded, or `sample_rate` (float between 0 and 1) to specify the amount of frames that should be loaded, `limit` to specify the max number of frames that will be returned. Returns ------- dataset : TrackingDataset Raises ------ - See Also -------- Examples -------- >>> serializer = TRACABSerializer() >>> with open("metadata.xml", "rb") as meta, \ >>> open("raw.dat", "rb") as raw: >>> dataset = serializer.deserialize( >>> inputs={ >>> 'metadata': meta, >>> 'raw_data': raw >>> }, >>> options={ >>> 'only_alive': True, >>> 'sample_rate': 1/12 >>> } >>> ) """ self.__validate_inputs(inputs) if not options: options = {} sample_rate = float(options.get("sample_rate", 1.0)) limit = int(options.get("limit", 0)) only_alive = bool(options.get("only_alive", True)) # TODO: also used in Metrica, extract to a method home_team = Team(team_id="home", name="home", ground=Ground.HOME) away_team = Team(team_id="away", name="away", ground=Ground.AWAY) teams = [home_team, away_team] with performance_logging("Loading metadata", logger=logger): match = objectify.fromstring(inputs["metadata"].read()).match frame_rate = int(match.attrib["iFrameRateFps"]) pitch_size_width = float(match.attrib["fPitchXSizeMeters"]) pitch_size_height = float(match.attrib["fPitchYSizeMeters"]) periods = [] for period in match.iterchildren(tag="period"): start_frame_id = int(period.attrib["iStartFrame"]) end_frame_id = int(period.attrib["iEndFrame"]) if start_frame_id != 0 or end_frame_id != 0: periods.append( Period( id=int(period.attrib["iId"]), start_timestamp=start_frame_id / frame_rate, end_timestamp=end_frame_id / frame_rate, )) with performance_logging("Loading data", logger=logger): def _iter(): n = 0 sample = 1.0 / sample_rate for line_ in inputs["raw_data"].readlines(): line_ = line_.strip().decode("ascii") if not line_: continue frame_id = int(line_[:10].split(":", 1)[0]) if only_alive and not line_.endswith("Alive;:"): continue for period_ in periods: if period_.contains(frame_id / frame_rate): if n % sample == 0: yield period_, line_ n += 1 frames = [] for n, (period, line) in enumerate(_iter()): frame = self._frame_from_line(teams, period, line, frame_rate) frames.append(frame) if not period.attacking_direction_set: period.set_attacking_direction( attacking_direction=attacking_direction_from_frame( frame)) if limit and n >= limit: break orientation = ( Orientation.FIXED_HOME_AWAY if periods[0].attacking_direction == AttackingDirection.HOME_AWAY else Orientation.FIXED_AWAY_HOME) metadata = Metadata( teams=teams, periods=periods, pitch_dimensions=PitchDimensions( x_dim=Dimension(-1 * pitch_size_width / 2, pitch_size_width / 2), y_dim=Dimension(-1 * pitch_size_height / 2, pitch_size_height / 2), x_per_meter=100, y_per_meter=100, ), score=None, frame_rate=frame_rate, orientation=orientation, provider=Provider.TRACAB, flags=DatasetFlag.BALL_OWNING_TEAM | DatasetFlag.BALL_STATE, ) return TrackingDataset( records=frames, metadata=metadata, )
def run_query(argv=sys.argv[1:]): parser = argparse.ArgumentParser(description="Run query on event data") parser.add_argument('--input-statsbomb', help="StatsBomb event input files (events.json,lineup.json)") parser.add_argument('--output-xml', help="Output file") parser.add_argument('--with-success', default=True, help="Input existence of success capture in output") parser.add_argument('--prepend-time', default=7, help="Seconds to prepend to match") parser.add_argument('--append-time', default=5, help="Seconds to append to match") parser.add_argument('--query-file', help="File containing the query", required=True) parser.add_argument('--stats', default="none", help="Show matches stats", choices=["text", "json", "none"]) parser.add_argument('--show-events', default=False, help="Show events for each match", action="store_true") logger = logging.getLogger("run_query") logging.basicConfig(stream=sys.stderr, level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") opts = parser.parse_args(argv) query = load_query(opts.query_file) dataset = None if opts.input_statsbomb: with performance_logging("load dataset", logger=logger): events_filename, lineup_filename = opts.input_statsbomb.split(",") dataset = load_statsbomb_event_data( events_filename.strip(), lineup_filename.strip(), options={ "event_types": query.event_types } ) if not dataset: raise Exception("You have to specify a dataset.") with performance_logging("searching", logger=logger): matches = pm.search(dataset, query.pattern) video_fragments = [] counter = Counter() for i, match in enumerate(matches): team = match.events[0].team success = 'success' in match.captures counter.update({ f"{team}_total": 1, f"{team}_success": 1 if success else 0 }) if opts.show_events: print_match(i, match, success, str(team)) if opts.output_xml: label = str(team) if opts.with_success and success: label += " success" start_timestamp = ( match.events[0].timestamp + match.events[0].period.start_timestamp - opts.prepend_time ) end_timestamp = ( match.events[-1].timestamp + match.events[-1].period.start_timestamp + opts.append_time ) video_fragments.append( VideoFragment( id_=str(i), start=start_timestamp, end=end_timestamp, label=label ) ) if opts.output_xml: write_to_xml(video_fragments, opts.output_xml) logger.info(f"Wrote {len(video_fragments)} video fragments to file") if opts.stats == "text": print("Home:") print(f" total count: {counter['home_total']}") print( f" success: {counter['home_success']} ({counter['home_success'] / counter['home_total'] * 100:.0f}%)") print( f" no success: {counter['home_total'] - counter['home_success']} ({(counter['home_total'] - counter['home_success']) / counter['home_total'] * 100:.0f}%)") print("") print("Away:") print(f" total count: {counter['away_total']}") print( f" success: {counter['away_success']} ({counter['away_success'] / counter['away_total'] * 100:.0f}%)") print( f" no success: {counter['away_total'] - counter['away_success']} ({(counter['away_total'] - counter['away_success']) / counter['away_total'] * 100:.0f}%)") elif opts.stats == "json": import json print(json.dumps(counter, indent=4))