def test(num_episodes: int, game_interface: HFOAttackingPlayer, features: discrete_features_v2.DiscreteFeaturesV2, agent: QLearningAgentV5, actions: DiscreteActionsV5, reward_funct): """ @param num_episodes: number of episodes to run @param game_interface: game interface, that manages interactions between both; @param features: features interface, from the observation array, gets the main features for the agent; @param agent: learning agent; @param actions: actions interface; @param reward_funct: reward function used @return: (int) the avarage reward """ # Run training using Q-Learning sum_score = 0 for ep in range(num_episodes): print('<Test> {}/{}:'.format(ep, num_episodes)) # Go to origin position: features.update_features(game_interface.get_state()) go_to_origin_position(game_interface=game_interface, features=features, actions=actions) # Test loop: while game_interface.in_game(): # Update environment features: curr_state_id = features.get_state_index() has_ball = features.has_ball() # Act: action_idx = agent.exploit_actions(curr_state_id) hfo_action_params, num_rep = \ actions.map_action_idx_to_hfo_action( agent_pos=features.get_pos_tuple(), has_ball=has_ball, action_idx=action_idx) action_name = actions.map_action_to_str(action_idx, has_ball) # Step: rep_counter_aux = 0 while game_interface.in_game() and rep_counter_aux < num_rep: status, observation = game_interface.step( hfo_action_params, has_ball) rep_counter_aux += 1 reward = reward_funct(status) # update features: features.update_features(observation) # Save metrics: agent.save_visited_state(curr_state_id, action_idx) sum_score += reward # Reset player: agent.reset(training=False) # Game Reset game_interface.reset() return sum_score / num_episodes
def test(num_episodes: int, game_interface: HFOAttackingPlayer, features: DiscreteFeatures, agent: QLearningAgentTest, actions: DiscreteActionsV5, reward_funct) -> float: """ @param num_episodes: number of episodes to run @param game_interface: game interface, that manages interactions between both; @param features: features interface, from the observation array, gets the main features for the agent; @param agent: learning agent; @param actions: actions interface; @param reward_funct: reward function used @return: (float) the average reward """ # Run training using Q-Learning sum_score = 0 for ep in range(num_episodes): # Check if server still up: if game_interface.hfo.step() == SERVER_DOWN: raise ServerDownError("testing; episode={}".format(ep)) # Go to origin position: features.update_features(game_interface.get_state()) go_to_origin_position(game_interface=game_interface, features=features, actions=actions) # Test loop: debug_counter = 0 # TODO remove while game_interface.in_game(): # Update environment features: curr_state_id = features.get_state_index() has_ball = features.has_ball() if not has_ball: hfo_action_params = GO_TO_BALL num_rep = 5 else: # Act: debug_counter += 1 action_idx = agent.exploit_actions(curr_state_id) hfo_action_params, num_rep = \ actions.map_action_idx_to_hfo_action( agent_pos=features.get_pos_tuple(), has_ball=has_ball, action_idx=action_idx) # Step: status, observation = execute_action( action_params=hfo_action_params, repetitions=num_rep, has_ball=has_ball, game_interface=game_interface) # update features: reward = reward_funct(status) features.update_features(observation) sum_score += reward # Game Reset game_interface.reset() print("## AVR Test reward = ", sum_score / num_episodes) return sum_score / num_episodes
def go_to_origin_position(game_interface: HFOAttackingPlayer, features: DiscreteFeatures, actions: DiscreteActionsV5, random_start: bool = True): if random_start: pos_name, origin_pos = random.choice(list(ORIGIN_POSITIONS.items())) else: pos_name = "Fixed start" origin_pos = features.get_pos_tuple() print("Moving to starting point: {0}".format(pos_name)) pos = features.get_pos_tuple(round_ndigits=1) while origin_pos != pos: has_ball = features.has_ball() hfo_action: tuple = actions.dribble_to_pos(origin_pos) status, observation = game_interface.step(hfo_action, has_ball) features.update_features(observation) pos = features.get_pos_tuple(round_ndigits=1)
def train(num_train_episodes: int, num_total_train_ep: int, game_interface: HFOAttackingPlayer, features: DiscreteFeatures, agent: QLearningAgentTest, actions: DiscreteActionsV5, reward_funct): """ @param num_train_episodes: number of episodes to train in this iteration @param num_total_train_ep: number total of episodes to train @param game_interface: game interface, that manages interactions between both; @param features: features interface, from the observation array, gets the main features for the agent; @param agent: learning agent; @param actions: actions interface; @param reward_funct: reward function used @return: (QLearningAgentV5) the agent """ sum_score = 0 for ep in range(num_train_episodes): # Check if server still up: if game_interface.hfo.step() == SERVER_DOWN: raise ServerDownError("training; episode={}".format(ep)) # Go to origin position: features.update_features(game_interface.get_state()) go_to_origin_position(game_interface=game_interface, features=features, actions=actions) # Start learning loop while game_interface.in_game(): # Update environment features: curr_state_id = features.get_state_index() has_ball = features.has_ball() if not has_ball: hfo_action_params = GO_TO_BALL num_rep = 5 status, observation = execute_action( action_params=hfo_action_params, repetitions=num_rep, has_ball=has_ball, game_interface=game_interface) features.update_features(observation) reward = reward_funct(status) else: # Act: action_idx = agent.act(curr_state_id) hfo_action_params, num_rep =\ actions.map_action_idx_to_hfo_action( agent_pos=features.get_pos_tuple(), has_ball=has_ball, action_idx=action_idx) # Step: status, observation = execute_action( action_params=hfo_action_params, repetitions=num_rep, has_ball=has_ball, game_interface=game_interface) # Update environment features: reward = reward_funct(status) sum_score += reward features.update_features(observation) new_state_id = features.get_state_index() agent.store_ep(state_idx=curr_state_id, action_idx=action_idx, reward=reward, next_state_idx=new_state_id, has_ball=has_ball, done=not game_interface.in_game()) agent.learn_buffer(reward) agent.update_hyper_parameters(num_total_episodes=num_total_train_ep) # Game Reset game_interface.reset() print("## AVR Train reward = ", sum_score / num_train_episodes)
num_episodes = (num_train_ep + num_test_ep) * num_repetitions # Directory save_dir = args.save_dir or mkdir() print("Starting Training - id={}; num_opponents={}; num_teammates={}; " "num_episodes={};".format(agent_id, num_op, num_team, num_episodes)) # Initialize connection with the HFO server hfo_interface = HFOAttackingPlayer(agent_id=agent_id, num_opponents=num_op, num_teammates=num_team) hfo_interface.connect_to_server() # Agent set-up reward_function = basic_reward features_manager = DiscreteFeatures(num_team, num_op) actions_manager = DiscreteActionsV5() agent = QLearningAgentTest(num_states=features_manager.get_num_states(), num_actions=actions_manager.get_num_actions(), learning_rate=0.1, discount_factor=0.9, epsilon=0.8) # Save metrics structures avr_rewards_list = [] avr_epsilons_list = [] trained_eps_list = [] q_tables_list = [] # Test one first time without previous train: av_reward = test(num_episodes=num_test_ep, agent=agent, game_interface=hfo_interface, features=features_manager,
save_dir = args.save_dir print("Q Table file: ", load_file) print("Starting Test - num_opponents={}; num_teammates={}; " "num_episodes={};".format(num_op, num_team, num_episodes)) # Initialize connection with the HFO server hfo_interface = HFOAttackingPlayer(agent_id=agent_id, num_opponents=num_op, num_teammates=num_team) hfo_interface.connect_to_server() # Agent set-up reward_function = reward_functions.basic_reward features_manager = discrete_features_v2.DiscreteFeaturesV2( num_team, num_op) actions_manager = DiscreteActionsV5( origin_pos=learning_agent.ORIGIN_POSITIONS["MID LEFT"]) agent = learning_agent.QLearningAgentV5( num_states=features_manager.get_num_states(), num_actions=actions_manager.get_num_actions(), dir=save_dir) agent.load_q_table(load_file) # Run training using Q-Learning av_reward = learning_agent.test(num_episodes=num_episodes, agent=agent, game_interface=hfo_interface, features=features_manager, actions=actions_manager, reward_funct=reward_function) print("Av reward = {}".format(av_reward))
def train(num_train_episodes: int, num_total_train_ep: int, game_interface: HFOAttackingPlayer, features: discrete_features_v2.DiscreteFeaturesV2, agent: QLearningAgentV5, actions: DiscreteActionsV5, save_metrics: bool, reward_funct): """ @param num_train_episodes: number of episodes to train in this iteration @param num_total_train_ep: number total of episodes to train @param game_interface: game interface, that manages interactions between both; @param features: features interface, from the observation array, gets the main features for the agent; @param agent: learning agent; @param actions: actions interface; @param save_metrics: flag, if true save the metrics; @param reward_funct: reward function used @return: (QLearningAgentV5) the agent """ for ep in range(num_train_episodes): # Go to origin position: features.update_features(game_interface.get_state()) go_to_origin_position(game_interface=game_interface, features=features, actions=actions) # Start learning loop aux_positions_names = set() aux_actions_played = set() while game_interface.in_game(): # Update environment features: curr_state_id = features.get_state_index() has_ball = features.has_ball() # Act: action_idx = agent.act(curr_state_id) hfo_action_params, num_rep =\ actions.map_action_idx_to_hfo_action( agent_pos=features.get_pos_tuple(), has_ball=has_ball, action_idx=action_idx) # Step: rep_counter_aux = 0 while game_interface.in_game() and rep_counter_aux < num_rep: status, observation = game_interface.step( hfo_action_params, has_ball) rep_counter_aux += 1 reward = reward_funct(status) # Save metrics: if save_metrics: agent.save_visited_state(curr_state_id, action_idx) agent.cum_reward += reward aux_positions_names.add(features.get_position_name()) action_name = actions.map_action_to_str(action_idx, has_ball) aux_actions_played.add(action_name) # Update environment features: prev_state_id = curr_state_id features.update_features(observation) curr_state_id = features.get_state_index() agent.store_ep(state_idx=prev_state_id, action_idx=action_idx, reward=reward, next_state_idx=curr_state_id, has_ball=has_ball, done=not game_interface.in_game()) agent.learn() # print(':: Episode: {}; reward: {}; epsilon: {}; positions: {}; ' # 'actions: {}'.format(ep, agent.cum_reward, agent.epsilon, # aux_positions_names, aux_actions_played)) if save_metrics: agent.save_metrics(agent.old_q_table, agent.q_table) # Reset player: agent.reset() agent.update_hyper_parameters(episode=agent.train_eps, num_total_episodes=num_total_train_ep) # Game Reset game_interface.reset() agent.save_model() if save_metrics: actions_name = [ actions_manager.map_action_to_str(i, has_ball=True) for i in range(agent.num_actions) ] agent.export_metrics(training=True, actions_name=actions_name) return agent
def train(num_train_episodes: int, num_total_train_ep: int, game_interface: HFOAttackingPlayer, features: DiscreteFeaturesV2, agent: QLearningAgentV6, actions: DiscreteActionsV5, reward_funct): """ @param num_train_episodes: number of episodes to train in this iteration @param num_total_train_ep: number total of episodes to train @param game_interface: game interface, that manages interactions between both; @param features: features interface, from the observation array, gets the main features for the agent; @param agent: learning agent; @param actions: actions interface; @param reward_funct: reward function used @return: (QLearningAgentV5) the agent """ sum_score = 0 sum_epsilons = 0 agent.counter_explorations = 0 agent.counter_exploitations = 0 for ep in range(num_train_episodes): # Check if server still up: # if game_interface.hfo.step() == SERVER_DOWN: # raise ServerDownError("training; episode={}".format(ep)) # Go to origin position: features.update_features(game_interface.get_state()) go_to_origin_position(game_interface=game_interface, features=features, actions=actions) # Start learning loop debug_counter = 0 # TODO remove while game_interface.in_game(): # Update environment features: curr_state_id = features.get_state_index() has_ball = features.has_ball() # Act: debug_counter += 1 action_idx = agent.act(curr_state_id) hfo_action_params, num_rep =\ actions.map_action_idx_to_hfo_action( agent_pos=features.get_pos_tuple(), has_ball=has_ball, action_idx=action_idx) # Step: status, observation = execute_action( action_params=hfo_action_params, repetitions=num_rep, has_ball=has_ball, game_interface=game_interface) # Update environment features: reward = reward_funct(status) sum_score += reward features.update_features(observation) new_state_id = features.get_state_index() agent.store_ep(state_idx=curr_state_id, action_idx=action_idx, reward=reward, next_state_idx=new_state_id, has_ball=has_ball, done=not game_interface.in_game()) if status == OUT_OF_TIME: if debug_counter < 5: raise NoActionPlayedError( "agent was only able to choose {}".format(debug_counter)) agent.learn_buffer() agent.update_hyper_parameters(num_total_episodes=num_total_train_ep) sum_epsilons += agent.epsilon # Game Reset game_interface.reset() print("<<TRAIN>> AVR reward = ", sum_score / num_train_episodes) print("<<TRAIN>> %Explorations={}% ".format( round( (agent.counter_explorations / (agent.counter_exploitations + agent.counter_explorations)), 4) * 100))