def run(team_name, number, first_player): interface = inter.AgentInterface(team_name, number) if first_player: interface.set_home(3, 2) else: interface.set_home(4, 2) obs = interface.observe_from_server() while ("start", 0) not in obs: obs = interface.observe_from_server() while ("stop", 0) not in obs: new_cycle = False for o in obs: if o[0] == "cycle": new_cycle = True obs = [] break if new_cycle: interface.send_action("move", 1) obs = interface.observe_from_server()
def train(self): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables num_episodes = 0 self.agents = [] self.agent_obs = [] # agents = [] # agent_obs = [] for a in range(Config.num_players_per_team): agent = AgentInterface('COMA', a + 1) agent.set_home( int(Config.rows / 2) - int(Config.num_players_per_team / 2) + 1 + a, 2) self.agents.append(agent) obs = agent.observe_from_server() self.agent_obs.append(obs) initial_state = None for a in range(Config.num_players_per_team): while ("start", 0) not in self.agent_obs[a]: self.agent_obs[a] = self.agents[a].observe_from_server() inital_obs = 0 while inital_obs < Config.num_players_per_team: for a in range(Config.num_players_per_team): new_cycle = False agent = self.agents[a] self.agent_obs[a] = agent.observe_from_server() obs = self.agent_obs[a] for o in obs: if o[0] == "cycle": new_cycle = True break if new_cycle: initial_state = self.get_state(agent, obs) inital_obs += 1 i = 0 self.t = 0 episodes = 0 goals_scored = 0 while True: states = None actions = None TDTargets = None for i in range(Config.batch_size): estates, eactions, erewards, next_state, won = self.sample_episode( initial_state) episodes += 1 if won: goals_scored += 1 if episodes % Config.num_train_episodes == 0: print "TRAIN PROPORTION OF EPISODES WON %s" % ( float(goals_scored) / Config.num_train_episodes) with open(Config.model_name + '.out', 'a') as f: f.write('%s,%s\n' % (self.t, float(goals_scored) / Config.num_train_episodes)) if Config.save_model: self.save() episodes = 0 goals_scored = 0 initial_state = next_state batch_targets = self.getTDTargets(estates, eactions, erewards) if TDTargets is None: TDTargets = batch_targets else: TDTargets = np.append(TDTargets, batch_targets, axis=0) if states is None: states = estates else: states = np.append(states, estates, axis=0) if actions is None: actions = eactions else: actions = np.append(actions, eactions, axis=0) self.t += len(states) mini_batch_start = 0 while mini_batch_start < states.shape[0]: next_mini_batch = min(states.shape[0], mini_batch_start + Config.max_batch_size) mstates = states[mini_batch_start:next_mini_batch] mactions = actions[mini_batch_start:next_mini_batch] mTDTargets = TDTargets[mini_batch_start:next_mini_batch] for a in range(Config.num_players_per_team): a_minus = np.delete(mactions, a, 1) agent_actions = mactions[:, a] loss, _ = self.sess.run( [self.loss, self.train_op_global], feed_dict={ self.s: mstates, self.a: agent_actions, self.a_minus: a_minus, self.TD_targets: mTDTargets[:, a], self.agent: [ a for _ in range(next_mini_batch - mini_batch_start) ], self.e: self.get_epsilon() }) for a in range(Config.num_players_per_team): a_minus = np.delete(mactions, a, 1) agent_actions = mactions[:, a] advantages = self.sess.run( [self.advantage], feed_dict={ self.s: mstates, self.s_local: self.get_states_local(a, mstates), self.a: agent_actions, self.a_minus: a_minus, self.agent: [ a for _ in range(next_mini_batch - mini_batch_start) ], self.e: self.get_epsilon() })[0] self.sess.run( [self.policy_op], feed_dict={ self.s: mstates, self.s_local: self.get_states_local(a, mstates), self.a: agent_actions, self.agent: [ a for _ in range(next_mini_batch - mini_batch_start) ], self.advantages_in: advantages, self.e: self.get_epsilon() }) if i > Config.target_update_freq: self.update_target_params() i = 0 i += next_mini_batch - mini_batch_start mini_batch_start = next_mini_batch
def train(): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables score_team_prev = 0 score_opp_prev = 0 i = 0 while True: i += 1 # agents = [] # agent_obs = [] # for a in range(Config.num_players_per_team): agent = AgentInterface('Q-Tabular', player) agent.set_home(3 * player - 1, 3) # agents.append(agent) obs = agent.observe_from_server() # agent_obs.append(obs) state_prev = None action_prev = None # for a in range(Config.num_players_per_team): while ("start", 0) not in obs: obs = agent.observe_from_server() while True: # for a in range(Config.num_players_per_team): obs = agent.observe_from_server() new_cycle = False for o in obs: if o[0] == "cycle": new_cycle = True break if new_cycle: state_new = get_state(agent, obs) if action_prev is not None: score = None for o in obs: if o[0] == "score": if agent.left_team: score = [o[1][0], o[1][1]] else: score = [o[1][1], o[1][0]] score_team_new, score_opp_new = score[0], score[1] reward_prev = reward(state_prev, state_new, action_prev, score_team_prev, score_opp_prev, score_team_new, score_opp_new) print(reward_prev) score_team_prev = score_team_new score_opp_prev = score_opp_new update_Q(state_prev, action_prev, reward_prev, state_new) action_new = get_action(state_new) if action_new <= 8: agent.send_action("move", action_new) else: teammate = 2 if player == 1 else 1 agent.send_action("pass", teammate) state_prev = state_new.copy() action_prev = action_new
def train(self): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables score_team_prev = 0 score_opp_prev = 0 i = 0 while True: i += 1 # agents = [] # agent_obs = [] # for a in range(Config.num_players_per_team): agent = AgentInterface('SMART', self.player) agent.set_home(2 + self.player, 2) # agents.append(agent) obs = agent.observe_from_server() # agent_obs.append(obs) state_prev = None action_prev = None t = 0 # for a in range(Config.num_players_per_team): while ("start", 0) not in obs: obs = agent.observe_from_server() while True: # for a in range(Config.num_players_per_team): obs = agent.observe_from_server() new_cycle = False for o in obs: if o[0] == "cycle": new_cycle = True break if new_cycle: state_new = self.get_state(agent, obs) if action_prev is not None: score = None for o in obs: if o[0] == "score": if agent.left_team: score = [o[1][0], o[1][1]] else: score = [o[1][1], o[1][0]] score_team_new, score_opp_new = score[0], score[1] reward_prev = self.reward(state_prev, state_new, action_prev, score_team_prev, score_opp_prev, score_team_new, score_opp_new) print(reward_prev) score_team_prev = score_team_new score_opp_prev = score_opp_new loss, _ = self.sess.run( [self.loss, self.train_op], feed_dict={ self.s: [state_prev], self.s_: [state_new], self.r: [reward_prev], self.a: [action_prev] }) if t % Config.target_update_freq == 0: self.update_target_params() t += 1 action_new = self.get_action(state_new)[0] if action_new <= 8: agent.send_action("move", action_new) else: teammate = 2 if self.player == 1 else 1 agent.send_action("pass", teammate) state_prev = state_new.copy() action_prev = action_new
def train(self): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables score_team = 0 score_opp = 0 i = 0 while True: i += 1 agents = [] agent_obs = [] for a in range(Config.num_players_per_team): agent = AgentInterface('SMART', a) agent.set_home(3 + a, 2) agents.append(agent) obs = agent.observe_from_server() agent_obs.append(obs) states = np.zeros((Config.num_players_per_team, Config.state_size)) for a in range(Config.num_players_per_team): while ("start", 0) not in agent_obs[a]: agent_obs[a] = agents[a].observe_from_server() states[a] = self.get_state(agents[a], agent_obs[a]) new_states = np.zeros( (Config.num_players_per_team, Config.state_size)) while True: for a in range(Config.num_players_per_team): agent_obs[a] = agents[a].observe_from_server() new_cycle = False for o in agent_obs[a]: if o[0] == "cycle": new_cycle = True break if new_cycle: action = self.get_action([states[a]])[0] if action <= 8: agents[a].send_action("move", action) else: if action - 9 < a: agents[a].send_action("pass", action - 9) else: agents[a].send_action("pass", action - 8) new_states[a] = self.get_state(agents[a], agent_obs[a]) score = None for o in agent_obs[a]: if o[0] == "score": if agents[a].left_team: score = [o[1][0], o[1][1]] else: score = [o[1][1], o[1][0]] reward = 0. if score[0] > score_team: reward = 1. elif score[1] > score_opp: reward = -1. score_team = score[0] score_opp = score[1] loss, _ = self.sess.run( [self.loss, self.train_op], feed_dict={ self.s: [states[a]], self.r: [reward], self.a: [action] }) states[a] = new_states[a]
def train(): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables score_team_prev = [0 for a in range(Config.num_players_per_team)] score_opp_prev = [0 for a in range(Config.num_players_per_team)] i = 0 while True: i += 1 agents = [] agent_obs = [] for a in range(Config.num_players_per_team): agent = AgentInterface('Q-Tabular-PS', a) agent.set_home(3*a+2, 3) agents.append(agent) obs = agent.observe_from_server() agent_obs.append(obs) state_prev = [None, None] action_prev = [None, None] for a in range(Config.num_players_per_team): while ("start", 0) not in agent_obs[a]: agent_obs[a] = agents[a].observe_from_server() while True: for a in range(Config.num_players_per_team): agent = agents[a] agent_obs[a] = agent.observe_from_server() obs = agent_obs[a] new_cycle = False for o in obs: if o[0] == "cycle": new_cycle = True break if new_cycle: state_new = get_state(agent, obs) if action_prev[a] is not None: score = None for o in obs: if o[0] == "score": if agent.left_team: score = [o[1][0], o[1][1]] else: score = [o[1][1], o[1][0]] score_team_new, score_opp_new = score[0], score[1] reward_prev = reward(state_prev[a], state_new, action_prev[a], score_team_prev[a], score_opp_prev[a], score_team_new, score_opp_new) score_team_prev[a] = score_team_new score_opp_prev[a] = score_opp_new update_Q(state_prev[a], action_prev[a], reward_prev, state_new) action_new = get_action(state_new) if action_new <= 8: agent.send_action("move", action_new) else: teammate = 1-a agent.send_action("pass", teammate) state_prev[a] = state_new.copy() action_prev[a] = action_new
def train(self): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables score_team = 0 score_opp = 0 i = 0 while True: i += 1 # agents = [] # agent_obs = [] # for a in range(Config.num_players_per_team): agent = AgentInterface('SMART', player) agent.set_home(3+player, 2) # agents.append(agent) obs = agent.observe_from_server() # agent_obs.append(obs) state = np.zeros(Config.state_size) # for a in range(Config.num_players_per_team): while ("start", 0) not in obs: obs = agent.observe_from_server() state = self.get_state(agent, obs) new_state = np.zeros(Config.state_size) while True: # for a in range(Config.num_players_per_team): obs = agent.observe_from_server() new_cycle = False for o in obs: if o[0] == "cycle": new_cycle = True break if new_cycle: action = self.get_action([state])[0] if action <= 8: agent.send_action("move", action) else: if action - 9 < player: agent.send_action("pass", action-9) else: agent.send_action("pass", action-8) new_state = self.get_state(agent, obs) score = None for o in obs: if o[0] == "score": if agent.left_team: score = [o[1][0], o[1][1]] else: score = [o[1][1], o[1][0]] reward = 0. if score[0] > score_team: reward = 1. elif score[1] > score_opp: reward = -1. else: reward = -0.2 score_team = score[0] score_opp = score[1] loss, _, grad_norm = self.sess.run([self.loss, self.train_op, self.grad_norm], feed_dict={self.s: [state], self.r: [reward], self.a: [action]}) print grad_norm state = new_state.copy()
def train(self): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables score_team_prev = 0 score_opp_prev = 0 num_episodes = 0 # agents = [] # agent_obs = [] # for a in range(Config.num_players_per_team): agent = AgentInterface(Config.model_name, self.player) agent.set_home( int(Config.rows / 2) - int(Config.num_players_per_team / 2) + self.player, 2) # agents.append(agent) obs = agent.observe_from_server() # agent_obs.append(obs) self.i = 0 state_prev = None action_prev = None train_episodes_won = 0 # eval_episodes_won = 0 last_print_episode = 0 last_save_episode = 0 # for a in range(Config.num_players_per_team): while ("start", 0) not in obs: obs = agent.observe_from_server() while True: obs = agent.observe_from_server() new_cycle = False for o in obs: if o[0] == "cycle": new_cycle = True break if new_cycle: if num_episodes % ( Config.num_train_episodes + Config.num_eval_episodes ) == 0 and num_episodes > last_print_episode: if self.player == 1: print "NUMBER OF TRAINING ITERATIONS: %d" % (self.t) last_print_episode = num_episodes print "TRAIN PROPORTION OF EPISODES WON %s" % ( float(train_episodes_won) / Config.num_train_episodes) # print "EVAL. PROPORTION OF EPISODES WON %s" % (float(eval_episodes_won) / Config.num_eval_episodes) with open(Config.model_name + '.out', 'a') as f: f.write('%s,%s\n' % (self.t, float(train_episodes_won) / Config.num_train_episodes)) train_episodes_won = 0 if Config.save_model and num_episodes > last_save_episode: self.save() last_save_episode = num_episodes # eval_episodes_won = 0 if num_episodes % (Config.num_train_episodes + Config.num_eval_episodes ) < Config.num_train_episodes: #TRAINING # for a in range(Config.num_players_per_team): state_new = self.get_state(agent, obs) if action_prev is not None: score = None for o in obs: if o[0] == "score": if agent.left_team: score = [o[1][0], o[1][1]] else: score = [o[1][1], o[1][0]] score_team_new, score_opp_new = score[0], score[1] if (score_team_new, score_opp_new) != (score_team_prev, score_opp_prev): self.i = 0 num_episodes += 1 if score_team_new > score_team_prev: train_episodes_won += 1 reward_prev = self.reward(state_prev, state_new, action_prev, score_team_prev, score_opp_prev, score_team_new, score_opp_new) #--print(reward_prev) # self.episode_opp_goals+=score_opp_new-score_opp_prev # self.episode_team_goals+=score_team_new-score_team_prev loss, _ = self.sess.run( [self.loss, self.train_op], feed_dict={ self.s: [state_prev], self.s_: [state_new], self.r: [reward_prev], self.a: [action_prev] }) # if self.t % Config.target_update_freq == 0: if (score_team_new, score_opp_new) != (score_team_prev, score_opp_prev): self.update_target_params() #print('final score:',self.episode_team_goals,self.episode_opp_goals) # self.episode_opp_goals=0 # self.episode_team_goals=0 self.t += 1 score_team_prev = score_team_new score_opp_prev = score_opp_new action_new = self.get_action(state_new, True)[0] # if self.i > Config.max_episode_length and agent.uni_number == 0: # agent.send_action("restart", False) # self.i = 0 # num_episodes += 1 # else: if action_new <= 8: agent.send_action("move", action_new) else: if action_new - 8 < self.player: agent.send_action("pass", action_new - 8) else: agent.send_action("pass", action_new - 7) state_prev = state_new.copy() action_prev = action_new # else: # EVALUATION # # for a in range(Config.num_players_per_team): # state_new = self.get_state(agent, obs) # if action_prev is not None: # score = None # for o in obs: # if o[0] == "score": # if agent.left_team: # score = [o[1][0], o[1][1]] # else: # score = [o[1][1], o[1][0]] # score_team_new, score_opp_new = score[0], score[1] # if (score_team_new, score_opp_new) != (score_team_prev, score_opp_prev): # self.i = 0 # num_episodes += 1 # if score_team_new > score_team_prev: # eval_episodes_won += 1 # reward_prev = self.reward(state_prev, state_new, action_prev, score_team_prev, score_opp_prev, score_team_new, score_opp_new) # #--print(reward_prev) # # self.episode_opp_goals+=score_opp_new-score_opp_prev # # self.episode_team_goals+=score_team_new-score_team_prev # score_team_prev = score_team_new # score_opp_prev = score_opp_new # action_new = self.get_action(state_new, False)[0] # # if self.i > Config.max_episode_length and agent.uni_number == 0: # # agent.send_action("restart", False) # # self.i = 0 # # num_episodes += 1 # # else: # if action_new <= 8: # agent.send_action("move", action_new) # else: # if action_new - 8 < self.player: # agent.send_action("pass", action_new-8) # else: # agent.send_action("pass", action_new-7) # state_prev = state_new.copy() # action_prev = action_new self.i += 1
def train(self): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables score_team_prev = 0 score_opp_prev = 0 i = 0 while True: i += 1 # agents = [] # agent_obs = [] # for a in range(Config.num_players_per_team): agent = AgentInterface('SMART', self.player) agent.set_home( int(Config.rows / 2) - int(Config.num_players_per_team / 2) + 1 + self.player, 2) # agents.append(agent) obs = agent.observe_from_server() # agent_obs.append(obs) state_prev = None action_prev = None # for a in range(Config.num_players_per_team): while ("start", 0) not in obs: obs = agent.observe_from_server() while True: # for a in range(Config.num_players_per_team): obs = agent.observe_from_server() new_cycle = False for o in obs: if o[0] == "cycle": new_cycle = True break if new_cycle: state_new = self.get_state(agent, obs) if action_prev is not None: score = None for o in obs: if o[0] == "score": if agent.left_team: score = [o[1][0], o[1][1]] else: score = [o[1][1], o[1][0]] score_team_new, score_opp_new = score[0], score[1] reward_prev = self.reward(state_prev, state_new, action_prev, score_team_prev, score_opp_prev, score_team_new, score_opp_new) #--print(reward_prev) self.episode_opp_goals += score_opp_new - score_opp_prev self.episode_team_goals += score_team_new - score_team_prev score_team_prev = score_team_new score_opp_prev = score_opp_new self.write_sars(state_prev, state_new, reward_prev, action_prev, self.player) if self.t % Config.target_update_freq == 0: self.update_target_params() #print('final score:',self.episode_team_goals,self.episode_opp_goals) self.episode_opp_goals = 0 self.episode_team_goals = 0 #Update model if self.t % Config.model_update_freq == 0 and os.path.exists( 'tmp/model.ckpt.data-00000-of-00001' ) and self.last_model_update != os.path.getmtime( 'tmp/model.ckpt.data-00000-of-00001'): self.last_model_update = os.path.getmtime( 'tmp/model.ckpt.data-00000-of-00001') try: self.saver.restore(self.sess, "tmp/model.ckpt") print('load model successful') except Exception: print('load model failed') self.t += 1 action_new = self.get_action(state_new)[0] if action_new <= 8: agent.send_action("move", action_new) else: if action_new - 9 < self.player: agent.send_action("pass", action_new - 9) else: agent.send_action("pass", action_new - 8) state_prev = state_new.copy() action_prev = action_new