def run_maze_learner(mission, clients): if 'malmopy.visualization.tensorboard' in sys.modules: visualizer = TensorboardVisualizer() visualizer.initialize(logdir, None) else: visualizer = ConsoleVisualizer() # with TensorboardVisualizer() as visualizer: env = MazeEnvironment(mission, [str.split(client, ':') for client in clients]) env.recording = False # explorer = LinearEpsilonGreedyExplorer(1, 0.1, 10000) # model = DeepQNeuralNetwork((4, 84, 84), (env.available_actions,), momentum=0, visualizer=visualizer) # memory = TemporalMemory(50000, model.input_shape[1:], model.input_shape[0], False) agent = RandomAgent( "rand", 3 ) #DQNAgent("Maze DQN Agent", env.available_actions, model, memory, explorer=explorer, #visualizer=visualizer) # exp = SingleAgentExperiment("Malmo Cliff Walking", agent, env, 500000, warm_up_timesteps=500, # visualizer=visualizer) # exp.episode_end += on_episode_end # visualizer.initialize(MALMO_MAZE_FOLDER, model, CntkConverter()) # with Popen(['tensorboard', '--logdir=%s' % path.join(MALMO_MAZE_FOLDER, path.pardir), '--port=6006']): EPOCH_SIZE = 250000 max_training_steps = 50 * EPOCH_SIZE state = env.reset() reward = 0 agent_done = False viz_rewards = [] for step in range(1, max_training_steps + 1): # action = agent.act(state, reward, agent_done, is_training=True) # check if env needs reset if env.done: visualize_training(visualizer, step, viz_rewards) agent.inject_summaries(step) viz_rewards = [] state = env.reset() # select an action action = agent.act(state, reward, agent_done, is_training=True) print('ACTION BEING TAKEN: ', action) # take a step state, reward, agent_done = env.do(action) viz_rewards.append(reward) if (step % EPOCH_SIZE) == 0: model.save('%s-%s-dqn_%d.model' % (backend, environment, step / EPOCH_SIZE))
def agent_factory(name, role, type, clients, max_epochs, logdir, visualizer): assert len(clients) >= 2, 'Not enough clients (need at least 2)' clients = parse_clients_args(clients) builder = PigChaseSymbolicStateBuilder() env = PigChaseEnvironment(clients, builder, role=role, randomize_positions=True) if role == 0: agent = PigChaseChallengeAgent(name) obs = env.reset() reward = 0 agent_done = False while True: if env.done: obs = env.reset() # select an action action = agent.act(obs, reward, agent_done, is_training=True) # take a step obs, reward, agent_done = env.do(action) else: if type == 'astar': agent = FocusedAgent(name, ENV_TARGET_NAMES[0]) else: agent = RandomAgent(name, env.available_actions) obs = env.reset() reward = 0 agent_done = False viz_rewards = [] max_training_steps = EPOCH_SIZE * max_epochs for step in range(1, max_training_steps + 1): # check if env needs reset if env.done: visualize_training(visualizer, step, viz_rewards) viz_rewards = [] obs = env.reset() # select an action action = agent.act(obs, reward, agent_done, is_training=True) # take a step obs, reward, agent_done = env.do(action) viz_rewards.append(reward) agent.inject_summaries(step)
def __init__(self, name, visualizer=None): nb_actions = len(ENV_ACTIONS) super(PigChaseChallengeAgent, self).__init__(name, nb_actions, visualizer = visualizer) self._agents = [] self._agents.append(FocusedAgent(name, ENV_TARGET_NAMES[0], visualizer = visualizer)) self._agents.append(RandomAgent(name, nb_actions, visualizer = visualizer)) self.current_agent = self._select_agent(P_FOCUSED)
def __init__(self, name, visualizer=None, p_focused=0.75): nb_actions = len(ENV_ACTIONS) super(PigChaseChallengeAgent, self).__init__(name, nb_actions, visualizer=visualizer) self.epi_counter = 0 self._agents = [] self._agents.append( FocusedAgent(name, ENV_TARGET_NAMES[0], visualizer=visualizer)) self._agents.append( RandomAgent(name, nb_actions, visualizer=visualizer)) self.p_focused = p_focused self.current_agent = self._select_agent(p_focused)
def __init__(self, name, visualizer=None, focused=True, random=True, bad_guy=True, standstill=False): nb_actions = len(ENV_ACTIONS) super(ChallengerFactory, self).__init__(name, nb_actions, visualizer=visualizer) # List of possible agents self._agents = [] self._agent_probabilities = [] self._helmets = [] print("Allowing challengers:") if focused: self._agents.append( FocusedAgent(name, ENV_TARGET_NAMES[0], visualizer=visualizer)) self._agent_probabilities.append(FocusedAgentWeight) self._helmets.append(ChallengerFactory.AGENT_TYPE[FocusedAgent]) print(" FocusedAgent") if random: self._agents.append( RandomAgent(name, nb_actions, visualizer=visualizer)) self._agent_probabilities.append(RandomAgentWeight) self._helmets.append(ChallengerFactory.AGENT_TYPE[RandomAgent]) print(" RandomAgent") if bad_guy: self._agents.append(BadGuy(name, visualizer=visualizer)) self._agent_probabilities.append(BadGuyWeight) self._helmets.append(ChallengerFactory.AGENT_TYPE[BadGuy]) print(" BadGuy") if standstill: self._agents.append(StandstillAgent(name, visualizer=visualizer)) self._agent_probabilities.append(StandstillAgentWeight) self._helmets.append(ChallengerFactory.AGENT_TYPE[StandstillAgent]) print(" StandstillAgent") # Select first agent n = sum(self._agent_probabilities) self._agent_probabilities = [ item / n for item in self._agent_probabilities ] self.current_agent = self._select_agent(self._agent_probabilities)
def __init__(self, name, visualizer=None, p_focused_new=0.7): if p_focused_new is None: self.p_focused = p_focused_new else: self.p_focused = P_FOCUSED nb_actions = len(ENV_ACTIONS) super(PigChaseChallengeAgent, self).__init__(name, nb_actions, visualizer=visualizer) self._agents = [] self._agents.append( FocusedAgent(name, ENV_TARGET_NAMES[0], visualizer=visualizer)) self._agents.append( RandomAgent(name, nb_actions, visualizer=visualizer)) self.current_agent = self._select_agent(self.p_focused)
def run_maze_learner(mission, clients): if 'malmopy.visualization.tensorboard' in sys.modules: visualizer = TensorboardVisualizer() visualizer.initialize(logdir, None) else: visualizer = ConsoleVisualizer() env = MazeEnvironment(mission, [str.split(client, ':') for client in clients]) env.recording = False agent = RandomAgent("rand", 3, delay_between_action=1.5) #taking random actions EPOCH_SIZE = 250000 max_training_steps = 50 * EPOCH_SIZE state = env.reset() reward = 0 agent_done = False viz_rewards = [] for step in range(1, max_training_steps + 1): action = agent.act(state, reward, agent_done, is_training=True) # check if env needs reset if env.done: visualize_training(visualizer, step, viz_rewards) agent.inject_summaries(step) viz_rewards = [] state = env.reset() # select an action action = agent.act(state, reward, agent_done, is_training=True) print('ACTION BEING TAKEN: ', action) # take a step state, reward, agent_done = env.do(action) viz_rewards.append(reward) if (step % EPOCH_SIZE) == 0: model.save('%s-%s-dqn_%d.model' % (backend, environment, step / EPOCH_SIZE))
def agent_factory(name, role, type, clients, max_epochs, logdir, visualizer): assert len(clients) >= 2, 'Not enough clients (need at least 2)' clients = parse_clients_args(clients) builder = PigChaseSymbolicStateBuilder() env = PigChaseEnvironment(clients, builder, role=role, randomize_positions=True) if role == 0: agent1 = FocusedAgent(name, ENV_TARGET_NAMES[0]) agent2 = RandomAgent(name, env.available_actions) agent3 = BadAgent(name) agent_list = [agent1, agent2, agent3]# three types of agent agent = agent1 obs = env.reset() reward = 0 agent_done = False max_training_steps = EPOCH_SIZE * max_epochs epoch = 0 for step in range(1, max_training_steps+1): if env.done: obs = env.reset() epoch += 1 agent = agent_list[epoch/10 % 3]# change for every 10 episodes # select an action action = agent.act(obs, reward, agent_done) # take a step obs, reward, agent_done = env.do(action) else: config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.Session(config = config) as sess: agent1 = BayesAgent(name, ENV_TARGET_NAMES[0], 'Agent_1', True, sess) agent2 = RandomAgent(name, env.available_actions) agent3 = BadAgent(name) agent4 = FocusedAgent(name, ENV_TARGET_NAMES[0]) if not agent1.save: sess.run(tf.global_variables_initializer()) print "Initialize" agent_list = [agent1, agent2, agent3, agent4]# three types of agents agent = agent1 obs = env.reset() agent1.reset(obs) reward = 0 agent_done = False viz_rewards = [] avg = [] epoch = 0 s = 1 max_training_steps = EPOCH_SIZE * max_epochs for step in range(1, max_training_steps+1): # check if env needs reset if agent_done: obs = env.reset() agent1.reset(obs) avg.append(sum(viz_rewards)) print "Epoch:%d, accumulative rewards: %d"%(epoch, sum(viz_rewards)) visualize_training(visualizer, step, viz_rewards) viz_rewards = [] epoch += 1 agent = agent_list[epoch/5 % 4]# change for every episodes if epoch%10 == 0: agent1.reset_collaborator() s = 1 # select an action action = agent.act(obs, reward, agent_done, is_training = True) # take a step next_obs, reward, agent_done = env.do(action) agent1.collecting(obs, action, reward, next_obs, agent_done, s) s += 1 obs = next_obs viz_rewards.append(reward) if step % 100 == 0: agent1.save_replay_buffer() # agent1.inject_summaries(step) print "Average Reward: ", 1.*sum(avg)/len(avg)
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation the # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all copies or substantial portions of # the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # =================================================================================================================== from common import ENV_AGENT_NAMES from evaluation import PigChaseEvaluator from environment import PigChaseTopDownStateBuilder from malmopy.agent import RandomAgent if __name__ == '__main__': # Warn for Agent name !!! clients = [('127.0.0.1', 10000), ('127.0.0.1', 10001)] agent = RandomAgent(ENV_AGENT_NAMES[1], 3) eval = PigChaseEvaluator(clients, agent, agent, PigChaseTopDownStateBuilder()) eval.run() eval.save('My Exp 1', 'pig_chase_results.json')
def agent_factory(name, role, baseline_agent, clients, max_epochs, logdir, visualizer): assert len(clients) >= 2, 'Not enough clients (need at least 2)' clients = parse_clients_args(clients) batch_size = 32 builder = PigChaseSymbolicStateBuilder() env = PigChaseEnvironment(clients, builder, role=role, randomize_positions=True) if role == 0: agent = PigChaseChallengeAgent(name) if type(agent.current_agent) == RandomAgent: agent_type = PigChaseEnvironment.AGENT_TYPE_1 else: agent_type = PigChaseEnvironment.AGENT_TYPE_2 ##Aqui el state hay que modificarlo para que se adapte a lo que la red neurnal necesita state = env.reset(agent_type) reward = 0 agent_done = False num_actions = 0 while True: # take a step # reset if needed if env.done: print(agent.check_memory(batch_size)) if type(agent.current_agent) == RandomAgent: agent_type = PigChaseEnvironment.AGENT_TYPE_1 else: agent_type = PigChaseEnvironment.AGENT_TYPE_2 ##Aqui el state habria que modificarlo de nuevo if num_actions > batch_size: print('Entrando a replay 1') agent.replay(batch_size) state = env.reset(agent_type) # select an action #print('Accion del role 1') action = agent.act(state, reward, agent_done, is_training=True) next_state, reward, agent_done = env.do(action) num_actions = num_actions + 1 next_state2 = adapt_state(next_state) agent.remember(state, action, reward, next_state2, agent_done) ##Aqui state= obs (que seria el estado anterior estado modificado) state = next_state ##No estoy seguro de si esto va aqui por el while true (no se cuando acaba). Deberia ir cuando acaba una partida ##Hacer check si hace el replay o no. Si no lo hace nunca, meter el replay dentro de el if(env.done (signifca que una etapa ha acabado y empieza otra, por lo que deberia esta bien)) else: if baseline_agent == 'astar': agent = FocusedAgent(name, ENV_TARGET_NAMES[0]) else: agent = RandomAgent(name, env.available_actions) state = env.reset() reward = 0 agent_done = False viz_rewards = [] max_training_steps = EPOCH_SIZE * max_epochs for step in six.moves.range(1, max_training_steps + 1): # check if env needs reset if env.done: visualize_training(visualizer, step, viz_rewards) viz_rewards = [] ##No se si esto se tiene que hacer tambien aqui o no, hacer check if agent.check_memory(batch_size) > batch_size: print('Entrando a replay 2') agent.replay(batch_size) state = env.reset() # select an action #print('Accion del role 2') action = agent.act(state, reward, agent_done, is_training=True) # take a step next_state, reward, agent_done = env.do(action) next_state2 = adapt_state(next_state) agent.remember(state, action, reward, next_state2, agent_done) ##Aqui state= obs (que seria el estado anterior estado modificado) state = next_state #obs, reward, agent_done = env.do(action) viz_rewards.append(reward) agent.inject_summaries(step)
def agent_factory(name, role, baseline_agent, clients, max_epochs, logdir, visualizer): assert len(clients) >= 2, 'Not enough clients (need at least 2)' clients = parse_clients_args(clients) builder = PigChaseSymbolicStateBuilder() env = PigChaseEnvironment(clients, builder, role=role, randomize_positions=True) if role == 0: agent = PigChaseChallengeAgent(name) obs = env.reset(get_agent_type(agent)) reward = 0 agent_done = False while True: if env.done: while True: obs = env.reset(get_agent_type(agent)) if obs: break # select an action action = agent.act(obs, reward, agent_done, is_training=True) # reset if needed if env.done: obs = env.reset(get_agent_type(agent)) # take a step obs, reward, agent_done = env.do(action) else: if baseline_agent == 'tabq': agent = TabularQLearnerAgent(name, visualizer) elif baseline_agent == 'astar': agent = FocusedAgent(name, ENV_TARGET_NAMES[0]) else: agent = RandomAgent(name, env.available_actions) obs = env.reset() reward = 0 agent_done = False viz_rewards = [] max_training_steps = EPOCH_SIZE * max_epochs for step in six.moves.range(1, max_training_steps + 1): # check if env needs reset if env.done: while True: if len(viz_rewards) == 0: viz_rewards.append(0) visualize_training(visualizer, step, viz_rewards) tag = "Episode End Conditions" visualizer.add_entry( step, '%s/timeouts per episode' % tag, env.end_result == "command_quota_reached") visualizer.add_entry( step, '%s/agent_1 defaults per episode' % tag, env.end_result == "Agent_1_defaulted") visualizer.add_entry( step, '%s/agent_2 defaults per episode' % tag, env.end_result == "Agent_2_defaulted") visualizer.add_entry(step, '%s/pig caught per episode' % tag, env.end_result == "caught_the_pig") agent.inject_summaries(step) viz_rewards = [] obs = env.reset() if obs: break # select an action action = agent.act(obs, reward, agent_done, is_training=True) # take a step obs, reward, agent_done = env.do(action) viz_rewards.append(reward)