def mp_explore(args, pipe2_exp, worker_id): args.init_before_training(if_main=False) '''basic arguments''' env = args.env agent = args.agent rollout_num = args.rollout_num '''training arguments''' net_dim = args.net_dim max_memo = args.max_memo target_step = args.target_step gamma = args.gamma reward_scale = args.reward_scale random_seed = args.random_seed torch.manual_seed(random_seed + worker_id) np.random.seed(random_seed + worker_id) del args # In order to show these hyper-parameters clearly, I put them above. '''init: environment''' max_step = env.max_step state_dim = env.state_dim action_dim = env.action_dim if_discrete = env.if_discrete '''init: Agent, ReplayBuffer''' agent.init(net_dim, state_dim, action_dim) agent.state = env.reset() if_on_policy = getattr(agent, 'if_on_policy', False) buffer = ReplayBuffer(max_len=max_memo // rollout_num + max_step, if_on_policy=if_on_policy, state_dim=state_dim, action_dim=1 if if_discrete else action_dim, if_gpu=False) '''start exploring''' exp_step = target_step // rollout_num with torch.no_grad(): if not if_on_policy: explore_before_training(env, buffer, exp_step, reward_scale, gamma) buffer.update_now_len_before_sample() pipe2_exp.send((buffer.buf_state[:buffer.now_len], buffer.buf_other[:buffer.now_len])) # buf_state, buf_other = pipe1_exp.recv() buffer.empty_buffer_before_explore() while True: agent.explore_env(env, buffer, exp_step, reward_scale, gamma) buffer.update_now_len_before_sample() pipe2_exp.send((buffer.buf_state[:buffer.now_len], buffer.buf_other[:buffer.now_len])) # buf_state, buf_other = pipe1_exp.recv() buffer.empty_buffer_before_explore() # pipe1_exp.send(agent.act) agent.act = pipe2_exp.recv()
def __init__(self, num_agents, state_size, action_size): self.agents = [] for i in range(num_agents): self.agents.append(Agent(state_size, action_size)) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE)
def __init__(self, num_agents, state_size, action_size, random_seed): self.num_agents = num_agents self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.agents = [ Agent(state_size, action_size, self.memory, BATCH_SIZE, random_seed) for agent_posit in range(num_agents) ]
def mp_explore_in_env(args, pipe2_exp, worker_id): env = args.env reward_scale = args.reward_scale gamma = args.gamma random_seed = args.random_seed agent_rl = args.agent_rl net_dim = args.net_dim max_memo = args.max_memo target_step = args.target_step rollout_num = args.rollout_num del args torch.manual_seed(random_seed + worker_id) np.random.seed(random_seed + worker_id) '''init: env''' state_dim = env.state_dim action_dim = env.action_dim if_discrete = env.if_discrete max_step = env.max_step '''build agent''' agent = agent_rl(state_dim, action_dim, net_dim) # training agent agent.state = env.reset() # agent.device = torch.device('cpu') # env_cpu--act_cpu a little faster than env_cpu--act_gpu, but high cpu-util '''build replay buffer, init: total_step, reward_avg''' if_on_policy = bool( agent_rl.__name__ in {'AgentPPO', 'AgentGaePPO', 'AgentInterPPO'}) buffer = ReplayBuffer(max_memo // rollout_num + max_step, state_dim, if_on_policy=if_on_policy, action_dim=1 if if_discrete else action_dim) # build experience replay buffer exp_step = target_step // rollout_num with torch.no_grad(): while True: # pipe1_exp.send(agent.act) agent.act = pipe2_exp.recv() agent.update_buffer(env, buffer, exp_step, reward_scale, gamma) buffer.update__now_len__before_sample() pipe2_exp.send((buffer.buf_state[:buffer.now_len], buffer.buf_other[:buffer.now_len]))
def __init__(self, num_agents=2, state_size=24, action_size=2): """Initialize a maddpg_agent wrapper. Params ====== num_agents (int): the number of agents in the environment state_size (int): dimension of each state action_size (int): dimension of each action """ self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.agents = [ ddpg(state_size, action_size, i + 1, random_seed=0) for i in range(num_agents) ] # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed=0)
class MultiAgent(): def __init__(self, num_agents, state_size, action_size): self.agents = [] for i in range(num_agents): self.agents.append(Agent(state_size, action_size)) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE) def step(self, states, actions, rewards, next_states, done): for i in range(len(states)): self.memory.add(states[i], actions[i], rewards[i], next_states[i], done) if len(self.memory) > BATCH_SIZE: experiences, indexes = self.memory.sample() for agent in self.agents: error = agent.learn(experiences, GAMMA) #update priority replay memory self.memory.update(indexes, abs(error)) def act(self, states, add_noise=True, noise_weight=1.0): actions = [] for i in range(len(self.agents)): actions.append(self.agents[i].act(states[i], add_noise, noise_weight)) return actions def reset(self): for agent in self.agents: agent.reset()
def __init__(self, action_size, state_size, shared_replay_buffer, num_agents): self.shared_replay_buffer = shared_replay_buffer memory_fn = lambda: ReplayBuffer(action_size, int(1e6), BATCH_SIZE, SEED, DEVICE) memory = None if shared_replay_buffer: self.memory = memory_fn() memory = self.memory self.ddpg_agents = [ DDPGAgent(action_size, state_size, shared_replay_buffer, memory) for _ in range(num_agents) ] self.t_step = 0
def __init__(self, num_agents, state_size, action_size, random_seed=0, discount_factor=0.95, tau=0.02): super(MADDPG, self).__init__() # self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.memories = [ ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) for _ in range(num_agents) ] self.agents = [ Agent(state_size, action_size, random_seed) for _ in range(num_agents) ] self.discount_factor = discount_factor self.tau = tau self.iter = 0
def train_and_evaluate(args): args.init_before_training() '''basic arguments''' cwd = args.cwd env = args.env agent = args.agent gpu_id = args.gpu_id # necessary for Evaluator? env_eval = args.env_eval '''training arguments''' net_dim = args.net_dim max_memo = args.max_memo break_step = args.break_step batch_size = args.batch_size target_step = args.target_step repeat_times = args.repeat_times if_break_early = args.if_allow_break gamma = args.gamma reward_scale = args.reward_scale '''evaluating arguments''' show_gap = args.show_gap eval_times1 = args.eval_times1 eval_times2 = args.eval_times2 env_eval = deepcopy(env) if env_eval is None else deepcopy(env_eval) del args # In order to show these hyper-parameters clearly, I put them above. '''init: environment''' max_step = env.max_step state_dim = env.state_dim action_dim = env.action_dim if_discrete = env.if_discrete env_eval = deepcopy(env) if env_eval is None else deepcopy(env_eval) '''init: Agent, ReplayBuffer, Evaluator''' agent.init(net_dim, state_dim, action_dim) if_on_policy = getattr(agent, 'if_on_policy', False) buffer = ReplayBuffer(max_len=max_memo + max_step, if_on_policy=if_on_policy, if_gpu=True, state_dim=state_dim, action_dim=1 if if_discrete else action_dim) evaluator = Evaluator(cwd=cwd, agent_id=gpu_id, device=agent.device, env=env_eval, eval_times1=eval_times1, eval_times2=eval_times2, show_gap=show_gap) # build Evaluator '''prepare for training''' agent.state = env.reset() if if_on_policy: steps = 0 else: # explore_before_training for off-policy with torch.no_grad(): # update replay buffer steps = explore_before_training(env, buffer, target_step, reward_scale, gamma) agent.update_net(buffer, target_step, batch_size, repeat_times) # pre-training and hard update agent.act_target.load_state_dict(agent.act.state_dict()) if getattr( agent, 'act_target', None) else None agent.cri_target.load_state_dict(agent.cri.state_dict()) if getattr( agent, 'cri_target', None) else None total_step = steps '''start training''' if_reach_goal = False while not ((if_break_early and if_reach_goal) or total_step > break_step or os.path.exists(f'{cwd}/stop')): with torch.no_grad(): # speed up running steps = agent.explore_env(env, buffer, target_step, reward_scale, gamma) total_step += steps obj_a, obj_c = agent.update_net(buffer, target_step, batch_size, repeat_times) with torch.no_grad(): # speed up running if_reach_goal = evaluator.evaluate_save(agent.act, steps, obj_a, obj_c)
def train_and_evaluate(args): args.init_before_training() cwd = args.cwd env = args.env env_eval = args.env_eval agent_id = args.gpu_id agent_rl = args.agent_rl # basic arguments gamma = args.gamma # training arguments net_dim = args.net_dim max_memo = args.max_memo target_step = args.target_step batch_size = args.batch_size repeat_times = args.repeat_times reward_scale = args.reward_scale if_per = args.if_per show_gap = args.show_gap # evaluate arguments eval_times1 = args.eval_times1 eval_times2 = args.eval_times2 break_step = args.break_step if_break_early = args.if_break_early env_eval = deepcopy(env) if env_eval is None else deepcopy(env_eval) del args # In order to show these hyper-parameters clearly, I put them above. '''init: env''' state_dim = env.state_dim action_dim = env.action_dim if_discrete = env.if_discrete max_step = env.max_step env_eval = deepcopy(env) if env_eval is None else deepcopy(env_eval) '''init: Agent, Evaluator, ReplayBuffer''' agent = agent_rl(net_dim, state_dim, action_dim) # build AgentRL agent.state = env.reset() evaluator = Evaluator(cwd=cwd, agent_id=agent_id, device=agent.device, env=env_eval, eval_times1=eval_times1, eval_times2=eval_times2, show_gap=show_gap) # build Evaluator if_on_policy = agent_rl.__name__ in {'AgentPPO', 'AgentGaePPO'} buffer = ReplayBuffer(max_memo + max_step, state_dim, if_on_policy=if_on_policy, if_per=if_per, action_dim=1 if if_discrete else action_dim) # build experience replay buffer if if_on_policy: steps = 0 else: with torch.no_grad(): # update replay buffer steps = _explore_before_train(env, buffer, target_step, reward_scale, gamma) agent.update_net(buffer, target_step, batch_size, repeat_times) # pre-training and hard update agent.act_target.load_state_dict( agent.act.state_dict()) if 'act_target' in dir(agent) else None total_step = steps if_solve = False while not ((if_break_early and if_solve) or total_step > break_step or os.path.exists(f'{cwd}/stop')): with torch.no_grad(): # speed up running steps = agent.update_buffer(env, buffer, target_step, reward_scale, gamma) total_step += steps obj_a, obj_c = agent.update_net(buffer, target_step, batch_size, repeat_times) with torch.no_grad(): # speed up running if_solve = evaluator.evaluate_act__save_checkpoint( agent.act, steps, obj_a, obj_c)
args = parse_arguments() # Setup RNG if args.seed: print('torch RNG using seed:', args.seed) seed = torch.manual_seed(args.seed) # Setup Environment env = UnityEnvironment(file_name="Banana.app", no_graphics=args.no_render) brain_name = env.brain_names[0] brain = env.brains[brain_name] # Setup Agent and Experience Replay Buffer state_size = brain.vector_observation_space_size action_size = brain.vector_action_space_size memory = ReplayBuffer(action_size, args.memory, args.batch_size) agent = BananAgent(state_size, action_size, memory=memory, checkpoint_filename=args.checkpoint if args.checkpoint and os.path.exists(args.checkpoint) else None) print('Number of actions:', action_size) print('State Features: ', state_size) # Retrieve hyperparameters epsilon = args.eps epsilon_decay = args.eps_decay epsilon_mininum = args.eps_min evaluate = args.evaluate if evaluate: print('Running in evaluation mode!')
def train_and_evaluate(args): args.init_before_training() '''basic arguments''' cwd = args.cwd env = args.env agent = args.agent gpu_id = args.gpu_id # necessary for Evaluator? '''training arguments''' net_dim = args.net_dim max_memo = args.max_memo break_step = args.break_step batch_size = args.batch_size target_step = args.target_step repeat_times = args.repeat_times if_break_early = args.if_allow_break if_per = args.if_per gamma = args.gamma reward_scale = args.reward_scale '''evaluating arguments''' eval_gap = args.env_eval eval_times1 = args.eval_times1 eval_times2 = args.eval_times2 if args.env_eval is not None: env_eval = args.env_eval elif args.env_eval in set(gym.envs.registry.env_specs.keys()): env_eval = PreprocessEnv(gym.make(env.env_name)) else: env_eval = deepcopy(env) del args # In order to show these hyper-parameters clearly, I put them above. '''init: environment''' max_step = env.max_step state_dim = env.state_dim action_dim = env.action_dim if_discrete = env.if_discrete '''init: Agent, ReplayBuffer, Evaluator''' agent.init(net_dim, state_dim, action_dim, if_per) if_on_policy = getattr(agent, 'if_on_policy', False) buffer = ReplayBuffer(max_len=max_memo + max_step, state_dim=state_dim, action_dim=1 if if_discrete else action_dim, if_on_policy=if_on_policy, if_per=if_per, if_gpu=True) evaluator = Evaluator( cwd=cwd, agent_id=gpu_id, device=agent.device, env=env_eval, eval_gap=eval_gap, eval_times1=eval_times1, eval_times2=eval_times2, ) '''prepare for training''' agent.state = env.reset() if if_on_policy: steps = 0 else: # explore_before_training for off-policy with torch.no_grad(): # update replay buffer steps = explore_before_training(env, buffer, target_step, reward_scale, gamma) agent.update_net(buffer, target_step, batch_size, repeat_times) # pre-training and hard update agent.act_target.load_state_dict(agent.act.state_dict()) if getattr( agent, 'act_target', None) else None agent.cri_target.load_state_dict(agent.cri.state_dict()) if getattr( agent, 'cri_target', None) else None total_step = steps '''start training''' if_reach_goal = False while not ((if_break_early and if_reach_goal) or total_step > break_step or os.path.exists(f'{cwd}/stop')): steps = agent.explore_env(env, buffer, target_step, reward_scale, gamma) total_step += steps train_record = agent.update_net(buffer, target_step, batch_size, repeat_times) if_reach_goal = evaluator.evaluate_save(agent.act, steps, train_record) evaluator.draw_plot() print( f'| SavedDir: {cwd}\n| UsedTime: {time.time() - evaluator.start_time:.0f}' )
class maddpg: """Wrapper class managing different agents in the environment.""" def __init__(self, num_agents=2, state_size=24, action_size=2): """Initialize a maddpg_agent wrapper. Params ====== num_agents (int): the number of agents in the environment state_size (int): dimension of each state action_size (int): dimension of each action """ self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.agents = [ ddpg(state_size, action_size, i + 1, random_seed=0) for i in range(num_agents) ] # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed=0) def reset(self): """Resets OU Noise for each agent.""" for agent in self.agents: agent.reset() def act(self, observations, add_noise=False): """Picks an action for each agent given.""" actions = [] for agent, observation in zip(self.agents, observations): action = agent.act(observation, add_noise=add_noise) actions.append(action) return np.array(actions) def step(self, states, actions, rewards, next_states, dones, timestep): """Save experience in replay memory.""" states = states.reshape(1, -1) actions = actions.reshape(1, -1) next_states = next_states.reshape(1, -1) self.memory.add(states, actions, rewards, next_states, dones) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and timestep % LEARNING_PERIOD == 0: for a_i, agent in enumerate(self.agents): experiences = self.memory.sample() self.learn(experiences, a_i) def learn(self, experiences, agent_number): """ The critic takes as its input the combined observations and actions from all agents. Collect actions from each agent for the 'experiences'. """ next_actions = [] actions_pred = [] states, _, _, next_states, _ = experiences next_states = next_states.reshape(-1, self.num_agents, self.state_size) states = states.reshape(-1, self.num_agents, self.state_size) for a_i, agent in enumerate(self.agents): agent_id_tensor = self._get_agent_number(a_i) state = states.index_select(1, agent_id_tensor).squeeze(1) next_state = next_states.index_select(1, agent_id_tensor).squeeze(1) next_actions.append(agent.actor_target(next_state)) actions_pred.append(agent.actor_local(state)) next_actions = torch.cat(next_actions, dim=1).to(device) actions_pred = torch.cat(actions_pred, dim=1).to(device) agent = self.agents[agent_number] agent.learn(experiences, next_actions, actions_pred) def _get_agent_number(self, i): """Helper to get an agent's number as a Torch tensor.""" return torch.tensor([i]).to(device) def save_weights(self, dir): for i in range(self.num_agents): torch.save(self.agents[i].actor_local.state_dict(), os.path.join(dir, 'checkpoint_actor_{}.pth'.format(i))) torch.save(self.agents[i].critic_local.state_dict(), os.path.join(dir, 'checkpoint_critic_{}.pth'.format(i)))