def train_agent(episodes=100, model='DDPG', print_every=10): if model.lower() == 'd4pg': agent = D4PGAgent() print('Use D4PG agent......\n') else: agent = DDPGAgent() print('Use default DDPG agent......\n') print('Batch size: ', BATCH_SIZE) print('Actor learning rate: ', LR_ACTOR) print('Critic learning rate: ', LR_CRITIC) print('\n') env = EnvWrapper(file_name='Reacher_Windows_x86_64\Reacher.exe', train_mode=True) scores = [] scores_window = deque(maxlen=100) for ep in range(1, episodes + 1): agent.reset() agent.states = env.reset() for s in range(agent.max_steps): agent.actions = agent.act(add_noise=True) agent.rewards, agent.next_states, agent.dones = env.step( agent.actions) agent.step() agent.states = agent.next_states scores.append(agent.scores.mean()) scores_window.append(agent.scores.mean()) if ep % print_every == 0: print('Episode %d, avg score: %.2f' % (ep, agent.scores.mean())) if np.mean(scores_window) >= 30: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(ep - 100, np.mean(scores_window))) torch.save(agent.actor.state_dict(), 'checkpoints/reacher_%s_actor_checkpoint.pth' % model) torch.save(agent.critic.state_dict(), 'checkpoints/reacher_%s_critic_checkpoint.pth' % model) env.close() return scores, agent
def run(): env = gym.make('Pendulum-v0') seed = 30 env.seed(seed) agent = DDPGAgent(seed=seed, n_state=env.observation_space.shape[0], n_action=env.action_space.shape[0]) ''' agent = Agent(state_size=env.observation_space.shape[0], action_size=env.action_space.shape[0], random_seed=seed) ''' episodes_n = 2000 steps_max = 300 scores = [] print_every = 100 scores_deque = deque(maxlen=print_every) for i_episode in range(1, episodes_n): state = env.reset() agent.reset() score = 0 done_step = 0 for step in range(steps_max): action = agent.act(state) state_next, reward, done, meta = env.step(action) agent.step(state, action, reward, state_next, done) state = state_next score += reward done_step += 1 if done: break scores.append(score) scores_deque.append(score) print_line(i_episode, scores_deque, end="") if i_episode % print_every == 0: print_line(i_episode, scores_deque, end="\n") return scores
def main(path=''): """ show the environment controlled by the 20 smart agents Args: param1: (string) pathname for saved network weights """ env = UnityEnvironment(file_name='Reacher_Linux/Reacher.x86_64', no_graphics=False) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=False)[brain_name] states = env_info.vector_observations # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) config = Config() config.n_agents = num_agents config.gamma = 0.99 config.state_dim = states.shape[1] config.action_dim = brain.vector_action_space_size config.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") config.seed = 42 config.leak = 0.001 config.tau = 1e-3 config.hdl1 = 256 config.hdl2 = 128 config.hdl3 = 128 config.lr_actor = 0.001 config.lr_critic = 0.001 config.batch_size = 1024 config.weight_decay = 0.99 config.memory_capacity = int(1e6) agent = DDPGAgent(config) agent.actor_local.load_state_dict(torch.load(path + 'checkpoint_actor.pth')) agent.critic_local.load_state_dict(torch.load(path + 'checkpoint_critic.pth')) for _ in range(3): episode_reward = [] scores = np.zeros(num_agents) env_info = env.reset(train_mode=False)[brain_name] states = env_info.vector_observations agent.reset_noise() total_steps = 0 while True: total_steps += 1 actions = agent.act(states, 0, False) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations reward = env_info.rewards done = np.array(env_info.local_done) episode_reward.append(np.mean(reward)) scores += reward states = next_states if np.any(done): print("total steps", total_steps) print(sum(episode_reward)) print('average: ', np.mean(scores)) print('min: ', np.min(np.array(episode_reward))) print('max: ', np.max(np.array(episode_reward))) break
def run(env, device, episodes, experiment_name, update_rate, action_size, state_size, brain_name, epsilon_start=1.0, epsilon_min=0.05, epsilon_decay=0.995, max_score=30., num_agents=1): epsilon = epsilon_start agent = DDPGAgent(state_space=state_size, action_space=action_size, buffer_size=int(1e5), batch_size=512, learning_rate_actor=0.001, learning_rate_critic=0.001, update_rate=update_rate, gamma=0.995, tau=0.001, device=device, seed=5, num_agents=num_agents) score_window = deque(maxlen=100) all_scores = [] tb_writer = SummaryWriter('{}/{}'.format('logs', experiment_name)) for episode in range(episodes): agent.reset() scores = np.zeros(num_agents) dones = np.zeros((num_agents), dtype=bool) env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations while not np.any(dones): actions = agent.act(states, epsilon) actions = np.clip(actions, -1, 1) # all actions between -1 and 1 env_info = env.step(actions)[ brain_name] # send all actions to tne environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished scores += env_info.rewards # update the score (for each agent) for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): agent.step(state, action, reward, next_state, done) states = next_states episode_score = np.mean(scores) score_window.append(episode_score) all_scores.append(episode_score) #decay after each episode if episode % 10 == 0: epsilon = max(epsilon_min, epsilon * epsilon_decay) print('\rEpisode: {}\tAverage Score: {}'.format( episode, np.mean(score_window)), end="") if episode % 100 == 0: tb_writer.add_scalar('Episode_Accum_score', np.mean(score_window), episode) print('\rEpisode: {}\tAverage Score: {}'.format( episode, np.mean(score_window))) if np.mean(score_window) >= max_score: torch.save(agent.actor_local_network.state_dict(), 'actor_checkpoint_{}.pth'.format(experiment_name)) torch.save(agent.critic_local_network.state_dict(), 'critic_checkpoint_{}.pth'.format(experiment_name)) break
def main(arg): """ Args: param1: (args) """ env = UnityEnvironment(file_name='Reacher_Linux/Reacher.x86_64', no_graphics=True, seed=arg.seed) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) states = env_info.vector_observations print('Size of each action:', brain.vector_action_space_size) print(states.shape[1]) epsilon = arg.epsilon epsilon_min = arg.epsilon_min epsilon_decay = arg.epsilon_decay config = Config() config.state_dim = states.shape[1] config.action_dim = brain.vector_action_space_size config.n_agents = num_agents set_config(config, arg) t_0 = time.time() n_episodes = arg.n_episodes train_every = arg.train_every scores_window = deque(maxlen=100) # last 100 scores agent = DDPGAgent(config) scores = [] print("Start training") for i_episode in range(1, n_episodes+1): env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations # get the current state (for each agent) agent.reset_noise() episode_reward = np.zeros(num_agents) for t in range(arg.t_max): actions = agent.act(states, epsilon) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished episode_reward += np.array(env_info.rewards) # update the score (for each agent) for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): agent.memory.add(state, action, reward, next_state, done) states = next_states if t % train_every == 0: for _ in range(arg.repeat_learning): agent.learn() if np.any(dones): break epsilon = epsilon * epsilon_decay epsilon = max(epsilon_min, epsilon) scores_window.append(np.mean(episode_reward)) scores.append(np.mean(episode_reward)) duration = time.time() - t_0 sec = duration % 60 minutes = duration // 60 print('\rEpisode {}\t Average Score all: {:.2f} , Score: {:.2f} Time: min {:.2f} sec: {}'.format(i_episode, np.mean(scores_window), np.mean(episode_reward), minutes, sec)) if np.mean(scores_window) >= 30: print("Enviroment solved save smart agent") torch.save(agent.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic.pth') break return scores