agent.load_models() np.random.seed(0) score_history = [] for i in range(200): obs = env.reset() done = False score = 0 step = 0 while not done: step += 1 # print(obs) act = agent.choose_action(obs) # print(act) new_state, reward, done, info = env.step(act) agent.remember(obs, act, reward, new_state, int(done)) agent.learn() score += reward obs = new_state env.render() score_history.append(score) # if i % 25 == 0: # agent.save_models() print('episode ', i, 'score %.2f' % score, 'trailing 128 games avg %.3f' % np.mean(score_history[-128:]), 'finished after ', step, ' episode') env.close() agent.save_models() filename = 'MountainCar-alpha000025-beta00025-400-300.png'
def ddpg(env, state_size, action_size, num_agents, brain_name, n_episodes=1000, max_t=1000, print_every=10, title=None, batch_size=128, gamma=0.99, tau=1e-3, lr_actor=1e-4, lr_critic=1e-3, weight_decay=0, device="cuda:0", fc1_units=128, fc2_units=64, n_updates=10, update_intervals=20): agent = Agent(state_size=state_size, action_size=action_size, random_seed=2, num_agents=num_agents, batch_size=batch_size, gamma=gamma, tau=tau, lr_actor=lr_actor, lr_critic=lr_critic, weight_decay=weight_decay, device=device, fc1_units=fc1_units, fc2_units=fc2_units) # create save directory if title is None: title = "experiment" current_time = strftime("%Y-%m-%d_%H:%M:%S", gmtime()) title = title + "_" + current_time # write a new file os.makedirs("experiments/{}".format(title), exist_ok=True) f = open("experiments/{}/scores.txt".format(title), "w") f.close() scores_deque = deque(maxlen=100) mean_scores = [] for i_episode in range(1, n_episodes + 1): env_info = env.reset( train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations scores = np.zeros(num_agents) agent.reset() for t in range(max_t): # 1. observe states with the current policty mu theta + noise actions = agent.act(states) # 2. Execute a in the environment and observe next state (s,a,r,s',d') env_info = env.step(actions)[ brain_name] # send all actions to tne environment next_states = env_info.vector_observations # get next state (for each agent) rewards = env_info.rewards # get reward (for each agent) dones = env_info.local_done # see if episode finished # 3. save experiences to the replay buffer agent.remember(states, actions, rewards, next_states, dones) # 4. learn by sampling from the replay buffer # if it is time to update, for however many updates agent.update(n_updates, update_intervals, t) scores += env_info.rewards # update the score (for each agent) states = next_states # roll over states to next time step if np.any(dones): break scores_deque.append(np.mean(scores)) print('\rEpisode {}\tLast 100 average Score: {:.2f}'.format( i_episode, np.mean(scores_deque)), end="") # save score and model every print_every if i_episode % print_every == 0: f = open("experiments/{}/scores.txt".format(title), "a") f.write("{},{}\n".format(i_episode, np.mean(scores_deque))) f.close() print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_deque))) mean_scores.append(np.mean(scores_deque)) # save if best model if np.mean(scores_deque) == max(mean_scores): torch.save(agent.actor_local.state_dict(), 'experiments/{}/checkpoint_actor.pth'.format(title)) torch.save( agent.critic_local.state_dict(), 'experiments/{}/checkpoint_critic.pth'.format(title)) if np.mean(scores_deque) >= 30: print("\rEnvironment solved with average score of 30") break return mean_scores, title