loss = loss_function(prediction, q_target) gradients = tape.gradient(loss, agent.model.trainable_variables) optimizer.apply_gradients( zip(gradients, agent.model.trainable_variables)) new_weights = agent.model.get_weights() # set new weights agent.set_weights(new_weights) manager.set_agent(new_weights) # get new weights agent = manager.get_agent() # update aggregator time_steps = manager.test(test_steps) manager.update_aggregator(loss=loss, time_steps=time_steps) # print progress print( f"epoch ::: {e} loss ::: {loss} avg env steps ::: {np.mean(time_steps)}" ) # you can also alter your managers parameters if e % 5 == 0: epsilon = epsilon * .9 manager.set_epsilon(epsilon=epsilon) print(f"New epsilon: {epsilon}") # if e % saving_after == 0: # #you can save models # manager.save_model(saving_path, e)
manager.set_agent(agent.get_weights()) print('TEST') # Update aggregator steps, current_rewards = manager.test( max_steps=1000, test_episodes=10, render=False, evaluation_measure="time_and_reward", ) #if (e+1) % 5 == 0: manager.test(max_steps=1000, test_episodes=1, render=True) manager.update_aggregator(loss=losses, reward=current_rewards, time=steps) # Collect all rewards rewards.extend(current_rewards) # Average reward over last 100 episodes avg_reward = sum(rewards[-100:]) / min(len(rewards), 100) # Print progress print( f"epoch ::: {e} loss ::: {np.mean(losses)} avg_current_reward ::: {np.mean(current_rewards)} avg_reward ::: {avg_reward} avg_timesteps ::: {np.mean(steps)}" ) if avg_reward > env.spec.reward_threshold: print(f'\n\nEnvironment solved after {e+1} episodes!') # Save model
# positive critic loss for gradient descent with MSE critic_loss = tf.reduce_mean((mc - agent.v(state))**2) critic_gradients = tape.gradient( critic_loss, agent.model.critic.trainable_variables) optimizer.apply_gradients( zip(critic_gradients, agent.model.critic.trainable_variables)) total_loss += actor_loss + critic_loss # Update the agent manager.set_agent(agent.get_weights()) agent = manager.get_agent() reward = manager.test(test_steps, evaluation_measure="reward") manager.update_aggregator(loss=total_loss, reward=reward) # print progress print( f"epoch ::: {e} loss ::: {total_loss} avg env steps ::: {np.mean(reward)}" ) if e % saving_after == 0: # you can save models manager.save_model(saving_path, e) # and load mmodels manager.load_model(saving_path) print("done") print("testing optimized agent") manager.test(test_steps, test_episodes=10, render=True)
data = manager.get_data() manager.store_in_buffer(data) # sample data to optimize on from buffer experience_dict = manager.sample(sample_size) print('optimizing...') for states, actions, rewards, states_new, not_dones in zip( *[experience_dict[k] for k in optim_keys]): train_step(agent.model, states, actions, rewards, states_new, not_dones, learning_rate, gamma) # set new weights, get optimized agent manager.set_agent(agent.model.get_weights()) # update aggregator time_steps, reward_agg = manager.test( test_steps, evaluation_measure='time_and_reward') manager.update_aggregator(time_steps=time_steps, rewards=reward_agg) if e % saving_after == 0: show_q(agent.model, e, saving_path, env_kwargs['action_dict']) print( f"epoch ::: {e} avg env steps ::: {np.mean(time_steps)} avg reward ::: {np.mean(reward_agg)}" ) print('done') print('testing optimized agent') manager.test(test_steps, render=True, evaluation_measure='time_and_reward')
def train_td3(args, model, action_dimension=None): print(args) tf.keras.backend.set_floatx('float32') ray.init(log_to_driver=False) # hyper parameters buffer_size = args.buffer_size # 10e6 in their repo, not possible with our ram epochs = args.epochs saving_path = os.getcwd() + "/" + args.saving_dir saving_after = 5 sample_size = args.sample_size optim_batch_size = args.batch_size gamma = args.gamma test_steps = 100 # 1000 in their repo policy_delay = 2 rho = .046 policy_noise = args.policy_noise policy_noise_clip = .5 msg_dim = args.msg_dim # 32 in their repo learning_rate = args.learning_rate save_args(args, saving_path) env_test_instance = gym.make('BipedalWalker-v3') if action_dimension is None: action_dimension = copy(env_test_instance.action_space.shape[0]) model_kwargs = { # action dimension for modular actions 'action_dimension': action_dimension, 'min_action': copy(env_test_instance.action_space.low)[0], 'max_action': copy(env_test_instance.action_space.high)[0], 'msg_dimension': msg_dim, 'fix_sigma': True, 'hidden_units': args.hidden_units } del env_test_instance manager = SampleManager(model, 'BipedalWalker-v3', num_parallel=(os.cpu_count() - 1), total_steps=150, action_sampling_type="continuous_normal_diagonal", is_tf=True, model_kwargs=model_kwargs) optim_keys = [ 'state', 'action', 'reward', 'state_new', 'not_done', ] manager.initialize_buffer(buffer_size, optim_keys) manager.initialize_aggregator(path=saving_path, saving_after=saving_after, aggregator_keys=["loss", "reward"]) agent = manager.get_agent() optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) # fill buffer print("Filling buffer before training..") while len(manager.buffer.buffer[ manager.buffer.keys[0]]) < manager.buffer.size: # Gives you state action reward trajectories data = manager.get_data() manager.store_in_buffer(data) # track time while training timer = time.time() last_t = timer target_agent = manager.get_agent() for e in range(epochs): # off policy sample_dict = manager.sample(sample_size, from_buffer=True) print(f"collected data for: {sample_dict.keys()}") # cast values to float32 and create data dict sample_dict['state'] = tf.cast(sample_dict['state'], tf.float32) sample_dict['action'] = tf.cast(sample_dict['action'], tf.float32) sample_dict['reward'] = tf.cast(sample_dict['reward'], tf.float32) sample_dict['state_new'] = tf.cast(sample_dict['state_new'], tf.float32) sample_dict['not_done'] = tf.cast(sample_dict['not_done'], tf.float32) data_dict = dict_to_dict_of_datasets(sample_dict, batch_size=optim_batch_size) total_loss = 0 for state, action, reward, state_new, not_done in \ zip(data_dict['state'], data_dict['action'], data_dict['reward'], data_dict['state_new'], data_dict['not_done']): action_new = target_agent.act(state_new) # add noise to action_new action_new = action_new + tf.clip_by_value( tf.random.normal(action_new.shape, 0., policy_noise), -policy_noise_clip, policy_noise_clip) # clip action_new to action space action_new = tf.clip_by_value( action_new, manager.env_instance.action_space.low, manager.env_instance.action_space.high) # calculate target with double-Q-learning state_action_new = tf.concat([state_new, action_new], axis=-1) q_values0 = target_agent.model.critic0(state_action_new) q_values1 = target_agent.model.critic1(state_action_new) q_values = tf.concat([q_values0, q_values1], axis=-1) q_targets = tf.squeeze(tf.reduce_min(q_values, axis=-1)) critic_target = reward + gamma * not_done * q_targets state_action = tf.concat([state, action], axis=-1) # update critic 0 with tf.GradientTape() as tape: q_output = agent.model.critic0(state_action) loss = tf.keras.losses.MSE(tf.squeeze(critic_target), tf.squeeze(q_output)) total_loss += loss gradients = tape.gradient(loss, agent.model.critic0.trainable_variables) optimizer.apply_gradients( zip(gradients, agent.model.critic0.trainable_variables)) # update critic 1 with tf.GradientTape() as tape: q_output = agent.model.critic1(state_action) loss = tf.keras.losses.MSE(tf.squeeze(critic_target), tf.squeeze(q_output)) total_loss += loss gradients = tape.gradient(loss, agent.model.critic1.trainable_variables) optimizer.apply_gradients( zip(gradients, agent.model.critic1.trainable_variables)) # update actor with delayed policy update if e % policy_delay == 0: with tf.GradientTape() as tape: actor_output = agent.model.actor(state) action = reparam_action(actor_output, agent.model.action_dimension, agent.model.min_action, agent.model.max_action) state_action = tf.concat([state, action], axis=-1) q_val = agent.model.critic0(state_action) actor_loss = -tf.reduce_mean(q_val) total_loss += actor_loss actor_gradients = tape.gradient( actor_loss, agent.model.actor.trainable_variables) optimizer.apply_gradients( zip(actor_gradients, agent.model.actor.trainable_variables)) # Update agent manager.set_agent(agent.get_weights()) agent = manager.get_agent() if e % policy_delay == 0: # Polyak averaging new_weights = list(rho * np.array(target_agent.get_weights()) + (1. - rho) * np.array(agent.get_weights())) target_agent.set_weights(new_weights) reward = manager.test(test_steps, evaluation_measure="reward") manager.update_aggregator(loss=total_loss, reward=reward) print( f"epoch ::: {e} loss ::: {total_loss} avg reward ::: {np.mean(reward)}" ) if e % saving_after == 0: manager.save_model(saving_path, e) # needed time and remaining time estimation current_t = time.time() time_needed = (current_t - last_t) / 60. time_remaining = (current_t - timer) / 60. / (e + 1) * (epochs - (e + 1)) print( 'Finished epoch %d of %d. Needed %1.f min for this epoch. Estimated time remaining: %.1f min' % (e + 1, epochs, time_needed, time_remaining)) last_t = current_t manager.load_model(saving_path) print("done") print("testing optimized agent") manager.test(test_steps, test_episodes=10, render=True) ray.shutdown()
# iterating through dataset old_table = agent.get_weights() for s, a, r, n in zip(sample_dict['state'], sample_dict['action'], sample_dict['reward'], sample_dict['state_new']): s_x, s_y = s # unpacking state n_x, n_y = n # unpacking new state # Apply Q-learning formula old_table[a, s_x, s_y] += alpha * (r + gamma * np.max(old_table[:, n_x, n_y]) - old_table[a, s_x, s_y]) # set new weights manager.set_agent(old_table) # get new weights agent = manager.get_agent() time_steps = manager.test(test_steps) # update aggregator manager.update_aggregator(time_steps=time_steps) print(f"epoch ::: {e} avg env steps ::: {np.mean(time_steps)}") print("Done!") print("Testing optimized agent...") manager.test(test_steps, test_episodes=3, render=True)
# TODO: optimize agent dummy_losses = [ np.mean(np.random.normal(size=(64, 100)), axis=0) for _ in range(1000) ] new_weights = agent.model.get_weights() # set new weights manager.set_agent(new_weights) # get new weights agent = manager.get_agent() # update aggregator time_steps = manager.test(test_steps) manager.update_aggregator(loss=dummy_losses, time_steps=time_steps) # print progress print( f"epoch ::: {e} loss ::: {np.mean([np.mean(l) for l in dummy_losses])} avg env steps ::: {np.mean(time_steps)}" ) # yeu can also alter your managers parameters manager.set_epsilon(epsilon=0.99) if e % saving_after == 0: # you can save models manager.save_model(saving_path, e) # and load mmodels manager.load_model(saving_path) print("done")
manager.set_agent(new_weights) # get new agent agent = manager.get_agent() # update aggregator time_steps, reward = manager.test( test_steps, render=True if e % 10 == 0 else False, evaluation_measure="time_and_reward") #print("time_steps: ", len(time_steps)) #print("reward: ", len(reward)) #print("actor_loss:", len(actor_losses)) #print("critic_loss: ", len(critic_losses)) manager.update_aggregator(actor_loss=actor_losses, time_steps=time_steps, reward=reward) print( f"epoch ::: {e} actor_loss ::: {np.mean([np.mean(l) for l in actor_losses])} avg env steps ::: {np.mean(time_steps)} avg reward ::: {np.mean(reward)}" ) if e % saving_after == 0: manager.save_model(saving_path, e) #manager.agg.save_graphic() print("---") manager.load_model(saving_path) print("done") print("testing optimized agent") manager.test(test_steps, test_episodes=10, render=True,
# TODO: optimize agent old_q[h, w, a] += error agent.set_weights(old_q) # Set new weights manager.set_agent(agent.get_weights()) # Update aggregator steps = manager.test( max_steps=100, test_episodes=10, render=True, evaluation_measure="time", ) manager.update_aggregator(error=error_aggregator, steps=steps) # Print progress print( f"epoch ::: {e} error ::: {np.mean(error_aggregator)} avg_timesteps ::: {np.mean(steps)}" ) agent.model.print_optimal(action_dict) print("testing optimized agent") manager.test( max_steps=100, test_episodes=10, render=True, do_print=True, evaluation_measure="time", )
for s, a, r, ns, nd in dataset: # ensure that the datasets have at least 10 elements # otherwise we run into problems with the MSE loss if len(s) >= 10: loss = train_q_network(agent, s, a, r, ns, nd, optimizer) losses.append(loss) print(f'average loss: {np.mean(losses)}') # update the weights of the manager manager.set_agent(agent.get_weights()) print('# ================= validation ================== #') render = e % RENDER_EPISODES == 0 time_steps, rewards = manager.test(MAX_TEST_STEPS, TEST_EPISODES, evaluation_measure='time_and_reward', render=render, do_print=False) manager.update_aggregator(loss=losses, time_steps=time_steps, reward=rewards) print(f'average reward: {np.mean(rewards)}') print(f'average time steps: {np.mean(time_steps)}') if e % SAVE_EPISODES == 0: print('# ================= save model ================== #') agent.model.deep_q_net.save(os.path.join(saving_path_model, f'epoch_{e}')) print('# ============== TRAINING FINISHED ============== #') print('# ============== SAVE FINAL MODELS ============== #') agent.model.deep_q_net.save(os.path.join(saving_path_model, f'final')) print('# ================= FINAL TEST ================== #') manager.test(MAX_TEST_STEPS, 10, render=True, do_print=True, evaluation_measure='time_and_reward')
loss_actor = -action_probs * advantage - entropy_coeff * entropy gradients_actor = tape.gradient( loss_actor, agent.model.trainable_variables) optimizer_actor.apply_gradients( zip(gradients_actor, agent.model.trainable_variables)) # set new weights manager.set_agent(agent.get_weights()) # get new agent agent = manager.get_agent() # update aggregator if epoch % 5 != 0: time_steps = manager.test(test_steps, test_episodes=10) else: time_steps = manager.test(test_steps, test_episodes=10, render=True) manager.update_aggregator(loss_critic=np.mean(loss_critic), loss_actor=np.mean(loss_actor), time_steps=time_steps) # print progress print( f"epoch ::: {epoch} critic loss ::: {np.mean(loss_critic)} actor loss ::: {np.mean(loss_actor)} avg env steps ::: {np.mean(time_steps)}" ) # and load models #manager.load_model(saving_path) print("done") print("testing optimized agent") manager.test(test_steps, test_episodes=10, render=True, do_print=True)