def main(cnf): env, agent = create_world(cnf) cnf = cnf.main agent._replay_buffer.load_data('./per_exp/eval_grads/buffer_data/') agent._policy.actor.load_weights( './per_exp/eval_grads/model/converged_actor') set_trace() buff = agent._replay_buffer buff.size = N buff.ptr = N buff.max_size = N buff.tree.n_entries = N agent = agent._policy for t in range(N + 5000): state, action, reward, next_state, done, *_ = buff.sample_uniformly( 128) error = agent._train_step(state, action, reward, next_state, done, False, None) if t == 1000: set_trace() #print(f'td error {tf.norm(error)} iteration {i}') if not t % 5000: print(f'{t} of 1005000') agent.critic.save_weights('converged_critic')
def main(cnf): env, agent = create_world(cnf) cnf = cnf.main agent._replay_buffer.load_data('./per_exp/eval_grads/buffer_data/') buff = agent._replay_buffer buff.size = N buff.ptr = N buff.max_size = N buff.tree.n_entries = N #agent.load_model('./per_exp/eval_grads/model/TD3_Vrep_save_transitions') #max_action= tf.constant([5.585, 5.585, 3.071, 3.071, 1.919, 0.698, 0.698, 1.], dtype=tf.float32) #new_actor = Actor(22, 8, max_action, [300, 300], 9e-6) new_actor = agent._policy.actor agent = agent._policy for t in range(100000): state, action, reward, next_state, done, *_ = buff.sample_uniformly( 128) error, *_ = agent.train(buff, 128, t, True, None) #print(f'td error {tf.norm(error)} iteration {i}') if not t % 5000: print(f'{t} of 100000') agent.actor.save_weights('converged_actor') wandb.log({'actor_loss': error})
def main(cnf): env, agent = create_world(cnf) cnf_old = cnf cnf = cnf.main agent._replay_buffer.load_data('./per_exp/eval_grads/buffer_data/') buff = agent._replay_buffer buff.size = N buff.ptr = N buff.max_size = N buff.tree.n_entries = N agent._policy.actor.load_weights( './per_exp/eval_grads/model/converged_actor') trained_actor = agent._policy.actor print('Successfully loaded') print('Compute true gradient of true critic') sim_avg = [] td_avg = [] for k in range(SAMPLES): # True gradient of true critic true_critic = create_agent(cnf_old, env) train_the_critic(true_critic, buff, N_TRAIN_TRUE_CRITIC) true_critic_sample = true_critic._policy.critic state, *_ = buff.get_buffer() with tf.GradientTape() as tape: action = trained_actor(state) q_value, _ = true_critic_sample(tf.concat([state, action], axis=1)) actor_loss = -tf.reduce_mean(q_value) gradients_true = tape.gradient(actor_loss, trained_actor.trainable_variables) gradients_true = [tf.reshape(x, [-1]) for x in gradients_true] state, action, reward, next_state, done, *_ = buff.get_buffer() print(f' state shape {state.shape}') for idx, s in enumerate(state): print(f'{idx} of 1000000') s = tf.reshape(s, [1, s.shape[0]]) ns = tf.reshape(next_state[idx], [1, next_state[idx].shape[0]]) d = tf.reshape(done[idx], [1, done[idx].shape[0]]) r = tf.reshape(reward[idx], [1, reward[idx].shape[0]]) approx_critic = true_critic td_errors = approx_critic._policy._compute_td_error( s, trained_actor(s), r, ns, d) with tf.GradientTape() as tape: action = trained_actor(s) q_value, _ = approx_critic._policy.critic( tf.concat([s, action], axis=1)) actor_loss = -tf.reduce_mean(q_value) gradients_sample = tape.gradient(actor_loss, trained_actor.trainable_variables) gradients_sample = [tf.reshape(x, [-1]) for x in gradients_sample] sims = [ -simil_metric(x, y) for x, y in zip(gradients_true, gradients_sample) ] sims = tf.reduce_mean(sims) sim_avg.append(sims) td_avg.append(td_errors) np.save(f'sim_avg.npy', sim_avg) np.save(f'td_avg.npy', td_avg)
def main(cnf): env, agent = create_world(cnf) cnf = cnf.main # create objects logger = Logger(cnf.log, cnf.minilog, env.time_limit) # Load previously trained model. if cnf.load_model: agent.load_model(f'./experiments/models/{agent._load_string}') # Training loop state, done = env.reset(), False switch = 0 reward_fn = tf.Variable(0) for t in range(int(cnf.max_timesteps)): state_old = state if not t % cnf.switch_time: switch = (switch + 1) % 2 action = agent.select_action(state, noise_bool=True, reward_fn=reward_fn) next_state, reward, done, _ = env.step(action, reward_fn) success_cd = [done if env.success else 0][0] # get intrinsic reward from agent.transitbuffer computation intr_rew = agent.replay_add(state, action, reward, next_state, done, success_cd) maybe_verbose_output(t, agent, env, action, cnf, state, intr_rew) logger.inc(t, reward) if not cnf.flat_agent and not cnf.minilog: logger.most_important_plot(agent, state, action, reward, next_state, success_cd) if cnf.save_attention: attention_grad_meta(state, agent, t, flat=cnf.flat_agent) state = next_state if done: reward_fn.assign([0 if switch else 1][0]) # Train at the end of the episode for the appropriate times. makes collecting # norms stds and losses easier if t > cnf.start_timesteps: agent.train(t, logger.episode_timesteps) print( f"Total T: {t+1} Episode Num: {logger.episode_num+1} Episode T: {logger.episode_timesteps} Reward: {logger.episode_reward}" ) logger.log(t, intr_rew) agent.reset() logger.reset() state, done = env.reset(), False # Evaluate episode if (t + 1) % cnf.eval_freq == 0: avg_ep_rew, avg_intr_rew, success_rate, rate_correct_solves, untouchable_steps = agent.evaluation( env) state, done = env.reset(), False agent.reset() logger.reset(post_eval=True) logger.log_eval(t, avg_ep_rew, avg_intr_rew, success_rate, rate_correct_solves, untouchable_steps) if cnf.save_model: agent.save_model(f'./experiments/models/{agent._load_string}')
def main(cnf): env, agent = create_world(cnf) agent.load_model('here') cnf = cnf.main agent.meta_replay_buffer.load_data('./per_exp/buffer_data/') buff = agent.meta_replay_buffer buff.size = N print(buff.alpha) new_rew = [] if True: co = 0 for re in buff.reward[:N]: if re == -1.: new_rew.append(re) else: for i in range(1): new_rew.append(re) buff.reward = np.asarray(new_rew) buff.max_size = N buff.tree.n_entries = N idx = np.where(buff.reward == -1.)[0] m1 = [] m2 = [] counter = 0 idxs = np.zeros([N, N]) errors = np.zeros([N, N]) for t in range(1000): #print(f'train {t} of 10000') agent._meta_agent.train(buff, 10, t, False, None) #print(buff.batch_idxs) state, action, reward, next_state, done = buff.get_buffer() error = agent._meta_agent._compute_td_error(state, action, reward, next_state, done) errors[t] = error[:, 0] for i in range(buff.batch_idxs.shape[0]): if buff.reward[buff.batch_idxs[i]] != -1.: counter += 1 print(counter) print(f' counter {counter}') m1.append(buff.batch_idxs[i]) m2.append(t) np.save('m1.npy', m1) np.save('m2.npy', m2) np.save(f'errors_small_{repetitions}.npy', errors) env.close()
def main(cnf): env, agent = create_world(cnf) cnf_old = cnf cnf = cnf.main agent._replay_buffer.load_data('./per_exp/eval_grads/buffer_data/') accum = Accumulator() buff = agent._replay_buffer buff.size = N buff.ptr = N buff.max_size = N buff.tree.n_entries = N agent._policy.actor.load_weights('./per_exp/eval_grads/model/converged_actor') trained_actor = agent._policy.actor print('Compute true gradient of true critic') batch_range = np.array([2, 128, 256, 512, 1024, 2048]) ret = [] print('Update Buffer') for batch_size in batch_range: simil_list = [] print(f'Batch {batch_size}') for i in range(SAMPLES): # True gradient of true critic true_critic= create_agent(cnf_old, env) train_the_critic(true_critic, buff, N_TRAIN_TRUE_CRITIC) true_critic_sample = true_critic._policy.critic approx_critic = true_critic update_buffer(buff, approx_critic) state, *_ = buff.get_buffer() with tf.GradientTape() as tape: action = trained_actor(state) q_value, _ = true_critic_sample(tf.concat([state, action], axis=1)) actor_loss = -tf.reduce_mean(q_value) gradients_true = tape.gradient(actor_loss, trained_actor.trainable_variables) gradients_true = [tf.reshape(x, [-1]) for x in gradients_true] simil_avg = [] for i in range(10): state, *_ = buff.sample(batch_size) with tf.GradientTape() as tape: action = trained_actor(state) q_value, _ = approx_critic._policy.critic(tf.concat([state, action], axis=1)) actor_loss = -tf.reduce_mean(q_value) gradients_sample = tape.gradient(actor_loss, trained_actor.trainable_variables) gradients_sample = [tf.reshape(x, [-1]) for x in gradients_sample] sims = [-simil_metric(x, y) for x, y in zip(gradients_true, gradients_sample)] sims = tf.reduce_mean(sims) simil_avg.append(sims.numpy()) simil_list.append(np.mean(simil_avg)) ret.append(simil_list) np.save(f'simil_list_{cnf_old.agent.sub_per}_{cnf_old.buffer.alpha}.npy', ret)
def main(cnf): env, agent = create_world(cnf) cnf = cnf.main # create objects logger = Logger(cnf.log, cnf.time_limit) # Load previously trained model. if cnf.load_model: agent.load_model(f'./experiments/models/{agent._file_name}') # Training loop state, done = env.reset(), False for t in range(int(cnf.max_timesteps)): action = agent.select_noisy_action(state) maybe_verbose_output(t, agent, env, action, cnf, state) next_state, reward, done, _ = env.step(action) intr_rew = agent.replay_add(state, action, reward, next_state, done) if t > cnf.start_timesteps and not t % cnf.train_every: agent.train(t) state = next_state logger.inc(t, reward) if done: print( f"Total T: {t+1} Episode Num: {logger.episode_num+1}+ Episode T: {logger.episode_timesteps} Reward: {logger.episode_reward}" ) # Reset environment agent.reset() hard_reset = logger.log(t, intr_rew) logger.reset() state, done = env.reset(), False # Evaluate episode if (t + 1) % cnf.eval_freq == 0: avg_ep_rew, avg_intr_rew, success_rate = agent.evaluation(env) state, done = env.reset(), False agent.reset() logger.reset(post_eval=True) logger.log_eval(t, avg_ep_rew, avg_intr_rew, success_rate) if cnf.save_model: agent.save_model(f'./experiments/models/{agent._file_name}')
import numpy as np import tensorflow as tf from environments.coppeliagym import CoppeliaEnv from utils.utils import setup, create_world import collections args = ['--ee_j_pos', '--vrep', '--render'] args = setup(args) env, agent = create_world(args) for eps in range(10): state = env.reset() env.render() for t in range(args.time_limit): if t < 150: action = env.action_space.high else: action = env.action_space.low np.putmask(action, [1, 1, 1, 1, 0, 1, 1, 1], 0) next_state, *_ = env.step(action) print(next_state[7])