def main(argv): FLAGS = flags.FLAGS print(FLAGS.flag_values_dict()) FLAGS = FLAGS.flag_values_dict() globals().update(FLAGS) '''ENVIRONMENT''' import gym env = gym.make("HandManipulatePen-v0") results = env.reset() env.observation_space["observation"].shape[0] observation_dim = env.observation_space["observation"].shape[0] n_actuators = env.action_space.shape[0] n_dmp_basis = 10 action_dim = n_actuators * (n_dmp_basis + 1) #goal_dim = observation_dim goal_dim = 7 batch_size = 1 number_layers = 2 alpha = 0.1 # hyperparameter used in average lp estimate for goal policy #DMP env.relative_control = True from dmp import DMP n_simulation_steps = 25 dmp = DMP(10, n_simulation_steps, n_actuators) #%% '''NET''' from learning_progress_rnn import GOALRNN net = GOALRNN(batch_size, number_layers, observation_dim, action_dim, goal_dim, n_hidden=256) # load rnn if saved. TODO: save only weights maybe to avoid bugs after we change architecture.. if os.path.isfile("lprnn" + experiment_name + ".pt"): temp_net = torch.load("lprnn" + experiment_name + ".pt") net.load_state_dict(temp_net.state_dict()) if os.path.isfile("lprnn_weights" + experiment_name + ".pt"): print("LOADING WEIGHTS") net.load_state_dict( torch.load("lprnn_weights" + experiment_name + ".pt")) # optimizer and losses from torch import optim #optimizer = optim.SGD(net.parameters(), lr=1e-4, momentum=0.9) optimizer = optim.RMSprop(net.parameters()) goal_loss = nn.MSELoss() goal_reconstruction_loss = nn.MSELoss() action_reconstruction_loss = nn.MSELoss() # initial values of several variables previous_goal_reward = torch.Tensor([-1.0]) previous_lp_value = torch.zeros(1) learning_progress = torch.zeros(1) #average_reward_estimate = 0 average_lp_estimate = 0 #initial run action = env.action_space.sample() results = env.step(action) observation = results[0]["observation"] observations = np.expand_dims(np.expand_dims(observation, 0), 0) observations = torch.Tensor(observations) if evaluating: net.eval() if evaluating or not lp_training: pen_goal = results[0]["desired_goal"] #goal = np.tile(np.expand_dims(pen_goal,0),(n_steps,1)) #goal = np.reshape(goal.T,(-1)) goal = torch.Tensor(pen_goal) if rendering: setup_render(env) rewards = [] lps = [] # a function that allows to do back prop, but not accumulate gradient in certain modules of a network def partial_backprop(loss, parts_to_ignore=[]): for part in parts_to_ignore: for parameter in part.parameters(): parameter.requires_grad = False loss.backward(retain_graph=True) for part in parts_to_ignore: for parameter in part.parameters(): parameter.requires_grad = True pen_vars_slice = slice(54, 61) pen_pos_center = torch.Tensor([1.0, 0.90, 0.15]).unsqueeze(0).unsqueeze(0) print(observations.shape) reset_env = False for iteration in range(1000000): #print(observations) if evaluating: #if evaluating we just use the action prediction part of the network #print(goal, observations) action_parameters, _ = net.predict_action( goal.unsqueeze(0).unsqueeze(0), observations) action_parameters = action_parameters[0, 0, :] #print(action_parameters.shape) else: #feed observations to net, get desired goal, actions (and their probabilities), and predicted value of action, and goal actions, log_prob_action, goal, log_prob_goal, value, lp_value = net( observations, learning_progress.detach().unsqueeze(0).unsqueeze(0)) value = value - 1 # learn the difference between the value and -1, because at the beginning most values will be close to -1 pen_pos = observations[:, :, pen_vars_slice][..., :3] pen_rot = observations[:, :, pen_vars_slice][..., 3:] rot_goal = goal[:, :, 3:] rel_rot_goal = (rot_goal - pen_rot) * 0.1 + pen_rot goal = torch.cat([(goal[:, :, :3] - pen_pos) * 0.002 + pen_pos, (rel_rot_goal) / np.linalg.norm(rel_rot_goal)], dim=2) #goal += 0.05*torch.randn_like(goal) goal = Variable(goal.data, requires_grad=True) if lp_training: action_parameters, log_prob_action, goal, log_prob_goal, value, lp_value = actions[ 0, 0, :], log_prob_action[0, 0], goal[0, 0, :], log_prob_goal[ 0, 0], value[0, 0, :], lp_value[0, 0, :] else: # if we are not training goal policy then ignore the goal policy variables. We'll us the goal provided by openaigym action_parameters, log_prob_action, _, _, value, lp_value = actions[ 0, 0, :], log_prob_action[0, 0], goal[0, 0, :], log_prob_goal[ 0, 0], value[0, 0, :], lp_value[0, 0, :] action_parameters = action_parameters.detach().numpy() #print(action_parameters) #run action using DMP for i in range(n_simulation_steps): # print(context.shape) action = dmp.action_rollout(None, action_parameters, i) results = env.step(action) if evaluating or not lp_training: pen_goal = results[0]["desired_goal"] #goal = np.tile(np.expand_dims(pen_goal,0),(n_steps,1)) #goal = np.reshape(goal.T,(-1)) goal = torch.Tensor(pen_goal) if evaluating: print(results[1]) rewards.append(results[1]) if rendering: #env.render() render_with_target(env, goal.detach().numpy()) obs = results[0]["observation"] done = results[2] if done: print("reseting environment") results = env.reset() #print(results) obs = results["observation"] reset_env = True break new_observations = np.expand_dims(np.expand_dims(obs, 0), 0) new_observations = torch.Tensor(new_observations) if not evaluating: # saving rewards, learning progresses, etc if iteration % save_freq == save_freq - 1: print("Saving stuff") #torch.save(net, "lprnn.pt") torch.save(net.state_dict(), "lprnn_weights" + experiment_name + ".pt") with open("rewards" + experiment_name + ".txt", "a") as f: f.write("\n".join([str(r) for r in rewards])) rewards = [] with open("learning_progresses" + experiment_name + ".txt", "a") as f: f.write("\n".join([str(lp) for lp in lps])) lps = [] if save_goals: if iteration == 0: goals = np.expand_dims(goal, 0) else: goals = np.concatenate( [goals, np.expand_dims(goal, 0)], axis=0) else: if iteration % save_freq == save_freq - 1: with open("test_rewards" + experiment_name + ".txt", "a") as f: f.write("\n".join([str(r) for r in rewards])) rewards = [] # we detach the RNN every so often, to determine how far to backpropagate through time # TODO: do this in a way that doesn't start from scratch, but instead backpropagates forget_freq many iterations in the past # at *every* time step!! if iteration % forget_freq == forget_freq - 1: net.forget() if reset_env: observations = new_observations if not evaluating and lp_training: previous_lp_value = lp_value learning_progress = Variable( torch.zeros_like(learning_progress)) reset_env = False continue if not evaluating: #if not evaluating, then train optimizer.zero_grad() # train policy to maximize goal reward (which is -goal_loss, which is -|observation-goal|^2) Here we are only looking at pen part of observation #goal_reward = -1*goal_loss(new_observations[0,0,pen_vars_slice],goal) #goal_reward = goal_reward*(goal_reward>-1) goal_reward = env.compute_reward( new_observations[0, 0, pen_vars_slice].numpy(), goal.detach().numpy(), None) print(new_observations[0, 0, pen_vars_slice], goal) print("goal_reward", goal_reward) rewards.append(goal_reward) hindsight_goal = new_observations[:, :, pen_vars_slice] #if iteration <= 100000: # # we train for the goal reconstruction part of the network # # we use the hindsight_goal (the outcome of our action, to ensure we autoencode reachable goals, and explore more effectively # reconstructed_goal = net.autoencode_goal(hindsight_goal+0.01*torch.randn_like(hindsight_goal)) # loss = goal_reconstruction_loss(goal, reconstructed_goal) # print("goal_reconstruction_loss", loss.data.item()) # partial_backprop(loss) # we also learn to predict the actions we just performed when goal is the observed outcome # this is called hindsight experience replay (but this is a version that doesnt seem to work well) #predicted_action_parameters,_ = net.predict_action(hindsight_goal,observations, output_mean=True) #loss = action_reconstruction_loss(predicted_action_parameters, torch.Tensor(action_parameters)) #partial_backprop(loss, [net.goal_encoder]) # we update the policy and value function following a non-bootstraped actor-critic approach # we update the state-action value function by computing delta, # delta is an unbiased estimator of the difference between the predicted value `value` and # the true expected reward (estimated by the observed `goal_reward`) # we train the value function to minimize their squared difference delta**2 delta = goal_reward - value print("value", value.data.item()) reward_value_fun = 0.5 * delta**2 partial_backprop(reward_value_fun, []) # then we update the policy using a policy gradient update # where delta is used as the advantage # note that we detach delta, so that it becomes a scalar, and gradients aren't backpropagated through it anymore loss_policy = -delta.detach() * log_prob_action partial_backprop(loss_policy) ###Hindsight Experience Replay #value = net.compute_value(hindsight_goal, observations)[0,0,:] #value = value - 1 #log_prob_action = net.get_log_prob_action(hindsight_goal, observations, actions) #goal_reward = env.compute_reward(new_observations[0,0,pen_vars_slice].numpy(), hindsight_goal[0,0,:].detach().numpy(), None) #delta2 = goal_reward - value #reward_value_fun = 0.5*delta2**2 #partial_backprop(reward_value_fun) #loss_policy = -delta2.detach()*log_prob_action #partial_backprop(loss_policy) # we define absolute learning progress as the absolute value of the "Bellman" error, `delta` # If delta is high, that means that the reward we got was significantly different from our expectations # which means we updated our policy a lot # which I am interpreting as "you have learned a lot" -- you have made significant learning progress # on the other hand if delta is very small, you didn't learn anything you didn't already know. #learning_progress = torch.abs(delta+delta2) learning_progress = torch.abs(delta) lps.append(learning_progress.data.item()) # we use `learning_progress` (lp) as reward to train the goal-generating process (aka goal policy). # because the agent will be exploring goals in a continual way # we use a "continual learning" method for RL # in particular we use the average reward method, explained in Sutton and Barto 2nd edition (10.3, 13.6) # In short average reward RL uses differential value functions # which estimate the difference between {expected average reward following a state-action} and {average reward over all time - assume ergodicity} # -- called the differential return -- # this difference measures the transient advantage of performing this action over other actions # there is a version of the policy gradient theorem which shows that using this in place of the expected raw reward in the episodic setting, # means we are maximizing the average reward over all time of our policy, which is the desired objective in the continual setting. # yeah, this theory has assumptions like ergodicity, and also you can only prove optimality for tabular RL, or linear models, not NNs, # but these are problems with all of RL theory really. # anyway, `delta` is now the Bellman error measuring the difference between # {the previous estimation of the expected differential return (`previous_lp_value`)} # and {the new bootstraped estimate `learning_progress.detach() - average_lp_estimate + lp_value.detach()`} delta = learning_progress.detach( ) - average_lp_estimate + lp_value.detach() - previous_lp_value # note that we detach the learning progress and lp_value, this is standard in Bellman errors I think # we don't backpropagate through the bootstraped target! # also update `average_lp_estimate` which is the estimate of the average reward. average_lp_estimate = average_lp_estimate + alpha * delta.detach() if iteration > 0 and lp_training: #only do this once we have a previous_lp_value # update the differential value function for goal policy loss_lp_value_fun = 0.5 * delta**2 partial_backprop(loss_lp_value_fun, [net.goal_decoder]) # update goal policy using policy gradient loss_goal_policy = -delta.detach() * log_prob_goal # we don't update the goal_decoder because that way we are just training the RNN to produce certain action vectors # after the autoencoder has trained well, then each latent vector represents 1-to-1 a goal # and the action can learn to map to the actions corresponding to that goal # if we kept changing the goal_decoder, then the action decoder may "get confused" as its actual goal is changing even for fixed input latent vector partial_backprop(loss_goal_policy, []) optimizer.step() previous_lp_value = lp_value observations = new_observations
def main(argv): FLAGS = flags.FLAGS print(FLAGS.flag_values_dict()) FLAGS = FLAGS.flag_values_dict() globals().update(FLAGS) '''ENVIRONMENT''' import gym #env=gym.make("HandManipulatePen-v0") #env=gym.make("HandManipulateEgg-v0") env = gym.make("FetchSlide-v1") #goal_vars_slice = slice(54,61) goal_vars_slice = slice(3, 6) results = env.reset() #env.observation_space["observation"].shape[0] observation_dim = env.observation_space["observation"].shape[0] n_actuators = env.action_space.shape[0] n_dmp_basis = 10 action_dim = n_actuators * (n_dmp_basis + 1) #goal_dim = observation_dim #goal_dim = 7 goal_dim = 3 batch_size = 1 number_layers = 2 gamma = 0.9 #alpha = 0.1 # hyperparameter used in average lp estimate for goal policy #DMP env.relative_control = True from dmp import DMP n_simulation_steps = 25 dmp = DMP(10, n_simulation_steps, n_actuators) #%% '''NET''' from learning_progress_a2c import GOALRNN net = GOALRNN(batch_size, number_layers, observation_dim, action_dim, goal_dim, goal_vars_slice, n_hidden=256) if os.path.isfile("lprnn_weights" + experiment_name + ".pt"): print("LOADING WEIGHTS") net.load_state_dict( torch.load("lprnn_weights" + experiment_name + ".pt")) #print(net.goal_decoder.state_dict().items()) net.goal_decoder.apply(weight_reset) #print(net.goal_decoder.state_dict().items()) # optimizer and losses from torch import optim #optimizer = optim.SGD(net.parameters(), lr=1e-4, momentum=0.9) #optimizer = optim.SGD(net.parameters(), lr=1e-6) #optimizer2 = optim.SGD(net.parameters(), lr=1e1) optimizer = optim.Adam(net.parameters()) #optimizer = optim.RMSprop(net.parameters()) # initial values of several variables learning_progress = torch.zeros(1) #initial run action = env.action_space.sample() results = env.step(action) observation = results[0]["observation"] observations = np.expand_dims(np.expand_dims(observation, 0), 0) observations = torch.Tensor(observations) if evaluating: net.eval() if evaluating or not lp_training: pen_goal = results[0]["desired_goal"] goal = torch.Tensor(pen_goal) #if rendering: # setup_render(env) def g(epsilon, delta): if delta >= 0: return (1 + epsilon) * delta else: return (1 - epsilon) * delta rewards = [] lps = [] temp_buffer = [] # a function that allows to do back prop, but not accumulate gradient in certain modules of a network def partial_backprop(loss, parts_to_ignore=[]): for part in parts_to_ignore: for parameter in part.parameters(): parameter.requires_grad = False loss.backward(retain_graph=True) for part in parts_to_ignore: for parameter in part.parameters(): parameter.requires_grad = True print(observations.shape) reset_env = False '''TRAINING LOOP''' # using DDPG on-olicy (without memory buffer for now. TODO: have memory buffer for iteration in range(1000000): if evaluating: #if evaluating we just use the action prediction part of the network action_parameters = net.compute_actions( goal.unsqueeze(0).unsqueeze(0), observations) action_parameters = action_parameters[0, 0, :] else: #feed observations to net, get desired goal, actions, and predicted value of action, and goal actions, noisy_actions, noisy_goals, log_prob_goals = net( observations) #goals = Variable(goals.data, requires_grad=True) if lp_training: action_parameters, goal = noisy_actions[0, 0, :], noisy_goals[ 0, 0, :] else: # if we are not training goal policy then ignore the goal policy variables. We'll us the goal provided by openaigym action_parameters = noisy_actions[0, 0, :] action_parameters = action_parameters.detach().numpy() #print(action_parameters) #run action using DMP for i in range(n_simulation_steps): action = dmp.action_rollout(None, action_parameters, i) results = env.step(action) if evaluating or not lp_training: pen_goal = results[0]["desired_goal"] goal = torch.Tensor(pen_goal) if evaluating: print(results[1]) rewards.append(results[1]) if rendering: #render_with_target(env,goal.detach().numpy()) env.render() obs = results[0]["observation"] done = results[2] if done: print("reseting environment") results = env.reset() obs = results["observation"] reset_env = True break new_observations = np.expand_dims(np.expand_dims(obs, 0), 0) new_observations = torch.Tensor(new_observations) if not evaluating: # saving rewards, learning progresses, etc if iteration % save_freq == save_freq - 1: print("Saving stuff") torch.save(net.state_dict(), "lprnn_weights" + experiment_name + ".pt") with open("rewards" + experiment_name + ".txt", "a") as f: f.write("\n".join([str(r) for r in rewards])) f.write("\n") rewards = [] with open("learning_progresses" + experiment_name + ".txt", "a") as f: f.write("\n".join([str(lp) for lp in lps])) f.write("\n") lps = [] #if save_goals: # if iteration == 0: # goals = np.expand_dims(goal,0) # else: # goals = np.concatenate([noisy_goals,np.expand_dims(goal,0)],axis=0) else: if iteration % save_freq == save_freq - 1: with open("test_rewards" + experiment_name + ".txt", "a") as f: f.write("\n".join([str(r) for r in rewards])) rewards = [] if reset_env: observations = new_observations #if not evaluating and lp_training: # learning_progress = Variable(torch.zeros_like(learning_progress)) reset_env = False continue if not evaluating: #if not evaluating, then train hindsight_goal = new_observations[0, 0, goal_vars_slice] hindsight_goals = hindsight_goal.unsqueeze(0).unsqueeze(0) sparse_goal_reward = env.compute_reward(hindsight_goal.numpy(), goal.detach().numpy(), None) goal_reward = torch.clamp( -torch.norm(hindsight_goal - goal.detach()), -1, 1) print(new_observations[0, 0, goal_vars_slice], goal) print("goal_reward", goal_reward) print("sparse_goal_reward", sparse_goal_reward) rewards.append(sparse_goal_reward) temp_buffer.append((observations, noisy_actions, hindsight_goals, 0, new_observations, log_prob_goals.detach())) '''TRAIN GOAL POLICY''' if iteration > 0 and ( iteration % 20 ) == 0 and lp_training: #only do this once we have a previous_lp_value # im doing 5 training iterations to make sure goal policy updates kinda quick, and is able to adapt to the learning of the agent for i in range(20): index = np.random.choice(range(len(temp_buffer))) print(index) observations, noisy_actions, hindsight_goals, _, new_observations, log_prob_goals_old = temp_buffer[ index] optimizer.zero_grad() # find the actions the policy predicts for hindsight goal hindsight_actions = net.compute_actions( hindsight_goals.detach(), observations) #hindsight_actions_original = hindsight_actions.detach().clone() action_reconstruction_loss = 0.5 * torch.norm( noisy_actions.detach() - hindsight_actions)**2 partial_backprop(action_reconstruction_loss) optimizer.step() new_actions = net.compute_actions(hindsight_goals, observations) action_difference = torch.norm( new_actions - hindsight_actions) / torch.norm(hindsight_actions) learning_progress = action_difference print("learning progress", learning_progress.data.item()) lps.append(learning_progress.data.item()) temp_buffer[index] = (observations, noisy_actions, hindsight_goals, learning_progress.detach(), new_observations, log_prob_goals_old) for i in range(20): optimizer.zero_grad() index = np.random.choice(range(len(temp_buffer))) observations, _, hindsight_goals, learning_progress, new_observations, _ = temp_buffer[ index] log_prob_goals = net.compute_log_prob_goals( observations, hindsight_goals) previous_lp_value = net.compute_vlp(observations) delta = learning_progress + gamma * net.compute_vlp( new_observations).detach() - previous_lp_value #delta = learning_progress.detach() #print(delta, learning_progress, lp_value, previous_lp_value) loss_lp_value_fun = 0.5 * delta**2 partial_backprop(loss_lp_value_fun, [net.goal_decoder]) optimizer.step() for i in range(20): index = np.random.choice(range(len(temp_buffer))) observations, _, hindsight_goals, learning_progress, new_observations, log_prob_goals_old = temp_buffer[ index] log_prob_goals = net.compute_log_prob_goals( observations, hindsight_goals) previous_lp_value = net.compute_vlp(observations) delta = learning_progress + gamma * net.compute_vlp( new_observations).detach() - previous_lp_value optimizer.zero_grad() #loss_goal_policy = -delta.detach()*log_prob_goals[0,0] #loss_goal_policy = -delta.detach()*torch.exp(log_prob_goals[0,0]-log_prob_goals_old) loss_goal_policy = -torch.min( delta.detach() * torch.exp(log_prob_goals[0, 0] - log_prob_goals_old), g(0.5, delta.detach())) partial_backprop(loss_goal_policy) optimizer.step() temp_buffer = [] #print("lp value", previous_lp_value.data.item()) observations = new_observations
def main(argv): FLAGS = flags.FLAGS print(FLAGS.flag_values_dict()) FLAGS = FLAGS.flag_values_dict() globals().update(FLAGS) '''ENVIRONMENT''' import gym env = gym.make("HandManipulatePen-v0") results = env.reset() #env.observation_space["observation"].shape[0] observation_dim = env.observation_space["observation"].shape[0] n_actuators = env.action_space.shape[0] n_dmp_basis = 10 action_dim = n_actuators * (n_dmp_basis + 1) #goal_dim = observation_dim goal_dim = 7 batch_size = 1 number_layers = 2 gamma = 0.9 #alpha = 0.1 # hyperparameter used in average lp estimate for goal policy #DMP env.relative_control = True from dmp import DMP n_simulation_steps = 25 dmp = DMP(10, n_simulation_steps, n_actuators) #%% '''NET''' from learning_progress_ddpg_a2c import GOALRNN net = GOALRNN(batch_size, number_layers, observation_dim, action_dim, goal_dim, n_hidden=256) if os.path.isfile("lprnn_weights" + experiment_name + ".pt"): print("LOADING WEIGHTS") net.load_state_dict( torch.load("lprnn_weights" + experiment_name + ".pt")) #print(net.goal_decoder.state_dict().items()) net.goal_decoder.apply(weight_reset) #print(net.goal_decoder.state_dict().items()) # optimizer and losses from torch import optim #optimizer = optim.SGD(net.parameters(), lr=1e-4, momentum=0.9) #optimizer = optim.SGD(net.parameters(), lr=1e-6) #optimizer2 = optim.SGD(net.parameters(), lr=1e1) optimizer = optim.Adam(net.parameters()) #optimizer = optim.RMSprop(net.parameters()) # initial values of several variables learning_progress = torch.zeros(1) #initial run action = env.action_space.sample() results = env.step(action) observation = results[0]["observation"] observations = np.expand_dims(np.expand_dims(observation, 0), 0) observations = torch.Tensor(observations) if evaluating: net.eval() if evaluating or not lp_training: pen_goal = results[0]["desired_goal"] goal = torch.Tensor(pen_goal) if rendering: setup_render(env) rewards = [] lps = [] temp_buffer = [] # a function that allows to do back prop, but not accumulate gradient in certain modules of a network def partial_backprop(loss, parts_to_ignore=[]): for part in parts_to_ignore: for parameter in part.parameters(): parameter.requires_grad = False loss.backward(retain_graph=True) for part in parts_to_ignore: for parameter in part.parameters(): parameter.requires_grad = True print(observations.shape) reset_env = False '''TRAINING LOOP''' # using DDPG on-olicy (without memory buffer for now. TODO: have memory buffer for iteration in range(1000000): if evaluating: #if evaluating we just use the action prediction part of the network action_parameters = net.compute_actions( goal.unsqueeze(0).unsqueeze(0), observations) action_parameters = action_parameters[0, 0, :] else: #feed observations to net, get desired goal, actions, and predicted value of action, and goal actions, noisy_actions, noisy_goals, log_prob_goals = net( observations) #goals = Variable(goals.data, requires_grad=True) if lp_training: action_parameters, goal = noisy_actions[0, 0, :], noisy_goals[ 0, 0, :] else: # if we are not training goal policy then ignore the goal policy variables. We'll us the goal provided by openaigym action_parameters = noisy_actions[0, 0, :] action_parameters = action_parameters.detach().numpy() #print(action_parameters) #run action using DMP for i in range(n_simulation_steps): action = dmp.action_rollout(None, action_parameters, i) results = env.step(action) if evaluating or not lp_training: pen_goal = results[0]["desired_goal"] goal = torch.Tensor(pen_goal) if evaluating: print(results[1]) rewards.append(results[1]) if rendering: render_with_target(env, goal.detach().numpy()) obs = results[0]["observation"] done = results[2] if done: print("reseting environment") results = env.reset() obs = results["observation"] reset_env = True break new_observations = np.expand_dims(np.expand_dims(obs, 0), 0) new_observations = torch.Tensor(new_observations) if not evaluating: # saving rewards, learning progresses, etc if iteration % save_freq == save_freq - 1: print("Saving stuff") torch.save(net.state_dict(), "lprnn_weights" + experiment_name + ".pt") with open("rewards" + experiment_name + ".txt", "a") as f: f.write("\n".join([str(r) for r in rewards])) f.write("\n") rewards = [] with open("learning_progresses" + experiment_name + ".txt", "a") as f: f.write("\n".join([str(lp) for lp in lps])) f.write("\n") lps = [] #if save_goals: # if iteration == 0: # goals = np.expand_dims(goal,0) # else: # goals = np.concatenate([noisy_goals,np.expand_dims(goal,0)],axis=0) else: if iteration % save_freq == save_freq - 1: with open("test_rewards" + experiment_name + ".txt", "a") as f: f.write("\n".join([str(r) for r in rewards])) rewards = [] if reset_env: observations = new_observations if not evaluating and lp_training: learning_progress = Variable( torch.zeros_like(learning_progress)) reset_env = False continue if not evaluating: #if not evaluating, then train pen_vars_slice = slice(54, 61) hindsight_goal = new_observations[0, 0, pen_vars_slice] hindsight_goals = hindsight_goal.unsqueeze(0).unsqueeze(0) sparse_goal_reward = env.compute_reward(hindsight_goal.numpy(), goal.detach().numpy(), None) goal_reward = torch.clamp( -torch.norm(hindsight_goal - goal.detach()), -1, 1) print(new_observations[0, 0, pen_vars_slice], goal) print("goal_reward", goal_reward) print("sparse_goal_reward", sparse_goal_reward) rewards.append(sparse_goal_reward) total_delta = 0 ## update q value network on desired goal #optimizer.zero_grad() #value = net.compute_q_value(noisy_goals.detach(), observations, noisy_actions) #print("q-value", value.data.item()) #delta = goal_reward - value #total_delta += delta #reward_value_fun = 0.5*delta**2 #partial_backprop(reward_value_fun, [net.goal_decoder, net.action_decoder]) #optimizer.step() ### update q value network on hindsight goal ### by definition we have achieved the hindsight goal, so reward is 0.0 (achieved goal) #goal_reward = 0.0 #optimizer.zero_grad() #value = net.compute_q_value(hindsight_goals.detach(), observations, noisy_actions) #delta = goal_reward - value #total_delta += delta #reward_value_fun = 0.5*delta**2 #partial_backprop(reward_value_fun, [net.goal_decoder, net.action_decoder]) #optimizer.step() # update policy to achieve actions with higher q value # this is good to make sure we learn about actions for goals which are achievable # TODO: Alternative to try: because in this environment having several actions that lead to same outcome is probably not very likely, then we can train the action decoder directly too on hindsight goal # without any problem for intrisic motivation I think if np.random.rand() <= 1.0: #print(net.action_decoder.state_dict().items()) for i in range(1): optimizer.zero_grad() # find the actions the policy predicts for hindsight goal hindsight_actions = net.compute_actions( hindsight_goals.detach(), observations) if i == 0: hindsight_actions_original = hindsight_actions.detach( ).clone() action_reconstruction_loss = 0.5 * torch.norm( actions.detach() - hindsight_actions)**2 partial_backprop(action_reconstruction_loss) optimizer.step() for i in range(1): optimizer.zero_grad() # find the actions the policy predicts for hindsight goal hindsight_actions = net.compute_actions( hindsight_goals.detach() + 0.1 * torch.rand_like(hindsight_goals), observations) action_reconstruction_loss = 0.5 * torch.norm( actions.detach() - hindsight_actions)**2 partial_backprop(-action_reconstruction_loss) optimizer.step() new_actions = net.compute_actions(hindsight_goals, observations) action_difference = torch.norm( new_actions - hindsight_actions_original) / torch.norm( hindsight_actions_original) else: actions = net.compute_actions(noisy_goals, observations) optimizer.zero_grad() loss_policy = -net.compute_q_value(noisy_goals.detach(), observations, actions) partial_backprop(loss_policy, [net.q_value_decoder]) optimizer.step() new_actions = net.compute_actions(noisy_goals, observations) action_difference = torch.norm(new_actions - actions) / torch.norm(actions) '''COMPUTE LEARNING PROGRESS''' print("action difference", action_difference) #learning_progress = torch.abs(delta) + action_difference #learning_progress = torch.max(total_delta,action_difference) #learning_progress = 0.1*torch.abs(total_delta)+2*action_difference learning_progress = action_difference #learning_progress = action_difference + 0.001*goal_reward learning_progress *= 10 #learning_progress = goal_reward print("learning progress", learning_progress.data.item()) lps.append(learning_progress.data.item()) temp_buffer.append((observations, hindsight_goals, learning_progress.detach(), new_observations)) '''TRAIN GOAL POLICY''' if iteration % 20 == 19 and lp_training: #only do this once we have a previous_lp_value # im doing 5 training iterations to make sure goal policy updates kinda quick, and is able to adapt to the learning of the agent for i in range(20): optimizer.zero_grad() observations, hindsight_goals, learning_progress, new_observations = np.random.choice( temp_buffer) log_prob_goals = net.compute_log_prob_goals( observations, hindsight_goals) previous_lp_value = net.compute_vlp(observations) delta = learning_progress + gamma * net.compute_vlp( new_observations).detach() - previous_lp_value #delta = learning_progress.detach() #print(delta, learning_progress, lp_value, previous_lp_value) loss_lp_value_fun = 0.5 * delta**2 partial_backprop(loss_lp_value_fun, [net.goal_decoder]) optimizer.step() for i in range(20): observations, hindsight_goals, learning_progress, new_observations = np.random.choice( temp_buffer) log_prob_goals = net.compute_log_prob_goals( observations, hindsight_goals) previous_lp_value = net.compute_vlp(observations) delta = learning_progress + gamma * net.compute_vlp( new_observations).detach() - previous_lp_value optimizer.zero_grad() loss_goal_policy = -delta.detach() * log_prob_goals[0, 0] partial_backprop(loss_goal_policy) optimizer.step() temp_buffer = [] #print("lp value", previous_lp_value.data.item()) observations = new_observations
rewards = [] lps = [] for iteration in range(1000000): if not evaluating: action, log_prob_action, goal, log_prob_goal, value, lp_value = net(observations, learning_progress.detach().unsqueeze(0).unsqueeze(0)) goal = Variable(goal.data, requires_grad=True) action_parameters, log_prob_action, goal, log_prob_goal, value, lp_value = action[0,0,:], log_prob_action[0,0], goal[0,0,:], log_prob_goal[0,0], value[0,0,:], lp_value[0,0,:] else: action_parameters = net.predict_action(goal) print(action_parameters.shape) action_parameters = action_parameters.detach().numpy() for i in range(n_simulation_steps): # print(context.shape) action = dmp.action_rollout(None,action_parameters,i) results = env.step(action) if evaluating: pen_goal = results["desired_goal"] goal = np.tile(np.expand_dims(pen_goal,0),(n_steps,1)) goal = np.reshape(goal.T,(-1)) if rendering: env.render() obs = results[0]["observation"] # print(obs) done = results[2] if done: print("reseting environment") results = env.reset() reset_env = True break