def test_act_and_train(self): n_updates = 10 n_dims = 2 n_bfs = 4 n_reps = 3 duration = 0.5 dt = 0.01 dmps = [[] for _ in range(n_dims)] n_times = int(duration / dt) env = Point(dmps, n_dims, n_bfs, n_times, duration, dt) agent = MDSKDE(dmps=dmps, n_updates=n_updates, n_reps=n_reps, n_dims=n_dims, n_bfs=n_bfs, n=5, times=1.0, r_gain=10**(-8), r_normalize=False) for i in range(n_dims): dmps[i] = DMP(n_bfs, duration, 0.5, dt, True) dmps[i].reset() dmps[i].set_goal(env.gxi[i]) agent.theta[i] = dmps[i].minimize_jerk() reward = 0 obs = env.reset() t = 0 k = 0 agent.act_and_train(obs, reward, t, k)
def callback(self, msg): if self.pose_0 == None: self.pose_0 = [ msg.position.x, msg.position.y, msg.position.z, msg.orientation.x, msg.orientation.y, msg.orientation.z, msg.orientation.w ] self.pose_1 = [ msg.position.x, msg.position.y, msg.position.z, msg.orientation.x, msg.orientation.y, msg.orientation.z, msg.orientation.w ] else: self.pose_0 = self.pose_1 self.pose_1 = [ msg.position.x, msg.position.y, msg.position.z, msg.orientation.x, msg.orientation.y, msg.orientation.z, msg.orientation.w ] if np.sum( np.abs(np.array(self.pose_0) - np.array(self.pose_1))) > 0.01: print('Moving') self.traj.append(self.pose_1) else: if len(self.traj) > 5: # Fit DMP to motion segment path = np.array(self.traj) dmp = DMP(path[0, :], path[-1, :], Nb=500, dt=0.01, d=path.shape[1], jnames=self.name) params = dmp.imitate_path( path + 1e-5 * np.random.randn(path.shape[0], path.shape[1])) #print(params) dmp.reset_state() self.dmp_count += 1 with open("./dmp%05d.npy" % self.dmp_count, "w") as f: pickle.dump(dmp, f, pickle.HIGHEST_PROTOCOL) y_r, dy_r, ddy_r = dmp.rollout() plt.subplot(1, 2, 1) plt.cla() plt.plot(y_r) plt.subplot(1, 2, 2) plt.cla() plt.plot(path) plt.draw() plt.pause(0.1) self.traj = [] print('Stationary')
def callback(self, msg): if self.js_prev is not None: self.js_prev = msg self.js_current = msg else: self.js_prev = self.js_current self.js_current = msg # If your joints have changed by more than a small threshold amount, then you are moving! Start recording joint angles if np.sum( np.abs( np.array(self.js_prev.position) - np.array(self.js_current.position))) > 0.01: print('Moving') j_list = [ msg.position[msg.name.index(j)] for j in self.joint_names ] self.traj.append(j_list) elif len(self.traj) > 0: # Fit DMP to motion segment path = np.array(self.traj) dmp = DMP(path[0, :], path[-1, :], num_basis_funcs=500, dt=0.01, d=path.shape[1], jnames=self.joint_names) _, weights = imitate_path( path + 1e-5 * np.random.randn(path.shape[0], path.shape[1]), dmp) print("Learned weights: {}".format(weights)) dmp.weights = weights self.dmp_count += 1 # Save it with pickle with open("./dmp{}.npy".format(self.dmp_count), "w") as f: pickle.dump(dmp, f, pickle.HIGHEST_PROTOCOL) # Plot the path imitated by the learned dmp (rolled out) against the actual trajectory path... y_r, dy_r, _ = dmp.rollout() plot_rollout(y_r, path) jt = build_jt(msg.header, self.joint_names, y_r, dy_r) self.traj_pub.publish(jt) self.traj = [] else: print("Stationary")
def callback(self, msg): if self.pose_prev is not None: self.pose_prev = [msg.position.x, msg.position.y, msg.position.z, msg.orientation.x, msg.orientation.y, msg.orientation.z, msg.orientation.w] self.pose_current = [msg.position.x, msg.position.y, msg.position.z, msg.orientation.x, msg.orientation.y, msg.orientation.z, msg.orientation.w] else: self.pose_prev = self.pose_current self.pose_current = [msg.position.x, msg.position.y, msg.position.z, msg.orientation.x, msg.orientation.y, msg.orientation.z, msg.orientation.w] if np.sum(np.abs(np.array(self.pose_prev) - np.array(self.pose_current))) > 0.01: print('Moving') self.traj.append(self.pose_current) else: if len(self.traj) > 5: # Fit DMP to motion segment path = np.array(self.traj) dmp = DMP(path[0, :], path[-1, :], num_basis_funcs=500, dt=0.01, d=path.shape[1], jnames=self.name) params = dmp.imitate_path(path+1e-5*np.random.randn(path.shape[0], path.shape[1])) print(params) self.dmp_count += 1 with open("./dmp%05d.npy" % self.dmp_count, "w") as f: pickle.dump(dmp, f, pickle.HIGHEST_PROTOCOL) y_r, _, _ = dmp.rollout() plot_rollout(y_r, path) self.traj = [] print('Stationary')
def callback(self, msg): if self.js_0 == None: self.js_0 = msg self.js_1 = msg else: self.js_0 = self.js_1 self.js_1 = msg if np.sum( np.abs( np.array(self.js_0.position) - np.array(self.js_1.position))) > 0.01: print('Moving') j_list = [] for j, joint in enumerate(self.joint_names): i = msg.name.index(joint) if i: j_list.append(msg.position[i]) self.traj.append(j_list) else: if len(self.traj) > 0: # Fit DMP to motion segment path = np.array(self.traj) dmp = DMP(path[0, :], path[-1, :], Nb=500, dt=0.01, d=path.shape[1], jnames=self.joint_names) params = dmp.imitate_path( path + 1e-5 * np.random.randn(path.shape[0], path.shape[1])) #print(params) dmp.reset_state() self.dmp_count += 1 with open("./dmp%05d.npy" % self.dmp_count, "w") as f: pickle.dump(dmp, f, pickle.HIGHEST_PROTOCOL) y_r, dy_r, ddy_r = dmp.rollout() plt.subplot(1, 2, 1) plt.cla() plt.plot(y_r) plt.subplot(1, 2, 2) plt.cla() plt.plot(path) plt.draw() plt.pause(0.1) self.jt.header = msg.header self.jt.joint_names = self.joint_names jtp = JointTrajectoryPoint() for i in range(y_r.shape[0]): jtp.positions = y_r[i, :].tolist() jtp.velocities = dy_r[i, :].tolist() jtp.time_from_start = rospy.Duration(1.0) self.jt.points.append(jtp) self.traj_pub.publish(self.jt) self.traj = [] print('Stationary') self.jt = JointTrajectory()
from dmp import DynamicMovementPrimitive as DMP import numpy as np import matplotlib.pyplot as plt from plots import plot as Plot # Initialize the DMP class my_dmp = DMP(20, 20, 0) # Filtering Parameters window = 5 blends = 10 # Load the demo data data = DMP.load_demo("demos/demo1.txt") # Obtain the joint position data and the time vector t, q = DMP.parse_demo(data) # Get the phase from the time vector phase = my_dmp.phase(t) Plot.phase(phase) print(q.shape) # Normalize the time vector t = DMP.normalize_vector(t) # Compute velocity and acceleration dq = np.zeros(q.shape) ddq = np.zeros(q.shape)
def main(argv): FLAGS = flags.FLAGS print(FLAGS.flag_values_dict()) FLAGS = FLAGS.flag_values_dict() globals().update(FLAGS) '''ENVIRONMENT''' import gym env = gym.make("HandManipulatePen-v0") results = env.reset() env.observation_space["observation"].shape[0] observation_dim = env.observation_space["observation"].shape[0] n_actuators = env.action_space.shape[0] n_dmp_basis = 10 action_dim = n_actuators * (n_dmp_basis + 1) #goal_dim = observation_dim goal_dim = 7 batch_size = 1 number_layers = 2 alpha = 0.1 # hyperparameter used in average lp estimate for goal policy #DMP env.relative_control = True from dmp import DMP n_simulation_steps = 25 dmp = DMP(10, n_simulation_steps, n_actuators) #%% '''NET''' from learning_progress_rnn import GOALRNN net = GOALRNN(batch_size, number_layers, observation_dim, action_dim, goal_dim, n_hidden=256) # load rnn if saved. TODO: save only weights maybe to avoid bugs after we change architecture.. if os.path.isfile("lprnn" + experiment_name + ".pt"): temp_net = torch.load("lprnn" + experiment_name + ".pt") net.load_state_dict(temp_net.state_dict()) if os.path.isfile("lprnn_weights" + experiment_name + ".pt"): print("LOADING WEIGHTS") net.load_state_dict( torch.load("lprnn_weights" + experiment_name + ".pt")) # optimizer and losses from torch import optim #optimizer = optim.SGD(net.parameters(), lr=1e-4, momentum=0.9) optimizer = optim.RMSprop(net.parameters()) goal_loss = nn.MSELoss() goal_reconstruction_loss = nn.MSELoss() action_reconstruction_loss = nn.MSELoss() # initial values of several variables previous_goal_reward = torch.Tensor([-1.0]) previous_lp_value = torch.zeros(1) learning_progress = torch.zeros(1) #average_reward_estimate = 0 average_lp_estimate = 0 #initial run action = env.action_space.sample() results = env.step(action) observation = results[0]["observation"] observations = np.expand_dims(np.expand_dims(observation, 0), 0) observations = torch.Tensor(observations) if evaluating: net.eval() if evaluating or not lp_training: pen_goal = results[0]["desired_goal"] #goal = np.tile(np.expand_dims(pen_goal,0),(n_steps,1)) #goal = np.reshape(goal.T,(-1)) goal = torch.Tensor(pen_goal) if rendering: setup_render(env) rewards = [] lps = [] # a function that allows to do back prop, but not accumulate gradient in certain modules of a network def partial_backprop(loss, parts_to_ignore=[]): for part in parts_to_ignore: for parameter in part.parameters(): parameter.requires_grad = False loss.backward(retain_graph=True) for part in parts_to_ignore: for parameter in part.parameters(): parameter.requires_grad = True pen_vars_slice = slice(54, 61) pen_pos_center = torch.Tensor([1.0, 0.90, 0.15]).unsqueeze(0).unsqueeze(0) print(observations.shape) reset_env = False for iteration in range(1000000): #print(observations) if evaluating: #if evaluating we just use the action prediction part of the network #print(goal, observations) action_parameters, _ = net.predict_action( goal.unsqueeze(0).unsqueeze(0), observations) action_parameters = action_parameters[0, 0, :] #print(action_parameters.shape) else: #feed observations to net, get desired goal, actions (and their probabilities), and predicted value of action, and goal actions, log_prob_action, goal, log_prob_goal, value, lp_value = net( observations, learning_progress.detach().unsqueeze(0).unsqueeze(0)) value = value - 1 # learn the difference between the value and -1, because at the beginning most values will be close to -1 pen_pos = observations[:, :, pen_vars_slice][..., :3] pen_rot = observations[:, :, pen_vars_slice][..., 3:] rot_goal = goal[:, :, 3:] rel_rot_goal = (rot_goal - pen_rot) * 0.1 + pen_rot goal = torch.cat([(goal[:, :, :3] - pen_pos) * 0.002 + pen_pos, (rel_rot_goal) / np.linalg.norm(rel_rot_goal)], dim=2) #goal += 0.05*torch.randn_like(goal) goal = Variable(goal.data, requires_grad=True) if lp_training: action_parameters, log_prob_action, goal, log_prob_goal, value, lp_value = actions[ 0, 0, :], log_prob_action[0, 0], goal[0, 0, :], log_prob_goal[ 0, 0], value[0, 0, :], lp_value[0, 0, :] else: # if we are not training goal policy then ignore the goal policy variables. We'll us the goal provided by openaigym action_parameters, log_prob_action, _, _, value, lp_value = actions[ 0, 0, :], log_prob_action[0, 0], goal[0, 0, :], log_prob_goal[ 0, 0], value[0, 0, :], lp_value[0, 0, :] action_parameters = action_parameters.detach().numpy() #print(action_parameters) #run action using DMP for i in range(n_simulation_steps): # print(context.shape) action = dmp.action_rollout(None, action_parameters, i) results = env.step(action) if evaluating or not lp_training: pen_goal = results[0]["desired_goal"] #goal = np.tile(np.expand_dims(pen_goal,0),(n_steps,1)) #goal = np.reshape(goal.T,(-1)) goal = torch.Tensor(pen_goal) if evaluating: print(results[1]) rewards.append(results[1]) if rendering: #env.render() render_with_target(env, goal.detach().numpy()) obs = results[0]["observation"] done = results[2] if done: print("reseting environment") results = env.reset() #print(results) obs = results["observation"] reset_env = True break new_observations = np.expand_dims(np.expand_dims(obs, 0), 0) new_observations = torch.Tensor(new_observations) if not evaluating: # saving rewards, learning progresses, etc if iteration % save_freq == save_freq - 1: print("Saving stuff") #torch.save(net, "lprnn.pt") torch.save(net.state_dict(), "lprnn_weights" + experiment_name + ".pt") with open("rewards" + experiment_name + ".txt", "a") as f: f.write("\n".join([str(r) for r in rewards])) rewards = [] with open("learning_progresses" + experiment_name + ".txt", "a") as f: f.write("\n".join([str(lp) for lp in lps])) lps = [] if save_goals: if iteration == 0: goals = np.expand_dims(goal, 0) else: goals = np.concatenate( [goals, np.expand_dims(goal, 0)], axis=0) else: if iteration % save_freq == save_freq - 1: with open("test_rewards" + experiment_name + ".txt", "a") as f: f.write("\n".join([str(r) for r in rewards])) rewards = [] # we detach the RNN every so often, to determine how far to backpropagate through time # TODO: do this in a way that doesn't start from scratch, but instead backpropagates forget_freq many iterations in the past # at *every* time step!! if iteration % forget_freq == forget_freq - 1: net.forget() if reset_env: observations = new_observations if not evaluating and lp_training: previous_lp_value = lp_value learning_progress = Variable( torch.zeros_like(learning_progress)) reset_env = False continue if not evaluating: #if not evaluating, then train optimizer.zero_grad() # train policy to maximize goal reward (which is -goal_loss, which is -|observation-goal|^2) Here we are only looking at pen part of observation #goal_reward = -1*goal_loss(new_observations[0,0,pen_vars_slice],goal) #goal_reward = goal_reward*(goal_reward>-1) goal_reward = env.compute_reward( new_observations[0, 0, pen_vars_slice].numpy(), goal.detach().numpy(), None) print(new_observations[0, 0, pen_vars_slice], goal) print("goal_reward", goal_reward) rewards.append(goal_reward) hindsight_goal = new_observations[:, :, pen_vars_slice] #if iteration <= 100000: # # we train for the goal reconstruction part of the network # # we use the hindsight_goal (the outcome of our action, to ensure we autoencode reachable goals, and explore more effectively # reconstructed_goal = net.autoencode_goal(hindsight_goal+0.01*torch.randn_like(hindsight_goal)) # loss = goal_reconstruction_loss(goal, reconstructed_goal) # print("goal_reconstruction_loss", loss.data.item()) # partial_backprop(loss) # we also learn to predict the actions we just performed when goal is the observed outcome # this is called hindsight experience replay (but this is a version that doesnt seem to work well) #predicted_action_parameters,_ = net.predict_action(hindsight_goal,observations, output_mean=True) #loss = action_reconstruction_loss(predicted_action_parameters, torch.Tensor(action_parameters)) #partial_backprop(loss, [net.goal_encoder]) # we update the policy and value function following a non-bootstraped actor-critic approach # we update the state-action value function by computing delta, # delta is an unbiased estimator of the difference between the predicted value `value` and # the true expected reward (estimated by the observed `goal_reward`) # we train the value function to minimize their squared difference delta**2 delta = goal_reward - value print("value", value.data.item()) reward_value_fun = 0.5 * delta**2 partial_backprop(reward_value_fun, []) # then we update the policy using a policy gradient update # where delta is used as the advantage # note that we detach delta, so that it becomes a scalar, and gradients aren't backpropagated through it anymore loss_policy = -delta.detach() * log_prob_action partial_backprop(loss_policy) ###Hindsight Experience Replay #value = net.compute_value(hindsight_goal, observations)[0,0,:] #value = value - 1 #log_prob_action = net.get_log_prob_action(hindsight_goal, observations, actions) #goal_reward = env.compute_reward(new_observations[0,0,pen_vars_slice].numpy(), hindsight_goal[0,0,:].detach().numpy(), None) #delta2 = goal_reward - value #reward_value_fun = 0.5*delta2**2 #partial_backprop(reward_value_fun) #loss_policy = -delta2.detach()*log_prob_action #partial_backprop(loss_policy) # we define absolute learning progress as the absolute value of the "Bellman" error, `delta` # If delta is high, that means that the reward we got was significantly different from our expectations # which means we updated our policy a lot # which I am interpreting as "you have learned a lot" -- you have made significant learning progress # on the other hand if delta is very small, you didn't learn anything you didn't already know. #learning_progress = torch.abs(delta+delta2) learning_progress = torch.abs(delta) lps.append(learning_progress.data.item()) # we use `learning_progress` (lp) as reward to train the goal-generating process (aka goal policy). # because the agent will be exploring goals in a continual way # we use a "continual learning" method for RL # in particular we use the average reward method, explained in Sutton and Barto 2nd edition (10.3, 13.6) # In short average reward RL uses differential value functions # which estimate the difference between {expected average reward following a state-action} and {average reward over all time - assume ergodicity} # -- called the differential return -- # this difference measures the transient advantage of performing this action over other actions # there is a version of the policy gradient theorem which shows that using this in place of the expected raw reward in the episodic setting, # means we are maximizing the average reward over all time of our policy, which is the desired objective in the continual setting. # yeah, this theory has assumptions like ergodicity, and also you can only prove optimality for tabular RL, or linear models, not NNs, # but these are problems with all of RL theory really. # anyway, `delta` is now the Bellman error measuring the difference between # {the previous estimation of the expected differential return (`previous_lp_value`)} # and {the new bootstraped estimate `learning_progress.detach() - average_lp_estimate + lp_value.detach()`} delta = learning_progress.detach( ) - average_lp_estimate + lp_value.detach() - previous_lp_value # note that we detach the learning progress and lp_value, this is standard in Bellman errors I think # we don't backpropagate through the bootstraped target! # also update `average_lp_estimate` which is the estimate of the average reward. average_lp_estimate = average_lp_estimate + alpha * delta.detach() if iteration > 0 and lp_training: #only do this once we have a previous_lp_value # update the differential value function for goal policy loss_lp_value_fun = 0.5 * delta**2 partial_backprop(loss_lp_value_fun, [net.goal_decoder]) # update goal policy using policy gradient loss_goal_policy = -delta.detach() * log_prob_goal # we don't update the goal_decoder because that way we are just training the RNN to produce certain action vectors # after the autoencoder has trained well, then each latent vector represents 1-to-1 a goal # and the action can learn to map to the actions corresponding to that goal # if we kept changing the goal_decoder, then the action decoder may "get confused" as its actual goal is changing even for fixed input latent vector partial_backprop(loss_goal_policy, []) optimizer.step() previous_lp_value = lp_value observations = new_observations
def main(argv): FLAGS = flags.FLAGS print(FLAGS.flag_values_dict()) FLAGS = FLAGS.flag_values_dict() globals().update(FLAGS) '''ENVIRONMENT''' import gym env = gym.make("HandManipulatePen-v0") results = env.reset() #env.observation_space["observation"].shape[0] observation_dim = env.observation_space["observation"].shape[0] n_actuators = env.action_space.shape[0] n_dmp_basis = 10 action_dim = n_actuators * (n_dmp_basis + 1) #goal_dim = observation_dim goal_dim = 7 batch_size = 1 number_layers = 2 gamma = 0.9 #alpha = 0.1 # hyperparameter used in average lp estimate for goal policy #DMP env.relative_control = True from dmp import DMP n_simulation_steps = 25 dmp = DMP(10, n_simulation_steps, n_actuators) #%% '''NET''' from learning_progress_ddpg_a2c import GOALRNN net = GOALRNN(batch_size, number_layers, observation_dim, action_dim, goal_dim, n_hidden=256) if os.path.isfile("lprnn_weights" + experiment_name + ".pt"): print("LOADING WEIGHTS") net.load_state_dict( torch.load("lprnn_weights" + experiment_name + ".pt")) #print(net.goal_decoder.state_dict().items()) net.goal_decoder.apply(weight_reset) #print(net.goal_decoder.state_dict().items()) # optimizer and losses from torch import optim #optimizer = optim.SGD(net.parameters(), lr=1e-4, momentum=0.9) #optimizer = optim.SGD(net.parameters(), lr=1e-6) #optimizer2 = optim.SGD(net.parameters(), lr=1e1) optimizer = optim.Adam(net.parameters()) #optimizer = optim.RMSprop(net.parameters()) # initial values of several variables learning_progress = torch.zeros(1) #initial run action = env.action_space.sample() results = env.step(action) observation = results[0]["observation"] observations = np.expand_dims(np.expand_dims(observation, 0), 0) observations = torch.Tensor(observations) if evaluating: net.eval() if evaluating or not lp_training: pen_goal = results[0]["desired_goal"] goal = torch.Tensor(pen_goal) if rendering: setup_render(env) rewards = [] lps = [] temp_buffer = [] # a function that allows to do back prop, but not accumulate gradient in certain modules of a network def partial_backprop(loss, parts_to_ignore=[]): for part in parts_to_ignore: for parameter in part.parameters(): parameter.requires_grad = False loss.backward(retain_graph=True) for part in parts_to_ignore: for parameter in part.parameters(): parameter.requires_grad = True print(observations.shape) reset_env = False '''TRAINING LOOP''' # using DDPG on-olicy (without memory buffer for now. TODO: have memory buffer for iteration in range(1000000): if evaluating: #if evaluating we just use the action prediction part of the network action_parameters = net.compute_actions( goal.unsqueeze(0).unsqueeze(0), observations) action_parameters = action_parameters[0, 0, :] else: #feed observations to net, get desired goal, actions, and predicted value of action, and goal actions, noisy_actions, noisy_goals, log_prob_goals = net( observations) #goals = Variable(goals.data, requires_grad=True) if lp_training: action_parameters, goal = noisy_actions[0, 0, :], noisy_goals[ 0, 0, :] else: # if we are not training goal policy then ignore the goal policy variables. We'll us the goal provided by openaigym action_parameters = noisy_actions[0, 0, :] action_parameters = action_parameters.detach().numpy() #print(action_parameters) #run action using DMP for i in range(n_simulation_steps): action = dmp.action_rollout(None, action_parameters, i) results = env.step(action) if evaluating or not lp_training: pen_goal = results[0]["desired_goal"] goal = torch.Tensor(pen_goal) if evaluating: print(results[1]) rewards.append(results[1]) if rendering: render_with_target(env, goal.detach().numpy()) obs = results[0]["observation"] done = results[2] if done: print("reseting environment") results = env.reset() obs = results["observation"] reset_env = True break new_observations = np.expand_dims(np.expand_dims(obs, 0), 0) new_observations = torch.Tensor(new_observations) if not evaluating: # saving rewards, learning progresses, etc if iteration % save_freq == save_freq - 1: print("Saving stuff") torch.save(net.state_dict(), "lprnn_weights" + experiment_name + ".pt") with open("rewards" + experiment_name + ".txt", "a") as f: f.write("\n".join([str(r) for r in rewards])) f.write("\n") rewards = [] with open("learning_progresses" + experiment_name + ".txt", "a") as f: f.write("\n".join([str(lp) for lp in lps])) f.write("\n") lps = [] #if save_goals: # if iteration == 0: # goals = np.expand_dims(goal,0) # else: # goals = np.concatenate([noisy_goals,np.expand_dims(goal,0)],axis=0) else: if iteration % save_freq == save_freq - 1: with open("test_rewards" + experiment_name + ".txt", "a") as f: f.write("\n".join([str(r) for r in rewards])) rewards = [] if reset_env: observations = new_observations if not evaluating and lp_training: learning_progress = Variable( torch.zeros_like(learning_progress)) reset_env = False continue if not evaluating: #if not evaluating, then train pen_vars_slice = slice(54, 61) hindsight_goal = new_observations[0, 0, pen_vars_slice] hindsight_goals = hindsight_goal.unsqueeze(0).unsqueeze(0) sparse_goal_reward = env.compute_reward(hindsight_goal.numpy(), goal.detach().numpy(), None) goal_reward = torch.clamp( -torch.norm(hindsight_goal - goal.detach()), -1, 1) print(new_observations[0, 0, pen_vars_slice], goal) print("goal_reward", goal_reward) print("sparse_goal_reward", sparse_goal_reward) rewards.append(sparse_goal_reward) total_delta = 0 ## update q value network on desired goal #optimizer.zero_grad() #value = net.compute_q_value(noisy_goals.detach(), observations, noisy_actions) #print("q-value", value.data.item()) #delta = goal_reward - value #total_delta += delta #reward_value_fun = 0.5*delta**2 #partial_backprop(reward_value_fun, [net.goal_decoder, net.action_decoder]) #optimizer.step() ### update q value network on hindsight goal ### by definition we have achieved the hindsight goal, so reward is 0.0 (achieved goal) #goal_reward = 0.0 #optimizer.zero_grad() #value = net.compute_q_value(hindsight_goals.detach(), observations, noisy_actions) #delta = goal_reward - value #total_delta += delta #reward_value_fun = 0.5*delta**2 #partial_backprop(reward_value_fun, [net.goal_decoder, net.action_decoder]) #optimizer.step() # update policy to achieve actions with higher q value # this is good to make sure we learn about actions for goals which are achievable # TODO: Alternative to try: because in this environment having several actions that lead to same outcome is probably not very likely, then we can train the action decoder directly too on hindsight goal # without any problem for intrisic motivation I think if np.random.rand() <= 1.0: #print(net.action_decoder.state_dict().items()) for i in range(1): optimizer.zero_grad() # find the actions the policy predicts for hindsight goal hindsight_actions = net.compute_actions( hindsight_goals.detach(), observations) if i == 0: hindsight_actions_original = hindsight_actions.detach( ).clone() action_reconstruction_loss = 0.5 * torch.norm( actions.detach() - hindsight_actions)**2 partial_backprop(action_reconstruction_loss) optimizer.step() for i in range(1): optimizer.zero_grad() # find the actions the policy predicts for hindsight goal hindsight_actions = net.compute_actions( hindsight_goals.detach() + 0.1 * torch.rand_like(hindsight_goals), observations) action_reconstruction_loss = 0.5 * torch.norm( actions.detach() - hindsight_actions)**2 partial_backprop(-action_reconstruction_loss) optimizer.step() new_actions = net.compute_actions(hindsight_goals, observations) action_difference = torch.norm( new_actions - hindsight_actions_original) / torch.norm( hindsight_actions_original) else: actions = net.compute_actions(noisy_goals, observations) optimizer.zero_grad() loss_policy = -net.compute_q_value(noisy_goals.detach(), observations, actions) partial_backprop(loss_policy, [net.q_value_decoder]) optimizer.step() new_actions = net.compute_actions(noisy_goals, observations) action_difference = torch.norm(new_actions - actions) / torch.norm(actions) '''COMPUTE LEARNING PROGRESS''' print("action difference", action_difference) #learning_progress = torch.abs(delta) + action_difference #learning_progress = torch.max(total_delta,action_difference) #learning_progress = 0.1*torch.abs(total_delta)+2*action_difference learning_progress = action_difference #learning_progress = action_difference + 0.001*goal_reward learning_progress *= 10 #learning_progress = goal_reward print("learning progress", learning_progress.data.item()) lps.append(learning_progress.data.item()) temp_buffer.append((observations, hindsight_goals, learning_progress.detach(), new_observations)) '''TRAIN GOAL POLICY''' if iteration % 20 == 19 and lp_training: #only do this once we have a previous_lp_value # im doing 5 training iterations to make sure goal policy updates kinda quick, and is able to adapt to the learning of the agent for i in range(20): optimizer.zero_grad() observations, hindsight_goals, learning_progress, new_observations = np.random.choice( temp_buffer) log_prob_goals = net.compute_log_prob_goals( observations, hindsight_goals) previous_lp_value = net.compute_vlp(observations) delta = learning_progress + gamma * net.compute_vlp( new_observations).detach() - previous_lp_value #delta = learning_progress.detach() #print(delta, learning_progress, lp_value, previous_lp_value) loss_lp_value_fun = 0.5 * delta**2 partial_backprop(loss_lp_value_fun, [net.goal_decoder]) optimizer.step() for i in range(20): observations, hindsight_goals, learning_progress, new_observations = np.random.choice( temp_buffer) log_prob_goals = net.compute_log_prob_goals( observations, hindsight_goals) previous_lp_value = net.compute_vlp(observations) delta = learning_progress + gamma * net.compute_vlp( new_observations).detach() - previous_lp_value optimizer.zero_grad() loss_goal_policy = -delta.detach() * log_prob_goals[0, 0] partial_backprop(loss_goal_policy) optimizer.step() temp_buffer = [] #print("lp value", previous_lp_value.data.item()) observations = new_observations
def main(argv): FLAGS = flags.FLAGS print(FLAGS.flag_values_dict()) FLAGS = FLAGS.flag_values_dict() globals().update(FLAGS) '''ENVIRONMENT''' import gym #env=gym.make("HandManipulatePen-v0") #env=gym.make("HandManipulateEgg-v0") env = gym.make("FetchSlide-v1") #goal_vars_slice = slice(54,61) goal_vars_slice = slice(3, 6) results = env.reset() #env.observation_space["observation"].shape[0] observation_dim = env.observation_space["observation"].shape[0] n_actuators = env.action_space.shape[0] n_dmp_basis = 10 action_dim = n_actuators * (n_dmp_basis + 1) #goal_dim = observation_dim #goal_dim = 7 goal_dim = 3 batch_size = 1 number_layers = 2 gamma = 0.9 #alpha = 0.1 # hyperparameter used in average lp estimate for goal policy #DMP env.relative_control = True from dmp import DMP n_simulation_steps = 25 dmp = DMP(10, n_simulation_steps, n_actuators) #%% '''NET''' from learning_progress_a2c import GOALRNN net = GOALRNN(batch_size, number_layers, observation_dim, action_dim, goal_dim, goal_vars_slice, n_hidden=256) if os.path.isfile("lprnn_weights" + experiment_name + ".pt"): print("LOADING WEIGHTS") net.load_state_dict( torch.load("lprnn_weights" + experiment_name + ".pt")) #print(net.goal_decoder.state_dict().items()) net.goal_decoder.apply(weight_reset) #print(net.goal_decoder.state_dict().items()) # optimizer and losses from torch import optim #optimizer = optim.SGD(net.parameters(), lr=1e-4, momentum=0.9) #optimizer = optim.SGD(net.parameters(), lr=1e-6) #optimizer2 = optim.SGD(net.parameters(), lr=1e1) optimizer = optim.Adam(net.parameters()) #optimizer = optim.RMSprop(net.parameters()) # initial values of several variables learning_progress = torch.zeros(1) #initial run action = env.action_space.sample() results = env.step(action) observation = results[0]["observation"] observations = np.expand_dims(np.expand_dims(observation, 0), 0) observations = torch.Tensor(observations) if evaluating: net.eval() if evaluating or not lp_training: pen_goal = results[0]["desired_goal"] goal = torch.Tensor(pen_goal) #if rendering: # setup_render(env) def g(epsilon, delta): if delta >= 0: return (1 + epsilon) * delta else: return (1 - epsilon) * delta rewards = [] lps = [] temp_buffer = [] # a function that allows to do back prop, but not accumulate gradient in certain modules of a network def partial_backprop(loss, parts_to_ignore=[]): for part in parts_to_ignore: for parameter in part.parameters(): parameter.requires_grad = False loss.backward(retain_graph=True) for part in parts_to_ignore: for parameter in part.parameters(): parameter.requires_grad = True print(observations.shape) reset_env = False '''TRAINING LOOP''' # using DDPG on-olicy (without memory buffer for now. TODO: have memory buffer for iteration in range(1000000): if evaluating: #if evaluating we just use the action prediction part of the network action_parameters = net.compute_actions( goal.unsqueeze(0).unsqueeze(0), observations) action_parameters = action_parameters[0, 0, :] else: #feed observations to net, get desired goal, actions, and predicted value of action, and goal actions, noisy_actions, noisy_goals, log_prob_goals = net( observations) #goals = Variable(goals.data, requires_grad=True) if lp_training: action_parameters, goal = noisy_actions[0, 0, :], noisy_goals[ 0, 0, :] else: # if we are not training goal policy then ignore the goal policy variables. We'll us the goal provided by openaigym action_parameters = noisy_actions[0, 0, :] action_parameters = action_parameters.detach().numpy() #print(action_parameters) #run action using DMP for i in range(n_simulation_steps): action = dmp.action_rollout(None, action_parameters, i) results = env.step(action) if evaluating or not lp_training: pen_goal = results[0]["desired_goal"] goal = torch.Tensor(pen_goal) if evaluating: print(results[1]) rewards.append(results[1]) if rendering: #render_with_target(env,goal.detach().numpy()) env.render() obs = results[0]["observation"] done = results[2] if done: print("reseting environment") results = env.reset() obs = results["observation"] reset_env = True break new_observations = np.expand_dims(np.expand_dims(obs, 0), 0) new_observations = torch.Tensor(new_observations) if not evaluating: # saving rewards, learning progresses, etc if iteration % save_freq == save_freq - 1: print("Saving stuff") torch.save(net.state_dict(), "lprnn_weights" + experiment_name + ".pt") with open("rewards" + experiment_name + ".txt", "a") as f: f.write("\n".join([str(r) for r in rewards])) f.write("\n") rewards = [] with open("learning_progresses" + experiment_name + ".txt", "a") as f: f.write("\n".join([str(lp) for lp in lps])) f.write("\n") lps = [] #if save_goals: # if iteration == 0: # goals = np.expand_dims(goal,0) # else: # goals = np.concatenate([noisy_goals,np.expand_dims(goal,0)],axis=0) else: if iteration % save_freq == save_freq - 1: with open("test_rewards" + experiment_name + ".txt", "a") as f: f.write("\n".join([str(r) for r in rewards])) rewards = [] if reset_env: observations = new_observations #if not evaluating and lp_training: # learning_progress = Variable(torch.zeros_like(learning_progress)) reset_env = False continue if not evaluating: #if not evaluating, then train hindsight_goal = new_observations[0, 0, goal_vars_slice] hindsight_goals = hindsight_goal.unsqueeze(0).unsqueeze(0) sparse_goal_reward = env.compute_reward(hindsight_goal.numpy(), goal.detach().numpy(), None) goal_reward = torch.clamp( -torch.norm(hindsight_goal - goal.detach()), -1, 1) print(new_observations[0, 0, goal_vars_slice], goal) print("goal_reward", goal_reward) print("sparse_goal_reward", sparse_goal_reward) rewards.append(sparse_goal_reward) temp_buffer.append((observations, noisy_actions, hindsight_goals, 0, new_observations, log_prob_goals.detach())) '''TRAIN GOAL POLICY''' if iteration > 0 and ( iteration % 20 ) == 0 and lp_training: #only do this once we have a previous_lp_value # im doing 5 training iterations to make sure goal policy updates kinda quick, and is able to adapt to the learning of the agent for i in range(20): index = np.random.choice(range(len(temp_buffer))) print(index) observations, noisy_actions, hindsight_goals, _, new_observations, log_prob_goals_old = temp_buffer[ index] optimizer.zero_grad() # find the actions the policy predicts for hindsight goal hindsight_actions = net.compute_actions( hindsight_goals.detach(), observations) #hindsight_actions_original = hindsight_actions.detach().clone() action_reconstruction_loss = 0.5 * torch.norm( noisy_actions.detach() - hindsight_actions)**2 partial_backprop(action_reconstruction_loss) optimizer.step() new_actions = net.compute_actions(hindsight_goals, observations) action_difference = torch.norm( new_actions - hindsight_actions) / torch.norm(hindsight_actions) learning_progress = action_difference print("learning progress", learning_progress.data.item()) lps.append(learning_progress.data.item()) temp_buffer[index] = (observations, noisy_actions, hindsight_goals, learning_progress.detach(), new_observations, log_prob_goals_old) for i in range(20): optimizer.zero_grad() index = np.random.choice(range(len(temp_buffer))) observations, _, hindsight_goals, learning_progress, new_observations, _ = temp_buffer[ index] log_prob_goals = net.compute_log_prob_goals( observations, hindsight_goals) previous_lp_value = net.compute_vlp(observations) delta = learning_progress + gamma * net.compute_vlp( new_observations).detach() - previous_lp_value #delta = learning_progress.detach() #print(delta, learning_progress, lp_value, previous_lp_value) loss_lp_value_fun = 0.5 * delta**2 partial_backprop(loss_lp_value_fun, [net.goal_decoder]) optimizer.step() for i in range(20): index = np.random.choice(range(len(temp_buffer))) observations, _, hindsight_goals, learning_progress, new_observations, log_prob_goals_old = temp_buffer[ index] log_prob_goals = net.compute_log_prob_goals( observations, hindsight_goals) previous_lp_value = net.compute_vlp(observations) delta = learning_progress + gamma * net.compute_vlp( new_observations).detach() - previous_lp_value optimizer.zero_grad() #loss_goal_policy = -delta.detach()*log_prob_goals[0,0] #loss_goal_policy = -delta.detach()*torch.exp(log_prob_goals[0,0]-log_prob_goals_old) loss_goal_policy = -torch.min( delta.detach() * torch.exp(log_prob_goals[0, 0] - log_prob_goals_old), g(0.5, delta.detach())) partial_backprop(loss_goal_policy) optimizer.step() temp_buffer = [] #print("lp value", previous_lp_value.data.item()) observations = new_observations
def run(args): if args.seed is not None: np.random.seed(args.seed) # Set default parameter depend on environment if args.n_dims is None: if args.env == 'Point' or args.env == 'Arm': args.n_dims = 2 elif args.env == 'OneDof' or args.env == 'Quadratic': args.n_dims = 1 else: raise NotImplementedError if args.n_bfs is None: if args.env == 'Point' or args.env == 'Arm' or args.env == 'OneDof': args.n_bfs = 10 elif args.env == 'Quadratic': args.n_bfs = 1 else: raise NotImplementedError if args.dt is None: if args.env == 'Point' or args.env == 'Arm' or args.env == 'OneDof': args.dt = 0.01 elif args.env == 'Quadratic': args.dt = 1.0 else: raise NotImplementedError if args.duration is None: if args.env == 'Point' or args.env == 'Arm' or args.env == 'OneDof': args.duration = 0.5 elif args.env == 'Quadratic': args.duration = 2.0 else: raise NotImplementedError # Set default parameter depend on agent if args.r_gain is None: if args.r_normalize: if args.agent == 'PI2': args.r_gain = 10 elif args.agent == 'MDS': args.r_gain = 10 # 1.0/100000.0 elif args.agent == 'GMDS': args.r_gain = 10 elif args.agent == 'AMDS': args.r_gain = 10 # 1.0/100000.0 elif args.agent == 'MDSKDE': args.r_gain = 10 # 1.0/100000.0 elif args.agent == 'REPS': args.r_gain = 10 elif args.agent == 'AREPS': args.r_gain = 10 else: raise NotImplementedError else: if args.agent == 'PI2': args.r_gain = 10**8 elif args.agent == 'MDS': args.r_gain = 10**8 elif args.agent == 'GMDS': args.r_gain = 10**8 elif args.agent == 'AMDS': args.r_gain = 10**8 elif args.agent == 'MDSKDE': args.r_gain = 10**8 elif args.agent == 'REPS': args.r_gain = 10**8 elif args.agent == 'AREPS': args.r_gain = 10**8 else: raise NotImplementedError if args.std is None: if args.agent == 'PI2': args.std = 10 elif args.agent == 'MDS': args.std = 10 elif args.agent == 'GMDS': args.std = 10 elif args.agent == 'AMDS': args.std = 10 elif args.agent == 'MDSKDE': args.std = 1 elif args.agent == 'REPS': args.std = 10 elif args.agent == 'AREPS': args.std = 10 else: raise NotImplementedError if args.plotdir is not None: if not args.plot: args.plot = [] for d in args.plotdir: directory = glob.glob('{}/*'.format(os.path.normpath(d))) args.plot.extend(directory) # Check parameter if args.env == 'Point': assert 2 == args.n_dims, \ 'Number Error: n_dims of Point task arrow only 2 dimension' elif args.env == 'OneDof': assert 1 == args.n_dims, \ 'Number Error: n_dims of OneDof task arrow only 1 dimension' if args.env == 'Quadratic': assert 1 == args.n_bfs, \ 'Number Error: args.n_bfs of Quadratic task arrow only 1 dimension' assert 2 == args.duration, \ 'Number Error: duration of Quadratic task arrow only 2 [s]' assert 1 == args.dt, \ 'Number Error: dt of Quadratic task arrow only 1 [s]' args.outdir = prepare_output_dir(args, args.outdir, argv=sys.argv) Debug.set_outdir(args.outdir) Debug.set_n_updates(args.n_updates) # not use DMP if args.env == 'Quadratic': dmps = None # use DMP else: dmps = [[] for _ in range(args.n_dims)] n_times = int(args.duration / args.dt) # set environment if args.env == 'Arm' or args.env == 'Point': env = eval(args.env)(dmps, args.n_dims, args.n_bfs, n_times, args.duration, args.dt) # elif args.env == 'Quadratic': # env = eval(args.env)(args.n_dims) else: # TODO raise NotImplementedError # set agent if args.agent == 'PI2': agent = PI2(dmps, args.n_updates, args.n_reps, args.n_dims, args.n_bfs, n_times, args.r_gain, args.r_normalize) elif args.agent == 'GMDS': agent = GMDS(dmps, args.n_updates, args.n_reps, args.n_dims, args.n_bfs, n_times, args.r_gain, args.r_normalize) elif args.agent == 'MDS': agent = MDS(args.n_updates, args.n_reps, args.n_dims, args.n_bfs, n_times, args.r_gain, args.r_normalize) elif args.agent == 'AMDS': agent = AMDS(dmps, args.n_updates, args.n_reps, args.n_dims, args.n_bfs, n_times, args.r_gain, args.r_normalize) elif args.agent == 'MDSKDE': agent = MDSKDE(args.n_updates, args.n_reps, args.n_dims, args.n_bfs, n_times, args.r_gain, args.r_normalize) elif args.agent == 'REPS': agent = REPS(dmps, args.n_updates, args.n_reps, args.n_dims, args.n_bfs, n_times, args.r_gain, args.r_normalize) elif args.agent == 'AREPS': agent = AREPS(dmps, args.n_updates, args.n_reps, args.n_dims, args.n_bfs, n_times, args.r_gain, args.r_normalize) else: raise NotImplementedError # Set dmp if args.env == 'Point' or args.env == 'Arm': for i in range(args.n_dims): dmps[i] = DMP(args.n_bfs, args.duration, 0.5, args.dt, True) dmps[i].reset() dmps[i].set_goal(env.gxi[i]) agent.theta[i] = dmps[i].minimize_jerk() if args.plot is None: learn(args, agent, env, n_times) else: plot(args)
goal_loss = nn.MSELoss() goal_reconstruction_loss = nn.MSELoss() previous_goal_reward = torch.Tensor([-10]) # previous_value = torch.zeros(1) previous_lp_value = torch.zeros(1) learning_progress = torch.zeros(1) average_reward_estimate = 0 average_lp_estimate = 0 alpha = 0.1 env.relative_control = True from dmp import DMP n_simulation_steps=25 dmp = DMP(10,n_simulation_steps,n_actuators) #%% save_goals = False #rendering = True rendering = False evaluating = False save_freq = 300 forget_freq = 300 # forget_freq = 2 action = env.action_space.sample() observation = env.step(action)[0]["observation"] observations = np.expand_dims(np.expand_dims(observation,0),0) observations = torch.Tensor(observations)
parser.add_argument('--camera', type=int, default=0) parser.add_argument('--robot', action='store_true') args = parser.parse_args() g_net_param = torch.load(args.model_path, map_location='cpu') if args.model_path else None if g_net_param: cfg = g_net_param['config'] else: cfg = Config() logger = SummaryWriter(os.path.join(cfg.log_path, cfg.experiment_name)) torch.cuda.set_device(0) if cfg.use_DMP: dmp = DMP(cfg) #loader generator_train = build_loader(cfg, True) # function pointer generator_test = build_loader(cfg, False) # function pointer class CMP(object): def __init__(self, config): self.cfg = config self.condition_net = NN_img_c( sz_image=self.cfg.image_size, ch_image=self.cfg.image_channels, tasks=self.cfg.number_of_tasks, task_img_sz=(self.cfg.object_size[0] if self.cfg.img_as_task else -1))
def main(argv): '''PREPPING UP variables''' global FLAGS FLAGS = FLAGS.flag_values_dict() # print(FLAGS) globals().update(FLAGS) context_dim = env.observation_space["observation"].shape[0] FLAGS["context_dim"] = context_dim n_steps = n_simulation_steps // outcome_sampling_frequency FLAGS["n_steps"] = n_steps n_actuators = env.action_space.shape[0] FLAGS["n_actuators"] = n_actuators action_dim = n_actuators * ( n_dmp_basis + 1) # +1 for target position, in dmp parametrization FLAGS["action_dim"] = action_dim outcome_dim = context_dim * n_steps FLAGS["outcome_dim"] = outcome_dim FLAGS["memory_dim"] = context_dim + outcome_dim + action_dim globals().update(FLAGS) indices_hand_pos = slice(0 * n_steps, 24 * n_steps) indices_hand_vel = slice(24 * n_steps, 48 * n_steps) indices_pen_pos = slice(48 * n_steps, 51 * n_steps) indices_pen_rot = slice(51 * n_steps, 55 * n_steps) indices_pen_vel = slice(55 * n_steps, 58 * n_steps) indices_pen_rotvel = slice(58 * n_steps, 61 * n_steps) global n_goal_spaces, goal_spaces_indices, goal_spaces_names, reward_funs, find_memory_by_reward, find_memory_by_slice # goal_spaces_indices = [indices_hand_pos,indices_hand_vel,indices_pen_pos,indices_pen_rot,indices_pen_vel,indices_pen_rotvel] goal_spaces_indices = [ indices_pen_pos, indices_pen_rot, indices_pen_vel, indices_pen_rotvel ] # goal_spaces_names = ["hand_pos","hand_vel","pen_pos","pen_rot","pen_vel","pen_rotvel"] # goal_spaces_names = ["pen_pos","pen_rot","pen_vel","pen_rotvel"] goal_spaces_names = ["pen_pos", "pen_rot"] goal_precisions = [0.7, 1.0, 2.0, 3.0, 10.0] goal_spaces_indices = [ goal_spaces_indices[i] for i, g in enumerate(goal_spaces_names) for p in goal_precisions ] goal_spaces_names = [ goal_space_name + "_" + "{0:1}".format(precision) for goal_space_name in goal_spaces_names for precision in goal_precisions ] n_goal_spaces = len(goal_spaces_indices) env.relative_control = True from dmp import DMP dmp = DMP(n_dmp_basis, n_simulation_steps, n_actuators) action_rollout = dmp.action_rollout #outcome is a flattned (row-major) version of an array of dimensions (observation_dim,n_steps) # goal_reward_vectorized = np.vectorize(goal_reward) def reward_funs(goal_space): precision = float(goal_spaces_names[goal_space].split("_")[-1]) def reward_fun(context, outcomes, goal): outcome_slice = slice(goal_spaces_indices[goal_space].start, goal_spaces_indices[goal_space].stop) outcomes = outcomes[:, outcome_slice] return goal_reward(goal_space, goal, outcomes, context, precision) return reward_fun def find_memory_by_reward(context, goal_space, goal): outcomes = database[:, context_dim:-action_dim] rewards = reward_funs(goal_space)(context, outcomes, goal) # print("AAA", rewards.shape) goodness = -np.linalg.norm(database[:, :context_dim] - context, axis=1) + rewards index_of_memory = np.argmax(goodness, axis=0) return index_of_memory, goodness[index_of_memory] default_mask = np.zeros(memory_dim) default_mask[:context_dim] = 1 def find_memory_by_slice(context, outcome_slice, goal, action=None, mask=default_mask): #TODO: This is a very hacky function, that is very specific to our implementation, rather than being general #it works because the goal and outcomes spaces are the same, but it's fine. query_vector = np.zeros(memory_dim) query_vector[:context_dim] = context #indices corresponding to the part of the outcome which we are querying against memory_slice = slice(context_dim + outcome_slice.start, context_dim + outcome_slice.stop) query_vector[memory_slice] = goal / np.sqrt(goal.shape[0]) mask[memory_slice] = 1 if action is not None: query_vector[-action_dim:] = action distances = np.linalg.norm((database - query_vector) * mask, axis=1) index_of_memory = np.argmin(distances, axis=0) return index_of_memory, distances[index_of_memory] from mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() if evaluating: assert size == 1 if not evaluating: assert size == 2 #initialize NN from meta_policy_neural_net import make_meta_policy_nn_model global meta_policy_nn_model meta_policy_nn_model = make_meta_policy_nn_model(FLAGS) if os.path.exists(os.path.join(THIS_DIR, "model_weights.p")): updated_model_weigths = pickle.load( open(os.path.join(THIS_DIR, "model_weights.p"), "rb")) meta_policy_nn_model.set_weights(updated_model_weigths) if rank == 0: # yeah this is uggly, okkk global database global intrinsic_rewards global goal_space_probs files = os.listdir(MEMORIES_DIR) # print(list(files)) files = [os.path.join(MEMORIES_DIR, f) for f in files] # add path to each file files.sort(key=lambda x: os.path.getmtime(x), reverse=True) MAX_MEMORY_SIZE = 10000 # MAX_MEMORY_SIZE = 100000 print(files) if len(files) > 0: for ii, filename in enumerate(files): if ii == 0: database = np.load(filename) else: database = np.concatenate([database, np.load(filename)], 0) if len(database) > MAX_MEMORY_SIZE: break else: database = None if os.path.exists(os.path.join(THIS_DIR, "intrinsic_rewards.p")): intrinsic_rewards = pickle.load( open(os.path.join(THIS_DIR, "intrinsic_rewards.p"), "rb")) else: intrinsic_rewards = np.array( [0.01 for index in goal_spaces_indices], dtype=np.float32) goal_space_probs = goal_space_probabilities(intrinsic_rewards) '''INITIALIZE ENVIRONMENT''' # action = env.action_space.sample() results = env.reset() context = results["observation"] if evaluating: pen_goal = results["desired_goal"] # pen_goal = pen_goal[:3] # pen_goal = pen_goal[3:] goal = np.tile(np.expand_dims(pen_goal, 0), (n_steps, 1)) goal = np.reshape(goal.T, (-1)) '''INITILIAZE MEMORY DATABASE VIA RANDOM EXPLORATION''' #some random exploration if database is None: #cold start print("random warming up") reset_env = False memories = 0 while memories < 1000: observations = [] action_parameter = 2 * np.random.rand(action_dim) - 1 for i in range(n_simulation_steps): # print(i) if rendering: env.render() action = action_rollout(context, action_parameter, i) results = env.step(action) obs = results[0]["observation"] done = results[2] if done: print("reseting environment") results = env.reset() # obs = results["observation"] reset_env = True break if i % outcome_sampling_frequency == 0: observations.append(obs) if reset_env: reset_env = False continue else: print("Adding memory") sys.stdout.flush() memories += 1 outcome = np.reshape(np.stack(observations).T, (outcome_dim)) if database is None and memories == 1: database = np.expand_dims( np.concatenate([context, outcome, action_parameter]), 0) # print(database.shape) else: update_exploration_policy(context, outcome, action_parameter) else: memories = database.shape[0] if not evaluating: comm.send( True, dest=1, tag=11 ) #signal to send to consolidation code to continue going on '''TRAINING LOOP''' print("active goal babbling") reset_env = False for iteration in range(200000): print("iteration", iteration) # this chooses one of the goal_spaces as an index from 0 to len(goal_spaces_indices)-1 goal_space = goal_space_policy(context) if not evaluating: if comm.Iprobe(source=1, tag=12): updated_model_weigths = comm.recv(source=1, tag=12) meta_policy_nn_model.set_weights(updated_model_weigths) #goal is of size len(goal_spaces_indices[goal_space]) goal = goal_policy(goal_space, context) if evaluating: action_parameter = meta_policy(goal_space, goal, context) else: #USE EXPLORATION POLICY action_parameter = exploration_meta_policy( goal_space, goal, context) observations = [] for i in range(n_simulation_steps): # print(context.shape) action = action_rollout(context, action_parameter, i) results = env.step(action) if rendering: env.render() obs = results[0]["observation"] # print(obs) done = results[2] if done: print("reseting environment") results = env.reset() if evaluating: pen_goal = results["desired_goal"] # pen_goal = pen_goal[:3] # pen_goal = pen_goal[3:] goal = np.tile(np.expand_dims(pen_goal, 0), (n_steps, 1)) goal = np.reshape(goal.T, (-1)) reset_env = True break if i % outcome_sampling_frequency == 0: observations.append(obs) if reset_env: reset_env = False # continue else: memories += 1 outcome = np.reshape(np.stack(observations).T, (outcome_dim)) if not evaluating: intrinsic_rewards = update_intrinsic_reward( intrinsic_rewards, goal_space, goal, context, outcome) print(goal_spaces_names) print(intrinsic_rewards) sys.stdout.flush() if not evaluating: update_exploration_policy(context, outcome, action_parameter) update_goal_space_policy() if not evaluating: if iteration % save_freq == save_freq - 1: print("Saving new batch of memories") sys.stdout.flush() # pickle.dump(database, open("database.p","wb")) database = database[-MAX_MEMORY_SIZE:] np.save( os.path.join(MEMORIES_DIR, "database_" + str(iteration) + ".npy"), database[-save_freq:]) pickle.dump( intrinsic_rewards, open(os.path.join(THIS_DIR, "intrinsic_rewards.p"), "wb")) comm.send( True, dest=1, tag=11 ) #signal to send to consolidation code to continue going on context = observations[-1] comm.send(False, dest=1, tag=11) if rank == 1: from meta_policy_neural_net import learn_from_database while comm.recv(source=0, tag=11): # while True: # files = filter(os.path.isfile, os.listdir("memories")) files = os.listdir(MEMORIES_DIR) # print(list(files)) files = [os.path.join(MEMORIES_DIR, f) for f in files] # add path to each file files.sort(key=lambda x: os.path.getmtime(x), reverse=True) # print(files) sys.stdout.flush() for filename in files: # if True: if not comm.Iprobe(source=0, tag=11): database = np.load(filename) print("Training on", filename) sys.stdout.flush() meta_policy_nn_model = learn_from_database( meta_policy_nn_model, database, FLAGS) # sys.stdout.flush() updated_model_weigths = meta_policy_nn_model.get_weights() comm.send(updated_model_weigths, dest=0, tag=12) pickle.dump( updated_model_weigths, open(os.path.join(THIS_DIR, "model_weights.p"), "wb")) else: break