def load_pilco(path, controller=None, reward=None, sparse=False, debug=False): X = np.loadtxt(path + 'X.csv', delimiter=',') Y = np.loadtxt(path + 'Y.csv', delimiter=',') if not sparse: pilco = PILCO(X, Y, controller=controller, reward=reward, debug=debug) else: with open(path + 'n_ind.txt', 'r') as f: n_ind = int(f.readline()) f.close() pilco = PILCO(X, Y, num_induced_points=n_ind) # params = np.load(path + "pilco_values.npy").item() # pilco.assign(params) for i, m in enumerate(pilco.mgpr.models): values = np.load(path + "model_" + str(i) + ".npy").item() m.assign(values) return pilco
def __init__(self): rospy.init_node('pilco_node', anonymous=True) self.rate = rospy.Rate(15) rospy.Subscriber('/RL/gripper_status', String, self.callbackGripperStatus) self.reset_srv = rospy.ServiceProxy('/RL/ResetGripper', Empty) self.move_srv = rospy.ServiceProxy('/RL/MoveGripper', TargetAngles) self.drop_srv = rospy.ServiceProxy('/RL/IsObjDropped', IsDropped) self.obs_srv = rospy.ServiceProxy('/RL/observation', observation) # self.plot_srv = rospy.ServiceProxy('/plot/plot', Empty) # self.clear_srv = rospy.ServiceProxy('/plot/clear', Empty) pub_goal = rospy.Publisher('/RL/goal', Float32MultiArray, queue_size=10) self.pub_act = rospy.Publisher('/RL/action', Float32MultiArray, queue_size=10) msg = Float32MultiArray() msg.data = self.goal # Initial random rollouts to generate a dataset X, Y = self.rollout(policy=self.random_policy, steps=self.max_steps) for i in range(1, 15): X_, Y_ = self.rollout(policy=self.random_policy, steps=self.max_steps) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) data_size = X.shape[0] self.state_dim = Y.shape[1] control_dim = X.shape[1] - self.state_dim controller = RbfController(state_dim=self.state_dim, control_dim=control_dim, num_basis_functions=5) # controller = LinearController(state_dim=self.state_dim, control_dim=control_dim) # X = self.normz(X); Y = self.normz(Y) self.ngoal = np.zeros(self.state_dim) for i in range(self.state_dim): self.ngoal[i] = (self.goal[i] - self.state_min[i]) / ( self.state_max[i] - self.state_min[i]) # pilco = PILCO(X, Y, controller=controller, horizon=self.max_steps) # Example of user provided reward function, setting a custom target state # R = ExponentialReward(state_dim=self.state_dim, t=self.ngoal) R = ExponentialRewardPosition(state_dim=self.state_dim, t=self.ngoal) # Only position self.pilco = PILCO(X, Y, controller=controller, horizon=self.max_steps, reward=R, num_induced_points=int(data_size / 10)) # Example of fixing a parameter, optional, for a linear controller only #pilco.controller.b = np.array([[0.0]]) #pilco.controller.b.trainable = False Iter = 0 success_count = 0 while not rospy.is_shutdown(): pub_goal.publish(msg) Iter += 1 self.pilco.optimize() # import pdb; pdb.set_trace() X_new, Y_new = self.rollout(policy=self.pilco_policy, steps=self.max_steps) # X_new = self.normz(X_new); Y_new = self.normz(Y_new) # Update dataset X = np.vstack((X, X_new)) Y = np.vstack((Y, Y_new)) self.pilco.mgpr.set_XY(X, Y) cur = np.array(self.obs_srv().state) d = np.linalg.norm((X_new[-1, :2] + Y_new[-1, :2]) - self.ngoal[:2]) # print('[pilco_node] Iteration ' + str(Iter) + ', reached:' + str(X_new[-1,:self.state_dim]+Y_new[-1,:]) + ', distance: ' + str( np.linalg.norm((X_new[-1,:self.state_dim]+Y_new[-1,:])-self.ngoal) )) print('[pilco_node] Goal: ' + str(self.goal) + ', normalized goal: ' + str(self.ngoal) + ', current position: ' + str(cur)) print('[pilco_node] Iteration ' + str(Iter) + ', reached:' + str(X_new[-1, :2] + Y_new[-1, :2]) + ', pos. distance: ' + str(d)) # if np.linalg.norm(self.ngoal-X_new[-1,:self.state_dim]) < self.stop_distance: if d < self.stop_distance: print( '[pilco_node] Goal reached after %d iterations, %d/3 trial.!' % (Iter, success_count)) success_count += 1 if success_count >= 3: break else: success_count = 0 self.rate.sleep() print('[pilco_node] Found solution.') X, Y = self.rollout(policy=self.pilco_policy, steps=self.max_steps, normz=False) plt.figure() plt.plot(X[:, 0], X[:, 1], '-k') plt.plot(X[0, 0], X[0, 1], 'or') plt.plot(self.goal[0], self.goal[1], 'og') xend = X[-1, :self.state_dim] + Y[-1, :] plt.plot(xend[0], xend[1], 'ob') plt.axis('equal') plt.xlabel('x') plt.ylabel('y') plt.show()
class pilco_node(): goal = np.array([37.0, 106.0, 9.6664, 10.5664]) max_steps = 50 stop_distance = 0.08 state_max = np.array([90.0164, 140.6792, 67.6822, 66.9443]) state_min = np.array([-92.6794, 44.1619, 0.4516, 0.9372]) def __init__(self): rospy.init_node('pilco_node', anonymous=True) self.rate = rospy.Rate(15) rospy.Subscriber('/RL/gripper_status', String, self.callbackGripperStatus) self.reset_srv = rospy.ServiceProxy('/RL/ResetGripper', Empty) self.move_srv = rospy.ServiceProxy('/RL/MoveGripper', TargetAngles) self.drop_srv = rospy.ServiceProxy('/RL/IsObjDropped', IsDropped) self.obs_srv = rospy.ServiceProxy('/RL/observation', observation) # self.plot_srv = rospy.ServiceProxy('/plot/plot', Empty) # self.clear_srv = rospy.ServiceProxy('/plot/clear', Empty) pub_goal = rospy.Publisher('/RL/goal', Float32MultiArray, queue_size=10) self.pub_act = rospy.Publisher('/RL/action', Float32MultiArray, queue_size=10) msg = Float32MultiArray() msg.data = self.goal # Initial random rollouts to generate a dataset X, Y = self.rollout(policy=self.random_policy, steps=self.max_steps) for i in range(1, 15): X_, Y_ = self.rollout(policy=self.random_policy, steps=self.max_steps) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) data_size = X.shape[0] self.state_dim = Y.shape[1] control_dim = X.shape[1] - self.state_dim controller = RbfController(state_dim=self.state_dim, control_dim=control_dim, num_basis_functions=5) # controller = LinearController(state_dim=self.state_dim, control_dim=control_dim) # X = self.normz(X); Y = self.normz(Y) self.ngoal = np.zeros(self.state_dim) for i in range(self.state_dim): self.ngoal[i] = (self.goal[i] - self.state_min[i]) / ( self.state_max[i] - self.state_min[i]) # pilco = PILCO(X, Y, controller=controller, horizon=self.max_steps) # Example of user provided reward function, setting a custom target state # R = ExponentialReward(state_dim=self.state_dim, t=self.ngoal) R = ExponentialRewardPosition(state_dim=self.state_dim, t=self.ngoal) # Only position self.pilco = PILCO(X, Y, controller=controller, horizon=self.max_steps, reward=R, num_induced_points=int(data_size / 10)) # Example of fixing a parameter, optional, for a linear controller only #pilco.controller.b = np.array([[0.0]]) #pilco.controller.b.trainable = False Iter = 0 success_count = 0 while not rospy.is_shutdown(): pub_goal.publish(msg) Iter += 1 self.pilco.optimize() # import pdb; pdb.set_trace() X_new, Y_new = self.rollout(policy=self.pilco_policy, steps=self.max_steps) # X_new = self.normz(X_new); Y_new = self.normz(Y_new) # Update dataset X = np.vstack((X, X_new)) Y = np.vstack((Y, Y_new)) self.pilco.mgpr.set_XY(X, Y) cur = np.array(self.obs_srv().state) d = np.linalg.norm((X_new[-1, :2] + Y_new[-1, :2]) - self.ngoal[:2]) # print('[pilco_node] Iteration ' + str(Iter) + ', reached:' + str(X_new[-1,:self.state_dim]+Y_new[-1,:]) + ', distance: ' + str( np.linalg.norm((X_new[-1,:self.state_dim]+Y_new[-1,:])-self.ngoal) )) print('[pilco_node] Goal: ' + str(self.goal) + ', normalized goal: ' + str(self.ngoal) + ', current position: ' + str(cur)) print('[pilco_node] Iteration ' + str(Iter) + ', reached:' + str(X_new[-1, :2] + Y_new[-1, :2]) + ', pos. distance: ' + str(d)) # if np.linalg.norm(self.ngoal-X_new[-1,:self.state_dim]) < self.stop_distance: if d < self.stop_distance: print( '[pilco_node] Goal reached after %d iterations, %d/3 trial.!' % (Iter, success_count)) success_count += 1 if success_count >= 3: break else: success_count = 0 self.rate.sleep() print('[pilco_node] Found solution.') X, Y = self.rollout(policy=self.pilco_policy, steps=self.max_steps, normz=False) plt.figure() plt.plot(X[:, 0], X[:, 1], '-k') plt.plot(X[0, 0], X[0, 1], 'or') plt.plot(self.goal[0], self.goal[1], 'og') xend = X[-1, :self.state_dim] + Y[-1, :] plt.plot(xend[0], xend[1], 'ob') plt.axis('equal') plt.xlabel('x') plt.ylabel('y') plt.show() def rollout(self, policy, steps, normz=True): X = [] Y = [] print('[pilco_node] Rollout...') # Reset system # self.clear_srv() self.reset_srv() while not self.gripper_closed: self.rate.sleep() # Get observation x = np.array(self.obs_srv().state) state_dim = len(x) for i in range(steps): # print('Step ' + str(i) + '...') xu = np.zeros(4) for i in range(state_dim): xu[i] = (x[i] - self.state_min[i]) / (self.state_max[i] - self.state_min[i]) # Choose action u = policy(xu) # print('u: ', u) u[2] = (u[2] + 1) / 2 msg = Float32MultiArray() msg.data = u self.pub_act.publish(msg) # Act for _ in range(int(u[2] * 150) + 1): suc = self.move_srv(u[:2]).success rospy.sleep(0.05) self.rate.sleep() if not suc: break fail = self.drop_srv().dropped # Check if dropped - end of episode if suc and not fail: # Get observation x_new = np.array(self.obs_srv().state) else: # End episode if overload or angle limits reached rospy.logerr( '[pilco_node] Failed to move. Episode declared failed.') break xu = np.hstack((x, u)) dx = x_new - x if normz: dx = np.zeros(state_dim) for i in range(state_dim): xu[i] = (xu[i] - self.state_min[i]) / (self.state_max[i] - self.state_min[i]) dx[i] = (x_new[i] - self.state_min[i]) / ( self.state_max[i] - self.state_min[i]) - xu[i] X.append(xu) Y.append(dx) x = x_new self.rate.sleep() # self.plot_srv() return np.stack(X), np.stack(Y) def random_policy(self, x): a = np.array([0., 0.]) if np.random.uniform(0., 1., 1) > 0.6: if np.random.uniform(0., 1., 1) > 0.5: a = np.random.uniform(0.75, 1.0, 2) else: a = np.random.uniform(-1.0, -0.75, 2) else: a = np.random.uniform(-1., 1., 2) return np.append(a, np.random.uniform(-1, 1, 1)) def pilco_policy(self, x): return self.pilco.compute_action(x[None, :])[0, :] def callbackGripperStatus(self, msg): self.gripper_closed = msg.data == "closed" def normz(self, X): for i in range(self.state_dim): X[:, i] = (X[:, i] - self.state_min[i]) / (self.state_max[i] - self.state_min[i]) return X
X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) # get env reward env_reward = get_env_reward(args.env_name) # load gp if args.load_gp_model: try: pilco = load_pilco(model_path, controller, env_reward, debug=args.debug) print("load gp model successfully") except: raise ValueError("can not find saved gp models") # pilco = PILCO(X, Y, controller=controller, reward=env_reward) else: pilco = PILCO(X, Y, controller=controller, reward=env_reward, debug=args.debug) # for numerical stability for model in pilco.mgpr.models: model.likelihood.variance = 0.001 model.likelihood.variance.trainable = False # reward_list = [] # ep_step_list = [] X_init = X[:, 0: state_dim] # def random_policy(obs): # return env.action_space.sample() # random policy as comparing policy # policy2 = random_policy
with tf.Session() as sess: p_start = time.time() target = np.array([1.2, 0.38, 0.38]) env = EnvWrapper(target) T = 25 num_basis_functions = 50 max_action = 0.3 time_on_real_robot = 0 X, Y, t = rollout(env, T, random=True, trial=0) time_on_real_robot += t state_dim = Y.shape[1] control_dim = X.shape[1] - Y.shape[1] controller = RbfController(state_dim, control_dim, num_basis_functions, max_action) reward = ExponentialReward(3, t=target) pilco = PILCO(X, Y, controller=controller, reward=reward) plot(pilco, X, Y, T, 0) n = 4 t_model = 0 t_policy = 0 for i in range(1, n): env.reset() t1 = time.time() pilco.optimize_models() t2 = time.time() t_model += t2 - t1 print("model optimization done!") pilco.optimize_policy() t3 = time.time() t_policy += t3 - t2 print("policy optimization done!")
experiment_path = experiment_path[:-1] if not os.path.exists(experiment_path): os.makedirs(experiment_path) os.makedirs(experiment_path + "/img") os.makedirs(experiment_path + "/img/rewards") os.makedirs(experiment_path + "/img/H_space") os.makedirs(experiment_path + "/img/H_space/live_inference") checkpoint_path = experiment_path + "/model.ckpt" config_path = experiment_path + "/config.json" arg_dict["experiment_path"] = experiment_path arg_dict["checkpoint_path"] = checkpoint_path training_envs, test_envs, dataset, cost, labels = get_envs(**arg_dict) num_envs = len(training_envs) + len(test_envs) arg_dict["n_envs"] = num_envs arg_dict["n_active_tasks"] = len(training_envs) arg_dict["labels"] = labels with open(config_path, "w") as f: json.dump(arg_dict, f) model = create_model(**arg_dict) agent = create_agent(model, cost, **arg_dict) pilco = PILCO(model, agent, dataset, **arg_dict) run_PILCO(pilco, training_envs, test_envs, **arg_dict)
timesteps=T, random=True, SUBS=SUBS, verbose=False, render=False) X = np.vstack((X, X_)) Y = np.vstack((Y, Y_)) state_dim = Y.shape[1] control_dim = X.shape[1] - state_dim X, Y = torch.tensor(X, dtype=torch.float32), torch.tensor(Y, dtype=torch.float32) # build PILCO model pilco = PILCO(X, Y, horizon=T, m_init=m_init, S_init=S_init) for rollouts in range(N): # main PILCO loop print("**** ITERATION no", rollouts, " ****") # learn dynamic model pilco.optimize_models(maxiter=maxiter) # improve policy pilco.optimize_policy(maxiter=maxiter) # collect more data X_new, Y_new = rollout(env, pilco, timesteps=T_sim, verbose=False, render=False, SUBS=SUBS)