Exemple #1
0
def load_pilco(path, controller=None, reward=None, sparse=False, debug=False):
    X = np.loadtxt(path + 'X.csv', delimiter=',')
    Y = np.loadtxt(path + 'Y.csv', delimiter=',')
    if not sparse:
        pilco = PILCO(X, Y, controller=controller, reward=reward, debug=debug)
    else:
        with open(path + 'n_ind.txt', 'r') as f:
            n_ind = int(f.readline())
            f.close()
        pilco = PILCO(X, Y, num_induced_points=n_ind)
    # params = np.load(path + "pilco_values.npy").item()
    # pilco.assign(params)
    for i, m in enumerate(pilco.mgpr.models):
        values = np.load(path + "model_" + str(i) + ".npy").item()
        m.assign(values)
    return pilco
Exemple #2
0
    def __init__(self):
        rospy.init_node('pilco_node', anonymous=True)
        self.rate = rospy.Rate(15)

        rospy.Subscriber('/RL/gripper_status', String,
                         self.callbackGripperStatus)

        self.reset_srv = rospy.ServiceProxy('/RL/ResetGripper', Empty)
        self.move_srv = rospy.ServiceProxy('/RL/MoveGripper', TargetAngles)
        self.drop_srv = rospy.ServiceProxy('/RL/IsObjDropped', IsDropped)
        self.obs_srv = rospy.ServiceProxy('/RL/observation', observation)
        # self.plot_srv = rospy.ServiceProxy('/plot/plot', Empty)
        # self.clear_srv = rospy.ServiceProxy('/plot/clear', Empty)

        pub_goal = rospy.Publisher('/RL/goal',
                                   Float32MultiArray,
                                   queue_size=10)
        self.pub_act = rospy.Publisher('/RL/action',
                                       Float32MultiArray,
                                       queue_size=10)
        msg = Float32MultiArray()

        msg.data = self.goal
        # Initial random rollouts to generate a dataset
        X, Y = self.rollout(policy=self.random_policy, steps=self.max_steps)
        for i in range(1, 15):
            X_, Y_ = self.rollout(policy=self.random_policy,
                                  steps=self.max_steps)
            X = np.vstack((X, X_))
            Y = np.vstack((Y, Y_))

        data_size = X.shape[0]
        self.state_dim = Y.shape[1]
        control_dim = X.shape[1] - self.state_dim
        controller = RbfController(state_dim=self.state_dim,
                                   control_dim=control_dim,
                                   num_basis_functions=5)
        # controller = LinearController(state_dim=self.state_dim, control_dim=control_dim)

        # X = self.normz(X); Y = self.normz(Y)
        self.ngoal = np.zeros(self.state_dim)
        for i in range(self.state_dim):
            self.ngoal[i] = (self.goal[i] - self.state_min[i]) / (
                self.state_max[i] - self.state_min[i])

        # pilco = PILCO(X, Y, controller=controller, horizon=self.max_steps)
        # Example of user provided reward function, setting a custom target state
        # R = ExponentialReward(state_dim=self.state_dim, t=self.ngoal)
        R = ExponentialRewardPosition(state_dim=self.state_dim,
                                      t=self.ngoal)  # Only position
        self.pilco = PILCO(X,
                           Y,
                           controller=controller,
                           horizon=self.max_steps,
                           reward=R,
                           num_induced_points=int(data_size / 10))

        # Example of fixing a parameter, optional, for a linear controller only
        #pilco.controller.b = np.array([[0.0]])
        #pilco.controller.b.trainable = False

        Iter = 0
        success_count = 0
        while not rospy.is_shutdown():
            pub_goal.publish(msg)

            Iter += 1
            self.pilco.optimize()
            # import pdb; pdb.set_trace()
            X_new, Y_new = self.rollout(policy=self.pilco_policy,
                                        steps=self.max_steps)
            # X_new = self.normz(X_new); Y_new = self.normz(Y_new)
            # Update dataset
            X = np.vstack((X, X_new))
            Y = np.vstack((Y, Y_new))
            self.pilco.mgpr.set_XY(X, Y)

            cur = np.array(self.obs_srv().state)
            d = np.linalg.norm((X_new[-1, :2] + Y_new[-1, :2]) -
                               self.ngoal[:2])
            # print('[pilco_node] Iteration ' + str(Iter) + ', reached:' + str(X_new[-1,:self.state_dim]+Y_new[-1,:]) + ', distance: ' + str( np.linalg.norm((X_new[-1,:self.state_dim]+Y_new[-1,:])-self.ngoal) ))
            print('[pilco_node] Goal: ' + str(self.goal) +
                  ', normalized goal: ' + str(self.ngoal) +
                  ', current position: ' + str(cur))
            print('[pilco_node] Iteration ' + str(Iter) + ', reached:' +
                  str(X_new[-1, :2] + Y_new[-1, :2]) + ', pos. distance: ' +
                  str(d))
            # if np.linalg.norm(self.ngoal-X_new[-1,:self.state_dim]) < self.stop_distance:
            if d < self.stop_distance:
                print(
                    '[pilco_node] Goal reached after %d iterations, %d/3 trial.!'
                    % (Iter, success_count))
                success_count += 1
                if success_count >= 3:
                    break
            else:
                success_count = 0
            self.rate.sleep()

        print('[pilco_node] Found solution.')
        X, Y = self.rollout(policy=self.pilco_policy,
                            steps=self.max_steps,
                            normz=False)
        plt.figure()
        plt.plot(X[:, 0], X[:, 1], '-k')
        plt.plot(X[0, 0], X[0, 1], 'or')
        plt.plot(self.goal[0], self.goal[1], 'og')
        xend = X[-1, :self.state_dim] + Y[-1, :]
        plt.plot(xend[0], xend[1], 'ob')
        plt.axis('equal')
        plt.xlabel('x')
        plt.ylabel('y')
        plt.show()
Exemple #3
0
class pilco_node():
    goal = np.array([37.0, 106.0, 9.6664, 10.5664])
    max_steps = 50
    stop_distance = 0.08
    state_max = np.array([90.0164, 140.6792, 67.6822, 66.9443])
    state_min = np.array([-92.6794, 44.1619, 0.4516, 0.9372])

    def __init__(self):
        rospy.init_node('pilco_node', anonymous=True)
        self.rate = rospy.Rate(15)

        rospy.Subscriber('/RL/gripper_status', String,
                         self.callbackGripperStatus)

        self.reset_srv = rospy.ServiceProxy('/RL/ResetGripper', Empty)
        self.move_srv = rospy.ServiceProxy('/RL/MoveGripper', TargetAngles)
        self.drop_srv = rospy.ServiceProxy('/RL/IsObjDropped', IsDropped)
        self.obs_srv = rospy.ServiceProxy('/RL/observation', observation)
        # self.plot_srv = rospy.ServiceProxy('/plot/plot', Empty)
        # self.clear_srv = rospy.ServiceProxy('/plot/clear', Empty)

        pub_goal = rospy.Publisher('/RL/goal',
                                   Float32MultiArray,
                                   queue_size=10)
        self.pub_act = rospy.Publisher('/RL/action',
                                       Float32MultiArray,
                                       queue_size=10)
        msg = Float32MultiArray()

        msg.data = self.goal
        # Initial random rollouts to generate a dataset
        X, Y = self.rollout(policy=self.random_policy, steps=self.max_steps)
        for i in range(1, 15):
            X_, Y_ = self.rollout(policy=self.random_policy,
                                  steps=self.max_steps)
            X = np.vstack((X, X_))
            Y = np.vstack((Y, Y_))

        data_size = X.shape[0]
        self.state_dim = Y.shape[1]
        control_dim = X.shape[1] - self.state_dim
        controller = RbfController(state_dim=self.state_dim,
                                   control_dim=control_dim,
                                   num_basis_functions=5)
        # controller = LinearController(state_dim=self.state_dim, control_dim=control_dim)

        # X = self.normz(X); Y = self.normz(Y)
        self.ngoal = np.zeros(self.state_dim)
        for i in range(self.state_dim):
            self.ngoal[i] = (self.goal[i] - self.state_min[i]) / (
                self.state_max[i] - self.state_min[i])

        # pilco = PILCO(X, Y, controller=controller, horizon=self.max_steps)
        # Example of user provided reward function, setting a custom target state
        # R = ExponentialReward(state_dim=self.state_dim, t=self.ngoal)
        R = ExponentialRewardPosition(state_dim=self.state_dim,
                                      t=self.ngoal)  # Only position
        self.pilco = PILCO(X,
                           Y,
                           controller=controller,
                           horizon=self.max_steps,
                           reward=R,
                           num_induced_points=int(data_size / 10))

        # Example of fixing a parameter, optional, for a linear controller only
        #pilco.controller.b = np.array([[0.0]])
        #pilco.controller.b.trainable = False

        Iter = 0
        success_count = 0
        while not rospy.is_shutdown():
            pub_goal.publish(msg)

            Iter += 1
            self.pilco.optimize()
            # import pdb; pdb.set_trace()
            X_new, Y_new = self.rollout(policy=self.pilco_policy,
                                        steps=self.max_steps)
            # X_new = self.normz(X_new); Y_new = self.normz(Y_new)
            # Update dataset
            X = np.vstack((X, X_new))
            Y = np.vstack((Y, Y_new))
            self.pilco.mgpr.set_XY(X, Y)

            cur = np.array(self.obs_srv().state)
            d = np.linalg.norm((X_new[-1, :2] + Y_new[-1, :2]) -
                               self.ngoal[:2])
            # print('[pilco_node] Iteration ' + str(Iter) + ', reached:' + str(X_new[-1,:self.state_dim]+Y_new[-1,:]) + ', distance: ' + str( np.linalg.norm((X_new[-1,:self.state_dim]+Y_new[-1,:])-self.ngoal) ))
            print('[pilco_node] Goal: ' + str(self.goal) +
                  ', normalized goal: ' + str(self.ngoal) +
                  ', current position: ' + str(cur))
            print('[pilco_node] Iteration ' + str(Iter) + ', reached:' +
                  str(X_new[-1, :2] + Y_new[-1, :2]) + ', pos. distance: ' +
                  str(d))
            # if np.linalg.norm(self.ngoal-X_new[-1,:self.state_dim]) < self.stop_distance:
            if d < self.stop_distance:
                print(
                    '[pilco_node] Goal reached after %d iterations, %d/3 trial.!'
                    % (Iter, success_count))
                success_count += 1
                if success_count >= 3:
                    break
            else:
                success_count = 0
            self.rate.sleep()

        print('[pilco_node] Found solution.')
        X, Y = self.rollout(policy=self.pilco_policy,
                            steps=self.max_steps,
                            normz=False)
        plt.figure()
        plt.plot(X[:, 0], X[:, 1], '-k')
        plt.plot(X[0, 0], X[0, 1], 'or')
        plt.plot(self.goal[0], self.goal[1], 'og')
        xend = X[-1, :self.state_dim] + Y[-1, :]
        plt.plot(xend[0], xend[1], 'ob')
        plt.axis('equal')
        plt.xlabel('x')
        plt.ylabel('y')
        plt.show()

    def rollout(self, policy, steps, normz=True):
        X = []
        Y = []
        print('[pilco_node] Rollout...')

        # Reset system
        # self.clear_srv()
        self.reset_srv()
        while not self.gripper_closed:
            self.rate.sleep()

        # Get observation
        x = np.array(self.obs_srv().state)
        state_dim = len(x)

        for i in range(steps):
            # print('Step ' + str(i) + '...')

            xu = np.zeros(4)
            for i in range(state_dim):
                xu[i] = (x[i] - self.state_min[i]) / (self.state_max[i] -
                                                      self.state_min[i])

            # Choose action
            u = policy(xu)
            # print('u: ', u)

            u[2] = (u[2] + 1) / 2

            msg = Float32MultiArray()
            msg.data = u
            self.pub_act.publish(msg)

            # Act
            for _ in range(int(u[2] * 150) + 1):
                suc = self.move_srv(u[:2]).success
                rospy.sleep(0.05)
                self.rate.sleep()
                if not suc:
                    break

            fail = self.drop_srv().dropped  # Check if dropped - end of episode

            if suc and not fail:
                # Get observation
                x_new = np.array(self.obs_srv().state)
            else:
                # End episode if overload or angle limits reached
                rospy.logerr(
                    '[pilco_node] Failed to move. Episode declared failed.')
                break

            xu = np.hstack((x, u))
            dx = x_new - x
            if normz:
                dx = np.zeros(state_dim)
                for i in range(state_dim):
                    xu[i] = (xu[i] - self.state_min[i]) / (self.state_max[i] -
                                                           self.state_min[i])
                    dx[i] = (x_new[i] - self.state_min[i]) / (
                        self.state_max[i] - self.state_min[i]) - xu[i]

            X.append(xu)
            Y.append(dx)
            x = x_new

            self.rate.sleep()

        # self.plot_srv()
        return np.stack(X), np.stack(Y)

    def random_policy(self, x):
        a = np.array([0., 0.])
        if np.random.uniform(0., 1., 1) > 0.6:
            if np.random.uniform(0., 1., 1) > 0.5:
                a = np.random.uniform(0.75, 1.0, 2)
            else:
                a = np.random.uniform(-1.0, -0.75, 2)
        else:
            a = np.random.uniform(-1., 1., 2)

        return np.append(a, np.random.uniform(-1, 1, 1))

    def pilco_policy(self, x):
        return self.pilco.compute_action(x[None, :])[0, :]

    def callbackGripperStatus(self, msg):
        self.gripper_closed = msg.data == "closed"

    def normz(self, X):
        for i in range(self.state_dim):
            X[:, i] = (X[:, i] - self.state_min[i]) / (self.state_max[i] -
                                                       self.state_min[i])

        return X
Exemple #4
0
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))

# get env reward
env_reward = get_env_reward(args.env_name)

# load gp
if args.load_gp_model:
    try:
        pilco = load_pilco(model_path, controller, env_reward, debug=args.debug)
        print("load gp model successfully")
    except:
        raise ValueError("can not find saved gp models")
        # pilco = PILCO(X, Y, controller=controller, reward=env_reward)
else:
    pilco = PILCO(X, Y, controller=controller, reward=env_reward, debug=args.debug)

# for numerical stability
for model in pilco.mgpr.models:
    model.likelihood.variance = 0.001
    model.likelihood.variance.trainable = False

# reward_list = []
# ep_step_list = []
X_init = X[:, 0: state_dim]

# def random_policy(obs):
    # return env.action_space.sample()
# random policy as comparing policy
# policy2 = random_policy
Exemple #5
0
with tf.Session() as sess:
    p_start = time.time()
    target = np.array([1.2, 0.38, 0.38])
    env = EnvWrapper(target)
    T = 25
    num_basis_functions = 50
    max_action = 0.3
    time_on_real_robot = 0
    X, Y, t = rollout(env, T, random=True, trial=0)
    time_on_real_robot += t
    state_dim = Y.shape[1]
    control_dim = X.shape[1] - Y.shape[1]
    controller = RbfController(state_dim, control_dim, num_basis_functions,
                               max_action)
    reward = ExponentialReward(3, t=target)
    pilco = PILCO(X, Y, controller=controller, reward=reward)
    plot(pilco, X, Y, T, 0)
    n = 4
    t_model = 0
    t_policy = 0
    for i in range(1, n):
        env.reset()
        t1 = time.time()
        pilco.optimize_models()
        t2 = time.time()
        t_model += t2 - t1
        print("model optimization done!")
        pilco.optimize_policy()
        t3 = time.time()
        t_policy += t3 - t2
        print("policy optimization done!")
    experiment_path = experiment_path[:-1]

    if not os.path.exists(experiment_path):
        os.makedirs(experiment_path)
        os.makedirs(experiment_path + "/img")
        os.makedirs(experiment_path + "/img/rewards")
        os.makedirs(experiment_path + "/img/H_space")
        os.makedirs(experiment_path + "/img/H_space/live_inference")

    checkpoint_path = experiment_path + "/model.ckpt"
    config_path = experiment_path + "/config.json"

    arg_dict["experiment_path"] = experiment_path
    arg_dict["checkpoint_path"] = checkpoint_path

    training_envs, test_envs, dataset, cost, labels = get_envs(**arg_dict)
    num_envs = len(training_envs) + len(test_envs)
    arg_dict["n_envs"] = num_envs
    arg_dict["n_active_tasks"] = len(training_envs)
    arg_dict["labels"] = labels

    with open(config_path, "w") as f:
        json.dump(arg_dict, f)

    model = create_model(**arg_dict)
    agent = create_agent(model, cost, **arg_dict)

    pilco = PILCO(model, agent, dataset, **arg_dict)

    run_PILCO(pilco, training_envs, test_envs, **arg_dict)
Exemple #7
0
                         timesteps=T,
                         random=True,
                         SUBS=SUBS,
                         verbose=False,
                         render=False)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))

    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim
    X, Y = torch.tensor(X,
                        dtype=torch.float32), torch.tensor(Y,
                                                           dtype=torch.float32)

    # build PILCO model
    pilco = PILCO(X, Y, horizon=T, m_init=m_init, S_init=S_init)

    for rollouts in range(N):
        # main PILCO loop
        print("**** ITERATION no", rollouts, " ****")
        # learn dynamic model
        pilco.optimize_models(maxiter=maxiter)
        # improve policy
        pilco.optimize_policy(maxiter=maxiter)
        # collect more data
        X_new, Y_new = rollout(env,
                               pilco,
                               timesteps=T_sim,
                               verbose=False,
                               render=False,
                               SUBS=SUBS)