Ejemplos de DMP.DMP en Python

Lenguaje de programación: Python

Namespace/Package Name: dmp

Clase / Tipo: DMP

Método / Función: DMP

Ejemplos en hotexamples.com: 13

Python DMP.DMP - 13 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de dmp.DMP.DMP extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

DMP(13)

action_rollout(4)

rollout(4)

run_dmp(4)

imitate_path(3)

learn_dmp(2)

reset_state(2)

weights(1)

Ejemplo n.º 1

Mostrar archivo

Archivo: learn.py Proyecto: thegodone/MirrorDescentSearch

    def test_act_and_train(self):
        n_updates = 10
        n_dims = 2
        n_bfs = 4
        n_reps = 3
        duration = 0.5
        dt = 0.01
        dmps = [[] for _ in range(n_dims)]

        n_times = int(duration / dt)

        env = Point(dmps, n_dims, n_bfs, n_times, duration, dt)

        agent = MDSKDE(dmps=dmps,
                       n_updates=n_updates,
                       n_reps=n_reps,
                       n_dims=n_dims,
                       n_bfs=n_bfs,
                       n=5,
                       times=1.0,
                       r_gain=10**(-8),
                       r_normalize=False)

        for i in range(n_dims):
            dmps[i] = DMP(n_bfs, duration, 0.5, dt, True)
            dmps[i].reset()
            dmps[i].set_goal(env.gxi[i])
            agent.theta[i] = dmps[i].minimize_jerk()

        reward = 0
        obs = env.reset()
        t = 0
        k = 0
        agent.act_and_train(obs, reward, t, k)

Ejemplo n.º 2

Mostrar archivo

Archivo: fit_dmp_pose.py Proyecto: mgb45/dmp_imitation

    def callback(self, msg):
        if self.pose_0 == None:
            self.pose_0 = [
                msg.position.x, msg.position.y, msg.position.z,
                msg.orientation.x, msg.orientation.y, msg.orientation.z,
                msg.orientation.w
            ]
            self.pose_1 = [
                msg.position.x, msg.position.y, msg.position.z,
                msg.orientation.x, msg.orientation.y, msg.orientation.z,
                msg.orientation.w
            ]
        else:
            self.pose_0 = self.pose_1
            self.pose_1 = [
                msg.position.x, msg.position.y, msg.position.z,
                msg.orientation.x, msg.orientation.y, msg.orientation.z,
                msg.orientation.w
            ]

        if np.sum(
                np.abs(np.array(self.pose_0) - np.array(self.pose_1))) > 0.01:
            print('Moving')
            self.traj.append(self.pose_1)
        else:
            if len(self.traj) > 5:

                # Fit DMP to motion segment
                path = np.array(self.traj)
                dmp = DMP(path[0, :],
                          path[-1, :],
                          Nb=500,
                          dt=0.01,
                          d=path.shape[1],
                          jnames=self.name)
                params = dmp.imitate_path(
                    path +
                    1e-5 * np.random.randn(path.shape[0], path.shape[1]))
                #print(params)

                dmp.reset_state()
                self.dmp_count += 1

                with open("./dmp%05d.npy" % self.dmp_count, "w") as f:
                    pickle.dump(dmp, f, pickle.HIGHEST_PROTOCOL)

                y_r, dy_r, ddy_r = dmp.rollout()
                plt.subplot(1, 2, 1)
                plt.cla()
                plt.plot(y_r)
                plt.subplot(1, 2, 2)
                plt.cla()
                plt.plot(path)
                plt.draw()
                plt.pause(0.1)

            self.traj = []

            print('Stationary')

Ejemplo n.º 3

Mostrar archivo

Archivo: fit_dmp_joints.py Proyecto: craigiedon/dl2_lfd

    def callback(self, msg):
        if self.js_prev is not None:
            self.js_prev = msg
            self.js_current = msg
        else:
            self.js_prev = self.js_current
            self.js_current = msg

        # If your joints have changed by more than a small threshold amount, then you are moving! Start recording joint angles
        if np.sum(
                np.abs(
                    np.array(self.js_prev.position) -
                    np.array(self.js_current.position))) > 0.01:
            print('Moving')
            j_list = [
                msg.position[msg.name.index(j)] for j in self.joint_names
            ]
            self.traj.append(j_list)

        elif len(self.traj) > 0:
            # Fit DMP to motion segment
            path = np.array(self.traj)
            dmp = DMP(path[0, :],
                      path[-1, :],
                      num_basis_funcs=500,
                      dt=0.01,
                      d=path.shape[1],
                      jnames=self.joint_names)
            _, weights = imitate_path(
                path + 1e-5 * np.random.randn(path.shape[0], path.shape[1]),
                dmp)
            print("Learned weights: {}".format(weights))
            dmp.weights = weights

            self.dmp_count += 1

            # Save it with pickle
            with open("./dmp{}.npy".format(self.dmp_count), "w") as f:
                pickle.dump(dmp, f, pickle.HIGHEST_PROTOCOL)

            # Plot the path imitated by the learned dmp (rolled out) against the actual trajectory path...
            y_r, dy_r, _ = dmp.rollout()
            plot_rollout(y_r, path)

            jt = build_jt(msg.header, self.joint_names, y_r, dy_r)
            self.traj_pub.publish(jt)
            self.traj = []
        else:
            print("Stationary")

Ejemplo n.º 4

Mostrar archivo

    def callback(self, msg):
        if self.pose_prev is not None:
            self.pose_prev = [msg.position.x, msg.position.y, msg.position.z,
                           msg.orientation.x, msg.orientation.y, msg.orientation.z,
                           msg.orientation.w]
            self.pose_current = [msg.position.x, msg.position.y, msg.position.z,
                           msg.orientation.x, msg.orientation.y, msg.orientation.z,
                           msg.orientation.w]
        else:
            self.pose_prev = self.pose_current
            self.pose_current = [msg.position.x, msg.position.y, msg.position.z,
                           msg.orientation.x, msg.orientation.y, msg.orientation.z,
                           msg.orientation.w]

        if np.sum(np.abs(np.array(self.pose_prev) - np.array(self.pose_current))) > 0.01:
            print('Moving')
            self.traj.append(self.pose_current)
        else:
            if len(self.traj) > 5:

                # Fit DMP to motion segment
                path = np.array(self.traj)
                dmp = DMP(path[0, :], path[-1, :], num_basis_funcs=500, dt=0.01,
                          d=path.shape[1], jnames=self.name)
                params = dmp.imitate_path(path+1e-5*np.random.randn(path.shape[0], path.shape[1]))
                print(params)

                self.dmp_count += 1

                with open("./dmp%05d.npy" % self.dmp_count, "w") as f:
                    pickle.dump(dmp, f, pickle.HIGHEST_PROTOCOL)

                y_r, _, _ = dmp.rollout()
                plot_rollout(y_r, path)

            self.traj = []

            print('Stationary')

Ejemplo n.º 5

Mostrar archivo

    def callback(self, msg):
        if self.js_0 == None:
            self.js_0 = msg
            self.js_1 = msg
        else:
            self.js_0 = self.js_1
            self.js_1 = msg

        if np.sum(
                np.abs(
                    np.array(self.js_0.position) -
                    np.array(self.js_1.position))) > 0.01:
            print('Moving')
            j_list = []
            for j, joint in enumerate(self.joint_names):
                i = msg.name.index(joint)
                if i:
                    j_list.append(msg.position[i])
            self.traj.append(j_list)
        else:
            if len(self.traj) > 0:

                # Fit DMP to motion segment
                path = np.array(self.traj)
                dmp = DMP(path[0, :],
                          path[-1, :],
                          Nb=500,
                          dt=0.01,
                          d=path.shape[1],
                          jnames=self.joint_names)
                params = dmp.imitate_path(
                    path +
                    1e-5 * np.random.randn(path.shape[0], path.shape[1]))
                #print(params)

                dmp.reset_state()
                self.dmp_count += 1

                with open("./dmp%05d.npy" % self.dmp_count, "w") as f:
                    pickle.dump(dmp, f, pickle.HIGHEST_PROTOCOL)

                y_r, dy_r, ddy_r = dmp.rollout()
                plt.subplot(1, 2, 1)
                plt.cla()
                plt.plot(y_r)
                plt.subplot(1, 2, 2)
                plt.cla()
                plt.plot(path)
                plt.draw()
                plt.pause(0.1)
                self.jt.header = msg.header
                self.jt.joint_names = self.joint_names
                jtp = JointTrajectoryPoint()
                for i in range(y_r.shape[0]):
                    jtp.positions = y_r[i, :].tolist()
                    jtp.velocities = dy_r[i, :].tolist()
                    jtp.time_from_start = rospy.Duration(1.0)
                    self.jt.points.append(jtp)
                self.traj_pub.publish(self.jt)

                self.traj = []

            print('Stationary')
            self.jt = JointTrajectory()

Ejemplo n.º 6

Mostrar archivo

Archivo: main.py Proyecto: jbd95/pyrdmp

from dmp import DynamicMovementPrimitive as DMP
import numpy as np
import matplotlib.pyplot as plt
from plots import plot as Plot

# Initialize the DMP class
my_dmp = DMP(20, 20, 0)

# Filtering Parameters
window = 5
blends = 10

# Load the demo data
data = DMP.load_demo("demos/demo1.txt")

# Obtain the joint position data and the time vector
t, q = DMP.parse_demo(data)

# Get the phase from the time vector
phase = my_dmp.phase(t)
Plot.phase(phase)

print(q.shape)

# Normalize the time vector
t = DMP.normalize_vector(t)

# Compute velocity and acceleration
dq = np.zeros(q.shape)
ddq = np.zeros(q.shape)

Ejemplo n.º 7

Mostrar archivo

def main(argv):
    FLAGS = flags.FLAGS
    print(FLAGS.flag_values_dict())
    FLAGS = FLAGS.flag_values_dict()
    globals().update(FLAGS)
    '''ENVIRONMENT'''
    import gym
    env = gym.make("HandManipulatePen-v0")
    results = env.reset()
    env.observation_space["observation"].shape[0]
    observation_dim = env.observation_space["observation"].shape[0]
    n_actuators = env.action_space.shape[0]
    n_dmp_basis = 10
    action_dim = n_actuators * (n_dmp_basis + 1)
    #goal_dim = observation_dim
    goal_dim = 7
    batch_size = 1
    number_layers = 2
    alpha = 0.1  # hyperparameter used in average lp estimate for goal policy

    #DMP
    env.relative_control = True
    from dmp import DMP
    n_simulation_steps = 25
    dmp = DMP(10, n_simulation_steps, n_actuators)

    #%%
    '''NET'''

    from learning_progress_rnn import GOALRNN
    net = GOALRNN(batch_size,
                  number_layers,
                  observation_dim,
                  action_dim,
                  goal_dim,
                  n_hidden=256)

    # load rnn if saved. TODO: save only weights maybe to avoid bugs after we change architecture..
    if os.path.isfile("lprnn" + experiment_name + ".pt"):
        temp_net = torch.load("lprnn" + experiment_name + ".pt")
        net.load_state_dict(temp_net.state_dict())

    if os.path.isfile("lprnn_weights" + experiment_name + ".pt"):
        print("LOADING WEIGHTS")
        net.load_state_dict(
            torch.load("lprnn_weights" + experiment_name + ".pt"))

    # optimizer and losses
    from torch import optim
    #optimizer = optim.SGD(net.parameters(), lr=1e-4, momentum=0.9)
    optimizer = optim.RMSprop(net.parameters())
    goal_loss = nn.MSELoss()
    goal_reconstruction_loss = nn.MSELoss()
    action_reconstruction_loss = nn.MSELoss()

    # initial values of several variables
    previous_goal_reward = torch.Tensor([-1.0])
    previous_lp_value = torch.zeros(1)
    learning_progress = torch.zeros(1)
    #average_reward_estimate = 0
    average_lp_estimate = 0

    #initial run
    action = env.action_space.sample()
    results = env.step(action)
    observation = results[0]["observation"]
    observations = np.expand_dims(np.expand_dims(observation, 0), 0)
    observations = torch.Tensor(observations)

    if evaluating:
        net.eval()
    if evaluating or not lp_training:
        pen_goal = results[0]["desired_goal"]
        #goal = np.tile(np.expand_dims(pen_goal,0),(n_steps,1))
        #goal = np.reshape(goal.T,(-1))
        goal = torch.Tensor(pen_goal)
    if rendering:
        setup_render(env)

    rewards = []
    lps = []

    # a function that allows to do back prop, but not accumulate gradient in certain modules of a network
    def partial_backprop(loss, parts_to_ignore=[]):
        for part in parts_to_ignore:
            for parameter in part.parameters():
                parameter.requires_grad = False
        loss.backward(retain_graph=True)
        for part in parts_to_ignore:
            for parameter in part.parameters():
                parameter.requires_grad = True

    pen_vars_slice = slice(54, 61)
    pen_pos_center = torch.Tensor([1.0, 0.90, 0.15]).unsqueeze(0).unsqueeze(0)
    print(observations.shape)
    reset_env = False
    for iteration in range(1000000):
        #print(observations)
        if evaluating:  #if evaluating we just use the action prediction part of the network
            #print(goal, observations)
            action_parameters, _ = net.predict_action(
                goal.unsqueeze(0).unsqueeze(0), observations)
            action_parameters = action_parameters[0, 0, :]
            #print(action_parameters.shape)
        else:
            #feed observations to net, get desired goal, actions (and their probabilities), and predicted value of action, and goal
            actions, log_prob_action, goal, log_prob_goal, value, lp_value = net(
                observations,
                learning_progress.detach().unsqueeze(0).unsqueeze(0))
            value = value - 1  # learn the difference between the value and -1, because at the beginning most values will be close to -1
            pen_pos = observations[:, :, pen_vars_slice][..., :3]
            pen_rot = observations[:, :, pen_vars_slice][..., 3:]
            rot_goal = goal[:, :, 3:]
            rel_rot_goal = (rot_goal - pen_rot) * 0.1 + pen_rot
            goal = torch.cat([(goal[:, :, :3] - pen_pos) * 0.002 + pen_pos,
                              (rel_rot_goal) / np.linalg.norm(rel_rot_goal)],
                             dim=2)
            #goal += 0.05*torch.randn_like(goal)
            goal = Variable(goal.data, requires_grad=True)
            if lp_training:
                action_parameters, log_prob_action, goal, log_prob_goal, value, lp_value = actions[
                    0,
                    0, :], log_prob_action[0, 0], goal[0, 0, :], log_prob_goal[
                        0, 0], value[0, 0, :], lp_value[0, 0, :]
            else:  # if we are not training goal policy then ignore the goal policy variables. We'll us the goal provided by openaigym
                action_parameters, log_prob_action, _, _, value, lp_value = actions[
                    0,
                    0, :], log_prob_action[0, 0], goal[0, 0, :], log_prob_goal[
                        0, 0], value[0, 0, :], lp_value[0, 0, :]

        action_parameters = action_parameters.detach().numpy()
        #print(action_parameters)
        #run action using DMP
        for i in range(n_simulation_steps):
            # print(context.shape)
            action = dmp.action_rollout(None, action_parameters, i)
            results = env.step(action)
            if evaluating or not lp_training:
                pen_goal = results[0]["desired_goal"]
                #goal = np.tile(np.expand_dims(pen_goal,0),(n_steps,1))
                #goal = np.reshape(goal.T,(-1))
                goal = torch.Tensor(pen_goal)
                if evaluating:
                    print(results[1])
                    rewards.append(results[1])
            if rendering:
                #env.render()
                render_with_target(env, goal.detach().numpy())
            obs = results[0]["observation"]
            done = results[2]
            if done:
                print("reseting environment")
                results = env.reset()
                #print(results)
                obs = results["observation"]
                reset_env = True
                break
        new_observations = np.expand_dims(np.expand_dims(obs, 0), 0)
        new_observations = torch.Tensor(new_observations)

        if not evaluating:
            # saving rewards, learning progresses, etc
            if iteration % save_freq == save_freq - 1:
                print("Saving stuff")
                #torch.save(net, "lprnn.pt")
                torch.save(net.state_dict(),
                           "lprnn_weights" + experiment_name + ".pt")
                with open("rewards" + experiment_name + ".txt", "a") as f:
                    f.write("\n".join([str(r) for r in rewards]))
                rewards = []
                with open("learning_progresses" + experiment_name + ".txt",
                          "a") as f:
                    f.write("\n".join([str(lp) for lp in lps]))
                lps = []

            if save_goals:
                if iteration == 0:
                    goals = np.expand_dims(goal, 0)
                else:
                    goals = np.concatenate(
                        [goals, np.expand_dims(goal, 0)], axis=0)
        else:
            if iteration % save_freq == save_freq - 1:
                with open("test_rewards" + experiment_name + ".txt", "a") as f:
                    f.write("\n".join([str(r) for r in rewards]))
                rewards = []

        # we detach the RNN every so often, to determine how far to backpropagate through time
        # TODO: do this in a way that doesn't start from scratch, but instead backpropagates forget_freq many iterations in the past
        # at *every* time step!!
        if iteration % forget_freq == forget_freq - 1:
            net.forget()

        if reset_env:
            observations = new_observations
            if not evaluating and lp_training:
                previous_lp_value = lp_value
                learning_progress = Variable(
                    torch.zeros_like(learning_progress))
            reset_env = False
            continue

        if not evaluating:  #if not evaluating, then train
            optimizer.zero_grad()

            # train policy to maximize goal reward (which is -goal_loss, which is -|observation-goal|^2) Here we are only looking at pen part of observation
            #goal_reward = -1*goal_loss(new_observations[0,0,pen_vars_slice],goal)
            #goal_reward = goal_reward*(goal_reward>-1)
            goal_reward = env.compute_reward(
                new_observations[0, 0, pen_vars_slice].numpy(),
                goal.detach().numpy(), None)
            print(new_observations[0, 0, pen_vars_slice], goal)
            print("goal_reward", goal_reward)
            rewards.append(goal_reward)

            hindsight_goal = new_observations[:, :, pen_vars_slice]

            #if iteration <= 100000:
            #    # we train for the goal reconstruction part of the network
            #    # we use the hindsight_goal (the outcome of our action, to ensure we autoencode reachable goals, and explore more effectively
            #    reconstructed_goal = net.autoencode_goal(hindsight_goal+0.01*torch.randn_like(hindsight_goal))
            #    loss = goal_reconstruction_loss(goal, reconstructed_goal)
            #    print("goal_reconstruction_loss", loss.data.item())
            #    partial_backprop(loss)

            # we also learn to predict the actions we just performed when goal is the observed outcome
            # this is called hindsight experience replay (but this is a version that doesnt seem to work well)
            #predicted_action_parameters,_ = net.predict_action(hindsight_goal,observations, output_mean=True)
            #loss = action_reconstruction_loss(predicted_action_parameters, torch.Tensor(action_parameters))
            #partial_backprop(loss, [net.goal_encoder])

            # we update the policy and value function following a non-bootstraped actor-critic approach
            # we update the state-action value function by computing delta,
            # delta is an unbiased estimator of the difference between the predicted value `value` and
            # the true expected reward (estimated by the observed `goal_reward`)
            # we train the value function to minimize their squared difference delta**2
            delta = goal_reward - value
            print("value", value.data.item())
            reward_value_fun = 0.5 * delta**2
            partial_backprop(reward_value_fun, [])

            # then we update the policy using a policy gradient update
            # where delta is used as the advantage
            # note that we detach delta, so that it becomes a scalar, and gradients aren't backpropagated through it anymore
            loss_policy = -delta.detach() * log_prob_action
            partial_backprop(loss_policy)

            ###Hindsight Experience Replay
            #value = net.compute_value(hindsight_goal, observations)[0,0,:]
            #value = value - 1
            #log_prob_action = net.get_log_prob_action(hindsight_goal, observations, actions)
            #goal_reward = env.compute_reward(new_observations[0,0,pen_vars_slice].numpy(), hindsight_goal[0,0,:].detach().numpy(), None)
            #delta2 = goal_reward - value
            #reward_value_fun = 0.5*delta2**2
            #partial_backprop(reward_value_fun)

            #loss_policy = -delta2.detach()*log_prob_action
            #partial_backprop(loss_policy)

            # we define absolute learning progress as the absolute value of the "Bellman" error, `delta`
            # If delta is high, that means that the reward we got was significantly different from our expectations
            # which means we updated our policy a lot
            # which I am interpreting as "you have learned a lot" -- you have made significant learning progress
            # on the other hand if delta is very small, you didn't learn anything you didn't already know.
            #learning_progress = torch.abs(delta+delta2)
            learning_progress = torch.abs(delta)
            lps.append(learning_progress.data.item())

            # we use `learning_progress` (lp) as reward to train the goal-generating process (aka goal policy).
            # because the agent will be exploring goals in a continual way
            # we use a "continual learning" method for RL
            # in particular we use the average reward method, explained in Sutton and Barto 2nd edition (10.3, 13.6)
            # In short average reward RL uses differential value functions
            # which estimate the difference between {expected average reward following a state-action} and {average reward over all time - assume ergodicity}
            # -- called the differential return --
            # this difference measures the transient advantage of performing this action over other actions
            # there is a version of the policy gradient theorem which shows that using this in place of the expected raw reward in the episodic setting,
            # means we are maximizing the average reward over all time of our policy, which is the desired objective in the continual setting.
            # yeah, this theory has assumptions like ergodicity, and also you can only prove optimality for tabular RL, or linear models, not NNs,
            # but these are problems with all of RL theory really.
            # anyway, `delta` is now the Bellman error measuring the difference between
            # {the previous estimation of the expected differential return (`previous_lp_value`)}
            # and {the new bootstraped estimate `learning_progress.detach() - average_lp_estimate + lp_value.detach()`}
            delta = learning_progress.detach(
            ) - average_lp_estimate + lp_value.detach() - previous_lp_value
            # note that we detach the learning progress and lp_value, this is standard in Bellman errors I think
            # we don't backpropagate through the bootstraped target!
            # also update `average_lp_estimate` which is the estimate of the average reward.
            average_lp_estimate = average_lp_estimate + alpha * delta.detach()

            if iteration > 0 and lp_training:  #only do this once we have a previous_lp_value
                # update the differential value function for goal policy
                loss_lp_value_fun = 0.5 * delta**2
                partial_backprop(loss_lp_value_fun, [net.goal_decoder])

                # update goal policy using policy gradient
                loss_goal_policy = -delta.detach() * log_prob_goal
                # we don't update the goal_decoder because that way we are just training the RNN to produce certain action vectors
                # after the autoencoder has trained well, then each latent vector represents 1-to-1 a goal
                # and the action can learn to map to the actions corresponding to that goal
                # if we kept changing the goal_decoder, then the action decoder may "get confused" as its actual goal is changing even for fixed input latent vector
                partial_backprop(loss_goal_policy, [])

            optimizer.step()

            previous_lp_value = lp_value

        observations = new_observations

Ejemplo n.º 8

Mostrar archivo

Archivo: train_goal_a2c.py Proyecto: MetaGenAI/vrai

def main(argv):
    FLAGS = flags.FLAGS
    print(FLAGS.flag_values_dict())
    FLAGS = FLAGS.flag_values_dict()
    globals().update(FLAGS)
    '''ENVIRONMENT'''
    import gym
    env = gym.make("HandManipulatePen-v0")
    results = env.reset()
    #env.observation_space["observation"].shape[0]
    observation_dim = env.observation_space["observation"].shape[0]
    n_actuators = env.action_space.shape[0]
    n_dmp_basis = 10
    action_dim = n_actuators * (n_dmp_basis + 1)
    #goal_dim = observation_dim
    goal_dim = 7
    batch_size = 1
    number_layers = 2
    gamma = 0.9
    #alpha = 0.1 # hyperparameter used in average lp estimate for goal policy

    #DMP
    env.relative_control = True
    from dmp import DMP
    n_simulation_steps = 25
    dmp = DMP(10, n_simulation_steps, n_actuators)

    #%%
    '''NET'''

    from learning_progress_ddpg_a2c import GOALRNN
    net = GOALRNN(batch_size,
                  number_layers,
                  observation_dim,
                  action_dim,
                  goal_dim,
                  n_hidden=256)

    if os.path.isfile("lprnn_weights" + experiment_name + ".pt"):
        print("LOADING WEIGHTS")
        net.load_state_dict(
            torch.load("lprnn_weights" + experiment_name + ".pt"))

    #print(net.goal_decoder.state_dict().items())
    net.goal_decoder.apply(weight_reset)
    #print(net.goal_decoder.state_dict().items())

    # optimizer and losses
    from torch import optim
    #optimizer = optim.SGD(net.parameters(), lr=1e-4, momentum=0.9)
    #optimizer = optim.SGD(net.parameters(), lr=1e-6)
    #optimizer2 = optim.SGD(net.parameters(), lr=1e1)
    optimizer = optim.Adam(net.parameters())
    #optimizer = optim.RMSprop(net.parameters())

    # initial values of several variables
    learning_progress = torch.zeros(1)

    #initial run
    action = env.action_space.sample()
    results = env.step(action)
    observation = results[0]["observation"]
    observations = np.expand_dims(np.expand_dims(observation, 0), 0)
    observations = torch.Tensor(observations)

    if evaluating:
        net.eval()
    if evaluating or not lp_training:
        pen_goal = results[0]["desired_goal"]
        goal = torch.Tensor(pen_goal)
    if rendering:
        setup_render(env)

    rewards = []
    lps = []
    temp_buffer = []

    # a function that allows to do back prop, but not accumulate gradient in certain modules of a network
    def partial_backprop(loss, parts_to_ignore=[]):
        for part in parts_to_ignore:
            for parameter in part.parameters():
                parameter.requires_grad = False
        loss.backward(retain_graph=True)
        for part in parts_to_ignore:
            for parameter in part.parameters():
                parameter.requires_grad = True

    print(observations.shape)
    reset_env = False
    '''TRAINING LOOP'''
    # using DDPG on-olicy (without memory buffer for now. TODO: have memory buffer
    for iteration in range(1000000):
        if evaluating:  #if evaluating we just use the action prediction part of the network
            action_parameters = net.compute_actions(
                goal.unsqueeze(0).unsqueeze(0), observations)
            action_parameters = action_parameters[0, 0, :]
        else:
            #feed observations to net, get desired goal, actions, and predicted value of action, and goal
            actions, noisy_actions, noisy_goals, log_prob_goals = net(
                observations)
            #goals = Variable(goals.data, requires_grad=True)
            if lp_training:
                action_parameters, goal = noisy_actions[0, 0, :], noisy_goals[
                    0, 0, :]
            else:  # if we are not training goal policy then ignore the goal policy variables. We'll us the goal provided by openaigym
                action_parameters = noisy_actions[0, 0, :]

        action_parameters = action_parameters.detach().numpy()
        #print(action_parameters)
        #run action using DMP
        for i in range(n_simulation_steps):
            action = dmp.action_rollout(None, action_parameters, i)
            results = env.step(action)
            if evaluating or not lp_training:
                pen_goal = results[0]["desired_goal"]
                goal = torch.Tensor(pen_goal)
                if evaluating:
                    print(results[1])
                    rewards.append(results[1])
            if rendering:
                render_with_target(env, goal.detach().numpy())
            obs = results[0]["observation"]
            done = results[2]
            if done:
                print("reseting environment")
                results = env.reset()
                obs = results["observation"]
                reset_env = True
                break
        new_observations = np.expand_dims(np.expand_dims(obs, 0), 0)
        new_observations = torch.Tensor(new_observations)

        if not evaluating:
            # saving rewards, learning progresses, etc
            if iteration % save_freq == save_freq - 1:
                print("Saving stuff")
                torch.save(net.state_dict(),
                           "lprnn_weights" + experiment_name + ".pt")
                with open("rewards" + experiment_name + ".txt", "a") as f:
                    f.write("\n".join([str(r) for r in rewards]))
                    f.write("\n")
                rewards = []
                with open("learning_progresses" + experiment_name + ".txt",
                          "a") as f:
                    f.write("\n".join([str(lp) for lp in lps]))
                    f.write("\n")
                lps = []

            #if save_goals:
            #    if iteration == 0:
            #        goals = np.expand_dims(goal,0)
            #    else:
            #        goals = np.concatenate([noisy_goals,np.expand_dims(goal,0)],axis=0)
        else:
            if iteration % save_freq == save_freq - 1:
                with open("test_rewards" + experiment_name + ".txt", "a") as f:
                    f.write("\n".join([str(r) for r in rewards]))
                rewards = []

        if reset_env:
            observations = new_observations
            if not evaluating and lp_training:
                learning_progress = Variable(
                    torch.zeros_like(learning_progress))
            reset_env = False
            continue

        if not evaluating:  #if not evaluating, then train

            pen_vars_slice = slice(54, 61)
            hindsight_goal = new_observations[0, 0, pen_vars_slice]
            hindsight_goals = hindsight_goal.unsqueeze(0).unsqueeze(0)
            sparse_goal_reward = env.compute_reward(hindsight_goal.numpy(),
                                                    goal.detach().numpy(),
                                                    None)
            goal_reward = torch.clamp(
                -torch.norm(hindsight_goal - goal.detach()), -1, 1)
            print(new_observations[0, 0, pen_vars_slice], goal)
            print("goal_reward", goal_reward)
            print("sparse_goal_reward", sparse_goal_reward)
            rewards.append(sparse_goal_reward)

            total_delta = 0

            ## update q value network on desired goal
            #optimizer.zero_grad()
            #value = net.compute_q_value(noisy_goals.detach(), observations, noisy_actions)
            #print("q-value", value.data.item())
            #delta = goal_reward - value
            #total_delta += delta
            #reward_value_fun = 0.5*delta**2
            #partial_backprop(reward_value_fun, [net.goal_decoder, net.action_decoder])
            #optimizer.step()

            ### update q value network on hindsight goal
            ### by definition we have achieved the hindsight goal, so reward is 0.0 (achieved goal)
            #goal_reward = 0.0
            #optimizer.zero_grad()
            #value = net.compute_q_value(hindsight_goals.detach(), observations, noisy_actions)
            #delta = goal_reward - value
            #total_delta += delta
            #reward_value_fun = 0.5*delta**2
            #partial_backprop(reward_value_fun, [net.goal_decoder, net.action_decoder])
            #optimizer.step()

            # update policy to achieve actions with higher q value
            # this is good to make sure we learn about actions for goals which are achievable
            # TODO: Alternative to try: because in this environment having several actions that lead to same outcome is probably not very likely, then we can train the action decoder directly too on hindsight goal
            # without  any problem for intrisic motivation I think
            if np.random.rand() <= 1.0:
                #print(net.action_decoder.state_dict().items())
                for i in range(1):
                    optimizer.zero_grad()
                    # find the actions the policy predicts for hindsight goal
                    hindsight_actions = net.compute_actions(
                        hindsight_goals.detach(), observations)
                    if i == 0:
                        hindsight_actions_original = hindsight_actions.detach(
                        ).clone()
                    action_reconstruction_loss = 0.5 * torch.norm(
                        actions.detach() - hindsight_actions)**2
                    partial_backprop(action_reconstruction_loss)
                    optimizer.step()
                for i in range(1):
                    optimizer.zero_grad()
                    # find the actions the policy predicts for hindsight goal
                    hindsight_actions = net.compute_actions(
                        hindsight_goals.detach() +
                        0.1 * torch.rand_like(hindsight_goals), observations)
                    action_reconstruction_loss = 0.5 * torch.norm(
                        actions.detach() - hindsight_actions)**2
                    partial_backprop(-action_reconstruction_loss)
                    optimizer.step()
                new_actions = net.compute_actions(hindsight_goals,
                                                  observations)
                action_difference = torch.norm(
                    new_actions - hindsight_actions_original) / torch.norm(
                        hindsight_actions_original)
            else:
                actions = net.compute_actions(noisy_goals, observations)
                optimizer.zero_grad()
                loss_policy = -net.compute_q_value(noisy_goals.detach(),
                                                   observations, actions)
                partial_backprop(loss_policy, [net.q_value_decoder])
                optimizer.step()
                new_actions = net.compute_actions(noisy_goals, observations)
                action_difference = torch.norm(new_actions -
                                               actions) / torch.norm(actions)
            '''COMPUTE LEARNING PROGRESS'''

            print("action difference", action_difference)
            #learning_progress = torch.abs(delta) + action_difference
            #learning_progress = torch.max(total_delta,action_difference)
            #learning_progress = 0.1*torch.abs(total_delta)+2*action_difference
            learning_progress = action_difference
            #learning_progress = action_difference + 0.001*goal_reward
            learning_progress *= 10
            #learning_progress = goal_reward
            print("learning progress", learning_progress.data.item())
            lps.append(learning_progress.data.item())

            temp_buffer.append((observations, hindsight_goals,
                                learning_progress.detach(), new_observations))
            '''TRAIN GOAL POLICY'''
            if iteration % 20 == 19 and lp_training:  #only do this once we have a previous_lp_value
                # im doing 5 training iterations to make sure goal policy updates kinda quick, and is able to adapt to the learning of the agent
                for i in range(20):
                    optimizer.zero_grad()
                    observations, hindsight_goals, learning_progress, new_observations = np.random.choice(
                        temp_buffer)
                    log_prob_goals = net.compute_log_prob_goals(
                        observations, hindsight_goals)
                    previous_lp_value = net.compute_vlp(observations)
                    delta = learning_progress + gamma * net.compute_vlp(
                        new_observations).detach() - previous_lp_value
                    #delta = learning_progress.detach()
                    #print(delta, learning_progress, lp_value, previous_lp_value)

                    loss_lp_value_fun = 0.5 * delta**2
                    partial_backprop(loss_lp_value_fun, [net.goal_decoder])
                    optimizer.step()

                for i in range(20):
                    observations, hindsight_goals, learning_progress, new_observations = np.random.choice(
                        temp_buffer)
                    log_prob_goals = net.compute_log_prob_goals(
                        observations, hindsight_goals)
                    previous_lp_value = net.compute_vlp(observations)
                    delta = learning_progress + gamma * net.compute_vlp(
                        new_observations).detach() - previous_lp_value
                    optimizer.zero_grad()
                    loss_goal_policy = -delta.detach() * log_prob_goals[0, 0]
                    partial_backprop(loss_goal_policy)
                    optimizer.step()

                temp_buffer = []

            #print("lp value", previous_lp_value.data.item())

        observations = new_observations

Ejemplo n.º 9

Mostrar archivo

Archivo: train_goal_a2c_buffer.py Proyecto: MetaGenAI/vrai

def main(argv):
    FLAGS = flags.FLAGS
    print(FLAGS.flag_values_dict())
    FLAGS = FLAGS.flag_values_dict()
    globals().update(FLAGS)
    '''ENVIRONMENT'''
    import gym
    #env=gym.make("HandManipulatePen-v0")
    #env=gym.make("HandManipulateEgg-v0")
    env = gym.make("FetchSlide-v1")
    #goal_vars_slice = slice(54,61)
    goal_vars_slice = slice(3, 6)
    results = env.reset()
    #env.observation_space["observation"].shape[0]
    observation_dim = env.observation_space["observation"].shape[0]
    n_actuators = env.action_space.shape[0]
    n_dmp_basis = 10
    action_dim = n_actuators * (n_dmp_basis + 1)
    #goal_dim = observation_dim
    #goal_dim = 7
    goal_dim = 3
    batch_size = 1
    number_layers = 2
    gamma = 0.9
    #alpha = 0.1 # hyperparameter used in average lp estimate for goal policy

    #DMP
    env.relative_control = True
    from dmp import DMP
    n_simulation_steps = 25
    dmp = DMP(10, n_simulation_steps, n_actuators)

    #%%
    '''NET'''

    from learning_progress_a2c import GOALRNN
    net = GOALRNN(batch_size,
                  number_layers,
                  observation_dim,
                  action_dim,
                  goal_dim,
                  goal_vars_slice,
                  n_hidden=256)

    if os.path.isfile("lprnn_weights" + experiment_name + ".pt"):
        print("LOADING WEIGHTS")
        net.load_state_dict(
            torch.load("lprnn_weights" + experiment_name + ".pt"))

    #print(net.goal_decoder.state_dict().items())
    net.goal_decoder.apply(weight_reset)
    #print(net.goal_decoder.state_dict().items())

    # optimizer and losses
    from torch import optim
    #optimizer = optim.SGD(net.parameters(), lr=1e-4, momentum=0.9)
    #optimizer = optim.SGD(net.parameters(), lr=1e-6)
    #optimizer2 = optim.SGD(net.parameters(), lr=1e1)
    optimizer = optim.Adam(net.parameters())
    #optimizer = optim.RMSprop(net.parameters())

    # initial values of several variables
    learning_progress = torch.zeros(1)

    #initial run
    action = env.action_space.sample()
    results = env.step(action)
    observation = results[0]["observation"]
    observations = np.expand_dims(np.expand_dims(observation, 0), 0)
    observations = torch.Tensor(observations)

    if evaluating:
        net.eval()
    if evaluating or not lp_training:
        pen_goal = results[0]["desired_goal"]
        goal = torch.Tensor(pen_goal)
    #if rendering:
    #    setup_render(env)

    def g(epsilon, delta):
        if delta >= 0:
            return (1 + epsilon) * delta
        else:
            return (1 - epsilon) * delta

    rewards = []
    lps = []
    temp_buffer = []

    # a function that allows to do back prop, but not accumulate gradient in certain modules of a network
    def partial_backprop(loss, parts_to_ignore=[]):
        for part in parts_to_ignore:
            for parameter in part.parameters():
                parameter.requires_grad = False
        loss.backward(retain_graph=True)
        for part in parts_to_ignore:
            for parameter in part.parameters():
                parameter.requires_grad = True

    print(observations.shape)
    reset_env = False
    '''TRAINING LOOP'''
    # using DDPG on-olicy (without memory buffer for now. TODO: have memory buffer
    for iteration in range(1000000):
        if evaluating:  #if evaluating we just use the action prediction part of the network
            action_parameters = net.compute_actions(
                goal.unsqueeze(0).unsqueeze(0), observations)
            action_parameters = action_parameters[0, 0, :]
        else:
            #feed observations to net, get desired goal, actions, and predicted value of action, and goal
            actions, noisy_actions, noisy_goals, log_prob_goals = net(
                observations)
            #goals = Variable(goals.data, requires_grad=True)
            if lp_training:
                action_parameters, goal = noisy_actions[0, 0, :], noisy_goals[
                    0, 0, :]
            else:  # if we are not training goal policy then ignore the goal policy variables. We'll us the goal provided by openaigym
                action_parameters = noisy_actions[0, 0, :]

        action_parameters = action_parameters.detach().numpy()
        #print(action_parameters)
        #run action using DMP
        for i in range(n_simulation_steps):
            action = dmp.action_rollout(None, action_parameters, i)
            results = env.step(action)
            if evaluating or not lp_training:
                pen_goal = results[0]["desired_goal"]
                goal = torch.Tensor(pen_goal)
                if evaluating:
                    print(results[1])
                    rewards.append(results[1])
            if rendering:
                #render_with_target(env,goal.detach().numpy())
                env.render()
            obs = results[0]["observation"]
            done = results[2]
            if done:
                print("reseting environment")
                results = env.reset()
                obs = results["observation"]
                reset_env = True
                break
        new_observations = np.expand_dims(np.expand_dims(obs, 0), 0)
        new_observations = torch.Tensor(new_observations)

        if not evaluating:
            # saving rewards, learning progresses, etc
            if iteration % save_freq == save_freq - 1:
                print("Saving stuff")
                torch.save(net.state_dict(),
                           "lprnn_weights" + experiment_name + ".pt")
                with open("rewards" + experiment_name + ".txt", "a") as f:
                    f.write("\n".join([str(r) for r in rewards]))
                    f.write("\n")
                rewards = []
                with open("learning_progresses" + experiment_name + ".txt",
                          "a") as f:
                    f.write("\n".join([str(lp) for lp in lps]))
                    f.write("\n")
                lps = []

            #if save_goals:
            #    if iteration == 0:
            #        goals = np.expand_dims(goal,0)
            #    else:
            #        goals = np.concatenate([noisy_goals,np.expand_dims(goal,0)],axis=0)
        else:
            if iteration % save_freq == save_freq - 1:
                with open("test_rewards" + experiment_name + ".txt", "a") as f:
                    f.write("\n".join([str(r) for r in rewards]))
                rewards = []

        if reset_env:
            observations = new_observations
            #if not evaluating and lp_training:
            #    learning_progress = Variable(torch.zeros_like(learning_progress))
            reset_env = False
            continue

        if not evaluating:  #if not evaluating, then train

            hindsight_goal = new_observations[0, 0, goal_vars_slice]
            hindsight_goals = hindsight_goal.unsqueeze(0).unsqueeze(0)
            sparse_goal_reward = env.compute_reward(hindsight_goal.numpy(),
                                                    goal.detach().numpy(),
                                                    None)
            goal_reward = torch.clamp(
                -torch.norm(hindsight_goal - goal.detach()), -1, 1)
            print(new_observations[0, 0, goal_vars_slice], goal)
            print("goal_reward", goal_reward)
            print("sparse_goal_reward", sparse_goal_reward)
            rewards.append(sparse_goal_reward)

            temp_buffer.append((observations, noisy_actions, hindsight_goals,
                                0, new_observations, log_prob_goals.detach()))
            '''TRAIN GOAL POLICY'''
            if iteration > 0 and (
                    iteration % 20
            ) == 0 and lp_training:  #only do this once we have a previous_lp_value
                # im doing 5 training iterations to make sure goal policy updates kinda quick, and is able to adapt to the learning of the agent
                for i in range(20):
                    index = np.random.choice(range(len(temp_buffer)))
                    print(index)
                    observations, noisy_actions, hindsight_goals, _, new_observations, log_prob_goals_old = temp_buffer[
                        index]
                    optimizer.zero_grad()
                    # find the actions the policy predicts for hindsight goal
                    hindsight_actions = net.compute_actions(
                        hindsight_goals.detach(), observations)
                    #hindsight_actions_original = hindsight_actions.detach().clone()
                    action_reconstruction_loss = 0.5 * torch.norm(
                        noisy_actions.detach() - hindsight_actions)**2
                    partial_backprop(action_reconstruction_loss)
                    optimizer.step()
                    new_actions = net.compute_actions(hindsight_goals,
                                                      observations)
                    action_difference = torch.norm(
                        new_actions -
                        hindsight_actions) / torch.norm(hindsight_actions)
                    learning_progress = action_difference
                    print("learning progress", learning_progress.data.item())
                    lps.append(learning_progress.data.item())
                    temp_buffer[index] = (observations, noisy_actions,
                                          hindsight_goals,
                                          learning_progress.detach(),
                                          new_observations, log_prob_goals_old)

                for i in range(20):
                    optimizer.zero_grad()
                    index = np.random.choice(range(len(temp_buffer)))
                    observations, _, hindsight_goals, learning_progress, new_observations, _ = temp_buffer[
                        index]
                    log_prob_goals = net.compute_log_prob_goals(
                        observations, hindsight_goals)
                    previous_lp_value = net.compute_vlp(observations)
                    delta = learning_progress + gamma * net.compute_vlp(
                        new_observations).detach() - previous_lp_value
                    #delta = learning_progress.detach()
                    #print(delta, learning_progress, lp_value, previous_lp_value)

                    loss_lp_value_fun = 0.5 * delta**2
                    partial_backprop(loss_lp_value_fun, [net.goal_decoder])
                    optimizer.step()

                for i in range(20):
                    index = np.random.choice(range(len(temp_buffer)))
                    observations, _, hindsight_goals, learning_progress, new_observations, log_prob_goals_old = temp_buffer[
                        index]
                    log_prob_goals = net.compute_log_prob_goals(
                        observations, hindsight_goals)
                    previous_lp_value = net.compute_vlp(observations)
                    delta = learning_progress + gamma * net.compute_vlp(
                        new_observations).detach() - previous_lp_value
                    optimizer.zero_grad()
                    #loss_goal_policy = -delta.detach()*log_prob_goals[0,0]
                    #loss_goal_policy = -delta.detach()*torch.exp(log_prob_goals[0,0]-log_prob_goals_old)
                    loss_goal_policy = -torch.min(
                        delta.detach() *
                        torch.exp(log_prob_goals[0, 0] - log_prob_goals_old),
                        g(0.5, delta.detach()))
                    partial_backprop(loss_goal_policy)
                    optimizer.step()

                temp_buffer = []

            #print("lp value", previous_lp_value.data.item())

        observations = new_observations

Ejemplo n.º 10

Mostrar archivo

Archivo: learn.py Proyecto: thegodone/MirrorDescentSearch

def run(args):
    if args.seed is not None:
        np.random.seed(args.seed)

    # Set default parameter depend on environment
    if args.n_dims is None:
        if args.env == 'Point' or args.env == 'Arm':
            args.n_dims = 2
        elif args.env == 'OneDof' or args.env == 'Quadratic':
            args.n_dims = 1
        else:
            raise NotImplementedError

    if args.n_bfs is None:
        if args.env == 'Point' or args.env == 'Arm' or args.env == 'OneDof':
            args.n_bfs = 10
        elif args.env == 'Quadratic':
            args.n_bfs = 1
        else:
            raise NotImplementedError

    if args.dt is None:
        if args.env == 'Point' or args.env == 'Arm' or args.env == 'OneDof':
            args.dt = 0.01
        elif args.env == 'Quadratic':
            args.dt = 1.0
        else:
            raise NotImplementedError

    if args.duration is None:
        if args.env == 'Point' or args.env == 'Arm' or args.env == 'OneDof':
            args.duration = 0.5
        elif args.env == 'Quadratic':
            args.duration = 2.0
        else:
            raise NotImplementedError

    # Set default parameter depend on agent
    if args.r_gain is None:
        if args.r_normalize:
            if args.agent == 'PI2':
                args.r_gain = 10
            elif args.agent == 'MDS':
                args.r_gain = 10  # 1.0/100000.0
            elif args.agent == 'GMDS':
                args.r_gain = 10
            elif args.agent == 'AMDS':
                args.r_gain = 10  # 1.0/100000.0
            elif args.agent == 'MDSKDE':
                args.r_gain = 10  # 1.0/100000.0
            elif args.agent == 'REPS':
                args.r_gain = 10
            elif args.agent == 'AREPS':
                args.r_gain = 10
            else:
                raise NotImplementedError
        else:
            if args.agent == 'PI2':
                args.r_gain = 10**8
            elif args.agent == 'MDS':
                args.r_gain = 10**8
            elif args.agent == 'GMDS':
                args.r_gain = 10**8
            elif args.agent == 'AMDS':
                args.r_gain = 10**8
            elif args.agent == 'MDSKDE':
                args.r_gain = 10**8
            elif args.agent == 'REPS':
                args.r_gain = 10**8
            elif args.agent == 'AREPS':
                args.r_gain = 10**8
            else:
                raise NotImplementedError
    if args.std is None:
        if args.agent == 'PI2':
            args.std = 10
        elif args.agent == 'MDS':
            args.std = 10
        elif args.agent == 'GMDS':
            args.std = 10
        elif args.agent == 'AMDS':
            args.std = 10
        elif args.agent == 'MDSKDE':
            args.std = 1
        elif args.agent == 'REPS':
            args.std = 10
        elif args.agent == 'AREPS':
            args.std = 10
        else:
            raise NotImplementedError

    if args.plotdir is not None:
        if not args.plot:
            args.plot = []
        for d in args.plotdir:
            directory = glob.glob('{}/*'.format(os.path.normpath(d)))
            args.plot.extend(directory)

    # Check parameter
    if args.env == 'Point':
        assert 2 == args.n_dims, \
            'Number Error: n_dims of Point task arrow only 2 dimension'
    elif args.env == 'OneDof':
        assert 1 == args.n_dims, \
            'Number Error: n_dims of OneDof task arrow only 1 dimension'
    if args.env == 'Quadratic':
        assert 1 == args.n_bfs, \
            'Number Error: args.n_bfs of Quadratic task arrow only 1 dimension'
        assert 2 == args.duration, \
            'Number Error: duration of Quadratic task arrow only 2 [s]'
        assert 1 == args.dt, \
            'Number Error: dt of Quadratic task arrow only 1 [s]'

    args.outdir = prepare_output_dir(args, args.outdir, argv=sys.argv)
    Debug.set_outdir(args.outdir)
    Debug.set_n_updates(args.n_updates)

    # not use DMP
    if args.env == 'Quadratic':
        dmps = None
    # use DMP
    else:
        dmps = [[] for _ in range(args.n_dims)]

    n_times = int(args.duration / args.dt)

    # set environment
    if args.env == 'Arm' or args.env == 'Point':
        env = eval(args.env)(dmps, args.n_dims, args.n_bfs, n_times,
                             args.duration, args.dt)
    # elif args.env == 'Quadratic':
    #     env = eval(args.env)(args.n_dims)
    else:
        # TODO
        raise NotImplementedError

    # set agent
    if args.agent == 'PI2':
        agent = PI2(dmps, args.n_updates, args.n_reps, args.n_dims, args.n_bfs,
                    n_times, args.r_gain, args.r_normalize)
    elif args.agent == 'GMDS':
        agent = GMDS(dmps, args.n_updates, args.n_reps, args.n_dims,
                     args.n_bfs, n_times, args.r_gain, args.r_normalize)
    elif args.agent == 'MDS':
        agent = MDS(args.n_updates, args.n_reps, args.n_dims, args.n_bfs,
                    n_times, args.r_gain, args.r_normalize)
    elif args.agent == 'AMDS':
        agent = AMDS(dmps, args.n_updates, args.n_reps, args.n_dims,
                     args.n_bfs, n_times, args.r_gain, args.r_normalize)
    elif args.agent == 'MDSKDE':
        agent = MDSKDE(args.n_updates, args.n_reps, args.n_dims, args.n_bfs,
                       n_times, args.r_gain, args.r_normalize)
    elif args.agent == 'REPS':
        agent = REPS(dmps, args.n_updates, args.n_reps, args.n_dims,
                     args.n_bfs, n_times, args.r_gain, args.r_normalize)
    elif args.agent == 'AREPS':
        agent = AREPS(dmps, args.n_updates, args.n_reps, args.n_dims,
                      args.n_bfs, n_times, args.r_gain, args.r_normalize)
    else:
        raise NotImplementedError

    # Set dmp
    if args.env == 'Point' or args.env == 'Arm':
        for i in range(args.n_dims):
            dmps[i] = DMP(args.n_bfs, args.duration, 0.5, args.dt, True)
            dmps[i].reset()
            dmps[i].set_goal(env.gxi[i])
            agent.theta[i] = dmps[i].minimize_jerk()

    if args.plot is None:
        learn(args, agent, env, n_times)
    else:
        plot(args)

Ejemplo n.º 11

Mostrar archivo

Archivo: learning_progress_rnn_sandbox.py Proyecto: MetaGenAI/vrai

goal_loss = nn.MSELoss()
goal_reconstruction_loss = nn.MSELoss()

previous_goal_reward = torch.Tensor([-10])
# previous_value = torch.zeros(1)
previous_lp_value = torch.zeros(1)
learning_progress = torch.zeros(1)
average_reward_estimate = 0
average_lp_estimate = 0
alpha = 0.1

env.relative_control = True

from dmp import DMP
n_simulation_steps=25
dmp = DMP(10,n_simulation_steps,n_actuators)

#%%

save_goals = False
#rendering = True
rendering = False
evaluating = False
save_freq = 300
forget_freq = 300
# forget_freq = 2

action = env.action_space.sample()
observation = env.step(action)[0]["observation"]
observations = np.expand_dims(np.expand_dims(observation,0),0)
observations = torch.Tensor(observations)

Ejemplo n.º 12

Mostrar archivo

Archivo: run_robot.py Proyecto: tsinghua-rll/CDMP

parser.add_argument('--camera', type=int, default=0)
parser.add_argument('--robot', action='store_true')
args = parser.parse_args()

g_net_param = torch.load(args.model_path,
                         map_location='cpu') if args.model_path else None

if g_net_param:
    cfg = g_net_param['config']
else:
    cfg = Config()
logger = SummaryWriter(os.path.join(cfg.log_path, cfg.experiment_name))
torch.cuda.set_device(0)

if cfg.use_DMP:
    dmp = DMP(cfg)

#loader
generator_train = build_loader(cfg, True)  # function pointer
generator_test = build_loader(cfg, False)  # function pointer


class CMP(object):
    def __init__(self, config):
        self.cfg = config
        self.condition_net = NN_img_c(
            sz_image=self.cfg.image_size,
            ch_image=self.cfg.image_channels,
            tasks=self.cfg.number_of_tasks,
            task_img_sz=(self.cfg.object_size[0]
                         if self.cfg.img_as_task else -1))

Ejemplo n.º 13

Mostrar archivo

Archivo: sandboxv3.py Proyecto: MetaGenAI/vrai

def main(argv):
    '''PREPPING UP variables'''

    global FLAGS
    FLAGS = FLAGS.flag_values_dict()
    # print(FLAGS)
    globals().update(FLAGS)

    context_dim = env.observation_space["observation"].shape[0]
    FLAGS["context_dim"] = context_dim
    n_steps = n_simulation_steps // outcome_sampling_frequency
    FLAGS["n_steps"] = n_steps
    n_actuators = env.action_space.shape[0]
    FLAGS["n_actuators"] = n_actuators
    action_dim = n_actuators * (
        n_dmp_basis + 1)  # +1 for target position, in dmp parametrization
    FLAGS["action_dim"] = action_dim
    outcome_dim = context_dim * n_steps
    FLAGS["outcome_dim"] = outcome_dim
    FLAGS["memory_dim"] = context_dim + outcome_dim + action_dim
    globals().update(FLAGS)

    indices_hand_pos = slice(0 * n_steps, 24 * n_steps)
    indices_hand_vel = slice(24 * n_steps, 48 * n_steps)
    indices_pen_pos = slice(48 * n_steps, 51 * n_steps)
    indices_pen_rot = slice(51 * n_steps, 55 * n_steps)
    indices_pen_vel = slice(55 * n_steps, 58 * n_steps)
    indices_pen_rotvel = slice(58 * n_steps, 61 * n_steps)

    global n_goal_spaces, goal_spaces_indices, goal_spaces_names, reward_funs, find_memory_by_reward, find_memory_by_slice
    # goal_spaces_indices = [indices_hand_pos,indices_hand_vel,indices_pen_pos,indices_pen_rot,indices_pen_vel,indices_pen_rotvel]
    goal_spaces_indices = [
        indices_pen_pos, indices_pen_rot, indices_pen_vel, indices_pen_rotvel
    ]
    # goal_spaces_names = ["hand_pos","hand_vel","pen_pos","pen_rot","pen_vel","pen_rotvel"]
    # goal_spaces_names = ["pen_pos","pen_rot","pen_vel","pen_rotvel"]
    goal_spaces_names = ["pen_pos", "pen_rot"]
    goal_precisions = [0.7, 1.0, 2.0, 3.0, 10.0]
    goal_spaces_indices = [
        goal_spaces_indices[i] for i, g in enumerate(goal_spaces_names)
        for p in goal_precisions
    ]
    goal_spaces_names = [
        goal_space_name + "_" + "{0:1}".format(precision)
        for goal_space_name in goal_spaces_names
        for precision in goal_precisions
    ]
    n_goal_spaces = len(goal_spaces_indices)

    env.relative_control = True
    from dmp import DMP
    dmp = DMP(n_dmp_basis, n_simulation_steps, n_actuators)
    action_rollout = dmp.action_rollout

    #outcome is a flattned (row-major) version of an array of dimensions (observation_dim,n_steps)

    # goal_reward_vectorized = np.vectorize(goal_reward)
    def reward_funs(goal_space):
        precision = float(goal_spaces_names[goal_space].split("_")[-1])

        def reward_fun(context, outcomes, goal):
            outcome_slice = slice(goal_spaces_indices[goal_space].start,
                                  goal_spaces_indices[goal_space].stop)
            outcomes = outcomes[:, outcome_slice]
            return goal_reward(goal_space, goal, outcomes, context, precision)

        return reward_fun

    def find_memory_by_reward(context, goal_space, goal):
        outcomes = database[:, context_dim:-action_dim]
        rewards = reward_funs(goal_space)(context, outcomes, goal)
        # print("AAA", rewards.shape)
        goodness = -np.linalg.norm(database[:, :context_dim] - context,
                                   axis=1) + rewards
        index_of_memory = np.argmax(goodness, axis=0)
        return index_of_memory, goodness[index_of_memory]

    default_mask = np.zeros(memory_dim)
    default_mask[:context_dim] = 1

    def find_memory_by_slice(context,
                             outcome_slice,
                             goal,
                             action=None,
                             mask=default_mask):
        #TODO: This is a very hacky function, that is very specific to our implementation, rather than being general
        #it works because the goal and outcomes spaces are the same, but it's fine.
        query_vector = np.zeros(memory_dim)
        query_vector[:context_dim] = context
        #indices corresponding to the part of the outcome which we are querying against
        memory_slice = slice(context_dim + outcome_slice.start,
                             context_dim + outcome_slice.stop)
        query_vector[memory_slice] = goal / np.sqrt(goal.shape[0])
        mask[memory_slice] = 1
        if action is not None:
            query_vector[-action_dim:] = action
        distances = np.linalg.norm((database - query_vector) * mask, axis=1)
        index_of_memory = np.argmin(distances, axis=0)
        return index_of_memory, distances[index_of_memory]

    from mpi4py import MPI
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()

    if evaluating: assert size == 1
    if not evaluating: assert size == 2

    #initialize NN
    from meta_policy_neural_net import make_meta_policy_nn_model

    global meta_policy_nn_model
    meta_policy_nn_model = make_meta_policy_nn_model(FLAGS)

    if os.path.exists(os.path.join(THIS_DIR, "model_weights.p")):
        updated_model_weigths = pickle.load(
            open(os.path.join(THIS_DIR, "model_weights.p"), "rb"))
        meta_policy_nn_model.set_weights(updated_model_weigths)

    if rank == 0:
        # yeah this is uggly, okkk
        global database
        global intrinsic_rewards
        global goal_space_probs
        files = os.listdir(MEMORIES_DIR)
        # print(list(files))
        files = [os.path.join(MEMORIES_DIR, f)
                 for f in files]  # add path to each file
        files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
        MAX_MEMORY_SIZE = 10000
        # MAX_MEMORY_SIZE = 100000
        print(files)
        if len(files) > 0:
            for ii, filename in enumerate(files):
                if ii == 0:
                    database = np.load(filename)
                else:
                    database = np.concatenate([database, np.load(filename)], 0)
                if len(database) > MAX_MEMORY_SIZE:
                    break
        else:
            database = None
        if os.path.exists(os.path.join(THIS_DIR, "intrinsic_rewards.p")):
            intrinsic_rewards = pickle.load(
                open(os.path.join(THIS_DIR, "intrinsic_rewards.p"), "rb"))
        else:
            intrinsic_rewards = np.array(
                [0.01 for index in goal_spaces_indices], dtype=np.float32)
        goal_space_probs = goal_space_probabilities(intrinsic_rewards)
        '''INITIALIZE ENVIRONMENT'''

        # action = env.action_space.sample()
        results = env.reset()
        context = results["observation"]
        if evaluating:
            pen_goal = results["desired_goal"]
            # pen_goal = pen_goal[:3]
            # pen_goal = pen_goal[3:]
            goal = np.tile(np.expand_dims(pen_goal, 0), (n_steps, 1))
            goal = np.reshape(goal.T, (-1))
        '''INITILIAZE MEMORY DATABASE VIA RANDOM EXPLORATION'''

        #some random exploration
        if database is None:  #cold start
            print("random warming up")
            reset_env = False
            memories = 0
            while memories < 1000:
                observations = []
                action_parameter = 2 * np.random.rand(action_dim) - 1
                for i in range(n_simulation_steps):
                    # print(i)
                    if rendering:
                        env.render()
                    action = action_rollout(context, action_parameter, i)
                    results = env.step(action)
                    obs = results[0]["observation"]
                    done = results[2]
                    if done:
                        print("reseting environment")
                        results = env.reset()
                        # obs = results["observation"]
                        reset_env = True
                        break
                    if i % outcome_sampling_frequency == 0:
                        observations.append(obs)

                if reset_env:
                    reset_env = False
                    continue
                else:
                    print("Adding memory")
                    sys.stdout.flush()
                    memories += 1

                outcome = np.reshape(np.stack(observations).T, (outcome_dim))
                if database is None and memories == 1:
                    database = np.expand_dims(
                        np.concatenate([context, outcome, action_parameter]),
                        0)
                    # print(database.shape)
                else:
                    update_exploration_policy(context, outcome,
                                              action_parameter)
        else:
            memories = database.shape[0]
            if not evaluating:
                comm.send(
                    True, dest=1, tag=11
                )  #signal to send to consolidation code to continue going on
        '''TRAINING LOOP'''
        print("active goal babbling")
        reset_env = False
        for iteration in range(200000):
            print("iteration", iteration)
            # this chooses one of the goal_spaces as an index from 0 to len(goal_spaces_indices)-1
            goal_space = goal_space_policy(context)

            if not evaluating:
                if comm.Iprobe(source=1, tag=12):
                    updated_model_weigths = comm.recv(source=1, tag=12)
                    meta_policy_nn_model.set_weights(updated_model_weigths)
                #goal is of size len(goal_spaces_indices[goal_space])
                goal = goal_policy(goal_space, context)

            if evaluating:
                action_parameter = meta_policy(goal_space, goal, context)
            else:
                #USE EXPLORATION POLICY
                action_parameter = exploration_meta_policy(
                    goal_space, goal, context)

            observations = []
            for i in range(n_simulation_steps):
                # print(context.shape)
                action = action_rollout(context, action_parameter, i)
                results = env.step(action)
                if rendering:
                    env.render()
                obs = results[0]["observation"]
                # print(obs)
                done = results[2]
                if done:
                    print("reseting environment")
                    results = env.reset()
                    if evaluating:
                        pen_goal = results["desired_goal"]
                        # pen_goal = pen_goal[:3]
                        # pen_goal = pen_goal[3:]
                        goal = np.tile(np.expand_dims(pen_goal, 0),
                                       (n_steps, 1))
                        goal = np.reshape(goal.T, (-1))
                    reset_env = True
                    break
                if i % outcome_sampling_frequency == 0:
                    observations.append(obs)

            if reset_env:
                reset_env = False
                # continue
            else:
                memories += 1
                outcome = np.reshape(np.stack(observations).T, (outcome_dim))

                if not evaluating:
                    intrinsic_rewards = update_intrinsic_reward(
                        intrinsic_rewards, goal_space, goal, context, outcome)
                    print(goal_spaces_names)
                    print(intrinsic_rewards)
                    sys.stdout.flush()

                if not evaluating:
                    update_exploration_policy(context, outcome,
                                              action_parameter)
                    update_goal_space_policy()
            if not evaluating:
                if iteration % save_freq == save_freq - 1:
                    print("Saving new batch of memories")
                    sys.stdout.flush()
                    # pickle.dump(database, open("database.p","wb"))
                    database = database[-MAX_MEMORY_SIZE:]
                    np.save(
                        os.path.join(MEMORIES_DIR,
                                     "database_" + str(iteration) + ".npy"),
                        database[-save_freq:])
                    pickle.dump(
                        intrinsic_rewards,
                        open(os.path.join(THIS_DIR, "intrinsic_rewards.p"),
                             "wb"))
                    comm.send(
                        True, dest=1, tag=11
                    )  #signal to send to consolidation code to continue going on
            context = observations[-1]

        comm.send(False, dest=1, tag=11)
    if rank == 1:
        from meta_policy_neural_net import learn_from_database
        while comm.recv(source=0, tag=11):
            # while True:
            # files = filter(os.path.isfile, os.listdir("memories"))
            files = os.listdir(MEMORIES_DIR)
            # print(list(files))
            files = [os.path.join(MEMORIES_DIR, f)
                     for f in files]  # add path to each file
            files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
            # print(files)
            sys.stdout.flush()
            for filename in files:
                # if True:
                if not comm.Iprobe(source=0, tag=11):
                    database = np.load(filename)
                    print("Training on", filename)
                    sys.stdout.flush()
                    meta_policy_nn_model = learn_from_database(
                        meta_policy_nn_model, database, FLAGS)
                    # sys.stdout.flush()
                    updated_model_weigths = meta_policy_nn_model.get_weights()
                    comm.send(updated_model_weigths, dest=0, tag=12)
                    pickle.dump(
                        updated_model_weigths,
                        open(os.path.join(THIS_DIR, "model_weights.p"), "wb"))
                else:
                    break