コード例 #1
0
def main(argv):
    FLAGS = flags.FLAGS
    print(FLAGS.flag_values_dict())
    FLAGS = FLAGS.flag_values_dict()
    globals().update(FLAGS)
    '''ENVIRONMENT'''
    import gym
    env = gym.make("HandManipulatePen-v0")
    results = env.reset()
    env.observation_space["observation"].shape[0]
    observation_dim = env.observation_space["observation"].shape[0]
    n_actuators = env.action_space.shape[0]
    n_dmp_basis = 10
    action_dim = n_actuators * (n_dmp_basis + 1)
    #goal_dim = observation_dim
    goal_dim = 7
    batch_size = 1
    number_layers = 2
    alpha = 0.1  # hyperparameter used in average lp estimate for goal policy

    #DMP
    env.relative_control = True
    from dmp import DMP
    n_simulation_steps = 25
    dmp = DMP(10, n_simulation_steps, n_actuators)

    #%%
    '''NET'''

    from learning_progress_rnn import GOALRNN
    net = GOALRNN(batch_size,
                  number_layers,
                  observation_dim,
                  action_dim,
                  goal_dim,
                  n_hidden=256)

    # load rnn if saved. TODO: save only weights maybe to avoid bugs after we change architecture..
    if os.path.isfile("lprnn" + experiment_name + ".pt"):
        temp_net = torch.load("lprnn" + experiment_name + ".pt")
        net.load_state_dict(temp_net.state_dict())

    if os.path.isfile("lprnn_weights" + experiment_name + ".pt"):
        print("LOADING WEIGHTS")
        net.load_state_dict(
            torch.load("lprnn_weights" + experiment_name + ".pt"))

    # optimizer and losses
    from torch import optim
    #optimizer = optim.SGD(net.parameters(), lr=1e-4, momentum=0.9)
    optimizer = optim.RMSprop(net.parameters())
    goal_loss = nn.MSELoss()
    goal_reconstruction_loss = nn.MSELoss()
    action_reconstruction_loss = nn.MSELoss()

    # initial values of several variables
    previous_goal_reward = torch.Tensor([-1.0])
    previous_lp_value = torch.zeros(1)
    learning_progress = torch.zeros(1)
    #average_reward_estimate = 0
    average_lp_estimate = 0

    #initial run
    action = env.action_space.sample()
    results = env.step(action)
    observation = results[0]["observation"]
    observations = np.expand_dims(np.expand_dims(observation, 0), 0)
    observations = torch.Tensor(observations)

    if evaluating:
        net.eval()
    if evaluating or not lp_training:
        pen_goal = results[0]["desired_goal"]
        #goal = np.tile(np.expand_dims(pen_goal,0),(n_steps,1))
        #goal = np.reshape(goal.T,(-1))
        goal = torch.Tensor(pen_goal)
    if rendering:
        setup_render(env)

    rewards = []
    lps = []

    # a function that allows to do back prop, but not accumulate gradient in certain modules of a network
    def partial_backprop(loss, parts_to_ignore=[]):
        for part in parts_to_ignore:
            for parameter in part.parameters():
                parameter.requires_grad = False
        loss.backward(retain_graph=True)
        for part in parts_to_ignore:
            for parameter in part.parameters():
                parameter.requires_grad = True

    pen_vars_slice = slice(54, 61)
    pen_pos_center = torch.Tensor([1.0, 0.90, 0.15]).unsqueeze(0).unsqueeze(0)
    print(observations.shape)
    reset_env = False
    for iteration in range(1000000):
        #print(observations)
        if evaluating:  #if evaluating we just use the action prediction part of the network
            #print(goal, observations)
            action_parameters, _ = net.predict_action(
                goal.unsqueeze(0).unsqueeze(0), observations)
            action_parameters = action_parameters[0, 0, :]
            #print(action_parameters.shape)
        else:
            #feed observations to net, get desired goal, actions (and their probabilities), and predicted value of action, and goal
            actions, log_prob_action, goal, log_prob_goal, value, lp_value = net(
                observations,
                learning_progress.detach().unsqueeze(0).unsqueeze(0))
            value = value - 1  # learn the difference between the value and -1, because at the beginning most values will be close to -1
            pen_pos = observations[:, :, pen_vars_slice][..., :3]
            pen_rot = observations[:, :, pen_vars_slice][..., 3:]
            rot_goal = goal[:, :, 3:]
            rel_rot_goal = (rot_goal - pen_rot) * 0.1 + pen_rot
            goal = torch.cat([(goal[:, :, :3] - pen_pos) * 0.002 + pen_pos,
                              (rel_rot_goal) / np.linalg.norm(rel_rot_goal)],
                             dim=2)
            #goal += 0.05*torch.randn_like(goal)
            goal = Variable(goal.data, requires_grad=True)
            if lp_training:
                action_parameters, log_prob_action, goal, log_prob_goal, value, lp_value = actions[
                    0,
                    0, :], log_prob_action[0, 0], goal[0, 0, :], log_prob_goal[
                        0, 0], value[0, 0, :], lp_value[0, 0, :]
            else:  # if we are not training goal policy then ignore the goal policy variables. We'll us the goal provided by openaigym
                action_parameters, log_prob_action, _, _, value, lp_value = actions[
                    0,
                    0, :], log_prob_action[0, 0], goal[0, 0, :], log_prob_goal[
                        0, 0], value[0, 0, :], lp_value[0, 0, :]

        action_parameters = action_parameters.detach().numpy()
        #print(action_parameters)
        #run action using DMP
        for i in range(n_simulation_steps):
            # print(context.shape)
            action = dmp.action_rollout(None, action_parameters, i)
            results = env.step(action)
            if evaluating or not lp_training:
                pen_goal = results[0]["desired_goal"]
                #goal = np.tile(np.expand_dims(pen_goal,0),(n_steps,1))
                #goal = np.reshape(goal.T,(-1))
                goal = torch.Tensor(pen_goal)
                if evaluating:
                    print(results[1])
                    rewards.append(results[1])
            if rendering:
                #env.render()
                render_with_target(env, goal.detach().numpy())
            obs = results[0]["observation"]
            done = results[2]
            if done:
                print("reseting environment")
                results = env.reset()
                #print(results)
                obs = results["observation"]
                reset_env = True
                break
        new_observations = np.expand_dims(np.expand_dims(obs, 0), 0)
        new_observations = torch.Tensor(new_observations)

        if not evaluating:
            # saving rewards, learning progresses, etc
            if iteration % save_freq == save_freq - 1:
                print("Saving stuff")
                #torch.save(net, "lprnn.pt")
                torch.save(net.state_dict(),
                           "lprnn_weights" + experiment_name + ".pt")
                with open("rewards" + experiment_name + ".txt", "a") as f:
                    f.write("\n".join([str(r) for r in rewards]))
                rewards = []
                with open("learning_progresses" + experiment_name + ".txt",
                          "a") as f:
                    f.write("\n".join([str(lp) for lp in lps]))
                lps = []

            if save_goals:
                if iteration == 0:
                    goals = np.expand_dims(goal, 0)
                else:
                    goals = np.concatenate(
                        [goals, np.expand_dims(goal, 0)], axis=0)
        else:
            if iteration % save_freq == save_freq - 1:
                with open("test_rewards" + experiment_name + ".txt", "a") as f:
                    f.write("\n".join([str(r) for r in rewards]))
                rewards = []

        # we detach the RNN every so often, to determine how far to backpropagate through time
        # TODO: do this in a way that doesn't start from scratch, but instead backpropagates forget_freq many iterations in the past
        # at *every* time step!!
        if iteration % forget_freq == forget_freq - 1:
            net.forget()

        if reset_env:
            observations = new_observations
            if not evaluating and lp_training:
                previous_lp_value = lp_value
                learning_progress = Variable(
                    torch.zeros_like(learning_progress))
            reset_env = False
            continue

        if not evaluating:  #if not evaluating, then train
            optimizer.zero_grad()

            # train policy to maximize goal reward (which is -goal_loss, which is -|observation-goal|^2) Here we are only looking at pen part of observation
            #goal_reward = -1*goal_loss(new_observations[0,0,pen_vars_slice],goal)
            #goal_reward = goal_reward*(goal_reward>-1)
            goal_reward = env.compute_reward(
                new_observations[0, 0, pen_vars_slice].numpy(),
                goal.detach().numpy(), None)
            print(new_observations[0, 0, pen_vars_slice], goal)
            print("goal_reward", goal_reward)
            rewards.append(goal_reward)

            hindsight_goal = new_observations[:, :, pen_vars_slice]

            #if iteration <= 100000:
            #    # we train for the goal reconstruction part of the network
            #    # we use the hindsight_goal (the outcome of our action, to ensure we autoencode reachable goals, and explore more effectively
            #    reconstructed_goal = net.autoencode_goal(hindsight_goal+0.01*torch.randn_like(hindsight_goal))
            #    loss = goal_reconstruction_loss(goal, reconstructed_goal)
            #    print("goal_reconstruction_loss", loss.data.item())
            #    partial_backprop(loss)

            # we also learn to predict the actions we just performed when goal is the observed outcome
            # this is called hindsight experience replay (but this is a version that doesnt seem to work well)
            #predicted_action_parameters,_ = net.predict_action(hindsight_goal,observations, output_mean=True)
            #loss = action_reconstruction_loss(predicted_action_parameters, torch.Tensor(action_parameters))
            #partial_backprop(loss, [net.goal_encoder])

            # we update the policy and value function following a non-bootstraped actor-critic approach
            # we update the state-action value function by computing delta,
            # delta is an unbiased estimator of the difference between the predicted value `value` and
            # the true expected reward (estimated by the observed `goal_reward`)
            # we train the value function to minimize their squared difference delta**2
            delta = goal_reward - value
            print("value", value.data.item())
            reward_value_fun = 0.5 * delta**2
            partial_backprop(reward_value_fun, [])

            # then we update the policy using a policy gradient update
            # where delta is used as the advantage
            # note that we detach delta, so that it becomes a scalar, and gradients aren't backpropagated through it anymore
            loss_policy = -delta.detach() * log_prob_action
            partial_backprop(loss_policy)

            ###Hindsight Experience Replay
            #value = net.compute_value(hindsight_goal, observations)[0,0,:]
            #value = value - 1
            #log_prob_action = net.get_log_prob_action(hindsight_goal, observations, actions)
            #goal_reward = env.compute_reward(new_observations[0,0,pen_vars_slice].numpy(), hindsight_goal[0,0,:].detach().numpy(), None)
            #delta2 = goal_reward - value
            #reward_value_fun = 0.5*delta2**2
            #partial_backprop(reward_value_fun)

            #loss_policy = -delta2.detach()*log_prob_action
            #partial_backprop(loss_policy)

            # we define absolute learning progress as the absolute value of the "Bellman" error, `delta`
            # If delta is high, that means that the reward we got was significantly different from our expectations
            # which means we updated our policy a lot
            # which I am interpreting as "you have learned a lot" -- you have made significant learning progress
            # on the other hand if delta is very small, you didn't learn anything you didn't already know.
            #learning_progress = torch.abs(delta+delta2)
            learning_progress = torch.abs(delta)
            lps.append(learning_progress.data.item())

            # we use `learning_progress` (lp) as reward to train the goal-generating process (aka goal policy).
            # because the agent will be exploring goals in a continual way
            # we use a "continual learning" method for RL
            # in particular we use the average reward method, explained in Sutton and Barto 2nd edition (10.3, 13.6)
            # In short average reward RL uses differential value functions
            # which estimate the difference between {expected average reward following a state-action} and {average reward over all time - assume ergodicity}
            # -- called the differential return --
            # this difference measures the transient advantage of performing this action over other actions
            # there is a version of the policy gradient theorem which shows that using this in place of the expected raw reward in the episodic setting,
            # means we are maximizing the average reward over all time of our policy, which is the desired objective in the continual setting.
            # yeah, this theory has assumptions like ergodicity, and also you can only prove optimality for tabular RL, or linear models, not NNs,
            # but these are problems with all of RL theory really.
            # anyway, `delta` is now the Bellman error measuring the difference between
            # {the previous estimation of the expected differential return (`previous_lp_value`)}
            # and {the new bootstraped estimate `learning_progress.detach() - average_lp_estimate + lp_value.detach()`}
            delta = learning_progress.detach(
            ) - average_lp_estimate + lp_value.detach() - previous_lp_value
            # note that we detach the learning progress and lp_value, this is standard in Bellman errors I think
            # we don't backpropagate through the bootstraped target!
            # also update `average_lp_estimate` which is the estimate of the average reward.
            average_lp_estimate = average_lp_estimate + alpha * delta.detach()

            if iteration > 0 and lp_training:  #only do this once we have a previous_lp_value
                # update the differential value function for goal policy
                loss_lp_value_fun = 0.5 * delta**2
                partial_backprop(loss_lp_value_fun, [net.goal_decoder])

                # update goal policy using policy gradient
                loss_goal_policy = -delta.detach() * log_prob_goal
                # we don't update the goal_decoder because that way we are just training the RNN to produce certain action vectors
                # after the autoencoder has trained well, then each latent vector represents 1-to-1 a goal
                # and the action can learn to map to the actions corresponding to that goal
                # if we kept changing the goal_decoder, then the action decoder may "get confused" as its actual goal is changing even for fixed input latent vector
                partial_backprop(loss_goal_policy, [])

            optimizer.step()

            previous_lp_value = lp_value

        observations = new_observations
コード例 #2
0
def main(argv):
    FLAGS = flags.FLAGS
    print(FLAGS.flag_values_dict())
    FLAGS = FLAGS.flag_values_dict()
    globals().update(FLAGS)
    '''ENVIRONMENT'''
    import gym
    #env=gym.make("HandManipulatePen-v0")
    #env=gym.make("HandManipulateEgg-v0")
    env = gym.make("FetchSlide-v1")
    #goal_vars_slice = slice(54,61)
    goal_vars_slice = slice(3, 6)
    results = env.reset()
    #env.observation_space["observation"].shape[0]
    observation_dim = env.observation_space["observation"].shape[0]
    n_actuators = env.action_space.shape[0]
    n_dmp_basis = 10
    action_dim = n_actuators * (n_dmp_basis + 1)
    #goal_dim = observation_dim
    #goal_dim = 7
    goal_dim = 3
    batch_size = 1
    number_layers = 2
    gamma = 0.9
    #alpha = 0.1 # hyperparameter used in average lp estimate for goal policy

    #DMP
    env.relative_control = True
    from dmp import DMP
    n_simulation_steps = 25
    dmp = DMP(10, n_simulation_steps, n_actuators)

    #%%
    '''NET'''

    from learning_progress_a2c import GOALRNN
    net = GOALRNN(batch_size,
                  number_layers,
                  observation_dim,
                  action_dim,
                  goal_dim,
                  goal_vars_slice,
                  n_hidden=256)

    if os.path.isfile("lprnn_weights" + experiment_name + ".pt"):
        print("LOADING WEIGHTS")
        net.load_state_dict(
            torch.load("lprnn_weights" + experiment_name + ".pt"))

    #print(net.goal_decoder.state_dict().items())
    net.goal_decoder.apply(weight_reset)
    #print(net.goal_decoder.state_dict().items())

    # optimizer and losses
    from torch import optim
    #optimizer = optim.SGD(net.parameters(), lr=1e-4, momentum=0.9)
    #optimizer = optim.SGD(net.parameters(), lr=1e-6)
    #optimizer2 = optim.SGD(net.parameters(), lr=1e1)
    optimizer = optim.Adam(net.parameters())
    #optimizer = optim.RMSprop(net.parameters())

    # initial values of several variables
    learning_progress = torch.zeros(1)

    #initial run
    action = env.action_space.sample()
    results = env.step(action)
    observation = results[0]["observation"]
    observations = np.expand_dims(np.expand_dims(observation, 0), 0)
    observations = torch.Tensor(observations)

    if evaluating:
        net.eval()
    if evaluating or not lp_training:
        pen_goal = results[0]["desired_goal"]
        goal = torch.Tensor(pen_goal)
    #if rendering:
    #    setup_render(env)

    def g(epsilon, delta):
        if delta >= 0:
            return (1 + epsilon) * delta
        else:
            return (1 - epsilon) * delta

    rewards = []
    lps = []
    temp_buffer = []

    # a function that allows to do back prop, but not accumulate gradient in certain modules of a network
    def partial_backprop(loss, parts_to_ignore=[]):
        for part in parts_to_ignore:
            for parameter in part.parameters():
                parameter.requires_grad = False
        loss.backward(retain_graph=True)
        for part in parts_to_ignore:
            for parameter in part.parameters():
                parameter.requires_grad = True

    print(observations.shape)
    reset_env = False
    '''TRAINING LOOP'''
    # using DDPG on-olicy (without memory buffer for now. TODO: have memory buffer
    for iteration in range(1000000):
        if evaluating:  #if evaluating we just use the action prediction part of the network
            action_parameters = net.compute_actions(
                goal.unsqueeze(0).unsqueeze(0), observations)
            action_parameters = action_parameters[0, 0, :]
        else:
            #feed observations to net, get desired goal, actions, and predicted value of action, and goal
            actions, noisy_actions, noisy_goals, log_prob_goals = net(
                observations)
            #goals = Variable(goals.data, requires_grad=True)
            if lp_training:
                action_parameters, goal = noisy_actions[0, 0, :], noisy_goals[
                    0, 0, :]
            else:  # if we are not training goal policy then ignore the goal policy variables. We'll us the goal provided by openaigym
                action_parameters = noisy_actions[0, 0, :]

        action_parameters = action_parameters.detach().numpy()
        #print(action_parameters)
        #run action using DMP
        for i in range(n_simulation_steps):
            action = dmp.action_rollout(None, action_parameters, i)
            results = env.step(action)
            if evaluating or not lp_training:
                pen_goal = results[0]["desired_goal"]
                goal = torch.Tensor(pen_goal)
                if evaluating:
                    print(results[1])
                    rewards.append(results[1])
            if rendering:
                #render_with_target(env,goal.detach().numpy())
                env.render()
            obs = results[0]["observation"]
            done = results[2]
            if done:
                print("reseting environment")
                results = env.reset()
                obs = results["observation"]
                reset_env = True
                break
        new_observations = np.expand_dims(np.expand_dims(obs, 0), 0)
        new_observations = torch.Tensor(new_observations)

        if not evaluating:
            # saving rewards, learning progresses, etc
            if iteration % save_freq == save_freq - 1:
                print("Saving stuff")
                torch.save(net.state_dict(),
                           "lprnn_weights" + experiment_name + ".pt")
                with open("rewards" + experiment_name + ".txt", "a") as f:
                    f.write("\n".join([str(r) for r in rewards]))
                    f.write("\n")
                rewards = []
                with open("learning_progresses" + experiment_name + ".txt",
                          "a") as f:
                    f.write("\n".join([str(lp) for lp in lps]))
                    f.write("\n")
                lps = []

            #if save_goals:
            #    if iteration == 0:
            #        goals = np.expand_dims(goal,0)
            #    else:
            #        goals = np.concatenate([noisy_goals,np.expand_dims(goal,0)],axis=0)
        else:
            if iteration % save_freq == save_freq - 1:
                with open("test_rewards" + experiment_name + ".txt", "a") as f:
                    f.write("\n".join([str(r) for r in rewards]))
                rewards = []

        if reset_env:
            observations = new_observations
            #if not evaluating and lp_training:
            #    learning_progress = Variable(torch.zeros_like(learning_progress))
            reset_env = False
            continue

        if not evaluating:  #if not evaluating, then train

            hindsight_goal = new_observations[0, 0, goal_vars_slice]
            hindsight_goals = hindsight_goal.unsqueeze(0).unsqueeze(0)
            sparse_goal_reward = env.compute_reward(hindsight_goal.numpy(),
                                                    goal.detach().numpy(),
                                                    None)
            goal_reward = torch.clamp(
                -torch.norm(hindsight_goal - goal.detach()), -1, 1)
            print(new_observations[0, 0, goal_vars_slice], goal)
            print("goal_reward", goal_reward)
            print("sparse_goal_reward", sparse_goal_reward)
            rewards.append(sparse_goal_reward)

            temp_buffer.append((observations, noisy_actions, hindsight_goals,
                                0, new_observations, log_prob_goals.detach()))
            '''TRAIN GOAL POLICY'''
            if iteration > 0 and (
                    iteration % 20
            ) == 0 and lp_training:  #only do this once we have a previous_lp_value
                # im doing 5 training iterations to make sure goal policy updates kinda quick, and is able to adapt to the learning of the agent
                for i in range(20):
                    index = np.random.choice(range(len(temp_buffer)))
                    print(index)
                    observations, noisy_actions, hindsight_goals, _, new_observations, log_prob_goals_old = temp_buffer[
                        index]
                    optimizer.zero_grad()
                    # find the actions the policy predicts for hindsight goal
                    hindsight_actions = net.compute_actions(
                        hindsight_goals.detach(), observations)
                    #hindsight_actions_original = hindsight_actions.detach().clone()
                    action_reconstruction_loss = 0.5 * torch.norm(
                        noisy_actions.detach() - hindsight_actions)**2
                    partial_backprop(action_reconstruction_loss)
                    optimizer.step()
                    new_actions = net.compute_actions(hindsight_goals,
                                                      observations)
                    action_difference = torch.norm(
                        new_actions -
                        hindsight_actions) / torch.norm(hindsight_actions)
                    learning_progress = action_difference
                    print("learning progress", learning_progress.data.item())
                    lps.append(learning_progress.data.item())
                    temp_buffer[index] = (observations, noisy_actions,
                                          hindsight_goals,
                                          learning_progress.detach(),
                                          new_observations, log_prob_goals_old)

                for i in range(20):
                    optimizer.zero_grad()
                    index = np.random.choice(range(len(temp_buffer)))
                    observations, _, hindsight_goals, learning_progress, new_observations, _ = temp_buffer[
                        index]
                    log_prob_goals = net.compute_log_prob_goals(
                        observations, hindsight_goals)
                    previous_lp_value = net.compute_vlp(observations)
                    delta = learning_progress + gamma * net.compute_vlp(
                        new_observations).detach() - previous_lp_value
                    #delta = learning_progress.detach()
                    #print(delta, learning_progress, lp_value, previous_lp_value)

                    loss_lp_value_fun = 0.5 * delta**2
                    partial_backprop(loss_lp_value_fun, [net.goal_decoder])
                    optimizer.step()

                for i in range(20):
                    index = np.random.choice(range(len(temp_buffer)))
                    observations, _, hindsight_goals, learning_progress, new_observations, log_prob_goals_old = temp_buffer[
                        index]
                    log_prob_goals = net.compute_log_prob_goals(
                        observations, hindsight_goals)
                    previous_lp_value = net.compute_vlp(observations)
                    delta = learning_progress + gamma * net.compute_vlp(
                        new_observations).detach() - previous_lp_value
                    optimizer.zero_grad()
                    #loss_goal_policy = -delta.detach()*log_prob_goals[0,0]
                    #loss_goal_policy = -delta.detach()*torch.exp(log_prob_goals[0,0]-log_prob_goals_old)
                    loss_goal_policy = -torch.min(
                        delta.detach() *
                        torch.exp(log_prob_goals[0, 0] - log_prob_goals_old),
                        g(0.5, delta.detach()))
                    partial_backprop(loss_goal_policy)
                    optimizer.step()

                temp_buffer = []

            #print("lp value", previous_lp_value.data.item())

        observations = new_observations
コード例 #3
0
ファイル: train_goal_a2c.py プロジェクト: MetaGenAI/vrai
def main(argv):
    FLAGS = flags.FLAGS
    print(FLAGS.flag_values_dict())
    FLAGS = FLAGS.flag_values_dict()
    globals().update(FLAGS)
    '''ENVIRONMENT'''
    import gym
    env = gym.make("HandManipulatePen-v0")
    results = env.reset()
    #env.observation_space["observation"].shape[0]
    observation_dim = env.observation_space["observation"].shape[0]
    n_actuators = env.action_space.shape[0]
    n_dmp_basis = 10
    action_dim = n_actuators * (n_dmp_basis + 1)
    #goal_dim = observation_dim
    goal_dim = 7
    batch_size = 1
    number_layers = 2
    gamma = 0.9
    #alpha = 0.1 # hyperparameter used in average lp estimate for goal policy

    #DMP
    env.relative_control = True
    from dmp import DMP
    n_simulation_steps = 25
    dmp = DMP(10, n_simulation_steps, n_actuators)

    #%%
    '''NET'''

    from learning_progress_ddpg_a2c import GOALRNN
    net = GOALRNN(batch_size,
                  number_layers,
                  observation_dim,
                  action_dim,
                  goal_dim,
                  n_hidden=256)

    if os.path.isfile("lprnn_weights" + experiment_name + ".pt"):
        print("LOADING WEIGHTS")
        net.load_state_dict(
            torch.load("lprnn_weights" + experiment_name + ".pt"))

    #print(net.goal_decoder.state_dict().items())
    net.goal_decoder.apply(weight_reset)
    #print(net.goal_decoder.state_dict().items())

    # optimizer and losses
    from torch import optim
    #optimizer = optim.SGD(net.parameters(), lr=1e-4, momentum=0.9)
    #optimizer = optim.SGD(net.parameters(), lr=1e-6)
    #optimizer2 = optim.SGD(net.parameters(), lr=1e1)
    optimizer = optim.Adam(net.parameters())
    #optimizer = optim.RMSprop(net.parameters())

    # initial values of several variables
    learning_progress = torch.zeros(1)

    #initial run
    action = env.action_space.sample()
    results = env.step(action)
    observation = results[0]["observation"]
    observations = np.expand_dims(np.expand_dims(observation, 0), 0)
    observations = torch.Tensor(observations)

    if evaluating:
        net.eval()
    if evaluating or not lp_training:
        pen_goal = results[0]["desired_goal"]
        goal = torch.Tensor(pen_goal)
    if rendering:
        setup_render(env)

    rewards = []
    lps = []
    temp_buffer = []

    # a function that allows to do back prop, but not accumulate gradient in certain modules of a network
    def partial_backprop(loss, parts_to_ignore=[]):
        for part in parts_to_ignore:
            for parameter in part.parameters():
                parameter.requires_grad = False
        loss.backward(retain_graph=True)
        for part in parts_to_ignore:
            for parameter in part.parameters():
                parameter.requires_grad = True

    print(observations.shape)
    reset_env = False
    '''TRAINING LOOP'''
    # using DDPG on-olicy (without memory buffer for now. TODO: have memory buffer
    for iteration in range(1000000):
        if evaluating:  #if evaluating we just use the action prediction part of the network
            action_parameters = net.compute_actions(
                goal.unsqueeze(0).unsqueeze(0), observations)
            action_parameters = action_parameters[0, 0, :]
        else:
            #feed observations to net, get desired goal, actions, and predicted value of action, and goal
            actions, noisy_actions, noisy_goals, log_prob_goals = net(
                observations)
            #goals = Variable(goals.data, requires_grad=True)
            if lp_training:
                action_parameters, goal = noisy_actions[0, 0, :], noisy_goals[
                    0, 0, :]
            else:  # if we are not training goal policy then ignore the goal policy variables. We'll us the goal provided by openaigym
                action_parameters = noisy_actions[0, 0, :]

        action_parameters = action_parameters.detach().numpy()
        #print(action_parameters)
        #run action using DMP
        for i in range(n_simulation_steps):
            action = dmp.action_rollout(None, action_parameters, i)
            results = env.step(action)
            if evaluating or not lp_training:
                pen_goal = results[0]["desired_goal"]
                goal = torch.Tensor(pen_goal)
                if evaluating:
                    print(results[1])
                    rewards.append(results[1])
            if rendering:
                render_with_target(env, goal.detach().numpy())
            obs = results[0]["observation"]
            done = results[2]
            if done:
                print("reseting environment")
                results = env.reset()
                obs = results["observation"]
                reset_env = True
                break
        new_observations = np.expand_dims(np.expand_dims(obs, 0), 0)
        new_observations = torch.Tensor(new_observations)

        if not evaluating:
            # saving rewards, learning progresses, etc
            if iteration % save_freq == save_freq - 1:
                print("Saving stuff")
                torch.save(net.state_dict(),
                           "lprnn_weights" + experiment_name + ".pt")
                with open("rewards" + experiment_name + ".txt", "a") as f:
                    f.write("\n".join([str(r) for r in rewards]))
                    f.write("\n")
                rewards = []
                with open("learning_progresses" + experiment_name + ".txt",
                          "a") as f:
                    f.write("\n".join([str(lp) for lp in lps]))
                    f.write("\n")
                lps = []

            #if save_goals:
            #    if iteration == 0:
            #        goals = np.expand_dims(goal,0)
            #    else:
            #        goals = np.concatenate([noisy_goals,np.expand_dims(goal,0)],axis=0)
        else:
            if iteration % save_freq == save_freq - 1:
                with open("test_rewards" + experiment_name + ".txt", "a") as f:
                    f.write("\n".join([str(r) for r in rewards]))
                rewards = []

        if reset_env:
            observations = new_observations
            if not evaluating and lp_training:
                learning_progress = Variable(
                    torch.zeros_like(learning_progress))
            reset_env = False
            continue

        if not evaluating:  #if not evaluating, then train

            pen_vars_slice = slice(54, 61)
            hindsight_goal = new_observations[0, 0, pen_vars_slice]
            hindsight_goals = hindsight_goal.unsqueeze(0).unsqueeze(0)
            sparse_goal_reward = env.compute_reward(hindsight_goal.numpy(),
                                                    goal.detach().numpy(),
                                                    None)
            goal_reward = torch.clamp(
                -torch.norm(hindsight_goal - goal.detach()), -1, 1)
            print(new_observations[0, 0, pen_vars_slice], goal)
            print("goal_reward", goal_reward)
            print("sparse_goal_reward", sparse_goal_reward)
            rewards.append(sparse_goal_reward)

            total_delta = 0

            ## update q value network on desired goal
            #optimizer.zero_grad()
            #value = net.compute_q_value(noisy_goals.detach(), observations, noisy_actions)
            #print("q-value", value.data.item())
            #delta = goal_reward - value
            #total_delta += delta
            #reward_value_fun = 0.5*delta**2
            #partial_backprop(reward_value_fun, [net.goal_decoder, net.action_decoder])
            #optimizer.step()

            ### update q value network on hindsight goal
            ### by definition we have achieved the hindsight goal, so reward is 0.0 (achieved goal)
            #goal_reward = 0.0
            #optimizer.zero_grad()
            #value = net.compute_q_value(hindsight_goals.detach(), observations, noisy_actions)
            #delta = goal_reward - value
            #total_delta += delta
            #reward_value_fun = 0.5*delta**2
            #partial_backprop(reward_value_fun, [net.goal_decoder, net.action_decoder])
            #optimizer.step()

            # update policy to achieve actions with higher q value
            # this is good to make sure we learn about actions for goals which are achievable
            # TODO: Alternative to try: because in this environment having several actions that lead to same outcome is probably not very likely, then we can train the action decoder directly too on hindsight goal
            # without  any problem for intrisic motivation I think
            if np.random.rand() <= 1.0:
                #print(net.action_decoder.state_dict().items())
                for i in range(1):
                    optimizer.zero_grad()
                    # find the actions the policy predicts for hindsight goal
                    hindsight_actions = net.compute_actions(
                        hindsight_goals.detach(), observations)
                    if i == 0:
                        hindsight_actions_original = hindsight_actions.detach(
                        ).clone()
                    action_reconstruction_loss = 0.5 * torch.norm(
                        actions.detach() - hindsight_actions)**2
                    partial_backprop(action_reconstruction_loss)
                    optimizer.step()
                for i in range(1):
                    optimizer.zero_grad()
                    # find the actions the policy predicts for hindsight goal
                    hindsight_actions = net.compute_actions(
                        hindsight_goals.detach() +
                        0.1 * torch.rand_like(hindsight_goals), observations)
                    action_reconstruction_loss = 0.5 * torch.norm(
                        actions.detach() - hindsight_actions)**2
                    partial_backprop(-action_reconstruction_loss)
                    optimizer.step()
                new_actions = net.compute_actions(hindsight_goals,
                                                  observations)
                action_difference = torch.norm(
                    new_actions - hindsight_actions_original) / torch.norm(
                        hindsight_actions_original)
            else:
                actions = net.compute_actions(noisy_goals, observations)
                optimizer.zero_grad()
                loss_policy = -net.compute_q_value(noisy_goals.detach(),
                                                   observations, actions)
                partial_backprop(loss_policy, [net.q_value_decoder])
                optimizer.step()
                new_actions = net.compute_actions(noisy_goals, observations)
                action_difference = torch.norm(new_actions -
                                               actions) / torch.norm(actions)
            '''COMPUTE LEARNING PROGRESS'''

            print("action difference", action_difference)
            #learning_progress = torch.abs(delta) + action_difference
            #learning_progress = torch.max(total_delta,action_difference)
            #learning_progress = 0.1*torch.abs(total_delta)+2*action_difference
            learning_progress = action_difference
            #learning_progress = action_difference + 0.001*goal_reward
            learning_progress *= 10
            #learning_progress = goal_reward
            print("learning progress", learning_progress.data.item())
            lps.append(learning_progress.data.item())

            temp_buffer.append((observations, hindsight_goals,
                                learning_progress.detach(), new_observations))
            '''TRAIN GOAL POLICY'''
            if iteration % 20 == 19 and lp_training:  #only do this once we have a previous_lp_value
                # im doing 5 training iterations to make sure goal policy updates kinda quick, and is able to adapt to the learning of the agent
                for i in range(20):
                    optimizer.zero_grad()
                    observations, hindsight_goals, learning_progress, new_observations = np.random.choice(
                        temp_buffer)
                    log_prob_goals = net.compute_log_prob_goals(
                        observations, hindsight_goals)
                    previous_lp_value = net.compute_vlp(observations)
                    delta = learning_progress + gamma * net.compute_vlp(
                        new_observations).detach() - previous_lp_value
                    #delta = learning_progress.detach()
                    #print(delta, learning_progress, lp_value, previous_lp_value)

                    loss_lp_value_fun = 0.5 * delta**2
                    partial_backprop(loss_lp_value_fun, [net.goal_decoder])
                    optimizer.step()

                for i in range(20):
                    observations, hindsight_goals, learning_progress, new_observations = np.random.choice(
                        temp_buffer)
                    log_prob_goals = net.compute_log_prob_goals(
                        observations, hindsight_goals)
                    previous_lp_value = net.compute_vlp(observations)
                    delta = learning_progress + gamma * net.compute_vlp(
                        new_observations).detach() - previous_lp_value
                    optimizer.zero_grad()
                    loss_goal_policy = -delta.detach() * log_prob_goals[0, 0]
                    partial_backprop(loss_goal_policy)
                    optimizer.step()

                temp_buffer = []

            #print("lp value", previous_lp_value.data.item())

        observations = new_observations
コード例 #4
0
rewards = []
lps = []

for iteration in range(1000000):

    if not evaluating:
        action, log_prob_action, goal, log_prob_goal, value, lp_value = net(observations, learning_progress.detach().unsqueeze(0).unsqueeze(0))
        goal = Variable(goal.data, requires_grad=True)
        action_parameters, log_prob_action, goal, log_prob_goal, value, lp_value = action[0,0,:], log_prob_action[0,0], goal[0,0,:], log_prob_goal[0,0], value[0,0,:], lp_value[0,0,:]
    else:
        action_parameters = net.predict_action(goal)
        print(action_parameters.shape)
    action_parameters = action_parameters.detach().numpy()
    for i in range(n_simulation_steps):
        # print(context.shape)
        action = dmp.action_rollout(None,action_parameters,i)
        results = env.step(action)
        if evaluating:
            pen_goal = results["desired_goal"]
            goal = np.tile(np.expand_dims(pen_goal,0),(n_steps,1))
            goal = np.reshape(goal.T,(-1))
        if rendering:
            env.render()
        obs = results[0]["observation"]
        # print(obs)
        done = results[2]
        if done:
            print("reseting environment")
            results = env.reset()
            reset_env = True
            break