Ejemplo n.º 1
0
    def train(self, world_model_path):
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            losses = []
            all_rewards = []
            save_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                          scope='env_model')
            saver = tf.train.Saver(var_list=save_vars)

            train_writer = tf.summary.FileWriter('./env_logs/train/',
                                                 graph=sess.graph)
            summary_op = tf.summary.merge_all()

            if (self.n_envs == 1):
                envs = make_env()()
            else:
                envs = [make_env() for i in range(self.n_envs)]
                envs = SubprocVecEnv(envs)

            for idx, states, actions, rewards, next_states, dones in tqdm(
                    self.generate_data(envs), total=self.max_ep_len):
                actions = np.array(actions)
                actions = np.reshape(actions, (-1, 1))

                if (self.has_rewards):
                    target_reward = reward_to_target(rewards)
                    loss, reward_loss, state_loss, summary, _ = sess.run(
                        [
                            self.loss, self.reward_loss, self.state_loss,
                            summary_op, self.opt
                        ],
                        feed_dict={
                            self.states_ph: states,
                            self.actions_ph: actions,
                            self.target_states: next_states,
                            self.target_rewards: target_reward
                        })
                else:
                    loss, summary, _ = sess.run(
                        [self.loss, summary_op, self.opt],
                        feed_dict={
                            self.states_ph: states,
                            self.actions_ph: actions,
                            self.target_states: next_states,
                        })

                if idx % self.log_interval == 0:
                    if (self.has_rewards):
                        print(
                            '%i => Loss : %.4f, Reward Loss : %.4f, Image Loss : %.4f'
                            % (idx, loss, reward_loss, state_loss))
                    else:
                        print('%i => Loss : %.4f' % (idx, loss))
                    saver.save(sess,
                               '{}/env_model.ckpt'.format(world_model_path))
                    print('Environment model saved')

                train_writer.add_summary(summary, idx)
            envs.close()
Ejemplo n.º 2
0
experiment_rewards = np.zeros((len(seeds), args.max_steps // args.n_steps))

#run experiments n_seeds times
for seed in seeds:

    #seed numpy and torch
    np.random.seed(seed)
    torch.manual_seed(seed)

    #create environments for training
    #ensure all have different seeds across each base seed
    train_envs = [
        make_env(args.env, (seed * args.n_envs) + i)
        for i in range(args.n_envs)
    ]
    train_envs = SubprocVecEnv(train_envs)

    #environment we evaluate on has different seed from all environment we train on
    test_env = gym.make(args.env)
    test_env.seed(seed + args.n_envs)

    #create actor and critic
    actor = MLP(input_dim, args.hidden_dim, output_dim, args.n_layers,
                args.activation, args.dropout)
    critic = MLP(input_dim, args.hidden_dim, 1, args.n_layers, args.activation,
                 args.dropout)

    #initialize weights for actor and critic
    actor.apply(init_weights)
    critic.apply(init_weights)
# TODO - REMOVE ACTOR CRITIC WHERE NOT REQUIRED
N_ENVS = 1
N_STEPS = 5
END_REWARD = 49
MAX_TREE_STEPS = 7
NUM_ROLLOUTS = 10  # Hyperparameter of how far ahead in the future the agent "imagines"
DEBUG = False

A2C_MODEL_PATH = 'weights/a2c_3600.ckpt'
ENV_MODEL_PATH = 'weights/env_model.ckpt'

np.set_printoptions(threshold=sys.maxsize)

envs = [make_env() for i in range(N_ENVS)]
envs = SubprocVecEnv(envs)
ob_space = envs.observation_space.shape
ac_space = envs.action_space
nc, nw, nh = ob_space

g_actor_critic = None


def get_cache_loaded_a2c(sess, nenvs, nsteps, ob_space, ac_space):
    global g_actor_critic
    if g_actor_critic is None:
        with tf.variable_scope('actor'):
            g_actor_critic = get_actor_critic(sess,
                                              nenvs,
                                              nsteps,
                                              ob_space,
def train(policy, save_name, load_count = 0, summarize=True, load_path=None, log_path = './logs'):
    envs = [make_env() for i in range(N_ENVS)]
    envs = SubprocVecEnv(envs)

    ob_space = envs.observation_space.shape
    nc, nw, nh = ob_space
    ac_space = envs.action_space

    obs = envs.reset()
    ob_np = np.copy(obs)
    ob_np = np.squeeze(ob_np, axis=1)
    ob_np = np.expand_dims(ob_np, axis=3)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    actor_critic = get_actor_critic(sess, N_ENVS, N_STEPS, ob_space,
            ac_space, policy, summarize)
    if load_path is not None:
        actor_critic.load(load_path)
        print('Loaded a2c')

    summary_op = tf.summary.merge_all()
    writer = tf.summary.FileWriter(log_path, graph=sess.graph)

    sess.run(tf.global_variables_initializer())

    batch_ob_shape = (N_ENVS * N_STEPS, nw, nh, nc)

    dones = [False for _ in range(N_ENVS)]
    nbatch = N_ENVS * N_STEPS

    episode_rewards = np.zeros((N_ENVS, ))
    final_rewards   = np.zeros((N_ENVS, ))

    for update in tqdm(range(load_count + 1, TOTAL_TIMESTEPS + 1)):
        # mb stands for mini batch
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
        for n in range(N_STEPS):
            
            ob_np = np.copy(obs)
            ob_np = np.squeeze(ob_np, axis=1)
            ob_np = np.expand_dims(ob_np, axis=3)

            actions, values, _ = actor_critic.act(ob_np)

            mb_obs.append(ob_np)
            mb_actions.append(actions)
            mb_values.append(values)
            mb_dones.append(dones)

            obs, rewards, dones, _ = envs.step(actions)

            episode_rewards += rewards
            masks = 1 - np.array(dones)
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            mb_rewards.append(rewards)

        mb_dones.append(dones)

        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=np.float32).reshape(batch_ob_shape) #.swapaxes(1, 0).reshape(batch_ob_shape)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_masks = mb_dones[:, :-1]
        mb_dones = mb_dones[:, 1:]

        last_values = actor_critic.critique(ob_np).tolist()

        #discount/bootstrap off value fn
        for n, (rewards, d, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
            rewards = rewards.tolist()
            d = d.tolist()
            if d[-1] == 0:
                rewards = discount_with_dones(rewards+[value], d+[0], GAMMA)[:-1]
            else:
                rewards = discount_with_dones(rewards, d, GAMMA)
            mb_rewards[n] = rewards

        mb_rewards = mb_rewards.flatten()
        mb_actions = mb_actions.flatten()
        mb_values = mb_values.flatten()
        mb_masks = mb_masks.flatten()

        if summarize:
            loss, policy_loss, value_loss, policy_entropy, _, summary = actor_critic.train(mb_obs,
                    mb_rewards, mb_masks, mb_actions, mb_values, update,
                    summary_op)
            writer.add_summary(summary, update)
        else:
            loss, policy_loss, value_loss, policy_entropy, _ = actor_critic.train(mb_obs,
                    mb_rewards, mb_masks, mb_actions, mb_values, update)

        if update % LOG_INTERVAL == 0 or update == 1:
            print('%i => Policy Loss : %.4f, Value Loss : %.4f, Policy Entropy : %.4f, Final Reward : %.4f' % (update, policy_loss, value_loss, policy_entropy, final_rewards.mean()))

        if update % SAVE_INTERVAL == 0:
            print('Saving model')
            actor_critic.save(SAVE_PATH, save_name + '_' + str(update) + '.ckpt')

        actor_critic.save(SAVE_PATH, save_name + '_done.ckpt')
import os
import time
import curses
import numpy as np
import tensorflow as tf
from env_model import make_env, create_env_model
from utils import SubprocVecEnv
from discretize_env import pix_to_target, rewards_to_target, _NUM_PIXELS, sokoban_rewards
from a2c import get_actor_critic, CnnPolicy
from imagine import convert_target_to_real
from safe_grid_gym.envs.gridworlds_env import GridworldEnv

nenvs = 16
nsteps = 5
envs = [make_env() for i in range(nenvs)]
envs = SubprocVecEnv(envs)

ob_space = envs.observation_space.shape
ac_space = envs.action_space
num_actions = envs.action_space.n

env = GridworldEnv("side_effects_sokoban")

done = False
states = env.reset()
num_actions = ac_space.n
nc, nw, nh = ob_space
print('Observation space ', ob_space)
print('Number of actions ', num_actions)
steps = 0