Exemple #1
0
def load_checkpoint(checkpoint_path, rb_path, policy, args):
    fpath = os.path.join(checkpoint_path, 'model.pyth')
    checkpoint = torch.load(fpath, map_location='cpu')
    # change to default graph before loading
    policy.change_morphology([-1])
    # load and return checkpoint
    policy.actor.load_state_dict(checkpoint['actor_state'])
    policy.critic.load_state_dict(checkpoint['critic_state'])
    policy.actor_target.load_state_dict(checkpoint['actor_target_state'])
    policy.critic_target.load_state_dict(checkpoint['critic_target_state'])
    policy.actor_optimizer.load_state_dict(checkpoint['actor_optimizer_state'])
    policy.critic_optimizer.load_state_dict(
        checkpoint['critic_optimizer_state'])
    # load replay buffer
    all_rb_files = [f[:-4] for f in os.listdir(rb_path) if '.npy' in f]
    all_rb_files.sort()
    replay_buffer_new = dict()
    for name in all_rb_files:
        if len(all_rb_files) > args.rb_max // 1e6:
            replay_buffer_new[name] = utils.ReplayBuffer(
                max_size=args.rb_max // len(all_rb_files))
        else:
            replay_buffer_new[name] = utils.ReplayBuffer()
        replay_buffer_new[name].max_size = int(checkpoint['rb_max'][name])
        replay_buffer_new[name].ptr = int(checkpoint['rb_ptr'][name])
        replay_buffer_new[name].slicing_size = checkpoint['rb_slicing_size'][
            name]
        replay_buffer_new[name].storage = list(
            np.load(os.path.join(rb_path, '{}.npy'.format(name))))

    return checkpoint['total_timesteps'], \
            checkpoint['episode_num'], \
            replay_buffer_new, \
            checkpoint['num_samples'], \
            fpath
Exemple #2
0
    def __init__(self, params):

        self.state_size = params['state_size']
        self.action_size = params['action_size']
        self.buffer_size = params['buffer_size']
        self.batch_size = params['batch_size']
        self.nb_agents = params['nb_agents']
        self.learning_rate_Q = params['learning_rate_Q']
        self.learning_rate_mu = params['learning_rate_mu']
        self.memory = utils.ReplayBuffer(self.buffer_size, self.batch_size)
        self.device = params['device']
        self.tau = params['tau']
        self.gamma = params['gamma']
        
        self.Q = network.Q_estimator(
                                    self.state_size*self.nb_agents,
                                    self.action_size*self.nb_agents
                                    ).to(self.device)
        self.Q_hat = network.Q_estimator(
                                    self.state_size*self.nb_agents,
                                    self.action_size*self.nb_agents
                                    ).to(self.device)
        self.Q_hat.load_state_dict(self.Q.state_dict())
        self.optim_Q = torch.optim.Adam(self.Q.parameters(), lr=self.learning_rate_Q)

        self.mu = network.mu_estimator(self.state_size, self.action_size).to(self.device)
        self.mu_hat = network.mu_estimator(self.state_size, self.action_size).to(self.device)
        self.mu_hat.load_state_dict(self.mu.state_dict())
        self.optim_mu = torch.optim.Adam(self.mu.parameters(), lr=self.learning_rate_mu)
Exemple #3
0
def test_procedure(shared_actor, env):
    num_actions = env.action_space.n
    local_actor = nets.Actor(num_actions=num_actions)
    # load parameters from shared models
    begin_time = time.time()
    while True:
        replay_buffer = utils.ReplayBuffer(size=4, frame_history_len=4)
        local_actor.load_state_dict(shared_actor.state_dict())
        obs = env.reset()
        rewards = []
        while True:
            replay_buffer.store_frame(obs)
            states = replay_buffer.encode_recent_observation()

            states = np.expand_dims(states, axis=0) / 255.0 - .5
            logits = local_actor(
                Variable(torch.FloatTensor(states.astype(np.float32))))
            action = utils.epsilon_greedy(logits,
                                          num_actions=env.action_space.n,
                                          epsilon=-1.)
            obs, reward, done, info = env.step(action)
            rewards.append(reward)
            if done:
                print("Time:{}, computer:{}, agent:{}".format(
                    time.time() - begin_time, sum(np.array(rewards) == -1),
                    sum(np.array(rewards) == 1)))
                break
Exemple #4
0
    def __init__(self, headless=1):
        self.gamma = 0.99
        self.batch_size = 128
        self.critic_learning_rate = 0.005
        self.actor_learning_rate = 0.005

        self.tau = 0.001  # copy rate between target net and real net
        ## ENVIRONMENT
        self.env = utils.Denv(headless=headless,
                              location="donkey-generated-roads-v0")

        ## POLICY
        self.critic = Critic(obs_dim=1, action_dim=2)
        self.critic_target = Critic(obs_dim=1, action_dim=2)

        # Copy critic target parameters
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        self.actor = Actor()
        self.actor_target = Actor()
        # OPTIMIZERS
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.critic_learning_rate)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.actor_learning_rate)

        ## MEMORY
        self.memory = utils.ReplayBuffer(capacity=5000, seed=0)
def main(args):
    r = redis.Redis(host='10.10.1.2', port=6379, db=0)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_obs_dim, model_output_dim = 4, 6
    # model_obs_dim, model_output_dim = np.size(utils.HL_obs(state)), np.size(utils.HL_delta_obs(state, state))
    utils.make_dir(args.save_dir)
    HL_replay_buffer = utils.ReplayBuffer(
        model_obs_dim, args.z_dim, model_output_dim, device,
        args.num_iters * args.num_latent_action_per_iteration)
    HL_replay_buffer.load_buffer('./save_data/trial_4')
    # HL_replay_buffer.idx = 1415
    high_level_planning = HLPM.high_level_planning(
        device=device,
        model_obs_dim=model_obs_dim,
        z_dim=args.z_dim,
        model_output_dim=model_output_dim,
        model_hidden_num=args.model_hidden_num,
        batch_size=args.batch_size,
        model_lr=args.model_lr,
        high_level_policy_type=args.high_level_policy_type,
        update_sample_policy=args.update_sample_policy,
        update_sample_policy_lr=args.update_sample_policy_lr,
        low_level_policy_type=args.low_level_policy_type,
        num_timestep_per_footstep=args.num_timestep_per_footstep,
        model_update_steps=args.model_update_steps,
        control_frequency=args.control_frequency)
    # collect_data_client(args, r, high_level_planning , HL_replay_buffer)

    train_model(args, HL_replay_buffer, high_level_planning)
Exemple #6
0
def train_BCQ(state_dim, action_dim, max_action, device, args):
    # For saving files
    setting = f"{args.env}_{args.seed}"
    buffer_name = f"{args.buffer_name}_{setting}"

    # Initialize policy
    policy = BCQ.BCQ(state_dim, action_dim, max_action, device, args.discount,
                     args.tau, args.lmbda, args.phi)

    # Load buffer
    replay_buffer = utils.ReplayBuffer(state_dim, action_dim, device)
    replay_buffer.load(f"./buffers/{buffer_name}", args.load_buffer_size)

    evaluations = []
    episode_num = 0
    done = True
    training_iters = 0

    while training_iters < args.max_timesteps:
        pol_vals = policy.train(replay_buffer,
                                iterations=int(args.eval_freq),
                                batch_size=args.batch_size)

        evaluations.append(eval_policy(policy, args.env, args.seed))
        np.save(
            f"./results/BCQ_N{args.load_buffer_size}_phi{args.phi}_{buffer_name}",
            evaluations)

        training_iters += args.eval_freq
        print(f"Training iterations: {training_iters}")
Exemple #7
0
def make_buffer(hdf5_path):
    """
    Add transition tuples from batch file to replay buffer.
    """
    rb = utils.ReplayBuffer()

    f = h5py.File(hdf5_path, "r")
    demos = list(f["data"].keys())
    total_transitions = f["data"].attrs["total"]
    print("Loading {} transitions from {}...".format(total_transitions,
                                                     hdf5_path))
    env_name = f["data"].attrs["env"]

    for i in range(len(demos)):
        ep = demos[i]
        obs = f["data/{}/obs".format(ep)][()]
        actions = f["data/{}/actions".format(ep)][()]
        rewards = f["data/{}/rewards".format(ep)][()]
        next_obs = f["data/{}/next_obs".format(ep)][()]
        dones = f["data/{}/dones".format(ep)][()]

        ### important: this is action clipping! ###
        actions = np.clip(actions, -1., 1.)

        zipped = zip(obs, actions, rewards, next_obs, dones)
        for item in zipped:
            ob, ac, rew, next_ob, done = item
            # Expects tuples of (state, next_state, action, reward, done)
            rb.add((ob, next_ob, ac, rew, done))
    f.close()

    return rb, env_name
Exemple #8
0
    def __init__(self, agent_dict={}, actor_dict={}, critic_dict={}):
        """ Initialize Agent object

        Params
        ======
            agent_dict(dict): dictionary containing parameters for agent
            actor_dict(dict): dictionary containing parameters for agents actor-model
            critic_dict(dict): dictionary containing parameters for agents critic-model
        """
        enable_cuda = agent_dict.get("enable_cuda", False)
        if enable_cuda:
            self.device = torch.device(
                "cuda:0" if torch.cuda.is_available() else "cpu")
        else:
            self.device = torch.device("cpu")

        self.num_agents = agent_dict.get("num_agents", 20)

        self.num_episodes = agent_dict.get("num_episodes", 10000)
        self.save_after = agent_dict.get("save_after", -1)
        self.name = agent_dict.get("name", "reacher")

        self.gamma = agent_dict.get("gamma", 0.9)

        self.tau = agent_dict.get("tau", 0.001)

        self.noise = utils.OUNoise((self.num_agents, 4), 0)

        self.num_replays = agent_dict.get("num_replays", 1)

        self.learning_rate_actor = agent_dict.get("learning_rate_actor", 1E-3)
        self.learning_rate_critic = agent_dict.get("learning_rate_critic",
                                                   1E-3)

        self.criterion = nn.MSELoss()

        memory_size = agent_dict.get("memory_size", 2**14)
        batchsize = agent_dict.get("batchsize", 2**10)
        replay_reg = agent_dict.get("replay_reg", 0.0)

        self.replay_buffer = utils.ReplayBuffer(memory_size, batchsize)

        self.actor = model.ActorModel(actor_dict).to(self.device)
        self.actor_target = model.ActorModel(actor_dict).to(self.device)

        self.critic = model.CriticModel(critic_dict).to(self.device)
        self.critic_target = model.CriticModel(critic_dict).to(self.device)

        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=self.learning_rate_actor)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=self.learning_rate_critic)

        utils.copy_model(self.actor, self.actor_target, tau=1.0)
        utils.copy_model(self.critic, self.critic_target, tau=1.0)

        seed = agent_dict.get("seed", 0)

        torch.manual_seed(seed)
        np.random.seed(seed)
Exemple #9
0
def train(sess, env, args, actor, critic, action_bound):

    actor_loss = -tf.reduce_mean(critic.total_out)
    actor_train_step = tf.train.AdamOptimizer(args['actor_lr']).minimize(
        actor_loss, var_list=actor.network_params)
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    actor.update_target_network()
    critic.update_target_network()
    replay_buffer = utils.ReplayBuffer()

    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    done = True

    while total_timesteps < args['max_timesteps']:

        if total_timesteps != 0 and total_timesteps % args[
                'save_timesteps'] == 0:
            saver.save(sess, os.path.join(args['save_dir'], args['env']))

        if done:

            if total_timesteps != 0:
                print("total - ", total_timesteps, "episode num ", episode_num,
                      "episode reward ", episode_reward)
                training(sess, actor, critic, actor_train_step, action_bound,
                         replay_buffer, args, episode_timesteps)

            if total_timesteps != 0 and episode_num % args[
                    'eval_episodes'] == 0:
                print('starting evaluation')
                eval(env, actor)

            s = env.reset()
            done = False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        if total_timesteps < args['start_timesteps']:
            action = env.action_space.sample()
        else:
            action = actor.predict(np.reshape(s, (1, actor.s_dim)))
            action = (action + np.random.normal(
                0, args['expl_noise'], size=action.shape)).clip(
                    env.action_space.low, env.action_space.high)
            action = np.reshape(action, [-1])
            # print('new shape is ', action.shape)
        s2, r, done, info = env.step(action)
        episode_reward += r
        done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(
            done)
        replay_buffer.add((s, s2, action, r, done_bool))
        s = s2
        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1
def main(cfg):
    # define env & high level planning part & low level trajectory generator & replay buffer for HLP
    # initialize logger
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    env = daisy_API(sim=cfg.sim, render=False, logger=False)
    env.set_control_mode(cfg.control_mode)
    state = env.reset()
    com_utils = utils.CoM_frame_MPC()

    if cfg.sim:
        init_state = motion_library.exp_standing(env)

    model_obs_dim, model_output_dim = np.size(
        com_utils.HL_obs(state)), np.size(com_utils.HL_delta_obs(state, state))

    HL_replay_buffer = utils.ReplayBuffer(
        model_obs_dim, cfg.z_dim, model_output_dim, device,
        cfg.num_iters * cfg.num_latent_action_per_iteration)

    high_level_planning = HLPM.high_level_planning(
        device=device,
        model_obs_dim=model_obs_dim,
        z_dim=cfg.z_dim,
        model_output_dim=model_output_dim,
        model_hidden_num=cfg.model_hidden_num,
        model_layer_num=cfg.model_layer_num,
        batch_size=cfg.batch_size,
        model_lr=cfg.model_lr,
        high_level_policy_type=cfg.high_level_policy_type,
        update_sample_policy=cfg.update_sample_policy,
        update_sample_policy_lr=cfg.update_sample_policy_lr,
        low_level_policy_type=cfg.low_level_policy_type,
        num_timestep_per_footstep=cfg.num_timestep_per_footstep,
        model_update_steps=cfg.model_update_steps,
        control_frequency=cfg.control_frequency)

    low_level_TG = LLTG.low_level_TG(
        device=device,
        z_dim=cfg.z_dim,
        a_dim=cfg.a_dim,
        num_timestep_per_footstep=cfg.num_timestep_per_footstep,
        batch_size=cfg.batch_size,
        low_level_policy_type=cfg.low_level_policy_type,
        update_low_level_policy=cfg.update_low_level_policy,
        update_low_level_policy_lr=cfg.update_low_level_policy_lr,
        init_state=init_state,
    )

    if cfg.low_level_policy_type == 'NN':
        low_level_TG.load_model('.')

    # # # collect data
    collect_data(cfg, env, high_level_planning, low_level_TG, HL_replay_buffer,
                 com_utils)

    # train model
    train_model(cfg, HL_replay_buffer, high_level_planning)
Exemple #11
0
def create_replay_buffer(args: argparse.Namespace, env: utils.FrameStack,
                         device: torch.device) -> utils.ReplayBuffer:
    """"Method to create a replay buffer"""
    return utils.ReplayBuffer(
        obs_shape=env.observation_space.shape,
        action_shape=env.action_space.shape,
        capacity=args.replay_buffer_capacity,
        batch_size=args.batch_size,
        device=device,
    )
Exemple #12
0
def main(cfg):
    print(cfg.pretty())

    # define env & high level planning part & low level trajectory generator & replay buffer for HLP
    # initialize logger
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    env = daisy_API(sim=cfg.sim, render=False, logger=False)
    env.set_control_mode(cfg.control_mode)
    state = env.reset()

    if cfg.sim:
        init_state = motion_library.exp_standing(env)

    model_obs_dim, model_output_dim = 2, 5

    HL_replay_buffer = utils.ReplayBuffer(
        model_obs_dim, cfg.z_dim, model_output_dim, device,
        cfg.num_iters * cfg.num_latent_action_per_iteration)

    high_level_planning = HLPM.high_level_planning(
        device=device,
        model_obs_dim=model_obs_dim,
        z_dim=3,
        model_output_dim=model_output_dim,
        model_hidden_num=cfg.model_hidden_num,
        model_layer_num=2,
        batch_size=cfg.batch_size,
        model_lr=cfg.model_lr,
        high_level_policy_type='raibert',
        update_sample_policy=cfg.update_sample_policy,
        update_sample_policy_lr=cfg.update_sample_policy_lr,
        low_level_policy_type=cfg.low_level_policy_type,
        num_timestep_per_footstep=50,
        model_update_steps=cfg.model_update_steps,
        control_frequency=cfg.control_frequency)

    low_level_TG = LLTG.low_level_TG(
        device=device,
        z_dim=3,
        a_dim=cfg.a_dim,
        num_timestep_per_footstep=50,
        batch_size=cfg.batch_size,
        low_level_policy_type='IK',
        update_low_level_policy=cfg.update_low_level_policy,
        update_low_level_policy_lr=cfg.update_low_level_policy_lr,
        init_state=init_state,
    )

    # # if args.low_level_policy_type =='NN':
    # #     low_level_TG.load_model('./save_data/trial_2')

    # # # collect data
    collect_data(cfg, env, high_level_planning, low_level_TG, HL_replay_buffer)
Exemple #13
0
def train_BCQ(state_dim, action_dim, max_action, device, args):
    # For saving files
    setting = f"{args.env}_{args.seed}"
    buffer_name = f"{args.buffer_name}_{setting}"

    # Initialize policy
    if args.model == 'BCQ':
        policy = BCQ.BCQ(state_dim, action_dim, max_action, device,
                         args.discount, args.tau, args.lmbda, args.phi)
    elif args.model == 'BCQREM':
        policy = BCQREM.BCQ(state_dim, action_dim, max_action, device,
                            args.discount, args.tau, args.lmbda, args.phi)
    elif args.model == 'BCQREMshareQ':
        policy = BCQREM_share_Qparam.BCQ(state_dim, action_dim, max_action,
                                         device, args.discount, args.tau,
                                         args.lmbda, args.phi)
    elif args.model == 'BCQREMshareVQ':
        policy = BCQREM_share_VQparam.BCQ(state_dim, action_dim, max_action,
                                          device, args.discount, args.tau,
                                          args.lmbda, args.phi)
    # Load buffer
    replay_buffer = utils.ReplayBuffer(state_dim, action_dim, device)
    replay_buffer.load(f"./buffers/{buffer_name}")

    evaluations = []
    episode_num = 0
    done = True
    training_iters = 0

    print("NO.1 evaluations...")
    evaluations.append(eval_policy(policy, args.env, args.seed))
    while training_iters < args.max_timesteps:
        vae_loss, actor_loss, critic_loss = policy.train(
            replay_buffer,
            iterations=int(args.eval_freq),
            batch_size=args.batch_size)
        # vae_loss, actor_loss, critic_loss = round(vae_loss.item(), 5), round(actor_loss.item(), 5), round(critic_loss.item(), 5)
        # print(f'times:{training_iters}/[{args.max_timesteps}],  VAE: {vae_loss}, Actor: {actor_loss}, Critic: {critic_loss}')

        evaluations.append(eval_policy(policy, args.env, args.seed))
        if args.model == 'BCQ':
            np.save(f"./results/BCQ_{setting}", evaluations)
        elif args.model == 'BCQREM':
            np.save(f"./results/BCQREM_{setting}", evaluations)
        elif args.model == 'BCQREMshareQ':
            np.save(f"./results/BCQREMshareQ_{setting}", evaluations)
        elif args.model == 'BCQREMshareVQ':
            np.save(f"./results/BCQREMshareVQ_{setting}", evaluations)

        training_iters += args.eval_freq
        print(f"Training iterations: {training_iters}")
Exemple #14
0
def train_BEAR(state_dim, action_dim, max_action, device, args):
    print("Training BEAR\n")
    setting = f"{args.env}_{args.seed}"
    buffer_name = f"{args.buffer_name}_{setting}"
    hp_setting = f"N{args.load_buffer_size}_phi{args.phi}_n{args.n_action}_ne{args.n_action_execute}" \
                 f"_{args.score_activation}_k{str(args.sigmoid_k)}_betac{str(args.beta_c)}_betaa{str(args.beta_a)}"

    # Initialize policy
    policy = BEAR.BEAR(2,
                       state_dim,
                       action_dim,
                       max_action,
                       delta_conf=0.1,
                       use_bootstrap=False,
                       version=args.version,
                       lambda_=0.0,
                       threshold=0.05,
                       mode=args.mode,
                       num_samples_match=args.num_samples_match,
                       mmd_sigma=args.mmd_sigma,
                       lagrange_thresh=args.lagrange_thresh,
                       use_kl=(True if args.distance_type == "KL" else False),
                       use_ensemble=(False if args.use_ensemble_variance
                                     == "False" else True),
                       kernel_type=args.kernel_type,
                       actor_lr=args.actor_lr)

    # Load buffer
    replay_buffer = utils.ReplayBuffer(state_dim, action_dim, device)
    replay_buffer.load(f"./buffers/Extended-{buffer_name}",
                       args.load_buffer_size,
                       bootstrap_dim=4)

    if args.actor_lr != 1e-3:
        hp_setting += f"_lr{args.actor_lr}"
    evaluations = []
    episode_num = 0
    done = True
    training_iters = 0

    while training_iters < args.max_timesteps:
        pol_vals = policy.train(replay_buffer,
                                iterations=int(args.eval_freq),
                                batch_size=args.batch_size)

        evaluations.append(eval_policy(policy, args.env, args.seed))
        np.save(f"./results/BEAR3_{hp_setting}_{buffer_name}", evaluations)

        training_iters += args.eval_freq
        print(f"Training iterations: {training_iters}")
Exemple #15
0
def load_checkpoint(checkpoint_path, rb_path, policy, args):
    fpath = os.path.join(checkpoint_path, "model.pyth")
    checkpoint = torch.load(fpath, map_location="cpu")
    # change to default graph before loading
    policy.change_morphology([-1])
    # load and return checkpoint
    policy.actor.load_state_dict(checkpoint["actor_state"])
    policy.critic.load_state_dict(checkpoint["critic_state"])
    policy.actor_target.load_state_dict(checkpoint["actor_target_state"])
    policy.critic_target.load_state_dict(checkpoint["critic_target_state"])
    policy.actor_optimizer.load_state_dict(checkpoint["actor_optimizer_state"])
    policy.critic_optimizer.load_state_dict(
        checkpoint["critic_optimizer_state"])
    # load replay buffer
    all_rb_files = [f[:-4] for f in os.listdir(rb_path) if ".npy" in f]
    all_rb_files.sort()
    replay_buffer_new = dict()
    for name in all_rb_files:
        if len(all_rb_files) > args.rb_max // 1e6:
            replay_buffer_new[name] = utils.ReplayBuffer(
                max_size=args.rb_max // len(all_rb_files))
        else:
            replay_buffer_new[name] = utils.ReplayBuffer()
        replay_buffer_new[name].max_size = int(checkpoint["rb_max"][name])
        replay_buffer_new[name].ptr = int(checkpoint["rb_ptr"][name])
        replay_buffer_new[name].slicing_size = checkpoint["rb_slicing_size"][
            name]
        replay_buffer_new[name].storage = list(
            np.load(os.path.join(rb_path, "{}.npy".format(name))))

    return (
        checkpoint["total_timesteps"],
        checkpoint["episode_num"],
        replay_buffer_new,
        checkpoint["num_samples"],
        fpath,
    )
 def __init__(self,
              state_dim,
              action_dim,
              action_bound,
              discount_factor=1,
              seed=1,
              actor_lr=1e-3,
              critic_lr=1e-3,
              batch_size=100,
              namescope='default',
              tau=0.005,
              policy_noise=0.1,
              noise_clip=0.5,
              hidden_size=300):
     np.random.seed(int(seed))
     tf.set_random_seed(seed)
     self.state_dim = state_dim
     self.action_dim = action_dim
     # env.seed(int(seed))
     self.policy_noise = policy_noise
     self.noise_clip = noise_clip
     self.discount_factor = discount_factor
     self.batch_size = batch_size
     self.sess = tf.Session()
     self.hidden_size = hidden_size
     self.actor = Actor(self.sess,
                        state_dim,
                        action_dim,
                        action_bound,
                        actor_lr,
                        tau,
                        int(batch_size),
                        self.hidden_size,
                        namescope=namescope + str(seed))
     self.critic = Critic(self.sess,
                          state_dim,
                          action_dim,
                          critic_lr,
                          tau,
                          self.actor.scaled_out,
                          self.hidden_size,
                          namescope=namescope + str(seed))
     actor_loss = -tf.reduce_mean(self.critic.total_out)
     self.actor_train_step = tf.train.AdamOptimizer(actor_lr).minimize(
         actor_loss, var_list=self.actor.network_params)
     self.action_bound = action_bound
     self.sess.run(tf.global_variables_initializer())
     self.replay_buffer = utils.ReplayBuffer()
Exemple #17
0
 def __init__(self,
              env_name='Hopper-v2',
              total_episodes=1000,
              action_bound=1,
              episode_length=1000,
              learning_rate=0.02,
              weight=0.01,
              learning_steps=100,
              num_samples=8,
              noise=0.02,
              bc_index=[],
              std_dev=0.03,
              syn_step=10,
              num_best=4,
              meta_population_size=5,
              seed=1,
              hidden_size=300,
              coefficient=1):
     self.env = gym.make(env_name)
     np.random.seed(seed)
     self.env.seed(seed)
     self.action_bound = action_bound
     self.input_size = self.env.observation_space.shape[0]
     self.output_size = self.env.action_space.shape[0]
     self.total_episodes = total_episodes
     self.episode_length = episode_length
     self.lr = learning_rate
     self.num_best = num_best
     self.num_samples = num_samples
     self.noise = noise
     self.meta_population_size = meta_population_size
     self.seed = seed
     self.syn_step = syn_step
     self.coefficient = coefficient
     self.learning_steps = learning_steps
     self.bc_index = bc_index
     self.weight = weight
     self.normalizer = utils.Normalizer(self.env.observation_space.shape[0])
     self.hidden_size = hidden_size
     self.stddev = std_dev
     self.intrinsic_network = IntrinsicNetwork(state_dim=self.input_size,
                                               action_dim=self.output_size,
                                               seed=self.seed,
                                               namescope=str(seed),
                                               weight=self.weight)
     self.replay = utils.ReplayBuffer()
Exemple #18
0
    def __init__(self, config):
        self.config = config

        buffer_config = {'seed': config['seed'], 'size': config['buffer_size']}
        self.buffer = utils.ReplayBuffer(config=buffer_config)

        self.rng = np.random.default_rng(config['seed'])

        net_config = {
            'seed': config['seed'],
            'depth': config['network_depth'],
            'width': config['network_width'],
            'num_actions': config['num_actions'],
            'input_dim': config['input_dim']
        }
        self.qnet = QNet(net_config)
        self.targetnet = QNet(net_config)
        self.optimizer = torch.optim.Adam(self.qnet.parameters(),
                                          lr=config['step_size'])
        self.loss = torch.nn.MSELoss()
Exemple #19
0
Fichier : show.py Projet : llfl/TD3
def moduleShow(args):
    env = gym.make(args.env_name)
    state_dim = env.observation_space["observation"].shape[
        0] + env.observation_space["desired_goal"].shape[0]
    #state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    if args.policy_name == "TD3":
        policy = TD3.TD3(state_dim, action_dim, max_action)
    elif args.policy_name == "OurDDPG":
        policy = OurDDPG.DDPG(state_dim, action_dim, max_action)
    elif args.policy_name == "DDPG":
        policy = DDPG.DDPG(state_dim, action_dim, max_action)

    replay_buffer = utils.ReplayBuffer()

    # Evaluate untrained policy
    evaluations = [evaluate_policy(policy)]
    obs = env.reset()
Exemple #20
0
def train_BCQ(state_dim, action_dim, max_action, device, args):
    # For saving files
    setting = f"{args.env}_{args.seed}"
    buffer_name = f"{args.buffer_name}_{setting}"

    # Initialize policy
    policy = BCQ_brain.BCQ(state_dim, action_dim, max_action, device,
                           args.discount, args.tau, args.lmbda, args.phi)

    # Load buffer
    replay_buffer = utils.ReplayBuffer(state_dim, action_dim, device)
    replay_buffer.load(f"./buffers/{buffer_name}")

    training_iters = 0

    while training_iters < args.max_timesteps:
        pol_vals = policy.train(replay_buffer,
                                iterations=int(args.eval_freq),
                                batch_size=args.batch_size)
        training_iters += args.eval_freq
        #eval_policy(policy, training_iters)
        print(f"Training iterations: {training_iters}")
    return policy.actor_loss, policy.critic_loss, policy.vae_loss
Exemple #21
0
def test_vae_state(state_dim, action_dim, max_state, max_action, device, args):
    # For saving files
    setting = f"{args.env}_{args.seed}"
    buffer_name = f"{args.buffer_name}_{setting}"
    hp_setting = f"{args.score_activation}_k{str(args.sigmoid_k)}_betac{str(args.beta_c)}_betaa{str(args.beta_a)}"

    # Initialize policy
    policy = BCQ.BCQ_state(state_dim,
                           action_dim,
                           max_state,
                           max_action,
                           device,
                           args.discount,
                           args.tau,
                           args.lmbda,
                           args.phi,
                           beta_a=args.beta_a,
                           beta_c=args.beta_c,
                           sigmoid_k=args.sigmoid_k)

    # Load buffer
    replay_buffer = utils.ReplayBuffer(state_dim, action_dim, device)
    replay_buffer.load(f"./buffers/{buffer_name}", args.load_buffer_size)

    training_iters = 0

    while training_iters < int(args.max_timesteps / 5):
        vae_loss = policy.train_vae(replay_buffer,
                                    iterations=int(args.eval_freq),
                                    batch_size=args.batch_size)
        print(f"Training iterations: {training_iters}")
        print("VAE loss", vae_loss)
        training_iters += args.eval_freq
    policy.vae2.save(f"./models/vae_{setting}")
    test_loss = policy.test_vae(replay_buffer, batch_size=100000)
    print(test_loss)
    np.save(f"./results/vae_pretrain/elbo_{args.seed}", test_loss)
Exemple #22
0
def train_DBCQ(dargs, device):

    if not os.path.exists("./results"):
        os.makedirs("./results")

    if not os.path.exists("./models"):
        os.makedirs("./models")

    # For saving files
    setting = f"{dargs.env}_{dargs.seed}"
    buffer_name = f"{dargs.buffer_name}_{setting}"

    # Initialize policy
    policy = DBCQ.DBCQ(dargs.parameters, dargs.env_properties, device)

    # Load buffer
    replay_buffer = utils.ReplayBuffer(dargs.env_properties["state_dim"],
                                       dargs.env_properties["num_actions"],
                                       device)
    replay_buffer.load(f"./buffers/{buffer_name}")

    evaluations = []
    episode_num = 0
    done = True
    training_iters = 0

    while training_iters < dargs.max_timesteps:
        pol_vals = policy.train(replay_buffer)

        evaluations.append(eval_policy(policy, dargs.env, dargs.seed))
        np.save(f"./results/BCQ_{setting}", evaluations)

        training_iters += dargs.eval_freq
        print(f"Training iterations: {training_iters}")

    return policy
        kwargs["policy_freq"] = args.policy_freq
        policy = TD3.TD3(**kwargs)
    if args.policy == "DDPG":
        policy = DDPG.DDPG(**kwargs)
    if args.policy == "newDDPG":
        policy = newDDPG.DDPG(**kwargs)
    if args.policy == "newTD3":
        policy = newTD3.TD3(**kwargs)
    if args.policy == "A2C":
        policy = A2C.A2C(**kwargs)

    if args.load_model != "":
        policy_file = file_name if args.load_model == "default" else args.load_model
        policy.load(f"./models/{policy_file}")

    replay_buffer = utils.ReplayBuffer(state_dim, action_dim=action_dim)

    # Initialize environment
    minerEnv = MinerEnv(HOST, PORT)
    minerEnv.start()
    #init environment

    # Evaluate untrained policy
    evaluations = [eval_policy(policy, minerEnv)]
    train = False
    for episode_i in range(0, N_EPISODE):
        # Reset environment
        mapID = request_to_env(minerEnv, train)
        # init environment game
        minerEnv.reset()
        #action = policy.select_action(np.array(state))
def evaluate(env, agent, args, video, adapt=False):
    """Evaluate an agent, optionally adapt using PAD"""
    episode_rewards = []

    for i in tqdm(range(args.pad_num_episodes)):
        ep_agent = deepcopy(agent)  # make a new copy

        if args.use_curl:  # initialize replay buffer for CURL
            replay_buffer = utils.ReplayBuffer(
                obs_shape=env.observation_space.shape,
                action_shape=env.action_space.shape,
                capacity=args.train_steps,
                batch_size=args.pad_batch_size)
        video.init(enabled=True)

        obs = env.reset()
        done = False
        episode_reward = 0
        losses = []
        step = 0
        ep_agent.train()

        while not done:
            # Take step
            with utils.eval_mode(ep_agent):
                action = ep_agent.select_action(obs)
            next_obs, reward, done, _ = env.step(action)
            episode_reward += reward

            # Make self-supervised update if flag is true
            if adapt:
                if args.use_rot:  # rotation prediction

                    # Prepare batch of cropped observations
                    batch_next_obs = utils.batch_from_obs(
                        torch.Tensor(next_obs).cuda(),
                        batch_size=args.pad_batch_size)
                    batch_next_obs = utils.random_crop(batch_next_obs)

                    # Adapt using rotation prediction
                    losses.append(ep_agent.update_rot(batch_next_obs))

                if args.use_inv:  # inverse dynamics model

                    # Prepare batch of observations
                    batch_obs = utils.batch_from_obs(
                        torch.Tensor(obs).cuda(),
                        batch_size=args.pad_batch_size)
                    batch_next_obs = utils.batch_from_obs(
                        torch.Tensor(next_obs).cuda(),
                        batch_size=args.pad_batch_size)
                    batch_action = torch.Tensor(action).cuda().unsqueeze(
                        0).repeat(args.pad_batch_size, 1)

                    # Adapt using inverse dynamics prediction
                    losses.append(
                        ep_agent.update_inv(utils.random_crop(batch_obs),
                                            utils.random_crop(batch_next_obs),
                                            batch_action))

                if args.use_curl:  # CURL

                    # Add observation to replay buffer for use as negative samples
                    # (only first argument obs is used, but we store all for convenience)
                    replay_buffer.add(obs, action, reward, next_obs, True)

                    # Prepare positive and negative samples
                    obs_anchor, obs_pos = get_curl_pos_neg(
                        next_obs, replay_buffer)

                    # Adapt using CURL
                    losses.append(
                        ep_agent.update_curl(obs_anchor, obs_pos, ema=True))

            video.record(env, losses)
            obs = next_obs
            step += 1

        video.save(
            f'{args.mode}_pad_{i}.mp4' if adapt else f'{args.mode}_{i}.mp4')
        episode_rewards.append(episode_reward)

    return np.mean(episode_rewards)
Exemple #25
0
    if args.policy_name == "TD3":
        policy = TD3.TD3(state_dim, action_dim, max_action)
    elif args.policy_name == "BNNTD3":
        policy = BNNTD3.TD3(state_dim, action_dim, max_action)
    elif args.policy_name == "BootstrapTD3":
        if args.actor_branches > 0:
            actor_branches = args.actor_branches
        else:
            actor_branches = args.branches
        policy = BootstrapTD3.TD3(state_dim, action_dim, max_action, args.branches, actor_branches)
    elif args.policy_name == "OurDDPG":
        policy = OurDDPG.DDPG(state_dim, action_dim, max_action)
    elif args.policy_name == "DDPG":
        policy = DDPG.DDPG(state_dim, action_dim, max_action)

    replay_buffer = utils.ReplayBuffer()

    # Evaluate untrained policy
    evaluations = [evaluate_policy(policy)]

    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    if args.actor_branches > 0:
        branches = args.actor_branches
    else:
        branches = args.branches
    branch = sample_branch(branches)
    done = True

    while total_timesteps < args.max_timesteps:
        policy = TD3.TD3(**kwargs)
    if args.policy == "DDPG":
        policy = DDPG.DDPG(**kwargs)
    if args.policy == "newDDPG":
        policy = newDDPG.DDPG(**kwargs)
    if args.policy == "TD3_conv":
        policy = TD3_conv.TD3(**kwargs)
    if args.policy == "A2C":
        policy = A2C.A2C(**kwargs)

    if args.load_model != "":
        policy_file = file_name if args.load_model == "default" else args.load_model
        policy.load(f"./models/{policy_file}")

    replay_buffer = utils.ReplayBuffer(state_dim,
                                       action_dim=action_dim,
                                       max_size=int(10000))

    # Initialize environment
    minerEnv = MinerEnv(HOST, PORT)
    minerEnv.start()
    #init environment

    # Evaluate untrained policy
    #evaluations = [eval_policy(policy, minerEnv)]
    train = False
    best_score = {1: 0, 2: 0, 3: 0, 4: 0}
    for episode_i in range(0, N_EPISODE):
        # Reset environment
        mapID = request_to_env(minerEnv, train)
        # init environment game
Exemple #27
0
def main():
    args = parse_args()
    if args.seed == -1:
        args.__dict__["seed"] = np.random.randint(1, 1000000)
    utils.set_seed_everywhere(args.seed)

    pre_transform_image_size = args.pre_transform_image_size if 'crop' in args.data_augs else args.image_size
    pre_image_size = args.pre_transform_image_size  # record the pre transform image size for translation

    env = dmc2gym.make(domain_name=args.domain_name,
                       task_name=args.task_name,
                       seed=args.seed,
                       visualize_reward=False,
                       from_pixels=(args.encoder_type == 'pixel'),
                       height=pre_transform_image_size,
                       width=pre_transform_image_size,
                       frame_skip=args.action_repeat)

    env.seed(args.seed)

    # stack several consecutive frames together
    if args.encoder_type == 'pixel':
        env = utils.FrameStack(env, k=args.frame_stack)

    # make directory
    ts = time.gmtime()
    ts = time.strftime("%m-%d", ts)
    env_name = args.domain_name + '-' + args.task_name
    exp_name = env_name + '-' + ts + '-im' + str(args.image_size) +'-b'  \
    + str(args.batch_size) + '-s' + str(args.seed)  + '-' + args.encoder_type
    args.work_dir = args.work_dir + '/' + exp_name

    utils.make_dir(args.work_dir)
    video_dir = utils.make_dir(os.path.join(args.work_dir, 'video'))
    model_dir = utils.make_dir(os.path.join(args.work_dir, 'model'))
    buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer'))

    video = VideoRecorder(video_dir if args.save_video else None)

    with open(os.path.join(args.work_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f, sort_keys=True, indent=4)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    action_shape = env.action_space.shape

    if args.encoder_type == 'pixel':
        obs_shape = (3 * args.frame_stack, args.image_size, args.image_size)
        pre_aug_obs_shape = (3 * args.frame_stack, pre_transform_image_size,
                             pre_transform_image_size)
    else:
        obs_shape = env.observation_space.shape
        pre_aug_obs_shape = obs_shape

    replay_buffer = utils.ReplayBuffer(
        obs_shape=pre_aug_obs_shape,
        action_shape=action_shape,
        capacity=args.replay_buffer_capacity,
        batch_size=args.batch_size,
        device=device,
        image_size=args.image_size,
        pre_image_size=pre_image_size,
    )

    agent = make_agent(obs_shape=obs_shape,
                       action_shape=action_shape,
                       args=args,
                       device=device)

    L = Logger(args.work_dir, use_tb=args.save_tb)

    episode, episode_reward, done = 0, 0, True
    start_time = time.time()

    for step in range(args.num_train_steps):
        # evaluate agent periodically

        if step % args.eval_freq == 0:
            L.log('eval/episode', episode, step)
            evaluate(env, agent, video, args.num_eval_episodes, L, step, args)
            if args.save_model:
                agent.save_curl(model_dir, step)
            if args.save_buffer:
                replay_buffer.save(buffer_dir)

        if done:
            if step > 0:
                if step % args.log_interval == 0:
                    L.log('train/duration', time.time() - start_time, step)
                    L.dump(step)
                start_time = time.time()
            if step % args.log_interval == 0:
                L.log('train/episode_reward', episode_reward, step)

            obs = env.reset()
            done = False
            episode_reward = 0
            episode_step = 0
            episode += 1
            if step % args.log_interval == 0:
                L.log('train/episode', episode, step)

        # sample action for data collection
        if step < args.init_steps:
            action = env.action_space.sample()
        else:
            with utils.eval_mode(agent):
                action = agent.sample_action(obs / 255.)

        # run training update
        if step >= args.init_steps:
            num_updates = 1
            for _ in range(num_updates):
                agent.update(replay_buffer, L, step)

        next_obs, reward, done, _ = env.step(action)

        # allow infinit bootstrap
        done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(
            done)
        episode_reward += reward
        replay_buffer.add(obs, action, reward, next_obs, done_bool)

        obs = next_obs
        episode_step += 1
Exemple #28
0
def train(sess, env, args, actor, critic, action_bound):

    # update actor network by the deterministic policy gradient:
    actor_loss = -tf.reduce_mean(critic.total_out_scaled)
    actor_train_step = tf.train.AdamOptimizer(args['actor_lr']).minimize(
        actor_loss, var_list=actor.network_params)

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()
    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(args['summary_dir'], sess.graph)

    saver = tf.train.Saver()
    if tf.train.checkpoint_exists(os.path.join(args['save_dir'], args['env'])):
        saver.restore(sess, os.path.join(args['save_dir'], args['env']))
        print("Loading pre-trained model...")

    actor.update_target_network()
    critic.update_target_network()
    replay_buffer = utils.ReplayBuffer()

    total_timesteps = 0
    episode_num = 0
    done = True

    episode_reward = 0
    episode_timesteps = 0

    while total_timesteps < args['max_timesteps']:

        # start the trained model after a while, i.e., after save_timesteps
        if total_timesteps != 0 and total_timesteps % args[
                'save_timesteps'] == 0:
            print('start saving ...')
            saver.save(sess, os.path.join(args['save_dir'], args['env']))

        if done:
            # train
            if total_timesteps != 0:
                print("total - ", total_timesteps, "episode num ", episode_num,
                      "episode reward ", episode_reward)
                training(sess, actor, critic, actor_train_step, action_bound,
                         replay_buffer, args, episode_timesteps)
            # evaluate
            if total_timesteps != 0 and episode_num % args[
                    'eval_episodes'] == 0:
                print('start evaluating ...')
                eval(env, actor)

            # book-keeping
            summary_str = sess.run(summary_ops,
                                   feed_dict={
                                       summary_vars[0]: episode_reward,
                                       summary_vars[1]: episode_timesteps
                                   })
            writer.add_summary(summary_str, total_timesteps)
            writer.flush()

            s = env.reset()
            done = False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        # sample action
        if total_timesteps < args['start_timesteps']:
            action = env.action_space.sample()
        else:
            action = actor.predict(np.reshape(s, (1, actor.s_dim)))
            action = (action + np.random.normal(
                0, args['expl_noise'], size=action.shape)).clip(
                    env.action_space.low, env.action_space.high)
            action = np.reshape(action, [-1])

        s2, r, done, info = env.step(action)
        episode_reward += r
        done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(
            done)
        replay_buffer.add((s, s2, action, r, done_bool))
        s = s2
        episode_timesteps += 1
        total_timesteps += 1
Exemple #29
0
def main(args):
    device = "cuda" if torch.cuda.is_available() else "cpu"

    args.work_dir = os.path.join(
        args.work_dir,
        args.domain_name + "_" + args.task_name,
        args.exp_name,
        str(args.seed),
    )
    os.makedirs(args.work_dir, exist_ok=True)
    with open(os.path.join(args.work_dir, "args.json"), "w") as f:
        json.dump(vars(args), f, sort_keys=True, indent=4)

    train_envs = [
        utils.make_env(np.random.randint(0, 255), args)
        for i in range(args.num_envs)
    ]
    eval_envs = [
        utils.make_env(np.random.randint(0, 255), args) for i in range(5)
    ]
    print("Train env backgrounds: ",
          [train_env.bg_color for train_env in train_envs])
    print("Eval env backgrounds: ",
          [eval_env.bg_color for eval_env in eval_envs])

    obs_shape = train_envs[0].observation_space.shape
    action_size = train_envs[0].action_space.shape[0]

    phi = Encoder(obs_shape, args.encoder_feature_dim).to(device)
    model = DynamicsModel(args.encoder_feature_dim, action_size).to(device)
    decoders = [
        Decoder(obs_shape, args.encoder_feature_dim).to(device)
        for i in range(args.num_envs)
    ]
    opt = torch.optim.Adam(list(phi.parameters()) + list(model.parameters()),
                           lr=args.lr)
    decoder_opt = torch.optim.Adam(np.concatenate(
        [list(decoder.parameters()) for decoder in decoders]),
                                   lr=args.lr)

    train_replay_buffer = utils.ReplayBuffer(
        obs_shape=train_envs[0].observation_space.shape,
        action_shape=train_envs[0].action_space.shape,
        capacity=args.replay_buffer_capacity,
        batch_size=args.batch_size,
        device=device,
    )
    eval_replay_buffer = utils.ReplayBuffer(
        obs_shape=train_envs[0].observation_space.shape,
        action_shape=train_envs[0].action_space.shape,
        capacity=args.replay_buffer_capacity,
        batch_size=args.batch_size,
        device=device,
    )

    logging_dict = {
        "model_error": [],
        "decoding_error": [],
        "eval_model_error": [],
        "steps": [],
    }

    # collect data across environments
    for env_id in range(args.num_envs):
        train_replay_buffer = utils.collect_random_data(
            train_envs[env_id],
            env_id,
            args.num_samples,
            train_replay_buffer,
            save_video=args.save_video,
        )
        eval_replay_buffer = utils.collect_random_data(eval_envs[env_id],
                                                       env_id,
                                                       args.num_samples,
                                                       eval_replay_buffer)

    # Train loop
    for iteration in range(args.num_iters):
        model_error = 0
        decoder_error = 0
        for i in range(args.num_envs):
            obses, actions, rewards, next_obses, not_dones = train_replay_buffer.sample(
                i)
            latent = phi(obses)
            pred_next_latent = model(latent, actions)
            true_next_latent = phi(next_obses).detach()
            error_e = F.mse_loss(pred_next_latent, true_next_latent)
            model_error += error_e

            if args.one_decoder:
                pred_next_obses = decoders[0](
                    pred_next_latent)  # only use one decoder
            else:
                pred_next_obses = decoders[i](pred_next_latent)
            decoder_error_e = F.mse_loss(pred_next_obses, next_obses)
            decoder_error += decoder_error_e

        opt.zero_grad()
        model_error.backward(retain_graph=True)
        opt.step()

        decoder_opt.zero_grad()
        decoder_error.backward()
        decoder_opt.step()
        if iteration % args.log_interval == 0:
            with torch.no_grad():
                logging_dict["steps"].append(iteration)
                logging_dict["model_error"].append(model_error.item())
                logging_dict["decoding_error"].append(decoder_error.item())
                print(
                    f"Iteration {iteration}: Mean train set model error: {model_error.mean()}, decoding error: {decoder_error.mean()}%%"
                )

                # Evaluate on test environment
                (
                    obses,
                    actions,
                    rewards,
                    next_obses,
                    not_dones,
                ) = eval_replay_buffer.sample()
                with torch.no_grad():
                    latent = phi(obses)
                    pred_next_latent = model(latent, actions)
                    true_next_latent = phi(next_obses).detach()
                    test_error = F.mse_loss(pred_next_latent, true_next_latent)
                logging_dict["eval_model_error"].append(test_error.item())
                print(f"Mean test set error: {test_error}")
            torch.save(logging_dict,
                       os.path.join(args.work_dir, "logging_dict.pt"))
Exemple #30
0
def run_hiro(args):
    if not os.path.exists("./results"):
        os.makedirs("./results")
    if args.save_models and not os.path.exists("./pytorch_models"):
        os.makedirs("./pytorch_models")
    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)
    if not os.path.exists(os.path.join(args.log_dir, args.log_file)):
        os.makedirs(os.path.join(args.log_dir, args.log_file))

    env = gym.make(args.env_name)
    obs = env.reset()

    goal = obs['desired_goal']
    state = obs['observation']

    # # Write Hyperparameters to file
    # print("---------------------------------------")
    # print("Current Arguments:")
    # with open(os.path.join(args.log_dir, args.log_file, "hps.txt"), 'w') as f:
    #     for arg in vars(args):
    #         print("{}: {}".format(arg, getattr(args, arg)))
    #         f.write("{}: {}\n".format(arg, getattr(args, arg)))
    # print("---------------------------------------\n")

    writer = SummaryWriter(log_dir=os.path.join(args.log_dir, args.log_file))
    # torch.cuda.set_device(0)

    env_name = type(env).__name__
    file_name = 'hiro_{}'.format(env_name)

    # Set seeds
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    state_dim = state.shape[0]
    goal_dim = goal.shape[0]
    action_dim = env.action_space.shape[0]

    max_action = int(env.action_space.high[0])

    # Initialize policy, replay buffers
    controller_policy = hiro.Controller(state_dim=state_dim,
                                        goal_dim=state_dim,
                                        action_dim=action_dim,
                                        max_action=max_action,
                                        actor_lr=args.ctrl_act_lr,
                                        critic_lr=args.ctrl_crit_lr,
                                        ctrl_rew_type=args.ctrl_rew_type)

    manager_policy = hiro.Manager(state_dim=state_dim,
                                  goal_dim=goal_dim,
                                  action_dim=state_dim,
                                  actor_lr=args.man_act_lr,
                                  critic_lr=args.man_crit_lr,
                                  candidate_goals=args.candidate_goals)

    calculate_controller_reward = hiro_controller_reward

    if args.noise_type == "ou":
        man_noise = utils.OUNoise(state_dim, sigma=args.man_noise_sigma)
        ctrl_noise = utils.OUNoise(action_dim, sigma=args.ctrl_noise_sigma)

    elif args.noise_type == "normal":
        man_noise = utils.NormalNoise(sigma=args.man_noise_sigma)
        ctrl_noise = utils.NormalNoise(sigma=args.ctrl_noise_sigma)

    manager_buffer = utils.ReplayBuffer(maxsize=args.man_buffer_size)
    controller_buffer = utils.ReplayBuffer(maxsize=args.ctrl_buffer_size)

    # Logging Parameters
    total_timesteps = 0
    timesteps_since_eval = 0
    timesteps_since_manager = 0
    timesteps_since_subgoal = 0
    episode_num = 0
    done = True
    evaluations = []

    while total_timesteps < args.max_timesteps:
        if done:
            if total_timesteps != 0:
                print('Training Controller...')
                ctrl_act_loss, ctrl_crit_loss = controller_policy.train(
                    controller_buffer, episode_timesteps, args.ctrl_batch_size,
                    args.discount, args.ctrl_tau)

                writer.add_scalar('data/controller_actor_loss', ctrl_act_loss,
                                  total_timesteps)
                writer.add_scalar('data/controller_critic_loss',
                                  ctrl_crit_loss, total_timesteps)

                writer.add_scalar('data/controller_ep_rew', episode_reward,
                                  total_timesteps)
                writer.add_scalar('data/manager_ep_rew', manager_transition[4],
                                  total_timesteps)

                # Train Manager
                if timesteps_since_manager >= args.train_manager_freq:
                    print('Training Manager...')

                    timesteps_since_manager = 0
                    man_act_loss, man_crit_loss = manager_policy.train(
                        controller_policy, manager_buffer,
                        ceil(episode_timesteps / args.train_manager_freq),
                        args.man_batch_size, args.discount, args.man_tau)

                    writer.add_scalar('data/manager_actor_loss', man_act_loss,
                                      total_timesteps)
                    writer.add_scalar('data/manager_critic_loss',
                                      man_crit_loss, total_timesteps)

                # Evaluate episode
                if timesteps_since_eval >= args.eval_freq:
                    timesteps_since_eval = 0
                    avg_ep_rew, avg_controller_rew, avg_steps, avg_env_finish = evaluate_policy(
                        env, writer, manager_policy, controller_policy,
                        calculate_controller_reward, args.ctrl_rew_scale,
                        args.manager_propose_freq, len(evaluations))

                    writer.add_scalar('eval/avg_ep_rew', avg_ep_rew,
                                      total_timesteps)
                    writer.add_scalar('eval/avg_controller_rew',
                                      avg_controller_rew, total_timesteps)
                    writer.add_scalar('eval/avg_steps_to_finish', avg_steps,
                                      total_timesteps)
                    writer.add_scalar('eval/perc_env_goal_achieved',
                                      avg_env_finish, total_timesteps)

                    evaluations.append(
                        [avg_ep_rew, avg_controller_rew, avg_steps])

                    if args.save_models:
                        controller_policy.save(file_name + '_controller',
                                               directory="./pytorch_models")
                        manager_policy.save(file_name + '_manager',
                                            directory="./pytorch_models")

                    np.save("./results/%s" % (file_name), evaluations)

                # Process final state/obs, store manager transition, if it was not just created
                if len(manager_transition[-2]) != 1:
                    manager_transition[1] = state
                    manager_transition[5] = float(True)

                    # Every manager transition should have same length of sequences
                    if len(manager_transition[-2]
                           ) <= args.manager_propose_freq:
                        while len(manager_transition[-2]
                                  ) <= args.manager_propose_freq:
                            manager_transition[-1].append(np.inf)
                            manager_transition[-2].append(state)

                    manager_buffer.add(manager_transition)

            # Reset environment
            obs = env.reset()
            goal = obs['desired_goal']
            state = obs['observation']
            """
            obs = env.reset()  
            => {"observation", "achieved_goal", "desired_goal"}
                    (10, )        (3, )            (3, )
            goal = obs['desired_goal']  => (3, )
            state = obs['observation']  => (10, )
            """

            done = False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

            # Create new manager transition
            subgoal = manager_policy.sample_goal(state, goal)

            timesteps_since_subgoal = 0

            # Create a high level transition
            manager_transition = [
                state, None, goal, subgoal, 0, False, [state], []
            ]

        # TODO: Scale action to environment
        action = controller_policy.select_action(state, subgoal)
        action = ctrl_noise.perturb_action(action, max_action)

        # Perform action, get (nextst, r, d)
        next_tup, manager_reward, env_done, _ = env.step(action)

        # Update cumulative reward (env. reward) for manager
        manager_transition[4] += manager_reward * args.man_rew_scale

        # Process
        next_goal = obs['desired_goal']
        next_state = obs['observation']

        # Append low level sequence for off policy correction
        manager_transition[-1].append(action)
        manager_transition[-2].append(next_state)

        # Calculate reward, transition subgoal
        controller_reward = calculate_controller_reward(
            state, subgoal, next_state, args.ctrl_rew_scale)
        subgoal = controller_policy.subgoal_transition(state, subgoal,
                                                       next_state)

        # Is the episode over?
        if env_done:
            done = True

        episode_reward += controller_reward

        # Store low level transition
        controller_buffer.add(
            (
                state, next_state, subgoal, \
                action, controller_reward, float(done), \
                [], []
            )
        )

        # Update state parameters
        state = next_state
        goal = next_goal

        # Update counters
        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1
        timesteps_since_manager += 1
        timesteps_since_subgoal += 1

        if timesteps_since_subgoal % args.manager_propose_freq == 0:
            # Finish, add transition
            manager_transition[1] = state
            manager_transition[5] = float(True)

            manager_buffer.add(manager_transition)

            subgoal = manager_policy.sample_goal(state, goal)
            subgoal = man_noise.perturb_action(subgoal, max_action=np.inf)

            # Reset number of timesteps since we sampled a subgoal
            timesteps_since_subgoal = 0

            # Create a high level transition
            manager_transition = [
                state, None, goal, subgoal, 0, False, [state], []
            ]

    # Final evaluation
    evaluations.append([
        evaluate_policy(env, writer, manager_policy, controller_policy,
                        calculate_controller_reward, args.ctrl_rew_scale,
                        args.manager_propose_freq, len(evaluations))
    ])

    if args.save_models:
        controller_policy.save(file_name + '_controller',
                               directory="./pytorch_models")
        manager_policy.save(file_name + '_manager',
                            directory="./pytorch_models")

    np.save("./results/%s" % (file_name), evaluations)