Beispiel #1
0
def run_experiment(args):
    torch.set_num_threads(1) # see: https://github.com/pytorch/pytorch/issues/13757

    from apex import env_factory, create_logger

    # # Environment
    # if(args.env in ["Cassie-v0", "Cassie-mimic-v0", "Cassie-mimic-walking-v0"]):
    #     # NOTE: importing cassie for some reason breaks openai gym, BUG ?
    #     from cassie import CassieEnv, CassieTSEnv, CassieIKEnv
    #     from cassie.no_delta_env import CassieEnv_nodelta
    #     from cassie.speed_env import CassieEnv_speed
    #     from cassie.speed_double_freq_env import CassieEnv_speed_dfreq
    #     from cassie.speed_no_delta_env import CassieEnv_speed_no_delta
    #     # set up cassie environment
    #     # import gym_cassie
    #     # env_fn = gym_factory(args.env_name)
    #     #env_fn = make_env_fn(state_est=args.state_est)
    #     #env_fn = functools.partial(CassieEnv_speed_dfreq, "walking", clock_based = True, state_est=args.state_est)
    #     env_fn = functools.partial(CassieIKEnv, clock_based=True, state_est=args.state_est)
    #     print(env_fn().clock_inds)
    #     obs_dim = env_fn().observation_space.shape[0]
    #     action_dim = env_fn().action_space.shape[0]

    #     # Mirror Loss
    #     if args.mirror:
    #         if args.state_est:
    #             # with state estimator
    #             env_fn = functools.partial(SymmetricEnv, env_fn, mirrored_obs=[0.1, 1, 2, 3, 4, -10, -11, 12, 13, 14, -5, -6, 7, 8, 9, 15, 16, 17, 18, 19, 20, -26, -27, 28, 29, 30, -21, -22, 23, 24, 25, 31, 32, 33, 37, 38, 39, 34, 35, 36, 43, 44, 45, 40, 41, 42, 46, 47, 48], mirrored_act=[-5, -6, 7, 8, 9, -0.1, -1, 2, 3, 4])
    #         else:
    #             # without state estimator
    #             env_fn = functools.partial(SymmetricEnv, env_fn, mirrored_obs=[0.1, 1, 2, 3, 4, 5, -13, -14, 15, 16, 17,
    #                                             18, 19, -6, -7, 8, 9, 10, 11, 12, 20, 21, 22, 23, 24, 25, -33,
    #                                             -34, 35, 36, 37, 38, 39, -26, -27, 28, 29, 30, 31, 32, 40, 41, 42],
    #                                             mirrored_act = [-5, -6, 7, 8, 9, -0.1, -1, 2, 3, 4])
    # else:
    #     import gym
    #     env_fn = gym_factory(args.env_name)
    #     #max_episode_steps = env_fn()._max_episode_steps
    #     obs_dim = env_fn().observation_space.shape[0]
    #     action_dim = env_fn().action_space.shape[0]
    #     max_episode_steps = 1000

    # wrapper function for creating parallelized envs
    env_fn = env_factory(args.env_name, state_est=args.state_est, mirror=args.mirror, speed=args.speed)
    obs_dim = env_fn().observation_space.shape[0]
    action_dim = env_fn().action_space.shape[0]

    # Set seeds
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    if args.previous is not None:
        policy = torch.load(args.previous)
        print("loaded model from {}".format(args.previous))
    else:
        policy = GaussianMLP_Actor(
            obs_dim, action_dim,
            env_name=args.env_name,
            nonlinearity=torch.nn.functional.relu, 
            bounded=True, 
            init_std=np.exp(-2), 
            learn_std=False,
            normc_init=False
        )
        policy_copy = GaussianMLP_Actor(
            obs_dim, action_dim, 
            env_name=args.env_name,
            nonlinearity=torch.nn.functional.relu, 
            bounded=True, 
            init_std=np.exp(-2), 
            learn_std=False,
            normc_init=False
        )
        critic = GaussianMLP_Critic(
            obs_dim, 
            env_name=args.env_name,
            nonlinearity=torch.nn.functional.relu, 
            bounded=True, 
            init_std=np.exp(-2), 
            learn_std=False,
            normc_init=False
        )

        policy.obs_mean, policy.obs_std = map(torch.Tensor, get_normalization_params(iter=args.input_norm_steps, noise_std=1, policy=policy, env_fn=env_fn))
        critic.obs_mean = policy.obs_mean
        policy_copy.obs_mean = policy.obs_mean
        critic.obs_std = policy.obs_std
        policy_copy.obs_std = policy.obs_std
        policy_copy.train(0)

    policy.train(0)
    critic.train(0)

    print("obs_dim: {}, action_dim: {}".format(obs_dim, action_dim))

    if args.mirror:
        algo = MirrorPPO(args=vars(args))
    else:
        algo = PPO(args=vars(args))

    # create a tensorboard logging object
    logger = create_logger(args)

    print()
    print("Synchronous Distributed Proximal Policy Optimization:")
    print("\tenv:            {}".format(args.env_name))
    print("\tmax traj len:   {}".format(args.max_traj_len))
    print("\tseed:           {}".format(args.seed))
    print("\tmirror:         {}".format(args.mirror))
    print("\tnum procs:      {}".format(args.num_procs))
    print("\tlr:             {}".format(args.lr))
    print("\teps:            {}".format(args.eps))
    print("\tlam:            {}".format(args.lam))
    print("\tgamma:          {}".format(args.gamma))
    print("\tentropy coeff:  {}".format(args.entropy_coeff))
    print("\tclip:           {}".format(args.clip))
    print("\tminibatch size: {}".format(args.minibatch_size))
    print("\tepochs:         {}".format(args.epochs))
    print("\tnum steps:      {}".format(args.num_steps))
    print("\tuse gae:        {}".format(args.use_gae))
    print("\tmax grad norm:  {}".format(args.max_grad_norm))
    print("\tmax traj len:   {}".format(args.max_traj_len))
    print()

    algo.train(env_fn, policy, policy_copy, critic, args.n_itr, logger=logger)
Beispiel #2
0
    def __init__(self, args):

        self.logger = create_logger(args)
Beispiel #3
0
def run_experiment(args):

  # wrapper function for creating parallelized envs
  env_thunk = env_factory(args.env_name)
  with env_thunk() as env:
      obs_space = env.observation_space.shape[0]
      act_space = env.action_space.shape[0]

  # wrapper function for creating parallelized policies
  def policy_thunk():
    from rl.policies.actor import FF_Actor, LSTM_Actor, Linear_Actor
    if args.load_model is not None:
      return torch.load(args.load_model)
    else:
      if not args.recurrent:
        policy = Linear_Actor(obs_space, act_space, hidden_size=args.hidden_size).float()
      else:
        policy = LSTM_Actor(obs_space, act_space, hidden_size=args.hidden_size).float()

      # policy parameters should be zero initialized according to ARS paper
      for p in policy.parameters():
        p.data = torch.zeros(p.shape)
      return policy

  # the 'black box' function that will get passed into ARS
  def eval_fn(policy, env, reward_shift, traj_len, visualize=False, normalize=False):
    if hasattr(policy, 'init_hidden_state'):
      policy.init_hidden_state()

    state = torch.tensor(env.reset()).float()
    rollout_reward = 0
    done = False

    timesteps = 0
    while not done and timesteps < traj_len:
      if normalize:
        state = policy.normalize_state(state)
      action = policy.forward(state).detach().numpy()
      state, reward, done, _ = env.step(action)
      state = torch.tensor(state).float()
      rollout_reward += reward - reward_shift
      timesteps+=1
    return rollout_reward, timesteps
  import locale
  locale.setlocale(locale.LC_ALL, '')

  print("Augmented Random Search:")
  print("\tenv:          {}".format(args.env_name))
  print("\tseed:         {}".format(args.seed))
  print("\ttimesteps:    {:n}".format(args.timesteps))
  print("\tstd:          {}".format(args.std))
  print("\tdeltas:       {}".format(args.deltas))
  print("\tstep size:    {}".format(args.lr))
  print("\treward shift: {}".format(args.reward_shift))
  print()
  algo = ARS(policy_thunk, env_thunk, deltas=args.deltas, step_size=args.lr, std=args.std, workers=args.workers, redis_addr=args.redis)

  if args.algo not in ['v1', 'v2']:
    print("Valid arguments for --algo are 'v1' and 'v2'")
    exit(1)
  elif args.algo == 'v2':
    normalize_states = True
  else:
    normalize_states = False

  def black_box(p, env):
    return eval_fn(p, env, args.reward_shift, args.traj_len, normalize=normalize_states)

  avg_reward = 0
  timesteps = 0
  i = 0

  logger = create_logger(args)

#   if args.save_model is None:
#     args.save_model = os.path.join(logger.dir, 'actor.pt')

  args.save_model = os.path.join(logger.dir, 'actor.pt')

  env = env_thunk()
  while timesteps < args.timesteps:
    if not i % args.average_every:
      avg_reward = 0
      print()

    start = time.time()
    samples = algo.step(black_box)
    elapsed = time.time() - start
    iter_reward = 0
    for eval_rollout in range(10):
      reward, _ = eval_fn(algo.policy, env, 0, args.traj_len, normalize=normalize_states)
      iter_reward += reward / 10


    timesteps += samples
    avg_reward += iter_reward
    secs_per_sample = 1000 * elapsed / samples
    print(("iter {:4d} | "
           "ret {:6.2f} | "
           "last {:3d} iters: {:6.2f} | "
           "{:0.4f}s per 1k steps | "
           "timesteps {:10n}").format(i+1,  \
            iter_reward, (i%args.average_every)+1,      \
            avg_reward/((i%args.average_every)+1), \
            secs_per_sample, timesteps),    \
            end="\r")
    i += 1

    logger.add_scalar('eval', iter_reward, timesteps)
    torch.save(algo.policy, args.save_model)
Beispiel #4
0
def run_experiment(args):
    from time import time

    from apex import env_factory, create_logger
    from rl.policies.critic import FF_Critic, LSTM_Critic
    from rl.policies.actor import FF_Actor, LSTM_Actor

    import locale, os
    locale.setlocale(locale.LC_ALL, '')

    # wrapper function for creating parallelized envs
    env = env_factory(args.env_name)()
    eval_env = env_factory(args.env_name)()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if hasattr(env, 'seed'):
        env.seed(args.seed)

    obs_space = env.observation_space.shape[0]
    act_space = env.action_space.shape[0]

    if args.recurrent:
        actor = LSTM_Actor(obs_space,
                           act_space,
                           hidden_size=args.hidden_size,
                           env_name=args.env_name,
                           hidden_layers=args.layers)
        critic = LSTM_Critic(obs_space,
                             act_space,
                             hidden_size=args.hidden_size,
                             env_name=args.env_name,
                             hidden_layers=args.layers)
    else:
        actor = FF_Actor(obs_space,
                         act_space,
                         hidden_size=args.hidden_size,
                         env_name=args.env_name,
                         hidden_layers=args.layers)
        critic = FF_Critic(obs_space,
                           act_space,
                           hidden_size=args.hidden_size,
                           env_name=args.env_name,
                           hidden_layers=args.layers)

    algo = DPG(actor,
               critic,
               args.a_lr,
               args.c_lr,
               discount=args.discount,
               tau=args.tau,
               center_reward=args.center_reward,
               normalize=args.normalize)

    replay_buff = ReplayBuffer(obs_space, act_space, args.timesteps)

    if algo.recurrent:
        print("Recurrent Deterministic Policy Gradients:")
    else:
        print("Deep Deterministic Policy Gradients:")
    print("\tenv:            {}".format(args.env_name))
    print("\tseed:           {}".format(args.seed))
    print("\ttimesteps:      {:n}".format(args.timesteps))
    print("\tactor_lr:       {}".format(args.a_lr))
    print("\tcritic_lr:      {}".format(args.c_lr))
    print("\tdiscount:       {}".format(args.discount))
    print("\ttau:            {}".format(args.tau))
    print("\tnorm reward:    {}".format(args.center_reward))
    print("\tbatch_size:     {}".format(args.batch_size))
    print("\twarmup period:  {:n}".format(args.start_timesteps))
    print()

    iter = 0
    episode_reward = 0
    episode_timesteps = 0

    # create a tensorboard logging object
    logger = create_logger(args)

    if args.save_actor is None:
        args.save_actor = os.path.join(logger.dir, 'actor.pt')

    if args.save_critic is None:
        args.save_critic = os.path.join(logger.dir, 'critic.pt')

    # Keep track of some statistics for each episode
    training_start = time()
    episode_start = time()
    episode_loss = 0
    update_steps = 0
    best_reward = None

    # Fill replay buffer, update policy until n timesteps have passed
    timesteps = 0
    state = env.reset().astype(np.float32)
    while timesteps < args.timesteps:
        buffer_ready = (algo.recurrent and iter > args.batch_size) or (
            not algo.recurrent and replay_buff.size > args.batch_size)
        warmup = timesteps < args.start_timesteps

        state, r, done = collect_experience(algo.behavioral_actor,
                                            env,
                                            replay_buff,
                                            state,
                                            episode_timesteps,
                                            max_len=args.traj_len,
                                            random_action=warmup,
                                            noise=args.expl_noise,
                                            do_trajectory=algo.recurrent,
                                            normalize=algo.normalize)
        episode_reward += r
        episode_timesteps += 1
        timesteps += 1

        # Update the policy once our replay buffer is big enough
        if buffer_ready and done and not warmup:
            update_steps = 0
            if not algo.recurrent:
                num_updates = episode_timesteps * args.updates
            else:
                num_updates = args.updates
            for _ in range(num_updates):
                u_loss, u_steps = algo.update_policy(replay_buff,
                                                     args.batch_size,
                                                     traj_len=args.traj_len)
                episode_loss += u_loss / num_updates
                update_steps += u_steps

        if done:
            episode_elapsed = (time() - episode_start)
            episode_secs_per_sample = episode_elapsed / episode_timesteps
            logger.add_scalar(args.env_name + ' episode length',
                              episode_timesteps, iter)
            logger.add_scalar(args.env_name + ' episode reward',
                              episode_reward, iter)
            logger.add_scalar(args.env_name + ' critic loss', episode_loss,
                              iter)

            completion = 1 - float(timesteps) / args.timesteps
            avg_sample_r = (time() - training_start) / timesteps
            secs_remaining = avg_sample_r * args.timesteps * completion
            hrs_remaining = int(secs_remaining // (60 * 60))
            min_remaining = int(secs_remaining - hrs_remaining * 60 * 60) // 60

            if iter % args.eval_every == 0 and iter != 0:
                eval_reward = eval_policy(algo.behavioral_actor,
                                          eval_env,
                                          max_traj_len=args.traj_len)
                logger.add_scalar(args.env_name + ' eval episode', eval_reward,
                                  iter)
                logger.add_scalar(args.env_name + ' eval timestep',
                                  eval_reward, timesteps)

                print(
                    "evaluation after {:4d} episodes | return: {:7.3f} | timesteps {:9n}{:100s}"
                    .format(iter, eval_reward, timesteps, ''))

                if best_reward is None or eval_reward > best_reward:
                    torch.save(algo.behavioral_actor, args.save_actor)
                    torch.save(algo.behavioral_critic, args.save_critic)
                    best_reward = eval_reward
                    print("\t(best policy so far! saving to {})".format(
                        args.save_actor))

        try:
            print(
                "episode {:5d} | episode timestep {:5d}/{:5d} | return {:5.1f} | update timesteps: {:7n} | {:3.1f}s/1k samples | approx. {:3d}h {:02d}m remain\t\t\t\t"
                .format(iter, episode_timesteps, args.traj_len, episode_reward,
                        update_steps, 1000 * episode_secs_per_sample,
                        hrs_remaining, min_remaining),
                end='\r')

        except NameError:
            pass

        if done:
            if hasattr(algo.behavioral_actor, 'init_hidden_state'):
                algo.behavioral_actor.init_hidden_state()

            episode_start, episode_reward, episode_timesteps, episode_loss = time(
            ), 0, 0, 0
            iter += 1
Beispiel #5
0
def run_experiment(args):
    from apex import env_factory, create_logger

    # wrapper function for creating parallelized envs
    env_fn = env_factory(args.env_name,
                         state_est=args.state_est,
                         mirror=args.mirror,
                         history=args.history)
    max_traj_len = args.max_traj_len

    # Start ray
    ray.init(num_gpus=0, include_webui=True, redis_address=args.redis_address)

    # Set seeds
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    state_dim = env_fn().observation_space.shape[0]
    action_dim = env_fn().action_space.shape[0]
    max_action = 1.0
    #max_action = float(env.action_space.high[0])

    print()
    print("Synchronous Twin-Delayed Deep Deterministic policy gradients:")
    print("\tenv:            {}".format(args.env_name))
    print("\tmax traj len:   {}".format(args.max_traj_len))
    print("\tseed:           {}".format(args.seed))
    print("\tmirror:         {}".format(args.mirror))
    print("\tnum procs:      {}".format(args.num_procs))
    print("\tmin steps:      {}".format(args.min_steps))
    print("\ta_lr:           {}".format(args.a_lr))
    print("\tc_lr:           {}".format(args.c_lr))
    print("\ttau:            {}".format(args.tau))
    print("\tgamma:          {}".format(args.discount))
    print("\tact noise:      {}".format(args.act_noise))
    print("\tparam noise:    {}".format(args.param_noise))
    if (args.param_noise):
        print("\tnoise scale:    {}".format(args.noise_scale))
    print("\tbatch size:     {}".format(args.batch_size))

    print("\tpolicy noise:   {}".format(args.policy_noise))
    print("\tnoise clip:     {}".format(args.noise_clip))
    print("\tpolicy freq:    {}".format(args.policy_freq))
    print()

    # Initialize policy, replay buffer
    policy = TD3(state_dim,
                 action_dim,
                 max_action,
                 a_lr=args.a_lr,
                 c_lr=args.c_lr,
                 env_name=args.env_name)

    replay_buffer = ReplayBuffer()

    # create a tensorboard logging object
    logger = create_logger(args)

    # Initialize param noise (or set to None)
    param_noise = AdaptiveParamNoiseSpec(
        initial_stddev=0.05,
        desired_action_stddev=args.noise_scale,
        adaptation_coefficient=1.05) if args.param_noise else None

    total_timesteps = 0
    total_updates = 0
    timesteps_since_eval = 0
    episode_num = 0

    # Evaluate untrained policy once
    ret, eplen = evaluate_policy(env_fn(), policy)
    logger.add_scalar("Test/Return", ret, total_updates)
    logger.add_scalar("Test/Eplen", eplen, total_updates)

    policy.save(logger.dir)

    while total_timesteps < args.max_timesteps:

        # collect parallel experience and add to replay buffer
        merged_transitions, episode_timesteps = parallel_collect_experience(
            policy,
            env_fn,
            args.act_noise,
            args.min_steps,
            max_traj_len,
            num_procs=args.num_procs)
        replay_buffer.add_parallel(merged_transitions)
        total_timesteps += episode_timesteps
        timesteps_since_eval += episode_timesteps
        episode_num += args.num_procs

        # Logging rollouts
        print("Total T: {} Episode Num: {} Episode T: {}".format(
            total_timesteps, episode_num, episode_timesteps))

        # update the policy
        avg_q1, avg_q2, q_loss, pi_loss, avg_action = policy.train(
            replay_buffer, episode_timesteps, args.batch_size, args.discount,
            args.tau, args.policy_noise, args.noise_clip, args.policy_freq)
        total_updates += episode_timesteps  # this is how many iterations we did updates for

        # Logging training
        logger.add_scalar("Train/avg_q1", avg_q1, total_updates)
        logger.add_scalar("Train/avg_q2", avg_q2, total_updates)
        logger.add_scalar("Train/q_loss", q_loss, total_updates)
        logger.add_scalar("Train/pi_loss", pi_loss, total_updates)
        logger.add_histogram("Train/avg_action", avg_action, total_updates)

        # Evaluate episode
        if timesteps_since_eval >= args.eval_freq:
            timesteps_since_eval = 0
            ret, eplen = evaluate_policy(env_fn(), policy)

            # Logging Eval
            logger.add_scalar("Test/Return", ret, total_updates)
            logger.add_scalar("Test/Eplen", eplen, total_updates)
            logger.add_histogram("Test/avg_action", avg_action, total_updates)

            # Logging Totals
            logger.add_scalar("Misc/Timesteps", total_timesteps, total_updates)
            logger.add_scalar("Misc/ReplaySize", replay_buffer.ptr,
                              total_updates)

            print("Total T: {}\tEval Return: {}\t Eval Eplen: {}".format(
                total_timesteps, ret, eplen))

            if args.save_models:
                policy.save()

    # Final evaluation
    ret, eplen = evaluate_policy(env_fn(), policy)
    logger.add_scalar("Test/Return", ret, total_updates)
    logger.add_scalar("Test/Eplen", eplen, total_updates)

    # Final Policy Save
    if args.save_models:
        policy.save()
Beispiel #6
0
def run_experiment(args):
    from apex import env_factory, create_logger

    torch.set_num_threads(1)

    # wrapper function for creating parallelized envs
    env_fn = env_factory(args.env_name,
                         traj=args.traj,
                         state_est=args.state_est,
                         dynamics_randomization=args.dyn_random,
                         mirror=args.mirror,
                         clock_based=args.clock_based,
                         history=args.history)
    obs_dim = env_fn().observation_space.shape[0]
    action_dim = env_fn().action_space.shape[0]

    # Set seeds
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    if args.previous is not None:
        policy = torch.load(args.previous + "actor.pt")
        critic = torch.load(args.previous + "critic.pt")
        # TODO: add ability to load previous hyperparameters, if this is something that we event want
        # with open(args.previous + "experiment.pkl", 'rb') as file:
        #     args = pickle.loads(file.read())
        print("loaded model from {}".format(args.previous))
    else:
        if args.recurrent:
            policy = Gaussian_LSTM_Actor(obs_dim,
                                         action_dim,
                                         fixed_std=np.exp(-2),
                                         env_name=args.env_name)
            critic = LSTM_V(obs_dim)
        else:
            policy = Gaussian_FF_Actor(obs_dim,
                                       action_dim,
                                       fixed_std=np.exp(-2),
                                       env_name=args.env_name)
            critic = FF_V(obs_dim)

        with torch.no_grad():
            policy.obs_mean, policy.obs_std = map(
                torch.Tensor,
                get_normalization_params(iter=args.input_norm_steps,
                                         noise_std=1,
                                         policy=policy,
                                         env_fn=env_fn))
        critic.obs_mean = policy.obs_mean
        critic.obs_std = policy.obs_std

    print("obs_dim: {}, action_dim: {}".format(obs_dim, action_dim))

    # create a tensorboard logging object
    logger = create_logger(args)

    algo = PPO(args=vars(args), save_path=logger.dir)

    print()
    print("Synchronous Distributed Proximal Policy Optimization:")
    print("\tenv:            {}".format(args.env_name))
    print("\trun name:       {}".format(args.run_name))
    print("\tmax traj len:   {}".format(args.max_traj_len))
    print("\tseed:           {}".format(args.seed))
    print("\tmirror:         {}".format(args.mirror))
    print("\tnum procs:      {}".format(args.num_procs))
    print("\tlr:             {}".format(args.lr))
    print("\teps:            {}".format(args.eps))
    print("\tlam:            {}".format(args.lam))
    print("\tgamma:          {}".format(args.gamma))
    print("\tentropy coeff:  {}".format(args.entropy_coeff))
    print("\tclip:           {}".format(args.clip))
    print("\tminibatch size: {}".format(args.minibatch_size))
    print("\tepochs:         {}".format(args.epochs))
    print("\tnum steps:      {}".format(args.num_steps))
    print("\tuse gae:        {}".format(args.use_gae))
    print("\tmax grad norm:  {}".format(args.max_grad_norm))
    print("\tmax traj len:   {}".format(args.max_traj_len))
    print()

    algo.train(env_fn, policy, critic, args.n_itr, logger=logger)