コード例 #1
0
 def __init__(self,
              agent,
              env,
              steps_per_epoch=4000,
              epochs=50,
              seed=0,
              output_dir=None,
              output_fname='progress.txt',
              exp_name=None,
              max_ep_len=1000,
              gamma=0.99,
              lam=0.97):
     self.epoch_len, self.n_epochs = steps_per_epoch, epochs
     self.max_ep_len = max_ep_len
     self.logger = EpochLogger(output_dir=output_dir,
                               output_fname=output_fname,
                               exp_name=exp_name)
     print('locals')
     for key, val in locals().items():
         print('{}: {}'.format(key, len(str(val))))
     # self.logger.save_config(locals())
     self.env, self.agent = env, agent
     self.buffer = OnPolicyBuffer(steps_per_epoch, gamma=gamma, lam=lam)
     saver_kwargs = agent.build_graph(env.observation_space,
                                      env.action_space)
     self.logger.setup_tf_saver(**saver_kwargs)
     var_counts = tuple(
         tf_utils.trainable_count(scope) for scope in ['pi', 'v'])
     self.logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' %
                     var_counts)
     np.random.seed(seed)
     tf.set_random_seed(seed)
コード例 #2
0
ファイル: ddpg.py プロジェクト: shariqahn/UROPFall2020
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        logger_kwargs = setup_logger_kwargs('MultiTaskDDPG')
        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(globals())

        self.start_steps = 10000
コード例 #3
0
    def __init__(self, env, EP_MAX=1000, EP_LEN=250, GAMMA=0.99, LR = 0.0001, BATCH=32, UPDATE_STEP=10, hidden_sizes = (64,64), activation=tf.tanh, output_activation=tf.tanh, act_noise_amount=0.01, logger_kwargs=dict()):
        
        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())
        self.EP_MAX = EP_MAX
        self.EP_LEN = EP_LEN
        self.GAMMA = GAMMA
        self.LR = LR
        self.BATCH = BATCH
        self.UPDATE_STEP = UPDATE_STEP
        self.S_DIM = env.observation_space.shape[-1]
        self.A_DIM = env.action_space.shape[-1]
        self.act_high = env.action_space.high
        self.act_low = env.action_space.low
        self.hidden_sizes = hidden_sizes
        self.activation = activation
        self.output_activation = output_activation
        self.act_noise_amount = act_noise_amount
        self.sess = tf.Session()
        self.tfs = tf.placeholder(tf.float32, [None, self.S_DIM], 'state')
        self.tfa = tf.placeholder(tf.float32, [None, self.A_DIM], 'action')
    
        # policy
        self.pi, self.pi_params = self._build_net('pi', trainable=True)

        with tf.variable_scope('loss'):
            self.loss = tf.reduce_mean(tf.square(self.pi -  self.tfa))
        with tf.variable_scope('train'):
            self.train_op = tf.train.AdamOptimizer(LR).minimize(self.loss)
        tf.summary.FileWriter("log/", self.sess.graph)
        self.sess.run(tf.global_variables_initializer())

        # Setup model saving
        self.logger.setup_tf_saver(self.sess, inputs={'x': self.tfs, 'a': self.tfa}, outputs={'pi': self.pi})
コード例 #4
0
ファイル: logger.py プロジェクト: matrl-project/matrl
 def __init__(self, config):
     self.log_dir = config["output_dir"]
     logger_kwargs = setup_logger_kwargs(config["exp_name"], config["seed"])
     logger_kwargs["output_dir"] = config["output_dir"]
     self.csv_logger = EpochLogger(**logger_kwargs)
     self.csv_logger.save_config(config)
     self.tf_logger = SummaryWriter(os.path.join(self.log_dir))
コード例 #5
0
    def __init__(self,
                 env_maker: Callable,
                 ac_maker=core.MLPActorCritic,
                 ac_kwargs={},
                 seed: int = 0,
                 epochs: int = 50,
                 steps_per_epoch: int = 4000,
                 gamma: float = 0.99,
                 actor_lr: float = 3e-4,
                 critic_lr: float = 1e-3,
                 num_iter_train_critic: int = 80,
                 lam: float = 0.97,
                 max_episode_len: int = 1000,
                 logger_kwargs=dict(),
                 save_freq: int = 10):
        # Special function to avoid certain slowdowns from PyTorch + MPI combo.
        setup_pytorch_for_mpi()
        # Set up logger and save configuration
        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())
        # Random seed
        seed += 10000 * proc_id()
        torch.manual_seed(seed)
        np.random.seed(seed)

        self.epochs = epochs
        self.steps_per_epoch = steps_per_epoch
        self.num_iter_train_critic = num_iter_train_critic
        self.max_episode_len = max_episode_len
        self.save_freq = save_freq

        # make env
        self.env = env_maker()
        self.obs_dim = self.env.observation_space.shape
        self.act_dim = self.env.action_space.shape

        # make actor-critic
        self.ac = ac_maker(self.env.observation_space, self.env.action_space,
                           **ac_kwargs)

        # make buffer
        self.local_steps_per_epoch = int(steps_per_epoch / num_procs())
        self.buffer = Buffer(self.obs_dim, self.act_dim,
                             self.local_steps_per_epoch, gamma, lam)

        # make optimizers
        self.actor_optimizer = Adam(self.ac.actor.parameters(), lr=actor_lr)
        self.critic_optimizer = Adam(self.ac.critic.parameters(), lr=critic_lr)

        # Sync params across processes
        sync_params(self.ac)
        # Count variables
        var_counts = tuple(
            core.count_vars(module)
            for module in [self.ac.actor, self.ac.critic])
        self.logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' %
                        var_counts)
        # Set up model saving
        self.logger.setup_pytorch_saver(self.ac)
コード例 #6
0
ファイル: ddpg.py プロジェクト: shariqahn/UROPFall2020
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     logger_kwargs = setup_logger_kwargs('MultiTaskDDPGAutoQuery')
     self.logger = EpochLogger(**logger_kwargs)
     self.logger.save_config(globals())
     self.init_query = False
     self.init_reward = False
     self.query_reward = 0
コード例 #7
0
    def __init__(self, env_str, seed=0, logger_kwargs=dict()):
        self.env_str = env_str
        print(self.env_str)

        tf.set_random_seed(seed)
        np.random.seed(seed)

        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())
コード例 #8
0
ファイル: ddpg.py プロジェクト: shariqahn/UROPFall2020
class MultiTaskDDPGAugmentedOracle(MultiTaskDDPG):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        logger_kwargs = setup_logger_kwargs('MultiTaskDDPGAugmentedOracle')
        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(globals())

    def process_state(self, state):
        query = self.reward_function(None, None, None, True)
        return np.append(state, query)
コード例 #9
0
ファイル: ddpg.py プロジェクト: shariqahn/UROPFall2020
class MultiTaskDDPG(SingleTaskDDPG):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        logger_kwargs = setup_logger_kwargs('MultiTaskDDPG')
        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(globals())

        self.start_steps = 10000

    def reset(self, reward_function):
        self.reward_function = reward_function
コード例 #10
0
ファイル: evaluate.py プロジェクト: bdsaglam/LearnRL
def evaluate_agent(env,
                   agent: Agent,
                   deterministic=True,
                   num_episodes=5,
                   render=False,
                   logger=None):
    assert env is not None, \
        "Environment not found!\n\n It looks like the environment wasn't saved, " + \
        "and we can't run the agent in it. :( \n\n Check out the readthedocs " + \
        "page on Experiment Outputs for how to handle this situation."

    assert env.spec.max_episode_steps > 0

    if logger is None:
        logger = EpochLogger()

    episode_info = []
    goal_grid_code = None
    for _ in range(num_episodes):
        obs = env.reset()
        agent.reset()
        reward = 0

        done = False
        episode_return = 0
        t = 0
        while not done:
            if render:
                env.render()
                time.sleep(1e-3)

            with torch.no_grad():
                action = agent.act(obs, reward, goal_grid_code, deterministic)
            obs, reward, done, _ = env.step(action)
            episode_return += reward
            t += 1

        if done:
            goal_grid_code = agent.current_grid_code.detach().cpu().numpy()

        episode_info.append((t, episode_return))
        logger.store(TestEpRet=episode_return, TestEpLen=t)

    logger.log_tabular('EpisodeLimit', env.spec.max_episode_steps)
    logger.log_tabular('TestEpLen', with_min_and_max=True)
    logger.log_tabular('TestEpRet', with_min_and_max=True)
    logger.dump_tabular()

    env.close()

    return episode_info
コード例 #11
0
def get_mc(env_set="Hopper-v2", seed=1, buffer_type="sacpolicy_env_stopcrt_2_det_bear", cut_buffer_size='1000K',
           gamma=0.99, rollout=1000, augment_mc=True,
		   logger_kwargs=dict()):

	print('MClength:', rollout)
	print('Discount value', gamma)

	device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
	print("running on device:", device)

	global logger
	logger = EpochLogger(**logger_kwargs)
	logger.save_config(locals())

	if not os.path.exists("./results"):
		os.makedirs("./results")

	setting_name = "%s_r%s_g%s" % (buffer_type.replace('env', env_set), rollout, gamma)
	setting_name += 'noaug' if not (augment_mc) else ''
	print("---------------------------------------")
	print("Settings: " + setting_name)
	print("---------------------------------------")

	# Load buffer
	if 'sac' in buffer_type:
		replay_buffer = utils.BEAR_ReplayBuffer()
		desire_stop_dict = {'Hopper-v2': 1000, 'Walker2d-v2': 500, 'HalfCheetah-v2': 4000, 'Ant-v2': 750}
		buffer_name = buffer_type.replace('env', env_set).replace('crt', str(desire_stop_dict[env_set]))
		replay_buffer.load(buffer_name)
		buffer_name += '_1000K'
		setting_name = setting_name.replace('crt', str(desire_stop_dict[env_set]))
	elif 'FinalSigma' in buffer_type:
		replay_buffer = utils.ReplayBuffer()
		buffer_name = buffer_type.replace('env', env_set)
		replay_buffer.load(buffer_name)
	else:
		raise FileNotFoundError('! Unknown type of dataset %s' % buffer_type)


	print('Starting MC calculation, type:', augment_mc)

	if augment_mc == 'gain':

		states, gains = calculate_mc_gain(replay_buffer, rollout=rollout, gamma=gamma)
		if not os.path.exists('./results/ueMC_%s_S.npy' % buffer_name):
			np.save('./results/ueMC_%s_S' % (buffer_name + '_' + cut_buffer_size), states)
		np.save('./results/ueMC_%s_Gain' % setting_name, gains)
	else:
            raise Exception('! undefined mc calculation type')

	print('Calculation finished ==')
コード例 #12
0
ファイル: logger.py プロジェクト: matrl-project/matrl
class Logger:
    def __init__(self, config):
        self.log_dir = config["output_dir"]
        logger_kwargs = setup_logger_kwargs(config["exp_name"], config["seed"])
        logger_kwargs["output_dir"] = config["output_dir"]
        self.csv_logger = EpochLogger(**logger_kwargs)
        self.csv_logger.save_config(config)
        self.tf_logger = SummaryWriter(os.path.join(self.log_dir))
        # self.tf_logger.set_as_default()

    def store(self, data_dict, agent):
        for k, v in data_dict.items():
            if len(agent) > 1:
                for i in agent:
                    key = "{}_{}".format(k, i)
                    k_v = {key: v[i]}
                    self.csv_logger.store(**k_v)
            else:
                key = "{}_{}".format(k, agent[0])
                k_v = {key: v}
                self.csv_logger.store(**k_v)

    def dump(self, keys, agents, step, mean_only):
        for k in keys:
            for i in agents:
                key = "{}_{}".format(k, i)
                value = self.csv_logger.epoch_dict[key]

                self.csv_logger.log_tabular(key, average_only=mean_only)

                if mean_only:
                    self.tf_logger.add_scalar(key, np.mean(value), step)
                    # self.tf_logger.add_scalar(scalar)
                else:
                    for p, q in zip(
                        ["min", "mean", "max", "std"],
                        [
                            np.min(value),
                            np.mean(value),
                            np.max(value),
                            np.std(value)
                        ],
                    ):
                        # scalar = tf.compat.v1.summary.scalar("{}_{}".format(key, p), q, step)
                        self.tf_logger.add_scalar("{}_{}".format(key, p), q,
                                                  step)

        self.csv_logger.dump_tabular()
        self.tf_logger.flush()
コード例 #13
0
ファイル: meil-v2.py プロジェクト: johnhallman/reil-algorithm
def meil(WORKING_DIR, EXPERT_DIR, args):

    expert_distribution = Gaussian_Density()
    with tf.Session() as sess:
        env = gym.make(args.env)
        expert = load_policy(sess, EXPERT_DIR)
        expert_distribution.train(env, expert, args.trajects, args.distr_gamma,
                                  args.iter_length)
        env.close()
    expert_density = expert_distribution.density()

    env = gym.make(args.env)
    policy_distr = Gaussian_Density()
    policy = lambda s: np.random.uniform(
        -2.0, 2.0, size=env.action_space.shape)  # random policy
    policy_distr.train(env, policy, args.trajects, args.distr_gamma,
                       args.iter_length)
    density = policy_distr.density()

    logger_kwargs = setup_logger_kwargs("result", data_dir=WORKING_DIR)
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    for i in range(args.rounds):
        reward = lambda s: expert_density(s) / (density(s) + args.eps)

        message = "\nRound {} out of {}\n".format(i + 1, args.rounds)
        reuse = (i > 0)
        ppo(logger,
            reuse,
            message,
            lambda: gym.make(args.env),
            reward,
            actor_critic=core.mlp_actor_critic,
            ac_kwargs=dict(hidden_sizes=[args.hid] * args.l),
            gamma=args.gamma,
            steps_per_epoch=args.steps,
            epochs=args.epochs,
            logger_kwargs=logger_kwargs)

        with tf.Session() as sess:
            policy = load_policy(sess, os.path.join(WORKING_DIR, str(i)))
            policy_distr.train(env, policy, args.trajects, args.distr_gamma,
                               args.iter_length)
            density = policy_distr.density()

    env.close()
    opt_dir = reward_validation(WORKING_DIR, args)
    return opt_dir
コード例 #14
0
ファイル: ddpg.py プロジェクト: shariqahn/UROPFall2020
class MultiTaskDDPGQuery(MultiTaskDDPG):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        logger_kwargs = setup_logger_kwargs('MultiTaskDDPGQuery')
        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(globals())

    def process_state(self, state):
        query_state = np.zeros(17)
        query_state[0] = -.05
        next_state = np.zeros(17)
        next_state[0] = .007
        action = self.rng.rand(6)
        query = self.reward_function(query_state, action, next_state)
        return np.append(state, query)
コード例 #15
0
ファイル: main_get_mcret.py プロジェクト: lanyavik/BAIL
def get_mc(env_set="Hopper-v2",
           seed=0,
           buffer_type='FinalSAC_env_0_1000K',
           gamma=0.99,
           rollout=1000,
           augment_mc='gain',
           logger_kwargs=dict()):

    print('MClength:', rollout)
    print('Discount value', gamma)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("running on device:", device)

    global logger
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    if not os.path.exists("./results"):
        os.makedirs("./results")

    setting_name = "%s_r%s_g%s" % (buffer_type.replace(
        'env', env_set), rollout, gamma)
    setting_name += 'noaug' if not (augment_mc) else ''
    print("---------------------------------------")
    print("Settings: " + setting_name)
    print("---------------------------------------")

    # Load buffer
    replay_buffer = utils.ReplayBuffer()
    buffer_name = buffer_type.replace('env', env_set)
    replay_buffer.load(buffer_name)

    print('Starting MC calculation, type:', augment_mc)

    if augment_mc == 'gain':
        states, gains = calculate_mc_gain(replay_buffer,
                                          rollout=rollout,
                                          gamma=gamma)

        if not os.path.exists('./results/ueMC_%s_S.npy' % buffer_name):
            np.save('./results/ueMC_%s_S' % buffer_name, states)
        print(len(gains))
        np.save('./results/ueMC_%s_Gain' % setting_name, gains)
    else:
        raise Exception('! undefined mc calculation type')

    print('Calculation finished ==')
コード例 #16
0
def evaluate_agent(env,
                   agent: IAgent,
                   deterministic=True,
                   num_episodes=5,
                   episode_len_limit=None,
                   render=False,
                   logger=None):
    assert env is not None, \
        "Environment not found!\n\n It looks like the environment wasn't saved, " + \
        "and we can't run the agent in it. :( \n\n Check out the readthedocs " + \
        "page on Experiment Outputs for how to handle this situation."

    if episode_len_limit is None:
        if env.unwrapped.spec and env.unwrapped.spec.max_episode_steps:
            episode_len_limit = env.spec.max_episode_steps
        else:
            raise ValueError("Episode length limit must be specified")

    if logger is None:
        logger = EpochLogger()

    episode_info = []
    for _ in range(num_episodes):
        obs = env.reset()
        agent.reset()
        done = False
        episode_return = 0
        t = 0
        while not done and t != episode_len_limit:
            if render:
                env.render()
                time.sleep(1e-3)

            with torch.no_grad():
                action = agent.act(obs, deterministic)
            obs, reward, done, _ = env.step(action)
            episode_return += reward
            t += 1
        episode_info.append((t, episode_return))
        logger.store(TestEpRet=episode_return, TestEpLen=t)

    logger.log_tabular('EpisodeLimit', episode_len_limit)
    logger.log_tabular('TestEpLen', with_min_and_max=True)
    logger.log_tabular('TestEpRet', with_min_and_max=True)
    logger.dump_tabular()

    return episode_info
コード例 #17
0
ファイル: ddpg.py プロジェクト: shariqahn/UROPFall2020
    def __init__(self,
                 action_space,
                 observation_space,
                 rng,
                 eps=0.9,
                 discount_factor=0.99,
                 alpha=1e-3):
        self.rng = rng
        logger_kwargs = setup_logger_kwargs('SingleTaskDDPG', self.rng)
        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())

        self.actor_critic = MLPActorCritic
        # ac_kwargs=dict() ****?????*****
        # seed=0
        self.replay_size = int(1e6)
        self.polyak = 0.995
        self.gamma = discount_factor
        self.pi_lr = alpha
        self.q_lr = alpha
        self.batch_size = 100
        self.start_steps = 10000
        self.update_after = 1000
        self.update_every = 50
        self.act_noise = 0.1

        self.step_count = 0
        self.action_space = action_space
        self.observation_space = observation_space
        # self.observation_space = spaces.Box(-np.inf, np.inf, shape=(17,), dtype=np.float32) #fix

        # torch.manual_seed(seed)
        # np.random.seed(seed)

        # self.obs_dim = self.observation_space.shape
        self.act_dim = self.action_space.shape[0]
        # act_dim = self.action_space.n

        # Action limit for clamping: critically, assumes all dimensions share the same bound!
        self.act_limit = self.action_space.high[0]

        self.net = False
コード例 #18
0
 def __init__(self,
              agent,
              env,
              steps_per_epoch=5000,
              epochs=50,
              seed=0,
              max_ep_len=1000,
              start_steps=10000,
              replay_size=int(1e6),
              batch_size=100,
              n_test_episodes=10,
              output_dir=None,
              output_fname='progress.txt',
              exp_name=None):
     self.epoch_len, self.n_epochs = steps_per_epoch, epochs
     self.max_ep_len, self.start_steps = max_ep_len, start_steps
     self.n_test_episodes = n_test_episodes
     self.logger = EpochLogger(output_dir=output_dir,
                               output_fname=output_fname,
                               exp_name=exp_name)
     print('locals')
     for key, val in locals().items():
         print('{}: {}'.format(key, len(str(val))))
     # self.logger.save_config(locals())
     self.env, self.agent = env, agent
     self.buffer = OffPolicyBuffer(buffer_size=replay_size,
                                   epoch_size=steps_per_epoch,
                                   batch_size=batch_size)
     saver_kwargs = agent.build_graph(env.observation_space,
                                      env.action_space)
     self.logger.setup_tf_saver(**saver_kwargs)
     var_counts = tuple(
         tf_utils.trainable_count(scope) for scope in ['pi', 'q'])
     self.logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n' %
                     var_counts)
     np.random.seed(seed)
     tf.set_random_seed(seed)
コード例 #19
0
def worker_test(ps, start_time):

    from spinup.utils.run_utils import setup_logger_kwargs

    logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)
    logger = EpochLogger(**logger_kwargs)
    config = locals()
    del config['ps']
    logger.save_config(config)

    agent = Model(args)
    keys = agent.get_weights()[0]

    weights = ray.get(ps.pull.remote(keys))
    agent.set_weights(keys, weights)
    test_env = gym.make(args.env)
    while True:
        ave_ret = agent.test_agent(test_env, args)
        # print("test Average Ret:", ave_ret, "time:", time.time()-start_time)
        logger.log_tabular('AverageTestEpRet', ave_ret)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
        weights = ray.get(ps.pull.remote(keys))
        agent.set_weights(keys, weights)
コード例 #20
0
def sppo(args,
         env_fn,
         actor_critic=core.mlp_actor_critic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=4000,
         epochs=50,
         gamma=0.99,
         clip_ratio=0.2,
         train_pi_iters=80,
         train_v_iters=80,
         lam=0.97,
         max_ep_len=200,
         target_kl=0.01,
         logger_kwargs=dict(),
         save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    ###########
    if args.alpha == 'auto':
        target_entropy = 0.35

        log_alpha = tf.get_variable('log_alpha',
                                    dtype=tf.float32,
                                    initializer=tf.log(0.2))
        alpha = tf.exp(log_alpha)
    else:
        alpha = args.alpha
    ###########

    # Main outputs from computation graph
    mu, pi, logp, logp_pi, v, q, h = actor_critic(alpha, x_ph, a_ph,
                                                  **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi, h]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    ######

    if args.alpha == 'auto':
        alpha_loss = tf.reduce_mean(
            -log_alpha * tf.stop_gradient(-h + target_entropy)
        )  # tf.clip_by_value(-h + target_entropy, 0.0, 1000.0 )

        alpha_optimizer = MpiAdamOptimizer(learning_rate=1e-5)
        train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss,
                                                  var_list=[log_alpha])

    ######

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)

    # For PPO
    # min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph)
    # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))

    # ### Scheme1: SPPO NO.2: add entropy
    # adv_logp = adv_ph - tf.stop_gradient(alpha) * tf.stop_gradient(logp)
    # min_adv = tf.where(adv_logp>0, (1+clip_ratio)*adv_logp, (1-clip_ratio)*adv_logp)
    # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_logp, min_adv))

    # ### Scheme3: SPPO NO.3: add entropy
    # adv_logp = adv_ph - tf.stop_gradient(alpha) * logp_old_ph
    # min_adv = tf.where(adv_logp>0, (1+clip_ratio)*adv_logp, (1-clip_ratio)*adv_logp)
    # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_logp, min_adv))

    ### Scheme2: SPPO NO.2: add entropy
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(
        tf.minimum(ratio * adv_ph, min_adv) + tf.stop_gradient(alpha) * h)

    v_loss = tf.reduce_mean((ret_ph - v)**2)  #+(ret_ph - q)**2)/2.0

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        h)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(
        learning_rate=args.pi_lr).minimize(pi_loss + 0.1 * v_loss)
    # train_v = MpiAdamOptimizer(learning_rate=args.vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            if args.alpha == 'auto':
                sess.run(train_alpha_op, feed_dict=inputs)
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        # for _ in range(train_v_iters):
        #     sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old),
                     Alpha=sess.run(alpha) if args.alpha == 'auto' else alpha)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t, h_t = sess.run(get_action_ops,
                                           feed_dict={x_ph: o.reshape(1, -1)})
            # q_t = sess.run(q, feed_dict={x_ph: o.reshape(1,-1), a_ph: a})
            # SPPO NO.1: add entropy
            # rh = r - args.alpha * logp_t
            if args.alpha == 'auto':
                rh = r + sess.run(alpha) * h_t
            else:
                rh = r + alpha * h_t  # exact entropy

            # save and log
            buf.store(o, a, rh, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            ep_ret += r

            ep_len += 1
            # d = False if ep_len == max_ep_len else d

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # # Save model
        # if (epoch % save_freq == 0) or (epoch == epochs-1):
        #     logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Alpha', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
コード例 #21
0
ファイル: egl.py プロジェクト: eladsar/spinningup
def egl(env_fn,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        lr=1e-3,
        alpha=0.2,
        batch_size=256,
        start_steps=10000,
        update_after=1000,
        update_every=50,
        num_test_episodes=10,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1,
        eps=0.4,
        n_explore=32,
        device='cuda',
        architecture='mlp',
        sample='on_policy'):
    """
    Soft Actor-Critic (SAC)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
            The ``act`` method and ``pi`` module should accept batches of 
            observations as inputs, and ``q1`` and ``q2`` should accept a batch 
            of observations and a batch of actions as inputs. When called, 
            ``act``, ``q1``, and ``q2`` should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``q1``       (batch,)          | Tensor containing one current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ``q2``       (batch,)          | Tensor containing the other current 
                                           | estimate of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

            Calling ``pi`` should return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``logp_pi``  (batch,)          | Tensor containing log probabilities of
                                           | actions in ``a``. Importantly: gradients
                                           | should be able to flow back into ``a``.
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        alpha (float): Entropy regularization coefficient. (Equivalent to 
            inverse of reward scale in the original SAC paper.)

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    if architecture == 'mlp':
        actor_critic = core.MLPActorCritic
    elif architecture == 'spline':
        actor_critic = core.SplineActorCritic
    else:
        raise NotImplementedError

    device = torch.device(device)
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    ac = actor_critic(env.observation_space, env.action_space,
                      **ac_kwargs).to(device)
    ac_targ = deepcopy(ac)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size,
                                 device=device)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(
        core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2, ac.geps])
    logger.log(
        '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t geps: %d\n'
        % var_counts)

    n_samples = 100
    cmin = 0.25
    cmax = 1.75
    greed = 0.01
    rand = 0.01

    def max_reroute(o):

        b, _ = o.shape
        o = repeat_and_reshape(o, n_samples)
        with torch.no_grad():
            ai, _ = ac.pi(o)

            q1 = ac.q1(o, ai)
            q2 = ac.q2(o, ai)
            qi = torch.min(q1, q2).unsqueeze(-1)

        qi = qi.view(n_samples, b, 1)
        ai = ai.view(n_samples, b, act_dim)
        rank = torch.argsort(torch.argsort(qi, dim=0, descending=True),
                             dim=0,
                             descending=False)
        w = cmin * torch.ones_like(ai)
        m = int((1 - cmin) * n_samples / (cmax - cmin))

        w += (cmax - cmin) * (rank < m).float()
        w += ((1 - cmin) * n_samples - m * (cmax - cmin)) * (rank == m).float()

        w -= greed
        w += greed * n_samples * (rank == 0).float()

        w = w * (1 - rand) + rand

        w = w / w.sum(dim=0, keepdim=True)

        prob = torch.distributions.Categorical(probs=w.permute(1, 2, 0))

        a = torch.gather(ai.permute(1, 2, 0), 2,
                         prob.sample().unsqueeze(2)).squeeze(2)

        return a, (ai, w.mean(-1))

    # Set up function for computing SAC Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        q1 = ac.q1(o, a)
        q2 = ac.q2(o, a)

        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            a2, logp_a2 = ac.pi(o2)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2, a2)
            q2_pi_targ = ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2)

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().cpu().numpy(),
                      Q2Vals=q2.detach().cpu().numpy())

        return loss_q, q_info

    # # Set up function for computing EGL mean-gradient-losses
    # def compute_loss_g(data):
    #
    #     o, a1, r, o_tag, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']
    #
    #     a2 = ball_explore(a1, n_explore, eps)
    #
    #     a2 = a2.view(n_explore * len(r), act_dim)
    #     o_expand = repeat_and_reshape(o, n_explore)
    #
    #     # Bellman backup for Q functions
    #     with torch.no_grad():
    #
    #         q1 = ac.q1(o_expand, a2)
    #         q2 = ac.q2(o_expand, a2)
    #         q_dither = torch.min(q1, q2)
    #
    #         # Target actions come from *current* policy
    #         a_tag, logp_a_tag = ac.pi(o_tag)
    #
    #         # Target Q-values
    #         q1_pi_targ = ac_targ.q1(o_tag, a_tag)
    #         q2_pi_targ = ac_targ.q2(o_tag, a_tag)
    #         q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
    #         q_anchor = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a_tag)
    #
    #         q_anchor = repeat_and_reshape(q_anchor, n_explore).squeeze(-1)
    #
    #     geps = ac.geps(o, a1)
    #     geps = repeat_and_reshape(geps, n_explore)
    #     a1 = repeat_and_reshape(a1, n_explore)
    #
    #     geps = (geps * (a2 - a1)).sum(-1)
    #     # l1 loss against Bellman backup
    #
    #     loss_g = F.smooth_l1_loss(geps, q_dither - q_anchor)
    #
    #     # Useful info for logging
    #     g_info = dict(GVals=geps.flatten().detach().cpu().numpy())
    #
    #     return loss_g, g_info

    # Set up function for computing EGL mean-gradient-losses
    def compute_loss_g(data):
        o, a1, r, o_tag, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        a2 = ball_explore(a1, n_explore, eps)

        a2 = a2.view(n_explore * len(r), act_dim)
        o_expand = repeat_and_reshape(o, n_explore)

        # Bellman backup for Q functions
        with torch.no_grad():
            q1 = ac.q1(o_expand, a2)
            q2 = ac.q2(o_expand, a2)
            q_dither = torch.min(q1, q2)

            # Target actions come from *current* policy

            # Target Q-values
            q1 = ac.q1(o, a1)
            q2 = ac.q2(o, a1)
            q_anchor = torch.min(q1, q2)

            q_anchor = repeat_and_reshape(q_anchor, n_explore).squeeze(-1)

        geps = ac.geps(o, a1)
        geps = repeat_and_reshape(geps, n_explore)
        a1 = repeat_and_reshape(a1, n_explore)

        geps = (geps * (a2 - a1)).sum(-1)
        # l1 loss against Bellman backup

        loss_g = F.smooth_l1_loss(geps, q_dither - q_anchor)

        # Useful info for logging
        g_info = dict(GVals=geps.flatten().detach().cpu().numpy())

        return loss_g, g_info

    # Set up function for computing SAC pi loss
    def compute_loss_pi(data):
        o = data['obs']
        pi, logp_pi = ac.pi(o)
        geps_pi = ac.geps(o, pi)

        # Entropy-regularized policy loss
        loss_pi = (alpha * logp_pi - (geps_pi * pi).sum(-1)).mean()

        beta = autograd.Variable(pi.detach().clone(), requires_grad=True)
        q1_pi = ac.q1(o, beta)
        q2_pi = ac.q2(o, beta)
        qa = torch.min(q1_pi, q2_pi).unsqueeze(-1)

        grad_q = autograd.grad(outputs=qa,
                               inputs=beta,
                               grad_outputs=torch.cuda.FloatTensor(
                                   qa.size()).fill_(1.),
                               create_graph=False,
                               retain_graph=False,
                               only_inputs=True)[0]

        # Useful info for logging
        pi_info = dict(
            LogPi=logp_pi.detach().cpu().numpy(),
            GradGAmp=torch.norm(geps_pi, dim=-1).detach().cpu().numpy(),
            GradQAmp=torch.norm(grad_q, dim=-1).detach().cpu().numpy(),
            GradDelta=torch.norm(geps_pi - grad_q,
                                 dim=-1).detach().cpu().numpy(),
            GradSim=F.cosine_similarity(geps_pi, grad_q,
                                        dim=-1).detach().cpu().numpy(),
        )

        return loss_pi, pi_info

    if architecture == 'mlp':
        # Set up optimizers for policy and q-function
        pi_optimizer = Adam(ac.pi.parameters(), lr=lr)
        q_optimizer = Adam(q_params, lr=lr)
        g_optimizer = Adam(ac.geps.parameters(), lr=lr)
    elif architecture == 'spline':
        # Set up optimizers for policy and q-function
        pi_optimizer = SparseDenseAdamOptimizer(ac.pi,
                                                dense_args={'lr': lr},
                                                sparse_args={'lr': 10 * lr})
        q_optimizer = SparseDenseAdamOptimizer([ac.q1, ac.q2],
                                               dense_args={'lr': lr},
                                               sparse_args={'lr': 10 * lr})
        g_optimizer = SparseDenseAdamOptimizer(ac.geps,
                                               dense_args={'lr': lr},
                                               sparse_args={'lr': 10 * lr})
    else:
        raise NotImplementedError

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data):

        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, q_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **q_info)

        # Next run one gradient descent step for the mean-gradient
        g_optimizer.zero_grad()
        loss_g, g_info = compute_loss_g(data)
        loss_g.backward()
        g_optimizer.step()

        # Record things
        logger.store(LossG=loss_g.item(), **g_info)

        # Freeze Q-networks so you don't waste computational effort
        # computing gradients for them during the policy learning step.
        for p in ac.geps.parameters():
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi, pi_info = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-networks so you can optimize it at next DDPG step.
        for p in ac.geps.parameters():
            p.requires_grad = True

        # Record things
        logger.store(LossPi=loss_pi.item(), **pi_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

    def get_action_on_policy(o, deterministic=False):
        return ac.act(torch.as_tensor(o, dtype=torch.float32, device=device),
                      deterministic)

    def get_action_rbi(o, deterministic=False):
        o = torch.as_tensor(o, dtype=torch.float32, device=device)
        if deterministic:
            a = ac.act(o, deterministic)
        else:
            o = o.unsqueeze(0)
            a, _ = max_reroute(o)
            a = a.flatten().cpu().numpy()
        return a

    if sample == 'on_policy':
        get_action = get_action_on_policy
    elif sample == 'rbi':
        get_action = get_action_rbi
    else:
        raise NotImplementedError

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for t in tqdm(range(total_steps)):

        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy.
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch)

        # End of epoch handling
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)

            logger.log_tabular('GVals', with_min_and_max=True)
            logger.log_tabular('LossG', with_min_and_max=True)
            logger.log_tabular('GradGAmp', with_min_and_max=True)
            logger.log_tabular('GradQAmp', with_min_and_max=True)
            logger.log_tabular('GradDelta', with_min_and_max=True)
            logger.log_tabular('GradSim', with_min_and_max=True)

            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #22
0
def td3(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=5000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        pi_lr=1e-3,
        q_lr=1e-3,
        batch_size=100,
        start_steps=10000,
        act_noise=0.1,
        target_noise=0.2,
        noise_clip=0.5,
        policy_delay=2,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Deterministically computes actions
                                           | from policy given states.
            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q1(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to TD3.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        target_noise (float): Stddev for smoothing noise added to target 
            policy.

        noise_clip (float): Limit for absolute value of target policy 
            smoothing noise.

        policy_delay (int): Policy will only be updated once every 
            policy_delay times for each update of the Q-networks.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)

    #=========================================================================#
    #                                                                         #
    #           All of your code goes in the space below.                     #
    #                                                                         #
    #=========================================================================#

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        pi, q1, q2, q1_pi = actor_critic(x_ph, a_ph, **ac_kwargs)
    # Target policy network
    with tf.variable_scope('target'):
        pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs)
    # Target Q networks
    with tf.variable_scope('target', reuse=True):
        # Target policy smoothing, by adding clipped noise to target actions
        pi_noise_targ = get_pi_noise_clipped(pi,
                                             noise_scale=target_noise,
                                             noise_clip=noise_clip,
                                             act_limit=act_limit)
        # Target Q-values, using action from smoothed target policy
        _, q1_targ, q2_targ, _ = actor_critic(x2_ph, pi_noise_targ,
                                              **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main'])
    print(
        '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n'
        % var_counts)

    # Bellman backup for Q functions, using Clipped Double-Q targets
    q_targ = get_q_target(q1_targ, q2_targ, r_ph, d=d_ph, gamma=0.99)
    # TD3 losses
    pi_loss = -tf.reduce_mean(q1_pi)
    q1_loss = tf.losses.mean_squared_error(q_targ, q1)
    q2_loss = tf.losses.mean_squared_error(q_targ, q2)
    q_loss = q1_loss + q2_loss

    #=========================================================================#
    #                                                                         #
    #           All of your code goes in the space above.                     #
    #                                                                         #
    #=========================================================================#

    # Separate train ops for pi, q
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
    train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))

    # Polyak averaging for target variables
    target_update = tf.group([
        tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'pi': pi,
                              'q1': q1,
                              'q2': q2
                          })

    def get_action(o, noise_scale):
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy (with some noise, via act_noise). 
        """
        if t > start_steps:
            a = get_action(o, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            """
            Perform all TD3 updates at the end of the trajectory
            (in accordance with source code of TD3 published by
            original authors).
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done']
                }
                q_step_ops = [q_loss, q1, q2, train_q_op]
                outs = sess.run(q_step_ops, feed_dict)
                logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2])

                if j % policy_delay == 0:
                    # Delayed policy update
                    outs = sess.run([pi_loss, train_pi_op, target_update],
                                    feed_dict)
                    logger.store(LossPi=outs[0])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #23
0
def vpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(),  seed=0, 
        steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4,
        vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000,
        logger_kwargs=dict(), save_freq=10):
    """
    Vanilla Policy Gradient 

    (with GAE-Lambda for advantage estimation)

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with a 
            ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` 
            module. The ``step`` method should accept a batch of observations 
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``v``        (batch,)          | Numpy array of value estimates
                                           | for the provided observations.
            ``logp_a``   (batch,)          | Numpy array of log probs for the
                                           | actions in ``a``.
            ===========  ================  ======================================

            The ``act`` method behaves the same as ``step`` but only returns ``a``.

            The ``pi`` module's forward call should accept a batch of 
            observations and optionally a batch of actions, and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       N/A               | Torch Distribution object, containing
                                           | a batch of distributions describing
                                           | the policy for the provided observations.
            ``logp_a``   (batch,)          | Optional (only returned if batch of
                                           | actions is given). Tensor containing 
                                           | the log probability, according to 
                                           | the policy, of the provided actions.
                                           | If actions not given, will contain
                                           | ``None``.
            ===========  ================  ======================================

            The ``v`` module's forward call should accept a batch of observations
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``v``        (batch,)          | Tensor containing the value estimates
                                           | for the provided observations. (Critical: 
                                           | make sure to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to VPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    # Special function to avoid certain slowdowns from PyTorch + MPI combo.
    setup_pytorch_for_mpi()

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Random seed
    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate environment
    env = env_fn()
    obs_dim = env.observation_space.shape
    # obs_dim = env.observation_space.n
    act_dim = env.action_space.shape

    # Create actor-critic module
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)

    # Sync params across processes
    sync_params(ac)

    # Count variables
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts)

    # Set up experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Set up function for computing VPG policy loss
    def compute_loss_pi(data):
        obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data['logp']

        # Policy loss
        pi, logp = ac.pi(obs, act)
        loss_pi = -(logp * adv).mean()

        # Useful extra info
        approx_kl = (logp_old - logp).mean().item()
        ent = pi.entropy().mean().item()
        pi_info = dict(kl=approx_kl, ent=ent)

        return loss_pi, pi_info

    # Set up function for computing value loss
    def compute_loss_v(data):
        obs, ret = data['obs'], data['ret']
        return ((ac.v(obs) - ret)**2).mean()

    # Set up optimizers for policy and value function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update():
        data = buf.get()

        # Get loss and info values before update
        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(data).item()

        # Train policy with a single step of gradient descent
        pi_optimizer.zero_grad()
        loss_pi, pi_info = compute_loss_pi(data)
        loss_pi.backward()
        mpi_avg_grads(ac.pi)    # average grads across MPI processes
        pi_optimizer.step()

        # Value function learning
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            
            bayes_kl_loss = 0.
            if isinstance(ac.v, BayesMLPCritic):
                bayes_kl_loss = ac.v.compute_kl()

            total_loss_v = loss_v + bayes_kl_loss / data['obs'].shape[0]
            total_loss_v.backward()
            
            mpi_avg_grads(ac.v)    # average grads across MPI processes
            vf_optimizer.step()

        # Log changes from update
        kl, ent = pi_info['kl'], pi_info_old['ent']
        logger.store(LossPi=pi_l_old, LossV=v_l_old,
                     KL=kl, Entropy=ent,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old),
                     BayesKL=bayes_kl_loss)

    # Prepare for interaction with environment
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    epoch_reward = []
    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32))

            next_o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            # save and log
            buf.store(o, a, r, v, logp)
            logger.store(VVals=v)
            
            # Update obs (critical!)
            o = next_o

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t==local_steps_per_epoch-1

            if terminal or epoch_ended:
                if epoch_ended and not(terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.'%ep_len, flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32))
                else:
                    v = 0
                buf.finish_path(v)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    epoch_reward.append(ep_ret)  
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0


        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs-1):
            logger.save_state({'env': env}, None)

        # Perform VPG update!
        update()

        if epoch % 10 == 0:
            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('DeltaLossPi', average_only=True)
            logger.log_tabular('DeltaLossV', average_only=True)
            logger.log_tabular('Entropy', average_only=True)
            logger.log_tabular('KL', average_only=True)
            logger.log_tabular('BayesKL', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()
    
    return epoch_reward
コード例 #24
0
def cvi_ad(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, 
        steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, alp = 0.8,
        polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, 
        max_ep_len=1000, logger_kwargs=dict(), save_freq=1, decay = None, squash = False):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``mu``       (batch, act_dim)  | Computes mean actions from policy
                                           | given states.
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``. Critical: must be differentiable
                                           | with respect to policy parameters all
                                           | the way through action sampling.
            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q1(x, pi(x)).
            ``q2_pi``    (batch,)          | Gives the composition of ``q2`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q2(x, pi(x)).
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. 
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        alpha (float): Entropy regularization coefficient. (Equivalent to 
            inverse of reward scale in the original SAC paper.)

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None)
    adv_ph = tf.placeholder(dtype = tf.float32, shape = (None,))
    alp_ph = tf.placeholder(dtype =  tf.float32)
    t_step = tf.placeholder(dtype = tf.float32)
    #adv_ph1 = tf.placeholder(dtype = tf.float32, shape = (None,))
    #adv_ph2 = tf.placeholder(dtype = tf.float32, shape = (None,))

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        mu, pi, logp_pi, ad1, ad2, ad1_pi, ad2_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)
    
    # Target value network
    with tf.variable_scope('target'):
        _, _, _, ad1_targ, ad2_targ, _, _, v_targ  = actor_critic(x2_ph, a_ph, **ac_kwargs)
    
    squash_eps = 1e-2
    if squash:
        print("Squashed")
        squash_func = lambda x: tf.sign(x) * (tf.sqrt(tf.abs(x) + 1) - 1) + x * squash_eps
        squash_ifunc = lambda x: tf.sign(x) * ((tf.sqrt(1 + 4 * squash_eps * (tf.abs(x) + 1 + squash_eps)) - 1)** 2 * (1 / (2 * squash_eps))** 2 - 1)
    else:
        print ("Not Squashed")
        squash_func = lambda x: x
        squash_ifunc = lambda x: x
    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in 
                       ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main'])
    print(('\nNumber of parameters: \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts)

    q1 = v + ad1
    q2 = v + ad2
    q1_pi = v + ad1_pi
    q2_pi = v + ad2_pi

    # Min Double-Q:
    min_q_pi = tf.minimum(q1_pi, q2_pi)

    # Targets for Q and V regression
    q_backup = tf.stop_gradient(squash_func(r_ph + gamma*(1-d_ph)*squash_ifunc(v_targ) + alp_ph * adv_ph))
    #q_backup1 = tf.stop_gradient(r_ph + gamma*(1-d_ph)*v_targ + alp * adv_ph1)
    #q_backup2 = tf.stop_gradient(r_ph + gamma*(1-d_ph)*v_targ + alp * adv_ph2)

    v_backup = tf.stop_gradient(squash_func(squash_ifunc(min_q_pi) - alpha * logp_pi))

    # Soft actor-critic losses
    #alp = tf.Variable(0.2,dtype=tf.float32)
    #q_min = tf.minimum(q1,q2)
    pi_loss = tf.reduce_mean(alpha * logp_pi - squash_ifunc(min_q_pi))
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
    v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2)
    value_loss = q1_loss + q2_loss + v_loss

    # Policy train op 
    # (has to be separate from value train op, because q1_pi appears in pi_loss)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q') + get_vars('main/v') 
    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss, var_list=value_params)

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main)
                                  for v_main, v_targ in zip(get_vars('main') , get_vars('target'))])
        # target_update = tf.group([tf.assign(v_targ, tf.cond(tf.not_equal(t_step%1000,0), lambda: v_targ, lambda: v_main))
        #                           for v_main, v_targ in zip(get_vars('main') , get_vars('target'))])

    # All ops to call during one training step
    step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, 
                train_pi_op, train_value_op, target_update]
    
    # adv_op = squash_ifunc(tf.minimum(q1_targ, q2_targ))-squash_ifunc(v_targ)
    adv_op = squash_ifunc(tf.minimum(ad1_targ, ad2_targ))
    #adv_op1 = q1_targ-v_targ
    #adv_op2 = q2_targ-v_targ

    # Initializing targets to match main variables
    target_init = tf.group([tf.assign(v_targ, v_main)
                              for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, 
                                outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v})

    def get_action(o, deterministic=False):
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: o.reshape(1,-1)})[0]

    def test_agent(n=10):
        global sess, mu, pi, q1, q2, q1_pi, q2_pi
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time 
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs


    if decay:
       alp_val = 0.2
    else:
       alp_val = alp

    update_step = 0
    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update 
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            for j in range(ep_len):
                update_step+=1
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {x2_ph: batch['obs1'],
                             a_ph: batch['acts']
                            }
                advantage = sess.run(adv_op , feed_dict)
                #advantage = sess.run([adv_op1, adv_op2] , feed_dict)
          
                feed_dict = {x_ph: batch['obs1'],
                             x2_ph: batch['obs2'],
                             a_ph: batch['acts'],
                             r_ph: batch['rews'],
                             d_ph: batch['done'],
                             t_step: update_step,
                             adv_ph : advantage,
                             alp_ph : alp_val
                             #adv_ph1 : advantage[0],
                             #adv_ph2 : advantage[1]
                            }
                outs = sess.run(step_ops, feed_dict)
                logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2],
                             LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5],
                             VVals=outs[6], LogPi=outs[7])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0


        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch
            if decay:
               alp_val = eval(decay)(t//steps_per_epoch)

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs-1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True) 
            logger.log_tabular('Q2Vals', with_min_and_max=True) 
            logger.log_tabular('VVals', with_min_and_max=True) 
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()
コード例 #25
0
def ppo(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    maxRev = float("-inf")  #negative infinity in the beginning
    #maxRevActionSeq=[]
    maxRevTSTT = 0
    maxRevRevenue = 0
    maxRevThroughput = 0
    maxRevJAH = 0
    maxRevRemVeh = 0
    maxRevJAH2 = 0
    maxRevRMSE_MLvio = 0
    maxRevPerTimeVio = 0
    maxRevHOTDensity = pd.DataFrame()
    maxRevGPDensity = pd.DataFrame()
    maxtdJAHMax = 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(1, -1)})

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            #we need to scale the sampled values of action from (-1,1) to our choices of toll coz they were sampled from tanh activation mu
            numpyFromA = np.array(a[0])
            numpyFromA = ((numpyFromA + 1.0) *
                          (env.state.tollMax - env.state.tollMin) /
                          2.0) + env.state.tollMin
            a[0] = np.ndarray.tolist(numpyFromA)

            o, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                    #get other stats and store them too
                    otherStats = env.getAllOtherStats()
                    if np.any(np.isnan(np.array(otherStats))):
                        sys.exit("Nan found in statistics! Error")
                    logger.store(EpTSTT=otherStats[0],
                                 EpRevenue=otherStats[1],
                                 EpThroughput=otherStats[2],
                                 EpJAH=otherStats[3],
                                 EpRemVeh=otherStats[4],
                                 EpJAH2=otherStats[5],
                                 EpMLViolRMSE=otherStats[6],
                                 EpPerTimeVio=otherStats[7],
                                 EptdJAHMax=otherStats[8])
                    #determine max rev profile
                    if ep_ret > maxRev:
                        maxRev = ep_ret
                        maxRevActionSeq = env.state.tollProfile
                        maxRevTSTT = otherStats[0]
                        maxRevRevenue = otherStats[1]
                        maxRevThroughput = otherStats[2]
                        maxRevJAH = otherStats[3]
                        maxRevRemVeh = otherStats[4]
                        maxRevJAH2 = otherStats[5]
                        maxRevRMSE_MLvio = otherStats[6]
                        maxRevPerTimeVio = otherStats[7]
                        maxRevHOTDensity = env.getHOTDensityData()
                        maxRevGPDensity = env.getGPDensityData()
                        maxtdJAHMax = otherStats[8]
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpTSTT', average_only=True)
        logger.log_tabular('EpRevenue', average_only=True)
        logger.log_tabular('EpThroughput', average_only=True)
        logger.log_tabular('EpJAH', average_only=True)
        logger.log_tabular('EpRemVeh', average_only=True)
        logger.log_tabular('EpJAH2', average_only=True)
        logger.log_tabular('EpMLViolRMSE', average_only=True)
        logger.log_tabular('EpPerTimeVio', average_only=True)
        logger.log_tabular('EptdJAHMax', average_only=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
    print("Max cumulative reward obtained= %f " % maxRev)
    print(
        "Corresponding revenue($)= %f, TSTT(hrs)= %f, Throughput(veh)=%f, JAHstat= %f, remaining vehicles= %f, JAHstat2=%f, RMSEML_vio=%f, percentTimeViolated(%%)=%f, tdJAHMax= %f"
        %
        (maxRevRevenue, maxRevTSTT, maxRevThroughput, maxRevJAH, maxRevRemVeh,
         maxRevJAH2, maxRevRMSE_MLvio, maxRevPerTimeVio, maxtdJAHMax))
    outputVector = [
        maxRev, maxRevRevenue, maxRevTSTT, maxRevThroughput, maxRevJAH,
        maxRevRemVeh, maxRevJAH2, maxRevRMSE_MLvio, maxRevPerTimeVio,
        maxtdJAHMax
    ]
    #print("\n===Max rev action sequence is\n",maxRevActionSeq)
    exportTollProfile(maxRevActionSeq, logger_kwargs, outputVector)
    exportDensityData(maxRevHOTDensity, maxRevGPDensity, logger_kwargs)
コード例 #26
0
def td3(env_fn: Callable,
        actor_critic: torch.nn.Module = core.MLPActorCritic,
        ac_kwargs: Dict = None,
        seed: int = 0,
        steps_per_epoch: int = 4000,
        epochs: int = 2000,
        replay_size: int = int(1e6),
        gamma: float = 0.99,
        polyak: float = 0.995,
        pi_lr: Union[Callable, float] = 1e-3,
        q_lr: Union[Callable, float] = 1e-3,
        batch_size: int = 100,
        start_steps: int = 10000,
        update_after: int = 1000,
        update_every: int = 100,
        act_noise: Union[Callable, float] = 0.1,
        target_noise: float = 0.2,
        noise_clip: float = 0.5,
        policy_delay: int = 2,
        num_test_episodes: int = 3,
        max_ep_len: int = 1000,
        logger_kwargs: Dict = None,
        save_freq: int = 1,
        random_exploration: Union[Callable, float] = 0.0,
        save_checkpoint_path: str = None,
        load_checkpoint_path: str = None,
        load_model_file: str = None):
    """
    Twin Delayed Deep Deterministic Policy Gradient (TD3)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
            The ``act`` method and ``pi`` module should accept batches of 
            observations as inputs, and ``q1`` and ``q2`` should accept a batch 
            of observations and a batch of actions as inputs. When called, 
            these should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``pi``       (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``q1``       (batch,)          | Tensor containing one current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ``q2``       (batch,)          | Tensor containing the other current 
                                           | estimate of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object
            you provided to TD3.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float or callable): Learning rate for policy.

        q_lr (float or callable): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        act_noise (float or callable): Stddev for Gaussian exploration noise added to
            policy at training time. (At test time, no noise is added.)

        target_noise (float): Stddev for smoothing noise added to target 
            policy.

        noise_clip (float): Limit for absolute value of target policy 
            smoothing noise.

        policy_delay (int): Policy will only be updated once every 
            policy_delay times for each update of the Q-networks.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

        random_exploration (float or callable): Probability to randomly select
            an action instead of selecting from policy.

        save_checkpoint_path (str): Path to save the model. If not set, no model
            will be saved

        load_checkpoint_path (str): Path to load the model. Cannot be set if
            save_model_path is set.
    """
    if logger_kwargs is None:
        logger_kwargs = dict()
    if ac_kwargs is None:
        ac_kwargs = dict()

    if save_checkpoint_path is not None:
        assert load_checkpoint_path is None, "load_model_path cannot be set when save_model_path is already set"
        if not os.path.exists(save_checkpoint_path):
            print(f"Folder {save_checkpoint_path} does not exist, creating...")
            os.makedirs(save_checkpoint_path)

    if load_checkpoint_path is not None:
        assert load_model_file is None, "load_checkpoint_path cannot be set when load_model_file is already set"
    # ------------ Initialisation begin ------------
    loaded_state_dict = None
    if load_checkpoint_path is not None:
        logger = EpochLogger(**logger_kwargs)
        logger.save_config(locals())
        loaded_state_dict = load_latest_state_dict(load_checkpoint_path)

        logger.epoch_dict = loaded_state_dict['logger_epoch_dict']
        q_learning_rate_fn = loaded_state_dict['q_learning_rate_fn']
        pi_learning_rate_fn = loaded_state_dict['pi_learning_rate_fn']
        epsilon_fn = loaded_state_dict['epsilon_fn']
        act_noise_fn = loaded_state_dict['act_noise_fn']
        replay_buffer = loaded_state_dict['replay_buffer']
        env, test_env = loaded_state_dict['env'], loaded_state_dict['test_env']
        ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
        ac_targ = deepcopy(ac)
        ac.load_state_dict(loaded_state_dict['ac'])
        ac_targ.load_state_dict(loaded_state_dict['ac_targ'])
        obs_dim = env.observation_space.shape
        act_dim = env.action_space.shape[0]
        env.action_space.np_random.set_state(
            loaded_state_dict['action_space_state'])

        # List of parameters for both Q-networks (save this for convenience)
        q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())
        t_ori = loaded_state_dict['t']
        pi_optimizer = Adam(ac.pi.parameters(), lr=pi_learning_rate_fn(t_ori))
        pi_optimizer.load_state_dict(loaded_state_dict['pi_optimizer'])
        q_optimizer = Adam(q_params, lr=q_learning_rate_fn(t_ori))
        q_optimizer.load_state_dict(loaded_state_dict['q_optimizer'])
        np.random.set_state(loaded_state_dict['np_rng_state'])
        torch.set_rng_state(loaded_state_dict['torch_rng_state'])

    else:
        logger = EpochLogger(**logger_kwargs)
        logger.save_config(locals())

        torch.manual_seed(seed)
        np.random.seed(seed)
        random.seed(seed)
        os.environ['PYTHONHASHSEED'] = str(seed)

        q_learning_rate_fn = get_schedule_fn(q_lr)
        pi_learning_rate_fn = get_schedule_fn(pi_lr)
        act_noise_fn = get_schedule_fn(act_noise)
        epsilon_fn = get_schedule_fn(random_exploration)

        env, test_env = env_fn(), env_fn()
        obs_dim = env.observation_space.shape
        act_dim = env.action_space.shape[0]

        env.action_space.seed(seed)

        # Experience buffer
        replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                     act_dim=act_dim,
                                     size=replay_size)

        # Create actor-critic module and target networks
        if load_model_file is not None:
            assert os.path.exists(
                load_model_file
            ), f"Model file path does not exist: {load_model_file}"
            ac = torch.load(load_model_file)
        else:
            ac = actor_critic(env.observation_space, env.action_space,
                              **ac_kwargs)
        ac_targ = deepcopy(ac)

        # List of parameters for both Q-networks (save this for convenience)
        q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())

        # Set up optimizers for policy and q-function
        pi_optimizer = Adam(ac.pi.parameters(), lr=pi_learning_rate_fn(0))
        q_optimizer = Adam(q_params, lr=q_learning_rate_fn(0))
        t_ori = 0

    act_limit = 1.0

    # ------------ Initialisation end ------------

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(
        core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2])
    logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' %
               var_counts)

    torch.set_printoptions(profile="default")

    # Set up function for computing TD3 Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        q1 = ac.q1(o, a)
        q2 = ac.q2(o, a)

        # Bellman backup for Q functions
        with torch.no_grad():
            pi_targ = ac_targ.pi(o2)
            # Target policy smoothing
            epsilon = torch.randn_like(pi_targ) * target_noise
            epsilon = torch.clamp(epsilon, -noise_clip, noise_clip)
            a2 = pi_targ + epsilon
            a2 = torch.clamp(a2, -act_limit, act_limit)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2, a2)
            q2_pi_targ = ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + gamma * (1 - d) * q_pi_targ

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        loss_info = dict(Q1Vals=q1.detach().numpy(),
                         Q2Vals=q2.detach().numpy())

        return loss_q, loss_info

    # Set up function for computing TD3 pi loss
    def compute_loss_pi(data):
        o = data['obs']
        q1_pi = ac.q1(o, ac.pi(o))
        return -q1_pi.mean()

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data, timer):
        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, loss_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **loss_info)

        # Possibly update pi and target networks
        if timer % policy_delay == 0:

            # Freeze Q-networks so you don't waste computational effort
            # computing gradients for them during the policy learning step.
            for p in q_params:
                p.requires_grad = False

            # Next run one gradient descent step for pi.
            pi_optimizer.zero_grad()
            loss_pi = compute_loss_pi(data)
            loss_pi.backward()
            pi_optimizer.step()

            # Unfreeze Q-networks so you can optimize it at next DDPG step.
            for p in q_params:
                p.requires_grad = True

            # Record things
            logger.store(LossPi=loss_pi.item())

            # Finally, update target networks by polyak averaging.
            with torch.no_grad():
                for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                    # NB: We use an in-place operations "mul_", "add_" to update target
                    # params, as opposed to "mul" and "add", which would make new tensors.
                    p_targ.data.mul_(polyak)
                    p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, noise_scale):
        a = ac.act(torch.as_tensor(o, dtype=torch.float32))
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent():
        for _ in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                scaled_action = get_action(o, 0)
                o, r, d, _ = test_env.step(
                    unscale_action(env.action_space, scaled_action))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    if loaded_state_dict is not None:
        o = loaded_state_dict['o']
        ep_ret = loaded_state_dict['ep_ret']
        ep_len = loaded_state_dict['ep_len']
    else:
        o, ep_ret, ep_len = env.reset(), 0, 0
    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        t += t_ori
        # printMemUsage(f"start of step {t}")
        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy (with some noise, via act_noise).
        if t > start_steps and np.random.rand() > epsilon_fn(t):
            a = get_action(o, act_noise_fn(t))
            unscaled_action = unscale_action(env.action_space, a)
        else:
            unscaled_action = env.action_space.sample()
            a = scale_action(env.action_space, unscaled_action)
        # Step the env
        o2, r, d, _ = env.step(unscaled_action)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch, timer=j)

        # End of epoch handling
        if (t + 1) % steps_per_epoch == 0:
            # Perform LR decay
            update_learning_rate(q_optimizer, q_learning_rate_fn(t))
            update_learning_rate(pi_optimizer, pi_learning_rate_fn(t))
            epoch = (t + 1) // steps_per_epoch

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()

            # Save model and checkpoint
            save_checkpoint = False
            checkpoint_path = ""
            if save_checkpoint_path is not None:
                save_checkpoint = True
                checkpoint_path = save_checkpoint_path
            if load_checkpoint_path is not None:
                save_checkpoint = True
                checkpoint_path = load_checkpoint_path
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({}, None)

                if save_checkpoint:
                    checkpoint_file = os.path.join(checkpoint_path,
                                                   f'save_{epoch}.pt')
                    torch.save(
                        {
                            'ac':
                            ac.state_dict(),
                            'ac_targ':
                            ac_targ.state_dict(),
                            'replay_buffer':
                            replay_buffer,
                            'pi_optimizer':
                            pi_optimizer.state_dict(),
                            'q_optimizer':
                            q_optimizer.state_dict(),
                            'logger_epoch_dict':
                            logger.epoch_dict,
                            'q_learning_rate_fn':
                            q_learning_rate_fn,
                            'pi_learning_rate_fn':
                            pi_learning_rate_fn,
                            'epsilon_fn':
                            epsilon_fn,
                            'act_noise_fn':
                            act_noise_fn,
                            'torch_rng_state':
                            torch.get_rng_state(),
                            'np_rng_state':
                            np.random.get_state(),
                            'action_space_state':
                            env.action_space.np_random.get_state(),
                            'env':
                            env,
                            'test_env':
                            test_env,
                            'ep_ret':
                            ep_ret,
                            'ep_len':
                            ep_len,
                            'o':
                            o,
                            't':
                            t + 1
                        }, checkpoint_file)
                    delete_old_files(checkpoint_path, 10)
コード例 #27
0
def vpg(env_fn, actor_critic=core.ActorCritic, ac_kwargs=dict(), seed=0,
        steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4,
        vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000,
        logger_kwargs=dict(), save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A reference to ActorCritic class which after instantiation
            takes an input ``x``, and action, ``a``, and returns:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a``
                                           | in states ``x``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x``. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to VPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    # https://pytorch.org/docs/master/notes/randomness.html#cudnn
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape
    
    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Actor Critic model instance
    actor_critic = actor_critic(obs_dim, **ac_kwargs)
    actor_critic.to(device) # load to cpu/gpu

    # Count variables
    var_counts = tuple(core.count_vars(model) for model in [actor_critic.policy, actor_critic.value])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts)

    # Optimizers
    train_pi = optim.Adam(actor_critic.policy.parameters(), lr=pi_lr)
    train_v = optim.Adam(actor_critic.value.parameters(), lr=vf_lr)

    # Sync params across processes
    # sync_all_params() # TODO figure out the way to do use MPI for pytorch

    def update():
        actor_critic.train()
        obs, act, adv, ret, logp_old = map(lambda x: Tensor(x).to(device), buf.get())

        _ , logp, _, val = actor_critic(obs, act)

        ent = (-logp).mean()

        # VPG objectives
        pi_loss = -(logp * adv).mean()
        v_l_old = ((ret - val)**2).mean()

        # Policy gradient step
        train_pi.zero_grad()
        pi_loss.backward()
        train_pi.step()

        # Value function learning
        for _ in range(train_v_iters):
            val = actor_critic.value(obs)
            v_loss = (ret - val).pow(2).mean()
            train_v.zero_grad()
            v_loss.backward()
            train_v.step()

        actor_critic.eval()

        # Log changes from update
        _, logp, _, val = actor_critic(obs, act)
        pi_l_new = -(logp * adv).mean()
        v_l_new = ((ret - val)**2).mean()
        kl = (logp_old - logp).mean()

        logger.store(LossPi=pi_loss, LossV=v_l_old,
                     KL=kl, Entropy=ent,
                     DeltaLossPi=(pi_l_new - pi_loss),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, logp_t, logp_pi_t, v_t = actor_critic(Tensor(o.reshape(1,-1)).to(device))

            # save and log
            buf.store(o, a.cpu().numpy(), r, v_t.item(), logp_pi_t.cpu().detach().numpy())
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a.cpu().numpy())
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t==local_steps_per_epoch-1):
                if not(terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.'%ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else actor_critic(Tensor(o.reshape(1,-1)).to(device))[-1].item()
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs-1):
            logger.save_state({'env': env}, actor_critic, None)

        # Perform VPG update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('Time', time.time()-start_time)
        logger.dump_tabular()
コード例 #28
0
def bc_ue_ptb_learn(env_set="Hopper-v2",
                    seed=0,
                    buffer_type="FinalSigma0.5",
                    buffer_seed=0,
                    buffer_size='1000K',
                    cut_buffer_size='1000K',
                    mcue_seed=1,
                    qloss_k=10000,
                    qgt_seed=0,
                    qlearn_type='learn_all_data',
                    border=0.75,
                    clip=0.85,
                    update_type='e',
                    eval_freq=float(1e3),
                    max_timesteps=float(1e6),
                    lr=1e-3,
                    lag_lr=1e-3,
                    search_lr=3e-2,
                    wd=0,
                    epsilon_base=1,
                    logger_kwargs=dict()):
    """parameters |max_timesteps|, |eval_freq|:
       for BC_ue_border_perturb_c, Totalsteps means the number of minibatch updates (default batch size=100)
       for BC_ue_border_perturb_5,
       for BC_ue_border_perturb_e, Totalsteps means the number of updates on each datapoint, i.e., a step is
                                   an iteration of one optimization step on each data in the buffer"""

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("running on device:", device)
    """set up logger"""
    global logger
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    file_name = "BCue_per_e_%s_%s" % (env_set, seed)
    buffer_name = "%s_%s_%s_%s" % (buffer_type, env_set, buffer_seed,
                                   buffer_size)
    setting_name = "%s_r%s_g%s" % (buffer_name, 1000, 0.99)
    print
    ("---------------------------------------")
    print
    ("Settings: " + setting_name)
    print
    ("---------------------------------------")

    if not os.path.exists("./results"):
        os.makedirs("./results")

    env = gym.make(env_set)
    test_env = gym.make(env_set)

    # Set seeds
    env.seed(seed)
    test_env.seed(seed)
    env.action_space.np_random.seed(seed)
    test_env.action_space.np_random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    action_range = float(env.action_space.high[0]) - float(
        env.action_space.low[0])
    print('env', env_set, 'action range:', action_range)

    # Print out config used in MC Upper Envelope training
    rollout_list = [None, 1000, 200, 100, 10]
    k_list = [10000, 1000, 100]
    print('testing MClength:', rollout_list[mcue_seed % 10])
    print('Training loss ratio k:', k_list[mcue_seed // 10])

    selection_info = 'ue_border%s' % border
    selection_info += '_clip%s' % clip if clip is not None else ''
    print('selection_info:', selection_info)
    # Load the ue border selected buffer
    selected_buffer = utils.SARSAReplayBuffer()
    if buffer_size != cut_buffer_size:
        buffer_name = buffer_name + '_cutfinal' + cut_buffer_size

    selected_buffer.load(selection_info + '_' + buffer_name)
    buffer_length = selected_buffer.get_length()
    print(buffer_length)
    print('buffer setting:', selection_info + '_' + buffer_name)

    # Load the Q net trained with regression on Gts
    # And Load the corresponding Gts to the selected buffer
    selected_gts = np.load('./results/sele%s_ueMC_%s_Gt.npy' %
                           (selection_info, setting_name),
                           allow_pickle=True)

    if qlearn_type == 'learn_all_data':
        verbose_qnet = 'alldata_qgts%s' % qgt_seed + 'lok=%s' % qloss_k

    elif qlearn_type == 'learn_border_data':
        verbose_qnet = 'uebor%s_qgts%s' % (border, qgt_seed) if clip is None \
                       else 'uebor%s_clip%s_qgts%s' % (border, clip, qgt_seed)
        verbose_qnet += 'lok=%s' % qloss_k
    else:
        raise ValueError

    print('verbose_qnet:', verbose_qnet)

    Q_from_gt = QNet(state_dim, action_dim, activation='relu')
    Q_from_gt.load_state_dict(
        torch.load('%s/%s_Qgt.pth' %
                   ("./pytorch_models", setting_name + '_' + verbose_qnet)))
    print('load Qnet from',
          '%s/%s_UE.pth' % ("./pytorch_models", setting_name))

    # choose the epsilon plan for the constraints
    if update_type == 'c':
        epsilon = epsilon_plan(epsilon_base, action_range, selected_buffer, selected_gts, Q_from_gt, device,\
                               plan='common')
    else:
        epsilon = torch.FloatTensor([epsilon_base])
        print('one epsilon:', epsilon)

    print('policy train starts --')
    '''Initialize policy of the update type'''
    print("Updating approach: BC_ue_border_perturb_%s" % update_type)
    if update_type == "c":
        policy = BC_ue_border_perturb_c.BC_ue_perturb(state_dim, action_dim, max_action,\
                     lr=lr, lag_lr=lag_lr, wd=wd, num_lambda=buffer_length, Q_from_gt=Q_from_gt )
    elif update_type == "5":
        policy = BC_ue_border_perturb_5.BC_ue_perturb(state_dim, action_dim, max_action, \
                                                      lr=lr, lag_lr=lag_lr, wd=wd, Q_from_gt=Q_from_gt)
    elif update_type == "e":
        policy = BC_ue_border_perturb_e.BC_ue_perturb(state_dim, action_dim, max_action, \
                                                      lr=lr, wd=wd, Q_from_gt=Q_from_gt)
        policy.train_a_tilda(selected_buffer,
                             max_updates=50,
                             search_lr=search_lr,
                             epsilon=epsilon)

    episode_num = 0
    done = True

    training_iters, epoch = 0, 0
    while training_iters < max_timesteps:
        epoch += 1
        if update_type == 'e':
            pol_vals = policy.behavioral_cloning(iterations=int(eval_freq),
                                                 logger=logger)
        else:  # "5" and "c"
            pol_vals = policy.train(selected_buffer,
                                    iterations=int(eval_freq),
                                    epsilon=epsilon,
                                    logger=logger)

        avgtest_reward = evaluate_policy(policy, test_env)
        training_iters += eval_freq

        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('AverageTestEpRet', avgtest_reward)
        logger.log_tabular('TotalSteps', training_iters)

        if update_type == 'c':
            logger.log_tabular('BCLoss', average_only=True)
            logger.log_tabular('ActorLoss', average_only=True)
            logger.log_tabular('LambdaMax', average_only=True)
            logger.log_tabular('LambdaMin', average_only=True)
            logger.log_tabular('ConstraintViolated', with_min_and_max=True)
        elif update_type == '5':
            logger.log_tabular('BCLoss', average_only=True)
            logger.log_tabular('ActorLoss', average_only=True)
            logger.log_tabular('Lambda', average_only=True)
            logger.log_tabular('ConstraintViolatedValue', average_only=True)

        elif update_type == 'e':
            logger.log_tabular('BCLoss', average_only=True)

        logger.dump_tabular()
コード例 #29
0
def ppo(env_fn,
        actor_critic=core.MLPActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """
    Proximal Policy Optimization (by clipping), 

    with early stopping based on approximate KL

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with a 
            ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` 
            module. The ``step`` method should accept a batch of observations 
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``v``        (batch,)          | Numpy array of value estimates
                                           | for the provided observations.
            ``logp_a``   (batch,)          | Numpy array of log probs for the
                                           | actions in ``a``.
            ===========  ================  ======================================

            The ``act`` method behaves the same as ``step`` but only returns ``a``.

            The ``pi`` module's forward call should accept a batch of 
            observations and optionally a batch of actions, and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       N/A               | Torch Distribution object, containing
                                           | a batch of distributions describing
                                           | the policy for the provided observations.
            ``logp_a``   (batch,)          | Optional (only returned if batch of
                                           | actions is given). Tensor containing 
                                           | the log probability, according to 
                                           | the policy, of the provided actions.
                                           | If actions not given, will contain
                                           | ``None``.
            ===========  ================  ======================================

            The ``v`` module's forward call should accept a batch of observations
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``v``        (batch,)          | Tensor containing the value estimates
                                           | for the provided observations. (Critical: 
                                           | make sure to flatten this!)
            ===========  ================  ======================================


        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.) Typically
            denoted by :math:`\epsilon`. 

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    # Special function to avoid certain slowdowns from PyTorch + MPI combo.
    setup_pytorch_for_mpi()

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Random seed
    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate environment
    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Create actor-critic module
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)

    # Sync params across processes
    sync_params(ac)

    # Count variables
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # Set up experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Set up function for computing PPO policy loss
    def compute_loss_pi(data):
        obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[
            'logp']

        # Policy loss
        pi, logp = ac.pi(obs, act)
        ratio = torch.exp(logp - logp_old)
        clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv
        loss_pi = -(torch.min(ratio * adv, clip_adv)).mean()

        # Useful extra info
        approx_kl = (logp_old - logp).mean().item()
        ent = pi.entropy().mean().item()
        clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio)
        clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item()
        pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac)

        return loss_pi, pi_info

    # Set up function for computing value loss
    def compute_loss_v(data):
        obs, ret = data['obs'], data['ret']
        return ((ac.v(obs) - ret)**2).mean()

    # Set up optimizers for policy and value function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update():
        data = buf.get()

        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(data).item()

        # Train policy with multiple steps of gradient descent
        for i in range(train_pi_iters):
            pi_optimizer.zero_grad()
            loss_pi, pi_info = compute_loss_pi(data)
            kl = mpi_avg(pi_info['kl'])
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
            loss_pi.backward()
            mpi_avg_grads(ac.pi)  # average grads across MPI processes
            pi_optimizer.step()

        logger.store(StopIter=i)

        # Value function learning
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            loss_v.backward()
            mpi_avg_grads(ac.v)  # average grads across MPI processes
            vf_optimizer.step()

        # Log changes from update
        kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))

    # Prepare for interaction with environment
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32))

            next_o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            # save and log
            buf.store(o, a, r, v, logp)
            logger.store(VVals=v)

            # Update obs (critical!)
            o = next_o

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == local_steps_per_epoch - 1

            if terminal or epoch_ended:
                if epoch_ended and not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len,
                          flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32))
                else:
                    v = 0
                buf.finish_path(v)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
コード例 #30
0
def vpg(env_fn, actor_critic=tabular_actor_critic.TabularVPGActorCritic,
        n_episodes=100, env_kwargs={}, logger_kwargs={}, ac_kwargs={},
        n_test_episodes=100, gamma=0.99, lam=0.95, bootstrap_n=3):
    """
    Environment has discrete observation and action spaces, both
    low dimensional so policy and value functions can be stored
    in table.

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic : The constructor method for an actor critic class
            with an ``act`` method, and attributes ``pi`` and ``v``.

        n_episodes (int): Number of episodes/rollouts of interaction (equivalent
            to number of policy updates) to perform.

        bootstrap_n (int) : (optional) Number of reward steps to use with a bootstrapped
            approximate Value function.  If None, use GAE-lambda advantage estimation.
    """
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())
    log_wandb = logger_kwargs.get('output_dir').startswith('wandb')

    env = env_fn(**env_kwargs)
    test_env = env_fn(**env_kwargs)

    obs_dim = env.observation_space.n
    act_dim = env.action_space.n

    ac = actor_critic(obs_dim, act_dim, **ac_kwargs)

    def test_agent():
        o, test_ep_ret, test_ep_len = test_env.reset(), 0, 0

        episode = 0
        while episode < n_test_episodes:
            a, _ = ac.step(o)
            o2, r, d, _ = test_env.step(a)
            test_ep_ret += r
            test_ep_len += 1

            o = o2

            if d is True:
                logger.store(TestEpRet=test_ep_ret)
                logger.store(TestEpLen=test_ep_len)
                episode += 1
                o, test_ep_ret, test_ep_len = test_env.reset(), 0, 0

    traj = Trajectory(gamma, lam, bootstrap_n)

    # Run test agent before any training happens
    episode = 0
    test_agent()
    print('Mean test returns from random agent:', np.mean(logger.epoch_dict['TestEpRet']), flush=True)
    logger.log_tabular('Epoch', episode)
    logger.log_tabular('TestEpRet', with_min_and_max=True)
    logger.log_tabular('TestEpLen', with_min_and_max=True)
    # Hack logger values for compatibility with main logging header keys
    logger.log_tabular('EpRet', 0)
    logger.log_tabular('EpLen', 0)
    logger.log_tabular('AverageVVals', 0)
    logger.log_tabular('MaxVVals', 0)
    logger.log_tabular('MinVVals', 0)
    logger.log_tabular('StdVVals', 0)
    logger.log_tabular('TotalEnvInteracts', 0)
    if log_wandb:
        wandb.log(logger.log_current_row, step=episode)
    logger.dump_tabular()

    episode += 1
    o, ep_ret, ep_len = env.reset(), 0, 0
    total_env_interacts = 0

    while episode < n_episodes:
        a, v = ac.step(o)
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1
        total_env_interacts += 1

        traj.store(o, a, r, v)
        logger.store(VVals=v)

        o = o2

        if d is True:
            traj.finish_path(last_obs=o, last_val=0)
            ac.update(traj)
            test_agent()

            logger.log_tabular('Epoch', episode)
            logger.log_tabular('EpRet', ep_ret)
            logger.log_tabular('EpLen', ep_len)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('TestEpLen', with_min_and_max=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('TotalEnvInteracts', total_env_interacts)
            if log_wandb:
                wandb.log(logger.log_current_row, step=episode)
            logger.dump_tabular()

            traj.reset()
            episode += 1
            o, ep_ret, ep_len = env.reset(), 0, 0

    print('pi', ac.pi, flush=True)
    print('logits_pi', ac.logits_pi, flush=True)
    print('value', ac.V, flush=True)
    if isinstance(ac, tabular_actor_critic.TabularReturnHCA) or isinstance(ac, tabular_actor_critic.TabularStateHCA):
        print('h', ac.h, flush=True)