class AntAgent:

    def __init__(self, render=False, model=None):
        # create an environment
        self.environment = gym.make('MountainCarContinuous-v0')
        # reset environment when an agent is initialized
        self.current_observation = self.reset_environment()
        self.render = render
        self.model = model

        self.buffer = ReplayBuffer()

    def reset_environment(self):
        current_observation = self.environment.reset()
        return current_observation

    def get_action(self, current_observation):
        """Fetch an action according to model policy"""
        if self.model is None:
            action = self.environment.action_space.sample()
        else:
            action = self.model.predict(current_observation)
        return action

    def get_transitions(self, action):
        """Take one step in the environment and return the observations"""
        next_observation, reward, done, _ = self.environment.step(action)
        if self.render:
            self.environment.render()
        return next_observation, reward, done

    def run_episode(self, num_episodes=1):
        """run episodes `num_episodes` times using `model` policy"""
        for episode in range(num_episodes):
            self.current_observation = self.reset_environment()
            episode_id = self.buffer.create_episode()

            done = False
            transition = dict()

            while not done:
                transition['current_observation'] = self.current_observation
                transition['action'] = self.get_action(self.current_observation)
                transition['next_observation'], transition['reward'], done = self.get_transitions(transition['action'])

                self.buffer.add_sample(episode_id, transition)

            self.buffer.add_episode(episode_id)

    def learn(self, step=0, restore=False):
        """Train SAC model using transitions in replay buffer"""
        if self.model is None:
            raise Exception("This agent has no brain! Add a model which implements fit() function to train.")

        # Sample array of transitions from replay buffer.
        transition_matrices = self.buffer.fetch_sample()

        if step != 0:
            restore = True

        # Fit the SAC model.
        self.model.fit(transition_matrices, restore=restore, global_step=step)
Exemple #2
0
class Agent(object):
    def __init__(self, computation_graph_args, sample_trajectory_args, estimate_return_args):
        super(Agent, self).__init__()
        self.ob_dim = computation_graph_args['ob_dim']
        self.ac_dim = computation_graph_args['ac_dim']
        self.task_dim = computation_graph_args['task_dim']
        self.reward_dim = 1
        self.terminal_dim = 1

        self.meta_ob_dim = self.ob_dim + self.ac_dim + self.reward_dim + self.terminal_dim
        self.scope  = 'continuous_logits'
        self.size = computation_graph_args['size']
        self.gru_size = computation_graph_args['gru_size']
        self.n_layers = computation_graph_args['n_layers']
        self.learning_rate = computation_graph_args['learning_rate']
        self.history = computation_graph_args['history']
        self.num_value_iters = computation_graph_args['num_value_iters']
        self.l2reg = computation_graph_args['l2reg']
        self.recurrent = computation_graph_args['recurrent']

        self.animate = sample_trajectory_args['animate']
        self.max_path_length = sample_trajectory_args['max_path_length']
        self.min_timesteps_per_batch = sample_trajectory_args['min_timesteps_per_batch']
        self.generalized = sample_trajectory_args['generalized']
        self.granularity = sample_trajectory_args['granularity']

        self.gamma = estimate_return_args['gamma']
        self.nn_critic = estimate_return_args['nn_critic']
        self.normalize_advantages = estimate_return_args['normalize_advantages']

        self.replay_buffer = ReplayBuffer(100000, [self.history, self.meta_ob_dim], [self.ac_dim], self.gru_size, self.task_dim)
        self.val_replay_buffer = ReplayBuffer(100000, [self.history, self.meta_ob_dim], [self.ac_dim], self.gru_size, self.task_dim)

    def init_tf_sess(self):
        tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
        tf_config.gpu_options.allow_growth = True # may need if using GPU
        self.sess = tf.Session(config=tf_config)
        self.sess.__enter__() # equivalent to `with self.sess:`
        tf.global_variables_initializer().run() #pylint: disable=E1101

    def define_placeholders(self):
        """
        placeholders for batch batch observations / actions / advantages in policy gradient
        loss function.
        see Agent.build_computation_graph for notation

        returns:
            sy_ob_no: placeholder for meta-observations
            sy_ac_na: placeholder for actions
            sy_adv_n: placeholder for advantages
            sy_hidden: placeholder for RNN hidden state

            (PPO stuff)
            sy_lp_n: placeholder for pre-computed log-probs
            sy_fixed_lp_n: placeholder for pre-computed old log-probs
        """
        sy_ob_no = tf.placeholder(shape=[None, self.history, self.meta_ob_dim], name="ob", dtype=tf.float32)
        sy_ac_na = tf.placeholder(shape=[None, self.ac_dim], name="ac", dtype=tf.float32)
        sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)
        sy_hidden = tf.placeholder(shape=[None, self.gru_size], name="hidden", dtype=tf.float32)

        sy_lp_n = tf.placeholder(shape=[None], name="logprob", dtype=tf.float32)
        sy_fixed_lp_n = tf.placeholder(shape=[None], name="fixed_logprob", dtype=tf.float32)
        return sy_ob_no, sy_ac_na, sy_adv_n, sy_hidden, sy_lp_n, sy_fixed_lp_n

    def policy_forward_pass(self, sy_ob_no, sy_hidden):
        """
        constructs the symbolic operation for the policy network outputs,
        which are the parameters of the policy distribution p(a|s)

        arguments:
            sy_ob_no: (batch_size, self.history, self.meta_ob_dim)
            sy_hidden: (batch_size, self.gru_size)

        returns:
            the parameters of the policy.

            the parameters are a tuple (mean, log_std) of a Gaussian
                distribution over actions. log_std should just be a trainable
                variable, not a network output.
                sy_mean: (batch_size, self.ac_dim)
                sy_logstd: (batch_size, self.ac_dim)

        """
        # ac_dim * 2 because we predict both mean and std
        sy_policy_params, sy_hidden = build_policy(sy_ob_no, sy_hidden, self.ac_dim*2, self.scope, n_layers=self.n_layers, size=self.size, gru_size=self.gru_size, recurrent=self.recurrent)
        return (sy_policy_params, sy_hidden)

    def sample_action(self, policy_parameters):
        """
        constructs a symbolic operation for stochastically sampling from the policy
        distribution

        arguments:
            policy_parameters
                mean, log_std) of a Gaussian distribution over actions
                    sy_mean: (batch_size, self.ac_dim)
                    sy_logstd: (batch_size, self.ac_dim)

        returns:
            sy_sampled_ac:
                (batch_size, self.ac_dim)
        """
        sy_mean, sy_logstd = policy_parameters
        sy_sampled_ac = sy_mean + tf.exp(sy_logstd) * tf.random_normal(tf.shape(sy_mean), 0, 1)
        return sy_sampled_ac

    def get_log_prob(self, policy_parameters, sy_ac_na):
        """
        constructs a symbolic operation for computing the log probability of a set of actions
        that were actually taken according to the policy

        arguments:
            policy_parameters
                mean, log_std) of a Gaussian distribution over actions
                    sy_mean: (batch_size, self.ac_dim)
                    sy_logstd: (batch_size, self.ac_dim)

            sy_ac_na: (batch_size, self.ac_dim)

        returns:
            sy_lp_n: (batch_size)

        """
        sy_mean, sy_logstd = policy_parameters
        sy_lp_n = tfp.distributions.MultivariateNormalDiag(
            loc=sy_mean, scale_diag=tf.exp(sy_logstd)).log_prob(sy_ac_na)
        return sy_lp_n

    def build_computation_graph(self):
        """
        notes on notation:

        Symbolic variables have the prefix sy_, to distinguish them from the numerical values
        that are computed later in the function

        prefixes and suffixes:
        ob - observation
        ac - action
        _no - this tensor should have shape (batch self.size /n/, observation dim)
        _na - this tensor should have shape (batch self.size /n/, action dim)
        _n  - this tensor should have shape (batch self.size /n/)

        Note: batch self.size /n/ is defined at runtime, and until then, the shape for that axis
        is None

        ----------------------------------------------------------------------------------
        loss: a function of self.sy_lp_n and self.sy_adv_n that we will differentiate
            to get the policy gradient.
        """
        self.sy_ob_no, self.sy_ac_na, self.sy_adv_n, self.sy_hidden, self.sy_lp_n, self.sy_fixed_lp_n = self.define_placeholders()

        # The policy takes in an observation and produces a distribution over the action space
        policy_outputs = self.policy_forward_pass(self.sy_ob_no, self.sy_hidden)
        self.policy_parameters = policy_outputs[:-1]

        # unpack mean and variance
        self.policy_parameters = tf.split(self.policy_parameters[0], 2, axis=1)

        # We can sample actions from this action distribution.
        # This will be called in Agent.sample_trajectory() where we generate a rollout.
        self.sy_sampled_ac = self.sample_action(self.policy_parameters)

        # We can also compute the logprob of the actions that were actually taken by the policy
        # This is used in the loss function.
        self.sy_lp_n = self.get_log_prob(self.policy_parameters, self.sy_ac_na)

        # PPO critic update
        critic_regularizer = tf.contrib.layers.l2_regularizer(1e-3) if self.l2reg else None
        self.critic_prediction = tf.squeeze(build_critic(self.sy_ob_no, self.sy_hidden, 1, 'critic_network', n_layers=self.n_layers, size=self.size, gru_size=self.gru_size, recurrent=self.recurrent, regularizer=critic_regularizer))
        self.sy_target_n = tf.placeholder(shape=[None], name="critic_target", dtype=tf.float32)
        self.critic_loss = tf.losses.mean_squared_error(self.sy_target_n, self.critic_prediction)
        self.critic_weights = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic_network')
        self.critic_update_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.critic_loss)

        # PPO actor update
        self.sy_fixed_log_prob_n = tf.placeholder(shape=[None], name="fixed_log_prob", dtype=tf.float32)
        self.policy_surr_loss = self.ppo_loss(self.sy_lp_n, self.sy_fixed_lp_n, self.sy_adv_n)
        self.policy_weights = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope)
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.policy_update_op = minimize_and_clip(optimizer, self.policy_surr_loss, var_list=self.policy_weights, clip_val=40)

    def sample_trajectories(self, itr, env, min_timesteps, is_evaluation=False):
        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        stats = []
        while True:
            animate_this_episode=(len(stats)==0 and (itr % 10 == 0) and self.animate)
            steps, s = self.sample_trajectory(env, animate_this_episode, is_evaluation=is_evaluation)
            stats += s
            timesteps_this_batch += steps
            if timesteps_this_batch > min_timesteps:
                break
        return stats, timesteps_this_batch

    def sample_trajectory(self, env, animate_this_episode, is_evaluation):
        """
        sample a task, then sample trajectories from that task until either
        max(self.history, self.max_path_length) timesteps have been sampled

        construct meta-observations by concatenating (s, a, r, d) into one vector
        inputs to the policy should have the shape (batch_size, self.history, self.meta_ob_dim)
        zero pad the input to maintain a consistent input shape

        add the entire input as observation to the replay buffer, along with a, r, d
        samples will be drawn from the replay buffer to update the policy

        arguments:
            env: the env to sample trajectories from
            animate_this_episode: if True then render
            val: whether this is training or evaluation
        """
        env.reset_task(generalized=self.generalized, granularity=self.granularity, is_evaluation=is_evaluation)
        stats = []
        #====================================================================================#
        #                           ----------PROBLEM 1----------
        #====================================================================================#
        ep_steps = 0
        steps = 0

        num_samples = max(self.history, self.max_path_length + 1)
        meta_obs = np.zeros((num_samples + self.history + 1, self.meta_ob_dim))
        rewards = []

        while True:
            if animate_this_episode:
                env.render()
                time.sleep(0.1)

            if ep_steps == 0:
                ob = env.reset()
                # first meta ob has only the observation
                # set a, r, d to zero, construct first meta observation in meta_obs
                # YOUR CODE HERE
                meta_obs[steps + self.history, :self.ob_dim] = ob
                steps += 1

            # index into the meta_obs array to get the window that ends with the current timestep
            # please name the windowed observation `in_` for compatibilty with the code that adds to the replay buffer (lines 418, 420)
            # YOUR CODE HERE
            in_ = meta_obs[steps: steps + self.history, :]
            
            hidden = np.zeros((1, self.gru_size), dtype=np.float32)

            # get action from the policy
            # YOUR CODE HERE
            ac = self.sess.run(self.sy_sampled_ac, feed_dict = {self.sy_ob_no: [in_], self.sy_hidden: hidden})
            ac = ac[0]
            # step the environment
            # YOUR CODE HERE
            obs, rew, done, _ = env.step(ac)
            ep_steps += 1

            done = bool(done) or ep_steps == self.max_path_length
            # construct the meta-observation and add it to meta_obs
            # YOUR CODE HERE
            #print(self.meta_ob_dim)
            meta_obs[steps + self.history] = np.concatenate((obs, ac, [rew], [done])) 

            rewards.append(rew)
            steps += 1

            # add sample to replay buffer
            if is_evaluation:
                self.val_replay_buffer.add_sample(in_, ac, rew, done, hidden, env._goal)
            else:
                self.replay_buffer.add_sample(in_, ac, rew, done, hidden, env._goal)

            # start new episode
            if done:
                # compute stats over trajectory
                s = dict()
                s['rewards']= rewards[-ep_steps:]
                s['ep_len'] = ep_steps
                stats.append(s)
                ep_steps = 0

            if steps >= num_samples:
                break

        return steps, stats

    def compute_advantage(self, ob_no, re_n, hidden, masks, tau=0.95):
        """
        computes generalized advantage estimation (GAE).

        arguments:
            ob_no: (bsize, history, ob_dim)
            rewards: (bsize,)
            masks: (bsize,)
            values: (bsize,)
            gamma: scalar
            tau: scalar

        output:
            advantages: (bsize,)
            returns: (bsize,)

        requires:
            self.gamma
        """
        bsize = len(re_n)
        rewards = np.squeeze(re_n)
        masks = np.squeeze(masks)
        values = self.sess.run(self.critic_prediction, feed_dict={self.sy_ob_no: ob_no, self.sy_hidden: hidden})[:,None]
        gamma = self.gamma

        assert rewards.shape == masks.shape == (bsize,)
        assert values.shape == (bsize, 1)

        bsize = len(rewards)
        returns = np.empty((bsize,))
        deltas = np.empty((bsize,))
        advantages = np.empty((bsize,))

        prev_return = 0
        prev_value = 0
        prev_advantage = 0
        for i in reversed(range(bsize)):
            returns[i] = rewards[i] + gamma * prev_return * masks[i]
            deltas[i] = rewards[i] + gamma * prev_value * masks[i] - values[i]
            advantages[i] = deltas[i] + gamma * tau * prev_advantage * masks[i]

            prev_return = returns[i]
            prev_value = values[i]
            prev_advantage = advantages[i]

        advantages = (advantages - np.mean(advantages, axis=0)) / np.std(advantages, axis=0)
        return advantages, returns


    def estimate_return(self, ob_no, re_n, hidden, masks):
        """
        estimates the returns over a set of trajectories.

        let sum_of_path_lengths be the sum of the lengths of the paths sampled from
            Agent.sample_trajectories
        let num_paths be the number of paths sampled from Agent.sample_trajectories

        arguments:
            ob_no: shape: (sum_of_path_lengths, history, meta_obs_dim)
            re_n: length: num_paths. Each element in re_n is a numpy array
                containing the rewards for the particular path
            hidden: hidden state of recurrent policy
            masks: terminals masks

        returns:
            q_n: shape: (sum_of_path_lengths). A single vector for the estimated q values
                whose length is the sum of the lengths of the paths
            adv_n: shape: (sum_of_path_lengths). A single vector for the estimated
                advantages whose length is the sum of the lengths of the paths
        """
        adv_n, q_n = self.compute_advantage(ob_no, re_n, hidden, masks)
        return q_n, adv_n

    def update_parameters(self, ob_no, hidden, ac_na, fixed_log_probs, q_n, adv_n):
        """
        update the parameters of the policy and the critic,
        with PPO update

        arguments:
            ob_no: (minibsize, history, meta_obs_dim)
            hidden: shape: (minibsize, self.gru_size)
            ac_na: (minibsize)
            fixed_log_probs: (minibsize)
            adv_n: shape: (minibsize)
            q_n: shape: (sum_of_path_lengths)

        returns:
            nothing

        """
        self.update_critic(ob_no, hidden, q_n)
        self.update_policy(ob_no, hidden, ac_na, fixed_log_probs, adv_n)

    def update_critic(self, ob_no, hidden, q_n):
        """
        given:
            self.num_value_iters
            self.l2_reg

        arguments:
            ob_no: (minibsize, history, meta_obs_dim)
            hidden: (minibsize, self.gru_size)
            q_n: (minibsize)

        requires:
            self.num_value_iters
        """
        target_n = (q_n - np.mean(q_n))/(np.std(q_n)+1e-8)
        for k in range(self.num_value_iters):
            critic_loss, _ = self.sess.run(
                [self.critic_loss, self.critic_update_op],
                feed_dict={self.sy_target_n: target_n, self.sy_ob_no: ob_no, self.sy_hidden: hidden})
        return critic_loss

    def update_policy(self, ob_no, hidden, ac_na, fixed_log_probs, advantages):
        '''
        arguments:
            fixed_log_probs: (minibsize)
            advantages: (minibsize)
            hidden: (minibsize, self.gru_size)
        '''
        policy_surr_loss, _ = self.sess.run(
            [self.policy_surr_loss, self.policy_update_op],
            feed_dict={self.sy_ob_no: ob_no, self.sy_hidden: hidden, self.sy_ac_na: ac_na, self.sy_fixed_lp_n: fixed_log_probs, self.sy_adv_n: advantages})
        return policy_surr_loss

    def ppo_loss(self, log_probs, fixed_log_probs, advantages, clip_epsilon=0.1, entropy_coeff=1e-4):
        """
        given:
            clip_epsilon

        arguments:
            advantages (mini_bsize,)
            states (mini_bsize,)
            actions (mini_bsize,)
            fixed_log_probs (mini_bsize,)

        intermediate results:
            states, actions --> log_probs
            log_probs, fixed_log_probs --> ratio
            advantages, ratio --> surr1
            ratio, clip_epsilon, advantages --> surr2
            surr1, surr2 --> policy_surr_loss
        """
        ratio = tf.exp(log_probs - fixed_log_probs)
        surr1 = ratio * advantages
        surr2 = tf.clip_by_value(ratio, clip_value_min=1.0-clip_epsilon, clip_value_max=1.0+clip_epsilon) * advantages
        policy_surr_loss = -tf.reduce_mean(tf.minimum(surr1, surr2))

        probs = tf.exp(log_probs)
        entropy = tf.reduce_sum(-(log_probs * probs))
        policy_surr_loss -= entropy_coeff * entropy
        return policy_surr_loss
    # Check if we need to load a checkpoint
    if FLAGS.checkpoint:
        _saver = tf.train.Saver(var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=GENERATOR_SCOPE))
        _saver.restore(sess, FLAGS.checkpoint)

    if FLAGS.mode == 'train':
        # Fill replay buffer with the minimum number of sample
        obs = env.reset()
        for i in trange(REPLAY_BUFFER_MIN_SIZE):
            _old_state = obs
            # Generate random play
            a = env.action_space.sample()
            obs, reward, done, _ = env.step(a)
            # Add to replay buffer, sample of type: (state, action, reward, next_state, done)
            repbuf.add_sample((_old_state, a, reward, obs, done))
            # Check if done and reset if the game is ended
            if done:
                obs = env.reset()
        print("Loaded replay buffer with", len(repbuf.buffer), "samples.")

        # Now start training
        done = True
        episode_reward = 0
        episode_steps = 0
        rewards = deque([], maxlen=1000)
        epsilon = EPSILON_MAX
        episode_rewards_100 = 0
        episode_rewards_1000 = 0
        n_episode = 0
        total_max_q = 0
Exemple #4
0
class StandardRLAgent:
    def __init__(self, env, device):
        self.env = env
        self.device = device
        self.is_discrete_action = isinstance(env.action_space,
                                             gym.spaces.discrete.Discrete)
        self.obs_dim = env.observation_space.shape[0]

        gamma = 0.99
        self.gamma = gamma

        if self.is_discrete_action:
            self.num_act = env.action_space.n
            self.algo = DQNAlgo(self.obs_dim,
                                self.num_act,
                                gamma,
                                device=device)
        else:
            self.act_dim = env.action_space.shape[0]
            self.algo = DDPGAlgo(self.obs_dim,
                                 self.act_dim,
                                 gamma,
                                 device=device)

        self.replay_buffer = ReplayBuffer()

    def update_batch(self, batch_size=100):
        if self.replay_buffer.size() < batch_size * 10:
            return {}

        batch = self.replay_buffer.sample_batch(batch_size=batch_size)
        return self.algo.update_batch(batch)

    def run_episode(self):
        s, done = self.env.reset(), False
        zero = np.zeros_like(s)

        stats = Stats()

        epilen = 0
        R = 0.
        while not done:
            epilen += 1
            if len(self.replay_buffer) < 10000:
                a = self.env.action_space.sample()
            elif self.is_discrete_action:
                a = self.algo.get_action(s, zero, epsilon=0.05)
            else:
                a = self.algo.get_action(s, zero, sigma=0.1)
            sp, r, done, _ = self.env.step(a)
            mdone = modify_done(self.env, done)

            self.replay_buffer.add_sample((s, a, r, sp, mdone, zero))

            s = sp
            R += r

            if epilen % 1 == 0:
                info = self.update_batch()
                stats.update(info)

        print(f'Epilen: {epilen}\tR: {R:.2f}')
        print(stats)

        return epilen

    def test_episode(self):
        s, done = self.env.reset(), False
        zero = np.zeros_like(s)

        R = 0.
        R0 = 0.
        DiscR = 0.
        gamma_power = 1.

        cnt = 0
        while not done:
            # self.env.render()
            cnt += 1
            # if cnt >= 500: break
            if self.is_discrete_action:
                a = self.algo.get_action(s, zero, epsilon=0.)
            else:
                a = self.algo.get_action(s, zero, sigma=0.)
            sp, r, done, _ = self.env.step(a)

            if cnt == 1:
                R0 = self.algo.get_value(s, zero)

            R += r
            DiscR += r * gamma_power

            gamma_power *= self.gamma
            s = sp

        info = {
            'ExtR': R,
            'DiscExtR': DiscR,
            'DiscExtR_Est': R0,
        }

        return info
Exemple #5
0
class UVFAWithRewardAgent:
    def __init__(self, env, device='cpu', use_td3=True):
        self.env = env
        self.device = device
        self.is_discrete_action = isinstance(env.action_space, gym.spaces.discrete.Discrete)
        self.obs_dim = env.observation_space.shape[0]

        gamma = 0.99
        self.gamma = gamma

        if self.is_discrete_action:
            self.num_act = env.action_space.n
            self.algo = DQNWithRewardAlgo(self.obs_dim, self.num_act, gamma, use_td3=use_td3, device=device)
        else:
            self.act_dim = env.action_space.shape[0]
            self.algo = DDPGAlgo(self.obs_dim, self.act_dim, gamma, use_td3=use_td3, device=device)

        self.replay_buffer = ReplayBuffer()

        self.planner = Planner(trans_fn=self.planner_trans_fn, use_td3=use_td3, gamma=gamma, device=device)

        self.estimate_std()

    def estimate_std(self):
        tot_cnt = 0
        states = []
        action_deltas = []
        while tot_cnt < 10000:
            s, done = self.env.reset(), False
            while not done:
                tot_cnt += 1
                states.append(s)
                a = self.env.action_space.sample()
                sp, r, done, _ = self.env.step(a)
                action_deltas.append(sp - s)
                s = sp

        self.std = np.std(states, axis=0) + 1e-8
        self.astd = np.std(action_deltas, axis=0) + 1e-8

        print('Std', self.std, 'Action-Std', self.astd)

    def gen_goal(self, s):
        g = s + np.random.randn(*s.shape) * self.astd * np.random.randint(1, 10)
        return g

    def goal_reward(self, s, g):
        # r = (np.linalg.norm((s-g) / self.std, axis=-1) < 0.1).astype(np.float)
        r = (np.linalg.norm((s-g) / self.astd, axis=-1) < 1).astype(np.float)
        return r

    def update_batch(self, batch_size=32):
        if self.replay_buffer.size() < batch_size * 10:
            return {}

        batch = self.replay_buffer.sample_batch(batch_size=batch_size)
        return self.algo.update_batch(batch)


    def run_episode(self):
        s, done = self.env.reset(), False

        g = self.gen_goal(s)
        # g = s

        stats = Stats()

        episode = []

        epilen = 0
        extR = 0.
        intR = 0.
        while not done:
            epilen += 1
            if len(self.replay_buffer) < 10000:
                a = self.env.action_space.sample()
            elif self.is_discrete_action:
                a = self.algo.get_action(s, g, epsilon=0.05)
            else:
                a = self.algo.get_action(s, g, sigma=0.1)
            sp, r, done, info = self.env.step(a)
            mdone = modify_done(self.env, done)

            episode.append((s, a, r, sp, mdone, g))

            s = sp
            extR += r

            intR = max(intR, self.goal_reward(s, g) * (self.algo.gamma ** epilen))

            if epilen % 1 == 0:
                info = self.update_batch()
                stats.update(info)

        her_prob = 0.5
        rpl_len = 10
        for i in range(epilen):
            s, a, extr, sp, done, g = episode[i]
            if np.random.random() < her_prob:
                hg_idx = np.random.randint(i, min(epilen, i+rpl_len))
                g = episode[hg_idx][0]
            r = self.goal_reward(sp, g)
            done = np.logical_or((r > 0), done)
            self.replay_buffer.add_sample((s, a, r, extr, sp, done, g))

        print(f'Epilen: {epilen}\tExtR: {extR:.2f}\tIntR: {intR:.2f}')
        print(stats)

        return epilen


    def test_episode(self):
        s, done = self.env.reset(), False
        g = self.gen_goal(s)
        # g = np.array([0.5, 0.0])

        ss = torch.from_numpy(s).float().to(self.device).unsqueeze(0)
        gg = torch.from_numpy(g).float().to(self.device).unsqueeze(0)
        Q0, R0, _ = self.algo.get_values(ss, gg)
        Q0 = float(Q0)
        R0 = float(R0)

        ExtR = 0.
        DiscExtR = 0.
        IntR = 0.

        gamma_power = 1.0

        min_dis = 1e9

        cnt = 0
        while not done:
            # self.env.render()
            cnt += 1
            # if cnt >= 500: break
            if self.is_discrete_action:
                a = self.algo.get_action(s, g, epsilon=0.)
            else:
                a = self.algo.get_action(s, g, sigma=0.)
            sp, extr, done, info = self.env.step(a)
            mdone = modify_done(self.env, done)
            r = self.goal_reward(sp, g)

            ExtR += extr
            DiscExtR += gamma_power * extr
            IntR += gamma_power * r
            
            gamma_power *= self.gamma

            min_dis = min(min_dis, np.linalg.norm((sp-g)/self.std))

            s = sp
            if r > 0:
                done = True

        info = {
            'ExtR': ExtR,
            'DiscExtR': DiscExtR,
            'DiscExtR_Est': R0,
            'IntR': IntR,
            'IntR_Est': Q0,
        }

        return info

    def planner_trans_fn(self, s, g, *args, **kwargs):
        n, m = s.shape[0], g.shape[0]
        s = s.unsqueeze(1).expand(-1, m, -1)
        g = g.unsqueeze(0).expand(n, -1, -1)

        with torch.no_grad():
            G, R, Pi = self.algo.get_values(s, g, *args, **kwargs)
        return G, R, Pi

    def update_planner(self):
        n = 1000
        if len(self.replay_buffer) < n:
            return False

        waypoints = self.replay_buffer.sample_batch(n, replace=False)[0] # s
        print(waypoints.shape)
        self.planner.set_waypoint_states(waypoints)
        self.planner.update_trans()
        self.planner.pre_plan()

        return True

    def plan_episode(self, show_plan=False):
        s, done = self.env.reset(), False

        if show_plan:
            self.planner.show_plan(s, self.env)

        ExtR = 0.
        DiscExtR = 0.
        V0 = 0.
        gamma_power = 1.0

        step = 0
        while not done:
            step += 1
            # if step >= 10: break
            # if show_plan:
                # self.env.render()

            a, v = self.planner.plan(s)
            if step == 1:
                V0 = v

            sp, extr, done, info = self.env.step(a)
            ExtR += extr
            DiscExtR += extr * gamma_power
            
            gamma_power *= self.gamma
            s = sp

        info = {
            'Plan_ExtR': ExtR,
            'Plan_DiscExtR': DiscExtR,
            'Plan_DiscExtR_Est': V0,
        }
        return info
Exemple #6
0
class DDPGAgent(object):
    def __init__(self, sess, env, test_env, args):
        self.sess = sess
        self.args = args
        self.env = env
        self.test_env = test_env
        self.ob_dim = env.observation_space.shape[0]
        self.ac_dim = env.action_space.shape[0]

        # Construct the networks and the experience replay buffer.
        self.actor = Actor(sess, env, args)
        self.critic = Critic(sess, env, args)
        self.rbuffer = ReplayBuffer(args.replay_size, self.ob_dim, self.ac_dim)

        # Initialize then run, also setting current=target to start.
        self._debug_print()
        self.sess.run(tf.global_variables_initializer())
        self.actor.update_target_net(smooth=False)
        self.critic.update_target_net(smooth=False)

    def train(self):
        """ 
        Algorithm 1 in the DDPG paper. 
        """
        num_episodes = 0
        t_start = time.time()
        obs = self.env.reset()

        for t in range(self.args.n_iter):
            if (t % self.args.log_every_t_iter
                    == 0) and (t > self.args.wait_until_rbuffer):
                print("\n*** DDPG Iteration {} ***".format(t))

            # Sample actions with noise injection and manage buffer.
            act = self.actor.sample_action(obs, train=True)
            new_obs, rew, done, info = self.env.step(act)
            self.rbuffer.add_sample(s=obs, a=act, r=rew, done=done)
            if done:
                obs = self.env.reset()
                num_episodes += 1
            else:
                obs = new_obs

            if (t > self.args.wait_until_rbuffer) and (
                    t % self.args.learning_freq == 0):
                # Sample from the replay buffer.
                states_t_BO, actions_t_BA, rewards_t_B, states_tp1_BO, done_mask_B = \
                        self.rbuffer.sample(num=self.args.batch_size)

                feed = {
                    'obs_t_BO': states_t_BO,
                    'act_t_BA': actions_t_BA,
                    'rew_t_B': rewards_t_B,
                    'obs_tp1_BO': states_tp1_BO,
                    'done_mask_B': done_mask_B
                }

                # Update the critic, get sampled policy gradients, update actor.
                a_grads_BA, l2_error = self.critic.update_weights(feed)
                actor_gradients = self.actor.update_weights(feed, a_grads_BA)

                # Update both target networks.
                self.critic.update_target_net()
                self.actor.update_target_net()

            if (t % self.args.log_every_t_iter
                    == 0) and (t > self.args.wait_until_rbuffer):
                # Do some rollouts here and then record statistics.  Note that
                # some of these stats rely on stuff computed from sampling the
                # replay buffer, so be careful interpreting these. The code
                # probably needs to guard against this case as well.
                stats = self._do_rollouts()
                hours = (time.time() - t_start) / (60 * 60.)
                logz.log_tabular("MeanReward", np.mean(stats['reward']))
                logz.log_tabular("MaxReward", np.max(stats['reward']))
                logz.log_tabular("MinReward", np.min(stats['reward']))
                logz.log_tabular("StdReward", np.std(stats['reward']))
                logz.log_tabular("MeanLength", np.mean(stats['length']))
                logz.log_tabular("NumTrainingEps", num_episodes)
                logz.log_tabular("L2ErrorCritic", l2_error)
                logz.log_tabular("QaGradL2Norm", np.linalg.norm(a_grads_BA))
                logz.log_tabular("TimeHours", hours)
                logz.log_tabular("Iterations", t)
                logz.dump_tabular()

    def _do_rollouts(self):
        """ 
        Some rollouts to evaluate the agent's progress.  Returns a dictionary
        containing relevant statistics. Later, I should parallelize this using
        an array of environments.
        """
        num_episodes = 50
        stats = defaultdict(list)

        for i in range(num_episodes):
            obs = self.test_env.reset()
            ep_time = 0
            ep_reward = 0

            # Run one episode ...
            while True:
                act = self.actor.sample_action(obs, train=False)
                new_obs, rew, done, info = self.test_env.step(act)
                ep_time += 1
                ep_reward += rew
                if done:
                    break

            # ... and collect its information here.
            stats['length'].append(ep_time)
            stats['reward'].append(ep_reward)

        return stats

    def _debug_print(self):
        print("\n\t(A bunch of debug prints)\n")

        print("\nActor weights")
        for v in self.actor.weights:
            shp = v.get_shape().as_list()
            print("- {} shape:{} size:{}".format(v.name, shp, np.prod(shp)))
        print("Total # of weights: {}.".format(self.actor.num_weights))

        print("\nCritic weights")
        for v in self.critic.weights:
            shp = v.get_shape().as_list()
            print("- {} shape:{} size:{}".format(v.name, shp, np.prod(shp)))
        print("Total # of weights: {}.".format(self.critic.num_weights))
Exemple #7
0
class UVFAgent:
    def __init__(self, env, device):
        self.env = env
        self.device = device
        self.is_discrete_action = isinstance(env.action_space,
                                             gym.spaces.discrete.Discrete)
        self.obs_dim = env.observation_space.shape[0]

        gamma = 0.99

        if self.is_discrete_action:
            self.num_act = env.action_space.n
            self.algo = DQNAlgo(self.obs_dim,
                                self.num_act,
                                gamma,
                                device=device)
        else:
            self.act_dim = env.action_space.shape[0]
            self.algo = DDPGAlgo(self.obs_dim,
                                 self.act_dim,
                                 gamma,
                                 device=device)

        self.replay_buffer = ReplayBuffer()

        self.estimate_std()

    def estimate_std(self):
        tot_cnt = 0
        states = []
        action_deltas = []
        while tot_cnt < 10000:
            s, done = self.env.reset(), False
            while not done:
                tot_cnt += 1
                states.append(s)
                a = self.env.action_space.sample()
                sp, r, done, _ = self.env.step(a)
                action_deltas.append(sp - s)
                s = sp

        self.std = np.std(states, axis=0) + 1e-8
        self.astd = np.std(action_deltas, axis=0) + 1e-8

        print('Std', self.std, 'Action-Std', self.astd)

    def gen_goal(self, s):
        g = s + np.random.randn(*s.shape) * self.astd
        return g

    def goal_reward(self, s, g):
        # r = (np.linalg.norm((s-g) / self.std, axis=-1) < 0.1).astype(np.float)
        r = (np.linalg.norm((s - g) / self.astd, axis=-1) < 1).astype(np.float)
        return r

    def update_batch(self, batch_size=32):
        if self.replay_buffer.size() < batch_size * 10:
            return {}

        batch = self.replay_buffer.sample_batch(batch_size=batch_size)
        return self.algo.update_batch(batch)

    def run_episode(self):
        s, done = self.env.reset(), False

        g = self.gen_goal(s)
        # g = s

        stats = Stats()

        episode = []

        epilen = 0
        extR = 0.
        intR = 0.
        while not done:
            epilen += 1
            if self.is_discrete_action:
                a = self.algo.get_action(s, g, epsilon=0.05)
            else:
                a = self.algo.get_action(s, g, sigma=0.1)
            sp, r, done, info = self.env.step(a)
            mdone = modify_done(self.env, done)

            episode.append((s, a, r, sp, mdone, g))

            s = sp
            extR += r

            intR = max(intR,
                       self.goal_reward(s, g) * (self.algo.gamma**epilen))

            if epilen % 4 == 0:
                info = self.update_batch()
                stats.update(info)

        her_prob = 0.5
        rpl_len = 10
        for i in range(epilen):
            s, a, r, sp, done, g = episode[i]
            if np.random.random() < her_prob:
                hg_idx = np.random.randint(i, min(epilen, i + rpl_len))
                g = episode[hg_idx][0]
            r = self.goal_reward(sp, g)
            done = np.logical_or((r > 0), done)
            self.replay_buffer.add_sample((s, a, r, sp, done, g))

        print(f'Epilen: {epilen}\tExtR: {extR:.2f}\tIntR: {intR:.2f}')
        print(stats)

    def test_episode(self):
        s, done = self.env.reset(), False
        g = self.gen_goal(s)
        # g = np.array([0.5, 0.0])

        R = 0

        min_dis = 1e9

        cnt = 0
        while not done:
            # self.env.render()
            cnt += 1
            # if cnt >= 500: break
            if self.is_discrete_action:
                a = self.algo.get_action(s, g, epsilon=0.)
            else:
                a = self.algo.get_action(s, g, sigma=0.)
            sp, extr, done, info = self.env.step(a)
            r = self.goal_reward(sp, g)
            R += r

            min_dis = min(min_dis, np.linalg.norm((sp - g) / self.std))

            s = sp
            if r > 0:
                done = True

        return R, min_dis