コード例 #1
0
class DDPG:
    def __init__(self, action_dim, action_bound, tau, lr_a, lr_c, state_dim,
                 gamma, batch_size):
        self.target = tf.placeholder(tf.float32, [None, 1], 'critic_target')
        self.s = tf.placeholder(tf.float32, [None, state_dim], 'state')
        self.s_ = tf.placeholder(tf.float32, [None, state_dim], 'next_state')

        self.memory = ReplayBuffer(max_size=10000)
        self.noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim))
        self.batch_size = batch_size
        self.gamma = gamma

        self.sess = tf.Session()

        self.actor = Actor(self.sess,
                           self.s,
                           self.s_,
                           action_dim,
                           action_bound,
                           tau,
                           lr_a,
                           f1_units=300)
        self.critic = Critic(self.sess,
                             lr_c,
                             self.s,
                             self.s_,
                             self.actor.a,
                             self.actor.a_,
                             self.target,
                             tau,
                             gamma,
                             state_dim,
                             action_dim,
                             f1_units=300)
        self.actor.add_grad_to_graph(self.critic.a_g)

        self.sess.run(tf.global_variables_initializer())

    def choose_action(self, s):
        a = self.actor.choose_action(s)
        var = self.noise()
        a = a + var
        return a[0]

    def update_target_networks(self):
        self.sess.run([self.actor.replace, self.critic.replace])

    def store(self, s, a, r, s_, done):
        self.memory.store(s, a, r, s_, done)

    def learn(self):
        bs, ba, br, bs_, _ = self.memory.sample(self.batch_size)

        q_ = self.sess.run(self.critic.q_, {self.s_: bs_})
        br = br[:, np.newaxis]
        target_critic = br + self.gamma * q_
        self.critic.learn(bs, ba, target_critic)
        self.actor.learn(bs)
        self.update_target_networks()
コード例 #2
0
ファイル: agent_TD3.py プロジェクト: liuqx315/RLFluidControl
class Agent:
    def __init__(self, state_dim, action_dim, explore_noise = "Gaussian", *args, **kwargs):
        self.lr = 1e-4
        self.gamma = 0.99
        self.tau = 0.005
        self.bs = 512
        self.bfs = 1000000
        self.d = 2
        self.explore_noise = explore_noise
        self.explore_noise_size = 0.1 # or 0.01
        self.process_noise_generator = ProcessNoise(action_dim)
        self.criticreg_noise_size = 0.2
        self.criticreg_noise_clip = 0.5

        self.state_dim = state_dim
        self.action_dim = action_dim
        self.actor_nn_dim = [256, 256, self.action_dim]
        self.critic_nn_dim = [256, 256, 1]

        self.state1_place = tf.placeholder(tf.float32, [None, self.state_dim])
        self.action_place = tf.placeholder(tf.float32, [None, self.action_dim])
        self.reward_place = tf.placeholder(tf.float32, [None,1])
        self.isdone_place = tf.placeholder(tf.float32, [None,1])
        self.state2_place = tf.placeholder(tf.float32, [None, self.state_dim])


        with tf.variable_scope("target_actor", reuse = tf.AUTO_REUSE):
            self.Q_next_action = self.actor_nn(self.state2_place)
        self.Q_next_noise = tf.clip_by_value(tf.random.normal([self.bs, self.action_dim], 0, self.criticreg_noise_size),
                         - self.criticreg_noise_clip, self.criticreg_noise_clip)
        self.Q_next_noisy_action = tf.clip_by_value(self.Q_next_action + self.Q_next_noise, -1, 1)
        with tf.variable_scope("target_critic_1", reuse = tf.AUTO_REUSE):
            self.Q_critic_1 = self.critic_nn(self.state2_place, self.Q_next_noisy_action)
        with tf.variable_scope("target_critic_2", reuse = tf.AUTO_REUSE):
            self.Q_critic_2 = self.critic_nn(self.state2_place, self.Q_next_noisy_action)
        self.Q_critic_min = tf.minimum(self.Q_critic_1, self.Q_critic_2)
        self.Q_y = self.reward_place + self.gamma * (1-self.isdone_place) * self.Q_critic_min
        with tf.variable_scope("main_critic_1", reuse = tf.AUTO_REUSE):
            self.Q_Q_1 = self.critic_nn(self.state1_place, self.action_place)
        with tf.variable_scope("main_critic_2", reuse = tf.AUTO_REUSE):
            self.Q_Q_2 = self.critic_nn(self.state1_place, self.action_place) 
        self.Q_loss = tf.reduce_mean((self.Q_Q_1 - self.Q_y)**2) + tf.reduce_mean((self.Q_Q_2 - self.Q_y)**2)

        with tf.variable_scope("main_actor", reuse = tf.AUTO_REUSE):
            self.P_this_action = self.actor_nn(self.state1_place)
        with tf.variable_scope("main_critic_1", reuse = tf.AUTO_REUSE):
            self.P_Q_1 = self.critic_nn(self.state1_place, self.P_this_action)
        self.P_loss = - tf.reduce_mean(self.P_Q_1)

        with tf.variable_scope("main_actor", reuse = tf.AUTO_REUSE):
            self.action = self.actor_nn(self.state1_place)


        all_variables = tf.trainable_variables()
        self.main_critic_var = [i for i in all_variables if "main_critic" in i.name]
        self.target_critic_var = [i for i in all_variables if "target_critic" in i.name]
        self.main_actor_var = [i for i in all_variables if "main_actor" in i.name]
        self.target_actor_var = [i for i in all_variables if "target_actor" in i.name]

        assert len(self.main_critic_var) == len(self.target_critic_var)
        assert len(self.main_actor_var) == len(self.target_actor_var)

        self.Q_op =  tf.train.AdamOptimizer(self.lr).minimize(self.Q_loss, var_list = self.main_critic_var) 
        self.P_op =  tf.train.AdamOptimizer(self.lr).minimize(self.P_loss, var_list = self.main_actor_var) 
        self.T_init = [tf.assign(T, M) for (T,M) in zip(self.target_critic_var + self.target_actor_var, 
                                                        self.main_critic_var + self.main_actor_var)]
        self.T_op = [tf.assign(T, self.tau * M + (1 - self.tau) * T) for (T,M) in zip(
            self.target_critic_var + self.target_actor_var, self.main_critic_var + self.main_actor_var)]


        self.replay_buffer = ReplayBuffer(self.state_dim, self.action_dim, self.bfs)

        self.step_count = 0
        self.total_step_count = 0
        self.episode_count = 0
        self.train_count = 0

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.T_init)
        self.saver = tf.train.Saver(max_to_keep=1000)


    def actor_nn(self, state, bound = True):
        dim = self.actor_nn_dim
        A = state
        for i in range(0,len(dim)-1):
            A = tf.layers.dense(A, units= dim[i], activation = tf.nn.relu)
        action = tf.layers.dense(A, units= dim[-1], activation = tf.nn.tanh)
        return action


    def critic_nn(self, state, action):
        dim = self.critic_nn_dim
        A = tf.concat([state, action], axis = 1)
        for i in range(0,len(dim)-1):
            A = tf.layers.dense(A, units= dim[i], activation = tf.nn.relu)
        critic = tf.layers.dense(A, units= dim[-1], activation = None)
        return critic



    def get_action(self, state_data, stochastic = True):
        this_action = self.sess.run(self.action, feed_dict= {self.state1_place: state_data})
        if stochastic:
            if self.explore_noise == "Gaussian":
                explore_noise = np.random.normal(0, self.explore_noise_size, [1, self.action_dim])
            elif self.explore_noise == "Process":
                explore_noise = self.explore_noise_size * self.process_noise_generator.next()
            else:
                raise NotImplementedError
            this_action = np.clip(this_action + explore_noise, -1, 1)
        return this_action

    def eval_loss(self, bs = None):
        if bs is None:
            bs = self.bs
        this_bs = np.minimum(bs, self.replay_buffer.size)
        this_batch = self.replay_buffer.sample_batch(this_bs)
        feed_dict = {self.state1_place: this_batch["obs1"],
                     self.action_place: this_batch["acts"],
                     self.reward_place: this_batch["rews"],
                     self.isdone_place: this_batch["done"],
                     self.state2_place: this_batch["obs2"]}
        pass

    def train_iter(self):

        if self.bs <= self.replay_buffer.size:
            this_batch = self.replay_buffer.sample_batch(self.bs)
            
            feed_dict = {self.state1_place: this_batch["obs1"],
                        self.action_place: this_batch["acts"],
                        self.reward_place: this_batch["rews"],
                        self.isdone_place: this_batch["done"],
                        self.state2_place: this_batch["obs2"]}

            self.sess.run([self.Q_op], feed_dict=feed_dict)
            if self.total_step_count % self.d == 0:
                self.sess.run([self.P_op], feed_dict=feed_dict)
                self.sess.run(self.T_op)

            self.train_count += 1
            self.total_step_count += 1


    def record(self, this_state, this_action, this_reward, this_done, next_state):
        self.replay_buffer.store(obs=this_state, 
                                act=this_action,
                                rew=this_reward,
                                next_obs=next_state,
                                done=this_done)
        self.step_count += 1


    def reset_agent(self):

        self.replay_buffer = ReplayBuffer(self.state_dim, self.action_dim, self.bfs)

        self.step_count = 0
        self.total_step_count = 0
        self.train_count = 0
        self.episode_count = 0

        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.T_init)

    def reset_episode(self):

        self.step_count = 0
        self.train_count = 0
        self.episode_count += 1
コード例 #3
0
class Trainer():
    def __init__(self, params: Parameters):
        self.parms = params

        self.env = Env(params.game,
                       params.gamma,
                       norm_rewards=None,
                       norm_states=False)

        self.buffer = ReplayBuffer(params.replay_size)

        # Seed
        self.env.seed(params.seed)
        np.random.seed(params.seed)
        tf.random.set_seed(params.seed)

        self.critic = DDPGValueNet(feature_shape=self.env.features_shape,
                                   a_num=self.env.num_actions,
                                   lr=params.lr_c)
        self.target_critic = DDPGValueNet(
            feature_shape=self.env.features_shape,
            a_num=self.env.num_actions,
            lr=params.lr_c)
        self._copy_para(self.critic.model, self.target_critic.model)

        self.actor = CtsPolicy(action_bound=self.env.action_bound,
                               action_dim=self.env.num_actions,
                               lr=params.lr_a)
        self.target_actor = CtsPolicy(action_bound=self.env.action_bound,
                                      action_dim=self.env.num_actions,
                                      lr=params.lr_a)
        self._copy_para(self.actor, self.target_actor)

        self.ema = tf.train.ExponentialMovingAverage(decay=1.0 -
                                                     self.parms.tau)

    def _copy_para(self, from_model, to_model):
        """
        Copy parameters for soft updating
        :param from_model: latest model
        :param to_model: target model
        :return: None
        """
        for i, j in zip(from_model.trainable_weights,
                        to_model.trainable_weights):
            j.assign(i)

    def _ema_update(self):

        paras = self.actor.trainable_weights + \
                self.critic.model.trainable_weights

        self.ema.apply(paras)

        for i, j in zip(self.target_actor.trainable_weights + \
            self.target_critic.model.trainable_weights, paras):
            i.assign(self.ema.average(j))

    def _train(self):

        # Sample
        batch = self.buffer.sample(self.parms.batch_size)
        s = np.array([batch_[0] for batch_ in batch])
        a = np.array([batch_[1] for batch_ in batch])
        r = np.array([batch_[2] for batch_ in batch])
        s_next = np.array([batch_[3] for batch_ in batch])
        not_done = np.array([not batch_[4] for batch_ in batch])

        # Reshpe
        r = r[:, np.newaxis]
        not_done = not_done[:, np.newaxis]

        # Train critic
        with tf.GradientTape() as tape:
            pi_next = self.target_actor(s_next)
            a_next = pi_next.sample()
            q_next = self.target_critic([s_next, a_next])
            y = r + self.parms.gamma * q_next * not_done
            q = self.critic([s, a])
            c_loss = tf.losses.mean_squared_error(y, q)
        c_grads = tape.gradient(c_loss, self.critic.model.trainable_weights)
        self.critic.model.optimizer.apply_gradients(
            zip(c_grads, self.critic.model.trainable_weights))

        # Train actor
        with tf.GradientTape() as tape:
            pi = self.actor(s)
            a = pi.sample()
            q = self.critic([s, a])
            a_loss = -tf.reduce_mean(q)
        a_grads = tape.gradient(a_loss, self.actor.trainable_weights)
        self.actor.optimizer.apply_gradients(
            zip(a_grads, self.actor.trainable_weights))

        self._ema_update()

    def train_step(self):

        # Episode infomation
        episode_ret = []

        # Initialize s
        s = self.env.reset()
        for _ in range(self.parms.train_step_len):
            # Interact
            pi = self.actor(s[np.newaxis, :])  # batch_size=1
            a = pi.sample()[0]
            s_next, r, done, info = self.env.step(a)

            # Store
            self.buffer.store((s, a, r, s_next, done))

            # Train
            if self.buffer.size() > self.parms.start_size:
                self._train()

            if done:
                _, ret = info['done']
                episode_ret.append(ret)
                s_next = self.env.reset()

            s = s_next

        return np.mean(episode_ret)
コード例 #4
0
class Trainer():
    def __init__(self, params: Parameters):
        self.parms = params

        self.env = Env(params.game,
                       params.gamma,
                       norm_rewards=None,
                       norm_states=False)

        self.buffer = ReplayBuffer(params.replay_size)

        # Seed
        self.env.seed(params.seed)
        np.random.seed(params.seed)
        tf.random.set_seed(params.seed)

        # Four critic nets
        critic_nets = [
            DDPGValueNet(feature_shape=self.env.features_shape,
                         a_num=self.env.num_actions,
                         lr=params.lr_c) for _ in range(4)
        ]
        self.critic1, self.critic2, self.target_critic1, self.target_critic2 = critic_nets

        # Two actor nets
        self.actor = CtsPolicy(action_bound=self.env.action_bound,
                               action_dim=self.env.num_actions,
                               lr=params.lr_a)
        self.target_actor = CtsPolicy(action_bound=self.env.action_bound,
                                      action_dim=self.env.num_actions,
                                      lr=params.lr_a)

        # Copy parms
        self._copy_para(self.critic1, self.target_critic1)
        self._copy_para(self.critic2, self.target_critic2)
        self._copy_para(self.actor, self.target_actor)

        self.train_step_cnt = 0

    def _copy_para(self, from_model, to_model):
        """
        Copy parameters for soft updating
        :param from_model: latest model
        :param to_model: target model
        :return: None
        """
        for i, j in zip(from_model.trainable_weights,
                        to_model.trainable_weights):
            j.assign(i)

    def _target_soft_update(self, net, target_net):
        """ soft update the target net with Polyak averaging """
        for target_param, param in zip(target_net.trainable_weights,
                                       net.trainable_weights):
            target_param.assign(  # copy weight value into target parameters
                target_param * (1.0 - self.parms.tau) + param * self.parms.tau)

    def _train(self):

        # Sample
        batch = self.buffer.sample(self.parms.batch_size)
        s = np.array([batch_[0] for batch_ in batch])
        a = np.array([batch_[1] for batch_ in batch])
        r = np.array([batch_[2] for batch_ in batch])
        s_next = np.array([batch_[3] for batch_ in batch])
        not_done = np.array([not batch_[4] for batch_ in batch])

        # Reshpe
        r = r[:, np.newaxis]
        not_done = not_done[:, np.newaxis]

        # Set target y
        pi_next = self.target_actor(s_next)
        a_next = pi_next.sample()
        q_next = tf.minimum(self.target_critic1([s_next, a_next]),
                            self.target_critic2([s_next, a_next]))
        y = r + self.parms.gamma * q_next * not_done

        # Train critic1
        with tf.GradientTape() as c1_tape:
            q1 = self.critic1([s, a])
            c1_loss = tf.losses.mean_squared_error(y, q1)
        c1_grads = c1_tape.gradient(c1_loss, self.critic1.trainable_weights)
        self.critic1.optimizer.apply_gradients(
            zip(c1_grads, self.critic1.trainable_weights))

        # Train critic2
        with tf.GradientTape() as c2_tape:
            q2 = self.critic2([s, a])
            c2_loss = tf.losses.mean_squared_error(y, q2)
        c2_grads = c2_tape.gradient(c2_loss, self.critic2.trainable_weights)
        self.critic2.optimizer.apply_gradients(
            zip(c2_grads, self.critic2.trainable_weights))

        # Train actor
        if self.train_step_cnt % self.parms.actor_interval == 0:

            with tf.GradientTape() as a_tape:
                pi = self.actor(s)
                a = pi.sample()
                q = self.critic1([s, a])
                a_loss = -tf.reduce_mean(q)
            a_grads = a_tape.gradient(a_loss, self.actor.trainable_weights)
            self.actor.optimizer.apply_gradients(
                zip(a_grads, self.actor.trainable_weights))

            # update parms
            self._target_soft_update(self.actor, self.target_actor)
            self._target_soft_update(self.critic1, self.target_critic1)
            self._target_soft_update(self.critic2, self.target_critic2)

    def train_step(self):

        # Episode infomation
        episode_ret = []

        # Initialize s
        s = self.env.reset()
        for _ in range(self.parms.train_step_len):
            # Interact
            pi = self.actor(s[np.newaxis, :])  # batch_size=1

            a = pi.sample()[0]
            s_next, r, done, info = self.env.step(a)

            # Store
            self.buffer.store((s, a, r, s_next, done))

            # Train
            if self.buffer.size() > self.parms.start_size:
                self._train()
                self.train_step_cnt += 1

            if done:
                _, ret = info['done']
                episode_ret.append(ret)
                s_next = self.env.reset()

            s = s_next

        return np.mean(episode_ret)
コード例 #5
0
class Agent(object):
    def __init__(self, env, alpha, beta, tau, gamma,
                 state_dim = 8, action_dim = 2, max_replay_size = 1000000,
                 l1_dim = 400, l2_dim = 300, batch_size=64):

        self.env = env
        self.max_action = float(env.action_space.high[0])

        self.alpha = alpha # learning rate for actor network
        self.beta = beta # learning rate for critic network
        self.tau = tau # polyak averaging parameter
        self.gamma = gamma # discount factor of reward

        self.update_actor_count = 0
        self.update_actor_freq  = 2 

        self.policy_noise = .2
        self.noise_clip = .5

        self.state_dim = state_dim
        self.action_dim = action_dim
        self.l1_dim = l1_dim
        self.l2_dim = l2_dim
        self.batch_size = batch_size
        self.max_replay_size = max_replay_size

        # build the agent
        self.build_agent()
        # with "tau = 1", we initialize the target network the same as the main network
        self.update_target_network(tau = 1)

    def build_agent(self):
        # build the actor-critic network and also their target networks
        self.actor = Actor(self.state_dim, self.action_dim, self.l1_dim, self.l2_dim,self.alpha)
        self.target_actor = copy.deepcopy(self.actor)
        self.critic = Critic(self.state_dim, self.action_dim, self.l1_dim, self.l2_dim,self.beta)
        self.target_critic = copy.deepcopy(self.critic)

        # build the replaybuffer
        self.replaybuffer = ReplayBuffer(self.max_replay_size, self.state_dim, self.action_dim)
        # build the OUNoise for action selection 
        self.noise = OUNoise(self.action_dim)

    def act(self, state):
        state = T.tensor(state, dtype=T.float)
        action = self.actor(state)
        noisy_action = action + T.tensor(self.noise(), dtype=T.float)
        return noisy_action.cpu().detach().numpy()

    # store transition into the replay buffer
    def remember(self, state, action, reward, next_state, done):
        self.replaybuffer.store(state, action, reward, next_state, done)

    def sample_replaybuffer(self):
        # sample from the ReplayBuffer
        states, actions, rewards, next_states, dones = self.replaybuffer.sample(self.batch_size)
        states = T.tensor(states, dtype=T.float)
        actions = T.tensor(actions, dtype=T.float)
        rewards = T.tensor(rewards, dtype=T.float)
        next_states = T.tensor(next_states, dtype=T.float)
        dones = T.tensor(dones)

        return states, actions, rewards, next_states, dones

    def step(self):
        # we cannot learn before the amount of transitions inside
        # the replay buffer is larger than the batch size
        if self.replaybuffer.mem_cntr < self.batch_size:
            return
        
        self.update_actor_count += 1
            
        # get transition samples from replayer buffer
        states, actions, rewards, next_states, dones = self.sample_replaybuffer()
        # update the critic network
        self.update_critic(states, actions, rewards, next_states, dones)

        if self.update_actor_count % self.update_actor_freq == 0:
            # update the actor network
            self.update_actor(states)
            # update target network parameters
            self.update_target_network()
        
    def update_critic(self, states, actions, rewards, next_states, dones):
        with T.no_grad():
            # Select action according to policy and add clipped noise
            noise = (T.randn_like(actions) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip)
            next_action = (self.target_actor(next_states) + noise).clamp(-self.max_action, self.max_action)
            
            # Compute the target Q value and use the minimum of them
            target_Q1, target_Q2 = self.target_critic(next_states, next_action)
            target_Q = T.min(target_Q1, target_Q2)
            target_Q = [rewards[j] + self.gamma * target_Q[j] * dones[j] for j in range(self.batch_size)]
            # reshape the variable
            target_Q = T.tensor(target_Q)
            target_Q = target_Q.view(self.batch_size, 1)
        
        # Get current Q estimates
        current_Q1, current_Q2 = self.critic(states, actions)
        # Compute critic loss
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

        # Optimize the critic
        self.critic.optimizer.zero_grad()
        critic_loss.backward()
        self.critic.optimizer.step()

    def update_actor(self, states):
        # here we use the output from the actor network NOT the noisy action
        # because we only need to enforce exploration in the when actual interactions
        # happen in the environment
        actions = self.actor(states)
        actor_loss = - self.critic.q1_forward(states, actions).mean()
        
        # Optimize the actor
        self.actor.optimizer.zero_grad()
        actor_loss.backward()
        self.actor.optimizer.step()

    def update_target_network(self, tau=None):
        tau = self.tau if tau is None else tau

        # polyak averaging to update the target critic network
        for param, target_param in zip(self.critic.parameters(), self.target_critic.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

        # polyak averaging to update the target actor network
        for param, target_param in zip(self.actor.parameters(), self.target_actor.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
コード例 #6
0
class SAC:
    def __init__(self,
                 env,
                 test_env,
                 actor_critic=core.MLPActorCritic,
                 ac_kwargs=dict(),
                 seed=0,
                 steps_per_epoch=4000,
                 epochs=100,
                 replay_size=int(1e6),
                 gamma=0.99,
                 polyak=0.995,
                 entropy_tuning: bool = False,
                 lr=1e-3,
                 alpha=0.2,
                 batch_size=100,
                 start_steps=10000,
                 update_after=1000,
                 update_every=50,
                 act_noise=0.01,
                 max_ep_len=1000,
                 device='cpu',
                 num_test_episodes=1,
                 save_freq=2,
                 log_mode: List[str] = ["stdout"],
                 log_key: str = "timestep",
                 save_model: str = "checkpoints",
                 checkpoint_path: str = None,
                 log_interval: int = 10,
                 load_model=False,
                 dir_prefix: str = None):

        torch.manual_seed(seed)
        np.random.seed(seed)

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.seed = seed
        self.env = env
        self.test_env = test_env
        self.obs_dim = env.observation_space.shape
        self.act_dim = env.action_space.shape[0]
        self.act_limit = env.action_space.high[0]
        self.replay_size = replay_size
        self.batch_size = batch_size
        #self.noise_scale = act_noise

        self.load_model = load_model
        self.log_key = log_key
        #self.logdir = logdir
        self.save_model = save_model
        self.checkpoint_path = checkpoint_path
        #self.log_interval = log_interval
        #self.logger = Logger(logdir=logdir, formats=[*log_mode])

        #self.pi_lr = pi_lr
        #self.q_lr = q_lr
        self.lr = lr
        self.ac_kwargs = ac_kwargs

        self.steps_per_epoch = steps_per_epoch
        self.epochs = epochs
        self.max_ep_len = max_ep_len

        self.gamma = gamma
        self.polyak = polyak
        self.alpha = alpha
        self.entropy_tuning = entropy_tuning

        self.start_steps = start_steps
        self.update_after = update_after
        self.update_every = update_every
        self.save_freq = save_freq

        self.action_time_step = 0  #no. of updates
        self.current_timestep = 0
        self.current_epoch = 0
        self.dir_prefix = dir_prefix

        # Store the weights and scores in a new directory
        self.directory = "logs/sac_single_Agent_{}{}/".format(
            self.dir_prefix,
            time.strftime("%Y%m%d-%H%M%S"))  # appends the timedate
        os.makedirs(self.directory, exist_ok=True)
        self.model_dir = os.path.join(self.directory, 'model_param/')
        os.makedirs(self.model_dir)

        # Tensorboard writer object
        self.writer = SummaryWriter(log_dir=self.directory + 'tensorboard/')
        print("Logging to {}\n".format(self.directory + 'tensorboard/'))

        #self.test_env = env
        self.num_test_episodes = num_test_episodes

        # Create actor-critic module and target networks
        self.ac = actor_critic(self.env.observation_space,
                               self.env.action_space,
                               **ac_kwargs).to(self.device)
        self.ac_targ = deepcopy(self.ac).to(self.device)
        #actually no need of saving the policy parameters as target above, since we do not need any target Actor in SAC.

        if self.load_model:
            if os.path.exists(self.checkpoint_path):
                self.ac.load_state_dict(
                    torch.load(os.path.abspath(self.checkpoint_path)))
                self.ac_targ = deepcopy(self.ac).to(self.device)

        # Freeze target networks with respect to optimizers (only update via polyak averaging)
        for p in self.ac_targ.parameters():
            p.requires_grad = False

        # List of parameters for both Q-networks (save this for convenience)
        self.q_params = itertools.chain(self.ac.q1.parameters(),
                                        self.ac.q2.parameters())

        # Set up optimizers for policy and q-function
        self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=self.lr)
        self.pi_scheduler = StepLR(self.pi_optimizer, step_size=1, gamma=0.96)
        self.q_optimizer = Adam(self.q_params, lr=self.lr)
        self.q_scheduler = StepLR(self.q_optimizer, step_size=1, gamma=0.96)

        # Experience buffer
        self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim,
                                          act_dim=self.act_dim,
                                          size=self.replay_size)

        # from https://github.com/SforAiDl/genrl/blob/master/genrl/deep/agents/sac/sac.py
        if self.entropy_tuning:
            self.target_entropy = -torch.prod(
                torch.Tensor(self.env.action_space.shape).to(
                    self.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_optim = Adam([self.log_alpha], lr=self.lr)

        #else:
        #    self.alpha=self.alpha

        # no need of action scales setting
        # action_limit is directly obtained within the MLPActorCritic class
        # action_bias is not need as for the city learn environment, actions are bounded with -1/3 to +1/3
        # and the bias sums to 0

        # Assign device
        if "cuda" in device and torch.cuda.is_available():
            self.device = torch.device(device)
        else:
            self.device = torch.device("cpu")

        # Assign seed
        if seed is not None:
            set_seeds(seed, self.env)

        #initialize logs
        self.empty_logs()

        # Count variables (protip: try to get a feel for how different size networks behave!)
        var_counts = tuple(
            core.count_vars(module)
            for module in [self.ac.pi, self.ac.q1, self.ac.q2])
        print(var_counts)
        self.logs["var_counts"] = var_counts
        print(
            colorize(
                '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' %
                var_counts,
                'green',
                bold=True))
        self.writer.add_scalar('Number of parameters/pi', var_counts[0])
        self.writer.add_scalar('Number of parameters/q1', var_counts[1])
        self.writer.add_scalar('Number of parameters/q2', var_counts[2])
        #print(colorize(msg, color, bold=True))

    def load_weights(self, weights) -> None:
        """
        Load weights for the agent from pretrained model
        """
        self.q1.load_state_dict(weights["q1_weights"])
        self.q2.load_state_dict(weights["q2_weights"])
        self.policy.load_state_dict(weights["policy_weights"])

    def empty_logs(self):
        """
        Empties logs
        """
        self.logs = {}
        self.logs["q1_loss"] = []
        self.logs["q2_loss"] = []
        self.logs["policy_loss"] = []
        self.logs["alpha_loss"] = []
        self.logs["var_counts"] = ()

    def safe_mean(log: List[int]):
        """
        Returns 0 if there are no elements in logs
        """
        return np.mean(log) if len(log) > 0 else 0

    def get_logging_params(self) -> Dict[str, Any]:
        """
        :returns: Logging parameters for monitoring training
        :rtype: dict
        """
        logs = {
            "policy_loss": safe_mean(self.logs["policy_loss"]),
            "q1_loss": safe_mean(self.logs["q1_loss"]),
            "q2_loss": safe_mean(self.logs["q2_loss"]),
            "alpha_loss": safe_mean(self.logs["alpha_loss"]),
        }

        self.empty_logs()
        return logs

    # Set up function for computing SAC Q-losses
    def compute_loss_q(self, data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        q1 = self.ac.q1(o, a)
        q2 = self.ac.q2(o, a)

        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            a2, logp_a2 = self.ac.pi(o2)

            # Target Q-values
            q1_pi_targ = self.ac_targ.q1(o2, a2)
            q2_pi_targ = self.ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + self.gamma * (1 - d) * (q_pi_targ -
                                                 self.alpha * logp_a2)

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        #logging into tensorboard
        self.writer.add_scalar('loss/Critic1_loss', loss_q1,
                               self.current_timestep)
        self.writer.add_scalar('loss/Critic2_loss', loss_q2,
                               self.current_timestep)

        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy())

        self.logs["q1_loss"].append(loss_q1.item())
        self.logs["q2_loss"].append(loss_q2.item())

        return loss_q, q_info

    # Set up function for computing SAC pi loss
    def compute_loss_pi(self, data):
        o = data['obs']
        pi, logp_pi = self.ac.pi(o)
        q1_pi = self.ac.q1(o, pi)
        q2_pi = self.ac.q2(o, pi)
        q_pi = torch.min(q1_pi, q2_pi)

        # Entropy-regularized policy loss
        loss_pi = (self.alpha * logp_pi - q_pi).mean()

        # Useful info for logging
        pi_info = dict(LogPi=logp_pi.detach().numpy())

        # alpha loss
        alpha_loss = torch.tensor(0.0).to(self.device)

        if self.entropy_tuning:
            alpha_loss = -(self.log_alpha *
                           (logp_pi + self.target_entropy).detach()).mean()
            self.writer.add_scalar('loss/entropy_tuning_loss', alpha_loss,
                                   self.current_timestep)
            self.logs["alpha_loss"].append(alpha_loss.item())
        else:
            alpha_loss = 0

        #logging into tensorboard
        self.writer.add_scalar('loss/Actor_loss', loss_pi,
                               self.current_timestep)

        self.logs["policy_loss"].append(loss_pi.item())

        return loss_pi, alpha_loss, pi_info

    def update(self, data):
        # First run one gradient descent step for Q1 and Q2
        self.q_optimizer.zero_grad()
        loss_q, q_info = self.compute_loss_q(data)
        loss_q.backward()
        self.q_optimizer.step()

        # Freeze Q-networks so you don't waste computational effort
        # computing gradients for them during the policy learning step.
        for p in self.q_params:
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        self.pi_optimizer.zero_grad()
        loss_pi, alpha_loss, pi_info = self.compute_loss_pi(data)
        loss_pi.backward()
        self.pi_optimizer.step()

        if self.entropy_tuning:
            # Next run one gradient descent step for alpha.
            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()

            self.alpha = self.log_alpha.exp()

            self.writer.add_scalar('entropy_tuning_param/alpha', self.alpha,
                                   self.current_timestep)

        # Unfreeze Q-network so you can optimize it at the next SAC step.
        for p in self.q_params:
            p.requires_grad = True

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(self.ac.parameters(),
                                 self.ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(self.polyak)
                p_targ.data.add_((1 - self.polyak) * p.data)

    def reset_action_tracker(self):
        self.action_tracker = []

    def reset_reward_tracker(self):
        self.reward_tracker = []

    def get_action(self, o, deterministic=False):
        return self.ac.act(
            torch.as_tensor(o, dtype=torch.float32).to(self.device),
            deterministic)

    def eval_agent(self, test=True):
        if test == True:
            eval_env = self.test_env
            t_env = 'testing environment'
        else:
            eval_env = deepcopy(self.env)
            t_env = 'training environment'
        ep_rews = []
        for j in range(self.num_test_episodes):
            o, d, ep_ret, ep_len = eval_env.reset(), False, 0, 0
            while not (d or (ep_len == self.max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                # o = (o-self.replay_buffer.obs_buf_min) /(self.replay_buffer.obs_buf_max - self.replay_buffer.obs_buf_min)
                nom = o - self.replay_buffer.obs_buf_min
                denom = self.replay_buffer.obs_buf_max - self.replay_buffer.obs_buf_min
                denom[denom == 0] = 1
                o = nom / denom
                o, r, d, _ = eval_env.step(self.get_action(o, True))
                ep_ret += r
                ep_len += 1
            ep_rews.append(ep_ret)

        print("Evaluating on the {} for {} episode, Mean Reward: {}".format(
            t_env, self.num_test_episodes, np.mean(ep_rews)))
        #print('Final cost',eval_env.cost())

        self.writer.add_scalar("Scores/ramping",
                               eval_env.cost()['ramping'], self.current_epoch)
        self.writer.add_scalar("Scores/1-load_factor",
                               eval_env.cost()['1-load_factor'],
                               self.current_epoch)
        self.writer.add_scalar("Scores/average_daily_peak",
                               eval_env.cost()['average_daily_peak'],
                               self.current_epoch)
        self.writer.add_scalar("Scores/peak_demand",
                               eval_env.cost()['peak_demand'],
                               self.current_epoch)
        self.writer.add_scalar("Scores/net_electricity_consumption",
                               eval_env.cost()['net_electricity_consumption'],
                               self.current_epoch)
        self.writer.add_scalar("Scores/total",
                               eval_env.cost()['total'], self.current_epoch)
        self.writer.add_scalar("Scores/test_episode_reward", np.mean(ep_rews),
                               self.current_epoch)

        return np.mean(ep_rews), eval_env.cost()['total']

    def learn(self) -> None:

        ep_num = 0
        best_score = 1.5
        return_per_episode = []

        # Prepare for interaction with environment
        total_steps = self.steps_per_epoch * self.epochs
        epoch_start_time = time.time()
        o, ep_ret, ep_len = self.env.reset(), 0, 0
        #self.current_epoch=1
        # Main loop: collect experience in env and update/log each epoch
        for t in range(total_steps):

            self.current_timestep = t  #for logging

            # if t > 8759 update minmax of buffer and use it to normalize
            # so,we collect data for 1 year and calculate min-max of obs and rewards

            if t == self.start_steps:
                self.replay_buffer.collect_minmax()

            # Until start_steps have elapsed, randomly sample actions
            # from a uniform distribution for better exploration. Afterwards,
            # use the learned policy.
            if t > self.start_steps:
                #print(t)
                a = self.get_action(o)
            else:
                a = self.env.action_space.sample()

            # Step the env
            o2, r, d, _ = self.env.step(a)
            self.writer.add_scalar('Rewards/single_Agent_reward', r,
                                   self.current_timestep)

            ep_ret += r
            ep_len += 1

            # Ignore the "done" signal if it comes from hitting the time
            # horizon (that is, when it's an artificial terminal signal
            # that isn't based on the agent's state)
            d = False if ep_len == self.max_ep_len else d

            # Store experience to replay buffer
            self.replay_buffer.store(o, a, r, o2, d)

            # Super critical, easy to overlook step: make sure to update
            # most recent observation!
            o = o2

            # End of trajectory handling
            if d or (ep_len == self.max_ep_len):
                #print('End of trajectory: Episode return is', ep_ret )
                #print('Cost function is', self.env.cost())
                ep_num += 1
                return_per_episode.append(ep_ret)
                self.writer.add_scalar('Rewards/return_per_episode', ep_ret,
                                       ep_num)

                o, ep_ret, ep_len = self.env.reset(), 0, 0

            # Update handling
            if t >= self.update_after and t % self.update_every == 0:
                #if t >= self.update_after: #instead of updating for some fixed steps, update for every step
                #print('updating')
                for _ in range(self.update_every):
                    batch = self.replay_buffer.sample_batch(self.batch_size)
                    #print(batch)
                    #print(batch.size)
                    #sys.exit()
                    self.update(data=batch)

            #End of epoch handling
            if (t + 1) % self.steps_per_epoch == 0:
                epoch = (t + 1) // self.steps_per_epoch

                self.current_epoch += 1
                self.pi_scheduler.step()
                self.q_scheduler.step()

                print('Epoch:', epoch, 'Policy_LR:',
                      self.pi_scheduler.get_lr(), 'Critic_LR:',
                      self.q_scheduler.get_lr())

                print('time step: {} , epoch: {} ,time elapsed: {} '.format(
                    t + 1, epoch,
                    time.time() - epoch_start_time))
                train_mean_return, test_score = self.eval_agent(test=False)

                #test_mean_return=self.eval_agent(test=True)
                #print('time_per_epoch',time.time()-epoch_start_time)
                epoch_start_time = time.time()
                print('\n')

                # Save model
                if (epoch % self.save_freq == 0):
                    if test_score < best_score:
                        best_score = test_score
                        print(
                            'Better evaluation score and hence saving model to {}'
                            .format(
                                os.path.join(self.directory, 'model_param/')))
                        torch.save(
                            self.ac.state_dict(),
                            os.path.join(self.directory, 'model_param/') +
                            'checkpoint.pt')

            if (t + 1) % self.steps_per_epoch == 0:
                self.action_time_step = 0

            else:
                self.action_time_step += 1

        return epoch, train_mean_return * (self.batch_size)
コード例 #7
0
class DQNAgent:
    def __init__(self, state_dim, action_dim, tau, epsilon, mem_size,
                 batch_size, gamma, lr):
        self.sess = tf.Session()

        self.s = tf.placeholder(tf.float32, [None, *state_dim], 'state')
        self.s_ = tf.placeholder(tf.float32, [None, *state_dim], 'next_state')
        self.t = tf.placeholder(tf.float32, [
            None,
        ], 'target')
        self.action_in = tf.placeholder(tf.int32, [
            None,
        ], 'action')
        self.action_dim = action_dim
        self.action = tf.one_hot(self.action_in, depth=action_dim)

        self.batch_size = batch_size
        self.lr = lr
        self.gamma = gamma
        self.memory = ReplayBuffer(max_size=mem_size)

        # set the exploration params
        self.epsilon = epsilon
        self.decay_steps = 5000
        self.decay_inc = (epsilon - 0.1) / 4000

        # replace the target network params
        self.replace_counter = 0
        self.replace_iter = 300

        self.Q_net = QNet(sess=self.sess,
                          lr=1e-3,
                          action_dim=action_dim,
                          S=self.s,
                          S_=self.s_,
                          tau=tau)

        self.q_eval = tf.reduce_sum(tf.multiply(self.action, self.Q_net.q),
                                    axis=1)
        self.loss = tf.reduce_mean(tf.squared_difference(self.q_eval, self.t))
        self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)

        self.sess.run(tf.global_variables_initializer())

    def choose_action(self, s):
        if s.ndim < 2:
            s = [s]
        q_values = self.sess.run(self.Q_net.q, {self.s: s})
        a_best = np.argmax(q_values)
        a = a_best if np.random.random() > self.epsilon else np.random.randint(
            self.action_dim)
        return a

    def store(self, s, a, r, s_, done):
        self.memory.store(s, a, r, s_, done)

    def learn(self):
        states, actions, rewards, next_states, dones = self.memory.sample(
            self.batch_size)

        # use the target network to select the best action for next state
        action_next_target = np.argmax(self.sess.run(self.Q_net.q_,
                                                     {self.s_: next_states}),
                                       axis=1)

        # use the eval network to obtain the next state value and the target
        q_next = self.sess.run(self.q_eval, {
            self.s: next_states,
            self.action_in: action_next_target
        })
        q_next[dones] = 0
        target = rewards + self.gamma * q_next

        loss, _ = self.sess.run(
            [self.loss, self.train_op], {
                self.s: states,
                self.s_: next_states,
                self.action_in: actions,
                self.t: target
            })
        if self.replace_counter % self.replace_iter == 0:
            self.sess.run(self.Q_net.replace)

        self.epsilon = max(0.1, self.epsilon - self.decay_inc)
        self.replace_counter += 1
コード例 #8
0
class Agent(object):
    def __init__(self, env, alpha, beta, tau, gamma,
                 max_replay_size = 1000000, batch_size=64,
                 l1_dim = 400, l2_dim = 300, state_dim = 8, action_dim = 2):

        self.env = env
        self.alpha = alpha # learning rate for actor network
        self.beta = beta # learning rate for critic network
        self.tau = tau # polyak averaging parameter
        self.gamma = gamma # discount factor of reward

        self.max_replay_size = max_replay_size
        self.batch_size = batch_size
        self.l1_dim = l1_dim
        self.l2_dim = l2_dim
        self.state_dim = state_dim
        self.action_dim = action_dim

        # build the agent
        self.build_agent()
        # with "tau = 1", we initialize the target network the same as the main network
        self.update_target_network(tau = 1)

    def build_agent(self):
        # build the actor-critic network and also their target networks
        self.actor = Actor(self.state_dim, self.action_dim, self.l1_dim, self.l2_dim, self.alpha)
        self.target_actor = copy.deepcopy(self.actor)
        self.critic = Critic(self.state_dim, self.action_dim, self.l1_dim, self.l2_dim, self.beta)
        self.target_critic = copy.deepcopy(self.critic)

        # build the replaybuffer
        self.replaybuffer = ReplayBuffer(self.max_replay_size, self.state_dim, self.action_dim)
        # build the OUNoise for action selection 
        self.noise = OUNoise(self.action_dim)

    def act(self, state):
        state = T.tensor(state, dtype=T.float)
        action = self.actor(state)
        noisy_action = action + T.tensor(self.noise(), dtype=T.float)
        return noisy_action.cpu().detach().numpy()

    # store transition into the replay buffer
    def remember(self, state, action, reward, next_state, done):
        self.replaybuffer.store(state, action, reward, next_state, done)

    def sample_replaybuffer(self):
        # sample from the ReplayBuffer
        states, actions, rewards, next_states, dones = self.replaybuffer.sample(self.batch_size)
        states = T.tensor(states, dtype=T.float)
        actions = T.tensor(actions, dtype=T.float)
        rewards = T.tensor(rewards, dtype=T.float)
        next_states = T.tensor(next_states, dtype=T.float)
        dones = T.tensor(dones)

        return states, actions, rewards, next_states, dones

    def step(self):
        # we cannot learn before the amount of transitions inside
        # the replay buffer is larger than the batch size
        if self.replaybuffer.mem_cntr < self.batch_size:
            return
            
        # get transition samples from replayer buffer
        states, actions, rewards, next_states, dones = self.sample_replaybuffer()
        # update the critic network
        self.update_critic(states, actions, rewards, next_states, dones)
        # update the actor network
        self.update_actor(states)
        # update target network parameters
        self.update_target_network()
        
    def update_critic(self, states, actions, rewards, next_states, dones):
        # update the critic network
        target_actions = self.target_actor(next_states)
        target_critic_values = self.target_critic(next_states, target_actions)
        critic_values = self.critic(states, actions)

        target_critic_values = [rewards[j] + self.gamma * target_critic_values[j] * dones[j] for j in range(self.batch_size)]
        # reshape the variable
        target_critic_values = T.tensor(target_critic_values)
        target_critic_values = target_critic_values.view(self.batch_size, 1)

        critic_loss = F.mse_loss(target_critic_values, critic_values)

        # In PyTorch, we need to set the gradients to zero before starting to do backpropragation 
        # because PyTorch accumulates the gradients on subsequent backward passes
        # optimize the critic
        self.critic.optimizer.zero_grad()
        critic_loss.backward()
        self.critic.optimizer.step()

    def update_actor(self, states):
        # here we use the output from the actor network NOT the noisy action
        # because we only need to enforce exploration in the when actual interactions
        # happen in the environment

        actions = self.actor(states)

        # NOTICE here we do not multiply "actor_loss" with " self.actor(states)"
        # because we here take gradient with respect to the parameter not
        # first part to action and second to parameter (refer to the original paper)
        actor_loss = - self.critic(states, actions).mean()

        # optimize the actor
        self.actor.optimizer.zero_grad()
        actor_loss.backward()
        self.actor.optimizer.step()

    def update_target_network(self, tau=None):
        tau = self.tau if tau is None else tau

        # polyak averaging to update the target critic network
        for param, target_param in zip(self.critic.parameters(), self.target_critic.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

        # polyak averaging to update the target actor network
        for param, target_param in zip(self.actor.parameters(), self.target_actor.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
コード例 #9
0
class DQNAgent:
    def __init__(self, state_dim, action_dim, f1, tau, epsilon, mem_size,
                 batch_size, gamma, lr):
        self.sess = tf.Session()

        self.s = tf.placeholder(tf.float32, [None, *state_dim], 'state')
        self.s_ = tf.placeholder(tf.float32, [None, *state_dim], 'next_state')
        self.t = tf.placeholder(tf.float32, [
            None,
        ], 'target')
        self.action_in = tf.placeholder(tf.int32, [
            None,
        ], 'action')

        self.action_dim = action_dim
        self.action = tf.one_hot(self.action_in, depth=action_dim)

        self.batch_size = batch_size
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.decay_steps = 5000
        self.decay_inc = (epsilon - 0.1) / 4000
        self.replace_counter = 0
        self.memory = ReplayBuffer(max_size=mem_size)
        self.Q_net = QNet(sess=self.sess,
                          lr=1e-3,
                          action_dim=action_dim,
                          f1=f1,
                          S=self.s,
                          S_=self.s_,
                          tau=tau)

        self.q_action = tf.reduce_sum(tf.multiply(self.action, self.Q_net.q),
                                      axis=1)

        self.error = tf.abs(self.q_action - self.t)
        self.loss = tf.reduce_mean(tf.square(self.error))
        self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)

        self.sess.run(tf.global_variables_initializer())

    def choose_action(self, s):
        if s.ndim < 2:
            s = [s]
        q_values = self.sess.run(self.Q_net.q, {self.s: s})
        a_best = np.argmax(q_values)
        a = a_best if np.random.random() > self.epsilon else np.random.randint(
            self.action_dim)
        return a

    def store(self, s, a, r, s_, done):
        self.memory.store(s, a, r, s_, done)

    def learn(self):
        states, actions, rewards, next_states, dones = self.memory.sample(
            self.batch_size)
        q_next = self.sess.run(self.Q_net.q_, {self.s_: next_states})
        q_next[dones] = np.zeros([self.action_dim])
        target = rewards + self.gamma * np.max(q_next, axis=1)

        errors, _ = self.sess.run(
            [self.error, self.train_op], {
                self.s: states,
                self.s_: next_states,
                self.action_in: actions,
                self.t: target
            })
        if self.replace_counter % 300 == 0:
            self.sess.run(self.Q_net.replace)
        self.epsilon = max(0.1, self.epsilon - self.decay_inc)
        self.replace_counter += 1
コード例 #10
0
def main(args):

    if 'L2M2019Env' in args.env_name:
        env = L2M2019Env(visualize=False, difficulty=args.difficulty)
        test_env = L2M2019Env(visualize=False, difficulty=args.difficulty)
    else:
        env = gym.make(args.env_name)
        test_env = gym.make(args.env_name)
    device = torch.device(args.device)

    data = np.load('./official_obs_scaler.npz')
    obs_mean, obs_std = data['mean'], data['std']

    # 1.Set some necessary seed.
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)
    test_env.seed(args.seed + 999)

    # 2.Create actor, critic, EnvSampler() and PPO.
    if 'L2M2019Env' in args.env_name:
        obs_dim = 99
    else:
        obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    act_high = env.action_space.high
    act_low = env.action_space.low

    actor_critic = MLPActorCritic(obs_dim,
                                  act_dim,
                                  hidden_sizes=args.hidden_sizes).to(device)

    replay_buffer = ReplayBuffer(obs_dim, act_dim, args.buffer_size)

    gac = GAC(actor_critic,
              replay_buffer,
              device=device,
              gamma=args.gamma,
              alpha_start=args.alpha_start,
              alpha_min=args.alpha_min,
              alpha_max=args.alpha_max)

    def act_encoder(y):
        # y = [min, max] ==> x = [-1, 1]
        # if args.env_name == 'L2M2019Env':
        #     return y
        return (y - act_low) / (act_high - act_low) * 2.0 - 1.0

    def act_decoder(x):
        # x = [-1, 1] ==> y = [min, max]
        # if args.env_name == 'L2M2019Env':
        #     return np.abs(x)
        return (x + 1.0) / 2.0 * (act_high - act_low) - act_low

    def get_observation(env):
        obs = np.array(env.get_observation()[242:])

        obs = (obs - obs_mean) / obs_std

        state_desc = env.get_state_desc()
        p_body = [
            state_desc['body_pos']['pelvis'][0],
            -state_desc['body_pos']['pelvis'][2]
        ]
        v_body = [
            state_desc['body_vel']['pelvis'][0],
            -state_desc['body_vel']['pelvis'][2]
        ]
        v_tgt = env.vtgt.get_vtgt(p_body).T

        return np.append(obs, v_tgt)

    def get_reward(env):
        reward = 10.0

        # Reward for not falling down
        state_desc = env.get_state_desc()
        p_body = [
            state_desc['body_pos']['pelvis'][0],
            -state_desc['body_pos']['pelvis'][2]
        ]
        v_body = [
            state_desc['body_vel']['pelvis'][0],
            -state_desc['body_vel']['pelvis'][2]
        ]
        v_tgt = env.vtgt.get_vtgt(p_body).T

        vel_penalty = np.linalg.norm(v_body - v_tgt)

        muscle_penalty = 0
        for muscle in sorted(state_desc['muscles'].keys()):
            muscle_penalty += np.square(
                state_desc['muscles'][muscle]['activation'])

        ret_r = reward - (vel_penalty * 3 + muscle_penalty * 1)

        if vel_penalty < 0.3:
            ret_r += 10

        return ret_r

    # 3.Start training.
    def get_action(o, deterministic=False):
        o = torch.FloatTensor(o.reshape(1, -1)).to(device)
        a = actor_critic.act(o, deterministic)
        return a

    def test_agent():
        test_ret, test_len = 0, 0
        for j in range(args.epoch_per_test):
            _, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            o = get_observation(test_env)
            while not (d or (ep_len == args.max_ep_len)):
                # Take deterministic actions at test time
                a = get_action(o, True)
                a = act_decoder(a)

                for _ in range(args.frame_skip):
                    _, r, d, _ = test_env.step(a)
                    ep_ret += r
                    ep_len += 1
                    if d: break

                o = get_observation(test_env)

            test_ret += ep_ret
            test_len += ep_len
        return test_ret / args.epoch_per_test, test_len / args.epoch_per_test

    total_step = args.total_epoch * args.steps_per_epoch
    _, d, ep_len = env.reset(), False, 0
    o = get_observation(env)
    for t in range(1, total_step + 1):
        if t <= args.start_steps:
            a = act_encoder(env.action_space.sample())
        else:
            a = get_action(o, deterministic=False)

        a = act_decoder(a)

        r = 0.0
        for _ in range(args.frame_skip):
            _, _, d, _ = env.step(a)
            r += get_reward(env)
            ep_len += 1
            if d: break

        o2 = get_observation(env)

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)

        d = False if ep_len == args.max_ep_len else d

        # if not d:
        #     new_o, new_r, new_o2 = generate_success(o, o2)
        #     replay_buffer.store(new_o, a, new_r * args.reward_scale, new_o2, d)

        # Store experience to replay buffer
        replay_buffer.store(o, a, r * args.reward_scale, o2, d)

        o = o2
        if d or (ep_len == args.max_ep_len):
            _, ep_len = env.reset(obs_as_dict=False), 0
            o = get_observation(env)

        if t >= args.update_after and t % args.steps_per_update == 0:
            for _ in range(args.steps_per_update):
                loss_a, loss_c, alpha = gac.update(args.batch_size)
            gac.update_beta()
            print(
                "loss_actor = {:<22}, loss_critic = {:<22}, alpha = {:<20}, beta = {:<20}"
                .format(loss_a, loss_c, alpha, gac.beta))

        # End of epoch handling
        if t >= args.update_after and t % args.steps_per_epoch == 0:
            test_ret, test_len = test_agent()
            print("Step {:>10}: test_ret = {:<20}, test_len = {:<20}".format(
                t, test_ret, test_len))
            print(
                "-----------------------------------------------------------")
            yield t, test_ret, test_len, actor_critic