Beispiel #1
0
    def __init__(self, env, epsilon_init, decay, epsilon_min, update_freq, sl_lr, rl_lr, sl_capa, rl_capa, n_step, gamma, eta, max_episode, negative, rl_start, sl_start, train_freq, rl_batch_size, sl_batch_size, render, device):
        self.env = env
        self.epsilon_init = epsilon_init
        self.decay = decay
        self.epsilon_min = epsilon_min
        self.update_freq = update_freq
        self.sl_lr = sl_lr
        self.rl_lr = rl_lr
        self.sl_capa = sl_capa
        self.rl_capa = rl_capa
        self.n_step = n_step
        self.gamma = gamma
        self.eta = eta
        self.max_episode = max_episode
        self.negative = negative
        self.sl_start = sl_start
        self.rl_start = rl_start
        self.train_freq = train_freq
        self.rl_batch_size = rl_batch_size
        self.sl_batch_size = sl_batch_size
        self.render = render
        self.device = device

        self.observation_dim = self.env.observation_space.shape
        self.action_dim = self.env.action_space.n
        self.sl_p1_buffer = reservoir_buffer(self.sl_capa)
        self.sl_p2_buffer = reservoir_buffer(self.sl_capa)
        if self.n_step > 1:
            self.rl_p1_buffer = n_step_replay_buffer(self.rl_capa, self.n_step, self.gamma)
            self.rl_p2_buffer = n_step_replay_buffer(self.rl_capa, self.n_step, self.gamma)
        else:
            self.rl_p1_buffer = replay_buffer(self.rl_capa)
            self.rl_p2_buffer = replay_buffer(self.rl_capa)
        self.p1_dqn_eval = dueling_ddqn(self.observation_dim, self.action_dim).to(self.device)
        self.p1_dqn_target = dueling_ddqn(self.observation_dim, self.action_dim).to(self.device)
        self.p2_dqn_eval = dueling_ddqn(self.observation_dim, self.action_dim).to(self.device)
        self.p2_dqn_target = dueling_ddqn(self.observation_dim, self.action_dim).to(self.device)
        self.p1_dqn_target.load_state_dict(self.p1_dqn_eval.state_dict())
        self.p2_dqn_target.load_state_dict(self.p2_dqn_eval.state_dict())

        self.p1_policy = policy(self.observation_dim, self.action_dim).to(self.device)
        self.p2_policy = policy(self.observation_dim, self.action_dim).to(self.device)

        self.epsilon = lambda x: self.epsilon_min + (self.epsilon_init - self.epsilon_min) * math.exp(-1. * x / self.decay)

        self.p1_dqn_optimizer = torch.optim.Adam(self.p1_dqn_eval.parameters(), lr=self.rl_lr)
        self.p2_dqn_optimizer = torch.optim.Adam(self.p2_dqn_eval.parameters(), lr=self.rl_lr)
        self.p1_policy_optimizer = torch.optim.Adam(self.p1_policy.parameters(), lr=self.sl_lr)
        self.p2_policy_optimizer = torch.optim.Adam(self.p2_policy.parameters(), lr=self.sl_lr)
Beispiel #2
0
    def __init__(self):
        self.sess = tf.Session()
        self.memory = replay_buffer(max_length=1e5)
        self.tau = 0.995
        self.gamma = 0.99
        self.state_size = 33
        self.output_size = 4
        self.action_limit = 1.0
        self.hidden = [400, 300]
        self.batch_size = 100
        self.pi_lr = 1e-4
        self.q_lr = 1e-4
        self.noise = OU_noise(self.output_size, 1)

        self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size, self.state_size, None, None)

        with tf.variable_scope('main'):
            self.pi, self.q, self.q_pi = cr.mlp_actor_critic(
                self.x_ph,
                self.a_ph,
                self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit)

        with tf.variable_scope('target'):
            self.pi_targ, _, self.q_pi_targ = cr.mlp_actor_critic(self.x2_ph,\
                self.a_ph, self.hidden, activation=tf.nn.relu, output_activation=tf.tanh,
                output_size=self.output_size, action_limit=self.action_limit)

        self.target = tf.stop_gradient(self.r_ph + self.gamma *
                                       (1 - self.d_ph) * self.q_pi_targ)
        self.pi_loss = -tf.reduce_mean(self.q_pi)
        self.v_loss = tf.reduce_mean((self.q - self.target)**2) * 0.5
        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        self.v_optimizer = tf.train.AdamOptimizer(self.q_lr)
        self.pi_train = self.pi_optimizer.minimize(
            self.pi_loss, var_list=cr.get_vars('main/pi'))
        self.v_train = self.v_optimizer.minimize(
            self.v_loss, var_list=cr.get_vars('main/q'))

        self.target_update = tf.group([
            tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for
            v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.sess.run(tf.global_variables_initializer())

        self.sess.run(self.target_init)
Beispiel #3
0
    def __init__(self, observation_dim, action_dim, epsilon_init, decay,
                 epsilon_min, update_freq, sl_lr, rl_lr, sl_capa, rl_capa,
                 n_step, gamma, eta, rl_start, sl_start, train_freq,
                 rl_batch_size, sl_batch_size, device, eval_mode):
        self.observation_dim = observation_dim
        self.action_dim = action_dim
        self.epsilon_init = epsilon_init
        self.decay = decay
        self.epsilon_min = epsilon_min
        self.update_freq = update_freq
        self.sl_lr = sl_lr
        self.rl_lr = rl_lr
        self.sl_capa = sl_capa
        self.rl_capa = rl_capa
        self.n_step = n_step
        self.gamma = gamma
        self.eta = eta
        self.sl_start = sl_start
        self.rl_start = rl_start
        self.train_freq = train_freq
        self.rl_batch_size = rl_batch_size
        self.sl_batch_size = sl_batch_size
        self.device = device
        self.use_raw = False
        self.eval_mode = eval_mode

        self.sl_buffer = reservoir_buffer(self.sl_capa)
        if self.n_step > 1:
            self.rl_buffer = n_step_replay_buffer(self.rl_capa, self.n_step,
                                                  self.gamma)
        else:
            self.rl_buffer = replay_buffer(self.rl_capa)
        self.dqn_eval = dueling_ddqn(self.observation_dim,
                                     self.action_dim).to(self.device)
        self.dqn_target = dueling_ddqn(self.observation_dim,
                                       self.action_dim).to(self.device)
        self.dqn_target.load_state_dict(self.dqn_eval.state_dict())

        self.policy = policy(self.observation_dim,
                             self.action_dim).to(self.device)

        self.epsilon = lambda x: self.epsilon_min + (
            self.epsilon_init - self.epsilon_min) * math.exp(-1. * x / self.
                                                             decay)

        self.dqn_optimizer = torch.optim.Adam(self.dqn_eval.parameters(),
                                              lr=self.rl_lr)
        self.policy_optimizer = torch.optim.Adam(self.policy.parameters(),
                                                 lr=self.sl_lr)

        self.choose_policy_mode()

        self.count = 0
    def __init__(self):
        self.sess = tf.Session()
        self.state_size = env_set['state']
        self.output_size = env_set['action']
        self.worker_size = env_set['worker']
        self.support_size = 64
        self.tau = 0.995
        self.gamma = env_set['gamma']
        self.hidden = env_set['hidden']
        self.batch_size = env_set['batch_size']
        self.pi_lr = env_set['pi_lr']
        self.q_lr = env_set['q_lr']
        self.action_limit = 1.0
        self.memory = replay_buffer(env_set['mem_size'])
        self.kappa = 1.0
        self.risk_factor = -1.0
        self.random_risk = False
        self.target_noise = 0.2
        self.noise_clip = 0.5
        tf.set_random_seed(10)

        self.x_ph, self.a_ph,self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size,self.state_size, None, None)
        self.risk_factor_ph = tf.placeholder(tf.float32)

        with tf.variable_scope('main'):
            self.pi, self.q1, self.q2, self.q1_pi, self.q2_pi = cr.dqpg_td3_actor_critic(
                x=self.x_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit,
                support_size=self.support_size)

        with tf.variable_scope('target'):
            _, _, _, self.q1_pi_targ, self.q2_pi_targ = cr.dqpg_td3_actor_critic(
                x=self.x2_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit,
                support_size=self.support_size,
                pi_q_noise=self.target_noise,
                noise_clip=self.noise_clip)

        self.pi_params = cr.get_vars('main/pi')
        self.q_params = cr.get_vars('main/q')
        self.min_q_targ = tf.minimum(self.q1_pi_targ, self.q2_pi_targ)
        self.backup = tf.stop_gradient(tf.expand_dims(self.r_ph,axis=1)\
                                       + self.gamma*tf.expand_dims(1-self.d_ph,axis=1)*self.min_q_targ)
        self.quantile_weight = 1.0 - self.risk_factor_ph*\
            (2.0*tf.reshape(tf.range(0.5/self.support_size, 1, 1 / self.support_size), [1, self.support_size]) - 1.0)
        self.pi_loss = -tf.reduce_mean(
            tf.reduce_mean(self.q1_pi * self.quantile_weight))

        logit_valid_tile = tf.tile(tf.expand_dims(self.backup, axis=1),
                                   [1, self.support_size, 1])
        tau = tf.reshape(
            tf.range(0.5 / self.support_size, 1, 1 / self.support_size),
            [1, self.support_size])
        tau = tf.tile(tf.expand_dims(tau, axis=2), [1, 1, self.support_size])

        theta_loss_tile = tf.tile(tf.expand_dims(self.q1, axis=2),
                                  [1, 1, self.support_size])
        #Huber_loss = tf.compat.v1.losses.huber_loss(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE,delta=self.kappa)/self.kappa
        bellman_errors = logit_valid_tile - theta_loss_tile
        Logcosh = bellman_errors + tf.math.softplus(
            -2. * bellman_errors) - tf.log(2.)
        Loss = tf.abs(tau - tf.stop_gradient(tf.to_float(
            bellman_errors < 0))) * Logcosh
        self.v1_loss = tf.reduce_mean(
            tf.reduce_sum(tf.reduce_mean(Loss, axis=1), axis=1))

        theta_loss_tile = tf.tile(tf.expand_dims(self.q2, axis=2),
                                  [1, 1, self.support_size])
        #Huber_loss = tf.compat.v1.losses.huber_loss(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE,delta=self.kappa)/self.kappa
        bellman_errors = logit_valid_tile - theta_loss_tile
        Logcosh = bellman_errors + tf.math.softplus(
            -2. * bellman_errors) - tf.log(2.)
        Loss = tf.abs(tau - tf.stop_gradient(tf.to_float(
            bellman_errors < 0))) * Logcosh
        self.v2_loss = tf.reduce_mean(
            tf.reduce_sum(tf.reduce_mean(Loss, axis=1), axis=1))

        self.v_loss = self.v1_loss + self.v2_loss

        self.value_optimizer = tf.train.AdamOptimizer(self.q_lr)
        self.train_value_op = self.value_optimizer.minimize(
            self.v_loss, var_list=self.q_params)

        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        with tf.control_dependencies([self.train_value_op]):
            self.train_pi_op = self.pi_optimizer.minimize(
                self.pi_loss, var_list=self.pi_params)

        with tf.control_dependencies([self.train_pi_op]):
            self.target_update = tf.group([
                tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main)
                for v_main, v_targ in zip(cr.get_vars('main'),
                                          cr.get_vars('target'))
            ])

        self.step_ops = [
            self.pi_loss, self.v_loss, self.train_pi_op, self.train_value_op,
            self.target_update
        ]
        self.value_ops = [self.v_loss, self.train_value_op]

        self.target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.target_init)
        print(
            self.sess.run(self.quantile_weight,
                          feed_dict={self.risk_factor_ph: self.risk_factor}))
        self.saver = tf.train.Saver()
Beispiel #5
0
    def __init__(self):
        self.sess = tf.Session()
        self.state_size = 33
        self.output_size = 4
        self.tau = 0.995
        self.gamma = 0.99
        self.hidden = [400, 300]
        self.batch_size = 64
        self.pi_lr = 1e-3
        self.q_lr = 1e-3
        self.action_limit = 1.0
        self.memory = replay_buffer(1e5)
        self.target_noise = 0.2
        self.noise_clip = 0.1
        self.alpha = 1e-5
        self.num_worker = 20
        self.noise = OU_noise(self.output_size, self.num_worker)

        self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size, self.state_size, None, None)

        with tf.variable_scope('main'):
            self.mu, self.pi, self.logp_pi, self.q1, self.q2, self.q1_pi, self.q2_pi, self.v = \
                cr.sac_mlp_actor_critic(
                    x=self.x_ph,
                    a=self.a_ph,
                    hidden=self.hidden,
                    activation=tf.nn.relu,
                    output_activation=tf.tanh,
                    output_size=self.output_size,
                    action_limit=self.action_limit
                )
        with tf.variable_scope('target'):
            _, _, _, _, _, _, _, self.v_targ = \
                cr.sac_mlp_actor_critic(
                    x=self.x2_ph,
                    a=self.a_ph,
                    hidden=self.hidden,
                    activation=tf.nn.relu,
                    output_activation=tf.tanh,
                    output_size=self.output_size,
                    action_limit=self.action_limit
                )

        self.pi_params = cr.get_vars('main/pi')
        self.value_params = cr.get_vars('main/q') + cr.get_vars('main/v')

        self.min_q_pi = tf.minimum(self.q1_pi, self.q2_pi)
        self.q_backup = tf.stop_gradient(self.r_ph + self.gamma *
                                         (1 - self.d_ph) * self.v_targ)
        self.v_backup = tf.stop_gradient(self.min_q_pi -
                                         self.alpha * self.logp_pi)

        self.pi_loss = tf.reduce_mean(self.alpha * self.logp_pi - self.q1_pi)
        self.q1_loss = 0.5 * tf.reduce_mean((self.q_backup - self.q1)**2)
        self.q2_loss = 0.5 * tf.reduce_mean((self.q_backup - self.q2)**2)
        self.v_loss = 0.5 * tf.reduce_mean((self.v_backup - self.v)**2)
        self.value_loss = self.q1_loss + self.q2_loss + self.v_loss

        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        self.train_pi_op = self.pi_optimizer.minimize(self.pi_loss,
                                                      var_list=self.pi_params)

        self.value_optimizer = tf.train.AdamOptimizer(self.q_lr)
        with tf.control_dependencies([self.train_pi_op]):
            self.train_value_op = self.value_optimizer.minimize(
                self.value_loss, var_list=self.value_params)

        with tf.control_dependencies([self.train_value_op]):
            self.target_update = tf.group([
                tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main)
                for v_main, v_targ in zip(cr.get_vars('main'),
                                          cr.get_vars('target'))
            ])

        self.step_ops = [
            self.pi_loss, self.q1_loss, self.q2_loss, self.v_loss, self.q1,
            self.q2, self.v, self.logp_pi, self.train_pi_op,
            self.train_value_op, self.target_update
        ]

        self.target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.target_init)
Beispiel #6
0
    def __init__(self):
        self.sess = tf.Session()
        self.state_size = env_set['state']
        self.output_size = env_set['action']
        self.worker_size = env_set['worker']
        self.support_size = 8
        self.target_update_tau = 0.995
        self.gamma = 0.99
        self.hidden = env_set['hidden']
        self.batch_size = 64
        self.pi_lr = 1e-4
        self.q_lr = 1e-3
        self.action_limit = 1.0
        self.memory = replay_buffer(env_set['mem_size'])
        self.target_noise = 0.2
        self.noise_clip = 0.1

        self.x_ph, self.a_ph, self.tau_ph,self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size, self.support_size,self.state_size, None, None)

        with tf.variable_scope('main'):
            self.pi, self.q, self.q_pi = cr.dipg_mlp_actor_critic(
                x=self.x_ph,
                a=self.a_ph,
                tau=self.tau_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit)

        with tf.variable_scope('target'):
            _, _, self.q_pi_targ = cr.dipg_mlp_actor_critic(
                x=self.x2_ph,
                a=self.a_ph,
                tau=self.tau_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit,
                pi_q_noise=self.target_noise)

        self.pi_params = cr.get_vars('main/pi')
        self.q_params = cr.get_vars('main/q')
        self.backup = tf.stop_gradient(tf.tile(tf.expand_dims(self.r_ph,axis=1),[1,self.support_size])\
                    + self.gamma*tf.tile(tf.expand_dims(1-self.d_ph,axis=1),[1,self.support_size])*self.q_pi_targ)
        self.pi_loss = -tf.reduce_mean(tf.reduce_mean(self.q_pi))

        self.clip_tau = 5e-2
        theta_loss_tile = tf.tile(tf.expand_dims(self.q, axis=2),
                                  [1, 1, self.support_size])
        logit_valid_tile = tf.tile(tf.expand_dims(self.backup, axis=1),
                                   [1, self.support_size, 1])
        Huber_loss = tf.losses.huber_loss(logit_valid_tile,
                                          theta_loss_tile,
                                          reduction=tf.losses.Reduction.NONE)
        tau = tf.tile(tf.expand_dims(self.tau_ph, axis=2),
                      [1, 1, self.support_size])
        bellman_errors = logit_valid_tile - theta_loss_tile
        Loss = (
            tf.abs(tau - tf.stop_gradient(tf.to_float(bellman_errors < 0))) *
            Huber_loss)
        self.v_loss = tf.reduce_mean(
            tf.reduce_sum(tf.reduce_mean(Loss, axis=1)))

        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        grad = self.pi_optimizer.compute_gradients(self.pi_loss,
                                                   var_list=self.pi_params)
        grad = [(gr / self.support_size, var) for gr, var in grad]
        self.train_pi_op = self.pi_optimizer.apply_gradients(grad)

        self.value_optimizer = tf.train.AdamOptimizer(self.q_lr)
        with tf.control_dependencies([self.train_pi_op]):
            self.train_value_op = self.value_optimizer.minimize(
                self.v_loss, var_list=self.q_params)

        with tf.control_dependencies([self.train_value_op]):
            self.target_update = tf.group([
                tf.assign(
                    v_targ, self.target_update_tau * v_targ +
                    (1 - self.target_update_tau) * v_main)
                for v_main, v_targ in zip(cr.get_vars('main'),
                                          cr.get_vars('target'))
            ])
        self.step_ops = [
            self.pi_loss, self.v_loss, self.train_pi_op, self.train_value_op,
            self.target_update
        ]
        self.target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                cr.get_vars('main'), cr.get_vars('target'))
        ])
        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.target_init)
Beispiel #7
0
    def __init__(self):
        self.sess = tf.Session()
        self.state_size = 33
        self.output_size = 4
        self.tau = 0.995
        self.gamma = 0.99
        self.hidden = [400, 300]
        self.batch_size = 64
        self.pi_lr = 1e-3
        self.q_lr = 1e-3
        self.action_limit = 1.0
        self.memory = replay_buffer(1e5)
        self.target_noise = 0.2
        self.noise = OU_noise(self.output_size, 1)
        self.noise_clip = 0.1

        self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size, self.state_size, None, None)

        with tf.variable_scope('main'):
            self.pi, self.q1, self.q2, self.q1_pi = cr.td3_mlp_actor_critic(
                x=self.x_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit)

        with tf.variable_scope('target'):
            self.pi_targ, _, _, _ = cr.td3_mlp_actor_critic(
                x=self.x2_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit)

        with tf.variable_scope('target', reuse=True):
            self.eps = tf.random_normal(tf.shape(self.pi_targ),
                                        stddev=self.target_noise)
            self.epsilon = tf.clip_by_value(self.eps, -self.noise_clip,
                                            self.noise_clip)
            self.a_prev = self.pi_targ + self.epsilon
            self.a2 = tf.clip_by_value(self.a_prev, -self.action_limit,
                                       self.action_limit)
            _, self.q1_targ, self.q2_targ, self.q1_pi_targ = cr.td3_mlp_actor_critic(
                x=self.x2_ph,
                a=self.a2,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit)

        self.pi_params = cr.get_vars('main/pi')
        self.q_params = cr.get_vars('main/q')

        self.min_q_targ = tf.minimum(self.q1_targ, self.q2_targ)
        self.backup = tf.stop_gradient(self.r_ph + self.gamma *
                                       (1 - self.d_ph) * self.min_q_targ)
        self.pi_loss = -tf.reduce_mean(self.q1_pi)
        self.q1_loss = tf.reduce_mean((self.q1 - self.backup)**2)
        self.q2_loss = tf.reduce_mean((self.q2 - self.backup)**2)
        self.v_loss = self.q1_loss + self.q2_loss

        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        self.q_optimizer = tf.train.AdamOptimizer(self.q_lr)
        self.pi_train = self.pi_optimizer.minimize(self.pi_loss,
                                                   var_list=self.pi_params)
        self.v_train = self.pi_optimizer.minimize(self.v_loss,
                                                  var_list=self.q_params)

        self.target_update = tf.group([
            tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for
            v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.sess.run(tf.global_variables_initializer())

        self.sess.run(self.target_init)
Beispiel #8
0
    def __init__(self):
        self.sess = tf.Session()
        self.state_size = env_set['state']
        self.output_size = env_set['action']
        self.worker_size = env_set['worker']
        self.tau = 0.995
        self.gamma = env_set['gamma']
        self.hidden = env_set['hidden']
        self.batch_size = 64
        self.pi_lr = env_set['pi_lr']
        self.q_lr = env_set['q_lr']
        self.action_limit = 1.0
        self.memory = replay_buffer(env_set['mem_size'])
        self.target_noise = 0.2
        self.noise_clip = 0.5


        self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size, self.state_size, None, None)

        with tf.variable_scope('main'):
            self.pi, self.q1, self.q2, self.q1_pi, _ = cr.td3_mlp_actor_critic(
                x=self.x_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit,
            )

        with tf.variable_scope('target'):
            self.pi_targ, self.q1_double_targ, self.q2_double_targ, self.q1_pi_targ, self.q2_pi_targ = cr.td3_mlp_actor_critic(
                x=self.x2_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit,
                pi_q_noise=self.target_noise,
                noise_clip=self.noise_clip)

        self.pi_params = cr.get_vars('main/pi')
        self.q_params = cr.get_vars('main/q')

        self.min_q_targ = tf.minimum(self.q1_pi_targ, self.q2_pi_targ)
        #self.min_q_targ = tf.minimum(self.q1_double_targ,self.q2_double_targ)
        self.backup = tf.stop_gradient(self.r_ph + self.gamma *
                                       (1 - self.d_ph) * self.min_q_targ)
        self.pi_loss = -tf.reduce_mean(self.q1_pi)
        self.q1_loss = tf.reduce_mean((self.q1 - self.backup)**2)
        self.q2_loss = tf.reduce_mean((self.q2 - self.backup)**2)
        self.v_loss = self.q1_loss + self.q2_loss

        self.value_optimizer = tf.train.AdamOptimizer(self.q_lr)
        self.train_value_op = self.value_optimizer.minimize(
            self.v_loss, var_list=self.q_params)

        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        with tf.control_dependencies([self.train_value_op]):
            self.train_pi_op = self.pi_optimizer.minimize(
                self.pi_loss, var_list=self.pi_params)

        with tf.control_dependencies([self.train_pi_op]):
            self.target_update = tf.group([
                tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main)
                for v_main, v_targ in zip(cr.get_vars('main'),
                                          cr.get_vars('target'))
            ])

        self.step_ops = [
            self.pi_loss, self.v_loss, self.train_pi_op, self.train_value_op,
            self.target_update
        ]
        self.value_ops = [self.v_loss, self.train_value_op]

        self.target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.sess.run(tf.global_variables_initializer())

        self.sess.run(self.target_init)
        self.saver = tf.train.Saver()
    def __init__(self):
        self.sess = tf.Session()
        self.state_size = env_set['state']
        self.output_size = env_set['action']
        self.worker_size = env_set['worker']
        self.support_size = 8
        self.target_update_tau = 0.995
        self.gamma = 0.99
        self.hidden = env_set['hidden']
        self.batch_size = 64
        self.pi_lr = 1e-4
        self.q_lr = 1e-3
        self.action_limit = 1.0
        self.memory = replay_buffer(env_set['mem_size'])
        self.target_noise = 0.2
        self.noise_clip = 0.1
        self.alpha = 1e-5
        
        self.x_ph, self.a_ph, self.tau_ph,self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size, self.support_size,self.state_size, None, None)

        with tf.variable_scope('main'):
            self.pi, self.logp_pi, self.q1, self.q2, self.q1_pi, self.q2_pi,  self.v = cr.dipg_sac_mlp_actor_critic(
                x=self.x_ph,
                a=self.a_ph,
                tau= self.tau_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size
            )

        with tf.variable_scope('target'):
            _, _, _, _, _, _, self.v_targ = cr.dipg_sac_mlp_actor_critic(
                x=self.x2_ph,
                a=self.a_ph,
                tau=self.tau_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size
            )

        self.pi_params = cr.get_vars('main/pi')
        self.value_params = cr.get_vars('main/q') + cr.get_vars('main/v')
        self.min_q = tf.where(tf.less(tf.reduce_mean(self.q1_pi),tf.reduce_mean(self.q2_pi)),self.q1_pi,self.q2_pi)
        self.q_backup = tf.stop_gradient(tf.tile(tf.expand_dims(self.r_ph,axis=1),[1,self.support_size])\
                    + self.gamma*tf.tile(tf.expand_dims(1-self.d_ph,axis=1),[1,self.support_size])*self.v_targ)
        self.v_backup = tf.stop_gradient(self.min_q\
                        - self.alpha*tf.tile(tf.expand_dims(self.logp_pi,axis=1),[1,self.support_size]))
        self.pi_loss = tf.reduce_mean(self.alpha * self.logp_pi - tf.reduce_mean(self.q1_pi*tf.square(self.tau_ph)))
        tau = self.tau_ph
        inv_tau = 1 - tau
        tau = tf.tile(tf.expand_dims(tau, axis=1), [1, self.support_size, 1])
        inv_tau = tf.tile(tf.expand_dims(inv_tau, axis=1), [1, self.support_size, 1])
        logit_valid_tile = tf.tile(tf.expand_dims(self.q_backup, axis=1), [1, self.support_size, 1])

        theta_loss_tile = tf.tile(tf.expand_dims(self.q1, axis=2), [1, 1, self.support_size])
        Huber_loss = tf.losses.mean_squared_error(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE)
        error_loss = logit_valid_tile - theta_loss_tile
        Loss = tf.where(tf.less(error_loss, 0.0), Huber_loss, tau * Huber_loss)
        self.q1_loss = 0.5*tf.reduce_mean(tf.reduce_sum(tf.reduce_mean(Loss, axis=2), axis=1))

        theta_loss_tile = tf.tile(tf.expand_dims(self.q2, axis=2), [1, 1, self.support_size])
        Huber_loss = tf.losses.mean_squared_error(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE)
        error_loss = logit_valid_tile - theta_loss_tile
        Loss = tf.where(tf.less(error_loss, 0.0), Huber_loss, tau * Huber_loss)
        self.q2_loss = 0.5*tf.reduce_mean(tf.reduce_sum(tf.reduce_mean(Loss, axis=2), axis=1))

        theta_loss_tile = tf.tile(tf.expand_dims(self.v, axis=2), [1, 1, self.support_size])
        logit_valid_tile = tf.tile(tf.expand_dims(self.v_backup, axis=1), [1, self.support_size, 1])
        Huber_loss = tf.losses.mean_squared_error(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE)
        error_loss = logit_valid_tile - theta_loss_tile
        Loss = tf.where(tf.less(error_loss, 0.0), Huber_loss, tau * Huber_loss)
        self.v_loss = 0.5*tf.reduce_mean(tf.reduce_sum(tf.reduce_mean(Loss, axis=2), axis=1))
        self.value_loss = self.q1_loss + self.q2_loss + self.v_loss

        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        self.train_pi_op = self.pi_optimizer.minimize(self.pi_loss, var_list=self.pi_params)

        self.value_optimizer = tf.train.AdamOptimizer(self.q_lr)
        with tf.control_dependencies([self.train_pi_op]):
            self.train_value_op = self.value_optimizer.minimize(self.value_loss, var_list=self.value_params)

        with tf.control_dependencies([self.train_value_op]):
            self.target_update = tf.group([tf.assign(v_targ, self.target_update_tau * v_targ + (1 - self.target_update_tau) * v_main)
                                           for v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target'))])

        self.step_ops = [self.pi_loss, self.value_loss, self.train_pi_op, self.train_value_op, self.target_update]
        self.target_init = tf.group([tf.assign(v_targ, v_main)
                                    for v_main, v_targ in zip(cr.get_vars('main/v'), cr.get_vars('target/v'))])

        self.sess.run(tf.global_variables_initializer())

        self.sess.run(self.target_init)
Beispiel #10
0
    def __init__(self,
                 env_id,
                 episode,
                 learning_rate,
                 gamma,
                 capacity,
                 batch_size,
                 value_iter,
                 policy_iter,
                 rho,
                 episode_len,
                 render,
                 train_freq,
                 entropy_weight,
                 start_count=10000,
                 model_path=False):
        self.env_id = env_id
        self.env = make_env.make_env(self.env_id)
        self.episode = episode
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.capacity = capacity
        self.batch_size = batch_size
        self.value_iter = value_iter
        self.policy_iter = policy_iter
        self.rho = rho
        self.render = render
        self.episode_len = episode_len
        self.train_freq = train_freq
        self.entropy_weight = entropy_weight
        self.model_path = model_path

        self.observation_dims = self.env.observation_space
        self.action_dims = self.env.action_space
        self.observation_total_dims = sum([
            self.env.observation_space[i].shape[0] for i in range(self.env.n)
        ])
        self.action_total_dims = sum([
            self.env.action_space[i].n
            if isinstance(self.env.action_space[i], Discrete) else
            sum(self.env.action_space[i].high) + self.env.action_space[i].shape
            for i in range(self.env.n)
        ])
        self.policy_nets = [
            policy_net(
                self.observation_dims[i].shape[0], self.action_dims[i].n
                if isinstance(self.env.action_space[i], Discrete) else
                sum(self.env.action_space[i].high) +
                self.env.action_space[i].shape) for i in range(self.env.n)
        ]
        self.target_policy_nets = [
            policy_net(
                self.observation_dims[i].shape[0], self.action_dims[i].n
                if isinstance(self.env.action_space[i], Discrete) else
                sum(self.env.action_space[i].high) +
                self.env.action_space[i].shape) for i in range(self.env.n)
        ]
        self.value_nets = [
            value_net(self.observation_total_dims, self.action_total_dims, 1)
            for i in range(self.env.n)
        ]
        self.target_value_nets = [
            value_net(self.observation_total_dims, self.action_total_dims, 1)
            for i in range(self.env.n)
        ]
        self.policy_optimizers = [
            torch.optim.Adam(policy_net.parameters(), lr=self.learning_rate)
            for policy_net in self.policy_nets
        ]
        self.value_optimizers = [
            torch.optim.Adam(value_net.parameters(), lr=self.learning_rate)
            for value_net in self.value_nets
        ]
        if self.model_path:
            for i in range(self.env.n):
                self.policy_nets[i] = torch.load(
                    './models/{}/policy_model{}.pkl'.format(self.env_id, i))
                self.value_nets[i] = torch.load(
                    './models/{}/value_model{}.pkl'.format(self.env_id, i))
        [
            target_policy_net.load_state_dict(policy_net.state_dict())
            for target_policy_net, policy_net in zip(self.target_policy_nets,
                                                     self.policy_nets)
        ]
        [
            target_value_net.load_state_dict(value_net.state_dict())
            for target_value_net, value_net in zip(self.target_value_nets,
                                                   self.value_nets)
        ]
        self.buffer = replay_buffer(self.capacity)
        self.count = 0
        self.train_count = 0
        self.start_count = start_count