def __init__(self, sess, base_name, observation_space, action_space,
                 config):
        self.name = base_name
        self.actions_low = action_space.low
        self.actions_high = action_space.high
        self.env_name = config['env_name']
        self.ppo = config['ppo']
        self.is_adaptive_lr = config['lr_schedule'] == 'adaptive'
        self.is_polynom_decay_lr = config['lr_schedule'] == 'polynom_decay'
        self.is_exp_decay_lr = config['lr_schedule'] == 'exp_decay'
        self.lr_multiplier = tf.constant(1, shape=(), dtype=tf.float32)

        self.e_clip = config['e_clip']
        self.clip_value = config['clip_value']
        self.network = config['network']
        self.rewards_shaper = config['reward_shaper']
        self.num_actors = config['num_actors']
        self.env_config = config.get('env_config', {})
        self.vec_env = vecenv.create_vec_env(self.env_name, self.num_actors,
                                             **self.env_config)
        self.num_agents = self.vec_env.get_number_of_agents()
        self.steps_num = config['steps_num']
        self.normalize_advantage = config['normalize_advantage']
        self.config = config
        self.state_shape = observation_space.shape
        self.critic_coef = config['critic_coef']
        self.writer = SummaryWriter('runs/' + config['name'] +
                                    datetime.now().strftime("%d, %H:%M:%S"))

        self.sess = sess
        self.grad_norm = config['grad_norm']
        self.gamma = self.config['gamma']
        self.tau = self.config['tau']
        self.normalize_input = self.config['normalize_input']
        self.seq_len = self.config['seq_len']
        self.dones = np.asarray([False] * self.num_actors, dtype=np.bool)

        self.current_rewards = np.asarray([0] * self.num_actors,
                                          dtype=np.float32)
        self.current_lengths = np.asarray([0] * self.num_actors,
                                          dtype=np.float32)
        self.game_rewards = deque([], maxlen=100)
        self.game_lengths = deque([], maxlen=100)

        self.obs_ph = tf.placeholder('float32', (None, ) + self.state_shape,
                                     name='obs')
        self.target_obs_ph = tf.placeholder('float32',
                                            (None, ) + self.state_shape,
                                            name='target_obs')
        self.actions_num = action_space.shape[0]
        self.actions_ph = tf.placeholder('float32',
                                         (None, ) + action_space.shape,
                                         name='actions')
        self.old_mu_ph = tf.placeholder('float32',
                                        (None, ) + action_space.shape,
                                        name='old_mu_ph')
        self.old_sigma_ph = tf.placeholder('float32',
                                           (None, ) + action_space.shape,
                                           name='old_sigma_ph')
        self.old_neglogp_actions_ph = tf.placeholder('float32', (None, ),
                                                     name='old_logpactions')
        self.rewards_ph = tf.placeholder('float32', (None, ), name='rewards')
        self.old_values_ph = tf.placeholder('float32', (None, ),
                                            name='old_values')
        self.advantages_ph = tf.placeholder('float32', (None, ),
                                            name='advantages')
        self.learning_rate_ph = tf.placeholder('float32', (), name='lr_ph')
        self.epoch_num = tf.Variable(tf.constant(0, shape=(),
                                                 dtype=tf.float32),
                                     trainable=False)
        self.update_epoch_op = self.epoch_num.assign(self.epoch_num + 1)
        self.current_lr = self.learning_rate_ph

        self.bounds_loss_coef = config.get('bounds_loss_coef', None)

        if self.is_adaptive_lr:
            self.lr_threshold = config['lr_threshold']
        if self.is_polynom_decay_lr:
            self.lr_multiplier = tf.train.polynomial_decay(
                1.0,
                global_step=self.epoch_num,
                decay_steps=config['max_epochs'],
                end_learning_rate=0.001,
                power=tr_helpers.get_or_default(config, 'decay_power', 1.0))
        if self.is_exp_decay_lr:
            self.lr_multiplier = tf.train.exponential_decay(
                1.0,
                global_step=self.epoch_num,
                decay_steps=config['max_epochs'],
                decay_rate=config['decay_rate'])

        self.input_obs = self.obs_ph
        self.input_target_obs = self.target_obs_ph

        if observation_space.dtype == np.uint8:
            self.input_obs = tf.to_float(self.input_obs) / 255.0
            self.input_target_obs = tf.to_float(self.input_target_obs) / 255.0

        if self.normalize_input:
            self.moving_mean_std = MovingMeanStd(shape=observation_space.shape,
                                                 epsilon=1e-5,
                                                 decay=0.99)
            self.input_obs = self.moving_mean_std.normalize(self.input_obs,
                                                            train=True)
            self.input_target_obs = self.moving_mean_std.normalize(
                self.input_target_obs, train=False)

        games_num = self.config[
            'minibatch_size'] // self.seq_len  # it is used only for current rnn implementation

        self.train_dict = {
            'name': 'agent',
            'inputs': self.input_obs,
            'batch_num': self.config['minibatch_size'],
            'games_num': games_num,
            'actions_num': self.actions_num,
            'prev_actions_ph': self.actions_ph,
        }

        self.run_dict = {
            'name': 'agent',
            'inputs': self.input_target_obs,
            'batch_num': self.num_actors,
            'games_num': self.num_actors,
            'actions_num': self.actions_num,
            'prev_actions_ph': None,
        }

        self.states = None
        if self.network.is_rnn():
            self.neglogp_actions, self.state_values, self.action, self.entropy, self.mu, self.sigma, self.states_ph, self.masks_ph, self.lstm_state, self.initial_state = self.network(
                self.train_dict, reuse=False)
            self.target_neglogp, self.target_state_values, self.target_action, _, self.target_mu, self.target_sigma, self.target_states_ph, self.target_masks_ph, self.target_lstm_state, self.target_initial_state = self.network(
                self.run_dict, reuse=True)
            self.states = self.target_initial_state
        else:
            self.neglogp_actions, self.state_values, self.action, self.entropy, self.mu, self.sigma = self.network(
                self.train_dict, reuse=False)
            self.target_neglogp, self.target_state_values, self.target_action, _, self.target_mu, self.target_sigma = self.network(
                self.run_dict, reuse=True)

        curr_e_clip = self.e_clip * self.lr_multiplier
        if (self.ppo):
            self.prob_ratio = tf.exp(self.old_neglogp_actions_ph -
                                     self.neglogp_actions)
            self.prob_ratio = tf.clip_by_value(self.prob_ratio, 0.0, 16.0)
            self.pg_loss_unclipped = -tf.multiply(self.advantages_ph,
                                                  self.prob_ratio)
            self.pg_loss_clipped = -tf.multiply(
                self.advantages_ph,
                tf.clip_by_value(self.prob_ratio, 1. - curr_e_clip,
                                 1. + curr_e_clip))
            self.actor_loss = tf.reduce_mean(
                tf.maximum(self.pg_loss_unclipped, self.pg_loss_clipped))
        else:
            self.actor_loss = tf.reduce_mean(self.neglogp_actions *
                                             self.advantages_ph)

        self.c_loss = (tf.squeeze(self.state_values) - self.rewards_ph)**2

        if self.clip_value:
            self.cliped_values = self.old_values_ph + tf.clip_by_value(
                tf.squeeze(self.state_values) - self.old_values_ph,
                -curr_e_clip, curr_e_clip)
            self.c_loss_clipped = tf.square(self.cliped_values -
                                            self.rewards_ph)
            self.critic_loss = tf.reduce_mean(
                tf.maximum(self.c_loss, self.c_loss_clipped))
        else:
            self.critic_loss = tf.reduce_mean(self.c_loss)

        self._calc_kl_dist()

        self.loss = self.actor_loss + 0.5 * self.critic_coef * self.critic_loss - self.config[
            'entropy_coef'] * self.entropy
        self._apply_bound_loss()
        self.reg_loss = tf.losses.get_regularization_loss()
        self.loss += self.reg_loss
        self.train_step = tf.train.AdamOptimizer(self.current_lr *
                                                 self.lr_multiplier)
        self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                         scope='agent')

        grads = tf.gradients(self.loss, self.weights)
        if self.config['truncate_grads']:
            grads, _ = tf.clip_by_global_norm(grads, self.grad_norm)
        grads = list(zip(grads, self.weights))

        self.train_op = self.train_step.apply_gradients(grads)
        self.saver = tf.train.Saver()
        self.sess.run(tf.global_variables_initializer())
Beispiel #2
0
    def __init__(self,
                 sess,
                 base_name,
                 observation_space,
                 action_space,
                 config,
                 logger,
                 central_state_space=None):
        observation_shape = observation_space.shape
        actions_num = action_space.n
        self.config = config
        self.is_adaptive_lr = config['lr_schedule'] == 'adaptive'
        self.is_polynom_decay_lr = config['lr_schedule'] == 'polynom_decay'
        self.is_exp_decay_lr = config['lr_schedule'] == 'exp_decay'
        self.lr_multiplier = tf.constant(1, shape=(), dtype=tf.float32)
        self.learning_rate_ph = tf.placeholder('float32', (), name='lr_ph')
        self.games_to_track = tr_helpers.get_or_default(
            config, 'games_to_track', 100)
        self.max_epochs = tr_helpers.get_or_default(self.config, 'max_epochs',
                                                    1e6)

        self.games_to_log = self.config.get('games_to_track', 100)
        self.game_rewards = deque([], maxlen=self.games_to_track)
        self.game_lengths = deque([], maxlen=self.games_to_track)
        self.game_scores = deque([], maxlen=self.games_to_log)

        self.epoch_num = tf.Variable(tf.constant(0, shape=(),
                                                 dtype=tf.float32),
                                     trainable=False)
        self.update_epoch_op = self.epoch_num.assign(self.epoch_num + 1)
        self.current_lr = self.learning_rate_ph

        if self.is_adaptive_lr:
            self.lr_threshold = config['lr_threshold']
        if self.is_polynom_decay_lr:
            self.lr_multiplier = tf.train.polynomial_decay(
                1.0,
                global_step=self.epoch_num,
                decay_steps=self.max_epochs,
                end_learning_rate=0.001,
                power=tr_helpers.get_or_default(config, 'decay_power', 1.0))
        if self.is_exp_decay_lr:
            self.lr_multiplier = tf.train.exponential_decay(
                1.0,
                global_step=self.epoch_num,
                decay_steps=self.max_epochs,
                decay_rate=config['decay_rate'])

        self.env_name = config['env_name']
        self.network = config['network']
        self.batch_size = self.config['batch_size']

        self.obs_shape = observation_shape
        self.actions_num = actions_num
        self.writer = SummaryWriter('runs/' + config['name'] +
                                    datetime.now().strftime("%d, %H:%M:%S"))
        self.epsilon = self.config['epsilon']
        self.rewards_shaper = self.config['reward_shaper']
        self.epsilon_processor = tr_helpers.LinearValueProcessor(
            self.config['epsilon'], self.config['min_epsilon'],
            self.config['epsilon_decay_frames'])
        self.beta_processor = tr_helpers.LinearValueProcessor(
            self.config['priority_beta'], self.config['max_beta'],
            self.config['beta_decay_frames'])
        if self.env_name:
            self.env_config = config.get('env_config', {})
            self.env = env_configurations.configurations[
                self.env_name]['env_creator'](**self.env_config)
        self.sess = sess
        self.steps_num = self.config['steps_num']

        self.obs_act_rew = deque([], maxlen=self.steps_num)

        self.is_prioritized = config['replay_buffer_type'] != 'normal'
        self.atoms_num = self.config['atoms_num']
        assert self.atoms_num == 1

        if central_state_space is not None:
            self.state_shape = central_state_space.shape
        else:
            raise NotImplementedError(
                "central_state_space input to VDN is NONE!")
        self.n_agents = self.env.env_info['n_agents']

        if not self.is_prioritized:
            self.exp_buffer = experience.ReplayBufferCentralState(
                config['replay_buffer_size'], observation_space,
                central_state_space, self.n_agents)
        else:
            raise NotImplementedError(
                "Not implemented! PrioritizedReplayBuffer with CentralState")
            #self.exp_buffer = experience.PrioritizedReplayBufferCentralState(config['replay_buffer_size'], config['priority_alpha'])
            #self.sample_weights_ph = tf.placeholder(tf.float32, shape=[None, 1], name='sample_weights')

        self.batch_size_ph = tf.placeholder(tf.int32, name='batch_size_ph')
        self.obs_ph = tf.placeholder(observation_space.dtype,
                                     shape=(None, ) + self.obs_shape,
                                     name='obs_ph')
        self.state_ph = tf.placeholder(observation_space.dtype,
                                       shape=(None, ) + self.state_shape,
                                       name='state_ph')
        self.actions_ph = tf.placeholder(tf.int32,
                                         shape=[None, 1],
                                         name='actions_ph')
        self.rewards_ph = tf.placeholder(tf.float32,
                                         shape=[None, 1],
                                         name='rewards_ph')
        self.next_obs_ph = tf.placeholder(observation_space.dtype,
                                          shape=(None, ) + self.obs_shape,
                                          name='next_obs_ph')
        self.is_done_ph = tf.placeholder(tf.float32,
                                         shape=[None, 1],
                                         name='is_done_ph')
        self.is_not_done = 1 - self.is_done_ph
        self.name = base_name

        self.gamma = self.config['gamma']
        self.gamma_step = self.gamma**self.steps_num
        self.grad_norm = config['grad_norm']
        self.input_obs = self.obs_ph
        self.input_next_obs = self.next_obs_ph
        if observation_space.dtype == np.uint8:
            print('scaling obs')
            self.input_obs = tf.to_float(self.input_obs) / 255.0
            self.input_next_obs = tf.to_float(self.input_next_obs) / 255.0
        self.setup_qvalues(actions_num)

        self.reg_loss = tf.losses.get_regularization_loss()
        self.td_loss_mean += self.reg_loss
        self.learning_rate = self.config['learning_rate']
        self.train_step = tf.train.AdamOptimizer(
            self.learning_rate * self.lr_multiplier
        )  # .minimize(self.td_loss_mean, var_list=self.weights)
        grads = tf.gradients(self.td_loss_mean, self.weights)
        if self.config['truncate_grads']:
            grads, _ = tf.clip_by_global_norm(grads, self.grad_norm)
        grads = list(zip(grads, self.weights))
        self.train_op = self.train_step.apply_gradients(grads)

        self.saver = tf.train.Saver()
        self.assigns_op = [
            tf.assign(w_target, w_self, validate_shape=True)
            for w_self, w_target in zip(self.weights, self.target_weights)
        ]
        self.variables = TensorFlowVariables(self.qvalues, self.sess)
        if self.env_name:
            sess.run(tf.global_variables_initializer())
        self._reset()

        self.logger = logger
        self.num_env_steps_train = 0
    def train(self):

        max_epochs = tr_helpers.get_or_default(self.config, 'max_epochs', 1e6)
        self.obs = self.vec_env.reset()
        batch_size = self.steps_num * self.num_actors * self.num_agents
        minibatch_size = self.config['minibatch_size']
        mini_epochs_num = self.config['mini_epochs']
        num_minibatches = batch_size // minibatch_size
        last_lr = self.config['learning_rate']

        last_mean_rewards = -100500

        epoch_num = 0
        frame = 0
        update_time = 0
        play_time = 0
        start_time = time.time()
        total_time = 0

        while True:
            play_time_start = time.time()
            epoch_num = self.update_epoch()
            frame += batch_size
            obses, returns, dones, actions, values, neglogpacs, mus, sigmas, lstm_states, _ = self.play_steps(
            )
            advantages = returns - values
            if self.normalize_advantage:
                advantages = (advantages -
                              advantages.mean()) / (advantages.std() + 1e-8)

            a_losses = []
            c_losses = []
            b_losses = []
            entropies = []
            kls = []
            play_time_end = time.time()
            play_time = play_time_end - play_time_start
            update_time_start = time.time()
            if self.network.is_rnn():
                total_games = batch_size // self.seq_len
                num_games_batch = minibatch_size // self.seq_len
                game_indexes = np.arange(total_games)
                flat_indexes = np.arange(total_games * self.seq_len).reshape(
                    total_games, self.seq_len)
                lstm_states = lstm_states[::self.seq_len]
                for _ in range(0, mini_epochs_num):
                    np.random.shuffle(game_indexes)

                    for i in range(0, num_minibatches):
                        batch = range(i * num_games_batch,
                                      (i + 1) * num_games_batch)
                        mb_indexes = game_indexes[batch]
                        mbatch = flat_indexes[mb_indexes].ravel()

                        dict = {}
                        dict[self.old_values_ph] = values[mbatch]
                        dict[self.old_neglogp_actions_ph] = neglogpacs[mbatch]
                        dict[self.advantages_ph] = advantages[mbatch]
                        dict[self.rewards_ph] = returns[mbatch]
                        dict[self.actions_ph] = actions[mbatch]
                        dict[self.obs_ph] = obses[mbatch]
                        dict[self.old_mu_ph] = mus[mbatch]
                        dict[self.old_sigma_ph] = sigmas[mbatch]
                        dict[self.masks_ph] = dones[mbatch]
                        dict[self.states_ph] = lstm_states[batch]

                        dict[self.learning_rate_ph] = last_lr
                        run_ops = [
                            self.actor_loss, self.critic_loss, self.entropy,
                            self.kl_dist, self.current_lr, self.mu, self.sigma,
                            self.lr_multiplier
                        ]
                        if self.bounds_loss is not None:
                            run_ops.append(self.bounds_loss)

                        run_ops.append(self.train_op)
                        run_ops.append(
                            tf.get_collection(tf.GraphKeys.UPDATE_OPS))

                        res_dict = self.sess.run(run_ops, dict)
                        a_loss = res_dict[0]
                        c_loss = res_dict[1]
                        entropy = res_dict[2]
                        kl = res_dict[3]
                        last_lr = res_dict[4]
                        cmu = res_dict[5]
                        csigma = res_dict[6]
                        lr_mul = res_dict[7]
                        if self.bounds_loss is not None:
                            b_loss = res_dict[8]
                            b_losses.append(b_loss)

                        mus[mbatch] = cmu
                        sigmas[mbatch] = csigma
                        a_losses.append(a_loss)
                        c_losses.append(c_loss)
                        kls.append(kl)
                        entropies.append(entropy)
            else:
                for _ in range(0, mini_epochs_num):
                    permutation = np.random.permutation(batch_size)

                    obses = obses[permutation]
                    returns = returns[permutation]
                    actions = actions[permutation]
                    values = values[permutation]
                    neglogpacs = neglogpacs[permutation]
                    advantages = advantages[permutation]
                    mus = mus[permutation]
                    sigmas = sigmas[permutation]

                    for i in range(0, num_minibatches):
                        batch = range(i * minibatch_size,
                                      (i + 1) * minibatch_size)
                        dict = {
                            self.obs_ph: obses[batch],
                            self.actions_ph: actions[batch],
                            self.rewards_ph: returns[batch],
                            self.advantages_ph: advantages[batch],
                            self.old_neglogp_actions_ph: neglogpacs[batch],
                            self.old_values_ph: values[batch]
                        }

                        dict[self.old_mu_ph] = mus[batch]
                        dict[self.old_sigma_ph] = sigmas[batch]
                        dict[self.learning_rate_ph] = last_lr
                        run_ops = [
                            self.actor_loss, self.critic_loss, self.entropy,
                            self.kl_dist, self.current_lr, self.mu, self.sigma,
                            self.lr_multiplier
                        ]
                        if self.bounds_loss is not None:
                            run_ops.append(self.bounds_loss)

                        run_ops.append(self.train_op)
                        run_ops.append(
                            tf.get_collection(tf.GraphKeys.UPDATE_OPS))

                        res_dict = self.sess.run(run_ops, dict)
                        a_loss = res_dict[0]
                        c_loss = res_dict[1]
                        entropy = res_dict[2]
                        kl = res_dict[3]
                        last_lr = res_dict[4]
                        cmu = res_dict[5]
                        csigma = res_dict[6]
                        lr_mul = res_dict[7]
                        if self.bounds_loss is not None:
                            b_loss = res_dict[8]
                            b_losses.append(b_loss)
                        mus[batch] = cmu
                        sigmas[batch] = csigma
                        a_losses.append(a_loss)
                        c_losses.append(c_loss)
                        kls.append(kl)
                        entropies.append(entropy)

            update_time_end = time.time()
            update_time = update_time_end - update_time_start
            sum_time = update_time + play_time

            total_time = update_time_end - start_time

            if True:
                print('Frames per seconds: ', batch_size / sum_time)
                self.writer.add_scalar('performance/fps',
                                       batch_size / sum_time, frame)
                self.writer.add_scalar('performance/upd_time', update_time,
                                       frame)
                self.writer.add_scalar('performance/play_time', play_time,
                                       frame)
                self.writer.add_scalar('losses/a_loss', np.mean(a_losses),
                                       frame)
                self.writer.add_scalar('losses/c_loss', np.mean(c_losses),
                                       frame)
                if len(b_losses) > 0:
                    self.writer.add_scalar('losses/bounds_loss',
                                           np.mean(b_losses), frame)
                self.writer.add_scalar('losses/entropy', np.mean(entropies),
                                       frame)
                self.writer.add_scalar('info/last_lr', last_lr * lr_mul, frame)
                self.writer.add_scalar('info/lr_mul', lr_mul, frame)
                self.writer.add_scalar('info/e_clip', self.e_clip * lr_mul,
                                       frame)
                self.writer.add_scalar('info/kl', np.mean(kls), frame)
                self.writer.add_scalar('epochs', epoch_num, frame)

                if len(self.game_rewards) > 0:
                    mean_rewards = np.mean(self.game_rewards)
                    mean_lengths = np.mean(self.game_lengths)
                    self.writer.add_scalar('rewards/mean', mean_rewards, frame)
                    self.writer.add_scalar('rewards/time', mean_rewards,
                                           total_time)
                    self.writer.add_scalar('episode_lengths/mean',
                                           mean_lengths, frame)
                    self.writer.add_scalar('episode_lengths/time',
                                           mean_lengths, total_time)

                    if mean_rewards > last_mean_rewards:
                        print('saving next best rewards: ', mean_rewards)
                        last_mean_rewards = mean_rewards
                        self.save("./nn/" + self.name + self.env_name)
                        if last_mean_rewards > self.config['score_to_win']:
                            self.save("./nn/" + self.config['name'] + 'ep=' +
                                      str(epoch_num) + 'rew=' +
                                      str(mean_rewards))
                            return last_mean_rewards, epoch_num
                if epoch_num > max_epochs:
                    print('MAX EPOCHS NUM!')
                    self.save("./nn/" + 'last_' + self.config['name'] + 'ep=' +
                              str(epoch_num) + 'rew=' + str(mean_rewards))
                    return last_mean_rewards, epoch_num
                update_time = 0
Beispiel #4
0
    def __init__(self, sess, base_name, observation_space, action_space,
                 config):
        observation_shape = observation_space.shape
        self.use_action_masks = config.get('use_action_masks', False)
        self.is_train = config.get('is_train', True)
        self.self_play = config.get('self_play', False)
        self.name = base_name
        self.config = config
        self.env_name = config['env_name']
        self.ppo = config['ppo']
        self.is_adaptive_lr = config['lr_schedule'] == 'adaptive'
        self.is_polynom_decay_lr = config['lr_schedule'] == 'polynom_decay'
        self.is_exp_decay_lr = config['lr_schedule'] == 'exp_decay'
        self.lr_multiplier = tf.constant(1, shape=(), dtype=tf.float32)
        self.epoch_num = tf.Variable(tf.constant(0, shape=(),
                                                 dtype=tf.float32),
                                     trainable=False)

        self.e_clip = config['e_clip']
        self.clip_value = config['clip_value']
        self.network = config['network']
        self.rewards_shaper = config['reward_shaper']
        self.num_actors = config['num_actors']
        self.env_config = self.config.get('env_config', {})
        self.vec_env = vecenv.create_vec_env(self.env_name, self.num_actors,
                                             **self.env_config)
        self.num_agents = self.vec_env.get_number_of_agents()
        self.steps_num = config['steps_num']
        self.seq_len = self.config['seq_len']
        self.normalize_advantage = config['normalize_advantage']
        self.normalize_input = self.config['normalize_input']

        self.state_shape = observation_shape
        self.critic_coef = config['critic_coef']
        self.writer = SummaryWriter('runs/' + config['name'] +
                                    datetime.now().strftime("%d, %H:%M:%S"))
        self.sess = sess
        self.grad_norm = config['grad_norm']
        self.gamma = self.config['gamma']
        self.tau = self.config['tau']

        self.ignore_dead_batches = self.config.get('ignore_dead_batches',
                                                   False)

        self.dones = np.asarray([False] * self.num_actors * self.num_agents,
                                dtype=np.bool)
        self.current_rewards = np.asarray([0] * self.num_actors *
                                          self.num_agents,
                                          dtype=np.float32)
        self.current_lengths = np.asarray([0] * self.num_actors *
                                          self.num_agents,
                                          dtype=np.float32)
        self.games_to_log = self.config.get('games_to_track', 100)
        self.game_rewards = deque([], maxlen=self.games_to_log)
        self.game_lengths = deque([], maxlen=self.games_to_log)
        self.game_scores = deque([], maxlen=self.games_to_log)
        self.obs_ph = tf.placeholder(observation_space.dtype,
                                     (None, ) + observation_shape,
                                     name='obs')
        self.target_obs_ph = tf.placeholder(observation_space.dtype,
                                            (None, ) + observation_shape,
                                            name='target_obs')
        self.actions_num = action_space.n
        self.actions_ph = tf.placeholder('int32', (None, ), name='actions')
        if self.use_action_masks:
            self.action_mask_ph = tf.placeholder('int32',
                                                 (None, self.actions_num),
                                                 name='actions_mask')
        else:
            self.action_mask_ph = None

        self.old_logp_actions_ph = tf.placeholder('float32', (None, ),
                                                  name='old_logpactions')
        self.rewards_ph = tf.placeholder('float32', (None, ), name='rewards')
        self.old_values_ph = tf.placeholder('float32', (None, ),
                                            name='old_values')
        self.advantages_ph = tf.placeholder('float32', (None, ),
                                            name='advantages')
        self.learning_rate_ph = tf.placeholder('float32', (), name='lr_ph')

        self.update_epoch_op = self.epoch_num.assign(self.epoch_num + 1)
        self.current_lr = self.learning_rate_ph

        self.input_obs = self.obs_ph
        self.input_target_obs = self.target_obs_ph

        if observation_space.dtype == np.uint8:
            self.input_obs = tf.to_float(self.input_obs) / 255.0
            self.input_target_obs = tf.to_float(self.input_target_obs) / 255.0

        if self.is_adaptive_lr:
            self.lr_threshold = config['lr_threshold']
        if self.is_polynom_decay_lr:
            self.lr_multiplier = tf.train.polynomial_decay(
                1.0,
                self.epoch_num,
                config['max_epochs'],
                end_learning_rate=0.001,
                power=tr_helpers.get_or_default(config, 'decay_power', 1.0))
        if self.is_exp_decay_lr:
            self.lr_multiplier = tf.train.exponential_decay(
                1.0,
                self.epoch_num,
                config['max_epochs'],
                decay_rate=config['decay_rate'])
        if self.normalize_input:
            self.moving_mean_std = MovingMeanStd(shape=observation_space.shape,
                                                 epsilon=1e-5,
                                                 decay=0.99)
            self.input_obs = self.moving_mean_std.normalize(self.input_obs,
                                                            train=True)
            self.input_target_obs = self.moving_mean_std.normalize(
                self.input_target_obs, train=False)

        games_num = self.config[
            'minibatch_size'] // self.seq_len  # it is used only for current rnn implementation

        self.train_dict = {
            'name': 'agent',
            'inputs': self.input_obs,
            'batch_num': self.config['minibatch_size'],
            'games_num': games_num,
            'actions_num': self.actions_num,
            'prev_actions_ph': self.actions_ph,
            'action_mask_ph': None
        }

        self.run_dict = {
            'name': 'agent',
            'inputs': self.input_target_obs,
            'batch_num': self.num_actors * self.num_agents,
            'games_num': self.num_actors * self.num_agents,
            'actions_num': self.actions_num,
            'prev_actions_ph': None,
            'action_mask_ph': self.action_mask_ph
        }

        self.states = None
        if self.network.is_rnn():
            self.logp_actions, self.state_values, self.action, self.entropy, self.states_ph, self.masks_ph, self.lstm_state, self.initial_state = self.network(
                self.train_dict, reuse=False)
            self.target_neglogp, self.target_state_values, self.target_action, _, self.target_states_ph, self.target_masks_ph, self.target_lstm_state, self.target_initial_state, self.logits = self.network(
                self.run_dict, reuse=True)
            self.states = self.target_initial_state

        else:
            self.logp_actions, self.state_values, self.action, self.entropy = self.network(
                self.train_dict, reuse=False)
            self.target_neglogp, self.target_state_values, self.target_action, _, self.logits = self.network(
                self.run_dict, reuse=True)

        self.saver = tf.train.Saver()
        self.variables = TensorFlowVariables([
            self.target_action, self.target_state_values, self.target_neglogp
        ], self.sess)

        if self.is_train:
            self.setup_losses()

        self.sess.run(tf.global_variables_initializer())