Esempio n. 1
0
    def __init__(self, sess, base_name, observation_space, action_space,
                 config):
        self.name = base_name
        self.actions_low = action_space.low
        self.actions_high = action_space.high
        self.env_name = config['env_name']
        self.ppo = config['ppo']
        self.is_adaptive_lr = config['lr_schedule'] == 'adaptive'
        self.is_polynom_decay_lr = config['lr_schedule'] == 'polynom_decay'
        self.is_exp_decay_lr = config['lr_schedule'] == 'exp_decay'
        self.lr_multiplier = tf.constant(1, shape=(), dtype=tf.float32)

        self.e_clip = config['e_clip']
        self.clip_value = config['clip_value']
        self.network = config['network']
        self.rewards_shaper = config['reward_shaper']
        self.num_actors = config['num_actors']
        self.env_config = config.get('env_config', {})
        self.vec_env = vecenv.create_vec_env(self.env_name, self.num_actors,
                                             **self.env_config)
        self.num_agents = self.vec_env.get_number_of_agents()
        self.steps_num = config['steps_num']
        self.normalize_advantage = config['normalize_advantage']
        self.config = config
        self.state_shape = observation_space.shape
        self.critic_coef = config['critic_coef']
        self.writer = SummaryWriter('runs/' + config['name'] +
                                    datetime.now().strftime("_%d-%H-%M-%S"))

        self.sess = sess
        self.grad_norm = config['grad_norm']
        self.gamma = self.config['gamma']
        self.tau = self.config['tau']
        self.normalize_input = self.config['normalize_input']
        self.seq_len = self.config['seq_length']
        self.dones = np.asarray([False] * self.num_actors, dtype=np.bool)

        self.current_rewards = np.asarray([0] * self.num_actors,
                                          dtype=np.float32)
        self.current_lengths = np.asarray([0] * self.num_actors,
                                          dtype=np.float32)
        self.game_rewards = deque([], maxlen=100)
        self.game_lengths = deque([], maxlen=100)

        self.obs_ph = tf.placeholder('float32', (None, ) + self.state_shape,
                                     name='obs')
        self.target_obs_ph = tf.placeholder('float32',
                                            (None, ) + self.state_shape,
                                            name='target_obs')
        self.actions_num = action_space.shape[0]
        self.actions_ph = tf.placeholder('float32',
                                         (None, ) + action_space.shape,
                                         name='actions')
        self.old_mu_ph = tf.placeholder('float32',
                                        (None, ) + action_space.shape,
                                        name='old_mu_ph')
        self.old_sigma_ph = tf.placeholder('float32',
                                           (None, ) + action_space.shape,
                                           name='old_sigma_ph')
        self.old_neglogp_actions_ph = tf.placeholder('float32', (None, ),
                                                     name='old_logpactions')
        self.rewards_ph = tf.placeholder('float32', (None, ), name='rewards')
        self.old_values_ph = tf.placeholder('float32', (None, ),
                                            name='old_values')
        self.advantages_ph = tf.placeholder('float32', (None, ),
                                            name='advantages')
        self.learning_rate_ph = tf.placeholder('float32', (), name='lr_ph')
        self.epoch_num = tf.Variable(tf.constant(0, shape=(),
                                                 dtype=tf.float32),
                                     trainable=False)
        self.update_epoch_op = self.epoch_num.assign(self.epoch_num + 1)
        self.current_lr = self.learning_rate_ph

        self.bounds_loss_coef = config.get('bounds_loss_coef', None)

        if self.is_adaptive_lr:
            self.lr_threshold = config['lr_threshold']
        if self.is_polynom_decay_lr:
            self.lr_multiplier = tf.train.polynomial_decay(
                1.0,
                global_step=self.epoch_num,
                decay_steps=config['max_epochs'],
                end_learning_rate=0.001,
                power=config.get('decay_power', 1.0))
        if self.is_exp_decay_lr:
            self.lr_multiplier = tf.train.exponential_decay(
                1.0,
                global_step=self.epoch_num,
                decay_steps=config['max_epochs'],
                decay_rate=config['decay_rate'])

        self.input_obs = self.obs_ph
        self.input_target_obs = self.target_obs_ph

        if observation_space.dtype == np.uint8:
            self.input_obs = tf.to_float(self.input_obs) / 255.0
            self.input_target_obs = tf.to_float(self.input_target_obs) / 255.0

        if self.normalize_input:
            self.moving_mean_std = MovingMeanStd(shape=observation_space.shape,
                                                 epsilon=1e-5,
                                                 decay=0.99)
            self.input_obs = self.moving_mean_std.normalize(self.input_obs,
                                                            train=True)
            self.input_target_obs = self.moving_mean_std.normalize(
                self.input_target_obs, train=False)

        games_num = self.config[
            'minibatch_size'] // self.seq_len  # it is used only for current rnn implementation

        self.train_dict = {
            'name': 'agent',
            'inputs': self.input_obs,
            'batch_num': self.config['minibatch_size'],
            'games_num': games_num,
            'actions_num': self.actions_num,
            'prev_actions_ph': self.actions_ph,
        }

        self.run_dict = {
            'name': 'agent',
            'inputs': self.input_target_obs,
            'batch_num': self.num_actors,
            'games_num': self.num_actors,
            'actions_num': self.actions_num,
            'prev_actions_ph': None,
        }

        self.states = None
        if self.network.is_rnn():
            self.neglogp_actions, self.state_values, self.action, self.entropy, self.mu, self.sigma, self.states_ph, self.masks_ph, self.lstm_state, self.initial_state = self.network(
                self.train_dict, reuse=False)
            self.target_neglogp, self.target_state_values, self.target_action, _, self.target_mu, self.target_sigma, self.target_states_ph, self.target_masks_ph, self.target_lstm_state, self.target_initial_state = self.network(
                self.run_dict, reuse=True)
            self.states = self.target_initial_state
        else:
            self.neglogp_actions, self.state_values, self.action, self.entropy, self.mu, self.sigma = self.network(
                self.train_dict, reuse=False)
            self.target_neglogp, self.target_state_values, self.target_action, _, self.target_mu, self.target_sigma = self.network(
                self.run_dict, reuse=True)

        curr_e_clip = self.e_clip * self.lr_multiplier
        if (self.ppo):
            self.prob_ratio = tf.exp(self.old_neglogp_actions_ph -
                                     self.neglogp_actions)
            self.prob_ratio = tf.clip_by_value(self.prob_ratio, 0.0, 16.0)
            self.pg_loss_unclipped = -tf.multiply(self.advantages_ph,
                                                  self.prob_ratio)
            self.pg_loss_clipped = -tf.multiply(
                self.advantages_ph,
                tf.clip_by_value(self.prob_ratio, 1. - curr_e_clip,
                                 1. + curr_e_clip))
            self.actor_loss = tf.reduce_mean(
                tf.maximum(self.pg_loss_unclipped, self.pg_loss_clipped))
        else:
            self.actor_loss = tf.reduce_mean(self.neglogp_actions *
                                             self.advantages_ph)

        self.c_loss = (tf.squeeze(self.state_values) - self.rewards_ph)**2

        if self.clip_value:
            self.cliped_values = self.old_values_ph + tf.clip_by_value(
                tf.squeeze(self.state_values) - self.old_values_ph,
                -curr_e_clip, curr_e_clip)
            self.c_loss_clipped = tf.square(self.cliped_values -
                                            self.rewards_ph)
            self.critic_loss = tf.reduce_mean(
                tf.maximum(self.c_loss, self.c_loss_clipped))
        else:
            self.critic_loss = tf.reduce_mean(self.c_loss)

        self._calc_kl_dist()

        self.loss = self.actor_loss + 0.5 * self.critic_coef * self.critic_loss - self.config[
            'entropy_coef'] * self.entropy
        self._apply_bound_loss()
        self.reg_loss = tf.losses.get_regularization_loss()
        self.loss += self.reg_loss
        self.train_step = tf.train.AdamOptimizer(self.current_lr *
                                                 self.lr_multiplier)
        self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                         scope='agent')

        grads = tf.gradients(self.loss, self.weights)
        if self.config['truncate_grads']:
            grads, _ = tf.clip_by_global_norm(grads, self.grad_norm)
        grads = list(zip(grads, self.weights))

        self.train_op = self.train_step.apply_gradients(grads)
        self.saver = tf.train.Saver()
        self.sess.run(tf.global_variables_initializer())
Esempio n. 2
0
    def base_init(self, base_name, config):
        self.env_config = config.get('env_config', {})
        self.num_actors = config.get('num_actors', 1)
        self.env_name = config['env_name']
        print("Env name:", self.env_name)

        self.env_info = config.get('env_info')
        if self.env_info is None:
            self.vec_env = vecenv.create_vec_env(self.env_name,
                                                 self.num_actors,
                                                 **self.env_config)
            self.env_info = self.vec_env.get_env_info()

        self.sac_device = config.get('device', 'cuda:0')
        #temporary:
        self.ppo_device = self.sac_device
        print('Env info:')
        print(self.env_info)

        self.rewards_shaper = config['reward_shaper']
        self.observation_space = self.env_info['observation_space']
        self.weight_decay = config.get('weight_decay', 0.0)
        #self.use_action_masks = config.get('use_action_masks', False)
        self.is_train = config.get('is_train', True)

        self.c_loss = nn.MSELoss()
        # self.c2_loss = nn.SmoothL1Loss()

        self.save_best_after = config.get('save_best_after', 500)
        self.print_stats = config.get('print_stats', True)
        self.rnn_states = None
        self.name = base_name

        self.max_epochs = self.config.get('max_epochs', 1e6)

        self.network = config['network']
        self.rewards_shaper = config['reward_shaper']
        self.num_agents = self.env_info.get('agents', 1)
        self.obs_shape = self.observation_space.shape

        self.games_to_track = self.config.get('games_to_track', 100)
        self.game_rewards = torch_ext.AverageMeter(1, self.games_to_track).to(
            self.sac_device)
        self.game_lengths = torch_ext.AverageMeter(1, self.games_to_track).to(
            self.sac_device)
        self.obs = None

        self.min_alpha = torch.tensor(np.log(1)).float().to(self.sac_device)

        self.frame = 0
        self.update_time = 0
        self.last_mean_rewards = -100500
        self.play_time = 0
        self.epoch_num = 0

        self.writer = SummaryWriter('runs/' + config['name'] +
                                    datetime.now().strftime("_%d-%H-%M-%S"))
        print("Run Directory:",
              config['name'] + datetime.now().strftime("_%d-%H-%M-%S"))

        self.is_tensor_obses = None
        self.is_rnn = False
        self.last_rnn_indices = None
        self.last_state_indices = None
Esempio n. 3
0
    def __init__(self, base_name, config):
        self.config = config
        self.env_config = config.get('env_config', {})
        self.num_actors = config['num_actors']
        self.env_name = config['env_name']

        self.env_info = config.get('env_info')
        if self.env_info is None:
            self.vec_env = vecenv.create_vec_env(self.env_name, self.num_actors, **self.env_config)
            self.env_info = self.vec_env.get_env_info()

        self.ppo_device = config.get('device', 'cuda:0')
        print('Env info:')
        print(self.env_info)
        self.value_size = self.env_info.get('value_size',1)
        self.observation_space = self.env_info['observation_space']
        self.weight_decay = config.get('weight_decay', 0.0)
        self.use_action_masks = config.get('use_action_masks', False)
        self.is_train = config.get('is_train', True)

        self.central_value_config = self.config.get('central_value_config', None)
        self.has_central_value = self.central_value_config is not None

        if self.has_central_value:
            self.state_space = self.env_info.get('state_space', None)
            self.state_shape = None
            if self.state_space.shape != None:
                self.state_shape = self.state_space.shape

        self.self_play_config = self.config.get('self_play_config', None)
        self.has_self_play_config = self.self_play_config is not None

        self.self_play = config.get('self_play', False)
        self.save_freq = config.get('save_frequency', 0)
        self.save_best_after = config.get('save_best_after', 100)
        self.print_stats = config.get('print_stats', True)
        self.rnn_states = None
        self.name = base_name

        self.ppo = config['ppo']
        self.max_epochs = self.config.get('max_epochs', 1e6)

        self.is_adaptive_lr = config['lr_schedule'] == 'adaptive'
        self.linear_lr = config['lr_schedule'] == 'linear'
        self.schedule_type = config.get('schedule_type', 'legacy')
        if self.is_adaptive_lr:
            self.lr_threshold = config['lr_threshold']
            self.scheduler = schedulers.AdaptiveScheduler(self.lr_threshold)
        elif self.linear_lr:
            self.scheduler = schedulers.LinearScheduler(float(config['learning_rate']), 
                max_steps=self.max_epochs, 
                apply_to_entropy=config.get('schedule_entropy', False),
                start_entropy_coef=config.get('entropy_coef'))
        else:
            self.scheduler = schedulers.IdentityScheduler()

        self.e_clip = config['e_clip']
        self.clip_value = config['clip_value']
        self.network = config['network']
        self.rewards_shaper = config['reward_shaper']
        self.num_agents = self.env_info.get('agents', 1)
        self.steps_num = config['steps_num']
        self.seq_len = self.config.get('seq_length', 4)
        self.normalize_advantage = config['normalize_advantage']
        self.normalize_input = self.config['normalize_input']
        self.normalize_value = self.config.get('normalize_value', False)

        self.obs_shape = self.observation_space.shape
 
        self.critic_coef = config['critic_coef']
        self.grad_norm = config['grad_norm']
        self.gamma = self.config['gamma']
        self.tau = self.config['tau']

        self.games_to_track = self.config.get('games_to_track', 100)
        self.game_rewards = torch_ext.AverageMeter(self.value_size, self.games_to_track).to(self.ppo_device)
        self.game_lengths = torch_ext.AverageMeter(1, self.games_to_track).to(self.ppo_device)
        self.obs = None
        self.games_num = self.config['minibatch_size'] // self.seq_len # it is used only for current rnn implementation
        self.batch_size = self.steps_num * self.num_actors * self.num_agents
        self.batch_size_envs = self.steps_num * self.num_actors
        self.minibatch_size = self.config['minibatch_size']
        self.mini_epochs_num = self.config['mini_epochs']
        self.num_minibatches = self.batch_size // self.minibatch_size
        assert(self.batch_size % self.minibatch_size == 0)

        self.last_lr = self.config['learning_rate']
        self.frame = 0
        self.update_time = 0
        self.last_mean_rewards = -100500
        self.play_time = 0
        self.epoch_num = 0
        
        self.entropy_coef = self.config['entropy_coef']
        self.writer = SummaryWriter('runs/' + config['name'] + datetime.now().strftime("_%d-%H-%M-%S"))

        if self.normalize_value:
            self.value_mean_std = RunningMeanStd((1,)).to(self.ppo_device)

        self.is_tensor_obses = False

        self.last_rnn_indices = None
        self.last_state_indices = None

        #self_play
        if self.has_self_play_config:
            print('Initializing SelfPlay Manager')
            self.self_play_manager = SelfPlayManager(self.self_play_config, self.writer)
        
        # features
        self.algo_observer = config['features']['observer']
Esempio n. 4
0
    def __init__(self, sess, base_name, observation_space, action_space,
                 config):
        observation_shape = observation_space.shape
        self.use_action_masks = config.get('use_action_masks', False)
        self.is_train = config.get('is_train', True)
        self.self_play = config.get('self_play', False)
        self.name = base_name
        self.config = config
        self.env_name = config['env_name']
        self.ppo = config['ppo']
        self.is_adaptive_lr = config['lr_schedule'] == 'adaptive'
        self.is_polynom_decay_lr = config['lr_schedule'] == 'polynom_decay'
        self.is_exp_decay_lr = config['lr_schedule'] == 'exp_decay'
        self.lr_multiplier = tf.constant(1, shape=(), dtype=tf.float32)
        self.epoch_num = tf.Variable(tf.constant(0, shape=(),
                                                 dtype=tf.float32),
                                     trainable=False)

        self.e_clip = config['e_clip']
        self.clip_value = config['clip_value']
        self.network = config['network']
        self.rewards_shaper = config['reward_shaper']
        self.num_actors = config['num_actors']
        self.env_config = self.config.get('env_config', {})
        self.vec_env = vecenv.create_vec_env(self.env_name, self.num_actors,
                                             **self.env_config)
        self.num_agents = self.vec_env.get_number_of_agents()
        self.steps_num = config['steps_num']
        self.seq_len = self.config['seq_length']
        self.normalize_advantage = config['normalize_advantage']
        self.normalize_input = self.config['normalize_input']

        self.state_shape = observation_shape
        self.critic_coef = config['critic_coef']
        self.writer = SummaryWriter('runs/' + config['name'] +
                                    datetime.now().strftime("_%d-%H-%M-%S"))
        self.sess = sess
        self.grad_norm = config['grad_norm']
        self.gamma = self.config['gamma']
        self.tau = self.config['tau']

        self.ignore_dead_batches = self.config.get('ignore_dead_batches',
                                                   False)

        self.dones = np.asarray([False] * self.num_actors * self.num_agents,
                                dtype=np.bool)
        self.current_rewards = np.asarray([0] * self.num_actors *
                                          self.num_agents,
                                          dtype=np.float32)
        self.current_lengths = np.asarray([0] * self.num_actors *
                                          self.num_agents,
                                          dtype=np.float32)
        self.games_to_track = self.config.get('games_to_track', 100)
        self.game_rewards = deque([], maxlen=self.games_to_track)
        self.game_lengths = deque([], maxlen=self.games_to_track)
        self.game_scores = deque([], maxlen=self.games_to_track)
        self.obs_ph = tf.placeholder(observation_space.dtype,
                                     (None, ) + observation_shape,
                                     name='obs')
        self.target_obs_ph = tf.placeholder(observation_space.dtype,
                                            (None, ) + observation_shape,
                                            name='target_obs')
        self.actions_num = action_space.n
        self.actions_ph = tf.placeholder('int32', (None, ), name='actions')
        if self.use_action_masks:
            self.action_mask_ph = tf.placeholder('int32',
                                                 (None, self.actions_num),
                                                 name='actions_mask')
        else:
            self.action_mask_ph = None

        self.old_logp_actions_ph = tf.placeholder('float32', (None, ),
                                                  name='old_logpactions')
        self.rewards_ph = tf.placeholder('float32', (None, ), name='rewards')
        self.old_values_ph = tf.placeholder('float32', (None, ),
                                            name='old_values')
        self.advantages_ph = tf.placeholder('float32', (None, ),
                                            name='advantages')
        self.learning_rate_ph = tf.placeholder('float32', (), name='lr_ph')

        self.update_epoch_op = self.epoch_num.assign(self.epoch_num + 1)
        self.current_lr = self.learning_rate_ph

        self.input_obs = self.obs_ph
        self.input_target_obs = self.target_obs_ph

        if observation_space.dtype == np.uint8:
            self.input_obs = tf.to_float(self.input_obs) / 255.0
            self.input_target_obs = tf.to_float(self.input_target_obs) / 255.0

        if self.is_adaptive_lr:
            self.lr_threshold = config['lr_threshold']
        if self.is_polynom_decay_lr:
            self.lr_multiplier = tf.train.polynomial_decay(
                1.0,
                self.epoch_num,
                config['max_epochs'],
                end_learning_rate=0.001,
                power=tr_helpers.get_or_default(config, 'decay_power', 1.0))
        if self.is_exp_decay_lr:
            self.lr_multiplier = tf.train.exponential_decay(
                1.0,
                self.epoch_num,
                config['max_epochs'],
                decay_rate=config['decay_rate'])
        if self.normalize_input:
            self.moving_mean_std = MovingMeanStd(shape=observation_space.shape,
                                                 epsilon=1e-5,
                                                 decay=0.99)
            self.input_obs = self.moving_mean_std.normalize(self.input_obs,
                                                            train=True)
            self.input_target_obs = self.moving_mean_std.normalize(
                self.input_target_obs, train=False)

        games_num = self.config[
            'minibatch_size'] // self.seq_len  # it is used only for current rnn implementation

        self.train_dict = {
            'name': 'agent',
            'inputs': self.input_obs,
            'batch_num': self.config['minibatch_size'],
            'games_num': games_num,
            'actions_num': self.actions_num,
            'prev_actions_ph': self.actions_ph,
            'action_mask_ph': None
        }

        self.run_dict = {
            'name': 'agent',
            'inputs': self.input_target_obs,
            'batch_num': self.num_actors * self.num_agents,
            'games_num': self.num_actors * self.num_agents,
            'actions_num': self.actions_num,
            'prev_actions_ph': None,
            'action_mask_ph': self.action_mask_ph
        }

        self.states = None
        if self.network.is_rnn():
            self.logp_actions, self.state_values, self.action, self.entropy, self.states_ph, self.masks_ph, self.lstm_state, self.initial_state = self.network(
                self.train_dict, reuse=False)
            self.target_neglogp, self.target_state_values, self.target_action, _, self.target_states_ph, self.target_masks_ph, self.target_lstm_state, self.target_initial_state, self.logits = self.network(
                self.run_dict, reuse=True)
            self.states = self.target_initial_state
        else:
            self.logp_actions, self.state_values, self.action, self.entropy = self.network(
                self.train_dict, reuse=False)
            self.target_neglogp, self.target_state_values, self.target_action, _, self.logits = self.network(
                self.run_dict, reuse=True)

        self.saver = tf.train.Saver()
        self.variables = TensorFlowVariables([
            self.target_action, self.target_state_values, self.target_neglogp
        ], self.sess)

        if self.is_train:
            self.setup_losses()

        self.sess.run(tf.global_variables_initializer())
Esempio n. 5
0
    def base_init(self, base_name, config):
        self.config = config
        self.env_config = config.get('env_config', {})
        self.num_actors = config.get('num_actors', 1)
        self.env_name = config['env_name']
        print("Env name:", self.env_name)

        self.env_info = config.get('env_info')
        if self.env_info is None:
            self.vec_env = vecenv.create_vec_env(self.env_name,
                                                 self.num_actors,
                                                 **self.env_config)
            self.env_info = self.vec_env.get_env_info()

        self.sac_device = config.get('device', 'cuda:0')
        print('Env info:')
        print(self.env_info)

        self.rewards_shaper = config['reward_shaper']

        self.observation_space = self.env_info['observation_space']
        self.weight_decay = config.get('weight_decay', 0.0)
        self.use_action_masks = config.get('use_action_masks', False)
        self.is_train = config.get('is_train', True)

        self.central_value_config = self.config.get('central_value_config',
                                                    None)
        self.has_central_value = self.central_value_config is not None

        if self.has_central_value:
            self.state_space = self.env_info.get('state_space', None)
            self.state_shape = None
            if self.state_space.shape != None:
                self.state_shape = self.state_space.shape

        self.self_play_config = self.config.get('self_play_config', None)
        self.has_self_play_config = self.self_play_config is not None

        self.self_play = config.get('self_play', False)
        self.save_freq = config.get('save_frequency', 0)
        self.save_best_after = config.get('save_best_after', 500)
        self.print_stats = config.get('print_stats', True)
        self.rnn_states = None
        self.name = base_name

        self.max_epochs = self.config.get('max_epochs', 1e6)

        self.network = config['network']
        self.rewards_shaper = config['reward_shaper']
        self.num_agents = self.env_info.get('agents', 1)
        self.obs_shape = self.observation_space.shape

        self.games_to_track = self.config.get('games_to_track', 100)
        self.game_rewards = torch_ext.AverageMeter(1, self.games_to_track).to(
            self.sac_device)
        self.game_lengths = torch_ext.AverageMeter(1, self.games_to_track).to(
            self.sac_device)
        self.obs = None

        self.frame = 0
        self.update_time = 0
        self.last_mean_rewards = -100500
        self.play_time = 0
        self.epoch_num = 0

        # self.writer = SummaryWriter('ant_runs/' + config['name'] + datetime.now().strftime("_%d-%H-%M-%S"))
        self.writer = SummaryWriter('tested_new/' + 'shadowhand')
        # self.writer = SummaryWriter('walker/'+'fixed_buffer')
        print("Run Directory:",
              config['name'] + datetime.now().strftime("_%d-%H-%M-%S"))

        self.is_tensor_obses = None

        self.curiosity_config = self.config.get('rnd_config', None)
        self.has_curiosity = self.curiosity_config is not None
        if self.has_curiosity:
            self.curiosity_gamma = self.curiosity_config['gamma']
            self.curiosity_lr = self.curiosity_config['lr']
            self.curiosity_rewards = deque([], maxlen=self.games_to_track)
            self.curiosity_mins = deque([], maxlen=self.games_to_track)
            self.curiosity_maxs = deque([], maxlen=self.games_to_track)
            self.rnd_adv_coef = self.curiosity_config.get('adv_coef', 1.0)

        self.is_rnn = False
        self.last_rnn_indices = None
        self.last_state_indices = None