def __init__( self, obs_spec: Spec, act_spec: Spec, model_fn: ModelBuilder = None, policy_cls: PolicyType = None, sess_mgr: SessionManager = None, optimizer: tf.train.Optimizer = None, value_coef=DEFAULTS['value_coef'], entropy_coef=DEFAULTS['entropy_coef'], traj_len=DEFAULTS['traj_len'], batch_sz=DEFAULTS['batch_sz'], discount=DEFAULTS['discount'], gae_lambda=DEFAULTS['gae_lambda'], clip_rewards=DEFAULTS['clip_rewards'], clip_grads_norm=DEFAULTS['clip_grads_norm'], normalize_returns=DEFAULTS['normalize_returns'], normalize_advantages=DEFAULTS['normalize_advantages'], ): MemoryAgent.__init__(self, obs_spec, act_spec, traj_len, batch_sz) if not sess_mgr: sess_mgr = SessionManager() if not optimizer: optimizer = tf.train.AdamOptimizer( learning_rate=DEFAULTS['learning_rate']) self.sess_mgr = sess_mgr self.value_coef = value_coef self.entropy_coef = entropy_coef self.discount = discount self.gae_lambda = gae_lambda self.clip_rewards = clip_rewards self.normalize_returns = normalize_returns self.normalize_advantages = normalize_advantages self.model = model_fn(obs_spec, act_spec) self.value = self.model.outputs[-1] self.policy = policy_cls(act_spec, self.model.outputs[:-1]) self.loss_op, self.loss_terms, self.loss_inputs = self.loss_fn() # noinspection PyShadowingBuiltins grads, vars = zip(*optimizer.compute_gradients(self.loss_op)) self.grads_norm = tf.global_norm(grads) if clip_grads_norm > 0.: grads, _ = tf.clip_by_global_norm(grads, clip_grads_norm, self.grads_norm) self.train_op = optimizer.apply_gradients( zip(grads, vars), global_step=sess_mgr.global_step) self.minimize_ops = self.make_minimize_ops() sess_mgr.restore_or_init() self.n_batches = sess_mgr.start_step self.start_step = sess_mgr.start_step * traj_len self.logger = Logger()
def __init__(self, env_spec, callbacks=None, model_class=FullyConvModel, optimizer=tf.train.AdamOptimizer, learning_rate=0.0001, discount=0.99, trajectory_length=16, batch_size=32, max_grads_norm=100, policy_factor=1, entropy_factor=0.0001, value_factor=0.5): self.callbacks = callbacks self.discount = discount self.policy_factor = policy_factor self.entropy_factor = entropy_factor self.value_factor = value_factor self.input_observations = { name: Input(shape=spec.shape, name='input_{}'.format(name)) for name, spec in env_spec.observation_spec.items() } self.input_actions = { name: Input(shape=(), name='input_arg_{}_value'.format(name), dtype='int32') for name in env_spec.action_spec } self.input_returns = Input(shape=(), name='input_returns') self.function_args_mask = tf.constant( env_spec.action_spec['function_id'].args_mask, dtype=tf.float32, name='function_args_mask') self.model = model_class(self.input_observations, env_spec) self.loss = self.build_loss() self.optimizer = optimizer(learning_rate=learning_rate) grads, vars = zip(*self.optimizer.compute_gradients(self.loss)) grads_norm = tf.global_norm(grads) if max_grads_norm > 0: grads, _ = tf.clip_by_global_norm(grads, max_grads_norm, grads_norm) self.train_op = self.optimizer.apply_gradients( zip(grads, vars), global_step=tf.train.get_or_create_global_step()) self.history = History(trajectory_length, batch_size, env_spec) tf.summary.scalar('learning_rate', learning_rate) tf.summary.scalar('total_loss', self.loss, family='losses') tf.summary.scalar('grads_norm', grads_norm)
def __init__( self, obs_spec: Spec, # how does this just work like that? act_spec: Spec, model_fn: ModelBuilder=None, # same with these. Has to do with gin and/or __init__ files policy_cls: PolicyType=None, sess_mgr: SessionManager=None, optimizer: tf.train.Optimizer=None, value_coef=DEFAULTS['value_coef'], entropy_coef=DEFAULTS['entropy_coef'], traj_len=DEFAULTS['traj_len'], batch_sz=DEFAULTS['batch_sz'], gamma=DEFAULTS['gamma'], gae_lambda=DEFAULTS['gae_lambda'], clip_rewards=DEFAULTS['clip_rewards'], clip_grads_norm=DEFAULTS['clip_grads_norm'], normalize_returns=DEFAULTS['normalize_returns'], normalize_advantages=DEFAULTS['normalize_advantages']): MemoryAgent.__init__(self, obs_spec, act_spec, traj_len, batch_sz) if not sess_mgr: sess_mgr = SessionManager() if not optimizer: optimizer = tf.train.AdamOptimizer(DEFAULTS["learning_rate"]) self.sess_mgr = sess_mgr self.value_coef = value_coef self.entropy_coef = entropy_coef self.traj_len = traj_len self.gamma = gamma self.gae_lambda = gae_lambda self.clip_rewards = clip_rewards self.normalize_returns = normalize_returns self.normalize_advantages = normalize_advantages self.model = model_fn(obs_sec, act_spec) # this is fully_conv self.value = self.model.outputs[-1] # very cool self.policy = policy_cls(act_spec, self.model.outputs[:-1]) # whats advantage of doing it like this? Also Policy(act_spec, logits) self.loss_op, self.loss_terms, self.loss_inputs = self.loss_fn() # does this have to change? grads, vars = zip(*optimizer.compute_gradients(self.loss_op)) self.grads_norm = tf.global_norm(grads) if clip_grads_norm > 0.: # currently defaults at 0. grads, _ = tf.clip_by_global_norm(grads, clip_grads_norm, self.grads_norm) self.train_op = optimizer.apply_gradients(zip(grads, vars), global_step=sess_mgr.global_step) self.minimize_ops = self.make_minimize_ops() # whats benefit of doing like this instead of just setting them in the method? sess_mgr.restore_or_init() self.n_batches = sess_mgr.start_step self.start_step = sess_mgr.start_step * traj_len # why is there no self on these? self.logger = Logger() # whats this look like exactly?
def __init__( self, obs_spec: Spec, act_spec: Spec, model_fn: ModelBuilder, policy_cls: PolicyType, sess_mgr: SessionManager = None, traj_len=16, batch_sz=16, discount=0.99, gae_lambda=0.95, clip_rewards=0.0, normalize_advantages=True, bootstrap_terminals=False, clip_grads_norm=0.0, optimizer=tf.train.AdamOptimizer(), logger=Logger() ): MemoryAgent.__init__(self, obs_spec, act_spec, traj_len, batch_sz) if not sess_mgr: sess_mgr = SessionManager() self.sess_mgr = sess_mgr self.discount = discount self.gae_lambda = gae_lambda self.clip_rewards = clip_rewards self.normalize_advantages = normalize_advantages self.bootstrap_terminals = bootstrap_terminals self.logger = logger self.model = model_fn(obs_spec, act_spec) self.value = self.model.outputs[-1] self.policy = policy_cls(act_spec, self.model.outputs[:-1]) self.loss_op, self.loss_terms, self.loss_inputs = self.loss_fn() grads, vars = zip(*optimizer.compute_gradients(self.loss_op)) self.grads_norm = tf.global_norm(grads) if clip_grads_norm > 0.: grads, _ = tf.clip_by_global_norm(grads, clip_grads_norm, self.grads_norm) self.train_op = optimizer.apply_gradients(zip(grads, vars), global_step=self.sess_mgr.global_step) self.sess_mgr.restore_or_init() # NB! changing trajectory length in-between checkpoints will break the logs self.n_batches = self.sess_mgr.start_step self.start_step = self.sess_mgr.start_step * traj_len
def __init__( self, obs_spec, act_spec, model_fn=build_mlp, policy_cls=MultiPolicy, sess_mgr=None, traj_len=16, batch_sz=16, discount=0.99, gae_lambda=0.95, clip_rewards=0.0, normalize_advantages=True, bootstrap_terminals=False, clip_grads_norm=0.0, optimizer=tf.train.AdamOptimizer(), logger=Logger() ): MemoryAgent.__init__(self, obs_spec, act_spec, traj_len, batch_sz) if not sess_mgr: sess_mgr = SessionManager() self.sess_mgr = sess_mgr self.discount = discount self.gae_lambda = gae_lambda self.clip_rewards = clip_rewards self.normalize_advantages = normalize_advantages self.bootstrap_terminals = bootstrap_terminals self.logger = logger self.model = model_fn(obs_spec, act_spec) self.value = self.model.outputs[-1] self.policy = policy_cls(act_spec, self.model.outputs[:-1]) self.loss_op, self.loss_terms, self.loss_inputs = self.loss_fn() grads, vars = zip(*optimizer.compute_gradients(self.loss_op)) self.grads_norm = tf.global_norm(grads) if clip_grads_norm > 0.: grads, _ = tf.clip_by_global_norm(grads, clip_grads_norm, self.grads_norm) self.train_op = optimizer.apply_gradients(zip(grads, vars), global_step=self.sess_mgr.global_step) self.sess_mgr.restore_or_init() # NB! changing trajectory length in-between checkpoints will break the logs self.n_batches = self.sess_mgr.start_step self.start_step = self.sess_mgr.start_step * traj_len
def __init__( self, obs_spec: Spec, act_spec: Spec, model_variable_scope=DEFAULTS['model_variable_scope'], model_fn: ModelBuilder = None, policy_cls: PolicyType = None, sess_mgr: SessionManager = None, optimizer: tf.train.Optimizer = None, value_coef=DEFAULTS['value_coef'], entropy_coef=DEFAULTS['entropy_coef'], traj_len=DEFAULTS['traj_len'], batch_sz=DEFAULTS['batch_sz'], discount=DEFAULTS['discount'], gae_lambda=DEFAULTS['gae_lambda'], clip_rewards=DEFAULTS['clip_rewards'], clip_grads_norm=DEFAULTS['clip_grads_norm'], normalize_returns=DEFAULTS['normalize_returns'], normalize_advantages=DEFAULTS['normalize_advantages'], **kwargs, ): MemoryAgent.__init__(self, obs_spec, act_spec, traj_len, batch_sz) print(LOGGING_MSG_HEADER + ": the traj_len is {} and batch_sz is {}".format( traj_len, batch_sz)) if not sess_mgr: sess_mgr = SessionManager() self.subenvs = subenvs = kwargs[ 'subenvs'] if 'subenvs' in kwargs else [] if optimizer: optimizers = [copy.deepcopy(optimizer) for subenv in subenvs] else: optimizer = tf.train.AdamOptimizer( learning_rate=DEFAULTS['learning_rate']) optimizers = [ tf.train.AdamOptimizer(learning_rate=DEFAULTS['learning_rate']) for subenv in subenvs ] self.sess_mgr = sess_mgr self.model_variable_scope = self.sess_mgr.model_variable_scope self.value_coef = value_coef self.entropy_coef = entropy_coef self.discount = discount self.gae_lambda = gae_lambda self.clip_rewards = clip_rewards self.normalize_returns = normalize_returns self.normalize_advantages = normalize_advantages self.traj_len = traj_len self.batch_sz = batch_sz print(LOGGING_MSG_HEADER + " : the current model_variable_scope is", self.model_variable_scope) # implement the a2c to support multiple subagents # self.model = model_fn(obs_spec, act_spec) with sess_mgr.sess.graph.as_default(): # note this is name_scope as opposed to variable_scope, important with tf.name_scope(self.sess_mgr.main_tf_vs.original_name_scope): if subenvs: from collections import defaultdict self.subenv_dict = defaultdict(list) print( LOGGING_MSG_HEADER + ": Creating models for each individual subenvs: ", subenvs) for i, subenv in enumerate(subenvs): subenv_model = model_fn(obs_spec, act_spec) self.subenv_dict['models'].append(subenv_model) subenv_value = subenv_model.outputs[-1] self.subenv_dict['values'].append(subenv_value) subenv_policy = policy_cls(act_spec, subenv_model.outputs[:-1]) self.subenv_dict['policies'].append(subenv_policy) subenv_loss_op, subenv_loss_terms, subenv_loss_inputs = self.loss_fn( policy=subenv_policy, value=subenv_value) self.subenv_dict['loss_ops'].append(subenv_loss_op) self.subenv_dict['loss_terms'].append( subenv_loss_terms) self.subenv_dict['loss_inputs'].append( subenv_loss_inputs) subenv_optimizer = optimizers[i] grads, vars = zip(*subenv_optimizer.compute_gradients( subenv_loss_op)) subenv_grads_norm = tf.global_norm(grads) self.subenv_dict['grads_norms'].append( subenv_grads_norm) if clip_grads_norm > 0: grads, _ = tf.clip_by_global_norm( grads, clip_grads_norm, subenv_grads_norm) self.subenv_dict['train_ops'].append( subenv_optimizer.apply_gradients( zip(grads, vars), global_step=sess_mgr.global_step)) self.subenv_dict['minimize_ops'].append( self.make_minimize_ops(subenv_id=i)) print( LOGGING_MSG_HEADER + ": Successfully created models for each individual subenvs" ) else: print(LOGGING_MSG_HEADER + ": Creating single model for the environment.") self.model = model_fn(obs_spec, act_spec) self.value = self.model.outputs[-1] self.policy = policy_cls(act_spec, self.model.outputs[:-1]) self.loss_op, self.loss_terms, self.loss_inputs = self.loss_fn( ) grads, vars = zip( *optimizer.compute_gradients(self.loss_op)) self.grads_norm = tf.global_norm(grads) if clip_grads_norm > 0.: grads, _ = tf.clip_by_global_norm( grads, clip_grads_norm, self.grads_norm) self.train_op = optimizer.apply_gradients( zip(grads, vars), global_step=sess_mgr.global_step) self.minimize_ops = self.make_minimize_ops() print(LOGGING_MSG_HEADER + " : main_model setup on sess and graph complete") sess_mgr.restore_or_init() print(LOGGING_MSG_HEADER + " : main_model weights restore/init complete") self.n_batches = sess_mgr.start_step self.start_step = sess_mgr.start_step * traj_len self.logger = Logger()