Ejemplo n.º 1
0
    def __init__(
        self,
        obs_spec: Spec,
        act_spec: Spec,
        model_fn: ModelBuilder = None,
        policy_cls: PolicyType = None,
        sess_mgr: SessionManager = None,
        optimizer: tf.train.Optimizer = None,
        value_coef=DEFAULTS['value_coef'],
        entropy_coef=DEFAULTS['entropy_coef'],
        traj_len=DEFAULTS['traj_len'],
        batch_sz=DEFAULTS['batch_sz'],
        discount=DEFAULTS['discount'],
        gae_lambda=DEFAULTS['gae_lambda'],
        clip_rewards=DEFAULTS['clip_rewards'],
        clip_grads_norm=DEFAULTS['clip_grads_norm'],
        normalize_returns=DEFAULTS['normalize_returns'],
        normalize_advantages=DEFAULTS['normalize_advantages'],
    ):
        MemoryAgent.__init__(self, obs_spec, act_spec, traj_len, batch_sz)

        if not sess_mgr:
            sess_mgr = SessionManager()

        if not optimizer:
            optimizer = tf.train.AdamOptimizer(
                learning_rate=DEFAULTS['learning_rate'])

        self.sess_mgr = sess_mgr
        self.value_coef = value_coef
        self.entropy_coef = entropy_coef
        self.discount = discount
        self.gae_lambda = gae_lambda
        self.clip_rewards = clip_rewards
        self.normalize_returns = normalize_returns
        self.normalize_advantages = normalize_advantages

        self.model = model_fn(obs_spec, act_spec)
        self.value = self.model.outputs[-1]
        self.policy = policy_cls(act_spec, self.model.outputs[:-1])
        self.loss_op, self.loss_terms, self.loss_inputs = self.loss_fn()

        # noinspection PyShadowingBuiltins
        grads, vars = zip(*optimizer.compute_gradients(self.loss_op))
        self.grads_norm = tf.global_norm(grads)
        if clip_grads_norm > 0.:
            grads, _ = tf.clip_by_global_norm(grads, clip_grads_norm,
                                              self.grads_norm)
        self.train_op = optimizer.apply_gradients(
            zip(grads, vars), global_step=sess_mgr.global_step)
        self.minimize_ops = self.make_minimize_ops()

        sess_mgr.restore_or_init()
        self.n_batches = sess_mgr.start_step
        self.start_step = sess_mgr.start_step * traj_len

        self.logger = Logger()
Ejemplo n.º 2
0
    def __init__(self,
                 env_spec,
                 callbacks=None,
                 model_class=FullyConvModel,
                 optimizer=tf.train.AdamOptimizer,
                 learning_rate=0.0001,
                 discount=0.99,
                 trajectory_length=16,
                 batch_size=32,
                 max_grads_norm=100,
                 policy_factor=1,
                 entropy_factor=0.0001,
                 value_factor=0.5):
        self.callbacks = callbacks
        self.discount = discount
        self.policy_factor = policy_factor
        self.entropy_factor = entropy_factor
        self.value_factor = value_factor

        self.input_observations = {
            name: Input(shape=spec.shape, name='input_{}'.format(name))
            for name, spec in env_spec.observation_spec.items()
        }
        self.input_actions = {
            name: Input(shape=(),
                        name='input_arg_{}_value'.format(name),
                        dtype='int32')
            for name in env_spec.action_spec
        }
        self.input_returns = Input(shape=(), name='input_returns')

        self.function_args_mask = tf.constant(
            env_spec.action_spec['function_id'].args_mask,
            dtype=tf.float32,
            name='function_args_mask')

        self.model = model_class(self.input_observations, env_spec)

        self.loss = self.build_loss()

        self.optimizer = optimizer(learning_rate=learning_rate)
        grads, vars = zip(*self.optimizer.compute_gradients(self.loss))
        grads_norm = tf.global_norm(grads)
        if max_grads_norm > 0:
            grads, _ = tf.clip_by_global_norm(grads, max_grads_norm,
                                              grads_norm)
        self.train_op = self.optimizer.apply_gradients(
            zip(grads, vars), global_step=tf.train.get_or_create_global_step())

        self.history = History(trajectory_length, batch_size, env_spec)

        tf.summary.scalar('learning_rate', learning_rate)
        tf.summary.scalar('total_loss', self.loss, family='losses')
        tf.summary.scalar('grads_norm', grads_norm)
Ejemplo n.º 3
0
  def __init__(
    self,
    obs_spec: Spec,  # how does this just work like that?
    act_spec: Spec,
    model_fn: ModelBuilder=None,  # same with these. Has to do with gin and/or __init__ files
    policy_cls: PolicyType=None,
    sess_mgr: SessionManager=None,
    optimizer: tf.train.Optimizer=None,
    value_coef=DEFAULTS['value_coef'],
    entropy_coef=DEFAULTS['entropy_coef'],
    traj_len=DEFAULTS['traj_len'],
    batch_sz=DEFAULTS['batch_sz'],
    gamma=DEFAULTS['gamma'],
    gae_lambda=DEFAULTS['gae_lambda'],
    clip_rewards=DEFAULTS['clip_rewards'],
    clip_grads_norm=DEFAULTS['clip_grads_norm'],
    normalize_returns=DEFAULTS['normalize_returns'],
    normalize_advantages=DEFAULTS['normalize_advantages']):
    MemoryAgent.__init__(self, obs_spec, act_spec, traj_len, batch_sz)

    if not sess_mgr:
      sess_mgr = SessionManager()
    if not optimizer:
      optimizer = tf.train.AdamOptimizer(DEFAULTS["learning_rate"])

    self.sess_mgr = sess_mgr
    self.value_coef = value_coef
    self.entropy_coef = entropy_coef
    self.traj_len = traj_len
    self.gamma = gamma
    self.gae_lambda = gae_lambda
    self.clip_rewards = clip_rewards
    self.normalize_returns = normalize_returns
    self.normalize_advantages = normalize_advantages

    self.model = model_fn(obs_sec, act_spec)  # this is fully_conv
    self.value = self.model.outputs[-1]  # very cool
    self.policy = policy_cls(act_spec, self.model.outputs[:-1])  # whats advantage of doing it like this? Also Policy(act_spec, logits)
    self.loss_op, self.loss_terms, self.loss_inputs = self.loss_fn()  # does this have to change?

    grads, vars = zip(*optimizer.compute_gradients(self.loss_op))
    self.grads_norm = tf.global_norm(grads)
    if clip_grads_norm > 0.:  # currently defaults at 0.
      grads, _  = tf.clip_by_global_norm(grads, clip_grads_norm, self.grads_norm)
    self.train_op = optimizer.apply_gradients(zip(grads, vars), global_step=sess_mgr.global_step)
    self.minimize_ops = self.make_minimize_ops()  # whats benefit of doing like this instead of just setting them in the method?

    sess_mgr.restore_or_init()
    self.n_batches = sess_mgr.start_step
    self.start_step = sess_mgr.start_step * traj_len  # why is there no self on these?
    self.logger = Logger()  # whats this look like exactly?
Ejemplo n.º 4
0
    def __init__(
        self,
        obs_spec: Spec,
        act_spec: Spec,
        model_fn: ModelBuilder,
        policy_cls: PolicyType,
        sess_mgr: SessionManager = None,
        traj_len=16,
        batch_sz=16,
        discount=0.99,
        gae_lambda=0.95,
        clip_rewards=0.0,
        normalize_advantages=True,
        bootstrap_terminals=False,
        clip_grads_norm=0.0,
        optimizer=tf.train.AdamOptimizer(),
        logger=Logger()
    ):
        MemoryAgent.__init__(self, obs_spec, act_spec, traj_len, batch_sz)

        if not sess_mgr:
            sess_mgr = SessionManager()

        self.sess_mgr = sess_mgr
        self.discount = discount
        self.gae_lambda = gae_lambda
        self.clip_rewards = clip_rewards
        self.normalize_advantages = normalize_advantages
        self.bootstrap_terminals = bootstrap_terminals
        self.logger = logger

        self.model = model_fn(obs_spec, act_spec)
        self.value = self.model.outputs[-1]
        self.policy = policy_cls(act_spec, self.model.outputs[:-1])
        self.loss_op, self.loss_terms, self.loss_inputs = self.loss_fn()

        grads, vars = zip(*optimizer.compute_gradients(self.loss_op))
        self.grads_norm = tf.global_norm(grads)
        if clip_grads_norm > 0.:
            grads, _ = tf.clip_by_global_norm(grads, clip_grads_norm, self.grads_norm)
        self.train_op = optimizer.apply_gradients(zip(grads, vars), global_step=self.sess_mgr.global_step)

        self.sess_mgr.restore_or_init()
        # NB! changing trajectory length in-between checkpoints will break the logs
        self.n_batches = self.sess_mgr.start_step
        self.start_step = self.sess_mgr.start_step * traj_len
Ejemplo n.º 5
0
    def __init__(
        self,
        obs_spec,
        act_spec,
        model_fn=build_mlp,
        policy_cls=MultiPolicy,
        sess_mgr=None,
        traj_len=16,
        batch_sz=16,
        discount=0.99,
        gae_lambda=0.95,
        clip_rewards=0.0,
        normalize_advantages=True,
        bootstrap_terminals=False,
        clip_grads_norm=0.0,
        optimizer=tf.train.AdamOptimizer(),
        logger=Logger()
    ):
        MemoryAgent.__init__(self, obs_spec, act_spec, traj_len, batch_sz)

        if not sess_mgr:
            sess_mgr = SessionManager()

        self.sess_mgr = sess_mgr
        self.discount = discount
        self.gae_lambda = gae_lambda
        self.clip_rewards = clip_rewards
        self.normalize_advantages = normalize_advantages
        self.bootstrap_terminals = bootstrap_terminals
        self.logger = logger

        self.model = model_fn(obs_spec, act_spec)
        self.value = self.model.outputs[-1]
        self.policy = policy_cls(act_spec, self.model.outputs[:-1])
        self.loss_op, self.loss_terms, self.loss_inputs = self.loss_fn()

        grads, vars = zip(*optimizer.compute_gradients(self.loss_op))
        self.grads_norm = tf.global_norm(grads)
        if clip_grads_norm > 0.:
            grads, _ = tf.clip_by_global_norm(grads, clip_grads_norm, self.grads_norm)
        self.train_op = optimizer.apply_gradients(zip(grads, vars), global_step=self.sess_mgr.global_step)

        self.sess_mgr.restore_or_init()
        # NB! changing trajectory length in-between checkpoints will break the logs
        self.n_batches = self.sess_mgr.start_step
        self.start_step = self.sess_mgr.start_step * traj_len
Ejemplo n.º 6
0
    def __init__(
        self,
        obs_spec: Spec,
        act_spec: Spec,
        model_variable_scope=DEFAULTS['model_variable_scope'],
        model_fn: ModelBuilder = None,
        policy_cls: PolicyType = None,
        sess_mgr: SessionManager = None,
        optimizer: tf.train.Optimizer = None,
        value_coef=DEFAULTS['value_coef'],
        entropy_coef=DEFAULTS['entropy_coef'],
        traj_len=DEFAULTS['traj_len'],
        batch_sz=DEFAULTS['batch_sz'],
        discount=DEFAULTS['discount'],
        gae_lambda=DEFAULTS['gae_lambda'],
        clip_rewards=DEFAULTS['clip_rewards'],
        clip_grads_norm=DEFAULTS['clip_grads_norm'],
        normalize_returns=DEFAULTS['normalize_returns'],
        normalize_advantages=DEFAULTS['normalize_advantages'],
        **kwargs,
    ):
        MemoryAgent.__init__(self, obs_spec, act_spec, traj_len, batch_sz)
        print(LOGGING_MSG_HEADER +
              ": the traj_len is {} and batch_sz is {}".format(
                  traj_len, batch_sz))

        if not sess_mgr:
            sess_mgr = SessionManager()

        self.subenvs = subenvs = kwargs[
            'subenvs'] if 'subenvs' in kwargs else []

        if optimizer:
            optimizers = [copy.deepcopy(optimizer) for subenv in subenvs]
        else:
            optimizer = tf.train.AdamOptimizer(
                learning_rate=DEFAULTS['learning_rate'])
            optimizers = [
                tf.train.AdamOptimizer(learning_rate=DEFAULTS['learning_rate'])
                for subenv in subenvs
            ]

        self.sess_mgr = sess_mgr
        self.model_variable_scope = self.sess_mgr.model_variable_scope
        self.value_coef = value_coef
        self.entropy_coef = entropy_coef
        self.discount = discount
        self.gae_lambda = gae_lambda
        self.clip_rewards = clip_rewards
        self.normalize_returns = normalize_returns
        self.normalize_advantages = normalize_advantages
        self.traj_len = traj_len
        self.batch_sz = batch_sz

        print(LOGGING_MSG_HEADER + " : the current model_variable_scope is",
              self.model_variable_scope)
        # implement the a2c to support multiple subagents
        # self.model = model_fn(obs_spec, act_spec)
        with sess_mgr.sess.graph.as_default():
            # note this is name_scope as opposed to variable_scope, important
            with tf.name_scope(self.sess_mgr.main_tf_vs.original_name_scope):

                if subenvs:
                    from collections import defaultdict
                    self.subenv_dict = defaultdict(list)
                    print(
                        LOGGING_MSG_HEADER +
                        ": Creating models for each individual subenvs: ",
                        subenvs)

                    for i, subenv in enumerate(subenvs):
                        subenv_model = model_fn(obs_spec, act_spec)
                        self.subenv_dict['models'].append(subenv_model)

                        subenv_value = subenv_model.outputs[-1]
                        self.subenv_dict['values'].append(subenv_value)
                        subenv_policy = policy_cls(act_spec,
                                                   subenv_model.outputs[:-1])
                        self.subenv_dict['policies'].append(subenv_policy)

                        subenv_loss_op, subenv_loss_terms, subenv_loss_inputs = self.loss_fn(
                            policy=subenv_policy, value=subenv_value)
                        self.subenv_dict['loss_ops'].append(subenv_loss_op)
                        self.subenv_dict['loss_terms'].append(
                            subenv_loss_terms)
                        self.subenv_dict['loss_inputs'].append(
                            subenv_loss_inputs)

                        subenv_optimizer = optimizers[i]
                        grads, vars = zip(*subenv_optimizer.compute_gradients(
                            subenv_loss_op))

                        subenv_grads_norm = tf.global_norm(grads)
                        self.subenv_dict['grads_norms'].append(
                            subenv_grads_norm)
                        if clip_grads_norm > 0:
                            grads, _ = tf.clip_by_global_norm(
                                grads, clip_grads_norm, subenv_grads_norm)
                        self.subenv_dict['train_ops'].append(
                            subenv_optimizer.apply_gradients(
                                zip(grads, vars),
                                global_step=sess_mgr.global_step))
                        self.subenv_dict['minimize_ops'].append(
                            self.make_minimize_ops(subenv_id=i))
                    print(
                        LOGGING_MSG_HEADER +
                        ": Successfully created models for each individual subenvs"
                    )
                else:
                    print(LOGGING_MSG_HEADER +
                          ": Creating single model for the environment.")

                    self.model = model_fn(obs_spec, act_spec)
                    self.value = self.model.outputs[-1]
                    self.policy = policy_cls(act_spec, self.model.outputs[:-1])
                    self.loss_op, self.loss_terms, self.loss_inputs = self.loss_fn(
                    )

                    grads, vars = zip(
                        *optimizer.compute_gradients(self.loss_op))
                    self.grads_norm = tf.global_norm(grads)
                    if clip_grads_norm > 0.:
                        grads, _ = tf.clip_by_global_norm(
                            grads, clip_grads_norm, self.grads_norm)
                    self.train_op = optimizer.apply_gradients(
                        zip(grads, vars), global_step=sess_mgr.global_step)
                    self.minimize_ops = self.make_minimize_ops()

        print(LOGGING_MSG_HEADER +
              " : main_model setup on sess and graph complete")
        sess_mgr.restore_or_init()
        print(LOGGING_MSG_HEADER +
              " : main_model weights restore/init complete")

        self.n_batches = sess_mgr.start_step
        self.start_step = sess_mgr.start_step * traj_len

        self.logger = Logger()