Esempio n. 1
0
 def init_algo_params(self):
     '''Initialize other algorithm parameters'''
     algorithm_spec = self.agent.spec['algorithm']
     net_spec = self.agent.spec['net']
     self.action_policy = act_fns[algorithm_spec['action_policy']]
     self.action_policy_update = act_update_fns[
         algorithm_spec['action_policy_update']]
     util.set_attr(
         self,
         _.pick(
             algorithm_spec,
             [
                 # explore_var is epsilon, tau or etc. depending on the action policy
                 # these control the trade off between exploration and exploitaton
                 'explore_var_start',
                 'explore_var_end',
                 'explore_anneal_epi',
                 'gamma',  # the discount factor
                 'training_epoch',  # how many batches to train each time
                 'training_frequency',  # how often to train (once a few timesteps)
                 'training_iters_per_batch',  # how many times to train each batch
                 'training_min_timestep',  # how long before starting training
             ]))
     self.nanflat_explore_var_a = [self.explore_var_start
                                   ] * self.agent.body_num
Esempio n. 2
0
    def __init__(self, memory_spec, body):
        util.set_attr(self, memory_spec, [
            'batch_size',
            'seq_len',
            'game'
        ])

        self.total_reward = 0
        self.clip_reward = clip_reward
        self.seq_len = seq_len

        data_folder = join('data', 'experience', self.game)
        self.episode_intervals, self.data = self.load_episodes(data_folder, self.game)

        valid_seq_idx_ranges = list()
        for start, end in self.episode_intervals:
            if end - start + 1 < self.seq_len:
                continue
            valid_seq_idx_ranges.append((0, end - self.seq_len + 2))
        self.valid_seq_idx_ranges = valid_seq_idx_ranges

        total = sum([end - start + 1 for start, end in self.valid_seq_idx_ranges])
        self.valid_seq_idx_weights = [(end - start + 1) / total
                                      for start, end in self.valid_seq_idx_ranges]

        self.total_reward = 0
        self.is_episodic = False
Esempio n. 3
0
 def init_nets(self):
     '''Initialize the neural network used to learn the Q function from the spec'''
     body = self.agent.nanflat_body_a[0]  # single-body algo
     state_dim = body.state_dim  # dimension of the environment state, e.g. 4
     action_dim = body.action_dim  # dimension of the environment actions, e.g. 2
     net_spec = self.agent.spec['net']
     net_kwargs = util.compact_dict(
         dict(
             hid_layers_activation=_.get(net_spec, 'hid_layers_activation'),
             optim_param=_.get(net_spec, 'optim'),
             loss_param=_.get(net_spec, 'loss'),
             clamp_grad=_.get(net_spec, 'clamp_grad'),
             clamp_grad_val=_.get(net_spec, 'clamp_grad_val'),
         ))
     self.net = getattr(net,
                        net_spec['type'])(state_dim, net_spec['hid_layers'],
                                          action_dim, **net_kwargs)
     util.set_attr(
         self,
         _.pick(
             net_spec,
             [
                 # how many examples to learn per training iteration
                 'batch_size',
                 'decay_lr',
                 'decay_lr_frequency',
                 'decay_lr_min_timestep',
             ]))
Esempio n. 4
0
 def init_nets(self):
     super(DQN, self).init_nets()
     # Network update params
     net_spec = self.agent.spec['net']
     util.set_attr(self, _.pick(net_spec, [
         'update_type', 'update_frequency', 'polyak_weight',
     ]))
Esempio n. 5
0
 def init_algorithm_params(self):
     '''Initialize other algorithm parameters.'''
     # set default
     util.set_attr(self, dict(
         action_pdtype='default',
         action_policy='default',
         action_policy_update='no_update',
         explore_var_start=np.nan,
         explore_var_end=np.nan,
         explore_anneal_epi=np.nan,
     ))
     util.set_attr(self, self.algorithm_spec, [
         'action_pdtype',
         'action_policy',
         'action_policy_update',
         # explore_var is epsilon, tau or etc. depending on the action policy
         # these control the trade off between exploration and exploitaton
         'explore_var_start',
         'explore_var_end',
         'explore_anneal_epi',
         'gamma',  # the discount factor
         'training_frequency',  # how often to train for batch training (once each training_frequency time steps)
     ])
     self.to_train = 0
     self.action_policy = getattr(policy_util, self.action_policy)
     self.action_policy_update = getattr(policy_util, self.action_policy_update)
     for body in self.agent.nanflat_body_a:
         body.explore_var = self.explore_var_start
Esempio n. 6
0
def test_set_attr():
    class Foo:
        bar = 0
    foo = Foo()
    util.set_attr(foo, {'bar': 1, 'baz': 2})
    assert foo.bar == 1
    assert foo.baz == 2
Esempio n. 7
0
    def __init__(self, memory_spec, body):
        util.set_attr(self, memory_spec, [
            'batch_size',
            'max_size',
            'stack_len',
            'use_cer',
            'game',
        ])
        Replay.__init__(self, memory_spec, body)
        self.states_shape = self.scalar_shape
        self.states = [None] * self.max_size

        self._accept_data = True

        data_folder = os.path.join('data', self.game)
        actions = np.load(os.path.join(data_folder, '{}_actions.npy'))
        dones = np.load(os.path.join(data_folder, '{}_dones.npy'))
        rewards = np.load(os.path.join(data_folder, '{}_rewards.npy'))
        states = np.load(os.path.join(data_folder, '{}_states.npy'))

        for i in range(states.shape[0]):
            state = LazyFrames(states[i])
            self.add_experience(state, actions[i], rewards[i],
                                self.last_state, dones[i])
            self.last_state = state

        self._accept_data = False
Esempio n. 8
0
 def init_algorithm_params(self):
     # set default
     util.set_attr(self, dict(
         action_pdtype='Argmax',
         action_policy='epsilon_greedy',
         action_policy_update='linear_decay',
         explore_var_start=1.0,
         explore_var_end=0.1,
         explore_anneal_epi=100,
     ))
     util.set_attr(self, self.algorithm_spec, [
         'action_pdtype',
         'action_policy',
         'action_policy_update',
         # explore_var is epsilon, tau or etc. depending on the action policy
         # these control the trade off between exploration and exploitaton
         'explore_var_start',
         'explore_var_end',
         'explore_anneal_epi',
         'gamma',  # the discount factor
         'training_batch_epoch',  # how many gradient updates per batch
         'training_epoch',  # how many batches to train each time
         'training_frequency',  # how often to train (once a few timesteps)
         'training_min_timestep',  # how long before starting training
         'normalize_state',
     ])
     super(VanillaDQN, self).init_algorithm_params()
Esempio n. 9
0
 def init_algorithm_params(self):
     '''Initialize other algorithm parameters'''
     # set default
     util.set_attr(self, dict(
         action_pdtype='default',
         action_policy='default',
         action_policy_update='no_update',
         explore_var_start=np.nan,
         explore_var_end=np.nan,
         explore_anneal_epi=np.nan,
     ))
     util.set_attr(self, self.algorithm_spec, [
         'action_pdtype',
         'action_policy',
         # theoretically, PPO does not have policy update; but in this implementation we have such option
         'action_policy_update',
         'explore_var_start',
         'explore_var_end',
         'explore_anneal_epi',
         'gamma',
         'lam',
         'clip_eps',
         'entropy_coef',
         'training_frequency',  # horizon
         'training_epoch',
     ])
     # use the same annealing epi as lr
     self.clip_eps_anneal_epi = self.net_spec['lr_decay_min_timestep'] + self.net_spec['lr_decay_frequency'] * 20
     self.to_train = 0
     self.action_policy = getattr(policy_util, self.action_policy)
     self.action_policy_update = getattr(policy_util, self.action_policy_update)
     for body in self.agent.nanflat_body_a:
         body.explore_var = self.explore_var_start
Esempio n. 10
0
 def init_algorithm_params(self):
     '''Initialize other algorithm parameters'''
     # set default
     util.set_attr(self, dict(
         action_pdtype='default',
         action_policy='default',
         action_policy_update='no_update',
         explore_var_start=np.nan,
         explore_var_end=np.nan,
         explore_anneal_epi=np.nan,
     ))
     util.set_attr(self, self.algorithm_spec, [
         'action_pdtype',
         'action_policy',
         # theoretically, REINFORCE does not have policy update; but in this implementation we have such option
         'action_policy_update',
         'explore_var_start',
         'explore_var_end',
         'explore_anneal_epi',
         'gamma',  # the discount factor
         'add_entropy',
         'entropy_coef',
         'continuous_action_clip',
         'training_frequency',
     ])
     self.to_train = 0
     self.action_policy = getattr(policy_util, self.action_policy)
     self.action_policy_update = getattr(policy_util, self.action_policy_update)
     for body in self.agent.nanflat_body_a:
         body.explore_var = self.explore_var_start
Esempio n. 11
0
 def __init__(self, memory_spec, body):
     # set default
     util.set_attr(self, dict(cross_entropy=1.0, ))
     util.set_attr(self, memory_spec, [
         'cross_entropy',
     ])
     super().__init__(memory_spec, body)
Esempio n. 12
0
 def init_algorithm_params(self):
     # set default
     util.set_attr(self, dict(
         action_pdtype='Argmax',
         action_policy='epsilon_greedy',
         action_policy_update='linear_decay',
         explore_var_start=1.0,
         explore_var_end=0.1,
         explore_anneal_epi=100,
     ))
     util.set_attr(self, self.algorithm_spec, [
         'action_pdtype',
         'action_policy',
         'action_policy_update',
         # explore_var is epsilon, tau or etc. depending on the action policy
         # these control the trade off between exploration and exploitaton
         'explore_var_start',
         'explore_var_end',
         'explore_anneal_epi',
         'gamma',  # the discount factor
         'training_batch_epoch',  # how many gradient updates per batch
         'training_epoch',  # how many batches to train each time
         'training_frequency',  # how often to train (once a few timesteps)
         'training_min_timestep',  # how long before starting training
     ])
     super(VanillaDQN, self).init_algorithm_params()
Esempio n. 13
0
 def init_algorithm_params(self):
     util.set_attr(self, self.algorithm_spec, [
         'action_pdtype',
         'action_policy',
         # theoretically, PPO does not have policy update; but in this implementation we have such option
         'explore_var_spec',
         'gamma',
         'lam',
         'clip_eps_spec',
         'entropy_coef_spec',
         'val_loss_coef',
         'minibatch_size',
         'time_horizon',  # training_frequency = actor * horizon
         'training_epoch',
     ])
     self.to_train = 0
     self.training_frequency = self.time_horizon * self.body.env.num_envs
     assert self.memory_spec['name'] == 'OnPolicyBatchReplay', f'PPO only works with OnPolicyBatchReplay, but got {self.memory_spec["name"]}'
     self.action_policy = getattr(policy_util, self.action_policy)
     self.explore_var_scheduler = policy_util.VarScheduler(self.explore_var_spec)
     self.body.explore_var = self.explore_var_scheduler.start_val
     # extra variable decays for PPO
     self.clip_eps_scheduler = policy_util.VarScheduler(self.clip_eps_spec)
     self.body.clip_eps = self.clip_eps_scheduler.start_val
     if self.entropy_coef_spec is not None:
         self.entropy_coef_scheduler = policy_util.VarScheduler(self.entropy_coef_spec)
         self.body.entropy_coef = self.entropy_coef_scheduler.start_val
     # PPO uses GAE
     self.calc_advs_v_targets = self.calc_gae_advs_v_targets
Esempio n. 14
0
 def init_algorithm_params(self):
     '''Initialize other algorithm parameters'''
     # set default
     util.set_attr(
         self,
         dict(
             action_pdtype='default',
             action_policy='default',
             explore_var_spec=None,
             entropy_coef_spec=None,
             policy_loss_coef=1.0,
             val_loss_coef=1.0,
         ))
     util.set_attr(
         self,
         self.algorithm_spec,
         [
             'action_pdtype',
             'action_policy',
             # theoretically, AC does not have policy update; but in this implementation we have such option
             'explore_var_spec',
             'gamma',  # the discount factor
             'lam',
             'num_step_returns',
             'entropy_coef_spec',
             'policy_loss_coef',
             'val_loss_coef',
             'sil_policy_loss_coef',
             'sil_val_loss_coef',
             'training_frequency',
             'training_batch_iter',
             'training_iter',
         ])
     super().init_algorithm_params()
Esempio n. 15
0
 def init_algorithm_params(self):
     '''Initialize other algorithm parameters'''
     # set default
     util.set_attr(self, dict(
         action_pdtype='default',
         action_policy='default',
         center_return=False,
         explore_var_spec=None,
         entropy_coef_spec=None,
         policy_loss_coef=1.0,
     ))
     util.set_attr(self, self.algorithm_spec, [
         'action_pdtype',
         'action_policy',
         'center_return',  # center by the mean
         'explore_var_spec',
         'gamma',  # the discount factor
         'entropy_coef_spec',
         'policy_loss_coef',
         'training_frequency',
     ])
     self.to_train = 0
     self.action_policy = getattr(policy_util, self.action_policy)
     self.explore_var_scheduler = policy_util.VarScheduler(self.explore_var_spec)
     self.body.explore_var = self.explore_var_scheduler.start_val
     if self.entropy_coef_spec is not None:
         self.entropy_coef_scheduler = policy_util.VarScheduler(self.entropy_coef_spec)
         self.body.entropy_coef = self.entropy_coef_scheduler.start_val
Esempio n. 16
0
def test_set_attr():
    class Foo:
        bar = 0
    foo = Foo()
    util.set_attr(foo, {'bar': 1, 'baz': 2})
    assert foo.bar == 1
    assert foo.baz == 2
Esempio n. 17
0
 def init_algorithm_params(self):
     '''Initialize other algorithm parameters'''
     # set default
     util.set_attr(
         self,
         dict(
             action_pdtype='default',
             action_policy='default',
             action_policy_update='no_update',
             explore_var_start=np.nan,
             explore_var_end=np.nan,
             explore_anneal_epi=np.nan,
         ))
     util.set_attr(
         self,
         self.algorithm_spec,
         [
             'action_pdtype',
             'action_policy',
             # theoretically, REINFORCE does not have policy update; but in this implementation we have such option
             'action_policy_update',
             'explore_var_start',
             'explore_var_end',
             'explore_anneal_epi',
             'gamma',  # the discount factor
             'add_entropy',
             'entropy_coef',
             'training_frequency',
             'normalize_state',
         ])
     self.to_train = 0
     self.action_policy = getattr(policy_util, self.action_policy)
     self.action_policy_update = getattr(policy_util,
                                         self.action_policy_update)
     self.body.explore_var = self.explore_var_start
Esempio n. 18
0
 def init_algorithm_params(self):
     '''Initialize other algorithm parameters'''
     # set default
     util.set_attr(
         self,
         dict(
             action_pdtype='default',
             action_policy='default',
             explore_var_spec=None,
             entropy_coef_spec=None,
         ))
     util.set_attr(
         self,
         self.algorithm_spec,
         [
             'action_pdtype',
             'action_policy',
             # theoretically, REINFORCE does not have policy update; but in this implementation we have such option
             'explore_var_spec',
             'gamma',  # the discount factor
             'entropy_coef_spec',
             'training_frequency',
             'normalize_state',
         ])
     self.to_train = 0
     self.action_policy = getattr(policy_util, self.action_policy)
     self.explore_var_scheduler = policy_util.VarScheduler(
         self.explore_var_spec)
     self.body.explore_var = self.explore_var_scheduler.start_val
     if self.entropy_coef_spec is not None:
         self.entropy_coef_scheduler = policy_util.VarScheduler(
             self.entropy_coef_spec)
         self.body.entropy_coef = self.entropy_coef_scheduler.start_val
Esempio n. 19
0
 def init_algorithm_params(self):
     '''Initialize other algorithm parameters'''
     # set default
     util.set_attr(
         self,
         dict(
             action_pdtype='default',
             action_policy='default',
             training_iter=self.body.env.num_envs,
             training_start_step=self.body.memory.batch_size,
         ))
     util.set_attr(
         self,
         self.algorithm_spec,
         [
             'action_pdtype',
             'action_policy',
             'gamma',  # the discount factor
             'training_iter',
             'training_frequency',
             'training_start_step',
         ])
     if self.body.is_discrete:
         assert self.action_pdtype == 'GumbelSoftmax'
     self.to_train = 0
     self.action_policy = getattr(policy_util, self.action_policy)
Esempio n. 20
0
 def init_nets(self):
     '''Initialize nets with multi-task dimensions, and set net params'''
     self.state_dims = [
         body.state_dim for body in self.agent.nanflat_body_a]
     self.action_dims = [
         body.action_dim for body in self.agent.nanflat_body_a]
     self.total_state_dim = sum(self.state_dims)
     self.total_action_dim = sum(self.action_dims)
     net_spec = self.agent.spec['net']
     net_kwargs = util.compact_dict(dict(
         hid_layers_activation=_.get(net_spec, 'hid_layers_activation'),
         optim_param=_.get(net_spec, 'optim'),
         loss_param=_.get(net_spec, 'loss'),
         clamp_grad=_.get(net_spec, 'clamp_grad'),
         clamp_grad_val=_.get(net_spec, 'clamp_grad_val'),
     ))
     self.net = getattr(net, net_spec['type'])(
         self.total_state_dim, net_spec['hid_layers'], self.total_action_dim, **net_kwargs)
     self.target_net = getattr(net, net_spec['type'])(
         self.total_state_dim, net_spec['hid_layers'], self.total_action_dim, **net_kwargs)
     self.online_net = self.target_net
     self.eval_net = self.target_net
     util.set_attr(self, _.pick(net_spec, [
         'batch_size', 'update_type', 'update_frequency', 'polyak_weight',
     ]))
Esempio n. 21
0
 def init_nets(self):
     '''Initialize nets with multi-task dimensions, and set net params'''
     # NOTE: Separate init from MultitaskDQN despite similarities so that this implementation can support arbitrary sized state and action heads (e.g. multiple layers)
     net_spec = self.agent.spec['net']
     if len(net_spec['hid_layers']) > 0:
         state_head_out_d = int(net_spec['hid_layers'][0] / 4)
     else:
         state_head_out_d = 16
     self.state_dims = [
         [body.state_dim, state_head_out_d] for body in self.agent.nanflat_body_a]
     self.action_dims = [
         [body.action_dim] for body in self.agent.nanflat_body_a]
     self.total_state_dim = sum([s[0] for s in self.state_dims])
     self.total_action_dim = sum([a[0] for a in self.action_dims])
     logger.debug(
         f'State dims: {self.state_dims}, total: {self.total_state_dim}')
     logger.debug(
         f'Action dims: {self.action_dims}, total: {self.total_action_dim}')
     net_kwargs = util.compact_dict(dict(
         hid_layers_activation=_.get(net_spec, 'hid_layers_activation'),
         optim_param=_.get(net_spec, 'optim'),
         loss_param=_.get(net_spec, 'loss'),
         clamp_grad=_.get(net_spec, 'clamp_grad'),
         clamp_grad_val=_.get(net_spec, 'clamp_grad_val'),
     ))
     self.net = getattr(net, net_spec['type'])(
         self.state_dims, net_spec['hid_layers'], self.action_dims, **net_kwargs)
     self.target_net = getattr(net, net_spec['type'])(
         self.state_dims, net_spec['hid_layers'], self.action_dims, **net_kwargs)
     self.online_net = self.target_net
     self.eval_net = self.target_net
     util.set_attr(self, _.pick(net_spec, [
         'batch_size', 'update_type', 'update_frequency', 'polyak_weight',
     ]))
Esempio n. 22
0
 def init_nets(self):
     '''Initialize networks'''
     body = self.agent.nanflat_body_a[0]  # single-body algo
     state_dim = body.state_dim
     action_dim = body.action_dim
     net_spec = self.agent.spec['net']
     net_kwargs = util.compact_dict(dict(
         hid_layers_activation=_.get(net_spec, 'hid_layers_activation'),
         optim_param=_.get(net_spec, 'optim'),
         loss_param=_.get(net_spec, 'loss'),
         clamp_grad=_.get(net_spec, 'clamp_grad'),
         clamp_grad_val=_.get(net_spec, 'clamp_grad_val'),
     ))
     self.net = getattr(net, net_spec['type'])(
         state_dim, net_spec['hid_layers'], action_dim, **net_kwargs)
     self.target_net = getattr(net, net_spec['type'])(
         state_dim, net_spec['hid_layers'], action_dim, **net_kwargs)
     self.online_net = self.target_net
     self.eval_net = self.target_net
     util.set_attr(self, _.pick(net_spec, [
         'batch_size',
     ]))
     # Default network update params for base
     self.update_type = 'replace'
     self.update_frequency = 1
     self.polyak_weight = 0.0
Esempio n. 23
0
    def init_algo_params(self):
        '''Initialize other algorithm parameters'''
        algorithm_spec = self.agent.spec['algorithm']
        net_spec = self.agent.spec['net']
        self.set_action_fn()
        util.set_attr(self, _.pick(algorithm_spec, [
            'gamma',
            'num_epis_to_collect',
            'add_entropy', 'entropy_weight',
            'continuous_action_clip',
            'lamda', 'num_step_returns',
            'training_frequency', 'training_iters_per_batch',
            'use_GAE',
            'policy_loss_weight', 'val_loss_weight',

        ]))
        util.set_attr(self, _.pick(net_spec, [
            'decay_lr', 'decay_lr_frequency', 'decay_lr_min_timestep',
        ]))
        '''Select appropriate function for calculating state-action-value estimate (target)'''
        self.get_target = self.get_nstep_target
        if self.use_GAE:
            self.get_target = self.get_gae_target
        self.set_memory_flag()
        '''To save on a forward pass keep the log probs and entropy from each action'''
        self.saved_log_probs = []
        self.entropy = []
        self.to_train = 0
Esempio n. 24
0
 def init_algo_params(self):
     '''Initialize other algorithm parameters'''
     algorithm_spec = self.agent.spec['algorithm']
     net_spec = self.agent.spec['net']
     # Automatically selects appropriate discrete or continuous action policy if setting is default
     action_fn = algorithm_spec['action_policy']
     if action_fn == 'default':
         if self.is_discrete:
             self.action_policy = act_fns['softmax']
         else:
             self.action_policy = act_fns['gaussian']
     else:
         self.action_policy = act_fns[action_fn]
     util.set_attr(
         self,
         _.pick(algorithm_spec, [
             'gamma', 'num_epis_to_collect', 'add_entropy',
             'entropy_weight', 'continuous_action_clip'
         ]))
     util.set_attr(
         self,
         _.pick(net_spec, [
             'decay_lr', 'decay_lr_frequency', 'decay_lr_min_timestep',
             'gpu'
         ]))
     if not hasattr(self, 'gpu'):
         self.gpu = False
     logger.info(f'Training on gpu: {self.gpu}')
     # To save on a forward pass keep the log probs from each action
     self.saved_log_probs = []
     self.entropy = []
     self.to_train = 0
Esempio n. 25
0
 def init_algorithm_params(self):
     '''Initialize other algorithm parameters.'''
     # set default
     util.set_attr(
         self,
         dict(
             action_pdtype='default',
             action_policy='default',
             explore_var_spec=None,
         ))
     util.set_attr(
         self,
         self.algorithm_spec,
         [
             'action_pdtype',
             'action_policy',
             # explore_var is epsilon, tau or etc. depending on the action policy
             # these control the trade off between exploration and exploitaton
             'explore_var_spec',
             'gamma',  # the discount factor
             'training_frequency',  # how often to train for batch training (once each training_frequency time steps)
         ])
     self.to_train = 0
     self.action_policy = getattr(policy_util, self.action_policy)
     self.explore_var_scheduler = policy_util.VarScheduler(
         self.explore_var_spec)
     self.body.explore_var = self.explore_var_scheduler.start_val
Esempio n. 26
0
 def init_algorithm_params(self):
     '''Initialize other algorithm parameters.'''
     # set default
     util.set_attr(
         self,
         dict(
             action_pdtype='default',
             action_policy='default',
             action_policy_update='no_update',
             explore_var_start=np.nan,
             explore_var_end=np.nan,
             explore_anneal_epi=np.nan,
         ))
     util.set_attr(
         self,
         self.algorithm_spec,
         [
             'action_pdtype',
             'action_policy',
             'action_policy_update',
             # explore_var is epsilon, tau or etc. depending on the action policy
             # these control the trade off between exploration and exploitaton
             'explore_var_start',
             'explore_var_end',
             'explore_anneal_epi',
             'gamma',  # the discount factor
             'training_frequency',  # how often to train for batch training (once each training_frequency time steps)
             'normalize_state',
         ])
     self.to_train = 0
     self.action_policy = getattr(policy_util, self.action_policy)
     self.action_policy_update = getattr(policy_util,
                                         self.action_policy_update)
     self.body.explore_var = self.explore_var_start
Esempio n. 27
0
 def init_algorithm_params(self):
     # set default
     util.set_attr(
         self,
         dict(
             action_pdtype='Argmax',
             action_policy='epsilon_greedy',
             explore_var_spec=None,
         ))
     util.set_attr(
         self,
         self.algorithm_spec,
         [
             'action_pdtype',
             'action_policy',
             # explore_var is epsilon, tau or etc. depending on the action policy
             # these control the trade off between exploration and exploitaton
             'explore_var_spec',
             'gamma',  # the discount factor
             'training_batch_iter',  # how many gradient updates per batch
             'training_iter',  # how many batches to train each time
             'training_frequency',  # how often to train (once a few timesteps)
             'training_start_step',  # how long before starting training
         ])
     super().init_algorithm_params()
Esempio n. 28
0
 def load(self, train_df):
     '''Load clock from the last row of body.train_df'''
     last_row = train_df.iloc[-1]
     last_clock_vals = ps.pick(last_row,
                               *['epi', 't', 'wall_t', 'opt_step', 'frame'])
     util.set_attr(self, last_clock_vals)
     self.start_wall_t -= self.wall_t  # offset elapsed wall_t
Esempio n. 29
0
    def __init__(self, net_spec, in_dim, out_dim):
        '''
        net_spec:
        hid_layers: list containing dimensions of the hidden layers
        hid_layers_activation: activation function for the hidden layers
        init_fn: weight initialization function
        clip_grad_val: clip gradient norm if value is not None
        loss_spec: measure of error between model predictions and correct outputs
        optim_spec: parameters for initializing the optimizer
        lr_scheduler_spec: Pytorch optim.lr_scheduler
        update_type: method to update network weights: 'replace' or 'polyak'
        update_frequency: how many total timesteps per update
        polyak_coef: ratio of polyak weight update
        gpu: whether to train using a GPU. Note this will only work if a GPU is available, othewise setting gpu=True does nothing
        '''
        nn.Module.__init__(self)
        super(MLPNet, self).__init__(net_spec, in_dim, out_dim)
        # set default
        util.set_attr(self, dict(
            init_fn=None,
            clip_grad_val=None,
            loss_spec={'name': 'MSELoss'},
            optim_spec={'name': 'Adam'},
            lr_scheduler_spec=None,
            update_type='replace',
            update_frequency=1,
            polyak_coef=0.0,
            gpu=False,
        ))
        util.set_attr(self, self.net_spec, [
            'shared',
            'hid_layers',
            'hid_layers_activation',
            'init_fn',
            'clip_grad_val',
            'loss_spec',
            'optim_spec',
            'lr_scheduler_spec',
            'update_type',
            'update_frequency',
            'polyak_coef',
            'gpu',
        ])

        dims = [self.in_dim] + self.hid_layers
        self.model = net_util.build_fc_model(dims, self.hid_layers_activation)
        # add last layer with no activation
        # tails. avoid list for single-tail for compute speed
        if ps.is_integer(self.out_dim):
            self.model_tail = nn.Linear(dims[-1], self.out_dim)
        else:
            self.model_tails = nn.ModuleList([nn.Linear(dims[-1], out_d) for out_d in self.out_dim])

        net_util.init_layers(self, self.init_fn)
        for module in self.modules():
            module.to(self.device)
        self.loss_fn = net_util.get_loss_fn(self, self.loss_spec)
        self.optim = net_util.get_optim(self, self.optim_spec)
        self.lr_scheduler = net_util.get_lr_scheduler(self, self.lr_scheduler_spec)
Esempio n. 30
0
 def __init__(self, memory_spec, body):
     util.set_attr(
         self,
         memory_spec,
         [
             'stack_len',  # number of stack states
         ])
     OnPolicyReplay.__init__(self, memory_spec, body)
Esempio n. 31
0
    def __init__(self, net_spec, in_dim, out_dim):
        state_dim, action_dim = in_dim
        assert len(state_dim) == 3  # image shape (c,w,h)
        # conv body
        nn.Module.__init__(self)
        Net.__init__(self, net_spec, state_dim, out_dim)
        # set default
        util.set_attr(
            self,
            dict(
                out_layer_activation=None,
                init_fn=None,
                normalize=False,
                batch_norm=True,
                clip_grad_val=None,
                loss_spec={'name': 'MSELoss'},
                optim_spec={'name': 'Adam'},
                lr_scheduler_spec=None,
                update_type='replace',
                update_frequency=1,
                polyak_coef=0.0,
                gpu=False,
            ))
        util.set_attr(self, self.net_spec, [
            'conv_hid_layers',
            'fc_hid_layers',
            'hid_layers_activation',
            'out_layer_activation',
            'init_fn',
            'normalize',
            'batch_norm',
            'clip_grad_val',
            'loss_spec',
            'optim_spec',
            'lr_scheduler_spec',
            'update_type',
            'update_frequency',
            'polyak_coef',
            'gpu',
        ])
        # state conv model
        self.conv_model = self.build_conv_layers(self.conv_hid_layers)
        self.conv_out_dim = self.get_conv_output_size()

        # state fc model
        self.fc_model = net_util.build_fc_model(
            [self.conv_out_dim + action_dim] + self.fc_hid_layers,
            self.hid_layers_activation)

        # affine transformation applied to
        tail_in_dim = self.fc_hid_layers[-1]
        self.model_tail = net_util.build_fc_model([tail_in_dim, self.out_dim],
                                                  self.out_layer_activation)

        net_util.init_layers(self, self.init_fn)
        self.loss_fn = net_util.get_loss_fn(self, self.loss_spec)
        self.to(self.device)
        self.train()
Esempio n. 32
0
 def __init__(self, memory_spec, algorithm, body):
     util.set_attr(self, memory_spec, [
         'batch_size',
         'max_size',
     ])
     self.seq_len = algorithm.net_spec['seq_len']
     super(SeqReplay, self).__init__(memory_spec, algorithm, body)
     self.state_buffer = deque(maxlen=self.seq_len)
     self.reset()
Esempio n. 33
0
 def set_net_attributes(self):
     '''Initializes additional parameters from the net spec. Called by init_nets'''
     net_spec = self.agent.spec['net']
     util.set_attr(self, _.pick(net_spec, [
         'decay_lr', 'decay_lr_frequency', 'decay_lr_min_timestep', 'gpu'
     ]))
     if not hasattr(self, 'gpu'):
         self.gpu = False
     logger.info(f'Training on gpu: {self.gpu}')
Esempio n. 34
0
 def init_algorithm_params(self):
     '''Initialize other algorithm parameters'''
     # set default
     util.set_attr(
         self,
         dict(
             action_pdtype='default',
             action_policy='default',
             explore_var_spec=None,
             entropy_coef_spec=None,
             minibatch_size=4,
             val_loss_coef=1.0,
         ))
     util.set_attr(
         self,
         self.algorithm_spec,
         [
             'action_pdtype',
             'action_policy',
             # theoretically, PPO does not have policy update; but in this implementation we have such option
             'explore_var_spec',
             'gamma',
             'lam',
             'clip_eps_spec',
             'entropy_coef_spec',
             'val_loss_coef',
             'minibatch_size',
             'time_horizon',  # training_frequency = actor * horizon
             'training_epoch',
         ])
     self.to_train = 0
     # guard
     num_envs = self.body.env.num_envs
     if self.minibatch_size % num_envs != 0 or self.time_horizon % num_envs != 0:
         self.minibatch_size = math.ceil(
             self.minibatch_size / num_envs) * num_envs
         self.time_horizon = math.ceil(
             self.time_horizon / num_envs) * num_envs
         logger.info(
             f'minibatch_size and time_horizon needs to be multiples of num_envs; autocorrected values: minibatch_size: {self.minibatch_size}  time_horizon {self.time_horizon}'
         )
     self.training_frequency = self.time_horizon  # since all memories stores num_envs by batch in list
     assert self.memory_spec[
         'name'] == 'OnPolicyBatchReplay', f'PPO only works with OnPolicyBatchReplay, but got {self.memory_spec["name"]}'
     self.action_policy = getattr(policy_util, self.action_policy)
     self.explore_var_scheduler = policy_util.VarScheduler(
         self.explore_var_spec)
     self.body.explore_var = self.explore_var_scheduler.start_val
     # extra variable decays for PPO
     self.clip_eps_scheduler = policy_util.VarScheduler(self.clip_eps_spec)
     self.body.clip_eps = self.clip_eps_scheduler.start_val
     if self.entropy_coef_spec is not None:
         self.entropy_coef_scheduler = policy_util.VarScheduler(
             self.entropy_coef_spec)
         self.body.entropy_coef = self.entropy_coef_scheduler.start_val
     # PPO uses GAE
     self.calc_advs_v_targets = self.calc_gae_advs_v_targets
Esempio n. 35
0
 def __init__(self, memory_spec, algorithm, body):
     super(OnPolicyReplay, self).__init__(memory_spec, algorithm, body)
     # NOTE for OnPolicy replay, frequency = episode; for other classes below frequency = frames
     util.set_attr(self, self.agent_spec['algorithm'], ['training_frequency'])
     self.state_buffer = deque(maxlen=0)  # for API consistency
     # Don't want total experiences reset when memory is
     self.is_episodic = True
     self.total_experiences = 0
     self.warn_size_once = ps.once(lambda msg: logger.warn(msg))
     self.reset()
Esempio n. 36
0
 def __init__(self, memory_spec, algorithm, body):
     util.set_attr(self, memory_spec, [
         'alpha',
         'epsilon',
         'batch_size',
         'max_size',
         'use_cer',
     ])
     self.epsilon = torch.full((1,), self.epsilon)
     self.alpha = torch.full((1,), self.alpha)
     super(PrioritizedReplay, self).__init__(memory_spec, algorithm, body)
Esempio n. 37
0
 def __init__(self, memory_spec, algorithm, body):
     util.set_attr(self, memory_spec, [
         'batch_size',
         'max_size',
         'stack_len',  # num_stack_states
         'use_cer',
     ])
     self.raw_state_dim = deepcopy(body.state_dim)  # used for state_buffer
     body.state_dim = body.state_dim * self.stack_len  # modify to use for net init for flattened stacked input
     super(StackReplay, self).__init__(memory_spec, algorithm, body)
     self.state_buffer = deque(maxlen=self.stack_len)
     self.reset()
Esempio n. 38
0
 def __init__(self, memory_spec, algorithm, body):
     super(Replay, self).__init__(memory_spec, algorithm, body)
     util.set_attr(self, self.memory_spec, [
         'batch_size',
         'max_size',
         'use_cer',
     ])
     self.state_buffer = deque(maxlen=0)  # for API consistency
     self.batch_idxs = None
     self.total_experiences = 0  # To track total experiences encountered even with forgetting
     self.reset()
     self.print_memory_info()
Esempio n. 39
0
 def __init__(self, memory_spec, algorithm, body):
     self.atari = True  # Memory is specialized for playing Atari games
     util.set_attr(self, memory_spec, [
         'batch_size',
         'max_size',
         'stack_len',  # num_stack_states
         'use_cer',
     ])
     self.raw_state_dim = (84, 84)
     body.state_dim = self.raw_state_dim + (self.stack_len,)  # greyscale downsized, stacked
     Replay.__init__(self, memory_spec, algorithm, body)
     self.state_buffer = deque(maxlen=self.stack_len)
     self.reset()
Esempio n. 40
0
 def init_algorithm_params(self):
     '''Initialize other algorithm parameters'''
     # set default
     util.set_attr(self, dict(
         action_pdtype='default',
         action_policy='default',
         action_policy_update='no_update',
         explore_var_start=np.nan,
         explore_var_end=np.nan,
         explore_anneal_epi=np.nan,
         policy_loss_coef=1.0,
         val_loss_coef=1.0,
     ))
     util.set_attr(self, self.algorithm_spec, [
         'action_pdtype',
         'action_policy',
         # theoretically, AC does not have policy update; but in this implementation we have such option
         'action_policy_update',
         'explore_var_start',
         'explore_var_end',
         'explore_anneal_epi',
         'gamma',  # the discount factor
         'use_gae',
         'lam',
         'use_nstep',
         'num_step_returns',
         'add_entropy',
         'entropy_coef',
         'policy_loss_coef',
         'val_loss_coef',
         'continuous_action_clip',
         'training_frequency',
         'training_epoch',
     ])
     self.to_train = 0
     self.action_policy = getattr(policy_util, self.action_policy)
     self.action_policy_update = getattr(policy_util, self.action_policy_update)
     for body in self.agent.nanflat_body_a:
         body.explore_var = self.explore_var_start
     # Select appropriate methods to calculate adv_targets and v_targets for training
     if self.use_gae:
         self.calc_advs_v_targets = self.calc_gae_advs_v_targets
     elif self.use_nstep:
         self.calc_advs_v_targets = self.calc_nstep_advs_v_targets
     else:
         self.calc_advs_v_targets = self.calc_td_advs_v_targets
Esempio n. 41
0
    def __init__(self, env_spec, env_space, e=0):
        self.env_spec = env_spec
        self.env_space = env_space
        self.info_space = env_space.info_space
        self.e = e
        util.set_attr(self, self.env_spec)
        self.name = self.env_spec['name']
        self.body_e = None
        self.nanflat_body_e = None  # nanflatten version of bodies
        self.body_num = None

        worker_id = int(f'{os.getpid()}{self.e+int(ps.unique_id())}'[-4:])
        self.u_env = UnityEnvironment(file_name=util.get_env_path(self.name), worker_id=worker_id)
        # spaces for NN auto input/output inference
        logger.warn('Unity environment observation_space and action_space are constructed with invalid range. Use only their shapes.')
        self.observation_spaces = []
        self.action_spaces = []
        for a in range(len(self.u_env.brain_names)):
            observation_shape = (self.get_observable_dim(a)['state'],)
            if self.get_brain(a).state_space_type == 'discrete':
                observation_space = gym.spaces.Box(low=0, high=1, shape=observation_shape, dtype=np.int32)
            else:
                observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=observation_shape, dtype=np.float32)
            self.observation_spaces.append(observation_space)
            if self.is_discrete(a):
                action_space = gym.spaces.Discrete(self.get_action_dim(a))
            else:
                action_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32)
            self.action_spaces.append(action_space)
        for observation_space, action_space in zip(self.observation_spaces, self.action_spaces):
            set_gym_space_attr(observation_space)
            set_gym_space_attr(action_space)

        # TODO experiment to find out optimal benchmarking max_timestep, set
        # TODO ensure clock_speed from env_spec
        self.clock_speed = 1
        self.clock = Clock(self.clock_speed)
        self.done = False
Esempio n. 42
0
    def __init__(self, env_spec, env_space, e=0):
        self.env_spec = env_spec
        self.env_space = env_space
        self.info_space = env_space.info_space
        util.set_attr(self, self.env_spec)
        self.name = self.env_spec['name']
        self.e = e
        self.body_e = None
        self.nanflat_body_e = None  # nanflatten version of bodies
        self.body_num = None

        self.u_env = gym.make(self.name)
        # spaces for NN auto input/output inference
        set_gym_space_attr(self.u_env.observation_space)
        self.observation_spaces = [self.u_env.observation_space]
        set_gym_space_attr(self.u_env.action_space)
        self.action_spaces = [self.u_env.action_space]

        self.max_timestep = self.max_timestep or self.u_env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')
        # TODO ensure clock_speed from env_spec
        self.clock_speed = 1
        self.clock = Clock(self.clock_speed)
        self.done = False
Esempio n. 43
0
 def init_algorithm_params(self):
     '''Initialize other algorithm parameters'''
     # set default
     util.set_attr(self, dict(
         action_pdtype='default',
         action_policy='default',
         action_policy_update='no_update',
         explore_var_start=np.nan,
         explore_var_end=np.nan,
         explore_anneal_epi=np.nan,
         val_loss_coef=1.0,
     ))
     util.set_attr(self, self.algorithm_spec, [
         'action_pdtype',
         'action_policy',
         # theoretically, PPO does not have policy update; but in this implementation we have such option
         'action_policy_update',
         'explore_var_start',
         'explore_var_end',
         'explore_anneal_epi',
         'gamma',
         'lam',
         'clip_eps',
         'entropy_coef',
         'val_loss_coef',
         'training_frequency',  # horizon
         'training_epoch',
     ])
     # use the same annealing epi as lr
     self.clip_eps_anneal_epi = self.net_spec['lr_decay_min_timestep'] + self.net_spec['lr_decay_frequency'] * 20
     self.to_train = 0
     self.action_policy = getattr(policy_util, self.action_policy)
     self.action_policy_update = getattr(policy_util, self.action_policy_update)
     for body in self.agent.nanflat_body_a:
         body.explore_var = self.explore_var_start
     # PPO uses GAE
     self.calc_advs_v_targets = self.calc_gae_advs_v_targets
Esempio n. 44
0
    def __init__(self, net_spec, algorithm, in_dim, out_dim):
        '''
        net_spec:
        hid_layers: list with tuple consisting of two elements. (conv_hid, flat_hid)
                    Note: tuple must contain two elements, use empty list if no such layers.
            1. conv_hid: list containing dimensions of the convolutional hidden layers. Asssumed to all come before the flat layers.
                Note: a convolutional layer should specify the in_channel, out_channels, kernel_size, stride (of kernel steps), padding, and dilation (spacing between kernel points) E.g. [3, 16, (5, 5), 1, 0, (2, 2)]
                For more details, see http://pytorch.org/docs/master/nn.html#conv2d and https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md

            2. flat_hid: list of dense layers following the convolutional layers
        hid_layers_activation: activation function for the hidden layers
        batch_norm: whether to add batch normalization after each convolutional layer, excluding the input layer.
        clip_grad: whether to clip the gradient
        clip_grad_val: the clip value
        loss_spec: measure of error between model predictions and correct outputs
        optim_spec: parameters for initializing the optimizer
        lr_decay: function to decay learning rate
        lr_decay_frequency: how many total timesteps per decay
        lr_decay_min_timestep: minimum amount of total timesteps before starting decay
        lr_anneal_timestep: timestep to anneal lr decay
        update_type: method to update network weights: 'replace' or 'polyak'
        update_frequency: how many total timesteps per update
        polyak_coef: ratio of polyak weight update
        gpu: whether to train using a GPU. Note this will only work if a GPU is available, othewise setting gpu=True does nothing
        '''
        # OpenAI gym provides images as W x H x C, pyTorch expects C x W x H
        in_dim = np.roll(in_dim, 1)
        # use generic multi-output for Convnet
        out_dim = np.reshape(out_dim, -1).tolist()
        nn.Module.__init__(self)
        super(ConvNet, self).__init__(net_spec, algorithm, in_dim, out_dim)
        # set default
        util.set_attr(self, dict(
            batch_norm=True,
            clip_grad=False,
            clip_grad_val=1.0,
            loss_spec={'name': 'MSELoss'},
            optim_spec={'name': 'Adam'},
            lr_decay='no_decay',
            update_type='replace',
            update_frequency=1,
            polyak_coef=0.0,
            gpu=False,
        ))
        util.set_attr(self, self.net_spec, [
            'hid_layers',
            'hid_layers_activation',
            'batch_norm',
            'clip_grad',
            'clip_grad_val',
            'loss_spec',
            'optim_spec',
            'lr_decay',
            'lr_decay_frequency',
            'lr_decay_min_timestep',
            'lr_anneal_timestep',
            'update_type',
            'update_frequency',
            'polyak_coef',
            'gpu',
        ])

        self.conv_hid_layers = self.hid_layers[0]
        self.dense_hid_layers = self.hid_layers[1]
        # conv layer
        self.conv_model = self.build_conv_layers(self.conv_hid_layers)
        # fc layer from flattened conv
        self.dense_model = self.build_dense_layers(self.dense_hid_layers)
        # tails
        tail_in_dim = self.dense_hid_layers[-1] if len(self.dense_hid_layers) > 0 else self.conv_out_dim
        self.model_tails = nn.ModuleList([nn.Linear(tail_in_dim, out_d) for out_d in self.out_dim])

        net_util.init_layers(self.modules())
        if torch.cuda.is_available() and self.gpu:
            for module in self.modules():
                module.cuda()
        self.loss_fn = net_util.get_loss_fn(self, self.loss_spec)
        self.optim = net_util.get_optim(self, self.optim_spec)
        self.lr_decay = getattr(net_util, self.lr_decay)
Esempio n. 45
0
    def __init__(self, net_spec, algorithm, in_dim, out_dim):
        '''
        net_spec:
        hid_layers: list containing dimensions of the hidden layers. The last element of the list is should be the dimension of the hidden state for the recurrent layer. The other elements in the list are the dimensions of the MLP (if desired) which is to transform the state space.
        hid_layers_activation: activation function for the state_proc hidden layers
        rnn_hidden_size: rnn hidden_size
        rnn_num_layers: number of recurrent layers
        seq_len: length of the history of being passed to the net
        clip_grad: whether to clip the gradient
        clip_grad_val: the clip value
        loss_spec: measure of error between model predictions and correct outputs
        optim_spec: parameters for initializing the optimizer
        lr_decay: function to decay learning rate
        lr_decay_frequency: how many total timesteps per decay
        lr_decay_min_timestep: minimum amount of total timesteps before starting decay
        lr_anneal_timestep: timestep to anneal lr decay
        update_type: method to update network weights: 'replace' or 'polyak'
        update_frequency: how many total timesteps per update
        polyak_coef: ratio of polyak weight update
        gpu: whether to train using a GPU. Note this will only work if a GPU is available, othewise setting gpu=True does nothing
        '''
        # use generic multi-output for RNN
        out_dim = np.reshape(out_dim, -1).tolist()
        nn.Module.__init__(self)
        super(RecurrentNet, self).__init__(net_spec, algorithm, in_dim, out_dim)
        # set default
        util.set_attr(self, dict(
            rnn_num_layers=1,
            clip_grad=False,
            clip_grad_val=1.0,
            loss_spec={'name': 'MSELoss'},
            optim_spec={'name': 'Adam'},
            lr_decay='no_decay',
            update_type='replace',
            update_frequency=1,
            polyak_coef=0.0,
            gpu=False,
        ))
        util.set_attr(self, self.net_spec, [
            'hid_layers',
            'hid_layers_activation',
            'rnn_hidden_size',
            'rnn_num_layers',
            'seq_len',
            'clip_grad',
            'clip_grad_val',
            'loss_spec',
            'optim_spec',
            'lr_decay',
            'lr_decay_frequency',
            'lr_decay_min_timestep',
            'lr_anneal_timestep',
            'update_type',
            'update_frequency',
            'polyak_coef',
            'gpu',
        ])
        # state processing model
        state_proc_dims = [self.in_dim] + self.hid_layers
        self.state_proc_model = net_util.build_sequential(state_proc_dims, self.hid_layers_activation)

        # RNN model
        self.rnn_input_dim = state_proc_dims[-1]
        self.rnn_model = nn.GRU(
            input_size=self.rnn_input_dim,
            hidden_size=self.rnn_hidden_size,
            num_layers=self.rnn_num_layers,
            batch_first=True)

        # tails
        self.model_tails = nn.ModuleList([nn.Linear(self.rnn_hidden_size, out_d) for out_d in self.out_dim])

        net_util.init_layers(self.modules())
        if torch.cuda.is_available() and self.gpu:
            for module in self.modules():
                module.cuda()
        self.loss_fn = net_util.get_loss_fn(self, self.loss_spec)
        self.optim = net_util.get_optim(self, self.optim_spec)
        self.lr_decay = getattr(net_util, self.lr_decay)