def init_algo_params(self): '''Initialize other algorithm parameters''' algorithm_spec = self.agent.spec['algorithm'] net_spec = self.agent.spec['net'] self.action_policy = act_fns[algorithm_spec['action_policy']] self.action_policy_update = act_update_fns[ algorithm_spec['action_policy_update']] util.set_attr( self, _.pick( algorithm_spec, [ # explore_var is epsilon, tau or etc. depending on the action policy # these control the trade off between exploration and exploitaton 'explore_var_start', 'explore_var_end', 'explore_anneal_epi', 'gamma', # the discount factor 'training_epoch', # how many batches to train each time 'training_frequency', # how often to train (once a few timesteps) 'training_iters_per_batch', # how many times to train each batch 'training_min_timestep', # how long before starting training ])) self.nanflat_explore_var_a = [self.explore_var_start ] * self.agent.body_num
def __init__(self, memory_spec, body): util.set_attr(self, memory_spec, [ 'batch_size', 'seq_len', 'game' ]) self.total_reward = 0 self.clip_reward = clip_reward self.seq_len = seq_len data_folder = join('data', 'experience', self.game) self.episode_intervals, self.data = self.load_episodes(data_folder, self.game) valid_seq_idx_ranges = list() for start, end in self.episode_intervals: if end - start + 1 < self.seq_len: continue valid_seq_idx_ranges.append((0, end - self.seq_len + 2)) self.valid_seq_idx_ranges = valid_seq_idx_ranges total = sum([end - start + 1 for start, end in self.valid_seq_idx_ranges]) self.valid_seq_idx_weights = [(end - start + 1) / total for start, end in self.valid_seq_idx_ranges] self.total_reward = 0 self.is_episodic = False
def init_nets(self): '''Initialize the neural network used to learn the Q function from the spec''' body = self.agent.nanflat_body_a[0] # single-body algo state_dim = body.state_dim # dimension of the environment state, e.g. 4 action_dim = body.action_dim # dimension of the environment actions, e.g. 2 net_spec = self.agent.spec['net'] net_kwargs = util.compact_dict( dict( hid_layers_activation=_.get(net_spec, 'hid_layers_activation'), optim_param=_.get(net_spec, 'optim'), loss_param=_.get(net_spec, 'loss'), clamp_grad=_.get(net_spec, 'clamp_grad'), clamp_grad_val=_.get(net_spec, 'clamp_grad_val'), )) self.net = getattr(net, net_spec['type'])(state_dim, net_spec['hid_layers'], action_dim, **net_kwargs) util.set_attr( self, _.pick( net_spec, [ # how many examples to learn per training iteration 'batch_size', 'decay_lr', 'decay_lr_frequency', 'decay_lr_min_timestep', ]))
def init_nets(self): super(DQN, self).init_nets() # Network update params net_spec = self.agent.spec['net'] util.set_attr(self, _.pick(net_spec, [ 'update_type', 'update_frequency', 'polyak_weight', ]))
def init_algorithm_params(self): '''Initialize other algorithm parameters.''' # set default util.set_attr(self, dict( action_pdtype='default', action_policy='default', action_policy_update='no_update', explore_var_start=np.nan, explore_var_end=np.nan, explore_anneal_epi=np.nan, )) util.set_attr(self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', 'action_policy_update', # explore_var is epsilon, tau or etc. depending on the action policy # these control the trade off between exploration and exploitaton 'explore_var_start', 'explore_var_end', 'explore_anneal_epi', 'gamma', # the discount factor 'training_frequency', # how often to train for batch training (once each training_frequency time steps) ]) self.to_train = 0 self.action_policy = getattr(policy_util, self.action_policy) self.action_policy_update = getattr(policy_util, self.action_policy_update) for body in self.agent.nanflat_body_a: body.explore_var = self.explore_var_start
def test_set_attr(): class Foo: bar = 0 foo = Foo() util.set_attr(foo, {'bar': 1, 'baz': 2}) assert foo.bar == 1 assert foo.baz == 2
def __init__(self, memory_spec, body): util.set_attr(self, memory_spec, [ 'batch_size', 'max_size', 'stack_len', 'use_cer', 'game', ]) Replay.__init__(self, memory_spec, body) self.states_shape = self.scalar_shape self.states = [None] * self.max_size self._accept_data = True data_folder = os.path.join('data', self.game) actions = np.load(os.path.join(data_folder, '{}_actions.npy')) dones = np.load(os.path.join(data_folder, '{}_dones.npy')) rewards = np.load(os.path.join(data_folder, '{}_rewards.npy')) states = np.load(os.path.join(data_folder, '{}_states.npy')) for i in range(states.shape[0]): state = LazyFrames(states[i]) self.add_experience(state, actions[i], rewards[i], self.last_state, dones[i]) self.last_state = state self._accept_data = False
def init_algorithm_params(self): # set default util.set_attr(self, dict( action_pdtype='Argmax', action_policy='epsilon_greedy', action_policy_update='linear_decay', explore_var_start=1.0, explore_var_end=0.1, explore_anneal_epi=100, )) util.set_attr(self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', 'action_policy_update', # explore_var is epsilon, tau or etc. depending on the action policy # these control the trade off between exploration and exploitaton 'explore_var_start', 'explore_var_end', 'explore_anneal_epi', 'gamma', # the discount factor 'training_batch_epoch', # how many gradient updates per batch 'training_epoch', # how many batches to train each time 'training_frequency', # how often to train (once a few timesteps) 'training_min_timestep', # how long before starting training 'normalize_state', ]) super(VanillaDQN, self).init_algorithm_params()
def init_algorithm_params(self): '''Initialize other algorithm parameters''' # set default util.set_attr(self, dict( action_pdtype='default', action_policy='default', action_policy_update='no_update', explore_var_start=np.nan, explore_var_end=np.nan, explore_anneal_epi=np.nan, )) util.set_attr(self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', # theoretically, PPO does not have policy update; but in this implementation we have such option 'action_policy_update', 'explore_var_start', 'explore_var_end', 'explore_anneal_epi', 'gamma', 'lam', 'clip_eps', 'entropy_coef', 'training_frequency', # horizon 'training_epoch', ]) # use the same annealing epi as lr self.clip_eps_anneal_epi = self.net_spec['lr_decay_min_timestep'] + self.net_spec['lr_decay_frequency'] * 20 self.to_train = 0 self.action_policy = getattr(policy_util, self.action_policy) self.action_policy_update = getattr(policy_util, self.action_policy_update) for body in self.agent.nanflat_body_a: body.explore_var = self.explore_var_start
def init_algorithm_params(self): '''Initialize other algorithm parameters''' # set default util.set_attr(self, dict( action_pdtype='default', action_policy='default', action_policy_update='no_update', explore_var_start=np.nan, explore_var_end=np.nan, explore_anneal_epi=np.nan, )) util.set_attr(self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', # theoretically, REINFORCE does not have policy update; but in this implementation we have such option 'action_policy_update', 'explore_var_start', 'explore_var_end', 'explore_anneal_epi', 'gamma', # the discount factor 'add_entropy', 'entropy_coef', 'continuous_action_clip', 'training_frequency', ]) self.to_train = 0 self.action_policy = getattr(policy_util, self.action_policy) self.action_policy_update = getattr(policy_util, self.action_policy_update) for body in self.agent.nanflat_body_a: body.explore_var = self.explore_var_start
def __init__(self, memory_spec, body): # set default util.set_attr(self, dict(cross_entropy=1.0, )) util.set_attr(self, memory_spec, [ 'cross_entropy', ]) super().__init__(memory_spec, body)
def init_algorithm_params(self): # set default util.set_attr(self, dict( action_pdtype='Argmax', action_policy='epsilon_greedy', action_policy_update='linear_decay', explore_var_start=1.0, explore_var_end=0.1, explore_anneal_epi=100, )) util.set_attr(self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', 'action_policy_update', # explore_var is epsilon, tau or etc. depending on the action policy # these control the trade off between exploration and exploitaton 'explore_var_start', 'explore_var_end', 'explore_anneal_epi', 'gamma', # the discount factor 'training_batch_epoch', # how many gradient updates per batch 'training_epoch', # how many batches to train each time 'training_frequency', # how often to train (once a few timesteps) 'training_min_timestep', # how long before starting training ]) super(VanillaDQN, self).init_algorithm_params()
def init_algorithm_params(self): util.set_attr(self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', # theoretically, PPO does not have policy update; but in this implementation we have such option 'explore_var_spec', 'gamma', 'lam', 'clip_eps_spec', 'entropy_coef_spec', 'val_loss_coef', 'minibatch_size', 'time_horizon', # training_frequency = actor * horizon 'training_epoch', ]) self.to_train = 0 self.training_frequency = self.time_horizon * self.body.env.num_envs assert self.memory_spec['name'] == 'OnPolicyBatchReplay', f'PPO only works with OnPolicyBatchReplay, but got {self.memory_spec["name"]}' self.action_policy = getattr(policy_util, self.action_policy) self.explore_var_scheduler = policy_util.VarScheduler(self.explore_var_spec) self.body.explore_var = self.explore_var_scheduler.start_val # extra variable decays for PPO self.clip_eps_scheduler = policy_util.VarScheduler(self.clip_eps_spec) self.body.clip_eps = self.clip_eps_scheduler.start_val if self.entropy_coef_spec is not None: self.entropy_coef_scheduler = policy_util.VarScheduler(self.entropy_coef_spec) self.body.entropy_coef = self.entropy_coef_scheduler.start_val # PPO uses GAE self.calc_advs_v_targets = self.calc_gae_advs_v_targets
def init_algorithm_params(self): '''Initialize other algorithm parameters''' # set default util.set_attr( self, dict( action_pdtype='default', action_policy='default', explore_var_spec=None, entropy_coef_spec=None, policy_loss_coef=1.0, val_loss_coef=1.0, )) util.set_attr( self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', # theoretically, AC does not have policy update; but in this implementation we have such option 'explore_var_spec', 'gamma', # the discount factor 'lam', 'num_step_returns', 'entropy_coef_spec', 'policy_loss_coef', 'val_loss_coef', 'sil_policy_loss_coef', 'sil_val_loss_coef', 'training_frequency', 'training_batch_iter', 'training_iter', ]) super().init_algorithm_params()
def init_algorithm_params(self): '''Initialize other algorithm parameters''' # set default util.set_attr(self, dict( action_pdtype='default', action_policy='default', center_return=False, explore_var_spec=None, entropy_coef_spec=None, policy_loss_coef=1.0, )) util.set_attr(self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', 'center_return', # center by the mean 'explore_var_spec', 'gamma', # the discount factor 'entropy_coef_spec', 'policy_loss_coef', 'training_frequency', ]) self.to_train = 0 self.action_policy = getattr(policy_util, self.action_policy) self.explore_var_scheduler = policy_util.VarScheduler(self.explore_var_spec) self.body.explore_var = self.explore_var_scheduler.start_val if self.entropy_coef_spec is not None: self.entropy_coef_scheduler = policy_util.VarScheduler(self.entropy_coef_spec) self.body.entropy_coef = self.entropy_coef_scheduler.start_val
def test_set_attr(): class Foo: bar = 0 foo = Foo() util.set_attr(foo, {'bar': 1, 'baz': 2}) assert foo.bar == 1 assert foo.baz == 2
def init_algorithm_params(self): '''Initialize other algorithm parameters''' # set default util.set_attr( self, dict( action_pdtype='default', action_policy='default', action_policy_update='no_update', explore_var_start=np.nan, explore_var_end=np.nan, explore_anneal_epi=np.nan, )) util.set_attr( self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', # theoretically, REINFORCE does not have policy update; but in this implementation we have such option 'action_policy_update', 'explore_var_start', 'explore_var_end', 'explore_anneal_epi', 'gamma', # the discount factor 'add_entropy', 'entropy_coef', 'training_frequency', 'normalize_state', ]) self.to_train = 0 self.action_policy = getattr(policy_util, self.action_policy) self.action_policy_update = getattr(policy_util, self.action_policy_update) self.body.explore_var = self.explore_var_start
def init_algorithm_params(self): '''Initialize other algorithm parameters''' # set default util.set_attr( self, dict( action_pdtype='default', action_policy='default', explore_var_spec=None, entropy_coef_spec=None, )) util.set_attr( self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', # theoretically, REINFORCE does not have policy update; but in this implementation we have such option 'explore_var_spec', 'gamma', # the discount factor 'entropy_coef_spec', 'training_frequency', 'normalize_state', ]) self.to_train = 0 self.action_policy = getattr(policy_util, self.action_policy) self.explore_var_scheduler = policy_util.VarScheduler( self.explore_var_spec) self.body.explore_var = self.explore_var_scheduler.start_val if self.entropy_coef_spec is not None: self.entropy_coef_scheduler = policy_util.VarScheduler( self.entropy_coef_spec) self.body.entropy_coef = self.entropy_coef_scheduler.start_val
def init_algorithm_params(self): '''Initialize other algorithm parameters''' # set default util.set_attr( self, dict( action_pdtype='default', action_policy='default', training_iter=self.body.env.num_envs, training_start_step=self.body.memory.batch_size, )) util.set_attr( self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', 'gamma', # the discount factor 'training_iter', 'training_frequency', 'training_start_step', ]) if self.body.is_discrete: assert self.action_pdtype == 'GumbelSoftmax' self.to_train = 0 self.action_policy = getattr(policy_util, self.action_policy)
def init_nets(self): '''Initialize nets with multi-task dimensions, and set net params''' self.state_dims = [ body.state_dim for body in self.agent.nanflat_body_a] self.action_dims = [ body.action_dim for body in self.agent.nanflat_body_a] self.total_state_dim = sum(self.state_dims) self.total_action_dim = sum(self.action_dims) net_spec = self.agent.spec['net'] net_kwargs = util.compact_dict(dict( hid_layers_activation=_.get(net_spec, 'hid_layers_activation'), optim_param=_.get(net_spec, 'optim'), loss_param=_.get(net_spec, 'loss'), clamp_grad=_.get(net_spec, 'clamp_grad'), clamp_grad_val=_.get(net_spec, 'clamp_grad_val'), )) self.net = getattr(net, net_spec['type'])( self.total_state_dim, net_spec['hid_layers'], self.total_action_dim, **net_kwargs) self.target_net = getattr(net, net_spec['type'])( self.total_state_dim, net_spec['hid_layers'], self.total_action_dim, **net_kwargs) self.online_net = self.target_net self.eval_net = self.target_net util.set_attr(self, _.pick(net_spec, [ 'batch_size', 'update_type', 'update_frequency', 'polyak_weight', ]))
def init_nets(self): '''Initialize nets with multi-task dimensions, and set net params''' # NOTE: Separate init from MultitaskDQN despite similarities so that this implementation can support arbitrary sized state and action heads (e.g. multiple layers) net_spec = self.agent.spec['net'] if len(net_spec['hid_layers']) > 0: state_head_out_d = int(net_spec['hid_layers'][0] / 4) else: state_head_out_d = 16 self.state_dims = [ [body.state_dim, state_head_out_d] for body in self.agent.nanflat_body_a] self.action_dims = [ [body.action_dim] for body in self.agent.nanflat_body_a] self.total_state_dim = sum([s[0] for s in self.state_dims]) self.total_action_dim = sum([a[0] for a in self.action_dims]) logger.debug( f'State dims: {self.state_dims}, total: {self.total_state_dim}') logger.debug( f'Action dims: {self.action_dims}, total: {self.total_action_dim}') net_kwargs = util.compact_dict(dict( hid_layers_activation=_.get(net_spec, 'hid_layers_activation'), optim_param=_.get(net_spec, 'optim'), loss_param=_.get(net_spec, 'loss'), clamp_grad=_.get(net_spec, 'clamp_grad'), clamp_grad_val=_.get(net_spec, 'clamp_grad_val'), )) self.net = getattr(net, net_spec['type'])( self.state_dims, net_spec['hid_layers'], self.action_dims, **net_kwargs) self.target_net = getattr(net, net_spec['type'])( self.state_dims, net_spec['hid_layers'], self.action_dims, **net_kwargs) self.online_net = self.target_net self.eval_net = self.target_net util.set_attr(self, _.pick(net_spec, [ 'batch_size', 'update_type', 'update_frequency', 'polyak_weight', ]))
def init_nets(self): '''Initialize networks''' body = self.agent.nanflat_body_a[0] # single-body algo state_dim = body.state_dim action_dim = body.action_dim net_spec = self.agent.spec['net'] net_kwargs = util.compact_dict(dict( hid_layers_activation=_.get(net_spec, 'hid_layers_activation'), optim_param=_.get(net_spec, 'optim'), loss_param=_.get(net_spec, 'loss'), clamp_grad=_.get(net_spec, 'clamp_grad'), clamp_grad_val=_.get(net_spec, 'clamp_grad_val'), )) self.net = getattr(net, net_spec['type'])( state_dim, net_spec['hid_layers'], action_dim, **net_kwargs) self.target_net = getattr(net, net_spec['type'])( state_dim, net_spec['hid_layers'], action_dim, **net_kwargs) self.online_net = self.target_net self.eval_net = self.target_net util.set_attr(self, _.pick(net_spec, [ 'batch_size', ])) # Default network update params for base self.update_type = 'replace' self.update_frequency = 1 self.polyak_weight = 0.0
def init_algo_params(self): '''Initialize other algorithm parameters''' algorithm_spec = self.agent.spec['algorithm'] net_spec = self.agent.spec['net'] self.set_action_fn() util.set_attr(self, _.pick(algorithm_spec, [ 'gamma', 'num_epis_to_collect', 'add_entropy', 'entropy_weight', 'continuous_action_clip', 'lamda', 'num_step_returns', 'training_frequency', 'training_iters_per_batch', 'use_GAE', 'policy_loss_weight', 'val_loss_weight', ])) util.set_attr(self, _.pick(net_spec, [ 'decay_lr', 'decay_lr_frequency', 'decay_lr_min_timestep', ])) '''Select appropriate function for calculating state-action-value estimate (target)''' self.get_target = self.get_nstep_target if self.use_GAE: self.get_target = self.get_gae_target self.set_memory_flag() '''To save on a forward pass keep the log probs and entropy from each action''' self.saved_log_probs = [] self.entropy = [] self.to_train = 0
def init_algo_params(self): '''Initialize other algorithm parameters''' algorithm_spec = self.agent.spec['algorithm'] net_spec = self.agent.spec['net'] # Automatically selects appropriate discrete or continuous action policy if setting is default action_fn = algorithm_spec['action_policy'] if action_fn == 'default': if self.is_discrete: self.action_policy = act_fns['softmax'] else: self.action_policy = act_fns['gaussian'] else: self.action_policy = act_fns[action_fn] util.set_attr( self, _.pick(algorithm_spec, [ 'gamma', 'num_epis_to_collect', 'add_entropy', 'entropy_weight', 'continuous_action_clip' ])) util.set_attr( self, _.pick(net_spec, [ 'decay_lr', 'decay_lr_frequency', 'decay_lr_min_timestep', 'gpu' ])) if not hasattr(self, 'gpu'): self.gpu = False logger.info(f'Training on gpu: {self.gpu}') # To save on a forward pass keep the log probs from each action self.saved_log_probs = [] self.entropy = [] self.to_train = 0
def init_algorithm_params(self): '''Initialize other algorithm parameters.''' # set default util.set_attr( self, dict( action_pdtype='default', action_policy='default', explore_var_spec=None, )) util.set_attr( self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', # explore_var is epsilon, tau or etc. depending on the action policy # these control the trade off between exploration and exploitaton 'explore_var_spec', 'gamma', # the discount factor 'training_frequency', # how often to train for batch training (once each training_frequency time steps) ]) self.to_train = 0 self.action_policy = getattr(policy_util, self.action_policy) self.explore_var_scheduler = policy_util.VarScheduler( self.explore_var_spec) self.body.explore_var = self.explore_var_scheduler.start_val
def init_algorithm_params(self): '''Initialize other algorithm parameters.''' # set default util.set_attr( self, dict( action_pdtype='default', action_policy='default', action_policy_update='no_update', explore_var_start=np.nan, explore_var_end=np.nan, explore_anneal_epi=np.nan, )) util.set_attr( self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', 'action_policy_update', # explore_var is epsilon, tau or etc. depending on the action policy # these control the trade off between exploration and exploitaton 'explore_var_start', 'explore_var_end', 'explore_anneal_epi', 'gamma', # the discount factor 'training_frequency', # how often to train for batch training (once each training_frequency time steps) 'normalize_state', ]) self.to_train = 0 self.action_policy = getattr(policy_util, self.action_policy) self.action_policy_update = getattr(policy_util, self.action_policy_update) self.body.explore_var = self.explore_var_start
def init_algorithm_params(self): # set default util.set_attr( self, dict( action_pdtype='Argmax', action_policy='epsilon_greedy', explore_var_spec=None, )) util.set_attr( self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', # explore_var is epsilon, tau or etc. depending on the action policy # these control the trade off between exploration and exploitaton 'explore_var_spec', 'gamma', # the discount factor 'training_batch_iter', # how many gradient updates per batch 'training_iter', # how many batches to train each time 'training_frequency', # how often to train (once a few timesteps) 'training_start_step', # how long before starting training ]) super().init_algorithm_params()
def load(self, train_df): '''Load clock from the last row of body.train_df''' last_row = train_df.iloc[-1] last_clock_vals = ps.pick(last_row, *['epi', 't', 'wall_t', 'opt_step', 'frame']) util.set_attr(self, last_clock_vals) self.start_wall_t -= self.wall_t # offset elapsed wall_t
def __init__(self, net_spec, in_dim, out_dim): ''' net_spec: hid_layers: list containing dimensions of the hidden layers hid_layers_activation: activation function for the hidden layers init_fn: weight initialization function clip_grad_val: clip gradient norm if value is not None loss_spec: measure of error between model predictions and correct outputs optim_spec: parameters for initializing the optimizer lr_scheduler_spec: Pytorch optim.lr_scheduler update_type: method to update network weights: 'replace' or 'polyak' update_frequency: how many total timesteps per update polyak_coef: ratio of polyak weight update gpu: whether to train using a GPU. Note this will only work if a GPU is available, othewise setting gpu=True does nothing ''' nn.Module.__init__(self) super(MLPNet, self).__init__(net_spec, in_dim, out_dim) # set default util.set_attr(self, dict( init_fn=None, clip_grad_val=None, loss_spec={'name': 'MSELoss'}, optim_spec={'name': 'Adam'}, lr_scheduler_spec=None, update_type='replace', update_frequency=1, polyak_coef=0.0, gpu=False, )) util.set_attr(self, self.net_spec, [ 'shared', 'hid_layers', 'hid_layers_activation', 'init_fn', 'clip_grad_val', 'loss_spec', 'optim_spec', 'lr_scheduler_spec', 'update_type', 'update_frequency', 'polyak_coef', 'gpu', ]) dims = [self.in_dim] + self.hid_layers self.model = net_util.build_fc_model(dims, self.hid_layers_activation) # add last layer with no activation # tails. avoid list for single-tail for compute speed if ps.is_integer(self.out_dim): self.model_tail = nn.Linear(dims[-1], self.out_dim) else: self.model_tails = nn.ModuleList([nn.Linear(dims[-1], out_d) for out_d in self.out_dim]) net_util.init_layers(self, self.init_fn) for module in self.modules(): module.to(self.device) self.loss_fn = net_util.get_loss_fn(self, self.loss_spec) self.optim = net_util.get_optim(self, self.optim_spec) self.lr_scheduler = net_util.get_lr_scheduler(self, self.lr_scheduler_spec)
def __init__(self, memory_spec, body): util.set_attr( self, memory_spec, [ 'stack_len', # number of stack states ]) OnPolicyReplay.__init__(self, memory_spec, body)
def __init__(self, net_spec, in_dim, out_dim): state_dim, action_dim = in_dim assert len(state_dim) == 3 # image shape (c,w,h) # conv body nn.Module.__init__(self) Net.__init__(self, net_spec, state_dim, out_dim) # set default util.set_attr( self, dict( out_layer_activation=None, init_fn=None, normalize=False, batch_norm=True, clip_grad_val=None, loss_spec={'name': 'MSELoss'}, optim_spec={'name': 'Adam'}, lr_scheduler_spec=None, update_type='replace', update_frequency=1, polyak_coef=0.0, gpu=False, )) util.set_attr(self, self.net_spec, [ 'conv_hid_layers', 'fc_hid_layers', 'hid_layers_activation', 'out_layer_activation', 'init_fn', 'normalize', 'batch_norm', 'clip_grad_val', 'loss_spec', 'optim_spec', 'lr_scheduler_spec', 'update_type', 'update_frequency', 'polyak_coef', 'gpu', ]) # state conv model self.conv_model = self.build_conv_layers(self.conv_hid_layers) self.conv_out_dim = self.get_conv_output_size() # state fc model self.fc_model = net_util.build_fc_model( [self.conv_out_dim + action_dim] + self.fc_hid_layers, self.hid_layers_activation) # affine transformation applied to tail_in_dim = self.fc_hid_layers[-1] self.model_tail = net_util.build_fc_model([tail_in_dim, self.out_dim], self.out_layer_activation) net_util.init_layers(self, self.init_fn) self.loss_fn = net_util.get_loss_fn(self, self.loss_spec) self.to(self.device) self.train()
def __init__(self, memory_spec, algorithm, body): util.set_attr(self, memory_spec, [ 'batch_size', 'max_size', ]) self.seq_len = algorithm.net_spec['seq_len'] super(SeqReplay, self).__init__(memory_spec, algorithm, body) self.state_buffer = deque(maxlen=self.seq_len) self.reset()
def set_net_attributes(self): '''Initializes additional parameters from the net spec. Called by init_nets''' net_spec = self.agent.spec['net'] util.set_attr(self, _.pick(net_spec, [ 'decay_lr', 'decay_lr_frequency', 'decay_lr_min_timestep', 'gpu' ])) if not hasattr(self, 'gpu'): self.gpu = False logger.info(f'Training on gpu: {self.gpu}')
def init_algorithm_params(self): '''Initialize other algorithm parameters''' # set default util.set_attr( self, dict( action_pdtype='default', action_policy='default', explore_var_spec=None, entropy_coef_spec=None, minibatch_size=4, val_loss_coef=1.0, )) util.set_attr( self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', # theoretically, PPO does not have policy update; but in this implementation we have such option 'explore_var_spec', 'gamma', 'lam', 'clip_eps_spec', 'entropy_coef_spec', 'val_loss_coef', 'minibatch_size', 'time_horizon', # training_frequency = actor * horizon 'training_epoch', ]) self.to_train = 0 # guard num_envs = self.body.env.num_envs if self.minibatch_size % num_envs != 0 or self.time_horizon % num_envs != 0: self.minibatch_size = math.ceil( self.minibatch_size / num_envs) * num_envs self.time_horizon = math.ceil( self.time_horizon / num_envs) * num_envs logger.info( f'minibatch_size and time_horizon needs to be multiples of num_envs; autocorrected values: minibatch_size: {self.minibatch_size} time_horizon {self.time_horizon}' ) self.training_frequency = self.time_horizon # since all memories stores num_envs by batch in list assert self.memory_spec[ 'name'] == 'OnPolicyBatchReplay', f'PPO only works with OnPolicyBatchReplay, but got {self.memory_spec["name"]}' self.action_policy = getattr(policy_util, self.action_policy) self.explore_var_scheduler = policy_util.VarScheduler( self.explore_var_spec) self.body.explore_var = self.explore_var_scheduler.start_val # extra variable decays for PPO self.clip_eps_scheduler = policy_util.VarScheduler(self.clip_eps_spec) self.body.clip_eps = self.clip_eps_scheduler.start_val if self.entropy_coef_spec is not None: self.entropy_coef_scheduler = policy_util.VarScheduler( self.entropy_coef_spec) self.body.entropy_coef = self.entropy_coef_scheduler.start_val # PPO uses GAE self.calc_advs_v_targets = self.calc_gae_advs_v_targets
def __init__(self, memory_spec, algorithm, body): super(OnPolicyReplay, self).__init__(memory_spec, algorithm, body) # NOTE for OnPolicy replay, frequency = episode; for other classes below frequency = frames util.set_attr(self, self.agent_spec['algorithm'], ['training_frequency']) self.state_buffer = deque(maxlen=0) # for API consistency # Don't want total experiences reset when memory is self.is_episodic = True self.total_experiences = 0 self.warn_size_once = ps.once(lambda msg: logger.warn(msg)) self.reset()
def __init__(self, memory_spec, algorithm, body): util.set_attr(self, memory_spec, [ 'alpha', 'epsilon', 'batch_size', 'max_size', 'use_cer', ]) self.epsilon = torch.full((1,), self.epsilon) self.alpha = torch.full((1,), self.alpha) super(PrioritizedReplay, self).__init__(memory_spec, algorithm, body)
def __init__(self, memory_spec, algorithm, body): util.set_attr(self, memory_spec, [ 'batch_size', 'max_size', 'stack_len', # num_stack_states 'use_cer', ]) self.raw_state_dim = deepcopy(body.state_dim) # used for state_buffer body.state_dim = body.state_dim * self.stack_len # modify to use for net init for flattened stacked input super(StackReplay, self).__init__(memory_spec, algorithm, body) self.state_buffer = deque(maxlen=self.stack_len) self.reset()
def __init__(self, memory_spec, algorithm, body): super(Replay, self).__init__(memory_spec, algorithm, body) util.set_attr(self, self.memory_spec, [ 'batch_size', 'max_size', 'use_cer', ]) self.state_buffer = deque(maxlen=0) # for API consistency self.batch_idxs = None self.total_experiences = 0 # To track total experiences encountered even with forgetting self.reset() self.print_memory_info()
def __init__(self, memory_spec, algorithm, body): self.atari = True # Memory is specialized for playing Atari games util.set_attr(self, memory_spec, [ 'batch_size', 'max_size', 'stack_len', # num_stack_states 'use_cer', ]) self.raw_state_dim = (84, 84) body.state_dim = self.raw_state_dim + (self.stack_len,) # greyscale downsized, stacked Replay.__init__(self, memory_spec, algorithm, body) self.state_buffer = deque(maxlen=self.stack_len) self.reset()
def init_algorithm_params(self): '''Initialize other algorithm parameters''' # set default util.set_attr(self, dict( action_pdtype='default', action_policy='default', action_policy_update='no_update', explore_var_start=np.nan, explore_var_end=np.nan, explore_anneal_epi=np.nan, policy_loss_coef=1.0, val_loss_coef=1.0, )) util.set_attr(self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', # theoretically, AC does not have policy update; but in this implementation we have such option 'action_policy_update', 'explore_var_start', 'explore_var_end', 'explore_anneal_epi', 'gamma', # the discount factor 'use_gae', 'lam', 'use_nstep', 'num_step_returns', 'add_entropy', 'entropy_coef', 'policy_loss_coef', 'val_loss_coef', 'continuous_action_clip', 'training_frequency', 'training_epoch', ]) self.to_train = 0 self.action_policy = getattr(policy_util, self.action_policy) self.action_policy_update = getattr(policy_util, self.action_policy_update) for body in self.agent.nanflat_body_a: body.explore_var = self.explore_var_start # Select appropriate methods to calculate adv_targets and v_targets for training if self.use_gae: self.calc_advs_v_targets = self.calc_gae_advs_v_targets elif self.use_nstep: self.calc_advs_v_targets = self.calc_nstep_advs_v_targets else: self.calc_advs_v_targets = self.calc_td_advs_v_targets
def __init__(self, env_spec, env_space, e=0): self.env_spec = env_spec self.env_space = env_space self.info_space = env_space.info_space self.e = e util.set_attr(self, self.env_spec) self.name = self.env_spec['name'] self.body_e = None self.nanflat_body_e = None # nanflatten version of bodies self.body_num = None worker_id = int(f'{os.getpid()}{self.e+int(ps.unique_id())}'[-4:]) self.u_env = UnityEnvironment(file_name=util.get_env_path(self.name), worker_id=worker_id) # spaces for NN auto input/output inference logger.warn('Unity environment observation_space and action_space are constructed with invalid range. Use only their shapes.') self.observation_spaces = [] self.action_spaces = [] for a in range(len(self.u_env.brain_names)): observation_shape = (self.get_observable_dim(a)['state'],) if self.get_brain(a).state_space_type == 'discrete': observation_space = gym.spaces.Box(low=0, high=1, shape=observation_shape, dtype=np.int32) else: observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=observation_shape, dtype=np.float32) self.observation_spaces.append(observation_space) if self.is_discrete(a): action_space = gym.spaces.Discrete(self.get_action_dim(a)) else: action_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32) self.action_spaces.append(action_space) for observation_space, action_space in zip(self.observation_spaces, self.action_spaces): set_gym_space_attr(observation_space) set_gym_space_attr(action_space) # TODO experiment to find out optimal benchmarking max_timestep, set # TODO ensure clock_speed from env_spec self.clock_speed = 1 self.clock = Clock(self.clock_speed) self.done = False
def __init__(self, env_spec, env_space, e=0): self.env_spec = env_spec self.env_space = env_space self.info_space = env_space.info_space util.set_attr(self, self.env_spec) self.name = self.env_spec['name'] self.e = e self.body_e = None self.nanflat_body_e = None # nanflatten version of bodies self.body_num = None self.u_env = gym.make(self.name) # spaces for NN auto input/output inference set_gym_space_attr(self.u_env.observation_space) self.observation_spaces = [self.u_env.observation_space] set_gym_space_attr(self.u_env.action_space) self.action_spaces = [self.u_env.action_space] self.max_timestep = self.max_timestep or self.u_env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps') # TODO ensure clock_speed from env_spec self.clock_speed = 1 self.clock = Clock(self.clock_speed) self.done = False
def init_algorithm_params(self): '''Initialize other algorithm parameters''' # set default util.set_attr(self, dict( action_pdtype='default', action_policy='default', action_policy_update='no_update', explore_var_start=np.nan, explore_var_end=np.nan, explore_anneal_epi=np.nan, val_loss_coef=1.0, )) util.set_attr(self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', # theoretically, PPO does not have policy update; but in this implementation we have such option 'action_policy_update', 'explore_var_start', 'explore_var_end', 'explore_anneal_epi', 'gamma', 'lam', 'clip_eps', 'entropy_coef', 'val_loss_coef', 'training_frequency', # horizon 'training_epoch', ]) # use the same annealing epi as lr self.clip_eps_anneal_epi = self.net_spec['lr_decay_min_timestep'] + self.net_spec['lr_decay_frequency'] * 20 self.to_train = 0 self.action_policy = getattr(policy_util, self.action_policy) self.action_policy_update = getattr(policy_util, self.action_policy_update) for body in self.agent.nanflat_body_a: body.explore_var = self.explore_var_start # PPO uses GAE self.calc_advs_v_targets = self.calc_gae_advs_v_targets
def __init__(self, net_spec, algorithm, in_dim, out_dim): ''' net_spec: hid_layers: list with tuple consisting of two elements. (conv_hid, flat_hid) Note: tuple must contain two elements, use empty list if no such layers. 1. conv_hid: list containing dimensions of the convolutional hidden layers. Asssumed to all come before the flat layers. Note: a convolutional layer should specify the in_channel, out_channels, kernel_size, stride (of kernel steps), padding, and dilation (spacing between kernel points) E.g. [3, 16, (5, 5), 1, 0, (2, 2)] For more details, see http://pytorch.org/docs/master/nn.html#conv2d and https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md 2. flat_hid: list of dense layers following the convolutional layers hid_layers_activation: activation function for the hidden layers batch_norm: whether to add batch normalization after each convolutional layer, excluding the input layer. clip_grad: whether to clip the gradient clip_grad_val: the clip value loss_spec: measure of error between model predictions and correct outputs optim_spec: parameters for initializing the optimizer lr_decay: function to decay learning rate lr_decay_frequency: how many total timesteps per decay lr_decay_min_timestep: minimum amount of total timesteps before starting decay lr_anneal_timestep: timestep to anneal lr decay update_type: method to update network weights: 'replace' or 'polyak' update_frequency: how many total timesteps per update polyak_coef: ratio of polyak weight update gpu: whether to train using a GPU. Note this will only work if a GPU is available, othewise setting gpu=True does nothing ''' # OpenAI gym provides images as W x H x C, pyTorch expects C x W x H in_dim = np.roll(in_dim, 1) # use generic multi-output for Convnet out_dim = np.reshape(out_dim, -1).tolist() nn.Module.__init__(self) super(ConvNet, self).__init__(net_spec, algorithm, in_dim, out_dim) # set default util.set_attr(self, dict( batch_norm=True, clip_grad=False, clip_grad_val=1.0, loss_spec={'name': 'MSELoss'}, optim_spec={'name': 'Adam'}, lr_decay='no_decay', update_type='replace', update_frequency=1, polyak_coef=0.0, gpu=False, )) util.set_attr(self, self.net_spec, [ 'hid_layers', 'hid_layers_activation', 'batch_norm', 'clip_grad', 'clip_grad_val', 'loss_spec', 'optim_spec', 'lr_decay', 'lr_decay_frequency', 'lr_decay_min_timestep', 'lr_anneal_timestep', 'update_type', 'update_frequency', 'polyak_coef', 'gpu', ]) self.conv_hid_layers = self.hid_layers[0] self.dense_hid_layers = self.hid_layers[1] # conv layer self.conv_model = self.build_conv_layers(self.conv_hid_layers) # fc layer from flattened conv self.dense_model = self.build_dense_layers(self.dense_hid_layers) # tails tail_in_dim = self.dense_hid_layers[-1] if len(self.dense_hid_layers) > 0 else self.conv_out_dim self.model_tails = nn.ModuleList([nn.Linear(tail_in_dim, out_d) for out_d in self.out_dim]) net_util.init_layers(self.modules()) if torch.cuda.is_available() and self.gpu: for module in self.modules(): module.cuda() self.loss_fn = net_util.get_loss_fn(self, self.loss_spec) self.optim = net_util.get_optim(self, self.optim_spec) self.lr_decay = getattr(net_util, self.lr_decay)
def __init__(self, net_spec, algorithm, in_dim, out_dim): ''' net_spec: hid_layers: list containing dimensions of the hidden layers. The last element of the list is should be the dimension of the hidden state for the recurrent layer. The other elements in the list are the dimensions of the MLP (if desired) which is to transform the state space. hid_layers_activation: activation function for the state_proc hidden layers rnn_hidden_size: rnn hidden_size rnn_num_layers: number of recurrent layers seq_len: length of the history of being passed to the net clip_grad: whether to clip the gradient clip_grad_val: the clip value loss_spec: measure of error between model predictions and correct outputs optim_spec: parameters for initializing the optimizer lr_decay: function to decay learning rate lr_decay_frequency: how many total timesteps per decay lr_decay_min_timestep: minimum amount of total timesteps before starting decay lr_anneal_timestep: timestep to anneal lr decay update_type: method to update network weights: 'replace' or 'polyak' update_frequency: how many total timesteps per update polyak_coef: ratio of polyak weight update gpu: whether to train using a GPU. Note this will only work if a GPU is available, othewise setting gpu=True does nothing ''' # use generic multi-output for RNN out_dim = np.reshape(out_dim, -1).tolist() nn.Module.__init__(self) super(RecurrentNet, self).__init__(net_spec, algorithm, in_dim, out_dim) # set default util.set_attr(self, dict( rnn_num_layers=1, clip_grad=False, clip_grad_val=1.0, loss_spec={'name': 'MSELoss'}, optim_spec={'name': 'Adam'}, lr_decay='no_decay', update_type='replace', update_frequency=1, polyak_coef=0.0, gpu=False, )) util.set_attr(self, self.net_spec, [ 'hid_layers', 'hid_layers_activation', 'rnn_hidden_size', 'rnn_num_layers', 'seq_len', 'clip_grad', 'clip_grad_val', 'loss_spec', 'optim_spec', 'lr_decay', 'lr_decay_frequency', 'lr_decay_min_timestep', 'lr_anneal_timestep', 'update_type', 'update_frequency', 'polyak_coef', 'gpu', ]) # state processing model state_proc_dims = [self.in_dim] + self.hid_layers self.state_proc_model = net_util.build_sequential(state_proc_dims, self.hid_layers_activation) # RNN model self.rnn_input_dim = state_proc_dims[-1] self.rnn_model = nn.GRU( input_size=self.rnn_input_dim, hidden_size=self.rnn_hidden_size, num_layers=self.rnn_num_layers, batch_first=True) # tails self.model_tails = nn.ModuleList([nn.Linear(self.rnn_hidden_size, out_d) for out_d in self.out_dim]) net_util.init_layers(self.modules()) if torch.cuda.is_available() and self.gpu: for module in self.modules(): module.cuda() self.loss_fn = net_util.get_loss_fn(self, self.loss_spec) self.optim = net_util.get_optim(self, self.optim_spec) self.lr_decay = getattr(net_util, self.lr_decay)