def add_experience(self, state, action, reward, next_state, done, priority=1): '''Interface helper method for update() to add experience to memory''' self.states.append(state) self.actions.append(action) self.rewards.append(reward) self.next_states.append(next_state) self.dones.append(done) self.priorities.append(priority) # Set most recent self.most_recent[0] = state self.most_recent[1] = action self.most_recent[2] = reward self.most_recent[3] = next_state self.most_recent[4] = done self.most_recent[5] = priority # Track memory size and num experiences self.true_size += 1 if self.true_size > 1000 and self.memory_warn_flag: logger.warn("Large memory size: {}".format(self.true_size)) self.memory_warn_flag = False self.total_experiences += 1 # Decide if agent is to train if done or (len(self.states)) == self.training_frequency: self.body.agent.algorithm.to_train = 1
def get_session_data(session): ''' Gather data from session: MDP, Agent, Env data, hashed by aeb; then aggregate. @returns {dict, dict} session_mdp_data, session_data ''' data_names = AGENT_DATA_NAMES + ENV_DATA_NAMES mdp_data_names = ['t', 'epi'] + data_names agg_data_names = ['epi'] + list(DATA_AGG_FNS.keys()) data_h_v_dict = {data_name: session.aeb_space.get_history_v(data_name) for data_name in data_names} session_mdp_data, session_data = {}, {} for aeb in session.aeb_space.aeb_list: data_h_dict = {data_name: data_h_v[aeb] for data_name, data_h_v in data_h_v_dict.items()} # trim back to remove any incomplete sessions due to multienv termination complete_done_h = np.trim_zeros(data_h_dict['done'], 'b') # offset properly to bin separate episodes reset_bin = np.concatenate([[0.], complete_done_h[:-1]]) data_len = len(reset_bin) reset_idx = reset_bin.astype('bool') nonreset_idx = ~reset_idx data_h_dict['t'] = np.ones(reset_idx.shape) data_h_dict['epi'] = reset_idx.astype(int).cumsum() mdp_df = pd.DataFrame({ data_name: data_h_dict[data_name][:data_len] for data_name in mdp_data_names}) mdp_df = mdp_df.reindex(mdp_data_names, axis=1) aeb_df = mdp_df[agg_data_names].groupby('epi').agg(DATA_AGG_FNS) aeb_df.reset_index(drop=False, inplace=True) session_mdp_data[aeb], session_data[aeb] = mdp_df, aeb_df logger.debug(f'{session_data}') data_size_in_bytes = util.memory_size(session_mdp_data) logger.debug(f'Size of session data: {data_size_in_bytes} MB') if data_size_in_bytes > 25: logger.warn(f'Session data > 25 MB') return session_mdp_data, session_data
def run_by_mode(spec_file, spec_name, run_mode): spec = spec_util.get(spec_file, spec_name) # TODO remove when analysis can save all plotly plots os.environ['run_mode'] = run_mode if run_mode == 'search': Experiment(spec).run() elif run_mode == 'train': Trial(spec).run() elif run_mode == 'enjoy': # TODO turn on save/load model mode # Session(spec).run() pass elif run_mode == 'generate_benchmark': benchmarker.generate_specs(spec, const='agent') elif run_mode == 'benchmark': # TODO allow changing const to env run_benchmark(spec, const='agent') elif run_mode == 'dev': os.environ['PY_ENV'] = 'test' # to not save in viz spec = util.override_dev_spec(spec) Trial(spec).run() else: logger.warn( 'run_mode not recognized; must be one of `search, train, enjoy, benchmark, dev`.' )
def calc_aeb_fitness_sr(aeb_df, env_name): '''Top level method to calculate fitness vector for AEB level data (strength, speed, stability)''' no_fitness_sr = pd.Series({'strength': 0., 'speed': 0., 'stability': 0.}) if len(aeb_df) < MA_WINDOW: logger.warn( f'Run more than {MA_WINDOW} episodes to compute proper fitness') return no_fitness_sr std = FITNESS_STD.get(env_name) if std is None: std = FITNESS_STD.get('template') logger.warn( f'The fitness standard for env {env_name} is not built yet. Contact author. Using a template standard for now.' ) aeb_df['total_t'] = aeb_df['t'].cumsum() aeb_df['strength'] = calc_strength(aeb_df, std['rand_epi_reward'], std['std_epi_reward']) aeb_df['strength_ma'] = aeb_df['strength'].rolling(MA_WINDOW).mean() aeb_df['strength_mono_inc'] = is_noisy_mono_inc( aeb_df['strength']).astype(int) strength = aeb_df['strength_ma'].max() speed = calc_speed(aeb_df, std['std_timestep']) stability = calc_stability(aeb_df) aeb_fitness_sr = pd.Series({ 'strength': strength, 'speed': speed, 'stability': stability }) return aeb_fitness_sr
def test_logger(test_multiline_str): logger.critical(test_multiline_str) logger.debug(test_multiline_str) logger.error(test_multiline_str) logger.exception(test_multiline_str) logger.info(test_multiline_str) logger.warn(test_multiline_str)
def run_by_mode(spec_file, spec_name, lab_mode): logger.info(f'Running lab in mode: {lab_mode}') spec = spec_util.get(spec_file, spec_name) info_space = InfoSpace() analysis.save_spec(spec, info_space, unit='experiment') # '@' is reserved for 'enjoy@{prepath}' os.environ['lab_mode'] = lab_mode.split('@')[0] os.environ['PREPATH'] = util.get_prepath(spec, info_space) reload(logger) # to set PREPATH properly if lab_mode == 'search': info_space.tick('experiment') Experiment(spec, info_space).run() elif lab_mode.startswith('train'): if '@' in lab_mode: prepath = lab_mode.split('@')[1] spec, info_space = util.prepath_to_spec_info_space(prepath) else: info_space.tick('trial') Trial(spec, info_space).run() elif lab_mode.startswith('enjoy'): prepath = lab_mode.split('@')[1] spec, info_space = util.prepath_to_spec_info_space(prepath) Session(spec, info_space).run() elif lab_mode.startswith('enjoy'): prepath = lab_mode.split('@')[1] spec, info_space = util.prepath_to_spec_info_space(prepath) Session(spec, info_space).run() elif lab_mode == 'dev': spec = util.override_dev_spec(spec) info_space.tick('trial') Trial(spec, info_space).run() else: logger.warn('lab_mode not recognized; must be one of `search, train, enjoy, benchmark, dev`.')
def run_by_mode(spec_file, spec_name, lab_mode): logger.info(f'Running lab in mode: {lab_mode}') spec = spec_util.get(spec_file, spec_name) info_space = InfoSpace() os.environ['PREPATH'] = util.get_prepath(spec, info_space) reload(logger) # to set PREPATH properly # expose to runtime, '@' is reserved for 'enjoy@{prepath}' os.environ['lab_mode'] = lab_mode.split('@')[0] if lab_mode == 'search': info_space.tick('experiment') Experiment(spec, info_space).run() elif lab_mode == 'train': info_space.tick('trial') Trial(spec, info_space).run() elif lab_mode.startswith('enjoy'): prepath = lab_mode.split('@')[1] spec, info_space = util.prepath_to_spec_info_space(prepath) Session(spec, info_space).run() elif lab_mode == 'generate_benchmark': benchmarker.generate_specs(spec, const='agent') elif lab_mode == 'benchmark': # TODO allow changing const to env run_benchmark(spec, const='agent') elif lab_mode == 'dev': spec = util.override_dev_spec(spec) info_space.tick('trial') Trial(spec, info_space).run() else: logger.warn( 'lab_mode not recognized; must be one of `search, train, enjoy, benchmark, dev`.' )
def test_logger(test_str): logger.critical(test_str) logger.debug(test_str) logger.error(test_str) logger.exception(test_str) logger.info(test_str) logger.warn(test_str)
def is_q_learning(algorithm): '''Check the algorithm is a Q-learning variant and action space is discrete''' assert hasattr(algorithm, 'body') is_q_algo = any(k in algorithm.algorithm_spec['name'] for k in ('DQN', 'SARSA')) is_q = algorithm.body.is_discrete and is_q_algo if not is_q: logger.warn('DuelingMLPNet only appropriate for Q-Learning algorithms. Currently implemented for single body algorithms in discrete action spaces') return is_q
def test_logger(test_multiline_str): logger.set_level('DEBUG') logger.critical(test_multiline_str) logger.debug(test_multiline_str) logger.error(test_multiline_str) logger.exception(test_multiline_str) logger.info(test_multiline_str) logger.warn(test_multiline_str)
def calc_session_metrics(session_df, env_name, info_prepath=None, df_mode=None): ''' Calculate the session metrics: strength, efficiency, stability @param DataFrame:session_df Dataframe containing reward, frame, opt_step @param str:env_name Name of the environment to get its random baseline @param str:info_prepath Optional info_prepath to auto-save the output to @param str:df_mode Optional df_mode to save with info_prepath @returns dict:metrics Consists of scalar metrics and series local metrics ''' rand_bl = random_baseline.get_random_baseline(env_name) if rand_bl is None: mean_rand_returns = 0.0 logger.warn('Random baseline unavailable for environment. Please generate separately.') else: mean_rand_returns = rand_bl['mean'] mean_returns = session_df['total_reward'] frames = session_df['frame'] opt_steps = session_df['opt_step'] final_return_ma = mean_returns[-viz.PLOT_MA_WINDOW:].mean() str_, local_strs = calc_strength(mean_returns, mean_rand_returns) max_str, final_str = local_strs.max(), local_strs.iloc[-1] with warnings.catch_warnings(): # mute np.nanmean warning warnings.filterwarnings('ignore') sample_eff, local_sample_effs = calc_efficiency(local_strs, frames) train_eff, local_train_effs = calc_efficiency(local_strs, opt_steps) sta, local_stas = calc_stability(local_strs) # all the scalar session metrics scalar = { 'final_return_ma': final_return_ma, 'strength': str_, 'max_strength': max_str, 'final_strength': final_str, 'sample_efficiency': sample_eff, 'training_efficiency': train_eff, 'stability': sta, } # all the session local metrics local = { 'mean_returns': mean_returns, 'strengths': local_strs, 'sample_efficiencies': local_sample_effs, 'training_efficiencies': local_train_effs, 'stabilities': local_stas, 'frames': frames, 'opt_steps': opt_steps, } metrics = { 'scalar': scalar, 'local': local, } if info_prepath is not None: # auto-save if info_prepath is given util.write(metrics, f'{info_prepath}_session_metrics_{df_mode}.pkl') util.write(scalar, f'{info_prepath}_session_metrics_scalar_{df_mode}.json') # save important metrics in info_prepath directly util.write(scalar, f'{info_prepath.replace("info/", "")}_session_metrics_scalar_{df_mode}.json') return metrics
def set_memory_flag(self): '''Flags if memory is episodic or discrete. This affects how the target and advantage functions are calculated''' body = self.agent.nanflat_body_a[0] memory = body.memory.__class__.__name__ if memory.find('OnPolicyReplay') != -1: self.is_episodic = True elif memory.find('OnPolicyBatchReplay') != -1: self.is_episodic = False else: logger.warn(f'Error: Memory {memory} not recognized') raise NotImplementedError
def set_memory_flag(self): '''Flags if memory is episodic or discrete. This affects how self.sample() handles the batch it gets back from memory''' body = self.agent.nanflat_body_a[0] memory = body.memory.__class__.__name__ if (memory.find('OnPolicyReplay') != -1) or (memory.find('OnPolicyNStepReplay') != -1): self.is_episodic = True elif (memory.find('OnPolicyBatchReplay') != -1) or (memory.find('OnPolicyNStepBatchReplay') != -1): self.is_episodic = False else: logger.warn(f'Error: Memory {memory} not recognized') raise NotImplementedError
def is_q_learning(algorithm): '''Check the algorithm is a Q-learning variant and action space is discrete''' assert hasattr(algorithm, 'body') is_q_algo = any(k in algorithm.algorithm_spec['name'] for k in ('DQN', 'SARSA')) is_q = algorithm.body.is_discrete and is_q_algo if not is_q: logger.warn( 'DuelingMLPNet only appropriate for Q-Learning algorithms. Currently implemented for single body algorithms in discrete action spaces' ) return is_q
def save_image(figure, filepath=None): if os.environ['PY_ENV'] == 'test': return if filepath is None: filepath = f'{PLOT_FILEDIR}/{ps.get(figure, "layout.title")}.png' filepath = util.smart_path(filepath) try: pio.write_image(figure, filepath) logger.info(f'Graph saved to {filepath}') except Exception as e: logger.warn( f'{e}\nFailed to generate graph. Fix the issue and run retro-analysis to generate graphs.' )
def __init__(self, env_spec, env_space, e=0): self.env_spec = env_spec self.env_space = env_space self.info_space = env_space.info_space self.e = e util.set_attr(self, self.env_spec) self.name = self.env_spec['name'] self.body_e = None self.nanflat_body_e = None # nanflatten version of bodies self.body_num = None worker_id = int(f'{os.getpid()}{self.e+int(ps.unique_id())}'[-4:]) self.u_env = UnityEnvironment(file_name=util.get_env_path(self.name), worker_id=worker_id) # spaces for NN auto input/output inference logger.warn( 'Unity environment observation_space and action_space are constructed with invalid range. Use only their shapes.' ) self.observation_spaces = [] self.action_spaces = [] for a in range(len(self.u_env.brain_names)): observation_shape = (self.get_observable_dim(a)['state'], ) if self.get_brain(a).state_space_type == 'discrete': observation_space = gym.spaces.Box(low=0, high=1, shape=observation_shape, dtype=np.int32) else: observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=observation_shape, dtype=np.float32) self.observation_spaces.append(observation_space) if self.is_discrete(a): action_space = gym.spaces.Discrete(self.get_action_dim(a)) else: action_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1, ), dtype=np.float32) self.action_spaces.append(action_space) for observation_space, action_space in zip(self.observation_spaces, self.action_spaces): set_gym_space_attr(observation_space) set_gym_space_attr(action_space) # TODO experiment to find out optimal benchmarking max_timestep, set # TODO ensure clock_speed from env_spec self.clock_speed = 1 self.clock = Clock(self.clock_speed) self.done = False
def check_fn(*args, **kwargs): if not to_check_training_step(): return fn(*args, **kwargs) net = args[0] # first arg self # get pre-update parameters to compare pre_params = [param.clone() for param in net.parameters()] # run training_step, get loss loss = fn(*args, **kwargs) # get post-update parameters to compare post_params = [param.clone() for param in net.parameters()] if loss == 0.0: # if loss is 0, there should be no updates # TODO if without momentum, parameters should not change too for p_name, param in net.named_parameters(): assert param.grad.norm() == 0 else: # check parameter updates try: assert not all( torch.equal(w1, w2) for w1, w2 in zip(pre_params, post_params) ), f'Model parameter is not updated in training_step(), check if your tensor is detached from graph. Loss: {loss:g}' logger.info( f'Model parameter is updated in training_step(). Loss: {loss: g}' ) except Exception as e: logger.error(e) if os.environ.get('PY_ENV') == 'test': # raise error if in unit test raise (e) # check grad norms min_norm, max_norm = 0.0, 1e5 for p_name, param in net.named_parameters(): try: grad_norm = param.grad.norm() assert min_norm < grad_norm < max_norm, f'Gradient norm for {p_name} is {grad_norm:g}, fails the extreme value check {min_norm} < grad_norm < {max_norm}. Loss: {loss:g}. Check your network and loss computation.' logger.info( f'Gradient norm for {p_name} is {grad_norm:g}; passes value check.' ) except Exception as e: logger.warn(e) logger.debug('Passed network parameter update check.') # store grad norms for debugging net.store_grad_norms() return loss
def assert_trained(post_model, loss): post_weights = [param.clone() for param in post_model.parameters()] if loss == 0: # TODO if without momentum, weights should not change too for p_name, param in post_model.named_parameters(): assert param.grad.norm() == 0 else: assert not all(torch.equal(w1, w2) for w1, w2 in zip(pre_weights, post_weights)), f'Model parameter is not updated in training_step(), check if your tensor is detached from graph. loss: {loss}' min_norm = 0 max_norm = 1e5 for p_name, param in post_model.named_parameters(): try: assert min_norm < param.grad.norm() < max_norm, f'Gradient norm fails the extreme value check {min_norm} < {p_name}:{param.grad.norm()} < {max_norm}, which is bad. Loss: {loss}. Check your network and loss computation. Consider using the "clip_grad_val" net parameter.' except Exception as e: logger.warn(e) logger.debug('Passed network weight update assertation in dev lab_mode.')
def set_action_fn(self): '''Sets the function used to select actions. Automatically selects appropriate discrete or continuous action policy under default setting''' body = self.agent.nanflat_body_a[0] algorithm_spec = self.agent.spec['algorithm'] action_fn = algorithm_spec['action_policy'] if action_fn == 'default': if self.is_discrete: self.action_policy = act_fns['softmax'] else: if body.action_dim > 1: logger.warn(f'Action dim: {body.action_dim}. Continuous multidimensional action space not supported yet. Contact author') raise NotImplementedError else: self.action_policy = act_fns['gaussian'] else: self.action_policy = act_fns[action_fn]
def calc_aeb_fitness_sr(aeb_df, env_name): '''Top level method to calculate fitness vector for AEB level data (strength, speed, stability)''' std = FITNESS_STD.get(env_name) if std is None: std = FITNESS_STD.get('template') logger.warn(f'The fitness standard for env {env_name} is not built yet. Contact author. Using a template standard for now.') # calculate the strength sr and the moving-average (to denoise) first before calculating fitness aeb_df['strength'] = calc_strength_sr(aeb_df, std['rand_epi_reward'], std['std_epi_reward']) aeb_df['strength_ma'] = aeb_df['strength'].rolling(MA_WINDOW, min_periods=0, center=False).mean() strength = calc_strength(aeb_df) speed = calc_speed(aeb_df, std['std_timestep']) stability = calc_stability(aeb_df) aeb_fitness_sr = pd.Series({ 'strength': strength, 'speed': speed, 'stability': stability}) return aeb_fitness_sr
def add_experience(self, state, action, reward, next_state, done, priority=1): '''Interface helper method for update() to add experience to memory''' self.current_episode['states'].append(state) self.current_episode['actions'].append(action) self.current_episode['rewards'].append(reward) self.current_episode['next_states'].append(next_state) self.current_episode['dones'].append(done) self.current_episode['priorities'].append(priority) # Set most recent self.most_recent[0] = state self.most_recent[1] = action self.most_recent[2] = reward self.most_recent[3] = next_state self.most_recent[4] = done self.most_recent[5] = priority # If episode ended, add to memory and clear current_episode if done: self.states.append(self.current_episode['states']) self.actions.append(self.current_episode['actions']) self.rewards.append(self.current_episode['rewards']) self.next_states.append(self.current_episode['next_states']) self.dones.append(self.current_episode['dones']) self.priorities.append(self.current_episode['priorities']) self.current_episode = { 'states': [], 'actions': [], 'rewards': [], 'next_states': [], 'dones': [], 'priorities': [] } # If agent has collected the desired number of episodes, it is ready to train if len(self.states) == self.num_epis_to_collect: self.body.agent.algorithm.to_train = 1 # Track memory size and num experiences self.true_size += 1 if self.true_size > 1000 and self.memory_warn_flag: logger.warn("Large memory size: {}".format(self.true_size)) self.memory_warn_flag = False self.total_experiences += 1
def __init__(self, memory_spec, algorithm, body): super(OnPolicyReplay, self).__init__(memory_spec, algorithm, body) # NOTE for OnPolicy replay, frequency = episode; for other classes below frequency = frames util.set_attr(self, self.agent_spec['algorithm'], ['training_frequency']) self.state_buffer = deque(maxlen=0) # for API consistency # Don't want total experiences reset when memory is self.is_episodic = True self.total_experiences = 0 self.warn_size_once = ps.once(lambda msg: logger.warn(msg)) self.reset()
def run_by_mode(spec_file, spec_name, run_mode): spec = spec_util.get(spec_file, spec_name) if run_mode == 'search': Experiment(spec).run() elif run_mode == 'train': Trial(spec).run() elif run_mode == 'enjoy': # TODO turn on save/load model mode # Session(spec).run() pass elif run_mode == 'benchmark': # TODO need to spread benchmark over spec on Experiment pass elif run_mode == 'dev': os.environ['PY_ENV'] = 'test' # to not save in viz logger.set_level('DEBUG') spec = util.override_dev_spec(spec) Trial(spec).run() else: logger.warn( 'run_mode not recognized; must be one of `search, train, enjoy, benchmark, dev`.' )
def calc_aeb_fitness_sr(aeb_df, env_name): '''Top level method to calculate fitness vector for AEB level data (strength, speed, stability)''' no_fitness_sr = pd.Series({ 'strength': 0., 'speed': 0., 'stability': 0.}) if len(aeb_df) < MA_WINDOW: logger.warn(f'Run more than {MA_WINDOW} episodes to compute proper fitness') return no_fitness_sr std = FITNESS_STD.get(env_name) if std is None: std = FITNESS_STD.get('template') logger.warn(f'The fitness standard for env {env_name} is not built yet. Contact author. Using a template standard for now.') aeb_df['total_t'] = aeb_df['t'].cumsum() aeb_df['strength'] = calc_strength(aeb_df, std['rand_epi_reward'], std['std_epi_reward']) aeb_df['strength_ma'] = aeb_df['strength'].rolling(MA_WINDOW).mean() aeb_df['strength_mono_inc'] = is_noisy_mono_inc(aeb_df['strength']).astype(int) strength = aeb_df['strength_ma'].max() speed = calc_speed(aeb_df, std['std_timestep']) stability = calc_stability(aeb_df) aeb_fitness_sr = pd.Series({ 'strength': strength, 'speed': speed, 'stability': stability}) return aeb_fitness_sr
def __init__(self, env_spec, env_space, e=0): self.env_spec = env_spec self.env_space = env_space self.info_space = env_space.info_space self.e = e util.set_attr(self, self.env_spec) self.name = self.env_spec['name'] self.body_e = None self.nanflat_body_e = None # nanflatten version of bodies self.body_num = None worker_id = int(f'{os.getpid()}{self.e+int(ps.unique_id())}'[-4:]) self.u_env = UnityEnvironment(file_name=util.get_env_path(self.name), worker_id=worker_id) # spaces for NN auto input/output inference logger.warn('Unity environment observation_space and action_space are constructed with invalid range. Use only their shapes.') self.observation_spaces = [] self.action_spaces = [] for a in range(len(self.u_env.brain_names)): observation_shape = (self.get_observable_dim(a)['state'],) if self.get_brain(a).state_space_type == 'discrete': observation_space = gym.spaces.Box(low=0, high=1, shape=observation_shape, dtype=np.int32) else: observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=observation_shape, dtype=np.float32) self.observation_spaces.append(observation_space) if self.is_discrete(a): action_space = gym.spaces.Discrete(self.get_action_dim(a)) else: action_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1,), dtype=np.float32) self.action_spaces.append(action_space) for observation_space, action_space in zip(self.observation_spaces, self.action_spaces): set_gym_space_attr(observation_space) set_gym_space_attr(action_space) # TODO experiment to find out optimal benchmarking max_timestep, set # TODO ensure clock_speed from env_spec self.clock_speed = 1 self.clock = Clock(self.clock_speed) self.done = False
def __init__(self, memory_spec, body): ''' @param {*} body is the unit that stores its experience in this memory. Each body has a distinct memory. ''' self.memory_spec = memory_spec self.body = body # declare what data keys to store self.data_keys = ['states', 'actions', 'rewards', 'next_states', 'dones', 'priorities'] # the basic variables for every memory self.last_state = None # method to log size warning only once to prevent spamming log self.warn_size_once = ps.once(lambda msg: logger.warn(msg)) # for API consistency, reset to some max_len in your specific memory class self.state_buffer = deque(maxlen=0) # total_reward and its history over episodes self.total_reward = 0
def init_nets(self): '''Initialize the neural networks used to learn the actor and critic from the spec''' body = self.agent.nanflat_body_a[0] # singleton algo state_dim = body.state_dim action_dim = body.action_dim self.is_discrete = body.is_discrete net_spec = self.agent.spec['net'] mem_spec = self.agent.spec['memory'] net_type = self.agent.spec['net']['type'] actor_kwargs = util.compact_dict( dict( hid_layers_activation=_.get(net_spec, 'hid_layers_activation'), optim_param=_.get(net_spec, 'optim_actor'), loss_param=_.get(net_spec, 'loss'), # Note: Not used for training actor clamp_grad=_.get(net_spec, 'clamp_grad'), clamp_grad_val=_.get(net_spec, 'clamp_grad_val'), gpu=_.get(net_spec, 'gpu'), )) if self.agent.spec['net']['use_same_optim']: logger.info('Using same optimizer for actor and critic') critic_kwargs = actor_kwargs else: logger.info('Using different optimizer for actor and critic') critic_kwargs = util.compact_dict( dict( hid_layers_activation=_.get(net_spec, 'hid_layers_activation'), optim_param=_.get(net_spec, 'optim_critic'), loss_param=_.get(net_spec, 'loss'), clamp_grad=_.get(net_spec, 'clamp_grad'), clamp_grad_val=_.get(net_spec, 'clamp_grad_val'), gpu=_.get(net_spec, 'gpu'), )) ''' Below we automatically select an appropriate net based on two different conditions 1. If the action space is discrete or continuous action - Networks for continuous action spaces have two heads and return two values, the first is a tensor containing the mean of the action policy, the second is a tensor containing the std deviation of the action policy. The distribution is assumed to be a Gaussian (Normal) distribution. - Networks for discrete action spaces have a single head and return the logits for a categorical probability distribution over the discrete actions 2. If the actor and critic are separate or share weights - If the networks share weights then the single network returns a list. - Continuous action spaces: The return list contains 3 elements: The first element contains the mean output for the actor (policy), the second element the std dev of the policy, and the third element is the state-value estimated by the network. - Discrete action spaces: The return list contains 2 element. The first element is a tensor containing the logits for a categorical probability distribution over the actions. The second element contains the state-value estimated by the network. 3. If the network type is feedforward, convolutional, or recurrent - Feedforward and convolutional networks take a single state as input and require an OnPolicyReplay or OnPolicyBatchReplay memory - Recurrent networks take n states as input and require an OnPolicyNStepReplay or OnPolicyNStepBatchReplay memory ''' if net_type == 'MLPseparate': self.is_shared_architecture = False self.is_recurrent = False if self.is_discrete: self.actor = getattr(net, 'MLPNet')(state_dim, net_spec['hid_layers'], action_dim, **actor_kwargs) logger.info( "Feedforward net, discrete action space, actor and critic are separate networks" ) else: self.actor = getattr(net, 'MLPHeterogenousHeads')( state_dim, net_spec['hid_layers'], [action_dim, action_dim], **actor_kwargs) logger.info( "Feedforward net, continuous action space, actor and critic are separate networks" ) self.critic = getattr(net, 'MLPNet')(state_dim, net_spec['hid_layers'], 1, **critic_kwargs) elif net_type == 'MLPshared': self.is_shared_architecture = True self.is_recurrent = False if self.is_discrete: self.actorcritic = getattr(net, 'MLPHeterogenousHeads')( state_dim, net_spec['hid_layers'], [action_dim, 1], **actor_kwargs) logger.info( "Feedforward net, discrete action space, actor and critic combined into single network, sharing params" ) else: self.actorcritic = getattr(net, 'MLPHeterogenousHeads')( state_dim, net_spec['hid_layers'], [action_dim, action_dim, 1], **actor_kwargs) logger.info( "Feedforward net, continuous action space, actor and critic combined into single network, sharing params" ) elif net_type == 'Convseparate': self.is_shared_architecture = False self.is_recurrent = False if self.is_discrete: self.actor = getattr(net, 'ConvNet')(state_dim, net_spec['hid_layers'], action_dim, **actor_kwargs) logger.info( "Convolutional net, discrete action space, actor and critic are separate networks" ) else: self.actor = getattr(net, 'ConvNet')(state_dim, net_spec['hid_layers'], [action_dim, action_dim], **actor_kwargs) logger.info( "Convolutional net, continuous action space, actor and critic are separate networks" ) self.critic = getattr(net, 'ConvNet')(state_dim, net_spec['hid_layers'], 1, **critic_kwargs) elif net_type == 'Convshared': self.is_shared_architecture = True self.is_recurrent = False if self.is_discrete: self.actorcritic = getattr(net, 'ConvNet')(state_dim, net_spec['hid_layers'], [action_dim, 1], **actor_kwargs) logger.info( "Convolutional net, discrete action space, actor and critic combined into single network, sharing params" ) else: self.actorcritic = getattr(net, 'ConvNet')( state_dim, net_spec['hid_layers'], [action_dim, action_dim, 1], **actor_kwargs) logger.info( "Convolutional net, continuous action space, actor and critic combined into single network, sharing params" ) elif net_type == 'Recurrentseparate': self.is_shared_architecture = False self.is_recurrent = True if self.is_discrete: self.actor = getattr(net, 'RecurrentNet')( state_dim, net_spec['hid_layers'], action_dim, mem_spec['length_history'], **actor_kwargs) logger.info( "Recurrent net, discrete action space, actor and critic are separate networks" ) else: self.actor = getattr(net, 'RecurrentNet')( state_dim, net_spec['hid_layers'], [action_dim, action_dim], mem_spec['length_history'], **actor_kwargs) logger.info( "Recurrent net, continuous action space, actor and critic are separate networks" ) self.critic = getattr(net, 'RecurrentNet')(state_dim, net_spec['hid_layers'], 1, mem_spec['length_history'], **critic_kwargs) elif net_type == 'Recurrentshared': self.is_shared_architecture = True self.is_recurrent = True if self.is_discrete: self.actorcritic = getattr(net, 'RecurrentNet')( state_dim, net_spec['hid_layers'], [action_dim, 1], mem_spec['length_history'], **actor_kwargs) logger.info( "Recurrent net, discrete action space, actor and critic combined into single network, sharing params" ) else: self.actorcritic = getattr(net, 'RecurrentNet')( state_dim, net_spec['hid_layers'], [action_dim, action_dim, 1], mem_spec['length_history'], **actor_kwargs) logger.info( "Recurrent net, continuous action space, actor and critic combined into single network, sharing params" ) else: logger.warn( "Incorrect network type. Please use 'MLPshared', MLPseparate', Recurrentshared, or Recurrentseparate." ) raise NotImplementedError