def __init__(self, config, return_function): """ Parameters: Name: Type: Default: Description: (Omitted when self-explanatory) buff_sz int 10 buffer size batch_sz int 1 env_state_dims list [2,2] dimensions of the observations to be stored in the buffer num_actions int 2 number of actions available to the agent obs_dtype np.type np.uint8 the data type of the observations initial_rand_steps int 0 number of random steps before decaying sigma rand_steps_count int 0 number of random steps taken so far store_return bool True save the computed return so that it can be reused """ assert isinstance(config, Config) self.config = config self.buff_sz = check_attribute_else_default(self.config, 'buff_sz', 10) self.batch_sz = check_attribute_else_default(self.config, 'batch_sz', 1) self.env_state_dims = list( check_attribute_else_default(self.config, 'env_state_dims', [2, 2])) self.num_actions = check_attribute_else_default( self.config, 'num_actions', 2) self.obs_dtype = check_attribute_else_default(self.config, 'obs_dtype', np.uint8) self.initial_rand_steps = check_attribute_else_default( self.config, 'initial_rand_steps', 0) check_attribute_else_default(self.config, 'rand_steps_count', 0) self.store_return = check_attribute_else_default( self.config, 'store_return', True) """ Parameters for Return Function """ assert isinstance(return_function, nStep_Retrace_ReturnFunction) self.return_function = return_function self.n = return_function.n """ Termination or Timeout Count for Applying the Decay on Sigma """ self.episodes_since_last_decay = 0 """ Parameters to keep track of the current state of the buffer """ self.current_index = 0 self.full_buffer = False """ Circular Buffers """ self.state = CircularBuffer(self.buff_sz, shape=tuple(self.env_state_dims), dtype=self.obs_dtype) self.action = CircularBuffer(self.buff_sz, shape=(), dtype=np.uint8) self.reward = CircularBuffer(self.buff_sz, shape=(), dtype=np.int32) self.terminate = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool) self.timeout = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool) self.bprobabilities = CircularBuffer(self.buff_sz, shape=(self.num_actions, ), dtype=np.float64) self.estimated_return = CircularBuffer(self.buff_sz, shape=(), dtype=np.float64) self.up_to_date = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
def __init__(self, config, return_function): """ Parameters: Name: Type: Default: Description: (Omitted when self-explanatory) buff_sz int 10 buffer size batch_sz int 1 frame_stack int 4 number of frames to stack, see Mnih et. al. (2015) env_state_dims list [2,2] dimensions of the observations to be stored in the buffer num_actions int 2 number of actions available to the agent obs_dtype np.type np.uint8 the data type of the observations reward_clipping bool False clipping the reward , see Mnih et. al. (2015) sigma float 0.5 Sigma parameter, see De Asis et. al (2018) sigma_decay float 1.0 decay rate of sigma """ self.config = config self.buff_sz = check_attribute_else_default(config, 'buff_sz', 10) self.batch_sz = check_attribute_else_default(config, 'batch_sz', 1) self.frame_stack = check_attribute_else_default( config, 'frame_stack', 4) self.env_state_dims = list( check_attribute_else_default(config, 'env_state_dims', [2, 2])) self.num_actions = check_attribute_else_default( config, 'num_actions', 2) self.obs_dtype = check_attribute_else_default(config, 'obs_dtype', np.uint8) self.reward_clipping = check_attribute_else_default( config, 'reward_clipping', False) self.sigma = check_attribute_else_default(config, 'sigma', 0.5) self.sigma_decay = check_attribute_else_default( config, 'sigma_decay', 1.0) """ Parameters for Return Function """ assert isinstance(return_function, OnPolicyQSigmaReturnFunction) self.return_function = return_function self.n = return_function.n """ Parameters to keep track of the current state of the buffer """ self.current_index = 0 self.full_buffer = False """ Circular Buffers """ self.state = CircularBuffer(self.buff_sz, shape=tuple(self.env_state_dims), dtype=self.obs_dtype) self.action = CircularBuffer(self.buff_sz, shape=(), dtype=np.uint8) self.reward = CircularBuffer(self.buff_sz, shape=(), dtype=np.int32) self.terminate = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool) self.sigma = CircularBuffer(self.buff_sz, shape=(), dtype=np.float64)
def __init__(self, config, return_function): """ Parameters: Name: Type: Default: Description: (Omitted when self-explanatory) buff_sz int 10 buffer size batch_sz int 1 frame_stack int 4 number of frames to stack, see Mnih et. al. (2015) env_state_dims list [2,2] dimensions of the observations to be stored in the buffer num_actions int 2 number of actions available to the agent obs_dtype np.type np.uint8 the data type of the observations reward_clipping bool False clipping the reward , see Mnih et. al. (2015) sigma float 0.5 Sigma parameter, see De Asis et. al (2018) sigma_decay float 1.0 decay rate of sigma store_bprobs bool False whether to store and use the behaviour policy probabilities for the return function store_sigma bool False whether to store sigma at every time step and use the stored sigmas to compute the return. True = use the sigma from the buffer, False = use the current sigma initial_rand_steps int 0 number of random steps before decaying sigma rand_steps_count int 0 number of random steps taken so far store_return bool True save the computed return so that it can be reused """ assert isinstance(config, Config) self.config = config self.buff_sz = check_attribute_else_default(self.config, 'buff_sz', 10) self.batch_sz = check_attribute_else_default(self.config, 'batch_sz', 1) self.frame_stack = check_attribute_else_default( self.config, 'frame_stack', 4) self.env_state_dims = list( check_attribute_else_default(self.config, 'env_state_dims', [2, 2])) self.num_actions = check_attribute_else_default( self.config, 'num_actions', 2) self.obs_dtype = check_attribute_else_default(self.config, 'obs_dtype', np.uint8) self.reward_clipping = check_attribute_else_default( self.config, 'reward_clipping', False) self.sigma = check_attribute_else_default(self.config, 'sigma', 0.5) self.sigma_decay = check_attribute_else_default( self.config, 'sigma_decay', 1.0) self.store_bprobs = check_attribute_else_default( self.config, 'store_bprobs', False) self.store_sigma = check_attribute_else_default( self.config, 'store_sigma', False) self.initial_rand_steps = check_attribute_else_default( self.config, 'initial_rand_steps', 0) check_attribute_else_default(self.config, 'rand_steps_count', 0) self.store_return = check_attribute_else_default( self.config, 'store_return', True) """ Parameters for Return Function """ assert isinstance(return_function, QSigmaReturnFunction) self.return_function = return_function self.n = return_function.n """ Parameters to keep track of the current state of the buffer """ self.current_index = 0 self.full_buffer = False """ Circular Buffers """ self.state = CircularBuffer(self.buff_sz, shape=tuple(self.env_state_dims), dtype=self.obs_dtype) self.action = CircularBuffer(self.buff_sz, shape=(), dtype=np.uint8) self.reward = CircularBuffer(self.buff_sz, shape=(), dtype=np.int32) self.terminate = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool) if self.store_bprobs: self.bprobabilities = CircularBuffer(self.buff_sz, shape=(self.num_actions, ), dtype=np.float64) if self.store_sigma: self.sigma_buffer = CircularBuffer(self.buff_sz, shape=(), dtype=np.float64) self.estimated_return = CircularBuffer(self.buff_sz, shape=(), dtype=np.float64) self.up_to_date = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
def __init__(self, config, return_function): """ Parameters: Name: Type: Default: Description: (Omitted when self-explanatory) buff_sz int 10 buffer size batch_sz int 1 env_state_dims list [2,2] dimensions of the observations to be stored in the buffer num_actions int 2 number of actions available to the agent obs_dtype np.type np.uint8 the data type of the observations sigma float 0.5 Sigma parameter, see De Asis et. al (2018) sigma_decay float 1.0 decay rate of sigma decay_type string exp decay type of sigma. Options: exp and lin decay_freq int 1 how often to decay sigma, e.g. a decay frequency of 10 would apply the decay once very 10 episodes sigma_min float 0 the lowest value sigma can attain when decaying store_bprobs bool False whether to store and use the behaviour policy probabilities for the return function store_sigma bool False whether to store sigma at every time step and use the stored sigmas to compute the return. True = use the sigma from the buffer, False = use the current sigma initial_rand_steps int 0 number of random steps before decaying sigma rand_steps_count int 0 number of random steps taken so far store_return bool True save the computed return so that it can be reused """ assert isinstance(config, Config) self.config = config self.buff_sz = check_attribute_else_default(self.config, 'buff_sz', 10) self.batch_sz = check_attribute_else_default(self.config, 'batch_sz', 1) self.env_state_dims = list( check_attribute_else_default(self.config, 'env_state_dims', [2, 2])) self.num_actions = check_attribute_else_default( self.config, 'num_actions', 2) self.obs_dtype = check_attribute_else_default(self.config, 'obs_dtype', np.uint8) self.sigma = check_attribute_else_default(self.config, 'sigma', 0.5) self.sigma_decay = check_attribute_else_default( self.config, 'sigma_decay', 1.0) self.decay_type = check_attribute_else_default(self.config, 'decay_type', 'exp') self.decay_freq = check_attribute_else_default(self.config, 'decay_freq', 1) self.sigma_min = check_attribute_else_default(self.config, 'sigma_min', 0.0) self.store_bprobs = check_attribute_else_default( self.config, 'store_bprobs', False) self.store_sigma = check_attribute_else_default( self.config, 'store_sigma', False) self.initial_rand_steps = check_attribute_else_default( self.config, 'initial_rand_steps', 0) check_attribute_else_default(self.config, 'rand_steps_count', 0) self.store_return = check_attribute_else_default( self.config, 'store_return', True) """ Parameters for Return Function """ assert isinstance(return_function, QSigmaReturnFunction) self.return_function = return_function self.n = return_function.n """ Termination or Timeout Count for Applying the Decay on Sigma """ self.episodes_since_last_decay = 0 """ Parameters to keep track of the current state of the buffer """ self.current_index = 0 self.full_buffer = False """ Circular Buffers """ self.state = CircularBuffer(self.buff_sz, shape=tuple(self.env_state_dims), dtype=self.obs_dtype) self.action = CircularBuffer(self.buff_sz, shape=(), dtype=np.uint8) self.reward = CircularBuffer(self.buff_sz, shape=(), dtype=np.int32) self.terminate = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool) self.timeout = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool) if self.store_bprobs: self.bprobabilities = CircularBuffer(self.buff_sz, shape=(self.num_actions, ), dtype=np.float64) if self.store_sigma: self.sigma_buffer = CircularBuffer(self.buff_sz, shape=(), dtype=np.float64) self.estimated_return = CircularBuffer(self.buff_sz, shape=(), dtype=np.float64) self.up_to_date = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
def __init__(self, config, games_directory=None, rom_filename=None, summary=None): super().__init__() """ Parameters: Name: Type Default: Description(omitted when self-explanatory): display_screen bool False Display game screen agent_render bool False Display current frame the way the agent sees it frame_skip int 4 See ALE Documentation repeat_action_probability float 0.25 in [0,1], see ALE Documentation max_num_frames int 18000 Max number of frames per episode color_averaging bool False If true, it averages over the skipped frames. Otherwise, it takes the maximum over the skipped frames. frame_stack int 4 Stack of frames for agent, see Mnih et. al. (2015) save_summary bool False Save the summary of the environment """ assert isinstance(config, Config) self.display_screen = check_attribute_else_default( config, 'display_screen', False) self.agent_render = check_attribute_else_default( config, 'agent_render', False) self.frame_skip = check_attribute_else_default(config, 'frame_skip', 4) self.repeat_action_probability = check_attribute_else_default( config, 'repeat_action_probability', 0.25) max_num_frames = check_attribute_else_default(config, 'max_num_frames', 18000) self.color_averaging = check_attribute_else_default( config, 'color_averaging', True) if self.color_averaging: self.aggregate_func = np.average else: self.aggregate_func = np.amax self.frame_stack = check_attribute_else_default( config, 'frame_stack', 4) self.save_summary = check_attribute_else_default( config, 'save_summary', False) if self.save_summary: assert isinstance(summary, dict) self.summary = summary check_dict_else_default(self.summary, "frames_per_episode", []) " Environment variables" self.env = ALEInterface() self.env.setInt(b'frame_skip', 1) self.env.setInt(b'random_seed', 0) self.env.setFloat(b'repeat_action_probability', 0) self.env.setInt(b"max_num_frames_per_episode", max_num_frames) self.env.setBool(b"color_averaging", False) self.env.setBool(b'display_screen', self.display_screen) self.rom_file = str.encode(games_directory + rom_filename) self.frame_count = 0 " Loading ROM " self.env.loadROM(self.rom_file) """ Fixed Parameters: Frame Format: "NCHW" (batch_size, channels, height, width). Decided to adopt this format because it's the fastest to process in tensorflow with a gpu. Frame Height and Width: 84, the default value in the literature. """ " Inner state of the environment " self.height = 84 self.width = 84 self.current_state = np.zeros( [self.frame_stack, self.height, self.width], dtype=np.uint8) self.original_height = 210 self.original_width = 160 self.history = np.zeros( [self.frame_skip, self.original_height, self.original_width], np.uint8) self.reset() self.observations_dimensions = self.current_state.shape self.frame_dims = self.current_state[0].shape self.actions = self.env.getLegalActionSet() self.previous_action = 0