def __init__(self, environment, function_approximator, config=None, summary=None): self.config = config or Config() assert isinstance(config, Config) """ Parameters in config: Name: Type: Default: Description: (Omitted when self-explanatory) store_summary bool False store the summary of the agent (return per episode) """ self.store_summary = check_attribute_else_default( self.config, 'store_summary', False) if self.store_summary: assert isinstance(summary, dict) self.summary = summary check_dict_else_default(self.summary, 'return_per_episode', []) " Other Parameters " # Function Approximator: used to approximate the Q-Values self.fa = function_approximator # Environment that the agent is interacting with self.env = environment # Summaries self.cumulative_reward = 0
def __init__(self, config=None, summary=None): assert isinstance(config, Config) """ Parameters: Name: Type Default: Description(omitted when self-explanatory): max_actions int 1000 The max number of actions executed before forcing a time out save_summary bool False Whether to save a summary of the environment """ self.max_actions = check_attribute_else_default(config, 'max_actions', default_value=1000) self.save_summary = check_attribute_else_default(config, 'save_summary', default_value=False) self.summary = summary if self.save_summary: assert isinstance(self.summary, dict) check_dict_else_default(self.summary, "steps_per_episode", []) " Inner state of the environment " self.step_count = 0 self.current_state = self.reset() self.actions = np.array( [0, 1, 2], dtype=int) # 0 = backward, 1 = coast, 2 = forward self.high = np.array([0.5, 0.07], dtype=np.float32) self.low = np.array([-1.2, -0.07], dtype=np.float32) self.action_dictionary = { 0: -1, # accelerate backwards 1: 0, # coast 2: 1 } # accelerate forwards
def __init__(self, config, summary=None): assert isinstance(config, Config) """ Parameters: Name: Type Default: Description(omitted when self-explanatory): # environment parameters max_episode_length int 500000 The max number of actions executed before forcing a time out norm_state bool True Normalize the state to [-1,1] # summary parameters store_summary bool False Whether to store the summary of the environment number_of_steps int 500000 Total number of environment steps """ check_attribute(config, 'current_step', 0) self.config = config # environment related variables self.max_episode_length = check_attribute(config, 'max_episode_length', default_value=500000) self.norm_state = check_attribute(config, 'norm_state', default_value=True) # summary related variables self.store_summary = check_attribute(config, 'store_summary', default_value=False) self.number_of_steps = check_attribute(config, 'number_of_steps', default_value=500000) self.summary = summary if self.store_summary: assert isinstance(self.summary, dict) self.reward_per_step = np.zeros(self.number_of_steps, dtype=np.float64) check_dict_else_default(self.summary, "steps_per_episode", []) check_dict_else_default(self.summary, "reward_per_step", self.reward_per_step) # internal state of the environment self.episode_step_count = 0 position = -0.6 + np.random.random() * 0.2 velocity = 0.0 self.current_state = np.array((position, velocity), dtype=np.float64) self.actions = np.array( [0, 1, 2], dtype=int) # 0 = backward, 1 = coast, 2 = forward self.high = np.array([0.5, 0.07], dtype=np.float64) self.low = np.array([-1.2, -0.07], dtype=np.float64) self.action_dictionary = { 0: -1, # accelerate backwards 1: 0, # coast 2: 1 } # accelerate forwards
def __init__(self, config, summary=None): """ Parameters: Name: Type Default: Description(omitted when self-explanatory): max_episode_length int 200000 The max number of steps executed in an episoe before forcing a time out norm_state bool True Normalize the state to [-1,1] store_summary bool False Whether to store the summary of the environment number_of_steps int 200000 Total number of environment steps """ check_attribute(config, 'current_step', 0) self.config = config # environment parameters self.max_episode_length = check_attribute(config, 'max_episode_length', 200000) self.norm_state = check_attribute(config, 'norm_state', True) # summary parameters self.store_summary = check_attribute(config, 'store_summary', default_value=False) self.summary = summary self.number_of_steps = check_attribute(config, 'number_of_steps', 200000) if self.store_summary: assert isinstance(self.summary, dict) self.reward_per_step = np.zeros(self.number_of_steps, dtype=np.float64) check_dict_else_default(self.summary, "steps_per_episode", []) check_dict_else_default(self.summary, "reward_per_step", self.reward_per_step) self.num_action = 4 self.num_state = 2 """ Inner state of the environment """ self.episode_step_count = 0 self.state = np.float64( np.random.uniform(low=0.0, high=0.1, size=(2, ))) self.puddle1 = Puddle(0.45, 0.75, 0.10, 0.75, 0.1, 0.35) self.puddle2 = Puddle(0.45, 0.80, 0.45, 0.40, 0.1, 0.4) self.pworld_min_x = 0.0 self.pworld_max_x = 1.0 self.pworld_min_y = 0.0 self.pworld_max_y = 1.0 self.goalDimension = 0.05 self.defDisplacement = 0.05 self.goalXCoor = self.pworld_max_x - self.goalDimension self.goalYCoor = self.pworld_max_y - self.goalDimension
def __init__(self, environment, function_approximator, behaviour_policy, er_buffer, config=None, summary=None, reshape=True): """ Summary Name: return_per_episode """ self.config = config or Config() assert isinstance(config, Config) """ Parameters in config: Name: Type: Default: Description: (Omitted when self-explanatory) save_summary bool False save the summary of the agent (return per episode) er_start_size int 0 number of steps sampled before training starts er_init_steps_count int 0 number of initial steps taken so far fixed_tpolicy bool False whether the policy is fixed (e.g., a function of the state) or changes over time (e.g., epsilon-greedy or a function of the q-values) """ self.save_summary = check_attribute_else_default( self.config, 'save_summary', False) self.er_start_size = check_attribute_else_default( self.config, 'er_start_size', 0) check_attribute_else_default(self.config, 'er_init_steps_count', 0) self.fixed_tpolicy = check_attribute_else_default( self.config, 'fixed_tpolicy', False) if self.save_summary: assert isinstance(summary, dict) self.summary = summary check_dict_else_default(self.summary, 'return_per_episode', []) " Other Parameters " # Behaviour self.bpolicy = behaviour_policy # Experience Replay Buffer self.er_buffer = er_buffer # Function Approximator: used to approximate the Q-Values self.fa = function_approximator # Environment that the agent is interacting with self.env = environment # Summaries self.cumulative_reward = 0 # Whether to reshape the mountain car observations self.reshape = reshape
def __init__(self, config=None, summary=None): assert isinstance(config, Config) """ Parameters: Name: Type Default: Description(omitted when self-explanatory): max_actions int 1000 The max number of actions executed before forcing a time out save_summary bool False Whether to save a summary of the environment """ self.max_actions = check_attribute_else_default(config, 'max_actions', default_value=500) self.save_summary = check_attribute_else_default(config, 'save_summary', default_value=False) self.summary = summary if self.save_summary: assert isinstance(self.summary, dict) check_dict_else_default(self.summary, "steps_per_episode", []) " Inner state of the environment " self.step_count = 0 self.openai_env = gym.make('Acrobot-v1') self.actions = np.array([0, 1, 2], dtype=np.int8) self.high = np.array([np.pi * 2, np.pi * 2, 12.56637096, 28.27433395], np.float64) self.low = np.array([0.0, 0.0, -12.56637096, -28.27433395], dtype=np.float64) self.current_state = self.reset()
def __init__(self, config, summary=None): """ Parameters: Name: Type Default: Description(omitted when self-explanatory): max_actions int 1000 The max number of actions executed before forcing a time out norm_state bool True Normalize the state to [-1,1] store_summary bool False Whether to store the summary of the environment """ self.norm_state = check_attribute_else_default(config, 'norm_state', True) self.max_actions = check_attribute_else_default(config, 'max_actions', 1000) self.store_summary = check_attribute_else_default(config, 'store_summary', False) self.summary = summary if self.store_summary: assert isinstance(self.summary, dict) check_dict_else_default(self.summary, "steps_per_episode", []) self.num_actions = 3 self.state_dims = 4 " Inner state of the environment " self.step_count = 0 self.current_state = np.float64(np.random.uniform(low=-0.5, high=0.5, size=(4,))) self.MAX_VEL_1 = 4 * np.pi self.MAX_VEL_2 = 9 * np.pi self.MAX_THETA_1 = np.pi self.MAX_THETA_2 = np.pi self.m1 = 1.0 self.m2 = 1.0 self.l1 = 1.0 self.l2 = 1.0 self.lc1 = 0.5 self.lc2 = 0.5 self.I1 = 1.0 self.I2 = 1.0 self.g = 9.8 self.dt = 0.05 self.acrobotGoalPosition = 1.0
def __init__(self, config, summary=None): assert isinstance(config, Config) """ Parameters: Name: Type Default: Description(omitted when self-explanatory): max_episode_length int 500000 The max number of steps executed in an episoe before forcing a time out norm_state bool True Normalize the state to [-1,1] display bool False Whether to display the screen of the game init_lives int 3 Number of lives at the start of the game store_summary bool False Whether to store the summary of the environment number_of_steps int 500000 Total number of environment steps """ check_attribute(config, 'current_step', 0) self.config = config # environment parameters self.max_episode_length = check_attribute(config, 'max_episode_length', default_value=500000) self.norm_state = check_attribute(config, 'norm_state', default_value=True) self.display = False self.init_lives = 3 # self.display = check_attribute(config, 'display', default_value=False) # self.init_lives = check_attribute(config, 'init_lives', default_value=3) # summary parameters self.store_summary = check_attribute(config, 'store_summary', default_value=False) self.summary = summary self.number_of_steps = check_attribute(config, 'number_of_steps', 500000) if self.store_summary: assert isinstance(self.summary, dict) self.reward_per_step = np.zeros(self.number_of_steps, dtype=np.float64) check_dict_else_default(self.summary, "steps_per_episode", []) check_dict_else_default(self.summary, "reward_per_step", self.reward_per_step) # setting up original catcher environment with the specified parameters self.catcherOb = Catcher(init_lives=self.init_lives) if not self.display: # do not open a pygame window os.putenv('SDL_VIDEODRIVER', 'fbcon') os.environ["SDL_VIDEODRIVER"] = "dummy" if self.norm_state: self.pOb = PLE(self.catcherOb, fps=30, state_preprocessor=get_ob_normalize, display_screen=self.display) else: self.pOb = PLE(self.catcherOb, fps=30, state_preprocessor=get_ob, display_screen=self.display) self.pOb.init() # environment internal state self.actions = [ 97, None, 100 ] # self.pOb.getActionSet() (left = 97, do nothing = None, right = 100) self.num_action = 3 self.num_state = 4 self.episode_step_count = 0 self.pOb.reset_game() self.current_state = self.pOb.getGameState()
def __init__(self, optimizer, target_network, update_network, er_buffer, config=None, tf_session=None, summary=None): """ Summary Names: cumulative_loss training_steps """ assert isinstance(config, Config) self.config = config """ Parameters in config: Name: Type: Default: Description: (Omitted when self-explanatory) alpha float 0.00025 step size parameter obs_dims list [4,84,84] the dimensions of the obsevations tnetwork_update_freq int 10,000 number of updates before updating the target network update_count int 0 number of updates performed save_summary bool False indicates whether to save a summary of training """ self.alpha = check_attribute_else_default(self.config, 'alpha', 0.00025) self.obs_dims = check_attribute_else_default(self.config, 'obs_dims', [4, 84, 84]) self.tnetwork_update_freq = check_attribute_else_default( self.config, 'tnetwork_update_freq', 10000) self.save_summary = check_attribute_else_default( self.config, 'save_summary', False) check_attribute_else_default(self.config, 'update_count', 0) self.summary = summary if self.save_summary: assert isinstance(self.summary, dict) check_dict_else_default(self.summary, 'cumulative_loss', []) check_dict_else_default(self.summary, 'training_steps', []) self.training_steps = 0 self.cumulative_loss = 0 """ Other Parameters """ " Experience Replay Buffer and Return Function " self.er_buffer = er_buffer " Neural Network Models " self.target_network = target_network # Target Network self.update_network = update_network # Update Network " Training and Learning Evaluation: Tensorflow and variables initializer " self.optimizer = optimizer(self.alpha) self.sess = tf_session or tf.Session() " Train step " self.train_step = self.optimizer.minimize( self.update_network.train_loss, var_list=self.update_network.train_vars[0]) " Initializing variables in the graph" for var in tf.global_variables(): self.sess.run(var.initializer) " Copy Weights to Target Network Operator " unetwork_vars = tf.get_collection(self.update_network.name) tnetwork_vars = tf.get_collection(self.target_network.name) copy_ops = [ target_var.assign(update_var) for target_var, update_var in zip(tnetwork_vars, unetwork_vars) ] self.copy_to_target = tf.group(*copy_ops) self.sess.run(self.copy_to_target)