class DDPG(BaseAgent): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): # Task (environment) information self.task = task # should contain observation_space and action_space # Constrain state and action spaces self.state_size = 1 # position only self.state_range = self.task.observation_space.high[ 2] - self.task.observation_space.low[2] self.action_size = 1 # force only self.action_range = self.task.action_space.high[ 2] - self.task.action_space.low[2] print("Original spaces: {}, {}\nConstrained spaces: {}, {}".format( self.task.observation_space.shape, self.task.action_space.shape, self.state_size, self.action_size)) # Actor (Policy) Model self.action_low = self.task.action_space.low[2] self.action_high = self.task.action_space.high[2] self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.noise = OUNoise(self.action_size) #print('Noise generated') # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size) print('Replay Buffer initialized') # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters # Score tracker and learning parameters self.best_w = None self.best_score = -np.inf self.noise_scale = 0.1 # Episode variables self.reset_episode_vars() # Save episode stats self.stats_filename = os.path.join( util.get_param('out'), "stats_{}.csv".format(util.get_timestamp())) # path to CSV file self.episode_num = 1 def reset_episode_vars(self): self.last_state = None self.last_action = None self.total_reward = 0.0 self.count = 0 def step(self, state, reward, done): # Reduce state vector state = self.preprocess_state(state) # Transform state vector state = (state - self.task.observation_space.low[2] ) / self.state_range # scale to [0.0, 1.0] state = state.reshape(1, -1) # convert to row vector #print('Transform state vector') # Choose an action action = self.act(state) # Save experience / reward if self.last_state is not None and self.last_action is not None: self.memory.add(self.last_state, self.last_action, reward, state, done) self.total_reward += reward self.count += 1 # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: # Write episode stats self.write_stats([self.episode_num, self.total_reward]) self.episode_num += 1 # Learn from saved experiences experiences = self.memory.sample(self.batch_size) self.learn(experiences) #self.reset_episode_vars() self.last_state = state self.last_action = action return self.postprocess_action(action) def act(self, states): """Returns actions for given state(s) as per current policy.""" states = np.reshape(states, [-1, self.state_size]) actions = self.actor_local.model.predict(states) #print('Action taken!') return actions + self.noise.sample() # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def preprocess_state(self, state): """Reduce state vector to relevant dimensions.""" #print('State preprocessed') return state[2] # position only def postprocess_action(self, action): """Return complete action vector.""" complete_action = np.zeros(self.task.action_space.shape) # shape: (6,) complete_action[2] = action # linear force only #print('State postprocessed') return complete_action def write_stats(self, stats): """Write single episode stats to CSV file.""" df_stats = pd.DataFrame([stats], columns=['episode', 'total_reward' ]) # single-row dataframe df_stats.to_csv( self.stats_filename, mode='a', index=False, header=not os.path.isfile( self.stats_filename)) # write header first time only
class Task01_DDPG(BaseAgent): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): # Task (environment) information self.task = task # should contain observation_space and action_space #self.state_size = np.prod(self.task.observation_space.shape) # self.task.observation_space.high = self.task.observation_space.high[2:3] # self.task.observation_space.low = self.task.observation_space.low[2:3] self.state_range = self.task.observation_space.high - self.task.observation_space.low #self.action_size = np.prod(self.task.action_space.shape) self.action_range = self.task.action_space.high - self.task.action_space.low self.task.observation_space.high = self.task.observation_space.high[ 2:3] self.task.observation_space.low = self.task.observation_space.low[2:3] #self.state_range = self.state_range[2:3] #self.action_range = self.action_range[2:3] # Constrain state and action spaces self.state_size = 1 # position only self.action_size = 1 # force only self.action_low = self.task.action_space.low[2:3] self.action_high = self.task.action_space.high[2:3] print("Original spaces: {}, {}\nConstrained spaces: {}, {}".format( self.task.observation_space.shape, self.task.action_space.shape, self.state_size, self.action_size)) # # Policy parameters # self.w = np.random.normal( # size=(self.state_size, self.action_size), # weights for simple linear policy: state_space x action_space # scale=(self.action_range / (2 * self.state_size)).reshape(1, -1)) # start producing actions in a decent range # Score tracker and learning parameters self.best_w = None self.best_score = -np.inf self.noise_scale = 0.1 # Actor (Policy) Model #self.action_low = self.task.action_space.low #self.action_high = self.task.action_space.high self.state_range = self.state_range[2:3] self.action_range = self.action_range[2:3] self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.noise = OUNoise(self.action_size) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters # Episode variables #self.reset_episode_vars() #--------------------------------------- # Saving data self.stats_filename = os.path.join( util.get_param('out') + '/task01/', "stats_{}.csv".format(util.get_timestamp())) # path to CSV file self.stats_columns = ['episode', 'total_reward'] # specify columns to save self.episode_num = 1 print("Saving stats {} to {}".format(self.stats_columns, self.stats_filename)) # Load/save parameters self.load_weights = True # try to load weights from previously saved models self.save_weights_every = 1 # save weights every n episodes, None to disable self.model_dir = util.get_param( 'out' ) + '/task01' # you can use a separate subdirectory for each task and/or neural net architecture self.model_name = "my-model_" + util.get_timestamp() self.model_ext = ".h5" # if self.load_weights or self.save_weights_every: # self.actor_filename_local = os.path.join(self.model_dir, # "{}_actor_local{}".format(self.model_name, self.model_ext)) # self.critic_filename_local = os.path.join(self.model_dir, # "{}_critic_local{}".format(self.model_name, self.model_ext)) # self.actor_filename_target = os.path.join(self.model_dir, # "{}_actor_target{}".format(self.model_name, self.model_ext)) # self.critic_filename_target = os.path.join(self.model_dir, # "{}_critic_target{}".format(self.model_name, self.model_ext)) # print("Actor local filename :", self.actor_filename_local) # [debug] # print("Critic local filename:", self.critic_filename_local) # [debug] # print("Actor target filename :", self.actor_filename_target) # [debug] # print("Critic target filename:", self.critic_filename_target) # [debug] # Load pre-trained model weights, if available #if self.load_weights and os.path.isfile(self.actor_filename_local): if self.load_weights: try: date_of_file = '2018-02-20_11-28-13' #date_of_file = '2018-02-20_11-22-27' self.actor_filename_local = os.path.join( self.model_dir, 'my-model_{}_actor_local.h5'.format(date_of_file)) self.critic_filename_local = os.path.join( self.model_dir, 'my-model_{}_critic_local.h5'.format(date_of_file)) self.actor_filename_target = os.path.join( self.model_dir, 'my-model_{}_actor_target.h5'.format(date_of_file)) self.critic_filename_target = os.path.join( self.model_dir, 'my-model_{}_critic_target.h5'.format(date_of_file)) self.actor_local.model.load_weights(self.actor_filename_local) self.critic_local.model.load_weights( self.critic_filename_local) self.actor_target.model.load_weights( self.actor_filename_target) self.critic_target.model.load_weights( self.critic_filename_target) print("Model weights loaded from file: {}, {}, {}, {}".format( self.actor_filename_local, self.critic_filename_local, self.actor_filename_target, self.critic_filename_target)) # [debug] except Exception as e: print("Unable to load model weights from file: {}, {}, {}, {}". format(self.actor_filename_local, self.critic_filename_local, self.actor_filename_target, self.critic_filename_target)) print("{}: {}".format(e.__class__.__name__, str(e))) # Set the name of the weight files to this current time stamp, even if loaded from another timestamp. self.actor_filename_local = os.path.join( self.model_dir, "{}_actor_local{}".format(self.model_name, self.model_ext)) self.critic_filename_local = os.path.join( self.model_dir, "{}_critic_local{}".format(self.model_name, self.model_ext)) self.actor_filename_target = os.path.join( self.model_dir, "{}_actor_target{}".format(self.model_name, self.model_ext)) self.critic_filename_target = os.path.join( self.model_dir, "{}_critic_target{}".format(self.model_name, self.model_ext)) if self.save_weights_every: print("Saving model weights", "every {} episodes".format(self.save_weights_every) if self.save_weights_every else "disabled") # [debug] # Episode variables self.episode = 0 self.reset_episode_vars() #--------------------------------------- def reset_episode_vars(self): self.last_state = None self.last_action = None self.total_reward = 0.0 self.count = 0 def write_stats(self, stats): """Write single episode stats to CSV file.""" df_stats = pd.DataFrame( [stats], columns=self.stats_columns) # single-row dataframe df_stats.to_csv( self.stats_filename, mode='a', index=False, header=not os.path.isfile( self.stats_filename)) # write header first time only def step(self, state, reward, done): # Reduce state vector state = self.preprocess_state(state) # Transform state vector state = (state - self.task.observation_space.low ) / self.state_range # scale to [0.0, 1.0] state = state.reshape(1, -1) # convert to row vector # Choose an action action = self.act(state) # Save experience / reward if self.last_state is not None and self.last_action is not None: self.memory.add(self.last_state, self.last_action, reward, state, done) self.total_reward += reward self.count += 1 # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample(self.batch_size) self.learn(experiences) #---------------------- # Learn, if at end of episode if done: # Write episode stats self.write_stats([self.episode_num, self.total_reward]) self.episode_num += 1 # Save model weights at regular intervals if self.save_weights_every and self.episode % self.save_weights_every == 0: self.actor_local.model.save_weights(self.actor_filename_local) self.critic_local.model.save_weights( self.critic_filename_local) self.actor_target.model.save_weights( self.actor_filename_target) self.critic_target.model.save_weights( self.critic_filename_target) print( "Model weights saved at episode {}. Model files: {}. {}, {}, {}" .format(self.episode, self.actor_filename_local, self.critic_filename_local, self.actor_filename_target, self.critic_filename_target)) # [debug] self.learn(experiences) self.reset_episode_vars() self.last_state = state self.last_action = action #return action # Return complete action vector return self.postprocess_action(action) def act(self, states): """Returns actions for given state(s) as per current policy.""" states = np.reshape(states, [-1, self.state_size]) actions = self.actor_local.model.predict(states) return actions + self.noise.sample() # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def preprocess_state(self, state): """Reduce state vector to relevant dimensions.""" return state[2:3] # position only def postprocess_action(self, action): """Return complete action vector.""" complete_action = np.zeros(self.task.action_space.shape) # shape: (6,) complete_action[2:3] = action # linear force only return complete_action
class DDPG(BaseAgent): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): # Task (environment) information self.task = task # should contain observation_space and action_space # Load/save parameters self.load_weights = True # try to load weights from previously saved models self.save_weights_every = 5 # save weights every n episodes, None to disable self.model_dir = util.get_param( 'out' ) # you can use a separate subdirectory for each task and/or neural net architecture self.model_name = "ddpg_takeoff" self.model_ext = ".h5" if self.load_weights or self.save_weights_every: self.actor_filename = os.path.join( self.model_dir, "{}_actor{}".format(self.model_name, self.model_ext)) self.critic_filename = os.path.join( self.model_dir, "{}_critic{}".format(self.model_name, self.model_ext)) print("Actor filename :", self.actor_filename) # [debug] print("Critic filename:", self.critic_filename) # [debug] # Constrain state and action spaces self.state_size = 1 # position only self.state_range = self.task.observation_space.high[ 2] - self.task.observation_space.low[2] self.action_size = 1 # force only self.action_range = self.task.action_space.high[ 2] - self.task.action_space.low[2] print("Original spaces: {}, {}\nConstrained spaces: {}, {}".format( self.task.observation_space.shape, self.task.action_space.shape, self.state_size, self.action_size)) # Actor (Policy) Model self.action_low = self.task.action_space.low[2] self.action_high = self.task.action_space.high[2] self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Load pre-trained model weights, if available if self.load_weights and os.path.isfile(self.actor_filename): try: self.actor_local.model.load_weights(self.actor_filename) self.critic_local.model.load_weights(self.critic_filename) print("Model weights loaded from file!") # [debug] except Exception as e: print("Unable to load model weights from file!") print("{}: {}".format(e.__class__.__name__, str(e))) if self.save_weights_every: print("Saving model weights", "every {} episodes".format(self.save_weights_every) if self.save_weights_every else "disabled") # [debug] # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.noise = OUNoise(self.action_size) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size) print('Replay Buffer initialized') # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # Episode variables self.episode_num = 0 self.reset_episode_vars() # Save episode stats self.stats_filename = os.path.join( util.get_param('out'), "ddpg_takeoff_stats_{}.csv".format( util.get_timestamp())) # path to CSV file def reset_episode_vars(self): self.last_state = None self.last_action = None self.total_reward = 0.0 self.count = 0 ## THIS IS NOT USED self.episode_num += 1 def step(self, state, reward, done): # Reduce state vector state = self.preprocess_state(state) # Transform state vector state = (state - self.task.observation_space.low[2] ) / self.state_range # scale to [0.0, 1.0] state = state.reshape(1, -1) # convert to row vector # Choose an action action = self.act(state) # Save experience / reward if self.last_state is not None and self.last_action is not None: self.memory.add(self.last_state, self.last_action, reward, state, done) self.total_reward += reward self.count += 1 ## THIS IS NOT USED # Learn, if enough samples are available in memory #print('length memory: {}, batch size: {}'.format(len(self.memory),self.batch_size)) if len(self.memory) > self.batch_size: # Learn from saved experiences experiences = self.memory.sample(self.batch_size) #print('learning') self.learn(experiences) #print('learned') if done: # Save model weights at regular intervals if self.save_weights_every and self.episode_num % self.save_weights_every == 0: self.actor_local.model.save_weights(self.actor_filename) self.critic_local.model.save_weights(self.critic_filename) print("Model weights saved at episode", self.episode_num) # [debug] # Write episode stats self.write_stats([self.episode_num, self.total_reward]) print('Amount of steps in this episode:', self.count) self.reset_episode_vars() self.last_state = state self.last_action = action return self.postprocess_action(action) def act(self, states): """Returns actions for given state(s) as per current policy.""" states = np.reshape(states, [-1, self.state_size]) actions = self.actor_local.model.predict(states) return actions + self.noise.sample() # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def preprocess_state(self, state): """Reduce state vector to relevant dimensions.""" return state[2] # z-position only def postprocess_action(self, action): """Return complete action vector.""" complete_action = np.zeros(self.task.action_space.shape) # shape: (6,) complete_action[2] = action # linear force only return complete_action def write_stats(self, stats): """Write single episode stats to CSV file.""" df_stats = pd.DataFrame([stats], columns=['episode', 'total_reward' ]) # single-row dataframe df_stats.to_csv( self.stats_filename, mode='a', index=False, header=not os.path.isfile( self.stats_filename)) # write header first time only
class DDPG(BaseAgent): def __init__(self, task): print('start DDPG') self.task = task self.state_size = 1 self.action_size = 1 self.space_low = self.task.observation_space.low[2:3] self.stats_filename = os.path.join( util.get_param('out'), "stats_{}.csv".format(util.get_timestamp())) # path to CSV file self.stats_columns = ['episode', 'total_reward'] # specify columns to save # Episode variables self.reset_episode_vars() self.actor_learning_rate = 0.0001 self.tau = 0.99 self.mini_batch_size = 64 self.buffer_size = 100000 self.critic_learning_rate = 0.001 self.gamma = 0.88 self.episode = 0 # Load/save parameters self.load_weights = False # try to load weights from previously saved models self.save_weights_every = 50 # save weights every n episodes, None to disable self.model_dir = util.get_param( 'out') # you can use a separate subdirectory for each task and/or neural net architecture self.model_name = "my-model4" #my-model3 self.model_ext = ".h5" if self.load_weights or self.save_weights_every: self.actor_filename = os.path.join(self.model_dir, "{}_actor{}".format(self.model_name, self.model_ext)) self.critic_filename = os.path.join(self.model_dir, "{}_critic{}".format(self.model_name, self.model_ext)) print("Actor filename :", self.actor_filename) # [debug] print("Critic filename:", self.critic_filename) # [debug] self.memory = ReplayBuffer(self.buffer_size) self.action_low = self.task.action_space.low[2:3] self.action_high = self.task.action_space.high[2:3] self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) if self.load_weights and os.path.isfile(self.actor_filename): try: self.actor_local.model.load_weights(self.actor_filename) self.critic_local.model.load_weights(self.critic_filename) print("Model weights loaded from file!") # [debug] except Exception as e: print("Unable to load model weights from file!") print("{}: {}".format(e.__class__.__name__, str(e))) if self.save_weights_every: print("Saving model weights", "every {} episodes".format( self.save_weights_every) if self.save_weights_every else "disabled") # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) self.actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.action_size)) def write_stats(self, stats): """Write single episode stats to CSV file.""" df_stats = pd.DataFrame([stats], columns=self.stats_columns) # single-row dataframe df_stats.to_csv(self.stats_filename, mode='a', index=False, header=not os.path.isfile(self.stats_filename)) # write header first time only def reset_episode_vars(self): self.last_state = None self.last_action = None self.total_reward = 0.0 self.count = 0 def postprocess_action(self, action): complete_action = np.zeros(self.task.action_space.shape) # shape: (6,) complete_action[2:3] = action # linear force only return complete_action def step(self, state, reward, done): # Transform state vector old_height = state[2:3] state = (old_height - self.space_low) / self.state_size # scale to [0.0, 1.0] state = state.reshape(1, -1) # convert to row vector # Choose an action action = self.act(state) # Save experience / reward if self.last_state is not None and self.last_action is not None: self.memory.add(self.last_state, self.last_action, reward, state, done) self.total_reward += reward self.count += 1 if len(self.memory) > self.mini_batch_size: self.learn(self.memory.sample(self.mini_batch_size)) if done: print('reward', self.total_reward, "height", old_height) if self.save_weights_every and self.episode % self.save_weights_every == 0: self.actor_local.model.save_weights(self.actor_filename) self.critic_local.model.save_weights(self.critic_filename) #print("Model weights saved at episode", self.episode) self.write_stats([self.episode, self.total_reward]) self.episode += 1 self.reset_episode_vars() final_action = self.actor_target.model.predict_on_batch(state) self.last_state = state self.last_action = final_action return self.postprocess_action(final_action) def act(self, states): actions = self.actor_local.model.predict(states) return actions + self.actor_noise.sample() def learn(self, experiences): states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences if e is not None]) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(BaseAgent): def __init__(self, task): self.task = task self.state_size = 3 # position only self.action_size = 3 # force only self.action_low = self.task.action_space.low[0:3] self.action_high = self.task.action_space.high[0:3] print("Original spaces: {}, {}\nConstrained spaces: {}, {}".format( self.task.observation_space.shape, self.task.action_space.shape, self.state_size, self.action_size)) #load/save parameters self.load_weights = True # try to load weights from previously saved models self.save_weights_every = 100 # None to disable self.model_dir = util.get_param('out') self.model_name = "my-model" self.model_ext = ".h5" self.episode = 0 if self.load_weights or self.save_weights_every: self.actor_filename = os.path.join( self.model_dir, "{}_actor{}".format(self.model_name, self.model_ext)) self.critic_filename = os.path.join( self.model_dir, "{}_critic{}".format(self.model_name, self.model_ext)) print("Actor filename:", self.actor_filename) print("Critic filename:", self.critic_filename) # Actor(Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic(Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Load pre-trained model weights, if available if self.load_weights and os.path.isfile(self.actor_filename): try: self.actor_local.model.load_weights(self.actor_filename) self.critic_local.model.load_weights(self.critic_filename) print("Model weights loaded from file") # [debug] except Exception as e: print("Unable to load model weights from file!") print("{}: {}".format(e.__class__.__name__, str(e))) if self.save_weights_every: print("Saving model weights", "every {} episodes".format(self.save_weights_every) if self.save_weights_every else "disabled") # [debug] # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.noise = OUNoise(self.action_size) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size) # Algorithm parameters self.gamma = 0.99 #discount factor self.tau = 0.001 # for soft self.rewards_list = [] self.reset_episode_vars() # Save episode stats self.stats_filename = os.path.join( util.get_param('out'), "stats_{}.csv".format(util.get_timestamp())) self.stats_columns = ['episode', 'total_reward'] # specify column to save self.episode_num = 1 print("Saving stats {} to {}".format(self.stats_columns, self.stats_filename)) # debug #print("init complete") #[debug] def reset_episode_vars(self): self.last_state = None self.last_action = None self.total_reward = 0.0 self.count = 0 self.episode += 1 def preprocess_state(self, state): """Reduce state vector to relevant dimensions""" return state[0:3] # position only def postprocess_action(self, action): """Return complete action vector""" complete_action = np.zeros(self.task.action_space.shape) # shape (6,) complete_action[0:3] = action # linear force only return complete_action def step(self, state, reward, done): #print("take a step") #[debug] # Reduce state vector state = self.preprocess_state(state) # Choose an action (get action through local actor network) action = self.act(state) # Save experience/reward if self.last_state is not None and self.last_action is not None: self.memory.add(self.last_state, self.last_action, reward, state, done) self.total_reward += reward self.count += 1 # Learn, if replay buffer is ample to sample experiences (online learning) if len(self.memory) > self.batch_size: experiences = self.memory.sample(self.batch_size) self.learn(experiences) if done: #print("Done") #[debug] self.write_stats([self.episode_num, self.total_reward]) self.episode_num += 1 # Save model weights at regular intervals if self.save_weights_every and self.episode % self.save_weights_every == 0: self.actor_local.model.save_weights(self.actor_filename) self.critic_local.model.save_weights(self.critic_filename) print("Model weights saved at episode", self.episode) # [debug] self.reset_episode_vars() self.last_state = state self.last_action = action #print("end of step") #[debug] return self.postprocess_action(action) # to save rewards stats def write_stats(self, stats): """Write single episode stats to CSV file""" df_stats = pd.DataFrame([stats], columns=self.stats_columns) df_stats.to_csv(self.stats_filename, mode='a', index=False, header=not os.path.isfile(self.stats_filename)) print(stats) # debug def learn(self, experiences): #print("start learn") #[debug] states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from targets models actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model(local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) # learning_phase() = 0 -> test mode # learning_phase() = 1 -> train mode action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # Soft-update target models self.soft_update(self.actor_local, self.actor_target) self.soft_update(self.critic_local, self.critic_target) def act(self, states): #print("act") #[debug] states = np.reshape(states, [-1, self.state_size]) actions = self.actor_local.model.predict(states) return actions + self.noise.sample() # add some noise for exploration def soft_update(self, local_model, target_model): local_weights = np.array(local_model.model.get_weights()) target_weights = np.array(target_model.model.get_weights()) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.model.set_weights(new_weights)
class DDPG(BaseAgent): """Sample agent that searches for optimal policy randomly.""" def setup_weights(self): # save weights self.load_weights = True self.save_weights_every = 50 self.model_dir = util.get_param('out') self.model_name = "ddpg" self.model_ext = ".h5" if self.load_weights or self.save_weights_every: self.actor_filename = os.path.join(self.model_dir, "{}_actor{}".format(self.model_name, self.model_ext)) self.critic_filename = os.path.join(self.model_dir, "{}_critic{}".format(self.model_name, self.model_ext)) print("Actor filename :", self.actor_filename) print("Critic filename:", self.critic_filename) if self.load_weights and os.path.isfile(self.actor_filename): try: self.actor_local.model.load_weights(self.actor_filename) self.critic_local.model.load_weights(self.critic_filename) print("Model weights loaded from file!") except Exception as e: print("Unable to load model weights from file!") print("{}: {}".format(e.__class__.__name__, str(e))) else: self.critic_target.set_weights(self.critic_local) self.actor_target.set_weights(self.actor_local) def __init__(self, task): self.task = task self.state_size = 3 self.action_size = 3 #set action space limits self.action_low = self.task.action_space.low[0:3] self.action_high = self.task.action_space.high[0:3] print("Original spaces: {}, {}\nConstrained spaces: {}, {}".format( self.task.observation_space.shape, self.task.action_space.shape, self.state_size, self.action_size)) action = [self.action_size, self.action_low, self.action_high] #Initialize network #Actor self.actor_local = Actor(self.state_size, action) self.actor_target = Actor(self.state_size, action) #Critic self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.setup_weights() #noise self.noise = OUNoise(self.action_size) #Replay buffer self.buffer_size = 100000 self.batch_size = 128 self.memory = ReplayBuffer(self.buffer_size) #Hyper params self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters # log file self.stats = os.path.join(util.get_param('out'), "stats_{}.csv".format( util.get_timestamp())) self.episode_no = 1 self.stats_columns = ['episodes', 'total_reward'] print("Saving stats {} to {}".format(self.stats_columns, self.stats)) # Episode variables self.reset_episode_vars() def preprocess_state(self, state): return state[0:3] def postprocess_action(self, action): constrained_action = np.zeros(self.task.action_space.shape) constrained_action[0:3] = action return constrained_action def write(self, data): df_stats = pd.DataFrame([data], columns=self.stats_columns) df_stats.to_csv(self.stats, mode='a', index=False, header=not os.path.isfile(self.stats)) def reset_episode_vars(self): self.last_state = None self.last_action = None self.total_reward = 0.0 self.count = 0 def step(self, state, reward, done): state = self.preprocess_state(state) #choose an action action = self.act(state) # Save experience / reward if self.last_state is not None and self.last_action is not None: self.total_reward += reward self.count += 1 self.memory.add_experience(state, action, reward, self.last_state, done) # Learn, if at end of episode if self.memory.len() > self.batch_size: experiences = self.memory.sample(self.batch_size) self.learn(experiences) self.episode_no += 1 if done: if self.save_weights_every and self.episode_no % self.save_weights_every == 0: self.actor_local.model.save_weights(self.actor_filename) self.critic_local.model.save_weights(self.critic_filename) print("Model weights saved at episode", self.episode_no) self.write([self.episode_no, self.total_reward]) self.reset_episode_vars() self.last_state = state self.last_action = action return self.postprocess_action(action) def act(self, state): # Choose action based on given state and policy states = np.reshape(state, [-1, self.state_size]) actions = self.actor_local.predict(states) return actions + self.noise.sample() def learn(self, experiences): states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype( np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype( np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype( np.uint8).reshape(-1, 1) next_states = np.vstack([e.state_next for e in experiences if e is not None]) # Get predicted next states and Q values from target models actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) #compute Q targets Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) #train actor model action_gradients = np.reshape(self.critic_local.get_action_gradients( [states, actions, 0]),(-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) #update self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) def soft_update(self, local_model, target_model): '''update model params''' local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.update_weights(new_weights)
class BaseAgent: """Advanced base agent that lets you limit the action and state space""" def __init__(self, task, action_min, action_max, state_min, state_max): # Task (environment) information self.task = task # should contain observation_space and action_space self.min_action = action_min # define minimum and maximum action self.max_action = action_max self.min_stat = state_min # define minimum and maximum state self.max_stat = state_max self.learn_when_done = False # defines if the agent shall only learn at the end of each episode # Constrain state and action spaces self.state_size = self.max_stat-self.min_stat+1 # position only self.action_size = self.max_action-self.min_action+1 # force only print("Original spaces: {}, {}\nConstrained spaces: {}, {}".format( self.task.observation_space.shape, self.task.action_space.shape, self.state_size, self.action_size)) # calc state space minimum and range self.state_low = self.task.observation_space.low[self.min_stat:self.max_stat+1] self.state_range = self.task.observation_space.high[self.min_stat:self.max_stat+1] - self.state_low # self.action_size = np.prod(self.task.action_space.shape) # calc action space minimum, maximum and range self.action_low = self.task.action_space.low[self.min_action:self.max_action+1] self.action_high = self.task.action_space.high[self.min_action:self.max_action+1] self.action_range = self.action_high-self.action_low # Replay memory self.epsilon = 0.0 self.batch_size = 64 self.buffer_size = 100000 self.memory = ReplayBuffer(self.buffer_size) # Save episode stats self.stats_filename = os.path.join( util.get_param('out'), "stats_{}.csv".format(util.get_timestamp())) # path to CSV file self.stats_columns = ['episode', 'total_reward', 'learning_rate'] # specify columns to save self.episode_num = 1 print("Saving stats {} to {}".format(self.stats_columns, self.stats_filename)) # [debug] # Episode variables self.reset_episode_vars() def reset_episode_vars(self): """Reset current episode's stats""" self.last_state = None self.org_last_state = None self.last_action = None self.total_reward = 0.0 self.count = 0 def preprocess_state(self, state): """Reduce state vector to relevant dimensions.""" return state[self.min_stat:self.max_stat+1] # limit to desired state range def postprocess_action(self, action): """Return complete action vector.""" complete_action = np.zeros(self.task.action_space.shape) # shape: (6,) complete_action[self.min_action:self.max_action+1] = action # extend to original size again return complete_action def handle_step_index(self, done): """Is called once each turn for periodic events""" pass def step(self, state, reward, done): """Handles a single step: - Convert input state to simpler one - Estimate best action - Learn all x rounds - Write stats to log - Convert internal to external action and return it""" org_state = state; # print("Shape: {}".format(state.shape)) # Transform state vector state = self.preprocess_state(state) # print("PP Shape: {}".format(state.shape)) # print("{} {} {}".format(state.shape, self.state_low.shape, self.state_range.shape)) state = (state - self.state_low) / self.state_range # scale to [0.0, 1.0] state = state.reshape(1, -1) # convert to row vector # Choose an action action = self.act(state) # Save experience / reward if self.last_state is not None and self.last_action is not None: # print("Action shape {}".format(self.last_action.shape)) if len(self.memory)==self.batch_size-1: print("Buffer filled, starting learning") self.memory.add(self.last_state, self.last_action, reward, state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size and (self.learn_when_done==False or done==True): experiences = self.memory.sample(self.batch_size) self.learn(experiences) # Sum rewards if self.last_state is not None and self.last_action is not None: self.total_reward += reward self.count += 1 # convert action from restricted to full space again pp_action = self.postprocess_action(action) # Learn, if at end of episode if done: # Write episode stats self.write_stats([self.episode_num, self.total_reward, self.epsilon]) print("Reward: {} Exploration rate: {}".format(self.total_reward, self.epsilon)) self.episode_num += 1 self.reset_episode_vars() # remember this round's data self.last_state = state self.org_last_state = org_state self.last_action = action # notify high level handler self.handle_step_index(done) return pp_action def write_stats(self, stats): """Write single episode stats to CSV file.""" df_stats = pd.DataFrame([stats], columns=self.stats_columns) # single-row dataframe df_stats.to_csv(self.stats_filename, mode='a', index=False, header=not os.path.isfile(self.stats_filename)) # write header first time only def act(self, states): """Returns actions for given state(s) as per current policy.""" pass def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" pass
class DDPG(BaseAgent): '''Agent that searches for optimal policy using Deep Deterministic Policy Gradients.''' def __init__(self, task): ''' Initializes variables :param task: Should be able to access the following (OpenAI Gym spaces): task.observation_space # i.e. state space task.action_space ''' super(DDPG, self).__init__(task) self.use_gpu = torch.cuda.is_available() self.task = task # Hyperparameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for sort update of target parameters # constrained states self.state_size = np.prod(self.task.observation_space.shape).item() # constrained actions self.action_size = 1 self.action_low = self.task.action_space.low[2:3] self.action_high = self.task.action_space.high[2:3] # Actor model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.use_gpu) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.use_gpu) self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(), 1e-4) # Critic model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(), 1e-3) # load the models and sync weights target models self.best_model_loaded = self.load_models(self.actor_local, self.critic_local) self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local) print('Best model loaded: {}'.format(self.best_model_loaded)) # use GPU? if self.use_gpu: self.actor_local.cuda() self.actor_target.cuda() self.critic_local.cuda() self.critic_target.cuda() # Ornstein-Uhlenbeck noise for action sampling self.noise = OrnsteinUhlenbeckProcess( size=self.action_size, theta=0.15, sigma=0.02) # Replay memory self.buffer_size = 100000 self.batch_size = 128 self.memory = ReplayBuffer(self.buffer_size) # Score tracker and learning parameters self.best_score = -np.inf # Episode variables self.last_state = None self.last_action = None self.total_reward = 0.0 self.count = 0 self.episode_num = 1 self.acts = np.zeros(shape=self.task.action_space.shape) # actions to reuturn from step() # we set all actions to 0 # except one for vertical forces def reset_episode_vars(self): '''Resets episode variables''' self.last_state = None self.last_action = None self.total_reward = 0.0 self.count = 0 self.episode_num += 1 self.acts = np.zeros(shape=self.task.action_space.shape) def step(self, state, reward, done): '''Process state, reward, done flag, and return an action. :param state: current state vector as Numpy array, compatible with task's state space :param reward: last reward received :param done: whether this episode is complete :return: desired action vector as NumPy array, compatible with task's action space ''' # Choose an action state = state[0:self.state_size] action = self.act(state) # Save experience / reward if self.last_state is not None and self.last_action is not None: self.memory.add(self.last_state, self.last_action, reward, state, done) self.total_reward += reward self.count += 1 # Learn, if we have enough samples if len(self.memory) > self.batch_size and not self.best_model_loaded: experience = self.memory.sample(self.batch_size) self.learn(experience) # Write statistic and saves model if done if done: score = self.total_reward / float(self.count) if self.count else 0.0 if score > self.best_score: self.best_score = score self.save_models(self.episode_num, self.actor_target, self.critic_target, True) print("DDPG.learn(): t = {:4d}, score = {:7.3f} (best = {:7.3f}), total reward = {:7.3f}, episode = {}".format( self.count, score, self.best_score, self.total_reward, self.episode_num)) if self.episode_num % 10 == 0: self.save_models(self.episode_num, self.actor_target, self.critic_target, False) self.write_episode_stats(self.episode_num, self.total_reward) self.reset_episode_vars() # saves last state and actions self.last_state = state self.last_action = action self.acts[2] = action # change only vertical forces return self.acts def act(self, state): ''' Predict actions for a state :param state: Numpy array, environment state :return: Numpy array, predicted actions ''' state = self.to_var(torch.from_numpy(state).float()) self.actor_local.eval() action = self.actor_local.forward(state).detach() return action.data.cpu().numpy() + self.noise.sample() def to_var(self, x_numpy): ''' Helper to convert Numpy array to PyTorch tensor :param x_numpy: Numpy array to convert :return: PyTorch tensor ''' x_var = Variable(x_numpy) if self.use_gpu: x_var = x_var.cuda() return x_var def learn(self, experiences): ''' Trains the networks :param experiences: tuple of the experience - (states, actions, rewards, next_states, dones) ''' # -------------------- get data from batch -------------------- # get expereiences from the replay buffer states = np.vstack(experiences[0]) states = self.to_var(torch.from_numpy(states).float()) actions = np.vstack(experiences[1]) actions = self.to_var(torch.from_numpy(actions).float()) rewards = np.float32(experiences[2]) rewards = self.to_var(torch.from_numpy(rewards)) rewards = torch.unsqueeze(rewards, 1) next_states = np.vstack(experiences[3]) next_states = self.to_var(torch.from_numpy(next_states).float()) dones = np.float32(experiences[4]) not_dones = self.to_var(torch.from_numpy(1 - dones)) not_dones = torch.unsqueeze(not_dones, 1) # ---------------------- optimize critic ---------------------- next_actions = self.actor_target.forward(next_states).detach() Q_targets_next = self.critic_target.forward(next_states, next_actions).detach() Q_targets_next = not_dones * Q_targets_next Q_targets = rewards + (self.gamma * Q_targets_next) Q_predicted = self.critic_local.forward(states, actions) # compute critic model loss and train it value_loss = nn.SmoothL1Loss()(Q_predicted, Q_targets) self.critic_local.zero_grad() self.critic_optimizer.zero_grad() value_loss.backward() self.critic_optimizer.step() # ---------------------- optimize actor ----------------------- predicted_actions = self.actor_local.forward(states) policy_loss = torch.mean(-self.critic_local.forward(states, predicted_actions)) self.actor_local.zero_grad() self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() # soft update of target models self.soft_update(self.actor_target, self.actor_local) self.soft_update(self.critic_target, self.critic_local) def hard_update(self, target_model, local_model): ''' Hard update of the target model weights - just copy them from the local model :param target_model: Destination, target model :param local_model: Source, local model ''' for target_param, param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(param.data) def soft_update(self, target_model, local_model): ''' Soft update of the target model weights corresponding to DDPG algorithm :param target_model: Destination, target model :param local_model: Source, local model ''' for target_param, param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
class BaseAgentDDPG(BaseAgent): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task # Load/save parameters self.load_weights = False # try to load weights from previously saved models self.save_weights_every = None # save weights every n episodes, None to disable self.model_dir = util.get_param( 'out') # you can use a separate subdirectory for each task and/or neural net architecture self.model_name = "ddpg-{}".format(self.task.__class__.__name__) self.model_ext = ".h5" # Save episode stats self.stats_filename = os.path.join( util.get_param('out'), "stats_{}_{}.csv".format(self.model_name, util.get_timestamp())) # path to CSV file self.stats_columns = ['episode', 'total_reward'] # specify columns to save print("Saving stats {} to {}".format(self.stats_columns, self.stats_filename)) # [debug] # Constrain state and action spaces self.state_start = 2 self.state_end = 3 self.action_start = 2 self.action_end = 3 # Noise process self.theta = 0.15 self.sigma = 0.3 # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.005 # for soft update of target parameters # Episode variables self.episode = 0 self.episode_duration = 0 self.total_reward = 0 self.last_state = None self.last_action = None self.reset_episode_vars() # override params in child classes self.init_params() self.state_size = self.state_end - self.state_start self.action_size = self.action_end - self.action_start self.action_low = self.task.action_space.low[self.action_start:self.action_end] self.action_high = self.task.action_space.high[self.action_start:self.action_end] self.noise = OrnsteinUhlenbeckProcess(size=self.action_size, theta=self.theta, sigma=self.sigma) # Actor (Policy) Model self.actor_learning_rate = 0.0001 self.actor_local = None self.actor_target = None self.init_actor_models() # Critic (Value) Model self.critic_learning_rate = 0.001 self.critic_local = None self.critic_target = None self.init_critic_models() # Load pre-trained model weights, if available if self.load_weights and os.path.isfile(self.actor_filename): self.load_weights_from_file() if self.save_weights_every: print("Saving model weights", "every {} episodes".format( self.save_weights_every) if self.save_weights_every else "disabled") # [debug] print("Original spaces: {}, {}\nConstrained spaces: {}, {}".format( self.task.observation_space.shape, self.task.action_space.shape, self.state_size, self.action_size)) if self.load_weights or self.save_weights_every: self.actor_filename = os.path.join(self.model_dir, "{}_actor{}".format(self.model_name, self.model_ext)) self.critic_filename = os.path.join(self.model_dir, "{}_critic{}".format(self.model_name, self.model_ext)) print("Actor filename :", self.actor_filename) # [debug] print("Critic filename:", self.critic_filename) # [debug] def reset_episode_vars(self): self.total_reward = 0 self.episode_duration = 0 self.last_state = None self.last_action = None def preprocess_state(self, state): """Reduce state vector to relevant dimensions.""" return state[self.state_start:self.state_end] # position only def postprocess_action(self, action): """Return complete action vector.""" complete_action = np.zeros(self.task.action_space.shape) # shape: (6,) complete_action[self.action_start:self.action_end] = action # linear force only return complete_action def write_stats(self, stats): """Write single episode stats to CSV file.""" df_stats = pd.DataFrame([stats], columns=self.stats_columns) # single-row dataframe df_stats.to_csv(self.stats_filename, mode='a', index=False, header=not os.path.isfile(self.stats_filename)) # write header first time only def step(self, state, reward, done): state = self.preprocess_state(state) self.total_reward += reward # Choose an action action = self.act(state) self.episode_duration += 1 # Save experience / reward if self.last_state is not None and self.last_action is not None: self.memory.add(self.last_state, self.last_action, reward, state, done) self.last_state = state self.last_action = action if done: # Write episode stats self.write_stats([self.episode, self.total_reward]) print('episode={}, reward={:8.3f}, duration={}'.format(self.episode,self.total_reward, self.episode_duration)) # Save model weights at regular intervals if self.save_weights_every and self.episode % self.save_weights_every == 0: self.save_weights() self.episode += 1 self.reset_episode_vars() # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample(self.batch_size) self.learn(experiences) return self.postprocess_action(action) def act(self, states): """Returns actions for given state(s) as per current policy.""" states = np.reshape(states, [-1, self.state_size]) actions = self.predict_actions(states) return actions + self.noise.sample() # add some noise for exploration def soft_update(self, local_model, target_model): raise NotImplementedError("{} must override soft_update()".format(self.__class__.__name__)) def init_params(self): raise NotImplementedError("{} must override init_params()".format(self.__class__.__name__)) def init_actor_models(self): raise NotImplementedError("{} must override init_actor_models()".format(self.__class__.__name__)) def init_critic_models(self): raise NotImplementedError("{} must override init_critic_models()".format(self.__class__.__name__)) def load_weights_from_file(self): raise NotImplementedError("{} must override load_weights_from_file()".format(self.__class__.__name__)) def save_weights(self): raise NotImplementedError("{} must override save_weights()".format(self.__class__.__name__)) def predict_actions(self, states): raise NotImplementedError("{} must override predict_actions(states)".format(self.__class__.__name__)) def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" raise NotImplementedError("{} must override learn(experiences)".format(self.__class__.__name__))