def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters # Score self.score = -np.inf self.best_score = -np.inf
def __init__(self, task, exp_mu, exp_theta, exp_sigma, gamma, tau): self.task = task self.s_size = task.s_size self.a_size = task.a_size self.a_low = task.a_low self.a_high = task.a_high # Actor Model self.actor_local = Actor(self.s_size, self.a_size, self.a_low, self.a_high) self.actor_target = Actor(self.s_size, self.a_size, self.a_low, self.a_high) # Critic Model self.critic_local = Critic(self.s_size, self.a_size) self.critic_target = Critic(self.s_size, self.a_size) # Initialize target model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # initialize noise self.exp_mu = exp_mu self.exp_theta = exp_theta self.exp_sigma = exp_sigma self.noise = OUNoise(self.a_size, self.exp_mu, self.exp_theta, self.exp_sigma) # For Replay buffer self.buff_size = 1024 * 1024 self.batch_size = 64 self.memory = ReplayBuffer(self.buff_size, self.batch_size) # discount factor self.gamma = gamma # for soft update of target parameters self.tau = tau
def __init__(self, task, lra, lrc, db): self.task = task self.s_sz = task.state_size self.a_sz = task.action_size self.a_max = task.max_action # Actor (Policy) Model self.actor_local = Actor(self.s_sz, self.a_sz, lra) self.actor_target = Actor(self.s_sz, self.a_sz, lra) # First Critic (Value) Model self.critic_local_1 = Critic(self.s_sz, self.a_sz, lrc) self.critic_target_1 = Critic(self.s_sz, self.a_sz, lrc) # Second Critic (Value) Model self.critic_local_2 = Critic(self.s_sz, self.a_sz, lrc) self.critic_target_2 = Critic(self.s_sz, self.a_sz, lrc) # Initialize target model parameters with local model parameters self.critic_target_1.model.set_weights( self.critic_local_1.model.get_weights()) self.critic_target_2.model.set_weights( self.critic_local_2.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.noise = GaussianNoise(self.a_sz) # Replay memory self.num_exp = 0 self.batch = 32 self.buffer = 10000 labels = ["state", "action", "reward", "next_state", "done"] self.experience = namedtuple("Experience", field_names=labels) self.memory = PrioritizedReplayBuffer(self.buffer, self.batch, db) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.005 # for soft update of target parameters
def __init__(self, task): """Initialize models""" self.env = task self.state_size = task.state_size self.action_size = task.action_size self.action_high = task.action_high self.action_low = task.action_low # Initialize Actor (policy) models self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Initialize Critic (value) models self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay buffer self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.9 # discount factor self.tau = 0.001 # for soft update of target parameters
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.001 self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size) self.gamma = 0.99 self.tau = 0.1 self.learning_rate = 0.0005 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning_rate=self.learning_rate) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning_rate=self.learning_rate) self.critic_local = Critic(self.state_size, self.action_size, learning_rate=self.learning_rate) self.critic_target = Critic(self.state_size, self.action_size, learning_rate=self.learning_rate)
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.3 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.3 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 self.tau = 0.001
def create_models(self, hidden_sizes_actor=(512, 256), hidden_sizes_critic=(512, 256, 256)): self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, hidden_sizes=hidden_sizes_actor) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, hidden_sizes=hidden_sizes_actor) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.critic_local = Critic(self.state_size, self.action_size, hidden_sizes=hidden_sizes_critic) self.critic_target = Critic(self.state_size, self.action_size, hidden_sizes=hidden_sizes_critic) self.critic_target.model.set_weights( self.critic_local.model.get_weights())
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Critic self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) # Exploration noise self.exploration_mu = 0.1 self.exploration_sigma = 0.1 self.exploration_theta = 0.1 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Experience self.buffer_size = 100000000 self.batch_size = 64 self.buffer = ReplayBuffer(self.buffer_size) # Parameters self.gamma = 0.99 self.tau = 0.001
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high #actor model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) #Critic model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) #Initialize target model params with local params self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) #Initialize noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) #Replay memory Initialization self.buffer_size, self.batch_size = 2000000, 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) #Initialize algorithm parameters self.gamma, self.tau = 0.95, 0.001 #Initialize scores self.score, self.best_score = -np.inf, -np.inf
def __init__(self, task, single_rotor_control=False, prioritised_replay=False): tf.reset_default_graph() self.task = task self.state_size = self.task.state_size self.action_size = self.task.action_size self.action_low = self.task.action_low self.action_high = self.task.action_high self.action_range = self.action_high - self.action_low self.prioritised_replay = prioritised_replay with tf.variable_scope("local"): self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, single_rotor_control, is_training=False) self.critic_local = Critic(self.state_size, self.action_size, is_training=False) with tf.variable_scope("target"): self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, single_rotor_control) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters #self.critic_target.model.set_weights(self.critic_local.model.get_weights()) #self.actor_target.model.set_weights(self.actor_local.model.get_weights()) self.tau = tf.placeholder(tf.float32, name='tau') self.target_update_ops = self.soft_update() with tf.name_scope('summary'): self.reward_log = tf.Variable(0., False, name='reward_log', dtype=tf.float32) self.eps_length_log = tf.Variable(0., False, name='reward_log', dtype=tf.float32) tf.summary.scalar('reward_log', self.reward_log) tf.summary.scalar('eps_length_log', self.eps_length_log) self.summary_op = tf.summary.merge_all() # Noise process self.exploration_mu = 0 self.exploration_theta = .3 #(self.action_high - self.action_low)*.05 self.exploration_sigma = .4 #(self.action_high - self.action_low)*.05 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor
def __init__(self, task, basename): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # learning rates self.actor_learning_rate = 0.0001 self.critic_learning_rate = 0.001 # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, self.critic_learning_rate) self.critic_target = Critic(self.state_size, self.action_size, self.critic_learning_rate) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 1000000 self.batch_size = 128 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters # keep track of the best run self.nEpisode = 0 self.bestEpisode = [] self.bestEpisodeAt = -1 # logging business self.state_labels = self.task.get_state_labels() self.action_labels = [ 'ac{}'.format(i) for i in range(self.action_size) ] self.df_columns = [ 't' ] + self.state_labels.tolist() + self.action_labels + ['R'] self.basename = os.path.join('log', basename) self.currentEpisode = [] self.bestCumReward = -np.inf
def __init__(self, task): ''' Params ====== task (object) : environment ''' ''' Reference: Continuous Control With Deep Reinforcement Learning(2016) Playing CartPole through Asynchronous Advantage Actor Critic (A3C) with tf.keras ========= gamma : 0.99 tau : 0.001 buffer_size (ReplayBuffer) : 1e6 batch_size (ReplayBuffer) : 64 theta (Ornstein-Uhlenbeck process) : 0.15 sigma (Ornstein-Uhlenbeck process) : 0.2 ''' self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # actor (policy) model - use two copies of model for updating model and producing target self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # critic (value) model - use two copies of model for updating model and producing target self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # initialize target model parameters with local model parameters self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) # noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # replay memory self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters # reward history self.best_avg_score = -np.inf self.accumulated_reward = 0 self.count = 0
def __init__(self, task, mu=0.02, theta=0.16, sigma=0.21, buffer=500000, batch=64, gamma=0.98, tau=0.02, learning=0.001, dropout=0.2): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning, dropout) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning, dropout) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, learning, dropout) self.critic_target = Critic(self.state_size, self.action_size, learning, dropout) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = mu self.exploration_theta = theta self.exploration_sigma = sigma self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = buffer self.batch_size = batch self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = gamma # discount factor self.tau = tau # for soft update of target parameters # Score tracker and learning parameters # self.best_w = None self.score = 0 self.best_score = -np.inf self.noise_scale = 0.1 # Episode variables self.reset_episode()