def __init__(self, obs_space, act_space, sess, n_agents, name): self.act_space = act_space self.n_agents = n_agents self.ped_dqn = DQN(sess, obs_space, sup_len, act_space, n_agents, name) self.action_rb = ReplayBuffer(capacity=rb_capacity) self.mission_rb = ReplayBuffer(capacity=mrb_capacity) self.train_cnt = 0 self.mission_train_cnt = 0 self.sns_q = None
def __init__(self, task, seed=None, render=False): self.env = task.env self.total_reward = 0 self.steps = 0 self.action_repeat = 3 self.render = render # Score tracker and learning parameters self.score = -np.inf self.best_w = None self.best_score = -np.inf self.noise_scale = 0.1 #counter self.count = 0 # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(1, self.exploration_mu, self.exploration_theta, self.exploration_sigma)
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) self.exploration_mu = 0 self.exploration_theta = 0.10 # same direction self.exploration_sigma = 0.001 # random noise self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.gamma = 0.90 # discount factor self.tau = 0.1 # for soft update of target parameters self.best_score = -np.inf self.score = 0
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.gamma = 0.99 self.tau = 0.001
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.01 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor 0.99 self.tau = 0.1 # for soft update of target parameters 0.01
def __init__(self, task, buffer_size=100000, batch_size=64, gamma=0.99, tau=0.01): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning_rate=1e-3) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning_rate=1e-3) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, learning_rate=1e-4) self.critic_target = Critic(self.state_size, self.action_size, learning_rate=1e-4) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.noise = OUNoise(size=self.action_size) # Replay memory self.buffer_size = buffer_size self.batch_size = batch_size # 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = gamma # 0.99 discount factor self.tau = tau # 0.01 for soft update of target parameters # 初始化 self.last_state = None self.total_reward = 0.0 # Score tracker and learning parameters self.score = 0 self.best_score = -np.inf self.count = 0
def load_model(cls, filename): with open(filename + '.ddpg_agent') as f: m = pickle.load(f) m.actor_local = load_model(filename + '.actor_local') m.actor_target = load_model(filename + '.actor_target') m.critic_local = load_model(filename + '.critic_local') m.critic_target = load_model(filename + '.critic_target') m.replay_buffer = ReplayBuffer(m.buffer_size, m.batch_size) return m
def __init__(self, obs_space, act_space, sess, n_agents, name): self.obs_space = obs_space self.act_space = act_space self.n_agents = n_agents self.dqn = DQN(sess, obs_space, sup_len, act_space, n_agents, name) self.rb = ReplayBuffer(capacity=rb_capacity) self.train_cnt = 0
def __init__(self, task, sess, stats): self.sess = sess self.task = task self.stats = stats tau = 0.01 learning_rate = 2e-4 self.critic_local = QNetwork(sess, task, stats, name='critic_local', hidden_units=64, dropout_rate=0.2) self.critic_target = QNetwork(sess, task, stats, name='critic_target', hidden_units=64, dropout_rate=0.2) self.actor_local = Policy(sess, task, stats, name='actor_local', hidden_units=32, dropout_rate=0.2) self.actor_target = Policy(sess, task, stats, name='actor_target', hidden_units=32, dropout_rate=0.2) soft_copy_critic_ops = self._create_soft_copy_op('critic_local', 'critic_target', tau=tau) soft_copy_actor_ops = self._create_soft_copy_op('actor_local', 'actor_target', tau=tau) self._soft_copy_ops = [] self._soft_copy_ops.extend(soft_copy_critic_ops) self._soft_copy_ops.extend(soft_copy_actor_ops) self.gamma = 0.99 # reward discount rate # Exploration noise process exploration_mu = 0 exploration_theta = 0.15 exploration_sigma = 0.15 self.noise = OUNoise(task.action_size, exploration_mu, exploration_theta, exploration_sigma) # Replay memory self.batch_size = 256 self.memory = ReplayBuffer(buffer_size=10000, decay_steps=1000) self.sess.run(tf.global_variables_initializer())
def __init__(self, task, params={}): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, params=params) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, params=params) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, params) self.critic_target = Critic(self.state_size, self.action_size, params) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 # same direction self.exploration_sigma = 0.001 # random noise if (params.get("sigma")): self.exploration_sigma = params.get("sigma") self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 if (params.get("batch_size")): self.batch_size = params.get("batch_size") self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.1 # for soft update of target parameters #self.gamma = 0.9 #self.tau = 0.05 # Statistics self.best_score = -np.inf self.score = 0
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # AE: Although OUNoise gives me a convenient set of randomness for each of the rotors, I still need # AE: to make a decision myself on how to apply the randomness and how to manage its magnitude # AE: (i.e. my eplore vs exploit strategy). These variables will do that. self.explore_start = 1.0 # AE: exploration probability at start self.explore_stop = 0.001 # AE: minimum exploration probability self.decay_rate = 0.003 # AE: exponential decay rate for exploration prob self.magnitude_coeff = 0.1 # AE: a coefficient to limit randomness # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 # AE: additive to the noise. mu * theta will be directly added self.exploration_theta = 0.15 # AE: old noise will be multiplied by this self.exploration_sigma = 0.2 # AE: new noise will be multiplied by this self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor # AE: The learning rate. How much we trust the new values compared to the old ones. self.tau = 0.0001 # for soft update of target parameters # AE: current reward in learning procedure (for statistics) self.score = -np.inf # Episode variables self.reset_episode()
def __init__(self, env, actor_model, critic_model, gamma=0.99, tau=1e-3, critic_lr=1e-3, actor_lr=1e-4, critic_decay=0.): # Changed this to use generic env instead of Task super().__init__(env) self.state_size = env.observation_space.shape[0] self.action_size = env.action_space.shape[0] self.action_low = env.action_space.low self.action_high = env.action_space.high # Algorithm parameters self.gamma = gamma # discount factor self.tau = tau # for soft update of target parameters self.critic_lr = critic_lr self.actor_lr = actor_lr # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, self.critic_lr) self.critic_target = Critic(self.state_size, self.action_size, self.critic_lr) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
def setup_replay_buffer_(self): """ Setup a replay buffer. :return: None. """ if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.pr_alpha) self.beta_schedule = LinearSchedule(self.max_timesteps, initial_p=self.pr_beta, final_p=self.final_explore) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None
def __init__(self, task, seed=None, render=False): self.env = task.env self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.total_reward = 0 self.steps = 0 self.action_repeat = 3 self.render = render # Score tracker and learning parameters self.score = -np.inf self.best_w = None self.best_score = -np.inf self.noise_scale = 0.1 #counter self.count = 0 # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(1, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # from plicy search self.action_range = self.action_high - self.action_low self.w = np.random.normal( size=(self.state_size, self.action_size), # weights for simple linear policy: state_space x action_space scale=(self.action_range / (2 * self.state_size))) # start producing actions in a decent range # Score tracker and learning parameters self.score = -np.inf self.best_w = None self.best_score = -np.inf self.noise_scale = 0.1 #counter self.count = 0
def __init__(self, task, verbose=False): self.verbose = verbose self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) #log_path = '/tmp/logs' #self.callback = callbacks.TensorBoard(log_dir=log_path, histogram_freq=1, # write_images=False, write_grads=True, write_graph=False) #self.callback.set_model(self.critic_local.model) #log_path = '/tmp/logs' #self.writer = tf.summary.FileWriter(log_path) #self.learn_counter = 0 # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0.1 self.exploration_theta = 0.2 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 512 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.015 # for soft update of target parameters
def __init__(self, actor_model, tgt_actor_model, critic_model, tgt_critic_model, action_limits, actor_lr=1e-4, critic_lr=1e-3, critic_decay=1e-2, tau=1e-3, gamma=0.99, process=None, rb_size=1e6, minibatch_size=64, warmup_episodes=0, episodes_trained=0, train_scores=None, test_scores=None, best_train_score=-np.inf): # Changed this to use generic env instead of Task super().__init__(warmup_episodes, episodes_trained, train_scores, test_scores, best_train_score) self.actor = Actor(actor_model, critic_model, lr=actor_lr) self.tgt_actor = Actor(tgt_actor_model, tgt_critic_model, lr=actor_lr) self.tgt_actor.set_weights(self.actor.get_weights()) self.critic = Critic(critic_model, lr=critic_lr, decay=critic_decay) self.tgt_critic = Critic(tgt_critic_model, lr=critic_lr, decay=critic_decay) self.tgt_critic.set_weights(self.critic.get_weights()) self.action_limits = action_limits self.process = process self.minibatch_size = minibatch_size self.buffer = ReplayBuffer(int(rb_size), self.minibatch_size) self.tau = tau self.gamma = gamma self.state_space = K.int_shape(critic_model.inputs[0])[1] self.action_space = K.int_shape(critic_model.inputs[1])[1] self.learning_phase = 1 if process is None: self.process = OUNoise(size=self.action_space, theta=0.15, mu=0, sigma=0.2) else: self.process = process
def __init__(self, task): """Initialize DDPG Agent instance.""" self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_high = task.action_high self.action_low = task.action_low # Initializing local and target Actor Models # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_high, self.action_low) self.actor_target = Actor(self.state_size, self.action_size, self.action_high, self.action_low) # Initializing local and target Critic Models # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay Memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # Additional Parameters self.best_score = -np.inf self.total_reward = 0.0 self.count = 0 self.score = 0
def __init__(self, env_reset, state_size, action_size, action_low, action_high): """Params: env_reset: callback function to reset environemnt at end of episode state_size: dimension of state space action_size: dimension of action space action_low: float - minimum action value action_high: float - maximum action value """ self.training_steps = 0 # number of training steps run so far self.env_reset = env_reset self.state_size = state_size self.action_size = action_size self.action_low = action_low self.action_high = action_high # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 1e-3 # for soft update of target parameters self.critic_decay = 1e-2 # L2 weight decay for critic (regularization) self.critic_lr = 1e-3 # Learning rate for critic self.critic_alpha = 1e-2 # Leaky ReLU alpha for critic self.actor_lr = 1e-4 # Learning rate for actor self.actor_alpha = 1e-2 # Leaky ReLU alpha for actor # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.actor_alpha) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.actor_alpha) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, self.critic_lr, self.critic_decay, self.critic_alpha) self.critic_target = Critic(self.state_size, self.action_size, self.critic_lr, self.critic_decay,self.critic_alpha) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = int(1e6) self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size)
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 # Y.W. changing sigma self.exploration_sigma = 0.3 #0.3 #0.2 # 0.3 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory # Y.W. extending buffer_size self.buffer_size = 1000000 #100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor # self.tau = 0.01 # for soft update of target parameters # Y.W. self.tau = 0.001 # 0.001 # simple reword cash self.total_reward = 0.0 self.best_total_reward = -np.inf
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # ======================== Custom amednments : Score tracker and learning parameters ======================== self.best_score = -np.inf self.score = 0 self.total_reward = 0.0 self.count = 0 self.best_position = np.zeros(3)
def __init__(self, action_size): # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 # same direction self.exploration_sigma = 0.001 # random noise self.noise = OUNoise(action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) #self.memory = Memory(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.1 # for soft update of target parameters
def set_params(self, mu=0.1, sigma=0.1, theta=0.1, buffer_size=1e+8, batch_size=128, gamma=0.99, tau=1e-3): self.exploration_mu = mu self.exploration_sigma = sigma self.exploration_theta = theta self.noise = noise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.buffer_size = int(buffer_size) self.batch_size = int(batch_size) self.buffer = ReplayBuffer(self.buffer_size) self.gamma = gamma self.tau = tau
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.001 self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size) self.gamma = 0.99 self.tau = 0.1 self.learning_rate = 0.0005 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning_rate=self.learning_rate) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning_rate=self.learning_rate) self.critic_local = Critic(self.state_size, self.action_size, learning_rate=self.learning_rate) self.critic_target = Critic(self.state_size, self.action_size, learning_rate=self.learning_rate)
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high #actor model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) #Critic model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) #Initialize target model params with local params self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) #Initialize noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) #Replay memory Initialization self.buffer_size, self.batch_size = 2000000, 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) #Initialize algorithm parameters self.gamma, self.tau = 0.95, 0.001 #Initialize scores self.score, self.best_score = -np.inf, -np.inf
def __init__(self, task, mu=0.02, theta=0.16, sigma=0.21, buffer=500000, batch=64, gamma=0.98, tau=0.02, learning=0.001, dropout=0.2): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning, dropout) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning, dropout) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, learning, dropout) self.critic_target = Critic(self.state_size, self.action_size, learning, dropout) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = mu self.exploration_theta = theta self.exploration_sigma = sigma self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = buffer self.batch_size = batch self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = gamma # discount factor self.tau = tau # for soft update of target parameters # Score tracker and learning parameters # self.best_w = None self.score = 0 self.best_score = -np.inf self.noise_scale = 0.1 # Episode variables self.reset_episode()
def __init__(self, state_size, action_size, num_agents, random_seed, lr_actor=1e-4, lr_critic=1e-3, fc1_units=400, fc2_units=300, buffer_size=int(1e5), batch_size=128, gamma=0.99, tau=1e-3, max_norm=1.0, learn_period=20, learn_sampling_num=10): """Initialize an Agent object. Args: state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed max_norm (float): value of clip_grad_norm for critic optimizer """ super().__init__() self.state_size = state_size self.num_agents = num_agents self.action_size = action_size self.seed = random.seed(random_seed) self.max_norm = max_norm self.learn_period = learn_period self.learn_sampling_num = learn_sampling_num # Actor Network (w/ Target Network) self.actor_local = DDPGActor(state_size, action_size, random_seed, fc1_units=fc1_units, fc2_units=fc2_units).to(device) self.actor_target = DDPGActor(state_size, action_size, random_seed, fc1_units=fc1_units, fc2_units=fc2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = DDPGCritic(state_size, action_size, random_seed, fcs1_units=fc1_units, fc2_units=fc2_units).to(device) self.critic_target = DDPGCritic(state_size, action_size, random_seed, fcs1_units=fc1_units, fc2_units=fc2_units).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic) # Noise process for action # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 # (Timothy Lillicrap, 2016) self.exploration_sigma = 0.2 # (Timothy Lillicrap, 2016) # self.noise = OUNoise(action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.noise = OUNoiseMultivariate((num_agents, action_size), random_seed, mu=self.exploration_mu, theta=self.exploration_theta, sigma=self.exploration_sigma) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed, device) # parameter of discounted reward self.gamma = gamma # soft update parameter self.tau = tau self.batch_size = batch_size
def __init__(self, task): ''' Params ====== task (object) : environment ''' ''' Reference: Continuous Control With Deep Reinforcement Learning(2016) Playing CartPole through Asynchronous Advantage Actor Critic (A3C) with tf.keras ========= gamma : 0.99 tau : 0.001 buffer_size (ReplayBuffer) : 1e6 batch_size (ReplayBuffer) : 64 theta (Ornstein-Uhlenbeck process) : 0.15 sigma (Ornstein-Uhlenbeck process) : 0.2 ''' self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # actor (policy) model - use two copies of model for updating model and producing target self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # critic (value) model - use two copies of model for updating model and producing target self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # initialize target model parameters with local model parameters self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) # noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # replay memory self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters # reward history self.best_avg_score = -np.inf self.accumulated_reward = 0 self.count = 0
def __init__(self, task, prioritized_replay=True): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 #0.15 #0.1 self.exploration_sigma = 0.2 #0.2 #0.1 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.buffer_size = 100000 self.batch_size = 64 # 64 self.prioritized_replay = prioritized_replay self.prioritized_replay_alpha = 0.6 self.prioritized_replay_beta0 = 0.4 self.prioritized_replay_beta_iters = None self.prioritized_replay_eps = 1e-6 self.max_timesteps = 100000 # Replay buffer if self.prioritized_replay: self.memory = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: self.prioritized_replay_beta_iters = self.max_timesteps self.beta_schedule = LinearSchedule( self.prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters #self.tau = 0.001 # 0.001 per paper self.td_errors_list = [] self.actor_loss_list = [] self.critic_loss_list = []