def __init__(self, env, time_steps, hidden_dim): self.name = 'DDPG' # name for uploading results self.scale = env.asset self.unit = env.unit self.seed = env.rd_seed self.time_dim = time_steps self.state_dim = env.observation_space.shape[1] self.action_dim = env.action_space.shape[0] self.batch_size = 64 self.memory_size = self.time_dim + self.batch_size * 10 self.start_size = self.time_dim + self.batch_size * 2 # Initialise actor & critic networks self.actor_network = Actor(self.time_dim, self.state_dim, self.action_dim, hidden_dim) self.critic_network = Critic(self.time_dim, self.state_dim, self.action_dim, hidden_dim) # Initialize replay buffer self.replay_state = torch.zeros( (self.start_size - 1, 3, self.state_dim), device=cuda) self.replay_next_state = torch.zeros( (self.start_size - 1, 3, self.state_dim), device=cuda) self.replay_action = torch.zeros( (self.start_size - 1, 1, self.state_dim), device=cuda) self.replay_reward = torch.zeros((self.start_size - 1, ), device=cuda) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim, sigma=0.01 / self.action_dim) self.initial()
def __init__( self, obs_dim, action_dim, action_gain, actor_learning_rate=0.0001, critic_learning_rate=0.001, gamma=0.99, tau=0.001, ): self.obs_dim = obs_dim self.action_dim = action_dim self.gamma = gamma self.tau = tau # make main networks self.actor = Actor(obs_dim, action_dim, action_gain, actor_learning_rate) self.critic = Critic(obs_dim, action_dim, critic_learning_rate) # make target networks self.target_actor = Actor(obs_dim, action_dim, action_gain) self.target_actor.model.set_weights(self.actor.model.get_weights()) self.target_critic = Critic(obs_dim, action_dim) self.target_critic.model.set_weights(self.critic.model.get_weights())
def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.eps = eps_start self.t_step = 0 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self, state_size, action_size, random_seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = EPS #--- actor -----# self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=1e-3) #---- critic -----# self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=1e-3, weight_decay=0) self.noise = OUNoise(action_size, random_seed) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self, state_size, action_size, random_seed, num_agents): """Initialize an Agent object. """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed, sigma=0.1) # Replay buffer self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.num_agents = num_agents
def __init__( self, state_size=24, action_size=2, BATCH_SIZE=128, BUFFER_SIZE=int(1e6), discount_factor=1, tau=1e-2, noise_coefficient_start=5, noise_coefficient_decay=0.99, LR_ACTOR=1e-3, LR_CRITIC=1e-3, WEIGHT_DECAY=1e-3, device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")): """ state_size (int): dimension of each state action_size (int): dimension of each action BATCH_SIZE (int): mini batch size BUFFER_SIZE (int): experience storing lenght, keep it as high as possible discount_factor (float): discount factor for calculating Q_target tau (float): interpolation parameter for updating target network noise_coefficient_start (float): value to be multiplied to OUNoise sample noise_coefficient_decay (float): exponential decay factor for value to be multiplied to OUNoise sample LR_ACTOR (float): learning rate for actor network LR_CRITIC (float): learning rate for critic network WEIGHT_DECAY (float): Weight decay for critic network optimizer device : "cuda:0" if torch.cuda.is_available() else "cpu" """ self.state_size = state_size print(device) self.action_size = action_size self.BATCH_SIZE = BATCH_SIZE self.BUFFER_SIZE = BUFFER_SIZE self.discount_factor = discount_factor self.tau = tau self.noise_coefficient = noise_coefficient_start self.noise_coefficient_decay = noise_coefficient_decay self.steps_completed = 0 self.device = device # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size).to(self.device) self.actor_target = Actor(state_size, action_size).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size).to(self.device) self.critic_target = Critic(state_size, action_size).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((1, action_size)) # Replay memory self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE, self.BATCH_SIZE)
def __init__(self, env: gym.Env, memory_size: int, batch_size: int, ou_noise_theta: float, ou_noise_sigma: float, gamma: float = 0.99, tau: float = 5e-3, initial_random_episode: int = 1e4, name_cases='myproject'): """ Initialize. """ # Logger self.wandb = wandb.init(project=name_cases) obs_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] self.env = env self.memory = ReplayBuffer(memory_size) self.batch_size = batch_size self.gamma = gamma self.tau = tau self.initial_random_episode = initial_random_episode # noise self.noise = OUNoise( action_dim, theta=ou_noise_theta, sigma=ou_noise_sigma, ) # device: cpu / gpu self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print(self.device) # networks self.actor = Actor(obs_dim, action_dim).to(self.device) self.actor_target = Actor(obs_dim, action_dim).to(self.device) self.actor_target.load_state_dict(self.actor.state_dict()) self.critic = Critic(obs_dim + action_dim).to(self.device) self.critic_target = Critic(obs_dim + action_dim).to(self.device) self.critic_target.load_state_dict(self.critic.state_dict()) # optimizer self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=3e-4) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3) # transition to store in memory self.transition = list() # total steps count self.total_step = 0 # mode: train / test self.is_test = False self.populate(self.initial_random_episode)
def create_critic(self, alpha, hidden_layers): params = { 'input_shape': self.env.observation_space.shape, 'output_shape': self.env.action_space.shape, 'hidden_layers': hidden_layers } self.critic = OpenStruct() self.critic.online = Critic("{}.critic.online".format(self.name), **params) self.critic.target = Critic("{}.critic.target".format(self.name), **params)
def test_critic(self): Actor_obj = Actor(1, 16, 4) Critic_obj = Critic(4, 16, 1) # critic_optimizer = optim.SGD(Critic_obj.parameters(), lr=C_learning_rate) y = Actor_obj.forward(torch.FloatTensor([1])) # Forward Propagation y_pred = Critic_obj.forward(y) self.assertTrue(len(y_pred) == 1)
def __init__(self, n, state_size, action_size, random_seed, params): """Initialize an Agent object. Params ====== n (int): number of agents in env state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed params (dict): dictionary with hyperparameters name-value pairs """ self.n = n self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.BUFFER_SIZE = params["BUFFER_SIZE"] self.BATCH_SIZE = params["BATCH_SIZE"] self.GAMMA = params["GAMMA"] self.TAU = params["TAU"] self.LR_ACTOR = params["LR_ACTOR"] self.LR_CRITIC = params["LR_CRITIC"] self.WEIGHT_DECAY = params["WEIGHT_DECAY"] self.N_UPDATES = params["N_UPDATES"] self.UPDATE_STEP = params["UPDATE_STEP"] # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.LR_CRITIC, weight_decay=self.WEIGHT_DECAY) # Noise process self.noise = OUNoise(self.n, action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, self.BUFFER_SIZE, self.BATCH_SIZE, random_seed) #Count timesteps self.timestep = 0
def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None, buffer_size=150000, batch_size=32, gamma=0.99, replay_alpha=0.5, beta_limit=10000): self.task = Task(init_pose, init_velocities, init_angle_velocities, runtime, target_pos) self.state_size = self.task.state_size self.action_size = self.task.action_size self.state = self.task.reset() self.memory = PrioritizedReplay(buffer_size, batch_size, replay_alpha, beta_limit) self.actor = Actor(self.state_size, self.action_size, self.task.action_low, self.task.action_high) self.actor_weights = self.actor.model.trainable_weights self.actor_target = Actor(self.state_size, self.action_size, self.task.action_low, self.task.action_high) self.critic = Critic(self.state_size, self.action_size) self.critic_weights = self.critic.model.trainable_weights self.critic_target = Critic(self.state_size, self.action_size) self.gamma = gamma # how much influence older weights have when updating target self.tau = 0.03 #noise # GENTLE LANDING #self.mu = 0 #self.theta = 0.1 #self.sigma = 25 self.mu = 0 self.theta = 0.1 self.sigma = 9 self.noise = Noise(self.action_size, self.mu, self.theta, self.sigma) self.episodes = 0 self.training_step = 0
def __init__(self, env, act_dim, state_dim, goal_dim, act_range, buffer_size=int(1e6), gamma=0.98, lr=0.001, tau=0.95): """ Initialization """ # Environment and A2C parameters self.act_dim = act_dim self.act_range = act_range self.env_dim = state_dim + goal_dim self.gamma = gamma self.lr = lr self.tau = tau self.env = env # Create actor and critic networks self.actor_network = Actor(self.env_dim, act_dim, act_range) self.actor_target_network = Actor(self.env_dim, act_dim, act_range) self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_network = Critic(self.env_dim, act_dim, act_range) self.critic_target_network = Critic(self.env_dim, act_dim, act_range) self.actor_target_network.load_state_dict( self.actor_network.state_dict()) sync_networks(self.actor_network) sync_networks(self.critic_network) # Optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=lr) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=lr) # Replay buffer # self.buffer = MemoryBuffer(buffer_size) self.buffer = ReplayMemory(buffer_size) # Normalizers self.goal_normalizer = Normalizer( goal_dim, default_clip_range=5) # Clip between [-5, 5] self.state_normalizer = Normalizer(state_dim, default_clip_range=5)
def test_actor(self): Actor_obj = Actor(1, 16, 4) Critic_obj = Critic(4, 16, 1) # actor_optimizer = optim.SGD(Actor_obj.parameters(), lr=0.1, momentum=0.5) # Forward Propagation y = Actor_obj.forward(torch.FloatTensor([1])) self.assertTrue(len(y) == 4)
class ActorCritic(object): def __init__(self, env): LR_A = 0.001 # learning rate for actor LR_C = 0.01 # learning rate for critic num_features = env.observation_space.shape[0] # num_features = 14 num_actions = env.action_space.shape[0] self.action_space = env.action_space sess = tf.Session() self.actor = Actor( sess, n_features=num_features, action_bound=[env.action_space.low[0], env.action_space.high[0]], lr=LR_A) self.critic = Critic( sess, n_features=num_features, lr=LR_C ) # we need a good teacher, so the teacher should learn faster than the actor sess.run(tf.global_variables_initializer()) def get_action(self, state, episode_percentage): # state = state[0:14] # Sometimes pick random action to explore if np.random.random() < self.get_exploration_prob(episode_percentage): # print 'random' return self.action_space.sample() else: # print 'not random' return self.actor.choose_action(state)[0] def get_exploration_prob(self, episode_percentage): # if (episode_percentage > .8): # epsilon = 0.3 # else: epsilon = -1 * (episode_percentage**2) + 1 # epsilon = -1 * (episode_percentage - 1) ** 3 # epsilon = -0.8 * (episode_percentage - 1) ** 3 + 0.2 # epsilon = -0.8 * episode_percentage + 1 # print epsilon return epsilon def update(self, state, action, reward, new_state): # state = state[0:14] # new_state = new_state[0:14] td_error = self.critic.learn( state, reward, new_state) # gradient = grad[r + gamma * V(s_) - V(s)] # print td_error self.actor.learn( state, action, td_error) # true_gradient = grad[logPi(s,a) * td_error] def get_name(self): return 'ActorCritic'
def __init__(self, env): LR_A = 0.001 # learning rate for actor LR_C = 0.01 # learning rate for critic num_features = env.observation_space.shape[0] # num_features = 14 num_actions = env.action_space.shape[0] self.action_space = env.action_space sess = tf.Session() self.actor = Actor( sess, n_features=num_features, action_bound=[env.action_space.low[0], env.action_space.high[0]], lr=LR_A) self.critic = Critic( sess, n_features=num_features, lr=LR_C ) # we need a good teacher, so the teacher should learn faster than the actor sess.run(tf.global_variables_initializer())
def __init__(self, env=gym.make('Pendulum-v0'), s_dim=2, a_dim=1, gamma=0.99, episodes=100, tau=0.001, buffer_size=1e06, minibatch_size=64, actor_lr=0.001, critic_lr=0.001, save_name='final_weights', render=False): self.save_name = save_name self.render = render self.env = env self.upper_bound = env.action_space.high[0] self.lower_bound = env.action_space.low[0] self.EPISODES = episodes self.MAX_TIME_STEPS = 200 self.s_dim = s_dim self.a_dim = a_dim self.GAMMA = gamma self.TAU = tau self.buffer_size = buffer_size self.minibatch_size = minibatch_size self.actor_lr = actor_lr self.critic_lr = critic_lr self.ou_noise = OUNoise(mean=np.zeros(1)) self.actor = Actor(self.s_dim, self.a_dim).model() self.target_actor = Actor(self.s_dim, self.a_dim).model() self.actor_opt = tf.keras.optimizers.Adam(learning_rate=self.actor_lr) self.target_actor.set_weights(self.actor.get_weights()) self.critic = Critic(self.s_dim, self.a_dim).model() self.critic_opt = tf.keras.optimizers.Adam( learning_rate=self.critic_lr) self.target_critic = Critic(self.s_dim, self.a_dim).model() self.target_critic.set_weights(self.critic.get_weights()) self.replay_buffer = ReplayBuffer(self.buffer_size)
def __init__(self, n_state, n_action, a_limit, model_folder=None, memory_size=10000, batch_size=32, tau=0.01, gamma=0.99, var=3.0): # Record the parameters self.n_state = n_state self.n_action = n_action self.a_limit = a_limit self.memory_size = memory_size self.model_folder = model_folder self.batch_size = batch_size self.tau = tau self.gamma = gamma self.var = var # Create the network and related objects self.memory = np.zeros( [self.memory_size, 2 * self.n_state + self.n_action + 1], dtype=np.float32) self.memory_counter = 0 self.eval_actor = Actor(self.n_state, self.n_action, self.a_limit) self.eval_critic = Critic(self.n_state, self.n_action) self.target_actor = Actor(self.n_state, self.n_action, self.a_limit, trainable=False) self.target_critic = Critic(self.n_state, self.n_action, trainable=False) self.actor_optimizer = Adam(self.eval_actor.parameters(), lr=0.001) self.critic_optimizer = Adam(self.eval_critic.parameters(), lr=0.002) self.criterion = nn.MSELoss() # Make sure the parameter of target network is the same as evaluate network self.hardCopy()
def __init__(self, state_size, action_size, action_sigma=0.1, memory_size=1000000, batch=128, sigma=0.2, noise_clip=0.5, gamma=0.99, update_frequency=2, seed=0): ''' TD3 Agent :param state_size: State Dimension :param action_size: Action dimension :param action_sigma: standard deviation of the noise to be added to the action :param memory_size: :param batch: :param sigma: Standard deviation of the noise to be added to the target function (Chapter 5.3 of TD3 Paper) :param noise_clip: How much noise to allow :param gamma: :param update_frequency: :param seed: ''' self.state_size = state_size self.action_size = action_size self.action_sigma = action_sigma self.sigma = sigma self.noise_clip = noise_clip self.gamma = gamma self.update_frequency = update_frequency self.seed = seed self.actor = Actor(self.state_size, self.action_size).to(device) self.critic0 = Critic(self.state_size, self.action_size).to(device) #second Critic as described in the paper # https: // arxiv.org / pdf / 1802.09477.pdf self.critic1 = Critic(self.state_size, self.action_size).to(device) self.target_actor = Actor(self.state_size, self.action_size).to(device) self.target_critic0 = Critic(self.state_size, self.action_size).to(device) # second Critic as described in the paper # https: // arxiv.org / pdf / 1802.09477.pdf self.target_critic1 = Critic(self.state_size, self.action_size).to(device) self.memory = ReplayBuffer(memory_size, batch, seed=seed) self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR) self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR) self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR) self.soft_update(self.actor, self.target_actor, 1) self.soft_update(self.critic0, self.target_critic0, 1) self.soft_update(self.critic1, self.target_critic1, 1)
def MountainCar(): env = gym.make('MountainCar-v0') env = env.unwrapped env.reset() env.render() n_features = env.observation_space.shape[0] n_actions = env.action_space.n sess = tf.Session() actor = Actor(sess, n_features, n_actions, lr=LR_A) critic = Critic(sess, n_features, lr=LR_C) sess.run(tf.global_variables_initializer()) game = Game(env, actor, critic) game.run_mountain_car()
def main(args): with tf.device(args['device']): # tf tf.set_random_seed(args['rand_seed']) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # env env = gym.make('TestEnv-v0') env.seed(args['rand_seed']) s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] concat_dim = 2 batched_s_dim = [None, s_dim, concat_dim] batched_a_dim = [None, a_dim] # agents actor = Actor(sess, args['actor_lr'], args['tau'], args['batch_size'], args['clip_val'], batched_s_dim, batched_a_dim) critic = Critic(sess, args['critic_lr'], args['tau'], args['clip_val'], batched_s_dim, batched_a_dim) # experience exp = Experience(args['buffer_size'], args['batch_size'], args['rand_seed']) # noise actor_noise = ActorNoise(actor.predict, a_dim, noise_type=args['noise_type']) # initialize init = tf.global_variables_initializer() sess.run(init) saver = Model(sess, args['restore_path']) saver.restore_model() # training her = HER(saver, exp, env, actor, critic, actor_noise) if args['mode'] == 'train': her.train(args['gamma'], args['her_k'], args['max_episodes'], args['max_episode_len'], args['replay_len']) else: her.play(args['max_episodes'], args['max_episode_len'])
def CartPoleAC(): env1 = gym.make('CartPole-v0') # env2 = gym.make('CartPole-v0') env1.seed(10) # env2.seed(2) env1 = env1.unwrapped env1.reset() # env2 = env2.unwrapped # env2.reset() n_features = env1.observation_space.shape[0] n_actions = env1.action_space.n sess = tf.Session() actor = Actor(sess, n_features, n_actions, lr=LR_A) critic = Critic(sess, n_features, lr=LR_C) sess.run(tf.global_variables_initializer()) g = Game(env1, actor, critic) g.run()
def __init__(self, env, memory_size=1000000, batch=128, sigma=0.2, noise_clip=0.5, gamma=0.99, update_frequency=2): self.states = env.observation_space self.state_size = env.observation_space.shape[0] self.actions = env.action_space self.action_size = env.action_space.shape[0] self.sigma = sigma self.noise_clip = noise_clip self.gamma = gamma self.update_frequency = update_frequency self.actor = Actor(self.state_size, self.action_size).to(device) self.critic0 = Critic(self.state_size, self.action_size).to(device) self.critic1 = Critic(self.state_size, self.action_size).to(device) self.target_actor = Actor(self.state_size, self.action_size).to(device) self.target_critic0 = Critic(self.state_size, self.action_size).to(device) self.target_critic1 = Critic(self.state_size, self.action_size).to(device) self.memory = ReplayBuffer(memory_size, batch) self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR) self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR) self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR) self.soft_update(self.actor, self.target_actor, 1) self.soft_update(self.critic0, self.target_critic0, 1) self.soft_update(self.critic1, self.target_critic1, 1)
class Agent(): def __init__(self, env, memory_size=1000000, batch=128, sigma=0.2, noise_clip=0.5, gamma=0.99, update_frequency=2): self.states = env.observation_space self.state_size = env.observation_space.shape[0] self.actions = env.action_space self.action_size = env.action_space.shape[0] self.sigma = sigma self.noise_clip = noise_clip self.gamma = gamma self.update_frequency = update_frequency self.actor = Actor(self.state_size, self.action_size).to(device) self.critic0 = Critic(self.state_size, self.action_size).to(device) self.critic1 = Critic(self.state_size, self.action_size).to(device) self.target_actor = Actor(self.state_size, self.action_size).to(device) self.target_critic0 = Critic(self.state_size, self.action_size).to(device) self.target_critic1 = Critic(self.state_size, self.action_size).to(device) self.memory = ReplayBuffer(memory_size, batch) self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR) self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR) self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR) self.soft_update(self.actor, self.target_actor, 1) self.soft_update(self.critic0, self.target_critic0, 1) self.soft_update(self.critic1, self.target_critic1, 1) def act(self, state, step, epsilon=True): state = torch.from_numpy(np.asarray(state)).float().to(device) action = self.actor.forward(state) action = action.detach().cpu().numpy() if epsilon: noise = np.random.normal(0, 0.1, action.shape[0]) action += noise return action def update(self, step): state, action, reward, next_state, done = self.memory.sample() next_state_action = self.target_actor(next_state) noise = Normal(torch.zeros(self.action_size), self.sigma).sample() noise = torch.clamp(noise, -self.noise_clip, self.noise_clip).to(device) next_state_action += noise target_Q0 = self.target_critic0(next_state, next_state_action) target_Q1 = self.target_critic1(next_state, next_state_action) target_Q = torch.min(target_Q0, target_Q1) target_value = reward + self.gamma * target_Q * (1.0 - done) expected_Q0 = self.critic0(state, action) expected_Q1 = self.critic1(state, action) critic_0_loss = F.mse_loss(expected_Q0, target_value.detach()) critic_1_loss = F.mse_loss(expected_Q1, target_value.detach()) self.critic0_optimizer.zero_grad() critic_0_loss.backward() self.critic0_optimizer.step() self.critic1_optimizer.zero_grad() critic_1_loss.backward() self.critic1_optimizer.step() if step % self.update_frequency == 0: actor_loss = self.critic0.forward(state, self.actor.forward(state)) actor_loss = -actor_loss.mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.soft_update(self.critic0, self.target_critic0, TRANSFER_RATE) self.soft_update(self.critic1, self.target_critic1, TRANSFER_RATE) self.soft_update(self.actor, self.target_actor, TRANSFER_RATE) def soft_update(self, local_model, target_model, tao): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tao * local_param.data + (1.0 - tao) * target_param.data) def add_to_memory(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done)
class DDPG: def __init__(self, env=gym.make('Pendulum-v0'), s_dim=2, a_dim=1, gamma=0.99, episodes=100, tau=0.001, buffer_size=1e06, minibatch_size=64, actor_lr=0.001, critic_lr=0.001, save_name='final_weights', render=False): self.save_name = save_name self.render = render self.env = env self.upper_bound = env.action_space.high[0] self.lower_bound = env.action_space.low[0] self.EPISODES = episodes self.MAX_TIME_STEPS = 200 self.s_dim = s_dim self.a_dim = a_dim self.GAMMA = gamma self.TAU = tau self.buffer_size = buffer_size self.minibatch_size = minibatch_size self.actor_lr = actor_lr self.critic_lr = critic_lr self.ou_noise = OUNoise(mean=np.zeros(1)) self.actor = Actor(self.s_dim, self.a_dim).model() self.target_actor = Actor(self.s_dim, self.a_dim).model() self.actor_opt = tf.keras.optimizers.Adam(learning_rate=self.actor_lr) self.target_actor.set_weights(self.actor.get_weights()) self.critic = Critic(self.s_dim, self.a_dim).model() self.critic_opt = tf.keras.optimizers.Adam( learning_rate=self.critic_lr) self.target_critic = Critic(self.s_dim, self.a_dim).model() self.target_critic.set_weights(self.critic.get_weights()) self.replay_buffer = ReplayBuffer(self.buffer_size) def update_target(self): # Two methods to update the target actor # Method 1: self.target_actor.set_weights( np.array(self.actor.get_weights()) * self.TAU + np.array(self.target_actor.get_weights()) * (1 - self.TAU)) self.target_critic.set_weights( np.array(self.critic.get_weights()) * self.TAU + np.array(self.target_critic.get_weights()) * (1 - self.TAU)) """ # Method 2: new_weights = [] target_variables = self.target_critic.weights for i, variable in enumerate(self.critic.weights): new_weights.append(variable * self.TAU + target_variables[i] * (1 - self.TAU)) self.target_critic.set_weights(new_weights) new_weights = [] target_variables = self.target_actor.weights for i, variable in enumerate(self.actor.weights): new_weights.append(variable * self.TAU + target_variables[i] * (1 - self.TAU)) self.target_actor.set_weights(new_weights) """ def train_step(self): s_batch, a_batch, r_batch, d_batch, s2_batch = self.replay_buffer.sample_batch( self.minibatch_size) """ mu_prime = self.target_actor(s2_batch) # predictions by target actor Q_prime = self.target_critic([s2_batch, mu_prime]) # predictions by target critic y = np.zeros_like(Q_prime) for k in range(self.minibatch_size): if d_batch[k]: y[k] = r_batch[k] else: y[k] = r_batch[k] + self.GAMMA * Q_prime[k] # y = r_batch + gamma * Q_prime checkpoint_path = "training/cp_critic.ckpt" checkpoint_dir = os.path.dirname(checkpoint_path) # Create a callback that saves the model's weights cp_callback1 = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_dir, save_weights_only=True, verbose=1) self.critic.train_on_batch([s_batch, a_batch], y) # self.critic.fit([s_batch, a_batch], y, verbose=0, steps_per_epoch=8, callbacks=[cp_callback1]) with tf.GradientTape(persistent=True) as tape: a = self.actor(s_batch) tape.watch(a) theta = self.actor.trainable_variables q = self.critic([s_batch, a]) dq_da = tape.gradient(q, a) da_dtheta = tape.gradient(a, theta, output_gradients=-dq_da) self.actor_opt.apply_gradients(zip(da_dtheta, self.actor.trainable_variables)) """ with tf.GradientTape() as tape: target_actions = self.target_actor(s2_batch) y = r_batch + self.GAMMA * self.target_critic( [s2_batch, target_actions]) critic_value = self.critic([s_batch, a_batch]) critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value)) critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables) self.critic_opt.apply_gradients( zip(critic_grad, self.critic.trainable_variables)) with tf.GradientTape() as tape: actions = self.actor(s_batch) q = self.critic([s_batch, actions]) # critic_value # Used `-value` as we want to maximize the value given # by the critic for our actions actor_loss = -tf.math.reduce_mean(q) actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor_opt.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) self.update_target() return np.mean(q) def policy(self, s): # since batch normalization is done on self.actor, it is multiplied with upper_bound if s.ndim == 1: s = s[None, :] action = self.actor(s) * self.upper_bound + self.ou_noise() action = np.clip(action, self.lower_bound, self.upper_bound) return action def train(self): # To store reward history of each episode ep_reward_list = [] # To store average reward history of last few episodes avg_reward_list = [] monitor = Monitor([1, 1], titles=['Reward', 'Loss'], log=2) with Loop_handler( ) as interruption: # to properly save even if ctrl+C is pressed for eps in range(self.EPISODES): episode_reward = 0 s = self.env.reset() """ if an env is created using the "gym.make" method, it will terminate after 200 steps """ for t in range(self.MAX_TIME_STEPS): # done = False # while not done: if self.render: self.env.render() a = self.policy(s) s_, r, done, _ = self.env.step(a) self.replay_buffer.add(np.reshape(s, (self.s_dim, )), np.reshape(a, (self.a_dim, )), r, done, np.reshape(s_, (self.s_dim, ))) episode_reward += r if self.replay_buffer.size() > self.minibatch_size: q = self.train_step() s = s_.reshape(1, -1) if interruption(): break ep_reward_list.append(episode_reward) # Mean of last 40 episodes avg_reward = np.mean(ep_reward_list[-40:]) print("Episode * {} * Avg Reward is ==> {}".format( eps, avg_reward)) avg_reward_list.append(avg_reward) monitor.add_data(avg_reward, q) self.save_weights( save_name=self.save_name) # if you want to save weights self.plot_results(avg_reward=avg_reward_list, train=True) def save_weights(self, save_name='final_weights'): self.actor.save_weights("training/%s_actor.h5" % save_name) self.critic.save_weights("training/%s_critic.h5" % save_name) self.target_actor.save_weights("training/%s_target_actor.h5" % save_name) self.target_critic.save_weights("training/%s_target_critic.h5" % save_name) # to save in other format self.target_actor.save_weights('training/%s_actor_weights' % save_name, save_format='tf') self.target_critic.save_weights('training/%s_critic_weights' % save_name, save_format='tf') print('Training completed and network weights saved') # For evaluation of the policy learned def collect_data(self, act_net, iterations=1000): a_all, states_all = [], [] obs = self.env.reset() for t in range(iterations): obs = np.squeeze(obs) if obs.ndim == 1: a = act_net(obs[None, :]) else: a = act_net(obs) obs, _, done, _ = self.env.step(a) states_all.append(obs) a_all.append(a) # self.env.render() # Uncomment this to see the actor in action (But not in python notebook) # if done: # break states = np.squeeze( np.array(states_all)) # cos(theta), sin(theta), theta_dot a_all = np.squeeze(np.array(a_all)) return states, a_all def plot_results(self, avg_reward=None, actions=None, states=None, train=False, title=None): # An additional way to visualize the avg episode rewards if train: plt.figure() plt.plot(avg_reward) plt.xlabel("Episode") plt.ylabel("Avg. Epsiodic Reward") plt.show() else: # work only for Pendulum-v0 environment fig, ax = plt.subplots(3, sharex=True) theta = np.arctan2(states[:, 1], states[:, 0]) ax[0].set_ylabel('u') ax[0].plot(np.squeeze(actions)) ax[1].set_ylabel(u'$\\theta$') ax[1].plot(theta) # ax[1].plot(states[:, 0]) ax[2].set_ylabel(u'$\omega$') ax[2].plot(states[:, 2]) # ang velocity fig.canvas.set_window_title(title)
class DDPG: def __init__( self, obs_dim, action_dim, action_gain, actor_learning_rate=0.0001, critic_learning_rate=0.001, gamma=0.99, tau=0.001, ): self.obs_dim = obs_dim self.action_dim = action_dim self.gamma = gamma self.tau = tau # make main networks self.actor = Actor(obs_dim, action_dim, action_gain, actor_learning_rate) self.critic = Critic(obs_dim, action_dim, critic_learning_rate) # make target networks self.target_actor = Actor(obs_dim, action_dim, action_gain) self.target_actor.model.set_weights(self.actor.model.get_weights()) self.target_critic = Critic(obs_dim, action_dim) self.target_critic.model.set_weights(self.critic.model.get_weights()) def act(self, obs): return self.actor.act(obs)[0] @tf.function def update_networks(self, batch): """ runs all updates from provided training batch """ s, a, r, t, next_s = (tf.cast(i, tf.float32) for i in batch) self.update_critic(s, a, r, next_s) self.update_actor(s, a, r, next_s) self.update_target(self.actor.model, self.target_actor.model) self.update_target(self.critic.model, self.target_critic.model) @tf.function def update_critic(self, s, a, r, next_s): """ minimize td-loss from target """ # td estimate based on targets' behavior target_future_actions = self.target_actor.act(next_s) target_future_qs = self.target_critic.estimate_q( next_s, target_future_actions) target_current_qs = r + self.gamma * target_future_qs # update main critic main_current_qs = self.critic.model([s, a]) loss = keras.losses.mse(target_current_qs, main_current_qs) model_vars = self.critic.model.trainable_variables dloss_dcrit = tf.gradients(loss, model_vars) self.critic.optimizer.apply_gradients(zip(dloss_dcrit, model_vars)) @tf.function def update_actor(self, s, a, r, next_s): """ dq_dtheta = dq_da * da_dtheta""" # first, finding dq_da proposed_a = self.actor.model(s) q = self.critic.model([s, proposed_a]) dq_da = tf.gradients(q, proposed_a)[0] # second, finding dq_da * da_dtheta model_vars = self.actor.model.trainable_variables dq_dtheta = tf.gradients(proposed_a, model_vars, grad_ys=-dq_da) # updating the model self.actor.optimizer.apply_gradients(zip(dq_dtheta, model_vars)) @tf.function def update_target(self, main_model, target_model): """ target = tau*main + (1-tau)*target """ for model_weight, target_weight in zip(main_model.weights, target_model.weights): target_weight.assign(self.tau * model_weight + (1 - self.tau) * target_weight) def save_model(self, save_dir): """ saves the main and target networks""" self.actor.model.save_weights(os.path.join(save_dir, "actor")) self.critic.model.save_weights(os.path.join(save_dir, "critic")) self.target_actor.model.save_weights( os.path.join(save_dir, "target_actor")) self.target_critic.model.save_weights( os.path.join(save_dir, "target_critic"))
class DDPG: """docstring for DDPG""" def __init__(self, env, time_steps, hidden_dim): self.name = 'DDPG' # name for uploading results self.scale = env.asset self.unit = env.unit self.seed = env.rd_seed self.time_dim = time_steps self.state_dim = env.observation_space.shape[1] self.action_dim = env.action_space.shape[0] self.batch_size = 64 self.memory_size = self.time_dim + self.batch_size * 10 self.start_size = self.time_dim + self.batch_size * 2 # Initialise actor & critic networks self.actor_network = Actor(self.time_dim, self.state_dim, self.action_dim, hidden_dim) self.critic_network = Critic(self.time_dim, self.state_dim, self.action_dim, hidden_dim) # Initialize replay buffer self.replay_state = torch.zeros( (self.start_size - 1, 3, self.state_dim), device=cuda) self.replay_next_state = torch.zeros( (self.start_size - 1, 3, self.state_dim), device=cuda) self.replay_action = torch.zeros( (self.start_size - 1, 1, self.state_dim), device=cuda) self.replay_reward = torch.zeros((self.start_size - 1, ), device=cuda) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim, sigma=0.01 / self.action_dim) self.initial() def initial(self): self.steps = 0 self.action = torch.zeros(self.action_dim, device=cuda) self.replay_state = torch.zeros( (self.start_size - 1, 3, self.state_dim), device=cuda) self.replay_next_state = torch.zeros( (self.start_size - 1, 3, self.state_dim), device=cuda) self.replay_action = torch.zeros((self.start_size - 1, self.state_dim), device=cuda) self.replay_reward = torch.zeros((self.start_size - 1, ), device=cuda) def train_on_batch(self): # Sample a random minibatch of N transitions from replay buffer sample = torch.randint(self.time_dim, self.replay_reward.shape[0], [self.batch_size], device=cuda) index = torch.stack([sample - i for i in range(self.time_dim, 0, -1) ]).t().reshape(-1) state_data = min_max_scale(self.replay_state[:, 0, :]) amount_data = min_max_scale(self.replay_state[:, 2, :]) next_state_data = min_max_scale(self.replay_next_state[:, 0, :]) next_amount_data = min_max_scale(self.replay_next_state[:, 2, :]) state_batch = torch.index_select(state_data, 0, index).view(self.batch_size, -1) amount_data = torch.index_select(amount_data, 0, sample).view(self.batch_size, -1) state_batch = torch.cat([state_batch, amount_data], dim=1) next_state_batch = torch.index_select(next_state_data, 0, index).view(self.batch_size, -1) next_amount_data = torch.index_select(next_amount_data, 0, sample).view( self.batch_size, -1) next_state_batch = torch.cat([next_state_batch, next_amount_data], dim=1) action_batch = torch.index_select(self.replay_action / self.unit, 0, sample) reward_batch = torch.index_select(self.replay_reward, 0, sample) # Calculate y_batch next_action_batch = self.actor_network.target_action(next_state_batch) q_batch = self.critic_network.target_q(next_action_batch, next_state_batch) y_batch = torch.add(reward_batch, q_batch, alpha=GAMMA).view(-1, 1) # train actor-critic by target loss self.actor_network.train( self.critic_network.train(y_batch, action_batch, state_batch)) # Update target networks by soft update self.actor_network.update_target() self.critic_network.update_target() def perceive(self, state, action, reward, next_state, done): if self.steps < self.start_size - 1: self.replay_state[self.steps] = state self.replay_next_state[self.steps] = next_state self.replay_action[self.steps] = action self.replay_reward[self.steps] = reward else: if self.steps >= self.memory_size: self.replay_state = self.replay_state[1:] self.replay_next_state = self.replay_next_state[1:] self.replay_action = self.replay_action[1:] self.replay_reward = self.replay_reward[1:] self.replay_state = torch.cat( (self.replay_state, state.unsqueeze(0)), dim=0) self.replay_next_state = torch.cat( (self.replay_next_state, next_state.unsqueeze(0)), dim=0) self.replay_action = torch.cat( (self.replay_action, action.unsqueeze(0)), dim=0) self.replay_reward = torch.cat( (self.replay_reward, reward.unsqueeze(0)), dim=0) self.steps += 1 def act(self, next_state, portfolio): if self.steps > self.start_size: next_state_data = min_max_scale( self.replay_next_state[:, 0, :])[-self.time_dim:].view(1, -1) next_amount_data = min_max_scale( self.replay_next_state[:, 2, :])[-1].view(1, -1) next_state_data = torch.cat([next_state_data, next_amount_data], dim=1) self.train_on_batch() allocation = self.actor_network.target_action( next_state_data).data.view(-1) allocation += torch.tensor(self.exploration_noise.noise().tolist(), device=cuda) allocation[allocation < 0] = 0 allocation /= sum(allocation) allocation = torch.floor(portfolio * allocation / next_state[1, :] / self.unit) * self.unit self.action = allocation return self.action.clone()
class Agent(): def __init__(self, state_size, action_size, action_sigma=0.1, memory_size=1000000, batch=128, sigma=0.2, noise_clip=0.5, gamma=0.99, update_frequency=2, seed=0): ''' TD3 Agent :param state_size: State Dimension :param action_size: Action dimension :param action_sigma: standard deviation of the noise to be added to the action :param memory_size: :param batch: :param sigma: Standard deviation of the noise to be added to the target function (Chapter 5.3 of TD3 Paper) :param noise_clip: How much noise to allow :param gamma: :param update_frequency: :param seed: ''' self.state_size = state_size self.action_size = action_size self.action_sigma = action_sigma self.sigma = sigma self.noise_clip = noise_clip self.gamma = gamma self.update_frequency = update_frequency self.seed = seed self.actor = Actor(self.state_size, self.action_size).to(device) self.critic0 = Critic(self.state_size, self.action_size).to(device) #second Critic as described in the paper # https: // arxiv.org / pdf / 1802.09477.pdf self.critic1 = Critic(self.state_size, self.action_size).to(device) self.target_actor = Actor(self.state_size, self.action_size).to(device) self.target_critic0 = Critic(self.state_size, self.action_size).to(device) # second Critic as described in the paper # https: // arxiv.org / pdf / 1802.09477.pdf self.target_critic1 = Critic(self.state_size, self.action_size).to(device) self.memory = ReplayBuffer(memory_size, batch, seed=seed) self.actor_optimizer = Adam(self.actor.parameters(), lr=ACTOR_LR) self.critic0_optimizer = Adam(self.critic0.parameters(), lr=VALUE0_LR) self.critic1_optimizer = Adam(self.critic1.parameters(), lr=VALUE1_LR) self.soft_update(self.actor, self.target_actor, 1) self.soft_update(self.critic0, self.target_critic0, 1) self.soft_update(self.critic1, self.target_critic1, 1) def act(self, state, epsilon=True): state = torch.from_numpy(np.asarray(state)).float().to(device) self.actor.eval() with torch.no_grad(): action = self.actor.forward(state).cpu().data.numpy() self.actor.train() if epsilon: #if we want to inject some noise noise = np.random.normal(0, self.action_sigma, action.shape[0]) action += noise return action def update(self, step): ''' #https: // arxiv.org / pdf / 1802.09477.pdf the function is very similar to typical DDPG algorithm, except for 1) we have 2 critics to update 2) we take the min of the 2 values critics output 3) Has modified Target network with noise injected into it (Chapter 5.3 of the paper) 4) We delay updating the actor by certain steps :param step: how often to update the actor :return: ''' state, action, reward, next_state, done = self.memory.sample() # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models next_state_action = self.target_actor(next_state) #sample a random noise noise = Normal(torch.zeros(self.action_size), self.sigma).sample() noise = torch.clamp(noise, -self.noise_clip, self.noise_clip).to(device) next_state_action += noise target_Q0 = self.target_critic0(next_state, next_state_action) target_Q1 = self.target_critic1(next_state, next_state_action) target_Q = torch.min(target_Q0, target_Q1) target_value = reward + self.gamma * target_Q * (1.0 - done) expected_Q0 = self.critic0(state, action) expected_Q1 = self.critic1(state, action) critic_0_loss = F.mse_loss(expected_Q0, target_value.detach()) critic_1_loss = F.mse_loss(expected_Q1, target_value.detach()) self.critic0_optimizer.zero_grad() critic_0_loss.backward() self.critic0_optimizer.step() self.critic1_optimizer.zero_grad() critic_1_loss.backward() self.critic1_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss #as mentioned in the paper, we delay updating the actor network. if step % self.update_frequency == 0: actor_loss = self.critic0.forward(state, self.actor.forward(state)) actor_loss = -actor_loss.mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ------------------- # self.soft_update(self.critic0, self.target_critic0, TRANSFER_RATE) self.soft_update(self.critic1, self.target_critic1, TRANSFER_RATE) self.soft_update(self.actor, self.target_actor, TRANSFER_RATE) def soft_update(self, local_model, target_model, tao): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tao * local_param.data + (1.0 - tao) * target_param.data) def add_to_memory(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done)
class ActorCriticExperienceReplay(object): def __init__(self, env): self.MEMORY_SIZE = 200 self.BATCH_SIZE = 10 LR_A = 0.001 # learning rate for actor LR_C = 0.01 # learning rate for critic num_features = env.observation_space.shape[0] num_actions = env.action_space.shape[0] self.action_space = env.action_space sess = tf.Session() self.actor = Actor( sess, n_features=num_features, action_bound=[env.action_space.low[0], env.action_space.high[0]], lr=LR_A) self.critic = Critic( sess, n_features=num_features, lr=LR_C ) # we need a good teacher, so the teacher should learn faster than the actor sess.run(tf.global_variables_initializer()) self.replay_memory = [] def get_action(self, state, episode_percentage): # Sometimes pick random action to explore if np.random.random() < self.get_exploration_prob(episode_percentage): return self.action_space.sample() else: return self.actor.choose_action(state)[0] def get_exploration_prob(self, episode_percentage): return -1 * (episode_percentage**2) + 1 # return -1 * (episode_percentage - 1) ** 3 def update(self, state, action, reward, new_state): td_error = self.critic.learn( state, reward, new_state) # gradient = grad[r + gamma * V(s_) - V(s)] self.actor.learn( state, action, td_error) # true_gradient = grad[logPi(s,a) * td_error] # Add to replay memory self.replay_memory.append((state, action, reward, new_state)) if len(self.replay_memory) >= self.MEMORY_SIZE: self.replay_memory.pop(0) # Learn from replayed memories if np.random.random() < 0.5 and len( self.replay_memory) > self.BATCH_SIZE: minibatch = random.sample(self.replay_memory, self.BATCH_SIZE) for (batch_state, batch_action, batch_reward, batch_new_state) in minibatch: td_error = self.critic.learn(batch_state, batch_reward, batch_new_state) self.actor.learn(batch_state, batch_action, td_error) def get_name(self): return 'ActorCritic_ExperienceReplay'
class DDPGHedgingAgent: """DDPGAgent interacting with environment. Attribute: env (gym.Env): openAI Gym environment actor (nn.Module): target actor model to select actions actor_target (nn.Module): actor model to predict next actions actor_optimizer (Optimizer): optimizer for training actor critic (nn.Module): critic model to predict state values critic_target (nn.Module): target critic model to predict state values critic_optimizer (Optimizer): optimizer for training critic memory (ReplayBuffer): replay memory to store transitions batch_size (int): batch size for sampling gamma (float): discount factor tau (float): parameter for soft target update initial_random_episode (int): initial random action steps noise (OUNoise): noise generator for exploration device (torch.device): cpu / gpu transition (list): temporory storage for the recent transition total_step (int): total step numbers is_test (bool): flag to show the current mode (train / test) """ def __init__(self, env: gym.Env, memory_size: int, batch_size: int, ou_noise_theta: float, ou_noise_sigma: float, gamma: float = 0.99, tau: float = 5e-3, initial_random_episode: int = 1e4, name_cases='myproject'): """ Initialize. """ # Logger self.wandb = wandb.init(project=name_cases) obs_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] self.env = env self.memory = ReplayBuffer(memory_size) self.batch_size = batch_size self.gamma = gamma self.tau = tau self.initial_random_episode = initial_random_episode # noise self.noise = OUNoise( action_dim, theta=ou_noise_theta, sigma=ou_noise_sigma, ) # device: cpu / gpu self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print(self.device) # networks self.actor = Actor(obs_dim, action_dim).to(self.device) self.actor_target = Actor(obs_dim, action_dim).to(self.device) self.actor_target.load_state_dict(self.actor.state_dict()) self.critic = Critic(obs_dim + action_dim).to(self.device) self.critic_target = Critic(obs_dim + action_dim).to(self.device) self.critic_target.load_state_dict(self.critic.state_dict()) # optimizer self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=3e-4) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3) # transition to store in memory self.transition = list() # total steps count self.total_step = 0 # mode: train / test self.is_test = False self.populate(self.initial_random_episode) def populate(self, eps: int = 100) -> None: """ Carries out several random steps through the environment to initially fill up the replay buffer with experiences Args: steps: number of random steps to populate the buffer with """ if not self.is_test: print("Populate Replay Buffer... ") kbar = pkbar.Kbar(target=eps, width=20) state = self.env.reset() for i in range(eps): while True: # Get action from sample space selected_action = self.env.action_space.sample() # selected_action = 0 noise = self.noise.sample() selected_action = np.clip(selected_action + noise, -1.0, 1.0) next_state, reward, done, _ = self.env.step( selected_action) self.transition = [ state, selected_action, reward, next_state, int(done) ] self.memory.append(Experience(*self.transition)) state = next_state if done: state = self.env.reset() break kbar.add(1) # self.scaler = self.memory.standar_scaler() @torch.no_grad() def select_action(self, state: np.ndarray) -> np.ndarray: """Select an action from the input state.""" state_s = self.scaler.transform([state]) selected_action = self.actor( torch.FloatTensor(state_s).to(self.device)).item() # add noise for exploration during training if not self.is_test: noise = self.noise.sample() selected_action = np.clip(selected_action + noise, -1.0, 1.0) self.transition = [state, selected_action] return selected_action def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool]: """Take an action and return the response of the env.""" next_state, reward, done, _ = self.env.step(action) if not self.is_test: self.transition += [reward, next_state, int(done)] self.memory.append(Experience(*self.transition)) return next_state, reward, done def update_model(self) -> torch.Tensor: """ Update the model by gradient descent. Change the loss in to mean variance optimization """ device = self.device # for shortening the following lines state, action, reward, next_state, done = self.memory.sample( self.batch_size, self.device) state = torch.FloatTensor(self.scaler.transform(state)).to(device) next_state = torch.FloatTensor( self.scaler.transform(next_state)).to(device) # state = state.to(device) # next_state = next_state.to(device) action = action.to(device) reward = reward.to(device) done = done.to(device) masks = 1 - done next_action = self.actor_target(next_state) next_value = self.critic_target(next_state, next_action) curr_return = reward.reshape( -1, 1) + self.gamma * next_value * masks.reshape(-1, 1) # train critic values = self.critic(state, action) critic_loss = F.mse_loss(values, curr_return) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Freeze Q-network so you don't waste computational effort # computing gradients for it during the policy learning step. for p in self.critic.parameters(): p.requires_grad = False # train actor q_values = self.critic(state, self.actor(state)) actor_loss = -q_values.mean() # actor_loss = 0.5 * q_values.std() ** 2 self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() for p in self.critic.parameters(): p.requires_grad = True # target update self._target_soft_update() return actor_loss.data, critic_loss.data def train(self, num_frames: int, plotting_interval: int = 200): """Train the agent.""" self.is_test = False state = self.env.reset() actor_losses = [] critic_losses = [] scores = [] score = 0 print("Training...") kbar = pkbar.Kbar(target=num_frames, width=20) for self.total_step in range(1, num_frames + 1): action = self.select_action(state) next_state, reward, done = self.step(action) state = next_state score += reward # if episode ends if done: state = self.env.reset() scores.append(score) score = 0 self._plot( self.total_step, scores, actor_losses, critic_losses, ) # if training is ready if (len(self.memory) >= self.batch_size): # and actor_loss, critic_loss = self.update_model() actor_losses.append(actor_loss) critic_losses.append(critic_loss) kbar.add(1) self.env.close() def test(self): """Test the agent.""" self.is_test = True state = self.env.reset() done = False score = 0 while not done: action = self.select_action(state) next_state, reward, done = self.step(action) state = next_state score += reward self.env.close() return score def _target_soft_update(self): """Soft-update: target = tau*local + (1-tau)*target.""" tau = self.tau for t_param, l_param in zip(self.actor_target.parameters(), self.actor.parameters()): t_param.data.copy_(tau * l_param.data + (1.0 - tau) * t_param.data) for t_param, l_param in zip(self.critic_target.parameters(), self.critic.parameters()): t_param.data.copy_(tau * l_param.data + (1.0 - tau) * t_param.data) def _plot( self, frame_idx: int, scores: List[float], actor_losses: List[float], critic_losses: List[float], ): """Plot the training progresses.""" self.wandb.log({ 'frame': frame_idx, 'score': scores[-1], 'actor_loss': actor_losses[-1], 'critic_loss': critic_losses[-1] })
class DDPGAGENT: def __init__(self, state_size, action_size, random_seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = EPS #--- actor -----# self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=1e-3) #---- critic -----# self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=1e-3, weight_decay=0) self.noise = OUNoise(action_size, random_seed) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) #self.timestep = 0 def step(self, state, action, reward, next_state, done, timestep): self.memory.add_experience(state, action, reward, next_state, done) #self.timestep = (self.timestep + 1) % UPDATE_EVERY if len(self.memory) > BATCH_SIZE and timestep % UPDATE_EVERY == 0: for _ in range(LEARN_NUM): xp = self.memory.sample() self.learn(xp, GAMMA) #GAMMA VALUE 0.99 def act(self, state, noise_accumulate=True): state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() #Epsilon greedy selection if noise_accumulate: action += self.epsilon * self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset_internal_state() def learn(self, xp, gamma): states, actions, rewards, next_states, dones = xp #---configuring critic and computation of loss with help of MSE actions_nxt = self.actor_target(next_states) q_target_next = self.critic_target(next_states, actions_nxt) q_target = rewards + (gamma * q_target_next * (1 - dones)) q_expected = self.critic_local(states, actions) #MSE LOSS critic_loss = F.mse_loss(q_expected, q_target) self.critic_optimizer.zero_grad() critic_loss.backward() # Clips gradient norm of an iterable of parameters torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() #---configuring actor and computation of loss with help of MSE actor_predicted = self.actor_local(states) actor_loss = -self.critic_local(states, actor_predicted).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) self.epsilon -= 1e-6 self.noise.reset_internal_state() def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)