def do_q_learning(env, reward_function, train_episodes, figure=False): alpha = 0.01 gamma = 0.9 epsilon = 0.1 policy = DQNPolicy(env, lr=alpha, gamma=gamma, input=2, output=4) # 4 actions output, up, right, down, left replay_buffer = ReplayBuffer() # Play with a random policy and see # run_current_policy(env.env, policy) agg_interval = 100 avg_history = {'episodes': [], 'timesteps': [], 'reward': []} # Train the network to predict actions for each of the states for episode_i in range(train_episodes): episode_timestep = 0 episode_reward = 0.0 env.__init__() # todo : the first current state should be 0 cur_state = env.cur_state counter = 0 done = False while not done: # Let each episode be of 30 steps counter += 1 done = counter >= 30 # todo : check if this line is working action = policy.select_action(cur_state.reshape(1, -1), epsilon) # take action in the environment next_state = env.step(action) reward = reward_function(next_state) # add the transition to replay buffer replay_buffer.add(cur_state, action, next_state, reward, done) # sample minibatch of transitions from the replay buffer # the sampling is done every timestep and not every episode sample_transitions = replay_buffer.sample() # update the policy using the sampled transitions policy.update_policy(**sample_transitions) episode_reward += reward episode_timestep += 1 cur_state = next_state avg_history['episodes'].append(episode_i + 1) avg_history['timesteps'].append(episode_timestep) avg_history['reward'].append(episode_reward) learning_policy_progress.update() if figure: plt.plot(avg_history['episodes'], avg_history['reward']) plt.title('Reward') plt.xlabel('Episode') plt.ylabel('Reward') plt.show() return policy.q_model
def __init__(self, state_size, action_size, seed, hidden_layers=[64, 64], buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3, learning_rate=5e-4, update_every=4): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed hidden_layers (list of int ; optional): number of each layer nodes buffer_size (int ; optional): replay buffer size batch_size (int; optional): minibatch size gamma (float; optional): discount factor tau (float; optional): for soft update of target parameters learning_rate (float; optional): learning rate update_every (int; optional): how often to update the network """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = learning_rate self.update_every = update_every # detect GPU device self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # Q-Network model_params = [state_size, action_size, seed, hidden_layers] self.qnetwork_local = QNetwork(*model_params).to(self.device) self.qnetwork_target = QNetwork(*model_params).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed, self.device) # Initialize time step (for updating every self.update_every steps) self.t_step = 0
def __init__(self, state_size, action_size, seed, buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3, lr=5e-4, update_every=4, checkpoint_file='checkpoint.pth'): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed buffer_size(int): replay buffer size batch_size(int): minibatch size gamma: discount factor tau: for soft update of target parameters lr: learning rate update_every: how often to update the network """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = lr self.update_every = update_every self.checkpoint_file = checkpoint_file self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed, self.device) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def test_buffer_replace(): shape = (2, 2) capacity = 2 buffer = ReplayBuffer(capacity) for i in range(10): x = onp.ones(shape) * i a, r = i, i discount = 1.0 timestep = dm_env.TimeStep(dm_env.StepType.FIRST, r, discount, x) buffer.add(timestep, a, timestep) logging.debug("i: {}, r: {}, len(buffer): {}".format( i, capacity, len(buffer))) # make sure the buffer recycles if adding more elements than its capacity assert len(buffer) == capacity # make sure the oldest elements are recycled assert onp.array_equal( onp.array([buffer[i].s for i in range(len(buffer))]), onp.array([[[8.0, 8.0], [8.0, 8.0]], [[9.0, 9], [9.0, 9.0]]], dtype=onp.float32), ) assert onp.array_equal( onp.array([buffer[i].r for i in range(len(buffer))]), onp.array([8.0, 9.0], dtype=onp.float32), ) assert onp.array_equal( onp.array([buffer[i].a for i in range(len(buffer))]), onp.array([8.0, 9.0], dtype=onp.float32), ) # try sampling with n < len(buffer) batch = buffer.sample(1) assert len(batch[0]) == 1 logging.debug(batch) # try sampling wiht n == len(buffer) batch = buffer.sample(2) assert len(batch[0]) == len(buffer) logging.debug(batch) # try sampling with n > len(buffer) batch = buffer.sample(3) assert len(batch[0]) == len(buffer) logging.debug(batch) return
from numpy import savetxt USE_CUDA = torch.cuda.is_available() from dqn import QLearner, compute_td_loss, ReplayBuffer env_id = "PongNoFrameskip-v4" # established environment that will be played env = make_atari(env_id) env = wrap_deepmind(env) env = wrap_pytorch(env) num_frames = 1000000 # total frames that will be learning from batch_size = 32 # the number of samples that are provided to the model for update services at a given time gamma = 0.99 # the discount of future rewards record_idx = 10000 # replay_initial = 10000 # number frames that are held replay_buffer = ReplayBuffer(100000) model = QLearner(env, num_frames, batch_size, gamma, replay_buffer) model.load_state_dict( torch.load("model_pretrained.pth", map_location='cpu')) #loading in the pretrained model target_model = QLearner(env, num_frames, batch_size, gamma, replay_buffer) #load in model target_model.copy_from(model) optimizer = optim.Adam(model.parameters(), lr=0.0001) #learning rate set and optimizing the model if USE_CUDA: model = model.cuda() # sends model to gpu target_model = target_model.cuda() print("Using cuda")
import torch.nn.functional as F USE_CUDA = torch.cuda.is_available() from dqn import QLearner, compute_td_loss, ReplayBuffer env_id = "PongNoFrameskip-v4" env = make_atari(env_id) env = wrap_deepmind(env) env = wrap_pytorch(env) num_frames = 1000000 batch_size = 32 gamma = 0.99 record_idx = 10000 replay_initial = 10000 replay_buffer = ReplayBuffer(100000) model = QLearner(env, num_frames, batch_size, gamma, replay_buffer) model.load_state_dict(torch.load("model_pretrained.pth", map_location='cpu')) target_model = QLearner(env, num_frames, batch_size, gamma, replay_buffer) target_model.copy_from(model) optimizer = optim.Adam(model.parameters(), lr=0.00001) if USE_CUDA: model = model.cuda() target_model = target_model.cuda() print("Using cuda") epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 30000
class DQNAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, hidden_layers=[64, 64], buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3, learning_rate=5e-4, update_every=4): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed hidden_layers (list of int ; optional): number of each layer nodes buffer_size (int ; optional): replay buffer size batch_size (int; optional): minibatch size gamma (float; optional): discount factor tau (float; optional): for soft update of target parameters learning_rate (float; optional): learning rate update_every (int; optional): how often to update the network """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = learning_rate self.update_every = update_every # detect GPU device self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # Q-Network model_params = [state_size, action_size, seed, hidden_layers] self.qnetwork_local = QNetwork(*model_params).to(self.device) self.qnetwork_target = QNetwork(*model_params).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed, self.device) # Initialize time step (for updating every self.update_every steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every self.update_every time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Calculate target value self.qnetwork_target.eval() with torch.no_grad(): Q_dash = self.qnetwork_target(next_states) Q_dash_max = torch.max(Q_dash, dim=1, keepdim=True)[0] y = rewards + gamma * Q_dash_max * (1 - dones) self.qnetwork_target.train() # Predict Q-value self.optimizer.zero_grad() Q = self.qnetwork_local(states) y_pred = Q.gather(1, actions) # TD-error loss = torch.sum((y - y_pred)**2) # Optimize loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
total_step, total_reward)) # In[]: cp_alpha = 0.001 cp_gamma = 0.95 cp_epsilon = 0.05 cp_avg_history = {'episodes': [], 'timesteps': [], 'reward': []} agg_interval = 1 avg_reward = 0.0 avg_timestep = 0 # initialize policy and replay buffer cp_policy = DQNPolicy(cp_env, lr=cp_alpha, gamma=cp_gamma) replay_buffer = ReplayBuffer() cp_start_episode = 0 # Play with a random policy and see # run_current_policy(cp_env.env, cp_policy) cp_train_episodes = 200 pbar_cp = tqdm(total=cp_train_episodes) # In[]: # Train the network to predict actions for each of the states for episode_i in range(cp_start_episode, cp_start_episode + cp_train_episodes): episode_timestep = 0 episode_reward = 0.0
update_episode = 5 weight_decay = 10 avg_history = {'episodes': [], 'timesteps_unweighted': [], 'timesteps_weighted': [], 'unweighted_reward': [], 'weighted_reward':[], 'loss_unweighted': [], 'loss_weighted': []} agg_interval = 10 # initialize policy and replay buffer policy = DQN(input_size=env.observation_space.shape[0], output_size=env.action_space.n, hidden_size=24) # TODO : Check if this can perform better in a smaller no. of hidden sizes policy_weighted = DQN(input_size=env.observation_space.shape[0], output_size=env.action_space.n, hidden_size=24) optimizer_weighted = Adam(policy_weighted.parameters(), lr=learning_rate_weighted) replay_buffer = ReplayBuffer() replay_buffer_weighted = ReplayBuffer() optimizer = Adam(policy.parameters(), lr=learning_rate_unweighted) start_episode = 0 # Play with a random policy and see # run_current_policy(env.env, policy) train_episodes = 2000 # In[]: # Update the policy by a TD(0) def update_policy(cur_states, actions, next_states, rewards, dones):
warnings.filterwarnings("ignore") USE_CUDA = torch.cuda.is_available() from dqn import QLearner, compute_td_loss, ReplayBuffer env_id = "PongNoFrameskip-v4" env = make_atari(env_id) env = wrap_deepmind(env) env = wrap_pytorch(env) num_frames = 1000000 batch_size = 32 gamma = 0.99 replay_initial = 10000 replay_buffer = ReplayBuffer(100000) model = QLearner(env, num_frames, batch_size, gamma, replay_buffer) model.load_state_dict(torch.load(sys.argv[1], map_location='cpu')) model.eval() if USE_CUDA: model = model.cuda() print("Using cuda") model.load_state_dict(torch.load(pthname, map_location='cpu')) env.seed(1) state = env.reset() done = False games_won = 0
env_id = "PongNoFrameskip-v4" env = make_atari(env_id) env = wrap_deepmind(env, frame_stack=USE_FRAME_STACK) env = wrap_pytorch(env) train_num_frames = 5000000 sample_num_frames = 50000 batch_size = 32 gamma = 0.99 target_update = 50000 epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 1000000 replay_initial = 10000 learning_rate = 1e-5 train_replay_buffer = ReplayBuffer(100000) analysis_replay_buffer = ReplayBuffer(100000) policy_model = QLearner(env, train_num_frames, batch_size, gamma, train_replay_buffer) target_model = QLearner(env, train_num_frames, batch_size, gamma, train_replay_buffer) target_model.load_state_dict(policy_model.state_dict()) target_model.eval() optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if USE_CUDA: policy_model = policy_model.to(device) target_model = target_model.to(device)
plt.figure(2) plt.plot([reward[0] for reward in all_rewards], [reward[1] for reward in all_rewards]) plt.xlabel('Frame #') plt.ylabel('Episode Reward') plt.savefig(f'rewards_lr={lr}.pdf') USE_CUDA = torch.cuda.is_available() # Set up game env_id = "PongNoFrameskip-v4" env = make_atari(env_id) env = wrap_deepmind(env) env = wrap_pytorch(env) replay_buffer = ReplayBuffer(replay_buff_size) # Buffer size model = QLearner(env, num_frames, batch_size, gamma, replay_buffer) # Create model model.load_state_dict(torch.load("model_pretrained.pth", map_location='cpu')) model.eval() target_model = QLearner(env, num_frames, batch_size, gamma, replay_buffer) # Create target model target_model.copy_from(model) # Optimize model's parameters optimizer = optim.Adam(model.parameters(), lr=lr) if USE_CUDA: model = model.cuda() target_model = target_model.cuda() print("Using cuda") # Neg exp func. Start exploring then exploiting according to frame_indx
class Agent(): """Basic experinece replay agent.""" def __init__(self, state_size, action_size, seed, buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3, lr=5e-4, update_every=4, checkpoint_file='checkpoint.pth'): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed buffer_size(int): replay buffer size batch_size(int): minibatch size gamma: discount factor tau: for soft update of target parameters lr: learning rate update_every: how often to update the network """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = lr self.update_every = update_every self.checkpoint_file = checkpoint_file self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed, self.device) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def train(self, env, n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): """Train Agent by playing simulator Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ scores = [] # list containing scores from each episode moving_avgs = [] # list of moving averages scores_window = deque(maxlen=100) # last 100 scores brain_name = env.brain_names[0] # get env default branin name env_info = env.reset( train_mode=False)[brain_name] # intialize the environment eps = eps_start # initialize epsilon for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] # get the next state score = 0 for t in range(max_t): action = self.act(state, eps).astype(int) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[ 0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished self.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score moving_avg = np.mean(scores_window) # calculate moving average moving_avgs.append(moving_avg) # save most recent moving average eps = max(eps_end, eps_decay * eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, moving_avg)) if moving_avg >= 13.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, moving_avg)) self.save() break return scores, moving_avgs def test(self, env, num_episodes=10): brain_name = env.brain_names[0] scores = [] # list of scores avg_scores = [] # list of average scores for i_episode in range(1, num_episodes + 1): env_info = env.reset( train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 # initialize the score t = 1 while True: action = self.act(state, eps=0) # select an action env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[ 0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished score += reward # update the score state = next_state # roll over the state to next time step # print('empisode: {}, step: {}, reward: {}, score: {}, scores: {}'.format(i_episode, t, reward, score, scores)) t += 1 if done: # exit loop if episode finished scores.append(score) avg_scores.append(np.mean(scores)) print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores))) break return scores, avg_scores def save(self): """Save the model Params ====== file: checkpoint file name """ torch.save(self.qnetwork_local.state_dict(), self.checkpoint_file) def load(self): """Load the model Params ====== file: checkpoint file name """ self.qnetwork_local.load_state_dict(torch.load(self.checkpoint_file))
device = torch.device('cuda:0') else: device = torch.device('cpu') #device = torch.device('cpu') print(device) env = make_env(seed) state_shape = env.observation_space.shape n_actions = env.action_space.n state = env.reset() agent = DQNAgent(state_shape, n_actions, epsilon=0.9).to(device) #agent.load_state_dict(torch.load('dqn.weights')) target_network = DQNAgent(state_shape, n_actions).to(device) target_network.load_state_dict(agent.state_dict()) opt = torch.optim.Adam(agent.parameters(), lr=1e-4) exp_replay = ReplayBuffer(buffer_size) print('test_buffer') for i in range(100): play_and_record(state, agent, env, exp_replay, n_steps=10**2) if len(exp_replay) == buffer_size: break print(len(exp_replay)) state = env.reset() for step in trange(step, total_steps + 1): agent.epsilon = linear_decay(init_epsilon, final_epsilon, step, decay_steps) # play
def train(env_id, lr=1e-4, gamma=0.99, memory_size=1000, batch_size=32, train_timesteps=10000, train_start_time=1000, target_update_frequency=1000, init_epsilon=1, final_epsilon=0.1, epsilon_decay=300, model_path=None): device = 'cuda' if torch.cuda.is_available() else 'cpu' LOG_PATH = f'logs/dqn_log_{env_id}.txt' if get_env_type(env_id) == 'atari': env = make_atari(env_id) env = wrap_deepmind(env) env = wrap_pytorch(env) model_type = 'conv' else: env = gym.make(env_id) model_type = 'linear' obs_shape = env.observation_space.shape num_actions = env.action_space.n memory = ReplayBuffer(memory_size) agent = DQN(obs_shape, num_actions, lr, gamma, device, model_type) policy = EpsilonGreedy(agent, num_actions, init_epsilon, final_epsilon, epsilon_decay) # populate replay memory obs = env.reset() for t in range(train_start_time): # uniform random policy action = random.randrange(num_actions) next_obs, reward, done, _ = env.step(action) memory.add(obs, action, reward, next_obs, done) obs = next_obs if done: # start a new episode obs = env.reset() # for monitoring ep_num = 1 ep_start_time = 1 episode_reward = 0 reward_list = [] # train start obs = env.reset() for t in tqdm.tqdm(range(1, train_timesteps + 1)): # choose action action = policy.act(obs, t) next_obs, reward, done, _ = env.step(action) memory.add(obs, action, reward, next_obs, done) obs = next_obs # sample batch transitions from memory transitions = memory.sample(batch_size) # train loss = agent.train(transitions) # record reward episode_reward += reward # update target network at every C timesteps if t % target_update_frequency == 0: agent.update_target() if done: # start a new episode obs = env.reset() # write log with open(LOG_PATH, 'a') as f: f.write(f'{ep_num}\t{episode_reward}\t{ep_start_time}\t{t}\n') if model_path is not None: # save model info = { 'epoch': ep_num, 'timesteps': t, } agent.save(model_path, info) ep_num += 1 ep_start_time = t + 1 reward_list.append(episode_reward) episode_reward = 0
gamma = 1 # gamma = 0 because the action you took in the current state is no way going to affect what you have to do # in the next state avg_history = { 'episodes': [], 'timesteps': [], 'reward': [], 'hits percentage': [] } agg_interval = 10 running_loss1_mean = 0 running_loss2_mean = 0 loss1_history = [] loss2_history = [] # initialize policy and replay buffer replay_buffer = ReplayBuffer() actor_replay_buffer = ActorReplayBuffer() beta = 0.001 # beta is the momentum in variance updates of TD Error running_variance = 1 # In[]: def update_critic(critic_old, cur_states, actions, next_states, rewards, dones): # target doesnt change when its terminal, thus multiply with (1-done) targets = rewards + torch.mul(1 - dones, gamma * critic_old(next_states).squeeze(-1)) # expanded_targets are the Q values of all the actions for the current_states sampled