def train(agent, n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): """Deep Q-Learning. Params ====== agent: agent to train n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores eps = eps_start # initialize epsilon qnetwork_star = QNetwork(state_size=8, action_size=4, seed=0) qnetwork_star.load_state_dict(torch.load('dqn_16.pth')) qnetwork_star.eval() Q_threshold = 0.0 savenumber = 0 for i_episode in range(1, n_episodes+1): state = env.reset() score = 0 for t in range(max_t): action = agent.act(state, eps) # shared autonomy to_add = True with torch.no_grad(): state = torch.from_numpy(state).float() Q_values = qnetwork_star(state).data.numpy() action_star = np.argmax(Q_values) loss = Q_values[action_star] - Q_values[action] if loss > Q_threshold: to_add = False action = action_star next_state, reward, done, _ = env.step(action) agent.step(state, action, reward, next_state, done, to_add) state = next_state score += reward if done: break Q_threshold += 0.1 scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay*eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: savename = "assisted_" + str(savenumber) + ".pkl" print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window))) torch.save(agent.qnetwork_local.state_dict(), savename) savenumber += 1 return scores
def main(): env = gym.make("LanderCustom-v0") qnetwork = QNetwork(state_size=8, action_size=4, seed=0) qnetwork.load_state_dict(torch.load('basic_lander.pth')) qnetwork.eval() human = MLP() human.load_state_dict(torch.load('expert_bc.pt')) human.eval() softmax = torch.nn.Softmax(dim=0) episodes = 30 scores = [] Q_threshold = 1e-2 for episode in range(episodes): if episode < 10: force_x = 0.0 elif episode < 20: force_x = +500.0 else: force_x = -500.0 env.start_state(force_x, 0.0) state = env.reset() score = 0 while True: with torch.no_grad(): state = torch.from_numpy(state).float() Q_values = qnetwork(state).data.numpy() action_pred_dist = softmax(human(state).data).numpy() action_star = np.argmax(Q_values) action = np.random.choice(np.arange(4), p=action_pred_dist) loss = Q_values[action_star] - Q_values[action] # if loss > Q_threshold: # action = action_star # env.render() state, reward, done, _ = env.step(action) score += reward if done: print("episode: ", episode, "score: ", score) break scores.append(score) env.close() print("The average score is: ", np.mean(np.array(scores)))
def rollout(force_x, Q_threshold, modelname): env = gym.make("LanderCustom-v0") qnetwork = QNetwork(state_size=8, action_size=4, seed=0) qnetwork.load_state_dict(torch.load('basic_lander.pth')) qnetwork.eval() softmax = torch.nn.Softmax(dim=0) if modelname is not None: human = MLP() human.load_state_dict(torch.load(modelname)) human.eval() env.start_state(force_x, 0.0) state = env.reset() score = 0 dataset = [] while True: # get robot and human actions with torch.no_grad(): state = torch.from_numpy(state).float() Q_values = qnetwork(state).data.numpy() action_star = np.argmax(Q_values) action = np.random.choice(np.arange(4)) if modelname is not None: action_pred_dist = softmax(human(state).data).numpy() action = np.random.choice(np.arange(4), p=action_pred_dist) # save data loss = Q_values[action_star] - Q_values[action] dataset.append(list(state.numpy()) + [action, loss, action_star]) # shared autonomy if loss > Q_threshold: action = action_star # update environment # env.render() state, reward, done, _ = env.step(action) score += reward if done: # print("score: ", score) break env.close() return score, dataset
class DQNAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, hidden_layers=[64, 64], buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3, learning_rate=5e-4, update_every=4): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed hidden_layers (list of int ; optional): number of each layer nodes buffer_size (int ; optional): replay buffer size batch_size (int; optional): minibatch size gamma (float; optional): discount factor tau (float; optional): for soft update of target parameters learning_rate (float; optional): learning rate update_every (int; optional): how often to update the network """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = learning_rate self.update_every = update_every # detect GPU device self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # Q-Network model_params = [state_size, action_size, seed, hidden_layers] self.qnetwork_local = QNetwork(*model_params).to(self.device) self.qnetwork_target = QNetwork(*model_params).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed, self.device) # Initialize time step (for updating every self.update_every steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every self.update_every time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Calculate target value self.qnetwork_target.eval() with torch.no_grad(): Q_dash = self.qnetwork_target(next_states) Q_dash_max = torch.max(Q_dash, dim=1, keepdim=True)[0] y = rewards + gamma * Q_dash_max * (1 - dones) self.qnetwork_target.train() # Predict Q-value self.optimizer.zero_grad() Q = self.qnetwork_local(states) y_pred = Q.gather(1, actions) # TD-error loss = torch.sum((y - y_pred)**2) # Optimize loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done, to_add): # Save experience in replay memory if to_add: self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Basic experinece replay agent.""" def __init__(self, state_size, action_size, seed, buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3, lr=5e-4, update_every=4, checkpoint_file='checkpoint.pth'): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed buffer_size(int): replay buffer size batch_size(int): minibatch size gamma: discount factor tau: for soft update of target parameters lr: learning rate update_every: how often to update the network """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = lr self.update_every = update_every self.checkpoint_file = checkpoint_file self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed, self.device) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def train(self, env, n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): """Train Agent by playing simulator Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ scores = [] # list containing scores from each episode moving_avgs = [] # list of moving averages scores_window = deque(maxlen=100) # last 100 scores brain_name = env.brain_names[0] # get env default branin name env_info = env.reset( train_mode=False)[brain_name] # intialize the environment eps = eps_start # initialize epsilon for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] # get the next state score = 0 for t in range(max_t): action = self.act(state, eps).astype(int) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[ 0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished self.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score moving_avg = np.mean(scores_window) # calculate moving average moving_avgs.append(moving_avg) # save most recent moving average eps = max(eps_end, eps_decay * eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, moving_avg)) if moving_avg >= 13.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, moving_avg)) self.save() break return scores, moving_avgs def test(self, env, num_episodes=10): brain_name = env.brain_names[0] scores = [] # list of scores avg_scores = [] # list of average scores for i_episode in range(1, num_episodes + 1): env_info = env.reset( train_mode=False)[brain_name] # reset the environment state = env_info.vector_observations[0] # get the current state score = 0 # initialize the score t = 1 while True: action = self.act(state, eps=0) # select an action env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[ 0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished score += reward # update the score state = next_state # roll over the state to next time step # print('empisode: {}, step: {}, reward: {}, score: {}, scores: {}'.format(i_episode, t, reward, score, scores)) t += 1 if done: # exit loop if episode finished scores.append(score) avg_scores.append(np.mean(scores)) print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores))) break return scores, avg_scores def save(self): """Save the model Params ====== file: checkpoint file name """ torch.save(self.qnetwork_local.state_dict(), self.checkpoint_file) def load(self): """Load the model Params ====== file: checkpoint file name """ self.qnetwork_local.load_state_dict(torch.load(self.checkpoint_file))
def main(): env = gym.make("LanderCustom-v0") fx_init = float(sys.argv[1]) Q_threshold = float(sys.argv[2]) savename = 'test1.pkl' joystick = Joystick() qnetwork = QNetwork(state_size=8, action_size=4, seed=0) qnetwork.load_state_dict(torch.load('basic_lander.pth')) qnetwork.eval() human = MLP() human.load_state_dict(torch.load('mlp_model.pt')) human.eval() episodes = 10 scores = [] data = [] env.start_state(fx_init, 0.0) for episode in range(episodes): state = env.reset() env.render() score = 0 while True: action, start, stop = joystick.input() if start: break while True: action, start, stop = joystick.input() data.append(list(state) + [action]) with torch.no_grad(): state = torch.from_numpy(state).float() Q_values = qnetwork(state).data.numpy() action_pred_dist = human(state).data.numpy() action_star = np.argmax(Q_values) action_pred = np.argmax(action_pred_dist) # action = action_pred loss = Q_values[action_star] - Q_values[action] if loss > Q_threshold: action = action_star env.render() state, reward, done, _ = env.step(action) score += reward if done or stop: print(episode, score) # pickle.dump(data, open(savename, "wb" )) break time.sleep(0.025) scores.append(score) env.close() print(scores)
score_train, experiences = rollout(sa, mydata) score_test, _ = rollout(1e3, mydata) mytrainscore.append(score_train) mytestscore.append(score_test) learned_experiences = [] corrections = 0 for item in experiences: state, loss, accepted, action, action_star = item[0:4], item[ 4], item[5], item[6], item[7] if not accepted: corrections += 1 if loss > 0.5: learned_experiences.append(state + [action_star]) mydata = mydata + learned_experiences print(count, " TrainScore: ", score_train, " TestScore: ", \ score_test, " Correction: ", corrections, " Data: ", len(mydata)) return mytrainscore, mytestscore if __name__ == "__main__": env = gym.make("CartPole-v0") env.seed(0) qnetwork = QNetwork(state_size=4, action_size=2, seed=0) qnetwork.load_state_dict(torch.load('models/dqn_cartpole.pth')) qnetwork.eval() scores = [] for idx in range(25): mytrainscore, mytestscore = experience_matching() scores.append(mytestscore) pickle.dump(scores, open("results/failure/SomeSA.pkl", "wb"))
def play(n_episodes=100, max_t=1000): # for first train eps start is 1 # for second eps start is 0.5 # also I changed max_t to 500, because it would take a long time to land possible_actions = [ # don't move [0, 0], # up [0.1, 0], [0.2, 0], [0.3, 0], [0.5, 0], [0.6, 0], [0.7, 0], [0.8, 0], [0.9, 0], [1, 0], # left [0, -0.6], [0, -0.7], [0, -0.8], [0, -0.9], # right [0, 0.6], [0, 0.7], [0, 0.8], [0, 0.9], # up-left # [0.8, -0.8], # [0.8, -0.65], # [0.6, -0.8], # [0.6, -0.65], # [0.7, -0.8], # [0.7, -0.65], # # up-right # [0.8, 0.8], # [0.8, 0.65], # [0.6, 0.8], # [0.6, 0.65], # [0.7, 0.8], # [0.7, 0.65] ] env = gym.make('LunarLanderContinuous-v2') # env = gym.make('LunarLander-v2') # action_space = env.action_space # print("action size=", action_space) n_actions = len(possible_actions) the_ship = QNetwork(state_size=8, action_size=n_actions) if (loaded_model): the_ship.load_state_dict(torch.load(loaded_model)) the_ship.eval() scores_window = deque(maxlen=100) state_arr = [] scores = [] # run n games for i_episode in range(1, n_episodes + 1): state = env.reset() score = 0 # sample random transitions for t in range(max_t): state_arr = [state] input_s = torch.tensor(state_arr) qs = the_ship(input_s) the_ship.train() _, action = torch.max(qs.detach(), 1) action = action.detach().numpy()[0] env.render() action_continious = discrete_action_to_continious_array( possible_actions, action) next_state, reward, done, _ = env.step(action_continious) score += reward state = next_state if done: break scores_window.append(score) scores.append(score) mean_score = np.mean(scores_window) print('\rEpisode {}\tAverage Score: {:.2f}, Last reward: {}'.format( i_episode, mean_score, reward), end="") if i_episode % 100 == 0: print( '\rEpisode {}\tAverage Score: {:.2f}, Last reward: {}'.format( i_episode, mean_score, reward)) return scores