def main(): if not (os.path.isdir("logs")): os.makedirs("logs") working_dir = "logs/" + args.dir if not (os.path.isdir(working_dir)): raise NameError(args.dir + " does not exist in dir logs") print(args) env = QubeSwingupEnv(use_simulator=args.sim, batch_size= 2048*4) num_inputs = env.observation_space.shape[0] num_actions = NUMBER_OF_ACTIONS print('state size:', num_inputs) print('action size:', num_actions) net = QNet(num_inputs, num_actions) if not args.new_net else QNet_more_layers(num_inputs, num_actions) net.load_state_dict(torch.load(working_dir + "/best_model.pth", map_location=torch.device(device))) net.to(device) net.eval() running_score = 0 epsilon = 1.0 steps = 0 beta = beta_start loss = 0 best_running_score = -1000 for e in range(1): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: steps += 1 action = get_continuous_action(get_action(state, net)) if np.abs(state[0][1].item()) < deg2rad(25): action = pd_control_policy(state.cpu().numpy()[0])[0] next_state, reward, done, info = env.step(action) reward = give_me_reward(info["alpha"], info["theta"]) if args.sim: env.render() reward = give_me_reward(info["alpha"], info["theta"]) if done: print(info) print("theta:" , info["theta"] * 180/np.pi) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) score += reward state = next_state running_score = 0.99 * running_score + 0.01 * score print('{} episode | running_score: {:.2f} | score: {:.2f} | steps: {} '.format(e, running_score, score, steps)) env.close()
class Agent(): def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.qnetwork_local = QNet(state_size, action_size, seed).to(device) self.qnetwork_target = QNet(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay Memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.t_step = 0 def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.1): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # For normal DQN #Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # For double DQN Q_targets_next = np.argmax(self.qnetwork_local(next_states).detach(),axis=-1).unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).gather(1, Q_targets_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.qnetwork_local(states).gather(1, actions) loss = F.mse_loss(Q_expected, Q_targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) img_shape = env.observation_space.shape num_actions = 3 print('image size:', img_shape) print('action size:', num_actions) net = QNet(num_actions) net.load_state_dict(torch.load(args.save_path + 'model.pth')) net.to(device) net.eval() epsilon = 0 for e in range(5): done = False score = 0 state = env.reset() state = pre_process(state) state = torch.Tensor(state).to(device) history = torch.stack((state, state, state, state)) for i in range(3): action = env.action_space.sample() state, reward, done, info = env.step(action) state = pre_process(state) state = torch.Tensor(state).to(device) state = state.unsqueeze(0) history = torch.cat((state, history[:-1]), dim=0) while not done: if args.render: env.render() steps += 1 qvalue = net(history.unsqueeze(0)) action = get_action(0, qvalue, num_actions) next_state, reward, done, info = env.step(action + 1) next_state = pre_process(next_state) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) next_history = torch.cat((next_state, history[:-1]), dim=0) score += reward history = next_history print('{} episode | score: {:.2f}'.format(e, score))
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.n print('state size:', num_inputs) print('action size:', num_actions) net = QNet(num_inputs, num_actions) net.load_state_dict(torch.load(args.save_path + 'model.pth')) net.to(device) net.eval() running_score = 0 steps = 0 for e in range(5): done = False score = 0 state = env.reset() state = torch.Tensor(state).to(device) state = state.unsqueeze(0) while not done: env.render() steps += 1 qvalue = net(state) action = get_action(qvalue) next_state, reward, done, _ = env.step(action) next_state = torch.Tensor(next_state).to(device) next_state = next_state.unsqueeze(0) score += reward state = next_state print('{} episode | score: {:.2f}'.format(e, score))
def __init__(self, run): self.run = run ckpt_dir = os.path.join(run, 'ckpt') ckpts = glob2.glob(os.path.join(ckpt_dir, '*.pth')) assert ckpts, "No checkpoints to resume from!" def get_epoch(ckpt_url): s = re.findall("ckpt_e(\d+).pth", ckpt_url) epoch = int(s[0]) if s else -1 return epoch, ckpt_url start_epoch, ckpt = max(get_epoch(c) for c in ckpts) print('Checkpoint:', ckpt) if torch.cuda.is_available(): model = QNet().cuda() else: model = QNet() ckpt = torch.load(ckpt) model.load_state_dict(ckpt['model']) model.eval() self.model = model
class Agent(): def __init__(self, args, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.per = args.per self.dueling = args.dueling self.buffer_size = args.buffer_size self.batch_size = args.batch_size self.gamma = args.gamma self.tau = args.tau self.lr = args.learning_rate self.update_freq = args.update_every # Q-Network if self.dueling: self.local_qnet = DuelingQNet(state_size, action_size, seed).to(device) self.target_qnet = DuelingQNet(state_size, action_size, seed).to(device) else: self.local_qnet = QNet(state_size, action_size, seed).to(device) self.target_qnet = QNet(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.local_qnet.parameters(), lr=self.lr) # Replay Memory if self.per: self.memory = PrioritizedReplayMemory(args, self.buffer_size) else: self.memory = ReplayMemory(action_size, self.buffer_size, self.batch_size, seed) self.t_step = 0 # init time step for updating every UPDATE_EVERY steps def step(self, state, action, reward, next_state, done): if self.per: self.memory.append(state, action, reward, next_state, done) else: self.memory.add(state, action, reward, next_state, done) # save experience to replay memory. # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_freq if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: if self.dueling: self.learn_DDQN(self.gamma) else: self.learn(self.gamma) def act(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.local_qnet.eval() with torch.no_grad(): action_values = self.local_qnet(state) self.local_qnet.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, gamma): if self.per: idxs, states, actions, rewards, next_states, dones, weights = self.memory.sample( self.batch_size) else: states, actions, rewards, next_states, dones = self.memory.sample() # Get max predicted Q values for next states from target model Q_targets_next = self.target_qnet(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.local_qnet(states).gather(1, actions) # Compute loss - element-wise mean squared error # Now loss is a Tensor of shape (1,) # loss.item() gets the scalar value held in the loss. loss = F.mse_loss(Q_expected, Q_targets) # Minimize loss self.optimizer.zero_grad() if self.per: (weights * loss).mean().backward( ) # Backpropagate importance-weighted minibatch loss else: loss.backward() self.optimizer.step() if self.per: errors = np.abs((Q_expected - Q_targets).detach().cpu().numpy()) self.memory.update_priorities(idxs, errors) # Update target network self.soft_update(self.local_qnet, self.target_qnet, self.tau) def learn_DDQN(self, gamma): if self.per: idxs, states, actions, rewards, next_states, dones, weights = self.memory.sample( self.batch_size) else: states, actions, rewards, next_states, dones = self.memory.sample() # Get index of maximum value for next state from Q_expected Q_argmax = self.local_qnet(next_states).detach() _, a_prime = Q_argmax.max(1) # Get max predicted Q values for next states from target model Q_targets_next = self.target_qnet(next_states).detach().gather( 1, a_prime.unsqueeze(1)) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.local_qnet(states).gather(1, actions) # Compute loss # Now loss is a Tensor of shape (1,) # loss.item() gets the scalar value held in the loss. loss = F.mse_loss(Q_expected, Q_targets) # Minimize loss self.optimizer.zero_grad() if self.per: (weights * loss).mean().backward( ) # Backpropagate importance-weighted minibatch loss else: loss.backward() self.optimizer.step() if self.per: errors = np.abs((Q_expected - Q_targets).detach().cpu().numpy()) self.memory.update_priorities(idxs, errors) # Update target network self.soft_update(self.local_qnet, self.target_qnet, self.tau) def soft_update(self, local_model, target_model, tau): # θ_target = τ*θ_local + (1 - τ)*θ_target for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
else: input = torch.from_numpy(state).to(device, torch.float32).unsqueeze(0) score = net(input) action = score.max(dim=1)[1].to(torch.int64).item() return action # Build environment env = make_atari('PongNoFrameskip-v4', stack=2) env = wrap_pytorch(env) env = gym.wrappers.Monitor(env, directory='./movie', force=True, video_callable=lambda x: True) number_actions = env.action_space.n # Separate target net & policy net input_shape = env.reset().shape net = QNet(input_shape, number_actions) net.load_state_dict(torch.load(model)) net.eval().to(device) for episode in range(10): state = env.reset() done = False while not done: # env.render() action = select_action(state, number_actions=number_actions) next_state, reward, done, _ = env.step(action) state = next_state env.close()
return loss # Build environment env = make_atari('PongNoFrameskip-v4', stack=2) env = wrap_pytorch(env) number_actions = env.action_space.n replay_buffer = ReplayBuffer(replay_memory_size) # Separate target net & policy net input_shape = env.reset().shape current_net = QNet(input_shape, number_actions).to(device) target_net = QNet(input_shape, number_actions).to(device) # with older weights target_net.load_state_dict(current_net.state_dict()) target_net.eval() optimizer = opt_algorithm(current_net.parameters(), lr=learning_rate) n_episode = 1 episode_return = 0 best_return = 0 returns = [] state = env.reset() for i in count(): # env.render() eps = get_epsilon(i) action = select_action(state, current_net, eps, number_action=number_actions) next_state, reward, done, _ = env.step(action)
class Agent(): """Agent definition for interacting with environment""" def __init__(self, state_size, action_size, seed): """ Params ====== state_size (int): state dimension action_size (int): action dimension seed (int): random seed for replicating experiment """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.QNet_local = QNet(state_size, action_size, seed).to(device) self.QNet_target = QNet(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.QNet_local.parameters(), lr=LR) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Add current experience to replay memory self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Get favored action Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.QNet_local.eval() with torch.no_grad(): action_values = self.QNet_local(state) self.QNet_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Perform learning on experiences Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences Q_targets_next = self.QNet_target(next_states).detach().max( 1)[0].unsqueeze(1) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.QNet_local(states).gather(1, actions) loss = F.mse_loss(Q_expected, Q_targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.QNet_local, self.QNet_target, TAU) def soft_update(self, local_model, target_model, tau): """ θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): model to copy weights from target_model (PyTorch model): copy to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)