def __init__(self, params, model_path): self.params = params self.model_path = model_path self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.current_q_net = DQN(input_shape=1, num_of_actions=get_action_space()) self.current_q_net.to(self.device) self.target_q_net = DQN(input_shape=1, num_of_actions=get_action_space()) self.target_q_net.to(self.device) self.optimizer = RMSprop(self.current_q_net.parameters(), lr=self.params.lr) self.replay_memory = ReplayMemory(self.params.memory_capacity) env = gym.make('CarRacing-v0') self.environment = EnvironmentWrapper(env, self.params.skip_steps)
def get(name, *args): if name == "dqn": return DQN(*args) elif name == "a3c": return A3C(*args) elif name == "dynaq": return DynaQ(*args) else: raise Exception(name + " is not a valid agent")
def evaluate_dqn(path): model = DQN(input_shape=1, num_of_actions=get_action_space()) model.load_state_dict(torch.load(path)) model.eval() env = gym.make('CarRacing-v0') env_wrapper = EnvironmentWrapper(env, 1) total_reward = 0 num_of_episodes = 100 for episode in range(num_of_episodes): state = env_wrapper.reset() state = torch.tensor(state, dtype=torch.float) done = False score = 0 while not done: q_value = model(torch.stack([state])) _, action = get_action(q_value, train=False) print(action) state, reward, done = env_wrapper.step(action) state = torch.tensor(state, dtype=torch.float32) score += reward env_wrapper.render() print('Episode: {0} Score: {1:.2f}'.format(episode, score)) total_reward += score return total_reward / num_of_episodes
def train_tetris(): num_epochs = 2000 max_steps = None history = [] agent = DQN(4) env = Tetris() done = True epoch = 0 while epoch < num_epochs: current_state = env.reset() done = False next_steps = env.get_next_states() next_actions, next_states = zip(*next_steps.items()) steps = 0 while not done and (not max_steps or steps < max_steps): next_steps = env.get_next_states() next_actions, next_states = zip(*next_steps.items()) best_state_ind = agent.predict_move(next_states) action = next_actions[best_state_ind] reward, done = env.step(action, render=False) agent.add_memory(current_state, reward, next_states[best_state_ind], done) current_state = next_states[best_state_ind] steps += 1 if len(agent.memory)<1000: continue agent.train() if epoch % 50 == 0: agent.save_model(epoch) history.append(current_state) print(epoch) epoch += 1 np.savetxt("states.csv",history,delimiter=",",fmt="% s")
def main(): """ The main test drive for this project. It constructs the simulator environment. Then, it calls the modified Multi-DQN for training or testing according to the need. Note: If you want to test and debug what is happening in the simulator through an episode, make sure that "test" is the chosen mode along with "verbose: true" in the JSON file. :return: """ args = parse_args() env = StdSimulatorEnv(args) dqn = DQN(env, args) if args.mode == 'train': dqn.train() elif args.mode == 'test': dqn.test() else: raise ValueError("Modes are only train and test")
def dqn_inference(path): model = DQN(input_shape=1, num_of_actions=get_action_space()) model.load_state_dict(torch.load(path)) model.eval() env = gym.make('CarRacing-v0') env_wrapper = EnvironmentWrapper(env, 1) state = env_wrapper.reset() state = torch.tensor(state, dtype=torch.float32) done = False total_score = 0 while not done: q_value = model(torch.stack([state])) _, action = get_action(q_value, train=False) print(action) state, reward, done = env_wrapper.step(action) state = torch.tensor(state, dtype=torch.float32) total_score += reward env_wrapper.render() return total_score
from af.dqn import AF_DQN from dqn.dqn import DQN env = gym.make('CartPole-v1') model = DQN( 'MlpPolicy', env, # delta=0.1, # forecast_horizon=11, # dynamics_layers=[32, 32], # dynamics_lr=1e-4, verbose=2, learning_rate=1e-3, buffer_size=5000, batch_size=32, learning_starts=0, exploration_fraction=0.1, exploration_final_eps=0.02, exploration_initial_eps=0.8, tau=0.1, gamma=0.9, train_freq=4, gradient_steps=1, target_update_interval=10, tensorboard_log='runs') model.learn(total_timesteps=1e5, tb_log_name='DQN') from stable_baselines3.common.evaluation import evaluate_policy eval_env = gym.make('CartPole-v1')
class DQNTrainer: def __init__(self, params, model_path): self.params = params self.model_path = model_path self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.current_q_net = DQN(input_shape=1, num_of_actions=get_action_space()) self.current_q_net.to(self.device) self.target_q_net = DQN(input_shape=1, num_of_actions=get_action_space()) self.target_q_net.to(self.device) self.lr = self.params.lr #NEW self.optimizer = RMSprop(self.current_q_net.parameters(), lr=self.lr) # CHANGE self.replay_memory = ReplayMemory(self.params.memory_capacity) env = gym.make('CarRacing-v0') self.environment = EnvironmentWrapper(env, self.params.skip_steps) self.loss_log = [] # NEW self.score_log = [] # NEW def run(self): episode_score = 0 # NEW episode_score_short_array = np.array([]) # NEW loss_short_array = np.array([]) # NEW episode = 0 # NEW state = torch.tensor(self.environment.reset(), device=self.device, dtype=torch.float32) self._update_target_q_net() for step in range(int(self.params.num_of_steps)): q_value = self.current_q_net(torch.stack([state])) action_index, action = get_action(q_value, train=True, step=step, params=self.params, device=self.device) next_state, reward, done = self.environment.step(action) episode_score += reward # NEW next_state = torch.tensor(next_state, device=self.device, dtype=torch.float32) self.replay_memory.add(state, action_index, reward, next_state, done) state = next_state if done: episode += 1 # NEW print('***************Episode: {}. Score: {}'.format(episode, episode_score)) # NEW episode_score_short_array = np.append(episode_score_short_array, episode_score) # NEW episode_score = 0 # NEW state = torch.tensor(self.environment.reset(), device=self.device, dtype=torch.float32) if len(self.replay_memory.memory) > self.params.batch_size: loss = self._update_current_q_net() loss_short_array = np.append(loss_short_array, loss.cpu().detach().numpy()) # NEW print('Update: {}. Loss: {}'.format(step, loss)) if step % self.params.target_update_freq == 0: self._update_target_q_net() if step % int(self.params.num_of_steps/50) == 0: ### NEW self.lr *= 0.8 self.optimizer = RMSprop(self.current_q_net.parameters(), lr=self.lr) torch.save(self.target_q_net.state_dict(), "models/dqn{}.pt".format(step)) self.score_log.append(np.mean(episode_score_short_array)) self.loss_log.append(np.mean(loss_short_array)) torch.save(self.target_q_net.state_dict(), self.model_path) def _update_current_q_net(self): batch = self.replay_memory.sample(self.params.batch_size) states, actions, rewards, next_states, dones = batch states = torch.stack(states) next_states = torch.stack(next_states) actions = torch.stack(actions).view(-1, 1) rewards = torch.tensor(rewards, device=self.device) dones = torch.tensor(dones, device=self.device, dtype=torch.float32) q_values = self.current_q_net(states).gather(1, actions) next_q_values = self.target_q_net(next_states).max(1)[0] expected_q_values = rewards + self.params.discount_factor * next_q_values * (1 - dones) loss = F.smooth_l1_loss(q_values, expected_q_values.unsqueeze(1)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss def _update_target_q_net(self): self.target_q_net.load_state_dict(self.current_q_net.state_dict())
from dqn.dqn_scrabble_environment import DQNScrabbleEnvironment # inspired by https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html # initialize global variables num_episodes = DQNConstants.EPISODES batch_size = DQNConstants.BATCH_SIZE target_update = DQNConstants.TARGET_UPDATE gamma = DQNConstants.GAMMA epsilon_start = DQNConstants.EPSILON_START epsilon_end = DQNConstants.EPSILON_END epsilon_decay = DQNConstants.EPSILON_DECAY # initialize action-replay memory memory = ReplayMemory(DQNConstants.REPLAY_MEMORY_SIZE) # initialize DQNs policy_net = DQN(DQNScrabbleHelpers.calculate_input_size(4), DQNConstants.HIDDEN_LAYER_SIZE, 20) target_net = DQN(DQNScrabbleHelpers.calculate_input_size(4), DQNConstants.HIDDEN_LAYER_SIZE, 20) # initialize optimizer optimizer = DQNConstants.OPTIMIZER(policy_net.parameters(), lr=DQNConstants.LEARNING_RATE) # keep track of results results = [] # keep track of losses losses = [] # keep track of rewards rewards = [] # keep track of total steps taken total_steps = 0 # initialize environment env = DQNScrabbleEnvironment()
class DQNTrainer: def __init__(self, params, model_path): self.params = params self.model_path = model_path self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.current_q_net = DQN(input_shape=1, num_of_actions=get_action_space()) self.current_q_net.to(self.device) self.target_q_net = DQN(input_shape=1, num_of_actions=get_action_space()) self.target_q_net.to(self.device) self.optimizer = RMSprop(self.current_q_net.parameters(), lr=self.params.lr) self.replay_memory = ReplayMemory(self.params.memory_capacity) game = "Breakout-ram-v0" env = gym.make(game) self.environment = EnvironmentWrapper(env, self.params.skip_steps) def run(self): state = torch.tensor(self.environment.reset(), device=self.device, dtype=torch.float32) self._update_target_q_net() total_reward = 0 for step in range(int(self.params.num_of_steps)): q_value = self.current_q_net(torch.stack([state])) action_index, action = get_action(q_value, train=True, step=step, params=self.params, device=self.device) next_state, reward, done = self.environment.step(action) next_state = torch.tensor(next_state, device=self.device, dtype=torch.float32) self.replay_memory.add(state, action_index, reward, next_state, done) state = next_state total_reward += reward if done: state = torch.tensor(self.environment.reset(), device=self.device, dtype=torch.float32) if len(self.replay_memory.memory) > self.params.batch_size: loss = self._update_current_q_net() print('Update: {}. Loss: {}. Score: {}'.format( step, loss, total_reward)) if step % self.params.target_update_freq == 0: self._update_target_q_net() torch.save(self.target_q_net.state_dict(), self.model_path) def _update_current_q_net(self): batch = self.replay_memory.sample(self.params.batch_size) states, actions, rewards, next_states, dones = batch states = torch.stack(states) next_states = torch.stack(next_states) actions = torch.stack(actions).view(-1, 1) rewards = torch.tensor(rewards, device=self.device) dones = torch.tensor(dones, device=self.device, dtype=torch.float32) q_values = self.current_q_net(states).gather(1, actions) next_q_values = self.target_q_net(next_states).max(1)[0] expected_q_values = rewards + self.params.discount_factor * next_q_values * ( 1 - dones) loss = F.smooth_l1_loss(q_values, expected_q_values.unsqueeze(1)) self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss def _update_target_q_net(self): self.target_q_net.load_state_dict(self.current_q_net.state_dict())
from gym_maze.envs.maze import MazeGame from dqn.dqn import DQN if __name__ == '__main__': width = 4 height = 4 env = MazeGame(width, height, no_random=True, change_map_after=9999999999, state_representation="image", funny=False) env.render() agent = DQN(env.state_space.shape, env.action_space.shape) batch_size = 32 # Temporary memory temporary_memory = [] temporary_memory_max_steps = 30 timeout = 50 # 7x7 #temporary_memory_max_steps = 200 #timeout = 2000 # Failure compensation recent_games = deque(maxlen=10) maximum_loss_rate = .5
from dqn.dqn import DQN from tictactoe_env import TicTacToe def model_constructor(): model = Sequential() # hidden layer model.add(Dense(18, input_shape=(18, ))) model.add(Dense(12)) model.add(Dense(9)) model.compile(optimizer='adam', loss='mse', metrics=['accuracy']) return model dqnX = DQN(model_constructor, 9) dqnO = DQN(model_constructor, 9) def ai_ai_game(): false_moves_X = 0 false_moves_O = 0 env = TicTacToe() winner = env.winner() while winner == 0: ai_reward = 0 if env.get_turn() == 'X': res = -1 while res == -1: move = dqnX.determine_action(env.get_state(), ai_reward) res = env.place(move)
def model_constructor(): model = Sequential() # model.add(Conv2D(filters=32, kernel_size=2, padding="same", input_shape=(3, 3, 3))) # model.add(Conv2D(filters=64, kernel_size=3, padding="same")) # model.add(Reshape((27,))) # hidden layer model.add(Dense(100, input_shape=(13, len(blackjack_env.card_set)))) model.add(Dense(100)) model.add(Reshape((100 * 13,))) model.add(Dense(2)) model.compile(optimizer='adam', loss='mse', metrics=['accuracy']) # model.summary() return model dqn = DQN(model_constructor, 2) game = blackjack_env.BlackJack() def play_game(): winner = game.start_game() while winner == -2: move = dqn.determine_action(game.get_state(), 0) if move == 0: winner = game.play_pass() else: winner = game.play_hit() dqn.determine_action(game.get_state(), winner, terminal_state=True) return winner