Beispiel #1
0
 def __init__(self, params, model_path):
     self.params = params
     self.model_path = model_path
     self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     self.current_q_net = DQN(input_shape=1, num_of_actions=get_action_space())
     self.current_q_net.to(self.device)
     self.target_q_net = DQN(input_shape=1, num_of_actions=get_action_space())
     self.target_q_net.to(self.device)
     self.optimizer = RMSprop(self.current_q_net.parameters(),
                              lr=self.params.lr)
     self.replay_memory = ReplayMemory(self.params.memory_capacity)
     env = gym.make('CarRacing-v0')
     self.environment = EnvironmentWrapper(env, self.params.skip_steps)
 def get(name, *args):
     if name == "dqn":
         return DQN(*args)
     elif name == "a3c":
         return A3C(*args)
     elif name == "dynaq":
         return DynaQ(*args)
     else:
         raise Exception(name + " is not a valid agent")
Beispiel #3
0
def evaluate_dqn(path):
    model = DQN(input_shape=1, num_of_actions=get_action_space())
    model.load_state_dict(torch.load(path))
    model.eval()

    env = gym.make('CarRacing-v0')
    env_wrapper = EnvironmentWrapper(env, 1)

    total_reward = 0
    num_of_episodes = 100

    for episode in range(num_of_episodes):
        state = env_wrapper.reset()
        state = torch.tensor(state, dtype=torch.float)
        done = False
        score = 0
        while not done:
            q_value = model(torch.stack([state]))
            _, action = get_action(q_value, train=False)
            print(action)
            state, reward, done = env_wrapper.step(action)
            state = torch.tensor(state, dtype=torch.float32)
            score += reward
            env_wrapper.render()
        print('Episode: {0} Score: {1:.2f}'.format(episode, score))
        total_reward += score
    return total_reward / num_of_episodes
Beispiel #4
0
def train_tetris():
    num_epochs = 2000
    max_steps = None
    history = []
    agent = DQN(4)
    
    env = Tetris()
    
    done = True
    epoch = 0
    while epoch < num_epochs:
        current_state = env.reset()
        done = False
        next_steps = env.get_next_states()
        next_actions, next_states = zip(*next_steps.items())
        steps = 0

        while not done and (not max_steps or steps < max_steps):
            next_steps = env.get_next_states()
            next_actions, next_states = zip(*next_steps.items())
            best_state_ind = agent.predict_move(next_states)

            action = next_actions[best_state_ind]
            reward, done = env.step(action, render=False)
            agent.add_memory(current_state, reward, next_states[best_state_ind], done)
            current_state = next_states[best_state_ind]
            steps += 1

        if len(agent.memory)<1000:
            continue
        agent.train()
        if epoch % 50 == 0:
            agent.save_model(epoch)
        history.append(current_state)
        print(epoch)
        epoch += 1

    np.savetxt("states.csv",history,delimiter=",",fmt="% s")
Beispiel #5
0
def main():
    """
    The main test drive for this project. It constructs the simulator environment. Then, it calls the modified Multi-DQN for training or testing according to the need.
    Note: If you want to test and debug what is happening in the simulator through an episode, make sure that "test" is the chosen mode along with "verbose: true" in the JSON file.
    :return:
    """
    args = parse_args()
    env = StdSimulatorEnv(args)
    dqn = DQN(env, args)

    if args.mode == 'train':
        dqn.train()
    elif args.mode == 'test':
        dqn.test()
    else:
        raise ValueError("Modes are only train and test")
Beispiel #6
0
def dqn_inference(path):
    model = DQN(input_shape=1, num_of_actions=get_action_space())
    model.load_state_dict(torch.load(path))
    model.eval()

    env = gym.make('CarRacing-v0')
    env_wrapper = EnvironmentWrapper(env, 1)

    state = env_wrapper.reset()
    state = torch.tensor(state, dtype=torch.float32)
    done = False
    total_score = 0
    while not done:
        q_value = model(torch.stack([state]))
        _, action = get_action(q_value, train=False)
        print(action)
        state, reward, done = env_wrapper.step(action)
        state = torch.tensor(state, dtype=torch.float32)
        total_score += reward
        env_wrapper.render()
    return total_score
Beispiel #7
0
from af.dqn import AF_DQN
from dqn.dqn import DQN

env = gym.make('CartPole-v1')

model = DQN(
    'MlpPolicy',
    env,
    # delta=0.1,
    # forecast_horizon=11,
    # dynamics_layers=[32, 32],
    # dynamics_lr=1e-4,
    verbose=2,
    learning_rate=1e-3,
    buffer_size=5000,
    batch_size=32,
    learning_starts=0,
    exploration_fraction=0.1,
    exploration_final_eps=0.02,
    exploration_initial_eps=0.8,
    tau=0.1,
    gamma=0.9,
    train_freq=4,
    gradient_steps=1,
    target_update_interval=10,
    tensorboard_log='runs')

model.learn(total_timesteps=1e5, tb_log_name='DQN')

from stable_baselines3.common.evaluation import evaluate_policy
eval_env = gym.make('CartPole-v1')
Beispiel #8
0
class DQNTrainer:
    def __init__(self, params, model_path):
        self.params = params
        self.model_path = model_path
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.current_q_net = DQN(input_shape=1, num_of_actions=get_action_space())
        self.current_q_net.to(self.device)
        self.target_q_net = DQN(input_shape=1, num_of_actions=get_action_space())
        self.target_q_net.to(self.device)
        self.lr = self.params.lr #NEW
        self.optimizer = RMSprop(self.current_q_net.parameters(),
                                 lr=self.lr) # CHANGE
                                 
        self.replay_memory = ReplayMemory(self.params.memory_capacity)
        env = gym.make('CarRacing-v0')
        self.environment = EnvironmentWrapper(env, self.params.skip_steps)
        self.loss_log = [] # NEW
        self.score_log = [] # NEW

    def run(self):
        episode_score = 0 # NEW
        episode_score_short_array = np.array([]) # NEW
        loss_short_array = np.array([]) # NEW
        episode = 0 # NEW
        state = torch.tensor(self.environment.reset(),
                             device=self.device,
                             dtype=torch.float32)
        self._update_target_q_net()
        for step in range(int(self.params.num_of_steps)):
            q_value = self.current_q_net(torch.stack([state]))
            action_index, action = get_action(q_value,
                                              train=True,
                                              step=step,
                                              params=self.params,
                                              device=self.device)
            next_state, reward, done = self.environment.step(action)
            episode_score += reward # NEW
            next_state = torch.tensor(next_state,
                                      device=self.device,
                                      dtype=torch.float32)
            self.replay_memory.add(state, action_index, reward, next_state, done)
            state = next_state
            if done:
                episode += 1 # NEW
                print('***************Episode: {}. Score: {}'.format(episode, episode_score)) # NEW
                episode_score_short_array = np.append(episode_score_short_array, episode_score) # NEW
                episode_score = 0 # NEW
                
                state = torch.tensor(self.environment.reset(),
                                     device=self.device,
                                     dtype=torch.float32)

            if len(self.replay_memory.memory) > self.params.batch_size:
                loss = self._update_current_q_net()
                loss_short_array = np.append(loss_short_array, loss.cpu().detach().numpy()) # NEW
                print('Update: {}. Loss: {}'.format(step, loss))

            if step % self.params.target_update_freq == 0:
                self._update_target_q_net()
                
            if step % int(self.params.num_of_steps/50) == 0: ### NEW
                self.lr *= 0.8  
                self.optimizer = RMSprop(self.current_q_net.parameters(),
                                 lr=self.lr) 
                torch.save(self.target_q_net.state_dict(), "models/dqn{}.pt".format(step))

                self.score_log.append(np.mean(episode_score_short_array))
                self.loss_log.append(np.mean(loss_short_array))


        torch.save(self.target_q_net.state_dict(), self.model_path)

    def _update_current_q_net(self):
        batch = self.replay_memory.sample(self.params.batch_size)
        states, actions, rewards, next_states, dones = batch

        states = torch.stack(states)
        next_states = torch.stack(next_states)
        actions = torch.stack(actions).view(-1, 1)
        rewards = torch.tensor(rewards, device=self.device)
        dones = torch.tensor(dones, device=self.device, dtype=torch.float32)

        q_values = self.current_q_net(states).gather(1, actions)
        next_q_values = self.target_q_net(next_states).max(1)[0]

        expected_q_values = rewards + self.params.discount_factor * next_q_values * (1 - dones)
        loss = F.smooth_l1_loss(q_values, expected_q_values.unsqueeze(1))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss

    def _update_target_q_net(self):
        self.target_q_net.load_state_dict(self.current_q_net.state_dict())
Beispiel #9
0
from dqn.dqn_scrabble_environment import DQNScrabbleEnvironment

# inspired by https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html

# initialize global variables
num_episodes = DQNConstants.EPISODES
batch_size = DQNConstants.BATCH_SIZE
target_update = DQNConstants.TARGET_UPDATE
gamma = DQNConstants.GAMMA
epsilon_start = DQNConstants.EPSILON_START
epsilon_end = DQNConstants.EPSILON_END
epsilon_decay = DQNConstants.EPSILON_DECAY
# initialize action-replay memory
memory = ReplayMemory(DQNConstants.REPLAY_MEMORY_SIZE)
# initialize DQNs
policy_net = DQN(DQNScrabbleHelpers.calculate_input_size(4),
                 DQNConstants.HIDDEN_LAYER_SIZE, 20)
target_net = DQN(DQNScrabbleHelpers.calculate_input_size(4),
                 DQNConstants.HIDDEN_LAYER_SIZE, 20)
# initialize optimizer
optimizer = DQNConstants.OPTIMIZER(policy_net.parameters(),
                                   lr=DQNConstants.LEARNING_RATE)
# keep track of results
results = []
# keep track of losses
losses = []
# keep track of rewards
rewards = []
# keep track of total steps taken
total_steps = 0
# initialize environment
env = DQNScrabbleEnvironment()
Beispiel #10
0
class DQNTrainer:
    def __init__(self, params, model_path):
        self.params = params
        self.model_path = model_path
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.current_q_net = DQN(input_shape=1,
                                 num_of_actions=get_action_space())
        self.current_q_net.to(self.device)
        self.target_q_net = DQN(input_shape=1,
                                num_of_actions=get_action_space())
        self.target_q_net.to(self.device)
        self.optimizer = RMSprop(self.current_q_net.parameters(),
                                 lr=self.params.lr)
        self.replay_memory = ReplayMemory(self.params.memory_capacity)
        game = "Breakout-ram-v0"
        env = gym.make(game)
        self.environment = EnvironmentWrapper(env, self.params.skip_steps)

    def run(self):
        state = torch.tensor(self.environment.reset(),
                             device=self.device,
                             dtype=torch.float32)
        self._update_target_q_net()
        total_reward = 0
        for step in range(int(self.params.num_of_steps)):
            q_value = self.current_q_net(torch.stack([state]))
            action_index, action = get_action(q_value,
                                              train=True,
                                              step=step,
                                              params=self.params,
                                              device=self.device)
            next_state, reward, done = self.environment.step(action)
            next_state = torch.tensor(next_state,
                                      device=self.device,
                                      dtype=torch.float32)
            self.replay_memory.add(state, action_index, reward, next_state,
                                   done)
            state = next_state
            total_reward += reward
            if done:
                state = torch.tensor(self.environment.reset(),
                                     device=self.device,
                                     dtype=torch.float32)
            if len(self.replay_memory.memory) > self.params.batch_size:
                loss = self._update_current_q_net()
                print('Update: {}. Loss: {}. Score: {}'.format(
                    step, loss, total_reward))
            if step % self.params.target_update_freq == 0:
                self._update_target_q_net()
        torch.save(self.target_q_net.state_dict(), self.model_path)

    def _update_current_q_net(self):
        batch = self.replay_memory.sample(self.params.batch_size)
        states, actions, rewards, next_states, dones = batch

        states = torch.stack(states)
        next_states = torch.stack(next_states)
        actions = torch.stack(actions).view(-1, 1)
        rewards = torch.tensor(rewards, device=self.device)
        dones = torch.tensor(dones, device=self.device, dtype=torch.float32)

        q_values = self.current_q_net(states).gather(1, actions)
        next_q_values = self.target_q_net(next_states).max(1)[0]

        expected_q_values = rewards + self.params.discount_factor * next_q_values * (
            1 - dones)
        loss = F.smooth_l1_loss(q_values, expected_q_values.unsqueeze(1))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss

    def _update_target_q_net(self):
        self.target_q_net.load_state_dict(self.current_q_net.state_dict())
Beispiel #11
0
from gym_maze.envs.maze import MazeGame
from dqn.dqn import DQN

if __name__ == '__main__':
    width = 4
    height = 4

    env = MazeGame(width,
                   height,
                   no_random=True,
                   change_map_after=9999999999,
                   state_representation="image",
                   funny=False)
    env.render()

    agent = DQN(env.state_space.shape, env.action_space.shape)
    batch_size = 32

    # Temporary memory
    temporary_memory = []

    temporary_memory_max_steps = 30
    timeout = 50

    # 7x7
    #temporary_memory_max_steps = 200
    #timeout = 2000

    # Failure compensation
    recent_games = deque(maxlen=10)
    maximum_loss_rate = .5
Beispiel #12
0
from dqn.dqn import DQN
from tictactoe_env import TicTacToe


def model_constructor():
    model = Sequential()
    # hidden layer
    model.add(Dense(18, input_shape=(18, )))
    model.add(Dense(12))
    model.add(Dense(9))

    model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
    return model


dqnX = DQN(model_constructor, 9)
dqnO = DQN(model_constructor, 9)


def ai_ai_game():
    false_moves_X = 0
    false_moves_O = 0
    env = TicTacToe()
    winner = env.winner()
    while winner == 0:
        ai_reward = 0
        if env.get_turn() == 'X':
            res = -1
            while res == -1:
                move = dqnX.determine_action(env.get_state(), ai_reward)
                res = env.place(move)
def model_constructor():
    model = Sequential()
    # model.add(Conv2D(filters=32, kernel_size=2, padding="same", input_shape=(3, 3, 3)))
    # model.add(Conv2D(filters=64, kernel_size=3, padding="same"))
    # model.add(Reshape((27,)))
    # hidden layer
    model.add(Dense(100, input_shape=(13, len(blackjack_env.card_set))))
    model.add(Dense(100))
    model.add(Reshape((100 * 13,)))
    model.add(Dense(2))
    model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
    # model.summary()
    return model


dqn = DQN(model_constructor, 2)

game = blackjack_env.BlackJack()


def play_game():
    winner = game.start_game()
    while winner == -2:
        move = dqn.determine_action(game.get_state(), 0)
        if move == 0:
            winner = game.play_pass()
        else:
            winner = game.play_hit()
    dqn.determine_action(game.get_state(), winner, terminal_state=True)
    return winner