Beispiel #1
0
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size,
                 eps_min=0.01,
                 eps_dec=5e-7,
                 replace=1000):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims)

        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims)
    def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
                 memory_size, batch_size, algo, env_name, checkpoint_dir,
                 epsilon_min=0.01, epsilon_decay=5e-7, replace_target_count=1000):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.algo = algo
        self.env_name = env_name
        self.epsilon_min = 0.01
        self.epsilon_decay = epsilon_decay
        self.replace_target_count = replace_target_count
        self.checkpoint_dir = checkpoint_dir
        self.action_space = [i for i in range(self.n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayMemory(memory_size, input_dims, n_actions)

        self.q_net = DeepQNetwork(self.lr, self.n_actions, 
                                  name=self.env_name+'_'+self.algo+'_q_net',
                                  input_dims=self.input_dims,
                                  checkpoint_dir=self.checkpoint_dir)
        self.target_net = DeepQNetwork(self.lr, self.n_actions, 
                                  name=self.env_name+'_'+self.algo+'_target_net',
                                  input_dims=self.input_dims,
                                  checkpoint_dir=self.checkpoint_dir)
Beispiel #3
0
    def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
                 mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
                 replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = DeepQNetwork(self.lr, self.n_actions,
                                    input_dims=self.input_dims,
                                    name=self.env_name+'_'+self.algo+'_q_eval',
                                    chkpt_dir=self.chkpt_dir)

        self.q_next = DeepQNetwork(self.lr, self.n_actions,
                                    input_dims=self.input_dims,
                                    name=self.env_name+'_'+self.algo+'_q_next',
                                    chkpt_dir=self.chkpt_dir)
class DDQNAgent(Agent):
    def __init__(self, *args, **kwargs):
        super(DDQNAgent, self).__init__(*args, **kwargs)

        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_eval',
                                   checkpoint_dir=self.checkpoint_dir)

        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_next',
                                   checkpoint_dir=self.checkpoint_dir)

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation],
                             dtype=T.float).to(self.q_eval.device)
            actions = self.q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)
        return action

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        self.q_eval.optimizer.zero_grad()

        self.replace_target_network()

        states, actions, rewards, new_states, dones = self.sample_memory()

        indices = np.arange(self.batch_size)

        q_pred = self.q_eval.forward(states)[indices, actions]
        q_next = self.q_next.forward(new_states)
        q_eval = self.q_eval.forward(new_states)

        max_actions = T.argmax(q_eval, dim=1)

        q_next[dones] = 0.0

        q_target = rewards + self.gamma * q_next[indices, max_actions]

        loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
        loss.backward()

        self.q_eval.optimizer.step()

        self.learn_step_cntr += 1

        self.decrement_epsilon()
def run_dqn(config, gym_wrapper, summaries_collector_traj,
            summaries_collector):
    q_network = DeepQNetwork(config, gym_wrapper, trajectory=1)
    initial_time = round(time(), 3)
    q_network.train(summaries_collector)
    reward = q_network.test(summaries_collector, episodes=10, render=True)
    summaries_collector.read_summaries('test')
    total_time_traj = round(time(), 3) - initial_time
    print("tested avg reward: {0} in: {1}".format(reward, total_time_traj))
Beispiel #6
0
    def __init__(self, *args, **kwargs):
        super(DQNAgent, self).__init__(*args, **kwargs)

        self.q_eval = DeepQNetwork(self.lr, self.n_actions,
                                    input_dims=self.input_dims,
                                    name=self.env_name+'_'+self.algo+'_q_eval',
                                    chkpt_dir=self.chkpt_dir)
        self.q_next = DeepQNetwork(self.lr, self.n_actions,
                                    input_dims=self.input_dims,
                                    name=self.env_name+'_'+self.algo+'_q_next',
                                    chkpt_dir=self.chkpt_dir)
    def crossover(self, crossover_mode, selection_mode):
        print("Crossver")
        #2. Fitness
        sum_fitnesses = np.sum(self.old_fitnesses)
        probs = [
            self.old_fitnesses[i] / sum_fitnesses for i in range(self.size)
        ]

        # Sorting descending NNs according to their fitnesses
        #3. Parents selection
        sort_indices = np.argsort(probs)[::-1]
        for i in range(self.size):
            if i < self.size * elitism_pct:
                # Add the top performing childs - parents selection
                model_c = self.old_models[sort_indices[i]]
            else:
                #selekcja rankingowa
                if selection_mode == "ranking":
                    a = sort_indices[0]
                    b = sort_indices[1]
                # sum_parent = self.old_fitnesses[a] + self.old_fitnesses[b]

                model_a, model_b = self.old_models[a], self.old_models[b]
                model_c = DeepQNetwork()

                conv_a = [model_a.conv1, model_a.conv2, model_a.conv3]
                conv_b = [model_b.conv1, model_b.conv2, model_b.conv3]
                conv_c = [model_c.conv1, model_c.conv2, model_c.conv3]

                c_i = 0
                for c in conv_c:
                    #4. Crossover
                    for i in range(c[0].weight.size()[0]):
                        for j in range(c[0].weight.size()[1]):
                            if crossover_mode == "mean":
                                c[0].weight.data[i][
                                    j] = conv_b[c_i][0].weight.data[i][
                                        j] / conv_a[c_i][0].weight.data[i][j]
                            if crossover_mode == "two_point":
                                point_one = np.random.random()
                                point_two = np.random.random()

                                if point_one > point_two:
                                    a = point_one
                                    point_one = point_two
                                    point_two = a

                                c[0].weight.data[0:point_one][j] = conv_b[c_i][
                                    0].weight.data[0:point_one][j]
                                c[0].weight.data[point_one:point_two][
                                    j] = conv_c[c_i][0].weight.data[
                                        point_one:point_two][j]
                                c[0].weight.data[point_two:][j] = conv_b[c_i][
                                    0].weight.data[point_two:][j]

                        c_i += 1

                self.models.append(model_c)
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size,
                 chkpt_dir,
                 eps_min=0.01,
                 eps_dec=5e-7,
                 replace=1000,
                 algo=None,
                 env_name=None):
        self.gamma = gamma  # 0.99
        self.epsilon = epsilon  # 1.0
        self.lr = lr  # 0.0001
        self.n_actions = n_actions  # 6
        self.input_dims = input_dims  # (4, 84, 84)
        self.batch_size = batch_size  # 32
        self.eps_min = eps_min  # 0.1
        self.eps_dec = eps_dec  # 1e-05
        self.replace_target_cnt = replace  # 1000
        self.algo = algo  # 'DQNAgent'
        self.env_name = env_name  #  'PongNoFrameskip-v4'
        self.chkpt_dir = chkpt_dir  #  .\\models\\
        self.action_space = [i for i in range(self.n_actions)
                             ]  # [0, 1, 2, 3, 4, 5]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_eval',
                                   chkpt_dir=self.chkpt_dir)
        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_next',
                                   chkpt_dir=self.chkpt_dir)
Beispiel #9
0
class DQNAgent(Agent):
    def __init__(self, *args, **kwargs):
        super(DQNAgent, self).__init__(*args, **kwargs)

        self.q_eval = DeepQNetwork(self.n_actions, input_dims=self.input_dims, name=Config.get_name('_q_eval'))
        self.q_next = DeepQNetwork(self.n_actions, input_dims=self.input_dims, name=Config.get_name('_q_next'))

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation],dtype=T.float).to(self.q_eval.device)
            actions = self.q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def learn(self):
        if self.memory.mem_cntr < Config.batch_size:
            return

        self.q_eval.optimizer.zero_grad()
        self.replace_target_network()

        states, actions, rewards, states_, dones = self.sample_memory()
        indices = np.arange(Config.batch_size)

        q_pred = self.q_eval.forward(states)[indices, actions]
        q_next = self.q_next.forward(states_).max(dim=1)[0]
        q_next[dones] = 0.0
        q_target = rewards + Config.gamma*q_next

        loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
        loss.backward()
        self.q_eval.optimizer.step()
        self.learn_step_counter += 1

        self.decay_epsilon()
 def __init__(self,
              size=50,
              old_population=None,
              crossover_mode="mean",
              selection_mode="ranking"):
     self.size = size
     if old_population is None:
         self.models = [DeepQNetwork() for i in range(size)]
     else:
         #1. Population
         self.old_models = old_population.models
         self.old_fitnesses = old_population.fitnesses
         self.models = []
         self.crossover_mode = crossover_mode
         self.selection_mode = selection_mode
         self.crossover(crossover_mode, selection_mode)
         self.mutate()
     self.fitnesses = np.zeros(self.size)
Beispiel #11
0
    def __init__(self, *args, **kwargs):
        super(DQNAgent, self).__init__(*args, **kwargs)

        self.q_eval = DeepQNetwork(self.n_actions, input_dims=self.input_dims, name=Config.get_name('_q_eval'))
        self.q_next = DeepQNetwork(self.n_actions, input_dims=self.input_dims, name=Config.get_name('_q_next'))
Beispiel #12
0
        os.mkdir(RES_DIR)

    # 设置GPU
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True  # 程序按需申请内存
    sess = tf.compat.v1.Session(config=config)
    tf.compat.v1.keras.backend.set_session(sess)

    # 设置gym有关参数
    env = make_atari('PongNoFrameskip-v4')
    env = wrap_deepmind(env, scale=False, frame_stack=True)
    num_actions = env.action_space.n

    dqn = DeepQNetwork(input_shape=(WIDTH, HEIGHT, NUM_FRAMES),
                       num_actions=num_actions,
                       name='dqn',
                       learning_rate=LR)
    target_dqn = DeepQNetwork(input_shape=(WIDTH, HEIGHT, NUM_FRAMES),
                              num_actions=num_actions,
                              name='target_dqn',
                              learning_rate=LR)
    buf = MemoryBuffer(memory_size=BUFFER_SIZE)

    total_episode_rewards = []
    step = 0
    for episode in range(MAX_EPISODE + 1):
        frame = env.reset()  # LazyFrames
        state = np.array(frame)  # narray (84, 84, 4)
        done = False
        cur_episode_reward = 0
        while not done:  # 如果done则结束episode
Beispiel #13
0
def train(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)
    writer = SummaryWriter(opt.log_path)
    env = Tetris(width=opt.width, height=opt.height, block_size=opt.block_size)
    model = DeepQNetwork()
    optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr)
    criterion = nn.MSELoss()

    state = env.reset()
    if torch.cuda.is_available():
        model.cuda()
        state = state.cuda()

    replay_memory = deque(maxlen=opt.replay_memory_size)
    epoch = 0
    while epoch < opt.num_epochs:
        next_steps = env.get_next_states()
        # Exploration or exploitation
        epsilon = opt.final_epsilon + (
            max(opt.num_decay_epochs - epoch, 0) *
            (opt.initial_epsilon - opt.final_epsilon) / opt.num_decay_epochs)
        u = random()
        random_action = u <= epsilon
        next_actions, next_states = zip(*next_steps.items())
        next_states = torch.stack(next_states)
        if torch.cuda.is_available():
            next_states = next_states.cuda()
        model.eval()
        with torch.no_grad():
            predictions = model(next_states)[:, 0]
        model.train()
        # if random_action:
        #     index = randint(0, len(next_steps) - 1)
        # else:
        index = torch.argmax(predictions).item()

        next_state = next_states[index, :]
        action = next_actions[index]

        reward, done = env.step(action, render=True)

        if torch.cuda.is_available():
            next_state = next_state.cuda()
        replay_memory.append([state, reward, next_state, done])
        if done:
            final_score = env.score
            final_tetrominoes = env.tetrominoes
            final_cleared_lines = env.cleared_lines
            state = env.reset()
            if torch.cuda.is_available():
                state = state.cuda()
        else:
            state = next_state
            continue
        if len(replay_memory) < opt.replay_memory_size / 10:
            continue
        epoch += 1
        batch = sample(replay_memory, min(len(replay_memory), opt.batch_size))
        state_batch, reward_batch, next_state_batch, done_batch = zip(*batch)
        state_batch = torch.stack(tuple(state for state in state_batch))
        reward_batch = torch.from_numpy(
            np.array(reward_batch, dtype=np.float32)[:, None])
        next_state_batch = torch.stack(
            tuple(state for state in next_state_batch))

        if torch.cuda.is_available():
            state_batch = state_batch.cuda()
            reward_batch = reward_batch.cuda()
            next_state_batch = next_state_batch.cuda()

        q_values = model(state_batch)
        model.eval()
        with torch.no_grad():
            next_prediction_batch = model(next_state_batch)
        model.train()

        y_batch = torch.cat(
            tuple(reward if done else reward + opt.gamma * prediction
                  for reward, done, prediction in zip(
                      reward_batch, done_batch, next_prediction_batch)))[:,
                                                                         None]

        optimizer.zero_grad()
        loss = criterion(q_values, y_batch)
        loss.backward()
        optimizer.step()

        print(
            "Epoch: {}/{}, Action: {}, Score: {}, Tetrominoes {}, Cleared lines: {}"
            .format(epoch, opt.num_epochs, action, final_score,
                    final_tetrominoes, final_cleared_lines))
        writer.add_scalar('Train/Score', final_score, epoch - 1)
        writer.add_scalar('Train/Tetrominoes', final_tetrominoes, epoch - 1)
        writer.add_scalar('Train/Cleared lines', final_cleared_lines,
                          epoch - 1)

        if epoch > 0 and epoch % opt.save_interval == 0:
            torch.save(model, "{}/tetris_{}".format(opt.saved_path, epoch))

    torch.save(model, "{}/tetris2".format(opt.saved_path))
Beispiel #14
0
Tensorflow: 1.0
gym: 0.8.0
"""

import gym
from deep_q_network import DeepQNetwork

env = gym.make('MountainCar-v0')
env = env.unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

RL = DeepQNetwork(n_actions=3, n_features=2, learning_rate=0.001, gamma=0.9)

total_steps = 0
for i_episode in range(10):

    observation = env.reset()
    ep_r = 0
    while True:
        env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)

        position, velocity = observation_
Beispiel #15
0
import numpy as np

env = gym.make('LunarLander-v2')
env = env.unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

# Initialize DQN
DQN = DeepQNetwork(  n_y=env.action_space.n,
                    n_x=env.observation_space.shape[0],
                    learning_rate=0.01,
                    replace_target_iter=100,
                    memory_size=500,
                    batch_size=32,
                    epsilon_max=0.9,
                    epsilon_greedy_increment=0.001
                )


RENDER_ENV = False
EPISODES = 500
rewards = []
RENDER_REWARD_MIN = 0
total_steps_counter = 0

for episode in range(400):

    observation = env.reset()
class DQNAgent():
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size,
                 chkpt_dir,
                 eps_min=0.01,
                 eps_dec=5e-7,
                 replace=1000,
                 algo=None,
                 env_name=None):
        self.gamma = gamma  # 0.99
        self.epsilon = epsilon  # 1.0
        self.lr = lr  # 0.0001
        self.n_actions = n_actions  # 6
        self.input_dims = input_dims  # (4, 84, 84)
        self.batch_size = batch_size  # 32
        self.eps_min = eps_min  # 0.1
        self.eps_dec = eps_dec  # 1e-05
        self.replace_target_cnt = replace  # 1000
        self.algo = algo  # 'DQNAgent'
        self.env_name = env_name  #  'PongNoFrameskip-v4'
        self.chkpt_dir = chkpt_dir  #  .\\models\\
        self.action_space = [i for i in range(self.n_actions)
                             ]  # [0, 1, 2, 3, 4, 5]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_eval',
                                   chkpt_dir=self.chkpt_dir)
        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_next',
                                   chkpt_dir=self.chkpt_dir)

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation],
                             dtype=T.float).to(self.q_eval.device)
            actions = self.q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)
        return action

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def sample_memory(self):
        state, action, reward, new_state, done = self.memory.sample_buffer(
            self.batch_size)
        states = T.tensor(state).to(self.q_eval.device)
        rewards = T.tensor(reward).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        states_ = T.tensor(new_state).to(self.q_eval.device)
        return states, actions, rewards, states_, dones

    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict(
            ))  # load_state_dict and state_dict are inbuilt functions of torch

    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        self.q_eval.optimizer.zero_grad()

        self.replace_target_network()
        states, actions, rewards, states_, dones = self.sample_memory()

        indices = np.arange(self.batch_size)
        q_pred = self.q_eval.forward(
            states
        )[indices,
          actions]  # self.q_eval.forward(states).shape = (32, 6), q_pred.shape = 32
        q_next = self.q_next.forward(states_).max(
            dim=1
        )[0]  # self.q_next.forward(states_).shape = (32, 6), q_next.shape = 32

        temp_dones = dones.bool()
        q_next[temp_dones] = 0.0  # as reward for terminal state is 0
        q_target = rewards + self.gamma * q_next

        loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
        loss.backward()
        self.q_eval.optimizer.step()
        self.learn_step_counter += 1
        self.decrement_epsilon()
# Reset environment
env.reset()

# Get screen size so that we can initialize layers correctly based on shape
# returned from AI gym. Typical dimensions at this point are close to 3x40x90
# which is the result of a clamped and down-scaled render buffer in get_screen()
init_screen = InputExtractor.get_screen(env=env, device=device)
_, _, screen_height, screen_width = init_screen.shape

# Get number of actions from gym action space
n_actions = env.action_space.n

# Only use defined parameters if there is no previous output being loaded
if RUN_TO_LOAD != None:
    # Initialize and load policy net
    policy_net = DeepQNetwork(screen_height, screen_width, n_actions)
    policy_net.load_state_dict(NET_STATE_DICT)
    policy_net.to(device)
    policy_net.eval()
else:
    # Initialize policy net
    policy_net = DeepQNetwork(screen_height, screen_width, n_actions)
    policy_net.to(device)

# Copy target net from policy net
target_net = DeepQNetwork(screen_height, screen_width, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

# Only use defined parameters if there is no previous output being loaded
if RUN_TO_LOAD != None:
Beispiel #18
0
class DQNAgent():
    def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec = 5e-7,
                replace =1000, algo=None, env_name = None, chkpt_dir = 'tmp/dqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(self.n_actions)]
        self.learn_step_counter= 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
        self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims = self.input_dims,name = self.env_name + self.algo + '_q_eval', chkpt_dir= self.chkpt_dir)
        self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims = self.input_dims,name = self.env_name + self.algo + '_q_next', chkpt_dir= self.chkpt_dir)

    def choose_action(self, observation):
        if(np.random.random()> self.epsilon):
            state = T.tensor([observation], dtype=T.float).to(self.q_eval.device)
            actions = self.q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)
        return action

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)
    def sample_memory(self):
        state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size)
        states = T.tensor(state).to(self.q_eval.device)
        rewards= T.tensor(reward).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        states_ = T.tensor(new_state).to(self.q_eval.device)
        return states, actions, rewards, states_, done
    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())
    def decrement_epsilon(self):
        self.epsilon = self.epsilon -self.eps_dec if self.epsilon > self.eps_min else self.eps_min

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()
    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()
    def learn(self):
        if(self.memory.mem_cntr < self.batch_size):
            return
        self.q_eval.optimizer.zero_grad()
        self.replace_target_network()
        states, actions, rewards, states_, dones = self.sample_memory()
        indices = np.arange(self.batch_size)
        q_pred = self.q_eval.forward(states)[indices, actions]
        q_next = self.q_next.forward(states_).max(dim=1)[0]
        q_next[dones] = 0
        q_target= rewards + self.gamma * q_next
        loss =self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
        loss.backward()
        self.q_eval.optimizer.step()
        self.learn_step_counter+=1
        self.decrement_epsilon()
Beispiel #19
0
test_dataset = MNIST('./data',
                     train=False,
                     transform=img_transform,
                     download=True)
# print(len(test_dataset))
test_1, test_2 = random_split(test_dataset, [2000, 8000])
test_dataloader = DataLoader(
    test_1,  # 
    batch_size=BATCH_SIZE,
    shuffle=True)
#===========================================================

cnn = CNN().cuda()
cnn.load_state_dict(
    torch.load("/home/user/liuhongxing/Mnist_RL/cnn_mnist.pth"))
Q = DeepQNetwork()
Q.load_state_dict(
    torch.load(
        '/home/user/liuhongxing/Mnist_RL/Q_network_exploitation_600_act5.pth'))
optimizer = torch.optim.Adam(Q.parameters(), lr=1e-3)
criterion = nn.MSELoss()
threshold = 0.1
iterations = 70
gamma = 0.9

# writer = SummaryWriter('./log/Q_network_600_act5')
# # training and validation
# print('start Q_network_600_act5 training----------------------')
# for st, data in enumerate(train_dataloader):
#         imgs, labels = data
#         D, pre_label, _ = genenrate_D(imgs,labels)            # 生成图片与聚类中心距离D
Beispiel #20
0
if __name__ == '__main__':
    LR = 2.5e-4
    HEIGHT = 84
    WIDTH = 84
    NUM_FRAMES = 4
    TEST_EPISODE = 5

    par_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    MODEL_DIR = os.path.join(par_dir, 'model')
    model_file = os.listdir(MODEL_DIR)[-1]  # 最后保存的模型

    env = make_atari('PongNoFrameskip-v4')
    env = wrap_deepmind(env, scale=False, frame_stack=True)
    num_actions = env.action_space.n

    dqn = DeepQNetwork(input_shape=(WIDTH, HEIGHT, NUM_FRAMES), num_actions=num_actions, name='dqn', learning_rate=LR)
    dqn.load(MODEL_DIR, model_file)

    ep_reward = []
    for _ in range(TEST_EPISODE):
        frame = env.reset()  # LazyFrames
        state = np.array(frame)  # narray (84, 84, 4)
        done = False
        cur_episode_reward = 0
        while not done:  # 如果done则结束episode
            action = dqn.get_action(state / 255.0)
            env.render()
            next_frame, reward, done, _ = env.step(action)
            state = np.array(next_frame)
            cur_episode_reward += reward
            time.sleep(0.005)
class DoubleDQNAgent():
    def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
                 memory_size, batch_size, algo, env_name, checkpoint_dir,
                 epsilon_min=0.01, epsilon_decay=5e-7, replace_target_count=1000):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.algo = algo
        self.env_name = env_name
        self.epsilon_min = 0.01
        self.epsilon_decay = epsilon_decay
        self.replace_target_count = replace_target_count
        self.checkpoint_dir = checkpoint_dir
        self.action_space = [i for i in range(self.n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayMemory(memory_size, input_dims, n_actions)

        self.q_net = DeepQNetwork(self.lr, self.n_actions, 
                                  name=self.env_name+'_'+self.algo+'_q_net',
                                  input_dims=self.input_dims,
                                  checkpoint_dir=self.checkpoint_dir)
        self.target_net = DeepQNetwork(self.lr, self.n_actions, 
                                  name=self.env_name+'_'+self.algo+'_target_net',
                                  input_dims=self.input_dims,
                                  checkpoint_dir=self.checkpoint_dir)

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            observation = T.tensor([observation], dtype=T.float).to(self.q_net.device)
            actions = self.q_net(observation)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def remember(self, state, action, reward, next_state, done):
        self.memory.remember(state, action, reward, next_state, done)

    def sample_memory(self):
        states, actions, rewards, next_states, dones = \
                                self.memory.sample_buffer(self.batch_size)
        
        states = T.tensor(states).to(self.q_net.device)
        actions = T.tensor(actions).to(self.q_net.device)
        rewards = T.tensor(rewards).to(self.q_net.device)
        next_states = T.tensor(next_states).to(self.q_net.device)
        dones = T.tensor(dones).to(self.q_net.device)

        return states, actions, rewards, next_states, dones

    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_count == 0:
            self.target_net.load_state_dict(self.q_net.state_dict())

    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.epsilon_decay \
                            if self.epsilon > self.epsilon_min else self.epsilon_min
            
    def learn(self):
        if self.memory.memory_counter < self.batch_size:
            return
        
        self.q_net.optimizer.zero_grad()

        self.replace_target_network()

        states, actions, rewards, next_states, dones = self.sample_memory()

        q_prediction = self.q_net(states) # (batch_size, *n_actions)
        target_predictions = self.target_net(next_states) # (batch_size, *n_actions)
        target_predictions[dones] = 0.0
        
        indices = np.arange(self.batch_size)
        q_value = q_prediction[indices, actions]

        t_actions = T.argmax(self.q_net(next_states), dim=1)
        target_value = rewards + self.gamma * target_predictions[indices, t_actions]

        loss = self.q_net.loss(q_value, target_value).to(self.q_net.device)
        loss.backward()
        self.q_net.optimizer.step()
        self.learn_step_counter += 1

        self.decrement_epsilon()
            
    def save_models(self):
        self.q_net.save_checkpoint()
        self.target_net.save_checkpoint()

    def load_models(self):
        self.q_net.load_checkpoint()
        self.target_net.load_checkpoint()
Beispiel #22
0
def runner(node_num):
    # Load checkpoint
    load_path = "weights/weights.ckpt"
    save_path = "weights/weights.ckpt"

    # set seed
    seed = 42
    np.random.seed(seed)
    random.seed(seed)

    # Generate graph for training...
    resources = 1
    # G, reward_save, num_nodes = generate_graph(nodes=node_num, type='gnp_adversarial')
    # G, reward_save, num_nodes = generate_graph(load_dir='../gml/ibm.gml', type='gml')
    G, reward_save, num_nodes = generate_graph(nodes=node_num,
                                               type='random_graph',
                                               seed=42)

    # Pick an arbitrary node to be the root
    root = 0
    # Try plotting. If on ssh, don't bother since there are some necessary plt.draw() commands
    # to plot a networkx graph.
    try:
        plot_graph(G, root, 'rl_graph.png')
    except:
        print('No display')

    # We may want to include the graph laplacian in the observation space
    # Graph laplacian is D - A
    # laplacian_matrix = nx.laplacian_matrix(G).toarray()
    # flat_laplacian = laplacian_matrix.flatten()

    # Build the learning environment
    env = environment(G, [root], resources)
    print('num_edges:', G.number_of_edges())
    print("Ratio Heuristic", ratio_heuristic(G, [root], resources), '\n')

    # Our observation space
    n_y = len(env.actions_permutations)

    # Initialize DQN
    DQN = DeepQNetwork(
        n_y=n_y,
        n_x=num_nodes,
        resources=resources,
        env=env,
        learning_rate=0.01,
        replace_target_iter=20,
        memory_size=20000,
        batch_size=256,
        reward_decay=0.6,
        epsilon_min=0.1,
        epsilon_greedy_decrement=5e-5,
        # load_path=load_path,
        # save_path=save_path,
        # laplacian=flat_laplacian,
        inner_act_func='leaky_relu',
        output_act_func='leaky_relu')

    episodes = 600
    rewards = []
    total_steps_counter = 0
    episodes_since_max = 0

    optimal_action_sequences = []
    overall_start = time.time()
    # DQN.epsilon = 0.5

    for episode in range(episodes):

        observation, done = env.reset()
        episode_reward = 0
        action_sequence = []
        start = time.time()
        train_time = 0

        while not done:
            # 1. Choose an action based on observation
            action = DQN.choose_action(observation)

            # check for random action
            if action == -1:
                # action = env.random_action()
                # now choose between truly random action and a ratio action
                r = random.random()
                if r < 0.6:
                    action = env.random_action()
                else:
                    action = env.ratio_action()

            # save the taken action
            action_sequence.append(action)

            # print('Chosen action', action)
            # 2. Take the chosen action in the environment
            observation_, reward, done = env.step(action, neg=False)
            # print(observation_, reward, done)
            # 3. Store transition
            DQN.store_transition(observation, action, reward, observation_)

            episode_reward += reward

            if total_steps_counter > 2000:
                # 4. Train
                s = time.time()
                DQN.learn()
                e = time.time()
                train_time += (e - s)

            if done:
                rewards.append(episode_reward)
                max_reward_so_far = np.amax(rewards)

                # if maximum reward so far, save the action sequence
                if episode_reward == max_reward_so_far:
                    optimal_action_sequences.append(
                        (action_sequence, episode_reward))
                    episodes_since_max = 0
                    # DQN.epsilon = 1

                print("==========================================")
                print("Episode: ", episode)
                print("Reward: ", round(episode_reward, 2))
                print("Epsilon: ", round(DQN.epsilon, 2))
                print("Max reward so far: ", max_reward_so_far)

                end = time.time()
                print('Episode time:', end - start)
                start = time.time()
                break

            # Save observation
            observation = observation_

            # Increase total steps
            total_steps_counter += 1

            # if episode == 700:
            #     DQN.epsilon_min = .1
            #     DQN.epsilon = 0.5

        episodes_since_max += 1
        print('train time across episode', train_time)

    overall_end = time.time()

    # TEST Q-Learning
    DQN.epsilon = 0
    DQN.epsilon_min = 0
    observation, done = env.reset()
    final_reward = 0
    action_sequence = []
    while not done:
        action = DQN.choose_action(observation)
        action_sequence.append(action)
        observation_, reward, done = env.step(action, neg=False)

        final_reward += reward
        if done:
            rewards.append(final_reward)
            max_reward_so_far = np.amax(rewards)

            # if maximum reward so far, save the action sequence
            if final_reward == max_reward_so_far:
                optimal_action_sequences.append(
                    (action_sequence, final_reward))
                episodes_since_max = 0
            break

        # Save observation
        observation = observation_

    print('final epsilon=0 reward', final_reward, '\n')

    # TESTING
    # convert our 'best' optimal action sequence to the vector representation, test it for correctness
    opt = optimal_action_sequences[len(optimal_action_sequences) - 1][0]
    reward = optimal_action_sequences[len(optimal_action_sequences) - 1][1]

    print()
    # print('RL action sequence:')
    env.reset()
    true_r = 0
    for action in opt:
        # print('action index', action)
        # debug will print the action at each step as a vector
        _, r, d = env.step(action, debug=True)
        true_r += r

    results = []
    # if we have a reasonable number of nodes (< 24), we can compute optimal using DP
    if num_nodes < 24:
        dp_time = time.time()
        results.append(DP_optimal(G, [root], resources))
        print('DP Opt: ', results[0])
        dp_time_end = time.time()
        results.append(dp_time_end - dp_time)
        print('DP time: ', results[1])
    else:
        results.append('n/a')
        results.append('n/a')

    print('\n Random Heuristic', random_heuristic(G, [root], resources), '\n')
    results.append(random_heuristic(G, [root], resources))

    # Only works on trees
    # print('\n Tree Heuristic:', simulate_tree_recovery(G, resources, root, clean=False), '\n')

    ratio_time_start = time.time()
    print('\n Ratio Heuristic', ratio_heuristic(G, [root], resources))
    ratio_time_end = time.time()
    print('Ratio time:', ratio_time_end - ratio_time_start)
    results.append(ratio_heuristic(G, [root], resources))
    results.append(ratio_time_end - ratio_time_start)

    print('\n reward during training:', reward)
    results.append(reward)
    print('RL method time (s): ', overall_end - overall_start, '\n')
    results.append(overall_end - overall_start)

    plot_bar_x(rewards, 'episode', 'reward_graph.png')
    with open(reward_save, 'w') as f:
        for item in rewards:
            f.write('%s\n' % item)

    return results
class DQNAgent(object):
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size,
                 eps_min=0.01,
                 eps_dec=5e-7,
                 replace=1000,
                 algo=None,
                 env_name=None,
                 chkpt_dir='tmp/dqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_eval',
                                   chkpt_dir=self.chkpt_dir)

        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_next',
                                   chkpt_dir=self.chkpt_dir)

        #self.dim_bechir = self.q_eval.calculate_output_bechir(self.input_dims)

    def choose_action(self, observation):
        """
        Choose an action through an epsilon-greedy approach.
        :param observation: state features as provided by gym environment.
        :return: action
        """
        if np.random.random() > self.epsilon:
            # Convert state to Pytorch tensor and send to q_eval.device
            state = T.tensor([observation],
                             dtype=T.float).to(self.q_eval.device)
            # Get actions values from q_eval network
            actions = self.q_eval.forward(state)
            # Get action with highest value
            action = T.argmax(actions).item()
        else:
            # Select random action from action space
            action = np.random.choice(self.action_space)

        return action

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def sample_memory(self):
        state, action, reward, new_state, done = \
                                self.memory.sample_buffer(self.batch_size)

        states = T.tensor(state).to(self.q_eval.device)
        rewards = T.tensor(reward).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        states_ = T.tensor(new_state).to(self.q_eval.device)

        return states, actions, rewards, states_, dones

    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.eps_dec \
                           if self.epsilon > self.eps_min else self.eps_min

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()

    def learn(self):
        # If memory counter has not reached batch size simply return
        if self.memory.mem_cntr < self.batch_size:
            return

        # reset gradients for the main network's optimizer
        self.q_eval.optimizer.zero_grad()

        # Call function to update target network weights every n steps
        self.replace_target_network()

        # Sample environment transitions from the replay buffer
        states, actions, rewards, states_, dones = self.sample_memory()

        # Get Q(s,a) for the actions performed by the agent.
        # Because we processed a batch of states, we need to index the result of the forward function by the indices of
        # the states (from 0 to batch_size) followed by the index of the action performed by the agent.
        indices = np.arange(self.batch_size)
        q_pred = self.q_eval.forward(states)[indices, actions]

        # Get max Q(s', a') from target network
        q_next = self.q_next.forward(states_).max(dim=1)[0]

        # Set Q(s', a') to zero for terminal states
        q_next[dones] = 0.0
        # Compute the q_target as r + gamma * Q(s',a')
        q_target = rewards + self.gamma * q_next

        # Compute the loss tensor and move it to q_eval.device
        loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)

        # Backpropagate loss and optimize network parameters
        loss.backward()
        self.q_eval.optimizer.step()

        # Increment training counter
        self.learn_step_counter += 1

        # Decrement epsilon for epsilon-greedy action selection
        self.decrement_epsilon()
Beispiel #24
0
def train(opt):
    '''This function is for the training '''
    if torch.cuda.is_available(
    ):  # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda
        torch.cuda.manual_seed(
            125
        )  # the torch cuda manual seed is used in order to have reproducable results
    else:
        torch.manual_seed(125)  # sets the random number generator from pytorch
    if os.path.isdir(
            opt.log_path):  # check if the path is the path that is stored
        shutil.rmtree(
            opt.log_path)  # delets all the content from the lo_path dirfectory
    os.makedirs(opt.log_path
                )  # create a new path directory and store it to the log_path
    new_writer2 = SummaryWriter(
        opt.log_path)  # create a new summary writer with the log path
    environment = Tetris(
        width=opt.width, height=opt.height, block_size=opt.block_size
    )  # sets the environment to the tetris environment that i have created before with with width, the height and the the block size  from the parser.
    deepQ_model = DeepQNetwork(
    )  # the model is set to the deep q network that was created before
    my_optim = torch.optim.Adam(
        deepQ_model.parameters(), lr=opt.lr
    )  # sets the optimizaer with the algorith Adamn and the deep q model paramets and the learning rate from the parser
    cn = nn.MSELoss(
    )  # this is the default as ((input-target)**2).mean() but with pytorch it gets easier
    state = environment.reset(
    )  # the state is equal to a new reset environment
    if torch.cuda.is_available(
    ):  # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda
        deepQ_model.cuda(
        )  # sets the .cuda() to the deep q learning model to keep track of the gpu
        state = state.cuda(
        )  # sets the .cuda() to the state to keep track of the gpu
    r_memory = deque(
        maxlen=opt.mem_size
    )  #adds the removed elements to the r_memory. In that case the removed element is the memory sizy from the parser
    epoch = 0  #the epoch is set to 0
    output_training_video = cv2.VideoWriter(
        opt.result, cv2.VideoWriter_fourcc(*'FMP4'), opt.fps,
        (int(1.5 * opt.width * opt.block_size), opt.height * opt.block_size))
    while epoch < opt.num_epochs:  # loops until the epoch is less than the number of epochs from the parser

        next_steps = environment.get_next_states(
        )  # the next steps are set to the environment next states
        epsilon = opt.finalEpsilon + (
            max(opt.decay_epochs - epoch, 0) *
            (opt.initialEpsilon - opt.finalEpsilon) / opt.decay_epochs
        )  # this is for exploration. The epsilon is the final epsilon value from the parser + the max decay epochs - epoch and 0 * with the initial epsilon from the parser - the final epsilon / by the number of decay epochs.
        pp = random()  # pp is a random
        rand_action = pp <= epsilon  # random action is equal to the pp less than the epsilon
        nextActions, next_states = zip(
            *next_steps.items()
        )  # next action and next states are equal to a series of tuples of the next steps
        next_states = torch.stack(
            next_states
        )  # next states are set to the cocatenates of the next states to a new dimension
        if torch.cuda.is_available(
        ):  # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda
            next_states = next_states.cuda(
            )  # sets the .cuda() to the next states to keep track of the gpu
        deepQ_model.eval(
        )  # this pytorch function sets the model to evaluation mode while testing
        with torch.no_grad(
        ):  # torch. no_grad() is used to deactive the autograd egnine which reduces memory usage and speed up
            dqm_p = deepQ_model(
                next_states
            )[:, 0]  # press is set to the deepq model with the next states
        deepQ_model.train()  # trains the deep q model
        if rand_action:  # if the action is random
            idx = randint(
                0,
                len(next_steps) - 1
            )  # the index is set to the random of the length of the next steps -1
        else:
            idx = torch.argmax(
                dqm_p).item()  #index set the maximum values of dqm_p
        next_state = next_states[
            idx, :]  # the next state is equal to the next states index
        action = nextActions[idx]  #action is set the next actions index
        reward, done = environment.make_step(
            action, cv2_rend=True
        )  # the reword and done is set to the environment with the action and the open cv render which is the environment for visualization
        if torch.cuda.is_available(
        ):  # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda
            next_state = next_state.cuda(
            )  # sets the .cuda() to the next state to keep track of the gpu
        r_memory.append([
            state, reward, next_state, done
        ])  # appends the r memory with the state reward next state and done
        if done:  # if its done
            output_training_video.release()
            episode_durations.append(epoch + 1)
            #plot_durations()
            final_total_score = environment.player_score  # the final total score is equal to the environments players score
            tot_reward.append(final_total_score)
            plot_reward()
            final_total_blocks = environment.tetris_blocks  # the final total blocks are equal to the environments tetris blocks
            final_total_completed_lines = environment.completed_lines  # the final total completed lines are equal to the environments completed lines
            state = environment.reset(
            )  # state is equal to a new environment (rest)
            if torch.cuda.is_available(
            ):  # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda
                state = state.cuda(
                )  # sets the .cuda() to the state to keep track of the gpu
        else:
            state = next_state  # the state is equal to the next state
            continue
        if len(
                r_memory
        ) < opt.mem_size / 10:  # if the length of the r memory is less than the parsers memory size / 10
            continue  # continues
        epoch += 1  # increments epoch +1
        batch = sample(
            r_memory, min(len(r_memory), opt.mini_batch_size)
        )  # the batch is set to the sample of the r memory the minimum length of the r memory and the mini batch size from the parser
        stateBatch, batchReward, nextB_state, completed_batch = zip(
            *batch
        )  # the statebatch, the batch reward the next state and the completed batch are all zipped all together to a tuple
        stateBatch = torch.stack(
            tuple(state for state in stateBatch)
        )  # the state batch is equal to the  to the cocatenates as a tuple of the states
        batchReward = torch.from_numpy(
            np.array(batchReward, dtype=np.float32)[:, None]
        )  # the batch reward is equal to a numpy ndarray of the batch reward as a float
        nextB_state = torch.stack(
            tuple(state for state in nextB_state)
        )  # the nextB state is equal to the cocatenates as a tuple of the states
        if torch.cuda.is_available(
        ):  # torch.cuda is used for computational perpose and the .availiable() shows if the system supports cuda
            stateBatch = stateBatch.cuda(
            )  # sets the .cuda() to the state batch to keep track of the gpu
            batchReward = batchReward.cuda(
            )  # sets the .cuda() to the batch reward to keep track of the gpu
            nextB_state = nextB_state.cuda(
            )  # sets the .cuda() to the nextB state to keep track of the gpu
        q_values = deepQ_model(
            stateBatch)  # the q values are equal to the models's state batch
        deepQ_model.eval()  # sets the model to evaluation mode for testing
        with torch.no_grad(
        ):  # torch. no_grad() is used to deactive the autograd egnine which reduces memory usage and speed up
            nextPred_batch = deepQ_model(
                nextB_state
            )  # the next predi batch is equal to the models's nextB state
        deepQ_model.train()  # sets the model to training mode
        batch_Y = torch.cat(
            tuple(reward if done else reward + opt.gamma * prediction
                  for reward, done, prediction in zip(
                      batchReward, completed_batch, nextPred_batch))
        )[:,
          None]  #  Loops in the zip tuple of batch rewards completed batches and next pred batch and if the batch of Y is equal to a oncatenated tuple of the reward. If its not done the reward + the gamma from the parser * the predictions are stored to the batch Y.
        my_optim.zero_grad(
        )  # the gradients of the optimizer are set to zero at the begining of the mini batch
        loss = cn(q_values,
                  batch_Y)  # the loss is equal to the q values and the batch y
        loss.backward(
        )  # computes dloss/dx for every parameter x which has requires the grad = True
        my_optim.step(
        )  #performs a parameter update on the optimzier based on the current gradient
        print(
            "Epoch Num: {}/{}, Action: {}, Score: {}, TPieces {}, Cleared lines: {}"
            .format(epoch, opt.num_epochs, action, final_total_score,
                    final_total_blocks, final_total_completed_lines)
        )  # prints the epoch number the action the final total score the final total blocks and the final completed lines for every epoch during training
        new_writer2.add_scalar(
            'Train/Score', final_total_score, epoch - 1
        )  # creates a summury scaler using tensorflow for the train score which gets the final total score and the step which is epoch -1
        new_writer2.add_scalar(
            'Train/TPieces', final_total_blocks, epoch - 1
        )  # creates a summury scaler using tensorflow for the train TPieces which gets the final total blocks and the step which is epoch -1
        new_writer2.add_scalar(
            'Train/Cleared lines', final_total_completed_lines, epoch - 1
        )  # creates a summury scaler using tensorflow for the train cleared lines which gets the final total completed lines and the step which is epoch -1
        if epoch > 0 and epoch % opt.store_interval == 0:  # if the epoch is greater than 0 and the epoch % the stored interval is equal to 0
            torch.save(
                deepQ_model, "{}/tetris_{}".format(opt.saved_path, epoch)
            )  # the trained model and epochsis saved to the saved path which is the trained models folder.
    torch.save(
        deepQ_model, "{}/tetris".format(opt.saved_path)
    )  # saves the trained model to the saved path from the parser which is the trained models folder
Beispiel #25
0
            RL.save_experience(observation, action, reward, observation_)

            if (step > 200) and (step % 5 == 0):
                RL.learn()

            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                break
            step += 1

    # end of game
    print('game over')
    env.destroy()


if __name__ == "__main__":
    # maze game
    env = Maze()
    RL = DeepQNetwork(
        env.n_actions,
        env.n_features,
        learning_rate=0.01,
        gamma=0.9,
    )
    env.after(100, run_maze)
    env.mainloop()
    RL.plot_loss()
Beispiel #26
0
from config_utils import read_main_config
from deep_q_network import DeepQNetwork
from gym_wrapper import GymWrapper


from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

config = read_main_config()
gym_wrapper = GymWrapper(config['general']['scenario'])
deep_q_network = DeepQNetwork(config, gym_wrapper)
deep_q_network.train()
deep_q_network.test(episodes=3)
Beispiel #27
0
gamma = 0.999  # Is the discount factor used in the Bellman equation
eps_start = SaveLoadModule.get_epsilon_start_point(
)  # Starting value of epsilon
eps_end = 0.2  # Ending value of epsilon
eps_decay = 0.0001  # Decay rate we’ll use to decay epsilon over time
target_update = 10  # How frequently, in terms of episodes, we’ll update the target network weights with the policy network weights.
memory_size = 300  # Capacity of the replay memory
lr = 0.001  # Learning rate
num_episodes = 500  # Number of episodes we want to play
last_training_episode = SaveLoadModule.get_most_advanced_episode()
environment_manager = EnvManager('SpaceInvaders-v0')
strategy = EpsilonGreedyStrategy(eps_start, eps_end, eps_decay)
agent = Agent(strategy, environment_manager.num_actions_available())
memory = ReplayMemory(memory_size)
policy_net = DeepQNetwork(
    input_shape=(environment_manager.get_input_shape(), ),
    action_space=environment_manager.num_actions_available(),
    batch_size=batch_size)
target_net = DeepQNetwork(
    input_shape=(environment_manager.get_input_shape(), ),
    action_space=environment_manager.num_actions_available(),
    batch_size=batch_size)

max_reward = 0
# Episode loop
for episode in range(last_training_episode, num_episodes):
    max_episode_reward = 0
    environment_manager.reset()
    state = environment_manager.get_state()
    environment_manager.done = False

    # Steps loop
            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                break
            step += 1

    # end of game
    print('game over')
    env.destroy()


if __name__ == "__main__":
    # maze game
    env = Maze(arg.mazeSize, arg.mazeSize)
    RL = DeepQNetwork(n_actions=len(env.action_space),
    n_features=1 ,# len(env.position) ==> when state = (x,y) representation
    #env.n_actions, env.n_features,
                      learning_rate=0.01,
                      reward_decay=0.9,
                      e_greedy=0.9,
                      e_greedy_increment = 0.01,
                      hidden_layers=[10, 10],
                      replace_target_iter=200,
                      memory_size=2000,
                      # output_graph=True
                      )
    env.after(100, run_maze)
    env.mainloop()
    RL.plot_cost()
Beispiel #29
0
class Agent(object):
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size,
                 eps_min=0.01,
                 eps_dec=5e-7,
                 replace=1000,
                 algo=None,
                 env_name=None,
                 checkpoint_dir='tmp/dqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_counter = replace
        self.algo = algo
        self.env_name = env_name
        self.checkpoint_dir = checkpoint_dir
        self.action_space = [i for i in range(self.n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   "_q_eval",
                                   checkpoint_dir=self.checkpoint_dir)
        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   "_q_next",
                                   checkpoint_dir=self.checkpoint_dir)

    def store_transition(self, state, action, reward, resulted_state, done):
        self.memory.store_transition(state, action, reward, resulted_state,
                                     done)

    def sample_memory(self):
        state, action, reward, resulted_state, done = self.memory.sample_buffer(
            self.batch_size)
        states = T.tensor(state).to(self.q_eval.device)
        rewards = T.tensor(reward).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        resulted_states = T.tensor(resulted_state).to(self.q_eval.device)

        return states, actions, rewards, resulted_states, dones

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation], dtype=T.float).to(
                self.q_eval.device)  # converting observation to tensor,
            # and observation is in the list because our convolution expects an input tensor of shape batch size
            # by input dims.
            _, advantages = self.q_eval.forward(state)
            action = T.argmax(advantages).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def replace_target_network(self):
        if self.replace_target_counter is not None and \
            self.learn_step_counter % self.replace_target_counter == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decrement_epsilon(self):
        if self.epsilon > self.eps_min:
            self.epsilon = self.epsilon - self.eps_dec
        else:
            self.epsilon = self.eps_min

    def learn(self):
        if self.memory.mem_counter < self.batch_size:
            return

        self.q_eval.optimizer.zero_grad()

        self.replace_target_network()

        states, actions, rewards, resulted_states, dones = self.sample_memory()

        indexes = np.arange(self.batch_size)

        V_states, A_states = self.q_eval.forward(states)
        q_pred = T.add(
            V_states, (A_states - A_states.mean(dim=1, keepdim=True)))[indexes,
                                                                       actions]

        V_resulted_states, A_resulted_states = self.q_next.forward(
            resulted_states)
        q_next = T.add(
            V_resulted_states,
            (A_resulted_states -
             A_resulted_states.mean(dim=1, keepdim=True))).max(dim=1)[0]
        q_next[dones] = 0.0

        target = rewards + self.gamma * q_next

        loss = self.q_eval.loss(target, q_pred).to(self.q_eval.device)
        loss.backward()
        self.q_eval.optimizer.step()

        self.learn_step_counter += 1
        self.decrement_epsilon()

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()