Python ReplayMemory.sample Examples

Programming Language: Python

Namespace/Package Name: dqn

Class/Type: ReplayMemory

Method/Function: sample

Examples at hotexamples.com: 8

Python ReplayMemory.sample - 8 examples found. These are the top rated real world Python examples of dqn.ReplayMemory.sample extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ReplayMemory(13)

push(9)

sample(8)

length(2)

push_transition(2)

sample_batch(2)

can_provide_sample(1)

setup_epoch_training(1)

store(1)

Example #1

Show file

File: test_dqn.py Project: faezs/reinforcement

    def testReplayMemory(self):
        od = [84, 84, 4]
        ad = [8, 10]
        rd = [5]
        s = int(10000)
        b = 32

        rm = ReplayMemory(obs_dim=od, act_dim=ad, r_dim=rd, size=s)
        o = self.get_rand(od)
        a = self.get_rand(ad)
        r = self.get_rand(rd)
        d = 0
        for _ in range(1000):
            rm.store(o, a, r, o, d)

        o_s, a_s, r_s, on_s, d_s = rm.sample(b)

        self.assertEqual(o_s.shape, combined_shape(b, od))
        self.assertEqual(a_s.shape, combined_shape(b, ad))
        self.assertEqual(r_s.shape, combined_shape(b, rd))
        self.assertEqual(on_s.shape, combined_shape(b, od))
        self.assertEqual(d_s.shape, combined_shape(b))

Example #2

Show file

class Agent:
    def __init__(self,
                 env,
                 input_size,
                 output_size,
                 hidden_size,
                 max_cars=10,
                 max_passengers=10,
                 mix_hidden=32,
                 batch_size=128,
                 lr=0.001,
                 gamma=.999,
                 eps_start=0.9,
                 eps_end=0.05,
                 eps_decay=750,
                 replay_capacity=10000,
                 num_save=200,
                 num_episodes=10000,
                 mode="random",
                 training=False,
                 load_file=None):
        self.env = env
        self.orig_env = copy.deepcopy(env)
        self.grid_map = env.grid_map
        self.cars = env.grid_map.cars
        self.num_cars = len(self.cars)
        self.passengers = env.grid_map.passengers
        self.num_passengers = len(self.passengers)
        self.max_cars = max_cars
        self.max_passengers = max_passengers
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.eps_start = eps_start
        self.eps_end = eps_end
        self.eps_decay = eps_decay
        self.replay_capacity = replay_capacity
        self.num_episodes = num_episodes
        self.steps_done = 0
        self.lr = lr
        self.mode = mode
        self.num_save = num_save
        self.training = training
        self.algorithm = PairAlgorithm()
        self.episode_durations = []
        self.duration_matrix = np.zeros((self.max_passengers, self.max_cars))
        self.count_matrix = np.zeros((self.max_passengers, self.max_cars))
        self.loss_history = []

        self.memory = ReplayMemory(self.replay_capacity)

        self.device = torch.device("cpu")
        print("Device being used:", self.device)
        self.policy_net = DQN(self.input_size, self.output_size,
                              self.hidden_size).to(self.device)

        self.params = list(self.policy_net.parameters())

        if self.mode == "qmix":
            self.mixer = QMixer(self.input_size, self.max_passengers,
                                mix_hidden).to(self.device)
            self.params += list(self.mixer.parameters())

        if load_file:
            self.policy_net.load_state_dict(torch.load(load_file))
            if self.mode == "qmix":
                self.mixer.load_state_dict(torch.load("mixer_" + load_file))
                self.mixer.eval()
            self.policy_net.eval()
            self.load_file = "Pretrained_" + load_file
            print("Checkpoint loaded")
        else:
            self.load_file = self.mode + "_model_num_cars_" + str(self.num_cars) + "_num_passengers_" + str(self.num_passengers) + \
                    "_num_episodes_" + str(self.num_episodes) + "_hidden_size_" + str(self.hidden_size) + ".pth"

        self.optimizer = optim.RMSprop(self.params, lr=self.lr)
        #self.optimizer = optim.Adam(self.params, lr=self.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
        #self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, 1500, gamma=0.1)

    def select_action(self, state):
        #Select action with epsilon greedy
        sample = random.random()

        eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * \
            math.exp(-1. * self.steps_done / self.eps_decay)

        print(eps_threshold)

        self.steps_done += 1

        if not self.training:
            eps_threshold = 0.0

        if sample > eps_threshold:
            # Choose best action
            with torch.no_grad():

                self.policy_net.eval()
                action = self.policy_net(state).view(
                    self.max_passengers,
                    self.max_cars)[:, :self.num_cars].max(1)[1].view(
                        1, self.max_passengers)
                action[0, self.num_passengers:] = self.max_cars
                return action

        else:
            #Choose random action
            action = torch.tensor([[
                random.randrange(self.num_cars)
                for car in range(self.max_passengers)
            ]],
                                  device=self.device,
                                  dtype=torch.long)
            action[0, self.num_passengers:] = self.max_cars
            return action

    def random_action(self, state):
        return torch.tensor([[
            random.randrange(self.num_cars)
            for car in range(self.num_passengers)
        ]],
                            device=self.device,
                            dtype=torch.long)

    def get_state(self):
        # Cars (px, py, 1=matched), Passengers(pickup_x, pickup_y, dest_x, dest_y, 1=matched)
        # Vector Size = 3*C + 5*P
        cars = self.cars
        passengers = self.passengers
        indicator_cars_vec = np.zeros(self.max_cars)
        indicator_passengers_vec = np.zeros(self.max_passengers)

        # Encode information about cars
        cars_vec = np.array([0] * (2 * self.max_cars))

        for i, car in enumerate(cars):
            cars_vec[2 * i:2 * i + 2] = [car.position[0], car.position[1]]
            indicator_cars_vec[i] = 1

        # Encode information about passengers
        passengers_vec = np.array([0] * (4 * self.max_passengers))
        for i, passenger in enumerate(passengers):
            passengers_vec[4 * i:4 * i + 4] = [
                passenger.pick_up_point[0], passenger.pick_up_point[1],
                passenger.drop_off_point[0], passenger.drop_off_point[1]
            ]
            indicator_passengers_vec[i] = 1

        return torch.tensor(np.concatenate(
            (cars_vec, indicator_cars_vec, passengers_vec,
             indicator_passengers_vec)),
                            device=self.device,
                            dtype=torch.float).unsqueeze(0)

    def train(self):

        duration_sum = 0.0

        for episode in range(self.num_episodes):

            self.reset_different_num()
            #self.reset()
            #self.reset_orig_env()

            state = self.get_state()

            if self.mode == "dqn" or self.mode == "qmix":
                action = self.select_action(state)

            elif self.mode == "random":
                action = self.random_action([state])

            elif self.mode == "greedy":
                action = [self.algorithm.greedy_fcfs(self.grid_map)]
                action = torch.tensor(action,
                                      device=self.device,
                                      dtype=torch.long)
                #print(action.size())
                #print(action[:,:self.num_passengers])

            reward, duration = self.env.step(action[:, :self.num_passengers],
                                             self.mode)

            if self.mode == "dqn":
                reward.extend([0] *
                              (self.max_passengers - self.num_passengers))

            self.episode_durations.append(duration)
            count = self.count_matrix[self.num_passengers - 1,
                                      self.num_cars - 1]
            self.duration_matrix[
                self.num_passengers - 1,
                self.num_cars - 1] = self.duration_matrix[
                    self.num_passengers - 1, self.num_cars -
                    1] * (count / (count + 1)) + duration / (count + 1)
            self.count_matrix[self.num_passengers - 1, self.num_cars - 1] += 1
            duration_sum += duration

            if self.training:
                self.memory.push(
                    state, action,
                    torch.tensor(reward, device=self.device,
                                 dtype=torch.float).unsqueeze(0))
                self.optimize_model()

                self.plot_durations(self.mode)
                self.plot_loss_history(self.mode)

            if self.training and episode % self.num_save == 0:
                torch.save(self.policy_net.state_dict(),
                           "episode_" + str(episode) + "_" + self.load_file)
                if self.mode == "qmix":
                    torch.save(
                        self.mixer.state_dict(),
                        "mixer_episode_" + str(episode) + "_" + self.load_file)
                print("Checkpoint saved")

            print("Episode: ", episode)

        if self.training:
            torch.save(self.policy_net.state_dict(), self.load_file)
            if self.mode == "qmix":
                torch.save(self.mixer.state_dict(), "mixer_" + self.load_file)
            print("Checkpoint saved")

        print("Average duration was ", duration_sum / self.num_episodes)
        print("Finished")
        np.save("Duration_matrix", self.duration_matrix)
        np.save("Count_matrix", self.count_matrix)
        print(self.duration_matrix)
        print(self.count_matrix)

    def reset(self):

        self.env.reset()
        self.grid_map = self.env.grid_map
        self.cars = self.env.grid_map.cars
        self.passengers = self.env.grid_map.passengers

    def reset_different_num(self):

        self.env.grid_map.cars = []
        self.env.grid_map.passengers = []
        self.env.grid_map.num_passengers = random.randint(
            1, self.max_passengers)
        self.env.grid_map.num_cars = random.randint(1, self.max_cars)
        self.env.grid_map.add_passenger(self.env.grid_map.num_passengers)
        self.env.grid_map.add_cars(self.env.grid_map.num_cars)

        self.grid_map = self.env.grid_map
        self.num_passengers = self.env.grid_map.num_passengers
        self.num_cars = self.env.grid_map.num_cars
        self.cars = self.env.grid_map.cars
        self.passengers = self.env.grid_map.passengers

    def reset_orig_env(self):

        self.env = copy.deepcopy(self.orig_env)
        self.grid_map = self.env.grid_map
        self.cars = self.env.grid_map.cars
        self.passengers = self.env.grid_map.passengers
        self.grid_map.init_zero_map_cost()

    def optimize_model(self):
        if len(self.memory) < self.batch_size:
            return

        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        self.policy_net.train()

        q_values = self.policy_net(state_batch).view(self.batch_size,
                                                     self.max_passengers,
                                                     self.max_cars)
        q_values = torch.cat((q_values,
                              torch.zeros(
                                  (self.batch_size, self.max_passengers, 1),
                                  device=self.device)), 2)
        state_action_values = q_values.gather(
            2, action_batch.unsqueeze(2)).squeeze()

        # Compute the expected Q values
        expected_state_action_values = reward_batch

        # Compute Huber loss
        if self.mode == "dqn":
            loss = F.smooth_l1_loss(state_action_values,
                                    expected_state_action_values)
        elif self.mode == "qmix":
            self.mixer.train()
            chosen_action_qvals = self.mixer(state_action_values, state_batch)
            loss = F.smooth_l1_loss(chosen_action_qvals,
                                    reward_batch.view(-1, 1, 1))
            #loss = F.mse_loss(chosen_action_qvals, reward_batch.view(-1, 1, 1))

        self.loss_history.append(loss.item())

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()

        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def plot_durations(self, filename):
        print("Saving durations plot ...")
        plt.figure(2)
        plt.clf()

        total_steps = np.array(self.episode_durations)

        N = len(total_steps)
        window_size = 200
        if N < window_size:
            total_steps_smoothed = total_steps
        else:
            total_steps_smoothed = np.zeros(N - window_size)

            for i in range(N - window_size):
                window_steps = total_steps[i:i + window_size]
                total_steps_smoothed[i] = np.average(window_steps)

        plt.title('Episode Duration history')
        plt.xlabel('Episode')
        plt.ylabel('Duration')

        plt.plot(total_steps_smoothed)
        np.save("Duration_" + filename, total_steps_smoothed)
        #plt.savefig("Durations_history_" + filename)

    def plot_loss_history(self, filename):
        print("Saving loss history ...")
        plt.figure(2)
        plt.clf()
        #loss = torch.tensor(self.loss_history, dtype=torch.float)

        total_loss = np.array(self.loss_history)

        N = len(total_loss)
        window_size = 50
        if N < window_size:
            total_loss_smoothed = total_loss
        else:
            total_loss_smoothed = np.zeros(N - window_size)

            for i in range(N - window_size):
                window_steps = total_loss[i:i + window_size]
                total_loss_smoothed[i] = np.average(window_steps)

        plt.title('Loss history')
        plt.xlabel('Episodes')
        plt.ylabel('Loss')
        plt.plot(self.loss_history)
        np.save("Loss_" + filename, total_loss_smoothed)

Example #3

Show file

class RLAgent(Player):
    def __init__(self,
                 name,
                 others=None,
                 last_n=10,
                 load_path=None,
                 checkpoint=5000,
                 fixed_strategy=False,
                 eps_decay=0.00005):
        if others is None:
            others = [1, 2]
        self.others = others
        self.last_n = last_n
        self.prev_points = 0
        self.batch_size = 32
        self.gamma = 0.9
        self.eps_start = 1
        self.eps_end = 0.01
        self.eps_decay = eps_decay
        self.target_update = 100
        self.plot_at = 1000
        self.q_max = []
        self.q_list = []
        self.checkpoint = checkpoint
        self.memory_size = 1000
        self.lr = 0.00001
        self.train = True

        self.input_dim = len(others) * 6
        self.output_dim = 3
        self.current_step = 1
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.memory = ReplayMemory(self.memory_size)

        # Initialize the policy and target networks
        self.policy_net = DQN(self.input_dim, self.output_dim).to(self.device)
        self.target_net = DQN(self.input_dim, self.output_dim).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        if load_path is not None:
            checkpoint = torch.load(load_path)
            self.policy_net.load_state_dict(checkpoint['model_state_dict'])
            self.policy_net.eval()
            self.eps_start = 0
            self.eps_end = 0
            self.train = False
        if fixed_strategy:
            self.strategy = FixedStrategy()
        self.strategy = EpsilonGreedyStrategy(self.eps_start, self.eps_end,
                                              self.eps_decay)

        # Set the optimizer
        self.optimizer = optim.Adam(params=self.policy_net.parameters(),
                                    lr=self.lr)
        self.loss = None

        # Push to replay memory
        self.prev_state = None
        self.action = None
        self.reward = None
        self.current_state = None

        super().__init__(name)

    def select_action(self, valid_actions, history):
        # print(self.memory.can_provide_sample(self.batch_size))
        if self.memory.can_provide_sample(self.batch_size) and self.train:
            self.train_model()

        if len(history) > self.last_n + 1:
            self.prev_state, self.current_state = self.get_states(history)
            self.reward = self.get_reward()
            if self.action is not None and self.train:
                self.memory.push(
                    Experience(self.prev_state, self.action,
                               self.current_state, self.reward))
            self.action = self.get_action(valid_actions)
            return self.action.item()
        else:
            return np.random.choice(valid_actions)

    def get_states(self, history):
        prev_state, current_state = [], []
        if len(history) > self.last_n + 1:
            for other in self.others:
                other_history = [i[other] for i in history]
                other_last_n = other_history[-self.last_n:]
                other_last_n_p = other_history[-self.last_n - 1:-1]
                other_policy_total = get_policy(other_history)
                other_policy_last_n = get_policy(other_last_n)
                other_policy_total_p = get_policy(other_history[:-1])
                other_policy_last_n_p = get_policy(other_last_n_p)
                prev_state.extend(other_policy_total_p + other_policy_last_n_p)
                current_state.extend(other_policy_total + other_policy_last_n)
        return torch.as_tensor(prev_state).unsqueeze(-2), torch.as_tensor(
            current_state).unsqueeze(-2)

    def get_reward(self):
        reward = self.points - self.prev_points
        self.prev_points = self.points
        return torch.tensor([reward])

    def get_action(self, valid_actions):
        rate = self.strategy.get_exploration_rate(self.current_step)
        self.current_step += 1
        if rate > random.random():
            # For random, we can pass the allowable_moves vector and choose from it randomly
            action = np.random.choice(valid_actions)
            return torch.tensor([action]).to(self.device)  # explore
        else:
            with torch.no_grad():
                self.q_max.append(
                    self.policy_net(self.current_state).max().item())
                return self.policy_net(self.current_state).max(1)[1].to(
                    self.device)  # exploit

    def train_model(self):
        experiences = self.memory.sample(self.batch_size)
        states, actions, rewards, next_states = extract_tensors(experiences)
        if self.current_step % self.target_update == 0:
            print('UPDATE TARGET NET', self.current_step)
            self.q_list.extend(self.q_max)
            print('Q Max', sum(self.q_max) / self.target_update)
            q_max_list.append(sum(self.q_max) / self.target_update)
            self.q_max = []
            self.target_net.load_state_dict(self.policy_net.state_dict())

        if self.current_step % self.plot_at == 0:
            e_ = self.memory.memory[-100:]
            batch = Experience(*zip(*e_))
            print('\n', '*' * 42)
            print('EXPLORATION RATE',
                  self.strategy.get_exploration_rate(self.current_step))
            print('REWARD', sum(batch.reward).item())
            print('POLICY', get_policy([i.item() for i in batch.action]))
            print('*' * 42, '\n')
            plt.plot(range(len(q_max_list)), q_max_list)
            plt.show()
        if self.current_step % self.checkpoint == 0:
            print('SAVE CHECKPOINT AT', self.current_step)
            checkpoint_path = checkpoint_folder + checkpoint_prefix + str(
                self.current_step) + checkpoint_suffix
            torch.save({'model_state_dict': self.policy_net.state_dict()},
                       checkpoint_path)
        current_q_values = QValues.get_current(self.policy_net, states,
                                               actions)
        next_q_values = QValues.get_next(self.policy_net, self.target_net,
                                         next_states)
        target_q_values = (next_q_values * self.gamma) + rewards
        self.loss = F.mse_loss(current_q_values, target_q_values)
        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()

Example #4

Show file

File: agent.py Project: sakshamarora1/Road-Fighter-AI

class DQNAgent:
    def __init__(self, inputs, n_actions):
        self.brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions)
        self.target_brain = DeepQNetwork(inputs, 16, 16, outputNum=n_actions)
        self.target_brain.load_state_dict(self.brain.state_dict())
        self.target_brain.eval()

        self.set_params()
        self.optimizer = torch.optim.Adam(self.brain.parameters())
        self.memory = ReplayMemory(50000)
        self.action_space = [0, 1]

    def set_params(self):
        self.batch_size = 64

        self.max_exploration_rate = 1
        self.min_exploration_rate = 0.05
        self.exploration_decay_rate = 0.0005

        self.steps_done = 0

    def select_action(self, state):
        sample = np.random.random()
        exploration_rate = self.min_exploration_rate + (
            self.max_exploration_rate - self.min_exploration_rate) * np.exp(
                -self.steps_done * self.exploration_decay_rate)

        self.steps_done += 1
        if sample > exploration_rate:
            with torch.no_grad():
                actions = self.brain(state)
                return torch.argmax(actions).item()
        else:
            return np.random.choice(self.action_space)

    def learn(self):
        if len(self.memory) < self.batch_size:
            return

        self.optimizer.zero_grad()

        max_capacity = (len(self.memory)
                        if len(self.memory) < self.memory.capacity else
                        self.memory.capacity)

        batch = np.random.choice(max_capacity, self.batch_size)

        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        non_final_mask = torch.tensor(
            tuple(map(lambda s: s is not None, batch.next_state)),
            dtype=torch.bool,
        )
        non_final_next_states = torch.tensor(
            [s for s in batch.next_state if s is not None])

        state_batch = torch.tensor(batch.state)
        action_batch = torch.tensor(batch.action)
        reward_batch = torch.tensor(batch.reward, dtype=torch.float)

        state_action_values = self.brain(state_batch).gather(
            1, action_batch.unsqueeze(-1))

        next_state_values = torch.zeros(self.batch_size)
        next_state_values[non_final_mask] = self.target_brain(
            non_final_next_states).max(1)[0]

        gamma = 0.99
        expected_state_action_values = (gamma * next_state_values +
                                        reward_batch / reward_batch.max())

        self.loss = torch.nn.MSELoss()(
            expected_state_action_values.unsqueeze(-1), state_action_values)

        self.optimizer.zero_grad()
        self.loss.backward()
        self.optimizer.step()

Example #5

Show file

File: maze_nn_exp_replay_target_nn.py Project: zjor/rl

class Agent(AbstractAgent):
    actions = ['←', '→', '↑', '↓']

    def __init__(self, env, p=1.0, lr=0.8, y=0.95, step_cost=.0, living_cost=.0, episode_length=100,
                 memory_capacity=100, batch_size=10, target_update=10, eps=0.5, eps_decay=0.999):
        AbstractAgent.__init__(self, eps, eps_decay)
        self.env = env
        self.lr = lr
        self.y = y
        self.step_cost = step_cost
        self.living_cost = living_cost
        q = (1.0 - p) / 2
        self.stochastic_actions = {
            '←': [[0, 2, 3], [p, q, q]],
            '→': [[1, 2, 3], [p, q, q]],
            '↑': [[2, 0, 1], [p, q, q]],
            '↓': [[3, 0, 1], [p, q, q]]
        }
        self.s0 = env.field.index('s')
        self.episode_length = episode_length
        self.rewards = []
        self.losses = []
        self.state_len = env.width * env.height
        self.nn = Model(
            in_features=self.state_len,
            hidden=[],
            out_features=len(Agent.actions))
        self.target_nn = Model(
            in_features=self.state_len,
            hidden=[],
            out_features=len(Agent.actions))
        self.target_nn.load_state_dict(self.nn.state_dict())
        self.target_nn.eval()

        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.nn.parameters(), lr=0.05)
        self.memory = ReplayMemory(memory_capacity)
        self.batch_size = batch_size
        self.target_update = target_update

    def step(self, state, action):
        # simulating Markov Process, desired action happens with probability p
        # but with the probability (1-p) / 2 the agent goes sideways
        sa = self.stochastic_actions[action]
        mp_action = np.random.choice(sa[0], p=sa[1])
        action = Agent.actions[mp_action]
        return self.env.step(state, action)

    def print_policy(self):
        for y in range(self.env.height):
            for x in range(self.env.width):
                s = y * self.env.width + x
                cell = self.env.field[s]
                if not (cell == '.' or cell == 's'):
                    print(cell, end='')
                    continue
                q_predicted = self._predict_q_policy(s)
                a = torch.argmax(q_predicted, 0).item()
                print(Agent.actions[a], end='')
            print()

    def _encode_state(self, s):
        z = np.zeros(self.state_len)
        z[s] = 1
        return torch.tensor(z, dtype=torch.float)

    def _predict_q_policy(self, s):
        return self.nn(self._encode_state(s))

    def _predict_q_target(self, s):
        return self.target_nn(self._encode_state(s))

    def optimize(self):
        if len(self.memory) < self.batch_size:
            return

        transitions = self.memory.sample(self.batch_size)
        for s, a, s1, r in transitions:
            q_predicted = self._predict_q_policy(s)
            q_target = q_predicted.clone().detach()
            q_target[a] = r + self.y * self._predict_q_target(s1).max().item()

            loss = self.criterion(q_predicted, q_target)
            self.losses.append(loss.item())

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

    def run_episode(self):
        AbstractAgent.run_episode(self)
        s = self.s0
        episode_number = len(self.rewards)
        self.rewards.append(.0)
        for j in range(self.episode_length):
            q_predicted = self._predict_q_policy(s)
            a = torch.argmax(q_predicted, 0).item()
            a = self.select_action(a)
            s1, r, over = self.step(s, Agent.actions[a])
            if s != s1:
                r -= self.step_cost
            r -= self.living_cost
            self.memory.push(s, a, s1, r)
            s = s1
            self.optimize()
            self.rewards[-1] += r
            if over:
                break
        if episode_number % self.target_update == 0:
            self.target_nn.load_state_dict(self.nn.state_dict())

Example #6

Show file

class Agent(AbstractAgent):
    actions = ['←', '→', '↑', '↓']

    def __init__(self,
                 env,
                 lr=0.8,
                 y=0.95,
                 step_cost=.0,
                 living_cost=.0,
                 episode_length=100,
                 memory_capacity=100,
                 batch_size=25,
                 eps=0.5,
                 eps_decay=0.999):
        AbstractAgent.__init__(self, eps, eps_decay)
        self.env = env
        self.lr = lr
        self.y = y
        self.step_cost = step_cost
        self.living_cost = living_cost
        self.s0 = env.field.index('s')
        self.episode_length = episode_length
        self.rewards = []
        self.losses = []
        self.state_len = env.width * env.height

        self.nn = Model(in_features=2,
                        hidden=[self.state_len, self.state_len],
                        out_features=len(Agent.actions))

        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.nn.parameters(), lr=0.01)
        self.memory = ReplayMemory(memory_capacity)
        self.batch_size = batch_size

    def step(self, state, action):
        return self.env.step(state, action)

    def print_policy(self):
        for y in range(self.env.height):
            for x in range(self.env.width):
                s = y * self.env.width + x
                cell = self.env.field[s]
                if not (cell == '.' or cell == 's'):
                    print(cell, end='')
                    continue
                q_predicted = self._predict_q(s)
                a = torch.argmax(q_predicted, 0).item()
                print(Agent.actions[a], end='')
            print()

    def _encode_state(self, s):
        # z = np.zeros(self.state_len)
        # z[s] = 1
        # return torch.tensor(z, dtype=torch.float)
        w = self.env.width
        x, y = s % w, s // w
        return torch.tensor([x, y], dtype=torch.float)

    def _predict_q(self, s):
        return self.nn(self._encode_state(s))

    def optimize(self):
        if len(self.memory) < self.batch_size:
            return

        transitions = self.memory.sample(self.batch_size)
        for s, a, s1, r in transitions:
            q_predicted = self._predict_q(s)
            q_target = q_predicted.clone().detach()
            q_target[a] = r + self.y * self._predict_q(s1).max().item()

            loss = self.criterion(q_predicted, q_target)
            self.losses.append(loss)

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

    def run_episode(self):
        AbstractAgent.run_episode(self)
        s = self.s0
        self.rewards.append(.0)
        for j in range(self.episode_length):
            q_predicted = self._predict_q(s)
            a = torch.argmax(q_predicted, 0).item()
            a = self.select_action(a)
            s1, r, over = self.step(s, Agent.actions[a])
            if s != s1:
                r -= self.step_cost
            r -= self.living_cost
            self.memory.push(s, a, s1, r)
            s = s1
            self.optimize()
            self.rewards[-1] += r
            if over:
                break

Example #7

Show file

class Agent(AbstractAgent):
    actions = ['←', '→', '↑', '↓']

    def __init__(self,
                 env,
                 model,
                 lr=0.8,
                 y=0.95,
                 step_cost=.0,
                 living_cost=.0,
                 episode_length=100,
                 memory_capacity=100,
                 batch_size=10,
                 eps=0.5,
                 eps_decay=0.999):
        AbstractAgent.__init__(self, eps, eps_decay)
        self.env = env
        self.model = model
        self.lr = lr
        self.y = y
        self.step_cost = step_cost
        self.living_cost = living_cost
        self.s0 = env.field.index('s')
        self.episode_length = episode_length
        self.rewards = []
        self.losses = []
        self.memory = ReplayMemory(memory_capacity)
        self.batch_size = batch_size

    def step(self, state, action):
        return self.env.step(state, action)

    def print_policy(self):
        for y in range(self.env.height):
            for x in range(self.env.width):
                s = y * self.env.width + x
                cell = self.env.field[s]
                if not (cell == '.' or cell == 's'):
                    print(cell, end='')
                    continue
                q_predicted = self.predict_q(s)
                a = np.argmax(q_predicted)
                print(Agent.actions[a], end='')
            print()

    def _encode_state(self, s):
        z = np.zeros(self.env.length)
        z[s] = 1.0
        return np.array([z])

    def predict_q(self, s):
        return self.model.predict(self._encode_state(s))[0]

    def optimize(self):
        if len(self.memory) < self.batch_size:
            return

        transitions = self.memory.sample(self.batch_size)
        for s, a, s1, r in transitions:
            q_predicted = self.predict_q(s)
            q_target = q_predicted
            q_target[a] = r + self.y * self.predict_q(s1).max()

            history = self.model.fit(x=self._encode_state(s),
                                     y=np.array([q_target]),
                                     epochs=1,
                                     verbose=False)
            self.losses.append(history.history["loss"][-1])

    def run_episode(self):
        AbstractAgent.run_episode(self)
        s = self.s0
        self.rewards.append(.0)
        for j in range(self.episode_length):
            q_predicted = self.predict_q(s)
            a = np.argmax(q_predicted)
            a = self.select_action(a)
            s1, r, over = self.step(s, Agent.actions[a])
            if s != s1:
                r -= self.step_cost
            r -= self.living_cost
            self.memory.push(s, a, s1, r)
            s = s1
            self.optimize()
            self.rewards[-1] += r
            if over:
                break

Example #8

Show file

def train(args):
    device = torch.device("cuda" if args.gpu else "cpu")
    env = Environment(draw=False,
                      fps=args.fps,
                      debug=args.debug,
                      dist_to_pipe=args.dist_to_pipe,
                      dist_between_pipes=args.dist_between_pipes,
                      obs_this_pipe=args.obs_this_pipe)

    observation_space = env.get_observation_size_buffer()
    action_space = env.get_action_size()

    policy_network = DQN(observation_space, action_space).to(device)
    target_network = DQN(observation_space, action_space).to(device)

    optimizer = torch.optim.Adam(policy_network.parameters(), lr=args.lr)

    replay_buffer = ReplayMemory(args.replay_capacity)
    writer = SummaryWriter()

    if args.inference:
        target_network.load_checkpoint()

    best_reward = None
    iteration = 0
    total_reward = 0.0
    rewards = []
    state = env.reset()
    while True:
        epsilon = max(args.final_eps,
                      args.start_eps - iteration / args.eps_decay_final_step)

        iteration += 1
        episode_reward = None
        if np.random.rand() < epsilon:
            action = env.get_action_random()
        else:
            state_v = torch.tensor(np.array([state], copy=False)).to(device)
            q_vals_v = policy_network(state_v.float())
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())

        next_state, reward, done = env.step(action)
        total_reward += reward

        replay_buffer.push(state, action, next_state, reward, done)

        state = next_state

        if done:
            episode_reward = total_reward
            state = env.reset()
            total_reward = 0.0

        if episode_reward is not None:
            rewards.append(episode_reward)
            mean_reward = np.mean(rewards[-80:])
            print(
                f"Episode {iteration}:  eps {epsilon}  mean reward {mean_reward}  episode reward {episode_reward}"
            )

            writer.add_scalar("epsilon", epsilon, iteration)
            writer.add_scalar("mean_reward", mean_reward, iteration)
            writer.add_scalar("reward", episode_reward, iteration)

            if best_reward is None or best_reward < mean_reward:
                torch.save(policy_network.state_dict(),
                           f"./models/checkpoint_{iteration}")
                print(f"New best reward found: {best_reward} -> {mean_reward}")
                best_reward = mean_reward
            if mean_reward > args.goal_reward:
                print(f"Achieved in {iteration} steps.")
                break

        if len(replay_buffer) < args.replay_start_step:
            continue

        if iteration % args.target_update_iterations == 0:
            target_network.load_state_dict(policy_network.state_dict())

        optimizer.zero_grad()

        batch = replay_buffer.sample(args.batch_size)
        loss = calculate_loss(batch,
                              policy_network,
                              target_network,
                              args.gamma,
                              device=device)

        loss.backward()
        optimizer.step()
    writer.close()