コード例 #1
0
ファイル: DQN_Agent_TwoReplay.py プロジェクト: tabzraz/RL
    def __init__(self, args, exp_model, logging_funcs):
        self.args = args

        # Exploration Model
        self.exp_model = exp_model

        self.log = logging_funcs["log"]
        self.env = logging_funcs["env"]

        self.replay = ExpReplay(args.exp_replay_size,
                                args.stale_limit,
                                exp_model,
                                args,
                                priority=self.args.prioritized)
        self.bonus_replay = ExpReplay(args.bonus_replay_size,
                                      args.stale_limit,
                                      exp_model,
                                      args,
                                      priority=self.args.prioritized)

        self.bonus_replay_stuff = 0

        # DQN and Target DQN
        model = get_models(args.model)
        self.dqn = model(actions=args.actions)
        self.target_dqn = model(actions=args.actions)

        dqn_params = 0
        for weight in self.dqn.parameters():
            weight_params = 1
            for s in weight.size():
                weight_params *= s
            dqn_params += weight_params
        print("DQN has {:,} parameters.".format(dqn_params))

        self.target_dqn.eval()

        if args.gpu:
            print("Moving models to GPU.")
            self.dqn.cuda()
            self.target_dqn.cuda()

        # Optimizer
        self.optimizer = RMSprop(self.dqn.parameters(), lr=args.lr)
        # self.optimizer = Adam(self.dqn.parameters(), lr=args.lr)

        self.T = 0
        self.target_sync_T = -self.args.t_max

        self.max_bonus = 0

        self.player_bonus_positions = []
コード例 #2
0
ファイル: TabQ_Agent.py プロジェクト: tabzraz/RL
    def __init__(self, args, exp_model):
        self.args = args

        # Exploration Model
        self.exp_model = exp_model

        # Experience Replay
        if self.args.set_replay:
            self.replay = ExpReplaySet(10, 10, exp_model, args, priority=False)
        else:
            self.replay = ExpReplay(args.exp_replay_size, args.stale_limit, exp_model, args, priority=self.args.prioritized)

        # DQN and Target DQN
        self.q_table = {}
        self.actions = args.actions

        self.T = 0
コード例 #3
0
ファイル: DDQN_Agent.py プロジェクト: tabzraz/RL
    def __init__(self, args, exp_model, logging_func):
        self.args = args

        # Exploration Model
        self.exp_model = exp_model

        self.log = logging_func["log"]

        # Experience Replay
        if self.args.set_replay:
            self.replay = ExpReplaySet(10, 10, exp_model, args, priority=False)
        else:
            self.replay = ExpReplay(args.exp_replay_size,
                                    args.stale_limit,
                                    exp_model,
                                    args,
                                    priority=self.args.prioritized)

        # DQN and Target DQN
        model = get_models(args.model)
        self.dqn = model(actions=args.actions)
        self.target_dqn = model(actions=args.actions)

        dqn_params = 0
        for weight in self.dqn.parameters():
            weight_params = 1
            for s in weight.size():
                weight_params *= s
            dqn_params += weight_params
        print("DQN has {:,} parameters.".format(dqn_params))

        self.target_dqn.eval()

        if args.gpu:
            print("Moving models to GPU.")
            self.dqn.cuda()
            self.target_dqn.cuda()

        # Optimizer
        # self.optimizer = Adam(self.dqn.parameters(), lr=args.lr)
        self.optimizer = RMSprop(self.dqn.parameters(), lr=args.lr)

        self.T = 0
        self.target_sync_T = -self.args.t_max
コード例 #4
0
ファイル: TabQ_Agent.py プロジェクト: tabzraz/RL
    def __init__(self, args, exp_model):
        self.args = args

        # Exploration Model
        self.exp_model = exp_model

        # Experience Replay
        if self.args.set_replay:
            self.replay = ExpReplaySet(10, 10, exp_model, args, priority=False)
        else:
            self.replay = ExpReplay(args.exp_replay_size,
                                    args.stale_limit,
                                    exp_model,
                                    args,
                                    priority=self.args.prioritized)

        # DQN and Target DQN
        self.q_table = {}
        self.actions = args.actions

        self.T = 0
コード例 #5
0
ファイル: DDQN_Agent.py プロジェクト: tabzraz/RL
    def __init__(self, args, exp_model, logging_func):
        self.args = args

        # Exploration Model
        self.exp_model = exp_model

        self.log = logging_func["log"]

        # Experience Replay
        if self.args.set_replay:
            self.replay = ExpReplaySet(10, 10, exp_model, args, priority=False)
        else:
            self.replay = ExpReplay(args.exp_replay_size, args.stale_limit, exp_model, args, priority=self.args.prioritized)

        # DQN and Target DQN
        model = get_models(args.model)
        self.dqn = model(actions=args.actions)
        self.target_dqn = model(actions=args.actions)

        dqn_params = 0
        for weight in self.dqn.parameters():
            weight_params = 1
            for s in weight.size():
                weight_params *= s
            dqn_params += weight_params
        print("DQN has {:,} parameters.".format(dqn_params))

        self.target_dqn.eval()

        if args.gpu:
            print("Moving models to GPU.")
            self.dqn.cuda()
            self.target_dqn.cuda()

        # Optimizer
        # self.optimizer = Adam(self.dqn.parameters(), lr=args.lr)
        self.optimizer = RMSprop(self.dqn.parameters(), lr=args.lr)

        self.T = 0
        self.target_sync_T = -self.args.t_max
コード例 #6
0
    def __init__(self, args, exp_model, logging_func):
        self.args = args

        # Exploration Model
        self.exp_model = exp_model

        self.log = logging_func["log"]
        self.log_image = logging_func["image"]
        os.makedirs("{}/transition_model".format(args.log_path))

        # Experience Replay
        self.replay = ExpReplay(args.exp_replay_size, args.stale_limit, exp_model, args, priority=self.args.prioritized)

        # DQN and Target DQN
        model = get_models(args.model)
        print("\n\nDQN")
        self.dqn = model(actions=args.actions)
        print("Target DQN")
        self.target_dqn = model(actions=args.actions)

        dqn_params = 0
        for weight in self.dqn.parameters():
            weight_params = 1
            for s in weight.size():
                weight_params *= s
            dqn_params += weight_params
        print("Model DQN has {:,} parameters.".format(dqn_params))

        self.target_dqn.eval()

        if args.gpu:
            print("Moving models to GPU.")
            self.dqn.cuda()
            self.target_dqn.cuda()

        # Optimizer
        # self.optimizer = Adam(self.dqn.parameters(), lr=args.lr)
        self.optimizer = RMSprop(self.dqn.parameters(), lr=args.lr)

        self.T = 0
        self.target_sync_T = -self.args.t_max

        # Action sequences
        self.actions_to_take = []
コード例 #7
0
ファイル: TabQ_Agent.py プロジェクト: tabzraz/RL
class TabQ_Agent:

    def __init__(self, args, exp_model):
        self.args = args

        # Exploration Model
        self.exp_model = exp_model

        # Experience Replay
        if self.args.set_replay:
            self.replay = ExpReplaySet(10, 10, exp_model, args, priority=False)
        else:
            self.replay = ExpReplay(args.exp_replay_size, args.stale_limit, exp_model, args, priority=self.args.prioritized)

        # DQN and Target DQN
        self.q_table = {}
        self.actions = args.actions

        self.T = 0

    def state_to_tuple(self, state):
        return tuple(np.argwhere(state > 0.9)[0])
        return tuple([tuple([tuple(y) for y in x]) for x in state])

    def act(self, state, epsilon, exp_model):
        self.T += 1

        tuple_state = self.state_to_tuple(state)
        q_values = []
        for a in range(self.actions):
            key = (tuple_state, a)
            if key in self.q_table:
                q_value = self.q_table[key]
            else:
                q_value = 1 # HACK for optimistic init
                # q_value = np.random.random() / 1000
            q_values.append(q_value)

        q_values_numpy = np.array(q_values)

        extra_info = {}
        extra_info["Q_Values"] = q_values_numpy

        if self.args.optimistic_init:
            for a in range(self.args.actions):
                _, info = exp_model.bonus(state, a, dont_remember=True)
                action_pseudo_count = info["Pseudo_Count"]
                # TODO: Log the optimism bonuses
                q_values_numpy[a] += self.args.optimistic_scaler / np.sqrt(action_pseudo_count + 0.01)

        if np.random.random() < epsilon:
            action = np.random.randint(low=0, high=self.args.actions)
        else:
            action = q_values_numpy.argmax()

        extra_info["Action"] = action

        return action, extra_info

    def experience(self, state, action, reward, state_next, steps, terminated, pseudo_reward=0, density=1):
        self.replay.Add_Exp(state, action, reward, state_next, steps, terminated, pseudo_reward)

    def end_of_trajectory(self):
        self.replay.end_of_trajectory()

    def train(self):

        for _ in range(self.args.iters):

            # TODO: Use a named tuple for experience replay
            n_step_sample = 1
            if np.random.random() < self.args.n_step_mixing:
                n_step_sample = self.args.n_step
            batch, indices, is_weights = self.replay.Sample_N(self.args.batch_size, n_step_sample, self.args.gamma)

            # (state_now, action_now, accum_reward, state_then, steps, terminate)
            td_errors = []
            for index, batch_stuff in enumerate(batch):

                state, action, reward, next_state, step, terminal_state = batch_stuff
                # Work out q_value target
                q_value_target = reward
                if not terminal_state:

                    next_state_max_qval = -100000
                    tuple_next_state = self.state_to_tuple(next_state)
                    for a in range(self.actions):
                        key = (tuple_next_state, a)
                        if key in self.q_table:
                            q_value = self.q_table[key]
                        else:
                            # q_value = 0
                            q_value = np.random.random() / 1000
                        next_state_max_qval = max(next_state_max_qval, q_value)

                    q_value_target += (self.args.gamma ** step) * next_state_max_qval

                # Update our estimate
                tuple_state = self.state_to_tuple(state)
                q_value_prediction = 0
                key = (tuple_state, action)
                if key in self.q_table:
                    q_value_prediction = self.q_table[key]

                td_error = q_value_prediction - q_value_target
                td_errors.append(td_error)

                if self.args.prioritized and self.args.prioritized_is:
                    td_error * is_weights[index]

                updated_prediction = q_value_prediction - self.args.lr * (td_error)

                self.q_table[key] = updated_prediction

            info = {}

            info["TD_Error"] = sum(td_errors) / len(td_errors)

            # Update the priorities
            self.replay.Update_Indices(indices, td_errors, no_pseudo_in_priority=self.args.count_td_priority)

            info["Loss"] = info["TD_Error"]

        return info
コード例 #8
0
class DQN_Model_Agent:

    def __init__(self, args, exp_model, logging_func):
        self.args = args

        # Exploration Model
        self.exp_model = exp_model

        self.log = logging_func["log"]
        self.log_image = logging_func["image"]
        os.makedirs("{}/transition_model".format(args.log_path))

        # Experience Replay
        self.replay = ExpReplay(args.exp_replay_size, args.stale_limit, exp_model, args, priority=self.args.prioritized)

        # DQN and Target DQN
        model = get_models(args.model)
        print("\n\nDQN")
        self.dqn = model(actions=args.actions)
        print("Target DQN")
        self.target_dqn = model(actions=args.actions)

        dqn_params = 0
        for weight in self.dqn.parameters():
            weight_params = 1
            for s in weight.size():
                weight_params *= s
            dqn_params += weight_params
        print("Model DQN has {:,} parameters.".format(dqn_params))

        self.target_dqn.eval()

        if args.gpu:
            print("Moving models to GPU.")
            self.dqn.cuda()
            self.target_dqn.cuda()

        # Optimizer
        # self.optimizer = Adam(self.dqn.parameters(), lr=args.lr)
        self.optimizer = RMSprop(self.dqn.parameters(), lr=args.lr)

        self.T = 0
        self.target_sync_T = -self.args.t_max

        # Action sequences
        self.actions_to_take = []

    def sync_target_network(self):
        for target, source in zip(self.target_dqn.parameters(), self.dqn.parameters()):
            target.data = source.data


    def get_pc_estimates(self, root_state, depth=0, starts=None):
        state = root_state
        bonuses = []
        for action in range(self.args.actions):

            # Current pc estimates
            if depth == 0 or not self.args.only_leaf:
                numpy_state = state[0].numpy().swapaxes(0, 2)
                _, info = self.exp_model.bonus(numpy_state, action, dont_remember=True)
                action_pseudo_count = info["Pseudo_Count"]
                action_bonus = self.args.optimistic_scaler / np.power(action_pseudo_count + 0.01, self.args.bandit_p)
                if starts is not None:
                    action_bonus += starts[action]

            # If the depth is 0 we don't want to look any further ahead
            if depth == 0:
                bonuses.append(action_bonus)
                continue

            one_hot_action = torch.zeros(1, self.args.actions)
            one_hot_action[0, action] = 1
            _, next_state_prediction = self.dqn(Variable(state, volatile=True), Variable(one_hot_action, volatile=True))
            next_state_prediction = next_state_prediction.cpu().data

            next_state_pc_estimates = self.get_pc_estimates(next_state_prediction, depth=depth - 1)

            if self.args.only_leaf:
                bonuses += next_state_pc_estimates
            else:
                ahead_pc_estimates = [action_bonus + self.args.gamma * n for n in next_state_pc_estimates]
                bonuses += ahead_pc_estimates

        return bonuses

    def act(self, state, epsilon, exp_model, evaluation=False):
        # self.T += 1
        if not evaluation:
            if len(self.actions_to_take) > 0:
                action_to_take = self.actions_to_take[0]
                self.actions_to_take = self.actions_to_take[1:]
                return action_to_take, {"Action": action_to_take, "Q_Values": self.prev_q_vals}

        self.dqn.eval()
        # orig_state = state[:, :, -1:]
        state = torch.from_numpy(state).float().transpose_(0, 2).unsqueeze(0)
        q_values = self.dqn(Variable(state, volatile=True)).cpu().data[0]
        q_values_numpy = q_values.numpy()
        self.prev_q_vals = q_values_numpy

        extra_info = {}

        if self.args.optimistic_init and not evaluation and len(self.actions_to_take) == 0:

            # 2 action lookahead
            action_bonuses = self.get_pc_estimates(state, depth=self.args.lookahead_depth, starts=q_values_numpy)

            # Find the maximum sequence 
            max_so_far = -100000
            best_index = 0
            best_seq = []

            for ii, bonus in enumerate(action_bonuses):
                if bonus > max_so_far:
                    best_index = ii
                    max_so_far = bonus

            for depth in range(self.args.lookahead_depth):
                last_action = best_index % self.args.actions
                best_index = best_index // self.args.actions
                best_seq = best_seq + [last_action]

            # print(best_seq)
            self.actions_to_take = best_seq

        extra_info["Q_Values"] = q_values_numpy

        if np.random.random() < epsilon:
            action = np.random.randint(low=0, high=self.args.actions)
        else:
            action = q_values.max(0)[1][0]  # Torch...

        extra_info["Action"] = action

        return action, extra_info

    def experience(self, state, action, reward, state_next, steps, terminated, pseudo_reward=0, density=1, exploring=False):
        if not exploring:
            self.T += 1
        self.replay.Add_Exp(state, action, reward, state_next, steps, terminated, pseudo_reward, density)

    def end_of_trajectory(self):
        self.replay.end_of_trajectory()

    def train(self):

        if self.T - self.target_sync_T > self.args.target:
            self.sync_target_network()
            self.target_sync_T = self.T

        info = {}

        for _ in range(self.args.iters):
            self.dqn.eval()

            # TODO: Use a named tuple for experience replay
            n_step_sample = self.args.n_step
            batch, indices, is_weights = self.replay.Sample_N(self.args.batch_size, n_step_sample, self.args.gamma)
            columns = list(zip(*batch))

            states = Variable(torch.from_numpy(np.array(columns[0])).float().transpose_(1, 3))
            actions = Variable(torch.LongTensor(columns[1]))
            terminal_states = Variable(torch.FloatTensor(columns[5]))
            rewards = Variable(torch.FloatTensor(columns[2]))
            # Have to clip rewards for DQN
            rewards = torch.clamp(rewards, -1, 1)
            steps = Variable(torch.FloatTensor(columns[4]))
            new_states = Variable(torch.from_numpy(np.array(columns[3])).float().transpose_(1, 3))

            target_dqn_qvals = self.target_dqn(new_states).cpu()
            # Make a new variable with those values so that these are treated as constants
            target_dqn_qvals_data = Variable(target_dqn_qvals.data)

            q_value_targets = (Variable(torch.ones(terminal_states.size()[0])) - terminal_states)
            inter = Variable(torch.ones(terminal_states.size()[0]) * self.args.gamma)
            # print(steps)
            q_value_targets = q_value_targets * torch.pow(inter, steps)
            if self.args.double:
                # Double Q Learning
                new_states_qvals = self.dqn(new_states).cpu()
                new_states_qvals_data = Variable(new_states_qvals.data)
                q_value_targets = q_value_targets * target_dqn_qvals_data.gather(1, new_states_qvals_data.max(1)[1])
            else:
                q_value_targets = q_value_targets * target_dqn_qvals_data.max(1)[0]
            q_value_targets = q_value_targets + rewards

            self.dqn.train()

            one_hot_actions = torch.zeros(self.args.batch_size, self.args.actions)

            for i in range(self.args.batch_size):
                one_hot_actions[i][actions[i].data] = 1

            if self.args.gpu:
                actions = actions.cuda()
                one_hot_actions = one_hot_actions.cuda()
                q_value_targets = q_value_targets.cuda()
                new_states = new_states.cuda()
            model_predictions_q_vals, model_predictions_state = self.dqn(states, Variable(one_hot_actions))
            model_predictions = model_predictions_q_vals.gather(1, actions.view(-1, 1))

            # info = {}

            td_error = model_predictions - q_value_targets
            info["TD_Error"] = td_error.mean().data[0]

            # Update the priorities
            if not self.args.density_priority:
                self.replay.Update_Indices(indices, td_error.cpu().data.numpy(), no_pseudo_in_priority=self.args.count_td_priority)

            # If using prioritised we need to weight the td_error
            if self.args.prioritized and self.args.prioritized_is:
                # print(td_error)
                weights_tensor = torch.from_numpy(is_weights).float()
                weights_tensor = Variable(weights_tensor)
                if self.args.gpu:
                    weights_tensor = weights_tensor.cuda()
                # print(weights_tensor)
                td_error = td_error * weights_tensor

            # Model 1 step state transition error

            # Save them every x steps
            if self.T % self.args.model_save_image == 0:
                os.makedirs("{}/transition_model/{}".format(self.args.log_path, self.T))
                for ii, image, action, next_state, current_state in zip(range(self.args.batch_size), model_predictions_state.cpu().data, actions.data, new_states.cpu().data, states.cpu().data):
                    image = image.numpy()[0]
                    image = np.clip(image, 0, 1)
                    # print(next_state)
                    next_state = next_state.numpy()[0]
                    current_state = current_state.numpy()[0]

                    black_bars = np.zeros_like(next_state[:1, :])
                    # print(black_bars.shape)

                    joined_image = np.concatenate((current_state, black_bars, image, black_bars, next_state), axis=0)
                    joined_image = np.transpose(joined_image)
                    self.log_image("{}/transition_model/{}/{}_____Action_{}".format(self.args.log_path, self.T, ii + 1, action), joined_image * 255)

                    # self.log_image("{}/transition_model/{}/{}_____Action_{}".format(self.args.log_path, self.T, ii + 1, action), image * 255)
                    # self.log_image("{}/transition_model/{}/{}_____Correct".format(self.args.log_path, self.T, ii + 1), next_state * 255)

            # print(model_predictions_state)

            # Cross Entropy Loss
            # TODO

            # Regresssion loss
            state_error = model_predictions_state - new_states
            # state_error_val = state_error.mean().data[0]

            info["State_Error"] = state_error.mean().data[0]
            self.log("DQN/State_Loss", state_error.mean().data[0], step=self.T)
            self.log("DQN/State_Loss_Squared", state_error.pow(2).mean().data[0], step=self.T)
            self.log("DQN/State_Loss_Max", state_error.abs().max().data[0], step=self.T)
            # self.log("DQN/Action_Matrix_Norm", self.dqn.action_matrix.weight.norm().cpu().data[0], step=self.T)

            combined_loss = (1 - self.args.model_loss) * td_error.pow(2).mean() + (self.args.model_loss) * state_error.pow(2).mean()
            l2_loss = combined_loss
            # l2_loss = (combined_loss).pow(2).mean()
            info["Loss"] = l2_loss.data[0]

            # Update
            self.optimizer.zero_grad()
            l2_loss.backward()

            # Taken from pytorch clip_grad_norm
            # Remove once the pip version it up to date with source
            gradient_norm = clip_grad_norm(self.dqn.parameters(), self.args.clip_value)
            if gradient_norm is not None:
                info["Norm"] = gradient_norm

            self.optimizer.step()

            if "States" in info:
                states_trained = info["States"]
                info["States"] = states_trained + columns[0]
            else:
                info["States"] = columns[0]

        # Pad out the states to be of size batch_size
        if len(info["States"]) < self.args.batch_size:
            old_states = info["States"]
            new_states = old_states[0] * (self.args.batch_size - len(old_states))
            info["States"] = new_states

        return info
コード例 #9
0
ファイル: DQN_Distribution_Agent.py プロジェクト: tabzraz/RL
class DQN_Distribution_Agent:

    def __init__(self, args, exp_model, logging_func):
        self.args = args

        # Exploration Model
        self.exp_model = exp_model

        self.log = logging_func["log"]

        # Experience Replay
        self.replay = ExpReplay(args.exp_replay_size, args.stale_limit, exp_model, args, priority=self.args.prioritized)

        # DQN and Target DQN
        model = get_models(args.model)
        self.dqn = model(actions=args.actions, atoms=args.atoms)
        self.target_dqn = model(actions=args.actions, atoms=args.atoms)

        dqn_params = 0
        for weight in self.dqn.parameters():
            weight_params = 1
            for s in weight.size():
                weight_params *= s
            dqn_params += weight_params
        print("Distrib DQN has {:,} parameters.".format(dqn_params))

        self.target_dqn.eval()

        if args.gpu:
            print("Moving models to GPU.")
            self.dqn.cuda()
            self.target_dqn.cuda()

        # Optimizer
        self.optimizer = Adam(self.dqn.parameters(), lr=args.lr)
        # self.optimizer = RMSprop(self.dqn.parameters(), lr=args.lr)

        self.T = 0
        self.target_sync_T = -self.args.t_max

    def sync_target_network(self):
        for target, source in zip(self.target_dqn.parameters(), self.dqn.parameters()):
            target.data = source.data

    def act(self, state, epsilon, exp_model, evaluation=False):
        # self.T += 1
        self.dqn.eval()
        orig_state = state[:, :, -1:]
        state = torch.from_numpy(state).float().transpose_(0, 2).unsqueeze(0)
        q_values_distributions = self.dqn(Variable(state, volatile=True)).cpu().data[0]
        # TODO: Log Q-Value distributions
        # print(q_values_distributions)
        values = torch.linspace(self.args.v_min, self.args.v_max, steps=self.args.atoms)
        values = values.view(1, self.args.atoms)
        values = values.expand(self.args.actions, self.args.atoms)

        # print(values, q_values_distributions, torch.sum(q_values_distributions * values, dim=1))
        q_value_expectations = torch.sum(q_values_distributions * values, dim=1)
        q_values_numpy = q_value_expectations.numpy()

        extra_info = {}

        if self.args.optimistic_init and not evaluation:
            raise NotImplementedError
            q_values_pre_bonus = np.copy(q_values_numpy)
            if not self.args.ucb:
                for a in range(self.args.actions):
                    _, info = exp_model.bonus(orig_state, a, dont_remember=True)
                    action_pseudo_count = info["Pseudo_Count"]
                    # TODO: Log the optimism bonuses
                    optimism_bonus = self.args.optimistic_scaler / np.sqrt(action_pseudo_count + 0.01)
                    self.log("Bandit/Action_{}".format(a), optimism_bonus, step=self.T)
                    q_values[a] += optimism_bonus
            else:
                action_counts = []
                for a in range(self.args.actions):
                    _, info = exp_model.bonus(orig_state, a, dont_remember=True)
                    action_pseudo_count = info["Pseudo_Count"]
                    action_counts.append(action_pseudo_count)
                total_count = sum(action_counts)
                for ai, a in enumerate(action_counts):
                    # TODO: Log the optimism bonuses
                    optimisim_bonus = self.args.optimistic_scaler * np.sqrt(2 * np.log(max(1, total_count)) / (a + 0.01))
                    self.log("Bandit/UCB/Action_{}".format(ai), optimisim_bonus, step=self.T)
                    q_values[ai] += optimisim_bonus

            extra_info["Action_Bonus"] = q_values_numpy - q_values_pre_bonus

        extra_info["Q_Values"] = q_values_numpy

        if np.random.random() < epsilon:
            action = np.random.randint(low=0, high=self.args.actions)
        else:
            action = int(np.argmax(q_values_numpy))
            # action = q_values.max(0)[1][0]  # Torch...

        extra_info["Action"] = action

        return action, extra_info

    def experience(self, state, action, reward, state_next, steps, terminated, pseudo_reward=0, density=1, exploring=False):
        if not exploring:
            self.T += 1
        self.replay.Add_Exp(state, action, reward, state_next, steps, terminated, pseudo_reward, density)

    def end_of_trajectory(self):
        self.replay.end_of_trajectory()

    def train(self):

        if self.T - self.target_sync_T > self.args.target:
            self.sync_target_network()
            self.target_sync_T = self.T

        info = {}

        for _ in range(self.args.iters):
            self.dqn.eval()

            batch, indices, is_weights = self.replay.Sample_N(self.args.batch_size, self.args.n_step, self.args.gamma)
            columns = list(zip(*batch))

            states = Variable(torch.from_numpy(np.array(columns[0])).float().transpose_(1, 3))
            actions = Variable(torch.LongTensor(columns[1]))
            terminal_states = Variable(torch.FloatTensor(columns[5]))
            rewards = Variable(torch.FloatTensor(columns[2]))
            # Have to clip rewards for DQN
            rewards = torch.clamp(rewards, -1, 1)
            steps = Variable(torch.FloatTensor(columns[4]))
            new_states = Variable(torch.from_numpy(np.array(columns[3])).float().transpose_(1, 3))

            target_dqn_qvals = self.target_dqn(new_states).cpu()
            # Make a new variable with those values so that these are treated as constants
            target_dqn_qvals_data = Variable(target_dqn_qvals.data)

            q_value_gammas = (Variable(torch.ones(terminal_states.size()[0])) - terminal_states)
            inter = Variable(torch.ones(terminal_states.size()[0]) * self.args.gamma)
            # print(steps)
            q_value_gammas = q_value_gammas * torch.pow(inter, steps)

            values = torch.linspace(self.args.v_min, self.args.v_max, steps=self.args.atoms)
            values = Variable(values)
            values = values.view(1, 1, self.args.atoms)
            values = values.expand(self.args.batch_size, self.args.actions, self.args.atoms)
            # print(values)

            q_value_gammas = q_value_gammas.view(self.args.batch_size, 1, 1)
            q_value_gammas = q_value_gammas.expand(self.args.batch_size, self.args.actions, self.args.atoms)
            # print(q_value_gammas)
            gamma_values = q_value_gammas * values
            # print(gamma_values)
            rewards = rewards.view(self.args.batch_size, 1, 1)
            rewards = rewards.expand(self.args.batch_size, self.args.actions, self.args.atoms)
            # print(rewards)
            operator_q_values = rewards + gamma_values
            # print(operator_q_values)

            clipped_operator_q_values = torch.clamp(operator_q_values, self.args.v_min, self.args.v_max)

            delta_z = (self.args.v_max - self.args.v_min) / (self.args.atoms - 1)
            # Using the notation from the categorical paper
            b_j = (clipped_operator_q_values - self.args.v_min) / delta_z
            # print(b_j)
            lower_bounds = torch.floor(b_j)
            upper_bounds = torch.ceil(b_j)

            # Work out the max action
            atom_values = Variable(torch.linspace(self.args.v_min, self.args.v_max, steps=self.args.atoms))
            atom_values = atom_values.view(1, 1, self.args.atoms)
            atom_values = atom_values.expand(self.args.batch_size, self.args.actions, self.args.atoms)

            # Sum over the atoms dimension
            target_expected_qvalues = torch.sum(target_dqn_qvals_data * atom_values, dim=2)
            # Get the maximum actions index across the batch size
            max_actions = target_expected_qvalues.max(dim=1)[1].view(-1)

            # Project back onto the original support for the max actions
            q_value_distribution_targets = torch.zeros(self.args.batch_size, self.args.atoms)

            # Distributions for the max actions
            # print(target_dqn_qvals_data, max_actions)
            q_value_max_actions_distribs = target_dqn_qvals_data.index_select(dim=1, index=max_actions)[:,0,:]
            # print(q_value_max_actions_distribs)

            # Lower_bounds_actions
            lower_bounds_actions = lower_bounds.index_select(dim=1, index=max_actions)[:,0,:]
            upper_bounds_actions = upper_bounds.index_select(dim=1, index=max_actions)[:,0,:]
            b_j_actions = b_j.index_select(dim=1, index=max_actions)[:,0,:]

            lower_bound_values_to_add = q_value_max_actions_distribs * (upper_bounds_actions - b_j_actions)
            upper_bound_values_to_add = q_value_max_actions_distribs * (b_j_actions - lower_bounds_actions)
            # print(lower_bounds_actions)
            # print(lower_bound_values_to_add)
            # Naive looping
            for b in range(self.args.batch_size):
                for l, pj in zip(lower_bounds_actions.data.type(torch.LongTensor)[b], lower_bound_values_to_add[b].data):
                    q_value_distribution_targets[b][l] += pj
                for u, pj in zip(upper_bounds_actions.data.type(torch.LongTensor)[b], upper_bound_values_to_add[b].data):
                    q_value_distribution_targets[b][u] += pj

            self.dqn.train()
            if self.args.gpu:
                actions = actions.cuda()
                # q_value_targets = q_value_targets.cuda()
                q_value_distribution_targets = q_value_distribution_targets.cuda()
            model_predictions = self.dqn(states).index_select(1, actions.view(-1))[:,0,:]
            q_value_distribution_targets = Variable(q_value_distribution_targets)
            # print(q_value_distribution_targets)
            # print(model_predictions) 

            # Cross entropy loss
            ce_loss = -torch.sum(q_value_distribution_targets * torch.log(model_predictions), dim=1)
            ce_batch_loss = ce_loss.mean()

            info = {}

            self.log("DQN/X_Entropy_Loss", ce_batch_loss.data[0], step=self.T)

            # Update
            self.optimizer.zero_grad()
            ce_batch_loss.backward()

            # Taken from pytorch clip_grad_norm
            # Remove once the pip version it up to date with source
            gradient_norm = clip_grad_norm(self.dqn.parameters(), self.args.clip_value)
            if gradient_norm is not None:
                info["Norm"] = gradient_norm

            self.optimizer.step()

            if "States" in info:
                states_trained = info["States"]
                info["States"] = states_trained + columns[0]
            else:
                info["States"] = columns[0]

        # Pad out the states to be of size batch_size
        if len(info["States"]) < self.args.batch_size:
            old_states = info["States"]
            new_states = old_states[0] * (self.args.batch_size - len(old_states))
            info["States"] = new_states

        return info
コード例 #10
0
class DQN_Distribution_Agent:
    def __init__(self, args, exp_model, logging_func):
        self.args = args

        # Exploration Model
        self.exp_model = exp_model

        self.log = logging_func["log"]

        # Experience Replay
        self.replay = ExpReplay(args.exp_replay_size,
                                args.stale_limit,
                                exp_model,
                                args,
                                priority=self.args.prioritized)

        # DQN and Target DQN
        model = get_models(args.model)
        self.dqn = model(actions=args.actions, atoms=args.atoms)
        self.target_dqn = model(actions=args.actions, atoms=args.atoms)

        dqn_params = 0
        for weight in self.dqn.parameters():
            weight_params = 1
            for s in weight.size():
                weight_params *= s
            dqn_params += weight_params
        print("Distrib DQN has {:,} parameters.".format(dqn_params))

        self.target_dqn.eval()

        if args.gpu:
            print("Moving models to GPU.")
            self.dqn.cuda()
            self.target_dqn.cuda()

        # Optimizer
        self.optimizer = Adam(self.dqn.parameters(), lr=args.lr)
        # self.optimizer = RMSprop(self.dqn.parameters(), lr=args.lr)

        self.T = 0
        self.target_sync_T = -self.args.t_max

    def sync_target_network(self):
        for target, source in zip(self.target_dqn.parameters(),
                                  self.dqn.parameters()):
            target.data = source.data

    def act(self, state, epsilon, exp_model, evaluation=False):
        # self.T += 1
        self.dqn.eval()
        orig_state = state[:, :, -1:]
        state = torch.from_numpy(state).float().transpose_(0, 2).unsqueeze(0)
        q_values_distributions = self.dqn(Variable(
            state, volatile=True)).cpu().data[0]
        # TODO: Log Q-Value distributions
        # print(q_values_distributions)
        values = torch.linspace(self.args.v_min,
                                self.args.v_max,
                                steps=self.args.atoms)
        values = values.view(1, self.args.atoms)
        values = values.expand(self.args.actions, self.args.atoms)

        # print(values, q_values_distributions, torch.sum(q_values_distributions * values, dim=1))
        q_value_expectations = torch.sum(q_values_distributions * values,
                                         dim=1)
        q_values_numpy = q_value_expectations.numpy()

        extra_info = {}

        if self.args.optimistic_init and not evaluation:
            raise NotImplementedError
            q_values_pre_bonus = np.copy(q_values_numpy)
            if not self.args.ucb:
                for a in range(self.args.actions):
                    _, info = exp_model.bonus(orig_state,
                                              a,
                                              dont_remember=True)
                    action_pseudo_count = info["Pseudo_Count"]
                    # TODO: Log the optimism bonuses
                    optimism_bonus = self.args.optimistic_scaler / np.sqrt(
                        action_pseudo_count + 0.01)
                    self.log("Bandit/Action_{}".format(a),
                             optimism_bonus,
                             step=self.T)
                    q_values[a] += optimism_bonus
            else:
                action_counts = []
                for a in range(self.args.actions):
                    _, info = exp_model.bonus(orig_state,
                                              a,
                                              dont_remember=True)
                    action_pseudo_count = info["Pseudo_Count"]
                    action_counts.append(action_pseudo_count)
                total_count = sum(action_counts)
                for ai, a in enumerate(action_counts):
                    # TODO: Log the optimism bonuses
                    optimisim_bonus = self.args.optimistic_scaler * np.sqrt(
                        2 * np.log(max(1, total_count)) / (a + 0.01))
                    self.log("Bandit/UCB/Action_{}".format(ai),
                             optimisim_bonus,
                             step=self.T)
                    q_values[ai] += optimisim_bonus

            extra_info["Action_Bonus"] = q_values_numpy - q_values_pre_bonus

        extra_info["Q_Values"] = q_values_numpy

        if np.random.random() < epsilon:
            action = np.random.randint(low=0, high=self.args.actions)
        else:
            action = int(np.argmax(q_values_numpy))
            # action = q_values.max(0)[1][0]  # Torch...

        extra_info["Action"] = action

        return action, extra_info

    def experience(self,
                   state,
                   action,
                   reward,
                   state_next,
                   steps,
                   terminated,
                   pseudo_reward=0,
                   density=1,
                   exploring=False):
        if not exploring:
            self.T += 1
        self.replay.Add_Exp(state, action, reward, state_next, steps,
                            terminated, pseudo_reward, density)

    def end_of_trajectory(self):
        self.replay.end_of_trajectory()

    def train(self):

        if self.T - self.target_sync_T > self.args.target:
            self.sync_target_network()
            self.target_sync_T = self.T

        info = {}

        for _ in range(self.args.iters):
            self.dqn.eval()

            batch, indices, is_weights = self.replay.Sample_N(
                self.args.batch_size, self.args.n_step, self.args.gamma)
            columns = list(zip(*batch))

            states = Variable(
                torch.from_numpy(np.array(columns[0])).float().transpose_(
                    1, 3))
            actions = Variable(torch.LongTensor(columns[1]))
            terminal_states = Variable(torch.FloatTensor(columns[5]))
            rewards = Variable(torch.FloatTensor(columns[2]))
            # Have to clip rewards for DQN
            rewards = torch.clamp(rewards, -1, 1)
            steps = Variable(torch.FloatTensor(columns[4]))
            new_states = Variable(
                torch.from_numpy(np.array(columns[3])).float().transpose_(
                    1, 3))

            target_dqn_qvals = self.target_dqn(new_states).cpu()
            # Make a new variable with those values so that these are treated as constants
            target_dqn_qvals_data = Variable(target_dqn_qvals.data)

            q_value_gammas = (Variable(torch.ones(terminal_states.size()[0])) -
                              terminal_states)
            inter = Variable(
                torch.ones(terminal_states.size()[0]) * self.args.gamma)
            # print(steps)
            q_value_gammas = q_value_gammas * torch.pow(inter, steps)

            values = torch.linspace(self.args.v_min,
                                    self.args.v_max,
                                    steps=self.args.atoms)
            values = Variable(values)
            values = values.view(1, 1, self.args.atoms)
            values = values.expand(self.args.batch_size, self.args.actions,
                                   self.args.atoms)
            # print(values)

            q_value_gammas = q_value_gammas.view(self.args.batch_size, 1, 1)
            q_value_gammas = q_value_gammas.expand(self.args.batch_size,
                                                   self.args.actions,
                                                   self.args.atoms)
            # print(q_value_gammas)
            gamma_values = q_value_gammas * values
            # print(gamma_values)
            rewards = rewards.view(self.args.batch_size, 1, 1)
            rewards = rewards.expand(self.args.batch_size, self.args.actions,
                                     self.args.atoms)
            # print(rewards)
            operator_q_values = rewards + gamma_values
            # print(operator_q_values)

            clipped_operator_q_values = torch.clamp(operator_q_values,
                                                    self.args.v_min,
                                                    self.args.v_max)

            delta_z = (self.args.v_max - self.args.v_min) / (self.args.atoms -
                                                             1)
            # Using the notation from the categorical paper
            b_j = (clipped_operator_q_values - self.args.v_min) / delta_z
            # print(b_j)
            lower_bounds = torch.floor(b_j)
            upper_bounds = torch.ceil(b_j)

            # Work out the max action
            atom_values = Variable(
                torch.linspace(self.args.v_min,
                               self.args.v_max,
                               steps=self.args.atoms))
            atom_values = atom_values.view(1, 1, self.args.atoms)
            atom_values = atom_values.expand(self.args.batch_size,
                                             self.args.actions,
                                             self.args.atoms)

            # Sum over the atoms dimension
            target_expected_qvalues = torch.sum(target_dqn_qvals_data *
                                                atom_values,
                                                dim=2)
            # Get the maximum actions index across the batch size
            max_actions = target_expected_qvalues.max(dim=1)[1].view(-1)

            # Project back onto the original support for the max actions
            q_value_distribution_targets = torch.zeros(self.args.batch_size,
                                                       self.args.atoms)

            # Distributions for the max actions
            # print(target_dqn_qvals_data, max_actions)
            q_value_max_actions_distribs = target_dqn_qvals_data.index_select(
                dim=1, index=max_actions)[:, 0, :]
            # print(q_value_max_actions_distribs)

            # Lower_bounds_actions
            lower_bounds_actions = lower_bounds.index_select(
                dim=1, index=max_actions)[:, 0, :]
            upper_bounds_actions = upper_bounds.index_select(
                dim=1, index=max_actions)[:, 0, :]
            b_j_actions = b_j.index_select(dim=1, index=max_actions)[:, 0, :]

            lower_bound_values_to_add = q_value_max_actions_distribs * (
                upper_bounds_actions - b_j_actions)
            upper_bound_values_to_add = q_value_max_actions_distribs * (
                b_j_actions - lower_bounds_actions)
            # print(lower_bounds_actions)
            # print(lower_bound_values_to_add)
            # Naive looping
            for b in range(self.args.batch_size):
                for l, pj in zip(
                        lower_bounds_actions.data.type(torch.LongTensor)[b],
                        lower_bound_values_to_add[b].data):
                    q_value_distribution_targets[b][l] += pj
                for u, pj in zip(
                        upper_bounds_actions.data.type(torch.LongTensor)[b],
                        upper_bound_values_to_add[b].data):
                    q_value_distribution_targets[b][u] += pj

            self.dqn.train()
            if self.args.gpu:
                actions = actions.cuda()
                # q_value_targets = q_value_targets.cuda()
                q_value_distribution_targets = q_value_distribution_targets.cuda(
                )
            model_predictions = self.dqn(states).index_select(
                1, actions.view(-1))[:, 0, :]
            q_value_distribution_targets = Variable(
                q_value_distribution_targets)
            # print(q_value_distribution_targets)
            # print(model_predictions)

            # Cross entropy loss
            ce_loss = -torch.sum(
                q_value_distribution_targets * torch.log(model_predictions),
                dim=1)
            ce_batch_loss = ce_loss.mean()

            info = {}

            self.log("DQN/X_Entropy_Loss", ce_batch_loss.data[0], step=self.T)

            # Update
            self.optimizer.zero_grad()
            ce_batch_loss.backward()

            # Taken from pytorch clip_grad_norm
            # Remove once the pip version it up to date with source
            gradient_norm = clip_grad_norm(self.dqn.parameters(),
                                           self.args.clip_value)
            if gradient_norm is not None:
                info["Norm"] = gradient_norm

            self.optimizer.step()

            if "States" in info:
                states_trained = info["States"]
                info["States"] = states_trained + columns[0]
            else:
                info["States"] = columns[0]

        # Pad out the states to be of size batch_size
        if len(info["States"]) < self.args.batch_size:
            old_states = info["States"]
            new_states = old_states[0] * (self.args.batch_size -
                                          len(old_states))
            info["States"] = new_states

        return info
コード例 #11
0
ファイル: DDQN_Agent.py プロジェクト: tabzraz/RL
class DDQN_Agent:

    def __init__(self, args, exp_model, logging_func):
        self.args = args

        # Exploration Model
        self.exp_model = exp_model

        self.log = logging_func["log"]

        # Experience Replay
        if self.args.set_replay:
            self.replay = ExpReplaySet(10, 10, exp_model, args, priority=False)
        else:
            self.replay = ExpReplay(args.exp_replay_size, args.stale_limit, exp_model, args, priority=self.args.prioritized)

        # DQN and Target DQN
        model = get_models(args.model)
        self.dqn = model(actions=args.actions)
        self.target_dqn = model(actions=args.actions)

        dqn_params = 0
        for weight in self.dqn.parameters():
            weight_params = 1
            for s in weight.size():
                weight_params *= s
            dqn_params += weight_params
        print("DQN has {:,} parameters.".format(dqn_params))

        self.target_dqn.eval()

        if args.gpu:
            print("Moving models to GPU.")
            self.dqn.cuda()
            self.target_dqn.cuda()

        # Optimizer
        # self.optimizer = Adam(self.dqn.parameters(), lr=args.lr)
        self.optimizer = RMSprop(self.dqn.parameters(), lr=args.lr)

        self.T = 0
        self.target_sync_T = -self.args.t_max

    def sync_target_network(self):
        for target, source in zip(self.target_dqn.parameters(), self.dqn.parameters()):
            target.data = source.data

    def act(self, state, epsilon, exp_model, evaluation=False):
        # self.T += 1
        self.dqn.eval()
        orig_state = state[:, :, -1:]
        state = torch.from_numpy(state).float().transpose_(0, 2).unsqueeze(0)
        q_values = self.dqn(Variable(state, volatile=True)).cpu().data[0]
        q_values_numpy = q_values.numpy()

        extra_info = {}

        if self.args.optimistic_init and not evaluation:
            q_values_pre_bonus = np.copy(q_values_numpy)
            if not self.args.ucb:
                for a in range(self.args.actions):
                    _, info = exp_model.bonus(orig_state, a, dont_remember=True)
                    action_pseudo_count = info["Pseudo_Count"]
                    # TODO: Log the optimism bonuses
                    optimism_bonus = self.args.optimistic_scaler / np.power(action_pseudo_count + 0.01, self.args.bandit_p)
                    if self.args.tb and self.T % self.args.tb_interval == 0:
                        self.log("Bandit/Action_{}".format(a), optimism_bonus, step=self.T)
                    q_values[a] += optimism_bonus
            else:
                action_counts = []
                for a in range(self.args.actions):
                    _, info = exp_model.bonus(orig_state, a, dont_remember=True)
                    action_pseudo_count = info["Pseudo_Count"]
                    action_counts.append(action_pseudo_count)
                total_count = sum(action_counts)
                for ai, a in enumerate(action_counts):
                    # TODO: Log the optimism bonuses
                    optimisim_bonus = self.args.optimistic_scaler * np.sqrt(2 * np.log(max(1, total_count)) / (a + 0.01))
                    self.log("Bandit/UCB/Action_{}".format(ai), optimisim_bonus, step=self.T)
                    q_values[ai] += optimisim_bonus

            extra_info["Action_Bonus"] = q_values_numpy - q_values_pre_bonus

        extra_info["Q_Values"] = q_values_numpy

        if np.random.random() < epsilon:
            action = np.random.randint(low=0, high=self.args.actions)
        else:
            action = q_values.max(0)[1][0]  # Torch...

        extra_info["Action"] = action

        return action, extra_info

    def experience(self, state, action, reward, state_next, steps, terminated, pseudo_reward=0, density=1, exploring=False):
        if not exploring:
            self.T += 1
        self.replay.Add_Exp(state, action, reward, state_next, steps, terminated, pseudo_reward, density)

    def end_of_trajectory(self):
        self.replay.end_of_trajectory()

    def train(self):

        if self.T - self.target_sync_T > self.args.target:
            self.sync_target_network()
            self.target_sync_T = self.T

        info = {}

        for _ in range(self.args.iters):
            self.dqn.eval()

            # TODO: Use a named tuple for experience replay
            n_step_sample = 1
            if np.random.random() < self.args.n_step_mixing:
                n_step_sample = self.args.n_step
            batch, indices, is_weights = self.replay.Sample_N(self.args.batch_size, n_step_sample, self.args.gamma)
            columns = list(zip(*batch))

            states = Variable(torch.from_numpy(np.array(columns[0])).float().transpose_(1, 3))
            actions = Variable(torch.LongTensor(columns[1]))
            terminal_states = Variable(torch.FloatTensor(columns[5]))
            rewards = Variable(torch.FloatTensor(columns[2]))
            # Have to clip rewards for DQN
            rewards = torch.clamp(rewards, -1, 1)
            steps = Variable(torch.FloatTensor(columns[4]))
            new_states = Variable(torch.from_numpy(np.array(columns[3])).float().transpose_(1, 3))

            target_dqn_qvals = self.target_dqn(new_states).cpu()
            # Make a new variable with those values so that these are treated as constants
            target_dqn_qvals_data = Variable(target_dqn_qvals.data)

            q_value_targets = (Variable(torch.ones(terminal_states.size()[0])) - terminal_states)
            inter = Variable(torch.ones(terminal_states.size()[0]) * self.args.gamma)
            # print(steps)
            q_value_targets = q_value_targets * torch.pow(inter, steps)
            if self.args.double:
                # Double Q Learning
                new_states_qvals = self.dqn(new_states).cpu()
                new_states_qvals_data = Variable(new_states_qvals.data)
                q_value_targets = q_value_targets * target_dqn_qvals_data.gather(1, new_states_qvals_data.max(1)[1])
            else:
                q_value_targets = q_value_targets * target_dqn_qvals_data.max(1)[0]
            q_value_targets = q_value_targets + rewards

            self.dqn.train()
            if self.args.gpu:
                actions = actions.cuda()
                q_value_targets = q_value_targets.cuda()
            model_predictions = self.dqn(states).gather(1, actions.view(-1, 1))

            # info = {}

            td_error = model_predictions - q_value_targets
            info["TD_Error"] = td_error.mean().data[0]

            # Update the priorities
            if not self.args.density_priority:
                self.replay.Update_Indices(indices, td_error.cpu().data.numpy(), no_pseudo_in_priority=self.args.count_td_priority)

            # If using prioritised we need to weight the td_error
            if self.args.prioritized and self.args.prioritized_is:
                # print(td_error)
                weights_tensor = torch.from_numpy(is_weights).float()
                weights_tensor = Variable(weights_tensor)
                if self.args.gpu:
                    weights_tensor = weights_tensor.cuda()
                # print(weights_tensor)
                td_error = td_error * weights_tensor
            l2_loss = (td_error).pow(2).mean()
            info["Loss"] = l2_loss.data[0]

            # Update
            self.optimizer.zero_grad()
            l2_loss.backward()

            # Taken from pytorch clip_grad_norm
            # Remove once the pip version it up to date with source
            gradient_norm = clip_grad_norm(self.dqn.parameters(), self.args.clip_value)
            if gradient_norm is not None:
                info["Norm"] = gradient_norm

            self.optimizer.step()

            if "States" in info:
                states_trained = info["States"]
                info["States"] = states_trained + columns[0]
            else:
                info["States"] = columns[0]

        # Pad out the states to be of size batch_size
        if len(info["States"]) < self.args.batch_size:
            old_states = info["States"]
            new_states = old_states[0] * (self.args.batch_size - len(old_states))
            info["States"] = new_states

        return info
コード例 #12
0
ファイル: TabQ_Agent.py プロジェクト: tabzraz/RL
class TabQ_Agent:
    def __init__(self, args, exp_model):
        self.args = args

        # Exploration Model
        self.exp_model = exp_model

        # Experience Replay
        if self.args.set_replay:
            self.replay = ExpReplaySet(10, 10, exp_model, args, priority=False)
        else:
            self.replay = ExpReplay(args.exp_replay_size,
                                    args.stale_limit,
                                    exp_model,
                                    args,
                                    priority=self.args.prioritized)

        # DQN and Target DQN
        self.q_table = {}
        self.actions = args.actions

        self.T = 0

    def state_to_tuple(self, state):
        return tuple(np.argwhere(state > 0.9)[0])
        return tuple([tuple([tuple(y) for y in x]) for x in state])

    def act(self, state, epsilon, exp_model):
        self.T += 1

        tuple_state = self.state_to_tuple(state)
        q_values = []
        for a in range(self.actions):
            key = (tuple_state, a)
            if key in self.q_table:
                q_value = self.q_table[key]
            else:
                q_value = 1  # HACK for optimistic init
                # q_value = np.random.random() / 1000
            q_values.append(q_value)

        q_values_numpy = np.array(q_values)

        extra_info = {}
        extra_info["Q_Values"] = q_values_numpy

        if self.args.optimistic_init:
            for a in range(self.args.actions):
                _, info = exp_model.bonus(state, a, dont_remember=True)
                action_pseudo_count = info["Pseudo_Count"]
                # TODO: Log the optimism bonuses
                q_values_numpy[a] += self.args.optimistic_scaler / np.sqrt(
                    action_pseudo_count + 0.01)

        if np.random.random() < epsilon:
            action = np.random.randint(low=0, high=self.args.actions)
        else:
            action = q_values_numpy.argmax()

        extra_info["Action"] = action

        return action, extra_info

    def experience(self,
                   state,
                   action,
                   reward,
                   state_next,
                   steps,
                   terminated,
                   pseudo_reward=0,
                   density=1):
        self.replay.Add_Exp(state, action, reward, state_next, steps,
                            terminated, pseudo_reward)

    def end_of_trajectory(self):
        self.replay.end_of_trajectory()

    def train(self):

        for _ in range(self.args.iters):

            # TODO: Use a named tuple for experience replay
            n_step_sample = 1
            if np.random.random() < self.args.n_step_mixing:
                n_step_sample = self.args.n_step
            batch, indices, is_weights = self.replay.Sample_N(
                self.args.batch_size, n_step_sample, self.args.gamma)

            # (state_now, action_now, accum_reward, state_then, steps, terminate)
            td_errors = []
            for index, batch_stuff in enumerate(batch):

                state, action, reward, next_state, step, terminal_state = batch_stuff
                # Work out q_value target
                q_value_target = reward
                if not terminal_state:

                    next_state_max_qval = -100000
                    tuple_next_state = self.state_to_tuple(next_state)
                    for a in range(self.actions):
                        key = (tuple_next_state, a)
                        if key in self.q_table:
                            q_value = self.q_table[key]
                        else:
                            # q_value = 0
                            q_value = np.random.random() / 1000
                        next_state_max_qval = max(next_state_max_qval, q_value)

                    q_value_target += (self.args.gamma**
                                       step) * next_state_max_qval

                # Update our estimate
                tuple_state = self.state_to_tuple(state)
                q_value_prediction = 0
                key = (tuple_state, action)
                if key in self.q_table:
                    q_value_prediction = self.q_table[key]

                td_error = q_value_prediction - q_value_target
                td_errors.append(td_error)

                if self.args.prioritized and self.args.prioritized_is:
                    td_error * is_weights[index]

                updated_prediction = q_value_prediction - self.args.lr * (
                    td_error)

                self.q_table[key] = updated_prediction

            info = {}

            info["TD_Error"] = sum(td_errors) / len(td_errors)

            # Update the priorities
            self.replay.Update_Indices(
                indices,
                td_errors,
                no_pseudo_in_priority=self.args.count_td_priority)

            info["Loss"] = info["TD_Error"]

        return info
コード例 #13
0
ファイル: DQN_Agent_TwoReplay.py プロジェクト: tabzraz/RL
class DQN_Agent_TwoReplay:
    def __init__(self, args, exp_model, logging_funcs):
        self.args = args

        # Exploration Model
        self.exp_model = exp_model

        self.log = logging_funcs["log"]
        self.env = logging_funcs["env"]

        self.replay = ExpReplay(args.exp_replay_size,
                                args.stale_limit,
                                exp_model,
                                args,
                                priority=self.args.prioritized)
        self.bonus_replay = ExpReplay(args.bonus_replay_size,
                                      args.stale_limit,
                                      exp_model,
                                      args,
                                      priority=self.args.prioritized)

        self.bonus_replay_stuff = 0

        # DQN and Target DQN
        model = get_models(args.model)
        self.dqn = model(actions=args.actions)
        self.target_dqn = model(actions=args.actions)

        dqn_params = 0
        for weight in self.dqn.parameters():
            weight_params = 1
            for s in weight.size():
                weight_params *= s
            dqn_params += weight_params
        print("DQN has {:,} parameters.".format(dqn_params))

        self.target_dqn.eval()

        if args.gpu:
            print("Moving models to GPU.")
            self.dqn.cuda()
            self.target_dqn.cuda()

        # Optimizer
        self.optimizer = RMSprop(self.dqn.parameters(), lr=args.lr)
        # self.optimizer = Adam(self.dqn.parameters(), lr=args.lr)

        self.T = 0
        self.target_sync_T = -self.args.t_max

        self.max_bonus = 0

        self.player_bonus_positions = []

    def sync_target_network(self):
        for target, source in zip(self.target_dqn.parameters(),
                                  self.dqn.parameters()):
            target.data = source.data

    def act(self, state, epsilon, exp_model, evaluation=False):
        # self.T += 1
        self.dqn.eval()
        orig_state = state[:, :, -1:]
        state = torch.from_numpy(state).float().transpose_(0, 2).unsqueeze(0)
        q_values = self.dqn(Variable(state, volatile=True)).cpu().data[0]
        q_values_numpy = np.copy(q_values.numpy())

        extra_info = {}
        extra_info["Q_Values"] = q_values_numpy

        if self.args.optimistic_init and not evaluation:
            if not self.args.ucb:
                for a in range(self.args.actions):
                    _, info = exp_model.bonus(orig_state,
                                              a,
                                              dont_remember=True)
                    action_pseudo_count = info["Pseudo_Count"]
                    # TODO: Log the optimism bonuses
                    optimism_bonus = self.args.optimistic_scaler / np.sqrt(
                        action_pseudo_count + 0.01)
                    self.log("Bandit/Action_{}".format(a),
                             optimism_bonus,
                             step=self.T)
                    q_values[a] += optimism_bonus
            else:
                action_counts = []
                for a in range(self.args.actions):
                    _, info = exp_model.bonus(orig_state,
                                              a,
                                              dont_remember=True)
                    action_pseudo_count = info["Pseudo_Count"]
                    action_counts.append(action_pseudo_count)
                total_count = sum(action_counts)
                for ai, a in enumerate(action_counts):
                    # TODO: Log the optimism bonuses
                    optimisim_bonus = self.args.optimistic_scaler * np.sqrt(
                        2 * np.log(max(1, total_count)) / (a + 0.01))
                    self.log("Bandit/UCB/Action_{}".format(ai),
                             optimisim_bonus,
                             step=self.T)
                    q_values[ai] += optimisim_bonus

            extra_info["Action_Bonus"] = q_values.numpy() - q_values_numpy

        if np.random.random() < epsilon:
            action = np.random.randint(low=0, high=self.args.actions)
        else:
            action = q_values.max(0)[1][0]  # Torch...

        extra_info["Action"] = action

        return action, extra_info

    def experience(self,
                   state,
                   action,
                   reward,
                   state_next,
                   steps,
                   terminated,
                   pseudo_reward=0,
                   density=1,
                   exploring=False):
        if not exploring:
            self.T += 1
        self.replay.Add_Exp(state, action, reward, state_next, steps,
                            terminated, 0, density)
        self.max_bonus = max(self.max_bonus, pseudo_reward)
        if pseudo_reward > self.max_bonus * self.args.bonus_replay_threshold:
            self.player_bonus_positions.append(self.env.log_visitation())
            self.bonus_replay.Add_Exp(state, action, reward, state_next, steps,
                                      terminated, pseudo_reward, density)
            self.bonus_replay_stuff += 1
            if not exploring:
                self.log("Bonus_Replay_Experience",
                         self.bonus_replay_stuff,
                         step=self.T)
                self.log("Bonus_Replay_Bonus", pseudo_reward, step=self.T)

    def end_of_trajectory(self):
        self.replay.end_of_trajectory()
        if self.bonus_replay_stuff > 0:
            self.bonus_replay.end_of_trajectory()

    def train(self, bonus_replay=False):

        if bonus_replay is False:
            self.train(bonus_replay=True)

        if self.T - self.target_sync_T > self.args.target:
            self.sync_target_network()
            self.target_sync_T = self.T

        info = {}

        if self.args.exp_replay_size <= 1000 and not bonus_replay:
            return info

        for _ in range(self.args.iters):
            self.dqn.eval()

            # TODO: Use a named tuple for experience replay
            if not bonus_replay:
                batch, indices, is_weights = self.replay.Sample_N(
                    self.args.batch_size, self.args.n_step, self.args.gamma)
            else:
                batch, indices, is_weights = self.bonus_replay.Sample_N(
                    self.args.batch_size, 1, self.args.gamma)
            columns = list(zip(*batch))

            states = Variable(
                torch.from_numpy(np.array(columns[0])).float().transpose_(
                    1, 3))
            actions = Variable(torch.LongTensor(columns[1]))
            terminal_states = Variable(torch.FloatTensor(columns[5]))
            rewards = Variable(torch.FloatTensor(columns[2]))
            # Have to clip rewards for DQN
            rewards = torch.clamp(rewards, -1, 1)
            steps = Variable(torch.FloatTensor(columns[4]))
            new_states = Variable(
                torch.from_numpy(np.array(columns[3])).float().transpose_(
                    1, 3))

            target_dqn_qvals = self.target_dqn(new_states).cpu()
            # Make a new variable with those values so that these are treated as constants
            target_dqn_qvals_data = Variable(target_dqn_qvals.data)

            q_value_targets = (
                Variable(torch.ones(terminal_states.size()[0])) -
                terminal_states)
            inter = Variable(
                torch.ones(terminal_states.size()[0]) * self.args.gamma)
            # print(steps)
            q_value_targets = q_value_targets * torch.pow(inter, steps)
            if self.args.double:
                # Double Q Learning
                new_states_qvals = self.dqn(new_states).cpu()
                new_states_qvals_data = Variable(new_states_qvals.data)
                q_value_targets = q_value_targets * target_dqn_qvals_data.gather(
                    1,
                    new_states_qvals_data.max(1)[1])
            else:
                q_value_targets = q_value_targets * target_dqn_qvals_data.max(
                    1)[0]
            q_value_targets = q_value_targets + rewards

            self.dqn.train()
            if self.args.gpu:
                actions = actions.cuda()
                q_value_targets = q_value_targets.cuda()
            model_predictions = self.dqn(states).gather(1, actions.view(-1, 1))

            # info = {}

            td_error = model_predictions - q_value_targets
            info["TD_Error"] = td_error.mean().data[0]

            # Update the priorities
            if not self.args.density_priority:
                if bonus_replay:
                    self.bonus_replay.Update_Indices(
                        indices,
                        td_error.cpu().data.numpy(),
                        no_pseudo_in_priority=self.args.count_td_priority)
                else:
                    self.replay.Update_Indices(
                        indices,
                        td_error.cpu().data.numpy(),
                        no_pseudo_in_priority=self.args.count_td_priority)

            # If using prioritised we need to weight the td_error
            if self.args.prioritized and self.args.prioritized_is:
                # print(td_error)
                weights_tensor = torch.from_numpy(is_weights).float()
                weights_tensor = Variable(weights_tensor)
                if self.args.gpu:
                    weights_tensor = weights_tensor.cuda()
                # print(weights_tensor)
                td_error = td_error * weights_tensor
            l2_loss = (td_error).pow(2).mean()
            info["Loss"] = l2_loss.data[0]

            # Update
            self.optimizer.zero_grad()
            l2_loss.backward()

            # Taken from pytorch clip_grad_norm
            # Remove once the pip version it up to date with source
            gradient_norm = clip_grad_norm(self.dqn.parameters(),
                                           self.args.clip_value)
            if gradient_norm is not None:
                info["Norm"] = gradient_norm

            self.optimizer.step()

            if "States" in info:
                states_trained = info["States"]
                info["States"] = states_trained + columns[0]
            else:
                info["States"] = columns[0]

        # Pad out the states to be of size batch_size
        if len(info["States"]) < self.args.batch_size:
            old_states = info["States"]
            new_states = old_states[0] * (self.args.batch_size -
                                          len(old_states))
            info["States"] = new_states

        if bonus_replay:
            if self.args.tb and self.T % self.args.tb_interval == 0:
                self.log("DQN/Bonus/Gradient_Norm", info["Norm"], step=self.T)
                self.log("DQN/Bonus/Loss", info["Loss"], step=self.T)
                self.log("DQN/Bonus/TD_Error", info["TD_Error"], step=self.T)

        return info