Exemple #1
0
    def __init__(self, features, actions, params):
        self.features = features
        self.actions = actions
        self.params = params

        # define parameter contract
        self.alpha = params['alpha']
        self.epsilon = params['epsilon']
        self.target_refresh = params['target_refresh']
        self.buffer_size = params['buffer_size']

        self.h1 = params['h1']
        self.h2 = params['h2']

        # build two networks, one for the "online" learning policy
        # the other as a fixed target network
        self.policy_net = Network(features, self.h1, self.h2,
                                  actions).to(device)
        self.target_net = Network(features, self.h1, self.h2,
                                  actions).to(device)

        # build the optimizer for _only_ the policy network
        # target network parameters will be copied from the policy net periodically
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.alpha,
                                    betas=(0.9, 0.999))
        # self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min')

        # a simple circular replay buffer (i.e. a FIFO buffer)
        self.buffer = ReplayBuffer(self.buffer_size)
        self.steps = 0
        self.actionCounter = np.zeros((env.width, env.height, env.num_actions))

        # initialize the weights of the target network to match the weights of policy network
        self.policy_net.cloneWeightsTo(self.target_net)
Exemple #2
0
    def __init__(self, features, actions, params):
        self.features = features
        self.actions = actions
        self.params = params

        # define parameter contract
        self.alpha = params['alpha']
        self.epsilon = params['epsilon']
        self.target_refresh = params['target_refresh']
        self.buffer_size = params['buffer_size']

        self.h1 = params['h1']
        self.h2 = params['h2']

        # build two networks, one for the "online" learning policy
        # the other as a fixed target network
        self.policy_net = Network(features, self.h1, self.h2,
                                  actions).to(device)
        self.target_net = Network(features, self.h1, self.h2,
                                  actions).to(device)
        self.det_net = Network(features, self.h1, self.h2, actions).to(device)
        self.bpolicy_net = Network(features, self.h1, self.h2,
                                   actions).to(device)
        self.bpolicy_net.load_state_dict(
            torch.load(
                "/home/soumyadeep/Action_Imbalance/RLGTD/experiments/prediction_SARSA/agents/net_params.pt"
            ))

        # build the optimizer for _only_ the policy network
        # target network parameters will be copied from the policy net periodically
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.alpha,
                                    betas=(0.9, 0.999))

        # a simple circular replay buffer (i.e. a FIFO buffer)
        self.buffer = ReplayBuffer(self.buffer_size)
        self.steps = 0

        # initialize the weights of the target network to match the weights of policy network
        self.policy_net.cloneWeightsTo(self.target_net)
Exemple #3
0
class BaseAgent:
    def __init__(self, features, actions, params):
        self.features = features
        self.actions = actions
        self.params = params

        # define parameter contract
        self.alpha = params['alpha']
        self.epsilon = params['epsilon']
        self.target_refresh = params['target_refresh']
        self.buffer_size = params['buffer_size']

        self.h1 = params['h1']
        self.h2 = params['h2']

        # build two networks, one for the "online" learning policy
        # the other as a fixed target network
        self.policy_net = Network(features, self.h1, self.h2,
                                  actions).to(device)
        self.target_net = Network(features, self.h1, self.h2,
                                  actions).to(device)
        self.det_net = Network(features, self.h1, self.h2, actions).to(device)
        self.bpolicy_net = Network(features, self.h1, self.h2,
                                   actions).to(device)
        self.bpolicy_net.load_state_dict(
            torch.load(
                "/home/soumyadeep/Action_Imbalance/RLGTD/experiments/prediction_SARSA/agents/net_params.pt"
            ))

        # build the optimizer for _only_ the policy network
        # target network parameters will be copied from the policy net periodically
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.alpha,
                                    betas=(0.9, 0.999))

        # a simple circular replay buffer (i.e. a FIFO buffer)
        self.buffer = ReplayBuffer(self.buffer_size)
        self.steps = 0

        # initialize the weights of the target network to match the weights of policy network
        self.policy_net.cloneWeightsTo(self.target_net)

    def selectAction(self, x):
        # take a random action about epsilon percent of the time
        q_s, _ = self.bpolicy_net(x)

        if q_s.shape[0] == 3:
            q_s = q_s.unsqueeze(0)
            #act = q_s.argmax().detach()

    # else:
        act = torch.max(q_s, 1).indices.detach().numpy()

        for i in range(act.shape[0]):
            action = act[i]
            if action == 1:
                if np.random.rand() < self.epsilon:
                    act[i] = np.random.choice([0, 2])

        # if act.cpu().numpy() == 1:
        #     if np.random.rand() < self.epsilon:
        #         a = np.random.randint(self.actions-1)

        # if np.random.rand() < self.epsilon:
        #     a = np.random.randint(self.actions)
        #     return torch.tensor(a, device=device)

        # # otherwise take a greedy action
        # q_s, _ = self.bpolicy_net(x)
        # # print(q_s)
        # return q_s.argmax().detach()
        act_tensor = torch.from_numpy(act).detach().to(device)

        return act_tensor

    def updateNetwork(self, samples):
        pass

    def update(self, s, a, sp, r, gamma):
        # the "online" sample gets tossed into the replay buffer
        self.buffer.add((s, a, sp, r, gamma))
        self.steps += 1

        # if it is time to set the target net <- policy network
        # do that before the learning step
        if self.steps % self.target_refresh == 0:
            self.policy_net.cloneWeightsTo(self.target_net)

        # as long as we have enough samples in the buffer to do one mini-batch update
        # go ahead and randomly sample a mini-batch and do a single update
        if len(self.buffer) > 200:
            samples, idcs = self.buffer.sample(200)
            self.updateNetwork(samples)
    def __init__(self, features, actions, state_array, params):
        super(DQN, self).__init__(features, actions, params)
        self.buffer_BACK = ReplayBuffer(1000)
        self.buffer_STAY = ReplayBuffer(1000)
        self.buffer_FORWARD = ReplayBuffer(1000)

        self.back_q_net = Network(features, self.h1, self.h2, 1).to(device)
        self.back_target_q_net = Network(
            features, self.h1, self.h2, 1).to(device)
        self.back_q_net.cloneWeightsTo(self.back_target_q_net)

        self.stay_q_net = Network(features, self.h1, self.h2, 1).to(device)
        self.stay_target_q_net = Network(
            features, self.h1, self.h2, 1).to(device)
        self.stay_q_net.cloneWeightsTo(self.stay_target_q_net)

        self.forward_q_net = Network(features, self.h1, self.h2, 1).to(device)
        self.forward_target_q_net = Network(
            features, self.h1, self.h2, 1).to(device)
        self.forward_q_net.cloneWeightsTo(self.forward_target_q_net)

        self.optimizerBack = torch.optim.Adam(self.back_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999))
        self.optimizerStay = torch.optim.Adam(self.stay_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999))
        self.optimizerForward = torch.optim.Adam(self.forward_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999))

        self.back_values = []
        self.stay_values = []
        self.forward_values = []

        self.back_values_baseline = []
        self.stay_values_baseline = []
        self.forward_values_baseline = []

        self.td_loss = []
        self.state_array = state_array
        self.penultimate_features = []

        self.ratioMap = params['ratioMap']
        self.sampleSize = params['sampleSize']
class DQN(BaseAgent):
    def __init__(self, features, actions, state_array, params):
        super(DQN, self).__init__(features, actions, params)
        self.buffer_BACK = ReplayBuffer(1000)
        self.buffer_STAY = ReplayBuffer(1000)
        self.buffer_FORWARD = ReplayBuffer(1000)

        self.back_q_net = Network(features, self.h1, self.h2, 1).to(device)
        self.back_target_q_net = Network(
            features, self.h1, self.h2, 1).to(device)
        self.back_q_net.cloneWeightsTo(self.back_target_q_net)

        self.stay_q_net = Network(features, self.h1, self.h2, 1).to(device)
        self.stay_target_q_net = Network(
            features, self.h1, self.h2, 1).to(device)
        self.stay_q_net.cloneWeightsTo(self.stay_target_q_net)

        self.forward_q_net = Network(features, self.h1, self.h2, 1).to(device)
        self.forward_target_q_net = Network(
            features, self.h1, self.h2, 1).to(device)
        self.forward_q_net.cloneWeightsTo(self.forward_target_q_net)

        self.optimizerBack = torch.optim.Adam(self.back_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999))
        self.optimizerStay = torch.optim.Adam(self.stay_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999))
        self.optimizerForward = torch.optim.Adam(self.forward_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999))

        self.back_values = []
        self.stay_values = []
        self.forward_values = []

        self.back_values_baseline = []
        self.stay_values_baseline = []
        self.forward_values_baseline = []

        self.td_loss = []
        self.state_array = state_array
        self.penultimate_features = []

        self.ratioMap = params['ratioMap']
        self.sampleSize = params['sampleSize']

    def updateNetwork(self, samples):
        # organize the mini-batch so that we can request "columns" from the data
        # e.g. we can get all of the actions, or all of the states with a single call
        batch = getBatchColumns(samples)

        # compute Q(s, a) for each sample in mini-batch
        Qs, x = self.policy_net(batch.states)
        Qsa = Qs.gather(1, batch.actions).squeeze()

        self.penultimate_features.append(x)

        # by default Q(s', a') = 0 unless the next states are non-terminal

        Qspap = torch.zeros(batch.size, device=device)
        # for i in range(len(batch.actions.numpy())):
        #     if batch.actions.numpy()[i][0] == 0:
        #         self.back_values.append(Qsa.detach().numpy()[i])
        #     elif batch.actions.numpy()[i][0] == 1:
        #         self.stay_values.append(Qsa.detach().numpy()[i])
        #     elif batch.actions.numpy()[i][0] == 2:
        #         self.forward_values.append(Qsa.detach().numpy()[i])

        # if we don't have any non-terminal next states, then no need to bootstrap
        if batch.nterm_sp.shape[0] > 0:
            Qsp, _ = self.target_net(batch.nterm_sp)

            # bootstrapping term is the max Q value for the next-state
            # only assign to indices where the next state is non-terminal
            Qspap[batch.nterm] = Qsp.max(1).values

        # compute the empirical MSBE for this mini-batch and let torch auto-diff to optimize
        # don't worry about detaching the bootstrapping term for semi-gradient Q-learning
        # the target network handles that
        target = batch.rewards + batch.gamma * Qspap.detach()
        td_loss = 0.5 * f.mse_loss(target, Qsa)

        # make sure we have no gradients left over from previous update
        self.optimizer.zero_grad()
        self.target_net.zero_grad()

        # compute the entire gradient of the network using only the td error
        td_loss.backward()

        self.td_loss.append(td_loss.detach().numpy())

        # self.td_loss = self.td_loss + list(td_loss.detach().numpy())

        Qs_state_array, _ = self.policy_net(self.state_array)

        Qsa_mean_states = torch.mean(Qs_state_array, 0)

        self.back_values.append(Qsa_mean_states[0].detach().numpy())
        self.stay_values.append(Qsa_mean_states[1].detach().numpy())
        self.forward_values.append(Qsa_mean_states[2].detach().numpy())

        # update the *policy network* using the combined gradients
        self.optimizer.step()

    def updateActionNet(self, samples, q_net, target_q_net, optimizer, storeList):
        batch = getBatchColumns(samples)
        Qs, x = q_net(batch.states)

        # Qsa = Qs.squeeze()
        # for i in range(len(batch.actions)):
        #     storeList.append(Qsa.detach().numpy()[i])
        Qspap = torch.zeros(batch.size, device=device)

        ############  ============  CHECK ================= ###############################
        if batch.nterm_sp.shape[0] > 0:
            ##  Qsp, _ = target_q_net(batch.nterm_sp) #### Is this correct ????

            Qsp_back, _ = self.back_target_q_net(batch.nterm_sp)
            Qsp_stay, _ = self.stay_target_q_net(batch.nterm_sp)
            Qsp_forward, _ = self.forward_target_q_net(batch.nterm_sp)

            Qsp = torch.hstack([Qsp_back, Qsp_stay, Qsp_forward])

            # bootstrapping term is the max Q value for the next-state
            # only assign to indices where the next state is non-terminal
            Qspap[batch.nterm] = Qsp.max(1).values

        ############  ============  CHECK ================= ###############################
        # compute the empirical MSBE for this mini-batch and let torch auto-diff to optimize
        # don't worry about detaching the bootstrapping term for semi-gradient Q-learning
        # the target network handles that
        target = batch.rewards + batch.gamma * Qspap.detach()
        td_loss = 0.5 * f.mse_loss(target, Qsa)

        # make sure we have no gradients left over from previous update
        optimizer.zero_grad()
        target_q_net.zero_grad()
        self.back_target_q_net.zero_grad()
        self.stay_target_q_net.zero_grad()
        self.forward_target_q_net.zero_grad()

        # compute the entire gradient of the network using only the td error
        td_loss.backward()

        Qs_state_array, _ = q_net(self.state_array)
        Qsa_mean_states = torch.mean(Qs_state_array, 0)
        storeList.append(Qsa_mean_states[0].detach().numpy())

        # update the *policy network* using the combined gradients
        optimizer.step()

    def update(self, s, a, sp, r, gamma):
        if a.cpu().numpy() == 0:
            self.buffer_BACK.add((s, a, sp, r, gamma))
        elif a.cpu().numpy() == 1:
            self.buffer_STAY.add((s, a, sp, r, gamma))
        elif a.cpu().numpy() == 2:
            self.buffer_FORWARD.add((s, a, sp, r, gamma))

        # the "online" sample gets tossed into the replay buffer
        self.buffer.add((s, a, sp, r, gamma))
        self.steps += 1

        # if it is time to set the target net <- policy network
        # do that before the learning step
        if self.steps % self.target_refresh == 0:
            self.policy_net.cloneWeightsTo(self.target_net)
            self.back_q_net.cloneWeightsTo(self.back_target_q_net)
            self.stay_q_net.cloneWeightsTo(self.stay_target_q_net)
            self.forward_q_net.cloneWeightsTo(self.forward_target_q_net)

        back_sample_count = math.floor(
            self.ratioMap.backward_ratio * self.sampleSize)
        stay_sample_count = math.floor(
            self.ratioMap.stay_ratio * self.sampleSize)
        forward_sample_count = math.floor(
            self.ratioMap.forward_ratio * self.sampleSize)

        # as long as we have enough samples in the buffer to do one mini-batch update
        # go ahead and randomly sample a mini-batch and do a single update
        if len(self.buffer_BACK) > back_sample_count \
                and len(self.buffer_STAY) > stay_sample_count \
                and len(self.buffer_FORWARD) > forward_sample_count:

            samplesBack, idcs = self.buffer_BACK.sample(back_sample_count)
            samplesStay, idcs = self.buffer_STAY.sample(stay_sample_count)
            samplesForward, idcs = self.buffer_FORWARD.sample(forward_sample_count)
            self.updateActionNet(samplesBack, self.back_q_net, self.back_target_q_net, self.optimizerBack,
                                 self.back_values_baseline)
            self.updateActionNet(samplesStay, self.stay_q_net, self.stay_target_q_net, self.optimizerStay,
                                 self.stay_values_baseline)
            self.updateActionNet(samplesForward, self.forward_q_net, self.forward_target_q_net, self.optimizerForward,
                                 self.forward_values_baseline)
            samples = samplesBack + samplesStay + samplesForward

            self.updateNetwork(samples)
Exemple #6
0
class BaseAgent:
    def __init__(self, features, actions, params):
        self.features = features
        self.actions = actions
        self.params = params

        # define parameter contract
        self.alpha = params['alpha']
        self.epsilon = params['epsilon']
        self.target_refresh = params['target_refresh']
        self.buffer_size = params['buffer_size']

        self.h1 = params['h1']
        self.h2 = params['h2']

        # build two networks, one for the "online" learning policy
        # the other as a fixed target network
        self.policy_net = Network(features, self.h1, self.h2,
                                  actions).to(device)
        self.target_net = Network(features, self.h1, self.h2,
                                  actions).to(device)

        # build the optimizer for _only_ the policy network
        # target network parameters will be copied from the policy net periodically
        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.alpha,
                                    betas=(0.9, 0.999))
        # self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min')

        # a simple circular replay buffer (i.e. a FIFO buffer)
        self.buffer = ReplayBuffer(self.buffer_size)
        self.steps = 0
        self.actionCounter = np.zeros((env.width, env.height, env.num_actions))

        # initialize the weights of the target network to match the weights of policy network
        self.policy_net.cloneWeightsTo(self.target_net)

    def selectAction(self, x):
        # take a random action about epsilon percent of the time
        if np.random.rand() < self.epsilon:
            a = np.random.randint(self.actions)
            return torch.tensor(a, device=device)

        # otherwise take a greedy action
        q_s, _ = self.policy_net(x)
        # print(q_s.detach().numpy()[0][3])
        print(q_s.argmax().detach())

        return q_s.argmax().detach()

    def updateNetwork(self, samples):
        pass

    def update(self, s, a, r, sp, gamma):
        # the "online" sample gets tossed into the replay buffer
        self.buffer.add((s, a, r, sp, gamma))
        self.steps += 1
        a = a.numpy()
        s = s.numpy()

        self.actionCounter[s[0][0]][s[0][1]][a] += 1

        # if it is time to set the target net <- policy network
        # do that before the learning step
        if self.steps % self.target_refresh == 0:
            self.policy_net.cloneWeightsTo(self.target_net)

        # as long as we have enough samples in the buffer to do one mini-batch update
        # go ahead and randomly sample a mini-batch and do a single update
        if len(self.buffer) > 32:
            samples, idcs = self.buffer.sample(32)
            self.updateNetwork(samples)