class Agent(object):
    def __init__(self, g_list, test_g_list, env):
        self.g_list = g_list
        if test_g_list is None:
            self.test_g_list = g_list
        else:
            self.test_g_list = test_g_list

        self.env = env
        self.net = QNet()
        self.old_net = QNet()
        self.optimizer = optim.Adam(self.net.parameters(),
                                    lr=cmd_args.learning_rate)

        if cmd_args.ctx == 'gpu':
            self.net = self.net.cuda()
            self.old_net = self.old_net.cuda()

        self.eps_start = 1.0
        self.eps_end = 1.0
        self.eps_step = 10000
        self.burn_in = 100  # number of iterations to run first set ("intial burning in to memory") of simulations?
        self.step = 0

        self.best_eval = None
        self.pos = 0
        self.sample_idxes = list(range(len(g_list)))
        random.shuffle(self.sample_idxes)
        self.take_snapshot()

    def take_snapshot(self):
        self.old_net.load_state_dict(self.net.state_dict())

    # type = 0 for add, 1 for subtract
    def make_actions(self, greedy=True, _type=0):
        self.eps = self.eps_end + max(
            0., (self.eps_start - self.eps_end) *
            (self.eps_step - max(0., self.step)) / self.eps_step)

        cur_state = self.env.getStateRef()

        actions, q_arrs = self.net(cur_state,
                                   None,
                                   greedy_acts=True,
                                   _type=_type)

        q_vals = []

        for i in range(len(q_arrs)):
            tmp = q_arrs[i].numpy()
            tmp = tmp[actions[i]][0]
            q_vals.append(tmp)

        return actions, q_vals

    def run_simulation(self):

        self.env.setup(g_list)
        avg_rewards = []

        t_a, t_s = 0, 0

        for asdf in range(GLOBAL_EPISODE_STEPS):

            if asdf % 2 == 0:
                assert self.env.first_nodes == None

            for i in range(len(self.g_list)):

                g = self.g_list[i].to_networkx()

                con_nodes = list(set(list(sum(g.edges, ()))))
                for j in range(20):
                    if (j not in con_nodes):
                        rand_num = np.random.randint(0, 20)
                        g.add_edge(j, rand_num)
                        self.env.added_edges.append((j, rand_num))

                self.g_list[i] = S2VGraph(g, label=self.g_list[i].label)

            action_type = (asdf % 4) // 2

            # get Actions
            list_at, _ = self.make_actions(_type=action_type)

            # save State
            list_st = self.env.cloneState()

            cur_state = self.env.getStateRef()

            _, predicted_Q = self.net(cur_state,
                                      None,
                                      greedy_acts=False,
                                      _type=action_type)

            # get Rewards
            if self.env.first_nodes is not None:
                rewards = self.env.get_rewards(list_at, _type=action_type)
                avg_rewards.append(sum(rewards) / len(rewards))
            else:
                rewards = [0] * len(g_list)

            # Update graph to get S'
            self.env.step(list_at, _type=action_type)

            # get next state
            if env.isTerminal():
                s_prime = None
            else:
                s_prime = self.env.cloneState()

            # get S'and A' values
            try:
                sprime_at, q_primes = self.make_actions(_type=action_type)

            except:
                continue

            # Calculate Q(S', A')
            actual_Q = torch.Tensor(rewards) + torch.Tensor(q_primes)

            # Pass loss to network
            loss = F.mse_loss(predicted_Q, actual_Q)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        return avg_rewards

    def train(self):

        # set up progress bar
        pbar = tqdm(range(GLOBAL_NUM_STEPS), unit='steps')
        avgs = []
        # for each iteration
        for self.step in pbar:
            # run simulation
            # side effects?
            avgs += self.run_simulation()
            #print("tmp: ", tmp)
            #avg_reward_step.append(sum(tmp)/len(tmp))
            #plt.plot(tmp)
            #plt.show()
            #plt.savefig('test.png')
        print("avgs: ", avgs)
        mov_avg = np.convolve(np.array(avgs), np.ones(4), 'valid') / 4
        print("mov avg: ", list(mov_avg))
        print(type(mov_avg))
        print(mov_avg.shape)
        plt.clf()
        plt.plot(list(mov_avg))
        plt.title('running average of average rewards')

        plt.savefig("Results.png")

        plt.show()
Beispiel #2
0
class DqnAgent():
    def __init__(self,
                 obs_dims,
                 act_dim,
                 lr=1e-3,
                 gamma=0.99,
                 replay_buffer_size=10000,
                 batch_size=64,
                 epsilon_min=0.01,
                 epsilon_dec=5e-5,
                 target_update_frequency=64):
        self.buffer = ReplayBuffer(replay_buffer_size, obs_dims)
        self.batch_size = batch_size
        self.q_eval = QNet(obs_dims, act_dim)
        self.q_target = QNet(obs_dims, act_dim)
        self.obs_dims = obs_dims
        self.act_dim = act_dim
        self.learn_ctr = 0
        self.target_update_frequency = target_update_frequency
        self.gamma = gamma
        self.epsilon = 1
        self.epsilon_min = epsilon_min
        self.epsilon_dec = epsilon_dec
        self.optimizer = torch.optim.Adam(self.q_eval.parameters(), lr=lr)
        self.loss_fn = torch.nn.MSELoss()

    def update_target(self):
        if self.learn_ctr % self.target_update_frequency == 0:
            self.q_target.load_state_dict(self.q_eval.state_dict())

    def decrement_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon = self.epsilon - self.epsilon_dec

    def choose_action(self, obs):
        if np.random.sample() < self.epsilon:
            return np.random.randint(self.act_dim)
        else:
            obs = torch.tensor(np.expand_dims(obs, axis=0), dtype=torch.float)
            return torch.argmax(self.q_eval(obs)).item()

    def store_transition(self, obs, act, rew, _obs, done):
        self.buffer.push(obs, act, rew, _obs, done)

    def sample_replay_buffer(self):
        return self.buffer.sample(self.batch_size)

    def learn(self):
        self.optimizer.zero_grad()
        obs, act, rew, _obs, done = self.sample_replay_buffer()
        obs = torch.tensor(obs, dtype=torch.float)
        act = torch.tensor(act, dtype=torch.long)
        rew = torch.tensor(rew, dtype=torch.long)
        _obs = torch.tensor(_obs, dtype=torch.float)
        done = torch.tensor(done, dtype=torch.long)
        idxs = torch.tensor(np.arange(self.batch_size), dtype=torch.long)
        q_pred = self.q_eval(obs)[idxs, act]
        q_next = self.q_target(_obs).max(dim=1)[0]
        q_target = rew + (1 - done) * self.gamma * q_next
        loss = self.loss_fn(q_target, q_pred)
        loss.backward()
        self.optimizer.step()
        self.update_target()
        self.decrement_epsilon()