Esempio n. 1
0
def experiment(algorithm_class, decay_exp):
    np.random.seed()

    # MDP
    mdp = GridWorldVanHasselt()

    # Policy
    epsilon = ExponentialDecayParameter(value=1, decay_exp=.5,
                                        size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    learning_rate = ExponentialDecayParameter(value=1, decay_exp=decay_exp,
                                              size=mdp.info.size)
    algorithm_params = dict(learning_rate=learning_rate)
    agent = algorithm_class(pi, mdp.info, **algorithm_params)

    # Algorithm
    start = mdp.convert_to_int(mdp._start, mdp._width)
    collect_max_Q = CollectMaxQ(agent.approximator, start)
    collect_dataset = CollectDataset()
    callbacks = [collect_dataset, collect_max_Q]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True)

    _, _, reward, _, _, _ = parse_dataset(collect_dataset.get())
    max_Qs = collect_max_Q.get_values()

    return reward, max_Qs
Esempio n. 2
0
    def _fit_boosted(self, x):
        """
        Single fit iteration for boosted FQI.

        Args:
            x (list): the dataset.

        """
        state, action, reward, next_state, absorbing, _ = parse_dataset(x)
        if self._target is None:
            self._target = reward
        else:
            self._next_q += self.approximator.predict(next_state,
                                                      idx=self._idx - 1)
            if np.any(absorbing):
                self._next_q *= 1 - absorbing.reshape(-1, 1)

            max_q = np.max(self._next_q, axis=1)
            self._target = reward + self.mdp_info.gamma * max_q

        self._target -= self._prediction
        self._prediction += self._target

        self.approximator.fit(state,
                              action,
                              self._target,
                              idx=self._idx,
                              **self._fit_params)

        self._idx += 1
Esempio n. 3
0
    def fit(self, dataset):
        phi_state, action, reward, phi_next_state, absorbing, _ = parse_dataset(
            dataset, self.phi)
        phi_state_action = get_action_features(phi_state, action,
                                               self.mdp_info.action_space.n)

        norm = np.inf
        while norm > self._epsilon:
            q = self.approximator.predict(phi_next_state)
            if np.any(absorbing):
                q *= 1 - absorbing.reshape(-1, 1)

            next_action = np.argmax(q, axis=1).reshape(-1, 1)
            phi_next_state_next_action = get_action_features(
                phi_next_state, next_action, self.mdp_info.action_space.n)

            tmp = phi_state_action - self.mdp_info.gamma *\
                phi_next_state_next_action
            self._A += phi_state_action.T.dot(tmp)
            self._b += (phi_state_action.T.dot(reward)).reshape(-1, 1)

            old_w = self.approximator.get_weights()
            if np.linalg.matrix_rank(self._A) == self._A.shape[1]:
                w = np.linalg.solve(self._A, self._b).ravel()
            else:
                w = np.linalg.pinv(self._A).dot(self._b).ravel()
            self.approximator.set_weights(w)

            norm = np.linalg.norm(w - old_w)
Esempio n. 4
0
    def _fit(self, x):
        state = list()
        action = list()
        reward = list()
        next_state = list()
        absorbing = list()

        half = len(x) // 2
        for i in range(2):
            s, a, r, ss, ab, _ = parse_dataset(x[i * half:(i + 1) * half])
            state.append(s)
            action.append(a)
            reward.append(r)
            next_state.append(ss)
            absorbing.append(ab)

        if self._target is None:
            self._target = reward
        else:
            for i in range(2):
                q_i = self.approximator.predict(next_state[i], idx=i)

                amax_q = np.expand_dims(np.argmax(q_i, axis=1), axis=1)
                max_q = self.approximator.predict(next_state[i], amax_q,
                                                  idx=1 - i)
                if np.any(absorbing[i]):
                    max_q *= 1 - absorbing[i]
                self._target[i] = reward[i] + self.mdp_info.gamma * max_q

        for i in range(2):
            self.approximator.fit(state[i], action[i], self._target[i], idx=i,
                                  **self._fit_params)
Esempio n. 5
0
    def fit(self, dataset):
        phi_state, action, reward, phi_next_state, absorbing, _ = parse_dataset(
            dataset, self.phi)
        phi_state_action = get_action_features(phi_state, action,
                                               self.mdp_info.action_space.n)

        norm = np.inf
        while norm > self._epsilon:
            q = self.approximator.predict(phi_next_state)
            if np.any(absorbing):
                q *= 1 - absorbing.reshape(-1, 1)

            next_action = np.argmax(q, axis=1).reshape(-1, 1)
            phi_next_state_next_action = get_action_features(
                phi_next_state,
                next_action,
                self.mdp_info.action_space.n
            )

            tmp = phi_state_action - self.mdp_info.gamma *\
                phi_next_state_next_action
            self._A += phi_state_action.T.dot(tmp)
            self._b += (phi_state_action.T.dot(reward)).reshape(-1, 1)

            old_w = self.approximator.get_weights()
            if np.linalg.matrix_rank(self._A) == self._A.shape[1]:
                w = np.linalg.solve(self._A, self._b).ravel()
            else:
                w = np.linalg.pinv(self._A).dot(self._b).ravel()
            self.approximator.set_weights(w)

            norm = np.linalg.norm(w - old_w)
Esempio n. 6
0
    def _fit(self, x):
        state = list()
        action = list()
        reward = list()
        next_state = list()
        absorbing = list()

        half = len(x) / 2
        for i in xrange(2):
            s, a, r, ss, ab, _ = parse_dataset(x[i * half:(i + 1) * half])
            state.append(s)
            action.append(a)
            reward.append(r)
            next_state.append(ss)
            absorbing.append(ab)

        if self._target is None:
            self._target = reward
        else:
            for i in xrange(2):
                q_i = self.approximator.predict(next_state[i], idx=i)
                if np.any(absorbing[i]):
                    q_i *= 1 - absorbing[i].reshape(-1, 1)

                amax_q = np.expand_dims(np.argmax(q_i, axis=1), axis=1)
                max_q = self.approximator.predict(next_state[i], amax_q,
                                                  idx=1 - i)
                self._target[i] = reward[i] + self.mdp_info.gamma * max_q

        for i in xrange(2):
            self.approximator.fit(state[i], action[i], self._target[i], idx=i,
                                  **self.params['fit_params'])
Esempio n. 7
0
    def _fit_boosted(self, x):
        """
        Single fit iteration for boosted FQI.

        Args:
            x (list): the dataset.

        """
        state, action, reward, next_state, absorbing, _ = parse_dataset(x)
        if self._target is None:
            self._target = reward
        else:
            self._next_q += self.approximator.predict(next_state,
                                                      idx=self._idx - 1)
            if np.any(absorbing):
                self._next_q *= 1 - absorbing.reshape(-1, 1)

            max_q = np.max(self._next_q, axis=1)
            self._target = reward + self.mdp_info.gamma * max_q

        self._target -= self._prediction
        self._prediction += self._target

        self.approximator.fit(state, action, self._target, idx=self._idx,
                              **self._fit_params)

        self._idx += 1
Esempio n. 8
0
    def fit(self, dataset):
        if not self._quiet:
            tqdm.write('Iteration ' + str(self._iter))

        x, u, r, xn, absorbing, last = parse_dataset(dataset)
        x = x.astype(np.float32)
        u = u.astype(np.float32)
        r = r.astype(np.float32)
        xn = xn.astype(np.float32)

        obs = torch.tensor(x, dtype=torch.float)
        act = torch.tensor(u, dtype=torch.float)
        v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last, self.mdp_info.gamma, self._lambda)
        np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8)
        adv = torch.tensor(np_adv, dtype=torch.float)

        old_pol_dist = self.policy.distribution_t(obs)
        old_log_p = old_pol_dist.log_prob(act)[:, None].detach()

        self._V.fit(x, v_target, **self._critic_fit_params)

        self._update_policy(obs, act, adv, old_log_p)

        # Print fit information
        self._print_fit_info(dataset, x, v_target, old_pol_dist)
        self._iter += 1
Esempio n. 9
0
def experiment_others(alg, decay_exp):
    np.random.seed()

    # MDP

    grid_map = "simple_gridmap.txt"
    mdp = GridWorldGenerator(grid_map=grid_map)

    # Policy
    epsilon = ExponentialDecayParameter(value=1, decay_exp=.5,
                             size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    alpha = ExponentialDecayParameter(value=1, decay_exp=decay_exp, size=mdp.info.size)

    algorithm_params = dict(learning_rate=alpha)
    fit_params = dict()
    agent_params = {'algorithm_params': algorithm_params,
                    'fit_params': fit_params}
    agent = alg(pi, mdp.info, agent_params)

    # Algorithm
    collect_max_Q = CollectMaxQ(agent.Q, mdp.convert_to_int(mdp._start, mdp._width))
    collect_dataset = CollectDataset()
    callbacks = [collect_max_Q, collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True)

    _, _, reward, _, _, _ = parse_dataset(collect_dataset.get())
    max_Qs = collect_max_Q.get_values()

    return reward, max_Qs
Esempio n. 10
0
    def fit(self, dataset):
        state, action, reward, next_state, absorbing, _ = parse_dataset(
            dataset)

        v, adv = compute_advantage_montecarlo(self._V, state, next_state,
                                              reward, absorbing,
                                              self.mdp_info.gamma)
        self._V.fit(state, v, **self._critic_fit_params)

        loss = self._loss(state, action, adv)
        self._optimize_actor_parameters(loss)
Esempio n. 11
0
def experiment(decay_exp, windowed, tol):
    np.random.seed()

    # MDP
    mdp = GridWorldVanHasselt()

    # Policy
    epsilon = ExponentialDecayParameter(value=1,
                                        decay_exp=.5,
                                        size=mdp.info.observation_space.size)
    pi = EpsGreedy(epsilon=epsilon)

    # Agent
    alpha = ExponentialDecayParameter(value=1,
                                      decay_exp=decay_exp,
                                      size=mdp.info.size)
    if windowed:
        beta = WindowedVarianceIncreasingParameter(value=1,
                                                   size=mdp.info.size,
                                                   tol=tol,
                                                   window=50)
    else:
        beta = VarianceIncreasingParameter(value=1,
                                           size=mdp.info.size,
                                           tol=tol)
    algorithm_params = dict(learning_rate=alpha, beta=beta, off_policy=True)
    fit_params = dict()
    agent_params = {
        'algorithm_params': algorithm_params,
        'fit_params': fit_params
    }
    agent = RQLearning(pi, mdp.info, agent_params)

    # Algorithm
    collect_max_Q = CollectMaxQ(agent.Q,
                                mdp.convert_to_int(mdp._start, mdp._width))
    collect_dataset = CollectDataset()
    callbacks = [collect_max_Q, collect_dataset]
    core = Core(agent, mdp, callbacks)

    # Train
    core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True)

    _, _, reward, _, _, _ = parse_dataset(collect_dataset.get())
    max_Qs = collect_max_Q.get_values()

    return reward, max_Qs
Esempio n. 12
0
    def fit(self, dataset):
        if not self._quiet:
            tqdm.write('Iteration ' + str(self._iter))

        state, action, reward, next_state, absorbing, last = parse_dataset(
            dataset)
        x = state.astype(np.float32)
        u = action.astype(np.float32)
        r = reward.astype(np.float32)
        xn = next_state.astype(np.float32)

        obs = to_float_tensor(x, self.policy.use_cuda)
        act = to_float_tensor(u, self.policy.use_cuda)
        v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last,
                                       self.mdp_info.gamma, self._lambda)
        np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8)
        adv = to_float_tensor(np_adv, self.policy.use_cuda)

        # Policy update
        self._old_policy = deepcopy(self.policy)
        old_pol_dist = self._old_policy.distribution_t(obs)
        old_log_prob = self._old_policy.log_prob_t(obs, act).detach()

        zero_grad(self.policy.parameters())
        loss = self._compute_loss(obs, act, adv, old_log_prob)

        prev_loss = loss.item()

        # Compute Gradient
        loss.backward()
        g = get_gradient(self.policy.parameters())

        # Compute direction through conjugate gradient
        stepdir = self._conjugate_gradient(g, obs, old_pol_dist)

        # Line search
        self._line_search(obs, act, adv, old_log_prob, old_pol_dist, prev_loss,
                          stepdir)

        # VF update
        self._V.fit(x, v_target, **self._critic_fit_params)

        # Print fit information
        self._print_fit_info(dataset, x, v_target, old_pol_dist)
        self._iter += 1
Esempio n. 13
0
    def _fit(self, x):
        """
        Single fit iteration.

        Args:
            x (list): the dataset.

        """
        state, action, reward, next_state, absorbing, _ = parse_dataset(x)
        if self._target is None:
            self._target = reward
        else:
            q = self.approximator.predict(next_state)
            if np.any(absorbing):
                q *= 1 - absorbing.reshape(-1, 1)

            max_q = np.max(q, axis=1)
            self._target = reward + self.mdp_info.gamma * max_q

        self.approximator.fit(state, action, self._target, **self._fit_params)
Esempio n. 14
0
    def _fit(self, x):
        """
        Single fit iteration.

        Args:
            x (list): the dataset.

        """
        state, action, reward, next_state, absorbing, _ = parse_dataset(x)
        if self._target is None:
            self._target = reward
        else:
            q = self.approximator.predict(next_state)
            if np.any(absorbing):
                q *= 1 - absorbing.reshape(-1, 1)

            max_q = np.max(q, axis=1)
            self._target = reward + self.mdp_info.gamma * max_q

        self.approximator.fit(state, action, self._target, **self._fit_params)
Esempio n. 15
0
    def fit(self, dataset):
        if not self._quiet:
            tqdm.write('Iteration ' + str(self._iter))

        state, action, reward, next_state, absorbing, last = parse_dataset(dataset)
        x = state.astype(np.float32)
        u = action.astype(np.float32)
        r = reward.astype(np.float32)
        xn = next_state.astype(np.float32)

        obs = torch.tensor(x, dtype=torch.float)
        act = torch.tensor(u, dtype=torch.float)
        v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last,
                                       self.mdp_info.gamma, self._lambda)
        np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8)
        adv = torch.tensor(np_adv, dtype=torch.float)

        # Policy update
        old_pol_dist = self.policy.distribution_t(obs)
        old_log_prob = self.policy.log_prob_t(obs, act).detach()

        self._zero_grad()
        loss = self._compute_loss(obs, act, adv, old_log_prob)

        prev_loss = loss.item()

        # Compute Gradient
        loss.backward(retain_graph=True)
        g = get_gradient(self.policy.parameters())

        # Compute direction trough conjugate gradient
        stepdir = self._conjugate_gradient(g, obs, old_pol_dist)

        # Line search
        shs = .5 * stepdir.dot(self._fisher_vector_product(
            torch.from_numpy(stepdir), obs, old_pol_dist)
        )
        lm = np.sqrt(shs / self._max_kl)
        fullstep = stepdir / lm
        stepsize = 1.

        theta_old = self.policy.get_weights()

        violation = True

        for _ in range(self._n_epochs_line_search):
            theta_new = theta_old + fullstep * stepsize
            self.policy.set_weights(theta_new)

            new_loss = self._compute_loss(obs, act, adv, old_log_prob)
            kl = self._compute_kl(obs, old_pol_dist)
            improve = new_loss - prev_loss
            if kl <= self._max_kl * 1.5 or improve >= 0:
                violation = False
                break
            stepsize *= .5

        if violation:
            self.policy.set_weights(theta_old)

        # VF update
        self._V.fit(x, v_target, **self._critic_fit_params)

        # Print fit information
        self._print_fit_info(dataset, x, v_target, old_pol_dist)
        self._iter += 1
Esempio n. 16
0
def experiment(mdp, test_states, test_actions, test_q, names):
    np.random.seed()

    n_games = len(mdp)
    input_shape = [(m.info.observation_space.shape[0], ) for m in mdp]
    n_actions_per_head = [(m.info.action_space.n, ) for m in mdp]

    test_states = np.array([test_states]).repeat(len(mdp), 0).reshape(-1, 2)
    test_actions = np.array([test_actions]).repeat(len(mdp), 0).reshape(-1, 1)
    test_idxs = np.ones(len(test_states), dtype=np.int) * np.arange(
        len(mdp)).repeat(len(test_states) // len(mdp), 0)

    # Policy
    epsilon = Parameter(value=1.)
    pi = EpsGreedyMultiple(parameter=epsilon,
                           n_actions_per_head=n_actions_per_head)

    # Approximator
    optimizer = {'class': optim.Adam, 'params': dict()}
    loss = LossFunction(n_games)

    approximator_params = dict(network=Network,
                               input_shape=input_shape,
                               output_shape=n_actions_per_head,
                               optimizer=optimizer,
                               loss=loss,
                               features='sigmoid',
                               n_features=30,
                               use_cuda=True,
                               quiet=False)

    approximator = TorchApproximator

    dataset = list()
    len_datasets = list()
    for i in range(len(mdp)):
        d = pickle.load(open('dataset_%s.pkl' % names[i], 'rb'))
        len_datasets.append(len(d))
        dataset += d

    # Agent
    algorithm_params = dict(n_iterations=1,
                            n_actions_per_head=n_actions_per_head,
                            fit_params=dict(patience=100, epsilon=1e-6))
    agent = FQI(approximator,
                pi,
                mdp[0].info,
                approximator_params=approximator_params,
                **algorithm_params)

    qs = list()
    scores = list()

    idxs = list()
    for i, l in enumerate(len_datasets):
        idxs += (np.ones(l, dtype=np.int) * i).tolist()
    idxs = np.array(idxs)

    state, action, reward, next_state, absorbing, _ = parse_dataset(dataset)
    for _ in trange(50, dynamic_ncols=True, disable=False, leave=False):
        agent._fit(state, action, reward, next_state, absorbing, idxs)
        # Algorithm
        core = Core(agent, mdp)
        test_epsilon = Parameter(0.)
        pi.set_parameter(test_epsilon)
        dataset = core.evaluate(n_steps=100)

        qs.append(
            agent.approximator.predict(test_states,
                                       test_actions,
                                       idx=test_idxs))
        scores.append(np.mean(compute_J(dataset, mdp[0].info.gamma)))

    qs_hat = np.array(qs)
    avi_diff = list()
    for i in range(len(qs_hat)):
        avi_diff.append(
            np.linalg.norm(qs_hat[i] - test_q, ord=1) / len(test_q))

    print(avi_diff, scores)

    return avi_diff, scores