Esempio n. 1
0
def _kifdd_common(
    agent_class,
    domain,
    kernel_resolution,
    threshold=1.0,
    lambda_=0.3,
    initial_learn_rate=0.1,
    boyan_N0=100,
    kernel="gaussian",
):
    kernel_width = (domain.statespace_limits[:, 1] -
                    domain.statespace_limits[:, 0]) / kernel_resolution
    kifdd = KernelizediFDD(
        domain,
        sparsify=True,
        kernel=getattr(representations, kernel),
        kernel_args=[kernel_width],
        active_threshold=0.01,
        discover_threshold=threshold,
        normalization=True,
        max_active_base_feat=10,
        max_base_feat_sim=0.5,
    )
    return agent_class(
        eGreedy(kifdd, epsilon=0.1),
        kifdd,
        discount_factor=domain.discount_factor,
        lambda_=lambda_,
        initial_learn_rate=initial_learn_rate,
        learn_rate_decay_mode="boyan",
        boyan_N0=boyan_N0,
    )
Esempio n. 2
0
def _make_experiment(exp_id=1, path="./Results/Tmp/test_PST"):
    """
    Each file specifying an experimental setup should contain a
    make_experiment function which returns an instance of the Experiment
    class with everything set up.

    @param id: number used to seed the random number generators
    @param path: output directory where logs and results are stored
    """
    # Domain:
    NUM_UAV = 3
    domain = PST(NUM_UAV=NUM_UAV)

    # Representation
    # discretization only needed for continuous state spaces, discarded otherwise
    representation = IncrementalTabular(domain)

    # Policy
    policy = eGreedy(representation, epsilon=0.1)

    # Agent
    agent = SARSA(
        representation=representation,
        policy=policy,
        discount_factor=domain.discount_factor,
        initial_learn_rate=0.1,
    )
    checks_per_policy = 2
    max_steps = 30
    num_policy_checks = 2
    experiment = Experiment(**locals())
    return experiment
Esempio n. 3
0
def _rbf_common(
    agent_class,
    domain,
    seed=1,
    num_rbfs=96,
    resolution=21,
    initial_learn_rate=0.1,
    lambda_=0.3,
    boyan_N0=100,
):
    rbf = RBF(
        domain,
        num_rbfs=num_rbfs,
        resolution_max=resolution,
        resolution_min=resolution,
        const_feature=False,
        normalize=True,
        seed=seed,
    )
    return agent_class(
        eGreedy(rbf, epsilon=0.1),
        rbf,
        discount_factor=domain.discount_factor,
        lambda_=lambda_,
        initial_learn_rate=initial_learn_rate,
        learn_rate_decay_mode="boyan",
        boyan_N0=boyan_N0,
    )
Esempio n. 4
0
    def _solve_impl(self):
        """Solve the domain MDP."""
        self.bellman_updates = 0
        self.policy_improvement_iteration = 0
        self.start_time = clock()

        # Initialize the policy
        # Copy the representation so that the weight change during the evaluation
        # does not change the policy
        policy = eGreedy(deepcopy(self.representation),
                         epsilon=0,
                         deterministic=True)

        # Setup the number of policy changes to 1 so the while loop starts
        policy_changes = True

        while policy_changes and self.has_time():
            # Evaluate the policy
            if self.policy_evaluation(policy):
                self.logger.info("Converged!")

            # Improve the policy
            self.policy_improvement_iteration += 1
            policy, policy_changes = self.policy_improvement(policy)

        self.log_value()
Esempio n. 5
0
def _ifddk_common(
    agent_class,
    domain,
    epsilon=0.1,
    discretization=20,
    threshold=1.0,
    lambda_=0.3,
    initial_learn_rate=0.1,
    boyan_N0=100,
):
    ifddk = iFDDK(
        domain,
        discovery_threshold=threshold,
        initial_representation=IndependentDiscretization(
            domain, discretization=discretization),
        sparsify=True,
        useCache=True,
        lazy=True,
        lambda_=lambda_,
    )
    return agent_class(
        eGreedy(ifddk, epsilon=epsilon),
        ifddk,
        discount_factor=domain.discount_factor,
        lambda_=lambda_,
        initial_learn_rate=initial_learn_rate,
        learn_rate_decay_mode="boyan",
        boyan_N0=boyan_N0,
    )
Esempio n. 6
0
def tabular_q(
    domain,
    epsilon=0.1,
    epsilon_decay=0.0,
    epsilon_min=0.0,
    discretization=20,
    lambda_=0.3,
    initial_learn_rate=0.1,
    boyan_N0=100,
    incremental=False,
):
    if incremental:
        tabular = IncrementalTabular(domain, discretization=discretization)
    else:
        tabular = Tabular(domain, discretization=discretization)
    return Q_Learning(
        eGreedy(
            tabular,
            epsilon=epsilon,
            epsilon_decay=epsilon_decay,
            epsilon_min=epsilon_min,
        ),
        tabular,
        discount_factor=domain.discount_factor,
        lambda_=lambda_,
        initial_learn_rate=initial_learn_rate,
        learn_rate_decay_mode="boyan",
        boyan_N0=boyan_N0,
    )
Esempio n. 7
0
def _fourier_common(
    agent_class,
    domain,
    order=3,
    scaling=False,
    initial_learn_rate=0.1,
    lambda_=0.3,
    boyan_N0=100,
):
    fourier = Fourier(domain, order=order, scaling=scaling)
    return agent_class(
        eGreedy(fourier, epsilon=0.1),
        fourier,
        discount_factor=domain.discount_factor,
        lambda_=lambda_,
        initial_learn_rate=initial_learn_rate,
        learn_rate_decay_mode="boyan",
        boyan_N0=boyan_N0,
    )
Esempio n. 8
0
def tabular_ucbvi(
    domain,
    seed,
    show_reward=False,
    epsilon=0.1,
    epsilon_decay=0.0,
    epsilon_min=0.0,
    vi_threshold=1e-6,
):
    tabular = Tabular(domain, discretization=20)
    policy = eGreedy(tabular,
                     epsilon=epsilon,
                     epsilon_decay=epsilon_decay,
                     epsilon_min=epsilon_min)
    return UCBVI(policy,
                 tabular,
                 domain.discount_factor,
                 seed=seed,
                 show_reward=show_reward)
Esempio n. 9
0
def tile_ggq(domain,
             res_mat,
             lambda_=0.3,
             initial_learn_rate=0.1,
             boyan_N0=100):
    tile = TileCoding(
        domain,
        memory=2000,
        num_tilings=[1] * res_mat.shape[0],
        resolution_matrix=res_mat,
        safety="none",
    )
    return GreedyGQ(
        eGreedy(tile, epsilon=0.1),
        tile,
        discount_factor=domain.discount_factor,
        lambda_=lambda_,
        initial_learn_rate=initial_learn_rate,
        boyan_N0=boyan_N0,
    )
Esempio n. 10
0
def _make_experiment(domain,
                     exp_id=1,
                     path="./Results/Tmp/test_InfTrackCartPole"):
    ## Representation
    # discretization only needed for continuous state spaces, discarded otherwise
    representation = Tabular(domain)

    ## Policy
    policy = eGreedy(representation, epsilon=0.2)

    ## Agent
    agent = SARSA(
        representation=representation,
        policy=policy,
        discount_factor=domain.discount_factor,
        initial_learn_rate=0.1,
    )
    checks_per_policy = 3
    max_steps = 50
    num_policy_checks = 3
    experiment = Experiment(**locals())
    return experiment
Esempio n. 11
0
def test_qlearn_valfun_chain():
    """
    Check if Q-Learning computes the value function of a simple Markov chain correctly.
    This only tests value function estimation, only one action possible
    """
    rep = MockRepresentation()
    pol = eGreedy(rep)
    agent = Q_Learning(pol, rep, 0.9, lambda_=0.0)
    for i in range(1000):
        if i % 4 == 3:
            continue
        agent.learn(
            np.array([i % 4]),
            [0],
            0,
            1.0,
            np.array([(i + 1) % 4]),
            [0],
            0,
            (i + 2) % 4 == 0,
        )
    V_true = np.array([2.71, 1.9, 1, 0])
    np.testing.assert_allclose(rep.weight_vec, V_true)
Esempio n. 12
0
def tabular_mbie_eb(
    domain,
    seed,
    show_reward=False,
    beta=0.1,
    epsilon=0.1,
    epsilon_decay=0.0,
    epsilon_min=0.0,
    vi_threshold=1e-6,
):
    tabular = Tabular(domain, discretization=20)
    policy = eGreedy(tabular,
                     epsilon=epsilon,
                     epsilon_decay=epsilon_decay,
                     epsilon_min=epsilon_min)
    return MBIE_EB(
        policy,
        tabular,
        domain.discount_factor,
        beta=beta,
        seed=seed,
        show_reward=show_reward,
    )
Esempio n. 13
0
def tabular_opt_psrl(
    domain,
    seed,
    show_reward=False,
    epsilon=0.1,
    epsilon_decay=0.0,
    epsilon_min=0.0,
    n_samples=10,
    vi_threshold=1e-6,
):
    tabular = Tabular(domain, discretization=20)
    policy = eGreedy(tabular,
                     epsilon=epsilon,
                     epsilon_decay=epsilon_decay,
                     epsilon_min=epsilon_min)
    return OptimisticPSRL(
        policy,
        tabular,
        domain.discount_factor,
        seed=seed,
        show_reward=show_reward,
        n_samples=n_samples,
    )
Esempio n. 14
0
def test_ggq_valfun_chain():
    """
    Check if Greedy-GQ computes the value function of a simple Markov chain correctly.
    This only tests value function estimation, only one action possible
    """
    rep = MockRepresentation()
    pol = eGreedy(rep)
    agent = GreedyGQ(pol, rep, lambda_=0.0, discount_factor=0.9)
    for i in range(1000):
        if i % 4 == 3:
            agent.episode_terminated()
            continue
        agent.learn(
            np.array([i % 4]),
            [0],
            0,
            1.0,
            np.array([(i + 1) % 4]),
            [0],
            0,
            (i + 2) % 4 == 0,
        )
    V_true = np.array([2.71, 1.9, 1, 0])
    np.testing.assert_allclose(rep.weight_vec, V_true)
    def solve_in_matrix_format(self):
        # while delta_weight_vec > threshold
        #  1. Gather data following an e-greedy policy
        #  2. Calculate A and b estimates
        #  3. calculate new_weight_vec, and delta_weight_vec
        # return policy greedy w.r.t last weight_vec
        self.policy = eGreedy(self.representation, epsilon=self.epsilon)

        # Number of samples to be used for each policy evaluation phase. L1 in
        # the Geramifard et. al. FTML 2012 paper
        self.samples_num = 1000

        self.start_time = clock()  # Used to track the total time for solving
        samples = 0
        converged = False
        iteration = 0
        while self.has_time() and not converged:

            #  1. Gather samples following an e-greedy policy
            S, Actions, NS, R, T = self.collect_samples(self.samples_num)
            samples += self.samples_num

            #  2. Calculate A and b estimates
            a_num = self.domain.num_actions
            n = self.representation.features_num
            discount_factor = self.domain.discount_factor

            self.A = np.zeros((n * a_num, n * a_num))
            self.b = np.zeros((n * a_num, 1))
            for i in range(self.samples_num):
                phi_s_a = self.representation.phi_sa(S[i], T[i],
                                                     Actions[i, 0]).reshape(
                                                         (-1, 1))
                E_phi_ns_na = self.calculate_expected_phi_ns_na(
                    S[i], Actions[i, 0], self.ns_samples).reshape((-1, 1))
                d = phi_s_a - discount_factor * E_phi_ns_na
                self.A += np.outer(phi_s_a, d.T)
                self.b += phi_s_a * R[i, 0]

            #  3. calculate new_weight_vec, and delta_weight_vec
            new_weight_vec, solve_time = solveLinear(regularize(self.A),
                                                     self.b)
            iteration += 1
            if solve_time > 1:
                self.logger.info(
                    "#%d: Finished Policy Evaluation. Solve Time = %0.2f(s)" %
                    (iteration, solve_time))
            weight_diff = l_norm(new_weight_vec -
                                 self.representation.weight_vec)
            converged = weight_diff < self.convergence_threshold
            self.representation.weight_vec = new_weight_vec
            (
                perf_return,
                perf_steps,
                perf_term,
                perf_disc_return,
            ) = self.performance_run()
            self.logger.info(
                "#%d [%s]: Samples=%d, ||weight-Change||=%0.4f, Return = %0.4f"
                % (
                    iteration,
                    hhmmss(deltaT(self.start_time)),
                    samples,
                    weight_diff,
                    perf_return,
                ))
            if self._visualize_mode:
                self.domain.show_learning(self.representation)

            # store stats
            self.result["samples"].append(samples)
            self.result["return"].append(perf_return)
            self.result["planning_time"].append(deltaT(self.start_time))
            self.result["num_features"].append(
                self.representation.features_num)
            self.result["steps"].append(perf_steps)
            self.result["terminated"].append(perf_term)
            self.result["discounted_return"].append(perf_disc_return)
            self.result["iteration"].append(iteration)

        if converged:
            self.logger.info("Converged!")

        self.log_value()
Esempio n. 16
0
def test_deepcopy():
    rep = MockRepresentation()
    pol = eGreedy(rep)
    agent = SARSA(pol, rep, 0.9, lambda_=0.0)
    copied_agent = copy.deepcopy(agent)
    assert agent.lambda_ == copied_agent.lambda_
Esempio n. 17
0
def tabular_sarsa(domain, discretization=20, lambda_=0.3):
    tabular = Tabular(domain, discretization=discretization)
    policy = eGreedy(tabular, epsilon=0.1)
    return SARSA(policy, tabular, domain.discount_factor, lambda_=lambda_)
    def _solve_impl(self):
        """Solve the domain MDP."""

        self.start_time = clock()  # Used to track the total time for solving
        self.bellman_updates = 0
        converged = False
        PI_iteration = 0

        # The policy is maintained as separate copy of the representation.
        # This way as the representation is updated the policy remains intact
        policy = eGreedy(deepcopy(self.representation),
                         epsilon=0,
                         deterministic=True)
        a_num = self.domain.num_actions

        while self.has_time() and not converged:

            # Policy Improvement (Updating the representation of the value)
            self.traj_based_policy_evaluation(policy)
            PI_iteration += 1

            # Theta can increase in size if the representation
            # is expanded hence padding the weight vector with zeros
            additional_dim = (self.representation.features_num -
                              policy.representation.features_num)
            padded_theta = np.hstack(
                (policy.representation.weight, np.zeros(
                    (a_num, additional_dim))))

            # Calculate the change in the weight_vec as L2-norm
            weight_diff = np.linalg.norm(padded_theta -
                                         self.representation.weight)
            converged = weight_diff < self.convergence_threshold

            # Update the underlying value function of the policy
            policy.representation = deepcopy(
                self.representation)  # self.representation

            (
                perf_return,
                perf_steps,
                perf_term,
                perf_disc_return,
            ) = self.performance_run()
            self.logger.info(
                "PI #%d [%s]: BellmanUpdates=%d, ||delta-weight_vec||=%0.4f, "
                "Return=%0.3f, steps=%d, features=%d" % (
                    PI_iteration,
                    hhmmss(deltaT(self.start_time)),
                    self.bellman_updates,
                    weight_diff,
                    perf_return,
                    perf_steps,
                    self.representation.features_num,
                ))

            if self._visualize_mode:
                self.domain.show_learning(self.representation)

            # store stats
            self.result["bellman_updates"].append(self.bellman_updates)
            self.result["return"].append(perf_return)
            self.result["planning_time"].append(deltaT(self.start_time))
            self.result["num_features"].append(
                self.representation.features_num)
            self.result["steps"].append(perf_steps)
            self.result["terminated"].append(perf_term)
            self.result["discounted_return"].append(perf_disc_return)
            self.result["policy_improvemnt_iteration"].append(PI_iteration)

        if converged:
            self.logger.info("Converged!")
        self.log_value()
Esempio n. 19
0
def tabular_lspi(domain, max_steps, discretization=20):
    tabular = Tabular(domain, discretization=discretization)
    policy = eGreedy(tabular, epsilon=0.1)
    return LSPI(policy, tabular, domain.discount_factor, max_steps, 1000)