Esempio n. 1
0
def day_pass(k, v, d):

    Q = utils.load_object(etr_path + v["policy"])
    task = v["task"]

    task.starting_day_index = d
    s = task.reset()

    rewards = np.zeros((len(task.prices[0])))
    actions = np.zeros((len(task.prices[0])))

    state_value_list = []

    done = False

    while not done:

        a_list = Q._q_values(s)

        state_value_list.append([s[0], a_list])
        a = np.argmax(a_list)
        s, r, done, _ = task.step([a])

        r = r[0]
        done = done[0]

        actions[task.current_timestep] = a - 1  # [0, 2] -> [-1, 1]
        rewards[task.current_timestep] = r

    print("{0:s} - Day: {1:4d}, Cumulative reward: {2:8.6f}".format(
        k, d, np.sum(rewards)))

    return (d, rewards, actions, state_value_list)
Esempio n. 2
0
def year_pass(k, v):

    Q = utils.load_object(etr_path + v["policy"])
    task = v["task"]

    task.starting_day_index = 0
    task.reset()
    num_days = task.n_days

    if n_jobs == 1:
        outputs = [day_pass(k, v, d) for d in range(num_days)]
    elif n_jobs > 1:
        outputs = Parallel(n_jobs=n_jobs,
                           max_nbytes=None)(delayed(day_pass)(k, v, d)
                                            for d in range(num_days))

    days = []
    actions = np.zeros((num_days, len(task.prices[0])))
    rewards = np.zeros((num_days, len(task.prices[0])))
    state_value_list = []

    for (d, r, a, svl) in outputs:

        days.append(d)
        rewards[d, :] = r
        actions[d, :] = a

        state_value_list.extend(svl)

    print("Days:", len(days))
    print("Rewards sum:", np.sum(rewards))
    print("State values list length:", len(state_value_list))

    utils.save_object(state_value_list, save_dataset_path + k)
    utils.save_object([days, actions, rewards], save_actions_path + k)
Esempio n. 3
0
def plot_actions(dataset_path, qw, index, task, n_actions, save_path):

    dataset = utils.load_object(dataset_path)
    dataset = np.array(dataset)

    actions_etr = np.zeros((n_actions, 3))
    for i in range(n_actions):
        for j in range(3):
            actions_etr[i, j] = dataset[i, 1][j]

    actions_nn = np.zeros((n_actions, 3))

    q = MLPQFunction(task.state_dim,
                     task.action_space.n,
                     layers=layers,
                     initial_params=qw)

    task.starting_day_index = 0
    task.reset()

    actions_counter = 0

    for di in range(task.n_days):

        task.starting_day_index = di
        s = task.reset()

        done = False
        while not done:
            a_list = q.value_actions(s)
            actions_nn[actions_counter, :] = a_list
            a = np.argmax(a_list)
            s, r, done, _ = task.step([a])

            done = done[0]

            actions_counter += 1
            if actions_counter >= n_actions:
                break

            percentage = actions_counter * 100 / n_actions
            if percentage % 10 == 0:
                print("Actions evaluation: {0:3d}%".format(int(percentage)))

        if actions_counter >= n_actions:
            break

    fig, ax = plt.subplots(3, sharex=True, figsize=(16, 9))

    for i in range(3):
        ax[i].plot(actions_etr[:10000, i], label="ETR")
        ax[i].plot(actions_nn[:10000, i], label="NN")
        ax[i].set_title("Action " + str(i - 1))
        ax[i].legend()

    plt.savefig(save_path + '.pdf', format='pdf')
Esempio n. 4
0
def transfer(dataset_path, mdp, save_path, iterations, year, seed=0):

    np.random.seed(seed)

    data = utils.load_object(dataset_path)
    data = np.array(data)

    state_dim = mdp.state_dim
    n_actions = mdp.action_space.n
    mdp.starting_day_index = 0
    mdp.reset()
    day_length = len(mdp.prices[0])

    Q = MLPQFunction(state_dim, n_actions, layers=layers)
    Q.init_weights()

    m_t = 0
    v_t = 0
    t = 0

    utils.save_object([], save_path)

    losses = [[], [], []]

    for i in range(iterations):

        # sample time of day
        time = int(np.random.uniform(low=0, high=day_length))
        datapoints = np.arange(0, len(data) - day_length, day_length)
        datapoints += time
        datapoints = data[datapoints]
        np.random.shuffle(datapoints)
        datapoints = datapoints[:batch_size]

        for a in range(n_actions):
            with torch.autograd.set_detect_anomaly(True):
                train_loss, grad = compute_gradient_single_action(
                    Q, datapoints, a)

            losses[a].append(train_loss)

            print(
                "Y: {0}, I: {1:5d}, Time: {2:4d}, A: {3:1d}, Grad: {4:8.6f}, Train Loss: {5:8.6f}"
                .format(year, i, time, a, np.linalg.norm(grad), train_loss))

            Q._w, t, m_t, v_t = utils.adam(Q._w,
                                           grad,
                                           t,
                                           m_t,
                                           v_t,
                                           alpha=alpha)

        if save_freq > 0 and i % save_freq == 0:
            past_Qs = utils.load_object(save_path)
            past_Qs.append(np.array(Q._w))
            utils.save_object(past_Qs, save_path)
            plot_actions(dataset_path, Q._w, i, mdp, n_actions_plot,
                         path + "/plot-" + year + "-" + str(i))

    print(
        "Model selected index: {0:4d}, Train Loss: [{1:8.6f}, {2:8.6f}, {3:8.6f}]"
        .format(i, losses[0][i], losses[1][i], losses[2][i]))

    return [mdp.get_info(), np.array(Q._w), losses]
Esempio n. 5
0
def learn(
        mdp,
        Q,
        operator,
        max_iter=5000,
        buffer_size=10000,
        batch_size=50,
        alpha_adam=0.001,
        alpha_sgd=0.1,
        lambda_=0.001,
        n_weights=10,
        train_freq=1,
        eval_freq=50,
        random_episodes=0,
        eval_states=None,
        eval_episodes=1,
        mean_episodes=50,
        preprocess=lambda x: x,
        cholesky_clip=0.0001,
        bandwidth=0.00001,
        post_components=1,
        max_iter_ukl=60,
        eps=0.001,
        eta=1e-6,
        time_coherent=False,
        source_file=None,
        seed=None,
        render=False,
        verbose=True,
        ukl_tight_freq=1,
        sources=None,
        # Lambda function to calculate the weights
        weights_calculator=None):

    if seed is not None:
        np.random.seed(seed)

    # Randomly initialize the weights in case an MLP is used
    if isinstance(Q, MLPQFunction):
        Q.init_weights()

    # Reset global variables
    global prior_eigen
    prior_eigen = None
    global cholesky_mask
    cholesky_mask = None
    global prior_normal
    prior_normal = None
    global posterior_normal
    posterior_normal = None

    # Initialize policies
    pi_g = EpsilonGreedy(Q, np.arange(mdp.action_space.n), epsilon=0)

    # Get number of features
    K = Q._w.size
    C = post_components

    # Load weights and construct prior distribution
    weights = utils.load_object(source_file) if sources is None else sources
    timesteps = len(weights)
    ws = []
    # Take only 1 sample per timestep
    for i in range(timesteps):
        samples = weights[i]
        np.random.shuffle(samples)
        ws.append(samples[0][1])  # 0: first sample (random), 1: weights
    ws = np.array(ws)

    # The gaussian mixture weights are uniform if not provided.
    c_bar = np.ones(
        timesteps
    ) / timesteps if weights_calculator is None else weights_calculator(ws)

    # Take only gaussians with non-zero weights
    ws = ws[c_bar > 0]
    timesteps = len(ws)
    c_bar = c_bar[c_bar > 0]

    mu_bar = ws
    Sigma_bar = np.tile(np.eye(K) * bandwidth, (timesteps, 1, 1))
    Sigma_bar_inv = np.tile((1 / bandwidth * np.eye(K))[np.newaxis],
                            (timesteps, 1, 1))

    # We initialize the parameters of the posterior to the best approximation of the posterior family to the prior
    c = np.ones(C) / C
    psi = c[:, np.newaxis] * c_bar[np.newaxis]
    phi = np.array(psi)

    mu = np.array([100 * np.random.randn(K) for _ in range(C)])
    Sigma = np.array([np.eye(K) for _ in range(C)])

    phi, psi = tight_ukl(c,
                         mu,
                         Sigma,
                         c_bar,
                         mu_bar,
                         Sigma_bar,
                         phi,
                         psi,
                         max_iter=max_iter_ukl,
                         eps=eps)
    params, phi, psi = init_posterior(c,
                                      mu,
                                      Sigma,
                                      c_bar,
                                      mu_bar,
                                      Sigma_bar,
                                      phi,
                                      psi,
                                      C,
                                      K,
                                      cholesky_clip,
                                      max_iter_ukl,
                                      max_iter=max_iter_ukl * 10,
                                      precision=Sigma_bar_inv,
                                      eta=eta,
                                      eps=eps,
                                      verbose=verbose)

    # Add random episodes if needed
    init_samples = list()
    if random_episodes > 0:
        w, _ = sample_gmm(random_episodes, c_bar, mu_bar, np.sqrt(Sigma_bar))
        for i in range(random_episodes):
            Q._w = w[i]
            init_samples.append(
                utils.generate_episodes(mdp,
                                        pi_g,
                                        n_episodes=1,
                                        preprocess=preprocess))
        init_samples = np.concatenate(init_samples)

        t, s, a, r, s_prime, absorbing, sa = utils.split_data(
            init_samples, mdp.state_dim, mdp.action_dim)
        init_samples = np.concatenate(
            (t[:, np.newaxis], preprocess(s), a, r[:, np.newaxis],
             preprocess(s_prime), absorbing[:, np.newaxis]),
            axis=1)

    # Figure out the effective state-dimension after preprocessing is applied
    eff_state_dim = preprocess(np.zeros(mdp.state_dim)).size

    # Create replay buffer
    buffer = Buffer(buffer_size, eff_state_dim)
    n_init_samples = buffer.add_all(init_samples) if random_episodes > 0 else 0

    # Results
    iterations = []
    episodes = []
    n_samples = []
    evaluation_rewards = []
    learning_rewards = []
    episode_rewards = [0.0]
    l_2 = []
    l_inf = []
    fvals = []
    episode_t = []

    # Create masks for ADAM and SGD
    adam_mask = pack(np.zeros(C),
                     np.ones((C, K)) * alpha_adam, np.zeros(
                         (C, K, K)))  # ADAM learns only \mu
    sgd_mask = pack(np.zeros(C), np.zeros((C, K)),
                    np.ones((C, K, K)) * alpha_sgd)  # SGD learns only L

    # Adam initial params
    m_t = 0
    v_t = 0
    t = 0

    # Init env
    s = mdp.reset()
    h = 0
    Q._w = sample_posterior(params, C, K)

    start_time = time.time()

    # Learning
    for i in range(max_iter):

        # If we do not use time coherent exploration, resample parameters
        Q._w = sample_posterior(params, C, K) if not time_coherent else Q._w
        # Take greedy action wrt current Q-function
        s_prep = preprocess(s)
        a = np.argmax(Q.value_actions(s_prep))
        # Step
        s_prime, r, done, _ = mdp.step(a)
        # Build the new sample and add it to the dataset
        buffer.add_sample(h, s_prep, a, r, preprocess(s_prime), done)

        # Take a step of gradient if needed
        if i % train_freq == 0:
            # Estimate gradient
            g = gradient(buffer.sample_batch(batch_size),
                         params,
                         Q,
                         c_bar,
                         mu_bar,
                         Sigma_bar,
                         operator,
                         i + 1,
                         phi,
                         psi,
                         n_weights,
                         lambda_,
                         max_iter_ukl,
                         C,
                         K,
                         precision=Sigma_bar_inv,
                         t_step=i,
                         ukl_tight_freq=ukl_tight_freq)

            # Take a gradient step for \mu
            params, t, m_t, v_t = utils.adam(params,
                                             g,
                                             t,
                                             m_t,
                                             v_t,
                                             alpha=adam_mask)
            # Take a gradient step for L
            params = utils.sgd(params, g, alpha=sgd_mask)
            # Clip parameters
            params = clip(params, cholesky_clip, C, K)

        # Add reward to last episode
        episode_rewards[-1] += r * mdp.gamma**h

        s = s_prime
        h += 1
        if done or h >= mdp.horizon:

            episode_rewards.append(0.0)
            s = mdp.reset()
            h = 0
            Q._w = sample_posterior(params, C, K)
            episode_t.append(i)

        # Evaluate model
        if i % eval_freq == 0:

            #Save current weights
            current_w = np.array(Q._w)

            # Evaluate MAP Q-function
            c, mu, _ = unpack(params, C, K)
            rew = 0
            for j in range(C):
                Q._w = mu[j]
                rew += utils.evaluate_policy(mdp,
                                             pi_g,
                                             render=render,
                                             initial_states=eval_states,
                                             n_episodes=eval_episodes,
                                             preprocess=preprocess)[0]
            rew /= C

            learning_rew = np.mean(
                episode_rewards[-mean_episodes -
                                1:-1]) if len(episode_rewards) > 1 else 0.0
            br = operator.bellman_residual(Q,
                                           buffer.sample_batch(batch_size))**2
            l_2_err = np.average(br)
            l_inf_err = np.max(br)
            fval = objective(buffer.sample_batch(batch_size),
                             params,
                             Q,
                             c_bar,
                             mu_bar,
                             Sigma_bar,
                             operator,
                             i + 1,
                             phi,
                             psi,
                             n_weights,
                             lambda_,
                             C,
                             K,
                             precision=Sigma_bar_inv)

            # Append results
            iterations.append(i)
            episodes.append(len(episode_rewards) - 1)
            n_samples.append(n_init_samples + i + 1)
            evaluation_rewards.append(rew)
            learning_rewards.append(learning_rew)
            l_2.append(l_2_err)
            l_inf.append(l_inf_err)
            fvals.append(fval)

            # Make sure we restart from s
            mdp.reset(s)

            # Restore weights
            Q._w = current_w

            end_time = time.time()
            elapsed_time = end_time - start_time
            start_time = end_time

            if verbose:
                print(
                    "Iter {} Episodes {} Rew(G) {} Rew(L) {} Fval {} L2 {} L_inf {} time {:.1f} s"
                    .format(i, episodes[-1], rew, learning_rew, fval, l_2_err,
                            l_inf_err, elapsed_time))

        if (i * 100 / max_iter) % 10 == 0:
            print("Seed: " + str(seed) + " - Progress: " +
                  str(int(i * 100 / max_iter)) + "%")

    run_info = [
        iterations, episodes, n_samples, learning_rewards, evaluation_rewards,
        l_2, l_inf, fvals, episode_rewards[:len(episode_t)], episode_t
    ]
    weights = np.array(mu)

    print("Task over: ", mdp.get_info(), " - Last learning rewards: ",
          np.around(run_info[3][-5:], decimals=3))

    return [mdp.get_info(), weights, run_info]
Esempio n. 6
0
l1 = int(args.l1)
l2 = int(args.l2)
alpha = float(args.alpha)
env = str(args.env)
cart_mass = float(args.cart_mass)
pole_mass = float(args.pole_mass)
pole_length = float(args.pole_length)
n_jobs = int(args.n_jobs)
n_runs = int(args.n_runs)
file_name = str(args.file_name)
dqn = bool(args.dqn)
source_file = str(args.source_file)

# load weights

weights = utils.load_object(source_file)
ws = np.array([w[1] for w in weights])
np.random.shuffle(ws)
params = np.array([w[0][1:] for w in weights])
n_runs = min((n_runs, len(weights)))

# Generate tasks
mc = [
    np.random.uniform(0.5, 1.5) if cart_mass < 0 else cart_mass
    for _ in range(n_runs)
]
mp = [
    np.random.uniform(0.1, 0.3) if pole_mass < 0 else pole_mass
    for _ in range(n_runs)
]
l = [
Esempio n. 7
0
if just_one_timestep in range(
        0,
        len(tasks_data) - 1):  # Learn optimal policies just for one timestep
    print("Timestep", just_one_timestep)
    if n_jobs == 1:
        timestep_results = [
            run(tasks_data[just_one_timestep], seeds[j])
            for j in range(seeds_per_task)
        ]
    elif n_jobs > 1:
        timestep_results = Parallel(n_jobs=n_jobs)(
            delayed(run)(tasks_data[just_one_timestep], seeds[j])
            for j in range(seeds_per_task))

    results = utils.load_object(
        sources_file_name)  # sources must already exist.
    results[just_one_timestep] = timestep_results  # overwrite
    utils.save_object(results, sources_file_name)

else:  # Learn optimal policies for all sources
    for i in range(len(tasks_data) - 1):
        print("Timestep", i)
        if n_jobs == 1:
            timestep_results = [
                run(tasks_data[i], seeds[j]) for j in range(seeds_per_task)
            ]
        elif n_jobs > 1:
            timestep_results = Parallel(n_jobs=n_jobs)(
                delayed(run)(tasks_data[i], seeds[j])
                for j in range(seeds_per_task))
Esempio n. 8
0
        c.set_color(0, 0, 0)
        if a == 3:
            c.set_color(0.8, 0.8, 0)
        c.add_attr(rendering.Transform(translation=(0.5, self.size[1] - 1)))

        goal = self.viewer.draw_circle(radius=self.goal_radius)
        goal.set_color(0, 0.8, 0)
        goal.add_attr(rendering.Transform(translation=(self.goal[0], self.goal[1])))

        agent = self.viewer.draw_circle(radius=0.1)
        orientation = self.viewer.draw_line([0.,0.], [.1 * np.cos(self.current_state[2]), .1 * np.sin(self.current_state[2])])
        agent.set_color(.8, 0, 0)
        transform = rendering.Transform(translation=(self.current_state[0], self.current_state[1]))
        agent.add_attr(transform)
        orientation.add_attr(transform)

        return self.viewer.render(return_rgb_array=mode == 'rgb_array')

if __name__ == '__main__':
    from misc import utils

    mazes = utils.load_object("../scripts/mazes10x10")
    for maze in mazes:
        m = Maze(size=maze[0], wall_dim=maze[1], goal_pos=maze[2], start_pos=maze[3], walls=maze[4])
        print(maze[4][3])
        for i in range(1000):
            a = np.random.randint(0, 3)
            s, _, _, _ = m.step(a)
            print("Iter {} State {} A {}".format(i,s,a))
            m._render(a=a)
            time.sleep(.1)
Esempio n. 9
0
filenames = [
    #"mgvt_1c",
    #"t2vt_1c",
    #"source_2014",
    "source_2015",
    "source_2016",
    "source_2017",
    #"2015",
    #"2016",
    #"2017",
    #"2018"
]

for filename in filenames:
    results = utils.load_object("visualize-actions/" + filename)
    days = results[0]
    actions = results[1]
    rewards = results[2]

    # transpose actions matrix
    #actions = np.transpose(actions)

    # rewards cumulative sum
    rewards = np.sum(rewards, axis = 1)
    rewards = np.cumsum(rewards)

    def format_time(value, tick_number):
        hours = str(int(2 + value // 60))
        minutes = int(value % 60)
        if minutes < 10:
Esempio n. 10
0
def learn(mdp,
          Q,
          operator,
          max_iter=5000,
          buffer_size=10000,
          batch_size=50,
          alpha_adam=0.001,
          alpha_sgd=0.1,
          lambda_=0.001,
          n_weights=10,
          train_freq=1,
          eval_freq=50,
          random_episodes=0,
          eval_states=None,
          eval_episodes=1,
          mean_episodes=50,
          preprocess=lambda x: x,
          sigma_reg=0.0001,
          cholesky_clip=0.0001,
          time_coherent=False,
          n_source=10,
          source_file=None,
          seed=None,
          render=False,
          verbose=True,
          sources=None):

    if seed is not None:
        np.random.seed(seed)

    # Randomly initialize the weights in case an MLP is used
    if isinstance(Q, MLPQFunction):
        Q.init_weights()

    global prior_eigen_torch
    prior_eigen_torch = None

    # Initialize policies
    pi_g = EpsilonGreedy(Q, np.arange(mdp.action_space.n), epsilon=0)

    # Get number of features
    K = Q._w.size

    # Load weights and construct prior distribution
    weights = utils.load_object(source_file) if sources is None else sources
    ws = np.array([w[1] for w in weights])
    np.random.shuffle(ws)
    # Take only the first n_source weights
    ws = ws[:n_source, :]
    mu_bar = np.mean(ws, axis=0)
    Sigma_bar = np.cov(ws.T)
    # We use higher regularization for the prior to prevent the ELBO from diverging
    Sigma_bar_inv = np.linalg.inv(Sigma_bar + np.eye(K) * sigma_reg)
    # We initialize the parameters at the prior with smaller regularization (just to make sure Sigma_bar is pd)
    params = clip(
        pack(mu_bar,
             np.linalg.cholesky(Sigma_bar + np.eye(K) * cholesky_clip**2)),
        cholesky_clip, K)

    # Add random episodes if needed
    if random_episodes > 0:
        init_samples = list()
        for i in range(random_episodes):
            Q._w = sample_posterior(params, K)
            init_samples.append(
                utils.generate_episodes(mdp,
                                        pi_g,
                                        n_episodes=1,
                                        preprocess=preprocess))
        init_samples = np.concatenate(init_samples)

        t, s, a, r, s_prime, absorbing, sa = utils.split_data(
            init_samples, mdp.state_dim, mdp.action_dim)
        init_samples = np.concatenate(
            (t[:, np.newaxis], preprocess(s), a, r[:, np.newaxis],
             preprocess(s_prime), absorbing[:, np.newaxis]),
            axis=1)

    # Figure out the effective state-dimension after preprocessing is applied
    eff_state_dim = preprocess(np.zeros(mdp.state_dim)).size

    # Create replay buffer
    buffer = Buffer(buffer_size, eff_state_dim)
    n_init_samples = buffer.add_all(init_samples) if random_episodes > 0 else 0

    # Results
    iterations = []
    episodes = []
    n_samples = []
    evaluation_rewards = []
    learning_rewards = []
    episode_rewards = [0.0]
    episode_t = []
    l_2 = []
    l_inf = []
    fvals = []

    # Create masks for ADAM and SGD
    adam_mask = pack(np.ones(K) * alpha_adam, np.zeros(
        (K, K)))  # ADAM learns only \mu
    sgd_mask = pack(np.zeros(K),
                    np.ones((K, K)) * alpha_sgd)  # SGD learns only L

    # Adam initial params
    m_t = 0
    v_t = 0
    t = 0

    # RMSprop for Variance
    v_t_var = 0.

    # Init env
    s = mdp.reset()
    h = 0
    Q._w = sample_posterior(params, K)

    start_time = time.time()

    # Learning
    for i in range(max_iter):

        # If we do not use time coherent exploration, resample parameters
        Q._w = sample_posterior(params, K) if not time_coherent else Q._w
        # Take greedy action wrt current Q-function
        s_prep = preprocess(s)
        a = np.argmax(Q.value_actions(s_prep))
        # Step
        s_prime, r, done, _ = mdp.step(a)
        # Build the new sample and add it to the dataset
        buffer.add_sample(h, s_prep, a, r, preprocess(s_prime), done)

        # Take a step of gradient if needed
        if i % train_freq == 0:
            # Estimate gradient
            g = gradient(buffer.sample_batch(batch_size), params, Q, mu_bar,
                         Sigma_bar_inv, operator, i + 1, lambda_, n_weights)

            # Take a gradient step for \mu
            params, t, m_t, v_t = utils.adam(params,
                                             g,
                                             t,
                                             m_t,
                                             v_t,
                                             alpha=adam_mask)
            # Take a gradient step for L
            params = utils.sgd(params, g, alpha=sgd_mask)
            # params,v_t_var = utils.rmsprop(params, g, v_t_var, alpha=sgd_mask)
            # Clip parameters
            params = clip(params, cholesky_clip, K)

        # Add reward to last episode
        episode_rewards[-1] += r * mdp.gamma**h

        s = s_prime
        h += 1
        if done or h >= mdp.horizon:

            episode_rewards.append(0.0)
            s = mdp.reset()
            h = 0
            Q._w = sample_posterior(params, K)
            episode_t.append(i)

        # Evaluate model
        if i % eval_freq == 0:

            #Save current weights
            current_w = np.array(Q._w)

            # Evaluate MAP Q-function
            mu, _ = unpack(params, K)
            Q._w = mu
            rew = utils.evaluate_policy(mdp,
                                        pi_g,
                                        render=render,
                                        initial_states=eval_states,
                                        n_episodes=eval_episodes,
                                        preprocess=preprocess)[0]
            learning_rew = np.mean(
                episode_rewards[-mean_episodes -
                                1:-1]) if len(episode_rewards) > 1 else 0.0
            br = operator.bellman_residual(Q,
                                           buffer.sample_batch(batch_size))**2
            l_2_err = np.average(br)
            l_inf_err = np.max(br)
            fval = objective(buffer.sample_batch(batch_size), params, Q,
                             mu_bar, Sigma_bar_inv, operator, i + 1, lambda_,
                             n_weights)

            # Append results
            iterations.append(i)
            episodes.append(len(episode_rewards) - 1)
            n_samples.append(n_init_samples + i + 1)
            evaluation_rewards.append(rew)
            learning_rewards.append(learning_rew)
            l_2.append(l_2_err)
            l_inf.append(l_inf_err)
            fvals.append(fval)

            # Make sure we restart from s
            mdp.reset(s)

            # Restore weights
            Q._w = current_w

            end_time = time.time()
            elapsed_time = end_time - start_time
            start_time = end_time

            if verbose:
                print(
                    "Iter {} Episodes {} Rew(G) {} Rew(L) {} Fval {} L2 {} L_inf {} time {:.1f} s"
                    .format(i, episodes[-1], rew, learning_rew, fval, l_2_err,
                            l_inf_err, elapsed_time))

    run_info = [
        iterations, episodes, n_samples, learning_rewards, evaluation_rewards,
        l_2, l_inf, fvals, episode_rewards[:len(episode_t)], episode_t
    ]
    weights = np.array(mu)

    return [mdp.get_info(), weights, run_info]
Esempio n. 11
0
                 eps_start=eps_start,
                 eps_end=eps_end,
                 exploration_fraction=exploration_fraction,
                 random_episodes=random_episodes,
                 eval_episodes=n_eval_episodes,
                 mean_episodes=mean_episodes,
                 seed=seed,
                 verbose=verbose)


last_rewards = 5

results = []

if just_one_timestep in envs.keys():
    results = utils.load_object(sources_file_name)
    index = list(envs.keys()).index(just_one_timestep)
    mdp = mdps[index]
    print(mdp.get_info())
    if index >= len(results):
        results.append([run(mdp, seed)])
    else:
        results[index] = [run(mdp, seed)]
    print("Last learning rewards:",
          np.around(results[index][0][2][3][-last_rewards:], decimals=5))
    utils.save_object(results, sources_file_name)
else:
    for mdp in mdps:
        print(mdp.get_info())
        results.append([run(mdp, seed)])
        print("Last learning rewards:",
Esempio n. 12
0
        "3c-lambda-0.9": "t2vt_3c_l=0.9",
        "3c-lambda-1.0": "t2vt_3c_l=1.0",
        "3c-likelihood": "t2vt_3c_l=likelihood"
    }

data_index = 3

print(results_path)

out = {"i": []}

for name, file in experiments.items():
    fs = glob.glob(results_path + file + "*.pkl")
    if len(fs) == 0:
        continue
    r = utils.load_object(fs[0][:-4])

    if not out["i"]:
        out["i"] = r[0][2][0][1:]

    data = [r[i][2][data_index][1:] for i in range(len(r))]

    mean = np.mean(data, axis=0)
    std = 2 * np.std(data, axis=0, ddof=1) / np.sqrt(np.array(data).shape[0])

    out["mean-" + name] = mean
    out["std-" + name] = std

keys = list(out.keys())

with open(results_path + "results.csv", "w", newline='') as outfile:
names = [
    "NT", "GVT", "GVT (TC)", "1-MGVT", "1-MGVT (TC)", "2-MGVT", "2-MGVT (TC)"
]

x = []
y_mean = []
y_std = []
y2_mean = []
y2_std = []
y3_mean = []
y3_std = []
y4_mean = []
y4_std = []

for file in files:
    results = [r[2] for r in utils.load_object(file)]
    iterations = []
    episodes = []
    n_samples = []
    lear_rew = []
    eval_rew = []
    l_2 = []
    l_inf = []
    for result in results:
        iterations.append(result[0])
        episodes.append(result[1])
        n_samples.append(result[2])
        lear_rew.append(result[3])
        eval_rew.append(result[4])
        l_2.append(result[5])
        l_inf.append(result[6])
Esempio n. 14
0
        "task": gym.make("VecTradingPrices2016-v2"),
        "filepath": "additions/experiments/trading/",
        "filename": "sources",
        "source_index": 1
    },
    "source_2017": {
        "task": gym.make("VecTradingPrices-v3"),
        "filepath": "additions/experiments/trading/",
        "filename": "sources",
        "source_index": 2
    }
}

for k, v in w_dict.items():
    if "source_index" in v.keys():
        weights = utils.load_object(v["filepath"] + v["filename"])
        weights = weights[v["source_index"]][0][1]
        v["weights"] = weights
    else:
        weights = utils.load_object(v["filepath"] + v["filename"])
        weights = weights[0][1]
        v["weights"] = weights


def year_pass(Q, task):

    days = []
    rewards = np.zeros((task.n_days, len(task.prices[0])))
    actions = np.zeros((task.n_days, len(task.prices[0])))
    state_value_list = []
Esempio n. 15
0
        task.starting_day_index = di
        s = task.reset()
        s = [s]

        print("Day index:", di)

        days.append(task.selected_day)

        done = False
        while not done:

            a = np.argmax(Q._q_values(s))
            s, r, done, _ = task.step(a)
            s = [s]

            actions[di, task.current_timestep] = a - 1  # [0, 2] -> [-1, 1]
            rewards[di, task.current_timestep] = r

        print("Cumulative reward:", np.sum(rewards))

    return [days, actions, rewards]


for k, v in etrs.items():
    print(k)
    Q = utils.load_object(etr_path + v["policy"])
    task = v["task"]
    task.starting_day_index = 0
    task.reset()
    output = year_pass(Q, task)
    utils.save_object(output, "visualize-actions/" + k)
Esempio n. 16
0
eval_freq = int(args.eval_freq)
mean_episodes = int(args.mean_episodes)
alpha = float(args.alpha)
maze = int(args.maze)
l1 = int(args.l1)
l2 = int(args.l2)
n_jobs = int(args.n_jobs)
n_runs = int(args.n_runs)
file_name = str(args.file_name)
render = bool(args.render)
dqn = bool(args.dqn)
eval_episodes = 10
mazes_file = args.mazes_file
# Generate tasks

mazes = utils.load_object(mazes_file)

mdps = [Maze(size=maze[0], wall_dim=maze[1], goal_pos=maze[2], start_pos=maze[3], walls=maze[4]) \
            for maze in mazes]
if maze == -1:
    shuffle(mdps)
    mdps = [mdps[i] for i in range(min(n_runs,len(mdps)))]
else:
    mdps = [mdps[maze]]

state_dim = mdps[0].state_dim
action_dim = 1
n_actions = mdps[0].action_space.n

# Create Q Function
layers = [l1]
Esempio n. 17
0
            "%Y-%m-%d_%H-%M-%S")

# Seed to get reproducible results
seed = 1
np.random.seed(seed)

como_data = pd.read_csv(path + '/../../lake/data/como_data.csv')
demand = np.loadtxt(path + '/../../lake/data/comoDemand.txt')
min_env_flow = np.loadtxt(path + '/../../lake/data/MEF_como.txt')

temp_lake = Lakecomo(None, None, min_env_flow, None, None, seed=seed)
temp_inflow = list(como_data.loc[como_data['year'] == 1946, 'in'])
temp_mdp = LakeEnv(temp_inflow, demand, temp_lake)

# Load tasks
tasks_data = utils.load_object(tasks_file)

n_eval_episodes = 5

state_dim = temp_mdp.observation_space.shape[0]
action_dim = 1
n_actions = temp_mdp.N_DISCRETE_ACTIONS

# Create BellmanOperator
operator = MellowBellmanOperator(kappa, tau, xi, temp_mdp.gamma, state_dim,
                                 action_dim)
# Create Q Function
layers = [l1]
if l2 > 0:
    layers.append(l2)
Q = MLPQFunction(state_dim, n_actions, layers=layers, activation=activation)
Esempio n. 18
0
alpha_sgd = float(args.alpha_sgd)
lambda_ = float(args.lambda_)
n_weights = int(args.n_weights)
sigma_reg = float(args.sigma_reg)
cholesky_clip = float(args.cholesky_clip)
n_source = int(args.n_source)
source_file = str(args.source_file)
time_coherent = bool(args.time_coherent)
fixed_seed = int(args.fixed_seed)


# Generate tasks

np.random.seed(485)

mazes = utils.load_object(mazes_file)
weights = utils.load_object(source_file)

mdps = [Maze(size=maze[0], wall_dim=maze[1], goal_pos=maze[2], start_pos=maze[3], walls=maze[4]) \
            for maze in mazes]

envs = list()
sources = list()

if maze == -1:
    for i in range(min(n_runs, len(mdps))):
        envs.append(mdps[i % len(mdps)])
        sources.append([w for w in weights if not np.array_equal(w[0][-1], envs[-1].walls) and not np.array_equal(w[0][-2], envs[-1].goal)])
else:
    envs = [mdps[maze] for i in range(n_runs)]
    sources = [w for w in weights if not np.array_equal(w[0][-1], envs[-1].walls) and not np.array_equal(w[0][-2], envs[-1].goal)]