def universal_approximation(f, x):
    [train_x, test_x] = split_data(x, ratio=0.75, random=True)
    train_y = np.sin(train_x)
    test_x = np.sort(test_x, axis=0)
    test_y = f(test_x)

    # build simple FNN
    model = Sequential()
    model.add(Dense(50, input_shape=(1, ), activation='relu'))
    model.add(Dense(1))

    model.compile(loss='mse', optimizer='adam')

    # training process
    model.fit(train_x, train_y, batch_size=100, epochs=1000)
    layer = model.get_layer(index=0)

    plt.plot(model.history.history['loss'])
    plt.show()

    # predict
    y_hat = model.predict(test_x)
    plt.plot(test_x, test_y, 'b-', label='original')
    plt.plot(test_x, y_hat, 'r-', label='predicted')
    plt.legend()
    plt.show()
def linear_regression(a=1.0, b=0.0):
    X = np.linspace(-100, 100, 200)
    X = X.reshape((-1, 1))
    [train_x, test_x] = split_data(X, ratio=0.8, random=True)
    train_y = a * train_x + b
    test_y = a * test_x + b

    i = Input(1)
    x = Dense(1)(i)

    # define trainer
    trainer = Trainer(loss='mse',
                      optimizer=Adam(learning_rate=0.2),
                      batch_size=50,
                      epochs=50)

    # create model
    model = Sequential(i, x, trainer)

    model.summary()

    # training process
    model.fit(train_x, train_y)

    # predict
    y_hat = model.predict(test_x)
    plt.plot(test_x, test_y, 'b')
    plt.plot(test_x, y_hat, 'r')
    plt.show()
def linear_classification(a=1.0, b=0.0):
    x = np.linspace(-100, 100, 200)
    y = a * x + b
    X = np.array(list(zip(x, y))) + np.random.randn(200, 2) * 100
    Y = np.where(a * X[:, 0] + b > X[:, 1], 1, 0)
    (train_x, train_y), (test_x, test_y) = split_data(X,
                                                      Y,
                                                      ratio=0.8,
                                                      random=True)
    train_y = to_one_hot(train_y)
    test_y = np.where(a * test_x[:, 0] + b > test_x[:, 1], 1, 0)

    # build simple FNN
    i = Input(2)
    x = Dense(2, activation='softmax')(i)

    # define trainer

    # create model
    model = Model(i, x)
    model.compile(optimizer=Adam(learning_rate=0.1),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    model.summary()

    # training process
    model.fit(train_x, train_y, batch_size=50, epochs=50)

    # predict
    y_hat = model.predict(test_x)
    y_hat = np.argmax(y_hat, axis=1)
    simple_plot(test_x, y_hat, a, b)
def multi_classification(csv_file_path):
    """assuming the csv file has columns: x, y, class"""
    df = pd.read_csv(csv_file_path)
    X = df[['x', 'y']].to_numpy()
    Y = df['class'].to_numpy().reshape((-1, 1))

    plt.scatter(X[:, 0], X[:, 1], c=Y, s=100, marker='o')
    plt.show()

    (train_x, train_y), (test_x, test_y) = split_data(X,
                                                      Y,
                                                      ratio=0.75,
                                                      random=True)
Exemple #5
0
    def _bellman_residual_surrogate(self, Q, samples, weights=None):
        _, _, _, r, s_prime, absorbing, sa = utils.split_data(samples, self._state_dim, self._action_dim)
        if weights is None:
            amax = torch.argmax(Q.value_actions(s_prime, absorbing, grad_required=True), dim=1)
            amax = amax.detach().numpy()    # ensure that is not taken for the derivative
            maxQ = self._q_target.value(np.concatenate((s_prime, amax[:,np.newaxis]), axis=1), grad_required=True).detach()
            r = torch.from_numpy(r)
            absorbing = torch.from_numpy(absorbing)
            qval = Q.value(sa, grad_required=True)
        else:
            qprime = Q.value_actions_weights(s_prime, weights=weights, done=absorbing, grad_required=True).detach()
            amax = torch.argmax(qprime, dim=1).type("int64") # best actions
            maxQ = self._q_target.value_actions_weights(s_prime, weights=weights, done=absorbing, grad_required=True)

            state = np.repeat(np.arange(s_prime.shape[0], dtype="int64"), weights.shape[0])
            amax = amax.view(-1)        # flattens the tensor
            maxQ = maxQ[state, amax].view(s_prime.shape[0], weights.shape[0]).detach()
            r = torch.from_numpy(r).unsqueeze(1)
            absorbing = torch.from_numpy(absorbing).unsqueeze(1)
            qval = Q.value_weights(sa, grad_required=True)
        return smooth_l1_loss(qval, r + self._gamma * maxQ * (1-absorbing), reduce=False)
Exemple #6
0
def linear_classification(a=1.0, b=0.0, graph=False):

    # prepare data
    x = np.linspace(-100, 100, 200)
    y = a * x + b
    X = np.array(list(zip(x, y))) + np.random.randn(200, 2) * 100
    Y = to_one_hot(np.where(a * X[:, 0] + b > X[:, 1], 1, 0))
    (train_x, train_y), (test_x, test_y) = split_data(X,
                                                      Y,
                                                      ratio=0.8,
                                                      random=True)

    # build simple FNN
    i = Input(2)
    x = Dense(2, activation='softmax')(i)

    # define trainer
    trainer = Trainer(loss='cross_entropy',
                      optimizer=Adam(learning_rate=0.05),
                      batch_size=50,
                      epochs=50,
                      metrics=['accuracy'])

    # create model
    model = Sequential(i, x, trainer)

    model.summary()

    # training process
    model.fit(train_x, train_y)
    print(model.evaluate(test_x, test_y))

    if graph:
        plt.plot(model.history['loss'])
        plt.show()

        # predict
        y_hat = model.predict(test_x)
        y_hat = np.argmax(y_hat, axis=1)
        simple_plot(test_x, y_hat, a, b)
Exemple #7
0
def universal_approximation(f, x):
    [train_x, test_x] = split_data(x, ratio=0.8, random=True)
    train_y = f(train_x)

    test_x = np.sort(test_x, axis=0)
    test_y = f(test_x)

    # build simple FNN
    i = Input(1)
    x = Dense(50, activation='relu')(i)
    x = Dense(1)(x)

    # define trainer
    schedule = ExponentialDecay(initial_learning_rate=0.01, decay_rate=0.75)
    trainer = Trainer(loss='mse',
                      optimizer=Adam(learning_rate=schedule),
                      batch_size=50,
                      epochs=750)

    # create model
    model = Sequential(i, x, trainer)

    model.summary()

    # training process
    start = time.time()
    model.fit(train_x, train_y)
    print(time.time() - start)

    plt.plot(range(len(model.history['loss'])), model.history['loss'])
    plt.show()

    # predict
    y_hat = model.predict(test_x)
    plt.plot(test_x, test_y, 'b-', label='original')
    plt.plot(test_x, y_hat, 'r-', label='predicted')
    plt.legend()
    plt.show()
Exemple #8
0
 def _bellman_residual_surrogate(self, Q, samples, weights=None):
     _, _, _, r, s_prime, absorbing, sa = utils.split_data(
         samples, self._state_dim, self._action_dim)
     if weights is None:
         Qs_prime = Q.value_actions(s_prime, absorbing, grad_required=True)
         mmQs = mellow_max(Qs_prime, self._kappa, axis=1)
         r = torch.from_numpy(r)
         absorbing = torch.from_numpy(absorbing)
         qval = Q.value(sa, grad_required=True)
     else:
         Qs_prime = Q.value_actions_weights(s_prime,
                                            weights=weights,
                                            done=absorbing,
                                            grad_required=True)
         mmQs = mellow_max(Qs_prime, self._kappa, axis=1)
         r = torch.from_numpy(r).unsqueeze(1)
         absorbing = torch.from_numpy(absorbing).unsqueeze(1)
         qval = Q.value_weights(sa, grad_required=True)
     mean_weight = (r + self._gamma * mmQs * (1 - absorbing) -
                    qval).detach()
     return 2 * mean_weight * (
         r + self._xi * self._gamma * mmQs - qval
     )  # TODO does the (1-done) goes in the derivative?
def linear_regression(a=1.0, b=0.0):
    X = np.linspace(-100, 100, 200)
    X = X.reshape((-1, 1))
    [train_x, test_x] = split_data(X, ratio=0.8, random=True)
    train_y = a * train_x + b
    test_y = a * test_x + b

    # build simple FNN
    i = Input(1)
    x = Dense(1)(i)

    # create model
    model = Model(i, x)

    # training process
    model.compile(optimizer=Adam(learning_rate=0.1), loss='mse')
    model.fit(train_x, train_y, batch_size=50, epochs=50)

    # predict
    y_hat = model.predict(test_x)
    plt.plot(test_x, test_y, 'b')
    plt.plot(test_x, y_hat, 'r')
    plt.show()
Exemple #10
0
def learn(Q,
          operator,
          data,
          demand,
          min_env_flow,
          actions_report_file="",
          max_iter=5000,
          buffer_size=10000,
          batch_size=50,
          alpha=0.001,
          train_freq=1,
          eval_freq=50,
          eps_start=1.0,
          eps_end=0.02,
          exploration_fraction=0.2,
          random_episodes=0,
          eval_states=None,
          eval_episodes=1,
          mean_episodes=50,
          preprocess=lambda x: x,
          seed=None,
          render=False,
          verbose=True):

    leap_year_demand = np.insert(demand, 60, demand[59])

    if seed is not None:
        np.random.seed(seed)

    # mdp creation
    lake = Lakecomo(None, None, min_env_flow, None, None, seed=seed)
    years = data.year.unique()
    description = str(int(years[0])) + "-" + str(int(years[-1]))
    sampled_year = np.random.choice(years)
    inflow = list(data.loc[data['year'] == sampled_year, 'in'])
    if sampled_year % 4 == 0:  # leap years between 1946 and 2011 satisfy this condition even though it's not the complete leap year condition
        mdp = LakeEnv(inflow, leap_year_demand, lake)
    else:
        mdp = LakeEnv(inflow, demand, lake)

    # Randomly initialize the weights in case an MLP is used
    if isinstance(Q, MLPQFunction):
        Q.init_weights()
        if isinstance(operator, DQNOperator):
            operator._q_target._w = Q._w

    # Initialize policies
    schedule = np.linspace(eps_start, eps_end,
                           int(exploration_fraction * max_iter))
    pi = ScheduledGibbs(Q, np.arange(mdp.N_DISCRETE_ACTIONS), schedule)
    pi_u = Gibbs(Q, np.arange(mdp.N_DISCRETE_ACTIONS), tau=0)
    pi_g = Gibbs(Q, np.arange(mdp.N_DISCRETE_ACTIONS), tau=np.inf)

    # Add random episodes if needed
    init_samples = utils.generate_episodes(
        mdp, pi_u, n_episodes=random_episodes,
        preprocess=preprocess) if random_episodes > 0 else None
    if random_episodes > 0:
        t, s, a, r, s_prime, absorbing, sa = utils.split_data(
            init_samples, mdp.observation_space.shape[0], mdp.action_dim)
        init_samples = np.concatenate(
            (t[:, np.newaxis], preprocess(s), a, r[:, np.newaxis],
             preprocess(s_prime), absorbing[:, np.newaxis]),
            axis=1)

    # Figure out the effective state-dimension after preprocessing is applied
    eff_state_dim = preprocess(np.zeros(mdp.observation_space.shape[0])).size

    # Create replay buffer
    buffer = Buffer(buffer_size, eff_state_dim)
    n_init_samples = buffer.add_all(init_samples) if random_episodes > 0 else 0

    # Results
    iterations = []
    episodes = []
    n_samples = []
    evaluation_rewards = []
    learning_rewards = []
    episode_rewards = [0.0]
    episode_t = []
    l_2 = []
    l_inf = []

    # Adam initial params
    m_t = 0
    v_t = 0
    t = 0

    # Init env
    s = mdp.reset()
    h = 0

    start_time = time.time()

    if actions_report_file:
        actions_executed = []

        columns = list(range(mdp.N_DISCRETE_ACTIONS))
        actions_report_df = pd.DataFrame(columns=columns)
        actions_report_df.to_csv(actions_report_file, index=False)

    done_counter = 0

    # Learning
    for i in range(max_iter):

        # Take epsilon-greedy action wrt current Q-function
        s_prep = preprocess(s)
        a = pi.sample_action(s_prep)
        if actions_report_file:
            actions_executed.append(a)

        # Step
        s_prime, r, done, _ = mdp.step(a)

        # Build the new sample and add it to the dataset
        buffer.add_sample(h, s_prep, a, r, preprocess(s_prime), done)

        # Take a step of gradient if needed
        if i % train_freq == 0:
            # Estimate gradient
            g = operator.gradient_be(Q, buffer.sample_batch(batch_size))
            # Take a gradient step
            Q._w, t, m_t, v_t = utils.adam(Q._w, g, t, m_t, v_t, alpha=alpha)

        # Add reward to last episode
        episode_rewards[-1] += r * mdp.gamma**h

        s = s_prime

        h += 1
        if done or h >= mdp.horizon:

            if actions_report_file:
                actions_counts = np.bincount(actions_executed)
                actions_freqs = list(actions_counts / sum(actions_counts))
                new_row = dict(zip(columns, actions_freqs))
                actions_report_df = actions_report_df.append(new_row,
                                                             ignore_index=True)
                actions_report_df.to_csv(actions_report_file, index=False)

                actions_executed = []

            episode_rewards.append(0.0)

            sampled_year = np.random.choice(years)
            inflow = list(data.loc[data['year'] == sampled_year, 'in'])
            if sampled_year % 4 == 0:
                mdp = LakeEnv(inflow, leap_year_demand, lake)
            else:
                mdp = LakeEnv(inflow, demand, lake)

            s = mdp.reset()

            h = 0
            episode_t.append(i)

            done_counter += 1

        # Evaluate model
        if done_counter == eval_freq:

            # Evaluate greedy policy
            scores = []
            for _ in range(eval_episodes):
                sampled_year = np.random.choice(years)
                inflow = list(data.loc[data['year'] == sampled_year, 'in'])
                if sampled_year % 4 == 0:
                    mdp = LakeEnv(inflow, leap_year_demand, lake)
                else:
                    mdp = LakeEnv(inflow, demand, lake)

                scores.append(_single_year_eval(mdp, pi_g))

            rew = np.mean(scores)

            learning_rew = np.mean(
                episode_rewards[-mean_episodes -
                                1:-1]) if len(episode_rewards) > 1 else 0.0
            br = operator.bellman_residual(Q,
                                           buffer.sample_batch(batch_size))**2
            l_2_err = np.average(br)
            l_inf_err = np.max(br)

            # Append results
            iterations.append(i)
            episodes.append(len(episode_rewards) - 1)
            n_samples.append(n_init_samples + i + 1)
            evaluation_rewards.append(rew)
            learning_rewards.append(learning_rew)
            l_2.append(l_2_err)
            l_inf.append(l_inf_err)

            sampled_year = np.random.choice(years)
            inflow = list(data.loc[data['year'] == sampled_year, 'in'])

            if sampled_year % 4 == 0:
                mdp = LakeEnv(inflow, leap_year_demand, lake)
            else:
                mdp = LakeEnv(inflow, demand, lake)

            s = mdp.reset()

            end_time = time.time()
            elapsed_time = end_time - start_time
            start_time = end_time

            if verbose:
                print(
                    "Iter {} Episodes {} Rew(G) {} Rew(L) {} L2 {} L_inf {} time {:.1f} s"
                    .format(i, episodes[-1], rew, learning_rew, l_2_err,
                            l_inf_err, elapsed_time))

            done_counter = 0

        if (i * 100 / max_iter) % 10 == 0:
            print("years:", description, "- Progress:",
                  str(int(i * 100 / max_iter)) + "%")

    run_info = [
        iterations, episodes, n_samples, learning_rewards, evaluation_rewards,
        l_2, l_inf, episode_rewards[:len(episode_t)], episode_t
    ]
    weights = np.array(Q._w)

    last_rewards = 5
    print("years:", description, "- Last evaluation rewards:",
          np.around(evaluation_rewards[-last_rewards:], decimals=3))

    return [[], weights, run_info]
Exemple #11
0
def learn(
        mdp,
        Q,
        operator,
        max_iter=5000,
        buffer_size=10000,
        batch_size=50,
        alpha_adam=0.001,
        alpha_sgd=0.1,
        lambda_=0.001,
        n_weights=10,
        train_freq=1,
        eval_freq=50,
        random_episodes=0,
        eval_states=None,
        eval_episodes=1,
        mean_episodes=50,
        preprocess=lambda x: x,
        cholesky_clip=0.0001,
        bandwidth=0.00001,
        post_components=1,
        max_iter_ukl=60,
        eps=0.001,
        eta=1e-6,
        time_coherent=False,
        source_file=None,
        seed=None,
        render=False,
        verbose=True,
        ukl_tight_freq=1,
        sources=None,
        # Lambda function to calculate the weights
        weights_calculator=None):

    if seed is not None:
        np.random.seed(seed)

    # Randomly initialize the weights in case an MLP is used
    if isinstance(Q, MLPQFunction):
        Q.init_weights()

    # Reset global variables
    global prior_eigen
    prior_eigen = None
    global cholesky_mask
    cholesky_mask = None
    global prior_normal
    prior_normal = None
    global posterior_normal
    posterior_normal = None

    # Initialize policies
    pi_g = EpsilonGreedy(Q, np.arange(mdp.action_space.n), epsilon=0)

    # Get number of features
    K = Q._w.size
    C = post_components

    # Load weights and construct prior distribution
    weights = utils.load_object(source_file) if sources is None else sources
    timesteps = len(weights)
    ws = []
    # Take only 1 sample per timestep
    for i in range(timesteps):
        samples = weights[i]
        np.random.shuffle(samples)
        ws.append(samples[0][1])  # 0: first sample (random), 1: weights
    ws = np.array(ws)

    # The gaussian mixture weights are uniform if not provided.
    c_bar = np.ones(
        timesteps
    ) / timesteps if weights_calculator is None else weights_calculator(ws)

    # Take only gaussians with non-zero weights
    ws = ws[c_bar > 0]
    timesteps = len(ws)
    c_bar = c_bar[c_bar > 0]

    mu_bar = ws
    Sigma_bar = np.tile(np.eye(K) * bandwidth, (timesteps, 1, 1))
    Sigma_bar_inv = np.tile((1 / bandwidth * np.eye(K))[np.newaxis],
                            (timesteps, 1, 1))

    # We initialize the parameters of the posterior to the best approximation of the posterior family to the prior
    c = np.ones(C) / C
    psi = c[:, np.newaxis] * c_bar[np.newaxis]
    phi = np.array(psi)

    mu = np.array([100 * np.random.randn(K) for _ in range(C)])
    Sigma = np.array([np.eye(K) for _ in range(C)])

    phi, psi = tight_ukl(c,
                         mu,
                         Sigma,
                         c_bar,
                         mu_bar,
                         Sigma_bar,
                         phi,
                         psi,
                         max_iter=max_iter_ukl,
                         eps=eps)
    params, phi, psi = init_posterior(c,
                                      mu,
                                      Sigma,
                                      c_bar,
                                      mu_bar,
                                      Sigma_bar,
                                      phi,
                                      psi,
                                      C,
                                      K,
                                      cholesky_clip,
                                      max_iter_ukl,
                                      max_iter=max_iter_ukl * 10,
                                      precision=Sigma_bar_inv,
                                      eta=eta,
                                      eps=eps,
                                      verbose=verbose)

    # Add random episodes if needed
    init_samples = list()
    if random_episodes > 0:
        w, _ = sample_gmm(random_episodes, c_bar, mu_bar, np.sqrt(Sigma_bar))
        for i in range(random_episodes):
            Q._w = w[i]
            init_samples.append(
                utils.generate_episodes(mdp,
                                        pi_g,
                                        n_episodes=1,
                                        preprocess=preprocess))
        init_samples = np.concatenate(init_samples)

        t, s, a, r, s_prime, absorbing, sa = utils.split_data(
            init_samples, mdp.state_dim, mdp.action_dim)
        init_samples = np.concatenate(
            (t[:, np.newaxis], preprocess(s), a, r[:, np.newaxis],
             preprocess(s_prime), absorbing[:, np.newaxis]),
            axis=1)

    # Figure out the effective state-dimension after preprocessing is applied
    eff_state_dim = preprocess(np.zeros(mdp.state_dim)).size

    # Create replay buffer
    buffer = Buffer(buffer_size, eff_state_dim)
    n_init_samples = buffer.add_all(init_samples) if random_episodes > 0 else 0

    # Results
    iterations = []
    episodes = []
    n_samples = []
    evaluation_rewards = []
    learning_rewards = []
    episode_rewards = [0.0]
    l_2 = []
    l_inf = []
    fvals = []
    episode_t = []

    # Create masks for ADAM and SGD
    adam_mask = pack(np.zeros(C),
                     np.ones((C, K)) * alpha_adam, np.zeros(
                         (C, K, K)))  # ADAM learns only \mu
    sgd_mask = pack(np.zeros(C), np.zeros((C, K)),
                    np.ones((C, K, K)) * alpha_sgd)  # SGD learns only L

    # Adam initial params
    m_t = 0
    v_t = 0
    t = 0

    # Init env
    s = mdp.reset()
    h = 0
    Q._w = sample_posterior(params, C, K)

    start_time = time.time()

    # Learning
    for i in range(max_iter):

        # If we do not use time coherent exploration, resample parameters
        Q._w = sample_posterior(params, C, K) if not time_coherent else Q._w
        # Take greedy action wrt current Q-function
        s_prep = preprocess(s)
        a = np.argmax(Q.value_actions(s_prep))
        # Step
        s_prime, r, done, _ = mdp.step(a)
        # Build the new sample and add it to the dataset
        buffer.add_sample(h, s_prep, a, r, preprocess(s_prime), done)

        # Take a step of gradient if needed
        if i % train_freq == 0:
            # Estimate gradient
            g = gradient(buffer.sample_batch(batch_size),
                         params,
                         Q,
                         c_bar,
                         mu_bar,
                         Sigma_bar,
                         operator,
                         i + 1,
                         phi,
                         psi,
                         n_weights,
                         lambda_,
                         max_iter_ukl,
                         C,
                         K,
                         precision=Sigma_bar_inv,
                         t_step=i,
                         ukl_tight_freq=ukl_tight_freq)

            # Take a gradient step for \mu
            params, t, m_t, v_t = utils.adam(params,
                                             g,
                                             t,
                                             m_t,
                                             v_t,
                                             alpha=adam_mask)
            # Take a gradient step for L
            params = utils.sgd(params, g, alpha=sgd_mask)
            # Clip parameters
            params = clip(params, cholesky_clip, C, K)

        # Add reward to last episode
        episode_rewards[-1] += r * mdp.gamma**h

        s = s_prime
        h += 1
        if done or h >= mdp.horizon:

            episode_rewards.append(0.0)
            s = mdp.reset()
            h = 0
            Q._w = sample_posterior(params, C, K)
            episode_t.append(i)

        # Evaluate model
        if i % eval_freq == 0:

            #Save current weights
            current_w = np.array(Q._w)

            # Evaluate MAP Q-function
            c, mu, _ = unpack(params, C, K)
            rew = 0
            for j in range(C):
                Q._w = mu[j]
                rew += utils.evaluate_policy(mdp,
                                             pi_g,
                                             render=render,
                                             initial_states=eval_states,
                                             n_episodes=eval_episodes,
                                             preprocess=preprocess)[0]
            rew /= C

            learning_rew = np.mean(
                episode_rewards[-mean_episodes -
                                1:-1]) if len(episode_rewards) > 1 else 0.0
            br = operator.bellman_residual(Q,
                                           buffer.sample_batch(batch_size))**2
            l_2_err = np.average(br)
            l_inf_err = np.max(br)
            fval = objective(buffer.sample_batch(batch_size),
                             params,
                             Q,
                             c_bar,
                             mu_bar,
                             Sigma_bar,
                             operator,
                             i + 1,
                             phi,
                             psi,
                             n_weights,
                             lambda_,
                             C,
                             K,
                             precision=Sigma_bar_inv)

            # Append results
            iterations.append(i)
            episodes.append(len(episode_rewards) - 1)
            n_samples.append(n_init_samples + i + 1)
            evaluation_rewards.append(rew)
            learning_rewards.append(learning_rew)
            l_2.append(l_2_err)
            l_inf.append(l_inf_err)
            fvals.append(fval)

            # Make sure we restart from s
            mdp.reset(s)

            # Restore weights
            Q._w = current_w

            end_time = time.time()
            elapsed_time = end_time - start_time
            start_time = end_time

            if verbose:
                print(
                    "Iter {} Episodes {} Rew(G) {} Rew(L) {} Fval {} L2 {} L_inf {} time {:.1f} s"
                    .format(i, episodes[-1], rew, learning_rew, fval, l_2_err,
                            l_inf_err, elapsed_time))

        if (i * 100 / max_iter) % 10 == 0:
            print("Seed: " + str(seed) + " - Progress: " +
                  str(int(i * 100 / max_iter)) + "%")

    run_info = [
        iterations, episodes, n_samples, learning_rewards, evaluation_rewards,
        l_2, l_inf, fvals, episode_rewards[:len(episode_t)], episode_t
    ]
    weights = np.array(mu)

    print("Task over: ", mdp.get_info(), " - Last learning rewards: ",
          np.around(run_info[3][-5:], decimals=3))

    return [mdp.get_info(), weights, run_info]
Exemple #12
0
def learn(mdp,
          Q,
          operator,
          max_iter=5000,
          buffer_size=10000,
          batch_size=50,
          alpha=0.001,
          train_freq=1,
          eval_freq=50,
          eps_start=1.0,
          eps_end=0.02,
          exploration_fraction=0.2,
          random_episodes=0,
          eval_states=None,
          eval_episodes=1,
          mean_episodes=50,
          preprocess=lambda x: x,
          seed=None,
          render=False,
          verbose=True):
    if seed is not None:
        np.random.seed(seed)

    # Randomly initialize the weights in case an MLP is used
    if isinstance(Q, MLPQFunction):
        # Q.init_weights()
        if isinstance(operator, DQNOperator):
            operator._q_target._w = Q._w

    # Initialize policies
    schedule = np.linspace(eps_start, eps_end, exploration_fraction * max_iter)
    pi = ScheduledEpsilonGreedy(Q, np.arange(mdp.action_space.n), schedule)
    pi_u = EpsilonGreedy(Q, np.arange(mdp.action_space.n), epsilon=1)
    pi_g = EpsilonGreedy(Q, np.arange(mdp.action_space.n), epsilon=0)

    # Add random episodes if needed
    init_samples = utils.generate_episodes(
        mdp, pi_u, n_episodes=random_episodes,
        preprocess=preprocess) if random_episodes > 0 else None
    if random_episodes > 0:
        t, s, a, r, s_prime, absorbing, sa = utils.split_data(
            init_samples, mdp.state_dim, mdp.action_dim)
        init_samples = np.concatenate(
            (t[:, np.newaxis], preprocess(s), a, r[:, np.newaxis],
             preprocess(s_prime), absorbing[:, np.newaxis]),
            axis=1)

    # Figure out the effective state-dimension after preprocessing is applied
    eff_state_dim = preprocess(np.zeros(mdp.state_dim)).size

    # Create replay buffer
    buffer = Buffer(buffer_size, eff_state_dim)
    n_init_samples = buffer.add_all(init_samples) if random_episodes > 0 else 0

    # Results
    iterations = []
    episodes = []
    n_samples = []
    evaluation_rewards = []
    learning_rewards = []
    episode_rewards = [0.0]
    episode_t = []
    l_2 = []
    l_inf = []

    # Adam initial params
    m_t = 0
    v_t = 0
    t = 0

    # Init env
    s = mdp.reset()
    h = 0

    start_time = time.time()

    # Learning
    for i in range(max_iter):

        # Take epsilon-greedy action wrt current Q-function
        s_prep = preprocess(s)
        a = pi.sample_action(s_prep)
        # Step
        s_prime, r, done, _ = mdp.step(a)
        # Build the new sample and add it to the dataset
        buffer.add_sample(h, s_prep, a, r, preprocess(s_prime), done)

        # Take a step of gradient if needed
        if i % train_freq == 0:
            # Estimate gradient
            g = operator.gradient_be(Q, buffer.sample_batch(batch_size))
            # Take a gradient step
            Q._w, t, m_t, v_t = utils.adam(Q._w, g, t, m_t, v_t, alpha=alpha)

        # Add reward to last episode
        episode_rewards[-1] += r * mdp.gamma**h

        s = s_prime
        h += 1
        if done or h >= mdp.horizon:
            episode_rewards.append(0.0)
            s = mdp.reset()
            h = 0
            episode_t.append(i)

        # Evaluate model
        if i % eval_freq == 0:

            # Evaluate greedy policy
            rew = utils.evaluate_policy(mdp,
                                        pi_g,
                                        render=render,
                                        initial_states=eval_states,
                                        n_episodes=eval_episodes,
                                        preprocess=preprocess)[0]
            learning_rew = np.mean(
                episode_rewards[-mean_episodes -
                                1:-1]) if len(episode_rewards) > 1 else 0.0
            br = operator.bellman_residual(Q,
                                           buffer.sample_batch(batch_size))**2
            l_2_err = np.average(br)
            l_inf_err = np.max(br)

            # Append results
            iterations.append(i)
            episodes.append(len(episode_rewards) - 1)
            n_samples.append(n_init_samples + i + 1)
            evaluation_rewards.append(rew)
            learning_rewards.append(learning_rew)
            l_2.append(l_2_err)
            l_inf.append(l_inf_err)

            # Make sure we restart from s
            mdp.reset(s)

            end_time = time.time()
            elapsed_time = end_time - start_time
            start_time = end_time

            if verbose:
                print(
                    "Iter {} Episodes {} Rew(G) {} Rew(L) {} L2 {} L_inf {} time {:.1f} s"
                    .format(i, episodes[-1], rew, learning_rew, l_2_err,
                            l_inf_err, elapsed_time))
        # if np.mean(episode_rewards[-mean_episodes - 1:-1]) > -80:
        #     render=True

    run_info = [
        iterations, episodes, n_samples, learning_rewards, evaluation_rewards,
        l_2, l_inf, episode_rewards[:len(episode_t)], episode_t
    ]
    weights = np.array(Q._w)

    return [mdp.get_info(), weights, run_info]
Exemple #13
0
 def bellman_residual(self, Q, samples, weights=None):
     """General function for computing Bellman residuals"""
     _, _, _, r, s_prime, absorbing, sa = utils.split_data(
         samples, self._state_dim, self._action_dim)
     return self._bellman_residual_single(Q, r, s_prime, absorbing, sa) if weights is None else \
         self._bellman_residual_multi(Q, r, s_prime, absorbing, sa, weights)
Exemple #14
0
 def gradient_be(self, Q, samples, weights=None):
     """General function for gradients of the Bellman error"""
     _, _, _, _, _, _, sa = utils.split_data(samples, self._state_dim, self._action_dim)
     return self._gradient_be_single(Q, samples, sa) if weights is None else \
         self._gradient_be_multi(Q, samples, sa, weights)
Exemple #15
0
 def gradient_mm(self, Q, samples, weights=None):
     """General function for computing mellow-max gradients"""
     _, _, _, _, s_prime, absorbing, _ = utils.split_data(samples, self._state_dim, self._action_dim)
     return self._gradient_mm_single(Q, s_prime, absorbing) if weights is None else \
         self._gradient_mm_multi(Q, s_prime, absorbing, weights)
Exemple #16
0
def learn(mdp,
          Q,
          operator,
          max_iter=5000,
          buffer_size=10000,
          batch_size=50,
          alpha_adam=0.001,
          alpha_sgd=0.1,
          lambda_=0.001,
          n_weights=10,
          train_freq=1,
          eval_freq=50,
          random_episodes=0,
          eval_states=None,
          eval_episodes=1,
          mean_episodes=50,
          preprocess=lambda x: x,
          sigma_reg=0.0001,
          cholesky_clip=0.0001,
          time_coherent=False,
          n_source=10,
          source_file=None,
          seed=None,
          render=False,
          verbose=True,
          sources=None):

    if seed is not None:
        np.random.seed(seed)

    # Randomly initialize the weights in case an MLP is used
    if isinstance(Q, MLPQFunction):
        Q.init_weights()

    global prior_eigen_torch
    prior_eigen_torch = None

    # Initialize policies
    pi_g = EpsilonGreedy(Q, np.arange(mdp.action_space.n), epsilon=0)

    # Get number of features
    K = Q._w.size

    # Load weights and construct prior distribution
    weights = utils.load_object(source_file) if sources is None else sources
    ws = np.array([w[1] for w in weights])
    np.random.shuffle(ws)
    # Take only the first n_source weights
    ws = ws[:n_source, :]
    mu_bar = np.mean(ws, axis=0)
    Sigma_bar = np.cov(ws.T)
    # We use higher regularization for the prior to prevent the ELBO from diverging
    Sigma_bar_inv = np.linalg.inv(Sigma_bar + np.eye(K) * sigma_reg)
    # We initialize the parameters at the prior with smaller regularization (just to make sure Sigma_bar is pd)
    params = clip(
        pack(mu_bar,
             np.linalg.cholesky(Sigma_bar + np.eye(K) * cholesky_clip**2)),
        cholesky_clip, K)

    # Add random episodes if needed
    if random_episodes > 0:
        init_samples = list()
        for i in range(random_episodes):
            Q._w = sample_posterior(params, K)
            init_samples.append(
                utils.generate_episodes(mdp,
                                        pi_g,
                                        n_episodes=1,
                                        preprocess=preprocess))
        init_samples = np.concatenate(init_samples)

        t, s, a, r, s_prime, absorbing, sa = utils.split_data(
            init_samples, mdp.state_dim, mdp.action_dim)
        init_samples = np.concatenate(
            (t[:, np.newaxis], preprocess(s), a, r[:, np.newaxis],
             preprocess(s_prime), absorbing[:, np.newaxis]),
            axis=1)

    # Figure out the effective state-dimension after preprocessing is applied
    eff_state_dim = preprocess(np.zeros(mdp.state_dim)).size

    # Create replay buffer
    buffer = Buffer(buffer_size, eff_state_dim)
    n_init_samples = buffer.add_all(init_samples) if random_episodes > 0 else 0

    # Results
    iterations = []
    episodes = []
    n_samples = []
    evaluation_rewards = []
    learning_rewards = []
    episode_rewards = [0.0]
    episode_t = []
    l_2 = []
    l_inf = []
    fvals = []

    # Create masks for ADAM and SGD
    adam_mask = pack(np.ones(K) * alpha_adam, np.zeros(
        (K, K)))  # ADAM learns only \mu
    sgd_mask = pack(np.zeros(K),
                    np.ones((K, K)) * alpha_sgd)  # SGD learns only L

    # Adam initial params
    m_t = 0
    v_t = 0
    t = 0

    # RMSprop for Variance
    v_t_var = 0.

    # Init env
    s = mdp.reset()
    h = 0
    Q._w = sample_posterior(params, K)

    start_time = time.time()

    # Learning
    for i in range(max_iter):

        # If we do not use time coherent exploration, resample parameters
        Q._w = sample_posterior(params, K) if not time_coherent else Q._w
        # Take greedy action wrt current Q-function
        s_prep = preprocess(s)
        a = np.argmax(Q.value_actions(s_prep))
        # Step
        s_prime, r, done, _ = mdp.step(a)
        # Build the new sample and add it to the dataset
        buffer.add_sample(h, s_prep, a, r, preprocess(s_prime), done)

        # Take a step of gradient if needed
        if i % train_freq == 0:
            # Estimate gradient
            g = gradient(buffer.sample_batch(batch_size), params, Q, mu_bar,
                         Sigma_bar_inv, operator, i + 1, lambda_, n_weights)

            # Take a gradient step for \mu
            params, t, m_t, v_t = utils.adam(params,
                                             g,
                                             t,
                                             m_t,
                                             v_t,
                                             alpha=adam_mask)
            # Take a gradient step for L
            params = utils.sgd(params, g, alpha=sgd_mask)
            # params,v_t_var = utils.rmsprop(params, g, v_t_var, alpha=sgd_mask)
            # Clip parameters
            params = clip(params, cholesky_clip, K)

        # Add reward to last episode
        episode_rewards[-1] += r * mdp.gamma**h

        s = s_prime
        h += 1
        if done or h >= mdp.horizon:

            episode_rewards.append(0.0)
            s = mdp.reset()
            h = 0
            Q._w = sample_posterior(params, K)
            episode_t.append(i)

        # Evaluate model
        if i % eval_freq == 0:

            #Save current weights
            current_w = np.array(Q._w)

            # Evaluate MAP Q-function
            mu, _ = unpack(params, K)
            Q._w = mu
            rew = utils.evaluate_policy(mdp,
                                        pi_g,
                                        render=render,
                                        initial_states=eval_states,
                                        n_episodes=eval_episodes,
                                        preprocess=preprocess)[0]
            learning_rew = np.mean(
                episode_rewards[-mean_episodes -
                                1:-1]) if len(episode_rewards) > 1 else 0.0
            br = operator.bellman_residual(Q,
                                           buffer.sample_batch(batch_size))**2
            l_2_err = np.average(br)
            l_inf_err = np.max(br)
            fval = objective(buffer.sample_batch(batch_size), params, Q,
                             mu_bar, Sigma_bar_inv, operator, i + 1, lambda_,
                             n_weights)

            # Append results
            iterations.append(i)
            episodes.append(len(episode_rewards) - 1)
            n_samples.append(n_init_samples + i + 1)
            evaluation_rewards.append(rew)
            learning_rewards.append(learning_rew)
            l_2.append(l_2_err)
            l_inf.append(l_inf_err)
            fvals.append(fval)

            # Make sure we restart from s
            mdp.reset(s)

            # Restore weights
            Q._w = current_w

            end_time = time.time()
            elapsed_time = end_time - start_time
            start_time = end_time

            if verbose:
                print(
                    "Iter {} Episodes {} Rew(G) {} Rew(L) {} Fval {} L2 {} L_inf {} time {:.1f} s"
                    .format(i, episodes[-1], rew, learning_rew, fval, l_2_err,
                            l_inf_err, elapsed_time))

    run_info = [
        iterations, episodes, n_samples, learning_rewards, evaluation_rewards,
        l_2, l_inf, fvals, episode_rewards[:len(episode_t)], episode_t
    ]
    weights = np.array(mu)

    return [mdp.get_info(), weights, run_info]