Beispiel #1
0
    reward_function = 'simple'  # simple or lbph
    seed = random.randint(0, 1000)
    print "Using seed=%i" % seed
    ce = CubeEnvironment(n=3, seed=seed, whiteplastic=False, reward_function=reward_function)
    ce.randomize(1)

    # Create actor-critic models
    n_hidden = 50
    activation_func = 'gaussian'
    t = 10
    gamma = 0.99
    lambda_ = 0.3  # default: 0.7
    N = 5000
    M = 8  # Total movements

    policy_model = PolicyModel(ce, LBPFeatureTransformer(), t=t, lambda_=lambda_, gamma=gamma)
    value_model = ValueModel(ce, LBPFeatureTransformer(), n_hidden=n_hidden, activation_func=activation_func,
                             t=t, lambda_=lambda_, gamma=gamma)


    max_movements_for_cur_iter, max_movements_for_random = 0, 0
    total_rewards = np.empty(N)
    total_iters = np.empty(N)
    for n in range(N):
        #eps = 1.0/np.sqrt(n+1)
        eps = 1.0/(0.001*n+1)
        #eps = 0.7
        prev_max_movements_for_cur_iter, prev_max_movements_for_random = max_movements_for_cur_iter, max_movements_for_random
        max_movements_for_cur_iter, max_movements_for_random = max_movements(n, N, M)
        if prev_max_movements_for_cur_iter != max_movements_for_cur_iter:
            print "Now playing for a max of %i movements..." % max_movements_for_cur_iter
Beispiel #2
0
 def __init__(self, env):
     self.env = env
     self.models = {}
     self.feature_transformer = LBPFeatureTransformer()
     for a in env.actions_available:
         self.models[a] = dict()
Beispiel #3
0
from rl3.environment.base import CubeEnvironment

import matplotlib.pyplot as plt
import numpy as np

if __name__ == '__main__':
    reward_function = 'lbph'  # simple or lbph
    seed = 39  # np.random.randint(0, 100)
    print "Using seed=%i" % seed
    ce = CubeEnvironment(n=3,
                         seed=seed,
                         whiteplastic=False,
                         reward_function=reward_function)
    ce.randomize(1)
    #model = QubeTabularAgent(ce)
    model = QubeRegAgent(ce, LBPFeatureTransformer())

    # TODO: crear un mecanismo de attachments para el env (por ej, para monitorear algoritmos seguidos en cada iter)

    gamma = 0.99
    N = 3000
    totalrewards = np.empty(N)
    costs = np.empty(N)
    for n in range(N):
        eps = 1.0 / np.sqrt(n + 1)
        totalreward = play_one(model, ce, eps, gamma, max_iters=100)
        totalrewards[n] = totalreward
        if n % 100 == 0:
            print "episode:", n, "total reward:", totalreward, "eps:", eps, "avg reward (last 100):", \
                totalrewards[max(0, n-100):(n+1)].mean()
        # print "Algorithm followed: %s" % ce.actions_taken
Beispiel #4
0
    sum_movements = sum(movements)
    probabilities = [m / float(sum_movements) for m in movements]
    return int(np.random.choice(movements, p=probabilities))


if __name__ == '__main__':
    reward_function = 'simple'  # simple or lbph
    seed = random.randint(0, 1000)
    print "Using seed=%i" % seed
    ce = CubeEnvironment(n=3,
                         seed=seed,
                         whiteplastic=False,
                         reward_function=reward_function)
    ce.randomize(1)
    #model = QubeTabularAgent(ce)
    model = QubeBoltzmannRegAgent(ce, LBPFeatureTransformer())

    # TODO: crear un mecanismo de attachments para el env (por ej, para monitorear algoritmos seguidos en cada iter)
    # TODO: experience replay, pero mayormente de los estados que llevaron a un reward positivo

    gamma = 0.99
    lambda_ = 0.3  # default: 0.7
    N = 5000
    M = 10  # Total movements
    max_movements_for_cur_iter, max_movements_for_random = 0, 0
    totalrewards = np.empty(N)
    for n in range(N):
        #eps = 1.0/np.sqrt(n+1)
        eps = 1.0 / (0.001 * n + 1)
        #eps = 0.7
        prev_max_movements_for_cur_iter, prev_max_movements_for_random = max_movements_for_cur_iter, max_movements_for_random
    #max_movs = int(2.4270509831248424 ** max_movs_for_random)  # num_aureo * n_caras / n vertices
    max_movs = min(max_movs, 100)
    return max_movs, max_movs_for_random


if __name__ == '__main__':
    reward_function = 'lbph'  # simple or lbph
    seed = random.randint(0, 1000)
    print "Using seed=%i" % seed
    ce = CubeEnvironment(n=3,
                         seed=seed,
                         whiteplastic=False,
                         reward_function=reward_function)
    ce.randomize(1)
    #model = QubeTabularAgent(ce)
    model = QubeBloomPARAgent(ce, LBPFeatureTransformer())
    #model = QubeBloomPARAgent(ce, LBPFeatureTransformer())
    #model = QubeBloomAgent(ce, LBPFeatureTransformer())

    # TODO: crear un mecanismo de attachments para el env (por ej, para monitorear algoritmos seguidos en cada iter)

    gamma = 0.99
    lambda_ = 0.7  # default: 0.7
    N = 10000
    M = 10  # Total movements
    max_movements_for_cur_iter, max_movements_for_random = 0, 0
    totalrewards = np.empty(N)
    for n in range(N):
        #eps = 1.0/np.sqrt(n+1)
        eps = 1.0 / (0.001 * n + 1)
        #eps = 0.7