Example #1
0
def lbph_reward(cube, reward_positive=100, reward_negative=-1):
    if cube.is_solved():
        reward = reward_positive
    else:
        actions_taken = cube.actions_taken
        # punish to avoid regrets (i.e. agent moving back and forth between two states)
        if len(actions_taken) >= 2 and are_inverse_actions(actions_taken[-2], actions_taken[-1]):
            reward = 10 * reward_negative
        # punish to avoid loops (i.e. agent doing a complete loop up to the same state,
        #                        or making three steps that are equal to make just a single inverse step)
        elif len(actions_taken) >= 3 and len(set(actions_taken[-3:])) == 1:
            reward = 10 * reward_negative
        else:
            state = cube.get_state()
            lbp_code = LBPFeatureTransformer.transform(state, normalize=False)
            hist_lbp = LBPFeatureTransformer.hist_lbp_code(lbp_code)
            coefficients = np.linspace(-1.0, 1.0, len(hist_lbp))
            #coefficients[coefficients > 0.0] = 0.0
            reward = sum([c * h for (c, h) in zip(coefficients, hist_lbp)])
            reward += reward_negative  # due to take a step
    return reward
Example #2
0
class QubeTabularAgent(object):
    def __init__(self, env):
        self.env = env
        self.models = {}
        self.feature_transformer = LBPFeatureTransformer()
        for a in env.actions_available:
            self.models[a] = dict()

    def predict_from_action(self, s, a):
        X = self.feature_transformer.transform(s, normalize=False)
        X = tuple(X)
        prediction = 0.0
        if X in self.models[a]:
            prediction = self.models[a][X]
        return prediction

    def predict(self, s):
        return np.array(
            [self.predict_from_action(s, a) for a in self.models.keys()])

    def update(self, s, a, G):
        X = self.feature_transformer.transform(s, normalize=False)
        X = tuple(X)
        if X in self.models[a]:
            self.models[a][X] += G
        else:
            self.models[a][X] = np.random.uniform(0.0, 1.0)

    def sample_action(self, s, eps):
        if np.random.random() < eps:
            return self.env.sample_action()
        else:
            actions = self.models.keys()
            G = [self.predict_from_action(s, a) for a in self.models.keys()]
            if max(G) == 0.0:
                action = np.random.choice(actions)
            else:
                action = actions[np.argmax(G)]
            return action
Example #3
0
    reward_function = 'simple'  # simple or lbph
    seed = random.randint(0, 1000)
    print "Using seed=%i" % seed
    ce = CubeEnvironment(n=3, seed=seed, whiteplastic=False, reward_function=reward_function)
    ce.randomize(1)

    # Create actor-critic models
    n_hidden = 50
    activation_func = 'gaussian'
    t = 10
    gamma = 0.99
    lambda_ = 0.3  # default: 0.7
    N = 5000
    M = 8  # Total movements

    policy_model = PolicyModel(ce, LBPFeatureTransformer(), t=t, lambda_=lambda_, gamma=gamma)
    value_model = ValueModel(ce, LBPFeatureTransformer(), n_hidden=n_hidden, activation_func=activation_func,
                             t=t, lambda_=lambda_, gamma=gamma)


    max_movements_for_cur_iter, max_movements_for_random = 0, 0
    total_rewards = np.empty(N)
    total_iters = np.empty(N)
    for n in range(N):
        #eps = 1.0/np.sqrt(n+1)
        eps = 1.0/(0.001*n+1)
        #eps = 0.7
        prev_max_movements_for_cur_iter, prev_max_movements_for_random = max_movements_for_cur_iter, max_movements_for_random
        max_movements_for_cur_iter, max_movements_for_random = max_movements(n, N, M)
        if prev_max_movements_for_cur_iter != max_movements_for_cur_iter:
            print "Now playing for a max of %i movements..." % max_movements_for_cur_iter
Example #4
0
 def __init__(self, env):
     self.env = env
     self.models = {}
     self.feature_transformer = LBPFeatureTransformer()
     for a in env.actions_available:
         self.models[a] = dict()
Example #5
0
if __name__ == "__main__":
    """
    Functional testing.
    """
    # Taking all of the supported actions

    seed = np.random.randint(0, 100)
    print "Using seed=%i" % seed
    ce = CubeEnvironment(n=3, seed=seed, whiteplastic=False)
    print "---o- ---------------------------- -o---"
    print "---o- Taking all supported actions -o---"
    print "---o- ---------------------------- -o---"
    state = ce.cube.get_state()
    print "State: %s" % str(state)
    lbp_code = LBPFeatureTransformer.transform(state)
    print "LBP code: %s" % str(lbp_code)
    lbp_hist = LBPFeatureTransformer.hist_lbp_code(lbp_code)
    print "LBP hist: %s" % str(lbp_hist)
    print "It's solved!" if ce.is_solved() else "Not solved!"
    for a in actions_available:
        print "Taking the following action: %s" % a
        ce.take_action(a)
        state = ce.cube.get_state()
        print "State: %s" % str(state)
        lbp_code = LBPFeatureTransformer.transform(state)
        print "LBP code: %s" % str(lbp_code)
        lbp_hist = LBPFeatureTransformer.hist_lbp_code(lbp_code)
        print "LBP hist: %s" % str(lbp_hist)
        print "It's solved!" if ce.is_solved() else "Not solved!"
        #ce.render(flat=False)#.savefig("test%02d.png" % m, dpi=865 / c.N)
Example #6
0
    sum_movements = sum(movements)
    probabilities = [m / float(sum_movements) for m in movements]
    return int(np.random.choice(movements, p=probabilities))


if __name__ == '__main__':
    reward_function = 'simple'  # simple or lbph
    seed = random.randint(0, 1000)
    print "Using seed=%i" % seed
    ce = CubeEnvironment(n=3,
                         seed=seed,
                         whiteplastic=False,
                         reward_function=reward_function)
    ce.randomize(1)
    #model = QubeTabularAgent(ce)
    model = QubeBoltzmannRegAgent(ce, LBPFeatureTransformer())

    # TODO: crear un mecanismo de attachments para el env (por ej, para monitorear algoritmos seguidos en cada iter)
    # TODO: experience replay, pero mayormente de los estados que llevaron a un reward positivo

    gamma = 0.99
    lambda_ = 0.3  # default: 0.7
    N = 5000
    M = 10  # Total movements
    max_movements_for_cur_iter, max_movements_for_random = 0, 0
    totalrewards = np.empty(N)
    for n in range(N):
        #eps = 1.0/np.sqrt(n+1)
        eps = 1.0 / (0.001 * n + 1)
        #eps = 0.7
        prev_max_movements_for_cur_iter, prev_max_movements_for_random = max_movements_for_cur_iter, max_movements_for_random
Example #7
0
from rl3.environment.base import CubeEnvironment

import matplotlib.pyplot as plt
import numpy as np

if __name__ == '__main__':
    reward_function = 'lbph'  # simple or lbph
    seed = 39  # np.random.randint(0, 100)
    print "Using seed=%i" % seed
    ce = CubeEnvironment(n=3,
                         seed=seed,
                         whiteplastic=False,
                         reward_function=reward_function)
    ce.randomize(1)
    #model = QubeTabularAgent(ce)
    model = QubeRegAgent(ce, LBPFeatureTransformer())

    # TODO: crear un mecanismo de attachments para el env (por ej, para monitorear algoritmos seguidos en cada iter)

    gamma = 0.99
    N = 3000
    totalrewards = np.empty(N)
    costs = np.empty(N)
    for n in range(N):
        eps = 1.0 / np.sqrt(n + 1)
        totalreward = play_one(model, ce, eps, gamma, max_iters=100)
        totalrewards[n] = totalreward
        if n % 100 == 0:
            print "episode:", n, "total reward:", totalreward, "eps:", eps, "avg reward (last 100):", \
                totalrewards[max(0, n-100):(n+1)].mean()
        # print "Algorithm followed: %s" % ce.actions_taken
    #max_movs = int(2.4270509831248424 ** max_movs_for_random)  # num_aureo * n_caras / n vertices
    max_movs = min(max_movs, 100)
    return max_movs, max_movs_for_random


if __name__ == '__main__':
    reward_function = 'lbph'  # simple or lbph
    seed = random.randint(0, 1000)
    print "Using seed=%i" % seed
    ce = CubeEnvironment(n=3,
                         seed=seed,
                         whiteplastic=False,
                         reward_function=reward_function)
    ce.randomize(1)
    #model = QubeTabularAgent(ce)
    model = QubeBloomPARAgent(ce, LBPFeatureTransformer())
    #model = QubeBloomPARAgent(ce, LBPFeatureTransformer())
    #model = QubeBloomAgent(ce, LBPFeatureTransformer())

    # TODO: crear un mecanismo de attachments para el env (por ej, para monitorear algoritmos seguidos en cada iter)

    gamma = 0.99
    lambda_ = 0.7  # default: 0.7
    N = 10000
    M = 10  # Total movements
    max_movements_for_cur_iter, max_movements_for_random = 0, 0
    totalrewards = np.empty(N)
    for n in range(N):
        #eps = 1.0/np.sqrt(n+1)
        eps = 1.0 / (0.001 * n + 1)
        #eps = 0.7