def lbph_reward(cube, reward_positive=100, reward_negative=-1): if cube.is_solved(): reward = reward_positive else: actions_taken = cube.actions_taken # punish to avoid regrets (i.e. agent moving back and forth between two states) if len(actions_taken) >= 2 and are_inverse_actions(actions_taken[-2], actions_taken[-1]): reward = 10 * reward_negative # punish to avoid loops (i.e. agent doing a complete loop up to the same state, # or making three steps that are equal to make just a single inverse step) elif len(actions_taken) >= 3 and len(set(actions_taken[-3:])) == 1: reward = 10 * reward_negative else: state = cube.get_state() lbp_code = LBPFeatureTransformer.transform(state, normalize=False) hist_lbp = LBPFeatureTransformer.hist_lbp_code(lbp_code) coefficients = np.linspace(-1.0, 1.0, len(hist_lbp)) #coefficients[coefficients > 0.0] = 0.0 reward = sum([c * h for (c, h) in zip(coefficients, hist_lbp)]) reward += reward_negative # due to take a step return reward
class QubeTabularAgent(object): def __init__(self, env): self.env = env self.models = {} self.feature_transformer = LBPFeatureTransformer() for a in env.actions_available: self.models[a] = dict() def predict_from_action(self, s, a): X = self.feature_transformer.transform(s, normalize=False) X = tuple(X) prediction = 0.0 if X in self.models[a]: prediction = self.models[a][X] return prediction def predict(self, s): return np.array( [self.predict_from_action(s, a) for a in self.models.keys()]) def update(self, s, a, G): X = self.feature_transformer.transform(s, normalize=False) X = tuple(X) if X in self.models[a]: self.models[a][X] += G else: self.models[a][X] = np.random.uniform(0.0, 1.0) def sample_action(self, s, eps): if np.random.random() < eps: return self.env.sample_action() else: actions = self.models.keys() G = [self.predict_from_action(s, a) for a in self.models.keys()] if max(G) == 0.0: action = np.random.choice(actions) else: action = actions[np.argmax(G)] return action
reward_function = 'simple' # simple or lbph seed = random.randint(0, 1000) print "Using seed=%i" % seed ce = CubeEnvironment(n=3, seed=seed, whiteplastic=False, reward_function=reward_function) ce.randomize(1) # Create actor-critic models n_hidden = 50 activation_func = 'gaussian' t = 10 gamma = 0.99 lambda_ = 0.3 # default: 0.7 N = 5000 M = 8 # Total movements policy_model = PolicyModel(ce, LBPFeatureTransformer(), t=t, lambda_=lambda_, gamma=gamma) value_model = ValueModel(ce, LBPFeatureTransformer(), n_hidden=n_hidden, activation_func=activation_func, t=t, lambda_=lambda_, gamma=gamma) max_movements_for_cur_iter, max_movements_for_random = 0, 0 total_rewards = np.empty(N) total_iters = np.empty(N) for n in range(N): #eps = 1.0/np.sqrt(n+1) eps = 1.0/(0.001*n+1) #eps = 0.7 prev_max_movements_for_cur_iter, prev_max_movements_for_random = max_movements_for_cur_iter, max_movements_for_random max_movements_for_cur_iter, max_movements_for_random = max_movements(n, N, M) if prev_max_movements_for_cur_iter != max_movements_for_cur_iter: print "Now playing for a max of %i movements..." % max_movements_for_cur_iter
def __init__(self, env): self.env = env self.models = {} self.feature_transformer = LBPFeatureTransformer() for a in env.actions_available: self.models[a] = dict()
if __name__ == "__main__": """ Functional testing. """ # Taking all of the supported actions seed = np.random.randint(0, 100) print "Using seed=%i" % seed ce = CubeEnvironment(n=3, seed=seed, whiteplastic=False) print "---o- ---------------------------- -o---" print "---o- Taking all supported actions -o---" print "---o- ---------------------------- -o---" state = ce.cube.get_state() print "State: %s" % str(state) lbp_code = LBPFeatureTransformer.transform(state) print "LBP code: %s" % str(lbp_code) lbp_hist = LBPFeatureTransformer.hist_lbp_code(lbp_code) print "LBP hist: %s" % str(lbp_hist) print "It's solved!" if ce.is_solved() else "Not solved!" for a in actions_available: print "Taking the following action: %s" % a ce.take_action(a) state = ce.cube.get_state() print "State: %s" % str(state) lbp_code = LBPFeatureTransformer.transform(state) print "LBP code: %s" % str(lbp_code) lbp_hist = LBPFeatureTransformer.hist_lbp_code(lbp_code) print "LBP hist: %s" % str(lbp_hist) print "It's solved!" if ce.is_solved() else "Not solved!" #ce.render(flat=False)#.savefig("test%02d.png" % m, dpi=865 / c.N)
sum_movements = sum(movements) probabilities = [m / float(sum_movements) for m in movements] return int(np.random.choice(movements, p=probabilities)) if __name__ == '__main__': reward_function = 'simple' # simple or lbph seed = random.randint(0, 1000) print "Using seed=%i" % seed ce = CubeEnvironment(n=3, seed=seed, whiteplastic=False, reward_function=reward_function) ce.randomize(1) #model = QubeTabularAgent(ce) model = QubeBoltzmannRegAgent(ce, LBPFeatureTransformer()) # TODO: crear un mecanismo de attachments para el env (por ej, para monitorear algoritmos seguidos en cada iter) # TODO: experience replay, pero mayormente de los estados que llevaron a un reward positivo gamma = 0.99 lambda_ = 0.3 # default: 0.7 N = 5000 M = 10 # Total movements max_movements_for_cur_iter, max_movements_for_random = 0, 0 totalrewards = np.empty(N) for n in range(N): #eps = 1.0/np.sqrt(n+1) eps = 1.0 / (0.001 * n + 1) #eps = 0.7 prev_max_movements_for_cur_iter, prev_max_movements_for_random = max_movements_for_cur_iter, max_movements_for_random
from rl3.environment.base import CubeEnvironment import matplotlib.pyplot as plt import numpy as np if __name__ == '__main__': reward_function = 'lbph' # simple or lbph seed = 39 # np.random.randint(0, 100) print "Using seed=%i" % seed ce = CubeEnvironment(n=3, seed=seed, whiteplastic=False, reward_function=reward_function) ce.randomize(1) #model = QubeTabularAgent(ce) model = QubeRegAgent(ce, LBPFeatureTransformer()) # TODO: crear un mecanismo de attachments para el env (por ej, para monitorear algoritmos seguidos en cada iter) gamma = 0.99 N = 3000 totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): eps = 1.0 / np.sqrt(n + 1) totalreward = play_one(model, ce, eps, gamma, max_iters=100) totalrewards[n] = totalreward if n % 100 == 0: print "episode:", n, "total reward:", totalreward, "eps:", eps, "avg reward (last 100):", \ totalrewards[max(0, n-100):(n+1)].mean() # print "Algorithm followed: %s" % ce.actions_taken
#max_movs = int(2.4270509831248424 ** max_movs_for_random) # num_aureo * n_caras / n vertices max_movs = min(max_movs, 100) return max_movs, max_movs_for_random if __name__ == '__main__': reward_function = 'lbph' # simple or lbph seed = random.randint(0, 1000) print "Using seed=%i" % seed ce = CubeEnvironment(n=3, seed=seed, whiteplastic=False, reward_function=reward_function) ce.randomize(1) #model = QubeTabularAgent(ce) model = QubeBloomPARAgent(ce, LBPFeatureTransformer()) #model = QubeBloomPARAgent(ce, LBPFeatureTransformer()) #model = QubeBloomAgent(ce, LBPFeatureTransformer()) # TODO: crear un mecanismo de attachments para el env (por ej, para monitorear algoritmos seguidos en cada iter) gamma = 0.99 lambda_ = 0.7 # default: 0.7 N = 10000 M = 10 # Total movements max_movements_for_cur_iter, max_movements_for_random = 0, 0 totalrewards = np.empty(N) for n in range(N): #eps = 1.0/np.sqrt(n+1) eps = 1.0 / (0.001 * n + 1) #eps = 0.7