def decompose(self): """Return a list of partial decompositions using parameters given in the constructor.""" # XXX SIGALRM works only on some platforms if self.time_limit: signal.signal(signal.SIGALRM, self._on_timeout) signal.alarm(self.time_limit) try: self.do_eliminations() except Exception: log.debug("Decomposing aborted due to timeout") if self.time_limit: signal.alarm(0) for td in self.td_roots: td.remove_subset_children() self.td_roots = [td.move_superset_children() for td in self.td_roots] self.connect_roots(self.graph.vertices) for i, td in enumerate(self.td_roots): self.normalize(td) if self.minimize_roots: intersection_with_remainder = td.node & self.remainder() if len(intersection_with_remainder) < len(td.node): new_root = TD(intersection_with_remainder) new_root.add_child(td) self.td_roots[i] = new_root # TD.canonize_root()? # TD.sort()? return self.td_roots
def __init__(self, alpha, gamma, eligibility, epsilon, next_action_considered): super().__init__() # store the hyper parameters self.alpha = alpha self.gamma = gamma self.eligibility = eligibility self.epsilon = epsilon self.next_action_considered = next_action_considered # use 'epsilon greedy' to chose action while agent is in a state #self.action_selection = common.epsilon_greedy # __experiment self.action_selection = common.explore self.steps = 0 # use TD class to do the actual algorithm self.td = TD(alpha, gamma, eligibility, self.value_callback, self.update_callback) # filled in when we know environment layout self.n_features = None self.action_space = None # underlying storage for store value per action per state # it's called 'q-table' just for convention, actually it can be used for any TD learning # as long as the environment has finite number of states self.qtable = {} # delayed learning self.qtable_future = None self._delayed_learning = False
def eliminate(self, vertex): new_bag = frozenset(self.graph.neighborhood(vertex)) new_subtree = TD(new_bag) # Eliminate vertex and connect neighbors for (x, y) in [(x, y) for x in self.graph.neighbors[vertex] for y in self.graph.neighbors[vertex] if x < y]: self.graph.add_edge(x, y) self.graph.remove_vertex(vertex) # Add children to this new subtree for subtree in self.bags_containing[vertex]: if not subtree.parent: new_subtree.add_child(subtree) self.td_roots.remove(subtree) # The new subtree has no parent yet self.td_roots.append(new_subtree) # For each bag element, remember it's contained in this new node for x in new_bag: self.bags_containing[x].append(new_subtree)
def setUp(self): # TD: # 1 # 2 self.td1 = TD({1}) self.td1.add_child(TD({2})) # 123 # 14 # 25 # 26 self.td2 = TD({1, 2, 3}) self.td2.add_child(TD({1, 4})) self.td2.add_child(TD({2, 5})) self.td2.add_child(TD({2, 6}))
def estimate_values_by_td(env, policy, features, true_values, lambda_, alpha): """Computes value-estimates using TD(\lambda) algorithm. Args: env (ChainWorld): The RL environment. policy (Policy): The policy that needs to be evaluated. features (StateFeatures): Feature function for states of the env. true_values (numpy.ndarray): The true state-values for given policy. lambda_ (float): \lambda parameter of TD(\lambda). alpha (float): Learning rate. Returns: list, list: Final value estimates of all states and MSVE errors of the algorithm's estimates over the course of it's execution. """ td = TD(policy=policy, features=features, gamma=0.95, alpha=0.1, lambda_=lambda_, true_values=true_values) td.run(num_episodes) v = [] for state in env.state_iterator(): feature_vector = features.vector(state) v.append(utils.state_value(feature_vector, td.theta)) return v, td.msve
class TestTD(unittest.TestCase): def setUp(self): # TD: # 1 # 2 self.td1 = TD({1}) self.td1.add_child(TD({2})) # 123 # 14 # 25 # 26 self.td2 = TD({1, 2, 3}) self.td2.add_child(TD({1, 4})) self.td2.add_child(TD({2, 5})) self.td2.add_child(TD({2, 6})) def test_weakly_normalize(self): td1 = copy.deepcopy(self.td1) assert td1 == self.td1 # deliberately not self.assertEqual td1.weakly_normalize() self.assertEqual(td1, self.td1) td2 = copy.deepcopy(self.td2) assert td2 == self.td2 # deliberately not self.assertEqual td2.weakly_normalize() # Expected result: join_node = TD({1, 2}) child1 = TD({1, 2}) child2 = TD({1, 2}) child3 = TD({1, 2}) child1.add_child(TD({1, 4})) child2.add_child(TD({2, 5})) child3.add_child(TD({2, 6})) for c in [child1, child2, child3]: join_node.add_child(c) expected = TD({1, 2, 3}) expected.add_child(join_node) self.assertEqual(td2, expected)
def experiment(): plotting = True if plotting: d = DynamicPlot(window_x=100, title='On-Policy Predictions', xlabel='Tim e_Step', ylabel='Value') d.add_line('Prediction') d.add_line('State') # init problem num_state = 25 alpha = 0.5 lam = 0.95 gamma = 0.97 # init state, action, and time step state = 0 t = 0 # init the solution soul = TD(num_state) # TD lambda algorithm main loop while True: state_prime, stim = next_state(state, num_state) if state_prime == 0: soul.reset_et() else: delta = soul.update(feature_vector(state, num_state), stim, feature_vector(state_prime, num_state), alpha, gamma, gamma, lam) d.update(t, [soul.get_value(feature_vector(state, num_state)), 0]) state = state_prime t += 1
def add_parent_to_roots(self, bag): new_root = TD(bag) for node in self.td_roots: new_root.add_child(node) self.td_roots = [new_root]
def test_weakly_normalize(self): td1 = copy.deepcopy(self.td1) assert td1 == self.td1 # deliberately not self.assertEqual td1.weakly_normalize() self.assertEqual(td1, self.td1) td2 = copy.deepcopy(self.td2) assert td2 == self.td2 # deliberately not self.assertEqual td2.weakly_normalize() # Expected result: join_node = TD({1, 2}) child1 = TD({1, 2}) child2 = TD({1, 2}) child3 = TD({1, 2}) child1.add_child(TD({1, 4})) child2.add_child(TD({2, 5})) child3.add_child(TD({2, 6})) for c in [child1, child2, child3]: join_node.add_child(c) expected = TD({1, 2, 3}) expected.add_child(join_node) self.assertEqual(td2, expected)
""" Author: Jeremy M. Stober Program: TD_EXAMPLE.PY Date: Friday, February 24 2012 Description: Examples using TD algorithms to learn value functions. """ from gridworld.boyan import Boyan from gridworld.chainwalk import Chainwalk from cartpole import CartPole from td import TD, TDQ, TDQCmac, SarsaCmac, Sarsa, ActorCritic, ActorCriticCmac # a simple environment env = Boyan() learner = TD(13, 0.1, 1.0, 0.8) learner.learn(1000,env,env.random_policy) print learner.V env = Chainwalk() learnerq = TDQ(2,4, 0.1, 0.9, 0.8) import pdb env = CartPole() #learnerq = SarsaCmac(2,0.01,0.95,0.9,0.01) #learnerq = Sarsa(2,170,0.001,0.95,0.5,0.01) #learnerq = ActorCritic(2, 162, 0.5, 0.5, 0.95, 0.8, 0.9) # From an old Sutton paper -- seems to work quite well. learnerq = ActorCriticCmac(2, 0.5, 1.0, 0.95, 0.8, 0.9) # Clearly does some learning, but not nearly as well. Policy not as stable. learnerq.learn(1000,env)
class TDLearning(AlgPlugin): def __init__(self, alpha, gamma, eligibility, epsilon, next_action_considered): super().__init__() # store the hyper parameters self.alpha = alpha self.gamma = gamma self.eligibility = eligibility self.epsilon = epsilon self.next_action_considered = next_action_considered # use 'epsilon greedy' to chose action while agent is in a state #self.action_selection = common.epsilon_greedy # __experiment self.action_selection = common.explore self.steps = 0 # use TD class to do the actual algorithm self.td = TD(alpha, gamma, eligibility, self.value_callback, self.update_callback) # filled in when we know environment layout self.n_features = None self.action_space = None # underlying storage for store value per action per state # it's called 'q-table' just for convention, actually it can be used for any TD learning # as long as the environment has finite number of states self.qtable = {} # delayed learning self.qtable_future = None self._delayed_learning = False @property def delayed_learning(self): return self._delayed_learning @delayed_learning.setter def delayed_learning(self, onoff): assert onoff == True or onoff == False if onoff != self._delayed_learning: if onoff == True: self.qtable_future = copy.deepcopy(self.qtable) else: assert self.qtable_future != None self.qtable = self.qtable_future self.qtable_future = None self._delayed_learning = onoff def delayed_learning_catchup(self): if self._delayed_learning == True: self.qtable = copy.deepcopy(self.qtable_future) def value_callback(self, state, action): """TD algorithm call this function to query action-value of a state""" #print("value_callback(): state: {}, action: {}".format(state, action)) if action == None: return np.max(self.qtable[state]) else: if self.delayed_learning: if self.qtable_future.get(state) == None: self.qtable_future[state] = [np.float32(0)] * self.action_space.n_actions return self.qtable_future[state][action] else: return self.qtable[state][action] def update_callback(self, state, action, delta): """TD algorithm call this function to update action-value of a state""" # wk_debug if delta != 0: #if delta < 0.0000000000000001 and delta > -0.000000000000001: #print("update_callback(): state: {}, action: {}, delta: {:.24f}".format(state, action, delta)) if delta > 10000000000 or delta < -100000000000: print("update_callback(): state: {}, action: {}, delta: {:.24f}".format(state, action, delta)) if self.delayed_learning: if self.qtable_future.get(state) == None: self.qtable_future[state] = [np.float32(0)] * self.action_space.n_actions self.qtable_future[state][action] += np.float32(delta) else: self.qtable[state][action] += np.float32(delta) ############################################################## # # # Below is the implementation of 'AlgPlugin' interface # # # ############################################################## def layout(self, n_features, action_space, preset_states_list): # __experiment self.steps = 0 self.n_features = n_features self.action_space = action_space self.qtable = {} self.qtable_future = None self._delayed_learning = False for (state, value, is_terminal) in preset_states_list: self.qtable[state] = [np.float32(value)] * self.action_space.n_actions def episode_start(self, episode, state): #super().episode_start(episode, state) if self.qtable.get(state) == None: self.qtable[state] = [np.float32(0)] * self.action_space.n_actions self.td.episode_start(state) return self.next_action(state) def one_step(self, state, action, reward, state_next): if self.qtable.get(state) == None: self.qtable[state] = [np.float32(0)] * self.action_space.n_actions if self.qtable.get(state_next) == None: self.qtable[state_next] = [np.float32(0)] * self.action_space.n_actions next_action_index = self._next_action_index(state_next) if self.next_action_considered == True: use_this_action = next_action_index else: use_this_action = None # need to translate from action to action_index, underlying TD algorithm # assume that actions are non-negative integer action_index = self.action_space.action_index(action) self.td.step(state, action_index, reward, state_next, use_this_action) return self.action_space.action_at(next_action_index) def episode_end(self): # __experiment self.steps += 1 self.td.episode_end() def _next_action_index(self, state): # __experiment action_index = self.action_selection(self.steps, self.qtable[state]) #print("next action index:", action_index) return action_index #return self.action_selection(self.epsilon, self.qtable[state]) def next_action(self, state): """Given the current state, based on selection algorithm select next action for agent""" action_index = self._next_action_index(state) return self.action_space.action_at(action_index) def best_action(self, state): """Select the action that has max value in a given state""" action_index = np.argmax(self.qtable[state]) return self.action_space.action_at(action_index) def get_action_values(self, state): return self.qtable.get(state) def get_action_values_dict(self, state): action_values = self.qtable.get(state) if action_values == None: return None else: action_values_dict = {self.action_space.action_at(i):v for i, v in enumerate(action_values)} return action_values_dict def whole_episode(self, one_episode): self.episode_start(one_episode[0][0]) for state, action, reward, state_next in one_episode: self.step(state, action, reward, state_next) self.episode_end()
#! /usr/bin/env python """ Author: Jeremy M. Stober Program: TD_EXAMPLE.PY Date: Friday, February 24 2012 Description: Examples using TD algorithms to learn value functions. """ from gridworld.boyan import Boyan from gridworld.chainwalk import Chainwalk from cartpole import CartPole from td import TD, TDQ, TDQCmac, SarsaCmac, Sarsa, ActorCritic, ActorCriticCmac # a simple environment env = Boyan() learner = TD(13, 0.1, 1.0, 0.8) learner.learn(1000, env, env.random_policy) print learner.V env = Chainwalk() learnerq = TDQ(2, 4, 0.1, 0.9, 0.8) import pdb env = CartPole() #learnerq = SarsaCmac(2,0.01,0.95,0.9,0.01) #learnerq = Sarsa(2,170,0.001,0.95,0.5,0.01) #learnerq = ActorCritic(2, 162, 0.5, 0.5, 0.95, 0.8, 0.9) # From an old Sutton paper -- seems to work quite well. learnerq = ActorCriticCmac( 2, 0.5, 1.0, 0.95, 0.8, 0.9 ) # Clearly does some learning, but not nearly as well. Policy not as stable.