コード例 #1
0
ファイル: decomposer.py プロジェクト: bbliem/decomp
    def decompose(self):
        """Return a list of partial decompositions using parameters given in
        the constructor."""
        # XXX SIGALRM works only on some platforms
        if self.time_limit:
            signal.signal(signal.SIGALRM, self._on_timeout)
            signal.alarm(self.time_limit)

        try:
            self.do_eliminations()
        except Exception:
            log.debug("Decomposing aborted due to timeout")

        if self.time_limit:
            signal.alarm(0)

        for td in self.td_roots:
            td.remove_subset_children()
        self.td_roots = [td.move_superset_children() for td in self.td_roots]
        self.connect_roots(self.graph.vertices)
        for i, td in enumerate(self.td_roots):
            self.normalize(td)
            if self.minimize_roots:
                intersection_with_remainder = td.node & self.remainder()
                if len(intersection_with_remainder) < len(td.node):
                    new_root = TD(intersection_with_remainder)
                    new_root.add_child(td)
                    self.td_roots[i] = new_root

        # TD.canonize_root()?
        # TD.sort()?
        return self.td_roots
コード例 #2
0
	def __init__(self, alpha, gamma, eligibility, epsilon, next_action_considered):
		super().__init__()

		# store the hyper parameters
		self.alpha = alpha
		self.gamma = gamma
		self.eligibility = eligibility
		self.epsilon = epsilon
		self.next_action_considered = next_action_considered

		# use 'epsilon greedy' to chose action while agent is in a state
		#self.action_selection = common.epsilon_greedy

		# __experiment
		self.action_selection = common.explore
		self.steps = 0

		# use TD class to do the actual algorithm
		self.td = TD(alpha, gamma, eligibility, self.value_callback, self.update_callback)

		# filled in when we know environment layout
		self.n_features = None
		self.action_space = None

		# underlying storage for store value per action per state
		# it's called 'q-table' just for convention, actually it can be used for any TD learning
		# as long as the environment has finite number of states
		self.qtable = {}  

		# delayed learning
		self.qtable_future = None
		self._delayed_learning = False
コード例 #3
0
ファイル: decomposer.py プロジェクト: bbliem/decomp
    def eliminate(self, vertex):
        new_bag = frozenset(self.graph.neighborhood(vertex))
        new_subtree = TD(new_bag)

        # Eliminate vertex and connect neighbors
        for (x, y) in [(x, y) for x in self.graph.neighbors[vertex]
                       for y in self.graph.neighbors[vertex] if x < y]:
            self.graph.add_edge(x, y)
        self.graph.remove_vertex(vertex)

        # Add children to this new subtree
        for subtree in self.bags_containing[vertex]:
            if not subtree.parent:
                new_subtree.add_child(subtree)
                self.td_roots.remove(subtree)

        # The new subtree has no parent yet
        self.td_roots.append(new_subtree)

        # For each bag element, remember it's contained in this new node
        for x in new_bag:
            self.bags_containing[x].append(new_subtree)
コード例 #4
0
    def setUp(self):
        # TD:
        # 1
        #   2
        self.td1 = TD({1})
        self.td1.add_child(TD({2}))

        # 123
        #   14
        #   25
        #   26
        self.td2 = TD({1, 2, 3})
        self.td2.add_child(TD({1, 4}))
        self.td2.add_child(TD({2, 5}))
        self.td2.add_child(TD({2, 6}))
コード例 #5
0
def estimate_values_by_td(env, policy, features, true_values, lambda_, alpha):
    """Computes value-estimates using TD(\lambda) algorithm.

    Args:
        env (ChainWorld): The RL environment.
        policy (Policy): The policy that needs to be evaluated.
        features (StateFeatures): Feature function for states of the env.
        true_values (numpy.ndarray): The true state-values for given policy.
        lambda_ (float): \lambda parameter of TD(\lambda).
        alpha (float): Learning rate.

    Returns:
        list, list: Final value estimates of all states and MSVE errors of the
            algorithm's estimates over the course of it's execution.
    """
    td = TD(policy=policy, features=features, gamma=0.95,
            alpha=0.1, lambda_=lambda_, true_values=true_values)
    td.run(num_episodes)

    v = []
    for state in env.state_iterator():
        feature_vector = features.vector(state)
        v.append(utils.state_value(feature_vector, td.theta))
    return v, td.msve
コード例 #6
0
class TestTD(unittest.TestCase):
    def setUp(self):
        # TD:
        # 1
        #   2
        self.td1 = TD({1})
        self.td1.add_child(TD({2}))

        # 123
        #   14
        #   25
        #   26
        self.td2 = TD({1, 2, 3})
        self.td2.add_child(TD({1, 4}))
        self.td2.add_child(TD({2, 5}))
        self.td2.add_child(TD({2, 6}))

    def test_weakly_normalize(self):
        td1 = copy.deepcopy(self.td1)
        assert td1 == self.td1  # deliberately not self.assertEqual
        td1.weakly_normalize()
        self.assertEqual(td1, self.td1)

        td2 = copy.deepcopy(self.td2)
        assert td2 == self.td2  # deliberately not self.assertEqual
        td2.weakly_normalize()
        # Expected result:
        join_node = TD({1, 2})
        child1 = TD({1, 2})
        child2 = TD({1, 2})
        child3 = TD({1, 2})
        child1.add_child(TD({1, 4}))
        child2.add_child(TD({2, 5}))
        child3.add_child(TD({2, 6}))
        for c in [child1, child2, child3]:
            join_node.add_child(c)
        expected = TD({1, 2, 3})
        expected.add_child(join_node)
        self.assertEqual(td2, expected)
コード例 #7
0
def experiment():

    plotting = True
    if plotting:
        d = DynamicPlot(window_x=100,
                        title='On-Policy Predictions',
                        xlabel='Tim  e_Step',
                        ylabel='Value')
        d.add_line('Prediction')
        d.add_line('State')

    # init problem
    num_state = 25

    alpha = 0.5
    lam = 0.95
    gamma = 0.97

    # init state, action, and time step
    state = 0
    t = 0

    # init the solution
    soul = TD(num_state)
    # TD lambda algorithm main loop
    while True:
        state_prime, stim = next_state(state, num_state)
        if state_prime == 0:
            soul.reset_et()
        else:
            delta = soul.update(feature_vector(state, num_state), stim,
                                feature_vector(state_prime, num_state), alpha,
                                gamma, gamma, lam)
        d.update(t, [soul.get_value(feature_vector(state, num_state)), 0])
        state = state_prime
        t += 1
コード例 #8
0
ファイル: decomposer.py プロジェクト: bbliem/decomp
 def add_parent_to_roots(self, bag):
     new_root = TD(bag)
     for node in self.td_roots:
         new_root.add_child(node)
     self.td_roots = [new_root]
コード例 #9
0
    def test_weakly_normalize(self):
        td1 = copy.deepcopy(self.td1)
        assert td1 == self.td1  # deliberately not self.assertEqual
        td1.weakly_normalize()
        self.assertEqual(td1, self.td1)

        td2 = copy.deepcopy(self.td2)
        assert td2 == self.td2  # deliberately not self.assertEqual
        td2.weakly_normalize()
        # Expected result:
        join_node = TD({1, 2})
        child1 = TD({1, 2})
        child2 = TD({1, 2})
        child3 = TD({1, 2})
        child1.add_child(TD({1, 4}))
        child2.add_child(TD({2, 5}))
        child3.add_child(TD({2, 6}))
        for c in [child1, child2, child3]:
            join_node.add_child(c)
        expected = TD({1, 2, 3})
        expected.add_child(join_node)
        self.assertEqual(td2, expected)
コード例 #10
0
ファイル: td_example.py プロジェクト: stober/td
"""
Author: Jeremy M. Stober
Program: TD_EXAMPLE.PY
Date: Friday, February 24 2012
Description: Examples using TD algorithms to learn value functions.
"""


from gridworld.boyan import Boyan
from gridworld.chainwalk import Chainwalk
from cartpole import CartPole
from td import TD, TDQ, TDQCmac, SarsaCmac, Sarsa, ActorCritic, ActorCriticCmac

# a simple environment
env = Boyan()
learner = TD(13, 0.1, 1.0, 0.8)
learner.learn(1000,env,env.random_policy)
print learner.V

env = Chainwalk()
learnerq = TDQ(2,4, 0.1, 0.9, 0.8)

import pdb

env = CartPole()
#learnerq = SarsaCmac(2,0.01,0.95,0.9,0.01)
#learnerq = Sarsa(2,170,0.001,0.95,0.5,0.01)
#learnerq = ActorCritic(2, 162, 0.5, 0.5, 0.95, 0.8, 0.9) # From an old Sutton paper -- seems to work quite well.
learnerq = ActorCriticCmac(2, 0.5, 1.0, 0.95, 0.8, 0.9) # Clearly does some learning, but not nearly as well. Policy not as stable.
learnerq.learn(1000,env)
コード例 #11
0
class TDLearning(AlgPlugin):
	def __init__(self, alpha, gamma, eligibility, epsilon, next_action_considered):
		super().__init__()

		# store the hyper parameters
		self.alpha = alpha
		self.gamma = gamma
		self.eligibility = eligibility
		self.epsilon = epsilon
		self.next_action_considered = next_action_considered

		# use 'epsilon greedy' to chose action while agent is in a state
		#self.action_selection = common.epsilon_greedy

		# __experiment
		self.action_selection = common.explore
		self.steps = 0

		# use TD class to do the actual algorithm
		self.td = TD(alpha, gamma, eligibility, self.value_callback, self.update_callback)

		# filled in when we know environment layout
		self.n_features = None
		self.action_space = None

		# underlying storage for store value per action per state
		# it's called 'q-table' just for convention, actually it can be used for any TD learning
		# as long as the environment has finite number of states
		self.qtable = {}  

		# delayed learning
		self.qtable_future = None
		self._delayed_learning = False

	@property
	def delayed_learning(self):
		return self._delayed_learning

	@delayed_learning.setter
	def delayed_learning(self, onoff):
		assert onoff == True or onoff == False

		if onoff != self._delayed_learning:
			if onoff == True:
				self.qtable_future = copy.deepcopy(self.qtable)
			else:
				assert self.qtable_future != None
				self.qtable = self.qtable_future
				self.qtable_future = None

			self._delayed_learning = onoff

	def delayed_learning_catchup(self):
		if self._delayed_learning == True:
			self.qtable = copy.deepcopy(self.qtable_future)

	def value_callback(self, state, action):
		"""TD algorithm call this function to query action-value of a state"""

		#print("value_callback(): state: {}, action: {}".format(state, action))

		if action == None:
			return np.max(self.qtable[state])
		else:
			if self.delayed_learning:
				if self.qtable_future.get(state) == None:
					self.qtable_future[state] = [np.float32(0)] * self.action_space.n_actions
				return self.qtable_future[state][action]
			else:
				return self.qtable[state][action]

	def update_callback(self, state, action, delta):
		"""TD algorithm call this function to update action-value of a state"""

		# wk_debug
		if delta != 0:
			#if delta < 0.0000000000000001 and delta > -0.000000000000001:
				#print("update_callback(): state: {}, action: {}, delta: {:.24f}".format(state, action, delta))
			if delta > 10000000000 or delta < -100000000000:
				print("update_callback(): state: {}, action: {}, delta: {:.24f}".format(state, action, delta))

		if self.delayed_learning:
			if self.qtable_future.get(state) == None:
				self.qtable_future[state] = [np.float32(0)] * self.action_space.n_actions
			self.qtable_future[state][action] += np.float32(delta)
		else:
			self.qtable[state][action] += np.float32(delta)

	##############################################################
	#                                                            #
	#    Below is the implementation of 'AlgPlugin' interface    #
	#                                                            #
	##############################################################
	def layout(self, n_features, action_space, preset_states_list):
		# __experiment
		self.steps = 0

		self.n_features = n_features
		self.action_space = action_space
		self.qtable = {}
		self.qtable_future = None
		self._delayed_learning = False

		for (state, value, is_terminal) in preset_states_list:
			self.qtable[state] = [np.float32(value)] * self.action_space.n_actions

	def episode_start(self, episode, state):
		#super().episode_start(episode, state)
		if self.qtable.get(state) == None:
			self.qtable[state] = [np.float32(0)] * self.action_space.n_actions
		self.td.episode_start(state)
		return self.next_action(state)

	def one_step(self, state, action, reward, state_next):
		if self.qtable.get(state) == None:
			self.qtable[state] = [np.float32(0)] * self.action_space.n_actions

		if self.qtable.get(state_next) == None:
			self.qtable[state_next] = [np.float32(0)] * self.action_space.n_actions

		next_action_index = self._next_action_index(state_next)
		if self.next_action_considered == True:
			use_this_action = next_action_index
		else:
			use_this_action = None

		# need to translate from action to action_index, underlying TD algorithm
		# assume that actions are non-negative integer
		action_index = self.action_space.action_index(action)

		self.td.step(state, action_index, reward, state_next, use_this_action)
		return self.action_space.action_at(next_action_index)

	def episode_end(self):
		# __experiment
		self.steps += 1

		self.td.episode_end()

	def _next_action_index(self, state):
		# __experiment
		action_index = self.action_selection(self.steps, self.qtable[state])
		#print("next action index:", action_index)
		return action_index
		#return self.action_selection(self.epsilon, self.qtable[state])

	def next_action(self, state):
		"""Given the current state, based on selection algorithm select next action for agent"""
		action_index = self._next_action_index(state)
		return self.action_space.action_at(action_index)

	def best_action(self, state):
		"""Select the action that has max value in a given state"""
		action_index = np.argmax(self.qtable[state])
		return self.action_space.action_at(action_index)

	def get_action_values(self, state):
		return self.qtable.get(state)

	def get_action_values_dict(self, state):
		action_values = self.qtable.get(state)
		if action_values == None:
			return None
		else:
			action_values_dict = {self.action_space.action_at(i):v for i, v in enumerate(action_values)}
			return action_values_dict

	def whole_episode(self, one_episode):
		self.episode_start(one_episode[0][0])
		for state, action, reward, state_next in one_episode:
			self.step(state, action, reward, state_next)
		self.episode_end()
コード例 #12
0
ファイル: td_example.py プロジェクト: tanduong/td
#! /usr/bin/env python
"""
Author: Jeremy M. Stober
Program: TD_EXAMPLE.PY
Date: Friday, February 24 2012
Description: Examples using TD algorithms to learn value functions.
"""

from gridworld.boyan import Boyan
from gridworld.chainwalk import Chainwalk
from cartpole import CartPole
from td import TD, TDQ, TDQCmac, SarsaCmac, Sarsa, ActorCritic, ActorCriticCmac

# a simple environment
env = Boyan()
learner = TD(13, 0.1, 1.0, 0.8)
learner.learn(1000, env, env.random_policy)
print learner.V

env = Chainwalk()
learnerq = TDQ(2, 4, 0.1, 0.9, 0.8)

import pdb

env = CartPole()
#learnerq = SarsaCmac(2,0.01,0.95,0.9,0.01)
#learnerq = Sarsa(2,170,0.001,0.95,0.5,0.01)
#learnerq = ActorCritic(2, 162, 0.5, 0.5, 0.95, 0.8, 0.9) # From an old Sutton paper -- seems to work quite well.
learnerq = ActorCriticCmac(
    2, 0.5, 1.0, 0.95, 0.8, 0.9
)  # Clearly does some learning, but not nearly as well. Policy not as stable.