コード例 #1
0
class UniformTiledAgent(LinearListAgent):
    """
    A LinearTDAgent subclass for continuous state spaces that
    automatically tiles the input space.  For high-dimensional inputs,
    the input can be separated into a several uniformly distributed
    'receptive fields' (rfs) that may overlap, and each rf is tiled
    separately.

    Parameters:
      num_rfs -- The number of receptive fields to use (default=1)
      rf_width -- The width of the receptive fields
                  (default=[D/num_rfs] where D = input dimensionality)
      num_tilings -- The number of tilings to use for each rf.
      tile_width  -- The width of each tile.
      num_features -- The total combined memory size for all rfs.

    Each separate rf is assumed to use the same tiling parameters.

    Examples:

    D = 9 , num_rfs = 3, rf_width = <default> will give the following

               |-rf0-|     |-rf2-| 
    Features: [ 0 1 2 3 4 5 6 7 8 ]
                     |-rf1-|     

    D = 10 , num_rfs = 3, rf_width = 4 will give the following

               |--rf0--|   |--rf2--| 
    Features: [ 0 1 2 3 4 5 6 7 8 9 ]
                     |--rf1--|     


    RF placements are determined with function place_rfs.
    
    """

    num_rfs = Integer(default=1, bounds=(1, None))
    rf_width = Parameter(None)
    num_tilings = Integer(default=1, bounds=(1, None))
    tile_width = Number(default=1)

    def __init__(self, **args):
        super(UniformTiledAgent, self).__init__(**args)
        if not self.rf_width:
            self.rf_width = self.num_features / self.num_rfs

    def __call__(self, sensation, reward=None):
        if not is_terminal(sensation):
            sensation = tile_uniform_rfs(
                array(sensation) / self.tile_width, self.num_rfs,
                self.rf_width, self.num_tilings,
                self.num_features / self.num_rfs)
        return super(UniformTiledAgent, self).__call__(sensation, reward)
コード例 #2
0
ファイル: som.py プロジェクト: QianShiDex/pendulum
class NoveltySOM(SOM):

    alpha_gain =  Number(default=2.0,bounds=(0.0,None))
    radius_gain = Number(default=2.0,bounds=(0.0,None))


    def __init__(self,**params):

        SOM.__init__(self,**params)
        self.error_ratio = 1.0

    def present_input(self,X):
        
        SOM.present_input(self,X)        
        dist = norm( self.get_model_vector(self.winner()) - X )
        self.error_ratio = dist / norm(X)
    
    def alpha(self):
        return SOM.alpha(self) * tanh(self.error_ratio * self.alpha_gain)

    def radius(self):
        return SOM.radius(self) * tanh(self.error_ratio * self.radius_gain)
コード例 #3
0
class EquilibriumGNG(GNG):
    error_threshold = Number(default=1.0, bounds=(0, None))

    def time_to_grow(self):
        from numpy import average, sqrt
        e = average(self.error * self.beta)

        result = (e > self.error_threshold
                  and super(EquilibriumGNG, self).time_to_grow())
        if result:
            self.verbose("average error = %.4e" % e, " -- Time to grow.")
        else:
            self.debug("average error = %.4e" % e, " -- Not growing.")

        return result
コード例 #4
0
class TabularTDAgent(TDAgent):
    """
    A TDAgent for environments with discrete states and actions.
    Sensations/states can be any hashable Python object, and the
    universe of sensations need not be specified in advance. The agent
    stores and updates a separate Q estimate for every (s,a) pair.

    Parameters:

    initial_q -- The initial Q estimate for each (s,a) pair. (default = 0.0)
    
    """

    initial_q = Number(default=0.0)

    def __init__(self, **params):
        super(TabularTDAgent, self).__init__(**params)
        self.reset_q()
        self.reset_e()

    def _start_episode(self, sensation):
        self.reset_e()
        return super(TabularTDAgent, self)._start_episode(sensation)

    def reset_q(self):
        self.q_table = {}

    def reset_e(self):
        self.e = {}

    def Q(self, s, a=None):
        if a is None:
            result = [self.Q(s, a) for a in range(len(self.actions))]
        else:
            result = self.q_table.get((s, a), self.initial_q)
        self.debug('Q(', s, ',', a, ') = ', result)
        return result

    def update_Q(self, s, a, delta, on_policy=True):
        if not on_policy:
            self.reset_e()

        if (s, a) not in self.q_table:
            self.q_table[(s, a)] = self.initial_q

        if self.lambda_:
            to_be_deleted = []
            for x in self.e:
                self.e[x] *= self.lambda_
                if self.e[x] < self.prune_eligibility:
                    to_be_deleted.append(x)
            for x in to_be_deleted:
                del self.e[x]

        if self.replacing_traces:
            self.e[(s, a)] = 1
        else:
            self.e[(s, a)] += 1

        for x, e in self.e.iteritems():
            self.q_table[x] += self.alpha * e * delta
コード例 #5
0
class LinearTDAgent(TDAgent):
    """
    A TD agent that takes a sensation as a 1D numpy vector of
    features and computes Q as a linear function of that sensation,
    using simple gradient descent.  The function is stored in the
    weight matrix self.w, such that Q(s) can be computed as w*s.
    Assumes a discrete set of actions.  Uses replacing eligibility
    traces.

    Parameters:

    num_features = The number of input features (default = 1)
    initial_w = A scalar value with which to initialize the weight
                matrix.
    """
    num_features = Integer(default=1, bounds=(1, None))
    initial_w = Number(default=0.0)

    def __init__(self, **params):
        super(LinearTDAgent, self).__init__(**params)
        self.reset_w()
        self.reset_e()

    def _start_episode(self, sensation):
        self.reset_e()
        return super(LinearTDAgent, self)._start_episode(sensation)

    def reset_w(self):
        """
        Reset the weight matrix to self.initial_w.
        """
        self.w = zeros(
            (len(self.actions), self.num_features), 'f') + self.initial_w

    def reset_e(self):
        """
        Reset the eligibility traces for self.w to all zeros.
        """
        self.e = zeros((len(self.actions), self.num_features), 'f') + 0.0

    def Q(self, state, action=None):
        """
        Compute Q(s,a) from W*s.
        """
        if action is None:
            return dot(self.w, state)
        else:
            return dot(self.w[action], state)

    def update_Q(self, sensation, action, delta, on_policy=True):
        """
        Do a linear update of the weights.  
        """
        if self.lambda_ and on_policy:
            self.e *= self.lambda_
            if self.prune_eligibility > 0.0:
                self.e *= (self.e > self.prune_eligibility)
        else:
            self.e *= 0.0

        self.e[action] += sensation

        if self.replacing_traces:
            putmask(self.e, self.e > 1, 1)

        self.w += self.e * (self.alpha / (sum(sensation))) * delta
コード例 #6
0
class TDAgent(Agent):
    """
    A generic temporal-difference (TD) agent with discrete actions.
    To create a new TD agent, override this class and implement the methods
    .Q(sensation,action=None) and .update_Q(sensation,action,delda,on_policy=True).

    Parameters:

    alpha  -- The learning rate, default = 0.1
    gamma  -- The discount factor, default = 1.0
    lambda_ -- The eligibility discount factor, default = 0.0.

    step_method -- The method for doing TD updates: 'sarsa' or 'q_learning'.
                     default = 'sarsa'

    action_selection -- The action selection method, default 'epsilon_greedy'.
                        To change action selection, set this to the name of the new method,
                        e.g. 'softmax'.

    initial_epsilon -- The starting epsilon for epsilon_greedy selection. (default=0.1)
    min_epsilon     -- The minimum (final) epsilon. (default = 0.0)
    epsilon_half_life -- The half-life for epsilon annealing. (default = 1)
    
    initial_temperature -- The starting temperature for softmax (Boltzman distribution)
                           selection. (default = 1.0)
    min_temperature     -- The min (final) temperature for softmax selection.
                           (default = 0.01)
    temperature_half_life -- The temperature half-life for softmax selection
                           (default = 1)

    actions -- The list of available actions - can be any Python object
               that is understood as an action by the environment
    """

    alpha = Magnitude(default=0.1)
    gamma = Magnitude(default=1.0)
    lambda_ = Magnitude(default=0.0)

    step_method = Parameter(default="sarsa")

    action_selection = Parameter(default="epsilon_greedy")

    # epsilon-greedy selection parameters
    initial_epsilon = Magnitude(default=0.1)
    min_epsilon = Magnitude(default=0.0)
    epsilon_half_life = Number(default=1, bounds=(0, None))

    # softmax selection parameters
    initial_temperature = Number(default=1.0, bounds=(0, None))
    min_temperature = Number(default=0.01, bounds=(0, None))
    temperature_half_life = Number(default=1, bounds=(0, None))

    actions = Parameter(default=[])

    prune_eligibility = Magnitude(default=0.001)
    replacing_traces = Parameter(default=True)

    history_log = Parameter(default=None)
    allow_learning = Parameter(default=True)

    def __init__(self, **args):
        from plastk.misc.utils import LogFile

        super(TDAgent, self).__init__(**args)
        self.nopickle.append('policy_fn')
        self.policy_fn = getattr(self, self.action_selection)

        self.total_steps = 0

        if isinstance(self.history_log, str):
            self._history_file = LogFile(self.history_log)
        elif isinstance(self.history_log, file) or isinstance(
                self.history_log, LogFile):
            self._history_file = self.history_log

    def unpickle(self):
        """
        Called automatically when the agent is unpickled.  Sets
        the action-selection function to its appropriate value.
        """
        super(TDAgent, self).unpickle()
        self.policy_fn = getattr(self, self.action_selection)

    def __call__(self, sensation, reward=None):
        """
        Do a step.  Calls the function selected in self.step_method
        and returns the action.
        """
        training_fn = getattr(self, '_' + self.step_method + '_training')

        action_index = self.learning_step(training_fn, sensation, reward)
        if self.history_log:
            if reward is None:
                self._history_file.write('start\n')
            self._history_file.write( ` sensation ` + '\n')
            self._history_file.write( ` reward ` + '\n')
            if not is_terminal(sensation):
                self._history_file.write( ` action_index ` + '\n')
        return self.actions[action_index]

    def Q(self, sensation, action=None):
        """
        Return Q(s,a).  If action is None, return an array
        of Q-values for each action in self.actions
        with the given sensation.

        You must override this method to implement a TDAgent subclass.
        """
        raise NYI

    def update_Q(self, sensation, action, delta, on_policy=True):
        """
        Update Q(sensation,action) by delta.  on_policy indicates
        whether the step that produced the update was on- or
        off-policy.  Any eligibility trace updates should be done from
        within this method.

        You must override this method to implement a TDAgent subclass.
        """
        raise NYI

    def learning_step(self, training_fn, sensation, reward=None):
        """
        Do a step using the learning algorithm specified. Selects an
        action, computes the update and calls the appropriate training
        routine.  Returns the agent's next action.
        """
        if reward == None:
            return self._start_episode(sensation)

        next_action = self.policy(sensation)

        if self.allow_learning:
            training_fn(self.last_sensation, self.last_action, reward,
                        sensation, next_action)

        self.last_sensation = sensation
        self.last_action = next_action
        if isinstance(reward, list):
            self.total_steps += len(reward)
        else:
            self.total_steps += 1
        return self.last_action

    def _sarsa_training(self, sensation, action, reward, next_sensation,
                        next_action):
        """
        Perform a single SARSA training step given (s,a,r,s',a'). 
        """
        rho = self.rho(reward)

        if is_terminal(next_sensation):
            value = 0
        else:
            value = self.Q(next_sensation, next_action)

        last_value = self.Q(sensation, action)
        delta = rho + (self.gamma * value - last_value)

        self.verbose("controller step = %d, rho = %.2f" %
                     (self.total_steps, rho))
        self.verbose(("Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f," +
                      "delta = %.5f, terminal? = %d") %
                     (last_value, value, value - last_value, delta,
                      is_terminal(next_sensation)))

        self.update_Q(sensation, action, delta)

    def _q_learning_training(self,
                             sensation,
                             action,
                             reward,
                             next_sensation,
                             next_action=None):
        """
        Do a single Q-lambda training step given (s,a,r,s').  Can be
        called from outside the q_learning_step method for off-policy
        training, experience replay, etc.
        """
        rho = self.rho(reward)

        last_Q = self.Q(sensation)
        last_value = last_Q[action]

        if is_terminal(next_sensation):
            value = 0
        else:
            value = max(self.Q(next_sensation))

        delta = rho + (self.gamma * value - last_value)

        self.verbose(
            "r = %.5f, Q(t-1) = %.5f, Q(t) = %.5f, diff = %.5f, delta = %.5f, terminal? = %d"
            % (rho, last_value, value, value - last_value, delta,
               is_terminal(next_sensation)))

        self.update_Q(sensation,
                      action,
                      delta,
                      on_policy=(last_Q[action] == max(last_Q)))

        if delta:
            assert (self.Q(sensation, action) - last_value) / delta < 1.0

    def _start_episode(self, sensation):
        """
        Start a new episode.  Called from self.__call__ when the reward is None.
        """
        self.last_sensation = sensation
        self.last_action = self.policy(sensation)
        return self.last_action

    def policy(self, sensation):
        """
        Given a sensation, return an action.  Uses
        self.action_selection to get a distribution over the agent's
        actions.  Uses self.applicable_actions to prevent selecting
        inapplicable actions.

        Returns 0 if is_terminal(sensation).
        """
        if not is_terminal(sensation):
            actions = self.applicable_actions(sensation)
            return actions[weighted_sample(self.policy_fn(sensation, actions))]
        else:
            # In the terminal state, the action is irrelevant
            return 0

    def epsilon_greedy(self, sensation, applicable_actions):
        """
        Given self.epsilon() and self.Q(), return a distribution over
        applicable_actions as an array where each element contains the
        a probability mass for the corresponding action.  I.e.  The
        action with the highest Q gets p = self.epsilon() and the
        others get the remainder of the mass, uniformly distributed.
        """
        Q = array([self.Q(sensation, action) for action in applicable_actions])

        # simple epsilon-greedy policy
        # get a vector with a 1 where each max element is, zero elsewhere
        mask = (Q == mmax(Q))

        num_maxes = len(nonzero(mask))
        num_others = len(mask) - num_maxes

        if num_others == 0: return mask

        e0 = self.epsilon() / num_maxes
        e1 = self.epsilon() / num_others

        result = zeros(len(mask)) + 0.0
        putmask(result, mask, 1 - e0)
        putmask(result, mask == 0, e1)
        return result

    def softmax(self, sensation, applicable_actions):
        """
        Given self.temperature() and self.Q(), return a Bolzman
        distribution over applicable_actions as an array where each
        element contains the a probability mass for the corresponding
        action.
        """
        temp = self.temperature()
        self.verbose("softmax, temperature = %.3f" % temp)
        Q = array([self.Q(sensation, action) for action in applicable_actions])
        return softmax(Q, temp)

    def normalized_softmax(self, sensation, applicable_actions):
        """
        Like softmax, except that the Q values are scaled into the
        range [0,1].  May make setting the initial temperature easier than with softmax.
        """
        temp = self.temperature()
        self.verbose("softmax, temperature = %.3f" % temp)
        Q = array([self.Q(sensation, action) for action in applicable_actions])
        return softmax(normalize_minmax(Q), temp)

    def temperature(self):
        """
        Using initial_temperature, min_temperature, and temperature_half_life,
        compute the temperature after self.total_steps, steps.
        """
        Ti = self.initial_temperature
        Tm = self.min_temperature
        decay = log(2) / self.temperature_half_life
        return Tm + (Ti - Tm) * exp(-decay * self.total_steps)

    def epsilon(self):
        """
        Using initial_epsilon, min_epsilon, and epsilon_half_life,
        compute epsilon after self.total_steps, steps.
        """
        Ei = self.initial_epsilon
        Em = self.min_epsilon
        decay = log(2) / self.epsilon_half_life
        return Em + (Ei - Em) * exp(-decay * self.total_steps)

    def rho(self, reward):
        """
        Compute the reward since the last step.
        
        IF the reward is a scalar, it is returned unchanged.

        If reward is a list, it is assumed to be a list of rewards
        accrued at a constant time step, and the discounted sum is
        returned.
        """
        if isinstance(reward, list):
            result = 0
            for r in reward:
                result = self.gamma * result + r
        else:
            result = reward
        return result

    def applicable(self, action, sensation):
        """
        If the given action has a method called 'applicable' return
        the value of action.applicable(sensation), otherwise return True.
        """
        if 'applicable' in dir(action):
            return action.applicable(sensation)
        else:
            return True

    def applicable_actions(self, sensation):
        """
        Return a list of the actions that are applicable to the given
        sensation.
        """
        return [
            a for a in range(len(self.actions))
            if self.applicable(self.actions[a], sensation)
        ]
コード例 #7
0
ファイル: som.py プロジェクト: QianShiDex/pendulum
class SOM(VQ):

    dim = Integer(default=2,bounds=(0,None))
    xdim = Integer(default=1,bounds=(0,None))
    ydim = Integer(default=1,bounds=(0,None))

    rmin = Number(default=0.0)
    rmax = Number(default=1.0)
    
    alpha_0 = Magnitude(default=0.5)
    radius_0 = Number(default=1.0,bounds=(0.0,None))
    
    response_exponent = Number(default=2)
    
    def __init__(self,**params):

        super(SOM,self).__init__(**params)

        self.weights = rand.uniform(self.rmin,self.rmax,
                                           (self.ydim,self.xdim,self.dim))

        self.activation = numpy.zeros( (self.ydim,self.xdim), 'f')
        self.count = 0

    ###########################################
    def init_training(self,alpha_0=None,
                      radius_0=None,
                      training_length=None):

        self.count = 0

        if alpha_0:
            self.alpha_0 = alpha_0
        if radius_0:
            self.radius_0 = radius_0

        self.half_life = training_length/8

    def alpha(self):
        return self.alpha_0 * decay(self.count,self.half_life)
    def radius(self):
        return self.radius_0 * decay(self.count,self.half_life)


    ##########################################
    def present_input(self,X):

        for y in range(self.ydim):
            for x in range(self.xdim):                
                self.activation[y,x] = norm(X - self.weights[y,x])**self.response_exponent

        self.activation = 1/self.activation
        
        if inf in self.activation:
            win = self.winner()
            self.activation.flat[win] = 0
            self.activation -= self.activation
            self.activation.flat[win] = 1.0
        else:
            self.activation /= sum(self.activation.flat)


    def train(self,X):

        self.present_input(X)

        wy,wx = self.winner_coords()

        self.debug("Winner coords = "+`(wy,wx)`)

        int_radius = numpy.floor(self.radius())

        self.debug("Training radius = %.2f" % self.radius())

        ymin = max(0,wy-int_radius)
        ymax = min(wy+int_radius,self.ydim-1)
        xmin = max(0,wx-int_radius)
        xmax = min(wx+int_radius,self.xdim-1)

        self.debug('y range = '+`(ymin,ymax)`)
        self.debug('x range = '+`(xmin,xmax)`)

        for y in range(ymin,ymax+1):
            for x in range(xmin,xmax+1):
                lattice_dist = sqrt((wx-x)**2 + (wy-y)**2)
                self.debug("Trying cell %d,%d"%(x,y))
                if  lattice_dist <= self.radius():
                    self.debug("Training cell %d,%d"%(x,y))
                    rate = self.alpha() * gaussian(lattice_dist,self.radius())
                    self.weights[y,x] += rate * (X - self.weights[y,x])
                                   
        self.count += 1 

    def train_batch(self,data,epochs):

        self.init_training(training_length=len(data)*epochs)

        for i in xrange(epochs):
            self.message("Starting epoch",i)
            for x in rand.shuffle(data):
                self.train(x)
            
    def winner(self):
        return numpy.argmax(self.activation.flat)

    def winner_coords(self):
        pos = numpy.argmax(self.activation.flat)
        return (pos/self.ydim, pos%self.xdim)

    def get_model_vector(self,index):
        if type(index) == int:
            y = index/self.ydim
            x = index%self.xdim
        else:
            # assume it's a tuple
            x,y = index
        return self.weights[y,x]

    def num_model_vectors(self):
        return len(self.activation.flat)
    def get_activation(self):
        return self.activation.flat
コード例 #8
0
class GNG(VQ):

    dim = Integer(default=2, bounds=(1, None))
    rmin = Number(default=0.0)
    rmax = Number(default=1.0)

    e_b = Magnitude(default=0.05)
    e_n = Magnitude(default=0.0006)
    lambda_ = Integer(default=200, bounds=(1, None))
    beta = Magnitude(default=0.0005)
    alpha = Magnitude(default=0.5)
    max_age = Integer(default=100, bounds=(1, None))

    response_exponent = Number(default=2)

    activation_function = Parameter(default='reciprocal')

    grow_callback = Parameter(default=None)
    shrink_callback = Parameter(default=None)

    initial_num_units = Integer(default=2, bounds=(1, None))
    initial_connections_per_unit = Integer(default=0, bounds=(0, None))

    normalize_error = Parameter(default=True)

    def __init__(self, **params):
        from plastk.base.rand import uniform
        from numpy import zeros
        super(GNG, self).__init__(**params)

        N = self.initial_num_units

        self.weights = uniform(self.rmin, self.rmax, (N, self.dim))
        self.dists = zeros((N, 1)) * 0.0
        self.error = zeros((N, 1)) * 0.0

        self.connections = [{} for i in range(N)]

        self.last_input = zeros(self.dim)

        self.count = 0

        if self.initial_connections_per_unit > 0:
            for w in self.weights:
                self.present_input(w)
                ww = self.winners(self.initial_connections_per_unit + 1)
                i = ww[0]
                for j in ww[1:]:
                    self.add_connection(i, j)

        self.nopickle += ['_activation_fn']
        self.unpickle()

    def unpickle(self):
        self._activation_fn = getattr(self,
                                      self.activation_function + '_activation')

        if hasattr(self, 'units'):
            # if the gng has a units attrib, it's the old version,
            # so convert it to the new version.
            self.weights = array([u.weights for u in self.units])
            self.error = array([u.error for u in self.units])
            self.dists = array([u.distance for u in self.units])
            self.connections = []
            for u in self.units:
                conn_dict = {}
                for k, v in u.connections.iteritems():
                    conn_dict[self.units.index(k)] = v
                self.connections.append(conn_dict)
            del self.units

    def get_model_vector(self, i):
        return self.weights[i]

    def num_model_vectors(self):
        return len(self.weights)

    def add_connection(self, a, b):
        if b not in self.connections[a]:
            self.verbose("Adding connection between", a, "and", b)

        self.connections[b][a] = 0
        self.connections[a][b] = 0

    def del_connection(self, a, b):

        self.verbose("Deleting connection between", a, "and", b)

        del (self.connections[b][a])
        del (self.connections[a][b])

    def del_unit(self, x):

        self.verbose("Deleting unit", x)

        if self.shrink_callback:
            self.shrink_callback(x)

        # remove the connections for unit x
        del self.connections[x]

        # iterate through the connection dictionaries decrementing
        # all the connection numbers greater than x
        for i, conn_dict in enumerate(self.connections):
            new_dict = {}
            for k, v in conn_dict.items():
                assert x != k
                if k > x:
                    new_dict[k - 1] = v
                else:
                    new_dict[k] = v
            self.verbose("old connections for unit", i, "=", conn_dict)
            self.verbose("new connections for unit", i, "=", new_dict)
            self.connections[i] = new_dict

        # set up slices for the items before and after
        # item x
        before = slice(0, x)
        after = slice(x + 1, len(self.weights))

        # remove the weights for unit x
        self.weights = join((self.weights[before], self.weights[after]))

        # remove the error accumulator for unit x
        self.error = join((self.error[before], self.error[after]))

        # remove the distance value for unit x
        self.dists = join((self.dists[before], self.dists[after]))

    def present_input(self, X):
        self.dists = matrixnorm(self.weights - X)
        self.last_input = X
        self.new_input = True

    def get_activation(self):
        if self.new_input:
            self._activation_fn()
            self.new_input = False
        return self.__activation

    def reciprocal_activation(self):
        self.__activation = 1 / self.dists

        if inf in self.__activation:
            win = self.winner()
            self.__activation.flat[win] = 0
            self.__activation -= self.__activation
            self.__activation.flat[win] = 1.0
        else:
            self.__activation /= sum(self.__activation.flat)
        return self.__activation

    def gaussian_activation(self):
        x = self.dists
        radii = zeros(self.dists.shape) * 0.0

        for u, conn_dict in enumerate(self.connections):
            neighbors = take(self.weights, conn_dict.keys())
            radii[u] = average(matrixnorm(neighbors - self.weights[u]))

        self.__activation = gaussian(x, radii / 2)

    def uniform_gaussian_activation(self):
        x = self.dists

        total = 0.0
        count = 0

        for u, conn_dict in enumerate(self.connections):
            neighbors = take(self.weights, conn_dict.keys())
            total += sum(matrixnorm(neighbors - self.weights[u]))
            count += len(conn_dict)

        self.__activation = gaussian(x, (total / count) / 2)

    def exp_abs_activation(self):
        x = self.dists

        total = 0.0
        count = 0

        for u, conn_dict in enumerate(self.connections):
            neighbors = take(self.weights, conn_dict.keys())
            total += sum(matrixnorm(neighbors - self.weights[u]))
            count += len(conn_dict)

        stddev = total / count
        self.__activation = exp(-abs(x / stddev))

    def winner_take_all_activation(self):
        self.__activation = zeros(len(self.dists))
        self.__activation[argmin(self.dists)] = 1.0

    def dot_activation(self):
        self.__activation = dot(self.weights, self.last_input)

    def train(self, X, error=None):

        self.debug("Training on input:", X)
        self.present_input(X)
        self.count += 1

        # (roman numeral comments from fritzke's algorithm in
        # B. Fritzke, Unsupervised ontogenetic networks, in Handbook
        # of Neural Computation, IOP Publishing and Oxford University
        # Press, 1996)  [ replacing \zeta with X ]

        # (iii) Determine units s_1 and s_2 (s_1,s_2 \in A) such that
        #       |w_{s_1} - X| <= |w_c - X| (\forall c \in A)
        #   and
        #       |w_{s_2} - X| <= |w_c - X| (\forall c \in A\\s_1)

        s_1, s_2 = self.winners(2)

        # (iv) If it does not already exist, insert a connection between s1 and s2
        #   in any case, set the age of the connection to zero

        self.add_connection(s_1, s_2)

        # (v) Add the squared distance betwen the input signal and the
        # nearest unit in input space to a local error variable

        if error == None:
            error = self.dists[s_1]**2
            if self.normalize_error:
                error = sqrt(error) / norm(X)

        self.error[s_1] += error

        # (vi) Move s_i and its direcct topological neighbors towards
        # X by fractions e_b and e_n, respectively, of the total
        # distance.

        self.weights[s_1] += self.e_b * (X - self.weights[s_1])
        for n in self.connections[s_1]:
            self.weights[n] += self.e_n * (X - self.weights[n])

        # (vii) Increment the age of all edges emanating from s_1
        for n in self.connections[s_1]:
            self.connections[n][s_1] += 1
            self.connections[s_1][n] += 1

        # (viii) Remove edges with an age larger than max_age....
        for a, connection_dict in enumerate(self.connections):
            for b, age in connection_dict.items():
                if age > self.max_age:
                    self.del_connection(a, b)

        # (viii) ... If this results in units having no emanating
        # edges, remove them as well.
        to_be_deleted = [a for a, d in enumerate(self.connections) if not d]
        #   sort the list in descending order, so deleting lower numbered
        #   units doesn't screw up the connections
        to_be_deleted.sort(reverse=True)
        if to_be_deleted:
            self.verbose("Deleting units", to_be_deleted)
        for a in to_be_deleted:
            self.del_unit(a)

        # (ix) if the number of input signals so far is an integer
        # multiple of a parameter \lambda, insert a new unit as
        # follows.
        if self.time_to_grow():
            # o Determine the unit q with the maximum accumulated error.
            # o Interpolate a new unit r from q and its neighbor f with the largest
            #   error variable

            q, f = self.growth_pair()
            r = len(self.weights)

            new_weights = 0.5 * (self.weights[q] + self.weights[f])
            new_weights.shape = (1, self.dim)
            self.weights = join((self.weights, new_weights), axis=0)

            new_distance = norm(X - new_weights)
            self.dists = join((self.dists, new_distance), axis=0)

            self.connections.append({})

            # o Insert edges connecting the new unit r with unts q and f and
            #   remove the original edge between q and f.
            self.verbose("Adding unit", r, "between", q, "and", f,
                         "--- count =", self.count)
            self.add_connection(q, r)
            self.add_connection(r, f)
            self.del_connection(q, f)

            # o Decrease the error variables of q and f
            self.error[q] += -self.alpha * self.error[q]
            self.error[f] += -self.alpha * self.error[f]

            # o Interpolate the error variable of r from q and f
            new_error = array(0.5 * (self.error[q] + self.error[f]))
            new_error.shape = (1, 1)
            self.error = join((self.error, new_error))

            if self.grow_callback:
                self.grow_callback(q, f)

        # (x) Decrease the error variables of all units
        self.error += -self.beta * self.error
        return

    def winners(self, N=1):
        N = min(N, len(self.dists))
        indices = argsort(self.dists)
        return tuple(indices[:N])

    def winner(self):
        return argmin(self.dists)

    def time_to_grow(self):
        return (self.count % self.lambda_) == 0

    def growth_pair(self):
        def max_error(a, b):
            if self.error[a] > self.error[b]:
                return a
            else:
                return b

        q = reduce(max_error, range(len(self.error)))
        f = reduce(max_error, self.connections[q])
        return q, f

    def neighbors(self, i):
        return self.connections[i].keys()
コード例 #9
0
class GridWorld(rl.Environment):

    grid = Parameter(default=[
        "############", "#G.........#", "#..........#", "#..........#",
        "#..........#", "#..........#", "#..........#", "#..........#",
        "#..........#", "#..........#", "#.........S#", "############"
    ])

    random_start_pos = Parameter(default=False)

    timeout = Integer(default=0, bounds=(0, None))

    actions = Parameter(default=['N', 'S', 'E', 'W'])
    action_map = Parameter(default={
        'N': (-1, 0),
        'S': (1, 0),
        'E': (0, 1),
        'W': (0, -1)
    })

    correct_action_probability = Magnitude(default=1.0)
    step_reward = Number(default=-1)
    goal_reward = Number(default=0)

    start_pos = Parameter(default=None)
    goal_pos = Parameter(default=None)
    crumbs = Parameter(default=False)
    clear_crumbs_on_pose_set = Parameter(default=True)
    recolor_crumbs_on_pose_set = Parameter(default=False)

    count_wall_states = Boolean(default=False)

    def __init__(self, **args):
        super(GridWorld, self).__init__(**args)

        if self.crumbs:
            self.clear_crumbs = False
            self.recolor_crumbs = False
            self.connect_crumbs = True

        for r, row in enumerate(self.grid):
            if len(row) != len(self.grid[0]):
                raise "GridWorld error: grid rows must all be the same length."

            for c, cell in enumerate(row):
                if cell == START:
                    if self.start_pos:
                        raise "GridWorld error: grid has more than one start position."
                    self.start_pos = (r, c)
                elif cell == GOAL:
                    if self.goal_pos:
                        raise "GridWorld error: grid has more than one goal position."
                    self.goal_pos = (r, c)

        self.start_episode()
        if self.count_wall_states:
            self.num_states = sum([len(row) for row in self.grid])
        else:
            self.num_states = sum([
                row.count(FREE) + row.count(START) + row.count(GOAL)
                for row in self.grid
            ])

    def __call__(self, action=None):
        if action == None:
            self.curr_pos = self.start_pos
            self.episode_steps = 0
            self.start_episode()
            return self.state()
        else:
            self.episode_steps += 1
            assert action in self.actions
            r, c = self.curr_pos
            p = self.correct_action_probability
            N = len(self.actions)
            distr = array([(1 - p) / (N - 1)] * N)
            distr[self.actions.index(action)] = p
            a = utils.weighted_sample(distr)

            dr, dc = self.action_map[self.actions[a]]

            if self.move_okay(r + dr, c + dc):
                r, c = self.curr_pos = (r + dr, c + dc)

        if (r, c) == self.goal_pos:
            self.verbose("!!! GOAL !!!")
            return rl.TERMINAL_STATE, self.goal_reward
        elif self.timeout and self.episode_steps > self.timeout:
            return rl.TERMINAL_STATE, self.step_reward
        else:
            return self.state(), self.step_reward

    def reset_crumbs(self):
        if not self.crumbs: return
        if self.clear_crumbs_on_pose_set:
            self.clear_crumbs = True
        if self.recolor_crumbs_on_pose_set:
            self.recolor_crumbs = True
        self.connect_crumbs = False

    def start_episode(self):
        if self.random_start_pos:
            while True:
                r = rand.randint(len(self.grid))
                c = rand.randint(len(self.grid[0]))
                g = self.grid[r][c]
                if g != WALL and g != GOAL:
                    self.curr_pos = self.start_pos = r, c
                    break
        else:
            self.curr_pos = self.start_pos

        self.episode_steps = 0
        self.reset_crumbs()

    def set_route(self, start_pos=None, goal_pos=None):
        if start_pos: self.start_pos = start_pos
        if goal_pos: self.goal_pos = goal_pos
        self.start_episode()

    def move_okay(self, r, c):
        rbound = len(self.grid)
        cbound = len(self.grid[0])
        return (0 <= r < rbound and 0 <= c < cbound
                and self.grid[r][c] != WALL)

    def state(self):
        r, c = self.curr_pos
        return r * len(self.grid[0]) + c