Beispiel #1
0
class Feather:
    '''Internal logger used by Rollout. Due for a rewrite.'''
    def __init__(self):
        self.expMap = set()
        self.blob = Blob()

    def scrawl(self, iden):
        '''Write logs from one time step

      Args:
         iden: The unique ID used in serialization
      '''
        world, annID, entID, _ = iden
        self.blob.entID = entID
        self.blob.annID = annID
        self.blob.world = world

        #tile = self.tile(stim)
        #self.move(tile, ent.pos)
        #self.action(arguments, atnArgs)

    def tile(self, stim):
        R, C = stim.shape
        rCent, cCent = R // 2, C // 2
        tile = stim[rCent, cCent]
        return tile

    def action(self, arguments, atnArgs):
        move, attk = arguments
        moveArgs, attkArgs, _ = atnArgs
        moveLogits, moveIdx = moveArgs
        attkLogits, attkIdx = attkArgs

    def move(self, tile, pos):
        tile = type(tile.state)
        if pos not in self.expMap:
            self.expMap.add(pos)
            if tile in self.blob.unique:
                self.blob.unique[tile] += 1
        if tile in self.blob.counts:
            self.blob.counts[tile] += 1

    def reward(self, reward):
        self.blob.reward.append(reward)

    def value(self, value):
        self.blob.value.append(float(value))

    def finish(self):
        self.blob.finish()
class Feather:
    def __init__(self):
        self.blob = Blob()

    def scrawl(self, apple, ent, val, reward, lmPunishment):
        self.blob.annID = ent.annID
        self.stats(val, reward, lmPunishment, apple)

    def stats(self, value, reward, lmPunishment, apple):
        self.blob.reward.append(reward)
        self.blob.apples.append(apple)
        self.blob.value.append(float(value))
        self.blob.lmPunishment.append(float(lmPunishment))

    def finish(self):
        self.blob.finish()
Beispiel #3
0
class Feather:
    def __init__(self, config):
        #self.expMap = set()
        self.blob = Blob(config)

    def scrawl(self, annID, val, reward, apples, lmPunishment):
        self.blob.annID = annID
        self.stats(val, reward, apples, lmPunishment)

    def stats(self, value, reward, apples, lmPunishment):
        self.blob.reward.append(reward)
        self.blob.apples.append(apples)
        self.blob.value.append(float(value))
        self.blob.lmPunishment.append(float(lmPunishment))

    def finish(self):
        self.blob.finish()
class Feather:
    def __init__(self, config):
        self.blob = Blob(config)

    def scrawl(self, ent, val, reward, lmPunishment, attack, contact):
        self.blob.annID = ent.annID
        self.stats(val, reward, lmPunishment, attack, contact)

    def stats(self, value, reward, lmPunishment, attack, contact):
        self.blob.reward.append(reward)
        self.blob.value.append(float(value))
        self.blob.lmPunishment.append(float(lmPunishment))
        self.blob.contact.append(float(contact))
        if attack is not None:
            self.blob.attack.append(float(attack))

    def finish(self):
        self.blob.finish()
Beispiel #5
0
class Feather:
    def __init__(self):
        self.expMap = set()
        self.blob = Blob()

    def scrawl(self, stim, ent, val, reward):
        self.blob.annID = ent.annID
        tile = self.tile(stim)
        self.move(tile, ent.pos)
        # self.action(arguments, atnArgs)
        self.stats(val, reward)

    def tile(self, stim):
        R, C = stim.shape
        rCent, cCent = R // 2, C // 2
        tile = stim[rCent, cCent]
        return tile

    def action(self, arguments, atnArgs):
        move, attk = arguments
        moveArgs, attkArgs, _ = atnArgs
        moveLogits, moveIdx = moveArgs
        attkLogits, attkIdx = attkArgs

    def move(self, tile, pos):
        tile = type(tile.state)
        r, c = pos
        self.blob.expMap[r][c] += 1
        if pos not in self.expMap:
            self.expMap.add(pos)
            self.blob.unique[tile] += 1
        self.blob.counts[tile] += 1

    def stats(self, value, reward):
        self.blob.reward.append(reward)
        self.blob.value.append(float(value))

    def finish(self):
        self.blob.finish()
Beispiel #6
0
class Rollout:
    def __init__(self, config):
        '''Rollout object used internally by RolloutManager

      Args:
         config: A configuration object
      '''
        self.actions = defaultdict(list)
        self.values = []
        self.rewards = []

        self.done = False
        self.time = -1

        #Logger
        self.config = config
        self.blob = None

    def __len__(self):
        '''Length of a rollout

      Returns:
         lifetime: Number of timesteps the agent has survived
      '''
        return self.blob.lifetime

    def inputs(self, reward, key):
        '''Collects input data to internal buffers

      Args:
         reward : The reward received by the agent for its last action
         key    : The ID associated with the agent
      '''
        #Also check if blob is not none. This prevents
        #recording the first reward of a partial trajectory
        if reward is not None and self.blob is not None:
            self.rewards.append(reward)

        if self.blob is None:
            annID, entID = key
            self.blob = Blob(entID, annID)

        self.time += 1
        self.blob.inputs(reward)

    def outputs(self, atnArgKey, atnLogits, atnIdx, value):
        '''Collects output data to internal buffers

      Args:
         atnArgKey : Action-Argument formatted string                           
         atnLogits : Action logits                                              
         atnsIdx   : Argument indices sampled from logits                       
         value     : Value function prediction  
      '''
        if len(self.actions[self.time]) == 0:
            self.blob.outputs(float(value))
            self.values.append(value)

        output = Output(atnArgKey, atnLogits, atnIdx, value)
        self.actions[self.time].append(output)

    def finish(self):
        '''Called internally once the full rollout has been collected'''
        self.rewards.append(-1)
        self.blob.inputs(-1)

        #self.returns     = self.gae(self.config.GAMMA, self.config.LAMBDA, self.config.HORIZON)
        self.returns = self.discount(self.config.GAMMA)
        self.lifespan = len(self.rewards)

        self.blob.finish()

    def gae(self, gamma, lamb, H):
        '''Applies generalized advantage estimation to the given trajectory
      
      Args:
         gamma: Reward discount factor
         gamma: GAE discount factor

      Returns:
         rewards: Discounted list of rewards
      '''
        r = self.rewards
        V = self.values

        L = len(r)
        returns = []
        for t in range(L):
            At, T = 0, min(L - t - 1, H)
            for i in range(T):
                tt = t + i
                deltaT = r[tt] + gamma * V[tt + 1] - V[tt]
                At += deltaT * (gamma * lamb)**i

            for out in self.actions[t]:
                out.returns = At

            returns.append(At)

        return returns

    def discount(self, gamma):
        '''Applies standard gamma discounting to the given trajectory
      
      Args:
         gamma: Reward discount factor

      Returns:
         rewards: Discounted list of rewards
      '''
        rets, N = [], len(self.rewards)
        discounts = np.array([gamma**i for i in range(N)])
        rewards = np.array(self.rewards)

        for idx in range(N):
            R_i = sum(rewards[idx:] * discounts[:N - idx])
            for out in self.actions[idx]:
                out.returns = R_i

            rets.append(R_i)

        return rets