Beispiel #1
0
class Pantheon:
    def __init__(self, config, args):
        self.start, self.tick, self.nANN = time.time(), 0, config.NPOP
        self.config, self.args = config, args
        self.net = Model(config, args)
        self.quill = Quill(config.MODELDIR)
        self.log = defaultdict(list)
        self.net.nParams

        self.period = 1

    @property
    def model(self):
        return self.net.model

    def step(self, recvs):
        recvs, logs = list(zip(*recvs))

        # Write logs
        self.quill.scrawl(logs)
        self.tick += 1

        if not self.config.TEST:
            lifetime = self.quill.latest()
            self.net.stepOpt(recvs)
            self.net.checkpoint(lifetime)
            self.net.saver.print()
        else:
            self.quill.print()

        return self.model
Beispiel #2
0
    def __init__(self, config, args):
        self.start, self.tick, self.nANN = time.time(), 0, config.NPOP
        self.config, self.args = config, args
        self.net = Model(config, args)
        self.quill = Quill(config.MODELDIR)
        self.log = defaultdict(list)
        self.net.nParams

        self.period = 1
Beispiel #3
0
class Pantheon(Ascend):
    '''Cluster level Pantheon API demo

   This cluster level module aggregrates
   gradients across all server level optimizer
   nodes and updates model weights using Adam.

   Also demonstrates logging and snapshotting
   functionality through the Quill and Model
   libraries, respectively.'''
    def __init__(self, trinity, config, idx):
        '''Initializes a copy of the model, which keeps
      track of a copy of the weights for the optimizer.'''
        super().__init__(trinity.god, config.NGOD, trinity, config)
        self.config = config

        self.net = Model(projekt.ANN, config)

        #Have been experimenting with population based
        #training. Nothing stable yet -- advise avoiding
        if config.POPOPT:
            self.opt = PopulationOptimizer(self.net, config)
        else:
            self.opt = GradientOptimizer(self.net, config)

        if config.LOAD or config.BEST:
            self.net.load(self.opt, config.BEST)

        self.quill = Quill(config.MODELDIR)
        self.log = defaultdict(list)

        self.tick = 0
        self.net.nParams

    @runtime
    def step(self):
        '''Broadcasts updated weights to server level
      God optimizer nodes. Performs an Adam step
      once optimizers return a batch of gradients.'''

        recvs = super().step(self.net.weights)

        #Write logs using Quill
        recvs, logs = list(zip(*recvs))
        logs = BlobLogs.merge(logs)

        self.quill.scrawl(logs)
        self.tick += 1

        self.quill.print()
        if not self.config.TEST:
            lifetime = self.quill.latest()
            self.opt.step(recvs, logs)
            self.net.checkpoint(self.opt, lifetime)
Beispiel #4
0
    def __init__(self, trinity, config, args):
        '''Initializes a copy of the model, which keeps
      track of a copy of the weights for the optimizer.'''
        super().__init__(trinity, config, args)
        self.config, self.args = config, args

        self.net = Model(projekt.ANN, config, args)
        self.quill = Quill(config.MODELDIR)
        self.log = defaultdict(list)

        self.tick = 0
        self.net.nParams
Beispiel #5
0
    def __init__(self, trinity, config, idx):
        '''Initializes a copy of the model, which keeps
      track of the weights for the optimizer.

      Args:
         trinity : A Trinity object as shown in __main__
         config  : A Config object as shown in __main__
         idx     : Unused hardware index
      '''
        super().__init__(trinity.god, config.NGOD, trinity, config)
        self.quill = Quill(config)
        self.config = config

        self.net = Model(projekt.Policy, config)
        self.net.printParams()
Beispiel #6
0
class Pantheon(trinity.Pantheon):
    '''Cluster level Pantheon API demo

   This cluster level module aggregrates
   gradients across all server level optimizer
   nodes and updates model weights using Adam.

   Also demonstrates logging and snapshotting
   functionality through the Quill and Model
   libraries, respectively.'''
    def __init__(self, trinity, config, args):
        '''Initializes a copy of the model, which keeps
      track of a copy of the weights for the optimizer.'''
        super().__init__(trinity, config, args)
        self.config, self.args = config, args

        self.net = Model(projekt.ANN, config, args)
        self.quill = Quill(config.MODELDIR)
        self.log = defaultdict(list)

        self.tick = 0
        self.net.nParams

    @runtime
    def step(self):
        '''Broadcasts updated weights to server level
      God optimizer nodes. Performs an Adam step
      once optimizers return a batch of gradients.'''

        recvs = super().step(self.net.model)

        #Write logs using Quill
        recvs, logs, nUpdates, nRollouts = list(zip(*recvs))
        nUpdates = sum(nUpdates)
        nRollouts = sum(nRollouts)
        self.quill.scrawl(logs, nUpdates, nRollouts)
        self.tick += 1

        self.quill.print()
        if not self.config.TEST:
            lifetime = self.quill.latest()
            self.net.stepOpt(recvs)
            self.net.checkpoint(lifetime)
            self.net.saver.print()
Beispiel #7
0
    def plot(self, ylabel, blob, plot, colors, idx, n):
        config = self.config
        Plot = Quill.plot(plot)
        title = Plot.__name__

        #Correct dimensions
        width = ((config.VIS_WIDTH - config.VIS_LEGEND_WIDTH -
                  config.VIS_LEGEND_OFFSET - config.VIS_TITLE_OFFSET) // n)
        height = config.VIS_HEIGHT
        aspect = height / width

        #Interactive plot element
        TOOLTIPS = [
            ("Track", "$name"),
            ("(x,y)", "($x, $y)"),
        ]

        #Make figure
        fig = bokeh.plotting.figure(
            plot_width=width,
            plot_height=height,
            x_range=bokeh.models.ranges.DataRange1d(range_padding=0.1 *
                                                    aspect),
            y_range=bokeh.models.ranges.DataRange1d(range_padding=0.1),
            tools='hover, pan, box_zoom, wheel_zoom, reset, save',
            active_drag='pan',
            active_scroll='wheel_zoom',
            active_inspect='hover',
            tooltips=TOOLTIPS,
            title_location='above',
            y_axis_label=ylabel,
            x_axis_location='below',
            min_border_right=config.VIS_BORDER_WIDTH,
            min_border_left=config.VIS_BORDER_WIDTH,
            min_border_top=config.VIS_BORDER_HEIGHT,
            min_border_bottom=config.VIS_BORDER_HEIGHT)

        #Extra options
        fig.axis.axis_label_text_font_style = 'bold'

        #Toolbars
        fig.toolbar.logo = None
        if not config.VIS_TOOLS:
            fig.toolbar_location = None

        #Draw glyphs
        legend = Plot(config, fig, ylabel, len(blob)).render(blob, colors)

        #Adjust for theme
        if config.VIS_THEME == 'web':
            fig.title.text = '{} vs. {}'.format(fig.yaxis.axis_label,
                                                fig.xaxis.axis_label)
            fig.yaxis.axis_label = None
            fig.xaxis.axis_label = None

        return fig, legend
Beispiel #8
0
    def run_actors(self, sharedReplay, sharedReplayLm, sharedStateDict):
        env = NativeServer(self.config, self.args)
        quill = Quill(self.config.MODELDIR)

        while True:
            if not ray.get(sharedStateDict.getFlag.remote()):
                sleep(5)
                continue

            idx, buffer, bufferLm, logs = env.run()

            env.append(idx, ray.get(sharedStateDict.send.remote()))

            if buffer is not None:
                sharedReplay.update.remote(buffer)
            if bufferLm is not None and self.args.lm:
                sharedReplayLm.update.remote(bufferLm, idx)

            if logs is not None:
                quill.scrawl([logs])

            sharedStateDict.increaseCounter.remote(len(buffer[0][0]))
Beispiel #9
0
    def __init__(self, trinity, config, idx):
        '''Initializes a copy of the model, which keeps
      track of a copy of the weights for the optimizer.'''
        super().__init__(trinity.god, config.NGOD, trinity, config)
        self.config = config

        self.net = Model(projekt.ANN, config)

        #Have been experimenting with population based
        #training. Nothing stable yet -- advise avoiding
        if config.POPOPT:
            self.opt = PopulationOptimizer(self.net, config)
        else:
            self.opt = GradientOptimizer(self.net, config)

        if config.LOAD or config.BEST:
            self.net.load(self.opt, config.BEST)

        self.quill = Quill(config.MODELDIR)
        self.log = defaultdict(list)

        self.tick = 0
        self.net.nParams
Beispiel #10
0
    def run(self):
        sharedReplay = ReplayMemoryMaster.remote(self.args, self.config)
        sharedReplayLm = ReplayMemoryLmMaster.remote(
            self.args, self.config) if self.args.lm else None
        sharedQuill = Quill.remote(self.config.MODELDIR)
        sharedStateDict = RemoteStateDict.remote(self.config)

        self.run_actors.remote(self, sharedReplay, sharedReplayLm, sharedQuill,
                               sharedStateDict)

        pantheonProcessId = self.run_pantheon.remote(self, sharedReplay,
                                                     sharedQuill,
                                                     sharedStateDict)
        if self.args.lm:
            self.run_pantheon_lm.remote(self, sharedReplayLm, sharedStateDict)
        ray.get(pantheonProcessId)
Beispiel #11
0
class Pantheon(Ascend):
    '''Cluster level infrastructure layer

   This module aggregates gradients across all server level 
   environments and updates model weights using Adam.

   It also demonstrates logging and snapshotting functionality 
   through the Quill and Model libraries, respectively.'''
    def __init__(self, trinity, config, idx):
        '''Initializes a copy of the model, which keeps
      track of the weights for the optimizer.

      Args:
         trinity : A Trinity object as shown in __main__
         config  : A Config object as shown in __main__
         idx     : Unused hardware index
      '''
        super().__init__(trinity.god, config.NGOD, trinity, config)
        self.quill = Quill(config)
        self.config = config

        self.net = Model(projekt.Policy, config)
        self.net.printParams()

    @runtime
    def step(self):
        '''Broadcasts updated weights to server level God optimizer nodes.
      Performs an Adam step once optimizers return a batch of gradients.

      Returns:
         perf  : Log message describing agent performance
         stats : Log message describing data collected
         log   : Dictionary of logs containing infrastructure usage data
      '''
        #Aggregate Blob logs as a BlobSummary
        recvs = super().step(self.net.weights)
        recvs, blobs, log = list(zip(*recvs))
        blobs = BlobSummary().add(blobs)

        #Update/checkpoint model and write logs
        stats, lifetime = self.quill.scrawl(blobs)
        perf = self.net.step(recvs, blobs, log, lifetime)

        return perf, stats, log
Beispiel #12
0
 def __init__(self, config, args):
     self.config, self.args = config, args
     self.net = Model(config, args)
     self.quill = Quill(config.MODELDIR)
Beispiel #13
0
class Pantheon:
    def __init__(self, config, args):
        self.config, self.args = config, args
        self.net = Model(config, args)
        self.quill = Quill(config.MODELDIR)

    def gatherTrajectory(self, states, rewards, policy, actions, lmActions, lmPolicy, annID):
        trajectory = defaultdict(list)
        trajectoryLm = defaultdict(list)

        for i in range(0, len(states), self.config.LSTM_PERIOD):
            stim = torch.from_numpy(states[i: i + self.config.LSTM_PERIOD]).float().to(self.config.DEVICE_OPTIMIZER)
            ret = torch.from_numpy(rewards[i: i + self.config.LSTM_PERIOD]).float().to(self.config.DEVICE_OPTIMIZER)
            oldPolicy = torch.from_numpy(policy[i: i + self.config.LSTM_PERIOD]).to(
                self.config.DEVICE_OPTIMIZER).float()
            lmOldPolicy = torch.from_numpy(lmPolicy[i: i + self.config.LSTM_PERIOD]).float().to(
                self.config.DEVICE_OPTIMIZER)
            lmAction = torch.from_numpy(lmActions[i: i + self.config.LSTM_PERIOD]).float().to(
                self.config.DEVICE_OPTIMIZER)
            action = torch.tensor(actions[i: i + self.config.LSTM_PERIOD]).to(self.config.DEVICE_OPTIMIZER)
            oldJointPolicy = F.softmax((1 - self.config.LM_LAMBDA) * oldPolicy + self.config.LM_LAMBDA * lmAction,
                                       dim=1).gather(1, action.view(-1, 1))

            outsLm = self.net.lawmaker(stim, annID)
            annReturns = self.net.anns[annID](stim, outsLm, (i + 1) % self.config.LSTM_PERIOD == 0)

            outsLm = self.net.lawmaker.get_punishment({'action': (outsLm['action'][0], lmAction,
                                                                              outsLm['action'][-2],
                                                                              outsLm['action'][-1])}, action,
                                                                  annReturns['outputs']['action'][0].detach())
            if self.args.lm:
                entropy, pi, Q = outsLm['action']
                trajectoryLm['QVals'].append(Q)
                trajectoryLm['policy'].append(pi)
                trajectoryLm['oldPolicy'].append(lmOldPolicy.gather(1, action.view(-1, 1)))
                trajectoryLm['correction'].append((lmOldPolicy.gather(1, action.view(-1, 1)) / oldJointPolicy).clamp(0.5, 2))
                trajectoryLm['entropy'].append(entropy)

            trajectory['vals'].append(annReturns['val'])
            trajectory['returns'].append(ret)
            trajectory['oldPolicy'].append(F.softmax(oldPolicy, dim=1).gather(1, action.view(-1, 1)))
            trajectory['policy'].append(F.softmax(annReturns['outputs']['action'][0], dim=1).
                                        gather(1, action.view(-1, 1)))
            trajectory['actions'].append(action)
            trajectory['correction'].append((F.softmax(oldPolicy, dim=1).
                                             gather(1, action.view(-1, 1)) / oldJointPolicy).clamp(0.5, 2))

        return trajectory, trajectoryLm

    def offPolicyTrain(self, batch):
        step = 500
        for i in range(0, self.config.HORIZON * self.config.EPOCHS, step):
            trajectories = []
            trajectoriesLm = []
            start = i
            for annID, agentBatch in batch.items():
                trajectory, trajectoryLm = self.gatherTrajectory(
                    agentBatch['state'][start:i + step],
                    agentBatch['reward'][start:i + step],
                    agentBatch['policy'][start:i + step],
                    agentBatch['action'][start:i + step],
                    agentBatch['lmAction'][start:i + step],
                    agentBatch['lmPolicy'][start:i + step],
                    annID)
                trajectoriesLm.append(trajectoryLm)
                trajectories.append(trajectory)
            loss, outs = optim.backwardAgentOffPolicy(trajectories, entWeight=self.net.agentEntropies[0],
                                                      device=self.config.DEVICE_OPTIMIZER)
            if self.args.lm:
                lmLoss, outsLm = optim.backwardLawmaker(trajectoriesLm, outs['vals'], outs['rets'],
                                                        entWeight=self.net.agentEntropies[0],
                                                        device=self.config.DEVICE_OPTIMIZER,
                                                        mode=self.config.LM_MODE)
                lmLoss.backward()
                nn.utils.clip_grad_norm_(self.net.lawmaker.parameters(), 0.5)
                self.net.lmOpt.step()
                self.net.lmScheduler.step()
                self.net.lmOpt.zero_grad()
            loss.backward()
            [nn.utils.clip_grad_norm_(ann.parameters(), 0.5) for ann in self.net.anns]
            [opt.step() for opt in self.net.opt]
            self.net.annealEntropy(0)
            [scheduler.step() for scheduler in self.net.scheduler]
            [opt.zero_grad() for opt in self.net.opt]
        return

    def model(self):
        return self.net.model()

    def step(self, batch, logs):
        # Write logs
        reward = self.quill.scrawl(logs)

        for i in range(self.config.EPOCHS_PPO):
            self.offPolicyTrain(batch)

        self.net.checkpoint(reward)
        self.net.saver.print()

        return self.model()
Beispiel #14
0
class Pantheon:
    def __init__(self, config, args):
        self.config, self.args = config, args
        self.net = Model(config, args)
        self.quill = Quill(config.MODELDIR)

    def gatherTrajectory(self, flat_states, ents_states, returns, policy,
                         actions, lmActions, lmPolicy, annID):
        trajectory = {
            'vals': [],
            'returns': [],
            'lmRewards': defaultdict(list),
            'correction': defaultdict(list),
            'oldPolicy': defaultdict(list),
            'policy': defaultdict(list),
            'actions': defaultdict(list)
        }

        trajectoryLm = defaultdict(lambda: defaultdict(list))

        for i in range(0, len(flat_states), self.config.LSTM_PERIOD):
            flat = torch.from_numpy(
                flat_states[i:i + self.config.LSTM_PERIOD]).float().to(
                    self.config.DEVICE_OPTIMIZER)
            ents = torch.from_numpy(
                ents_states[i:i + self.config.LSTM_PERIOD]).float().to(
                    self.config.DEVICE_OPTIMIZER)
            ret = torch.from_numpy(
                returns[i:i + self.config.LSTM_PERIOD]).float().to(
                    self.config.DEVICE_OPTIMIZER)
            oldPolicy = {
                k:
                torch.from_numpy(v[i:i + self.config.LSTM_PERIOD]).float().to(
                    self.config.DEVICE_OPTIMIZER).squeeze(1)
                for k, v in policy.items()
            }
            lmOldPolicy = {
                k:
                torch.from_numpy(v[i:i + self.config.LSTM_PERIOD]).float().to(
                    self.config.DEVICE_OPTIMIZER).squeeze(1)
                for k, v in lmPolicy.items()
            }
            lmAction = {
                k:
                torch.from_numpy(v[i:i + self.config.LSTM_PERIOD]).float().to(
                    self.config.DEVICE_OPTIMIZER).squeeze(1)
                for k, v in lmActions.items()
            }
            action = {
                k:
                torch.from_numpy(v[i:i + self.config.LSTM_PERIOD]).long().to(
                    self.config.DEVICE_OPTIMIZER).view(-1, 1)
                for k, v in actions.items()
            }
            oldJointPolicy = {
                k:
                1e-5 + F.softmax((1 - self.config.LM_LAMBDA) * oldPolicy[k] +
                                 self.config.LM_LAMBDA * lmAction[k],
                                 dim=1).gather(1, action[k])
                for k in oldPolicy.keys()
            }
            outsLm = self.net.lawmaker(flat, ents, annID)
            annReturns = self.net.anns[annID](flat, ents, {
                'actions': lmAction
            })

            outsLm = self.net.lawmaker.get_punishment(
                {
                    'actions': lmAction,
                    'policy': outsLm['policy'],
                    'entropy': outsLm['entropy'],
                    'Qs': outsLm['Qs']
                }, action)
            if self.args.lm:
                entropy, pi, Qs = outsLm['entropy'], outsLm['policy'], outsLm[
                    'Qs']
                for k in pi.keys():
                    trajectoryLm['Qs'][k].append(Qs[k])
                    trajectoryLm['policy'][k].append(pi[k])
                    trajectoryLm['oldPolicy'][k].append(lmOldPolicy[k])
                    trajectoryLm['entropy'][k].append(entropy[k])
                    corr = (lmOldPolicy[k] / oldJointPolicy[k]).clamp(0.5, 2)
                    trajectoryLm['correction'][k].append(corr)

            trajectory['vals'].append(annReturns['val'])
            trajectory['returns'].append(ret)
            for k in oldPolicy.keys():
                trajectory['oldPolicy'][k].append(oldPolicy[k])
                trajectory['policy'][k].append(annReturns['policy'][k])
                trajectory['actions'][k].append(action[k])
                corr = (F.softmax(oldPolicy[k], dim=1).gather(
                    1, action[k]).detach() / oldJointPolicy[k]).clamp(0.5, 2)
                trajectory['correction'][k].append(corr)

        return trajectory, trajectoryLm

    def offPolicyTrain(self, batch):
        step = 500
        for i in range(0, len(batch[0]['flat']), step):
            trajectories = []
            trajectoriesLm = []
            start = i
            for annID, agentBatch in batch.items():
                trajectory, trajectoryLm = self.gatherTrajectory(
                    agentBatch['flat'][start:i + step],
                    agentBatch['ents'][start:i + step],
                    agentBatch['return'][start:i + step], {
                        k: v[start:i + step]
                        for k, v in agentBatch['policy'].items()
                    }, {
                        k: v[start:i + step]
                        for k, v in agentBatch['action'].items()
                    }, {
                        k: v[start:i + step]
                        for k, v in agentBatch['lmAction'].items()
                    }, {
                        k: v[start:i + step]
                        for k, v in agentBatch['lmPolicy'].items()
                    }, annID)
                trajectoriesLm.append(trajectoryLm)
                trajectories.append(trajectory)
            loss, outs = optim.backwardAgentOffPolicy(
                trajectories,
                entWeight=self.net.agentEntropies[0],
                device=self.config.DEVICE_OPTIMIZER)
            if self.args.lm:
                lmLoss, outsLm = optim.backwardLawmaker(
                    trajectoriesLm,
                    outs['vals'],
                    outs['rets'],
                    entWeight=self.net.agentEntropies[0],
                    device=self.config.DEVICE_OPTIMIZER,
                    mode=self.config.LM_MODE)
                lmLoss.backward()
                nn.utils.clip_grad_norm_(self.net.lawmaker.parameters(), 0.5)
                self.net.lmOpt.step()
                self.net.lmScheduler.step()
                self.net.lmOpt.zero_grad()
            loss.backward()
            [
                nn.utils.clip_grad_norm_(ann.parameters(), 0.5)
                for ann in self.net.anns
            ]
            [opt.step() for opt in self.net.opt]
            self.net.annealEntropy(0)
            [scheduler.step() for scheduler in self.net.scheduler]
            [opt.zero_grad() for opt in self.net.opt]
        return

    def model(self):
        return self.net.model()

    def step(self, batch, logs):
        # Write logs
        lifetime = self.quill.scrawl(logs)

        for i in range(self.config.EPOCHS_PPO):
            self.offPolicyTrain(batch)

        self.net.checkpoint(lifetime)
        self.net.saver.print()

        return self.model()