class Pantheon: def __init__(self, config, args): self.start, self.tick, self.nANN = time.time(), 0, config.NPOP self.config, self.args = config, args self.net = Model(config, args) self.quill = Quill(config.MODELDIR) self.log = defaultdict(list) self.net.nParams self.period = 1 @property def model(self): return self.net.model def step(self, recvs): recvs, logs = list(zip(*recvs)) # Write logs self.quill.scrawl(logs) self.tick += 1 if not self.config.TEST: lifetime = self.quill.latest() self.net.stepOpt(recvs) self.net.checkpoint(lifetime) self.net.saver.print() else: self.quill.print() return self.model
def __init__(self, config, args): self.start, self.tick, self.nANN = time.time(), 0, config.NPOP self.config, self.args = config, args self.net = Model(config, args) self.quill = Quill(config.MODELDIR) self.log = defaultdict(list) self.net.nParams self.period = 1
class Pantheon(Ascend): '''Cluster level Pantheon API demo This cluster level module aggregrates gradients across all server level optimizer nodes and updates model weights using Adam. Also demonstrates logging and snapshotting functionality through the Quill and Model libraries, respectively.''' def __init__(self, trinity, config, idx): '''Initializes a copy of the model, which keeps track of a copy of the weights for the optimizer.''' super().__init__(trinity.god, config.NGOD, trinity, config) self.config = config self.net = Model(projekt.ANN, config) #Have been experimenting with population based #training. Nothing stable yet -- advise avoiding if config.POPOPT: self.opt = PopulationOptimizer(self.net, config) else: self.opt = GradientOptimizer(self.net, config) if config.LOAD or config.BEST: self.net.load(self.opt, config.BEST) self.quill = Quill(config.MODELDIR) self.log = defaultdict(list) self.tick = 0 self.net.nParams @runtime def step(self): '''Broadcasts updated weights to server level God optimizer nodes. Performs an Adam step once optimizers return a batch of gradients.''' recvs = super().step(self.net.weights) #Write logs using Quill recvs, logs = list(zip(*recvs)) logs = BlobLogs.merge(logs) self.quill.scrawl(logs) self.tick += 1 self.quill.print() if not self.config.TEST: lifetime = self.quill.latest() self.opt.step(recvs, logs) self.net.checkpoint(self.opt, lifetime)
def __init__(self, trinity, config, args): '''Initializes a copy of the model, which keeps track of a copy of the weights for the optimizer.''' super().__init__(trinity, config, args) self.config, self.args = config, args self.net = Model(projekt.ANN, config, args) self.quill = Quill(config.MODELDIR) self.log = defaultdict(list) self.tick = 0 self.net.nParams
def __init__(self, trinity, config, idx): '''Initializes a copy of the model, which keeps track of the weights for the optimizer. Args: trinity : A Trinity object as shown in __main__ config : A Config object as shown in __main__ idx : Unused hardware index ''' super().__init__(trinity.god, config.NGOD, trinity, config) self.quill = Quill(config) self.config = config self.net = Model(projekt.Policy, config) self.net.printParams()
class Pantheon(trinity.Pantheon): '''Cluster level Pantheon API demo This cluster level module aggregrates gradients across all server level optimizer nodes and updates model weights using Adam. Also demonstrates logging and snapshotting functionality through the Quill and Model libraries, respectively.''' def __init__(self, trinity, config, args): '''Initializes a copy of the model, which keeps track of a copy of the weights for the optimizer.''' super().__init__(trinity, config, args) self.config, self.args = config, args self.net = Model(projekt.ANN, config, args) self.quill = Quill(config.MODELDIR) self.log = defaultdict(list) self.tick = 0 self.net.nParams @runtime def step(self): '''Broadcasts updated weights to server level God optimizer nodes. Performs an Adam step once optimizers return a batch of gradients.''' recvs = super().step(self.net.model) #Write logs using Quill recvs, logs, nUpdates, nRollouts = list(zip(*recvs)) nUpdates = sum(nUpdates) nRollouts = sum(nRollouts) self.quill.scrawl(logs, nUpdates, nRollouts) self.tick += 1 self.quill.print() if not self.config.TEST: lifetime = self.quill.latest() self.net.stepOpt(recvs) self.net.checkpoint(lifetime) self.net.saver.print()
def plot(self, ylabel, blob, plot, colors, idx, n): config = self.config Plot = Quill.plot(plot) title = Plot.__name__ #Correct dimensions width = ((config.VIS_WIDTH - config.VIS_LEGEND_WIDTH - config.VIS_LEGEND_OFFSET - config.VIS_TITLE_OFFSET) // n) height = config.VIS_HEIGHT aspect = height / width #Interactive plot element TOOLTIPS = [ ("Track", "$name"), ("(x,y)", "($x, $y)"), ] #Make figure fig = bokeh.plotting.figure( plot_width=width, plot_height=height, x_range=bokeh.models.ranges.DataRange1d(range_padding=0.1 * aspect), y_range=bokeh.models.ranges.DataRange1d(range_padding=0.1), tools='hover, pan, box_zoom, wheel_zoom, reset, save', active_drag='pan', active_scroll='wheel_zoom', active_inspect='hover', tooltips=TOOLTIPS, title_location='above', y_axis_label=ylabel, x_axis_location='below', min_border_right=config.VIS_BORDER_WIDTH, min_border_left=config.VIS_BORDER_WIDTH, min_border_top=config.VIS_BORDER_HEIGHT, min_border_bottom=config.VIS_BORDER_HEIGHT) #Extra options fig.axis.axis_label_text_font_style = 'bold' #Toolbars fig.toolbar.logo = None if not config.VIS_TOOLS: fig.toolbar_location = None #Draw glyphs legend = Plot(config, fig, ylabel, len(blob)).render(blob, colors) #Adjust for theme if config.VIS_THEME == 'web': fig.title.text = '{} vs. {}'.format(fig.yaxis.axis_label, fig.xaxis.axis_label) fig.yaxis.axis_label = None fig.xaxis.axis_label = None return fig, legend
def run_actors(self, sharedReplay, sharedReplayLm, sharedStateDict): env = NativeServer(self.config, self.args) quill = Quill(self.config.MODELDIR) while True: if not ray.get(sharedStateDict.getFlag.remote()): sleep(5) continue idx, buffer, bufferLm, logs = env.run() env.append(idx, ray.get(sharedStateDict.send.remote())) if buffer is not None: sharedReplay.update.remote(buffer) if bufferLm is not None and self.args.lm: sharedReplayLm.update.remote(bufferLm, idx) if logs is not None: quill.scrawl([logs]) sharedStateDict.increaseCounter.remote(len(buffer[0][0]))
def __init__(self, trinity, config, idx): '''Initializes a copy of the model, which keeps track of a copy of the weights for the optimizer.''' super().__init__(trinity.god, config.NGOD, trinity, config) self.config = config self.net = Model(projekt.ANN, config) #Have been experimenting with population based #training. Nothing stable yet -- advise avoiding if config.POPOPT: self.opt = PopulationOptimizer(self.net, config) else: self.opt = GradientOptimizer(self.net, config) if config.LOAD or config.BEST: self.net.load(self.opt, config.BEST) self.quill = Quill(config.MODELDIR) self.log = defaultdict(list) self.tick = 0 self.net.nParams
def run(self): sharedReplay = ReplayMemoryMaster.remote(self.args, self.config) sharedReplayLm = ReplayMemoryLmMaster.remote( self.args, self.config) if self.args.lm else None sharedQuill = Quill.remote(self.config.MODELDIR) sharedStateDict = RemoteStateDict.remote(self.config) self.run_actors.remote(self, sharedReplay, sharedReplayLm, sharedQuill, sharedStateDict) pantheonProcessId = self.run_pantheon.remote(self, sharedReplay, sharedQuill, sharedStateDict) if self.args.lm: self.run_pantheon_lm.remote(self, sharedReplayLm, sharedStateDict) ray.get(pantheonProcessId)
class Pantheon(Ascend): '''Cluster level infrastructure layer This module aggregates gradients across all server level environments and updates model weights using Adam. It also demonstrates logging and snapshotting functionality through the Quill and Model libraries, respectively.''' def __init__(self, trinity, config, idx): '''Initializes a copy of the model, which keeps track of the weights for the optimizer. Args: trinity : A Trinity object as shown in __main__ config : A Config object as shown in __main__ idx : Unused hardware index ''' super().__init__(trinity.god, config.NGOD, trinity, config) self.quill = Quill(config) self.config = config self.net = Model(projekt.Policy, config) self.net.printParams() @runtime def step(self): '''Broadcasts updated weights to server level God optimizer nodes. Performs an Adam step once optimizers return a batch of gradients. Returns: perf : Log message describing agent performance stats : Log message describing data collected log : Dictionary of logs containing infrastructure usage data ''' #Aggregate Blob logs as a BlobSummary recvs = super().step(self.net.weights) recvs, blobs, log = list(zip(*recvs)) blobs = BlobSummary().add(blobs) #Update/checkpoint model and write logs stats, lifetime = self.quill.scrawl(blobs) perf = self.net.step(recvs, blobs, log, lifetime) return perf, stats, log
def __init__(self, config, args): self.config, self.args = config, args self.net = Model(config, args) self.quill = Quill(config.MODELDIR)
class Pantheon: def __init__(self, config, args): self.config, self.args = config, args self.net = Model(config, args) self.quill = Quill(config.MODELDIR) def gatherTrajectory(self, states, rewards, policy, actions, lmActions, lmPolicy, annID): trajectory = defaultdict(list) trajectoryLm = defaultdict(list) for i in range(0, len(states), self.config.LSTM_PERIOD): stim = torch.from_numpy(states[i: i + self.config.LSTM_PERIOD]).float().to(self.config.DEVICE_OPTIMIZER) ret = torch.from_numpy(rewards[i: i + self.config.LSTM_PERIOD]).float().to(self.config.DEVICE_OPTIMIZER) oldPolicy = torch.from_numpy(policy[i: i + self.config.LSTM_PERIOD]).to( self.config.DEVICE_OPTIMIZER).float() lmOldPolicy = torch.from_numpy(lmPolicy[i: i + self.config.LSTM_PERIOD]).float().to( self.config.DEVICE_OPTIMIZER) lmAction = torch.from_numpy(lmActions[i: i + self.config.LSTM_PERIOD]).float().to( self.config.DEVICE_OPTIMIZER) action = torch.tensor(actions[i: i + self.config.LSTM_PERIOD]).to(self.config.DEVICE_OPTIMIZER) oldJointPolicy = F.softmax((1 - self.config.LM_LAMBDA) * oldPolicy + self.config.LM_LAMBDA * lmAction, dim=1).gather(1, action.view(-1, 1)) outsLm = self.net.lawmaker(stim, annID) annReturns = self.net.anns[annID](stim, outsLm, (i + 1) % self.config.LSTM_PERIOD == 0) outsLm = self.net.lawmaker.get_punishment({'action': (outsLm['action'][0], lmAction, outsLm['action'][-2], outsLm['action'][-1])}, action, annReturns['outputs']['action'][0].detach()) if self.args.lm: entropy, pi, Q = outsLm['action'] trajectoryLm['QVals'].append(Q) trajectoryLm['policy'].append(pi) trajectoryLm['oldPolicy'].append(lmOldPolicy.gather(1, action.view(-1, 1))) trajectoryLm['correction'].append((lmOldPolicy.gather(1, action.view(-1, 1)) / oldJointPolicy).clamp(0.5, 2)) trajectoryLm['entropy'].append(entropy) trajectory['vals'].append(annReturns['val']) trajectory['returns'].append(ret) trajectory['oldPolicy'].append(F.softmax(oldPolicy, dim=1).gather(1, action.view(-1, 1))) trajectory['policy'].append(F.softmax(annReturns['outputs']['action'][0], dim=1). gather(1, action.view(-1, 1))) trajectory['actions'].append(action) trajectory['correction'].append((F.softmax(oldPolicy, dim=1). gather(1, action.view(-1, 1)) / oldJointPolicy).clamp(0.5, 2)) return trajectory, trajectoryLm def offPolicyTrain(self, batch): step = 500 for i in range(0, self.config.HORIZON * self.config.EPOCHS, step): trajectories = [] trajectoriesLm = [] start = i for annID, agentBatch in batch.items(): trajectory, trajectoryLm = self.gatherTrajectory( agentBatch['state'][start:i + step], agentBatch['reward'][start:i + step], agentBatch['policy'][start:i + step], agentBatch['action'][start:i + step], agentBatch['lmAction'][start:i + step], agentBatch['lmPolicy'][start:i + step], annID) trajectoriesLm.append(trajectoryLm) trajectories.append(trajectory) loss, outs = optim.backwardAgentOffPolicy(trajectories, entWeight=self.net.agentEntropies[0], device=self.config.DEVICE_OPTIMIZER) if self.args.lm: lmLoss, outsLm = optim.backwardLawmaker(trajectoriesLm, outs['vals'], outs['rets'], entWeight=self.net.agentEntropies[0], device=self.config.DEVICE_OPTIMIZER, mode=self.config.LM_MODE) lmLoss.backward() nn.utils.clip_grad_norm_(self.net.lawmaker.parameters(), 0.5) self.net.lmOpt.step() self.net.lmScheduler.step() self.net.lmOpt.zero_grad() loss.backward() [nn.utils.clip_grad_norm_(ann.parameters(), 0.5) for ann in self.net.anns] [opt.step() for opt in self.net.opt] self.net.annealEntropy(0) [scheduler.step() for scheduler in self.net.scheduler] [opt.zero_grad() for opt in self.net.opt] return def model(self): return self.net.model() def step(self, batch, logs): # Write logs reward = self.quill.scrawl(logs) for i in range(self.config.EPOCHS_PPO): self.offPolicyTrain(batch) self.net.checkpoint(reward) self.net.saver.print() return self.model()
class Pantheon: def __init__(self, config, args): self.config, self.args = config, args self.net = Model(config, args) self.quill = Quill(config.MODELDIR) def gatherTrajectory(self, flat_states, ents_states, returns, policy, actions, lmActions, lmPolicy, annID): trajectory = { 'vals': [], 'returns': [], 'lmRewards': defaultdict(list), 'correction': defaultdict(list), 'oldPolicy': defaultdict(list), 'policy': defaultdict(list), 'actions': defaultdict(list) } trajectoryLm = defaultdict(lambda: defaultdict(list)) for i in range(0, len(flat_states), self.config.LSTM_PERIOD): flat = torch.from_numpy( flat_states[i:i + self.config.LSTM_PERIOD]).float().to( self.config.DEVICE_OPTIMIZER) ents = torch.from_numpy( ents_states[i:i + self.config.LSTM_PERIOD]).float().to( self.config.DEVICE_OPTIMIZER) ret = torch.from_numpy( returns[i:i + self.config.LSTM_PERIOD]).float().to( self.config.DEVICE_OPTIMIZER) oldPolicy = { k: torch.from_numpy(v[i:i + self.config.LSTM_PERIOD]).float().to( self.config.DEVICE_OPTIMIZER).squeeze(1) for k, v in policy.items() } lmOldPolicy = { k: torch.from_numpy(v[i:i + self.config.LSTM_PERIOD]).float().to( self.config.DEVICE_OPTIMIZER).squeeze(1) for k, v in lmPolicy.items() } lmAction = { k: torch.from_numpy(v[i:i + self.config.LSTM_PERIOD]).float().to( self.config.DEVICE_OPTIMIZER).squeeze(1) for k, v in lmActions.items() } action = { k: torch.from_numpy(v[i:i + self.config.LSTM_PERIOD]).long().to( self.config.DEVICE_OPTIMIZER).view(-1, 1) for k, v in actions.items() } oldJointPolicy = { k: 1e-5 + F.softmax((1 - self.config.LM_LAMBDA) * oldPolicy[k] + self.config.LM_LAMBDA * lmAction[k], dim=1).gather(1, action[k]) for k in oldPolicy.keys() } outsLm = self.net.lawmaker(flat, ents, annID) annReturns = self.net.anns[annID](flat, ents, { 'actions': lmAction }) outsLm = self.net.lawmaker.get_punishment( { 'actions': lmAction, 'policy': outsLm['policy'], 'entropy': outsLm['entropy'], 'Qs': outsLm['Qs'] }, action) if self.args.lm: entropy, pi, Qs = outsLm['entropy'], outsLm['policy'], outsLm[ 'Qs'] for k in pi.keys(): trajectoryLm['Qs'][k].append(Qs[k]) trajectoryLm['policy'][k].append(pi[k]) trajectoryLm['oldPolicy'][k].append(lmOldPolicy[k]) trajectoryLm['entropy'][k].append(entropy[k]) corr = (lmOldPolicy[k] / oldJointPolicy[k]).clamp(0.5, 2) trajectoryLm['correction'][k].append(corr) trajectory['vals'].append(annReturns['val']) trajectory['returns'].append(ret) for k in oldPolicy.keys(): trajectory['oldPolicy'][k].append(oldPolicy[k]) trajectory['policy'][k].append(annReturns['policy'][k]) trajectory['actions'][k].append(action[k]) corr = (F.softmax(oldPolicy[k], dim=1).gather( 1, action[k]).detach() / oldJointPolicy[k]).clamp(0.5, 2) trajectory['correction'][k].append(corr) return trajectory, trajectoryLm def offPolicyTrain(self, batch): step = 500 for i in range(0, len(batch[0]['flat']), step): trajectories = [] trajectoriesLm = [] start = i for annID, agentBatch in batch.items(): trajectory, trajectoryLm = self.gatherTrajectory( agentBatch['flat'][start:i + step], agentBatch['ents'][start:i + step], agentBatch['return'][start:i + step], { k: v[start:i + step] for k, v in agentBatch['policy'].items() }, { k: v[start:i + step] for k, v in agentBatch['action'].items() }, { k: v[start:i + step] for k, v in agentBatch['lmAction'].items() }, { k: v[start:i + step] for k, v in agentBatch['lmPolicy'].items() }, annID) trajectoriesLm.append(trajectoryLm) trajectories.append(trajectory) loss, outs = optim.backwardAgentOffPolicy( trajectories, entWeight=self.net.agentEntropies[0], device=self.config.DEVICE_OPTIMIZER) if self.args.lm: lmLoss, outsLm = optim.backwardLawmaker( trajectoriesLm, outs['vals'], outs['rets'], entWeight=self.net.agentEntropies[0], device=self.config.DEVICE_OPTIMIZER, mode=self.config.LM_MODE) lmLoss.backward() nn.utils.clip_grad_norm_(self.net.lawmaker.parameters(), 0.5) self.net.lmOpt.step() self.net.lmScheduler.step() self.net.lmOpt.zero_grad() loss.backward() [ nn.utils.clip_grad_norm_(ann.parameters(), 0.5) for ann in self.net.anns ] [opt.step() for opt in self.net.opt] self.net.annealEntropy(0) [scheduler.step() for scheduler in self.net.scheduler] [opt.zero_grad() for opt in self.net.opt] return def model(self): return self.net.model() def step(self, batch, logs): # Write logs lifetime = self.quill.scrawl(logs) for i in range(self.config.EPOCHS_PPO): self.offPolicyTrain(batch) self.net.checkpoint(lifetime) self.net.saver.print() return self.model()