class Pantheon: def __init__(self, config, args): self.start, self.tick, self.nANN = time.time(), 0, config.NPOP self.config, self.args = config, args self.net = Model(config, args) self.quill = Quill(config.MODELDIR) self.log = defaultdict(list) self.net.nParams self.period = 1 @property def model(self): return self.net.model def step(self, recvs): recvs, logs = list(zip(*recvs)) # Write logs self.quill.scrawl(logs) self.tick += 1 if not self.config.TEST: lifetime = self.quill.latest() self.net.stepOpt(recvs) self.net.checkpoint(lifetime) self.net.saver.print() else: self.quill.print() return self.model
class Pantheon(Ascend): '''Cluster level Pantheon API demo This cluster level module aggregrates gradients across all server level optimizer nodes and updates model weights using Adam. Also demonstrates logging and snapshotting functionality through the Quill and Model libraries, respectively.''' def __init__(self, trinity, config, idx): '''Initializes a copy of the model, which keeps track of a copy of the weights for the optimizer.''' super().__init__(trinity.god, config.NGOD, trinity, config) self.config = config self.net = Model(projekt.ANN, config) #Have been experimenting with population based #training. Nothing stable yet -- advise avoiding if config.POPOPT: self.opt = PopulationOptimizer(self.net, config) else: self.opt = GradientOptimizer(self.net, config) if config.LOAD or config.BEST: self.net.load(self.opt, config.BEST) self.quill = Quill(config.MODELDIR) self.log = defaultdict(list) self.tick = 0 self.net.nParams @runtime def step(self): '''Broadcasts updated weights to server level God optimizer nodes. Performs an Adam step once optimizers return a batch of gradients.''' recvs = super().step(self.net.weights) #Write logs using Quill recvs, logs = list(zip(*recvs)) logs = BlobLogs.merge(logs) self.quill.scrawl(logs) self.tick += 1 self.quill.print() if not self.config.TEST: lifetime = self.quill.latest() self.opt.step(recvs, logs) self.net.checkpoint(self.opt, lifetime)
class Pantheon(trinity.Pantheon): '''Cluster level Pantheon API demo This cluster level module aggregrates gradients across all server level optimizer nodes and updates model weights using Adam. Also demonstrates logging and snapshotting functionality through the Quill and Model libraries, respectively.''' def __init__(self, trinity, config, args): '''Initializes a copy of the model, which keeps track of a copy of the weights for the optimizer.''' super().__init__(trinity, config, args) self.config, self.args = config, args self.net = Model(projekt.ANN, config, args) self.quill = Quill(config.MODELDIR) self.log = defaultdict(list) self.tick = 0 self.net.nParams @runtime def step(self): '''Broadcasts updated weights to server level God optimizer nodes. Performs an Adam step once optimizers return a batch of gradients.''' recvs = super().step(self.net.model) #Write logs using Quill recvs, logs, nUpdates, nRollouts = list(zip(*recvs)) nUpdates = sum(nUpdates) nRollouts = sum(nRollouts) self.quill.scrawl(logs, nUpdates, nRollouts) self.tick += 1 self.quill.print() if not self.config.TEST: lifetime = self.quill.latest() self.net.stepOpt(recvs) self.net.checkpoint(lifetime) self.net.saver.print()