def notice(self, message, type=None): if self._silent: return if type is None: print '%s\n' % message else: tb.notice(message, type)
def clean(self): nCleaned = 0 for m in self._methods: for ds in self._datasets: for ent in ds.bents(): if ent.bind(m).clean(): nCleaned += 1 tb.notice("cleaned %d entries" % (nCleaned), "passed")
def clean(self): nCleaned = 0 for m in self._methods: for ds in self._datasets: for ent in ds.bents(): if ent.bind(m).clean(): nCleaned += 1 tb.notice('cleaned %d entries' % (nCleaned), 'passed')
def archive(self, src, target, iter_step=-1): basename = os.path.basename(src) targetPath = os.path.join(target, basename) if os.path.exists(targetPath): raise Exception("target path %s already exists" % targetPath) env = Environment(src,backend=self._backend, unattended=self._unattended, silent=self._silent) env.init() env.shrink(iter_step=iter_step) tb.notice('archiving %s to %s' % (src, targetPath), 'run') os.system('mv %s %s' % (src, targetPath))
def check(self): nTotal = 0 nOk = 0 for m in self._methods: for ds in self._datasets: for ent in ds.uents() if m.direction() == "" else ds.bents(): if ent.bind(m).checkOut(self._args.verbose): nOk += 1 nTotal += 1 if nOk == nTotal: tb.notice("(%d/%d) passed" % (nOk, nTotal), "passed") else: tb.notice("(%d/%d) passed" % (nOk, nTotal), "failed")
def update(self): queue = tb.Queue() for m in self._methods: for ds in self._datasets: tb.notice('creating jobs for <%s> on <%s>' % (m, ds)) for ent in ds.uents() if m.direction() == '' else ds.bents(): job = tb.Job() ent.bind(m).makeUpdateJob(job) queue.postJob(job) queue.finishPacket() queue.submit(local=self._args.local, cores=self._args.cores)
def check(self): nTotal = 0 nOk = 0 for m in self._methods: for ds in self._datasets: for ent in ds.uents() if m.direction() == '' else ds.bents(): if ent.bind(m).checkOut(self._args.verbose): nOk += 1 nTotal += 1 if nOk == nTotal: tb.notice('(%d/%d) passed' % (nOk, nTotal), 'passed') else: tb.notice('(%d/%d) passed' % (nOk, nTotal), 'failed')
def update(self): queue = tb.Queue() for m in self._methods: for ds in self._datasets: tb.notice("creating jobs for <%s> on <%s>" % (m, ds)) for ent in ds.uents() if m.direction() == "" else ds.bents(): job = tb.Job() ent.bind(m).makeUpdateJob(job) queue.postJob(job) queue.finishPacket() queue.submit(local=self._args.local, cores=self._args.cores)
def archive(self, src, target, iter_step=-1): basename = os.path.basename(src) targetPath = os.path.join(target, basename) if os.path.exists(targetPath): raise Exception("target path %s already exists" % targetPath) env = Environment(src, backend=self._backend, unattended=self._unattended, silent=self._silent) env.init() env.shrink(iter_step=iter_step) tb.notice('archiving %s to %s' % (src, targetPath), 'run') os.system('mv %s %s' % (src, targetPath))
def prototxt(self, inFile, outDir, defs={}): defs['name'] = self._name if not os.path.isfile(inFile): raise Exception('input file %s not file' % inFile) if inFile.endswith('.prototxt'): os.system('cp %s %s' % (inFile, outDir)) return '%s' % (inFile) elif inFile.endswith('.prototmp'): prototxt = '%s/%s.prototxt' % (outDir, os.path.basename(inFile).replace('.prototmp', '')) if not self._silent: tb.notice('preprocessing %s' % inFile, 'run') tb.preprocessFile(inFile, prototxt, defs) return prototxt elif inFile.endswith('.py'): prototxt = '%s/%s.prototxt' % (outDir, os.path.basename(inFile).replace('.py', '')) args = '' for k, v in defs.iteritems(): if len(args): args += ' ' args += '%s=%s' % (k, v) if not self._silent: if not len(defs): tb.notice('converting %s' % inFile, 'run') else: tb.notice('converting %s (%s)' % (inFile, args), 'run') if os.system('python -B %s %s > %s' % (inFile, args, prototxt)) != 0: raise Exception('conversion of %s failed' % inFile) return prototxt else: raise Exception('don\'t know how to convert file %s to prototxt' % inFile)
def prototxt(self, inFile, outDir, defs={}): defs['name'] = self._name if not os.path.isfile(inFile): raise Exception('input file %s not file' % inFile) if inFile.endswith('.prototxt'): os.system('cp %s %s' % (inFile, outDir)) return '%s' % (inFile) elif inFile.endswith('.prototmp'): prototxt = '%s/%s.prototxt' % ( outDir, os.path.basename(inFile).replace('.prototmp', '')) if not self._silent: tb.notice('preprocessing %s' % inFile, 'run') tb.preprocessFile(inFile, prototxt, defs) return prototxt elif inFile.endswith('.py'): prototxt = '%s/%s.prototxt' % ( outDir, os.path.basename(inFile).replace('.py', '')) args = '' for k, v in defs.iteritems(): if len(args): args += ' ' args += '%s=%s' % (k, v) if not self._silent: if not len(defs): tb.notice('converting %s' % inFile, 'run') else: tb.notice('converting %s (%s)' % (inFile, args), 'run') if os.system('python -B %s %s > %s' % (inFile, args, prototxt)) != 0: raise Exception('conversion of %s failed' % inFile) return prototxt else: raise Exception('don\'t know how to convert file %s to prototxt' % inFile)
def shrink(self, iter_step): self.notice('removing *.pyc', 'del') os.system('rm -f %s/*.pyc' % (self._path)) self.notice('removing scratch', 'del') os.system('rm -rf %s/scratch' % self._path) if self.haveJobDir(): self.notice('removing jobs', 'del') os.system('rm -rf %s' % self._jobDir) self.sweep() if self.haveTrainDir(): for file in self._stateFiles: keep = False if iter_step != -1: if file.iteration() % iter_step == 0: keep = True if file == self._stateFiles[-1]: keep = True modelFile = None for f in self._modelFiles: if f.iteration() == file.iteration(): modelFile = f if keep: if modelFile is not None: tb.notice( 'keeping file %s' % (os.path.basename(file.filename())), 'passed') tb.notice( 'keeping file %s' % (os.path.basename(modelFile.filename())), 'passed') else: tb.notice('keeping file %s' % file, 'passed') else: if modelFile is not None: modelFile.delete(True) file.delete(True) else: file.delete(True)
def _callCopiedBin(self, cmd): bin = './' + os.path.basename(caffeBin()) tb.notice('making a local copy of %s' % caffeBin()) os.system('cp %s .' % caffeBin()) ldd = tb.run('ldd %s' % caffeBin()) caffeLib = None for line in ldd.split('\n'): match = re.match('\\s*libcaffe.so => (.*\.so)', line) if match: caffeLib = match.group(1) break if caffeLib is None: raise Exception('cannot find libcaffe.so dependency') tb.notice('making a local copy of %s' % caffeLib) os.system('cp %s .' % caffeLib) cmd = 'GLOG_logtostderr=%d LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH %s %s' % (not self._quiet, bin, cmd) if not self._silent: tb.notice('running "%s"' % cmd, 'run') tb.system(cmd)
def shrink(self, iter_step): self.notice('removing *.pyc', 'del') os.system('rm -f %s/*.pyc' % (self._path)) self.notice('removing scratch', 'del') os.system('rm -rf %s/scratch' % self._path) if self.haveJobDir(): self.notice('removing jobs', 'del') os.system('rm -rf %s' % self._jobDir) self.sweep() if self.haveTrainDir(): for file in self._stateFiles: keep = False if iter_step != -1: if file.iteration() % iter_step == 0: keep = True if file == self._stateFiles[-1]: keep = True modelFile = None for f in self._modelFiles: if f.iteration() == file.iteration(): modelFile = f if keep: if modelFile is not None: tb.notice('keeping file %s' % (os.path.basename(file.filename())),'passed') tb.notice('keeping file %s' % (os.path.basename(modelFile.filename())),'passed') else: tb.notice('keeping file %s' % file,'passed') else: if modelFile is not None: modelFile.delete(True) file.delete(True) else: file.delete(True)
def _callCopiedBin(self, cmd): bin = './' + os.path.basename(caffeBin()) tb.notice('making a local copy of %s' % caffeBin()) os.system('cp %s .' % caffeBin()) ldd = tb.run('ldd %s' % caffeBin()) caffeLib = None for line in ldd.split('\n'): match = re.match('\\s*libcaffe.so => (.*\.so)', line) if match: caffeLib = match.group(1) break if caffeLib is None: raise Exception('cannot find libcaffe.so dependency') tb.notice('making a local copy of %s' % caffeLib) os.system('cp %s .' % caffeLib) cmd = 'GLOG_logtostderr=%d LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH %s %s' % ( not self._quiet, bin, cmd) if not self._silent: tb.notice('running "%s"' % cmd, 'run') tb.system(cmd)
def _callBin(self, cmd): cmd = 'GLOG_logtostderr=%d %s %s' % (not self._quiet, caffeBin(), cmd) if not self._silent: tb.notice('running "%s"' % cmd, 'run') tb.system(cmd)
def delete(self,verbose=False): if verbose: tb.notice('removing %s' % self._filename, 'del') os.remove(self._filename)
def runOnCluster(env, node, gpus, background, insertLocal=True, trackJob=True): gpuArch = env.params().gpuArch() if node is not None: tb.notice( "Forwarding job to cluster node %s with %d gpu(s) which are of type %s" % (node, gpus, gpuArch), "info" ) else: tb.notice("Forwarding job to cluster with %d gpu(s) which are of type %s" % (gpus, gpuArch), "info") env.makeJobDir() currentId = "%s/current_id" % env.jobDir() if trackJob and os.path.exists(currentId): raise Exception("%s exists, there seems to be a job already running" % currentId) sysargs = sys.argv if insertLocal: sysargs.insert(1, "--execute") cmd = " ".join(sysargs) home = os.environ["HOME"] if args.backend == "python": training = os.path.abspath("training") cmd = "LD_LIBRARY_PATH=%s:$LD_LIBRARY_PATH PYTHONPATH=%s:$PYTHONPATH %s" % (training, training, cmd) qsubCommandFile = "%s/%s-%s.sh" % (env.jobDir(), env.name().replace("/", "_"), time.strftime("%d.%m.%Y-%H:%M:%S")) epilogueScript = "%s/epilogue.sh" % env.jobDir() open(epilogueScript, "w").write("#!/bin/bash\ncd $path\nrm -f jobs/current_id\n") if trackJob: saveIdCommand = "echo $$PBS_JOBID > jobs/current_id" else: saveIdCommand = "" script = Template( "#!/bin/bash\n" "\n" "umask 0002\n" 'echo -e "\e[30;42m --- running on" `hostname` "--- \e[0m"\n' 'cd "$path"\n' "$saveIdCommand\n" 'trap "echo got SIGHUP" SIGHUP\n' 'trap "echo got SIGUSR1" USR1\n' "$command\n" "echo done\n" "rm -f jobs/current_id\n" ).substitute(path=env.path(), command=cmd, saveIdCommand=saveIdCommand) open(qsubCommandFile, "w").write(script) tb.system('chmod a+x "%s"' % qsubCommandFile) qsub = "qsub -l nodes=%s:gpus=%d%s,mem=%dmb,walltime=240:00:00 %s -q gpujob -d %s %s -N %s -T %s" % ( node if node is not None else "1", gpus, (":" + gpuArch) if gpuArch != "any" else "", env.params().requiredMemory(), "-I -x" if not background else "", env.path(), qsubCommandFile, env.name(), epilogueScript, ) if background: print "job name: %s" % os.path.basename(qsubCommandFile) qsub += " -j oe -o %s" % (env.jobDir()) tb.notice("lmbtorque: running %s" % qsub, "run") if not background: tb.system('ssh lmbtorque "umask 0002; cd %s; %s; rm -f jobs/current_id"' % (env.path(), qsub)) else: tb.system('ssh lmbtorque "umask 0002; %s"' % (qsub)) sys.exit(0)
def runOnCluster(env, node, gpus, background, insertLocal=True, trackJob=True): gpuArch = env.params().gpuArch() if node is not None: tb.notice( 'Forwarding job to cluster node %s with %d gpu(s) which are of type %s' % (node, gpus, gpuArch), 'info') else: tb.notice( 'Forwarding job to cluster with %d gpu(s) which are of type %s' % (gpus, gpuArch), 'info') env.makeJobDir() currentId = '%s/current_id' % env.jobDir() if trackJob and os.path.exists(currentId): raise Exception('%s exists, there seems to be a job already running' % currentId) sysargs = sys.argv if insertLocal: sysargs.insert(1, '--execute') cmd = ' '.join(sysargs) home = os.environ['HOME'] if args.backend == 'python': training = os.path.abspath('training') cmd = 'LD_LIBRARY_PATH=%s:$LD_LIBRARY_PATH PYTHONPATH=%s:$PYTHONPATH %s' % ( training, training, cmd) qsubCommandFile = '%s/%s-%s.sh' % (env.jobDir(), env.name().replace( '/', '_'), time.strftime('%d.%m.%Y-%H:%M:%S')) epilogueScript = '%s/epilogue.sh' % env.jobDir() open(epilogueScript, 'w').write("#!/bin/bash\ncd $path\nrm -f jobs/current_id\n") if trackJob: saveIdCommand = 'echo $$PBS_JOBID > jobs/current_id' else: saveIdCommand = '' script = Template( '#!/bin/bash\n' '\n' 'umask 0002\n' 'echo -e "\e[30;42m --- running on" `hostname` "--- \e[0m"\n' 'cd "$path"\n' '$saveIdCommand\n' 'trap "echo got SIGHUP" SIGHUP\n' 'trap "echo got SIGUSR1" USR1\n' '$command\n' 'echo done\n' 'rm -f jobs/current_id\n').substitute(path=env.path(), command=cmd, saveIdCommand=saveIdCommand) open(qsubCommandFile, 'w').write(script) tb.system('chmod a+x "%s"' % qsubCommandFile) qsub = 'qsub -l nodes=%s:gpus=%d%s,mem=%dmb,walltime=240:00:00 %s -q gpujob -d %s %s -N %s -T %s' % ( node if node is not None else '1', gpus, (':' + gpuArch) if gpuArch != 'any' else '', env.params().requiredMemory(), '-I -x' if not background else '', env.path(), qsubCommandFile, env.name(), epilogueScript) if background: print 'job name: %s' % os.path.basename(qsubCommandFile) qsub += ' -j oe -o %s' % (env.jobDir()) tb.notice("lmbtorque: running %s" % qsub, 'run') if not background: tb.system( 'ssh lmbtorque "umask 0002; cd %s; %s; rm -f jobs/current_id"' % (env.path(), qsub)) else: tb.system('ssh lmbtorque "umask 0002; %s"' % (qsub)) sys.exit(0)