def viewFilters(self, iter): self.prepareTraining() prototxt = self._trainDir + '/train.prototxt' modelFile, iter = self.getModelFile(iter) os.environ['LD_LIBRARY_PATH']="/misc/lmbraid17/sceneflownet/common/programs/torch/install/lib:/usr/lib/x86_64-linux-gnu:/misc/lmbraid17/sceneflownet/common/software-root/lib:/home/ilge/dev/hackathon-caffe2/build/lib:/misc/software-lin/Qt-5.3.2/5.3/gcc_64/lib:/misc/lmbraid17/sceneflownet/common/programs/torch/install/lib:/usr/lib/x86_64-linux-gnu:/misc/lmbraid17/sceneflownet/common/software-root/lib:/home/ilge/dev/hackathon-caffe2/build/lib:/misc/software-lin/Qt-5.3.2/5.3/gcc_64/lib::/home/ilge/lib:/misc/software-lin/lmbsoft/openni-1.5.2.23-x86_64/usr/lib:/misc/software-lin/lmbsoft/glog/lib:/misc/software-lin/lmbsoft/mkl/lib:/misc/software-lin/lmbsoft/mkl/lib/intel64:/misc/software-lin/lmbsoft/cuda-6.5.14-x86_64/lib64:/misc/software-lin/lmbsoft/cuda-6.0.37-x86_64/lib64:/misc/student/mayern/OpenNI-Bin-Dev-Linux-x64-v1.5.4.0/Lib:/home/ilge/lib:/misc/software-lin/lmbsoft/openni-1.5.2.23-x86_64/usr/lib:/misc/software-lin/lmbsoft/glog/lib:/misc/software-lin/lmbsoft/mkl/lib:/misc/software-lin/lmbsoft/mkl/lib/intel64:/misc/software-lin/lmbsoft/cuda-6.5.14-x86_64/lib64:/misc/software-lin/lmbsoft/cuda-6.0.37-x86_64/lib64:/misc/student/mayern/OpenNI-Bin-Dev-Linux-x64-v1.5.4.0/Lib" os.environ['PATH']="/home/ilge/bin:/home/ilge/dev/pymill/bin:/misc/lmbraid17/sceneflownet/common/programs/torch/install/bin:/misc/lmbraid17/sceneflownet/common/software-root/bin:/misc/lmbraid17/sceneflownet/ilge/hackathon-caffe2/python/pymill/bin:/misc/software-lin/Qt-5.3.2/5.3/gcc_64/bin:/home/ilge/bin:/home/ilge/dev/pymill/bin:/misc/lmbraid17/sceneflownet/common/programs/torch/install/bin:/misc/lmbraid17/sceneflownet/common/software-root/bin:/misc/lmbraid17/sceneflownet/ilge/hackathon-caffe2/python/pymill/bin:/misc/software-lin/Qt-5.3.2/5.3/gcc_64/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/misc/software-lin/lmbsoft/cuda-6.5.14-x86_64/bin:/misc/software-lin/matlabR2013a/bin:/home/ilge/data/caffe/matching/bin:/misc/lmbraid15/hackathon/common/flo-results/bin:/misc/lmbraid17/sceneflownet/common/data_tools:/misc/software-lin/lmbsoft/cuda-6.5.14-x86_64/bin:/misc/software-lin/matlabR2013a/bin:/home/ilge/data/caffe/matching/bin:/misc/lmbraid15/hackathon/common/flo-results/bin:/misc/lmbraid17/sceneflownet/common/data_tools" tb.system('/home/ilge/bin/weight-viewer %s %s' % (prototxt, modelFile))
def viewFilters(self, iter): self.prepareTraining() prototxt = self._trainDir + '/train.prototxt' modelFile, iter = self.getModelFile(iter) os.environ[ 'LD_LIBRARY_PATH'] = "/misc/lmbraid17/sceneflownet/common/programs/torch/install/lib:/usr/lib/x86_64-linux-gnu:/misc/lmbraid17/sceneflownet/common/software-root/lib:/home/ilge/dev/hackathon-caffe2/build/lib:/misc/software-lin/Qt-5.3.2/5.3/gcc_64/lib:/misc/lmbraid17/sceneflownet/common/programs/torch/install/lib:/usr/lib/x86_64-linux-gnu:/misc/lmbraid17/sceneflownet/common/software-root/lib:/home/ilge/dev/hackathon-caffe2/build/lib:/misc/software-lin/Qt-5.3.2/5.3/gcc_64/lib::/home/ilge/lib:/misc/software-lin/lmbsoft/openni-1.5.2.23-x86_64/usr/lib:/misc/software-lin/lmbsoft/glog/lib:/misc/software-lin/lmbsoft/mkl/lib:/misc/software-lin/lmbsoft/mkl/lib/intel64:/misc/software-lin/lmbsoft/cuda-6.5.14-x86_64/lib64:/misc/software-lin/lmbsoft/cuda-6.0.37-x86_64/lib64:/misc/student/mayern/OpenNI-Bin-Dev-Linux-x64-v1.5.4.0/Lib:/home/ilge/lib:/misc/software-lin/lmbsoft/openni-1.5.2.23-x86_64/usr/lib:/misc/software-lin/lmbsoft/glog/lib:/misc/software-lin/lmbsoft/mkl/lib:/misc/software-lin/lmbsoft/mkl/lib/intel64:/misc/software-lin/lmbsoft/cuda-6.5.14-x86_64/lib64:/misc/software-lin/lmbsoft/cuda-6.0.37-x86_64/lib64:/misc/student/mayern/OpenNI-Bin-Dev-Linux-x64-v1.5.4.0/Lib" os.environ[ 'PATH'] = "/home/ilge/bin:/home/ilge/dev/pymill/bin:/misc/lmbraid17/sceneflownet/common/programs/torch/install/bin:/misc/lmbraid17/sceneflownet/common/software-root/bin:/misc/lmbraid17/sceneflownet/ilge/hackathon-caffe2/python/pymill/bin:/misc/software-lin/Qt-5.3.2/5.3/gcc_64/bin:/home/ilge/bin:/home/ilge/dev/pymill/bin:/misc/lmbraid17/sceneflownet/common/programs/torch/install/bin:/misc/lmbraid17/sceneflownet/common/software-root/bin:/misc/lmbraid17/sceneflownet/ilge/hackathon-caffe2/python/pymill/bin:/misc/software-lin/Qt-5.3.2/5.3/gcc_64/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/misc/software-lin/lmbsoft/cuda-6.5.14-x86_64/bin:/misc/software-lin/matlabR2013a/bin:/home/ilge/data/caffe/matching/bin:/misc/lmbraid15/hackathon/common/flo-results/bin:/misc/lmbraid17/sceneflownet/common/data_tools:/misc/software-lin/lmbsoft/cuda-6.5.14-x86_64/bin:/misc/software-lin/matlabR2013a/bin:/home/ilge/data/caffe/matching/bin:/misc/lmbraid15/hackathon/common/flo-results/bin:/misc/lmbraid17/sceneflownet/common/data_tools" tb.system('/home/ilge/bin/weight-viewer %s %s' % (prototxt, modelFile))
def runProto(self, proto): defFile = proto modelFile, iter = self._env.getModelFile(self._iter) print 'testing for iteration %d ...' % self._iter if self._output: dir = 'output_%s_%d' % (self._name, self._iter) tb.system('mkdir -p %s' % dir) self._variables['TEST_OUTPUT'] = 1 self._variables['TEST_OUTPUT_DIR'] = '"\\"%s\\""' % dir self._env.makeScratchDir() defPrototxt = self._env.prototxt(defFile, 'scratch', self._variables) print defFile, defPrototxt tb.system('%s test -weights %s -model %s -gpu 0 -iterations %d 2>&1' % (Environment.caffeBin(), modelFile, defPrototxt, self._iterations))
def runProto(self, proto): defFile = proto modelFile, iter = self._env.getModelFile(self._iter) print 'testing for iteration %d ...' % self._iter if self._output: dir = 'output_%s_%d' % (self._name, self._iter) tb.system('mkdir -p %s' % dir) self._variables['TEST_OUTPUT'] = 1 self._variables['TEST_OUTPUT_DIR'] = '"\\"%s\\""' % dir self._env.makeScratchDir() defPrototxt = self._env.prototxt(defFile, 'scratch', self._variables) print defFile, defPrototxt tb.system( '%s test -weights %s -model %s -gpu 0 -iterations %d 2>&1' % (Environment.caffeBin(), modelFile, defPrototxt, self._iterations))
def _callCopiedBin(self, cmd): bin = './' + os.path.basename(caffeBin()) tb.notice('making a local copy of %s' % caffeBin()) os.system('cp %s .' % caffeBin()) ldd = tb.run('ldd %s' % caffeBin()) caffeLib = None for line in ldd.split('\n'): match = re.match('\\s*libcaffe.so => (.*\.so)', line) if match: caffeLib = match.group(1) break if caffeLib is None: raise Exception('cannot find libcaffe.so dependency') tb.notice('making a local copy of %s' % caffeLib) os.system('cp %s .' % caffeLib) cmd = 'GLOG_logtostderr=%d LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH %s %s' % (not self._quiet, bin, cmd) if not self._silent: tb.notice('running "%s"' % cmd, 'run') tb.system(cmd)
def _callCopiedBin(self, cmd): bin = './' + os.path.basename(caffeBin()) tb.notice('making a local copy of %s' % caffeBin()) os.system('cp %s .' % caffeBin()) ldd = tb.run('ldd %s' % caffeBin()) caffeLib = None for line in ldd.split('\n'): match = re.match('\\s*libcaffe.so => (.*\.so)', line) if match: caffeLib = match.group(1) break if caffeLib is None: raise Exception('cannot find libcaffe.so dependency') tb.notice('making a local copy of %s' % caffeLib) os.system('cp %s .' % caffeLib) cmd = 'GLOG_logtostderr=%d LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH %s %s' % ( not self._quiet, bin, cmd) if not self._silent: tb.notice('running "%s"' % cmd, 'run') tb.system(cmd)
def _callBin(self, cmd): cmd = 'GLOG_logtostderr=%d %s %s' % (not self._quiet, caffeBin(), cmd) if not self._silent: tb.notice('running "%s"' % cmd, 'run') tb.system(cmd)
def makeJobDir(self): tb.system('mkdir -p %s' % self._jobDir)
def makeScratchDir(self): tb.system('mkdir -p %s' % self._scratchDir) tb.system('rm -f %s/scratch/current' % self._path) tb.system('ln -s %s %s/scratch/current' % (self._scratchDir, self._path))
def makeTrainDir(self): tb.system('mkdir -p %s' % self._trainDir) tb.system('mkdir -p %s' % self._logDir)
def runOnCluster(env, node, gpus, background, insertLocal=True, trackJob=True): gpuArch = env.params().gpuArch() if node is not None: tb.notice( 'Forwarding job to cluster node %s with %d gpu(s) which are of type %s' % (node, gpus, gpuArch), 'info') else: tb.notice( 'Forwarding job to cluster with %d gpu(s) which are of type %s' % (gpus, gpuArch), 'info') env.makeJobDir() currentId = '%s/current_id' % env.jobDir() if trackJob and os.path.exists(currentId): raise Exception('%s exists, there seems to be a job already running' % currentId) sysargs = sys.argv if insertLocal: sysargs.insert(1, '--execute') cmd = ' '.join(sysargs) home = os.environ['HOME'] if args.backend == 'python': training = os.path.abspath('training') cmd = 'LD_LIBRARY_PATH=%s:$LD_LIBRARY_PATH PYTHONPATH=%s:$PYTHONPATH %s' % ( training, training, cmd) qsubCommandFile = '%s/%s-%s.sh' % (env.jobDir(), env.name().replace( '/', '_'), time.strftime('%d.%m.%Y-%H:%M:%S')) epilogueScript = '%s/epilogue.sh' % env.jobDir() open(epilogueScript, 'w').write("#!/bin/bash\ncd $path\nrm -f jobs/current_id\n") if trackJob: saveIdCommand = 'echo $$PBS_JOBID > jobs/current_id' else: saveIdCommand = '' script = Template( '#!/bin/bash\n' '\n' 'umask 0002\n' 'echo -e "\e[30;42m --- running on" `hostname` "--- \e[0m"\n' 'cd "$path"\n' '$saveIdCommand\n' 'trap "echo got SIGHUP" SIGHUP\n' 'trap "echo got SIGUSR1" USR1\n' '$command\n' 'echo done\n' 'rm -f jobs/current_id\n').substitute(path=env.path(), command=cmd, saveIdCommand=saveIdCommand) open(qsubCommandFile, 'w').write(script) tb.system('chmod a+x "%s"' % qsubCommandFile) qsub = 'qsub -l nodes=%s:gpus=%d%s,mem=%dmb,walltime=240:00:00 %s -q gpujob -d %s %s -N %s -T %s' % ( node if node is not None else '1', gpus, (':' + gpuArch) if gpuArch != 'any' else '', env.params().requiredMemory(), '-I -x' if not background else '', env.path(), qsubCommandFile, env.name(), epilogueScript) if background: print 'job name: %s' % os.path.basename(qsubCommandFile) qsub += ' -j oe -o %s' % (env.jobDir()) tb.notice("lmbtorque: running %s" % qsub, 'run') if not background: tb.system( 'ssh lmbtorque "umask 0002; cd %s; %s; rm -f jobs/current_id"' % (env.path(), qsub)) else: tb.system('ssh lmbtorque "umask 0002; %s"' % (qsub)) sys.exit(0)
def _callBin(self, cmd): cmd = 'GLOG_logtostderr=%d %s %s' % (not self._quiet, caffeBin(), cmd) if not self._silent: tb.notice('running "%s"' % cmd, 'run') tb.system(cmd)
def makeJobDir(self): tb.system('mkdir -p %s' % self._jobDir)
def makeScratchDir(self): tb.system('mkdir -p %s' % self._scratchDir) tb.system('rm -f %s/scratch/current' % self._path) tb.system('ln -s %s %s/scratch/current' % (self._scratchDir, self._path))
def makeTrainDir(self): tb.system('mkdir -p %s' % self._trainDir) tb.system('mkdir -p %s' % self._logDir)
def copy(self, source, target, copySnapshot, iter): tb.system('mkdir -p %s' % target) for f in os.listdir(source): if f == '.': continue if f == '..': continue if f == 'training': if copySnapshot: os.system('mkdir -p %s/training' % target) modelFiles = iterFiles('.caffemodel', '%s/training' % source) stateFiles = iterFiles('.solverstate', '%s/training' % source) if iter != -1: for m in modelFiles: if m.iteration() == iter: tb.system('cp -v %s %s/training' % (m.filename(), target)) for s in stateFiles: if s.iteration() == iter: tb.system('cp -v %s %s/training' % (s.filename(), target)) else: tb.system('cp -v %s %s/training' % (modelFiles[-1].filename(), target)) tb.system('cp -v %s %s/training' % (stateFiles[-1].filename(), target)) tb.system('cp %s %s/training/log.txt %s/training' % ('' if self._silent else '-v', source, target)) continue if f == 'scratch': continue if f == 'jobs': continue if f.endswith('.pyc'): continue if os.path.isdir('%s/%s' % (source, f)) and f.startswith('test_'): continue if os.path.isdir('%s/%s' % (source, f)) and f.startswith('output'): continue tb.system('cp -r %s %s/%s %s' % ('' if self._silent else '-v', source, f, target))
def runOnCluster(env, node, gpus, background, insertLocal=True, trackJob=True): gpuArch = env.params().gpuArch() if node is not None: tb.notice( "Forwarding job to cluster node %s with %d gpu(s) which are of type %s" % (node, gpus, gpuArch), "info" ) else: tb.notice("Forwarding job to cluster with %d gpu(s) which are of type %s" % (gpus, gpuArch), "info") env.makeJobDir() currentId = "%s/current_id" % env.jobDir() if trackJob and os.path.exists(currentId): raise Exception("%s exists, there seems to be a job already running" % currentId) sysargs = sys.argv if insertLocal: sysargs.insert(1, "--execute") cmd = " ".join(sysargs) home = os.environ["HOME"] if args.backend == "python": training = os.path.abspath("training") cmd = "LD_LIBRARY_PATH=%s:$LD_LIBRARY_PATH PYTHONPATH=%s:$PYTHONPATH %s" % (training, training, cmd) qsubCommandFile = "%s/%s-%s.sh" % (env.jobDir(), env.name().replace("/", "_"), time.strftime("%d.%m.%Y-%H:%M:%S")) epilogueScript = "%s/epilogue.sh" % env.jobDir() open(epilogueScript, "w").write("#!/bin/bash\ncd $path\nrm -f jobs/current_id\n") if trackJob: saveIdCommand = "echo $$PBS_JOBID > jobs/current_id" else: saveIdCommand = "" script = Template( "#!/bin/bash\n" "\n" "umask 0002\n" 'echo -e "\e[30;42m --- running on" `hostname` "--- \e[0m"\n' 'cd "$path"\n' "$saveIdCommand\n" 'trap "echo got SIGHUP" SIGHUP\n' 'trap "echo got SIGUSR1" USR1\n' "$command\n" "echo done\n" "rm -f jobs/current_id\n" ).substitute(path=env.path(), command=cmd, saveIdCommand=saveIdCommand) open(qsubCommandFile, "w").write(script) tb.system('chmod a+x "%s"' % qsubCommandFile) qsub = "qsub -l nodes=%s:gpus=%d%s,mem=%dmb,walltime=240:00:00 %s -q gpujob -d %s %s -N %s -T %s" % ( node if node is not None else "1", gpus, (":" + gpuArch) if gpuArch != "any" else "", env.params().requiredMemory(), "-I -x" if not background else "", env.path(), qsubCommandFile, env.name(), epilogueScript, ) if background: print "job name: %s" % os.path.basename(qsubCommandFile) qsub += " -j oe -o %s" % (env.jobDir()) tb.notice("lmbtorque: running %s" % qsub, "run") if not background: tb.system('ssh lmbtorque "umask 0002; cd %s; %s; rm -f jobs/current_id"' % (env.path(), qsub)) else: tb.system('ssh lmbtorque "umask 0002; %s"' % (qsub)) sys.exit(0)
def copy(self, source, target, copySnapshot, iter): tb.system('mkdir -p %s' % target) for f in os.listdir(source): if f == '.': continue if f == '..': continue if f == 'training': if copySnapshot: os.system('mkdir -p %s/training' % target) modelFiles = iterFiles('.caffemodel', '%s/training' % source) stateFiles = iterFiles('.solverstate', '%s/training' % source) if iter != -1: for m in modelFiles: if m.iteration() == iter: tb.system('cp -v %s %s/training' % (m.filename(), target)) for s in stateFiles: if s.iteration() == iter: tb.system('cp -v %s %s/training' % (s.filename(), target)) else: tb.system('cp -v %s %s/training' % (modelFiles[-1].filename(), target)) tb.system('cp -v %s %s/training' % (stateFiles[-1].filename(), target)) tb.system('cp %s %s/training/log.txt %s/training' % ('' if self._silent else '-v', source, target)) continue if f == 'scratch': continue if f == 'jobs': continue if f.endswith('.pyc'): continue if os.path.isdir('%s/%s' % (source,f)) and f.startswith('test_'): continue if os.path.isdir('%s/%s' % (source,f)) and f.startswith('output'): continue tb.system('cp -r %s %s/%s %s' % ('' if self._silent else '-v', source, f, target))