def runNode(node, job, opts): alisDir = opts.alisDir if opts.alisDir else opts.dataDir # Create decoding command for each file cmd = '%s ../runDecode.py --dataDir %s --alisDir %s --numFiles 1 --start_file %d --out_file %s.%d' % (PYTHON_CMD, opts.dataDir, alisDir, job, opts.out_file, job) print cmd full_cmd = 'cd %s/../%s-utils; source ~/.bashrc; ' % (CLUSTER_DIR, DATASET) full_cmd += '; ' + cmd print full_cmd log_file = '/tmp/%s_decode%s.log' % (DATASET, job) run_cpu_job(node, full_cmd, stdout=open(log_file, 'w'), blocking=False) return None
def runNode(node, job, opts): alisDir = opts.alisDir if opts.alisDir else opts.dataDir # Create decoding command for each file cmd = '%s ../runDecode.py --dataDir %s --alisDir %s --numFiles 1 --start_file %d --out_file %s.%d' % ( PYTHON_CMD, opts.dataDir, alisDir, job, opts.out_file, job) print cmd full_cmd = 'cd %s/../%s-utils; source ~/.bashrc; ' % (CLUSTER_DIR, DATASET) full_cmd += '; ' + cmd print full_cmd log_file = '/tmp/%s_decode%s.log' % (DATASET, job) run_cpu_job(node, full_cmd, stdout=open(log_file, 'w'), blocking=False) return None
for d in run_dirs: alive = False log_file = pjoin(d, 'train.log') cfg_file = pjoin(d, 'cfg.json') if not os.path.exists(cfg_file): # Definitely delete it shutil.rmtree(d) continue alive = file_alive(log_file, max_dur_sec=60 * 60) if not alive and not os.path.exists(pjoin(d, 'sentinel')): run = os.path.basename(d) print 'loading config' print cfg_file cfg = load_config(cfg_file) print 'loaded config' host = cfg['host'] pid = cfg['pid'] print 'Killing run %s, PID %s on %s' % (run, cfg['pid'], cfg['host']) # Kill children (due to async data loader) run_cpu_job(host, 'pkill -TERM -P %s' % pid) # Kill process run_cpu_job(host, 'kill -9 %s' % pid) if args.clear_dirs: print 'Clearing %s' % d shutil.rmtree(d)
for d in run_dirs: alive = False log_file = pjoin(d, 'train.log') cfg_file = pjoin(d, 'cfg.json') if not os.path.exists(cfg_file): # Definitely delete it shutil.rmtree(d) continue alive = file_alive(log_file, max_dur_sec=60*60) if not alive and not os.path.exists(pjoin(d, 'sentinel')): run = os.path.basename(d) print 'loading config' print cfg_file cfg = load_config(cfg_file) print 'loaded config' host = cfg['host'] pid = cfg['pid'] print 'Killing run %s, PID %s on %s' % (run, cfg['pid'], cfg['host']) # Kill children (due to async data loader) run_cpu_job(host, 'pkill -TERM -P %s' % pid) # Kill process run_cpu_job(host, 'kill -9 %s' % pid) if args.clear_dirs: print 'Clearing %s' % d shutil.rmtree(d)