args = parser.parse_args() print 'Parsed args' run_dirs = get_run_dirs(args.run_dir) for d in run_dirs: alive = False log_file = pjoin(d, 'train.log') cfg_file = pjoin(d, 'cfg.json') if not os.path.exists(cfg_file): # Definitely delete it shutil.rmtree(d) continue alive = file_alive(log_file, max_dur_sec=60 * 60) if not alive and not os.path.exists(pjoin(d, 'sentinel')): run = os.path.basename(d) print 'loading config' print cfg_file cfg = load_config(cfg_file) print 'loaded config' host = cfg['host'] pid = cfg['pid'] print 'Killing run %s, PID %s on %s' % (run, cfg['pid'], cfg['host']) # Kill children (due to async data loader) run_cpu_job(host, 'pkill -TERM -P %s' % pid) # Kill process run_cpu_job(host, 'kill -9 %s' % pid)
def process_run_dir(run_dir, figs=False): print run_dir run_data = dict() # Config file cfg_file = pjoin(run_dir, 'cfg.json') if not fexists(cfg_file): print 'No config file in %s' % run_dir return # Get epoch epoch_file = pjoin(run_dir, 'epoch') if os.path.exists(epoch_file): epoch = int(open(epoch_file, 'r').read().strip()) else: epoch = -1 run_data['epoch'] = epoch last_cost_file = pjoin(run_dir, 'last_cost') if os.path.exists(last_cost_file): run_data['cost'] = float(open(last_cost_file, 'r').read()) # Alive / not log_file = pjoin(run_dir, 'train.log') run_data['alive'] = file_alive(log_file, max_dur_sec=60*60) # Complete / not run_data['complete'] = os.path.exists(pjoin(run_dir, 'sentinel')) if run_data['complete']: run_data['alive'] = "<span style='background:#ccc;'>False</span>" elif run_data['alive']: run_data['alive'] = "<span style='background:#6d6;color:#fff'>True</span>" else: run_data['alive'] = "<span style='background:#d66;color:#fff'>False</span>" run_data['run'] = os.path.basename(run_dir) num_files_file = pjoin(run_dir, 'num_files') if os.path.exists(num_files_file): run_data['num_files'] = open(num_files_file, 'r').read() read_cfg(cfg_file, run_data) # TODO Load CER and WER if figs and os.path.exists(pjoin(run_dir, 'params.pk')): plot_file = pjoin(run_dir, 'plot.png') cmd = 'python plot_results.py %s --out_file %s' % (run_dir, plot_file) # Check if params file has been modified after the plot image file params_file = pjoin(run_dir, 'params.pk') if (not os.path.exists(plot_file)) or (last_modified(plot_file) < last_modified(params_file)): print '%s modified, generating plot' % params_file try: check_call(cmd, shell=True) except: pass if args.viewer_dir: plot_dir = pjoin(args.viewer_dir, 'plots') if not os.path.exists(plot_dir): os.makedirs(plot_dir) if os.path.exists(pjoin(run_dir, 'plot.png')): shutil.copyfile(pjoin(run_dir, 'plot.png'), pjoin(plot_dir, '%s.png' % run_data['run'])) return run_data
args = parser.parse_args() print 'Parsed args' run_dirs = get_run_dirs(args.run_dir) for d in run_dirs: alive = False log_file = pjoin(d, 'train.log') cfg_file = pjoin(d, 'cfg.json') if not os.path.exists(cfg_file): # Definitely delete it shutil.rmtree(d) continue alive = file_alive(log_file, max_dur_sec=60*60) if not alive and not os.path.exists(pjoin(d, 'sentinel')): run = os.path.basename(d) print 'loading config' print cfg_file cfg = load_config(cfg_file) print 'loaded config' host = cfg['host'] pid = cfg['pid'] print 'Killing run %s, PID %s on %s' % (run, cfg['pid'], cfg['host']) # Kill children (due to async data loader) run_cpu_job(host, 'pkill -TERM -P %s' % pid) # Kill process run_cpu_job(host, 'kill -9 %s' % pid)