def reboot_run(run_dir, used_gpus): cfg_file = pjoin(run_dir, 'cfg.json') # Read in cluster we should be using cfg = load_config(cfg_file) cluster = ''.join(c for c in cfg['host'] if not c.isdigit()) run_args = '--cfg_file %s' % cfg_file cmd = 'cd %s; source ~/.bashrc; nohup %s runNNet.py %s' % (CTC_DIR, PYTHON_CMD, run_args) print cmd gpu_node = None while not gpu_node: all_free_gpus = get_all_free_gpus(cluster) print all_free_gpus gpu_node, gpu = get_next_free_gpu_sequential(all_free_gpus, used_gpus, FLAGGED_GPUS) if not gpu_node: print 'No free GPUs, waiting for a bit' time.sleep(SLEEP_SEC) # Log to file in for debugging log_file = pjoin(RUN_DIR, '%s.txt' % str(TimeString())) print 'Logging to %s' % log_file run_gpu_job(gpu_node, gpu, cmd, blocking=False, stdout=open(log_file, 'w')) used_gpus.add(gpu_node + '_' + str(gpu)) time.sleep(SLEEP_SEC)
def reboot_run(run_dir, used_gpus): cfg_file = pjoin(run_dir, 'cfg.json') # Read in cluster we should be using cfg = load_config(cfg_file) cluster = ''.join(c for c in cfg['host'] if not c.isdigit()) run_args = '--cfg_file %s' % cfg_file cmd = 'cd %s; source ~/.bashrc; nohup %s runNNet.py %s' % ( CTC_DIR, PYTHON_CMD, run_args) print cmd gpu_node = None while not gpu_node: all_free_gpus = get_all_free_gpus(cluster) print all_free_gpus gpu_node, gpu = get_next_free_gpu_sequential(all_free_gpus, used_gpus, FLAGGED_GPUS) if not gpu_node: print 'No free GPUs, waiting for a bit' time.sleep(SLEEP_SEC) # Log to file in for debugging log_file = pjoin(RUN_DIR, '%s.txt' % str(TimeString())) print 'Logging to %s' % log_file run_gpu_job(gpu_node, gpu, cmd, blocking=False, stdout=open(log_file, 'w')) used_gpus.add(gpu_node + '_' + str(gpu)) time.sleep(SLEEP_SEC)
used_gpus = set() for setting in PARAM_SETTING_LIST: run_args = ' '.join(['--%s %s' % (key, val) for key, val in setting.iteritems()]) print run_args cmd = 'cd %s; source ~/.bashrc; nohup %s runNNet.py %s' % (CTC_DIR, PYTHON_CMD, run_args) print cmd if args.view: continue gpu_node = None while not gpu_node: all_free_gpus = get_all_free_gpus(args.cluster) print all_free_gpus gpu_node, gpu = get_next_free_gpu_sequential(all_free_gpus, used_gpus, FLAGGED_GPUS) if not gpu_node: print 'No free GPUs, waiting for a bit' time.sleep(SLEEP_SEC) # Log to file in for debugging log_file = pjoin(RUN_DIR, '%s.txt' % str(TimeString())) print 'Logging to %s' % log_file run_gpu_job(gpu_node, gpu, cmd, blocking=False, stdout=open(log_file, 'w')) used_gpus.add(gpu_node + '_' + str(gpu)) time.sleep(SLEEP_SEC)