Esempio n. 1
0
def reboot_run(run_dir, used_gpus):
    cfg_file = pjoin(run_dir, 'cfg.json')

    # Read in cluster we should be using
    cfg = load_config(cfg_file)
    cluster = ''.join(c for c in cfg['host'] if not c.isdigit())

    run_args = '--cfg_file %s' % cfg_file
    cmd = 'cd %s; source ~/.bashrc; nohup %s runNNet.py %s' % (CTC_DIR, PYTHON_CMD, run_args)
    print cmd

    gpu_node = None
    while not gpu_node:
        all_free_gpus = get_all_free_gpus(cluster)
        print all_free_gpus
        gpu_node, gpu = get_next_free_gpu_sequential(all_free_gpus, used_gpus, FLAGGED_GPUS)
        if not gpu_node:
            print 'No free GPUs, waiting for a bit'
            time.sleep(SLEEP_SEC)

    # Log to file in for debugging
    log_file = pjoin(RUN_DIR, '%s.txt' % str(TimeString()))
    print 'Logging to %s' % log_file
    run_gpu_job(gpu_node, gpu, cmd, blocking=False,
            stdout=open(log_file, 'w'))

    used_gpus.add(gpu_node + '_' + str(gpu))

    time.sleep(SLEEP_SEC)
Esempio n. 2
0
    args = parser.parse_args()

    used_gpus = set()
    for setting in PARAM_SETTING_LIST:
        run_args = ' '.join(['--%s %s' % (key, val) for key, val in setting.iteritems()])
        print run_args
        cmd = 'cd %s; source ~/.bashrc; nohup %s runNNet.py %s' % (CTC_DIR, PYTHON_CMD, run_args)
        print cmd

        if args.view:
            continue

        gpu_node = None

        while not gpu_node:
            all_free_gpus = get_all_free_gpus(args.cluster)
            print all_free_gpus
            gpu_node, gpu = get_next_free_gpu_sequential(all_free_gpus, used_gpus, FLAGGED_GPUS)
            if not gpu_node:
                print 'No free GPUs, waiting for a bit'
                time.sleep(SLEEP_SEC)

        # Log to file in for debugging
        log_file = pjoin(RUN_DIR, '%s.txt' % str(TimeString()))
        print 'Logging to %s' % log_file
        run_gpu_job(gpu_node, gpu, cmd, blocking=False,
                stdout=open(log_file, 'w'))

        used_gpus.add(gpu_node + '_' + str(gpu))

        time.sleep(SLEEP_SEC)