Ejemplo n.º 1
0
def create_and_initialize_graph(model_tag, num_classes, num_features, save_loc, compute=None):
    model_path = get_model_path(0, model_tag, save_loc)

    host_name, gpu = compute if compute is not None else ('*', -1)
    job_name = '{}.initialize'.format(model_tag)
    log_path = get_log_path(0, model_tag, save_loc, operation='initialize')
    cmd = '{} --gpu {} --model-path {} --model-tag {} --num-classes {} --num-features {}' \
        .format(AVERAGE_INITIALIZATION_CMD, gpu, model_path, model_tag, num_classes, num_features)
    job = get_gpu_queue_job(host_name, job_name, log_path, cmd)
    submit_and_watch_job(append_cwd_to_python_path(job))
Ejemplo n.º 2
0
def submit_train_worker_job(model_tag, iteration, worker_id, batch_size, lr, egs_index, num_classes, num_features, save_loc, compute=None):
    meta_graph = get_meta_graph(model_tag, save_loc)

    host_name, gpu = compute if compute is not None else ('*', -1)
    job_name = '{}.{}.{}'.format(model_tag, iteration, worker_id)
    log_path = get_log_path(iteration, model_tag, save_loc, worker_id)
    cmd = '{} --batch-size {} --egs-index {} --gpu {} --iteration {} --lr {} --meta-graph {} --model-tag {}  --num-classes {} --num-features {} --save {} --worker-id {}' \
        .format(AVERAGE_TRAIN_CMD, batch_size, egs_index, gpu, iteration, lr, meta_graph, model_tag, num_classes, num_features, save_loc, worker_id)
    job = get_gpu_queue_job(host_name, job_name, log_path, cmd)
    return submit_job(append_cwd_to_python_path(job))
Ejemplo n.º 3
0
def average_parameters(model_tag, iteration, num_workers, save_loc, compute):
    meta_graph = get_meta_graph(model_tag, save_loc)
    checkpoint_dirs_ = [get_model_path(iteration, model_tag, save_loc, worker_id=w) for w in range(num_workers)]
    final_path = get_model_path(iteration, model_tag, save_loc)

    host_name, gpu = compute if compute is not None else ('*', -1)
    job_name = '{}.average.{}'.format(model_tag, iteration)
    log_path = get_log_path(iteration, model_tag, save_loc, operation='average')
    cmd = '{} --checkpoint-dirs {} --final-path {} --gpu {} --meta-graph {}' \
        .format(AVERAGE_CHECKPOINTS_CMD, ','.join(checkpoint_dirs_), final_path, gpu, meta_graph)
    job = get_gpu_queue_job(host_name, job_name, log_path, cmd)
    submit_and_watch_job(append_cwd_to_python_path(job))
Ejemplo n.º 4
0
def make_workers(workers, cmd, model_tag, log_loc):
    c = 0
    jobs_list = []
    for (address, gpu) in workers.items():
        node = address.split(':')[0][-1]
        job = get_gpu_queue_job(host_name=node, name='worker_{}_{}'.format(c + 1, model_tag),
                                           log_file='{}/{}_worker_{}.log'.format(log_loc, model_tag, c + 1),
                                           cmd='{} --gpu {} --model-tag {} --task-index {} --type worker'
                                           .format(cmd, gpu, model_tag, c))
        jobs_list.append(append_cwd_to_python_path(job))
        print('Worker Node {} - {}'.format(c + 1, address))
        c += 1
    return jobs_list