Exemple #1
0
def test_get_tf_config_default():
    assert not os.environ.get("PS_CONFIG")
    with pytest.raises(ConfigError) as ce:
        get_tf_config()

    assert "TF Config" in str(ce.value)
    assert "Something went wrong. " in str(ce.value)
Exemple #2
0
def test_get_tf_config():
    assert not os.environ.get("TF_CONFIG")

    os.environ[
        "PS_CONFIG"] = "eyJjbHVzdGVyIjogeyJtYXN0ZXIiOiBbImxvY2FsaG9zdDo1MDAwIl0sICJ3b3JrZXIiOiBbImxvY2FsaG9zdDo1MDAwIiwgImxvY2FsaG9zdDo1MDAxIl0sICJwcyI6IFsibG9jYWxob3N0OjUwMDIiXX0sICJ0YXNrIjogeyJ0eXBlIjogIm1hc3RlciIsICJpbmRleCI6IDB9LCAiZW52aXJvbm1lbnQiOiAiY2xvdWQifQ\=\="
    get_tf_config()

    assert os.environ.get("TF_CONFIG")

    os.environ.pop("PS_CONFIG")
        tf.logging.debug('Starting to Export model to {}'.format(str(flags_obj.export_dir)))
        image = tf.placeholder(tf.float32, [None, 28, 28])
        input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({
            'image': image,
        })
        mnist_classifier.export_savedmodel(flags_obj.export_dir, input_fn,
                                           strip_default_attrs=True)
        tf.logging.debug('Model Exported')


def main(_):
    run_mnist(flags.FLAGS)


if __name__ == '__main__':

    tf.logging.set_verbosity(tf.logging.DEBUG)

    if gradient_sdk:
        try:
            get_tf_config()
        except:
            pass
    define_mnist_flags()
    # Print ENV Variables
    tf.logging.debug('=' * 20 + ' Environment Variables ' + '=' * 20)
    for k, v in os.environ.items():
        tf.logging.debug('{}: {}'.format(k, v))

    absl_app.run(main)
                                        max_steps=opts.max_steps)
    eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn,
                                      steps=1,
                                      start_delay_secs=0,
                                      throttle_secs=opts.eval_secs)

    # Train and evaluate!
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)


if __name__ == "__main__":
    args = parse_args()
    tf.logging.set_verbosity(args.verbosity)

    tf.logging.debug('=' * 20 + ' Environment Variables ' + '=' * 20)
    for k, v in os.environ.items():
        tf.logging.debug('{}: {}'.format(k, v))

    tf.logging.debug('=' * 20 + ' Arguments ' + '=' * 20)
    for k, v in sorted(args.__dict__.items()):
        if v is not None:
            tf.logging.debug('{}: {}'.format(k, v))

    try:
        gradient_sdk.get_tf_config()
    except:
        tf.logging.debug("Single node mode")

    tf.logging.info('=' * 20 + ' Train starting ' + '=' * 20)
    main(args)
Exemple #5
0
def main():
    args = parser.parse_args()

    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    if args.gpu is not None:
        warnings.warn('You have chosen a specific GPU. This will completely '
                      'disable data parallelism.')

#     if args.dist_url == "env://" and args.world_size == -1:
#         args.world_size = int(os.environ["WORLD_SIZE"])
###########prod##################

    get_tf_config()
    metadata = json.loads(os.environ["TF_CONFIG"])
    cluster = metadata.get('cluster')
    workers = cluster.get("worker", [])
    # master_worker=workers[0]
    master_worker = cluster.get("master", [])[0]
    job_type = metadata.get('task', {}).get('type')
    task_index = metadata.get('task', {}).get('index')
    if job_type == "master":
        args.rank = 0
    else:
        args.rank = task_index + 1


###########test##################
# cluster = {}
# workers=["127.0.0.0:5000"]#cluster.get("worker",[])
# master_worker=workers[0]
# job_type = "master"#metadata.get('task', {}).get('type')
# task_index = "0" #metadata.get('task', {}).get('index')
# args.rank= 0 #task_index
#############################

    os.environ['MASTER_ADDR'] = master_worker.split(":")[0]
    os.environ['MASTER_PORT'] = master_worker.split(":")[1]

    # print("parameter server",ps)
    print("master_worker", master_worker)
    print("job_type", job_type)

    args.world_size = int(len(workers)) + 1

    args.distributed = args.world_size > 1 or args.multiprocessing_distributed

    ngpus_per_node = torch.cuda.device_count()
    if args.multiprocessing_distributed:
        # Since we have ngpus_per_node processes per node, the total world_size
        # needs to be adjusted accordingly
        args.world_size = ngpus_per_node * args.world_size
        # Use torch.multiprocessing.spawn to launch distributed processes: the
        # main_worker process function
        mp.spawn(main_worker,
                 nprocs=ngpus_per_node,
                 args=(ngpus_per_node, args))
    else:
        # Simply call main_worker function
        main_worker(args.gpu, ngpus_per_node, args)