def test_get_tf_config_default(): assert not os.environ.get("PS_CONFIG") with pytest.raises(ConfigError) as ce: get_tf_config() assert "TF Config" in str(ce.value) assert "Something went wrong. " in str(ce.value)
def test_get_tf_config(): assert not os.environ.get("TF_CONFIG") os.environ[ "PS_CONFIG"] = "eyJjbHVzdGVyIjogeyJtYXN0ZXIiOiBbImxvY2FsaG9zdDo1MDAwIl0sICJ3b3JrZXIiOiBbImxvY2FsaG9zdDo1MDAwIiwgImxvY2FsaG9zdDo1MDAxIl0sICJwcyI6IFsibG9jYWxob3N0OjUwMDIiXX0sICJ0YXNrIjogeyJ0eXBlIjogIm1hc3RlciIsICJpbmRleCI6IDB9LCAiZW52aXJvbm1lbnQiOiAiY2xvdWQifQ\=\=" get_tf_config() assert os.environ.get("TF_CONFIG") os.environ.pop("PS_CONFIG")
tf.logging.debug('Starting to Export model to {}'.format(str(flags_obj.export_dir))) image = tf.placeholder(tf.float32, [None, 28, 28]) input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({ 'image': image, }) mnist_classifier.export_savedmodel(flags_obj.export_dir, input_fn, strip_default_attrs=True) tf.logging.debug('Model Exported') def main(_): run_mnist(flags.FLAGS) if __name__ == '__main__': tf.logging.set_verbosity(tf.logging.DEBUG) if gradient_sdk: try: get_tf_config() except: pass define_mnist_flags() # Print ENV Variables tf.logging.debug('=' * 20 + ' Environment Variables ' + '=' * 20) for k, v in os.environ.items(): tf.logging.debug('{}: {}'.format(k, v)) absl_app.run(main)
max_steps=opts.max_steps) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, steps=1, start_delay_secs=0, throttle_secs=opts.eval_secs) # Train and evaluate! tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) if __name__ == "__main__": args = parse_args() tf.logging.set_verbosity(args.verbosity) tf.logging.debug('=' * 20 + ' Environment Variables ' + '=' * 20) for k, v in os.environ.items(): tf.logging.debug('{}: {}'.format(k, v)) tf.logging.debug('=' * 20 + ' Arguments ' + '=' * 20) for k, v in sorted(args.__dict__.items()): if v is not None: tf.logging.debug('{}: {}'.format(k, v)) try: gradient_sdk.get_tf_config() except: tf.logging.debug("Single node mode") tf.logging.info('=' * 20 + ' Train starting ' + '=' * 20) main(args)
def main(): args = parser.parse_args() if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') if args.gpu is not None: warnings.warn('You have chosen a specific GPU. This will completely ' 'disable data parallelism.') # if args.dist_url == "env://" and args.world_size == -1: # args.world_size = int(os.environ["WORLD_SIZE"]) ###########prod################## get_tf_config() metadata = json.loads(os.environ["TF_CONFIG"]) cluster = metadata.get('cluster') workers = cluster.get("worker", []) # master_worker=workers[0] master_worker = cluster.get("master", [])[0] job_type = metadata.get('task', {}).get('type') task_index = metadata.get('task', {}).get('index') if job_type == "master": args.rank = 0 else: args.rank = task_index + 1 ###########test################## # cluster = {} # workers=["127.0.0.0:5000"]#cluster.get("worker",[]) # master_worker=workers[0] # job_type = "master"#metadata.get('task', {}).get('type') # task_index = "0" #metadata.get('task', {}).get('index') # args.rank= 0 #task_index ############################# os.environ['MASTER_ADDR'] = master_worker.split(":")[0] os.environ['MASTER_PORT'] = master_worker.split(":")[1] # print("parameter server",ps) print("master_worker", master_worker) print("job_type", job_type) args.world_size = int(len(workers)) + 1 args.distributed = args.world_size > 1 or args.multiprocessing_distributed ngpus_per_node = torch.cuda.device_count() if args.multiprocessing_distributed: # Since we have ngpus_per_node processes per node, the total world_size # needs to be adjusted accordingly args.world_size = ngpus_per_node * args.world_size # Use torch.multiprocessing.spawn to launch distributed processes: the # main_worker process function mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) else: # Simply call main_worker function main_worker(args.gpu, ngpus_per_node, args)