def setUpClass(cls): cls.logdir = tempfile.mkdtemp() cmd = [ 'python3', 'train.py', '--dataset=cifar-10', '--synthetic-data', '--model-size=20', '--batch-size=1', '--iterations=100', '--batches-per-step=1', '--pipeline-depth=4', '--pipeline-splits', 'b2/0/relu', '--xla-recompute', '--shards=2', '--distributed', '--no-validation', '--no-stochastic-rounding', '--log-dir', cls.logdir, ] cwd = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) worker_ports = cls._pick_unique_unused_ports(cls.NUM_WORKERS) cluster_spec = { 'worker': ['localhost:%s' % port for port in worker_ports] } processes = cls._start_processes_with_tf_config(cmd, cwd, cluster_spec) cls._wait_for_processes(processes, cls.WORKER_TIMEOUT_SECONDS) cls.worker_log_dirs = cls._find_worker_log_dirs() cls.training_logs = [ parse_csv(os.path.join(d, "training.csv")) for d in cls.worker_log_dirs ]
def test_resnet_50_from_readme(self): NUM_WORKERS = 2 WORKER_TIMEOUT_SECONDS = 30 * 60 with tempfile.TemporaryDirectory() as logdir: cmd = [ 'python3', 'train.py', '--dataset=imagenet', '--generated-data', '--model-size=50', '--batch-size=4', '--batches-per-step=1', '--shards=4', '--pipeline', '--gradient-accumulation-count=64', '--pipeline-splits', 'b1/2/relu', 'b2/3/relu', 'b3/5/relu', '--xla-recompute', '--replicas=2', # Instead of 4 to make two processes fit on one machine. '--distributed', '--no-stochastic-rounding', '--no-validation', '--iterations=100', '--learning-rate-schedule=1', '--base-learning-rate=-14', '--log-dir', logdir, '--ckpt-all-instances', "true", '--log-all-instances', "true" ] extra_env = { 'POPLAR_ENGINE_OPTIONS': '{"opt.maxCopyMergeSize": 8388608}', } cwd = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) worker_ports = self._pick_unique_unused_ports(NUM_WORKERS) cluster_spec = { 'worker': ['localhost:%s' % port for port in worker_ports] } processes = self._start_processes_with_tf_config(cmd, cwd, extra_env, cluster_spec) self._wait_for_processes(processes, WORKER_TIMEOUT_SECONDS) worker_log_dirs = self._find_worker_log_dirs(NUM_WORKERS, logdir) training_logs = [parse_csv(os.path.join(d, "training.csv")) for d in worker_log_dirs] # The final training accuracy should be the same on all workers. for i in range(1, NUM_WORKERS): self.assertEqual( training_logs[0]['train_acc_avg'][-1], training_logs[i]['train_acc_avg'][-1]) # The final training loss should be the same on all workers. for i in range(1, NUM_WORKERS): self.assertEqual( training_logs[0]['loss_avg'][-1], training_logs[i]['loss_avg'][-1]) # The final weights should be the same on all workers. var_names_and_shapes = tf.train.list_variables(worker_log_dirs[0]) for var_name, _ in var_names_and_shapes: value_worker_0 = tf.train.load_variable(worker_log_dirs[0], var_name) for i in range(1, NUM_WORKERS): value_worker_i = tf.train.load_variable(worker_log_dirs[i], var_name) self.assertListEqual(value_worker_0.tolist(), value_worker_i.tolist())
def test_resnet8(self): TIMEOUT_SECONDS = 5 * 60 NUM_TOTAL_REPLICAS = 4 NUM_INSTANCES = 2 NUM_LOCAL_REPLICAS = NUM_TOTAL_REPLICAS // NUM_INSTANCES with tempfile.TemporaryDirectory() as logdir: # The buildbot runs as root, so let's allow that. cmd = [ 'poprun', '--mpi-global-args=--tag-output --allow-run-as-root', '--num-replicas=' + str(NUM_TOTAL_REPLICAS), '--num-instances=' + str(NUM_INSTANCES), sys.executable, 'train.py', '--dataset=cifar-10', '--generated-data', '--model-size=8', '--batch-size=1', '--batches-per-step=10', '--gradient-accumulation-count=10', '--no-validation', '--no-stochastic-rounding', '--iterations=100', '--log-dir', logdir, '--name-suffix', 'popdist_instance', '--ckpt-all-instances', "true", '--log-all-instances', "true" ] # Add some debug logging. extra_env = { 'POPRUN_LOG_LEVEL': 'TRACE', 'TF_CPP_VMODULE': 'poplar_compiler=1,poplar_executor=1', } cwd = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) env = os.environ.copy() env.update(extra_env) subprocess.check_call(cmd, cwd=cwd, env=env, timeout=TIMEOUT_SECONDS) instance_logdirs = glob.glob(f"{logdir}/*_popdist_instance_*") self.assertEqual(len(instance_logdirs), NUM_INSTANCES) training_logs = [] for instance_logdir in instance_logdirs: # Check that each instance got the correct number of replicas from popdist. with open(os.path.join(instance_logdir, 'arguments.json'), 'r') as f: argument_log = json.load(f) self.assertEqual(argument_log['replicas'], NUM_LOCAL_REPLICAS) # Check that the final accuracy is decent. training_log = parse_csv(os.path.join(instance_logdir, 'training.csv')) self.assertGreater(training_log['train_acc_avg'][-1], 95) training_logs.append(training_log) # The final training accuracy should be the same on all instances. for i in range(1, NUM_INSTANCES): self.assertEqual( training_logs[0]['train_acc_avg'][-1], training_logs[i]['train_acc_avg'][-1]) # The final training loss should be the same on all instances. for i in range(1, NUM_INSTANCES): self.assertEqual( training_logs[0]['loss_avg'][-1], training_logs[i]['loss_avg'][-1]) # The final weights should be the same on all instances. var_names_and_shapes = tf.train.list_variables(instance_logdirs[0]) for var_name, _ in var_names_and_shapes: value_instance_0 = tf.train.load_variable(instance_logdirs[0], var_name) for i in range(1, NUM_INSTANCES): value_instance_i = tf.train.load_variable(instance_logdirs[i], var_name) self.assertListEqual(value_instance_0.tolist(), value_instance_i.tolist())
def test_resnet8(self): TIMEOUT_SECONDS = 5 * 60 NUM_TOTAL_REPLICAS = 4 NUM_INSTANCES = 2 NUM_LOCAL_REPLICAS = NUM_TOTAL_REPLICAS // NUM_INSTANCES with tempfile.TemporaryDirectory() as logdir: # The buildbot runs as root, so let's allow that. cmd = [ 'poprun', '--mpi-global-args=--tag-output --allow-run-as-root', '--num-replicas=' + str(NUM_TOTAL_REPLICAS), '--num-instances=' + str(NUM_INSTANCES), sys.executable, 'train.py', '--dataset=cifar-10', '--synthetic-data', '--model-size=8', '--batch-size=1', '--batches-per-step=10', '--gradient-accumulation-count=10', '--no-validation', '--no-stochastic-rounding', '--iterations=100', '--log-dir', logdir, '--name-suffix', 'popdist_instance', ] # Add the MPI library dirs on the LD_LIBRARY_PATH, as these are not # always searched by default (e.g. on CentOS). A user would normally # do "module load mpi/openmpi-x86_64" on CentOS to achieve this instead. libdirs = subprocess.check_output(["mpic++", "--showme:libdirs"]) libdirs = libdirs.decode().strip().replace(" ", ":") ld_library_path = "{}:{}".format(os.environ["LD_LIBRARY_PATH"], libdirs) # Add some debug logging, and use executable cache which is shared between instances. extra_env = { 'POPRUN_LOG_LEVEL': 'TRACE', 'TF_CPP_VMODULE': 'poplar_compiler=1,poplar_executor=1', 'TF_POPLAR_FLAGS': f"--executable_cache_path={logdir}/exec_cache", 'LD_LIBRARY_PATH': ld_library_path, } cwd = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) env = os.environ.copy() env.update(extra_env) subprocess.check_call(cmd, cwd=cwd, env=env, timeout=TIMEOUT_SECONDS) instance_logdirs = glob.glob(f"{logdir}/*_popdist_instance_*") self.assertEqual(len(instance_logdirs), NUM_INSTANCES) training_logs = [] for instance_logdir in instance_logdirs: # Check that each instance got the correct number of replicas from popdist. with open(os.path.join(instance_logdir, 'arguments.json'), 'r') as f: argument_log = json.load(f) self.assertEqual(argument_log['replicas'], NUM_LOCAL_REPLICAS) # Check that the final accuracy is decent. training_log = parse_csv( os.path.join(instance_logdir, 'training.csv')) self.assertGreater(training_log['train_acc_avg'][-1], 95) training_logs.append(training_log) # The final training accuracy should be the same on all instances. for i in range(1, NUM_INSTANCES): self.assertEqual(training_logs[0]['train_acc_avg'][-1], training_logs[i]['train_acc_avg'][-1]) # The final training loss should be the same on all instances. for i in range(1, NUM_INSTANCES): self.assertEqual(training_logs[0]['loss_avg'][-1], training_logs[i]['loss_avg'][-1]) # The final weights should be the same on all instances. var_names_and_shapes = tf.train.list_variables(instance_logdirs[0]) for var_name, _ in var_names_and_shapes: value_instance_0 = tf.train.load_variable( instance_logdirs[0], var_name) for i in range(1, NUM_INSTANCES): value_instance_i = tf.train.load_variable( instance_logdirs[i], var_name) self.assertListEqual(value_instance_0.tolist(), value_instance_i.tolist())