Esempio n. 1
0
    def setUpClass(cls):
        cls.logdir = tempfile.mkdtemp()

        cmd = [
            'python3',
            'train.py',
            '--dataset=cifar-10',
            '--synthetic-data',
            '--model-size=20',
            '--batch-size=1',
            '--iterations=100',
            '--batches-per-step=1',
            '--pipeline-depth=4',
            '--pipeline-splits',
            'b2/0/relu',
            '--xla-recompute',
            '--shards=2',
            '--distributed',
            '--no-validation',
            '--no-stochastic-rounding',
            '--log-dir',
            cls.logdir,
        ]

        cwd = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))

        worker_ports = cls._pick_unique_unused_ports(cls.NUM_WORKERS)
        cluster_spec = {
            'worker': ['localhost:%s' % port for port in worker_ports]
        }

        processes = cls._start_processes_with_tf_config(cmd, cwd, cluster_spec)
        cls._wait_for_processes(processes, cls.WORKER_TIMEOUT_SECONDS)

        cls.worker_log_dirs = cls._find_worker_log_dirs()
        cls.training_logs = [
            parse_csv(os.path.join(d, "training.csv"))
            for d in cls.worker_log_dirs
        ]
Esempio n. 2
0
    def test_resnet_50_from_readme(self):

        NUM_WORKERS = 2
        WORKER_TIMEOUT_SECONDS = 30 * 60

        with tempfile.TemporaryDirectory() as logdir:
            cmd = [
                'python3', 'train.py',
                '--dataset=imagenet',
                '--generated-data',
                '--model-size=50',
                '--batch-size=4',
                '--batches-per-step=1',
                '--shards=4',
                '--pipeline',
                '--gradient-accumulation-count=64',
                '--pipeline-splits', 'b1/2/relu', 'b2/3/relu', 'b3/5/relu',
                '--xla-recompute',
                '--replicas=2',  # Instead of 4 to make two processes fit on one machine.
                '--distributed',
                '--no-stochastic-rounding',
                '--no-validation',
                '--iterations=100',
                '--learning-rate-schedule=1',
                '--base-learning-rate=-14',
                '--log-dir', logdir,
                '--ckpt-all-instances', "true",
                '--log-all-instances', "true"
            ]

            extra_env = {
                'POPLAR_ENGINE_OPTIONS': '{"opt.maxCopyMergeSize": 8388608}',
            }

            cwd = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))

            worker_ports = self._pick_unique_unused_ports(NUM_WORKERS)
            cluster_spec = {
                'worker': ['localhost:%s' % port for port in worker_ports]
            }

            processes = self._start_processes_with_tf_config(cmd, cwd, extra_env, cluster_spec)
            self._wait_for_processes(processes, WORKER_TIMEOUT_SECONDS)

            worker_log_dirs = self._find_worker_log_dirs(NUM_WORKERS, logdir)
            training_logs = [parse_csv(os.path.join(d, "training.csv")) for d in worker_log_dirs]

            # The final training accuracy should be the same on all workers.
            for i in range(1, NUM_WORKERS):
                self.assertEqual(
                    training_logs[0]['train_acc_avg'][-1],
                    training_logs[i]['train_acc_avg'][-1])

            # The final training loss should be the same on all workers.
            for i in range(1, NUM_WORKERS):
                self.assertEqual(
                    training_logs[0]['loss_avg'][-1],
                    training_logs[i]['loss_avg'][-1])

            # The final weights should be the same on all workers.
            var_names_and_shapes = tf.train.list_variables(worker_log_dirs[0])

            for var_name, _ in var_names_and_shapes:
                value_worker_0 = tf.train.load_variable(worker_log_dirs[0], var_name)

                for i in range(1, NUM_WORKERS):
                    value_worker_i = tf.train.load_variable(worker_log_dirs[i], var_name)
                    self.assertListEqual(value_worker_0.tolist(), value_worker_i.tolist())
Esempio n. 3
0
    def test_resnet8(self):

        TIMEOUT_SECONDS = 5 * 60
        NUM_TOTAL_REPLICAS = 4
        NUM_INSTANCES = 2
        NUM_LOCAL_REPLICAS = NUM_TOTAL_REPLICAS // NUM_INSTANCES

        with tempfile.TemporaryDirectory() as logdir:
            # The buildbot runs as root, so let's allow that.
            cmd = [
                'poprun',
                '--mpi-global-args=--tag-output --allow-run-as-root',
                '--num-replicas=' + str(NUM_TOTAL_REPLICAS),
                '--num-instances=' + str(NUM_INSTANCES),
                sys.executable,
                'train.py',
                '--dataset=cifar-10',
                '--generated-data',
                '--model-size=8',
                '--batch-size=1',
                '--batches-per-step=10',
                '--gradient-accumulation-count=10',
                '--no-validation',
                '--no-stochastic-rounding',
                '--iterations=100',
                '--log-dir', logdir,
                '--name-suffix', 'popdist_instance',
                '--ckpt-all-instances', "true",
                '--log-all-instances', "true"
            ]

            # Add some debug logging.
            extra_env = {
                'POPRUN_LOG_LEVEL': 'TRACE',
                'TF_CPP_VMODULE': 'poplar_compiler=1,poplar_executor=1',
            }

            cwd = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
            env = os.environ.copy()
            env.update(extra_env)
            subprocess.check_call(cmd, cwd=cwd, env=env, timeout=TIMEOUT_SECONDS)

            instance_logdirs = glob.glob(f"{logdir}/*_popdist_instance_*")
            self.assertEqual(len(instance_logdirs), NUM_INSTANCES)

            training_logs = []

            for instance_logdir in instance_logdirs:
                # Check that each instance got the correct number of replicas from popdist.
                with open(os.path.join(instance_logdir, 'arguments.json'), 'r') as f:
                    argument_log = json.load(f)
                self.assertEqual(argument_log['replicas'], NUM_LOCAL_REPLICAS)

                # Check that the final accuracy is decent.
                training_log = parse_csv(os.path.join(instance_logdir, 'training.csv'))
                self.assertGreater(training_log['train_acc_avg'][-1], 95)
                training_logs.append(training_log)

            # The final training accuracy should be the same on all instances.
            for i in range(1, NUM_INSTANCES):
                self.assertEqual(
                    training_logs[0]['train_acc_avg'][-1],
                    training_logs[i]['train_acc_avg'][-1])

            # The final training loss should be the same on all instances.
            for i in range(1, NUM_INSTANCES):
                self.assertEqual(
                    training_logs[0]['loss_avg'][-1],
                    training_logs[i]['loss_avg'][-1])

            # The final weights should be the same on all instances.
            var_names_and_shapes = tf.train.list_variables(instance_logdirs[0])

            for var_name, _ in var_names_and_shapes:
                value_instance_0 = tf.train.load_variable(instance_logdirs[0], var_name)

                for i in range(1, NUM_INSTANCES):
                    value_instance_i = tf.train.load_variable(instance_logdirs[i], var_name)
                    self.assertListEqual(value_instance_0.tolist(), value_instance_i.tolist())
Esempio n. 4
0
    def test_resnet8(self):

        TIMEOUT_SECONDS = 5 * 60
        NUM_TOTAL_REPLICAS = 4
        NUM_INSTANCES = 2
        NUM_LOCAL_REPLICAS = NUM_TOTAL_REPLICAS // NUM_INSTANCES

        with tempfile.TemporaryDirectory() as logdir:
            # The buildbot runs as root, so let's allow that.
            cmd = [
                'poprun',
                '--mpi-global-args=--tag-output --allow-run-as-root',
                '--num-replicas=' + str(NUM_TOTAL_REPLICAS),
                '--num-instances=' + str(NUM_INSTANCES),
                sys.executable,
                'train.py',
                '--dataset=cifar-10',
                '--synthetic-data',
                '--model-size=8',
                '--batch-size=1',
                '--batches-per-step=10',
                '--gradient-accumulation-count=10',
                '--no-validation',
                '--no-stochastic-rounding',
                '--iterations=100',
                '--log-dir',
                logdir,
                '--name-suffix',
                'popdist_instance',
            ]

            # Add the MPI library dirs on the LD_LIBRARY_PATH, as these are not
            # always searched by default (e.g. on CentOS). A user would normally
            # do "module load mpi/openmpi-x86_64" on CentOS to achieve this instead.
            libdirs = subprocess.check_output(["mpic++", "--showme:libdirs"])
            libdirs = libdirs.decode().strip().replace(" ", ":")
            ld_library_path = "{}:{}".format(os.environ["LD_LIBRARY_PATH"],
                                             libdirs)

            # Add some debug logging, and use executable cache which is shared between instances.
            extra_env = {
                'POPRUN_LOG_LEVEL': 'TRACE',
                'TF_CPP_VMODULE': 'poplar_compiler=1,poplar_executor=1',
                'TF_POPLAR_FLAGS':
                f"--executable_cache_path={logdir}/exec_cache",
                'LD_LIBRARY_PATH': ld_library_path,
            }

            cwd = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
            env = os.environ.copy()
            env.update(extra_env)
            subprocess.check_call(cmd,
                                  cwd=cwd,
                                  env=env,
                                  timeout=TIMEOUT_SECONDS)

            instance_logdirs = glob.glob(f"{logdir}/*_popdist_instance_*")
            self.assertEqual(len(instance_logdirs), NUM_INSTANCES)

            training_logs = []

            for instance_logdir in instance_logdirs:
                # Check that each instance got the correct number of replicas from popdist.
                with open(os.path.join(instance_logdir, 'arguments.json'),
                          'r') as f:
                    argument_log = json.load(f)
                self.assertEqual(argument_log['replicas'], NUM_LOCAL_REPLICAS)

                # Check that the final accuracy is decent.
                training_log = parse_csv(
                    os.path.join(instance_logdir, 'training.csv'))
                self.assertGreater(training_log['train_acc_avg'][-1], 95)
                training_logs.append(training_log)

            # The final training accuracy should be the same on all instances.
            for i in range(1, NUM_INSTANCES):
                self.assertEqual(training_logs[0]['train_acc_avg'][-1],
                                 training_logs[i]['train_acc_avg'][-1])

            # The final training loss should be the same on all instances.
            for i in range(1, NUM_INSTANCES):
                self.assertEqual(training_logs[0]['loss_avg'][-1],
                                 training_logs[i]['loss_avg'][-1])

            # The final weights should be the same on all instances.
            var_names_and_shapes = tf.train.list_variables(instance_logdirs[0])

            for var_name, _ in var_names_and_shapes:
                value_instance_0 = tf.train.load_variable(
                    instance_logdirs[0], var_name)

                for i in range(1, NUM_INSTANCES):
                    value_instance_i = tf.train.load_variable(
                        instance_logdirs[i], var_name)
                    self.assertListEqual(value_instance_0.tolist(),
                                         value_instance_i.tolist())