Esempio n. 1
0
def wait_for_training_examples(state, num_games):
    """Wait for training examples to be generated by the latest model.

    Args:
        state: the RL loop State instance.
        num_games: number of games to wait for.
    """

    model_dir = os.path.join(FLAGS.selfplay_dir, state.selfplay_model_name)
    pattern = os.path.join(model_dir, '*', '*', '*.tfrecord.zz')
    for i in itertools.count():
        try:
            paths = sorted(tf.io.gfile.glob(pattern))
        except tf.errors.OpError:
            paths = []
        if len(paths) >= num_games:
            mllogger = mllog.get_mllogger()
            mllog.config(filename="train.log")

            mllogger.event(key='actual_selfplay_games_per_generation',
                           value=len(paths))
            break
        if i % 30 == 0:
            logging.info('Waiting for %d games in %s (found %d)', num_games,
                         model_dir, len(paths))
        time.sleep(1)
def main(unused_argv):
    models = load_train_times()

    # Skip all models earlier than start and apply step.
    models = [x for x in models if int(x[1]) >= FLAGS.start][::FLAGS.step]

    mllogger = mllog.get_mllogger()
    mllog.config(filename="train.log")

    mllog.config(default_namespace="worker1",
                 default_stack_offset=1,
                 default_clear_line=False)

    for i, (timestamp, name, path) in enumerate(models):
        epoch_num = FLAGS.start + i
        mllogger.start(key=mllog.constants.EVAL_START, value=epoch_num)
        winrate = evaluate_model(path, epoch_num)
        mllogger.end(key=mllog.constants.EVAL_STOP, value=epoch_num)
        if winrate >= FLAGS.winrate:
            print('Model {} beat target after {}s'.format(name, timestamp))
            break

    mllogger.event(key='eval_games', value=len(models))
    mllogger.event(key='gating_win_rate', value=FLAGS.winrate)

    mllogger.end(key=mllog.constants.RUN_STOP, value="succuss")
Esempio n. 3
0
def mlperf_submission_log(benchmark):

    num_nodes = os.environ.get('SLURM_NNODES', 1)

    mllog.config(filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), f'{benchmark}.log'))
    mllogger = mllog.get_mllogger()
    mllogger.logger.propagate = False

    log_event(
        key=constants.SUBMISSION_BENCHMARK,
        value=benchmark,
        )

    log_event(
        key=constants.SUBMISSION_ORG,
        value='NVIDIA')

    log_event(
        key=constants.SUBMISSION_DIVISION,
        value='closed')

    log_event(
        key=constants.SUBMISSION_STATUS,
        value='onprem')

    log_event(
        key=constants.SUBMISSION_PLATFORM,
        value=f'{num_nodes}xSUBMISSION_PLATFORM_PLACEHOLDER')
Esempio n. 4
0
def export_model(model_path):
    """Take the latest checkpoint and copy it to model_path.

    Assumes that all relevant model files are prefixed by the same name.
    (For example, foo.index, foo.meta and foo.data-00000-of-00001).

    Args:
        model_path: The path (can be a gs:// path) to export model
    """
    FLAGS.use_bfloat16 = False
    estimator = tf.estimator.Estimator(model_fn,
                                       model_dir=FLAGS.work_dir,
                                       params=FLAGS.flag_values_dict())
    latest_checkpoint = estimator.latest_checkpoint()
    all_checkpoint_files = tf.io.gfile.glob(latest_checkpoint + '*')
    mllogger = mllog.get_mllogger()
    mllog.config(filename="train.log")

    mllog.config(default_namespace="worker1",
                 default_stack_offset=1,
                 default_clear_line=False)

    for filename in all_checkpoint_files:
        suffix = filename.partition(latest_checkpoint)[2]
        destination_path = model_path + suffix
        logging.info('Copying {} to {}'.format(filename, destination_path))
        tf.io.gfile.copy(filename, destination_path)
def mlperf_submission_log(benchmark):
    required_dist_init = ['RANK', 'WORLD_SIZE', 'MASTER_ADDR', 'MASTER_PORT']

    if all(var in os.environ for var in required_dist_init):
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    num_nodes = os.environ.get('SLURM_NNODES', 1)

    mllog.config(filename=os.path.join(
        os.path.dirname(os.path.abspath(__file__)), 'transformer.log'))
    mllogger = mllog.get_mllogger()
    mllogger.logger.propagate = False

    log_event(
        key=constants.SUBMISSION_BENCHMARK,
        value=benchmark,
    )

    log_event(key=constants.SUBMISSION_ORG, value='Fujitsu')

    log_event(key=constants.SUBMISSION_DIVISION, value='closed')

    log_event(key=constants.SUBMISSION_STATUS, value='onprem')

    log_event(key=constants.SUBMISSION_PLATFORM, value=f'1xGX2570M5')
Esempio n. 6
0
def main(argv):
    """Train on examples and export the updated model weights."""
    tf_records = argv[1:]
    logging.info("Training on %s records: %s to %s", len(tf_records),
                 tf_records[0], tf_records[-1])

    if FLAGS.dist_train:
        hvd.init()

    mllogger = mllog.get_mllogger()
    mllog.config(filename="train.log")

    mllog.config(default_namespace="worker1",
                 default_stack_offset=1,
                 default_clear_line=False)

    with utils.logged_timer("Training"):
        train(*tf_records)
    if (not FLAGS.dist_train) or hvd.rank() == 0:
        if FLAGS.export_path:
            dual_net.export_model(FLAGS.export_path)
            epoch = int(os.path.basename(FLAGS.export_path))
            mllogger.event(key="save_model", value={"Iteration": epoch})
        if FLAGS.freeze:
            dual_net.freeze_graph(FLAGS.export_path, FLAGS.use_trt,
                                  FLAGS.trt_max_batch_size,
                                  FLAGS.trt_precision,
                                  FLAGS.selfplay_precision)
Esempio n. 7
0
    def __init__(self, filename, benchmark, organization):
        self.mllogger = mllog.get_mllogger()
        self.comm_rank = comm.get_rank()
        self.comm_size = comm.get_size()
        self.constants = constants

        # create logging dir if it does not exist
        logdir = os.path.dirname(filename)
        if self.comm_rank == 0:
            if not os.path.isdir(logdir):
                os.makedirs(logdir)
        if torch.distributed.is_available(
        ) and torch.distributed.is_initialized():
            torch.distributed.barrier()

        # create config
        mllog.config(filename=filename)
        self.mllogger.logger.propagate = False
        self.log_event(key=constants.SUBMISSION_BENCHMARK, value=benchmark)

        self.log_event(key=constants.SUBMISSION_ORG, value=organization)

        self.log_event(key=constants.SUBMISSION_DIVISION, value='closed')

        self.log_event(key=constants.SUBMISSION_STATUS, value='onprem')

        self.log_event(
            key=constants.SUBMISSION_PLATFORM,
            value=f'{self.comm_size}xSUBMISSION_PLATFORM_PLACEHOLDER')
Esempio n. 8
0
def maybe_set_seed():

    if FLAGS.training_seed != 0:
        random.seed(FLAGS.training_seed)
        tf.set_random_seed(FLAGS.training_seed)
        np.random.seed(FLAGS.training_seed)
        mllogger = mllog.get_mllogger()
        mllogger.event(key=mllog.constants.SEED, value=FLAGS.training_seed)
Esempio n. 9
0
def log_submission_info(benchmark='cosmoflow',
                        org='UNDEFINED',
                        division='UNDEFINED',
                        status='UNDEFINED',
                        platform='UNDEFINED'):
    """Log general MLPerf submission details from config"""
    mllogger = mllog.get_mllogger()
    mllogger.event(key=mllog.constants.SUBMISSION_BENCHMARK, value=benchmark)
    mllogger.event(key=mllog.constants.SUBMISSION_ORG, value=org)
    mllogger.event(key=mllog.constants.SUBMISSION_DIVISION, value=division)
    mllogger.event(key=mllog.constants.SUBMISSION_STATUS, value=status)
    mllogger.event(key=mllog.constants.SUBMISSION_PLATFORM, value=platform)
Esempio n. 10
0
def build_model(input_shape,
                target_size,
                conv_size=32,
                kernel_size=3,
                n_conv_layers=5,
                fc1_size=128,
                fc2_size=64,
                l2=0,
                hidden_activation='LeakyReLU',
                pooling_type='MaxPool3D',
                dropout=0.5):
    """Construct the CosmoFlow 3D CNN model"""

    if have_mlperf_logging:
        mllogger = mllog.get_mllogger()
        mllogger.event(key=mllog.constants.OPT_WEIGHT_DECAY, value=l2)
        mllogger.event(key='dropout', value=dropout)

    conv_args = dict(kernel_size=kernel_size, padding='same')
    hidden_activation = getattr(layers, hidden_activation)
    pooling_type = getattr(layers, pooling_type)

    model = tf.keras.models.Sequential()

    # First convolutional layer
    model.add(layers.Conv3D(conv_size, input_shape=input_shape, **conv_args))
    model.add(hidden_activation())
    model.add(pooling_type(pool_size=2))

    # Additional conv layers
    for i in range(1, n_conv_layers):
        # Double conv channels at every layer
        model.add(layers.Conv3D(conv_size * 2**i, **conv_args))
        model.add(hidden_activation())
        model.add(pooling_type(pool_size=2))
    model.add(layers.Flatten())

    # Fully-connected layers
    model.add(layers.Dense(fc1_size, kernel_regularizer=regularizers.l2(l2)))
    model.add(hidden_activation())
    model.add(layers.Dropout(dropout))
    model.add(layers.Dense(fc2_size, kernel_regularizer=regularizers.l2(l2)))
    model.add(hidden_activation())
    model.add(layers.Dropout(dropout))

    # Output layers
    model.add(layers.Dense(target_size, activation='tanh'))
    model.add(layers.Lambda(scale_1p2))

    return model
Esempio n. 11
0
def get_optimizer(name, distributed=False, **opt_args):
    """Configure the optimizer"""

    # MLPerf logging
    if utils.distributed.rank() == 0 and have_mlperf_logging:
        mllogger = mllog.get_mllogger()
        mllogger.event(key=mllog.constants.OPT_NAME, value=name)

    # Construct the optimizer
    OptType = getattr(keras.optimizers, name)
    opt = OptType(**opt_args)

    # Distributed optimizer wrapper
    if distributed:
        opt = hvd.DistributedOptimizer(opt)

    return opt
Esempio n. 12
0
def get_mllog_mlloger():
    from mlperf_logging import mllog
    from mlperf_compliance import tf_mlperf_log

    str_hvd_rank = str(hvd.rank()) if horovod_enabled() else "0"
    mllogger = mllog.get_mllogger()
    filenames = "resnet50v1.5.log-" + str_hvd_rank
    mllog.config(filename=filenames)
    workername = "worker" + str_hvd_rank
    mllog.config(
        default_namespace = workername,
        default_stack_offset = 1,
        default_clear_line = False,
        root_dir = os.path.normpath(
           os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..")))

    return mllogger, mllog, tf_mlperf_log
Esempio n. 13
0
 def test_mllog_end_simple(self):
     prefix = ":::MLLOG"
     expected_log_json = json.dumps(
         json.loads(r'''
     {
       "namespace": "",
       "time_ms": 1234567890123,
       "event_type": "INTERVAL_END",
       "key": "run_stop",
       "value": null,
       "metadata": {"file": "mybenchmark/file.py", "lineno": 42}
     }''',
                    object_pairs_hook=collections.OrderedDict))
     expected_output = " ".join([prefix, expected_log_json])
     with _captured_stdout() as out:
         mllogger = mllog.get_mllogger()
         mllogger.end(mllog.constants.RUN_STOP, None)
         self.assertEqual(out.getvalue().splitlines()[0], expected_output)
Esempio n. 14
0
 def test_mllog_event_simple(self):
     prefix = ":::MLLOG"
     expected_log_json = json.dumps(
         json.loads(r'''
     {
       "namespace": "",
       "time_ms": 1234567890123,
       "event_type": "POINT_IN_TIME",
       "key": "eval_accuracy",
       "value": 0.99,
       "metadata": {"file": "mybenchmark/file.py", "lineno": 42}
     }''',
                    object_pairs_hook=collections.OrderedDict))
     expected_output = " ".join([prefix, expected_log_json])
     with _captured_stdout() as out:
         mllogger = mllog.get_mllogger()
         mllogger.event(mllog.constants.EVAL_ACCURACY, 0.99)
         self.assertEqual(out.getvalue().splitlines()[0], expected_output)
Esempio n. 15
0
def main(argv):
    mllogger = mllog.get_mllogger()
    mllog.config(filename="train.log")

    mllog.config(default_namespace="worker1",
                 default_stack_offset=1,
                 default_clear_line=False)

    mllogger.event(key=mllog.constants.OPT_BASE_LR, value=FLAGS.lr_rates)
    mllogger.event(key='lr_rates', value=FLAGS.lr_rates)
    mllogger.event(key=mllog.constants.OPT_LR_DECAY_BOUNDARY_EPOCHS,
                   value=FLAGS.lr_boundaries[1])
    mllogger.event(key='lr_boundaries', value=FLAGS.lr_boundaries[1])
    mllogger.event(key=mllog.constants.OPT_WEIGHT_DECAY,
                   value=FLAGS.l2_strength)
    mllogger.event(key='opt_learning_rate_decay_boundary_steps',
                   value=FLAGS.lr_boundaries)
    mllogger.event(key='train_batch_size', value=FLAGS.train_batch_size)
Esempio n. 16
0
def evaluate_model(eval_model_path, epoch):
    processes = []
    for i, device in enumerate(FLAGS.devices):
        a = i * FLAGS.num_games // len(FLAGS.devices)
        b = (i + 1) * FLAGS.num_games // len(FLAGS.devices)
        num_games = b - a

        env = os.environ.copy()
        env['CUDA_VISIBLE_DEVICES'] = device
        processes.append(
            checked_run([
                'numactl', '--physcpubind={}'.format(i), 'bazel-bin/cc/eval',
                '--flagfile={}'.format(
                    os.path.join(FLAGS.flags_dir, 'eval.flags')),
                '--eval_model={}'.format(eval_model_path),
                '--target_model={}'.format(
                    FLAGS.target), '--sgf_dir={}'.format(FLAGS.sgf_dir),
                '--parallel_games={}'.format(num_games), '--eval_device=cpu',
                '--target_device=cpu', '--verbose=false'
            ], env, False))
    all_output = wait(processes)

    total_wins = 0
    total_num_games = 0
    for output in all_output:
        lines = output.split('\n')

        eval_stats, target_stats = parse_win_stats_table(lines[-7:])
        num_games = eval_stats.total_wins + target_stats.total_wins
        total_wins += eval_stats.total_wins
        total_num_games += num_games

    mllogger = mllog.get_mllogger()
    mllogger.event(key=mllog.constants.EVAL_SAMPLES, value=total_num_games)

    win_rate = total_wins / total_num_games
    logging.info('Win rate %s vs %s: %.3f', eval_stats.model_name,
                 target_stats.model_name, win_rate)

    mllogger.event(key=mllog.constants.EVAL_ACCURACY,
                   value=win_rate,
                   metadata={"epoch_num": epoch})

    return win_rate
Esempio n. 17
0
def mx_resnet_print(key, val=None, metadata=None, deferred=False, stack_offset=1,
                    sync=False, uniq=True):
    rank = mpiwrapper.rank()
    if sync:
        mpiwrapper.barrier()

    if (uniq and rank == 0) or (not uniq):
        mllogger = mllog.get_mllogger()
        if key == mlperf_constants.RUN_START:
            mllogger.start(key=key, value=val, metadata=metadata)
        elif key== mlperf_constants.RUN_STOP:
            mllogger.end(key=key, value=val, metadata=metadata)
        else:
            mllogger.event(key=key, value=val, metadata=metadata)

    if sync:
        mpiwrapper.barrier()

    return
Esempio n. 18
0
def get_mllog_mlloger(output_dir=None):
    from mlperf_logging import mllog

    str_hvd_rank = str(hvd.rank()) if horovod_enabled() else "0"
    mllogger = mllog.get_mllogger()
    mllogger.propagate = False
    mllog.propagate=False
    if output_dir is None: output_dir='./log'
    filenames = os.path.normpath(output_dir) + "/result_rank_" + str_hvd_rank + ".txt"
    mllog.config(filename=filenames)
    workername = "worker" + str_hvd_rank
    mllog.config(
            default_namespace = workername,
            default_stack_offset = 1,
            default_clear_line = False,
            root_dir = os.path.normpath(
           os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..")))

    return mllogger, mllog
def main(argv):
    mllogger = mllog.get_mllogger()
    mllog.config(filename="train.log")

    mllog.config(default_namespace="worker1",
                 default_stack_offset=1,
                 default_clear_line=False,
                 root_dir=os.path.normpath("/tmp/"))

    mllogger.event(key='num_readouts', value=FLAGS.num_readouts)
    mllogger.event(key='value_init_penalty', value=FLAGS.value_init_penalty)
    mllogger.event(key='holdout_pct', value=FLAGS.holdout_pct)
    mllogger.event(key='disable_resign_pct', value=FLAGS.disable_resign_pct)
    mllogger.event(key='min_resign_threshold',
                   value=FLAGS.min_resign_threshold)
    mllogger.event(key='max_resign_threshold',
                   value=FLAGS.max_resign_threshold)
    mllogger.event(key='selfplay_threads', value=FLAGS.selfplay_threads)
    mllogger.event(key='parallel_games', value=FLAGS.parallel_inference)
    mllogger.event(key='virtual_losses', value=FLAGS.virtual_losses)
Esempio n. 20
0
def main(argv):
    """Entry point for running one selfplay game."""
    del argv  # Unused
    flags.mark_flag_as_required('load_file')
    mllogger = mllog.get_mllogger()
    mllog.config(filename="train.log")

    mllog.config(
      default_namespace = "worker1",
      default_stack_offset = 1,
      default_clear_line = False)


    mllogger.event(key='parallel_games', value=FLAGS.parallel_inference)

    run_game(
        load_file=FLAGS.load_file,
        selfplay_dir=FLAGS.selfplay_dir,
        holdout_dir=FLAGS.holdout_dir,
        holdout_pct=FLAGS.holdout_pct,
        sgf_dir=FLAGS.sgf_dir)
Esempio n. 21
0
 def test_mllog_event_override_param(self):
     prefix = ":::MLLOG"
     expected_log_json = json.dumps(
         json.loads(r'''
     {
       "namespace": "worker1",
       "time_ms": 1231231230123,
       "event_type": "POINT_IN_TIME",
       "key": "eval_accuracy",
       "value": 0.99,
       "metadata": {"file": "mybenchmark/file.py", "lineno": 42}
     }''',
                    object_pairs_hook=collections.OrderedDict))
     expected_output = "\n" + " ".join([prefix, expected_log_json]) + "\n"
     with _captured_stdout() as out:
         mllogger = mllog.get_mllogger()
         mllogger.event(mllog.constants.EVAL_ACCURACY,
                        0.99,
                        namespace="worker1",
                        time_ms=1231231230123,
                        clear_line=True)
         self.assertEqual(out.getvalue(), expected_output)
Esempio n. 22
0
def get_lr_schedule(base_lr,
                    global_batch_size,
                    base_batch_size=None,
                    scaling=None,
                    n_warmup_epochs=0,
                    decay_schedule={}):
    """Get the learning rate schedule function"""
    if scaling == 'linear':
        scale_factor = global_batch_size / base_batch_size
    elif scaling == 'sqrt':
        scale_factor = math.sqrt(global_batch_size / base_batch_size)
    else:
        scale_factor = 1.
    peak_lr = base_lr * scale_factor

    # MLPerf logging
    # NOTE: there is currently a confusing mismatch between the parameter
    # naming convention in this implementation and MLPerf's hyperparameter
    # conventions. Here we define base LR to be the LR at a baseline batch
    # size and the "peak" LR to be the value scaled according to current batch
    # size. We will leave things as-is for now.
    if utils.distributed.rank() == 0 and have_mlperf_logging:
        mllogger = mllog.get_mllogger()
        mllogger.event(key=mllog.constants.OPT_BASE_LR, value=peak_lr)
        mllogger.event(key=mllog.constants.OPT_LR_WARMUP_EPOCHS,
                       value=n_warmup_epochs)
        mllogger.event(key=mllog.constants.OPT_LR_WARMUP_FACTOR,
                       value=scale_factor)
        mllogger.event(key=mllog.constants.OPT_LR_DECAY_BOUNDARY_EPOCHS,
                       value=sorted(decay_schedule.keys()))
        mllogger.event(key=mllog.constants.OPT_LR_DECAY_FACTOR,
                       value=max(decay_schedule.values())
                       if len(decay_schedule) > 0 else 1)
    return partial(_lr_schedule,
                   base_lr=base_lr,
                   peak_lr=peak_lr,
                   n_warmup_epochs=n_warmup_epochs,
                   decay_schedule=decay_schedule)
Esempio n. 23
0
def configure_logger(benchmark):
    mllog.config(filename=os.path.join(
        os.path.dirname(os.path.abspath(__file__)), f'{benchmark}.log'))
    mllogger = mllog.get_mllogger()
    mllogger.logger.propagate = False
Esempio n. 24
0
#           http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

import torch

from mlperf_logging import mllog
from mlperf_logging.mllog import constants

mllogger = mllog.get_mllogger()


def configure_logger(benchmark):
    mllog.config(filename=os.path.join(
        os.path.dirname(os.path.abspath(__file__)), f'{benchmark}.log'))
    mllogger = mllog.get_mllogger()
    mllogger.logger.propagate = False


def log_start(*args, **kwargs):
    _log(mllogger.start, *args, **kwargs)


def log_end(*args, **kwargs):
    _log(mllogger.end, *args, **kwargs)
Esempio n. 25
0
def main(args):
    if not torch.cuda.is_available():
        raise NotImplementedError('Training on CPU is not supported')
    torch.cuda.set_device(args.device_id)

    mllog.config(filename=os.path.join(
        os.path.dirname(os.path.abspath(__file__)), 'transformer.log'))
    mllogger = mllog.get_mllogger()
    mllogger.logger.propagate = False

    log_start(key=constants.INIT_START, log_all_ranks=True)

    # preinit and warmup streams/groups for allreduce communicators
    allreduce_communicators = None
    if args.distributed_world_size > 1 and args.enable_parallel_backward_allred_opt:
        allreduce_groups = [
            torch.distributed.new_group()
            for _ in range(args.parallel_backward_allred_cuda_nstreams)
        ]
        allreduce_streams = [
            torch.cuda.Stream()
            for _ in range(args.parallel_backward_allred_cuda_nstreams)
        ]
        for group, stream in zip(allreduce_groups, allreduce_streams):
            with torch.cuda.stream(stream):
                torch.distributed.all_reduce(torch.cuda.FloatTensor(1),
                                             group=group)
        allreduce_communicators = (allreduce_groups, allreduce_streams)

    if args.max_tokens is None:
        args.max_tokens = 6000

    print(args)

    log_event(key=constants.GLOBAL_BATCH_SIZE,
              value=args.max_tokens * args.distributed_world_size)
    log_event(key=constants.OPT_NAME, value=args.optimizer)
    assert (len(args.lr) == 1)
    log_event(key=constants.OPT_BASE_LR,
              value=args.lr[0] if len(args.lr) == 1 else args.lr)
    log_event(key=constants.OPT_LR_WARMUP_STEPS, value=args.warmup_updates)
    assert (args.max_source_positions == args.max_target_positions)
    log_event(key=constants.MAX_SEQUENCE_LENGTH,
              value=args.max_target_positions,
              metadata={'method': 'discard'})
    log_event(key=constants.OPT_ADAM_BETA_1, value=eval(args.adam_betas)[0])
    log_event(key=constants.OPT_ADAM_BETA_2, value=eval(args.adam_betas)[1])
    log_event(key=constants.OPT_ADAM_EPSILON, value=args.adam_eps)
    log_event(key=constants.SEED, value=args.seed)

    # L2 Sector Promotion
    pValue = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int))
    result = ctypes.CDLL('libcudart.so').cudaDeviceSetLimit(
        ctypes.c_int(0x05), ctypes.c_int(128))
    result = ctypes.CDLL('libcudart.so').cudaDeviceGetLimit(
        pValue, ctypes.c_int(0x05))

    worker_seeds, shuffling_seeds = setup_seeds(
        args.seed,
        args.max_epoch + 1,
        torch.device('cuda'),
        args.distributed_rank,
        args.distributed_world_size,
    )
    worker_seed = worker_seeds[args.distributed_rank]
    print(
        f'Worker {args.distributed_rank} is using worker seed: {worker_seed}')
    torch.manual_seed(worker_seed)

    # Setup task, e.g., translation, language modeling, etc.
    task = tasks.setup_task(args)

    # Build model and criterion
    model = task.build_model(args)
    criterion = task.build_criterion(args)

    print('| model {}, criterion {}'.format(args.arch,
                                            criterion.__class__.__name__))
    print('| num. model params: {}'.format(
        sum(p.numel() for p in model.parameters())))

    # Build trainer
    if args.fp16:
        if args.distributed_weight_update != 0:
            from fairseq.fp16_trainer import DistributedFP16Trainer
            trainer = DistributedFP16Trainer(
                args,
                task,
                model,
                criterion,
                allreduce_communicators=allreduce_communicators)
        else:
            from fairseq.fp16_trainer import FP16Trainer
            trainer = FP16Trainer(
                args,
                task,
                model,
                criterion,
                allreduce_communicators=allreduce_communicators)
    else:
        if torch.cuda.get_device_capability(0)[0] >= 7:
            print(
                '| NOTICE: your device may support faster training with --fp16'
            )

        trainer = Trainer(args,
                          task,
                          model,
                          criterion,
                          allreduce_communicators=None)

    #if (args.online_eval or args.target_bleu) and not args.remove_bpe:
    #    args.remove_bpe='@@ '

    print('| training on {} GPUs'.format(args.distributed_world_size))
    print('| max tokens per GPU = {} and max sentences per GPU = {}'.format(
        args.max_tokens,
        args.max_sentences,
    ))

    # Initialize dataloader
    max_positions = trainer.get_model().max_positions()

    # Send a dummy batch to warm the caching allocator
    dummy_batch = language_pair_dataset.get_dummy_batch_isolated(
        args.max_tokens, max_positions, 8)
    trainer.dummy_train_step(dummy_batch)

    # Train until the learning rate gets too small or model reaches target score
    max_epoch = args.max_epoch if args.max_epoch >= 0 else math.inf
    max_update = args.max_update or math.inf
    tgt_bleu = args.target_bleu or math.inf
    current_bleu = 0.0
    lr = trainer.get_lr()
    train_meter = StopwatchMeter()
    train_meter.start()
    valid_losses = [None]

    # mlperf compliance synchronization
    if args.distributed_world_size > 1:
        assert (torch.distributed.is_initialized())
        torch.distributed.all_reduce(torch.cuda.FloatTensor(1))
        torch.cuda.synchronize()

    log_end(key=constants.INIT_STOP, sync=False)

    log_start(key=constants.RUN_START, sync=True)
    # second sync after RUN_START tag is printed.
    # this ensures no rank touches data until after RUN_START tag is printed.
    barrier()

    # Load dataset splits
    load_dataset_splits(task, ['train', 'test'])

    log_event(key=constants.TRAIN_SAMPLES,
              value=len(task.dataset(args.train_subset)),
              sync=False)
    log_event(key=constants.EVAL_SAMPLES,
              value=len(task.dataset(args.gen_subset)),
              sync=False)

    ctr = 0

    start = time.time()
    epoch_itr = data.EpochBatchIterator(
        dataset=task.dataset(args.train_subset),
        dataloader_num_workers=args.dataloader_num_workers,
        dataloader_pin_memory=args.enable_dataloader_pin_memory,
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences_valid,
        max_positions=max_positions,
        ignore_invalid_inputs=True,
        required_batch_size_multiple=8,
        seeds=shuffling_seeds,
        num_shards=args.distributed_world_size,
        shard_id=args.distributed_rank,
        epoch=epoch_itr.epoch if ctr is not 0 else 0,
        bucket_growth_factor=args.bucket_growth_factor,
        seq_len_multiple=args.seq_len_multiple,
        batching_scheme=args.batching_scheme,
        batch_multiple_strategy=args.batch_multiple_strategy,
    )
    print("got epoch iterator", time.time() - start)

    # Main training loop
    while lr >= args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates(
    ) < max_update and current_bleu < tgt_bleu:
        first_epoch = epoch_itr.epoch + 1
        log_start(key=constants.BLOCK_START,
                  metadata={
                      'first_epoch_num': first_epoch,
                      'epoch_count': 1
                  },
                  sync=False)
        log_start(key=constants.EPOCH_START,
                  metadata={'epoch_num': first_epoch},
                  sync=False)

        gc.disable()

        # Load the latest checkpoint if one is available
        if ctr is 0:
            load_checkpoint(args, trainer, epoch_itr)

        # train for one epoch
        start = time.time()
        #exit(1)
        train(args, trainer, task, epoch_itr, shuffling_seeds)
        print("epoch time ", time.time() - start)

        start = time.time()
        log_end(key=constants.EPOCH_STOP,
                metadata={'epoch_num': first_epoch},
                sync=False)

        # Eval BLEU score
        if args.online_eval or (not tgt_bleu is math.inf):
            current_bleu = score(args, trainer, task, epoch_itr,
                                 args.gen_subset)
            log_event(key=constants.EVAL_ACCURACY,
                      value=float(current_bleu) / 100.0,
                      metadata={'epoch_num': first_epoch})

        gc.enable()

        # Only use first validation loss to update the learning rate
        #lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0])

        # Save checkpoint
        #if epoch_itr.epoch % args.save_interval == 0:
        #    save_checkpoint(args, trainer, epoch_itr, valid_losses[0])

        ctr = ctr + 1
        print("validation and scoring ", time.time() - start)
        log_end(key=constants.BLOCK_STOP,
                metadata={'first_epoch_num': first_epoch},
                sync=False)

    train_meter.stop()
    status = 'success' if current_bleu >= tgt_bleu else 'aborted'
    log_end(key=constants.RUN_STOP, metadata={'status': status})
    print('| done training in {:.1f} seconds'.format(train_meter.sum))
 def __init__(self, metric='val_mae', log_key='eval_error'):
     self.mllogger = mllog.get_mllogger()
     self.metric = metric
     self.log_key = log_key
Esempio n. 27
0
def train(*tf_records: "Records to train on"):
    """Train on examples."""

    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    estimator = dual_net.get_estimator(FLAGS.num_intra_threads,
                                       FLAGS.num_inter_threads)

    if FLAGS.dist_train:
        effective_batch_size = int(FLAGS.train_batch_size / hvd.size())
        global_batch_size = effective_batch_size * hvd.size()
        mllogger = mllog.get_mllogger()
        mllogger.event(key=mllog.constants.GLOBAL_BATCH_SIZE,
                       value=global_batch_size)
    else:
        effective_batch_size = FLAGS.train_batch_size
        global_batch_size = FLAGS.train_batch_size

    logging.info("Real global batch size = {}, local batch size = {}.".format(
        global_batch_size, effective_batch_size))

    if FLAGS.use_tpu:
        effective_batch_size *= FLAGS.num_tpu_cores

    if FLAGS.use_tpu:
        if FLAGS.use_bt:

            def _input_fn(params):
                games = bigtable_input.GameQueue(FLAGS.cbt_project,
                                                 FLAGS.cbt_instance,
                                                 FLAGS.cbt_table)
                games_nr = bigtable_input.GameQueue(FLAGS.cbt_project,
                                                    FLAGS.cbt_instance,
                                                    FLAGS.cbt_table + '-nr')
                return preprocessing.get_tpu_bt_input_tensors(
                    games,
                    games_nr,
                    params['batch_size'],
                    params['input_layout'],
                    number_of_games=FLAGS.window_size,
                    random_rotation=True)
        else:

            def _input_fn(params):
                return preprocessing.get_tpu_input_tensors(
                    params['batch_size'],
                    params['input_layout'],
                    tf_records,
                    filter_amount=FLAGS.filter_amount,
                    shuffle_examples=FLAGS.shuffle_examples,
                    shuffle_buffer_size=FLAGS.shuffle_buffer_size,
                    random_rotation=True)

        # Hooks are broken with TPUestimator at the moment.
        hooks = []
    else:

        def _input_fn():
            return preprocessing.get_input_tensors(
                effective_batch_size,
                FLAGS.input_layout,
                tf_records,
                filter_amount=FLAGS.filter_amount,
                shuffle_examples=FLAGS.shuffle_examples,
                shuffle_buffer_size=FLAGS.shuffle_buffer_size,
                random_rotation=True,
                seed=FLAGS.training_seed,
                dist_train=FLAGS.dist_train,
                use_bf16=FLAGS.use_bfloat16)

        hooks = [
            UpdateRatioSessionHook(FLAGS.work_dir),
            EchoStepCounterHook(output_dir=FLAGS.work_dir)
        ]
        if FLAGS.dist_train:
            hooks.append(hvd.BroadcastGlobalVariablesHook(0))

    steps = FLAGS.steps_to_train
    if not steps and FLAGS.num_examples:
        batch_size = effective_batch_size
        if FLAGS.use_tpu:
            batch_size *= FLAGS.num_tpu_cores
        steps = math.floor(FLAGS.num_examples / batch_size)

    logging.info("Training, steps = %s, batch = %s -> %s examples", steps
                 or '?', effective_batch_size,
                 (steps * effective_batch_size) if steps else '?')

    if FLAGS.use_bt:
        games = bigtable_input.GameQueue(FLAGS.cbt_project, FLAGS.cbt_instance,
                                         FLAGS.cbt_table)
        if not games.read_wait_cell():
            games.require_fresh_games(20000)
        latest_game = games.latest_game_number
        index_from = max(latest_game, games.read_wait_cell())
        print("== Last game before training:", latest_game, flush=True)
        print("== Wait cell:", games.read_wait_cell(), flush=True)

    try:
        estimator.train(_input_fn, steps=steps, hooks=hooks)
        if FLAGS.use_bt:
            bigtable_input.set_fresh_watermark(games, index_from,
                                               FLAGS.window_size)
    except:
        if FLAGS.use_bt:
            games.require_fresh_games(0)
        raise
Esempio n. 28
0
"""
Utilities for MLPerf logging
"""
import collections
import os
import subprocess

from mlperf_logging import mllog
from mlperf_logging.mllog import constants

import torch

_MLLOGGER = mllog.get_mllogger()


def log_start(*args, **kwargs):
    "log with start tag"
    _log_print(_MLLOGGER.start, *args, **kwargs)


def log_end(*args, **kwargs):
    "log with end tag"
    _log_print(_MLLOGGER.end, *args, **kwargs)


def log_event(*args, **kwargs):
    "log with event tag"
    _log_print(_MLLOGGER.event, *args, **kwargs)


def _log_print(logger, *args, **kwargs):
Esempio n. 29
0
def main():
    mllog.config(filename=os.path.join(
        os.path.dirname(os.path.abspath(__file__)), 'unet3d.log'))
    mllog.config(filename=os.path.join("/results", 'unet3d.log'))
    mllogger = mllog.get_mllogger()
    mllogger.logger.propagate = False
    mllog_start(key=constants.INIT_START)

    flags = PARSER.parse_args()
    dllogger = get_dllogger(flags)
    local_rank = flags.local_rank
    device = get_device(local_rank)
    is_distributed = init_distributed()
    world_size = get_world_size()
    local_rank = get_rank()
    worker_seeds, shuffling_seeds = setup_seeds(flags.seed, flags.epochs,
                                                device)
    worker_seed = worker_seeds[local_rank]
    seed_everything(worker_seed)
    mllog_event(key=constants.SEED,
                value=flags.seed if flags.seed != -1 else worker_seed,
                sync=False)

    if is_main_process and flags.verbose:
        mlperf_submission_log()
        mlperf_run_param_log(flags)

    callbacks = get_callbacks(flags, dllogger, local_rank, world_size)
    flags.seed = worker_seed
    model = Unet3D(1,
                   3,
                   normalization=flags.normalization,
                   activation=flags.activation)

    mllog_end(key=constants.INIT_STOP, sync=True)
    mllog_start(key=constants.RUN_START, sync=True)
    train_dataloader, val_dataloader = get_data_loaders(flags,
                                                        num_shards=world_size)
    mllog_event(key=constants.GLOBAL_BATCH_SIZE,
                value=flags.batch_size * world_size,
                sync=False)
    loss_fn = DiceCELoss(to_onehot_y=True,
                         use_softmax=True,
                         layout=flags.layout,
                         include_background=flags.include_background)
    score_fn = DiceScore(to_onehot_y=True,
                         use_argmax=True,
                         layout=flags.layout,
                         include_background=flags.include_background)

    if flags.exec_mode == 'train':
        train(flags,
              model,
              train_dataloader,
              val_dataloader,
              loss_fn,
              score_fn,
              device=device,
              callbacks=callbacks,
              is_distributed=is_distributed)

    elif flags.exec_mode == 'evaluate':
        eval_metrics = evaluate(flags,
                                model,
                                val_dataloader,
                                loss_fn,
                                score_fn,
                                device=device,
                                is_distributed=is_distributed)
        if local_rank == 0:
            for key in eval_metrics.keys():
                print(key, eval_metrics[key])
    else:
        print("Invalid exec_mode.")
        pass
Esempio n. 30
0
def dummy_example():
    """Example usage of mllog"""

    # Get the mllogger instance, this needs to be called in every module that
    # needs logging
    mllogger = mllog.get_mllogger()

    # Customize mllogger configuration
    # These configurations only need to be set Once in your entire program.
    # Try tweaking the following configurations to see the difference.
    #   logger: Customize the underlying logger to change the logging behavior.
    #   filename: a log file to use. If set, a default file handler will be added
    #     to the logger so it can log to the specified file. For more advanced
    #     customizations, please set the 'logger' parameter instead.
    #   default_namespace: the default namespace to use if one isn't provided.
    #   default_stack_offset: the default depth to go into the stack to find
    #     the call site.
    #   default_clear_line: the default behavior of line clearing (i.e. print
    #     an extra new line to clear any pre-existing text in the log line).
    #   root_dir: directory prefix which will be trimmed when reporting calling
    #     file for logging.

    # Customize the underlying logger to use a file in addition to stdout.
    # 1. Simple way
    # Provide a filename, this adds a log file with default behavior.
    mllog.config(filename="example_simple.log")
    # 2. Advanced way
    # You may pass a logging.Logger instance to mllog.config().
    # To use the advanced way, comment out the "Simple way" above and uncomment
    # the followings:
    #
    # # Notice that proper log level needs to be set for both logger and handler.
    # logger = logging.getLogger("custom_logger")
    # logger.propagate = False
    # logger.setLevel(logging.DEBUG)
    # # add file handler for file logging
    # _file_handler = logging.FileHandler("example_advanced.log")
    # _file_handler.setLevel(logging.DEBUG)
    # logger.addHandler(_file_handler)
    # # add stream handler for stdout logging
    # _stream_handler = logging.StreamHandler(stream=sys.stdout)
    # _stream_handler.setLevel(logging.INFO)
    # logger.addHandler(_stream_handler)
    # mllog.config(logger=logger)

    # Set other logger configurations
    mllog.config(default_namespace="worker1",
                 default_stack_offset=1,
                 default_clear_line=False,
                 root_dir=os.path.normpath(
                     os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                  "..", "..")))

    # Example log messages
    # The methods to use are "start", "end", and "event".
    # You may check out the detailed APIs in mllog.mllog.
    # Try to use the keys from mllog.constants to avoid wrong keys.
    mllogger.start(key=mllog.constants.INIT_START)
    mllogger.event(key=mllog.constants.SUBMISSION_ORG, value="Intel")
    mllogger.event(key=mllog.constants.SUBMISSION_PLATFORM,
                   value="1 node x 8s CPX")
    mllogger.event(key=mllog.constants.SUBMISSION_DIVISION, value="closed")
    mllogger.event(key=mllog.constants.SUBMISSION_STATUS, value="onprem")
    mllogger.event(key=mllog.constants.SUBMISSION_BENCHMARK, value="resnet")
    mllogger.event(key=mllog.constants.SUBMISSION_POC_NAME,
                   value="Wei Wang, Christine Cheng")
    mllogger.event(key=mllog.constants.SUBMISSION_POC_EMAIL,
                   value="[email protected], [email protected]")
    mllogger.event(key=mllog.constants.TRAIN_SAMPLES, value=1281167)
    mllogger.event(key="lars_opt_momentum", value=0.9)
    mllogger.end(key=mllog.constants.INIT_STOP)