Example #1
0
def wait_for_training_examples(state, num_games):
    """Wait for training examples to be generated by the latest model.

    Args:
        state: the RL loop State instance.
        num_games: number of games to wait for.
    """

    model_dir = os.path.join(FLAGS.selfplay_dir, state.selfplay_model_name)
    pattern = os.path.join(model_dir, '*', '*', '*.tfrecord.zz')
    for i in itertools.count():
        try:
            paths = sorted(tf.io.gfile.glob(pattern))
        except tf.errors.OpError:
            paths = []
        if len(paths) >= num_games:
            mllogger = mllog.get_mllogger()
            mllog.config(filename="train.log")

            mllogger.event(key='actual_selfplay_games_per_generation',
                           value=len(paths))
            break
        if i % 30 == 0:
            logging.info('Waiting for %d games in %s (found %d)', num_games,
                         model_dir, len(paths))
        time.sleep(1)
Example #2
0
    def __init__(self, filename, benchmark, organization):
        self.mllogger = mllog.get_mllogger()
        self.comm_rank = comm.get_rank()
        self.comm_size = comm.get_size()
        self.constants = constants

        # create logging dir if it does not exist
        logdir = os.path.dirname(filename)
        if self.comm_rank == 0:
            if not os.path.isdir(logdir):
                os.makedirs(logdir)
        if torch.distributed.is_available(
        ) and torch.distributed.is_initialized():
            torch.distributed.barrier()

        # create config
        mllog.config(filename=filename)
        self.mllogger.logger.propagate = False
        self.log_event(key=constants.SUBMISSION_BENCHMARK, value=benchmark)

        self.log_event(key=constants.SUBMISSION_ORG, value=organization)

        self.log_event(key=constants.SUBMISSION_DIVISION, value='closed')

        self.log_event(key=constants.SUBMISSION_STATUS, value='onprem')

        self.log_event(
            key=constants.SUBMISSION_PLATFORM,
            value=f'{self.comm_size}xSUBMISSION_PLATFORM_PLACEHOLDER')
Example #3
0
def export_model(model_path):
    """Take the latest checkpoint and copy it to model_path.

    Assumes that all relevant model files are prefixed by the same name.
    (For example, foo.index, foo.meta and foo.data-00000-of-00001).

    Args:
        model_path: The path (can be a gs:// path) to export model
    """
    FLAGS.use_bfloat16 = False
    estimator = tf.estimator.Estimator(model_fn,
                                       model_dir=FLAGS.work_dir,
                                       params=FLAGS.flag_values_dict())
    latest_checkpoint = estimator.latest_checkpoint()
    all_checkpoint_files = tf.io.gfile.glob(latest_checkpoint + '*')
    mllogger = mllog.get_mllogger()
    mllog.config(filename="train.log")

    mllog.config(default_namespace="worker1",
                 default_stack_offset=1,
                 default_clear_line=False)

    for filename in all_checkpoint_files:
        suffix = filename.partition(latest_checkpoint)[2]
        destination_path = model_path + suffix
        logging.info('Copying {} to {}'.format(filename, destination_path))
        tf.io.gfile.copy(filename, destination_path)
def main(unused_argv):
    models = load_train_times()

    # Skip all models earlier than start and apply step.
    models = [x for x in models if int(x[1]) >= FLAGS.start][::FLAGS.step]

    mllogger = mllog.get_mllogger()
    mllog.config(filename="train.log")

    mllog.config(default_namespace="worker1",
                 default_stack_offset=1,
                 default_clear_line=False)

    for i, (timestamp, name, path) in enumerate(models):
        epoch_num = FLAGS.start + i
        mllogger.start(key=mllog.constants.EVAL_START, value=epoch_num)
        winrate = evaluate_model(path, epoch_num)
        mllogger.end(key=mllog.constants.EVAL_STOP, value=epoch_num)
        if winrate >= FLAGS.winrate:
            print('Model {} beat target after {}s'.format(name, timestamp))
            break

    mllogger.event(key='eval_games', value=len(models))
    mllogger.event(key='gating_win_rate', value=FLAGS.winrate)

    mllogger.end(key=mllog.constants.RUN_STOP, value="succuss")
Example #5
0
def main(argv):
    """Train on examples and export the updated model weights."""
    tf_records = argv[1:]
    logging.info("Training on %s records: %s to %s", len(tf_records),
                 tf_records[0], tf_records[-1])

    if FLAGS.dist_train:
        hvd.init()

    mllogger = mllog.get_mllogger()
    mllog.config(filename="train.log")

    mllog.config(default_namespace="worker1",
                 default_stack_offset=1,
                 default_clear_line=False)

    with utils.logged_timer("Training"):
        train(*tf_records)
    if (not FLAGS.dist_train) or hvd.rank() == 0:
        if FLAGS.export_path:
            dual_net.export_model(FLAGS.export_path)
            epoch = int(os.path.basename(FLAGS.export_path))
            mllogger.event(key="save_model", value={"Iteration": epoch})
        if FLAGS.freeze:
            dual_net.freeze_graph(FLAGS.export_path, FLAGS.use_trt,
                                  FLAGS.trt_max_batch_size,
                                  FLAGS.trt_precision,
                                  FLAGS.selfplay_precision)
def mlperf_submission_log(benchmark):
    required_dist_init = ['RANK', 'WORLD_SIZE', 'MASTER_ADDR', 'MASTER_PORT']

    if all(var in os.environ for var in required_dist_init):
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    num_nodes = os.environ.get('SLURM_NNODES', 1)

    mllog.config(filename=os.path.join(
        os.path.dirname(os.path.abspath(__file__)), 'transformer.log'))
    mllogger = mllog.get_mllogger()
    mllogger.logger.propagate = False

    log_event(
        key=constants.SUBMISSION_BENCHMARK,
        value=benchmark,
    )

    log_event(key=constants.SUBMISSION_ORG, value='Fujitsu')

    log_event(key=constants.SUBMISSION_DIVISION, value='closed')

    log_event(key=constants.SUBMISSION_STATUS, value='onprem')

    log_event(key=constants.SUBMISSION_PLATFORM, value=f'1xGX2570M5')
Example #7
0
def mlperf_submission_log(benchmark):

    num_nodes = os.environ.get('SLURM_NNODES', 1)

    mllog.config(filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), f'{benchmark}.log'))
    mllogger = mllog.get_mllogger()
    mllogger.logger.propagate = False

    log_event(
        key=constants.SUBMISSION_BENCHMARK,
        value=benchmark,
        )

    log_event(
        key=constants.SUBMISSION_ORG,
        value='NVIDIA')

    log_event(
        key=constants.SUBMISSION_DIVISION,
        value='closed')

    log_event(
        key=constants.SUBMISSION_STATUS,
        value='onprem')

    log_event(
        key=constants.SUBMISSION_PLATFORM,
        value=f'{num_nodes}xSUBMISSION_PLATFORM_PLACEHOLDER')
Example #8
0
def main(_):
    if not tf.gfile.Exists(FLAGS.mlperf_log_dir):
        print("Creating directory %s" % FLAGS.mlperf_log_dir)
        tf.gfile.MakeDirs(FLAGS.mlperf_log_dir)
    mllog.config(filename=os.path.join(FLAGS.mlperf_log_dir,
                                       "mlperf_compliance.log"),
                 root_dir=os.path.normpath(
                     os.path.dirname(os.path.realpath(__file__))))

    mllogger.start(key=mllog_const.INIT_START)
    # Set logging level to INFO to display training progress (logged by the
    # estimator)
    tf.logging.set_verbosity(tf.logging.INFO)

    # Set random seed.
    if FLAGS.random_seed is None:
        raise Exception('No Random seed given')
    print('Setting random seed = ', FLAGS.random_seed)
    seed = FLAGS.random_seed
    random.seed(seed)
    tf.set_random_seed(seed)
    numpy.random.seed(seed)

    # Determine training schedule based on flags.
    if FLAGS.train_steps is not None and FLAGS.train_epochs is not None:
        raise ValueError(
            "Both --train_steps and --train_epochs were set. Only one "
            "may be defined.")
    if FLAGS.train_steps is None and FLAGS.train_epochs is None:
        FLAGS.train_epochs = mlbox_const.DEFAULT_TRAIN_EPOCHS

    params = mlbox_model_params.MLBoxTransformerParams(FLAGS)

    # Make sure that the BLEU source and ref files if set
    if FLAGS.bleu_source is not None and FLAGS.bleu_ref is not None:
        if not tf.gfile.Exists(FLAGS.bleu_source):
            raise ValueError("BLEU source file %s does not exist" %
                             FLAGS.bleu_source)
        if not tf.gfile.Exists(FLAGS.bleu_ref):
            raise ValueError("BLEU source file %s does not exist" %
                             FLAGS.bleu_ref)

    mllogger.end(key=mllog_const.INIT_STOP)
    mllogger.start(key=mllog_const.RUN_START)

    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       model_dir=FLAGS.model_dir,
                                       params=params)
    train_schedule(estimator, params.train_eval_iterations,
                   params.single_iteration_train_steps,
                   params.single_iteration_train_epochs, FLAGS.bleu_source,
                   FLAGS.bleu_ref, FLAGS.bleu_threshold)

    mllogger.end(key=mllog_const.RUN_STOP)
Example #9
0
def get_mllog_mlloger():
    from mlperf_logging import mllog
    from mlperf_compliance import tf_mlperf_log

    str_hvd_rank = str(hvd.rank()) if horovod_enabled() else "0"
    mllogger = mllog.get_mllogger()
    filenames = "resnet50v1.5.log-" + str_hvd_rank
    mllog.config(filename=filenames)
    workername = "worker" + str_hvd_rank
    mllog.config(
        default_namespace = workername,
        default_stack_offset = 1,
        default_clear_line = False,
        root_dir = os.path.normpath(
           os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..")))

    return mllogger, mllog, tf_mlperf_log
Example #10
0
def main(argv):
    mllogger = mllog.get_mllogger()
    mllog.config(filename="train.log")

    mllog.config(default_namespace="worker1",
                 default_stack_offset=1,
                 default_clear_line=False)

    mllogger.event(key=mllog.constants.OPT_BASE_LR, value=FLAGS.lr_rates)
    mllogger.event(key='lr_rates', value=FLAGS.lr_rates)
    mllogger.event(key=mllog.constants.OPT_LR_DECAY_BOUNDARY_EPOCHS,
                   value=FLAGS.lr_boundaries[1])
    mllogger.event(key='lr_boundaries', value=FLAGS.lr_boundaries[1])
    mllogger.event(key=mllog.constants.OPT_WEIGHT_DECAY,
                   value=FLAGS.l2_strength)
    mllogger.event(key='opt_learning_rate_decay_boundary_steps',
                   value=FLAGS.lr_boundaries)
    mllogger.event(key='train_batch_size', value=FLAGS.train_batch_size)
Example #11
0
def get_mllog_mlloger(output_dir=None):
    from mlperf_logging import mllog

    str_hvd_rank = str(hvd.rank()) if horovod_enabled() else "0"
    mllogger = mllog.get_mllogger()
    mllogger.propagate = False
    mllog.propagate=False
    if output_dir is None: output_dir='./log'
    filenames = os.path.normpath(output_dir) + "/result_rank_" + str_hvd_rank + ".txt"
    mllog.config(filename=filenames)
    workername = "worker" + str_hvd_rank
    mllog.config(
            default_namespace = workername,
            default_stack_offset = 1,
            default_clear_line = False,
            root_dir = os.path.normpath(
           os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..")))

    return mllogger, mllog
def main(argv):
    mllogger = mllog.get_mllogger()
    mllog.config(filename="train.log")

    mllog.config(default_namespace="worker1",
                 default_stack_offset=1,
                 default_clear_line=False,
                 root_dir=os.path.normpath("/tmp/"))

    mllogger.event(key='num_readouts', value=FLAGS.num_readouts)
    mllogger.event(key='value_init_penalty', value=FLAGS.value_init_penalty)
    mllogger.event(key='holdout_pct', value=FLAGS.holdout_pct)
    mllogger.event(key='disable_resign_pct', value=FLAGS.disable_resign_pct)
    mllogger.event(key='min_resign_threshold',
                   value=FLAGS.min_resign_threshold)
    mllogger.event(key='max_resign_threshold',
                   value=FLAGS.max_resign_threshold)
    mllogger.event(key='selfplay_threads', value=FLAGS.selfplay_threads)
    mllogger.event(key='parallel_games', value=FLAGS.parallel_inference)
    mllogger.event(key='virtual_losses', value=FLAGS.virtual_losses)
Example #13
0
def main(argv):
    """Entry point for running one selfplay game."""
    del argv  # Unused
    flags.mark_flag_as_required('load_file')
    mllogger = mllog.get_mllogger()
    mllog.config(filename="train.log")

    mllog.config(
      default_namespace = "worker1",
      default_stack_offset = 1,
      default_clear_line = False)


    mllogger.event(key='parallel_games', value=FLAGS.parallel_inference)

    run_game(
        load_file=FLAGS.load_file,
        selfplay_dir=FLAGS.selfplay_dir,
        holdout_dir=FLAGS.holdout_dir,
        holdout_pct=FLAGS.holdout_pct,
        sgf_dir=FLAGS.sgf_dir)
Example #14
0
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import numpy as np
import os
from mlperf_logging import mllog
from mlperf_logging.mllog import constants as mllog_const
mllogger = mllog.get_mllogger()
mllog.config(filename=(os.getenv("COMPLIANCE_FILE")
                       or "mlperf_compliance.log"),
             root_dir=os.path.normpath(
                 os.path.dirname(os.path.realpath(__file__))))


def ssd_print(*args, sync=True, **kwargs):
    use_cuda = os.getenv('USE_CUDA')
    if sync and use_cuda == 'True':
        barrier()
    if get_rank() == 0:
        kwargs['stack_offset'] = 2
        mllogger.event(*args, **kwargs)


def barrier():
    """
    Works as a temporary distributed barrier, currently pytorch
Example #15
0
def main(unused_argv):
    """Run the reinforcement learning loop."""
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter('[%(asctime)s] %(message)s',
                                  '%Y-%m-%d %H:%M:%S')

    for handler in logger.handlers:
        handler.setFormatter(formatter)

    mllogger = mllog.get_mllogger()
    mllog.config(filename="train.log")

    mllog.config(default_namespace="worker1",
                 default_stack_offset=1,
                 default_clear_line=False)

    mllogger.event(key=mllog.constants.SUBMISSION_ORG, value="Intel")
    mllogger.event(key=mllog.constants.SUBMISSION_PLATFORM,
                   value="8 nodes x 4s CPX")
    mllogger.event(key=mllog.constants.SUBMISSION_DIVISION, value="closed")
    mllogger.event(key=mllog.constants.SUBMISSION_STATUS, value="onprem")
    mllogger.event(key=mllog.constants.SUBMISSION_BENCHMARK, value="minigo")

    mllogger.event(key='cache_clear', value=True)

    mllogger.event(key="filter_amount", value=FLAGS.train_filter)

    # The training loop must be bootstrapped; either by running bootstrap.sh
    # to generate training data from random games, or by running
    # copy_checkpoint.sh to copy an already generated checkpoint.
    model_dirs = list_selfplay_dirs(FLAGS.selfplay_dir)
    if not model_dirs:
        raise RuntimeError(
            'Couldn\'t find any selfplay games under %s. Either bootstrap.sh '
            'or init_from_checkpoint.sh must be run before the train loop is '
            'started')
    model_num = int(os.path.basename(model_dirs[0]))

    mllogger.end(key=mllog.constants.INIT_STOP)
    mllogger.start(key=mllog.constants.RUN_START)
    with logged_timer('Total time'):
        try:
            state = State(model_num)
            wait(
                checked_run([
                    'python3', 'parse_flags_train.py', '--flagfile={}'.format(
                        os.path.join(FLAGS.flags_dir, 'train.flags'))
                ]))
            wait(
                checked_run([
                    'python3', 'parse_flags_selfplay.py',
                    '--flagfile={}'.format(
                        os.path.join(FLAGS.flags_dir, 'selfplay.flags'))
                ]))

            mllogger.event(key="window_size", value=FLAGS.window_size)

            while state.iter_num <= FLAGS.iterations:
                mllogger.event(key=mllog.constants.EPOCH_START,
                               value=None,
                               metadata={"epoch_num": state.iter_num})
                state.iter_num += 1
                train(state)
                mllogger.event(key=mllog.constants.EPOCH_STOP,
                               value=None,
                               metadata={"epoch_num": state.iter_num})
                if (FLAGS.precision == 'int8'):
                    post_train(state)
        finally:
            asyncio.get_event_loop().close()
Example #16
0
def configure_mllogger(log_dir):
    """Setup the MLPerf logger"""
    if not have_mlperf_logging:
        raise RuntimeError('mlperf_logging package unavailable')
    mllog.config(filename=os.path.join(log_dir, 'mlperf.log'))
    return mllog.get_mllogger()
Example #17
0
def get_mlperf_logger(path, filename='mlperf.log'):
    mllog.config(filename=os.path.join(path, filename))
    mllogger = mllog.get_mllogger()
    mllogger.logger.propagate = False
    return mllogger
Example #18
0
def main(async_executor=None):
    # Setup MLPerf logger
    mllog.config()
    mllogger = mllog.get_mllogger()
    mllogger.logger.propagate = False
    # Start MLPerf benchmark
    log_start(key=mlperf_constants.INIT_START, uniq=False)

    # Parse args
    args = parse_args()

    ############################################################################
    # Initialize various libraries (horovod, logger, amp ...)
    ############################################################################
    # Initialize async executor
    if args.async_val:
        assert async_executor is not None, 'Please use ssd_main_async.py to launch with async support'
    else:
        # (Force) disable async validation
        async_executor = None

    # Initialize horovod
    hvd.init()

    # Initialize AMP
    if args.precision == 'amp':
        amp.init(layout_optimization=True)

    # Set MXNET_SAFE_ACCUMULATION=1 if necessary
    if args.precision == 'fp16':
        os.environ["MXNET_SAFE_ACCUMULATION"] = "1"

    # Results folder
    network_name = f'ssd_{args.backbone}_{args.data_layout}_{args.dataset}_{args.data_shape}'
    save_prefix = None
    if args.results:
        save_prefix = os.path.join(args.results, network_name)
    else:
        logging.info(
            "No results folder was provided. The script will not write logs or save weight to disk"
        )

    # Initialize logger
    log_file = None
    if args.results:
        log_file = f'{save_prefix}_{args.mode}_{hvd.rank()}.log'
    setup_logger(level=args.log_level
                 if hvd.local_rank() in args.log_local_ranks else 'CRITICAL',
                 log_file=log_file)

    # Set seed
    args.seed = set_seed_distributed(args.seed)
    ############################################################################

    ############################################################################
    # Validate arguments and print some useful information
    ############################################################################
    logging.info(args)

    assert not (args.resume_from and args.pretrained_backbone), (
        "--resume-from and --pretrained_backbone are "
        "mutually exclusive.")
    assert args.data_shape == 300, "only data_shape=300 is supported at the moment."
    assert args.input_batch_multiplier >= 1, "input_batch_multiplier must be >= 1"
    assert not (hvd.size() == 1 and args.gradient_predivide_factor > 1), (
        "Gradient predivide factor is not supported "
        "with a single GPU")
    if args.data_layout == 'NCHW' or args.precision == 'fp32':
        assert args.bn_group == 1, "Group batch norm doesn't support FP32 data format or NCHW data layout."
        if not args.no_fuse_bn_relu:
            logging.warning((
                "WARNING: fused batch norm relu is only supported with NHWC layout. "
                "A non fused version will be forced."))
            args.no_fuse_bn_relu = True
        if not args.no_fuse_bn_add_relu:
            logging.warning((
                "WARNING: fused batch norm add relu is only supported with NHWC layout. "
                "A non fused version will be forced."))
            args.no_fuse_bn_add_relu = True
    if args.profile_no_horovod and hvd.size() > 1:
        logging.warning(
            "WARNING: hvd.size() > 1, so must IGNORE requested --profile-no-horovod"
        )
        args.profile_no_horovod = False

    logging.info(f'Seed: {args.seed}')
    logging.info(f'precision: {args.precision}')
    if args.precision == 'fp16':
        logging.info(f'loss scaling: {args.fp16_loss_scale}')
    logging.info(f'network name: {network_name}')
    logging.info(f'fuse bn relu: {not args.no_fuse_bn_relu}')
    logging.info(f'fuse bn add relu: {not args.no_fuse_bn_add_relu}')
    logging.info(f'bn group: {args.bn_group}')
    logging.info(f'bn all reduce fp16: {args.bn_fp16}')
    logging.info(f'MPI size: {hvd.size()}')
    logging.info(f'MPI global rank: {hvd.rank()}')
    logging.info(f'MPI local rank: {hvd.local_rank()}')
    logging.info(f'async validation: {args.async_val}')
    ############################################################################

    # TODO(ahmadki): load network and anchors based on args.backbone (JoC)
    # Load network
    net = ssd_300_resnet34_v1_mlperf_coco(
        pretrained_base=False,
        nms_overlap_thresh=args.nms_overlap_thresh,
        nms_topk=args.nms_topk,
        nms_valid_thresh=args.nms_valid_thresh,
        post_nms=args.post_nms,
        layout=args.data_layout,
        fuse_bn_add_relu=not args.no_fuse_bn_add_relu,
        fuse_bn_relu=not args.no_fuse_bn_relu,
        bn_fp16=args.bn_fp16,
        norm_kwargs={'bn_group': args.bn_group})

    # precomputed anchors
    anchors_np = mlperf_xywh_anchors(image_size=args.data_shape,
                                     clip=True,
                                     normalize=True)
    if args.test_anchors and hvd.rank() == 0:
        logging.info(f'Normalized anchors: {anchors_np}')

    # Training mode
    train_net = None
    train_pipeline = None
    trainer_fn = None
    lr_scheduler = None
    if args.mode in ['train', 'train_val']:
        # Training iterator
        num_cropping_iterations = 1
        if args.use_tfrecord:
            tfrecord_files = glob.glob(
                os.path.join(args.tfrecord_root, 'train.*.tfrecord'))
            index_files = glob.glob(
                os.path.join(args.tfrecord_root, 'train.*.idx'))
            tfrecords = [(tfrecod, index)
                         for tfrecod, index in zip(tfrecord_files, index_files)
                         ]
        train_pipeline = get_training_pipeline(
            coco_root=args.coco_root if not args.use_tfrecord else None,
            tfrecords=tfrecords if args.use_tfrecord else None,
            anchors=anchors_np,
            num_shards=hvd.size(),
            shard_id=hvd.rank(),
            device_id=hvd.local_rank(),
            batch_size=args.batch_size * args.input_batch_multiplier,
            dataset_size=args.dataset_size,
            data_layout=args.data_layout,
            data_shape=args.data_shape,
            num_cropping_iterations=num_cropping_iterations,
            num_workers=args.dali_workers,
            fp16=args.precision == 'fp16',
            input_jpg_decode=args.input_jpg_decode,
            hw_decoder_load=args.hw_decoder_load,
            decoder_cache_size=min(
                (100 * 1024 + hvd.size() - 1) // hvd.size(), 12 *
                1024) if args.input_jpg_decode == 'cache' else 0,
            seed=args.seed)
        log_event(key=mlperf_constants.TRAIN_SAMPLES,
                  value=train_pipeline.epoch_size)
        log_event(key=mlperf_constants.MAX_SAMPLES,
                  value=num_cropping_iterations)

        # Training network
        train_net = SSDMultiBoxLoss(net=net,
                                    local_batch_size=args.batch_size,
                                    bulk_last_wgrad=args.bulk_last_wgrad)

        # Trainer function. SSDModel expects a function that takes 1 parameter - HybridBlock
        trainer_fn = functools.partial(
            sgd_trainer,
            learning_rate=args.lr,
            weight_decay=args.weight_decay,
            momentum=args.momentum,
            precision=args.precision,
            fp16_loss_scale=args.fp16_loss_scale,
            gradient_predivide_factor=args.gradient_predivide_factor,
            num_groups=args.horovod_num_groups,
            profile_no_horovod=args.profile_no_horovod)

        # Learning rate scheduler
        lr_scheduler = MLPerfLearningRateScheduler(
            learning_rate=args.lr,
            decay_factor=args.lr_decay_factor,
            decay_epochs=args.lr_decay_epochs,
            warmup_factor=args.lr_warmup_factor,
            warmup_epochs=args.lr_warmup_epochs,
            epoch_size=train_pipeline.epoch_size,
            global_batch_size=args.batch_size * hvd.size())

    # Validation mode
    infer_net = None
    val_iterator = None
    if args.mode in ['infer', 'val', 'train_val']:
        # Validation iterator
        tfrecord_files = glob.glob(
            os.path.join(args.tfrecord_root, 'val.*.tfrecord'))
        index_files = glob.glob(os.path.join(args.tfrecord_root, 'val.*.idx'))
        tfrecords = [(tfrecod, index)
                     for tfrecod, index in zip(tfrecord_files, index_files)]
        val_pipeline = get_inference_pipeline(
            coco_root=args.coco_root if not args.use_tfrecord else None,
            tfrecords=tfrecords if args.use_tfrecord else None,
            num_shards=hvd.size(),
            shard_id=hvd.rank(),
            device_id=hvd.local_rank(),
            batch_size=args.eval_batch_size,
            dataset_size=args.eval_dataset_size,
            data_layout=args.data_layout,
            data_shape=args.data_shape,
            num_workers=args.dali_workers,
            fp16=args.precision == 'fp16')
        log_event(key=mlperf_constants.EVAL_SAMPLES,
                  value=val_pipeline.epoch_size)

        # Inference network
        infer_net = COCOInference(net=net,
                                  ltrb=False,
                                  scale_bboxes=True,
                                  score_threshold=0.0)

        # annotations file
        cocoapi_annotation_file = os.path.join(
            args.coco_root, 'annotations', 'bbox_only_instances_val2017.json')

    # Prepare model
    model = SSDModel(net=net,
                     anchors_np=anchors_np,
                     precision=args.precision,
                     fp16_loss_scale=args.fp16_loss_scale,
                     train_net=train_net,
                     trainer_fn=trainer_fn,
                     lr_scheduler=lr_scheduler,
                     metric=mx.metric.Loss(),
                     infer_net=infer_net,
                     async_executor=async_executor,
                     save_prefix=save_prefix,
                     ctx=mx.gpu(hvd.local_rank()))

    # Do a training and validation runs on fake data.
    # this will set layers shape (needed before loading pre-trained backbone),
    # allocate tensors and and cache optimized graph.
    # Training dry run:
    logging.info('Running training dry runs')
    dummy_train_pipeline = get_training_pipeline(
        coco_root=None,
        tfrecords=[('dummy.tfrecord', 'dummy.idx')],
        anchors=anchors_np,
        num_shards=1,
        shard_id=0,
        device_id=hvd.local_rank(),
        batch_size=args.batch_size * args.input_batch_multiplier,
        dataset_size=None,
        data_layout=args.data_layout,
        data_shape=args.data_shape,
        num_workers=args.dali_workers,
        fp16=args.precision == 'fp16',
        seed=args.seed)
    dummy_train_iterator = get_training_iterator(pipeline=dummy_train_pipeline,
                                                 batch_size=args.batch_size)
    for images, box_targets, cls_targets in dummy_train_iterator:
        model.train_step(images=images,
                         box_targets=box_targets,
                         cls_targets=cls_targets)
    # Freeing memory is disabled due a bug in CUDA graphs
    # del dummy_train_pipeline
    # del dummy_train_iterator
    mx.ndarray.waitall()
    logging.info('Done')
    # Validation dry run:
    logging.info('Running inference dry runs')
    dummy_val_pipeline = get_inference_pipeline(
        coco_root=None,
        tfrecords=[('dummy.tfrecord', 'dummy.idx')],
        num_shards=1,
        shard_id=0,
        device_id=hvd.local_rank(),
        batch_size=args.eval_batch_size,
        dataset_size=None,
        data_layout=args.data_layout,
        data_shape=args.data_shape,
        num_workers=args.dali_workers,
        fp16=args.precision == 'fp16')
    dummy_val_iterator = get_inference_iterator(pipeline=dummy_val_pipeline)
    model.infer(data_iterator=dummy_val_iterator, log_interval=None)
    # Freeing memory is disabled due a bug in CUDA graphs
    # del dummy_val_pipeline
    # del dummy_val_iterator
    mx.ndarray.waitall()
    logging.info('Done')

    # re-initialize the model as a precaution in case the dry runs changed the parameters
    model.init_model(force_reinit=True)
    model.zero_grads()
    mx.ndarray.waitall()

    # load saved model or pretrained backbone
    if args.resume_from:
        model.load_parameters(filename=args.resume_from)
    elif args.pretrained_backbone:
        model.load_pretrain_backbone(picklefile_name=args.pretrained_backbone)

    # broadcast parameters
    model.broadcast_params()
    mx.ndarray.waitall()

    if args.test_initialization and hvd.rank() == 0:
        model.print_params_stats(net)

    log_end(key=mlperf_constants.INIT_STOP)

    # Main MLPerf loop (training+validation)
    mpiwrapper.barrier()
    log_start(key=mlperf_constants.RUN_START)
    mpiwrapper.barrier()
    # Real data iterators
    train_iterator = None
    val_iterator = None
    if train_pipeline:
        train_iterator = get_training_iterator(pipeline=train_pipeline,
                                               batch_size=args.batch_size,
                                               synthetic=args.synthetic)
    if val_pipeline:
        val_iterator = get_inference_iterator(pipeline=val_pipeline)
    model_map, epoch = model.train_val(train_iterator=train_iterator,
                                       start_epoch=args.start_epoch,
                                       end_epoch=args.epochs,
                                       val_iterator=val_iterator,
                                       val_interval=args.val_interval,
                                       val_epochs=args.val_epochs,
                                       annotation_file=cocoapi_annotation_file,
                                       target_map=args.target_map,
                                       train_log_interval=args.log_interval,
                                       val_log_interval=args.log_interval,
                                       save_interval=args.save_interval,
                                       cocoapi_threads=args.cocoapi_threads,
                                       profile_start=args.profile_start,
                                       profile_stop=args.profile_stop)
    status = 'success' if (model_map
                           and model_map >= args.target_map) else 'aborted'
    mx.ndarray.waitall()
    log_end(key=mlperf_constants.RUN_STOP, metadata={"status": status})

    logging.info(f'Rank {hvd.rank()} done. map={model_map} @ epoch={epoch}')
    mx.nd.waitall()
    hvd.shutdown()
Example #19
0
from hccl.manage.api import get_rank_size
from hccl.manage.api import get_rank_id
from npu_bridge.estimator.npu import npu_compile

from npu_bridge.helper import helper
gen_npu_ops = helper.get_gen_ops();
###### npu ######
rank_size = int(os.getenv('RANK_SIZE'))
rank_id = int(os.getenv('RANK_ID').split("-")[-1])
device_id = int(os.getenv('DEVICE_ID')) + rank_id * 8
###############################

# MLperf log
if device_id == 0:
	mllogger = mllog.get_mllogger()
	mllog.config(filename='resnet_close.log')
	mllog.config(
			default_namespace='worker1',
     	 default_stack_offset=1,
     	 default_clear_line=False,
     		root_dir=os.path.normpath(os.path.dirname(os.path.realpath(__file__)))
			)
	mllogger.event(key=mllog.constants.SUBMISSION_BENCHMARK, value="resnet" )
	mllogger.event(key=mllog.constants.SUBMISSION_DIVISION, value="open" )
	mllogger.event(key=mllog.constants.SUBMISSION_ORG, value="SIAT" )
	mllogger.event(key=mllog.constants.SUBMISSION_PLATFORM, value="Ascend 910" )
	mllogger.event(key=mllog.constants.SUBMISSION_STATUS, value="cloud" )
	mllogger.event(key=mllog.constants.CACHE_CLEAR )

params = { 
  #  'data_dir': '/opt/dataset/imagenet_TF',
Example #20
0
def main():
    mllog.config(filename=os.path.join(
        os.path.dirname(os.path.abspath(__file__)), 'unet3d.log'))
    mllog.config(filename=os.path.join("/results", 'unet3d.log'))
    mllogger = mllog.get_mllogger()
    mllogger.logger.propagate = False
    mllog_start(key=constants.INIT_START)

    flags = PARSER.parse_args()
    dllogger = get_dllogger(flags)
    local_rank = flags.local_rank
    device = get_device(local_rank)
    is_distributed = init_distributed()
    world_size = get_world_size()
    local_rank = get_rank()
    worker_seeds, shuffling_seeds = setup_seeds(flags.seed, flags.epochs,
                                                device)
    worker_seed = worker_seeds[local_rank]
    seed_everything(worker_seed)
    mllog_event(key=constants.SEED,
                value=flags.seed if flags.seed != -1 else worker_seed,
                sync=False)

    if is_main_process and flags.verbose:
        mlperf_submission_log()
        mlperf_run_param_log(flags)

    callbacks = get_callbacks(flags, dllogger, local_rank, world_size)
    flags.seed = worker_seed
    model = Unet3D(1,
                   3,
                   normalization=flags.normalization,
                   activation=flags.activation)

    mllog_end(key=constants.INIT_STOP, sync=True)
    mllog_start(key=constants.RUN_START, sync=True)
    train_dataloader, val_dataloader = get_data_loaders(flags,
                                                        num_shards=world_size)
    mllog_event(key=constants.GLOBAL_BATCH_SIZE,
                value=flags.batch_size * world_size,
                sync=False)
    loss_fn = DiceCELoss(to_onehot_y=True,
                         use_softmax=True,
                         layout=flags.layout,
                         include_background=flags.include_background)
    score_fn = DiceScore(to_onehot_y=True,
                         use_argmax=True,
                         layout=flags.layout,
                         include_background=flags.include_background)

    if flags.exec_mode == 'train':
        train(flags,
              model,
              train_dataloader,
              val_dataloader,
              loss_fn,
              score_fn,
              device=device,
              callbacks=callbacks,
              is_distributed=is_distributed)

    elif flags.exec_mode == 'evaluate':
        eval_metrics = evaluate(flags,
                                model,
                                val_dataloader,
                                loss_fn,
                                score_fn,
                                device=device,
                                is_distributed=is_distributed)
        if local_rank == 0:
            for key in eval_metrics.keys():
                print(key, eval_metrics[key])
    else:
        print("Invalid exec_mode.")
        pass
Example #21
0
def dummy_example():
    """Example usage of mllog"""

    # Get the mllogger instance, this needs to be called in every module that
    # needs logging
    mllogger = mllog.get_mllogger()

    # Customize mllogger configuration
    # These configurations only need to be set Once in your entire program.
    # Try tweaking the following configurations to see the difference.
    #   logger: Customize the underlying logger to change the logging behavior.
    #   filename: a log file to use. If set, a default file handler will be added
    #     to the logger so it can log to the specified file. For more advanced
    #     customizations, please set the 'logger' parameter instead.
    #   default_namespace: the default namespace to use if one isn't provided.
    #   default_stack_offset: the default depth to go into the stack to find
    #     the call site.
    #   default_clear_line: the default behavior of line clearing (i.e. print
    #     an extra new line to clear any pre-existing text in the log line).
    #   root_dir: directory prefix which will be trimmed when reporting calling
    #     file for logging.

    # Customize the underlying logger to use a file in addition to stdout.
    # 1. Simple way
    # Provide a filename, this adds a log file with default behavior.
    mllog.config(filename="example_simple.log")
    # 2. Advanced way
    # You may pass a logging.Logger instance to mllog.config().
    # To use the advanced way, comment out the "Simple way" above and uncomment
    # the followings:
    #
    # # Notice that proper log level needs to be set for both logger and handler.
    # logger = logging.getLogger("custom_logger")
    # logger.propagate = False
    # logger.setLevel(logging.DEBUG)
    # # add file handler for file logging
    # _file_handler = logging.FileHandler("example_advanced.log")
    # _file_handler.setLevel(logging.DEBUG)
    # logger.addHandler(_file_handler)
    # # add stream handler for stdout logging
    # _stream_handler = logging.StreamHandler(stream=sys.stdout)
    # _stream_handler.setLevel(logging.INFO)
    # logger.addHandler(_stream_handler)
    # mllog.config(logger=logger)

    # Set other logger configurations
    mllog.config(default_namespace="worker1",
                 default_stack_offset=1,
                 default_clear_line=False,
                 root_dir=os.path.normpath(
                     os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                  "..", "..")))

    # Example log messages
    # The methods to use are "start", "end", and "event".
    # You may check out the detailed APIs in mllog.mllog.
    # Try to use the keys from mllog.constants to avoid wrong keys.
    mllogger.start(key=mllog.constants.INIT_START)
    mllogger.event(key=mllog.constants.SUBMISSION_ORG, value="Intel")
    mllogger.event(key=mllog.constants.SUBMISSION_PLATFORM,
                   value="1 node x 8s CPX")
    mllogger.event(key=mllog.constants.SUBMISSION_DIVISION, value="closed")
    mllogger.event(key=mllog.constants.SUBMISSION_STATUS, value="onprem")
    mllogger.event(key=mllog.constants.SUBMISSION_BENCHMARK, value="resnet")
    mllogger.event(key=mllog.constants.SUBMISSION_POC_NAME,
                   value="Wei Wang, Christine Cheng")
    mllogger.event(key=mllog.constants.SUBMISSION_POC_EMAIL,
                   value="[email protected], [email protected]")
    mllogger.event(key=mllog.constants.TRAIN_SAMPLES, value=1281167)
    mllogger.event(key="lars_opt_momentum", value=0.9)
    mllogger.end(key=mllog.constants.INIT_STOP)
Example #22
0
def config_logger(benchmark):
    "initiates mlperf logger"
    mllog.config(filename=os.path.join(
        os.path.dirname(os.path.abspath(__file__)), f'{benchmark}.log'))
    _MLLOGGER.logger.propagate = False
Example #23
0
def main(args):
    if not torch.cuda.is_available():
        raise NotImplementedError('Training on CPU is not supported')
    torch.cuda.set_device(args.device_id)

    mllog.config(filename=os.path.join(
        os.path.dirname(os.path.abspath(__file__)), 'transformer.log'))
    mllogger = mllog.get_mllogger()
    mllogger.logger.propagate = False

    log_start(key=constants.INIT_START, log_all_ranks=True)

    # preinit and warmup streams/groups for allreduce communicators
    allreduce_communicators = None
    if args.distributed_world_size > 1 and args.enable_parallel_backward_allred_opt:
        allreduce_groups = [
            torch.distributed.new_group()
            for _ in range(args.parallel_backward_allred_cuda_nstreams)
        ]
        allreduce_streams = [
            torch.cuda.Stream()
            for _ in range(args.parallel_backward_allred_cuda_nstreams)
        ]
        for group, stream in zip(allreduce_groups, allreduce_streams):
            with torch.cuda.stream(stream):
                torch.distributed.all_reduce(torch.cuda.FloatTensor(1),
                                             group=group)
        allreduce_communicators = (allreduce_groups, allreduce_streams)

    if args.max_tokens is None:
        args.max_tokens = 6000

    print(args)

    log_event(key=constants.GLOBAL_BATCH_SIZE,
              value=args.max_tokens * args.distributed_world_size)
    log_event(key=constants.OPT_NAME, value=args.optimizer)
    assert (len(args.lr) == 1)
    log_event(key=constants.OPT_BASE_LR,
              value=args.lr[0] if len(args.lr) == 1 else args.lr)
    log_event(key=constants.OPT_LR_WARMUP_STEPS, value=args.warmup_updates)
    assert (args.max_source_positions == args.max_target_positions)
    log_event(key=constants.MAX_SEQUENCE_LENGTH,
              value=args.max_target_positions,
              metadata={'method': 'discard'})
    log_event(key=constants.OPT_ADAM_BETA_1, value=eval(args.adam_betas)[0])
    log_event(key=constants.OPT_ADAM_BETA_2, value=eval(args.adam_betas)[1])
    log_event(key=constants.OPT_ADAM_EPSILON, value=args.adam_eps)
    log_event(key=constants.SEED, value=args.seed)

    # L2 Sector Promotion
    pValue = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int))
    result = ctypes.CDLL('libcudart.so').cudaDeviceSetLimit(
        ctypes.c_int(0x05), ctypes.c_int(128))
    result = ctypes.CDLL('libcudart.so').cudaDeviceGetLimit(
        pValue, ctypes.c_int(0x05))

    worker_seeds, shuffling_seeds = setup_seeds(
        args.seed,
        args.max_epoch + 1,
        torch.device('cuda'),
        args.distributed_rank,
        args.distributed_world_size,
    )
    worker_seed = worker_seeds[args.distributed_rank]
    print(
        f'Worker {args.distributed_rank} is using worker seed: {worker_seed}')
    torch.manual_seed(worker_seed)

    # Setup task, e.g., translation, language modeling, etc.
    task = tasks.setup_task(args)

    # Build model and criterion
    model = task.build_model(args)
    criterion = task.build_criterion(args)

    print('| model {}, criterion {}'.format(args.arch,
                                            criterion.__class__.__name__))
    print('| num. model params: {}'.format(
        sum(p.numel() for p in model.parameters())))

    # Build trainer
    if args.fp16:
        if args.distributed_weight_update != 0:
            from fairseq.fp16_trainer import DistributedFP16Trainer
            trainer = DistributedFP16Trainer(
                args,
                task,
                model,
                criterion,
                allreduce_communicators=allreduce_communicators)
        else:
            from fairseq.fp16_trainer import FP16Trainer
            trainer = FP16Trainer(
                args,
                task,
                model,
                criterion,
                allreduce_communicators=allreduce_communicators)
    else:
        if torch.cuda.get_device_capability(0)[0] >= 7:
            print(
                '| NOTICE: your device may support faster training with --fp16'
            )

        trainer = Trainer(args,
                          task,
                          model,
                          criterion,
                          allreduce_communicators=None)

    #if (args.online_eval or args.target_bleu) and not args.remove_bpe:
    #    args.remove_bpe='@@ '

    print('| training on {} GPUs'.format(args.distributed_world_size))
    print('| max tokens per GPU = {} and max sentences per GPU = {}'.format(
        args.max_tokens,
        args.max_sentences,
    ))

    # Initialize dataloader
    max_positions = trainer.get_model().max_positions()

    # Send a dummy batch to warm the caching allocator
    dummy_batch = language_pair_dataset.get_dummy_batch_isolated(
        args.max_tokens, max_positions, 8)
    trainer.dummy_train_step(dummy_batch)

    # Train until the learning rate gets too small or model reaches target score
    max_epoch = args.max_epoch if args.max_epoch >= 0 else math.inf
    max_update = args.max_update or math.inf
    tgt_bleu = args.target_bleu or math.inf
    current_bleu = 0.0
    lr = trainer.get_lr()
    train_meter = StopwatchMeter()
    train_meter.start()
    valid_losses = [None]

    # mlperf compliance synchronization
    if args.distributed_world_size > 1:
        assert (torch.distributed.is_initialized())
        torch.distributed.all_reduce(torch.cuda.FloatTensor(1))
        torch.cuda.synchronize()

    log_end(key=constants.INIT_STOP, sync=False)

    log_start(key=constants.RUN_START, sync=True)
    # second sync after RUN_START tag is printed.
    # this ensures no rank touches data until after RUN_START tag is printed.
    barrier()

    # Load dataset splits
    load_dataset_splits(task, ['train', 'test'])

    log_event(key=constants.TRAIN_SAMPLES,
              value=len(task.dataset(args.train_subset)),
              sync=False)
    log_event(key=constants.EVAL_SAMPLES,
              value=len(task.dataset(args.gen_subset)),
              sync=False)

    ctr = 0

    start = time.time()
    epoch_itr = data.EpochBatchIterator(
        dataset=task.dataset(args.train_subset),
        dataloader_num_workers=args.dataloader_num_workers,
        dataloader_pin_memory=args.enable_dataloader_pin_memory,
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences_valid,
        max_positions=max_positions,
        ignore_invalid_inputs=True,
        required_batch_size_multiple=8,
        seeds=shuffling_seeds,
        num_shards=args.distributed_world_size,
        shard_id=args.distributed_rank,
        epoch=epoch_itr.epoch if ctr is not 0 else 0,
        bucket_growth_factor=args.bucket_growth_factor,
        seq_len_multiple=args.seq_len_multiple,
        batching_scheme=args.batching_scheme,
        batch_multiple_strategy=args.batch_multiple_strategy,
    )
    print("got epoch iterator", time.time() - start)

    # Main training loop
    while lr >= args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates(
    ) < max_update and current_bleu < tgt_bleu:
        first_epoch = epoch_itr.epoch + 1
        log_start(key=constants.BLOCK_START,
                  metadata={
                      'first_epoch_num': first_epoch,
                      'epoch_count': 1
                  },
                  sync=False)
        log_start(key=constants.EPOCH_START,
                  metadata={'epoch_num': first_epoch},
                  sync=False)

        gc.disable()

        # Load the latest checkpoint if one is available
        if ctr is 0:
            load_checkpoint(args, trainer, epoch_itr)

        # train for one epoch
        start = time.time()
        #exit(1)
        train(args, trainer, task, epoch_itr, shuffling_seeds)
        print("epoch time ", time.time() - start)

        start = time.time()
        log_end(key=constants.EPOCH_STOP,
                metadata={'epoch_num': first_epoch},
                sync=False)

        # Eval BLEU score
        if args.online_eval or (not tgt_bleu is math.inf):
            current_bleu = score(args, trainer, task, epoch_itr,
                                 args.gen_subset)
            log_event(key=constants.EVAL_ACCURACY,
                      value=float(current_bleu) / 100.0,
                      metadata={'epoch_num': first_epoch})

        gc.enable()

        # Only use first validation loss to update the learning rate
        #lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0])

        # Save checkpoint
        #if epoch_itr.epoch % args.save_interval == 0:
        #    save_checkpoint(args, trainer, epoch_itr, valid_losses[0])

        ctr = ctr + 1
        print("validation and scoring ", time.time() - start)
        log_end(key=constants.BLOCK_STOP,
                metadata={'first_epoch_num': first_epoch},
                sync=False)

    train_meter.stop()
    status = 'success' if current_bleu >= tgt_bleu else 'aborted'
    log_end(key=constants.RUN_STOP, metadata={'status': status})
    print('| done training in {:.1f} seconds'.format(train_meter.sum))
Example #24
0
    damping_each_step = np.array(damping_each_step).astype(np.float32)
    damping_now = damping_each_step[current_step:]
    return damping_now


if __name__ == '__main__':
    context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
                                      mirror_mean=True, parameter_broadcast=True)
    auto_parallel_context().set_all_reduce_fusion_split_indices([43], "hccl_world_groupsum1")
    auto_parallel_context().set_all_reduce_fusion_split_indices([27], "hccl_world_groupsum2")
    auto_parallel_context().set_all_reduce_fusion_split_indices([27], "hccl_world_groupsum3")
    auto_parallel_context().set_all_reduce_fusion_split_indices([27], "hccl_world_groupsum4")
    auto_parallel_context().set_all_reduce_fusion_split_indices([27], "hccl_world_groupsum5")

    # add mllog
    mllog.config(filename=log_filename)
    mllog.config(
        default_namespace="mindspore",
        default_stack_offset=1,
        default_clear_line=False,
        root_dir=os.path.normpath(
            os.path.dirname(os.path.realpath(__file__))))
    mllogger = mllog.get_mllogger()
    # submission
    mllogger.event(key=mllog.constants.SUBMISSION_BENCHMARK, value="resnet")
    mllogger.event(key=mllog.constants.SUBMISSION_DIVISION, value="open")
    mllogger.event(key=mllog.constants.SUBMISSION_ORG, value="SIAT")
    mllogger.event(key=mllog.constants.SUBMISSION_PLATFORM, value="Ascend 910")
    mllogger.event(key=mllog.constants.SUBMISSION_STATUS, value="cloud")
    mllogger.event(key=mllog.constants.CACHE_CLEAR)
Example #25
0
def set_defaults(opts):
    # Logs and checkpoint paths
    # Must be run last
    opts['summary_str'] += "Logging\n"
    name = opts['name']

    if opts["name_suffix"]:
        name = name + "_" + opts["name_suffix"]

    if opts.get("poplar_version"):
        name += "_v" + _extract_poplar_version(opts['poplar_version'])

    # We want this to be random even if random seeds have been set so that we don't overwrite
    # when re-running with the same seed
    random_state = random.getstate()
    random.seed()
    rnd_str = ''.join(
        random.choice(string.ascii_uppercase + string.digits)
        for _ in range(3))
    random.setstate(random_state)
    name += "_{}".format(rnd_str)
    opts['summary_str'] += " Name: {name}\n"

    # only instance 0 creates a log dir and logs to disk
    # a log dir is also created when using validation.py (aka opts['training']==False)
    # using train.py with --restore-path logs training results into that folder
    if ((not opts['no_logs'])
            and (not opts['restore_path'] or not opts.get('training')) and
        (opts['distributed_worker_index'] == 0 or opts['log_all_instances'])):
        if "logs_path" not in opts or opts["logs_path"] is None:
            opts["logs_path"] = os.path.join(opts["log_dir"],
                                             '{}'.format(name))

        opts["checkpoint_path"] = os.path.join(opts["logs_path"], "ckpt")

        if not os.path.isdir(opts["logs_path"]):
            os.makedirs(opts["logs_path"])

        opts['summary_str'] += " Saving to {logs_path}\n"

        fname = os.path.join(opts["logs_path"], 'arguments.json')
        if os.path.isfile(fname):
            fname = os.path.join(opts["logs_path"], 'arguments_restore.json')
        with open(fname, 'w') as fp:
            json.dump(opts,
                      fp,
                      sort_keys=True,
                      indent=4,
                      separators=(',', ': '))
    elif (
            opts['restore_path'] and
        (opts['distributed_worker_index'] == 0 or opts['log_all_instances'])):
        opts['logs_path'] = opts['restore_path']
        opts['checkpoint_path'] = os.path.join(opts['logs_path'], 'ckpt')
    else:
        opts["logs_path"] = None
        opts["log_dir"] = None
        opts["mlperf_logging"] = False
        opts["checkpoint_path"] = os.path.join('/tmp/', '{}/ckpt'.format(name))
        if not os.path.isdir(
                os.path.dirname(os.path.abspath(opts["checkpoint_path"]))):
            os.makedirs(
                os.path.dirname(os.path.abspath(opts["checkpoint_path"])))

    global MLPERF_LOGGING
    if opts["mlperf_logging"] and MLPERF_LOGGING and opts[
            'distributed_worker_index'] == 0:
        MLPERF_LOGGING = True
        seed = opts.get("seed", "None")
        try:
            mllog.config(default_namespace=mllog.constants.RESNET,
                         default_stack_offset=2,
                         default_clear_line=False,
                         root_dir=os.path.split(os.path.abspath(__file__))[0],
                         filename=os.path.join(opts["logs_path"],
                                               "result_{}.txt".format(seed)))
        except NameError:
            pass
    else:
        MLPERF_LOGGING = False

    return opts
Example #26
0
from mlperf_utils.logs import hooks_helper
from mlperf_utils.logs import logger
from mlperf_utils.misc import model_helpers

global is_mpi
try:
    import horovod.tensorflow as hvd
    hvd.init()
    is_mpi = hvd.size()
except ImportError:
    is_mpi = 0
    print("No MPI horovod support, this is running in no-MPI mode!")

mllogger = mllog.get_mllogger()
filenames = "resnet50v1.5.log-" + str(hvd.rank())
mllog.config(filename=filenames)
workername = "worker" + str(hvd.rank())
mllog.config(
    default_namespace = workername,
    default_stack_offset = 1,
    default_clear_line = False,
    root_dir = os.path.normpath(
      os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..")))
mllogger.event(key=mllog.constants.CACHE_CLEAR)
mllogger.start(key=mllog.constants.RUN_START)

_NUM_EXAMPLES_NAME = "num_examples"
_NUM_IMAGES = {
        'train': 1281167,
        'validation': 50000
}
Example #27
0
def configure_logger(benchmark):
    mllog.config(filename=os.path.join(
        os.path.dirname(os.path.abspath(__file__)), f'{benchmark}.log'))
    mllogger = mllog.get_mllogger()
    mllogger.logger.propagate = False
Example #28
0
def main():
    """
    Launches data-parallel multi-gpu training.
    """
    mllog.config(filename=os.path.join(
        os.path.dirname(os.path.abspath(__file__)), 'gnmt.log'))
    mllogger = mllog.get_mllogger()
    mllogger.logger.propagate = False

    gnmt_start(key=constants.INIT_START)

    args = parse_args()
    device = utils.set_device(args.cuda, args.local_rank)
    distributed = utils.init_distributed(args.cuda)

    args.rank = utils.get_rank()

    if args.rank == 0:
        mlperf_submission_log(benchmark=constants.GNMT)

    if not args.cudnn:
        torch.backends.cudnn.enabled = False

    # create directory for results
    save_path = os.path.join(args.results_dir, args.save)
    args.save_path = save_path
    os.makedirs(save_path, exist_ok=True)

    # setup logging
    log_filename = f'log_rank_{utils.get_rank()}.log'
    utils.setup_logging(os.path.join(save_path, log_filename))

    if args.env:
        utils.log_env_info()

    logging.info(f'Saving results to: {save_path}')
    logging.info(f'Run arguments: {args}')

    # automatically set train_iter_size based on train_global_batch_size,
    # world_size and per-worker train_batch_size
    if args.train_global_batch_size is not None:
        global_bs = args.train_global_batch_size
        bs = args.train_batch_size
        world_size = utils.get_world_size()
        assert global_bs % (bs * world_size) == 0
        args.train_iter_size = global_bs // (bs * world_size)
        logging.info(f'Global batch size was set in the config, '
                     f'Setting train_iter_size to {args.train_iter_size}')

    gnmt_event(key='seed', value=args.seed)
    worker_seeds, shuffling_seeds = utils.setup_seeds(args.seed, args.epochs,
                                                      device)
    worker_seed = worker_seeds[args.rank]
    logging.info(f'Worker {args.rank} is using worker seed: {worker_seed}')
    torch.manual_seed(worker_seed)

    # build tokenizer
    pad_vocab = utils.pad_vocabulary(args.math)
    tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME),
                          pad_vocab)

    vocab_size = tokenizer.vocab_size

    # build GNMT model
    model_config = {
        'hidden_size': args.hidden_size,
        'num_layers': args.num_layers,
        'dropout': args.dropout,
        'batch_first': False,
        'share_embedding': args.share_embedding
    }
    model = GNMT(vocab_size=vocab_size, **model_config)
    logging.info(model)

    batch_first = model.batch_first

    # define loss function (criterion) and optimizer
    criterion = build_criterion(vocab_size, config.PAD, args.smoothing)

    opt_config = {'optimizer': args.optimizer, 'lr': args.lr}
    opt_config.update(literal_eval(args.optimizer_extra))
    logging.info(f'Training optimizer config: {opt_config}')

    scheduler_config = {
        'warmup_steps': args.warmup_steps,
        'remain_steps': args.remain_steps,
        'decay_interval': args.decay_interval,
        'decay_steps': args.decay_steps,
        'decay_factor': args.decay_factor
    }

    logging.info(f'Training LR schedule config: {scheduler_config}')

    num_parameters = sum([l.nelement() for l in model.parameters()])
    logging.info(f'Number of parameters: {num_parameters}')

    gnmt_end(key=constants.INIT_STOP, sync=True)
    gnmt_start(key=constants.RUN_START, sync=True)

    # build datasets
    gnmt_event(key=constants.MAX_SEQUENCE_LENGTH,
               value=args.max_length_train,
               sync=False,
               metadata={'method': 'discard'})

    train_data = LazyParallelDataset(
        src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME),
        tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME),
        tokenizer=tokenizer,
        min_len=args.min_length_train,
        max_len=args.max_length_train,
        sort=False,
        max_size=args.max_size)

    val_data = ParallelDataset(src_fname=os.path.join(args.dataset_dir,
                                                      config.SRC_VAL_FNAME),
                               tgt_fname=os.path.join(args.dataset_dir,
                                                      config.TGT_VAL_FNAME),
                               tokenizer=tokenizer,
                               min_len=args.min_length_val,
                               max_len=args.max_length_val,
                               sort=True)

    test_data = TextDataset(src_fname=os.path.join(args.dataset_dir,
                                                   config.SRC_TEST_FNAME),
                            tokenizer=tokenizer,
                            min_len=args.min_length_test,
                            max_len=args.max_length_test,
                            sort=True)

    batching_opt = {
        'shard_size': args.shard_size,
        'num_buckets': args.num_buckets
    }
    # get data loaders
    train_loader = train_data.get_loader(batch_size=args.train_batch_size,
                                         seeds=shuffling_seeds,
                                         batch_first=batch_first,
                                         shuffle=True,
                                         batching=args.batching,
                                         batching_opt=batching_opt,
                                         num_workers=args.train_loader_workers)

    gnmt_event(key=constants.GLOBAL_BATCH_SIZE,
               value=args.train_batch_size * utils.get_world_size(),
               sync=False)

    val_loader = val_data.get_loader(batch_size=args.val_batch_size,
                                     batch_first=batch_first,
                                     shuffle=False,
                                     num_workers=args.val_loader_workers)

    test_loader = test_data.get_loader(batch_size=args.test_batch_size,
                                       batch_first=batch_first,
                                       shuffle=False,
                                       pad=True,
                                       num_workers=args.test_loader_workers)

    gnmt_event(key='training_samples', value=len(train_loader), sync=False)
    gnmt_event(key='evaluation_samples', value=len(val_loader), sync=False)

    translator = Translator(model=model,
                            tokenizer=tokenizer,
                            loader=test_loader,
                            beam_size=args.beam_size,
                            max_seq_len=args.max_length_test,
                            len_norm_factor=args.len_norm_factor,
                            len_norm_const=args.len_norm_const,
                            cov_penalty_factor=args.cov_penalty_factor,
                            cuda=args.cuda,
                            print_freq=args.print_freq,
                            dataset_dir=args.dataset_dir,
                            target_bleu=args.target_bleu,
                            save_path=args.save_path)

    # create trainer
    total_train_iters = len(train_loader) // args.train_iter_size * args.epochs
    save_info = {
        'model_config': model_config,
        'config': args,
        'tokenizer': tokenizer.get_state()
    }
    trainer_options = dict(criterion=criterion,
                           grad_clip=args.grad_clip,
                           iter_size=args.train_iter_size,
                           save_path=save_path,
                           save_freq=args.save_freq,
                           save_info=save_info,
                           opt_config=opt_config,
                           scheduler_config=scheduler_config,
                           train_iterations=total_train_iters,
                           batch_first=batch_first,
                           keep_checkpoints=args.keep_checkpoints,
                           math=args.math,
                           print_freq=args.print_freq,
                           cuda=args.cuda,
                           distributed=distributed,
                           intra_epoch_eval=args.intra_epoch_eval,
                           translator=translator)

    trainer_options['model'] = model
    trainer = trainers.Seq2SeqTrainer(**trainer_options)

    # optionally resume from a checkpoint
    if args.resume:
        checkpoint_file = args.resume
        if os.path.isdir(checkpoint_file):
            checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth')
        if os.path.isfile(checkpoint_file):
            trainer.load(checkpoint_file)
        else:
            logging.error(f'No checkpoint found at {args.resume}')

    # training loop
    best_loss = float('inf')
    break_training = False
    test_bleu = None
    for epoch in range(args.start_epoch, args.epochs):
        gnmt_start(key=constants.BLOCK_START,
                   metadata={
                       'first_epoch_num': epoch + 1,
                       'epoch_count': 1
                   },
                   sync=True)
        gnmt_start(key=constants.EPOCH_START,
                   metadata={'epoch_num': epoch + 1},
                   sync=True)

        logging.info(f'Starting epoch {epoch}')

        train_loader.sampler.set_epoch(epoch)

        trainer.epoch = epoch
        train_loss, train_perf = trainer.optimize(train_loader)

        gnmt_end(key=constants.EPOCH_STOP,
                 metadata={'epoch_num': epoch + 1},
                 sync=True)

        # evaluate on validation set
        if args.eval:
            logging.info(f'Running validation on dev set')
            val_loss, val_perf = trainer.evaluate(val_loader)

            # remember best prec@1 and save checkpoint
            if args.rank == 0:
                is_best = val_loss < best_loss
                best_loss = min(val_loss, best_loss)
                trainer.save(save_all=args.save_all, is_best=is_best)

        if args.eval:
            gnmt_start(key=constants.EVAL_START,
                       value=epoch,
                       metadata={'epoch_num': epoch + 1},
                       sync=True)
            test_bleu, break_training = translator.run(calc_bleu=True,
                                                       epoch=epoch)
            gnmt_event(key=constants.EVAL_ACCURACY,
                       value={
                           "epoch": epoch,
                           "value": round(test_bleu, 2)
                       },
                       metadata={'epoch_num': epoch + 1},
                       sync=False)
            gnmt_end(key=constants.EVAL_STOP,
                     metadata={'epoch_num': epoch + 1},
                     sync=True)

        acc_log = []
        acc_log += [f'Summary: Epoch: {epoch}']
        acc_log += [f'Training Loss: {train_loss:.4f}']
        if args.eval:
            acc_log += [f'Validation Loss: {val_loss:.4f}']
            acc_log += [f'Test BLEU: {test_bleu:.2f}']

        perf_log = []
        perf_log += [f'Performance: Epoch: {epoch}']
        perf_log += [f'Training: {train_perf:.0f} Tok/s']
        if args.eval:
            perf_log += [f'Validation: {val_perf:.0f} Tok/s']

        if args.rank == 0:
            logging.info('\t'.join(acc_log))
            logging.info('\t'.join(perf_log))

        gnmt_end(key=constants.BLOCK_STOP,
                 metadata={
                     'first_epoch_num': epoch + 1,
                     'epoch_count': 1
                 },
                 sync=True)

        logging.info(f'Finished epoch {epoch}')
        if break_training:
            break

    gnmt_end(key=constants.RUN_STOP,
             metadata={'status': 'success' if break_training else 'aborted'},
             sync=True)