Ejemplo n.º 1
0
def setup_logger(args):
    aggregator_dict = OrderedDict([
        ('loss', 'average'),
        ('weighted_loss', 'average'),
        ('tokens', ('average', 'performance')),
        ('updates', 'performance'),
        ('gnorm', 'average')
    ])
    os.makedirs(args.save_dir, exist_ok=True)
    log_path = os.path.join(args.save_dir, args.stat_file)

    if os.path.exists(log_path):
        for i in itertools.count():
            s_fname = args.stat_file.split('.')
            fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}'
            log_path = os.path.join(args.save_dir, fname)
            if not os.path.exists(log_path):
                break

    if not args.distributed_world_size > 1 or args.distributed_rank == 0:
        dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path),
                                AggregatorBackend(verbosity=0, agg_dict=aggregator_dict),
                                TensorBoardBackend(verbosity=1, log_dir=args.save_dir)])
    else:
        dllogger.init(backends=[])
    for k, v in vars(args).items():
        dllogger.log(step='PARAMETER', data={k: v}, verbosity=0)

    container_setup_info = get_framework_env_vars()
    dllogger.log(step='PARAMETER', data=container_setup_info, verbosity=0)

    dllogger.metadata('loss', {'unit': 'nat', 'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN'})
    dllogger.metadata('val_loss', {'unit': 'nat', 'GOAL': 'MINIMIZE', 'STAGE': 'VAL'})
    dllogger.metadata('speed', {'unit': 'tokens/s', 'format': ':.3f', 'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN'})
    dllogger.metadata('accuracy', {'unit': 'bleu', 'format': ':.2f', 'GOAL': 'MAXIMIZE', 'STAGE': 'VAL'})
def main():

    # setup params
    arguments = PARSER.parse_args()
    params = Namespace(**{**vars(CONFIG), **vars(arguments)})

    # setup logging
    # noinspection PyArgumentList
    logging.basicConfig(
        level=logging.DEBUG if params.verbose else logging.INFO,
        format='{asctime} {levelname:.1} {name:15} {message}',
        style='{'
    )

    # remove custom tf handler that logs to stderr
    logging.getLogger('tensorflow').setLevel(logging.WARNING)
    logging.getLogger('tensorflow').handlers.clear()

    # setup dllogger
    dllogger.init(backends=[
        dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=params.log_file),
        LoggingBackend(verbosity=dllogger.Verbosity.VERBOSE)
    ])
    dllogger.log(step='PARAMETER', data=vars(params))

    # setup dataset
    dataset = Dataset(params)

    if params.mode in ['train', 'train_and_eval']:
        run_training(dataset, params)
    if params.mode == 'eval':
        run_evaluation(dataset, params)
    if params.mode == 'infer':
        run_inference(dataset, params)
def setup_training(args):
    assert (torch.cuda.is_available())

    if args.local_rank == -1:
        device = torch.device("cuda")
        args.n_gpu = torch.cuda.device_count()
        args.allreduce_post_accumulation = False
        args.allreduce_post_accumulation_fp16 = False
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='smddp',
                                             init_method='env://')
        args.n_gpu = 1

    if args.gradient_accumulation_steps == 1:
        args.allreduce_post_accumulation = False
        args.allreduce_post_accumulation_fp16 = False

    if is_main_process():
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=args.json_summary),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                   step_format=format_step)
        ])
    else:
        dllogger.init(backends=[])

    print(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, args.n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))
    if args.train_batch_size % args.gradient_accumulation_steps != 0:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible"
            .format(args.gradient_accumulation_steps, args.train_batch_size))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    if not args.do_train:
        raise ValueError(" `do_train`  must be True.")

    if not args.resume_from_checkpoint and os.path.exists(
            args.output_dir) and (os.listdir(args.output_dir) and any(
                [i.startswith('ckpt') for i in os.listdir(args.output_dir)])):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))

    if (not args.resume_from_checkpoint
            or not os.path.exists(args.output_dir)) and is_main_process():
        os.makedirs(args.output_dir, exist_ok=True)

    return device, args
def init(log_fpath, log_dir, enabled=True, tb_subsets=[], **tb_kw):

    if enabled:
        backends = [JSONStreamBackend(Verbosity.DEFAULT,
                                      unique_log_fpath(log_fpath)),
                    StdOutBackend(Verbosity.VERBOSE,
                                  step_format=stdout_step_format,
                                  metric_format=stdout_metric_format)]
    else:
        backends = []

    dllogger.init(backends=backends)
    dllogger.metadata("train_lrate", {"name": "lrate", "format": ":>3.2e"})

    for id_, pref in [('train', ''), ('train_avg', 'avg train '),
                      ('val', '  avg val '), ('val_ema', '  EMA val ')]:

        dllogger.metadata(f"{id_}_loss",
                          {"name": f"{pref}loss", "format": ":>5.2f"})
        dllogger.metadata(f"{id_}_mel_loss",
                          {"name": f"{pref}mel loss", "format": ":>5.2f"})

        dllogger.metadata(f"{id_}_frames/s",
                          {"name": None, "unit": "frames/s", "format": ":>10.2f"})
        dllogger.metadata(f"{id_}_took",
                          {"name": "took", "unit": "s", "format": ":>3.2f"})

    global tb_loggers
    tb_loggers = {s: TBLogger(enabled, log_dir, name=s, **tb_kw)
                  for s in tb_subsets}
def init_dllogger(log_fpath=None, dummy=False):
    if dummy:
        DLLogger.init(backends=[])
        return
    DLLogger.init(backends=[
        JSONStreamBackend(Verbosity.DEFAULT, log_fpath),
        StdOutBackend(Verbosity.VERBOSE, step_format=stdout_step_format,
                      metric_format=stdout_metric_format)
        ]
    )
    DLLogger.metadata("train_loss", {"name": "loss", "format": ":>5.2f"})
    DLLogger.metadata("train_mel_loss", {"name": "mel loss", "format": ":>5.2f"})
    DLLogger.metadata("avg_train_loss", {"name": "avg train loss", "format": ":>5.2f"})
    DLLogger.metadata("avg_train_mel_loss", {"name": "avg train mel loss", "format": ":>5.2f"})
    DLLogger.metadata("val_loss", {"name": "  avg val loss", "format": ":>5.2f"})
    DLLogger.metadata("val_mel_loss", {"name": "  avg val mel loss", "format": ":>5.2f"})
    DLLogger.metadata(
        "val_ema_loss",
        {"name": "  EMA val loss", "format": ":>5.2f"})
    DLLogger.metadata(
        "val_ema_mel_loss",
        {"name": "  EMA val mel loss", "format": ":>5.2f"})
    DLLogger.metadata(
        "train_frames/s", {"name": None, "unit": "frames/s", "format": ":>10.2f"})
    DLLogger.metadata(
        "avg_train_frames/s", {"name": None, "unit": "frames/s", "format": ":>10.2f"})
    DLLogger.metadata(
        "val_frames/s", {"name": None, "unit": "frames/s", "format": ":>10.2f"})
    DLLogger.metadata(
        "val_ema_frames/s", {"name": None, "unit": "frames/s", "format": ":>10.2f"})
    DLLogger.metadata(
        "took", {"name": "took", "unit": "s", "format": ":>3.2f"})
    DLLogger.metadata("lrate_change", {"name": "lrate"})
Ejemplo n.º 6
0
	def __init__(self, log_file, global_batch_size, warmup_steps: int = 0, profile: bool = False):
		logger.init(backends=[JSONStreamBackend(Verbosity.VERBOSE, log_file), StdOutBackend(Verbosity.VERBOSE)])
		self.warmup_steps = warmup_steps
		self.global_batch_size = global_batch_size
		self.step = 0
		self.profile = profile
		self.timestamps = []
def main():
    """
    Launches inference benchmark.
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch FastPitch Inference Benchmark')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    log_file = args.log_file
    DLLogger.init(backends=[
        JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
        StdOutBackend(Verbosity.VERBOSE)
    ])
    for k, v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k: v})
    DLLogger.log(step="PARAMETER", data={'model_name': 'FastPitch_PyT'})

    model = load_and_setup_model('FastPitch',
                                 parser,
                                 None,
                                 args.amp_run,
                                 'cuda',
                                 unk_args=[],
                                 forward_is_infer=True,
                                 ema=False,
                                 jitable=True)

    # FIXME Temporarily disabled due to nn.LayerNorm fp16 casting bug in pytorch:20.02-py3 and 20.03
    # model = torch.jit.script(model)

    warmup_iters = 3
    iters = 1
    gen_measures = MeasureTime()
    all_frames = 0
    for i in range(-warmup_iters, iters):
        text_padded = torch.randint(low=0,
                                    high=148,
                                    size=(args.batch_size, 128),
                                    dtype=torch.long).to('cuda')
        input_lengths = torch.IntTensor([text_padded.size(1)] *
                                        args.batch_size).to('cuda')
        durs = torch.ones_like(text_padded).mul_(4).to('cuda')

        with torch.no_grad(), gen_measures:
            mels, *_ = model(text_padded, input_lengths, dur_tgt=durs)
        num_frames = mels.size(0) * mels.size(2)

        if i >= 0:
            all_frames += num_frames
            DLLogger.log(step=(i, ), data={"latency": gen_measures[-1]})
            DLLogger.log(step=(i, ),
                         data={"frames/s": num_frames / gen_measures[-1]})

    measures = gen_measures[warmup_iters:]
    DLLogger.log(step=(), data={'avg latency': np.mean(measures)})
    DLLogger.log(step=(), data={'avg frames/s': all_frames / np.sum(measures)})
    DLLogger.flush()
Ejemplo n.º 8
0
def get_logger(params):
    backends = []
    if hvd.rank() == 0:
        backends += [StdOutBackend(Verbosity.VERBOSE)]
        if params.log_dir:
            backends += [JSONStreamBackend(Verbosity.VERBOSE, params.log_dir)]
    logger.init(backends=backends)
    return logger
Ejemplo n.º 9
0
 def _initialize_dllogger(self, log_dir, filename, append):
     backends = [
         JSONStreamBackend(Verbosity.VERBOSE,
                           os.path.join(log_dir, filename),
                           append=append),
         StdOutBackend(Verbosity.VERBOSE),
     ]
     logger.init(backends=backends)
Ejemplo n.º 10
0
 def __init__(self, print_interval, backends, verbose=False):
     self.epoch = -1
     self.iteration = -1
     self.val_iteration = -1
     self.metrics = OrderedDict()
     self.backends = backends
     self.print_interval = print_interval
     self.verbose = verbose
     dllogger.init(backends)
Ejemplo n.º 11
0
 def __init__(self, log_dir, global_batch_size, mode, warmup, dim, profile):
     logger.init(backends=[JSONStreamBackend(Verbosity.VERBOSE, log_dir), StdOutBackend(Verbosity.VERBOSE)])
     self.warmup_steps = warmup
     self.global_batch_size = global_batch_size
     self.step = 0
     self.dim = dim
     self.mode = mode
     self.profile = profile
     self.timestamps = []
Ejemplo n.º 12
0
def main():
    args = parse_args()
    dllogger.init(backends=[
        dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                   filename=args.log_path),
        dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
    ])

    dllogger.log(data=vars(args), step='PARAMETER')

    model = NeuMF(nb_users=args.n_users,
                  nb_items=args.n_items,
                  mf_dim=args.factors,
                  mlp_layer_sizes=args.layers,
                  dropout=args.dropout)

    model = model.cuda()

    if args.load_checkpoint_path:
        state_dict = torch.load(args.load_checkpoint_path)
        model.load_state_dict(state_dict)

    if args.opt_level == "O2":
        model = amp.initialize(model,
                               opt_level=args.opt_level,
                               keep_batchnorm_fp32=False,
                               loss_scale='dynamic')
    model.eval()

    users = torch.cuda.LongTensor(args.batch_size).random_(0, args.n_users)
    items = torch.cuda.LongTensor(args.batch_size).random_(0, args.n_items)

    latencies = []
    for _ in range(args.num_batches):
        torch.cuda.synchronize()
        start = time.time()
        predictions = model(users, items, sigmoid=True)
        torch.cuda.synchronize()
        latencies.append(time.time() - start)

    dllogger.log(data={
        'batch_size':
        args.batch_size,
        'best_inference_throughput':
        args.batch_size / min(latencies),
        'best_inference_latency':
        min(latencies),
        'mean_inference_throughput':
        args.batch_size / np.mean(latencies),
        'mean_inference_latency':
        np.mean(latencies),
        'inference_latencies':
        latencies
    },
                 step=tuple())
    dllogger.flush()
    return
Ejemplo n.º 13
0
def setup_logger(args):
    aggregator_dict = OrderedDict([('loss', 'average'),
                                   ('weighted_loss', 'average'),
                                   ('tokens', ('average', 'performance')),
                                   ('updates', 'performance'),
                                   ('gnorm', 'average')])
    os.makedirs(args.save_dir, exist_ok=True)
    log_path = os.path.join(args.save_dir, args.stat_file)
    os.makedirs(args.save_dir, exist_ok=True)
    if not args.distributed_world_size > 1 or args.distributed_rank == 0:
        dllogger.init(backends=[
            JSONStreamBackend(verbosity=1, filename=log_path),
            AggregatorBackend(verbosity=0, agg_dict=aggregator_dict),
            TensorBoardBackend(verbosity=1, log_dir=args.save_dir)
        ])
    else:
        dllogger.init(backends=[])
    for k, v in vars(args).items():
        dllogger.log(step='PARAMETER', data={k: v}, verbosity=0)

    container_setup_info = {
        'NVIDIA_PYTORCH_VERSION': os.environ.get('NVIDIA_PYTORCH_VERSION'),
        'PYTORCH_VERSION': os.environ.get('PYTORCH_VERSION'),
        'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'),
        'NCCL_VERSION': os.environ.get('NCCL_VERSION'),
        'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'),
        'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'),
        'CUDA_VERSION': os.environ.get('CUDA_VERSION'),
        'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'),
        'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'),
        'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'),
    }
    dllogger.log(step='PARAMETER', data=container_setup_info, verbosity=0)

    dllogger.metadata('loss', {
        'unit': 'nat',
        'GOAL': 'MINIMIZE',
        'STAGE': 'TRAIN'
    })
    dllogger.metadata('val_loss', {
        'unit': 'nat',
        'GOAL': 'MINIMIZE',
        'STAGE': 'VAL'
    })
    dllogger.metadata('speed', {
        'unit': 'tokens/s',
        'format': ':.3f',
        'GOAL': 'MAXIMIZE',
        'STAGE': 'TRAIN'
    })
    dllogger.metadata('accuracy', {
        'unit': 'bleu',
        'format': ':.2f',
        'GOAL': 'MAXIMIZE',
        'STAGE': 'VAL'
    })
Ejemplo n.º 14
0
def get_logger(params):
    backends = []
    if params.worker_id == 0 or params.log_all_workers:
        backends += [StdOutBackend(Verbosity.VERBOSE)]
        if params.log_dir:
            os.makedirs(params.log_dir, exist_ok=True)
            log_file = f"{params.log_dir}/log.json"
            backends += [JSONStreamBackend(Verbosity.VERBOSE, log_file)]
    logger.init(backends=backends)
    return logger
Ejemplo n.º 15
0
def get_logger(params):
    backends = []
    worker_id = hvd_rank() if horovod_enabled() else 0
    if worker_id == 0:
        backends += [StdOutBackend(Verbosity.VERBOSE)]
        if params.log_dir:
            os.makedirs(params.log_dir, exist_ok=True)
            log_file = f"{params.log_dir}/log.json"
            backends += [JSONStreamBackend(Verbosity.VERBOSE, log_file)]
    logger.init(backends=backends)
    return logger
Ejemplo n.º 16
0
def setup_logger(args):
    os.makedirs(args.results, exist_ok=True)
    log_path = os.path.join(args.results, args.log_file)

    if os.path.exists(log_path):
        for i in itertools.count():
            s_fname = args.log_file.split('.')
            fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}'
            log_path = os.path.join(args.results, fname)
            if not os.path.exists(log_path):
                break

    def metric_format(metric, metadata, value):
        return "{}: {}".format(metric, f'{value:.5f}' if isinstance(value, float) else value)
    def step_format(step):
        if step == ():
            return "Finished |"
        elif isinstance(step, int):
            return "Step {0: <5} |".format(step)
        return "Step {} |".format(step)


    if not dist.is_initialized() or not args.distributed_world_size > 1 or args.distributed_rank == 0:
        dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path),
                                TensorBoardBackend(verbosity=1, log_dir=args.results),
                                StdOutBackend(verbosity=2, 
                                              step_format=step_format,
                                              prefix_format=lambda x: "")#,
                                              #metric_format=metric_format)
                                ])
    else:
        dllogger.init(backends=[])
    dllogger.log(step='PARAMETER', data=vars(args), verbosity=0)

    container_setup_info = {**get_framework_env_vars(), **get_system_info()}
    dllogger.log(step='ENVIRONMENT', data=container_setup_info, verbosity=0)

    dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
    dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
    dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
    dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
    dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f'})
    dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f'})
    dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
    dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
    dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
    dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f'})
    dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
    dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
    dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
    dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f'})
    dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
    dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
    dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
Ejemplo n.º 17
0
def setup_dllogger(enabled=True, filename=os.devnull, rank=0):
    if enabled and rank == 0:
        backends = [
            dllogger.JSONStreamBackend(
                dllogger.Verbosity.VERBOSE,
                filename,
            ),
        ]
        dllogger.init(backends)
    else:
        dllogger.init([])
Ejemplo n.º 18
0
def init_logging(log_path):
    json_backend = dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                              filename=log_path)
    stdout_backend = dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)

    stdout_backend._metadata['best_auc'].update({'format': '0:.5f'})
    stdout_backend._metadata['best_epoch'].update({'format': '0:.2f'})
    stdout_backend._metadata['average_train_throughput'].update({'format': ':.2e'})
    stdout_backend._metadata['average_test_throughput'].update({'format': ':.2e'})

    dllogger.init(backends=[json_backend, stdout_backend])
Ejemplo n.º 19
0
def get_dllogger(params):
    backends = []
    if is_main_process():
        backends += [StdOutBackend(Verbosity.VERBOSE)]
        if params.log_dir:
            backends += [
                JSONStreamBackend(Verbosity.VERBOSE,
                                  os.path.join(params.log_dir, "log.json"))
            ]
    logger.init(backends=backends)
    return logger
Ejemplo n.º 20
0
def init_log(args):

    enabled = not dist.is_initialized() or dist.get_rank() == 0
    if enabled:
        fpath = args.log_file or os.path.join(args.output_dir, 'nvlog.json')
        backends = [
            JSONStreamBackend(Verbosity.DEFAULT, unique_log_fpath(fpath)),
            StdOutBackend(Verbosity.VERBOSE,
                          step_format=stdout_step_format,
                          metric_format=stdout_metric_format)
        ]
    else:
        backends = []

    dllogger.init(backends=backends)
    dllogger.metadata("train_lrate", {"name": "lrate", "format": ":>3.2e"})

    for id_, pref in [('train', ''), ('train_avg', 'avg train '),
                      ('dev_ema', '  dev ema ')]:

        dllogger.metadata(f"{id_}_loss", {
            "name": f"{pref}loss",
            "format": ":>7.2f"
        })

        dllogger.metadata(f"{id_}_wer", {
            "name": f"{pref}wer",
            "format": ":>6.2f"
        })

        dllogger.metadata(f"{id_}_pplx", {
            "name": f"{pref}pplx",
            "format": ":>6.2f"
        })

        dllogger.metadata(f"{id_}_throughput", {
            "name": f"{pref}utts/s",
            "format": ":>5.0f"
        })

        dllogger.metadata(f"{id_}_took", {
            "name": "took",
            "unit": "s",
            "format": ":>5.2f"
        })

    tb_subsets = ['train', 'dev_ema']
    global tb_loggers
    tb_loggers = {
        s: TBLogger(enabled, args.output_dir, name=s)
        for s in tb_subsets
    }

    log_parameters(vars(args), tb_subset='train')
Ejemplo n.º 21
0
def setup_dllogger(rank, enabled=True, filename='log.json'):
    if enabled and rank == 0:
        backends = [
            StdOutBackend(Verbosity.DEFAULT),
            JSONStreamBackend(
                Verbosity.VERBOSE,
                filename,
            ),
        ]
        DLLogger.init(backends)
    else:
        DLLogger.init([])
Ejemplo n.º 22
0
def get_logger(params):
    """ Get logger object

    :param params: Dict with additional parameters
    :return: logger
    """
    backends = []
    if hvd.rank() == 0:
        backends += [StdOutBackend(Verbosity.VERBOSE)]
        if params.log_dir:
            backends += [JSONStreamBackend(Verbosity.VERBOSE, params.log_dir)]
    logger.init(backends=backends)
    return logger
Ejemplo n.º 23
0
def setup_dllogger(enabled=True, filename=os.devnull):
    rank = utils.distributed.get_rank()

    if enabled and rank == 0:
        backends = [
            dllogger.JSONStreamBackend(
                dllogger.Verbosity.VERBOSE,
                filename,
            ),
        ]
        dllogger.init(backends)
    else:
        dllogger.init([])
Ejemplo n.º 24
0
def setup_logging(args):
    logging.basicConfig(level=logging.DEBUG,
                        format='{asctime}:{levelname}: {message}',
                        style='{')
    if hvd.rank() == 0:
        dllogger.init(backends=[
            dllogger.StdOutBackend(dllogger.Verbosity.DEFAULT,
                                   step_format=format_step),
            dllogger.JSONStreamBackend(
                dllogger.Verbosity.VERBOSE,
                os.path.join(args.workspace, args.dllogger_log)),
        ])
    else:
        dllogger.init([])
def setup_training(args):

    assert (torch.cuda.is_available())

    global ort_supplement
    import ort_supplement.ort_supplement as ort_supplement
    device = ort_supplement.setup_onnxruntime_with_mpi(args)

    if is_main_process(args):
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=args.json_summary),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                   step_format=format_step)
        ])
    else:
        dllogger.init(backends=[])

    print(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, args.n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))
    if args.train_batch_size % args.gradient_accumulation_steps != 0:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible"
            .format(args.gradient_accumulation_steps, args.train_batch_size))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    if not args.do_train:
        raise ValueError(" `do_train`  must be True.")

    if not args.resume_from_checkpoint and os.path.exists(
            args.output_dir) and (os.listdir(args.output_dir) and any(
                [i.startswith('ckpt') for i in os.listdir(args.output_dir)])):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))

    if (not args.resume_from_checkpoint
            or not os.path.exists(args.output_dir)) and is_main_process(args):
        os.makedirs(args.output_dir, exist_ok=True)

    return device, args
Ejemplo n.º 26
0
def main():
    args = parse_args()
    dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                                       filename=args.log_path),
                            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)])

    dllogger.log(data=vars(args), step='PARAMETER')

    model = NeuMF(nb_users=args.n_users, nb_items=args.n_items, mf_dim=args.factors,
                  mlp_layer_sizes=args.layers, dropout=args.dropout)

    model = model.cuda()

    if args.load_checkpoint_path:
        state_dict = torch.load(args.load_checkpoint_path)
        model.load_state_dict(state_dict)

    if args.fp16:
        model.half()
    model.eval()
    
    batch_sizes = args.batch_sizes.split(',')
    batch_sizes = [int(s) for s in batch_sizes]

    result_data = {}
    for batch_size in batch_sizes:
        print('benchmarking batch size: ', batch_size)
        users = torch.cuda.LongTensor(batch_size).random_(0, args.n_users)
        items = torch.cuda.LongTensor(batch_size).random_(0, args.n_items)

        latencies = []
        for _ in range(args.num_batches):
            torch.cuda.synchronize()
            start = time.time()
            _ = model(users, items, sigmoid=True)
            torch.cuda.synchronize()
            latencies.append(time.time() - start)

        result_data[f'batch_{batch_size}_mean_throughput'] = batch_size / np.mean(latencies)
        result_data[f'batch_{batch_size}_mean_latency'] = np.mean(latencies)
        result_data[f'batch_{batch_size}_p90_latency'] = np.percentile(latencies, 0.90)
        result_data[f'batch_{batch_size}_p95_latency'] = np.percentile(latencies, 0.95)
        result_data[f'batch_{batch_size}_p99_latency'] = np.percentile(latencies, 0.99)

    dllogger.log(data=result_data, step=tuple())
    dllogger.flush()
    return
Ejemplo n.º 27
0
    def __init__(self, name, json_output=None, print_freq=20):
        self.name = name
        self.train_loss_logger = IterationAverageMeter("Training loss")
        self.train_epoch_time_logger = EpochMeter("Training 1 epoch time")
        self.val_acc_logger = EpochMeter("Validation accuracy")
        self.print_freq = print_freq

        backends = [DLLogger.StdOutBackend(DLLogger.Verbosity.DEFAULT)]
        if json_output:
            backends.append(
                DLLogger.JSONStreamBackend(DLLogger.Verbosity.VERBOSE,
                                           json_output))

        DLLogger.init(backends)

        self.epoch = 0
        self.train_iter = 0
        self.summary = {}
Ejemplo n.º 28
0
def setup_logger(args):
    if not args.no_dllogger:
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=1, filename=args.stat_file)
        ])
        for k, v in vars(args).items():
            dllogger.log(step='PARAMETER', data={k: v}, verbosity=0)
        container_setup_info = log_helper.get_framework_env_vars()
        dllogger.log(step='PARAMETER', data=container_setup_info, verbosity=0)
        dllogger.metadata(
            'throughput', {
                'unit': 'tokens/s',
                'format': ':/3f',
                'GOAL': 'MAXIMIZE',
                'STAGE': 'INFER'
            })
    else:
        dllogger.init(backends=[])
Ejemplo n.º 29
0
def init_logger(args, full, logger):
    if full:
        logger.setLevel(logging.INFO)
        log_path = os.path.join(args.results_dir, args.log_filename)
        os.makedirs(args.results_dir, exist_ok=True)
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=log_path),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)])
        logger.warning('command line arguments: {}'.format(json.dumps(vars(args))))
        if not os.path.exists(args.results_dir):
            os.mkdir(args.results_dir)

        with open('{}/args.json'.format(args.results_dir), 'w') as f:
            json.dump(vars(args), f, indent=4)
    else:
        logger.setLevel(logging.ERROR)
        dllogger.init(backends=[])

    dllogger.log(data=vars(args), step='PARAMETER')
Ejemplo n.º 30
0
def setup_logger(args):
    dllogger.init(backends=[
        dllogger.JSONStreamBackend(verbosity=1, filename=args.stat_file)
    ])
    for k, v in vars(args).items():
        dllogger.log(step='PARAMETER', data={k: v}, verbosity=0)
    container_setup_info = {
        'NVIDIA_PYTORCH_VERSION': os.environ.get('NVIDIA_PYTORCH_VERSION'),
        'PYTORCH_VERSION': os.environ.get('PYTORCH_VERSION'),
        'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'),
        'NCCL_VERSION': os.environ.get('NCCL_VERSION'),
        'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'),
        'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'),
        'CUDA_VERSION': os.environ.get('CUDA_VERSION')
    }
    dllogger.log(step='PARAMETER', data=container_setup_info, verbosity=0)
    dllogger.metadata('throughput', {
        'unit': 'tokens/s',
        'format': ':/3f',
        'GOAL': 'MAXIMIZE',
        'STAGE': 'INFER'
    })