def init(log_fpath, log_dir, enabled=True, tb_subsets=[], **tb_kw): if enabled: backends = [JSONStreamBackend(Verbosity.DEFAULT, unique_log_fpath(log_fpath)), StdOutBackend(Verbosity.VERBOSE, step_format=stdout_step_format, metric_format=stdout_metric_format)] else: backends = [] dllogger.init(backends=backends) dllogger.metadata("train_lrate", {"name": "lrate", "format": ":>3.2e"}) for id_, pref in [('train', ''), ('train_avg', 'avg train '), ('val', ' avg val '), ('val_ema', ' EMA val ')]: dllogger.metadata(f"{id_}_loss", {"name": f"{pref}loss", "format": ":>5.2f"}) dllogger.metadata(f"{id_}_mel_loss", {"name": f"{pref}mel loss", "format": ":>5.2f"}) dllogger.metadata(f"{id_}_frames/s", {"name": None, "unit": "frames/s", "format": ":>10.2f"}) dllogger.metadata(f"{id_}_took", {"name": "took", "unit": "s", "format": ":>3.2f"}) global tb_loggers tb_loggers = {s: TBLogger(enabled, log_dir, name=s, **tb_kw) for s in tb_subsets}
def setup_logger(args): aggregator_dict = OrderedDict([ ('loss', 'average'), ('weighted_loss', 'average'), ('tokens', ('average', 'performance')), ('updates', 'performance'), ('gnorm', 'average') ]) os.makedirs(args.save_dir, exist_ok=True) log_path = os.path.join(args.save_dir, args.stat_file) if os.path.exists(log_path): for i in itertools.count(): s_fname = args.stat_file.split('.') fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}' log_path = os.path.join(args.save_dir, fname) if not os.path.exists(log_path): break if not args.distributed_world_size > 1 or args.distributed_rank == 0: dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path), AggregatorBackend(verbosity=0, agg_dict=aggregator_dict), TensorBoardBackend(verbosity=1, log_dir=args.save_dir)]) else: dllogger.init(backends=[]) for k, v in vars(args).items(): dllogger.log(step='PARAMETER', data={k: v}, verbosity=0) container_setup_info = get_framework_env_vars() dllogger.log(step='PARAMETER', data=container_setup_info, verbosity=0) dllogger.metadata('loss', {'unit': 'nat', 'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN'}) dllogger.metadata('val_loss', {'unit': 'nat', 'GOAL': 'MINIMIZE', 'STAGE': 'VAL'}) dllogger.metadata('speed', {'unit': 'tokens/s', 'format': ':.3f', 'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN'}) dllogger.metadata('accuracy', {'unit': 'bleu', 'format': ':.2f', 'GOAL': 'MAXIMIZE', 'STAGE': 'VAL'})
def init_inference_metadata(): modalities = [('latency', 's', ':>10.5f'), ('RTF', 'x', ':>10.2f'), ('frames/s', None, ':>10.2f'), ('samples/s', None, ':>10.2f'), ('letters/s', None, ':>10.2f')] for perc in ['', 'avg', '90%', '95%', '99%']: for model in ['fastpitch', 'waveglow', '']: for mod, unit, format in modalities: name = f'{perc} {model} {mod}'.strip().replace(' ', ' ') dllogger.metadata( name.replace(' ', '_'), {'name': f'{name: <26}', 'unit': unit, 'format': format})
def init_log(args): enabled = (args.local_rank == 0) if enabled: fpath = args.log_file or os.path.join(args.output_dir, 'nvlog.json') backends = [ JSONStreamBackend(Verbosity.DEFAULT, unique_log_fpath(fpath)), StdOutBackend(Verbosity.VERBOSE, step_format=stdout_step_format, metric_format=stdout_metric_format) ] else: backends = [] dllogger.init(backends=backends) dllogger.metadata("train_lrate", {"name": "lrate", "format": ":>3.2e"}) for id_, pref in [('train', ''), ('train_avg', 'avg train '), ('dev', ' avg dev '), ('dev_ema', ' EMA dev ')]: dllogger.metadata(f"{id_}_loss", { "name": f"{pref}loss", "format": ":>7.2f" }) dllogger.metadata(f"{id_}_wer", { "name": f"{pref}wer", "format": ":>6.2f" }) dllogger.metadata(f"{id_}_throughput", { "name": f"{pref}utts/s", "format": ":>5.0f" }) dllogger.metadata(f"{id_}_took", { "name": "took", "unit": "s", "format": ":>5.2f" }) tb_subsets = ['train', 'dev', 'dev_ema'] if args.ema else ['train', 'dev'] global tb_loggers tb_loggers = { s: TBLogger(enabled, args.output_dir, name=s) for s in tb_subsets } log_parameters(vars(args), tb_subset='train')
def setup_logger(args): aggregator_dict = OrderedDict([('loss', 'average'), ('weighted_loss', 'average'), ('tokens', ('average', 'performance')), ('updates', 'performance'), ('gnorm', 'average')]) os.makedirs(args.save_dir, exist_ok=True) log_path = os.path.join(args.save_dir, args.stat_file) os.makedirs(args.save_dir, exist_ok=True) if not args.distributed_world_size > 1 or args.distributed_rank == 0: dllogger.init(backends=[ JSONStreamBackend(verbosity=1, filename=log_path), AggregatorBackend(verbosity=0, agg_dict=aggregator_dict), TensorBoardBackend(verbosity=1, log_dir=args.save_dir) ]) else: dllogger.init(backends=[]) for k, v in vars(args).items(): dllogger.log(step='PARAMETER', data={k: v}, verbosity=0) container_setup_info = { 'NVIDIA_PYTORCH_VERSION': os.environ.get('NVIDIA_PYTORCH_VERSION'), 'PYTORCH_VERSION': os.environ.get('PYTORCH_VERSION'), 'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'), 'NCCL_VERSION': os.environ.get('NCCL_VERSION'), 'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'), 'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'), 'CUDA_VERSION': os.environ.get('CUDA_VERSION'), 'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'), 'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'), 'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'), } dllogger.log(step='PARAMETER', data=container_setup_info, verbosity=0) dllogger.metadata('loss', { 'unit': 'nat', 'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN' }) dllogger.metadata('val_loss', { 'unit': 'nat', 'GOAL': 'MINIMIZE', 'STAGE': 'VAL' }) dllogger.metadata('speed', { 'unit': 'tokens/s', 'format': ':.3f', 'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN' }) dllogger.metadata('accuracy', { 'unit': 'bleu', 'format': ':.2f', 'GOAL': 'MAXIMIZE', 'STAGE': 'VAL' })
def setup_logger(args): if not args.no_dllogger: dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=1, filename=args.stat_file) ]) for k, v in vars(args).items(): dllogger.log(step='PARAMETER', data={k: v}, verbosity=0) container_setup_info = log_helper.get_framework_env_vars() dllogger.log(step='PARAMETER', data=container_setup_info, verbosity=0) dllogger.metadata( 'throughput', { 'unit': 'tokens/s', 'format': ':/3f', 'GOAL': 'MAXIMIZE', 'STAGE': 'INFER' }) else: dllogger.init(backends=[])
def setup_logger(args): dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=1, filename=args.stat_file) ]) for k, v in vars(args).items(): dllogger.log(step='PARAMETER', data={k: v}, verbosity=0) container_setup_info = { 'NVIDIA_PYTORCH_VERSION': os.environ.get('NVIDIA_PYTORCH_VERSION'), 'PYTORCH_VERSION': os.environ.get('PYTORCH_VERSION'), 'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'), 'NCCL_VERSION': os.environ.get('NCCL_VERSION'), 'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'), 'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'), 'CUDA_VERSION': os.environ.get('CUDA_VERSION') } dllogger.log(step='PARAMETER', data=container_setup_info, verbosity=0) dllogger.metadata('throughput', { 'unit': 'tokens/s', 'format': ':/3f', 'GOAL': 'MAXIMIZE', 'STAGE': 'INFER' })
def after_create_session(self, session, coord): params_count = tf.get_default_graph().get_tensor_by_name("trainable_parameters_count_ref:0") _params_count = session.run(params_count) Logger._stage = "train" if self._is_training else "eval" Logger.log( step=('PARAMETER'), data={"# Total Trainable Parameters": int(_params_count)}, verbosity=Logger.Verbosity.DEFAULT ) Logger.metadata( metric="{prefix}.avg_ips".format(prefix=Logger._stage), metadata={"unit": "imgs/s", "format": ":.3f", "GOAL": "MAXIMIZE", "STAGE": Logger._stage.upper()} ) for ths in [0.05, 0.125, 0.25, 0.5, 0.75, 0.85, 0.95, 0.99]: Logger.metadata( metric="{prefix}.IoU_THS_{ths}".format(prefix=Logger._stage, ths=ths), metadata={"format": ":.3f", "GOAL": "MAXIMIZE", "STAGE": Logger._stage.upper()} ) if self._is_training: Logger.metadata( metric="{prefix}.learning_rate".format(prefix=Logger._stage), metadata={"format": ":.3e", "GOAL": "NONE", "STAGE": Logger._stage.upper()} ) Logger.metadata( metric="{prefix}.weight_decay".format(prefix=Logger._stage), metadata={"format": ":.3f", "GOAL": "MAXIMIZE", "STAGE": Logger._stage.upper()} ) Logger.metadata( metric="{prefix}.reconstruction_loss".format(prefix=Logger._stage), metadata={"format": ":.3f", "GOAL": "MINIMIZE", "STAGE": Logger._stage.upper()} ) Logger.metadata( metric="{prefix}.total_loss".format(prefix=Logger._stage), metadata={"format": ":.3f", "GOAL": "MINIMIZE", "STAGE": Logger._stage.upper()} ) Logger.metadata( metric="{prefix}.true_positives".format(prefix=Logger._stage), metadata={"STAGE": Logger._stage.upper()} ) Logger.metadata( metric="{prefix}.true_negatives".format(prefix=Logger._stage), metadata={"STAGE": Logger._stage.upper()} ) Logger.metadata( metric="{prefix}.false_positives".format(prefix=Logger._stage), metadata={"STAGE": Logger._stage.upper()} ) Logger.metadata( metric="{prefix}.false_negatives".format(prefix=Logger._stage), metadata={"STAGE": Logger._stage.upper()} ) Logger.metadata( metric="{prefix}.true_positive_rate".format(prefix=Logger._stage), metadata={"STAGE": Logger._stage.upper()} ) Logger.metadata( metric="{prefix}.true_negative_rate".format(prefix=Logger._stage), metadata={"STAGE": Logger._stage.upper()} ) self._start_training_time = time.time()
def register_metric(self, metric_name, meter, verbosity=0, metadata={}): if self.verbose: print("Registering metric: {}".format(metric_name)) self.metrics[metric_name] = {"meter": meter, "level": verbosity} dllogger.metadata(metric_name, metadata)
def main(): parser = get_parser() args = parser.parse_args() log_fpath = args.log_file or str(Path(args.output_dir, 'nvlog_infer.json')) log_fpath = unique_log_fpath(log_fpath) dllogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, log_fpath), StdOutBackend(Verbosity.VERBOSE, metric_format=stdout_metric_format) ]) [dllogger.log("PARAMETER", {k: v}) for k, v in vars(args).items()] for step in ['DNN', 'data+DNN', 'data']: for c in [0.99, 0.95, 0.9, 0.5]: cs = 'avg' if c == 0.5 else f'{int(100*c)}%' dllogger.metadata(f'{step.lower()}_latency_{c}', { 'name': f'{step} latency {cs}', 'format': ':>7.2f', 'unit': 'ms' }) dllogger.metadata('eval_wer', { 'name': 'WER', 'format': ':>3.3f', 'unit': '%' }) if args.cpu: device = torch.device('cpu') else: assert torch.cuda.is_available() device = torch.device('cuda') torch.backends.cudnn.benchmark = args.cudnn_benchmark if args.seed is not None: torch.manual_seed(args.seed + args.local_rank) np.random.seed(args.seed + args.local_rank) random.seed(args.seed + args.local_rank) # set up distributed training multi_gpu = not args.cpu and int(os.environ.get('WORLD_SIZE', 1)) > 1 if multi_gpu: torch.cuda.set_device(args.local_rank) distrib.init_process_group(backend='nccl', init_method='env://') print_once(f'Inference with {distrib.get_world_size()} GPUs') cfg = config.load(args.model_config) if args.max_duration is not None: cfg['input_val']['audio_dataset']['max_duration'] = args.max_duration cfg['input_val']['filterbank_features'][ 'max_duration'] = args.max_duration if args.pad_to_max_duration: assert cfg['input_val']['audio_dataset']['max_duration'] > 0 cfg['input_val']['audio_dataset']['pad_to_max_duration'] = True cfg['input_val']['filterbank_features']['pad_to_max_duration'] = True use_dali = args.dali_device in ('cpu', 'gpu') (dataset_kw, features_kw, splicing_kw, _, _) = config.input(cfg, 'val') tokenizer_kw = config.tokenizer(cfg) tokenizer = Tokenizer(**tokenizer_kw) optim_level = 3 if args.amp else 0 feature_proc = torch.nn.Sequential( torch.nn.Identity(), torch.nn.Identity(), features.FrameSplicing(optim_level=optim_level, **splicing_kw), features.FillPadding(optim_level=optim_level, ), ) # dataset data_loader = DaliDataLoader(gpu_id=args.local_rank or 0, dataset_path=args.dataset_dir, config_data=dataset_kw, config_features=features_kw, json_names=[args.val_manifest], batch_size=args.batch_size, sampler=dali_sampler.SimpleSampler(), pipeline_type="val", device_type=args.dali_device, tokenizer=tokenizer) model = RNNT(n_classes=tokenizer.num_labels + 1, **config.rnnt(cfg)) if args.ckpt is not None: print(f'Loading the model from {args.ckpt} ...') checkpoint = torch.load(args.ckpt, map_location="cpu") key = 'ema_state_dict' if args.ema else 'state_dict' state_dict = checkpoint[key] model.load_state_dict(state_dict, strict=True) model.to(device) model.eval() if feature_proc is not None: feature_proc.to(device) feature_proc.eval() if args.amp: model = amp.initialize(model, opt_level='O3') if multi_gpu: model = DistributedDataParallel(model) agg = {'txts': [], 'preds': [], 'logits': []} dur = {'data': [], 'dnn': [], 'data+dnn': []} rep_loader = chain(*repeat(data_loader, args.repeats)) rep_len = args.repeats * len(data_loader) blank_idx = tokenizer.num_labels greedy_decoder = RNNTGreedyDecoder(blank_idx=blank_idx) def sync_time(): torch.cuda.synchronize() if device.type == 'cuda' else None return time.perf_counter() sz = [] with torch.no_grad(): for it, batch in enumerate(tqdm.tqdm(rep_loader, total=rep_len)): if use_dali: feats, feat_lens, txt, txt_lens = batch if feature_proc is not None: feats, feat_lens = feature_proc([feats, feat_lens]) else: batch = [t.cuda(non_blocking=True) for t in batch] audio, audio_lens, txt, txt_lens = batch feats, feat_lens = feature_proc([audio, audio_lens]) feats = feats.permute(2, 0, 1) if args.amp: feats = feats.half() sz.append(feats.size(0)) t1 = sync_time() log_probs, log_prob_lens = model(feats, feat_lens, txt, txt_lens) t2 = sync_time() # burn-in period; wait for a new loader due to num_workers if it >= 1 and (args.repeats == 1 or it >= len(data_loader)): dur['data'].append(t1 - t0) dur['dnn'].append(t2 - t1) dur['data+dnn'].append(t2 - t0) if txt is not None: agg['txts'] += helpers.gather_transcripts([txt], [txt_lens], tokenizer.detokenize) preds = greedy_decoder.decode(model, feats, feat_lens) agg['preds'] += helpers.gather_predictions([preds], tokenizer.detokenize) if 0 < args.steps < it: break t0 = sync_time() # communicate the results if args.transcribe_wav: for idx, p in enumerate(agg['preds']): print_once(f'Prediction {idx+1: >3}: {p}') elif args.transcribe_filelist: pass else: wer, loss = process_evaluation_epoch(agg) if not multi_gpu or distrib.get_rank() == 0: dllogger.log(step=(), data={'eval_wer': 100 * wer}) if args.save_predictions: with open(args.save_predictions, 'w') as f: f.write('\n'.join(agg['preds'])) # report timings if len(dur['data']) >= 20: ratios = [0.9, 0.95, 0.99] for stage in dur: lat = durs_to_percentiles(dur[stage], ratios) for k in [0.99, 0.95, 0.9, 0.5]: kk = str(k).replace('.', '_') dllogger.log(step=(), data={f'{stage.lower()}_latency_{kk}': lat[k]}) else: # TODO measure at least avg latency print_once('Not enough samples to measure latencies.')
def main(): parser = get_parser() args = parser.parse_args() log_fpath = args.log_file or str(Path(args.output_dir, 'nvlog_infer.json')) log_fpath = unique_log_fpath(log_fpath) dllogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_fpath), StdOutBackend(Verbosity.VERBOSE, metric_format=stdout_metric_format)]) [dllogger.log("PARAMETER", {k: v}) for k, v in vars(args).items()] for step in ['DNN', 'data+DNN', 'data']: for c in [0.99, 0.95, 0.9, 0.5]: cs = 'avg' if c == 0.5 else f'{int(100*c)}%' dllogger.metadata(f'{step.lower()}_latency_{c}', {'name': f'{step} latency {cs}', 'format': ':>7.2f', 'unit': 'ms'}) dllogger.metadata( 'eval_wer', {'name': 'WER', 'format': ':>3.2f', 'unit': '%'}) if args.cpu: device = torch.device('cpu') else: assert torch.cuda.is_available() device = torch.device('cuda') torch.backends.cudnn.benchmark = args.cudnn_benchmark if args.seed is not None: torch.manual_seed(args.seed + args.local_rank) np.random.seed(args.seed + args.local_rank) random.seed(args.seed + args.local_rank) # set up distributed training multi_gpu = not args.cpu and int(os.environ.get('WORLD_SIZE', 1)) > 1 if multi_gpu: torch.cuda.set_device(args.local_rank) distrib.init_process_group(backend='nccl', init_method='env://') print_once(f'Inference with {distrib.get_world_size()} GPUs') cfg = config.load(args.model_config) config.apply_config_overrides(cfg, args) symbols = helpers.add_ctc_blank(cfg['labels']) use_dali = args.dali_device in ('cpu', 'gpu') dataset_kw, features_kw = config.input(cfg, 'val') measure_perf = args.steps > 0 # dataset if args.transcribe_wav or args.transcribe_filelist: if use_dali: print("DALI supported only with input .json files; disabling") use_dali = False assert not args.pad_to_max_duration assert not (args.transcribe_wav and args.transcribe_filelist) if args.transcribe_wav: dataset = SingleAudioDataset(args.transcribe_wav) else: dataset = FilelistDataset(args.transcribe_filelist) data_loader = get_data_loader(dataset, batch_size=1, multi_gpu=multi_gpu, shuffle=False, num_workers=0, drop_last=(True if measure_perf else False)) _, features_kw = config.input(cfg, 'val') feat_proc = FilterbankFeatures(**features_kw) elif use_dali: # pad_to_max_duration is not supported by DALI - have simple padders if features_kw['pad_to_max_duration']: feat_proc = BaseFeatures( pad_align=features_kw['pad_align'], pad_to_max_duration=True, max_duration=features_kw['max_duration'], sample_rate=features_kw['sample_rate'], window_size=features_kw['window_size'], window_stride=features_kw['window_stride']) features_kw['pad_to_max_duration'] = False else: feat_proc = None data_loader = DaliDataLoader( gpu_id=args.local_rank or 0, dataset_path=args.dataset_dir, config_data=dataset_kw, config_features=features_kw, json_names=args.val_manifests, batch_size=args.batch_size, pipeline_type=("train" if measure_perf else "val"), # no drop_last device_type=args.dali_device, symbols=symbols) else: dataset = AudioDataset(args.dataset_dir, args.val_manifests, symbols, **dataset_kw) data_loader = get_data_loader(dataset, args.batch_size, multi_gpu=multi_gpu, shuffle=False, num_workers=4, drop_last=False) feat_proc = FilterbankFeatures(**features_kw) model = QuartzNet(encoder_kw=config.encoder(cfg), decoder_kw=config.decoder(cfg, n_classes=len(symbols))) if args.ckpt is not None: print(f'Loading the model from {args.ckpt} ...') checkpoint = torch.load(args.ckpt, map_location="cpu") key = 'ema_state_dict' if args.ema else 'state_dict' state_dict = checkpoint[key] model.load_state_dict(state_dict, strict=True) model.to(device) model.eval() if feat_proc is not None: feat_proc.to(device) feat_proc.eval() if args.amp: model = model.half() if args.torchscript: greedy_decoder = GreedyCTCDecoder() feat_proc, model, greedy_decoder = torchscript_export( data_loader, feat_proc, model, greedy_decoder, args.output_dir, use_amp=args.amp, use_conv_masks=True, model_toml=args.model_toml, device=device, save=args.torchscript_export) if multi_gpu: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) agg = {'txts': [], 'preds': [], 'logits': []} dur = {'data': [], 'dnn': [], 'data+dnn': []} looped_loader = chain.from_iterable(repeat(data_loader)) greedy_decoder = GreedyCTCDecoder() sync = lambda: torch.cuda.synchronize() if device.type == 'cuda' else None steps = args.steps + args.warmup_steps or len(data_loader) with torch.no_grad(): for it, batch in enumerate(tqdm(looped_loader, initial=1, total=steps)): if use_dali: feats, feat_lens, txt, txt_lens = batch if feat_proc is not None: feats, feat_lens = feat_proc(feats, feat_lens) else: batch = [t.to(device, non_blocking=True) for t in batch] audio, audio_lens, txt, txt_lens = batch feats, feat_lens = feat_proc(audio, audio_lens) sync() t1 = time.perf_counter() if args.amp: feats = feats.half() if model.encoder.use_conv_masks: log_probs, log_prob_lens = model(feats, feat_lens) else: log_probs = model(feats, feat_lens) preds = greedy_decoder(log_probs) sync() t2 = time.perf_counter() # burn-in period; wait for a new loader due to num_workers if it >= 1 and (args.steps == 0 or it >= args.warmup_steps): dur['data'].append(t1 - t0) dur['dnn'].append(t2 - t1) dur['data+dnn'].append(t2 - t0) if txt is not None: agg['txts'] += helpers.gather_transcripts([txt], [txt_lens], symbols) agg['preds'] += helpers.gather_predictions([preds], symbols) agg['logits'].append(log_probs) if it + 1 == steps: break sync() t0 = time.perf_counter() # communicate the results if args.transcribe_wav: for idx, p in enumerate(agg['preds']): print_once(f'Prediction {idx+1: >3}: {p}') elif args.transcribe_filelist: pass elif not multi_gpu or distrib.get_rank() == 0: wer, _ = process_evaluation_epoch(agg) dllogger.log(step=(), data={'eval_wer': 100 * wer}) if args.save_predictions: with open(args.save_predictions, 'w') as f: f.write('\n'.join(agg['preds'])) if args.save_logits: logits = torch.cat(agg['logits'], dim=0).cpu() torch.save(logits, args.save_logits) # report timings if len(dur['data']) >= 20: ratios = [0.9, 0.95, 0.99] for stage in dur: lat = durs_to_percentiles(dur[stage], ratios) for k in [0.99, 0.95, 0.9, 0.5]: kk = str(k).replace('.', '_') dllogger.log(step=(), data={f'{stage.lower()}_latency_{kk}': lat[k]}) else: print_once('Not enough samples to measure latencies.')
def init_dllogger(log_fpath=None, dummy=False): if dummy: DLLogger.init(backends=[]) return DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, log_fpath), StdOutBackend(Verbosity.VERBOSE, step_format=stdout_step_format, metric_format=stdout_metric_format) ] ) DLLogger.metadata("train_loss", {"name": "loss", "format": ":>5.2f"}) DLLogger.metadata("train_mel_loss", {"name": "mel loss", "format": ":>5.2f"}) DLLogger.metadata("avg_train_loss", {"name": "avg train loss", "format": ":>5.2f"}) DLLogger.metadata("avg_train_mel_loss", {"name": "avg train mel loss", "format": ":>5.2f"}) DLLogger.metadata("val_loss", {"name": " avg val loss", "format": ":>5.2f"}) DLLogger.metadata("val_mel_loss", {"name": " avg val mel loss", "format": ":>5.2f"}) DLLogger.metadata( "val_ema_loss", {"name": " EMA val loss", "format": ":>5.2f"}) DLLogger.metadata( "val_ema_mel_loss", {"name": " EMA val mel loss", "format": ":>5.2f"}) DLLogger.metadata( "train_frames/s", {"name": None, "unit": "frames/s", "format": ":>10.2f"}) DLLogger.metadata( "avg_train_frames/s", {"name": None, "unit": "frames/s", "format": ":>10.2f"}) DLLogger.metadata( "val_frames/s", {"name": None, "unit": "frames/s", "format": ":>10.2f"}) DLLogger.metadata( "val_ema_frames/s", {"name": None, "unit": "frames/s", "format": ":>10.2f"}) DLLogger.metadata( "took", {"name": "took", "unit": "s", "format": ":>3.2f"}) DLLogger.metadata("lrate_change", {"name": "lrate"})
def setup_logger(args): os.makedirs(args.results, exist_ok=True) log_path = os.path.join(args.results, args.log_file) if os.path.exists(log_path): for i in itertools.count(): s_fname = args.log_file.split('.') fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}' log_path = os.path.join(args.results, fname) if not os.path.exists(log_path): break def metric_format(metric, metadata, value): return "{}: {}".format(metric, f'{value:.5f}' if isinstance(value, float) else value) def step_format(step): if step == (): return "Finished |" elif isinstance(step, int): return "Step {0: <5} |".format(step) return "Step {} |".format(step) if not dist.is_initialized() or not args.distributed_world_size > 1 or args.distributed_rank == 0: dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path), TensorBoardBackend(verbosity=1, log_dir=args.results), StdOutBackend(verbosity=2, step_format=step_format, prefix_format=lambda x: "")#, #metric_format=metric_format) ]) else: dllogger.init(backends=[]) dllogger.log(step='PARAMETER', data=vars(args), verbosity=0) container_setup_info = {**get_framework_env_vars(), **get_system_info()} dllogger.log(step='ENVIRONMENT', data=container_setup_info, verbosity=0) dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'}) dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'}) dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'}) dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'}) dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f'}) dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f'}) dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'}) dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'}) dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'}) dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f'}) dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'}) dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'}) dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'}) dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f'}) dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'}) dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'}) dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
def main(): args = parse_args() init_distributed(args) if args.local_rank == 0: dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) ]) else: dllogger.init(backends=[]) dllogger.metadata('train_throughput', { "name": 'train_throughput', 'format': ":.3e" }) dllogger.metadata('hr@10', {"name": 'hr@10', 'format': ":.5f"}) dllogger.metadata('train_epoch_time', { "name": 'train_epoch_time', 'format': ":.3f" }) dllogger.metadata('validation_epoch_time', { "name": 'validation_epoch_time', 'format': ":.3f" }) dllogger.metadata('eval_throughput', { "name": 'eval_throughput', 'format': ":.3e" }) dllogger.log(data=vars(args), step='PARAMETER') if args.seed is not None: torch.manual_seed(args.seed) if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir: print("Saving results to {}".format(args.checkpoint_dir)) os.makedirs(args.checkpoint_dir, exist_ok=True) # sync workers before timing if args.distributed: torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0) torch.cuda.synchronize() main_start_time = time.time() feature_spec_path = os.path.join(args.data, args.feature_spec_file) feature_spec = FeatureSpec.from_yaml(feature_spec_path) trainset = dataloading.TorchTensorDataset(feature_spec, mapping_name='train', args=args) testset = dataloading.TorchTensorDataset(feature_spec, mapping_name='test', args=args) train_loader = dataloading.TrainDataloader(trainset, args) test_loader = dataloading.TestDataLoader(testset, args) # make pytorch memory behavior more consistent later torch.cuda.empty_cache() # Create model user_feature_name = feature_spec.channel_spec[USER_CHANNEL_NAME][0] item_feature_name = feature_spec.channel_spec[ITEM_CHANNEL_NAME][0] label_feature_name = feature_spec.channel_spec[LABEL_CHANNEL_NAME][0] model = NeuMF( nb_users=feature_spec.feature_spec[user_feature_name]['cardinality'], nb_items=feature_spec.feature_spec[item_feature_name]['cardinality'], mf_dim=args.factors, mlp_layer_sizes=args.layers, dropout=args.dropout) optimizer = FusedAdam(model.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps) criterion = nn.BCEWithLogitsLoss( reduction='none' ) # use torch.mean() with dim later to avoid copy to host # Move model and loss to GPU model = model.cuda() criterion = criterion.cuda() if args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale='dynamic') if args.distributed: model = DDP(model) local_batch = args.batch_size // args.world_size traced_criterion = torch.jit.trace( criterion.forward, (torch.rand(local_batch, 1), torch.rand(local_batch, 1))) print(model) print("{} parameters".format(utils.count_parameters(model))) if args.load_checkpoint_path: state_dict = torch.load(args.load_checkpoint_path) state_dict = { k.replace('module.', ''): v for k, v in state_dict.items() } model.load_state_dict(state_dict) if args.mode == 'test': start = time.time() hr, ndcg = val_epoch(model, test_loader, args.topk, distributed=args.distributed) val_time = time.time() - start eval_size = test_loader.raw_dataset_length eval_throughput = eval_size / val_time dllogger.log(step=tuple(), data={ 'best_eval_throughput': eval_throughput, 'hr@10': hr }) return # this should always be overridden if hr>0. # It is theoretically possible for the hit rate to be zero in the first epoch, which would result in referring # to an uninitialized variable. max_hr = 0 best_epoch = 0 best_model_timestamp = time.time() train_throughputs, eval_throughputs = [], [] for epoch in range(args.epochs): begin = time.time() batch_dict_list = train_loader.get_epoch_data() num_batches = len(batch_dict_list) for i in range(num_batches // args.grads_accumulated): for j in range(args.grads_accumulated): batch_idx = (args.grads_accumulated * i) + j batch_dict = batch_dict_list[batch_idx] user_features = batch_dict[USER_CHANNEL_NAME] item_features = batch_dict[ITEM_CHANNEL_NAME] user_batch = user_features[user_feature_name] item_batch = item_features[item_feature_name] label_features = batch_dict[LABEL_CHANNEL_NAME] label_batch = label_features[label_feature_name] outputs = model(user_batch, item_batch) loss = traced_criterion(outputs, label_batch.view(-1, 1)).float() loss = torch.mean(loss.view(-1), 0) if args.amp: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() for p in model.parameters(): p.grad = None del batch_dict_list train_time = time.time() - begin begin = time.time() epoch_samples = train_loader.length_after_augmentation train_throughput = epoch_samples / train_time train_throughputs.append(train_throughput) hr, ndcg = val_epoch(model, test_loader, args.topk, distributed=args.distributed) val_time = time.time() - begin eval_size = test_loader.raw_dataset_length eval_throughput = eval_size / val_time eval_throughputs.append(eval_throughput) dllogger.log(step=(epoch, ), data={ 'train_throughput': train_throughput, 'hr@10': hr, 'train_epoch_time': train_time, 'validation_epoch_time': val_time, 'eval_throughput': eval_throughput }) if hr > max_hr and args.local_rank == 0: max_hr = hr best_epoch = epoch print("New best hr!") if args.checkpoint_dir: save_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.pth') print("Saving the model to: ", save_checkpoint_path) torch.save(model.state_dict(), save_checkpoint_path) best_model_timestamp = time.time() if args.threshold is not None: if hr >= args.threshold: print("Hit threshold of {}".format(args.threshold)) break if args.local_rank == 0: dllogger.log(data={ 'best_train_throughput': max(train_throughputs), 'best_eval_throughput': max(eval_throughputs), 'mean_train_throughput': np.mean(train_throughputs), 'mean_eval_throughput': np.mean(eval_throughputs), 'best_accuracy': max_hr, 'best_epoch': best_epoch, 'time_to_target': time.time() - main_start_time, 'time_to_best_model': best_model_timestamp - main_start_time }, step=tuple())