def init_dllogger(log_fpath=None, dummy=False): if dummy: DLLogger.init(backends=[]) return DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, log_fpath), StdOutBackend(Verbosity.VERBOSE, step_format=stdout_step_format, metric_format=stdout_metric_format) ] ) DLLogger.metadata("train_loss", {"name": "loss", "format": ":>5.2f"}) DLLogger.metadata("train_mel_loss", {"name": "mel loss", "format": ":>5.2f"}) DLLogger.metadata("avg_train_loss", {"name": "avg train loss", "format": ":>5.2f"}) DLLogger.metadata("avg_train_mel_loss", {"name": "avg train mel loss", "format": ":>5.2f"}) DLLogger.metadata("val_loss", {"name": " avg val loss", "format": ":>5.2f"}) DLLogger.metadata("val_mel_loss", {"name": " avg val mel loss", "format": ":>5.2f"}) DLLogger.metadata( "val_ema_loss", {"name": " EMA val loss", "format": ":>5.2f"}) DLLogger.metadata( "val_ema_mel_loss", {"name": " EMA val mel loss", "format": ":>5.2f"}) DLLogger.metadata( "train_frames/s", {"name": None, "unit": "frames/s", "format": ":>10.2f"}) DLLogger.metadata( "avg_train_frames/s", {"name": None, "unit": "frames/s", "format": ":>10.2f"}) DLLogger.metadata( "val_frames/s", {"name": None, "unit": "frames/s", "format": ":>10.2f"}) DLLogger.metadata( "val_ema_frames/s", {"name": None, "unit": "frames/s", "format": ":>10.2f"}) DLLogger.metadata( "took", {"name": "took", "unit": "s", "format": ":>3.2f"}) DLLogger.metadata("lrate_change", {"name": "lrate"})
def setup_logger(args): aggregator_dict = OrderedDict([ ('loss', 'average'), ('weighted_loss', 'average'), ('tokens', ('average', 'performance')), ('updates', 'performance'), ('gnorm', 'average') ]) os.makedirs(args.save_dir, exist_ok=True) log_path = os.path.join(args.save_dir, args.stat_file) if os.path.exists(log_path): for i in itertools.count(): s_fname = args.stat_file.split('.') fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}' log_path = os.path.join(args.save_dir, fname) if not os.path.exists(log_path): break if not args.distributed_world_size > 1 or args.distributed_rank == 0: dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path), AggregatorBackend(verbosity=0, agg_dict=aggregator_dict), TensorBoardBackend(verbosity=1, log_dir=args.save_dir)]) else: dllogger.init(backends=[]) for k, v in vars(args).items(): dllogger.log(step='PARAMETER', data={k: v}, verbosity=0) container_setup_info = get_framework_env_vars() dllogger.log(step='PARAMETER', data=container_setup_info, verbosity=0) dllogger.metadata('loss', {'unit': 'nat', 'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN'}) dllogger.metadata('val_loss', {'unit': 'nat', 'GOAL': 'MINIMIZE', 'STAGE': 'VAL'}) dllogger.metadata('speed', {'unit': 'tokens/s', 'format': ':.3f', 'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN'}) dllogger.metadata('accuracy', {'unit': 'bleu', 'format': ':.2f', 'GOAL': 'MAXIMIZE', 'STAGE': 'VAL'})
def __init__(self, log_file, global_batch_size, warmup_steps: int = 0, profile: bool = False): logger.init(backends=[JSONStreamBackend(Verbosity.VERBOSE, log_file), StdOutBackend(Verbosity.VERBOSE)]) self.warmup_steps = warmup_steps self.global_batch_size = global_batch_size self.step = 0 self.profile = profile self.timestamps = []
def get_dllogger(results): return Logger( backends=[ JSONStreamBackend(Verbosity.VERBOSE, os.path.join(results, "logs.json")), StdOutBackend(Verbosity.VERBOSE, step_format=lambda step: f"Epoch: {step} "), ] )
def init(log_fpath, log_dir, enabled=True, tb_subsets=[], **tb_kw): if enabled: backends = [JSONStreamBackend(Verbosity.DEFAULT, unique_log_fpath(log_fpath)), StdOutBackend(Verbosity.VERBOSE, step_format=stdout_step_format, metric_format=stdout_metric_format)] else: backends = [] dllogger.init(backends=backends) dllogger.metadata("train_lrate", {"name": "lrate", "format": ":>3.2e"}) for id_, pref in [('train', ''), ('train_avg', 'avg train '), ('val', ' avg val '), ('val_ema', ' EMA val ')]: dllogger.metadata(f"{id_}_loss", {"name": f"{pref}loss", "format": ":>5.2f"}) dllogger.metadata(f"{id_}_mel_loss", {"name": f"{pref}mel loss", "format": ":>5.2f"}) dllogger.metadata(f"{id_}_frames/s", {"name": None, "unit": "frames/s", "format": ":>10.2f"}) dllogger.metadata(f"{id_}_took", {"name": "took", "unit": "s", "format": ":>3.2f"}) global tb_loggers tb_loggers = {s: TBLogger(enabled, log_dir, name=s, **tb_kw) for s in tb_subsets}
def main(): """ Launches inference benchmark. Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch FastPitch Inference Benchmark') parser = parse_args(parser) args, _ = parser.parse_known_args() log_file = args.log_file DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.log(step="PARAMETER", data={'model_name': 'FastPitch_PyT'}) model = load_and_setup_model('FastPitch', parser, None, args.amp_run, 'cuda', unk_args=[], forward_is_infer=True, ema=False, jitable=True) # FIXME Temporarily disabled due to nn.LayerNorm fp16 casting bug in pytorch:20.02-py3 and 20.03 # model = torch.jit.script(model) warmup_iters = 3 iters = 1 gen_measures = MeasureTime() all_frames = 0 for i in range(-warmup_iters, iters): text_padded = torch.randint(low=0, high=148, size=(args.batch_size, 128), dtype=torch.long).to('cuda') input_lengths = torch.IntTensor([text_padded.size(1)] * args.batch_size).to('cuda') durs = torch.ones_like(text_padded).mul_(4).to('cuda') with torch.no_grad(), gen_measures: mels, *_ = model(text_padded, input_lengths, dur_tgt=durs) num_frames = mels.size(0) * mels.size(2) if i >= 0: all_frames += num_frames DLLogger.log(step=(i, ), data={"latency": gen_measures[-1]}) DLLogger.log(step=(i, ), data={"frames/s": num_frames / gen_measures[-1]}) measures = gen_measures[warmup_iters:] DLLogger.log(step=(), data={'avg latency': np.mean(measures)}) DLLogger.log(step=(), data={'avg frames/s': all_frames / np.sum(measures)}) DLLogger.flush()
def get_logger(params): backends = [] if hvd.rank() == 0: backends += [StdOutBackend(Verbosity.VERBOSE)] if params.log_dir: backends += [JSONStreamBackend(Verbosity.VERBOSE, params.log_dir)] logger.init(backends=backends) return logger
def _initialize_dllogger(self, log_dir, filename, append): backends = [ JSONStreamBackend(Verbosity.VERBOSE, os.path.join(log_dir, filename), append=append), StdOutBackend(Verbosity.VERBOSE), ] logger.init(backends=backends)
def __init__(self, log_dir, global_batch_size, mode, warmup, dim, profile): logger.init(backends=[JSONStreamBackend(Verbosity.VERBOSE, log_dir), StdOutBackend(Verbosity.VERBOSE)]) self.warmup_steps = warmup self.global_batch_size = global_batch_size self.step = 0 self.dim = dim self.mode = mode self.profile = profile self.timestamps = []
def setup_logger(args): aggregator_dict = OrderedDict([('loss', 'average'), ('weighted_loss', 'average'), ('tokens', ('average', 'performance')), ('updates', 'performance'), ('gnorm', 'average')]) os.makedirs(args.save_dir, exist_ok=True) log_path = os.path.join(args.save_dir, args.stat_file) os.makedirs(args.save_dir, exist_ok=True) if not args.distributed_world_size > 1 or args.distributed_rank == 0: dllogger.init(backends=[ JSONStreamBackend(verbosity=1, filename=log_path), AggregatorBackend(verbosity=0, agg_dict=aggregator_dict), TensorBoardBackend(verbosity=1, log_dir=args.save_dir) ]) else: dllogger.init(backends=[]) for k, v in vars(args).items(): dllogger.log(step='PARAMETER', data={k: v}, verbosity=0) container_setup_info = { 'NVIDIA_PYTORCH_VERSION': os.environ.get('NVIDIA_PYTORCH_VERSION'), 'PYTORCH_VERSION': os.environ.get('PYTORCH_VERSION'), 'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'), 'NCCL_VERSION': os.environ.get('NCCL_VERSION'), 'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'), 'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'), 'CUDA_VERSION': os.environ.get('CUDA_VERSION'), 'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'), 'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'), 'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'), } dllogger.log(step='PARAMETER', data=container_setup_info, verbosity=0) dllogger.metadata('loss', { 'unit': 'nat', 'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN' }) dllogger.metadata('val_loss', { 'unit': 'nat', 'GOAL': 'MINIMIZE', 'STAGE': 'VAL' }) dllogger.metadata('speed', { 'unit': 'tokens/s', 'format': ':.3f', 'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN' }) dllogger.metadata('accuracy', { 'unit': 'bleu', 'format': ':.2f', 'GOAL': 'MAXIMIZE', 'STAGE': 'VAL' })
def log(logname, dice, results="/results"): dllogger = Logger(backends=[ JSONStreamBackend(Verbosity.VERBOSE, os.path.join(results, logname)), StdOutBackend(Verbosity.VERBOSE, step_format=lambda step: ""), ]) metrics = {} metrics.update({"Mean dice": round(dice.mean().item(), 2)}) metrics.update({f"L{j+1}": round(m.item(), 2) for j, m in enumerate(dice)}) dllogger.log(step=(), data=metrics) dllogger.flush()
def get_logger(params): backends = [] if params.worker_id == 0 or params.log_all_workers: backends += [StdOutBackend(Verbosity.VERBOSE)] if params.log_dir: os.makedirs(params.log_dir, exist_ok=True) log_file = f"{params.log_dir}/log.json" backends += [JSONStreamBackend(Verbosity.VERBOSE, log_file)] logger.init(backends=backends) return logger
def get_logger(params): backends = [] worker_id = hvd_rank() if horovod_enabled() else 0 if worker_id == 0: backends += [StdOutBackend(Verbosity.VERBOSE)] if params.log_dir: os.makedirs(params.log_dir, exist_ok=True) log_file = f"{params.log_dir}/log.json" backends += [JSONStreamBackend(Verbosity.VERBOSE, log_file)] logger.init(backends=backends) return logger
def setup_logger(args): os.makedirs(args.results, exist_ok=True) log_path = os.path.join(args.results, args.log_file) if os.path.exists(log_path): for i in itertools.count(): s_fname = args.log_file.split('.') fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}' log_path = os.path.join(args.results, fname) if not os.path.exists(log_path): break def metric_format(metric, metadata, value): return "{}: {}".format(metric, f'{value:.5f}' if isinstance(value, float) else value) def step_format(step): if step == (): return "Finished |" elif isinstance(step, int): return "Step {0: <5} |".format(step) return "Step {} |".format(step) if not dist.is_initialized() or not args.distributed_world_size > 1 or args.distributed_rank == 0: dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path), TensorBoardBackend(verbosity=1, log_dir=args.results), StdOutBackend(verbosity=2, step_format=step_format, prefix_format=lambda x: "")#, #metric_format=metric_format) ]) else: dllogger.init(backends=[]) dllogger.log(step='PARAMETER', data=vars(args), verbosity=0) container_setup_info = {**get_framework_env_vars(), **get_system_info()} dllogger.log(step='ENVIRONMENT', data=container_setup_info, verbosity=0) dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'}) dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'}) dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'}) dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'}) dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f'}) dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f'}) dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'}) dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'}) dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'}) dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f'}) dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'}) dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'}) dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'}) dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f'}) dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'}) dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'}) dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
def init_log(args): enabled = not dist.is_initialized() or dist.get_rank() == 0 if enabled: fpath = args.log_file or os.path.join(args.output_dir, 'nvlog.json') backends = [ JSONStreamBackend(Verbosity.DEFAULT, unique_log_fpath(fpath)), StdOutBackend(Verbosity.VERBOSE, step_format=stdout_step_format, metric_format=stdout_metric_format) ] else: backends = [] dllogger.init(backends=backends) dllogger.metadata("train_lrate", {"name": "lrate", "format": ":>3.2e"}) for id_, pref in [('train', ''), ('train_avg', 'avg train '), ('dev_ema', ' dev ema ')]: dllogger.metadata(f"{id_}_loss", { "name": f"{pref}loss", "format": ":>7.2f" }) dllogger.metadata(f"{id_}_wer", { "name": f"{pref}wer", "format": ":>6.2f" }) dllogger.metadata(f"{id_}_pplx", { "name": f"{pref}pplx", "format": ":>6.2f" }) dllogger.metadata(f"{id_}_throughput", { "name": f"{pref}utts/s", "format": ":>5.0f" }) dllogger.metadata(f"{id_}_took", { "name": "took", "unit": "s", "format": ":>5.2f" }) tb_subsets = ['train', 'dev_ema'] global tb_loggers tb_loggers = { s: TBLogger(enabled, args.output_dir, name=s) for s in tb_subsets } log_parameters(vars(args), tb_subset='train')
def get_dllogger(params): backends = [] if is_main_process(): backends += [StdOutBackend(Verbosity.VERBOSE)] if params.log_dir: backends += [ JSONStreamBackend(Verbosity.VERBOSE, os.path.join(params.log_dir, "log.json")) ] logger.init(backends=backends) return logger
def setup_dllogger(rank, enabled=True, filename='log.json'): if enabled and rank == 0: backends = [ StdOutBackend(Verbosity.DEFAULT), JSONStreamBackend( Verbosity.VERBOSE, filename, ), ] DLLogger.init(backends) else: DLLogger.init([])
def get_logger(params): """ Get logger object :param params: Dict with additional parameters :return: logger """ backends = [] if hvd.rank() == 0: backends += [StdOutBackend(Verbosity.VERBOSE)] if params.log_dir: backends += [JSONStreamBackend(Verbosity.VERBOSE, params.log_dir)] logger.init(backends=backends) return logger
def log(logname, dice, epoch=None, dice_tta=None): dllogger = Logger(backends=[ JSONStreamBackend(Verbosity.VERBOSE, os.path.join( args.results, logname)), StdOutBackend(Verbosity.VERBOSE, step_format=lambda step: ""), ]) metrics = {} if epoch is not None: metrics.update({"Epoch": epoch}) metrics.update({"Mean dice": round(dice.mean().item(), 2)}) if dice_tta is not None: metrics.update({"Mean TTA dice": round(dice_tta.mean().item(), 2)}) metrics.update({f"L{j+1}": round(m.item(), 2) for j, m in enumerate(dice)}) if dice_tta is not None: metrics.update({ f"TTA_L{j+1}": round(m.item(), 2) for j, m in enumerate(dice_tta) }) dllogger.log(step=(), data=metrics) dllogger.flush()
def __init__(self, args): super(Model, self).__init__() self.save_hyperparameters() self.args = args self.f1_score = F1(args) self.model = UNetLoc(args) if args.type == "pre" else get_dmg_unet( args) self.loss = Loss(args) self.best_f1 = torch.tensor(0) self.best_epoch = 0 self.tta_flips = [[2], [3], [2, 3]] self.lr = args.lr self.n_class = 2 if self.args.type == "pre" else 5 self.softmax = nn.Softmax(dim=1) self.test_idx = 0 self.dllogger = Logger(backends=[ JSONStreamBackend( Verbosity.VERBOSE, os.path.join(args.results, f"{args.logname}.json")), StdOutBackend(Verbosity.VERBOSE, step_format=lambda step: f"Epoch: {step} "), ])
def setup_logger(config): log_path = config.get("log_path", os.getcwd()) if is_main_process(): backends = [ TensorBoardBackend(verbosity=dllogger.Verbosity.VERBOSE, log_dir=log_path), JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=os.path.join(log_path, "log.json")), AggregatorBackend(verbosity=dllogger.Verbosity.VERBOSE, agg_dict={"loss": AverageMeter}), StdOutBackend( verbosity=dllogger.Verbosity.DEFAULT, step_format=empty_step_format, metric_format=no_string_metric_format, prefix_format=empty_prefix_format, ), ] logger = Logger(backends=backends) else: logger = Logger(backends=[]) container_setup_info = get_framework_env_vars() logger.log(step="PARAMETER", data=container_setup_info, verbosity=dllogger.Verbosity.DEFAULT) logger.metadata("loss", { "unit": "nat", "GOAL": "MINIMIZE", "STAGE": "TRAIN" }) logger.metadata("val_loss", { "unit": "nat", "GOAL": "MINIMIZE", "STAGE": "VAL" }) return logger
def __init__(self, args): super(NNUnet, self).__init__() self.args = args self.save_hyperparameters() self.build_nnunet() self.loss = Loss() self.dice = Dice(self.n_class) self.best_sum = 0 self.eval_dice = 0 self.best_sum_epoch = 0 self.best_dice = self.n_class * [0] self.best_epoch = self.n_class * [0] self.best_sum_dice = self.n_class * [0] self.learning_rate = args.learning_rate if self.args.exec_mode in ["train", "evaluate"]: self.dllogger = Logger(backends=[ JSONStreamBackend(Verbosity.VERBOSE, os.path.join(args.results, "logs.json")), StdOutBackend(Verbosity.VERBOSE, step_format=lambda step: f"Epoch: {step} "), ]) self.tta_flips = ([[2], [3], [2, 3]] if self.args.dim == 2 else [[2], [3], [4], [2, 3], [2, 4], [3, 4], [2, 3, 4]])
def __init__(self, log_path="bert_dllog.json"): self.logger = Logger([ StdOutBackend(Verbosity.DEFAULT, step_format=self.format_step), JSONStreamBackend(Verbosity.VERBOSE, log_path), ]) self.logger.metadata("mlm_loss", { "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN" }) self.logger.metadata("nsp_loss", { "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN" }) self.logger.metadata("avg_loss_step", { "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN" }) self.logger.metadata("total_loss", { "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN" }) self.logger.metadata("loss", { "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN" }) self.logger.metadata("f1", { "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL" }) self.logger.metadata("precision", { "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL" }) self.logger.metadata("recall", { "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL" }) self.logger.metadata("mcc", { "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL" }) self.logger.metadata("exact_match", { "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL" }) self.logger.metadata( "throughput_train", { "unit": "seq/s", "format": ":.3f", "GOAL": "MAXIMIZE", "STAGE": "TRAIN" }, ) self.logger.metadata( "throughput_inf", { "unit": "seq/s", "format": ":.3f", "GOAL": "MAXIMIZE", "STAGE": "VAL" }, )
def main(): parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Training') parser = parse_args(parser) args, _ = parser.parse_known_args() if 'LOCAL_RANK' in os.environ and 'WORLD_SIZE' in os.environ: local_rank = int(os.environ['LOCAL_RANK']) world_size = int(os.environ['WORLD_SIZE']) else: local_rank = args.rank world_size = args.world_size distributed_run = world_size > 1 if local_rank == 0: log_file = os.path.join(args.output, args.log_file) DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_file), StdOutBackend(Verbosity.VERBOSE)]) else: DLLogger.init(backends=[]) for k,v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k:v}) DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'}) model_name = args.model_name parser = models.model_parser(model_name, parser) args, _ = parser.parse_known_args() torch.backends.cudnn.enabled = args.cudnn_enabled torch.backends.cudnn.benchmark = args.cudnn_benchmark if distributed_run: init_distributed(args, world_size, local_rank, args.group_name) torch.cuda.synchronize() run_start_time = time.perf_counter() model_config = models.get_model_config(model_name, args) model = models.get_model(model_name, model_config, cpu_run=False, uniform_initialize_bn_weight=not args.disable_uniform_initialize_bn_weight) if distributed_run: model = DDP(model,device_ids=[local_rank],output_device=local_rank) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) scaler = torch.cuda.amp.GradScaler(enabled=args.amp) try: sigma = args.sigma except AttributeError: sigma = None start_epoch = [0] if args.resume_from_last: args.checkpoint_path = get_last_checkpoint_filename(args.output, model_name) if args.checkpoint_path is not "": load_checkpoint(model, optimizer, start_epoch, model_config, args.amp, args.checkpoint_path, local_rank) start_epoch = start_epoch[0] criterion = loss_functions.get_loss_function(model_name, sigma) try: n_frames_per_step = args.n_frames_per_step except AttributeError: n_frames_per_step = None collate_fn = data_functions.get_collate_function( model_name, n_frames_per_step) trainset = data_functions.get_data_loader( model_name, args.dataset_path, args.training_files, args) if distributed_run: train_sampler = DistributedSampler(trainset) shuffle = False else: train_sampler = None shuffle = True train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle, sampler=train_sampler, batch_size=args.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) valset = data_functions.get_data_loader( model_name, args.dataset_path, args.validation_files, args) batch_to_gpu = data_functions.get_batch_to_gpu(model_name) iteration = 0 train_epoch_items_per_sec = 0.0 val_loss = 0.0 num_iters = 0 model.train() for epoch in range(start_epoch, args.epochs): torch.cuda.synchronize() epoch_start_time = time.perf_counter() # used to calculate avg items/sec over epoch reduced_num_items_epoch = 0 train_epoch_items_per_sec = 0.0 num_iters = 0 reduced_loss = 0 # if overflow at the last iteration then do not save checkpoint overflow = False if distributed_run: train_loader.sampler.set_epoch(epoch) for i, batch in enumerate(train_loader): torch.cuda.synchronize() iter_start_time = time.perf_counter() DLLogger.log(step=(epoch, i), data={'glob_iter/iters_per_epoch': str(iteration)+"/"+str(len(train_loader))}) adjust_learning_rate(iteration, epoch, optimizer, args.learning_rate, args.anneal_steps, args.anneal_factor, local_rank) model.zero_grad() x, y, num_items = batch_to_gpu(batch) #AMP upstream autocast with torch.cuda.amp.autocast(enabled=args.amp): y_pred = model(x) loss = criterion(y_pred, y) if distributed_run: reduced_loss = reduce_tensor(loss.data, world_size).item() reduced_num_items = reduce_tensor(num_items.data, 1).item() else: reduced_loss = loss.item() reduced_num_items = num_items.item() if np.isnan(reduced_loss): raise Exception("loss is NaN") DLLogger.log(step=(epoch,i), data={'train_loss': reduced_loss}) num_iters += 1 # accumulate number of items processed in this epoch reduced_num_items_epoch += reduced_num_items if args.amp: scaler.scale(loss).backward() scaler.unscale_(optimizer) grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), args.grad_clip_thresh) scaler.step(optimizer) scaler.update() optimizer.zero_grad(set_to_none=True) else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), args.grad_clip_thresh) optimizer.step() torch.cuda.synchronize() iter_stop_time = time.perf_counter() iter_time = iter_stop_time - iter_start_time items_per_sec = reduced_num_items/iter_time train_epoch_items_per_sec += items_per_sec DLLogger.log(step=(epoch, i), data={'train_items_per_sec': items_per_sec}) DLLogger.log(step=(epoch, i), data={'train_iter_time': iter_time}) iteration += 1 torch.cuda.synchronize() epoch_stop_time = time.perf_counter() epoch_time = epoch_stop_time - epoch_start_time DLLogger.log(step=(epoch,), data={'train_items_per_sec': (train_epoch_items_per_sec/num_iters if num_iters > 0 else 0.0)}) DLLogger.log(step=(epoch,), data={'train_loss': reduced_loss}) DLLogger.log(step=(epoch,), data={'train_epoch_time': epoch_time}) val_loss, val_items_per_sec = validate(model, criterion, valset, epoch, iteration, args.batch_size, world_size, collate_fn, distributed_run, local_rank, batch_to_gpu) if (epoch % args.epochs_per_checkpoint == 0) and args.bench_class == "": save_checkpoint(model, optimizer, scaler, epoch, model_config, args.amp, args.output, args.model_name, local_rank, world_size) if local_rank == 0: DLLogger.flush() torch.cuda.synchronize() run_stop_time = time.perf_counter() run_time = run_stop_time - run_start_time DLLogger.log(step=tuple(), data={'run_time': run_time}) DLLogger.log(step=tuple(), data={'val_loss': val_loss}) DLLogger.log(step=tuple(), data={'train_items_per_sec': (train_epoch_items_per_sec/num_iters if num_iters > 0 else 0.0)}) DLLogger.log(step=tuple(), data={'val_items_per_sec': val_items_per_sec}) if local_rank == 0: DLLogger.flush()
def main(): parser = argparse.ArgumentParser( description='PyTorch TTS Data Pre-processing') parser = parse_args(parser) args, unk_args = parser.parse_known_args() if len(unk_args) > 0: raise ValueError(f'Invalid options {unk_args}') if args.extract_pitch_char: assert args.extract_durations, "Durations required for pitch extraction" DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) model = load_and_setup_model( 'Tacotron2', parser, args.tacotron2_checkpoint, amp=False, device=torch.device('cuda' if args.cuda else 'cpu'), forward_is_infer=False, ema=False) if args.train_mode: model.train() # n_mel_channels arg has been consumed by model's arg parser args.n_mel_channels = model.n_mel_channels for datum in ('mels', 'mels_teacher', 'attentions', 'durations', 'pitch_mel', 'pitch_char', 'pitch_trichar'): if getattr(args, f'extract_{datum}'): Path(args.dataset_path, datum).mkdir(parents=False, exist_ok=True) filenames = [ Path(l.split('|')[0]).stem for l in open(args.wav_text_filelist, 'r') ] # Compatibility with Tacotron2 Data loader args.n_speakers = 1 dataset = FilenamedLoader(filenames, args.dataset_path, args.wav_text_filelist, args, load_mel_from_disk=False) # TextMelCollate supports only n_frames_per_step=1 data_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, sampler=None, num_workers=0, collate_fn=TextMelCollate(1), pin_memory=False, drop_last=False) pitch_vecs = {'mel': {}, 'char': {}, 'trichar': {}} for i, batch in enumerate(data_loader): tik = time.time() fnames = batch[-1] x, _, _ = batch_to_gpu(batch[:-1]) _, text_lens, mels_padded, _, mel_lens = x for j, mel in enumerate(mels_padded): fpath = Path(args.dataset_path, 'mels', fnames[j] + '.pt') torch.save(mel[:, :mel_lens[j]].cpu(), fpath) with torch.no_grad(): out_mels, out_mels_postnet, _, alignments = model.forward(x) if args.extract_mels_teacher: for j, mel in enumerate(out_mels_postnet): fpath = Path(args.dataset_path, 'mels_teacher', fnames[j] + '.pt') torch.save(mel[:, :mel_lens[j]].cpu(), fpath) if args.extract_attentions: for j, ali in enumerate(alignments): ali = ali[:mel_lens[j], :text_lens[j]] fpath = Path(args.dataset_path, 'attentions', fnames[j] + '.pt') torch.save(ali.cpu(), fpath) durations = [] if args.extract_durations: for j, ali in enumerate(alignments): text_len = text_lens[j] ali = ali[:mel_lens[j], :text_len] dur = torch.histc(torch.argmax(ali, dim=1), min=0, max=text_len - 1, bins=text_len) durations.append(dur) fpath = Path(args.dataset_path, 'durations', fnames[j] + '.pt') torch.save(dur.cpu().int(), fpath) if args.extract_pitch_mel or args.extract_pitch_char or args.extract_pitch_trichar: for j, dur in enumerate(durations): fpath = Path(args.dataset_path, 'pitch_char', fnames[j] + '.pt') wav = Path(args.dataset_path, 'wavs', fnames[j] + '.wav') p_mel, p_char, p_trichar = calculate_pitch( str(wav), dur.cpu().numpy()) pitch_vecs['mel'][fnames[j]] = p_mel pitch_vecs['char'][fnames[j]] = p_char pitch_vecs['trichar'][fnames[j]] = p_trichar nseconds = time.time() - tik DLLogger.log(step=f'{i+1}/{len(data_loader)} ({nseconds:.2f}s)', data={}) if args.extract_pitch_mel: normalize_pitch_vectors(pitch_vecs['mel']) for fname, pitch in pitch_vecs['mel'].items(): fpath = Path(args.dataset_path, 'pitch_mel', fname + '.pt') torch.save(torch.from_numpy(pitch), fpath) if args.extract_pitch_char: mean, std = normalize_pitch_vectors(pitch_vecs['char']) for fname, pitch in pitch_vecs['char'].items(): fpath = Path(args.dataset_path, 'pitch_char', fname + '.pt') torch.save(torch.from_numpy(pitch), fpath) save_stats(args.dataset_path, args.wav_text_filelist, 'pitch_char', mean, std) if args.extract_pitch_trichar: normalize_pitch_vectors(pitch_vecs['trichar']) for fname, pitch in pitch_vecs['trichar'].items(): fpath = Path(args.dataset_path, 'pitch_trichar', fname + '.pt') torch.save(torch.from_numpy(pitch), fpath) DLLogger.flush()
def main(): parser = argparse.ArgumentParser( description='TensorRT Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() # initialize CUDA state torch.cuda.init() TRT_LOGGER = trt.Logger(trt.Logger.WARNING) encoder = load_engine(args.encoder, TRT_LOGGER) decoder_iter = load_engine(args.decoder, TRT_LOGGER) postnet = load_engine(args.postnet, TRT_LOGGER) waveglow = load_engine(args.waveglow, TRT_LOGGER) if args.waveglow_ckpt != "": # setup denoiser using WaveGlow PyTorch checkpoint waveglow_ckpt = load_and_setup_model('WaveGlow', parser, args.waveglow_ckpt, True, forward_is_infer=True) denoiser = Denoiser(waveglow_ckpt).cuda() # after initialization, we don't need WaveGlow PyTorch checkpoint # anymore - deleting del waveglow_ckpt torch.cuda.empty_cache() # create TRT contexts for each engine encoder_context = encoder.create_execution_context() decoder_context = decoder_iter.create_execution_context() postnet_context = postnet.create_execution_context() waveglow_context = waveglow.create_execution_context() DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.output+'/'+args.log_file), StdOutBackend(Verbosity.VERBOSE)]) texts = [] try: f = open(args.input, 'r') texts = f.readlines() except: print("Could not read file") sys.exit(1) measurements = {} sequences, sequence_lengths = prepare_input_sequence(texts) sequences = sequences.to(torch.int32) sequence_lengths = sequence_lengths.to(torch.int32) with MeasureTime(measurements, "latency"): mel, mel_lengths = infer_tacotron2_trt(encoder, decoder_iter, postnet, encoder_context, decoder_context, postnet_context, sequences, sequence_lengths, measurements, args.fp16) audios = infer_waveglow_trt(waveglow, waveglow_context, mel, measurements, args.fp16) with encoder_context, decoder_context, postnet_context, waveglow_context: pass audios = audios.float() if args.waveglow_ckpt != "": with MeasureTime(measurements, "denoiser"): audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) for i, audio in enumerate(audios): audio = audio[:mel_lengths[i]*args.stft_hop_length] audio = audio/torch.max(torch.abs(audio)) audio_path = args.output + "audio_"+str(i)+"_trt.wav" write(audio_path, args.sampling_rate, audio.cpu().numpy()) DLLogger.log(step=0, data={"tacotron2_encoder_latency": measurements['tacotron2_encoder_time']}) DLLogger.log(step=0, data={"tacotron2_decoder_latency": measurements['tacotron2_decoder_time']}) DLLogger.log(step=0, data={"tacotron2_postnet_latency": measurements['tacotron2_postnet_time']}) DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']}) DLLogger.log(step=0, data={"latency": measurements['latency']}) if args.waveglow_ckpt != "": DLLogger.log(step=0, data={"denoiser": measurements['denoiser']}) DLLogger.flush() prec = "fp16" if args.fp16 else "fp32" latency = measurements['latency'] throughput = audios.size(1)/latency log_data = "1,"+str(sequence_lengths[0].item())+","+prec+","+str(latency)+","+str(throughput)+","+str(mel_lengths[0].item())+"\n" with open("log_bs1_"+prec+".log", 'a') as f: f.write(log_data)
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU or CPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, unknown_args = parser.parse_known_args() DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.log_file), StdOutBackend(Verbosity.VERBOSE)]) for k,v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k:v}) DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'}) measurements_all = {"pre_processing": [], "tacotron2_latency": [], "waveglow_latency": [], "latency": [], "type_conversion": [], "data_transfer": [], "storage": [], "tacotron2_items_per_sec": [], "waveglow_items_per_sec": [], "num_mels_per_audio": [], "throughput": []} print("args:", args, unknown_args) tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.amp_run, args.cpu_run, forward_is_infer=True) waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.amp_run, args.cpu_run) if args.cpu_run: denoiser = Denoiser(waveglow, args.cpu_run) else: denoiser = Denoiser(waveglow, args.cpu_run).cuda() jitted_tacotron2 = torch.jit.script(tacotron2) texts = ["The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."] texts = [texts[0][:args.input_length]] texts = texts*args.batch_size warmup_iters = 3 for iter in range(args.num_iters): measurements = {} with MeasureTime(measurements, "pre_processing", args.cpu_run): sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu_run) with torch.no_grad(): with MeasureTime(measurements, "latency", args.cpu_run): with MeasureTime(measurements, "tacotron2_latency", args.cpu_run): mel, mel_lengths, _ = jitted_tacotron2(sequences_padded, input_lengths) with MeasureTime(measurements, "waveglow_latency", args.cpu_run): audios = waveglow.infer(mel, sigma=args.sigma_infer) audios = audios.float() audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) num_mels = mel.size(0)*mel.size(2) num_samples = audios.size(0)*audios.size(1) with MeasureTime(measurements, "type_conversion", args.cpu_run): audios = audios.float() with MeasureTime(measurements, "data_transfer", args.cpu_run): audios = audios.cpu() with MeasureTime(measurements, "storage", args.cpu_run): audios = audios.numpy() for i, audio in enumerate(audios): audio_path = "audio_"+str(i)+".wav" write(audio_path, args.sampling_rate, audio[:mel_lengths[i]*args.stft_hop_length]) measurements['tacotron2_items_per_sec'] = num_mels/measurements['tacotron2_latency'] measurements['waveglow_items_per_sec'] = num_samples/measurements['waveglow_latency'] measurements['num_mels_per_audio'] = mel.size(2) measurements['throughput'] = num_samples/measurements['latency'] if iter >= warmup_iters: for k,v in measurements.items(): measurements_all[k].append(v) DLLogger.log(step=(iter-warmup_iters), data={k: v}) DLLogger.flush() print_stats(measurements_all)
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.output + '/' + args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'}) tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.amp_run, forward_is_infer=True) waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.amp_run, forward_is_infer=True) denoiser = Denoiser(waveglow).cuda() jitted_tacotron2 = torch.jit.script(tacotron2) texts = [] try: f = open(args.input, 'r') texts = f.readlines() except: print("Could not read file") sys.exit(1) if args.include_warmup: sequence = torch.randint(low=0, high=148, size=(1, 50), dtype=torch.long).cuda() input_lengths = torch.IntTensor([sequence.size(1)]).cuda().long() for i in range(3): with torch.no_grad(): mel, mel_lengths = jitted_tacotron2(sequence, input_lengths) _ = waveglow(mel) measurements = {} sequences_padded, input_lengths = prepare_input_sequence(texts) with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"): mel, mel_lengths = jitted_tacotron2(sequences_padded, input_lengths) with torch.no_grad(), MeasureTime(measurements, "waveglow_time"): audios = waveglow(mel, sigma=args.sigma_infer) audios = audios.float() audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) print("Stopping after", mel.size(2), "decoder steps") tacotron2_infer_perf = mel.size(0) * mel.size( 2) / measurements['tacotron2_time'] waveglow_infer_perf = audios.size(0) * audios.size( 1) / measurements['waveglow_time'] DLLogger.log(step=0, data={"tacotron2_items_per_sec": tacotron2_infer_perf}) DLLogger.log(step=0, data={"tacotron2_latency": measurements['tacotron2_time']}) DLLogger.log(step=0, data={"waveglow_items_per_sec": waveglow_infer_perf}) DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']}) DLLogger.log(step=0, data={ "latency": (measurements['tacotron2_time'] + measurements['waveglow_time']) }) for i, audio in enumerate(audios): audio = audio[:mel_lengths[i] * args.stft_hop_length] audio = audio / torch.max(torch.abs(audio)) audio_path = args.output + "audio_" + str(i) + ".wav" write(audio_path, args.sampling_rate, audio.cpu().numpy()) DLLogger.flush()
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser(description='PyTorch FastPitch Inference', allow_abbrev=False) parser = parse_args(parser) args, unk_args = parser.parse_known_args() if args.p_arpabet > 0.0: cmudict.initialize(args.cmudict_path, keep_ambiguous=True) torch.backends.cudnn.benchmark = args.cudnn_benchmark if args.output is not None: Path(args.output).mkdir(parents=False, exist_ok=True) log_fpath = args.log_file or str(Path(args.output, 'nvlog_infer.json')) log_fpath = unique_log_fpath(log_fpath) DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_fpath), StdOutBackend(Verbosity.VERBOSE, metric_format=stdout_metric_format)]) init_inference_metadata() [DLLogger.log("PARAMETER", {k: v}) for k, v in vars(args).items()] device = torch.device('cuda' if args.cuda else 'cpu') if args.fastpitch != 'SKIP': generator = load_and_setup_model( 'FastPitch', parser, args.fastpitch, args.amp, device, unk_args=unk_args, forward_is_infer=True, ema=args.ema, jitable=args.torchscript) if args.torchscript: generator = torch.jit.script(generator) else: generator = None if args.waveglow != 'SKIP': with warnings.catch_warnings(): warnings.simplefilter("ignore") waveglow = load_and_setup_model( 'WaveGlow', parser, args.waveglow, args.amp, device, unk_args=unk_args, forward_is_infer=True, ema=args.ema) denoiser = Denoiser(waveglow).to(device) waveglow = getattr(waveglow, 'infer', waveglow) else: waveglow = None if len(unk_args) > 0: raise ValueError(f'Invalid options {unk_args}') fields = load_fields(args.input) batches = prepare_input_sequence( fields, device, args.symbol_set, args.text_cleaners, args.batch_size, args.dataset_path, load_mels=(generator is None), p_arpabet=args.p_arpabet) # Use real data rather than synthetic - FastPitch predicts len for _ in tqdm(range(args.warmup_steps), 'Warmup'): with torch.no_grad(): if generator is not None: b = batches[0] mel, *_ = generator(b['text']) if waveglow is not None: audios = waveglow(mel, sigma=args.sigma_infer).float() _ = denoiser(audios, strength=args.denoising_strength) gen_measures = MeasureTime(cuda=args.cuda) waveglow_measures = MeasureTime(cuda=args.cuda) gen_kw = {'pace': args.pace, 'speaker': args.speaker, 'pitch_tgt': None, 'pitch_transform': build_pitch_transformation(args)} if args.torchscript: gen_kw.pop('pitch_transform') print('NOTE: Pitch transforms are disabled with TorchScript') all_utterances = 0 all_samples = 0 all_letters = 0 all_frames = 0 reps = args.repeats log_enabled = reps == 1 log = lambda s, d: DLLogger.log(step=s, data=d) if log_enabled else None for rep in (tqdm(range(reps), 'Inference') if reps > 1 else range(reps)): for b in batches: if generator is None: log(rep, {'Synthesizing from ground truth mels'}) mel, mel_lens = b['mel'], b['mel_lens'] else: with torch.no_grad(), gen_measures: mel, mel_lens, *_ = generator(b['text'], **gen_kw) gen_infer_perf = mel.size(0) * mel.size(2) / gen_measures[-1] all_letters += b['text_lens'].sum().item() all_frames += mel.size(0) * mel.size(2) log(rep, {"fastpitch_frames/s": gen_infer_perf}) log(rep, {"fastpitch_latency": gen_measures[-1]}) if args.save_mels: for i, mel_ in enumerate(mel): m = mel_[:, :mel_lens[i].item()].permute(1, 0) fname = b['output'][i] if 'output' in b else f'mel_{i}.npy' mel_path = Path(args.output, Path(fname).stem + '.npy') np.save(mel_path, m.cpu().numpy()) if waveglow is not None: with torch.no_grad(), waveglow_measures: audios = waveglow(mel, sigma=args.sigma_infer) audios = denoiser(audios.float(), strength=args.denoising_strength ).squeeze(1) all_utterances += len(audios) all_samples += sum(audio.size(0) for audio in audios) waveglow_infer_perf = ( audios.size(0) * audios.size(1) / waveglow_measures[-1]) log(rep, {"waveglow_samples/s": waveglow_infer_perf}) log(rep, {"waveglow_latency": waveglow_measures[-1]}) if args.output is not None and reps == 1: for i, audio in enumerate(audios): audio = audio[:mel_lens[i].item() * args.stft_hop_length] if args.fade_out: fade_len = args.fade_out * args.stft_hop_length fade_w = torch.linspace(1.0, 0.0, fade_len) audio[-fade_len:] *= fade_w.to(audio.device) audio = audio / torch.max(torch.abs(audio)) fname = b['output'][i] if 'output' in b else f'audio_{i}.wav' audio_path = Path(args.output, fname) write(audio_path, args.sampling_rate, audio.cpu().numpy()) if generator is not None and waveglow is not None: log(rep, {"latency": (gen_measures[-1] + waveglow_measures[-1])}) log_enabled = True if generator is not None: gm = np.sort(np.asarray(gen_measures)) rtf = all_samples / (all_utterances * gm.mean() * args.sampling_rate) log((), {"avg_fastpitch_letters/s": all_letters / gm.sum()}) log((), {"avg_fastpitch_frames/s": all_frames / gm.sum()}) log((), {"avg_fastpitch_latency": gm.mean()}) log((), {"avg_fastpitch_RTF": rtf}) log((), {"90%_fastpitch_latency": gm.mean() + norm.ppf((1.0 + 0.90) / 2) * gm.std()}) log((), {"95%_fastpitch_latency": gm.mean() + norm.ppf((1.0 + 0.95) / 2) * gm.std()}) log((), {"99%_fastpitch_latency": gm.mean() + norm.ppf((1.0 + 0.99) / 2) * gm.std()}) if waveglow is not None: wm = np.sort(np.asarray(waveglow_measures)) rtf = all_samples / (all_utterances * wm.mean() * args.sampling_rate) log((), {"avg_waveglow_samples/s": all_samples / wm.sum()}) log((), {"avg_waveglow_latency": wm.mean()}) log((), {"avg_waveglow_RTF": rtf}) log((), {"90%_waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.90) / 2) * wm.std()}) log((), {"95%_waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.95) / 2) * wm.std()}) log((), {"99%_waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.99) / 2) * wm.std()}) if generator is not None and waveglow is not None: m = gm + wm rtf = all_samples / (all_utterances * m.mean() * args.sampling_rate) log((), {"avg_samples/s": all_samples / m.sum()}) log((), {"avg_letters/s": all_letters / m.sum()}) log((), {"avg_latency": m.mean()}) log((), {"avg_RTF": rtf}) log((), {"90%_latency": m.mean() + norm.ppf((1.0 + 0.90) / 2) * m.std()}) log((), {"95%_latency": m.mean() + norm.ppf((1.0 + 0.95) / 2) * m.std()}) log((), {"99%_latency": m.mean() + norm.ppf((1.0 + 0.99) / 2) * m.std()}) DLLogger.flush()
def main(_): # get e2e training time begin = time.time() logging.info("Training started at: {}".format(time.asctime())) hvd.init() # Parse and override hparams config = hparams_config.get_detection_config(FLAGS.model_name) config.override(FLAGS.hparams) if FLAGS.num_epochs: # NOTE: remove this flag after updating all docs. config.num_epochs = FLAGS.num_epochs if FLAGS.lr: config.learning_rate = FLAGS.lr if FLAGS.warmup_value: config.lr_warmup_init = FLAGS.warmup_value if FLAGS.warmup_epochs: config.lr_warmup_epoch = FLAGS.warmup_epochs config.backbone_init = FLAGS.backbone_init config.mixed_precision = FLAGS.amp config.image_size = model_utils.parse_image_size(config.image_size) # get eval config eval_config = hparams_config.get_detection_config(FLAGS.model_name) eval_config.override(FLAGS.hparams) eval_config.val_json_file = FLAGS.val_json_file eval_config.val_file_pattern = FLAGS.val_file_pattern eval_config.nms_configs.max_nms_inputs = anchors.MAX_DETECTION_POINTS eval_config.drop_remainder = False # eval all examples w/o drop. eval_config.image_size = model_utils.parse_image_size( eval_config['image_size']) # setup setup.set_flags(FLAGS, config, training=True) if FLAGS.debug: tf.config.experimental_run_functions_eagerly(True) tf.debugging.set_log_device_placement(True) tf.random.set_seed(111111) logging.set_verbosity(logging.DEBUG) # Check data path if FLAGS.training_file_pattern is None or FLAGS.val_file_pattern is None or FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --training_file_pattern, --val_file_pattern and --val_json_file for training.' ) steps_per_epoch = (FLAGS.num_examples_per_epoch + (FLAGS.batch_size * get_world_size()) - 1) // (FLAGS.batch_size * get_world_size()) if FLAGS.benchmark == True: # For ci perf training runs, run for a fixed number of iterations per epoch steps_per_epoch = FLAGS.benchmark_steps params = dict(config.as_dict(), model_name=FLAGS.model_name, model_dir=FLAGS.model_dir, steps_per_epoch=steps_per_epoch, checkpoint_period=FLAGS.checkpoint_period, batch_size=FLAGS.batch_size, num_shards=get_world_size(), val_json_file=FLAGS.val_json_file, testdev_dir=FLAGS.testdev_dir, mode='train') logging.info('Training params: {}'.format(params)) # make output dir if it does not exist tf.io.gfile.makedirs(FLAGS.model_dir) # dllogger setup backends = [] if is_main_process(): log_path = os.path.join(FLAGS.model_dir, FLAGS.log_filename) backends += [ JSONStreamBackend(verbosity=Verbosity.VERBOSE, filename=log_path), StdOutBackend(verbosity=Verbosity.DEFAULT) ] DLLogger.init(backends=backends) def get_dataset(is_training, params): file_pattern = (FLAGS.training_file_pattern if is_training else FLAGS.val_file_pattern) if not file_pattern: raise ValueError('No matching files.') return dataloader.InputReader( file_pattern, is_training=is_training, use_fake_data=FLAGS.use_fake_data, max_instances_per_image=config.max_instances_per_image, enable_map_parallelization=FLAGS.enable_map_parallelization)( params) num_samples = (FLAGS.eval_samples + get_world_size() - 1) // get_world_size() num_samples = (num_samples + FLAGS.eval_batch_size - 1) // FLAGS.eval_batch_size eval_config.num_samples = num_samples def get_eval_dataset(eval_config): dataset = dataloader.InputReader( FLAGS.val_file_pattern, is_training=False, max_instances_per_image=eval_config.max_instances_per_image)( eval_config, batch_size=FLAGS.eval_batch_size) dataset = dataset.shard(get_world_size(), get_rank()) dataset = dataset.take(num_samples) return dataset eval_dataset = get_eval_dataset(eval_config) # pick focal loss implementation focal_loss = train_lib.StableFocalLoss( params['alpha'], params['gamma'], label_smoothing=params['label_smoothing'], reduction=tf.keras.losses.Reduction.NONE) model = train_lib.EfficientDetNetTrain(params['model_name'], config) model.build((None, *config.image_size, 3)) model.compile( optimizer=optimizer_builder.get_optimizer(params), loss={ 'box_loss': train_lib.BoxLoss(params['delta'], reduction=tf.keras.losses.Reduction.NONE), 'box_iou_loss': train_lib.BoxIouLoss(params['iou_loss_type'], params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size'], reduction=tf.keras.losses.Reduction.NONE), 'class_loss': focal_loss, 'seg_loss': tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE) }) train_from_epoch = util_keras.restore_ckpt(model, params['model_dir'], config.moving_average_decay, steps_per_epoch=steps_per_epoch) print("training_mode: {}".format(FLAGS.training_mode)) callbacks = callback_builder.get_callbacks(params, FLAGS.training_mode, eval_config, eval_dataset, DLLogger, FLAGS.time_history, FLAGS.log_steps, FLAGS.lr_tb, FLAGS.benchmark) history = model.fit( get_dataset(True, params=params), epochs=params['num_epochs'], steps_per_epoch=steps_per_epoch, initial_epoch=train_from_epoch, callbacks=callbacks, verbose=1 if is_main_process() else 0, validation_data=get_dataset(False, params=params) if FLAGS.validate else None, validation_steps=(FLAGS.eval_samples // FLAGS.eval_batch_size) if FLAGS.validate else None) if is_main_process(): model.save_weights(os.path.join(FLAGS.model_dir, 'ckpt-final')) # log final stats stats = {} for callback in callbacks: if isinstance(callback, callback_builder.TimeHistory): if callback.epoch_runtime_log: stats[ 'avg_fps_training'] = callback.average_examples_per_second stats[ 'avg_fps_training_per_GPU'] = callback.average_examples_per_second / get_world_size( ) stats[ 'avg_latency_training'] = callback.average_time_per_iteration if history and history.history: train_hist = history.history #Gets final loss from training. stats['training_loss'] = float( hvd.allreduce(tf.constant(train_hist['loss'][-1], dtype=tf.float32), average=True)) if os.path.exists(os.path.join(FLAGS.model_dir, 'ema_weights')): ckpt_epoch = "%02d" % sorted(set([ int(f.rsplit('.')[0].rsplit('-')[1]) for f in os.listdir(os.path.join(FLAGS.model_dir, 'ema_weights')) if 'emackpt' in f ]), reverse=True)[0] ckpt = os.path.join(FLAGS.model_dir, 'ema_weights', 'emackpt-' + str(ckpt_epoch)) util_keras.restore_ckpt(model, ckpt, eval_config.moving_average_decay, steps_per_epoch=0, skip_mismatch=False, expect_partial=True) if is_main_process(): model.save(os.path.join(FLAGS.model_dir, 'emackpt-final')) else: ckpt_epoch = 'final' ckpt = os.path.join(FLAGS.model_dir, 'ckpt-' + ckpt_epoch) if is_main_process(): model.save(os.path.join(FLAGS.model_dir, 'ckpt-' + ckpt_epoch)) # Start evaluation of final ema checkpoint logging.set_verbosity(logging.WARNING) @tf.function def model_fn(images, labels): cls_outputs, box_outputs = model(images, training=False) detections = postprocess.generate_detections(eval_config, cls_outputs, box_outputs, labels['image_scales'], labels['source_ids']) tf.numpy_function(evaluator.update_state, [ labels['groundtruth_data'], postprocess.transform_detections(detections) ], []) if FLAGS.benchmark == False and FLAGS.training_mode == 'train': # Evaluator for AP calculation. label_map = label_util.get_label_map(eval_config.label_map) evaluator = coco_metric.EvaluationMetric( filename=eval_config.val_json_file, label_map=label_map) evaluator.reset_states() # evaluate all images. pbar = tf.keras.utils.Progbar(num_samples) for i, (images, labels) in enumerate(eval_dataset): model_fn(images, labels) if is_main_process(): pbar.update(i) # gather detections from all ranks evaluator.gather() if is_main_process(): # compute the final eval results. metrics = evaluator.result() metric_dict = {} for i, name in enumerate(evaluator.metric_names): metric_dict[name] = metrics[i] if label_map: for i, cid in enumerate(sorted(label_map.keys())): name = 'AP_/%s' % label_map[cid] metric_dict[name] = metrics[i + len(evaluator.metric_names)] # csv format csv_metrics = ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl'] csv_format = ",".join( [str(ckpt_epoch)] + [str(round(metric_dict[key] * 100, 2)) for key in csv_metrics]) print(FLAGS.model_name, metric_dict, "csv format:", csv_format) MPI.COMM_WORLD.Barrier() if is_main_process(): stats['e2e_training_time'] = time.time() - begin DLLogger.log(step=(), data=stats)