def get_logger(params): backends = [] if hvd.rank() == 0: backends += [StdOutBackend(Verbosity.VERBOSE)] if params.log_dir: backends += [JSONStreamBackend(Verbosity.VERBOSE, params.log_dir)] logger.init(backends=backends) return logger
def __init__(self, log_dir, global_batch_size, mode, warmup, dim, profile): logger.init(backends=[JSONStreamBackend(Verbosity.VERBOSE, log_dir), StdOutBackend(Verbosity.VERBOSE)]) self.warmup_steps = warmup self.global_batch_size = global_batch_size self.step = 0 self.dim = dim self.mode = mode self.profile = profile self.timestamps = []
def log(logname, dice, results="/results"): dllogger = Logger(backends=[ JSONStreamBackend(Verbosity.VERBOSE, os.path.join(results, logname)), StdOutBackend(Verbosity.VERBOSE, step_format=lambda step: ""), ]) metrics = {} metrics.update({"Mean dice": round(dice.mean().item(), 2)}) metrics.update({f"L{j+1}": round(m.item(), 2) for j, m in enumerate(dice)}) dllogger.log(step=(), data=metrics) dllogger.flush()
def get_logger(params): backends = [] if params.worker_id == 0 or params.log_all_workers: backends += [StdOutBackend(Verbosity.VERBOSE)] if params.log_dir: os.makedirs(params.log_dir, exist_ok=True) log_file = f"{params.log_dir}/log.json" backends += [JSONStreamBackend(Verbosity.VERBOSE, log_file)] logger.init(backends=backends) return logger
def get_logger(params): backends = [] worker_id = hvd_rank() if horovod_enabled() else 0 if worker_id == 0: backends += [StdOutBackend(Verbosity.VERBOSE)] if params.log_dir: os.makedirs(params.log_dir, exist_ok=True) log_file = f"{params.log_dir}/log.json" backends += [JSONStreamBackend(Verbosity.VERBOSE, log_file)] logger.init(backends=backends) return logger
def setup_logger(args): os.makedirs(args.results, exist_ok=True) log_path = os.path.join(args.results, args.log_file) if os.path.exists(log_path): for i in itertools.count(): s_fname = args.log_file.split('.') fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}' log_path = os.path.join(args.results, fname) if not os.path.exists(log_path): break def metric_format(metric, metadata, value): return "{}: {}".format(metric, f'{value:.5f}' if isinstance(value, float) else value) def step_format(step): if step == (): return "Finished |" elif isinstance(step, int): return "Step {0: <5} |".format(step) return "Step {} |".format(step) if not dist.is_initialized() or not args.distributed_world_size > 1 or args.distributed_rank == 0: dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path), TensorBoardBackend(verbosity=1, log_dir=args.results), StdOutBackend(verbosity=2, step_format=step_format, prefix_format=lambda x: "")#, #metric_format=metric_format) ]) else: dllogger.init(backends=[]) dllogger.log(step='PARAMETER', data=vars(args), verbosity=0) container_setup_info = {**get_framework_env_vars(), **get_system_info()} dllogger.log(step='ENVIRONMENT', data=container_setup_info, verbosity=0) dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'}) dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'}) dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'}) dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'}) dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f'}) dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f'}) dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'}) dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'}) dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'}) dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f'}) dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'}) dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'}) dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'}) dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f'}) dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'}) dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'}) dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
def get_dllogger(params): backends = [] if is_main_process(): backends += [StdOutBackend(Verbosity.VERBOSE)] if params.log_dir: backends += [ JSONStreamBackend(Verbosity.VERBOSE, os.path.join(params.log_dir, "log.json")) ] logger.init(backends=backends) return logger
def init_log(args): enabled = not dist.is_initialized() or dist.get_rank() == 0 if enabled: fpath = args.log_file or os.path.join(args.output_dir, 'nvlog.json') backends = [ JSONStreamBackend(Verbosity.DEFAULT, unique_log_fpath(fpath)), StdOutBackend(Verbosity.VERBOSE, step_format=stdout_step_format, metric_format=stdout_metric_format) ] else: backends = [] dllogger.init(backends=backends) dllogger.metadata("train_lrate", {"name": "lrate", "format": ":>3.2e"}) for id_, pref in [('train', ''), ('train_avg', 'avg train '), ('dev_ema', ' dev ema ')]: dllogger.metadata(f"{id_}_loss", { "name": f"{pref}loss", "format": ":>7.2f" }) dllogger.metadata(f"{id_}_wer", { "name": f"{pref}wer", "format": ":>6.2f" }) dllogger.metadata(f"{id_}_pplx", { "name": f"{pref}pplx", "format": ":>6.2f" }) dllogger.metadata(f"{id_}_throughput", { "name": f"{pref}utts/s", "format": ":>5.0f" }) dllogger.metadata(f"{id_}_took", { "name": "took", "unit": "s", "format": ":>5.2f" }) tb_subsets = ['train', 'dev_ema'] global tb_loggers tb_loggers = { s: TBLogger(enabled, args.output_dir, name=s) for s in tb_subsets } log_parameters(vars(args), tb_subset='train')
def setup_dllogger(rank, enabled=True, filename='log.json'): if enabled and rank == 0: backends = [ StdOutBackend(Verbosity.DEFAULT), JSONStreamBackend( Verbosity.VERBOSE, filename, ), ] DLLogger.init(backends) else: DLLogger.init([])
def init(log_fpath, log_dir, enabled=True, tb_subsets=[], **tb_kw): if enabled: backends = [ JSONStreamBackend(Verbosity.DEFAULT, unique_log_fpath(log_fpath)), StdOutBackend(Verbosity.VERBOSE, step_format=stdout_step_format, metric_format=stdout_metric_format) ] else: backends = [] dllogger.init(backends=backends) dllogger.metadata("train_lrate", {"name": "lrate", "format": ":>3.2e"}) for id_, pref in [('train', ''), ('train_avg', 'avg train '), ('val', ' avg val '), ('val_ema', ' EMA val ')]: dllogger.metadata(f"{id_}_loss", { "name": f"{pref}loss", "format": ":>5.2f" }) dllogger.metadata(f"{id_}_mel_loss", { "name": f"{pref}mel loss", "format": ":>5.2f" }) dllogger.metadata(f"{id_}_kl_loss", { "name": f"{pref}kl loss", "format": ":>5.5f" }) dllogger.metadata(f"{id_}_kl_weight", { "name": f"{pref}kl weight", "format": ":>5.5f" }) dllogger.metadata(f"{id_}_frames/s", { "name": None, "unit": "frames/s", "format": ":>10.2f" }) dllogger.metadata(f"{id_}_took", { "name": "took", "unit": "s", "format": ":>3.2f" }) global tb_loggers tb_loggers = { s: TBLogger(enabled, log_dir, name=s, **tb_kw) for s in tb_subsets }
def get_logger(params): """ Get logger object :param params: Dict with additional parameters :return: logger """ backends = [] if hvd.rank() == 0: backends += [StdOutBackend(Verbosity.VERBOSE)] if params.log_dir: backends += [JSONStreamBackend(Verbosity.VERBOSE, params.log_dir)] logger.init(backends=backends) return logger
def log(logname, dice, epoch=None, dice_tta=None): dllogger = Logger(backends=[ JSONStreamBackend(Verbosity.VERBOSE, os.path.join( args.results, logname)), StdOutBackend(Verbosity.VERBOSE, step_format=lambda step: ""), ]) metrics = {} if epoch is not None: metrics.update({"Epoch": epoch}) metrics.update({"Mean dice": round(dice.mean().item(), 2)}) if dice_tta is not None: metrics.update({"Mean TTA dice": round(dice_tta.mean().item(), 2)}) metrics.update({f"L{j+1}": round(m.item(), 2) for j, m in enumerate(dice)}) if dice_tta is not None: metrics.update({ f"TTA_L{j+1}": round(m.item(), 2) for j, m in enumerate(dice_tta) }) dllogger.log(step=(), data=metrics) dllogger.flush()
def main(): LOGGER.set_model_name('ResNet') LOGGER.set_backends([ StdOutBackend(log_file='std.out', logging_scope=Scope.TRAIN_ITER), CompactBackend(log_file=None, logging_scope=Scope.TRAIN_ITER, iteration_interval=5), JsonBackend(log_file='dummy.json', logging_scope=Scope.TRAIN_ITER, iteration_interval=4) ]) parser = ArgumentParser() parser.add_argument('--dummy', type=str, default='default_dummy_value') args = parser.parse_args() LOGGER.log_hardware() LOGGER.log_args(args) LOGGER.log(tags.RUN_INIT) LOGGER.register_metric('loss', meter=AverageMeter(), metric_scope=Scope.TRAIN_ITER) LOGGER.register_metric('epoch_nr', metric_scope=Scope.EPOCH) LOGGER.register_metric('epochs2') with LOGGER.timed_block(tags.SETUP_BLOCK): print("This is setup.") with LOGGER.timed_block(tags.PREPROC_BLOCK): print("This is preprocessing.") with LOGGER.timed_block(tags.RUN_BLOCK): print("This is run.") train() print("This is the end.") LOGGER.log(tags.RUN_FINAL) LOGGER.finish()
def __init__(self, args): super(Model, self).__init__() self.save_hyperparameters() self.args = args self.f1_score = F1(args) self.model = UNetLoc(args) if args.type == "pre" else get_dmg_unet( args) self.loss = Loss(args) self.best_f1 = torch.tensor(0) self.best_epoch = 0 self.tta_flips = [[2], [3], [2, 3]] self.lr = args.lr self.n_class = 2 if self.args.type == "pre" else 5 self.softmax = nn.Softmax(dim=1) self.test_idx = 0 self.dllogger = Logger(backends=[ JSONStreamBackend( Verbosity.VERBOSE, os.path.join(args.results, f"{args.logname}.json")), StdOutBackend(Verbosity.VERBOSE, step_format=lambda step: f"Epoch: {step} "), ])
def setup_logger(config): log_path = config.get("log_path", os.getcwd()) if is_main_process(): backends = [ TensorBoardBackend(verbosity=dllogger.Verbosity.VERBOSE, log_dir=log_path), JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=os.path.join(log_path, "log.json")), AggregatorBackend(verbosity=dllogger.Verbosity.VERBOSE, agg_dict={"loss": AverageMeter}), StdOutBackend( verbosity=dllogger.Verbosity.DEFAULT, step_format=empty_step_format, metric_format=no_string_metric_format, prefix_format=empty_prefix_format, ), ] logger = Logger(backends=backends) else: logger = Logger(backends=[]) container_setup_info = get_framework_env_vars() logger.log(step="PARAMETER", data=container_setup_info, verbosity=dllogger.Verbosity.DEFAULT) logger.metadata("loss", { "unit": "nat", "GOAL": "MINIMIZE", "STAGE": "TRAIN" }) logger.metadata("val_loss", { "unit": "nat", "GOAL": "MINIMIZE", "STAGE": "VAL" }) return logger
def __init__(self, args): super(NNUnet, self).__init__() self.args = args self.save_hyperparameters() self.build_nnunet() self.loss = Loss() self.dice = Dice(self.n_class) self.best_sum = 0 self.eval_dice = 0 self.best_sum_epoch = 0 self.best_dice = self.n_class * [0] self.best_epoch = self.n_class * [0] self.best_sum_dice = self.n_class * [0] self.learning_rate = args.learning_rate if self.args.exec_mode in ["train", "evaluate"]: self.dllogger = Logger(backends=[ JSONStreamBackend(Verbosity.VERBOSE, os.path.join(args.results, "logs.json")), StdOutBackend(Verbosity.VERBOSE, step_format=lambda step: f"Epoch: {step} "), ]) self.tta_flips = ([[2], [3], [2, 3]] if self.args.dim == 2 else [[2], [3], [4], [2, 3], [2, 4], [3, 4], [2, 3, 4]])
def main(): parser = argparse.ArgumentParser( description='PyTorch TTS Data Pre-processing') parser = parse_args(parser) args, unk_args = parser.parse_known_args() if len(unk_args) > 0: raise ValueError(f'Invalid options {unk_args}') if args.extract_pitch_char: assert args.extract_durations, "Durations required for pitch extraction" DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) model = load_and_setup_model( 'Tacotron2', parser, args.tacotron2_checkpoint, amp=False, device=torch.device('cuda' if args.cuda else 'cpu'), forward_is_infer=False, ema=False) if args.train_mode: model.train() # n_mel_channels arg has been consumed by model's arg parser args.n_mel_channels = model.n_mel_channels for datum in ('mels', 'mels_teacher', 'attentions', 'durations', 'pitch_mel', 'pitch_char', 'pitch_trichar'): if getattr(args, f'extract_{datum}'): Path(args.dataset_path, datum).mkdir(parents=False, exist_ok=True) filenames = [ Path(l.split('|')[0]).stem for l in open(args.wav_text_filelist, 'r') ] # Compatibility with Tacotron2 Data loader args.n_speakers = 1 dataset = FilenamedLoader(filenames, args.dataset_path, args.wav_text_filelist, args, load_mel_from_disk=False) # TextMelCollate supports only n_frames_per_step=1 data_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, sampler=None, num_workers=0, collate_fn=TextMelCollate(1), pin_memory=False, drop_last=False) pitch_vecs = {'mel': {}, 'char': {}, 'trichar': {}} for i, batch in enumerate(data_loader): tik = time.time() fnames = batch[-1] x, _, _ = batch_to_gpu(batch[:-1]) _, text_lens, mels_padded, _, mel_lens = x for j, mel in enumerate(mels_padded): fpath = Path(args.dataset_path, 'mels', fnames[j] + '.pt') torch.save(mel[:, :mel_lens[j]].cpu(), fpath) with torch.no_grad(): out_mels, out_mels_postnet, _, alignments = model.forward(x) if args.extract_mels_teacher: for j, mel in enumerate(out_mels_postnet): fpath = Path(args.dataset_path, 'mels_teacher', fnames[j] + '.pt') torch.save(mel[:, :mel_lens[j]].cpu(), fpath) if args.extract_attentions: for j, ali in enumerate(alignments): ali = ali[:mel_lens[j], :text_lens[j]] fpath = Path(args.dataset_path, 'attentions', fnames[j] + '.pt') torch.save(ali.cpu(), fpath) durations = [] if args.extract_durations: for j, ali in enumerate(alignments): text_len = text_lens[j] ali = ali[:mel_lens[j], :text_len] dur = torch.histc(torch.argmax(ali, dim=1), min=0, max=text_len - 1, bins=text_len) durations.append(dur) fpath = Path(args.dataset_path, 'durations', fnames[j] + '.pt') torch.save(dur.cpu().int(), fpath) if args.extract_pitch_mel or args.extract_pitch_char or args.extract_pitch_trichar: for j, dur in enumerate(durations): fpath = Path(args.dataset_path, 'pitch_char', fnames[j] + '.pt') wav = Path(args.dataset_path, 'wavs', fnames[j] + '.wav') p_mel, p_char, p_trichar = calculate_pitch( str(wav), dur.cpu().numpy()) pitch_vecs['mel'][fnames[j]] = p_mel pitch_vecs['char'][fnames[j]] = p_char pitch_vecs['trichar'][fnames[j]] = p_trichar nseconds = time.time() - tik DLLogger.log(step=f'{i+1}/{len(data_loader)} ({nseconds:.2f}s)', data={}) if args.extract_pitch_mel: normalize_pitch_vectors(pitch_vecs['mel']) for fname, pitch in pitch_vecs['mel'].items(): fpath = Path(args.dataset_path, 'pitch_mel', fname + '.pt') torch.save(torch.from_numpy(pitch), fpath) if args.extract_pitch_char: mean, std = normalize_pitch_vectors(pitch_vecs['char']) for fname, pitch in pitch_vecs['char'].items(): fpath = Path(args.dataset_path, 'pitch_char', fname + '.pt') torch.save(torch.from_numpy(pitch), fpath) save_stats(args.dataset_path, args.wav_text_filelist, 'pitch_char', mean, std) if args.extract_pitch_trichar: normalize_pitch_vectors(pitch_vecs['trichar']) for fname, pitch in pitch_vecs['trichar'].items(): fpath = Path(args.dataset_path, 'pitch_trichar', fname + '.pt') torch.save(torch.from_numpy(pitch), fpath) DLLogger.flush()
def main(): parser = argparse.ArgumentParser( description='TensorRT Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() # initialize CUDA state torch.cuda.init() TRT_LOGGER = trt.Logger(trt.Logger.WARNING) encoder = load_engine(args.encoder, TRT_LOGGER) decoder_iter = load_engine(args.decoder, TRT_LOGGER) postnet = load_engine(args.postnet, TRT_LOGGER) waveglow = load_engine(args.waveglow, TRT_LOGGER) if args.waveglow_ckpt != "": # setup denoiser using WaveGlow PyTorch checkpoint waveglow_ckpt = load_and_setup_model('WaveGlow', parser, args.waveglow_ckpt, True, forward_is_infer=True) denoiser = Denoiser(waveglow_ckpt).cuda() # after initialization, we don't need WaveGlow PyTorch checkpoint # anymore - deleting del waveglow_ckpt torch.cuda.empty_cache() # create TRT contexts for each engine encoder_context = encoder.create_execution_context() decoder_context = decoder_iter.create_execution_context() postnet_context = postnet.create_execution_context() waveglow_context = waveglow.create_execution_context() DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.output+'/'+args.log_file), StdOutBackend(Verbosity.VERBOSE)]) texts = [] try: f = open(args.input, 'r') texts = f.readlines() except: print("Could not read file") sys.exit(1) measurements = {} sequences, sequence_lengths = prepare_input_sequence(texts) sequences = sequences.to(torch.int32) sequence_lengths = sequence_lengths.to(torch.int32) with MeasureTime(measurements, "latency"): mel, mel_lengths = infer_tacotron2_trt(encoder, decoder_iter, postnet, encoder_context, decoder_context, postnet_context, sequences, sequence_lengths, measurements, args.fp16) audios = infer_waveglow_trt(waveglow, waveglow_context, mel, measurements, args.fp16) with encoder_context, decoder_context, postnet_context, waveglow_context: pass audios = audios.float() if args.waveglow_ckpt != "": with MeasureTime(measurements, "denoiser"): audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) for i, audio in enumerate(audios): audio = audio[:mel_lengths[i]*args.stft_hop_length] audio = audio/torch.max(torch.abs(audio)) audio_path = args.output + "audio_"+str(i)+"_trt.wav" write(audio_path, args.sampling_rate, audio.cpu().numpy()) DLLogger.log(step=0, data={"tacotron2_encoder_latency": measurements['tacotron2_encoder_time']}) DLLogger.log(step=0, data={"tacotron2_decoder_latency": measurements['tacotron2_decoder_time']}) DLLogger.log(step=0, data={"tacotron2_postnet_latency": measurements['tacotron2_postnet_time']}) DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']}) DLLogger.log(step=0, data={"latency": measurements['latency']}) if args.waveglow_ckpt != "": DLLogger.log(step=0, data={"denoiser": measurements['denoiser']}) DLLogger.flush() prec = "fp16" if args.fp16 else "fp32" latency = measurements['latency'] throughput = audios.size(1)/latency log_data = "1,"+str(sequence_lengths[0].item())+","+prec+","+str(latency)+","+str(throughput)+","+str(mel_lengths[0].item())+"\n" with open("log_bs1_"+prec+".log", 'a') as f: f.write(log_data)
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU or CPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, unknown_args = parser.parse_known_args() DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.log_file), StdOutBackend(Verbosity.VERBOSE)]) for k,v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k:v}) DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'}) measurements_all = {"pre_processing": [], "tacotron2_latency": [], "waveglow_latency": [], "latency": [], "type_conversion": [], "data_transfer": [], "storage": [], "tacotron2_items_per_sec": [], "waveglow_items_per_sec": [], "num_mels_per_audio": [], "throughput": []} print("args:", args, unknown_args) tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.amp_run, args.cpu_run, forward_is_infer=True) waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.amp_run, args.cpu_run) if args.cpu_run: denoiser = Denoiser(waveglow, args.cpu_run) else: denoiser = Denoiser(waveglow, args.cpu_run).cuda() jitted_tacotron2 = torch.jit.script(tacotron2) texts = ["The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."] texts = [texts[0][:args.input_length]] texts = texts*args.batch_size warmup_iters = 3 for iter in range(args.num_iters): measurements = {} with MeasureTime(measurements, "pre_processing", args.cpu_run): sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu_run) with torch.no_grad(): with MeasureTime(measurements, "latency", args.cpu_run): with MeasureTime(measurements, "tacotron2_latency", args.cpu_run): mel, mel_lengths, _ = jitted_tacotron2(sequences_padded, input_lengths) with MeasureTime(measurements, "waveglow_latency", args.cpu_run): audios = waveglow.infer(mel, sigma=args.sigma_infer) audios = audios.float() audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) num_mels = mel.size(0)*mel.size(2) num_samples = audios.size(0)*audios.size(1) with MeasureTime(measurements, "type_conversion", args.cpu_run): audios = audios.float() with MeasureTime(measurements, "data_transfer", args.cpu_run): audios = audios.cpu() with MeasureTime(measurements, "storage", args.cpu_run): audios = audios.numpy() for i, audio in enumerate(audios): audio_path = "audio_"+str(i)+".wav" write(audio_path, args.sampling_rate, audio[:mel_lengths[i]*args.stft_hop_length]) measurements['tacotron2_items_per_sec'] = num_mels/measurements['tacotron2_latency'] measurements['waveglow_items_per_sec'] = num_samples/measurements['waveglow_latency'] measurements['num_mels_per_audio'] = mel.size(2) measurements['throughput'] = num_samples/measurements['latency'] if iter >= warmup_iters: for k,v in measurements.items(): measurements_all[k].append(v) DLLogger.log(step=(iter-warmup_iters), data={k: v}) DLLogger.flush() print_stats(measurements_all)
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.output + '/' + args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'}) tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.amp_run, forward_is_infer=True) waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.amp_run, forward_is_infer=True) denoiser = Denoiser(waveglow).cuda() jitted_tacotron2 = torch.jit.script(tacotron2) texts = [] try: f = open(args.input, 'r') texts = f.readlines() except: print("Could not read file") sys.exit(1) if args.include_warmup: sequence = torch.randint(low=0, high=148, size=(1, 50), dtype=torch.long).cuda() input_lengths = torch.IntTensor([sequence.size(1)]).cuda().long() for i in range(3): with torch.no_grad(): mel, mel_lengths = jitted_tacotron2(sequence, input_lengths) _ = waveglow(mel) measurements = {} sequences_padded, input_lengths = prepare_input_sequence(texts) with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"): mel, mel_lengths = jitted_tacotron2(sequences_padded, input_lengths) with torch.no_grad(), MeasureTime(measurements, "waveglow_time"): audios = waveglow(mel, sigma=args.sigma_infer) audios = audios.float() audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) print("Stopping after", mel.size(2), "decoder steps") tacotron2_infer_perf = mel.size(0) * mel.size( 2) / measurements['tacotron2_time'] waveglow_infer_perf = audios.size(0) * audios.size( 1) / measurements['waveglow_time'] DLLogger.log(step=0, data={"tacotron2_items_per_sec": tacotron2_infer_perf}) DLLogger.log(step=0, data={"tacotron2_latency": measurements['tacotron2_time']}) DLLogger.log(step=0, data={"waveglow_items_per_sec": waveglow_infer_perf}) DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']}) DLLogger.log(step=0, data={ "latency": (measurements['tacotron2_time'] + measurements['waveglow_time']) }) for i, audio in enumerate(audios): audio = audio[:mel_lengths[i] * args.stft_hop_length] audio = audio / torch.max(torch.abs(audio)) audio_path = args.output + "audio_" + str(i) + ".wav" write(audio_path, args.sampling_rate, audio.cpu().numpy()) DLLogger.flush()
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser(description='PyTorch FastPitch Inference', allow_abbrev=False) parser = parse_args(parser) args, unk_args = parser.parse_known_args() if args.p_arpabet > 0.0: cmudict.initialize(args.cmudict_path, keep_ambiguous=True) torch.backends.cudnn.benchmark = args.cudnn_benchmark if args.output is not None: Path(args.output).mkdir(parents=False, exist_ok=True) log_fpath = args.log_file or str(Path(args.output, 'nvlog_infer.json')) log_fpath = unique_log_fpath(log_fpath) DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_fpath), StdOutBackend(Verbosity.VERBOSE, metric_format=stdout_metric_format)]) init_inference_metadata() [DLLogger.log("PARAMETER", {k: v}) for k, v in vars(args).items()] device = torch.device('cuda' if args.cuda else 'cpu') if args.fastpitch != 'SKIP': generator = load_and_setup_model( 'FastPitch', parser, args.fastpitch, args.amp, device, unk_args=unk_args, forward_is_infer=True, ema=args.ema, jitable=args.torchscript) if args.torchscript: generator = torch.jit.script(generator) else: generator = None if args.waveglow != 'SKIP': with warnings.catch_warnings(): warnings.simplefilter("ignore") waveglow = load_and_setup_model( 'WaveGlow', parser, args.waveglow, args.amp, device, unk_args=unk_args, forward_is_infer=True, ema=args.ema) denoiser = Denoiser(waveglow).to(device) waveglow = getattr(waveglow, 'infer', waveglow) else: waveglow = None if len(unk_args) > 0: raise ValueError(f'Invalid options {unk_args}') fields = load_fields(args.input) batches = prepare_input_sequence( fields, device, args.symbol_set, args.text_cleaners, args.batch_size, args.dataset_path, load_mels=(generator is None), p_arpabet=args.p_arpabet) # Use real data rather than synthetic - FastPitch predicts len for _ in tqdm(range(args.warmup_steps), 'Warmup'): with torch.no_grad(): if generator is not None: b = batches[0] mel, *_ = generator(b['text']) if waveglow is not None: audios = waveglow(mel, sigma=args.sigma_infer).float() _ = denoiser(audios, strength=args.denoising_strength) gen_measures = MeasureTime(cuda=args.cuda) waveglow_measures = MeasureTime(cuda=args.cuda) gen_kw = {'pace': args.pace, 'speaker': args.speaker, 'pitch_tgt': None, 'pitch_transform': build_pitch_transformation(args)} if args.torchscript: gen_kw.pop('pitch_transform') print('NOTE: Pitch transforms are disabled with TorchScript') all_utterances = 0 all_samples = 0 all_letters = 0 all_frames = 0 reps = args.repeats log_enabled = reps == 1 log = lambda s, d: DLLogger.log(step=s, data=d) if log_enabled else None for rep in (tqdm(range(reps), 'Inference') if reps > 1 else range(reps)): for b in batches: if generator is None: log(rep, {'Synthesizing from ground truth mels'}) mel, mel_lens = b['mel'], b['mel_lens'] else: with torch.no_grad(), gen_measures: mel, mel_lens, *_ = generator(b['text'], **gen_kw) gen_infer_perf = mel.size(0) * mel.size(2) / gen_measures[-1] all_letters += b['text_lens'].sum().item() all_frames += mel.size(0) * mel.size(2) log(rep, {"fastpitch_frames/s": gen_infer_perf}) log(rep, {"fastpitch_latency": gen_measures[-1]}) if args.save_mels: for i, mel_ in enumerate(mel): m = mel_[:, :mel_lens[i].item()].permute(1, 0) fname = b['output'][i] if 'output' in b else f'mel_{i}.npy' mel_path = Path(args.output, Path(fname).stem + '.npy') np.save(mel_path, m.cpu().numpy()) if waveglow is not None: with torch.no_grad(), waveglow_measures: audios = waveglow(mel, sigma=args.sigma_infer) audios = denoiser(audios.float(), strength=args.denoising_strength ).squeeze(1) all_utterances += len(audios) all_samples += sum(audio.size(0) for audio in audios) waveglow_infer_perf = ( audios.size(0) * audios.size(1) / waveglow_measures[-1]) log(rep, {"waveglow_samples/s": waveglow_infer_perf}) log(rep, {"waveglow_latency": waveglow_measures[-1]}) if args.output is not None and reps == 1: for i, audio in enumerate(audios): audio = audio[:mel_lens[i].item() * args.stft_hop_length] if args.fade_out: fade_len = args.fade_out * args.stft_hop_length fade_w = torch.linspace(1.0, 0.0, fade_len) audio[-fade_len:] *= fade_w.to(audio.device) audio = audio / torch.max(torch.abs(audio)) fname = b['output'][i] if 'output' in b else f'audio_{i}.wav' audio_path = Path(args.output, fname) write(audio_path, args.sampling_rate, audio.cpu().numpy()) if generator is not None and waveglow is not None: log(rep, {"latency": (gen_measures[-1] + waveglow_measures[-1])}) log_enabled = True if generator is not None: gm = np.sort(np.asarray(gen_measures)) rtf = all_samples / (all_utterances * gm.mean() * args.sampling_rate) log((), {"avg_fastpitch_letters/s": all_letters / gm.sum()}) log((), {"avg_fastpitch_frames/s": all_frames / gm.sum()}) log((), {"avg_fastpitch_latency": gm.mean()}) log((), {"avg_fastpitch_RTF": rtf}) log((), {"90%_fastpitch_latency": gm.mean() + norm.ppf((1.0 + 0.90) / 2) * gm.std()}) log((), {"95%_fastpitch_latency": gm.mean() + norm.ppf((1.0 + 0.95) / 2) * gm.std()}) log((), {"99%_fastpitch_latency": gm.mean() + norm.ppf((1.0 + 0.99) / 2) * gm.std()}) if waveglow is not None: wm = np.sort(np.asarray(waveglow_measures)) rtf = all_samples / (all_utterances * wm.mean() * args.sampling_rate) log((), {"avg_waveglow_samples/s": all_samples / wm.sum()}) log((), {"avg_waveglow_latency": wm.mean()}) log((), {"avg_waveglow_RTF": rtf}) log((), {"90%_waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.90) / 2) * wm.std()}) log((), {"95%_waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.95) / 2) * wm.std()}) log((), {"99%_waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.99) / 2) * wm.std()}) if generator is not None and waveglow is not None: m = gm + wm rtf = all_samples / (all_utterances * m.mean() * args.sampling_rate) log((), {"avg_samples/s": all_samples / m.sum()}) log((), {"avg_letters/s": all_letters / m.sum()}) log((), {"avg_latency": m.mean()}) log((), {"avg_RTF": rtf}) log((), {"90%_latency": m.mean() + norm.ppf((1.0 + 0.90) / 2) * m.std()}) log((), {"95%_latency": m.mean() + norm.ppf((1.0 + 0.95) / 2) * m.std()}) log((), {"99%_latency": m.mean() + norm.ppf((1.0 + 0.99) / 2) * m.std()}) DLLogger.flush()
def _initialize_dllogger(self, log_dir, filename, append): backends = [ JSONStreamBackend(Verbosity.VERBOSE, os.path.join(log_dir, filename), append=append), StdOutBackend(Verbosity.VERBOSE), ] logger.init(backends=backends)
def main(_): # get e2e training time begin = time.time() logging.info("Training started at: {}".format(time.asctime())) hvd.init() # Parse and override hparams config = hparams_config.get_detection_config(FLAGS.model_name) config.override(FLAGS.hparams) if FLAGS.num_epochs: # NOTE: remove this flag after updating all docs. config.num_epochs = FLAGS.num_epochs if FLAGS.lr: config.learning_rate = FLAGS.lr if FLAGS.warmup_value: config.lr_warmup_init = FLAGS.warmup_value if FLAGS.warmup_epochs: config.lr_warmup_epoch = FLAGS.warmup_epochs config.backbone_init = FLAGS.backbone_init config.mixed_precision = FLAGS.amp config.image_size = model_utils.parse_image_size(config.image_size) # get eval config eval_config = hparams_config.get_detection_config(FLAGS.model_name) eval_config.override(FLAGS.hparams) eval_config.val_json_file = FLAGS.val_json_file eval_config.val_file_pattern = FLAGS.val_file_pattern eval_config.nms_configs.max_nms_inputs = anchors.MAX_DETECTION_POINTS eval_config.drop_remainder = False # eval all examples w/o drop. eval_config.image_size = model_utils.parse_image_size( eval_config['image_size']) # setup setup.set_flags(FLAGS, config, training=True) if FLAGS.debug: tf.config.experimental_run_functions_eagerly(True) tf.debugging.set_log_device_placement(True) tf.random.set_seed(111111) logging.set_verbosity(logging.DEBUG) # Check data path if FLAGS.training_file_pattern is None or FLAGS.val_file_pattern is None or FLAGS.val_json_file is None: raise RuntimeError( 'You must specify --training_file_pattern, --val_file_pattern and --val_json_file for training.' ) steps_per_epoch = (FLAGS.num_examples_per_epoch + (FLAGS.batch_size * get_world_size()) - 1) // (FLAGS.batch_size * get_world_size()) if FLAGS.benchmark == True: # For ci perf training runs, run for a fixed number of iterations per epoch steps_per_epoch = FLAGS.benchmark_steps params = dict(config.as_dict(), model_name=FLAGS.model_name, model_dir=FLAGS.model_dir, steps_per_epoch=steps_per_epoch, checkpoint_period=FLAGS.checkpoint_period, batch_size=FLAGS.batch_size, num_shards=get_world_size(), val_json_file=FLAGS.val_json_file, testdev_dir=FLAGS.testdev_dir, mode='train') logging.info('Training params: {}'.format(params)) # make output dir if it does not exist tf.io.gfile.makedirs(FLAGS.model_dir) # dllogger setup backends = [] if is_main_process(): log_path = os.path.join(FLAGS.model_dir, FLAGS.log_filename) backends += [ JSONStreamBackend(verbosity=Verbosity.VERBOSE, filename=log_path), StdOutBackend(verbosity=Verbosity.DEFAULT) ] DLLogger.init(backends=backends) def get_dataset(is_training, params): file_pattern = (FLAGS.training_file_pattern if is_training else FLAGS.val_file_pattern) if not file_pattern: raise ValueError('No matching files.') return dataloader.InputReader( file_pattern, is_training=is_training, use_fake_data=FLAGS.use_fake_data, max_instances_per_image=config.max_instances_per_image, enable_map_parallelization=FLAGS.enable_map_parallelization)( params) num_samples = (FLAGS.eval_samples + get_world_size() - 1) // get_world_size() num_samples = (num_samples + FLAGS.eval_batch_size - 1) // FLAGS.eval_batch_size eval_config.num_samples = num_samples def get_eval_dataset(eval_config): dataset = dataloader.InputReader( FLAGS.val_file_pattern, is_training=False, max_instances_per_image=eval_config.max_instances_per_image)( eval_config, batch_size=FLAGS.eval_batch_size) dataset = dataset.shard(get_world_size(), get_rank()) dataset = dataset.take(num_samples) return dataset eval_dataset = get_eval_dataset(eval_config) # pick focal loss implementation focal_loss = train_lib.StableFocalLoss( params['alpha'], params['gamma'], label_smoothing=params['label_smoothing'], reduction=tf.keras.losses.Reduction.NONE) model = train_lib.EfficientDetNetTrain(params['model_name'], config) model.build((None, *config.image_size, 3)) model.compile( optimizer=optimizer_builder.get_optimizer(params), loss={ 'box_loss': train_lib.BoxLoss(params['delta'], reduction=tf.keras.losses.Reduction.NONE), 'box_iou_loss': train_lib.BoxIouLoss(params['iou_loss_type'], params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size'], reduction=tf.keras.losses.Reduction.NONE), 'class_loss': focal_loss, 'seg_loss': tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE) }) train_from_epoch = util_keras.restore_ckpt(model, params['model_dir'], config.moving_average_decay, steps_per_epoch=steps_per_epoch) print("training_mode: {}".format(FLAGS.training_mode)) callbacks = callback_builder.get_callbacks(params, FLAGS.training_mode, eval_config, eval_dataset, DLLogger, FLAGS.time_history, FLAGS.log_steps, FLAGS.lr_tb, FLAGS.benchmark) history = model.fit( get_dataset(True, params=params), epochs=params['num_epochs'], steps_per_epoch=steps_per_epoch, initial_epoch=train_from_epoch, callbacks=callbacks, verbose=1 if is_main_process() else 0, validation_data=get_dataset(False, params=params) if FLAGS.validate else None, validation_steps=(FLAGS.eval_samples // FLAGS.eval_batch_size) if FLAGS.validate else None) if is_main_process(): model.save_weights(os.path.join(FLAGS.model_dir, 'ckpt-final')) # log final stats stats = {} for callback in callbacks: if isinstance(callback, callback_builder.TimeHistory): if callback.epoch_runtime_log: stats[ 'avg_fps_training'] = callback.average_examples_per_second stats[ 'avg_fps_training_per_GPU'] = callback.average_examples_per_second / get_world_size( ) stats[ 'avg_latency_training'] = callback.average_time_per_iteration if history and history.history: train_hist = history.history #Gets final loss from training. stats['training_loss'] = float( hvd.allreduce(tf.constant(train_hist['loss'][-1], dtype=tf.float32), average=True)) if os.path.exists(os.path.join(FLAGS.model_dir, 'ema_weights')): ckpt_epoch = "%02d" % sorted(set([ int(f.rsplit('.')[0].rsplit('-')[1]) for f in os.listdir(os.path.join(FLAGS.model_dir, 'ema_weights')) if 'emackpt' in f ]), reverse=True)[0] ckpt = os.path.join(FLAGS.model_dir, 'ema_weights', 'emackpt-' + str(ckpt_epoch)) util_keras.restore_ckpt(model, ckpt, eval_config.moving_average_decay, steps_per_epoch=0, skip_mismatch=False, expect_partial=True) if is_main_process(): model.save(os.path.join(FLAGS.model_dir, 'emackpt-final')) else: ckpt_epoch = 'final' ckpt = os.path.join(FLAGS.model_dir, 'ckpt-' + ckpt_epoch) if is_main_process(): model.save(os.path.join(FLAGS.model_dir, 'ckpt-' + ckpt_epoch)) # Start evaluation of final ema checkpoint logging.set_verbosity(logging.WARNING) @tf.function def model_fn(images, labels): cls_outputs, box_outputs = model(images, training=False) detections = postprocess.generate_detections(eval_config, cls_outputs, box_outputs, labels['image_scales'], labels['source_ids']) tf.numpy_function(evaluator.update_state, [ labels['groundtruth_data'], postprocess.transform_detections(detections) ], []) if FLAGS.benchmark == False and FLAGS.training_mode == 'train': # Evaluator for AP calculation. label_map = label_util.get_label_map(eval_config.label_map) evaluator = coco_metric.EvaluationMetric( filename=eval_config.val_json_file, label_map=label_map) evaluator.reset_states() # evaluate all images. pbar = tf.keras.utils.Progbar(num_samples) for i, (images, labels) in enumerate(eval_dataset): model_fn(images, labels) if is_main_process(): pbar.update(i) # gather detections from all ranks evaluator.gather() if is_main_process(): # compute the final eval results. metrics = evaluator.result() metric_dict = {} for i, name in enumerate(evaluator.metric_names): metric_dict[name] = metrics[i] if label_map: for i, cid in enumerate(sorted(label_map.keys())): name = 'AP_/%s' % label_map[cid] metric_dict[name] = metrics[i + len(evaluator.metric_names)] # csv format csv_metrics = ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl'] csv_format = ",".join( [str(ckpt_epoch)] + [str(round(metric_dict[key] * 100, 2)) for key in csv_metrics]) print(FLAGS.model_name, metric_dict, "csv format:", csv_format) MPI.COMM_WORLD.Barrier() if is_main_process(): stats['e2e_training_time'] = time.time() - begin DLLogger.log(step=(), data=stats)
def __init__(self, log_path="bert_dllog.json"): self.logger = Logger([ StdOutBackend(Verbosity.DEFAULT, step_format=self.format_step), JSONStreamBackend(Verbosity.VERBOSE, log_path), ]) self.logger.metadata("mlm_loss", { "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN" }) self.logger.metadata("nsp_loss", { "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN" }) self.logger.metadata("avg_loss_step", { "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN" }) self.logger.metadata("total_loss", { "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN" }) self.logger.metadata("loss", { "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN" }) self.logger.metadata("f1", { "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL" }) self.logger.metadata("precision", { "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL" }) self.logger.metadata("recall", { "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL" }) self.logger.metadata("mcc", { "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL" }) self.logger.metadata("exact_match", { "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL" }) self.logger.metadata( "throughput_train", { "unit": "seq/s", "format": ":.3f", "GOAL": "MAXIMIZE", "STAGE": "TRAIN" }, ) self.logger.metadata( "throughput_inf", { "unit": "seq/s", "format": ":.3f", "GOAL": "MAXIMIZE", "STAGE": "VAL" }, )
def main(): """ Launches inference benchmark. Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() log_file = args.log_file DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.output + '/' + args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'}) model = load_and_setup_model(args.model_name, parser, None, args.amp_run, forward_is_infer=True) if args.model_name == "Tacotron2": model = torch.jit.script(model) warmup_iters = 3 num_iters = 1 + warmup_iters for i in range(num_iters): measurements = {} if args.model_name == 'Tacotron2': text_padded = torch.randint(low=0, high=148, size=(args.batch_size, 140), dtype=torch.long).cuda() input_lengths = torch.IntTensor([text_padded.size(1)] * args.batch_size).cuda().long() with torch.no_grad(), MeasureTime(measurements, "inference_time"): mels, _, _ = model(text_padded, input_lengths) num_items = mels.size(0) * mels.size(2) if args.model_name == 'WaveGlow': n_mel_channels = model.upsample.in_channels num_mels = 895 mel_padded = torch.zeros(args.batch_size, n_mel_channels, num_mels).normal_(-5.62, 1.98).cuda() if args.amp_run: mel_padded = mel_padded.half() with torch.no_grad(), MeasureTime(measurements, "inference_time"): audios = model(mel_padded) audios = audios.float() num_items = audios.size(0) * audios.size(1) if i >= warmup_iters: DLLogger.log(step=(i - warmup_iters, ), data={"latency": measurements['inference_time']}) DLLogger.log(step=(i - warmup_iters, ), data={ "items_per_sec": num_items / measurements['inference_time'] }) DLLogger.log(step=tuple(), data={'infer_latency': measurements['inference_time']}) DLLogger.log(step=tuple(), data={ 'infer_items_per_sec': num_items / measurements['inference_time'] }) DLLogger.flush()
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU or CPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() log_file = os.path.join(args.output, args.log_file) DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_file), StdOutBackend(Verbosity.VERBOSE)]) for k,v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k:v}) DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'}) tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.fp16, args.cpu, forward_is_infer=True) # forward is infer를 해줌으로써 tacotron model의 infer로 간다. waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.fp16, args.cpu, forward_is_infer=True) denoiser = Denoiser(waveglow) if not args.cpu: denoiser.cuda() jitted_tacotron2 = torch.jit.script(tacotron2) texts = [] id_list = [] try: f = open(args.input, 'r') texts = f.readlines() except: print("Could not read file") sys.exit(1) #------------------------------------------------------------------------------------------------------------------- ref_mel = load_mel(args.ref_mel) id_list.append(args.emotion_id) emotion_id = torch.LongTensor(id_list).cuda() print(emotion_id) #------------------------------------------------------------------------------------------------------------------- if args.include_warmup: sequence = torch.randint(low=0, high=80, size=(1,50)).long() input_lengths = torch.IntTensor([sequence.size(1)]).long() if not args.cpu: sequence = sequence.cuda() input_lengths = input_lengths.cuda() for i in range(3): with torch.no_grad(): mel, mel_lengths, _ = jitted_tacotron2(sequence, input_lengths, ref_mel, emotion_id) _ = waveglow(mel) measurements = {} sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu) with torch.no_grad(), MeasureTime(measurements, "tacotron2_time", args.cpu): mel, mel_lengths, alignments = jitted_tacotron2(sequences_padded, input_lengths, ref_mel, emotion_id) with torch.no_grad(), MeasureTime(measurements, "waveglow_time", args.cpu): audios = waveglow(mel, sigma=args.sigma_infer) audios = audios.float() with torch.no_grad(), MeasureTime(measurements, "denoiser_time", args.cpu): audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) print("Stopping after",mel.size(2),"decoder steps") tacotron2_infer_perf = mel.size(0)*mel.size(2)/measurements['tacotron2_time'] waveglow_infer_perf = audios.size(0)*audios.size(1)/measurements['waveglow_time'] DLLogger.log(step=0, data={"tacotron2_items_per_sec": tacotron2_infer_perf}) DLLogger.log(step=0, data={"tacotron2_latency": measurements['tacotron2_time']}) DLLogger.log(step=0, data={"waveglow_items_per_sec": waveglow_infer_perf}) DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']}) DLLogger.log(step=0, data={"denoiser_latency": measurements['denoiser_time']}) DLLogger.log(step=0, data={"latency": (measurements['tacotron2_time']+measurements['waveglow_time']+measurements['denoiser_time'])}) for i, audio in enumerate(audios): plt.imshow(alignments[i].float().data.cpu().numpy().T, aspect="auto", origin="lower") figure_path = os.path.join(args.output,"alignment_"+str(i)+args.suffix+".png") plt.savefig(figure_path) audio = audio[:mel_lengths[i]*args.stft_hop_length] audio = audio/torch.max(torch.abs(audio)) audio_path = os.path.join(args.output,"audio_"+str(i)+args.suffix+".wav") write(audio_path, args.sampling_rate, audio.cpu().numpy()) DLLogger.flush()
def main(): parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Training') parser = parse_args(parser) args, _ = parser.parse_known_args() if 'LOCAL_RANK' in os.environ and 'WORLD_SIZE' in os.environ: local_rank = int(os.environ['LOCAL_RANK']) world_size = int(os.environ['WORLD_SIZE']) else: local_rank = args.rank world_size = args.world_size distributed_run = world_size > 1 if local_rank == 0: log_file = os.path.join(args.output, args.log_file) DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_file), StdOutBackend(Verbosity.VERBOSE)]) else: DLLogger.init(backends=[]) for k,v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k:v}) DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'}) model_name = args.model_name parser = models.model_parser(model_name, parser) args, _ = parser.parse_known_args() torch.backends.cudnn.enabled = args.cudnn_enabled torch.backends.cudnn.benchmark = args.cudnn_benchmark if distributed_run: init_distributed(args, world_size, local_rank, args.group_name) torch.cuda.synchronize() run_start_time = time.perf_counter() model_config = models.get_model_config(model_name, args) model = models.get_model(model_name, model_config, cpu_run=False, uniform_initialize_bn_weight=not args.disable_uniform_initialize_bn_weight) if distributed_run: model = DDP(model,device_ids=[local_rank],output_device=local_rank) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) scaler = torch.cuda.amp.GradScaler(enabled=args.amp) try: sigma = args.sigma except AttributeError: sigma = None start_epoch = [0] if args.resume_from_last: args.checkpoint_path = get_last_checkpoint_filename(args.output, model_name) if args.checkpoint_path is not "": load_checkpoint(model, optimizer, start_epoch, model_config, args.amp, args.checkpoint_path, local_rank) start_epoch = start_epoch[0] criterion = loss_functions.get_loss_function(model_name, sigma) try: n_frames_per_step = args.n_frames_per_step except AttributeError: n_frames_per_step = None collate_fn = data_functions.get_collate_function( model_name, n_frames_per_step) trainset = data_functions.get_data_loader( model_name, args.dataset_path, args.training_files, args) if distributed_run: train_sampler = DistributedSampler(trainset) shuffle = False else: train_sampler = None shuffle = True train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle, sampler=train_sampler, batch_size=args.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) valset = data_functions.get_data_loader( model_name, args.dataset_path, args.validation_files, args) batch_to_gpu = data_functions.get_batch_to_gpu(model_name) iteration = 0 train_epoch_items_per_sec = 0.0 val_loss = 0.0 num_iters = 0 model.train() for epoch in range(start_epoch, args.epochs): torch.cuda.synchronize() epoch_start_time = time.perf_counter() # used to calculate avg items/sec over epoch reduced_num_items_epoch = 0 train_epoch_items_per_sec = 0.0 num_iters = 0 reduced_loss = 0 # if overflow at the last iteration then do not save checkpoint overflow = False if distributed_run: train_loader.sampler.set_epoch(epoch) for i, batch in enumerate(train_loader): torch.cuda.synchronize() iter_start_time = time.perf_counter() DLLogger.log(step=(epoch, i), data={'glob_iter/iters_per_epoch': str(iteration)+"/"+str(len(train_loader))}) adjust_learning_rate(iteration, epoch, optimizer, args.learning_rate, args.anneal_steps, args.anneal_factor, local_rank) model.zero_grad() x, y, num_items = batch_to_gpu(batch) #AMP upstream autocast with torch.cuda.amp.autocast(enabled=args.amp): y_pred = model(x) loss = criterion(y_pred, y) if distributed_run: reduced_loss = reduce_tensor(loss.data, world_size).item() reduced_num_items = reduce_tensor(num_items.data, 1).item() else: reduced_loss = loss.item() reduced_num_items = num_items.item() if np.isnan(reduced_loss): raise Exception("loss is NaN") DLLogger.log(step=(epoch,i), data={'train_loss': reduced_loss}) num_iters += 1 # accumulate number of items processed in this epoch reduced_num_items_epoch += reduced_num_items if args.amp: scaler.scale(loss).backward() scaler.unscale_(optimizer) grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), args.grad_clip_thresh) scaler.step(optimizer) scaler.update() optimizer.zero_grad(set_to_none=True) else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), args.grad_clip_thresh) optimizer.step() torch.cuda.synchronize() iter_stop_time = time.perf_counter() iter_time = iter_stop_time - iter_start_time items_per_sec = reduced_num_items/iter_time train_epoch_items_per_sec += items_per_sec DLLogger.log(step=(epoch, i), data={'train_items_per_sec': items_per_sec}) DLLogger.log(step=(epoch, i), data={'train_iter_time': iter_time}) iteration += 1 torch.cuda.synchronize() epoch_stop_time = time.perf_counter() epoch_time = epoch_stop_time - epoch_start_time DLLogger.log(step=(epoch,), data={'train_items_per_sec': (train_epoch_items_per_sec/num_iters if num_iters > 0 else 0.0)}) DLLogger.log(step=(epoch,), data={'train_loss': reduced_loss}) DLLogger.log(step=(epoch,), data={'train_epoch_time': epoch_time}) val_loss, val_items_per_sec = validate(model, criterion, valset, epoch, iteration, args.batch_size, world_size, collate_fn, distributed_run, local_rank, batch_to_gpu) if (epoch % args.epochs_per_checkpoint == 0) and args.bench_class == "": save_checkpoint(model, optimizer, scaler, epoch, model_config, args.amp, args.output, args.model_name, local_rank, world_size) if local_rank == 0: DLLogger.flush() torch.cuda.synchronize() run_stop_time = time.perf_counter() run_time = run_stop_time - run_start_time DLLogger.log(step=tuple(), data={'run_time': run_time}) DLLogger.log(step=tuple(), data={'val_loss': val_loss}) DLLogger.log(step=tuple(), data={'train_items_per_sec': (train_epoch_items_per_sec/num_iters if num_iters > 0 else 0.0)}) DLLogger.log(step=tuple(), data={'val_items_per_sec': val_items_per_sec}) if local_rank == 0: DLLogger.flush()
def main(): parser = get_parser() args = parser.parse_args() log_fpath = args.log_file or str(Path(args.output_dir, 'nvlog_infer.json')) log_fpath = unique_log_fpath(log_fpath) dllogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, log_fpath), StdOutBackend(Verbosity.VERBOSE, metric_format=stdout_metric_format) ]) [dllogger.log("PARAMETER", {k: v}) for k, v in vars(args).items()] for step in ['DNN', 'data+DNN', 'data']: for c in [0.99, 0.95, 0.9, 0.5]: cs = 'avg' if c == 0.5 else f'{int(100*c)}%' dllogger.metadata(f'{step.lower()}_latency_{c}', { 'name': f'{step} latency {cs}', 'format': ':>7.2f', 'unit': 'ms' }) dllogger.metadata('eval_wer', { 'name': 'WER', 'format': ':>3.3f', 'unit': '%' }) if args.cpu: device = torch.device('cpu') else: assert torch.cuda.is_available() device = torch.device('cuda') torch.backends.cudnn.benchmark = args.cudnn_benchmark if args.seed is not None: torch.manual_seed(args.seed + args.local_rank) np.random.seed(args.seed + args.local_rank) random.seed(args.seed + args.local_rank) # set up distributed training multi_gpu = not args.cpu and int(os.environ.get('WORLD_SIZE', 1)) > 1 if multi_gpu: torch.cuda.set_device(args.local_rank) distrib.init_process_group(backend='nccl', init_method='env://') print_once(f'Inference with {distrib.get_world_size()} GPUs') cfg = config.load(args.model_config) if args.max_duration is not None: cfg['input_val']['audio_dataset']['max_duration'] = args.max_duration cfg['input_val']['filterbank_features'][ 'max_duration'] = args.max_duration if args.pad_to_max_duration: assert cfg['input_val']['audio_dataset']['max_duration'] > 0 cfg['input_val']['audio_dataset']['pad_to_max_duration'] = True cfg['input_val']['filterbank_features']['pad_to_max_duration'] = True use_dali = args.dali_device in ('cpu', 'gpu') (dataset_kw, features_kw, splicing_kw, _, _) = config.input(cfg, 'val') tokenizer_kw = config.tokenizer(cfg) tokenizer = Tokenizer(**tokenizer_kw) optim_level = 3 if args.amp else 0 feature_proc = torch.nn.Sequential( torch.nn.Identity(), torch.nn.Identity(), features.FrameSplicing(optim_level=optim_level, **splicing_kw), features.FillPadding(optim_level=optim_level, ), ) # dataset data_loader = DaliDataLoader(gpu_id=args.local_rank or 0, dataset_path=args.dataset_dir, config_data=dataset_kw, config_features=features_kw, json_names=[args.val_manifest], batch_size=args.batch_size, sampler=dali_sampler.SimpleSampler(), pipeline_type="val", device_type=args.dali_device, tokenizer=tokenizer) model = RNNT(n_classes=tokenizer.num_labels + 1, **config.rnnt(cfg)) if args.ckpt is not None: print(f'Loading the model from {args.ckpt} ...') checkpoint = torch.load(args.ckpt, map_location="cpu") key = 'ema_state_dict' if args.ema else 'state_dict' state_dict = checkpoint[key] model.load_state_dict(state_dict, strict=True) model.to(device) model.eval() if feature_proc is not None: feature_proc.to(device) feature_proc.eval() if args.amp: model = amp.initialize(model, opt_level='O3') if multi_gpu: model = DistributedDataParallel(model) agg = {'txts': [], 'preds': [], 'logits': []} dur = {'data': [], 'dnn': [], 'data+dnn': []} rep_loader = chain(*repeat(data_loader, args.repeats)) rep_len = args.repeats * len(data_loader) blank_idx = tokenizer.num_labels greedy_decoder = RNNTGreedyDecoder(blank_idx=blank_idx) def sync_time(): torch.cuda.synchronize() if device.type == 'cuda' else None return time.perf_counter() sz = [] with torch.no_grad(): for it, batch in enumerate(tqdm.tqdm(rep_loader, total=rep_len)): if use_dali: feats, feat_lens, txt, txt_lens = batch if feature_proc is not None: feats, feat_lens = feature_proc([feats, feat_lens]) else: batch = [t.cuda(non_blocking=True) for t in batch] audio, audio_lens, txt, txt_lens = batch feats, feat_lens = feature_proc([audio, audio_lens]) feats = feats.permute(2, 0, 1) if args.amp: feats = feats.half() sz.append(feats.size(0)) t1 = sync_time() log_probs, log_prob_lens = model(feats, feat_lens, txt, txt_lens) t2 = sync_time() # burn-in period; wait for a new loader due to num_workers if it >= 1 and (args.repeats == 1 or it >= len(data_loader)): dur['data'].append(t1 - t0) dur['dnn'].append(t2 - t1) dur['data+dnn'].append(t2 - t0) if txt is not None: agg['txts'] += helpers.gather_transcripts([txt], [txt_lens], tokenizer.detokenize) preds = greedy_decoder.decode(model, feats, feat_lens) agg['preds'] += helpers.gather_predictions([preds], tokenizer.detokenize) if 0 < args.steps < it: break t0 = sync_time() # communicate the results if args.transcribe_wav: for idx, p in enumerate(agg['preds']): print_once(f'Prediction {idx+1: >3}: {p}') elif args.transcribe_filelist: pass else: wer, loss = process_evaluation_epoch(agg) if not multi_gpu or distrib.get_rank() == 0: dllogger.log(step=(), data={'eval_wer': 100 * wer}) if args.save_predictions: with open(args.save_predictions, 'w') as f: f.write('\n'.join(agg['preds'])) # report timings if len(dur['data']) >= 20: ratios = [0.9, 0.95, 0.99] for stage in dur: lat = durs_to_percentiles(dur[stage], ratios) for k in [0.99, 0.95, 0.9, 0.5]: kk = str(k).replace('.', '_') dllogger.log(step=(), data={f'{stage.lower()}_latency_{kk}': lat[k]}) else: # TODO measure at least avg latency print_once('Not enough samples to measure latencies.')
def main(): parser = get_parser() args = parser.parse_args() log_fpath = args.log_file or str(Path(args.output_dir, 'nvlog_infer.json')) log_fpath = unique_log_fpath(log_fpath) dllogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_fpath), StdOutBackend(Verbosity.VERBOSE, metric_format=stdout_metric_format)]) [dllogger.log("PARAMETER", {k: v}) for k, v in vars(args).items()] for step in ['DNN', 'data+DNN', 'data']: for c in [0.99, 0.95, 0.9, 0.5]: cs = 'avg' if c == 0.5 else f'{int(100*c)}%' dllogger.metadata(f'{step.lower()}_latency_{c}', {'name': f'{step} latency {cs}', 'format': ':>7.2f', 'unit': 'ms'}) dllogger.metadata( 'eval_wer', {'name': 'WER', 'format': ':>3.2f', 'unit': '%'}) if args.cpu: device = torch.device('cpu') else: assert torch.cuda.is_available() device = torch.device('cuda') torch.backends.cudnn.benchmark = args.cudnn_benchmark if args.seed is not None: torch.manual_seed(args.seed + args.local_rank) np.random.seed(args.seed + args.local_rank) random.seed(args.seed + args.local_rank) # set up distributed training multi_gpu = not args.cpu and int(os.environ.get('WORLD_SIZE', 1)) > 1 if multi_gpu: torch.cuda.set_device(args.local_rank) distrib.init_process_group(backend='nccl', init_method='env://') print_once(f'Inference with {distrib.get_world_size()} GPUs') cfg = config.load(args.model_config) config.apply_config_overrides(cfg, args) symbols = helpers.add_ctc_blank(cfg['labels']) use_dali = args.dali_device in ('cpu', 'gpu') dataset_kw, features_kw = config.input(cfg, 'val') measure_perf = args.steps > 0 # dataset if args.transcribe_wav or args.transcribe_filelist: if use_dali: print("DALI supported only with input .json files; disabling") use_dali = False assert not args.pad_to_max_duration assert not (args.transcribe_wav and args.transcribe_filelist) if args.transcribe_wav: dataset = SingleAudioDataset(args.transcribe_wav) else: dataset = FilelistDataset(args.transcribe_filelist) data_loader = get_data_loader(dataset, batch_size=1, multi_gpu=multi_gpu, shuffle=False, num_workers=0, drop_last=(True if measure_perf else False)) _, features_kw = config.input(cfg, 'val') feat_proc = FilterbankFeatures(**features_kw) elif use_dali: # pad_to_max_duration is not supported by DALI - have simple padders if features_kw['pad_to_max_duration']: feat_proc = BaseFeatures( pad_align=features_kw['pad_align'], pad_to_max_duration=True, max_duration=features_kw['max_duration'], sample_rate=features_kw['sample_rate'], window_size=features_kw['window_size'], window_stride=features_kw['window_stride']) features_kw['pad_to_max_duration'] = False else: feat_proc = None data_loader = DaliDataLoader( gpu_id=args.local_rank or 0, dataset_path=args.dataset_dir, config_data=dataset_kw, config_features=features_kw, json_names=args.val_manifests, batch_size=args.batch_size, pipeline_type=("train" if measure_perf else "val"), # no drop_last device_type=args.dali_device, symbols=symbols) else: dataset = AudioDataset(args.dataset_dir, args.val_manifests, symbols, **dataset_kw) data_loader = get_data_loader(dataset, args.batch_size, multi_gpu=multi_gpu, shuffle=False, num_workers=4, drop_last=False) feat_proc = FilterbankFeatures(**features_kw) model = QuartzNet(encoder_kw=config.encoder(cfg), decoder_kw=config.decoder(cfg, n_classes=len(symbols))) if args.ckpt is not None: print(f'Loading the model from {args.ckpt} ...') checkpoint = torch.load(args.ckpt, map_location="cpu") key = 'ema_state_dict' if args.ema else 'state_dict' state_dict = checkpoint[key] model.load_state_dict(state_dict, strict=True) model.to(device) model.eval() if feat_proc is not None: feat_proc.to(device) feat_proc.eval() if args.amp: model = model.half() if args.torchscript: greedy_decoder = GreedyCTCDecoder() feat_proc, model, greedy_decoder = torchscript_export( data_loader, feat_proc, model, greedy_decoder, args.output_dir, use_amp=args.amp, use_conv_masks=True, model_toml=args.model_toml, device=device, save=args.torchscript_export) if multi_gpu: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) agg = {'txts': [], 'preds': [], 'logits': []} dur = {'data': [], 'dnn': [], 'data+dnn': []} looped_loader = chain.from_iterable(repeat(data_loader)) greedy_decoder = GreedyCTCDecoder() sync = lambda: torch.cuda.synchronize() if device.type == 'cuda' else None steps = args.steps + args.warmup_steps or len(data_loader) with torch.no_grad(): for it, batch in enumerate(tqdm(looped_loader, initial=1, total=steps)): if use_dali: feats, feat_lens, txt, txt_lens = batch if feat_proc is not None: feats, feat_lens = feat_proc(feats, feat_lens) else: batch = [t.to(device, non_blocking=True) for t in batch] audio, audio_lens, txt, txt_lens = batch feats, feat_lens = feat_proc(audio, audio_lens) sync() t1 = time.perf_counter() if args.amp: feats = feats.half() if model.encoder.use_conv_masks: log_probs, log_prob_lens = model(feats, feat_lens) else: log_probs = model(feats, feat_lens) preds = greedy_decoder(log_probs) sync() t2 = time.perf_counter() # burn-in period; wait for a new loader due to num_workers if it >= 1 and (args.steps == 0 or it >= args.warmup_steps): dur['data'].append(t1 - t0) dur['dnn'].append(t2 - t1) dur['data+dnn'].append(t2 - t0) if txt is not None: agg['txts'] += helpers.gather_transcripts([txt], [txt_lens], symbols) agg['preds'] += helpers.gather_predictions([preds], symbols) agg['logits'].append(log_probs) if it + 1 == steps: break sync() t0 = time.perf_counter() # communicate the results if args.transcribe_wav: for idx, p in enumerate(agg['preds']): print_once(f'Prediction {idx+1: >3}: {p}') elif args.transcribe_filelist: pass elif not multi_gpu or distrib.get_rank() == 0: wer, _ = process_evaluation_epoch(agg) dllogger.log(step=(), data={'eval_wer': 100 * wer}) if args.save_predictions: with open(args.save_predictions, 'w') as f: f.write('\n'.join(agg['preds'])) if args.save_logits: logits = torch.cat(agg['logits'], dim=0).cpu() torch.save(logits, args.save_logits) # report timings if len(dur['data']) >= 20: ratios = [0.9, 0.95, 0.99] for stage in dur: lat = durs_to_percentiles(dur[stage], ratios) for k in [0.99, 0.95, 0.9, 0.5]: kk = str(k).replace('.', '_') dllogger.log(step=(), data={f'{stage.lower()}_latency_{kk}': lat[k]}) else: print_once('Not enough samples to measure latencies.')
def main(_): tf.get_logger().setLevel(logging.ERROR) hvd.init() FLAGS = PARSER.parse_args() backends = [] if hvd.rank() == 0: backends += [StdOutBackend(Verbosity.DEFAULT)] if FLAGS.log_dir: backends += [JSONStreamBackend(Verbosity.DEFAULT, FLAGS.log_dir)] DLLogger.init(backends=backends) for key in vars(FLAGS): DLLogger.log(step="PARAMETER", data={str(key): vars(FLAGS)[key]}) os.environ['CUDA_CACHE_DISABLE'] = '0' os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' os.environ['TF_ADJUST_HUE_FUSED'] = '1' os.environ['TF_ADJUST_SATURATION_FUSED'] = '1' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' os.environ['TF_SYNC_ON_FINISH'] = '0' os.environ['TF_AUTOTUNE_THRESHOLD'] = '2' os.environ['TF_DISABLE_NVTX_RANGES'] = '1' dataset = MSDDataset(json_path=os.path.join(FLAGS.data_dir, 'dataset.json'), dst_size=FLAGS.input_shape, seed=FLAGS.seed, interpolator=FLAGS.resize_interpolator, data_normalization=FLAGS.data_normalization, batch_size=FLAGS.batch_size, train_split=FLAGS.train_split, split_seed=FLAGS.split_seed) FLAGS.labels = dataset.labels gpu_options = tf.GPUOptions() config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True) if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) if FLAGS.use_amp: config.graph_options.rewrite_options.auto_mixed_precision = 1 run_config = tf.estimator.RunConfig( save_summary_steps=None, save_checkpoints_steps=None if FLAGS.benchmark else dataset.train_steps * FLAGS.train_epochs, save_checkpoints_secs=None, tf_random_seed=None, session_config=config, keep_checkpoint_max=1) estimator = tf.estimator.Estimator( model_fn=vnet_v2, model_dir=FLAGS.model_dir if hvd.rank() == 0 else None, config=run_config, params=FLAGS) train_hooks = [hvd.BroadcastGlobalVariablesHook(0)] if 'train' in FLAGS.exec_mode: steps = dataset.train_steps * FLAGS.train_epochs if FLAGS.benchmark: steps = FLAGS.warmup_steps * 2 if hvd.rank() == 0: train_hooks += [ ProfilingHook(FLAGS.warmup_steps, FLAGS.batch_size * hvd.size(), DLLogger) ] else: if hvd.rank() == 0: train_hooks += [TrainHook(FLAGS.log_every, DLLogger)] estimator.train(input_fn=lambda: dataset.train_fn(FLAGS.augment), steps=steps, hooks=train_hooks) if 'evaluate' in FLAGS.exec_mode: if hvd.rank() == 0: if FLAGS.train_split >= 1.0: raise ValueError("Missing argument: --train_split < 1.0") result = estimator.evaluate(input_fn=dataset.eval_fn, steps=dataset.eval_steps, hooks=[]) DLLogger.log(step=tuple(), data={ 'background_dice': str(result['background dice']), 'anterior_dice': str(result['Anterior dice']), 'posterior_dice': str(result['Posterior dice']) }) if 'predict' in FLAGS.exec_mode: count = 1 hooks = [] if hvd.rank() == 0: if FLAGS.benchmark: count = math.ceil( (FLAGS.warmup_steps * 2) / dataset.test_steps) hooks += [ ProfilingHook(FLAGS.warmup_steps, FLAGS.batch_size * hvd.size(), DLLogger, training=False) ] predictions = estimator.predict( input_fn=lambda: dataset.test_fn(count=count), hooks=hooks) pred = [p['prediction'] for p in predictions] predict_path = os.path.join(FLAGS.model_dir, 'predictions') if os.path.exists(predict_path): shutil.rmtree(predict_path) os.makedirs(predict_path) pickle.dump( pred, open(os.path.join(predict_path, 'predictions.pkl'), 'wb'))