コード例 #1
0
def init(log_fpath, log_dir, enabled=True, tb_subsets=[], **tb_kw):

    if enabled:
        backends = [JSONStreamBackend(Verbosity.DEFAULT,
                                      unique_log_fpath(log_fpath)),
                    StdOutBackend(Verbosity.VERBOSE,
                                  step_format=stdout_step_format,
                                  metric_format=stdout_metric_format)]
    else:
        backends = []

    dllogger.init(backends=backends)
    dllogger.metadata("train_lrate", {"name": "lrate", "format": ":>3.2e"})

    for id_, pref in [('train', ''), ('train_avg', 'avg train '),
                      ('val', '  avg val '), ('val_ema', '  EMA val ')]:

        dllogger.metadata(f"{id_}_loss",
                          {"name": f"{pref}loss", "format": ":>5.2f"})
        dllogger.metadata(f"{id_}_mel_loss",
                          {"name": f"{pref}mel loss", "format": ":>5.2f"})

        dllogger.metadata(f"{id_}_frames/s",
                          {"name": None, "unit": "frames/s", "format": ":>10.2f"})
        dllogger.metadata(f"{id_}_took",
                          {"name": "took", "unit": "s", "format": ":>3.2f"})

    global tb_loggers
    tb_loggers = {s: TBLogger(enabled, log_dir, name=s, **tb_kw)
                  for s in tb_subsets}
コード例 #2
0
	def __init__(self, log_file, global_batch_size, warmup_steps: int = 0, profile: bool = False):
		logger.init(backends=[JSONStreamBackend(Verbosity.VERBOSE, log_file), StdOutBackend(Verbosity.VERBOSE)])
		self.warmup_steps = warmup_steps
		self.global_batch_size = global_batch_size
		self.step = 0
		self.profile = profile
		self.timestamps = []
コード例 #3
0
def init_dllogger(log_fpath=None, dummy=False):
    if dummy:
        DLLogger.init(backends=[])
        return
    DLLogger.init(backends=[
        JSONStreamBackend(Verbosity.DEFAULT, log_fpath),
        StdOutBackend(Verbosity.VERBOSE, step_format=stdout_step_format,
                      metric_format=stdout_metric_format)
        ]
    )
    DLLogger.metadata("train_loss", {"name": "loss", "format": ":>5.2f"})
    DLLogger.metadata("train_mel_loss", {"name": "mel loss", "format": ":>5.2f"})
    DLLogger.metadata("avg_train_loss", {"name": "avg train loss", "format": ":>5.2f"})
    DLLogger.metadata("avg_train_mel_loss", {"name": "avg train mel loss", "format": ":>5.2f"})
    DLLogger.metadata("val_loss", {"name": "  avg val loss", "format": ":>5.2f"})
    DLLogger.metadata("val_mel_loss", {"name": "  avg val mel loss", "format": ":>5.2f"})
    DLLogger.metadata(
        "val_ema_loss",
        {"name": "  EMA val loss", "format": ":>5.2f"})
    DLLogger.metadata(
        "val_ema_mel_loss",
        {"name": "  EMA val mel loss", "format": ":>5.2f"})
    DLLogger.metadata(
        "train_frames/s", {"name": None, "unit": "frames/s", "format": ":>10.2f"})
    DLLogger.metadata(
        "avg_train_frames/s", {"name": None, "unit": "frames/s", "format": ":>10.2f"})
    DLLogger.metadata(
        "val_frames/s", {"name": None, "unit": "frames/s", "format": ":>10.2f"})
    DLLogger.metadata(
        "val_ema_frames/s", {"name": None, "unit": "frames/s", "format": ":>10.2f"})
    DLLogger.metadata(
        "took", {"name": "took", "unit": "s", "format": ":>3.2f"})
    DLLogger.metadata("lrate_change", {"name": "lrate"})
コード例 #4
0
def main():
    """
    Launches inference benchmark.
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch FastPitch Inference Benchmark')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    log_file = args.log_file
    DLLogger.init(backends=[
        JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
        StdOutBackend(Verbosity.VERBOSE)
    ])
    for k, v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k: v})
    DLLogger.log(step="PARAMETER", data={'model_name': 'FastPitch_PyT'})

    model = load_and_setup_model('FastPitch',
                                 parser,
                                 None,
                                 args.amp_run,
                                 'cuda',
                                 unk_args=[],
                                 forward_is_infer=True,
                                 ema=False,
                                 jitable=True)

    # FIXME Temporarily disabled due to nn.LayerNorm fp16 casting bug in pytorch:20.02-py3 and 20.03
    # model = torch.jit.script(model)

    warmup_iters = 3
    iters = 1
    gen_measures = MeasureTime()
    all_frames = 0
    for i in range(-warmup_iters, iters):
        text_padded = torch.randint(low=0,
                                    high=148,
                                    size=(args.batch_size, 128),
                                    dtype=torch.long).to('cuda')
        input_lengths = torch.IntTensor([text_padded.size(1)] *
                                        args.batch_size).to('cuda')
        durs = torch.ones_like(text_padded).mul_(4).to('cuda')

        with torch.no_grad(), gen_measures:
            mels, *_ = model(text_padded, input_lengths, dur_tgt=durs)
        num_frames = mels.size(0) * mels.size(2)

        if i >= 0:
            all_frames += num_frames
            DLLogger.log(step=(i, ), data={"latency": gen_measures[-1]})
            DLLogger.log(step=(i, ),
                         data={"frames/s": num_frames / gen_measures[-1]})

    measures = gen_measures[warmup_iters:]
    DLLogger.log(step=(), data={'avg latency': np.mean(measures)})
    DLLogger.log(step=(), data={'avg frames/s': all_frames / np.sum(measures)})
    DLLogger.flush()
コード例 #5
0
def get_logger(params):
    backends = []
    if hvd.rank() == 0:
        backends += [StdOutBackend(Verbosity.VERBOSE)]
        if params.log_dir:
            backends += [JSONStreamBackend(Verbosity.VERBOSE, params.log_dir)]
    logger.init(backends=backends)
    return logger
コード例 #6
0
ファイル: logger.py プロジェクト: seitak/DeepLearningExamples
 def _initialize_dllogger(self, log_dir, filename, append):
     backends = [
         JSONStreamBackend(Verbosity.VERBOSE,
                           os.path.join(log_dir, filename),
                           append=append),
         StdOutBackend(Verbosity.VERBOSE),
     ]
     logger.init(backends=backends)
コード例 #7
0
def log(logname, dice, results="/results"):
    dllogger = Logger(backends=[
        JSONStreamBackend(Verbosity.VERBOSE, os.path.join(results, logname)),
        StdOutBackend(Verbosity.VERBOSE, step_format=lambda step: ""),
    ])
    metrics = {}
    metrics.update({"Mean dice": round(dice.mean().item(), 2)})
    metrics.update({f"L{j+1}": round(m.item(), 2) for j, m in enumerate(dice)})
    dllogger.log(step=(), data=metrics)
    dllogger.flush()
コード例 #8
0
ファイル: setup.py プロジェクト: HabanaAI/Model-References
def get_logger(params):
    backends = []
    worker_id = hvd_rank() if horovod_enabled() else 0
    if worker_id == 0:
        backends += [StdOutBackend(Verbosity.VERBOSE)]
        if params.log_dir:
            os.makedirs(params.log_dir, exist_ok=True)
            log_file = f"{params.log_dir}/log.json"
            backends += [JSONStreamBackend(Verbosity.VERBOSE, log_file)]
    logger.init(backends=backends)
    return logger
コード例 #9
0
def setup_logger(args):
    os.makedirs(args.results, exist_ok=True)
    log_path = os.path.join(args.results, args.log_file)

    if os.path.exists(log_path):
        for i in itertools.count():
            s_fname = args.log_file.split('.')
            fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}'
            log_path = os.path.join(args.results, fname)
            if not os.path.exists(log_path):
                break

    def metric_format(metric, metadata, value):
        return "{}: {}".format(metric, f'{value:.5f}' if isinstance(value, float) else value)
    def step_format(step):
        if step == ():
            return "Finished |"
        elif isinstance(step, int):
            return "Step {0: <5} |".format(step)
        return "Step {} |".format(step)


    if not dist.is_initialized() or not args.distributed_world_size > 1 or args.distributed_rank == 0:
        dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path),
                                TensorBoardBackend(verbosity=1, log_dir=args.results),
                                StdOutBackend(verbosity=2, 
                                              step_format=step_format,
                                              prefix_format=lambda x: "")#,
                                              #metric_format=metric_format)
                                ])
    else:
        dllogger.init(backends=[])
    dllogger.log(step='PARAMETER', data=vars(args), verbosity=0)

    container_setup_info = {**get_framework_env_vars(), **get_system_info()}
    dllogger.log(step='ENVIRONMENT', data=container_setup_info, verbosity=0)

    dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
    dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
    dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
    dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
    dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f'})
    dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f'})
    dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
    dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
    dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
    dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f'})
    dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
    dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
    dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
    dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f'})
    dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
    dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
    dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
コード例 #10
0
def get_dllogger(params):
    backends = []
    if is_main_process():
        backends += [StdOutBackend(Verbosity.VERBOSE)]
        if params.log_dir:
            backends += [
                JSONStreamBackend(Verbosity.VERBOSE,
                                  os.path.join(params.log_dir, "log.json"))
            ]
    logger.init(backends=backends)
    return logger
コード例 #11
0
def init_log(args):

    enabled = not dist.is_initialized() or dist.get_rank() == 0
    if enabled:
        fpath = args.log_file or os.path.join(args.output_dir, 'nvlog.json')
        backends = [
            JSONStreamBackend(Verbosity.DEFAULT, unique_log_fpath(fpath)),
            StdOutBackend(Verbosity.VERBOSE,
                          step_format=stdout_step_format,
                          metric_format=stdout_metric_format)
        ]
    else:
        backends = []

    dllogger.init(backends=backends)
    dllogger.metadata("train_lrate", {"name": "lrate", "format": ":>3.2e"})

    for id_, pref in [('train', ''), ('train_avg', 'avg train '),
                      ('dev_ema', '  dev ema ')]:

        dllogger.metadata(f"{id_}_loss", {
            "name": f"{pref}loss",
            "format": ":>7.2f"
        })

        dllogger.metadata(f"{id_}_wer", {
            "name": f"{pref}wer",
            "format": ":>6.2f"
        })

        dllogger.metadata(f"{id_}_pplx", {
            "name": f"{pref}pplx",
            "format": ":>6.2f"
        })

        dllogger.metadata(f"{id_}_throughput", {
            "name": f"{pref}utts/s",
            "format": ":>5.0f"
        })

        dllogger.metadata(f"{id_}_took", {
            "name": "took",
            "unit": "s",
            "format": ":>5.2f"
        })

    tb_subsets = ['train', 'dev_ema']
    global tb_loggers
    tb_loggers = {
        s: TBLogger(enabled, args.output_dir, name=s)
        for s in tb_subsets
    }

    log_parameters(vars(args), tb_subset='train')
コード例 #12
0
def setup_dllogger(rank, enabled=True, filename='log.json'):
    if enabled and rank == 0:
        backends = [
            StdOutBackend(Verbosity.DEFAULT),
            JSONStreamBackend(
                Verbosity.VERBOSE,
                filename,
            ),
        ]
        DLLogger.init(backends)
    else:
        DLLogger.init([])
コード例 #13
0
def get_logger(params):
    """ Get logger object

    :param params: Dict with additional parameters
    :return: logger
    """
    backends = []
    if hvd.rank() == 0:
        backends += [StdOutBackend(Verbosity.VERBOSE)]
        if params.log_dir:
            backends += [JSONStreamBackend(Verbosity.VERBOSE, params.log_dir)]
    logger.init(backends=backends)
    return logger
コード例 #14
0
ファイル: main.py プロジェクト: zachwe/DeepLearningExamples
def log(logname, dice, epoch=None, dice_tta=None):
    dllogger = Logger(backends=[
        JSONStreamBackend(Verbosity.VERBOSE, os.path.join(
            args.results, logname)),
        StdOutBackend(Verbosity.VERBOSE, step_format=lambda step: ""),
    ])
    metrics = {}
    if epoch is not None:
        metrics.update({"Epoch": epoch})
    metrics.update({"Mean dice": round(dice.mean().item(), 2)})
    if dice_tta is not None:
        metrics.update({"Mean TTA dice": round(dice_tta.mean().item(), 2)})
    metrics.update({f"L{j+1}": round(m.item(), 2) for j, m in enumerate(dice)})
    if dice_tta is not None:
        metrics.update({
            f"TTA_L{j+1}": round(m.item(), 2)
            for j, m in enumerate(dice_tta)
        })
    dllogger.log(step=(), data=metrics)
    dllogger.flush()
コード例 #15
0
def main():
    LOGGER.set_model_name('ResNet')
    LOGGER.set_backends([
        StdOutBackend(log_file='std.out', logging_scope=Scope.TRAIN_ITER),
        CompactBackend(log_file=None,
                       logging_scope=Scope.TRAIN_ITER,
                       iteration_interval=5),
        JsonBackend(log_file='dummy.json',
                    logging_scope=Scope.TRAIN_ITER,
                    iteration_interval=4)
    ])

    parser = ArgumentParser()
    parser.add_argument('--dummy', type=str, default='default_dummy_value')
    args = parser.parse_args()

    LOGGER.log_hardware()
    LOGGER.log_args(args)

    LOGGER.log(tags.RUN_INIT)
    LOGGER.register_metric('loss',
                           meter=AverageMeter(),
                           metric_scope=Scope.TRAIN_ITER)
    LOGGER.register_metric('epoch_nr', metric_scope=Scope.EPOCH)
    LOGGER.register_metric('epochs2')

    with LOGGER.timed_block(tags.SETUP_BLOCK):
        print("This is setup.")

    with LOGGER.timed_block(tags.PREPROC_BLOCK):
        print("This is preprocessing.")

    with LOGGER.timed_block(tags.RUN_BLOCK):
        print("This is run.")
        train()
        print("This is the end.")

    LOGGER.log(tags.RUN_FINAL)

    LOGGER.finish()
コード例 #16
0
 def __init__(self, args):
     super(Model, self).__init__()
     self.save_hyperparameters()
     self.args = args
     self.f1_score = F1(args)
     self.model = UNetLoc(args) if args.type == "pre" else get_dmg_unet(
         args)
     self.loss = Loss(args)
     self.best_f1 = torch.tensor(0)
     self.best_epoch = 0
     self.tta_flips = [[2], [3], [2, 3]]
     self.lr = args.lr
     self.n_class = 2 if self.args.type == "pre" else 5
     self.softmax = nn.Softmax(dim=1)
     self.test_idx = 0
     self.dllogger = Logger(backends=[
         JSONStreamBackend(
             Verbosity.VERBOSE,
             os.path.join(args.results, f"{args.logname}.json")),
         StdOutBackend(Verbosity.VERBOSE,
                       step_format=lambda step: f"Epoch: {step} "),
     ])
コード例 #17
0
def setup_logger(config):
    log_path = config.get("log_path", os.getcwd())
    if is_main_process():
        backends = [
            TensorBoardBackend(verbosity=dllogger.Verbosity.VERBOSE,
                               log_dir=log_path),
            JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                              filename=os.path.join(log_path, "log.json")),
            AggregatorBackend(verbosity=dllogger.Verbosity.VERBOSE,
                              agg_dict={"loss": AverageMeter}),
            StdOutBackend(
                verbosity=dllogger.Verbosity.DEFAULT,
                step_format=empty_step_format,
                metric_format=no_string_metric_format,
                prefix_format=empty_prefix_format,
            ),
        ]

        logger = Logger(backends=backends)
    else:
        logger = Logger(backends=[])
    container_setup_info = get_framework_env_vars()
    logger.log(step="PARAMETER",
               data=container_setup_info,
               verbosity=dllogger.Verbosity.DEFAULT)

    logger.metadata("loss", {
        "unit": "nat",
        "GOAL": "MINIMIZE",
        "STAGE": "TRAIN"
    })
    logger.metadata("val_loss", {
        "unit": "nat",
        "GOAL": "MINIMIZE",
        "STAGE": "VAL"
    })
    return logger
コード例 #18
0
    def __init__(self, args):
        super(NNUnet, self).__init__()
        self.args = args
        self.save_hyperparameters()
        self.build_nnunet()
        self.loss = Loss()
        self.dice = Dice(self.n_class)
        self.best_sum = 0
        self.eval_dice = 0
        self.best_sum_epoch = 0
        self.best_dice = self.n_class * [0]
        self.best_epoch = self.n_class * [0]
        self.best_sum_dice = self.n_class * [0]
        self.learning_rate = args.learning_rate
        if self.args.exec_mode in ["train", "evaluate"]:
            self.dllogger = Logger(backends=[
                JSONStreamBackend(Verbosity.VERBOSE,
                                  os.path.join(args.results, "logs.json")),
                StdOutBackend(Verbosity.VERBOSE,
                              step_format=lambda step: f"Epoch: {step} "),
            ])

        self.tta_flips = ([[2], [3], [2, 3]] if self.args.dim == 2 else
                          [[2], [3], [4], [2, 3], [2, 4], [3, 4], [2, 3, 4]])
コード例 #19
0
def main():

    parser = argparse.ArgumentParser(
        description='TensorRT Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    # initialize CUDA state
    torch.cuda.init()

    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    encoder = load_engine(args.encoder, TRT_LOGGER)
    decoder_iter = load_engine(args.decoder, TRT_LOGGER)
    postnet = load_engine(args.postnet, TRT_LOGGER)
    waveglow = load_engine(args.waveglow, TRT_LOGGER)

    if args.waveglow_ckpt != "":
        # setup denoiser using WaveGlow PyTorch checkpoint
        waveglow_ckpt = load_and_setup_model('WaveGlow', parser, args.waveglow_ckpt,
                                             True, forward_is_infer=True)
        denoiser = Denoiser(waveglow_ckpt).cuda()
        # after initialization, we don't need WaveGlow PyTorch checkpoint
        # anymore - deleting
        del waveglow_ckpt
        torch.cuda.empty_cache()

    # create TRT contexts for each engine
    encoder_context = encoder.create_execution_context()
    decoder_context = decoder_iter.create_execution_context()
    postnet_context = postnet.create_execution_context()
    waveglow_context = waveglow.create_execution_context()

    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT,
                                              args.output+'/'+args.log_file),
                            StdOutBackend(Verbosity.VERBOSE)])

    texts = []
    try:
        f = open(args.input, 'r')
        texts = f.readlines()
    except:
        print("Could not read file")
        sys.exit(1)

    measurements = {}

    sequences, sequence_lengths = prepare_input_sequence(texts)
    sequences = sequences.to(torch.int32)
    sequence_lengths = sequence_lengths.to(torch.int32)
    with MeasureTime(measurements, "latency"):
        mel, mel_lengths = infer_tacotron2_trt(encoder, decoder_iter, postnet,
                                               encoder_context, decoder_context, postnet_context,
                                               sequences, sequence_lengths, measurements, args.fp16)
        audios = infer_waveglow_trt(waveglow, waveglow_context, mel, measurements, args.fp16)

    with encoder_context, decoder_context,  postnet_context, waveglow_context:
        pass

    audios = audios.float()
    if args.waveglow_ckpt != "":
        with MeasureTime(measurements, "denoiser"):
            audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)

    for i, audio in enumerate(audios):
        audio = audio[:mel_lengths[i]*args.stft_hop_length]
        audio = audio/torch.max(torch.abs(audio))
        audio_path = args.output + "audio_"+str(i)+"_trt.wav"
        write(audio_path, args.sampling_rate, audio.cpu().numpy())


    DLLogger.log(step=0, data={"tacotron2_encoder_latency": measurements['tacotron2_encoder_time']})
    DLLogger.log(step=0, data={"tacotron2_decoder_latency": measurements['tacotron2_decoder_time']})
    DLLogger.log(step=0, data={"tacotron2_postnet_latency": measurements['tacotron2_postnet_time']})
    DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']})
    DLLogger.log(step=0, data={"latency": measurements['latency']})

    if args.waveglow_ckpt != "":
        DLLogger.log(step=0, data={"denoiser": measurements['denoiser']})
    DLLogger.flush()

    prec = "fp16" if args.fp16 else "fp32"
    latency = measurements['latency']
    throughput = audios.size(1)/latency
    log_data = "1,"+str(sequence_lengths[0].item())+","+prec+","+str(latency)+","+str(throughput)+","+str(mel_lengths[0].item())+"\n"
    with open("log_bs1_"+prec+".log", 'a') as f:
        f.write(log_data)
コード例 #20
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU or CPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, unknown_args = parser.parse_known_args()

    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
                            StdOutBackend(Verbosity.VERBOSE)])
    for k,v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k:v})
    DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})

    measurements_all = {"pre_processing": [],
                        "tacotron2_latency": [],
                        "waveglow_latency": [],
                        "latency": [],
                        "type_conversion": [],
                        "data_transfer": [],
                        "storage": [],
                        "tacotron2_items_per_sec": [],
                        "waveglow_items_per_sec": [],
                        "num_mels_per_audio": [],
                        "throughput": []}

    print("args:", args, unknown_args)

    tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.amp_run, args.cpu_run, forward_is_infer=True)
    waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.amp_run, args.cpu_run)

    if args.cpu_run:
        denoiser = Denoiser(waveglow, args.cpu_run)
    else:
        denoiser = Denoiser(waveglow, args.cpu_run).cuda()

    jitted_tacotron2 = torch.jit.script(tacotron2)

    texts = ["The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."]
    texts = [texts[0][:args.input_length]]
    texts = texts*args.batch_size

    warmup_iters = 3

    for iter in range(args.num_iters):

        measurements = {}

        with MeasureTime(measurements, "pre_processing", args.cpu_run):
            sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu_run)

        with torch.no_grad():
            with MeasureTime(measurements, "latency", args.cpu_run):
                with MeasureTime(measurements, "tacotron2_latency", args.cpu_run):
                    mel, mel_lengths, _ = jitted_tacotron2(sequences_padded, input_lengths)

                with MeasureTime(measurements, "waveglow_latency", args.cpu_run):
                    audios = waveglow.infer(mel, sigma=args.sigma_infer)
                    audios = audios.float()
                    audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)

        num_mels = mel.size(0)*mel.size(2)
        num_samples = audios.size(0)*audios.size(1)

        with MeasureTime(measurements, "type_conversion", args.cpu_run):
            audios = audios.float()

        with MeasureTime(measurements, "data_transfer", args.cpu_run):
            audios = audios.cpu()

        with MeasureTime(measurements, "storage", args.cpu_run):
            audios = audios.numpy()
            for i, audio in enumerate(audios):
                audio_path = "audio_"+str(i)+".wav"
                write(audio_path, args.sampling_rate,
                      audio[:mel_lengths[i]*args.stft_hop_length])

        measurements['tacotron2_items_per_sec'] = num_mels/measurements['tacotron2_latency']
        measurements['waveglow_items_per_sec'] = num_samples/measurements['waveglow_latency']
        measurements['num_mels_per_audio'] = mel.size(2)
        measurements['throughput'] = num_samples/measurements['latency']

        if iter >= warmup_iters:
            for k,v in measurements.items():
                measurements_all[k].append(v)
                DLLogger.log(step=(iter-warmup_iters), data={k: v})

    DLLogger.flush()

    print_stats(measurements_all)
コード例 #21
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    DLLogger.init(backends=[
        JSONStreamBackend(Verbosity.DEFAULT, args.output + '/' +
                          args.log_file),
        StdOutBackend(Verbosity.VERBOSE)
    ])
    for k, v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k: v})
    DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'})

    tacotron2 = load_and_setup_model('Tacotron2',
                                     parser,
                                     args.tacotron2,
                                     args.amp_run,
                                     forward_is_infer=True)
    waveglow = load_and_setup_model('WaveGlow',
                                    parser,
                                    args.waveglow,
                                    args.amp_run,
                                    forward_is_infer=True)
    denoiser = Denoiser(waveglow).cuda()

    jitted_tacotron2 = torch.jit.script(tacotron2)

    texts = []
    try:
        f = open(args.input, 'r')
        texts = f.readlines()
    except:
        print("Could not read file")
        sys.exit(1)

    if args.include_warmup:
        sequence = torch.randint(low=0,
                                 high=148,
                                 size=(1, 50),
                                 dtype=torch.long).cuda()
        input_lengths = torch.IntTensor([sequence.size(1)]).cuda().long()
        for i in range(3):
            with torch.no_grad():
                mel, mel_lengths = jitted_tacotron2(sequence, input_lengths)
                _ = waveglow(mel)

    measurements = {}

    sequences_padded, input_lengths = prepare_input_sequence(texts)

    with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"):
        mel, mel_lengths = jitted_tacotron2(sequences_padded, input_lengths)

    with torch.no_grad(), MeasureTime(measurements, "waveglow_time"):
        audios = waveglow(mel, sigma=args.sigma_infer)
        audios = audios.float()
        audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)

    print("Stopping after", mel.size(2), "decoder steps")
    tacotron2_infer_perf = mel.size(0) * mel.size(
        2) / measurements['tacotron2_time']
    waveglow_infer_perf = audios.size(0) * audios.size(
        1) / measurements['waveglow_time']

    DLLogger.log(step=0,
                 data={"tacotron2_items_per_sec": tacotron2_infer_perf})
    DLLogger.log(step=0,
                 data={"tacotron2_latency": measurements['tacotron2_time']})
    DLLogger.log(step=0, data={"waveglow_items_per_sec": waveglow_infer_perf})
    DLLogger.log(step=0,
                 data={"waveglow_latency": measurements['waveglow_time']})
    DLLogger.log(step=0,
                 data={
                     "latency": (measurements['tacotron2_time'] +
                                 measurements['waveglow_time'])
                 })

    for i, audio in enumerate(audios):
        audio = audio[:mel_lengths[i] * args.stft_hop_length]
        audio = audio / torch.max(torch.abs(audio))
        audio_path = args.output + "audio_" + str(i) + ".wav"
        write(audio_path, args.sampling_rate, audio.cpu().numpy())

    DLLogger.flush()
コード例 #22
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(description='PyTorch FastPitch Inference',
                                     allow_abbrev=False)
    parser = parse_args(parser)
    args, unk_args = parser.parse_known_args()

    if args.p_arpabet > 0.0:
        cmudict.initialize(args.cmudict_path, keep_ambiguous=True)

    torch.backends.cudnn.benchmark = args.cudnn_benchmark

    if args.output is not None:
        Path(args.output).mkdir(parents=False, exist_ok=True)

    log_fpath = args.log_file or str(Path(args.output, 'nvlog_infer.json'))
    log_fpath = unique_log_fpath(log_fpath)
    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_fpath),
                            StdOutBackend(Verbosity.VERBOSE,
                                          metric_format=stdout_metric_format)])
    init_inference_metadata()
    [DLLogger.log("PARAMETER", {k: v}) for k, v in vars(args).items()]

    device = torch.device('cuda' if args.cuda else 'cpu')

    if args.fastpitch != 'SKIP':
        generator = load_and_setup_model(
            'FastPitch', parser, args.fastpitch, args.amp, device,
            unk_args=unk_args, forward_is_infer=True, ema=args.ema,
            jitable=args.torchscript)

        if args.torchscript:
            generator = torch.jit.script(generator)
    else:
        generator = None

    if args.waveglow != 'SKIP':
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            waveglow = load_and_setup_model(
                'WaveGlow', parser, args.waveglow, args.amp, device,
                unk_args=unk_args, forward_is_infer=True, ema=args.ema)
        denoiser = Denoiser(waveglow).to(device)
        waveglow = getattr(waveglow, 'infer', waveglow)
    else:
        waveglow = None

    if len(unk_args) > 0:
        raise ValueError(f'Invalid options {unk_args}')

    fields = load_fields(args.input)
    batches = prepare_input_sequence(
        fields, device, args.symbol_set, args.text_cleaners, args.batch_size,
        args.dataset_path, load_mels=(generator is None), p_arpabet=args.p_arpabet)

    # Use real data rather than synthetic - FastPitch predicts len
    for _ in tqdm(range(args.warmup_steps), 'Warmup'):
        with torch.no_grad():
            if generator is not None:
                b = batches[0]
                mel, *_ = generator(b['text'])
            if waveglow is not None:
                audios = waveglow(mel, sigma=args.sigma_infer).float()
                _ = denoiser(audios, strength=args.denoising_strength)

    gen_measures = MeasureTime(cuda=args.cuda)
    waveglow_measures = MeasureTime(cuda=args.cuda)

    gen_kw = {'pace': args.pace,
              'speaker': args.speaker,
              'pitch_tgt': None,
              'pitch_transform': build_pitch_transformation(args)}

    if args.torchscript:
        gen_kw.pop('pitch_transform')
        print('NOTE: Pitch transforms are disabled with TorchScript')

    all_utterances = 0
    all_samples = 0
    all_letters = 0
    all_frames = 0

    reps = args.repeats
    log_enabled = reps == 1
    log = lambda s, d: DLLogger.log(step=s, data=d) if log_enabled else None

    for rep in (tqdm(range(reps), 'Inference') if reps > 1 else range(reps)):
        for b in batches:
            if generator is None:
                log(rep, {'Synthesizing from ground truth mels'})
                mel, mel_lens = b['mel'], b['mel_lens']
            else:
                with torch.no_grad(), gen_measures:
                    mel, mel_lens, *_ = generator(b['text'], **gen_kw)

                gen_infer_perf = mel.size(0) * mel.size(2) / gen_measures[-1]
                all_letters += b['text_lens'].sum().item()
                all_frames += mel.size(0) * mel.size(2)
                log(rep, {"fastpitch_frames/s": gen_infer_perf})
                log(rep, {"fastpitch_latency": gen_measures[-1]})

                if args.save_mels:
                    for i, mel_ in enumerate(mel):
                        m = mel_[:, :mel_lens[i].item()].permute(1, 0)
                        fname = b['output'][i] if 'output' in b else f'mel_{i}.npy'
                        mel_path = Path(args.output, Path(fname).stem + '.npy')
                        np.save(mel_path, m.cpu().numpy())

            if waveglow is not None:
                with torch.no_grad(), waveglow_measures:
                    audios = waveglow(mel, sigma=args.sigma_infer)
                    audios = denoiser(audios.float(),
                                      strength=args.denoising_strength
                                      ).squeeze(1)

                all_utterances += len(audios)
                all_samples += sum(audio.size(0) for audio in audios)
                waveglow_infer_perf = (
                    audios.size(0) * audios.size(1) / waveglow_measures[-1])

                log(rep, {"waveglow_samples/s": waveglow_infer_perf})
                log(rep, {"waveglow_latency": waveglow_measures[-1]})

                if args.output is not None and reps == 1:
                    for i, audio in enumerate(audios):
                        audio = audio[:mel_lens[i].item() * args.stft_hop_length]

                        if args.fade_out:
                            fade_len = args.fade_out * args.stft_hop_length
                            fade_w = torch.linspace(1.0, 0.0, fade_len)
                            audio[-fade_len:] *= fade_w.to(audio.device)

                        audio = audio / torch.max(torch.abs(audio))
                        fname = b['output'][i] if 'output' in b else f'audio_{i}.wav'
                        audio_path = Path(args.output, fname)
                        write(audio_path, args.sampling_rate, audio.cpu().numpy())

            if generator is not None and waveglow is not None:
                log(rep, {"latency": (gen_measures[-1] + waveglow_measures[-1])})

    log_enabled = True
    if generator is not None:
        gm = np.sort(np.asarray(gen_measures))
        rtf = all_samples / (all_utterances * gm.mean() * args.sampling_rate)
        log((), {"avg_fastpitch_letters/s": all_letters / gm.sum()})
        log((), {"avg_fastpitch_frames/s": all_frames / gm.sum()})
        log((), {"avg_fastpitch_latency": gm.mean()})
        log((), {"avg_fastpitch_RTF": rtf})
        log((), {"90%_fastpitch_latency": gm.mean() + norm.ppf((1.0 + 0.90) / 2) * gm.std()})
        log((), {"95%_fastpitch_latency": gm.mean() + norm.ppf((1.0 + 0.95) / 2) * gm.std()})
        log((), {"99%_fastpitch_latency": gm.mean() + norm.ppf((1.0 + 0.99) / 2) * gm.std()})
    if waveglow is not None:
        wm = np.sort(np.asarray(waveglow_measures))
        rtf = all_samples / (all_utterances * wm.mean() * args.sampling_rate)
        log((), {"avg_waveglow_samples/s": all_samples / wm.sum()})
        log((), {"avg_waveglow_latency": wm.mean()})
        log((), {"avg_waveglow_RTF": rtf})
        log((), {"90%_waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.90) / 2) * wm.std()})
        log((), {"95%_waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.95) / 2) * wm.std()})
        log((), {"99%_waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.99) / 2) * wm.std()})
    if generator is not None and waveglow is not None:
        m = gm + wm
        rtf = all_samples / (all_utterances * m.mean() * args.sampling_rate)
        log((), {"avg_samples/s": all_samples / m.sum()})
        log((), {"avg_letters/s": all_letters / m.sum()})
        log((), {"avg_latency": m.mean()})
        log((), {"avg_RTF": rtf})
        log((), {"90%_latency": m.mean() + norm.ppf((1.0 + 0.90) / 2) * m.std()})
        log((), {"95%_latency": m.mean() + norm.ppf((1.0 + 0.95) / 2) * m.std()})
        log((), {"99%_latency": m.mean() + norm.ppf((1.0 + 0.99) / 2) * m.std()})
    DLLogger.flush()
コード例 #23
0
ファイル: train.py プロジェクト: paulsok/ML
def main(_):

    # get e2e training time
    begin = time.time()
    logging.info("Training started at: {}".format(time.asctime()))

    hvd.init()

    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)
    if FLAGS.num_epochs:  # NOTE: remove this flag after updating all docs.
        config.num_epochs = FLAGS.num_epochs
    if FLAGS.lr:
        config.learning_rate = FLAGS.lr
    if FLAGS.warmup_value:
        config.lr_warmup_init = FLAGS.warmup_value
    if FLAGS.warmup_epochs:
        config.lr_warmup_epoch = FLAGS.warmup_epochs
    config.backbone_init = FLAGS.backbone_init
    config.mixed_precision = FLAGS.amp
    config.image_size = model_utils.parse_image_size(config.image_size)

    # get eval config
    eval_config = hparams_config.get_detection_config(FLAGS.model_name)
    eval_config.override(FLAGS.hparams)
    eval_config.val_json_file = FLAGS.val_json_file
    eval_config.val_file_pattern = FLAGS.val_file_pattern
    eval_config.nms_configs.max_nms_inputs = anchors.MAX_DETECTION_POINTS
    eval_config.drop_remainder = False  # eval all examples w/o drop.
    eval_config.image_size = model_utils.parse_image_size(
        eval_config['image_size'])

    # setup
    setup.set_flags(FLAGS, config, training=True)

    if FLAGS.debug:
        tf.config.experimental_run_functions_eagerly(True)
        tf.debugging.set_log_device_placement(True)
        tf.random.set_seed(111111)
        logging.set_verbosity(logging.DEBUG)

    # Check data path
    if FLAGS.training_file_pattern is None or FLAGS.val_file_pattern is None or FLAGS.val_json_file is None:
        raise RuntimeError(
            'You must specify --training_file_pattern, --val_file_pattern and --val_json_file  for training.'
        )

    steps_per_epoch = (FLAGS.num_examples_per_epoch +
                       (FLAGS.batch_size * get_world_size()) -
                       1) // (FLAGS.batch_size * get_world_size())
    if FLAGS.benchmark == True:
        # For ci perf training runs, run for a fixed number of iterations per epoch
        steps_per_epoch = FLAGS.benchmark_steps
    params = dict(config.as_dict(),
                  model_name=FLAGS.model_name,
                  model_dir=FLAGS.model_dir,
                  steps_per_epoch=steps_per_epoch,
                  checkpoint_period=FLAGS.checkpoint_period,
                  batch_size=FLAGS.batch_size,
                  num_shards=get_world_size(),
                  val_json_file=FLAGS.val_json_file,
                  testdev_dir=FLAGS.testdev_dir,
                  mode='train')
    logging.info('Training params: {}'.format(params))

    # make output dir if it does not exist
    tf.io.gfile.makedirs(FLAGS.model_dir)

    # dllogger setup
    backends = []
    if is_main_process():
        log_path = os.path.join(FLAGS.model_dir, FLAGS.log_filename)
        backends += [
            JSONStreamBackend(verbosity=Verbosity.VERBOSE, filename=log_path),
            StdOutBackend(verbosity=Verbosity.DEFAULT)
        ]

    DLLogger.init(backends=backends)

    def get_dataset(is_training, params):
        file_pattern = (FLAGS.training_file_pattern
                        if is_training else FLAGS.val_file_pattern)
        if not file_pattern:
            raise ValueError('No matching files.')

        return dataloader.InputReader(
            file_pattern,
            is_training=is_training,
            use_fake_data=FLAGS.use_fake_data,
            max_instances_per_image=config.max_instances_per_image,
            enable_map_parallelization=FLAGS.enable_map_parallelization)(
                params)

    num_samples = (FLAGS.eval_samples + get_world_size() -
                   1) // get_world_size()
    num_samples = (num_samples + FLAGS.eval_batch_size -
                   1) // FLAGS.eval_batch_size
    eval_config.num_samples = num_samples

    def get_eval_dataset(eval_config):
        dataset = dataloader.InputReader(
            FLAGS.val_file_pattern,
            is_training=False,
            max_instances_per_image=eval_config.max_instances_per_image)(
                eval_config, batch_size=FLAGS.eval_batch_size)
        dataset = dataset.shard(get_world_size(), get_rank())
        dataset = dataset.take(num_samples)
        return dataset

    eval_dataset = get_eval_dataset(eval_config)

    # pick focal loss implementation
    focal_loss = train_lib.StableFocalLoss(
        params['alpha'],
        params['gamma'],
        label_smoothing=params['label_smoothing'],
        reduction=tf.keras.losses.Reduction.NONE)

    model = train_lib.EfficientDetNetTrain(params['model_name'], config)
    model.build((None, *config.image_size, 3))
    model.compile(
        optimizer=optimizer_builder.get_optimizer(params),
        loss={
            'box_loss':
            train_lib.BoxLoss(params['delta'],
                              reduction=tf.keras.losses.Reduction.NONE),
            'box_iou_loss':
            train_lib.BoxIouLoss(params['iou_loss_type'],
                                 params['min_level'],
                                 params['max_level'],
                                 params['num_scales'],
                                 params['aspect_ratios'],
                                 params['anchor_scale'],
                                 params['image_size'],
                                 reduction=tf.keras.losses.Reduction.NONE),
            'class_loss':
            focal_loss,
            'seg_loss':
            tf.keras.losses.SparseCategoricalCrossentropy(
                from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
        })
    train_from_epoch = util_keras.restore_ckpt(model,
                                               params['model_dir'],
                                               config.moving_average_decay,
                                               steps_per_epoch=steps_per_epoch)

    print("training_mode: {}".format(FLAGS.training_mode))
    callbacks = callback_builder.get_callbacks(params, FLAGS.training_mode,
                                               eval_config, eval_dataset,
                                               DLLogger, FLAGS.time_history,
                                               FLAGS.log_steps, FLAGS.lr_tb,
                                               FLAGS.benchmark)

    history = model.fit(
        get_dataset(True, params=params),
        epochs=params['num_epochs'],
        steps_per_epoch=steps_per_epoch,
        initial_epoch=train_from_epoch,
        callbacks=callbacks,
        verbose=1 if is_main_process() else 0,
        validation_data=get_dataset(False, params=params)
        if FLAGS.validate else None,
        validation_steps=(FLAGS.eval_samples //
                          FLAGS.eval_batch_size) if FLAGS.validate else None)

    if is_main_process():
        model.save_weights(os.path.join(FLAGS.model_dir, 'ckpt-final'))

    # log final stats
    stats = {}
    for callback in callbacks:
        if isinstance(callback, callback_builder.TimeHistory):
            if callback.epoch_runtime_log:
                stats[
                    'avg_fps_training'] = callback.average_examples_per_second
                stats[
                    'avg_fps_training_per_GPU'] = callback.average_examples_per_second / get_world_size(
                    )
                stats[
                    'avg_latency_training'] = callback.average_time_per_iteration

    if history and history.history:
        train_hist = history.history
        #Gets final loss from training.
        stats['training_loss'] = float(
            hvd.allreduce(tf.constant(train_hist['loss'][-1],
                                      dtype=tf.float32),
                          average=True))

    if os.path.exists(os.path.join(FLAGS.model_dir, 'ema_weights')):
        ckpt_epoch = "%02d" % sorted(set([
            int(f.rsplit('.')[0].rsplit('-')[1])
            for f in os.listdir(os.path.join(FLAGS.model_dir, 'ema_weights'))
            if 'emackpt' in f
        ]),
                                     reverse=True)[0]
        ckpt = os.path.join(FLAGS.model_dir, 'ema_weights',
                            'emackpt-' + str(ckpt_epoch))
        util_keras.restore_ckpt(model,
                                ckpt,
                                eval_config.moving_average_decay,
                                steps_per_epoch=0,
                                skip_mismatch=False,
                                expect_partial=True)
        if is_main_process():
            model.save(os.path.join(FLAGS.model_dir, 'emackpt-final'))
    else:
        ckpt_epoch = 'final'
        ckpt = os.path.join(FLAGS.model_dir, 'ckpt-' + ckpt_epoch)
        if is_main_process():
            model.save(os.path.join(FLAGS.model_dir, 'ckpt-' + ckpt_epoch))

    # Start evaluation of final ema checkpoint
    logging.set_verbosity(logging.WARNING)

    @tf.function
    def model_fn(images, labels):
        cls_outputs, box_outputs = model(images, training=False)
        detections = postprocess.generate_detections(eval_config, cls_outputs,
                                                     box_outputs,
                                                     labels['image_scales'],
                                                     labels['source_ids'])

        tf.numpy_function(evaluator.update_state, [
            labels['groundtruth_data'],
            postprocess.transform_detections(detections)
        ], [])

    if FLAGS.benchmark == False and FLAGS.training_mode == 'train':

        # Evaluator for AP calculation.
        label_map = label_util.get_label_map(eval_config.label_map)
        evaluator = coco_metric.EvaluationMetric(
            filename=eval_config.val_json_file, label_map=label_map)

        evaluator.reset_states()

        # evaluate all images.
        pbar = tf.keras.utils.Progbar(num_samples)
        for i, (images, labels) in enumerate(eval_dataset):
            model_fn(images, labels)
            if is_main_process():
                pbar.update(i)

        # gather detections from all ranks
        evaluator.gather()

        if is_main_process():
            # compute the final eval results.
            metrics = evaluator.result()
            metric_dict = {}
            for i, name in enumerate(evaluator.metric_names):
                metric_dict[name] = metrics[i]

            if label_map:
                for i, cid in enumerate(sorted(label_map.keys())):
                    name = 'AP_/%s' % label_map[cid]
                    metric_dict[name] = metrics[i +
                                                len(evaluator.metric_names)]

            # csv format
            csv_metrics = ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl']
            csv_format = ",".join(
                [str(ckpt_epoch)] +
                [str(round(metric_dict[key] * 100, 2)) for key in csv_metrics])
            print(FLAGS.model_name, metric_dict, "csv format:", csv_format)

        MPI.COMM_WORLD.Barrier()

    if is_main_process():
        stats['e2e_training_time'] = time.time() - begin
        DLLogger.log(step=(), data=stats)
コード例 #24
0
def main(_):
    tf.get_logger().setLevel(logging.ERROR)

    hvd.init()

    FLAGS = PARSER.parse_args()

    backends = []

    if hvd.rank() == 0:
        backends += [StdOutBackend(Verbosity.DEFAULT)]

        if FLAGS.log_dir:
            backends += [JSONStreamBackend(Verbosity.DEFAULT, FLAGS.log_dir)]

    DLLogger.init(backends=backends)

    for key in vars(FLAGS):
        DLLogger.log(step="PARAMETER", data={str(key): vars(FLAGS)[key]})

    os.environ['CUDA_CACHE_DISABLE'] = '0'

    os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

    os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'

    os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'

    os.environ['TF_ADJUST_HUE_FUSED'] = '1'
    os.environ['TF_ADJUST_SATURATION_FUSED'] = '1'
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    os.environ['TF_SYNC_ON_FINISH'] = '0'
    os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'
    os.environ['TF_DISABLE_NVTX_RANGES'] = '1'

    dataset = MSDDataset(json_path=os.path.join(FLAGS.data_dir,
                                                'dataset.json'),
                         dst_size=FLAGS.input_shape,
                         seed=FLAGS.seed,
                         interpolator=FLAGS.resize_interpolator,
                         data_normalization=FLAGS.data_normalization,
                         batch_size=FLAGS.batch_size,
                         train_split=FLAGS.train_split,
                         split_seed=FLAGS.split_seed)

    FLAGS.labels = dataset.labels

    gpu_options = tf.GPUOptions()
    config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True)

    if FLAGS.use_xla:
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    if FLAGS.use_amp:
        config.graph_options.rewrite_options.auto_mixed_precision = 1

    run_config = tf.estimator.RunConfig(
        save_summary_steps=None,
        save_checkpoints_steps=None
        if FLAGS.benchmark else dataset.train_steps * FLAGS.train_epochs,
        save_checkpoints_secs=None,
        tf_random_seed=None,
        session_config=config,
        keep_checkpoint_max=1)

    estimator = tf.estimator.Estimator(
        model_fn=vnet_v2,
        model_dir=FLAGS.model_dir if hvd.rank() == 0 else None,
        config=run_config,
        params=FLAGS)

    train_hooks = [hvd.BroadcastGlobalVariablesHook(0)]

    if 'train' in FLAGS.exec_mode:
        steps = dataset.train_steps * FLAGS.train_epochs

        if FLAGS.benchmark:
            steps = FLAGS.warmup_steps * 2
            if hvd.rank() == 0:
                train_hooks += [
                    ProfilingHook(FLAGS.warmup_steps,
                                  FLAGS.batch_size * hvd.size(), DLLogger)
                ]
        else:
            if hvd.rank() == 0:
                train_hooks += [TrainHook(FLAGS.log_every, DLLogger)]

        estimator.train(input_fn=lambda: dataset.train_fn(FLAGS.augment),
                        steps=steps,
                        hooks=train_hooks)

    if 'evaluate' in FLAGS.exec_mode:
        if hvd.rank() == 0:
            if FLAGS.train_split >= 1.0:
                raise ValueError("Missing argument: --train_split < 1.0")

            result = estimator.evaluate(input_fn=dataset.eval_fn,
                                        steps=dataset.eval_steps,
                                        hooks=[])

            DLLogger.log(step=tuple(),
                         data={
                             'background_dice': str(result['background dice']),
                             'anterior_dice': str(result['Anterior dice']),
                             'posterior_dice': str(result['Posterior dice'])
                         })

    if 'predict' in FLAGS.exec_mode:
        count = 1
        hooks = []
        if hvd.rank() == 0:
            if FLAGS.benchmark:
                count = math.ceil(
                    (FLAGS.warmup_steps * 2) / dataset.test_steps)
                hooks += [
                    ProfilingHook(FLAGS.warmup_steps,
                                  FLAGS.batch_size * hvd.size(),
                                  DLLogger,
                                  training=False)
                ]

            predictions = estimator.predict(
                input_fn=lambda: dataset.test_fn(count=count), hooks=hooks)

            pred = [p['prediction'] for p in predictions]

            predict_path = os.path.join(FLAGS.model_dir, 'predictions')
            if os.path.exists(predict_path):
                shutil.rmtree(predict_path)

            os.makedirs(predict_path)

            pickle.dump(
                pred, open(os.path.join(predict_path, 'predictions.pkl'),
                           'wb'))