コード例 #1
0
def get_logger(params):
    backends = []
    if hvd.rank() == 0:
        backends += [StdOutBackend(Verbosity.VERBOSE)]
        if params.log_dir:
            backends += [JSONStreamBackend(Verbosity.VERBOSE, params.log_dir)]
    logger.init(backends=backends)
    return logger
コード例 #2
0
 def __init__(self, log_dir, global_batch_size, mode, warmup, dim, profile):
     logger.init(backends=[JSONStreamBackend(Verbosity.VERBOSE, log_dir), StdOutBackend(Verbosity.VERBOSE)])
     self.warmup_steps = warmup
     self.global_batch_size = global_batch_size
     self.step = 0
     self.dim = dim
     self.mode = mode
     self.profile = profile
     self.timestamps = []
コード例 #3
0
def log(logname, dice, results="/results"):
    dllogger = Logger(backends=[
        JSONStreamBackend(Verbosity.VERBOSE, os.path.join(results, logname)),
        StdOutBackend(Verbosity.VERBOSE, step_format=lambda step: ""),
    ])
    metrics = {}
    metrics.update({"Mean dice": round(dice.mean().item(), 2)})
    metrics.update({f"L{j+1}": round(m.item(), 2) for j, m in enumerate(dice)})
    dllogger.log(step=(), data=metrics)
    dllogger.flush()
コード例 #4
0
def get_logger(params):
    backends = []
    if params.worker_id == 0 or params.log_all_workers:
        backends += [StdOutBackend(Verbosity.VERBOSE)]
        if params.log_dir:
            os.makedirs(params.log_dir, exist_ok=True)
            log_file = f"{params.log_dir}/log.json"
            backends += [JSONStreamBackend(Verbosity.VERBOSE, log_file)]
    logger.init(backends=backends)
    return logger
コード例 #5
0
ファイル: setup.py プロジェクト: HabanaAI/Model-References
def get_logger(params):
    backends = []
    worker_id = hvd_rank() if horovod_enabled() else 0
    if worker_id == 0:
        backends += [StdOutBackend(Verbosity.VERBOSE)]
        if params.log_dir:
            os.makedirs(params.log_dir, exist_ok=True)
            log_file = f"{params.log_dir}/log.json"
            backends += [JSONStreamBackend(Verbosity.VERBOSE, log_file)]
    logger.init(backends=backends)
    return logger
コード例 #6
0
def setup_logger(args):
    os.makedirs(args.results, exist_ok=True)
    log_path = os.path.join(args.results, args.log_file)

    if os.path.exists(log_path):
        for i in itertools.count():
            s_fname = args.log_file.split('.')
            fname = '.'.join(s_fname[:-1]) + f'_{i}.' + s_fname[-1] if len(s_fname) > 1 else args.stat_file + f'.{i}'
            log_path = os.path.join(args.results, fname)
            if not os.path.exists(log_path):
                break

    def metric_format(metric, metadata, value):
        return "{}: {}".format(metric, f'{value:.5f}' if isinstance(value, float) else value)
    def step_format(step):
        if step == ():
            return "Finished |"
        elif isinstance(step, int):
            return "Step {0: <5} |".format(step)
        return "Step {} |".format(step)


    if not dist.is_initialized() or not args.distributed_world_size > 1 or args.distributed_rank == 0:
        dllogger.init(backends=[JSONStreamBackend(verbosity=1, filename=log_path),
                                TensorBoardBackend(verbosity=1, log_dir=args.results),
                                StdOutBackend(verbosity=2, 
                                              step_format=step_format,
                                              prefix_format=lambda x: "")#,
                                              #metric_format=metric_format)
                                ])
    else:
        dllogger.init(backends=[])
    dllogger.log(step='PARAMETER', data=vars(args), verbosity=0)

    container_setup_info = {**get_framework_env_vars(), **get_system_info()}
    dllogger.log(step='ENVIRONMENT', data=container_setup_info, verbosity=0)

    dllogger.metadata('loss', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
    dllogger.metadata('P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
    dllogger.metadata('P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
    dllogger.metadata('P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TRAIN', 'format': ':5f'})
    dllogger.metadata('items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'TRAIN', 'format': ':1f'})
    dllogger.metadata('val_loss', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format':':5f'})
    dllogger.metadata('val_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
    dllogger.metadata('val_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
    dllogger.metadata('val_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'VAL', 'format': ':5f'})
    dllogger.metadata('val_items/s', {'GOAL': 'MAXIMIZE', 'STAGE': 'VAL', 'format': ':1f'})
    dllogger.metadata('test_P10', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
    dllogger.metadata('test_P50', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
    dllogger.metadata('test_P90', {'GOAL': 'MINIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
    dllogger.metadata('throughput', {'GOAL': 'MAXIMIZE', 'STAGE': 'TEST', 'format': ':1f'})
    dllogger.metadata('latency_p90', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
    dllogger.metadata('latency_p95', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
    dllogger.metadata('latency_p99', {'GOAL': 'MIMIMIZE', 'STAGE': 'TEST', 'format': ':5f'})
コード例 #7
0
def get_dllogger(params):
    backends = []
    if is_main_process():
        backends += [StdOutBackend(Verbosity.VERBOSE)]
        if params.log_dir:
            backends += [
                JSONStreamBackend(Verbosity.VERBOSE,
                                  os.path.join(params.log_dir, "log.json"))
            ]
    logger.init(backends=backends)
    return logger
コード例 #8
0
def init_log(args):

    enabled = not dist.is_initialized() or dist.get_rank() == 0
    if enabled:
        fpath = args.log_file or os.path.join(args.output_dir, 'nvlog.json')
        backends = [
            JSONStreamBackend(Verbosity.DEFAULT, unique_log_fpath(fpath)),
            StdOutBackend(Verbosity.VERBOSE,
                          step_format=stdout_step_format,
                          metric_format=stdout_metric_format)
        ]
    else:
        backends = []

    dllogger.init(backends=backends)
    dllogger.metadata("train_lrate", {"name": "lrate", "format": ":>3.2e"})

    for id_, pref in [('train', ''), ('train_avg', 'avg train '),
                      ('dev_ema', '  dev ema ')]:

        dllogger.metadata(f"{id_}_loss", {
            "name": f"{pref}loss",
            "format": ":>7.2f"
        })

        dllogger.metadata(f"{id_}_wer", {
            "name": f"{pref}wer",
            "format": ":>6.2f"
        })

        dllogger.metadata(f"{id_}_pplx", {
            "name": f"{pref}pplx",
            "format": ":>6.2f"
        })

        dllogger.metadata(f"{id_}_throughput", {
            "name": f"{pref}utts/s",
            "format": ":>5.0f"
        })

        dllogger.metadata(f"{id_}_took", {
            "name": "took",
            "unit": "s",
            "format": ":>5.2f"
        })

    tb_subsets = ['train', 'dev_ema']
    global tb_loggers
    tb_loggers = {
        s: TBLogger(enabled, args.output_dir, name=s)
        for s in tb_subsets
    }

    log_parameters(vars(args), tb_subset='train')
コード例 #9
0
def setup_dllogger(rank, enabled=True, filename='log.json'):
    if enabled and rank == 0:
        backends = [
            StdOutBackend(Verbosity.DEFAULT),
            JSONStreamBackend(
                Verbosity.VERBOSE,
                filename,
            ),
        ]
        DLLogger.init(backends)
    else:
        DLLogger.init([])
コード例 #10
0
def init(log_fpath, log_dir, enabled=True, tb_subsets=[], **tb_kw):

    if enabled:
        backends = [
            JSONStreamBackend(Verbosity.DEFAULT, unique_log_fpath(log_fpath)),
            StdOutBackend(Verbosity.VERBOSE,
                          step_format=stdout_step_format,
                          metric_format=stdout_metric_format)
        ]
    else:
        backends = []

    dllogger.init(backends=backends)
    dllogger.metadata("train_lrate", {"name": "lrate", "format": ":>3.2e"})

    for id_, pref in [('train', ''), ('train_avg', 'avg train '),
                      ('val', '  avg val '), ('val_ema', '  EMA val ')]:

        dllogger.metadata(f"{id_}_loss", {
            "name": f"{pref}loss",
            "format": ":>5.2f"
        })
        dllogger.metadata(f"{id_}_mel_loss", {
            "name": f"{pref}mel loss",
            "format": ":>5.2f"
        })

        dllogger.metadata(f"{id_}_kl_loss", {
            "name": f"{pref}kl loss",
            "format": ":>5.5f"
        })
        dllogger.metadata(f"{id_}_kl_weight", {
            "name": f"{pref}kl weight",
            "format": ":>5.5f"
        })

        dllogger.metadata(f"{id_}_frames/s", {
            "name": None,
            "unit": "frames/s",
            "format": ":>10.2f"
        })
        dllogger.metadata(f"{id_}_took", {
            "name": "took",
            "unit": "s",
            "format": ":>3.2f"
        })

    global tb_loggers
    tb_loggers = {
        s: TBLogger(enabled, log_dir, name=s, **tb_kw)
        for s in tb_subsets
    }
コード例 #11
0
def get_logger(params):
    """ Get logger object

    :param params: Dict with additional parameters
    :return: logger
    """
    backends = []
    if hvd.rank() == 0:
        backends += [StdOutBackend(Verbosity.VERBOSE)]
        if params.log_dir:
            backends += [JSONStreamBackend(Verbosity.VERBOSE, params.log_dir)]
    logger.init(backends=backends)
    return logger
コード例 #12
0
ファイル: main.py プロジェクト: zachwe/DeepLearningExamples
def log(logname, dice, epoch=None, dice_tta=None):
    dllogger = Logger(backends=[
        JSONStreamBackend(Verbosity.VERBOSE, os.path.join(
            args.results, logname)),
        StdOutBackend(Verbosity.VERBOSE, step_format=lambda step: ""),
    ])
    metrics = {}
    if epoch is not None:
        metrics.update({"Epoch": epoch})
    metrics.update({"Mean dice": round(dice.mean().item(), 2)})
    if dice_tta is not None:
        metrics.update({"Mean TTA dice": round(dice_tta.mean().item(), 2)})
    metrics.update({f"L{j+1}": round(m.item(), 2) for j, m in enumerate(dice)})
    if dice_tta is not None:
        metrics.update({
            f"TTA_L{j+1}": round(m.item(), 2)
            for j, m in enumerate(dice_tta)
        })
    dllogger.log(step=(), data=metrics)
    dllogger.flush()
コード例 #13
0
def main():
    LOGGER.set_model_name('ResNet')
    LOGGER.set_backends([
        StdOutBackend(log_file='std.out', logging_scope=Scope.TRAIN_ITER),
        CompactBackend(log_file=None,
                       logging_scope=Scope.TRAIN_ITER,
                       iteration_interval=5),
        JsonBackend(log_file='dummy.json',
                    logging_scope=Scope.TRAIN_ITER,
                    iteration_interval=4)
    ])

    parser = ArgumentParser()
    parser.add_argument('--dummy', type=str, default='default_dummy_value')
    args = parser.parse_args()

    LOGGER.log_hardware()
    LOGGER.log_args(args)

    LOGGER.log(tags.RUN_INIT)
    LOGGER.register_metric('loss',
                           meter=AverageMeter(),
                           metric_scope=Scope.TRAIN_ITER)
    LOGGER.register_metric('epoch_nr', metric_scope=Scope.EPOCH)
    LOGGER.register_metric('epochs2')

    with LOGGER.timed_block(tags.SETUP_BLOCK):
        print("This is setup.")

    with LOGGER.timed_block(tags.PREPROC_BLOCK):
        print("This is preprocessing.")

    with LOGGER.timed_block(tags.RUN_BLOCK):
        print("This is run.")
        train()
        print("This is the end.")

    LOGGER.log(tags.RUN_FINAL)

    LOGGER.finish()
コード例 #14
0
 def __init__(self, args):
     super(Model, self).__init__()
     self.save_hyperparameters()
     self.args = args
     self.f1_score = F1(args)
     self.model = UNetLoc(args) if args.type == "pre" else get_dmg_unet(
         args)
     self.loss = Loss(args)
     self.best_f1 = torch.tensor(0)
     self.best_epoch = 0
     self.tta_flips = [[2], [3], [2, 3]]
     self.lr = args.lr
     self.n_class = 2 if self.args.type == "pre" else 5
     self.softmax = nn.Softmax(dim=1)
     self.test_idx = 0
     self.dllogger = Logger(backends=[
         JSONStreamBackend(
             Verbosity.VERBOSE,
             os.path.join(args.results, f"{args.logname}.json")),
         StdOutBackend(Verbosity.VERBOSE,
                       step_format=lambda step: f"Epoch: {step} "),
     ])
コード例 #15
0
def setup_logger(config):
    log_path = config.get("log_path", os.getcwd())
    if is_main_process():
        backends = [
            TensorBoardBackend(verbosity=dllogger.Verbosity.VERBOSE,
                               log_dir=log_path),
            JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                              filename=os.path.join(log_path, "log.json")),
            AggregatorBackend(verbosity=dllogger.Verbosity.VERBOSE,
                              agg_dict={"loss": AverageMeter}),
            StdOutBackend(
                verbosity=dllogger.Verbosity.DEFAULT,
                step_format=empty_step_format,
                metric_format=no_string_metric_format,
                prefix_format=empty_prefix_format,
            ),
        ]

        logger = Logger(backends=backends)
    else:
        logger = Logger(backends=[])
    container_setup_info = get_framework_env_vars()
    logger.log(step="PARAMETER",
               data=container_setup_info,
               verbosity=dllogger.Verbosity.DEFAULT)

    logger.metadata("loss", {
        "unit": "nat",
        "GOAL": "MINIMIZE",
        "STAGE": "TRAIN"
    })
    logger.metadata("val_loss", {
        "unit": "nat",
        "GOAL": "MINIMIZE",
        "STAGE": "VAL"
    })
    return logger
コード例 #16
0
    def __init__(self, args):
        super(NNUnet, self).__init__()
        self.args = args
        self.save_hyperparameters()
        self.build_nnunet()
        self.loss = Loss()
        self.dice = Dice(self.n_class)
        self.best_sum = 0
        self.eval_dice = 0
        self.best_sum_epoch = 0
        self.best_dice = self.n_class * [0]
        self.best_epoch = self.n_class * [0]
        self.best_sum_dice = self.n_class * [0]
        self.learning_rate = args.learning_rate
        if self.args.exec_mode in ["train", "evaluate"]:
            self.dllogger = Logger(backends=[
                JSONStreamBackend(Verbosity.VERBOSE,
                                  os.path.join(args.results, "logs.json")),
                StdOutBackend(Verbosity.VERBOSE,
                              step_format=lambda step: f"Epoch: {step} "),
            ])

        self.tta_flips = ([[2], [3], [2, 3]] if self.args.dim == 2 else
                          [[2], [3], [4], [2, 3], [2, 4], [3, 4], [2, 3, 4]])
コード例 #17
0
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch TTS Data Pre-processing')
    parser = parse_args(parser)
    args, unk_args = parser.parse_known_args()
    if len(unk_args) > 0:
        raise ValueError(f'Invalid options {unk_args}')

    if args.extract_pitch_char:
        assert args.extract_durations, "Durations required for pitch extraction"

    DLLogger.init(backends=[
        JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
        StdOutBackend(Verbosity.VERBOSE)
    ])
    for k, v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k: v})

    model = load_and_setup_model(
        'Tacotron2',
        parser,
        args.tacotron2_checkpoint,
        amp=False,
        device=torch.device('cuda' if args.cuda else 'cpu'),
        forward_is_infer=False,
        ema=False)

    if args.train_mode:
        model.train()

    # n_mel_channels arg has been consumed by model's arg parser
    args.n_mel_channels = model.n_mel_channels

    for datum in ('mels', 'mels_teacher', 'attentions', 'durations',
                  'pitch_mel', 'pitch_char', 'pitch_trichar'):
        if getattr(args, f'extract_{datum}'):
            Path(args.dataset_path, datum).mkdir(parents=False, exist_ok=True)

    filenames = [
        Path(l.split('|')[0]).stem for l in open(args.wav_text_filelist, 'r')
    ]
    # Compatibility with Tacotron2 Data loader
    args.n_speakers = 1
    dataset = FilenamedLoader(filenames,
                              args.dataset_path,
                              args.wav_text_filelist,
                              args,
                              load_mel_from_disk=False)
    # TextMelCollate supports only n_frames_per_step=1
    data_loader = DataLoader(dataset,
                             batch_size=args.batch_size,
                             shuffle=False,
                             sampler=None,
                             num_workers=0,
                             collate_fn=TextMelCollate(1),
                             pin_memory=False,
                             drop_last=False)
    pitch_vecs = {'mel': {}, 'char': {}, 'trichar': {}}
    for i, batch in enumerate(data_loader):
        tik = time.time()
        fnames = batch[-1]
        x, _, _ = batch_to_gpu(batch[:-1])
        _, text_lens, mels_padded, _, mel_lens = x

        for j, mel in enumerate(mels_padded):
            fpath = Path(args.dataset_path, 'mels', fnames[j] + '.pt')
            torch.save(mel[:, :mel_lens[j]].cpu(), fpath)

        with torch.no_grad():
            out_mels, out_mels_postnet, _, alignments = model.forward(x)

        if args.extract_mels_teacher:
            for j, mel in enumerate(out_mels_postnet):
                fpath = Path(args.dataset_path, 'mels_teacher',
                             fnames[j] + '.pt')
                torch.save(mel[:, :mel_lens[j]].cpu(), fpath)
        if args.extract_attentions:
            for j, ali in enumerate(alignments):
                ali = ali[:mel_lens[j], :text_lens[j]]
                fpath = Path(args.dataset_path, 'attentions',
                             fnames[j] + '.pt')
                torch.save(ali.cpu(), fpath)
        durations = []
        if args.extract_durations:
            for j, ali in enumerate(alignments):
                text_len = text_lens[j]
                ali = ali[:mel_lens[j], :text_len]
                dur = torch.histc(torch.argmax(ali, dim=1),
                                  min=0,
                                  max=text_len - 1,
                                  bins=text_len)
                durations.append(dur)
                fpath = Path(args.dataset_path, 'durations', fnames[j] + '.pt')
                torch.save(dur.cpu().int(), fpath)
        if args.extract_pitch_mel or args.extract_pitch_char or args.extract_pitch_trichar:
            for j, dur in enumerate(durations):
                fpath = Path(args.dataset_path, 'pitch_char',
                             fnames[j] + '.pt')
                wav = Path(args.dataset_path, 'wavs', fnames[j] + '.wav')
                p_mel, p_char, p_trichar = calculate_pitch(
                    str(wav),
                    dur.cpu().numpy())
                pitch_vecs['mel'][fnames[j]] = p_mel
                pitch_vecs['char'][fnames[j]] = p_char
                pitch_vecs['trichar'][fnames[j]] = p_trichar

        nseconds = time.time() - tik
        DLLogger.log(step=f'{i+1}/{len(data_loader)} ({nseconds:.2f}s)',
                     data={})

    if args.extract_pitch_mel:
        normalize_pitch_vectors(pitch_vecs['mel'])
        for fname, pitch in pitch_vecs['mel'].items():
            fpath = Path(args.dataset_path, 'pitch_mel', fname + '.pt')
            torch.save(torch.from_numpy(pitch), fpath)

    if args.extract_pitch_char:
        mean, std = normalize_pitch_vectors(pitch_vecs['char'])
        for fname, pitch in pitch_vecs['char'].items():
            fpath = Path(args.dataset_path, 'pitch_char', fname + '.pt')
            torch.save(torch.from_numpy(pitch), fpath)
        save_stats(args.dataset_path, args.wav_text_filelist, 'pitch_char',
                   mean, std)

    if args.extract_pitch_trichar:
        normalize_pitch_vectors(pitch_vecs['trichar'])
        for fname, pitch in pitch_vecs['trichar'].items():
            fpath = Path(args.dataset_path, 'pitch_trichar', fname + '.pt')
            torch.save(torch.from_numpy(pitch), fpath)

    DLLogger.flush()
コード例 #18
0
def main():

    parser = argparse.ArgumentParser(
        description='TensorRT Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    # initialize CUDA state
    torch.cuda.init()

    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    encoder = load_engine(args.encoder, TRT_LOGGER)
    decoder_iter = load_engine(args.decoder, TRT_LOGGER)
    postnet = load_engine(args.postnet, TRT_LOGGER)
    waveglow = load_engine(args.waveglow, TRT_LOGGER)

    if args.waveglow_ckpt != "":
        # setup denoiser using WaveGlow PyTorch checkpoint
        waveglow_ckpt = load_and_setup_model('WaveGlow', parser, args.waveglow_ckpt,
                                             True, forward_is_infer=True)
        denoiser = Denoiser(waveglow_ckpt).cuda()
        # after initialization, we don't need WaveGlow PyTorch checkpoint
        # anymore - deleting
        del waveglow_ckpt
        torch.cuda.empty_cache()

    # create TRT contexts for each engine
    encoder_context = encoder.create_execution_context()
    decoder_context = decoder_iter.create_execution_context()
    postnet_context = postnet.create_execution_context()
    waveglow_context = waveglow.create_execution_context()

    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT,
                                              args.output+'/'+args.log_file),
                            StdOutBackend(Verbosity.VERBOSE)])

    texts = []
    try:
        f = open(args.input, 'r')
        texts = f.readlines()
    except:
        print("Could not read file")
        sys.exit(1)

    measurements = {}

    sequences, sequence_lengths = prepare_input_sequence(texts)
    sequences = sequences.to(torch.int32)
    sequence_lengths = sequence_lengths.to(torch.int32)
    with MeasureTime(measurements, "latency"):
        mel, mel_lengths = infer_tacotron2_trt(encoder, decoder_iter, postnet,
                                               encoder_context, decoder_context, postnet_context,
                                               sequences, sequence_lengths, measurements, args.fp16)
        audios = infer_waveglow_trt(waveglow, waveglow_context, mel, measurements, args.fp16)

    with encoder_context, decoder_context,  postnet_context, waveglow_context:
        pass

    audios = audios.float()
    if args.waveglow_ckpt != "":
        with MeasureTime(measurements, "denoiser"):
            audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)

    for i, audio in enumerate(audios):
        audio = audio[:mel_lengths[i]*args.stft_hop_length]
        audio = audio/torch.max(torch.abs(audio))
        audio_path = args.output + "audio_"+str(i)+"_trt.wav"
        write(audio_path, args.sampling_rate, audio.cpu().numpy())


    DLLogger.log(step=0, data={"tacotron2_encoder_latency": measurements['tacotron2_encoder_time']})
    DLLogger.log(step=0, data={"tacotron2_decoder_latency": measurements['tacotron2_decoder_time']})
    DLLogger.log(step=0, data={"tacotron2_postnet_latency": measurements['tacotron2_postnet_time']})
    DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']})
    DLLogger.log(step=0, data={"latency": measurements['latency']})

    if args.waveglow_ckpt != "":
        DLLogger.log(step=0, data={"denoiser": measurements['denoiser']})
    DLLogger.flush()

    prec = "fp16" if args.fp16 else "fp32"
    latency = measurements['latency']
    throughput = audios.size(1)/latency
    log_data = "1,"+str(sequence_lengths[0].item())+","+prec+","+str(latency)+","+str(throughput)+","+str(mel_lengths[0].item())+"\n"
    with open("log_bs1_"+prec+".log", 'a') as f:
        f.write(log_data)
コード例 #19
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU or CPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, unknown_args = parser.parse_known_args()

    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
                            StdOutBackend(Verbosity.VERBOSE)])
    for k,v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k:v})
    DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})

    measurements_all = {"pre_processing": [],
                        "tacotron2_latency": [],
                        "waveglow_latency": [],
                        "latency": [],
                        "type_conversion": [],
                        "data_transfer": [],
                        "storage": [],
                        "tacotron2_items_per_sec": [],
                        "waveglow_items_per_sec": [],
                        "num_mels_per_audio": [],
                        "throughput": []}

    print("args:", args, unknown_args)

    tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.amp_run, args.cpu_run, forward_is_infer=True)
    waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.amp_run, args.cpu_run)

    if args.cpu_run:
        denoiser = Denoiser(waveglow, args.cpu_run)
    else:
        denoiser = Denoiser(waveglow, args.cpu_run).cuda()

    jitted_tacotron2 = torch.jit.script(tacotron2)

    texts = ["The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."]
    texts = [texts[0][:args.input_length]]
    texts = texts*args.batch_size

    warmup_iters = 3

    for iter in range(args.num_iters):

        measurements = {}

        with MeasureTime(measurements, "pre_processing", args.cpu_run):
            sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu_run)

        with torch.no_grad():
            with MeasureTime(measurements, "latency", args.cpu_run):
                with MeasureTime(measurements, "tacotron2_latency", args.cpu_run):
                    mel, mel_lengths, _ = jitted_tacotron2(sequences_padded, input_lengths)

                with MeasureTime(measurements, "waveglow_latency", args.cpu_run):
                    audios = waveglow.infer(mel, sigma=args.sigma_infer)
                    audios = audios.float()
                    audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)

        num_mels = mel.size(0)*mel.size(2)
        num_samples = audios.size(0)*audios.size(1)

        with MeasureTime(measurements, "type_conversion", args.cpu_run):
            audios = audios.float()

        with MeasureTime(measurements, "data_transfer", args.cpu_run):
            audios = audios.cpu()

        with MeasureTime(measurements, "storage", args.cpu_run):
            audios = audios.numpy()
            for i, audio in enumerate(audios):
                audio_path = "audio_"+str(i)+".wav"
                write(audio_path, args.sampling_rate,
                      audio[:mel_lengths[i]*args.stft_hop_length])

        measurements['tacotron2_items_per_sec'] = num_mels/measurements['tacotron2_latency']
        measurements['waveglow_items_per_sec'] = num_samples/measurements['waveglow_latency']
        measurements['num_mels_per_audio'] = mel.size(2)
        measurements['throughput'] = num_samples/measurements['latency']

        if iter >= warmup_iters:
            for k,v in measurements.items():
                measurements_all[k].append(v)
                DLLogger.log(step=(iter-warmup_iters), data={k: v})

    DLLogger.flush()

    print_stats(measurements_all)
コード例 #20
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    DLLogger.init(backends=[
        JSONStreamBackend(Verbosity.DEFAULT, args.output + '/' +
                          args.log_file),
        StdOutBackend(Verbosity.VERBOSE)
    ])
    for k, v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k: v})
    DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'})

    tacotron2 = load_and_setup_model('Tacotron2',
                                     parser,
                                     args.tacotron2,
                                     args.amp_run,
                                     forward_is_infer=True)
    waveglow = load_and_setup_model('WaveGlow',
                                    parser,
                                    args.waveglow,
                                    args.amp_run,
                                    forward_is_infer=True)
    denoiser = Denoiser(waveglow).cuda()

    jitted_tacotron2 = torch.jit.script(tacotron2)

    texts = []
    try:
        f = open(args.input, 'r')
        texts = f.readlines()
    except:
        print("Could not read file")
        sys.exit(1)

    if args.include_warmup:
        sequence = torch.randint(low=0,
                                 high=148,
                                 size=(1, 50),
                                 dtype=torch.long).cuda()
        input_lengths = torch.IntTensor([sequence.size(1)]).cuda().long()
        for i in range(3):
            with torch.no_grad():
                mel, mel_lengths = jitted_tacotron2(sequence, input_lengths)
                _ = waveglow(mel)

    measurements = {}

    sequences_padded, input_lengths = prepare_input_sequence(texts)

    with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"):
        mel, mel_lengths = jitted_tacotron2(sequences_padded, input_lengths)

    with torch.no_grad(), MeasureTime(measurements, "waveglow_time"):
        audios = waveglow(mel, sigma=args.sigma_infer)
        audios = audios.float()
        audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)

    print("Stopping after", mel.size(2), "decoder steps")
    tacotron2_infer_perf = mel.size(0) * mel.size(
        2) / measurements['tacotron2_time']
    waveglow_infer_perf = audios.size(0) * audios.size(
        1) / measurements['waveglow_time']

    DLLogger.log(step=0,
                 data={"tacotron2_items_per_sec": tacotron2_infer_perf})
    DLLogger.log(step=0,
                 data={"tacotron2_latency": measurements['tacotron2_time']})
    DLLogger.log(step=0, data={"waveglow_items_per_sec": waveglow_infer_perf})
    DLLogger.log(step=0,
                 data={"waveglow_latency": measurements['waveglow_time']})
    DLLogger.log(step=0,
                 data={
                     "latency": (measurements['tacotron2_time'] +
                                 measurements['waveglow_time'])
                 })

    for i, audio in enumerate(audios):
        audio = audio[:mel_lengths[i] * args.stft_hop_length]
        audio = audio / torch.max(torch.abs(audio))
        audio_path = args.output + "audio_" + str(i) + ".wav"
        write(audio_path, args.sampling_rate, audio.cpu().numpy())

    DLLogger.flush()
コード例 #21
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(description='PyTorch FastPitch Inference',
                                     allow_abbrev=False)
    parser = parse_args(parser)
    args, unk_args = parser.parse_known_args()

    if args.p_arpabet > 0.0:
        cmudict.initialize(args.cmudict_path, keep_ambiguous=True)

    torch.backends.cudnn.benchmark = args.cudnn_benchmark

    if args.output is not None:
        Path(args.output).mkdir(parents=False, exist_ok=True)

    log_fpath = args.log_file or str(Path(args.output, 'nvlog_infer.json'))
    log_fpath = unique_log_fpath(log_fpath)
    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_fpath),
                            StdOutBackend(Verbosity.VERBOSE,
                                          metric_format=stdout_metric_format)])
    init_inference_metadata()
    [DLLogger.log("PARAMETER", {k: v}) for k, v in vars(args).items()]

    device = torch.device('cuda' if args.cuda else 'cpu')

    if args.fastpitch != 'SKIP':
        generator = load_and_setup_model(
            'FastPitch', parser, args.fastpitch, args.amp, device,
            unk_args=unk_args, forward_is_infer=True, ema=args.ema,
            jitable=args.torchscript)

        if args.torchscript:
            generator = torch.jit.script(generator)
    else:
        generator = None

    if args.waveglow != 'SKIP':
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            waveglow = load_and_setup_model(
                'WaveGlow', parser, args.waveglow, args.amp, device,
                unk_args=unk_args, forward_is_infer=True, ema=args.ema)
        denoiser = Denoiser(waveglow).to(device)
        waveglow = getattr(waveglow, 'infer', waveglow)
    else:
        waveglow = None

    if len(unk_args) > 0:
        raise ValueError(f'Invalid options {unk_args}')

    fields = load_fields(args.input)
    batches = prepare_input_sequence(
        fields, device, args.symbol_set, args.text_cleaners, args.batch_size,
        args.dataset_path, load_mels=(generator is None), p_arpabet=args.p_arpabet)

    # Use real data rather than synthetic - FastPitch predicts len
    for _ in tqdm(range(args.warmup_steps), 'Warmup'):
        with torch.no_grad():
            if generator is not None:
                b = batches[0]
                mel, *_ = generator(b['text'])
            if waveglow is not None:
                audios = waveglow(mel, sigma=args.sigma_infer).float()
                _ = denoiser(audios, strength=args.denoising_strength)

    gen_measures = MeasureTime(cuda=args.cuda)
    waveglow_measures = MeasureTime(cuda=args.cuda)

    gen_kw = {'pace': args.pace,
              'speaker': args.speaker,
              'pitch_tgt': None,
              'pitch_transform': build_pitch_transformation(args)}

    if args.torchscript:
        gen_kw.pop('pitch_transform')
        print('NOTE: Pitch transforms are disabled with TorchScript')

    all_utterances = 0
    all_samples = 0
    all_letters = 0
    all_frames = 0

    reps = args.repeats
    log_enabled = reps == 1
    log = lambda s, d: DLLogger.log(step=s, data=d) if log_enabled else None

    for rep in (tqdm(range(reps), 'Inference') if reps > 1 else range(reps)):
        for b in batches:
            if generator is None:
                log(rep, {'Synthesizing from ground truth mels'})
                mel, mel_lens = b['mel'], b['mel_lens']
            else:
                with torch.no_grad(), gen_measures:
                    mel, mel_lens, *_ = generator(b['text'], **gen_kw)

                gen_infer_perf = mel.size(0) * mel.size(2) / gen_measures[-1]
                all_letters += b['text_lens'].sum().item()
                all_frames += mel.size(0) * mel.size(2)
                log(rep, {"fastpitch_frames/s": gen_infer_perf})
                log(rep, {"fastpitch_latency": gen_measures[-1]})

                if args.save_mels:
                    for i, mel_ in enumerate(mel):
                        m = mel_[:, :mel_lens[i].item()].permute(1, 0)
                        fname = b['output'][i] if 'output' in b else f'mel_{i}.npy'
                        mel_path = Path(args.output, Path(fname).stem + '.npy')
                        np.save(mel_path, m.cpu().numpy())

            if waveglow is not None:
                with torch.no_grad(), waveglow_measures:
                    audios = waveglow(mel, sigma=args.sigma_infer)
                    audios = denoiser(audios.float(),
                                      strength=args.denoising_strength
                                      ).squeeze(1)

                all_utterances += len(audios)
                all_samples += sum(audio.size(0) for audio in audios)
                waveglow_infer_perf = (
                    audios.size(0) * audios.size(1) / waveglow_measures[-1])

                log(rep, {"waveglow_samples/s": waveglow_infer_perf})
                log(rep, {"waveglow_latency": waveglow_measures[-1]})

                if args.output is not None and reps == 1:
                    for i, audio in enumerate(audios):
                        audio = audio[:mel_lens[i].item() * args.stft_hop_length]

                        if args.fade_out:
                            fade_len = args.fade_out * args.stft_hop_length
                            fade_w = torch.linspace(1.0, 0.0, fade_len)
                            audio[-fade_len:] *= fade_w.to(audio.device)

                        audio = audio / torch.max(torch.abs(audio))
                        fname = b['output'][i] if 'output' in b else f'audio_{i}.wav'
                        audio_path = Path(args.output, fname)
                        write(audio_path, args.sampling_rate, audio.cpu().numpy())

            if generator is not None and waveglow is not None:
                log(rep, {"latency": (gen_measures[-1] + waveglow_measures[-1])})

    log_enabled = True
    if generator is not None:
        gm = np.sort(np.asarray(gen_measures))
        rtf = all_samples / (all_utterances * gm.mean() * args.sampling_rate)
        log((), {"avg_fastpitch_letters/s": all_letters / gm.sum()})
        log((), {"avg_fastpitch_frames/s": all_frames / gm.sum()})
        log((), {"avg_fastpitch_latency": gm.mean()})
        log((), {"avg_fastpitch_RTF": rtf})
        log((), {"90%_fastpitch_latency": gm.mean() + norm.ppf((1.0 + 0.90) / 2) * gm.std()})
        log((), {"95%_fastpitch_latency": gm.mean() + norm.ppf((1.0 + 0.95) / 2) * gm.std()})
        log((), {"99%_fastpitch_latency": gm.mean() + norm.ppf((1.0 + 0.99) / 2) * gm.std()})
    if waveglow is not None:
        wm = np.sort(np.asarray(waveglow_measures))
        rtf = all_samples / (all_utterances * wm.mean() * args.sampling_rate)
        log((), {"avg_waveglow_samples/s": all_samples / wm.sum()})
        log((), {"avg_waveglow_latency": wm.mean()})
        log((), {"avg_waveglow_RTF": rtf})
        log((), {"90%_waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.90) / 2) * wm.std()})
        log((), {"95%_waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.95) / 2) * wm.std()})
        log((), {"99%_waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.99) / 2) * wm.std()})
    if generator is not None and waveglow is not None:
        m = gm + wm
        rtf = all_samples / (all_utterances * m.mean() * args.sampling_rate)
        log((), {"avg_samples/s": all_samples / m.sum()})
        log((), {"avg_letters/s": all_letters / m.sum()})
        log((), {"avg_latency": m.mean()})
        log((), {"avg_RTF": rtf})
        log((), {"90%_latency": m.mean() + norm.ppf((1.0 + 0.90) / 2) * m.std()})
        log((), {"95%_latency": m.mean() + norm.ppf((1.0 + 0.95) / 2) * m.std()})
        log((), {"99%_latency": m.mean() + norm.ppf((1.0 + 0.99) / 2) * m.std()})
    DLLogger.flush()
コード例 #22
0
 def _initialize_dllogger(self, log_dir, filename, append):
     backends = [
         JSONStreamBackend(Verbosity.VERBOSE, os.path.join(log_dir, filename), append=append),
         StdOutBackend(Verbosity.VERBOSE),
     ]
     logger.init(backends=backends)
コード例 #23
0
ファイル: train.py プロジェクト: paulsok/ML
def main(_):

    # get e2e training time
    begin = time.time()
    logging.info("Training started at: {}".format(time.asctime()))

    hvd.init()

    # Parse and override hparams
    config = hparams_config.get_detection_config(FLAGS.model_name)
    config.override(FLAGS.hparams)
    if FLAGS.num_epochs:  # NOTE: remove this flag after updating all docs.
        config.num_epochs = FLAGS.num_epochs
    if FLAGS.lr:
        config.learning_rate = FLAGS.lr
    if FLAGS.warmup_value:
        config.lr_warmup_init = FLAGS.warmup_value
    if FLAGS.warmup_epochs:
        config.lr_warmup_epoch = FLAGS.warmup_epochs
    config.backbone_init = FLAGS.backbone_init
    config.mixed_precision = FLAGS.amp
    config.image_size = model_utils.parse_image_size(config.image_size)

    # get eval config
    eval_config = hparams_config.get_detection_config(FLAGS.model_name)
    eval_config.override(FLAGS.hparams)
    eval_config.val_json_file = FLAGS.val_json_file
    eval_config.val_file_pattern = FLAGS.val_file_pattern
    eval_config.nms_configs.max_nms_inputs = anchors.MAX_DETECTION_POINTS
    eval_config.drop_remainder = False  # eval all examples w/o drop.
    eval_config.image_size = model_utils.parse_image_size(
        eval_config['image_size'])

    # setup
    setup.set_flags(FLAGS, config, training=True)

    if FLAGS.debug:
        tf.config.experimental_run_functions_eagerly(True)
        tf.debugging.set_log_device_placement(True)
        tf.random.set_seed(111111)
        logging.set_verbosity(logging.DEBUG)

    # Check data path
    if FLAGS.training_file_pattern is None or FLAGS.val_file_pattern is None or FLAGS.val_json_file is None:
        raise RuntimeError(
            'You must specify --training_file_pattern, --val_file_pattern and --val_json_file  for training.'
        )

    steps_per_epoch = (FLAGS.num_examples_per_epoch +
                       (FLAGS.batch_size * get_world_size()) -
                       1) // (FLAGS.batch_size * get_world_size())
    if FLAGS.benchmark == True:
        # For ci perf training runs, run for a fixed number of iterations per epoch
        steps_per_epoch = FLAGS.benchmark_steps
    params = dict(config.as_dict(),
                  model_name=FLAGS.model_name,
                  model_dir=FLAGS.model_dir,
                  steps_per_epoch=steps_per_epoch,
                  checkpoint_period=FLAGS.checkpoint_period,
                  batch_size=FLAGS.batch_size,
                  num_shards=get_world_size(),
                  val_json_file=FLAGS.val_json_file,
                  testdev_dir=FLAGS.testdev_dir,
                  mode='train')
    logging.info('Training params: {}'.format(params))

    # make output dir if it does not exist
    tf.io.gfile.makedirs(FLAGS.model_dir)

    # dllogger setup
    backends = []
    if is_main_process():
        log_path = os.path.join(FLAGS.model_dir, FLAGS.log_filename)
        backends += [
            JSONStreamBackend(verbosity=Verbosity.VERBOSE, filename=log_path),
            StdOutBackend(verbosity=Verbosity.DEFAULT)
        ]

    DLLogger.init(backends=backends)

    def get_dataset(is_training, params):
        file_pattern = (FLAGS.training_file_pattern
                        if is_training else FLAGS.val_file_pattern)
        if not file_pattern:
            raise ValueError('No matching files.')

        return dataloader.InputReader(
            file_pattern,
            is_training=is_training,
            use_fake_data=FLAGS.use_fake_data,
            max_instances_per_image=config.max_instances_per_image,
            enable_map_parallelization=FLAGS.enable_map_parallelization)(
                params)

    num_samples = (FLAGS.eval_samples + get_world_size() -
                   1) // get_world_size()
    num_samples = (num_samples + FLAGS.eval_batch_size -
                   1) // FLAGS.eval_batch_size
    eval_config.num_samples = num_samples

    def get_eval_dataset(eval_config):
        dataset = dataloader.InputReader(
            FLAGS.val_file_pattern,
            is_training=False,
            max_instances_per_image=eval_config.max_instances_per_image)(
                eval_config, batch_size=FLAGS.eval_batch_size)
        dataset = dataset.shard(get_world_size(), get_rank())
        dataset = dataset.take(num_samples)
        return dataset

    eval_dataset = get_eval_dataset(eval_config)

    # pick focal loss implementation
    focal_loss = train_lib.StableFocalLoss(
        params['alpha'],
        params['gamma'],
        label_smoothing=params['label_smoothing'],
        reduction=tf.keras.losses.Reduction.NONE)

    model = train_lib.EfficientDetNetTrain(params['model_name'], config)
    model.build((None, *config.image_size, 3))
    model.compile(
        optimizer=optimizer_builder.get_optimizer(params),
        loss={
            'box_loss':
            train_lib.BoxLoss(params['delta'],
                              reduction=tf.keras.losses.Reduction.NONE),
            'box_iou_loss':
            train_lib.BoxIouLoss(params['iou_loss_type'],
                                 params['min_level'],
                                 params['max_level'],
                                 params['num_scales'],
                                 params['aspect_ratios'],
                                 params['anchor_scale'],
                                 params['image_size'],
                                 reduction=tf.keras.losses.Reduction.NONE),
            'class_loss':
            focal_loss,
            'seg_loss':
            tf.keras.losses.SparseCategoricalCrossentropy(
                from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
        })
    train_from_epoch = util_keras.restore_ckpt(model,
                                               params['model_dir'],
                                               config.moving_average_decay,
                                               steps_per_epoch=steps_per_epoch)

    print("training_mode: {}".format(FLAGS.training_mode))
    callbacks = callback_builder.get_callbacks(params, FLAGS.training_mode,
                                               eval_config, eval_dataset,
                                               DLLogger, FLAGS.time_history,
                                               FLAGS.log_steps, FLAGS.lr_tb,
                                               FLAGS.benchmark)

    history = model.fit(
        get_dataset(True, params=params),
        epochs=params['num_epochs'],
        steps_per_epoch=steps_per_epoch,
        initial_epoch=train_from_epoch,
        callbacks=callbacks,
        verbose=1 if is_main_process() else 0,
        validation_data=get_dataset(False, params=params)
        if FLAGS.validate else None,
        validation_steps=(FLAGS.eval_samples //
                          FLAGS.eval_batch_size) if FLAGS.validate else None)

    if is_main_process():
        model.save_weights(os.path.join(FLAGS.model_dir, 'ckpt-final'))

    # log final stats
    stats = {}
    for callback in callbacks:
        if isinstance(callback, callback_builder.TimeHistory):
            if callback.epoch_runtime_log:
                stats[
                    'avg_fps_training'] = callback.average_examples_per_second
                stats[
                    'avg_fps_training_per_GPU'] = callback.average_examples_per_second / get_world_size(
                    )
                stats[
                    'avg_latency_training'] = callback.average_time_per_iteration

    if history and history.history:
        train_hist = history.history
        #Gets final loss from training.
        stats['training_loss'] = float(
            hvd.allreduce(tf.constant(train_hist['loss'][-1],
                                      dtype=tf.float32),
                          average=True))

    if os.path.exists(os.path.join(FLAGS.model_dir, 'ema_weights')):
        ckpt_epoch = "%02d" % sorted(set([
            int(f.rsplit('.')[0].rsplit('-')[1])
            for f in os.listdir(os.path.join(FLAGS.model_dir, 'ema_weights'))
            if 'emackpt' in f
        ]),
                                     reverse=True)[0]
        ckpt = os.path.join(FLAGS.model_dir, 'ema_weights',
                            'emackpt-' + str(ckpt_epoch))
        util_keras.restore_ckpt(model,
                                ckpt,
                                eval_config.moving_average_decay,
                                steps_per_epoch=0,
                                skip_mismatch=False,
                                expect_partial=True)
        if is_main_process():
            model.save(os.path.join(FLAGS.model_dir, 'emackpt-final'))
    else:
        ckpt_epoch = 'final'
        ckpt = os.path.join(FLAGS.model_dir, 'ckpt-' + ckpt_epoch)
        if is_main_process():
            model.save(os.path.join(FLAGS.model_dir, 'ckpt-' + ckpt_epoch))

    # Start evaluation of final ema checkpoint
    logging.set_verbosity(logging.WARNING)

    @tf.function
    def model_fn(images, labels):
        cls_outputs, box_outputs = model(images, training=False)
        detections = postprocess.generate_detections(eval_config, cls_outputs,
                                                     box_outputs,
                                                     labels['image_scales'],
                                                     labels['source_ids'])

        tf.numpy_function(evaluator.update_state, [
            labels['groundtruth_data'],
            postprocess.transform_detections(detections)
        ], [])

    if FLAGS.benchmark == False and FLAGS.training_mode == 'train':

        # Evaluator for AP calculation.
        label_map = label_util.get_label_map(eval_config.label_map)
        evaluator = coco_metric.EvaluationMetric(
            filename=eval_config.val_json_file, label_map=label_map)

        evaluator.reset_states()

        # evaluate all images.
        pbar = tf.keras.utils.Progbar(num_samples)
        for i, (images, labels) in enumerate(eval_dataset):
            model_fn(images, labels)
            if is_main_process():
                pbar.update(i)

        # gather detections from all ranks
        evaluator.gather()

        if is_main_process():
            # compute the final eval results.
            metrics = evaluator.result()
            metric_dict = {}
            for i, name in enumerate(evaluator.metric_names):
                metric_dict[name] = metrics[i]

            if label_map:
                for i, cid in enumerate(sorted(label_map.keys())):
                    name = 'AP_/%s' % label_map[cid]
                    metric_dict[name] = metrics[i +
                                                len(evaluator.metric_names)]

            # csv format
            csv_metrics = ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl']
            csv_format = ",".join(
                [str(ckpt_epoch)] +
                [str(round(metric_dict[key] * 100, 2)) for key in csv_metrics])
            print(FLAGS.model_name, metric_dict, "csv format:", csv_format)

        MPI.COMM_WORLD.Barrier()

    if is_main_process():
        stats['e2e_training_time'] = time.time() - begin
        DLLogger.log(step=(), data=stats)
コード例 #24
0
 def __init__(self, log_path="bert_dllog.json"):
     self.logger = Logger([
         StdOutBackend(Verbosity.DEFAULT, step_format=self.format_step),
         JSONStreamBackend(Verbosity.VERBOSE, log_path),
     ])
     self.logger.metadata("mlm_loss", {
         "format": ":.4f",
         "GOAL": "MINIMIZE",
         "STAGE": "TRAIN"
     })
     self.logger.metadata("nsp_loss", {
         "format": ":.4f",
         "GOAL": "MINIMIZE",
         "STAGE": "TRAIN"
     })
     self.logger.metadata("avg_loss_step", {
         "format": ":.4f",
         "GOAL": "MINIMIZE",
         "STAGE": "TRAIN"
     })
     self.logger.metadata("total_loss", {
         "format": ":.4f",
         "GOAL": "MINIMIZE",
         "STAGE": "TRAIN"
     })
     self.logger.metadata("loss", {
         "format": ":.4f",
         "GOAL": "MINIMIZE",
         "STAGE": "TRAIN"
     })
     self.logger.metadata("f1", {
         "format": ":.4f",
         "GOAL": "MINIMIZE",
         "STAGE": "VAL"
     })
     self.logger.metadata("precision", {
         "format": ":.4f",
         "GOAL": "MINIMIZE",
         "STAGE": "VAL"
     })
     self.logger.metadata("recall", {
         "format": ":.4f",
         "GOAL": "MINIMIZE",
         "STAGE": "VAL"
     })
     self.logger.metadata("mcc", {
         "format": ":.4f",
         "GOAL": "MINIMIZE",
         "STAGE": "VAL"
     })
     self.logger.metadata("exact_match", {
         "format": ":.4f",
         "GOAL": "MINIMIZE",
         "STAGE": "VAL"
     })
     self.logger.metadata(
         "throughput_train",
         {
             "unit": "seq/s",
             "format": ":.3f",
             "GOAL": "MAXIMIZE",
             "STAGE": "TRAIN"
         },
     )
     self.logger.metadata(
         "throughput_inf",
         {
             "unit": "seq/s",
             "format": ":.3f",
             "GOAL": "MAXIMIZE",
             "STAGE": "VAL"
         },
     )
コード例 #25
0
def main():
    """
    Launches inference benchmark.
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    log_file = args.log_file

    DLLogger.init(backends=[
        JSONStreamBackend(Verbosity.DEFAULT, args.output + '/' +
                          args.log_file),
        StdOutBackend(Verbosity.VERBOSE)
    ])
    for k, v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k: v})
    DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'})

    model = load_and_setup_model(args.model_name,
                                 parser,
                                 None,
                                 args.amp_run,
                                 forward_is_infer=True)

    if args.model_name == "Tacotron2":
        model = torch.jit.script(model)

    warmup_iters = 3
    num_iters = 1 + warmup_iters

    for i in range(num_iters):

        measurements = {}

        if args.model_name == 'Tacotron2':
            text_padded = torch.randint(low=0,
                                        high=148,
                                        size=(args.batch_size, 140),
                                        dtype=torch.long).cuda()
            input_lengths = torch.IntTensor([text_padded.size(1)] *
                                            args.batch_size).cuda().long()
            with torch.no_grad(), MeasureTime(measurements, "inference_time"):
                mels, _, _ = model(text_padded, input_lengths)
            num_items = mels.size(0) * mels.size(2)

        if args.model_name == 'WaveGlow':
            n_mel_channels = model.upsample.in_channels
            num_mels = 895
            mel_padded = torch.zeros(args.batch_size, n_mel_channels,
                                     num_mels).normal_(-5.62, 1.98).cuda()
            if args.amp_run:
                mel_padded = mel_padded.half()

            with torch.no_grad(), MeasureTime(measurements, "inference_time"):
                audios = model(mel_padded)
                audios = audios.float()
            num_items = audios.size(0) * audios.size(1)

        if i >= warmup_iters:
            DLLogger.log(step=(i - warmup_iters, ),
                         data={"latency": measurements['inference_time']})
            DLLogger.log(step=(i - warmup_iters, ),
                         data={
                             "items_per_sec":
                             num_items / measurements['inference_time']
                         })

    DLLogger.log(step=tuple(),
                 data={'infer_latency': measurements['inference_time']})
    DLLogger.log(step=tuple(),
                 data={
                     'infer_items_per_sec':
                     num_items / measurements['inference_time']
                 })

    DLLogger.flush()
コード例 #26
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU or CPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    log_file = os.path.join(args.output, args.log_file)
    DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_file),
                            StdOutBackend(Verbosity.VERBOSE)])
    for k,v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k:v})
    DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})

    tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
                                     args.fp16, args.cpu, forward_is_infer=True) # forward is infer를 해줌으로써 tacotron model의 infer로 간다.
    waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow,
                                    args.fp16, args.cpu, forward_is_infer=True)
    denoiser = Denoiser(waveglow)
    if not args.cpu:
        denoiser.cuda()

    jitted_tacotron2 = torch.jit.script(tacotron2)

    texts = []
    id_list = []
    try:
        f = open(args.input, 'r')
        texts = f.readlines()
    except:
        print("Could not read file")
        sys.exit(1)

    #-------------------------------------------------------------------------------------------------------------------
    ref_mel = load_mel(args.ref_mel)
    id_list.append(args.emotion_id)
    emotion_id = torch.LongTensor(id_list).cuda()
    print(emotion_id)
    #-------------------------------------------------------------------------------------------------------------------


    if args.include_warmup:
        sequence = torch.randint(low=0, high=80, size=(1,50)).long()
        input_lengths = torch.IntTensor([sequence.size(1)]).long()
        if not args.cpu:
            sequence = sequence.cuda()
            input_lengths = input_lengths.cuda()
        for i in range(3):
            with torch.no_grad():
                mel, mel_lengths, _ = jitted_tacotron2(sequence, input_lengths, ref_mel, emotion_id)
                _ = waveglow(mel)

    measurements = {}

    sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu)

    with torch.no_grad(), MeasureTime(measurements, "tacotron2_time", args.cpu):
        mel, mel_lengths, alignments = jitted_tacotron2(sequences_padded, input_lengths, ref_mel, emotion_id)

    with torch.no_grad(), MeasureTime(measurements, "waveglow_time", args.cpu):
        audios = waveglow(mel, sigma=args.sigma_infer)
        audios = audios.float()
    with torch.no_grad(), MeasureTime(measurements, "denoiser_time", args.cpu):
        audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)

    print("Stopping after",mel.size(2),"decoder steps")
    tacotron2_infer_perf = mel.size(0)*mel.size(2)/measurements['tacotron2_time']
    waveglow_infer_perf = audios.size(0)*audios.size(1)/measurements['waveglow_time']

    DLLogger.log(step=0, data={"tacotron2_items_per_sec": tacotron2_infer_perf})
    DLLogger.log(step=0, data={"tacotron2_latency": measurements['tacotron2_time']})
    DLLogger.log(step=0, data={"waveglow_items_per_sec": waveglow_infer_perf})
    DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']})
    DLLogger.log(step=0, data={"denoiser_latency": measurements['denoiser_time']})
    DLLogger.log(step=0, data={"latency": (measurements['tacotron2_time']+measurements['waveglow_time']+measurements['denoiser_time'])})

    for i, audio in enumerate(audios):

        plt.imshow(alignments[i].float().data.cpu().numpy().T, aspect="auto", origin="lower")
        figure_path = os.path.join(args.output,"alignment_"+str(i)+args.suffix+".png")
        plt.savefig(figure_path)

        audio = audio[:mel_lengths[i]*args.stft_hop_length]
        audio = audio/torch.max(torch.abs(audio))
        audio_path = os.path.join(args.output,"audio_"+str(i)+args.suffix+".wav")
        write(audio_path, args.sampling_rate, audio.cpu().numpy())

    DLLogger.flush()
コード例 #27
0
def main():

    parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Training')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    if 'LOCAL_RANK' in os.environ and 'WORLD_SIZE' in os.environ:
        local_rank = int(os.environ['LOCAL_RANK'])
        world_size = int(os.environ['WORLD_SIZE'])
    else:
        local_rank = args.rank
        world_size = args.world_size

    distributed_run = world_size > 1

    if local_rank == 0:
        log_file = os.path.join(args.output, args.log_file)
        DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_file),
                                StdOutBackend(Verbosity.VERBOSE)])
    else:
        DLLogger.init(backends=[])

    for k,v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k:v})
    DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})

    model_name = args.model_name
    parser = models.model_parser(model_name, parser)
    args, _ = parser.parse_known_args()

    torch.backends.cudnn.enabled = args.cudnn_enabled
    torch.backends.cudnn.benchmark = args.cudnn_benchmark

    if distributed_run:
        init_distributed(args, world_size, local_rank, args.group_name)

    torch.cuda.synchronize()
    run_start_time = time.perf_counter()

    model_config = models.get_model_config(model_name, args)
    model = models.get_model(model_name, model_config,
                             cpu_run=False,
                             uniform_initialize_bn_weight=not args.disable_uniform_initialize_bn_weight)

    if distributed_run:
        model = DDP(model,device_ids=[local_rank],output_device=local_rank)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate,
                                 weight_decay=args.weight_decay)

    scaler = torch.cuda.amp.GradScaler(enabled=args.amp)

    try:
        sigma = args.sigma
    except AttributeError:
        sigma = None

    start_epoch = [0]

    if args.resume_from_last:
        args.checkpoint_path = get_last_checkpoint_filename(args.output, model_name)

    if args.checkpoint_path is not "":
        load_checkpoint(model, optimizer, start_epoch, model_config,
                        args.amp, args.checkpoint_path, local_rank)

    start_epoch = start_epoch[0]

    criterion = loss_functions.get_loss_function(model_name, sigma)

    try:
        n_frames_per_step = args.n_frames_per_step
    except AttributeError:
        n_frames_per_step = None

    collate_fn = data_functions.get_collate_function(
        model_name, n_frames_per_step)
    trainset = data_functions.get_data_loader(
        model_name, args.dataset_path, args.training_files, args)
    if distributed_run:
        train_sampler = DistributedSampler(trainset)
        shuffle = False
    else:
        train_sampler = None
        shuffle = True

    train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=args.batch_size, pin_memory=False,
                              drop_last=True, collate_fn=collate_fn)

    valset = data_functions.get_data_loader(
        model_name, args.dataset_path, args.validation_files, args)

    batch_to_gpu = data_functions.get_batch_to_gpu(model_name)

    iteration = 0
    train_epoch_items_per_sec = 0.0
    val_loss = 0.0
    num_iters = 0

    model.train()

    for epoch in range(start_epoch, args.epochs):
        torch.cuda.synchronize()
        epoch_start_time = time.perf_counter()
        # used to calculate avg items/sec over epoch
        reduced_num_items_epoch = 0

        train_epoch_items_per_sec = 0.0

        num_iters = 0
        reduced_loss = 0

        # if overflow at the last iteration then do not save checkpoint
        overflow = False

        if distributed_run:
            train_loader.sampler.set_epoch(epoch)

        for i, batch in enumerate(train_loader):
            torch.cuda.synchronize()
            iter_start_time = time.perf_counter()
            DLLogger.log(step=(epoch, i),
                         data={'glob_iter/iters_per_epoch': str(iteration)+"/"+str(len(train_loader))})

            adjust_learning_rate(iteration, epoch, optimizer, args.learning_rate,
                                 args.anneal_steps, args.anneal_factor, local_rank)

            model.zero_grad()
            x, y, num_items = batch_to_gpu(batch)

            #AMP upstream autocast
            with torch.cuda.amp.autocast(enabled=args.amp):
                y_pred = model(x)
                loss = criterion(y_pred, y)
            
            if distributed_run:
                reduced_loss = reduce_tensor(loss.data, world_size).item()
                reduced_num_items = reduce_tensor(num_items.data, 1).item()
            else:
                reduced_loss = loss.item()
                reduced_num_items = num_items.item()
            if np.isnan(reduced_loss):
                raise Exception("loss is NaN")

            DLLogger.log(step=(epoch,i), data={'train_loss': reduced_loss})

            num_iters += 1

            # accumulate number of items processed in this epoch
            reduced_num_items_epoch += reduced_num_items

            if args.amp:
                scaler.scale(loss).backward()

                scaler.unscale_(optimizer)
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), args.grad_clip_thresh)
                
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad(set_to_none=True)  

            else:
                loss.backward()
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), args.grad_clip_thresh)

                optimizer.step()

            torch.cuda.synchronize()
            iter_stop_time = time.perf_counter()
            iter_time = iter_stop_time - iter_start_time
            items_per_sec = reduced_num_items/iter_time
            train_epoch_items_per_sec += items_per_sec

            DLLogger.log(step=(epoch, i), data={'train_items_per_sec': items_per_sec})
            DLLogger.log(step=(epoch, i), data={'train_iter_time': iter_time})
            iteration += 1

        torch.cuda.synchronize()
        epoch_stop_time = time.perf_counter()
        epoch_time = epoch_stop_time - epoch_start_time

        DLLogger.log(step=(epoch,), data={'train_items_per_sec':
                                          (train_epoch_items_per_sec/num_iters if num_iters > 0 else 0.0)})
        DLLogger.log(step=(epoch,), data={'train_loss': reduced_loss})
        DLLogger.log(step=(epoch,), data={'train_epoch_time': epoch_time})

        val_loss, val_items_per_sec = validate(model, criterion, valset, epoch,
                                               iteration, args.batch_size,
                                               world_size, collate_fn,
                                               distributed_run, local_rank,
                                               batch_to_gpu)

        if (epoch % args.epochs_per_checkpoint == 0) and args.bench_class == "":
            save_checkpoint(model, optimizer, scaler, epoch, model_config,
                            args.amp, args.output, args.model_name,
                            local_rank, world_size)
        if local_rank == 0:
            DLLogger.flush()

    torch.cuda.synchronize()
    run_stop_time = time.perf_counter()
    run_time = run_stop_time - run_start_time
    DLLogger.log(step=tuple(), data={'run_time': run_time})
    DLLogger.log(step=tuple(), data={'val_loss': val_loss})
    DLLogger.log(step=tuple(), data={'train_items_per_sec':
                                     (train_epoch_items_per_sec/num_iters if num_iters > 0 else 0.0)})
    DLLogger.log(step=tuple(), data={'val_items_per_sec': val_items_per_sec})

    if local_rank == 0:
        DLLogger.flush()
コード例 #28
0
def main():
    parser = get_parser()
    args = parser.parse_args()

    log_fpath = args.log_file or str(Path(args.output_dir, 'nvlog_infer.json'))
    log_fpath = unique_log_fpath(log_fpath)
    dllogger.init(backends=[
        JSONStreamBackend(Verbosity.DEFAULT, log_fpath),
        StdOutBackend(Verbosity.VERBOSE, metric_format=stdout_metric_format)
    ])

    [dllogger.log("PARAMETER", {k: v}) for k, v in vars(args).items()]

    for step in ['DNN', 'data+DNN', 'data']:
        for c in [0.99, 0.95, 0.9, 0.5]:
            cs = 'avg' if c == 0.5 else f'{int(100*c)}%'
            dllogger.metadata(f'{step.lower()}_latency_{c}', {
                'name': f'{step} latency {cs}',
                'format': ':>7.2f',
                'unit': 'ms'
            })
    dllogger.metadata('eval_wer', {
        'name': 'WER',
        'format': ':>3.3f',
        'unit': '%'
    })

    if args.cpu:
        device = torch.device('cpu')
    else:
        assert torch.cuda.is_available()
        device = torch.device('cuda')
        torch.backends.cudnn.benchmark = args.cudnn_benchmark

    if args.seed is not None:
        torch.manual_seed(args.seed + args.local_rank)
        np.random.seed(args.seed + args.local_rank)
        random.seed(args.seed + args.local_rank)

    # set up distributed training
    multi_gpu = not args.cpu and int(os.environ.get('WORLD_SIZE', 1)) > 1
    if multi_gpu:
        torch.cuda.set_device(args.local_rank)
        distrib.init_process_group(backend='nccl', init_method='env://')
        print_once(f'Inference with {distrib.get_world_size()} GPUs')

    cfg = config.load(args.model_config)

    if args.max_duration is not None:
        cfg['input_val']['audio_dataset']['max_duration'] = args.max_duration
        cfg['input_val']['filterbank_features'][
            'max_duration'] = args.max_duration

    if args.pad_to_max_duration:
        assert cfg['input_val']['audio_dataset']['max_duration'] > 0
        cfg['input_val']['audio_dataset']['pad_to_max_duration'] = True
        cfg['input_val']['filterbank_features']['pad_to_max_duration'] = True

    use_dali = args.dali_device in ('cpu', 'gpu')

    (dataset_kw, features_kw, splicing_kw, _, _) = config.input(cfg, 'val')

    tokenizer_kw = config.tokenizer(cfg)
    tokenizer = Tokenizer(**tokenizer_kw)

    optim_level = 3 if args.amp else 0

    feature_proc = torch.nn.Sequential(
        torch.nn.Identity(),
        torch.nn.Identity(),
        features.FrameSplicing(optim_level=optim_level, **splicing_kw),
        features.FillPadding(optim_level=optim_level, ),
    )

    # dataset

    data_loader = DaliDataLoader(gpu_id=args.local_rank or 0,
                                 dataset_path=args.dataset_dir,
                                 config_data=dataset_kw,
                                 config_features=features_kw,
                                 json_names=[args.val_manifest],
                                 batch_size=args.batch_size,
                                 sampler=dali_sampler.SimpleSampler(),
                                 pipeline_type="val",
                                 device_type=args.dali_device,
                                 tokenizer=tokenizer)

    model = RNNT(n_classes=tokenizer.num_labels + 1, **config.rnnt(cfg))

    if args.ckpt is not None:
        print(f'Loading the model from {args.ckpt} ...')
        checkpoint = torch.load(args.ckpt, map_location="cpu")
        key = 'ema_state_dict' if args.ema else 'state_dict'
        state_dict = checkpoint[key]
        model.load_state_dict(state_dict, strict=True)

    model.to(device)
    model.eval()

    if feature_proc is not None:
        feature_proc.to(device)
        feature_proc.eval()

    if args.amp:
        model = amp.initialize(model, opt_level='O3')

    if multi_gpu:
        model = DistributedDataParallel(model)

    agg = {'txts': [], 'preds': [], 'logits': []}
    dur = {'data': [], 'dnn': [], 'data+dnn': []}

    rep_loader = chain(*repeat(data_loader, args.repeats))
    rep_len = args.repeats * len(data_loader)

    blank_idx = tokenizer.num_labels
    greedy_decoder = RNNTGreedyDecoder(blank_idx=blank_idx)

    def sync_time():
        torch.cuda.synchronize() if device.type == 'cuda' else None
        return time.perf_counter()

    sz = []
    with torch.no_grad():

        for it, batch in enumerate(tqdm.tqdm(rep_loader, total=rep_len)):

            if use_dali:
                feats, feat_lens, txt, txt_lens = batch
                if feature_proc is not None:
                    feats, feat_lens = feature_proc([feats, feat_lens])
            else:
                batch = [t.cuda(non_blocking=True) for t in batch]
                audio, audio_lens, txt, txt_lens = batch
                feats, feat_lens = feature_proc([audio, audio_lens])
            feats = feats.permute(2, 0, 1)
            if args.amp:
                feats = feats.half()

            sz.append(feats.size(0))

            t1 = sync_time()
            log_probs, log_prob_lens = model(feats, feat_lens, txt, txt_lens)
            t2 = sync_time()

            # burn-in period; wait for a new loader due to num_workers
            if it >= 1 and (args.repeats == 1 or it >= len(data_loader)):
                dur['data'].append(t1 - t0)
                dur['dnn'].append(t2 - t1)
                dur['data+dnn'].append(t2 - t0)

            if txt is not None:
                agg['txts'] += helpers.gather_transcripts([txt], [txt_lens],
                                                          tokenizer.detokenize)

            preds = greedy_decoder.decode(model, feats, feat_lens)

            agg['preds'] += helpers.gather_predictions([preds],
                                                       tokenizer.detokenize)

            if 0 < args.steps < it:
                break

            t0 = sync_time()

        # communicate the results
        if args.transcribe_wav:
            for idx, p in enumerate(agg['preds']):
                print_once(f'Prediction {idx+1: >3}: {p}')

        elif args.transcribe_filelist:
            pass

        else:
            wer, loss = process_evaluation_epoch(agg)

            if not multi_gpu or distrib.get_rank() == 0:
                dllogger.log(step=(), data={'eval_wer': 100 * wer})

        if args.save_predictions:
            with open(args.save_predictions, 'w') as f:
                f.write('\n'.join(agg['preds']))

    # report timings
    if len(dur['data']) >= 20:
        ratios = [0.9, 0.95, 0.99]

        for stage in dur:
            lat = durs_to_percentiles(dur[stage], ratios)
            for k in [0.99, 0.95, 0.9, 0.5]:
                kk = str(k).replace('.', '_')
                dllogger.log(step=(),
                             data={f'{stage.lower()}_latency_{kk}': lat[k]})

    else:
        # TODO measure at least avg latency
        print_once('Not enough samples to measure latencies.')
コード例 #29
0
def main():

    parser = get_parser()
    args = parser.parse_args()

    log_fpath = args.log_file or str(Path(args.output_dir, 'nvlog_infer.json'))
    log_fpath = unique_log_fpath(log_fpath)
    dllogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_fpath),
                            StdOutBackend(Verbosity.VERBOSE,
                                          metric_format=stdout_metric_format)])

    [dllogger.log("PARAMETER", {k: v}) for k, v in vars(args).items()]

    for step in ['DNN', 'data+DNN', 'data']:
        for c in [0.99, 0.95, 0.9, 0.5]:
            cs = 'avg' if c == 0.5 else f'{int(100*c)}%'
            dllogger.metadata(f'{step.lower()}_latency_{c}',
                              {'name': f'{step} latency {cs}',
                               'format': ':>7.2f', 'unit': 'ms'})
    dllogger.metadata(
        'eval_wer', {'name': 'WER', 'format': ':>3.2f', 'unit': '%'})

    if args.cpu:
        device = torch.device('cpu')
    else:
        assert torch.cuda.is_available()
        device = torch.device('cuda')
        torch.backends.cudnn.benchmark = args.cudnn_benchmark

    if args.seed is not None:
        torch.manual_seed(args.seed + args.local_rank)
        np.random.seed(args.seed + args.local_rank)
        random.seed(args.seed + args.local_rank)

    # set up distributed training
    multi_gpu = not args.cpu and int(os.environ.get('WORLD_SIZE', 1)) > 1
    if multi_gpu:
        torch.cuda.set_device(args.local_rank)
        distrib.init_process_group(backend='nccl', init_method='env://')
        print_once(f'Inference with {distrib.get_world_size()} GPUs')

    cfg = config.load(args.model_config)
    config.apply_config_overrides(cfg, args)

    symbols = helpers.add_ctc_blank(cfg['labels'])

    use_dali = args.dali_device in ('cpu', 'gpu')
    dataset_kw, features_kw = config.input(cfg, 'val')

    measure_perf = args.steps > 0

    # dataset
    if args.transcribe_wav or args.transcribe_filelist:

        if use_dali:
            print("DALI supported only with input .json files; disabling")
            use_dali = False

        assert not args.pad_to_max_duration
        assert not (args.transcribe_wav and args.transcribe_filelist)

        if args.transcribe_wav:
            dataset = SingleAudioDataset(args.transcribe_wav)
        else:
            dataset = FilelistDataset(args.transcribe_filelist)

        data_loader = get_data_loader(dataset,
                                      batch_size=1,
                                      multi_gpu=multi_gpu,
                                      shuffle=False,
                                      num_workers=0,
                                      drop_last=(True if measure_perf else False))

        _, features_kw = config.input(cfg, 'val')
        feat_proc = FilterbankFeatures(**features_kw)

    elif use_dali:
        # pad_to_max_duration is not supported by DALI - have simple padders
        if features_kw['pad_to_max_duration']:
            feat_proc = BaseFeatures(
                pad_align=features_kw['pad_align'],
                pad_to_max_duration=True,
                max_duration=features_kw['max_duration'],
                sample_rate=features_kw['sample_rate'],
                window_size=features_kw['window_size'],
                window_stride=features_kw['window_stride'])
            features_kw['pad_to_max_duration'] = False
        else:
            feat_proc = None

        data_loader = DaliDataLoader(
            gpu_id=args.local_rank or 0,
            dataset_path=args.dataset_dir,
            config_data=dataset_kw,
            config_features=features_kw,
            json_names=args.val_manifests,
            batch_size=args.batch_size,
            pipeline_type=("train" if measure_perf else "val"),  # no drop_last
            device_type=args.dali_device,
            symbols=symbols)

    else:
        dataset = AudioDataset(args.dataset_dir,
                               args.val_manifests,
                               symbols,
                               **dataset_kw)

        data_loader = get_data_loader(dataset,
                                      args.batch_size,
                                      multi_gpu=multi_gpu,
                                      shuffle=False,
                                      num_workers=4,
                                      drop_last=False)

        feat_proc = FilterbankFeatures(**features_kw)

    model = QuartzNet(encoder_kw=config.encoder(cfg),
                      decoder_kw=config.decoder(cfg, n_classes=len(symbols)))

    if args.ckpt is not None:
        print(f'Loading the model from {args.ckpt} ...')
        checkpoint = torch.load(args.ckpt, map_location="cpu")
        key = 'ema_state_dict' if args.ema else 'state_dict'
        state_dict = checkpoint[key]
        model.load_state_dict(state_dict, strict=True)

    model.to(device)
    model.eval()

    if feat_proc is not None:
        feat_proc.to(device)
        feat_proc.eval()

    if args.amp:
        model = model.half()

    if args.torchscript:
        greedy_decoder = GreedyCTCDecoder()

        feat_proc, model, greedy_decoder = torchscript_export(
            data_loader, feat_proc, model, greedy_decoder, args.output_dir,
            use_amp=args.amp, use_conv_masks=True, model_toml=args.model_toml,
            device=device, save=args.torchscript_export)

    if multi_gpu:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)

    agg = {'txts': [], 'preds': [], 'logits': []}
    dur = {'data': [], 'dnn': [], 'data+dnn': []}

    looped_loader = chain.from_iterable(repeat(data_loader))
    greedy_decoder = GreedyCTCDecoder()

    sync = lambda: torch.cuda.synchronize() if device.type == 'cuda' else None

    steps = args.steps + args.warmup_steps or len(data_loader)
    with torch.no_grad():

        for it, batch in enumerate(tqdm(looped_loader, initial=1, total=steps)):

            if use_dali:
                feats, feat_lens, txt, txt_lens = batch
                if feat_proc is not None:
                    feats, feat_lens = feat_proc(feats, feat_lens)
            else:
                batch = [t.to(device, non_blocking=True) for t in batch]
                audio, audio_lens, txt, txt_lens = batch
                feats, feat_lens = feat_proc(audio, audio_lens)

            sync()
            t1 = time.perf_counter()

            if args.amp:
                feats = feats.half()

            if model.encoder.use_conv_masks:
                log_probs, log_prob_lens = model(feats, feat_lens)
            else:
                log_probs = model(feats, feat_lens)

            preds = greedy_decoder(log_probs)

            sync()
            t2 = time.perf_counter()

            # burn-in period; wait for a new loader due to num_workers
            if it >= 1 and (args.steps == 0 or it >= args.warmup_steps):
                dur['data'].append(t1 - t0)
                dur['dnn'].append(t2 - t1)
                dur['data+dnn'].append(t2 - t0)

            if txt is not None:
                agg['txts'] += helpers.gather_transcripts([txt], [txt_lens],
                                                          symbols)
            agg['preds'] += helpers.gather_predictions([preds], symbols)
            agg['logits'].append(log_probs)

            if it + 1 == steps:
                break

            sync()
            t0 = time.perf_counter()

        # communicate the results
        if args.transcribe_wav:
            for idx, p in enumerate(agg['preds']):
                print_once(f'Prediction {idx+1: >3}: {p}')

        elif args.transcribe_filelist:
            pass

        elif not multi_gpu or distrib.get_rank() == 0:
            wer, _ = process_evaluation_epoch(agg)

            dllogger.log(step=(), data={'eval_wer': 100 * wer})

        if args.save_predictions:
            with open(args.save_predictions, 'w') as f:
                f.write('\n'.join(agg['preds']))

        if args.save_logits:
            logits = torch.cat(agg['logits'], dim=0).cpu()
            torch.save(logits, args.save_logits)

    # report timings
    if len(dur['data']) >= 20:
        ratios = [0.9, 0.95, 0.99]
        for stage in dur:
            lat = durs_to_percentiles(dur[stage], ratios)
            for k in [0.99, 0.95, 0.9, 0.5]:
                kk = str(k).replace('.', '_')
                dllogger.log(step=(), data={f'{stage.lower()}_latency_{kk}': lat[k]})

    else:
        print_once('Not enough samples to measure latencies.')
コード例 #30
0
def main(_):
    tf.get_logger().setLevel(logging.ERROR)

    hvd.init()

    FLAGS = PARSER.parse_args()

    backends = []

    if hvd.rank() == 0:
        backends += [StdOutBackend(Verbosity.DEFAULT)]

        if FLAGS.log_dir:
            backends += [JSONStreamBackend(Verbosity.DEFAULT, FLAGS.log_dir)]

    DLLogger.init(backends=backends)

    for key in vars(FLAGS):
        DLLogger.log(step="PARAMETER", data={str(key): vars(FLAGS)[key]})

    os.environ['CUDA_CACHE_DISABLE'] = '0'

    os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

    os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'

    os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'

    os.environ['TF_ADJUST_HUE_FUSED'] = '1'
    os.environ['TF_ADJUST_SATURATION_FUSED'] = '1'
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    os.environ['TF_SYNC_ON_FINISH'] = '0'
    os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'
    os.environ['TF_DISABLE_NVTX_RANGES'] = '1'

    dataset = MSDDataset(json_path=os.path.join(FLAGS.data_dir,
                                                'dataset.json'),
                         dst_size=FLAGS.input_shape,
                         seed=FLAGS.seed,
                         interpolator=FLAGS.resize_interpolator,
                         data_normalization=FLAGS.data_normalization,
                         batch_size=FLAGS.batch_size,
                         train_split=FLAGS.train_split,
                         split_seed=FLAGS.split_seed)

    FLAGS.labels = dataset.labels

    gpu_options = tf.GPUOptions()
    config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True)

    if FLAGS.use_xla:
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    if FLAGS.use_amp:
        config.graph_options.rewrite_options.auto_mixed_precision = 1

    run_config = tf.estimator.RunConfig(
        save_summary_steps=None,
        save_checkpoints_steps=None
        if FLAGS.benchmark else dataset.train_steps * FLAGS.train_epochs,
        save_checkpoints_secs=None,
        tf_random_seed=None,
        session_config=config,
        keep_checkpoint_max=1)

    estimator = tf.estimator.Estimator(
        model_fn=vnet_v2,
        model_dir=FLAGS.model_dir if hvd.rank() == 0 else None,
        config=run_config,
        params=FLAGS)

    train_hooks = [hvd.BroadcastGlobalVariablesHook(0)]

    if 'train' in FLAGS.exec_mode:
        steps = dataset.train_steps * FLAGS.train_epochs

        if FLAGS.benchmark:
            steps = FLAGS.warmup_steps * 2
            if hvd.rank() == 0:
                train_hooks += [
                    ProfilingHook(FLAGS.warmup_steps,
                                  FLAGS.batch_size * hvd.size(), DLLogger)
                ]
        else:
            if hvd.rank() == 0:
                train_hooks += [TrainHook(FLAGS.log_every, DLLogger)]

        estimator.train(input_fn=lambda: dataset.train_fn(FLAGS.augment),
                        steps=steps,
                        hooks=train_hooks)

    if 'evaluate' in FLAGS.exec_mode:
        if hvd.rank() == 0:
            if FLAGS.train_split >= 1.0:
                raise ValueError("Missing argument: --train_split < 1.0")

            result = estimator.evaluate(input_fn=dataset.eval_fn,
                                        steps=dataset.eval_steps,
                                        hooks=[])

            DLLogger.log(step=tuple(),
                         data={
                             'background_dice': str(result['background dice']),
                             'anterior_dice': str(result['Anterior dice']),
                             'posterior_dice': str(result['Posterior dice'])
                         })

    if 'predict' in FLAGS.exec_mode:
        count = 1
        hooks = []
        if hvd.rank() == 0:
            if FLAGS.benchmark:
                count = math.ceil(
                    (FLAGS.warmup_steps * 2) / dataset.test_steps)
                hooks += [
                    ProfilingHook(FLAGS.warmup_steps,
                                  FLAGS.batch_size * hvd.size(),
                                  DLLogger,
                                  training=False)
                ]

            predictions = estimator.predict(
                input_fn=lambda: dataset.test_fn(count=count), hooks=hooks)

            pred = [p['prediction'] for p in predictions]

            predict_path = os.path.join(FLAGS.model_dir, 'predictions')
            if os.path.exists(predict_path):
                shutil.rmtree(predict_path)

            os.makedirs(predict_path)

            pickle.dump(
                pred, open(os.path.join(predict_path, 'predictions.pkl'),
                           'wb'))