def end_iteration(self, val=False): it = self.val_iteration if val else self.iteration if (it % self.print_interval == 0): metrics = { n: m for n, m in self.metrics.items() if n.startswith('val') == val } step = (self.epoch, self.iteration) if not val else (self.epoch, self.iteration, self.val_iteration) verbositys = {m['level'] for _, m in metrics.items()} for ll in verbositys: llm = {n: m for n, m in metrics.items() if m['level'] == ll} dllogger.log(step=step, data={ n: m['meter'].get_iteration() for n, m in llm.items() }, verbosity=ll) for n, m in metrics.items(): m['meter'].reset_iteration() dllogger.flush()
def log(self): if is_main_process(): diffs = list(map(operator.sub, self.timestamps[1:], self.timestamps[:-1])) deltas = np.array(diffs) stats = self.process_performance_stats(deltas) logger.log(step=(), data=stats) logger.flush()
def main(): """ Launches inference benchmark. Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch FastPitch Inference Benchmark') parser = parse_args(parser) args, _ = parser.parse_known_args() log_file = args.log_file DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.log(step="PARAMETER", data={'model_name': 'FastPitch_PyT'}) model = load_and_setup_model('FastPitch', parser, None, args.amp_run, 'cuda', unk_args=[], forward_is_infer=True, ema=False, jitable=True) # FIXME Temporarily disabled due to nn.LayerNorm fp16 casting bug in pytorch:20.02-py3 and 20.03 # model = torch.jit.script(model) warmup_iters = 3 iters = 1 gen_measures = MeasureTime() all_frames = 0 for i in range(-warmup_iters, iters): text_padded = torch.randint(low=0, high=148, size=(args.batch_size, 128), dtype=torch.long).to('cuda') input_lengths = torch.IntTensor([text_padded.size(1)] * args.batch_size).to('cuda') durs = torch.ones_like(text_padded).mul_(4).to('cuda') with torch.no_grad(), gen_measures: mels, *_ = model(text_padded, input_lengths, dur_tgt=durs) num_frames = mels.size(0) * mels.size(2) if i >= 0: all_frames += num_frames DLLogger.log(step=(i, ), data={"latency": gen_measures[-1]}) DLLogger.log(step=(i, ), data={"frames/s": num_frames / gen_measures[-1]}) measures = gen_measures[warmup_iters:] DLLogger.log(step=(), data={'avg latency': np.mean(measures)}) DLLogger.log(step=(), data={'avg frames/s': all_frames / np.sum(measures)}) DLLogger.flush()
def main(): args = parse_args() dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) ]) dllogger.log(data=vars(args), step='PARAMETER') model = NeuMF(nb_users=args.n_users, nb_items=args.n_items, mf_dim=args.factors, mlp_layer_sizes=args.layers, dropout=args.dropout) model = model.cuda() if args.load_checkpoint_path: state_dict = torch.load(args.load_checkpoint_path) model.load_state_dict(state_dict) if args.opt_level == "O2": model = amp.initialize(model, opt_level=args.opt_level, keep_batchnorm_fp32=False, loss_scale='dynamic') model.eval() users = torch.cuda.LongTensor(args.batch_size).random_(0, args.n_users) items = torch.cuda.LongTensor(args.batch_size).random_(0, args.n_items) latencies = [] for _ in range(args.num_batches): torch.cuda.synchronize() start = time.time() predictions = model(users, items, sigmoid=True) torch.cuda.synchronize() latencies.append(time.time() - start) dllogger.log(data={ 'batch_size': args.batch_size, 'best_inference_throughput': args.batch_size / min(latencies), 'best_inference_latency': min(latencies), 'mean_inference_throughput': args.batch_size / np.mean(latencies), 'mean_inference_latency': np.mean(latencies), 'inference_latencies': latencies }, step=tuple()) dllogger.flush() return
def end(self): for n, m in self.metrics.items(): m["meter"].reset_epoch() verbositys = {m["level"] for _, m in self.metrics.items()} for ll in verbositys: llm = {n: m for n, m in self.metrics.items() if m["level"] == ll} dllogger.log( step=tuple(), data={n: m["meter"].get_run() for n, m in llm.items()}) for n, m in self.metrics.items(): m["meter"].reset_epoch() dllogger.flush()
def main(): args = parse_args() dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)]) dllogger.log(data=vars(args), step='PARAMETER') model = NeuMF(nb_users=args.n_users, nb_items=args.n_items, mf_dim=args.factors, mlp_layer_sizes=args.layers, dropout=args.dropout) model = model.cuda() if args.load_checkpoint_path: state_dict = torch.load(args.load_checkpoint_path) model.load_state_dict(state_dict) if args.fp16: model.half() model.eval() batch_sizes = args.batch_sizes.split(',') batch_sizes = [int(s) for s in batch_sizes] result_data = {} for batch_size in batch_sizes: print('benchmarking batch size: ', batch_size) users = torch.cuda.LongTensor(batch_size).random_(0, args.n_users) items = torch.cuda.LongTensor(batch_size).random_(0, args.n_items) latencies = [] for _ in range(args.num_batches): torch.cuda.synchronize() start = time.time() _ = model(users, items, sigmoid=True) torch.cuda.synchronize() latencies.append(time.time() - start) result_data[f'batch_{batch_size}_mean_throughput'] = batch_size / np.mean(latencies) result_data[f'batch_{batch_size}_mean_latency'] = np.mean(latencies) result_data[f'batch_{batch_size}_p90_latency'] = np.percentile(latencies, 0.90) result_data[f'batch_{batch_size}_p95_latency'] = np.percentile(latencies, 0.95) result_data[f'batch_{batch_size}_p99_latency'] = np.percentile(latencies, 0.99) dllogger.log(data=result_data, step=tuple()) dllogger.flush() return
def validate(args, trainer, datasets, subsets): """Evaluate the model on the validation set(s) and return the losses.""" # Reset value iterations counter trainer._num_val_iterations = 0 valid_losses = [] for subset in subsets: if len(subsets) > 1: print('Validating on \'{}\' subset'.format(subset)) # Initialize data iterator itr = data.EpochBatchIterator( dataset=datasets[subset], max_tokens=args.max_tokens, max_sentences=args.max_sentences_valid, max_positions=args.max_positions, ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ).next_epoch_itr(shuffle=False) # reset validation loss meters DLLogger.flush() subset_losses = [] for sample in itr: loss = trainer.valid_step(sample) subset_losses.append(loss) subset_loss = sum(subset_losses) / len(subset_losses) DLLogger.flush() valid_losses.append(subset_loss) print(f'Validation loss on subset {subset}: {subset_loss}') return valid_losses
def end_iteration(self, mode='train'): if mode == 'val': it = self.val_iteration elif mode == 'train': it = self.iteration elif mode == 'calib': it = self.calib_iteration if it % self.print_interval == 0 or mode == 'calib': metrics = { n: m for n, m in self.metrics.items() if n.startswith(mode) } if mode == 'train': step = (self.epoch, self.iteration) elif mode == 'val': step = (self.epoch, self.iteration, self.val_iteration) elif mode == 'calib': step = ('Calibration', self.calib_iteration) verbositys = {m["level"] for _, m in metrics.items()} for ll in verbositys: llm = {n: m for n, m in metrics.items() if m["level"] == ll} dllogger.log( step=step, data={ n: m["meter"].get_iteration() for n, m in llm.items() }, verbosity=ll, ) for n, m in metrics.items(): m["meter"].reset_iteration() dllogger.flush()
def main(args): tts = TTS(args) tts.load_model() try: f = open(args.input, 'r') texts = f.readlines() except: print("Could not read file") sys.exit(1) audios, alignments = tts.forward(texts) for i, audio in enumerate(audios): audio_path = args.output + "audio_" + str( i) + "_" + args.suffix + ".wav" write(audio_path, args.sampling_rate, audio.cpu().numpy()) #plt.imshow(alignments[i].float().data.cpu().numpy().T, aspect="auto", origin="lower") #figure_path = args.output+"alignment_"+str(i)+"_"+args.suffix+".png" #plt.savefig(figure_path) DLLogger.flush()
def train(args, trainer, epoch_itr): """Train the model for one epoch.""" # Initialize data iterator itr = epoch_itr.next_epoch_itr() # update parameters every N batches if epoch_itr.epoch <= len(args.update_freq): update_freq = args.update_freq[epoch_itr.epoch - 1] else: update_freq = args.update_freq[-1] max_update = args.max_update or math.inf num_batches = len(epoch_itr) begin = time.time() # reset meters DLLogger.flush() trainer.get_throughput_meter().reset() for i, sample in enumerate(itr): if i < num_batches - 1 and (i + 1) % update_freq > 0: # buffer updates according to --update-freq trainer.train_step(sample, update_params=False, last_step=(i == len(itr) - 1)) continue else: trainer.train_step(sample, update_params=True, last_step=(i == len(itr) - 1)) # ignore the first mini-batch in words-per-second calculation if i == 0: trainer.get_throughput_meter().reset() reset_perf_meters() if (i + 1) % args.log_interval == 0: DLLogger.flush() if trainer.get_num_updates() >= max_update: break print('Epoch time:', time.time() - begin) # Print epoch stats and reset training meters DLLogger.log(step=trainer.get_num_updates(), data={'speed': trainer.get_throughput_meter().avg}, verbosity=0) DLLogger.flush()
def main(): parser = argparse.ArgumentParser( description='TensorRT Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() # initialize CUDA state torch.cuda.init() TRT_LOGGER = trt.Logger(trt.Logger.WARNING) encoder = load_engine(args.encoder, TRT_LOGGER) decoder_iter = load_engine(args.decoder, TRT_LOGGER) postnet = load_engine(args.postnet, TRT_LOGGER) waveglow = load_engine(args.waveglow, TRT_LOGGER) if args.waveglow_ckpt != "": # setup denoiser using WaveGlow PyTorch checkpoint waveglow_ckpt = load_and_setup_model('WaveGlow', parser, args.waveglow_ckpt, True, forward_is_infer=True) denoiser = Denoiser(waveglow_ckpt).cuda() # after initialization, we don't need WaveGlow PyTorch checkpoint # anymore - deleting del waveglow_ckpt torch.cuda.empty_cache() # create TRT contexts for each engine encoder_context = encoder.create_execution_context() decoder_context = decoder_iter.create_execution_context() postnet_context = postnet.create_execution_context() waveglow_context = waveglow.create_execution_context() DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.output+'/'+args.log_file), StdOutBackend(Verbosity.VERBOSE)]) texts = [] try: f = open(args.input, 'r') texts = f.readlines() except: print("Could not read file") sys.exit(1) measurements = {} sequences, sequence_lengths = prepare_input_sequence(texts) sequences = sequences.to(torch.int32) sequence_lengths = sequence_lengths.to(torch.int32) with MeasureTime(measurements, "latency"): mel, mel_lengths = infer_tacotron2_trt(encoder, decoder_iter, postnet, encoder_context, decoder_context, postnet_context, sequences, sequence_lengths, measurements, args.fp16) audios = infer_waveglow_trt(waveglow, waveglow_context, mel, measurements, args.fp16) with encoder_context, decoder_context, postnet_context, waveglow_context: pass audios = audios.float() if args.waveglow_ckpt != "": with MeasureTime(measurements, "denoiser"): audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) for i, audio in enumerate(audios): audio = audio[:mel_lengths[i]*args.stft_hop_length] audio = audio/torch.max(torch.abs(audio)) audio_path = args.output + "audio_"+str(i)+"_trt.wav" write(audio_path, args.sampling_rate, audio.cpu().numpy()) DLLogger.log(step=0, data={"tacotron2_encoder_latency": measurements['tacotron2_encoder_time']}) DLLogger.log(step=0, data={"tacotron2_decoder_latency": measurements['tacotron2_decoder_time']}) DLLogger.log(step=0, data={"tacotron2_postnet_latency": measurements['tacotron2_postnet_time']}) DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']}) DLLogger.log(step=0, data={"latency": measurements['latency']}) if args.waveglow_ckpt != "": DLLogger.log(step=0, data={"denoiser": measurements['denoiser']}) DLLogger.flush() prec = "fp16" if args.fp16 else "fp32" latency = measurements['latency'] throughput = audios.size(1)/latency log_data = "1,"+str(sequence_lengths[0].item())+","+prec+","+str(latency)+","+str(throughput)+","+str(mel_lengths[0].item())+"\n" with open("log_bs1_"+prec+".log", 'a') as f: f.write(log_data)
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU or CPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, unknown_args = parser.parse_known_args() DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.log_file), StdOutBackend(Verbosity.VERBOSE)]) for k,v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k:v}) DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'}) measurements_all = {"pre_processing": [], "tacotron2_latency": [], "waveglow_latency": [], "latency": [], "type_conversion": [], "data_transfer": [], "storage": [], "tacotron2_items_per_sec": [], "waveglow_items_per_sec": [], "num_mels_per_audio": [], "throughput": []} print("args:", args, unknown_args) tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.amp_run, args.cpu_run, forward_is_infer=True) waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.amp_run, args.cpu_run) if args.cpu_run: denoiser = Denoiser(waveglow, args.cpu_run) else: denoiser = Denoiser(waveglow, args.cpu_run).cuda() jitted_tacotron2 = torch.jit.script(tacotron2) texts = ["The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."] texts = [texts[0][:args.input_length]] texts = texts*args.batch_size warmup_iters = 3 for iter in range(args.num_iters): measurements = {} with MeasureTime(measurements, "pre_processing", args.cpu_run): sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu_run) with torch.no_grad(): with MeasureTime(measurements, "latency", args.cpu_run): with MeasureTime(measurements, "tacotron2_latency", args.cpu_run): mel, mel_lengths, _ = jitted_tacotron2(sequences_padded, input_lengths) with MeasureTime(measurements, "waveglow_latency", args.cpu_run): audios = waveglow.infer(mel, sigma=args.sigma_infer) audios = audios.float() audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) num_mels = mel.size(0)*mel.size(2) num_samples = audios.size(0)*audios.size(1) with MeasureTime(measurements, "type_conversion", args.cpu_run): audios = audios.float() with MeasureTime(measurements, "data_transfer", args.cpu_run): audios = audios.cpu() with MeasureTime(measurements, "storage", args.cpu_run): audios = audios.numpy() for i, audio in enumerate(audios): audio_path = "audio_"+str(i)+".wav" write(audio_path, args.sampling_rate, audio[:mel_lengths[i]*args.stft_hop_length]) measurements['tacotron2_items_per_sec'] = num_mels/measurements['tacotron2_latency'] measurements['waveglow_items_per_sec'] = num_samples/measurements['waveglow_latency'] measurements['num_mels_per_audio'] = mel.size(2) measurements['throughput'] = num_samples/measurements['latency'] if iter >= warmup_iters: for k,v in measurements.items(): measurements_all[k].append(v) DLLogger.log(step=(iter-warmup_iters), data={k: v}) DLLogger.flush() print_stats(measurements_all)
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.output + '/' + args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'}) tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.amp_run, forward_is_infer=True) waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.amp_run, forward_is_infer=True) denoiser = Denoiser(waveglow).cuda() jitted_tacotron2 = torch.jit.script(tacotron2) texts = [] try: f = open(args.input, 'r') texts = f.readlines() except: print("Could not read file") sys.exit(1) if args.include_warmup: sequence = torch.randint(low=0, high=148, size=(1, 50), dtype=torch.long).cuda() input_lengths = torch.IntTensor([sequence.size(1)]).cuda().long() for i in range(3): with torch.no_grad(): mel, mel_lengths = jitted_tacotron2(sequence, input_lengths) _ = waveglow(mel) measurements = {} sequences_padded, input_lengths = prepare_input_sequence(texts) with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"): mel, mel_lengths = jitted_tacotron2(sequences_padded, input_lengths) with torch.no_grad(), MeasureTime(measurements, "waveglow_time"): audios = waveglow(mel, sigma=args.sigma_infer) audios = audios.float() audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) print("Stopping after", mel.size(2), "decoder steps") tacotron2_infer_perf = mel.size(0) * mel.size( 2) / measurements['tacotron2_time'] waveglow_infer_perf = audios.size(0) * audios.size( 1) / measurements['waveglow_time'] DLLogger.log(step=0, data={"tacotron2_items_per_sec": tacotron2_infer_perf}) DLLogger.log(step=0, data={"tacotron2_latency": measurements['tacotron2_time']}) DLLogger.log(step=0, data={"waveglow_items_per_sec": waveglow_infer_perf}) DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']}) DLLogger.log(step=0, data={ "latency": (measurements['tacotron2_time'] + measurements['waveglow_time']) }) for i, audio in enumerate(audios): audio = audio[:mel_lengths[i] * args.stft_hop_length] audio = audio / torch.max(torch.abs(audio)) audio_path = args.output + "audio_" + str(i) + ".wav" write(audio_path, args.sampling_rate, audio.cpu().numpy()) DLLogger.flush()
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser(description='PyTorch FastPitch Inference', allow_abbrev=False) parser = parse_args(parser) args, unk_args = parser.parse_known_args() if args.p_arpabet > 0.0: cmudict.initialize(args.cmudict_path, keep_ambiguous=True) torch.backends.cudnn.benchmark = args.cudnn_benchmark if args.output is not None: Path(args.output).mkdir(parents=False, exist_ok=True) log_fpath = args.log_file or str(Path(args.output, 'nvlog_infer.json')) log_fpath = unique_log_fpath(log_fpath) DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_fpath), StdOutBackend(Verbosity.VERBOSE, metric_format=stdout_metric_format)]) init_inference_metadata() [DLLogger.log("PARAMETER", {k: v}) for k, v in vars(args).items()] device = torch.device('cuda' if args.cuda else 'cpu') if args.fastpitch != 'SKIP': generator = load_and_setup_model( 'FastPitch', parser, args.fastpitch, args.amp, device, unk_args=unk_args, forward_is_infer=True, ema=args.ema, jitable=args.torchscript) if args.torchscript: generator = torch.jit.script(generator) else: generator = None if args.waveglow != 'SKIP': with warnings.catch_warnings(): warnings.simplefilter("ignore") waveglow = load_and_setup_model( 'WaveGlow', parser, args.waveglow, args.amp, device, unk_args=unk_args, forward_is_infer=True, ema=args.ema) denoiser = Denoiser(waveglow).to(device) waveglow = getattr(waveglow, 'infer', waveglow) else: waveglow = None if len(unk_args) > 0: raise ValueError(f'Invalid options {unk_args}') fields = load_fields(args.input) batches = prepare_input_sequence( fields, device, args.symbol_set, args.text_cleaners, args.batch_size, args.dataset_path, load_mels=(generator is None), p_arpabet=args.p_arpabet) # Use real data rather than synthetic - FastPitch predicts len for _ in tqdm(range(args.warmup_steps), 'Warmup'): with torch.no_grad(): if generator is not None: b = batches[0] mel, *_ = generator(b['text']) if waveglow is not None: audios = waveglow(mel, sigma=args.sigma_infer).float() _ = denoiser(audios, strength=args.denoising_strength) gen_measures = MeasureTime(cuda=args.cuda) waveglow_measures = MeasureTime(cuda=args.cuda) gen_kw = {'pace': args.pace, 'speaker': args.speaker, 'pitch_tgt': None, 'pitch_transform': build_pitch_transformation(args)} if args.torchscript: gen_kw.pop('pitch_transform') print('NOTE: Pitch transforms are disabled with TorchScript') all_utterances = 0 all_samples = 0 all_letters = 0 all_frames = 0 reps = args.repeats log_enabled = reps == 1 log = lambda s, d: DLLogger.log(step=s, data=d) if log_enabled else None for rep in (tqdm(range(reps), 'Inference') if reps > 1 else range(reps)): for b in batches: if generator is None: log(rep, {'Synthesizing from ground truth mels'}) mel, mel_lens = b['mel'], b['mel_lens'] else: with torch.no_grad(), gen_measures: mel, mel_lens, *_ = generator(b['text'], **gen_kw) gen_infer_perf = mel.size(0) * mel.size(2) / gen_measures[-1] all_letters += b['text_lens'].sum().item() all_frames += mel.size(0) * mel.size(2) log(rep, {"fastpitch_frames/s": gen_infer_perf}) log(rep, {"fastpitch_latency": gen_measures[-1]}) if args.save_mels: for i, mel_ in enumerate(mel): m = mel_[:, :mel_lens[i].item()].permute(1, 0) fname = b['output'][i] if 'output' in b else f'mel_{i}.npy' mel_path = Path(args.output, Path(fname).stem + '.npy') np.save(mel_path, m.cpu().numpy()) if waveglow is not None: with torch.no_grad(), waveglow_measures: audios = waveglow(mel, sigma=args.sigma_infer) audios = denoiser(audios.float(), strength=args.denoising_strength ).squeeze(1) all_utterances += len(audios) all_samples += sum(audio.size(0) for audio in audios) waveglow_infer_perf = ( audios.size(0) * audios.size(1) / waveglow_measures[-1]) log(rep, {"waveglow_samples/s": waveglow_infer_perf}) log(rep, {"waveglow_latency": waveglow_measures[-1]}) if args.output is not None and reps == 1: for i, audio in enumerate(audios): audio = audio[:mel_lens[i].item() * args.stft_hop_length] if args.fade_out: fade_len = args.fade_out * args.stft_hop_length fade_w = torch.linspace(1.0, 0.0, fade_len) audio[-fade_len:] *= fade_w.to(audio.device) audio = audio / torch.max(torch.abs(audio)) fname = b['output'][i] if 'output' in b else f'audio_{i}.wav' audio_path = Path(args.output, fname) write(audio_path, args.sampling_rate, audio.cpu().numpy()) if generator is not None and waveglow is not None: log(rep, {"latency": (gen_measures[-1] + waveglow_measures[-1])}) log_enabled = True if generator is not None: gm = np.sort(np.asarray(gen_measures)) rtf = all_samples / (all_utterances * gm.mean() * args.sampling_rate) log((), {"avg_fastpitch_letters/s": all_letters / gm.sum()}) log((), {"avg_fastpitch_frames/s": all_frames / gm.sum()}) log((), {"avg_fastpitch_latency": gm.mean()}) log((), {"avg_fastpitch_RTF": rtf}) log((), {"90%_fastpitch_latency": gm.mean() + norm.ppf((1.0 + 0.90) / 2) * gm.std()}) log((), {"95%_fastpitch_latency": gm.mean() + norm.ppf((1.0 + 0.95) / 2) * gm.std()}) log((), {"99%_fastpitch_latency": gm.mean() + norm.ppf((1.0 + 0.99) / 2) * gm.std()}) if waveglow is not None: wm = np.sort(np.asarray(waveglow_measures)) rtf = all_samples / (all_utterances * wm.mean() * args.sampling_rate) log((), {"avg_waveglow_samples/s": all_samples / wm.sum()}) log((), {"avg_waveglow_latency": wm.mean()}) log((), {"avg_waveglow_RTF": rtf}) log((), {"90%_waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.90) / 2) * wm.std()}) log((), {"95%_waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.95) / 2) * wm.std()}) log((), {"99%_waveglow_latency": wm.mean() + norm.ppf((1.0 + 0.99) / 2) * wm.std()}) if generator is not None and waveglow is not None: m = gm + wm rtf = all_samples / (all_utterances * m.mean() * args.sampling_rate) log((), {"avg_samples/s": all_samples / m.sum()}) log((), {"avg_letters/s": all_letters / m.sum()}) log((), {"avg_latency": m.mean()}) log((), {"avg_RTF": rtf}) log((), {"90%_latency": m.mean() + norm.ppf((1.0 + 0.90) / 2) * m.std()}) log((), {"95%_latency": m.mean() + norm.ppf((1.0 + 0.95) / 2) * m.std()}) log((), {"99%_latency": m.mean() + norm.ppf((1.0 + 0.99) / 2) * m.std()}) DLLogger.flush()
data=OrderedDict([('train_loss', reduced_loss), ('train_mel_frames_per_sec', (train_epoch_items_per_sec/num_iters if num_iters > 0 else 0.0)), ('train_epoch_time', epoch_time)])) val_loss = validate(model, criterion, valset, epoch, total_iter, args.batch_size, world_size, collate_fn, distributed_run, local_rank, batch_to_gpu) val_tblogger.log_value(total_iter, 'loss', val_loss) if (epoch % args.epochs_per_checkpoint == 0) and args.bench_class == "": save_checkpoint(model, optimizer, epoch, model_config, args.amp, args.output, args.model_name, local_rank, world_size) if local_rank == 0: DLLogger.flush() # ---------- Finished training -------------------------------------------- save_checkpoint(model, optimizer, epoch, model_config, args.amp, args.output, args.model_name, local_rank, world_size) torch.cuda.synchronize() run_stop_time = time.perf_counter() run_time = run_stop_time - run_start_time DLLogger.log(step=tuple(), data={'run_time': int(run_time)}) DLLogger.log(step=tuple(), data={'val_loss': val_loss}) DLLogger.log(step=tuple(), data={'train_items_per_sec': (int(train_epoch_items_per_sec/num_iters) if num_iters > 0 else 0)}) if local_rank == 0:
del train_dataloader # thread.join() return args, final_loss, train_time_raw, global_step del train_dataloader # thread.join() # Make sure pool has finished and switch train_dataloader # NOTE: Will block until complete train_dataloader, data_file = dataset_future.result(timeout=None) epoch += 1 if __name__ == "__main__": now = time.time() args, final_loss, train_time_raw, global_step = main() gpu_count = args.n_gpu global_step += args.phase1_end_step if (args.phase2 and args.resume_step > 0) else 0 if args.resume_step == -1: args.resume_step = 0 if torch.distributed.is_initialized(): gpu_count = get_world_size() if is_main_process(): e2e_time = time.time() - now training_perf = args.train_batch_size * args.gradient_accumulation_steps * gpu_count\ * (global_step - args.resume_step + skipped_steps) / train_time_raw dllogger.log(step=tuple(), data={"e2e_train_time": e2e_time, "training_sequences_per_second": training_perf, "final_loss": final_loss, "raw_train_time": train_time_raw }) dllogger.flush()
def main(args): args.fp16 = args.fp16 or args.amp if args.server_ip and args.server_port: # Distant debugging - see # https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd logger.info("Waiting for debugger attach") ptvsd.enable_attach( address=(args.server_ip, args.server_port), redirect_output=True, ) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of # sychronizing nodes/GPUs. if not torch.distributed.is_initialized(): torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, " "16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16, )) if not args.do_train and not args.do_eval and not args.do_predict: raise ValueError("At least one of `do_train`, `do_eval` or " "`do_predict` must be True.") if is_main_process(): if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train): logger.warning("Output directory ({}) already exists and is not " "empty.".format(args.output_dir)) mkdir_by_main_process(args.output_dir) if is_main_process(): dllogger.init(backends=[ dllogger.JSONStreamBackend( verbosity=dllogger.Verbosity.VERBOSE, filename=os.path.join(args.output_dir, 'dllogger.json'), ), dllogger.StdOutBackend( verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step, ), ]) else: dllogger.init(backends=[]) dllogger.log(step="PARAMETER", data={"Config": [str(args)]}) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( args.gradient_accumulation_steps)) if args.gradient_accumulation_steps > args.train_batch_size: raise ValueError("gradient_accumulation_steps ({}) cannot be larger " "train_batch_size ({}) - there cannot be a fraction " "of one sample.".format( args.gradient_accumulation_steps, args.train_batch_size, )) args.train_batch_size = (args.train_batch_size // args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) dllogger.log(step="PARAMETER", data={"SEED": args.seed}) processor = PROCESSORS[args.task_name]() num_labels = len(processor.get_labels()) #tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) tokenizer = BertTokenizer( args.vocab_file, do_lower_case=args.do_lower_case, max_len=512, ) # for bert large num_train_optimization_steps = None if args.do_train: train_features = get_train_features( args.data_dir, args.bert_model, args.max_seq_length, args.do_lower_case, args.local_rank, args.train_batch_size, args.gradient_accumulation_steps, args.num_train_epochs, tokenizer, processor, ) num_train_optimization_steps = int( len(train_features) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = (num_train_optimization_steps // torch.distributed.get_world_size()) # Prepare model config = modeling.BertConfig.from_json_file(args.config_file) # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) # modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training model = modeling.BertForSequenceClassification( config, num_labels=num_labels, ) logger.info("USING CHECKPOINT from {}".format(args.init_checkpoint)) checkpoint = torch.load(args.init_checkpoint, map_location='cpu') checkpoint = checkpoint["model"] if "model" in checkpoint.keys( ) else checkpoint model.load_state_dict(checkpoint, strict=False) logger.info("USED CHECKPOINT from {}".format(args.init_checkpoint)) dllogger.log( step="PARAMETER", data={ "num_parameters": sum([p.numel() for p in model.parameters() if p.requires_grad]), }, ) model.to(device) # Prepare optimizer model, optimizer, scheduler = init_optimizer_and_amp( model, args.learning_rate, args.loss_scale, args.warmup_proportion, num_train_optimization_steps, args.fp16, ) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from " "https://www.github.com/nvidia/apex to use " "distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) loss_fct = torch.nn.CrossEntropyLoss() results = {} if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) train_data = gen_tensor_dataset(train_features) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader( train_data, sampler=train_sampler, batch_size=args.train_batch_size, ) global_step = 0 nb_tr_steps = 0 tr_loss = 0 latency_train = 0.0 nb_tr_examples = 0 model.train() tic_train = time.perf_counter() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): if args.max_steps > 0 and global_step > args.max_steps: break batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, segment_ids, input_mask) loss = loss_fct( logits.view(-1, num_labels), label_ids.view(-1), ) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up for BERT # which FusedAdam doesn't do scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 latency_train = time.perf_counter() - tic_train tr_loss = tr_loss / nb_tr_steps results.update({ 'global_step': global_step, 'train:loss': tr_loss, 'train:latency': latency_train, 'train:num_samples_per_gpu': nb_tr_examples, 'train:num_steps': nb_tr_steps, 'train:throughput': get_world_size() * nb_tr_examples / latency_train, }) if is_main_process() and not args.skip_checkpoint: model_to_save = model.module if hasattr(model, 'module') else model torch.save( {"model": model_to_save.state_dict()}, os.path.join(args.output_dir, modeling.WEIGHTS_NAME), ) with open( os.path.join(args.output_dir, modeling.CONFIG_NAME), 'w', ) as f: f.write(model_to_save.config.to_json_string()) if (args.do_eval or args.do_predict) and is_main_process(): eval_examples = processor.get_dev_examples(args.data_dir) eval_features, label_map = convert_examples_to_features( eval_examples, processor.get_labels(), args.max_seq_length, tokenizer, ) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) eval_data = gen_tensor_dataset(eval_features) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size, ) model.eval() preds = None out_label_ids = None eval_loss = 0 nb_eval_steps, nb_eval_examples = 0, 0 cuda_events = [(torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)) for _ in range(len(eval_dataloader))] for i, (input_ids, input_mask, segment_ids, label_ids) in tqdm( enumerate(eval_dataloader), desc="Evaluating", ): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): cuda_events[i][0].record() logits = model(input_ids, segment_ids, input_mask) cuda_events[i][1].record() if args.do_eval: eval_loss += loss_fct( logits.view(-1, num_labels), label_ids.view(-1), ).mean().item() nb_eval_steps += 1 nb_eval_examples += input_ids.size(0) if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = label_ids.detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, label_ids.detach().cpu().numpy(), axis=0, ) torch.cuda.synchronize() eval_latencies = [ event_start.elapsed_time(event_end) for event_start, event_end in cuda_events ] eval_latencies = list(sorted(eval_latencies)) def infer_latency_sli(threshold): index = int(len(eval_latencies) * threshold) - 1 index = min(max(index, 0), len(eval_latencies) - 1) return eval_latencies[index] eval_throughput = (args.eval_batch_size / (np.mean(eval_latencies) / 1000)) results.update({ 'eval:num_samples_per_gpu': nb_eval_examples, 'eval:num_steps': nb_eval_steps, 'infer:latency(ms):50%': infer_latency_sli(0.5), 'infer:latency(ms):90%': infer_latency_sli(0.9), 'infer:latency(ms):95%': infer_latency_sli(0.95), 'infer:latency(ms):99%': infer_latency_sli(0.99), 'infer:latency(ms):100%': infer_latency_sli(1.0), 'infer:latency(ms):avg': np.mean(eval_latencies), 'infer:latency(ms):std': np.std(eval_latencies), 'infer:latency(ms):sum': np.sum(eval_latencies), 'infer:throughput(samples/s):avg': eval_throughput, }) preds = np.argmax(preds, axis=1) if args.do_predict: dump_predictions( os.path.join(args.output_dir, 'predictions.json'), label_map, preds, eval_examples, ) if args.do_eval: results['eval:loss'] = eval_loss / nb_eval_steps eval_result = compute_metrics(args.task_name, preds, out_label_ids) results.update(eval_result) if is_main_process(): logger.info("***** Results *****") for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) with open(os.path.join(args.output_dir, "results.txt"), "w") as writer: json.dump(results, writer) dllogger_queries_from_results = { 'exact_match': 'acc', 'F1': 'f1', 'e2e_train_time': 'train:latency', 'training_sequences_per_second': 'train:throughput', 'e2e_inference_time': ('infer:latency(ms):sum', lambda x: x / 1000), 'inference_sequences_per_second': 'infer:throughput(samples/s):avg', } for key, query in dllogger_queries_from_results.items(): results_key, convert = (query if isinstance(query, tuple) else (query, lambda x: x)) if results_key not in results: continue dllogger.log( step=tuple(), data={key: convert(results[results_key])}, ) dllogger.flush() return results
def score(args, trainer, dataset, src_dict, tgt_dict, ref_file): begin = time.time() src_dict = deepcopy( src_dict) # This is necessary, generation of translations tgt_dict = deepcopy( tgt_dict ) # alters target dictionary messing up with the rest of training model = trainer.get_model() # Initialize data iterator itr = data.EpochBatchIterator( dataset=dataset, max_tokens=None, max_sentences=max( 8, min(math.ceil(1024 / args.distributed_world_size), 128)), max_positions=args.max_positions, ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=8, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() translator = SequenceGenerator( [model], tgt_dict.get_metadata(), maxlen=args.max_target_positions - 1, #do not include EOS token beam_size=args.beam, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen, sampling=args.sampling, sampling_topk=args.sampling_topk, minlen=args.min_len, ) # Generate and compute BLEU dict = dictionary.Dictionary() num_sentences = 0 predictions = [] translations = translator.generate_batched_itr( itr, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b, cuda=True, timer=gen_timer, prefix_size=args.prefix_size, ) for sample_id, src_tokens, target_tokens, hypos in translations: # Process input and grount truth target_tokens = target_tokens.int().cpu() src_str = src_dict.string(src_tokens, args.remove_bpe) target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) # Process top predictions for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=None, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe) # Score only the top hypothesis if i == 0: if args.sentencepiece: hypo_str = hypo_str.replace(' ', '').replace('▁', ' ') target_str = target_str.replace(' ', '').replace('▁', ' ') sys_tok = tokenizer.Tokenizer.tokenize( (hypo_str.lower() if not args.test_cased_bleu else hypo_str), dict) ref_tok = tokenizer.Tokenizer.tokenize( (target_str.lower() if not args.test_cased_bleu else target_str), dict) if not args.sentencepiece: hypo_str = tokenizer.Tokenizer.detokenize(hypo_str, 'de') predictions.append('{}\t{}'.format(sample_id, hypo_str)) num_sentences += 1 if args.distributed_world_size > 1: predictions = _all_gather_predictions(predictions) with open(os.path.join(args.data, ref_file), 'r') as reference: refs = [reference.readlines()] #reducing indexed predictions as strings is more memory efficient than reducing tuples predictions = [tuple(item.split('\t')) for item in predictions] predictions = [(int(item[0]), item[1]) for item in predictions] predictions.sort(key=lambda tup: tup[0]) predictions = [ hypo[1] + ('\n' if hypo[1][-1] != '\n' else '') for hypo in predictions ] sacrebleu_score = sacrebleu.corpus_bleu( predictions, refs, lowercase=not args.test_cased_bleu).score if args.save_predictions: os.makedirs(os.path.join(args.save_dir, 'predictions'), exist_ok=True) with open( os.path.join( args.save_dir, 'predictions', ref_file + '.pred.update_{}'.format(trainer._num_updates)), 'w') as f: f.write(''.join(predictions)) DLLogger.log(step=trainer.get_num_updates(), data={ 'inference tokens/s': float(args.distributed_world_size) / gen_timer.avg }, verbosity=0) DLLogger.flush() if gen_timer.sum != 0: print( '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(len(predictions), gen_timer.n, gen_timer.sum, len(predictions) / gen_timer.sum, float(args.distributed_world_size) / gen_timer.avg)) print('| Eval completed in: {:.2f}s | {}CASED BLEU {:.2f}'.format( time.time() - begin, '' if args.test_cased_bleu else 'UN', sacrebleu_score)) return sacrebleu_score
def main(): hvd.init() parser = ArgumentParser(description="Train a Variational Autoencoder for Collaborative Filtering in TensorFlow") parser.add_argument('--train', action='store_true', help='Run training of VAE') parser.add_argument('--test', action='store_true', help='Run validation of VAE') parser.add_argument('--inference', action='store_true', help='Run inference on a single random example.' 'This can also be used to measure the latency for a batch size of 1') parser.add_argument('--inference_benchmark', action='store_true', help='Benchmark the inference throughput on a very large batch size') parser.add_argument('--use_tf_amp', action='store_true', help='Enable Automatic Mixed Precision') parser.add_argument('--epochs', type=int, default=400, help='Number of epochs to train') parser.add_argument('--batch_size_train', type=int, default=24576, help='Global batch size for training') parser.add_argument('--batch_size_validation', type=int, default=10000, help='Used both for validation and testing') parser.add_argument('--validation_step', type=int, default=50, help='Train epochs for one validation') parser.add_argument('--warm_up_epochs', type=int, default=5, help='Number of epochs to omit during benchmark') parser.add_argument('--total_anneal_steps', type=int, default=15000, help='Number of annealing steps') parser.add_argument('--anneal_cap', type=float, default=0.1, help='Annealing cap') parser.add_argument('--lam', type=float, default=1.00, help='Regularization parameter') parser.add_argument('--lr', type=float, default=0.004, help='Learning rate') parser.add_argument('--beta1', type=float, default=0.90, help='Adam beta1') parser.add_argument('--beta2', type=float, default=0.90, help='Adam beta2') parser.add_argument('--top_results', type=int, default=100, help='Number of results to be recommended') parser.add_argument('--xla', action='store_true', default=False, help='Enable XLA') parser.add_argument('--trace', action='store_true', default=False, help='Save profiling traces') parser.add_argument('--activation', type=str, default='tanh', help='Activation function') parser.add_argument('--log_path', type=str, default='./vae_cf.log', help='Path to the detailed training log to be created') parser.add_argument('--seed', type=int, default=0, help='Random seed for TensorFlow and numpy') parser.add_argument('--data_dir', default='/data', type=str, help='Directory for storing the training data') parser.add_argument('--checkpoint_dir', type=str, default=None, help='Path for saving a checkpoint after the training') args = parser.parse_args() if args.batch_size_train % hvd.size() != 0: raise ValueError('Global batch size should be a multiple of the number of workers') args.local_batch_size = args.batch_size_train // hvd.size() logger = logging.getLogger("VAE") if hvd.rank() == 0: logger.setLevel(logging.INFO) dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)]) else: dllogger.init(backends=[]) logger.setLevel(logging.ERROR) dllogger.log(data=vars(args), step='PARAMETER') np.random.seed(args.seed) tf.set_random_seed(args.seed) # Suppress TF warnings os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # set AMP os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1' if args.use_tf_amp else '0' # load dataset (train_data, validation_data_input, validation_data_true, test_data_input, test_data_true) = load_and_parse_ML_20M(args.data_dir) # make sure all dims and sizes are divisible by 8 number_of_train_users, number_of_items = train_data.shape number_of_items = round_8(number_of_items) for data in [train_data, validation_data_input, validation_data_true, test_data_input, test_data_true]: number_of_users, _ = data.shape data.resize(number_of_users, number_of_items) number_of_users, number_of_items = train_data.shape encoder_dims = [number_of_items, 600, 200] vae = VAE(train_data, encoder_dims, total_anneal_steps=args.total_anneal_steps, anneal_cap=args.anneal_cap, batch_size_train=args.local_batch_size, batch_size_validation=args.batch_size_validation, lam=args.lam, lr=args.lr, beta1=args.beta1, beta2=args.beta2, activation=args.activation, xla=args.xla, checkpoint_dir=args.checkpoint_dir, trace=args.trace, top_results=args.top_results) metrics = {'ndcg@100': partial(ndcg, R=100), 'recall@20': partial(recall, R=20), 'recall@50': partial(recall, R=50)} if args.train: vae.train(n_epochs=args.epochs, validation_data_input=validation_data_input, validation_data_true=validation_data_true, metrics=metrics, validation_step=args.validation_step) if args.test and hvd.size() <= 1: test_results = vae.test(test_data_input=test_data_input, test_data_true=test_data_true, metrics=metrics) for k, v in test_results.items(): print("{}:\t{}".format(k, v)) elif args.test and hvd.size() > 1: print("Testing is not supported with horovod multigpu yet") if args.inference_benchmark and hvd.size() <= 1: # use the train data to get accurate throughput numbers for inference # the test and validation sets are too small to measure this accurately # vae.inference_benchmark() _ = vae.test(test_data_input=train_data, test_data_true=train_data, metrics={}) elif args.test and hvd.size() > 1: print("Testing is not supported with horovod multigpu yet") if args.inference: input_data = np.random.randint(low=0, high=10000, size=10) recommendations = vae.query(input_data=input_data) print('Recommended item indices: ', recommendations) vae.close_session() dllogger.flush()
def main(): """ Launches inference benchmark. Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() log_file = args.log_file DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.output + '/' + args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'}) model = load_and_setup_model(args.model_name, parser, None, args.amp_run, forward_is_infer=True) if args.model_name == "Tacotron2": model = torch.jit.script(model) warmup_iters = 3 num_iters = 1 + warmup_iters for i in range(num_iters): measurements = {} if args.model_name == 'Tacotron2': text_padded = torch.randint(low=0, high=148, size=(args.batch_size, 140), dtype=torch.long).cuda() input_lengths = torch.IntTensor([text_padded.size(1)] * args.batch_size).cuda().long() with torch.no_grad(), MeasureTime(measurements, "inference_time"): mels, _, _ = model(text_padded, input_lengths) num_items = mels.size(0) * mels.size(2) if args.model_name == 'WaveGlow': n_mel_channels = model.upsample.in_channels num_mels = 895 mel_padded = torch.zeros(args.batch_size, n_mel_channels, num_mels).normal_(-5.62, 1.98).cuda() if args.amp_run: mel_padded = mel_padded.half() with torch.no_grad(), MeasureTime(measurements, "inference_time"): audios = model(mel_padded) audios = audios.float() num_items = audios.size(0) * audios.size(1) if i >= warmup_iters: DLLogger.log(step=(i - warmup_iters, ), data={"latency": measurements['inference_time']}) DLLogger.log(step=(i - warmup_iters, ), data={ "items_per_sec": num_items / measurements['inference_time'] }) DLLogger.log(step=tuple(), data={'infer_latency': measurements['inference_time']}) DLLogger.log(step=tuple(), data={ 'infer_items_per_sec': num_items / measurements['inference_time'] }) DLLogger.flush()
def run_generate(verbose=True): """ Takes input text, generates output, and then using reference calculates the BLEU scores. The results are saved to a file and returned to the caller, and printed out unless ``verbose=False`` is passed. Args: verbose (:obj:`bool`, `optional`, defaults to :obj:`True`): print results to stdout Returns: a tuple: ``(scores, params}`` - ``scores``: a dict of scores data ``{'bleu': 39.6501, 'n_obs': 2000, 'runtime': 186, 'seconds_per_sample': 0.093}`` - ``params``: a dict of custom params, e.g. ``{'num_beams': 5, 'length_penalty': 0.8}`` """ parser = argparse.ArgumentParser() parser.add_argument("model_path", type=str, help="like facebook/bart-large-cnn or path to ckpt") parser.add_argument("config_path", type=str, help="path to config") parser.add_argument("data_dir", type=str, help="like cnn_dm/test.source") parser.add_argument("save_path", type=str, help="where to save summaries") parser.add_argument("--type_path", type=str, required=False, default="test", help="like cnn_dm/test.target") parser.add_argument("--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.") parser.add_argument("--prefix", type=str, required=False, default=None, help="will be added to the begininng of src examples") parser.add_argument("--task", type=str, default="summarization", help="used for task_specific_params + metrics") parser.add_argument("--bs", type=int, default=8, required=False, help="batch size") parser.add_argument("--n_obs", type=int, default=None, required=False, help="How many observations. Defaults to all.") parser.add_argument("--num_return_sequences", type=int, default=1, required=False, help="How many sequences to return") parser.add_argument("--fp16", action="store_true") parser.add_argument("--dump-args", action="store_true", help="print the custom hparams with the results") parser.add_argument( "--info", nargs="?", type=str, const=datetime_now(), help= "use in conjunction w/ --dump-args to print with the results whatever other info you'd like, e.g. lang=en-ru. If no value is passed, the current datetime string will be used.", ) parser.add_argument("--eval_max_gen_length", type=int, default=None, help="never generate more than n tokens") parser.add_argument( "--eval_beams", type=int, default=None, required=False, help="# beams to use. 0 corresponds to not using beam search.") parser.add_argument( "--max_source_length", default=1024, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_target_length", default=142, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--sync_timeout", type=int, default=600, required=False, help= "How long should master process wait for other processes to finish.", ) parser.add_argument("--debug", action="store_true") parser.add_argument('--json-summary', type=str, default="results/dllogger.json", help='If provided, the json summary will be written to' 'the specified file.') parser.add_argument( '--distill', type=str, default=None, help="string indicating how model is distilled, only sft supported", choices=["sft", None]) parser.add_argument( '--layers', type=str, default=None, help= "string indicating which teacher layers remain, split by '-' (ex. 0-6-11)" ) parser.add_argument('--do_encoder', action="store_true", default=False, help="if true encoder distilled") parser.add_argument('--do_decoder', action="store_true", default=False, help="if true decoder distilled") dist = parser.add_argument_group('distributed setup') dist.add_argument('--local_rank', type=int, default=os.getenv('LOCAL_RANK', 0), help='Used for multi-process training.') start_time = time.time() # Unspecified args like --num_beams=2 --decoder_start_token_id=4 are passed to model.generate args, rest = parser.parse_known_args() parsed_args = parse_numeric_n_bool_cl_kwargs(rest) if args.local_rank <= 0: print(args) print(rest) # Initialize device and distributed backend utils.distributed_utils.init_distributed(args.device == "cuda") if utils.distributed_utils.get_world_size() > 1: utils.distributed_utils.set_affinity(args.local_rank) torch.cuda.set_device(args.local_rank) if Path(args.json_summary).exists(): warnings.warn( f"json_summary {args.json_summary} will be overwritten unless you type ctrl-c." ) if utils.distributed_utils.get_rank() == 0: dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.json_summary), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step) ]) else: dllogger.init(backends=[]) if parsed_args and verbose: print(f"parsed the following generate kwargs: {parsed_args}") Path(args.save_path).parent.mkdir(exist_ok=True) json_save_path = Path(args.save_path + "/tmp") Path(json_save_path).mkdir(exist_ok=True) # this handles locking. if args.layers: num_layers = len(args.layers.split('-')) else: num_layers = None results, num_replicas, runtime_metrics = generate_summaries_or_translations( args.data_dir, json_save_path, args.model_path, args.config_path, batch_size=args.bs, device=args.device, fp16=args.fp16, task=args.task, prefix=args.prefix, eval_beams=args.eval_beams, max_source_length=args.max_source_length, max_target_length=args.max_target_length, eval_max_gen_length=args.eval_max_gen_length, n_obs=args.n_obs, type_path=args.type_path, num_return_sequences=args.num_return_sequences, distill=args.distill, num_layers=num_layers, do_encoder=args.do_encoder, do_decoder=args.do_decoder, **parsed_args, ) if args.local_rank <= 0: save_path = Path(args.save_path) save_path.mkdir(exist_ok=True) partial_results = gather_results_from_each_node( num_replicas, json_save_path, args.sync_timeout) preds, time_list = combine_partial_results(partial_results) if args.num_return_sequences > 1: save_path = save_path.joinpath("pseudolabel_results.json") print( f"Saving aggregated results at {save_path}, intermediate in {json_save_path}/" ) save_json(preds, save_path) return tgt_file = Path(args.data_dir).joinpath(args.type_path + ".target") labels = [x.rstrip() for x in open(tgt_file).readlines()][:len(preds)] # Calculate metrics, save metrics, and save _generations.txt calc_bleu = "translation" in args.task score_fn = calculate_bleu if calc_bleu else calculate_rouge metric_name = "bleu" if calc_bleu else "rouge" metrics: Dict = score_fn(preds, labels) metrics["n_obs"] = len(preds) runtime = time.time() - start_time metrics["seconds_per_sample"] = round(runtime / metrics["n_obs"], 4) metrics["n_gpus"] = num_replicas metrics.update(runtime_metrics) time_list.sort() metrics["inference_latency_mean"] = np.mean(time_list) metrics["inference_latency_conf_50"] = max( time_list[:int(len(time_list) * 0.50)]) metrics["inference_latency_conf_90"] = max( time_list[:int(len(time_list) * 0.90)]) metrics["inference_latency_conf_95"] = max( time_list[:int(len(time_list) * 0.95)]) metrics["inference_latency_conf_99"] = max( time_list[:int(len(time_list) * 0.99)]) metrics["inference_latency_conf_100"] = max( time_list[:int(len(time_list) * 1)]) metrics["inference_throughput_mean"] = len(preds) * 1.0 / sum( time_list) metrics_save_path = save_path.joinpath( f"{args.type_path}_{metric_name}.json") save_json(metrics, metrics_save_path, indent=None) dllogger.log(step=tuple(), data=metrics) print(metrics) write_txt_file(preds, save_path.joinpath(f"{args.type_path}_generations.txt")) if args.debug: write_txt_file(labels, save_path.joinpath(f"{args.type_path}.target")) else: shutil.rmtree(json_save_path) dllogger.flush()
def train(args, trainer, datasets, epoch_itr): """Train the model for one epoch.""" # Initialize data iterator itr = epoch_itr.next_epoch_itr() # update parameters every N batches if epoch_itr.epoch <= len(args.update_freq): update_freq = args.update_freq[epoch_itr.epoch - 1] else: update_freq = args.update_freq[-1] if args.enable_parallel_backward_allred_opt and update_freq > 1: raise RuntimeError( '--enable-parallel-backward-allred-opt is incompatible with --update-freq > 1' ) first_valid = args.valid_subset.split(',')[0] max_update = args.max_update or math.inf num_batches = len(epoch_itr) begin = time.time() # reset meters DLLogger.flush() trainer.get_throughput_meter().reset() for i, sample in enumerate(itr): if i < num_batches - 1 and (i + 1) % update_freq > 0: # buffer updates according to --update-freq trainer.train_step(sample, update_params=False, last_step=(i == len(itr) - 1)) continue else: trainer.train_step(sample, update_params=True, last_step=(i == len(itr) - 1)) # ignore the first mini-batch in words-per-second calculation if i == 0: trainer.get_throughput_meter().reset() for backend in DLLogger.GLOBAL_LOGGER.backends: if isinstance(backend, AggregatorBackend): backend._reset_perf_meter('tokens') backend._reset_perf_meter('updates') break # Mid epoch checkpoint num_updates = trainer.get_num_updates() if args.save_interval_updates > 0 and num_updates % args.save_interval_updates == 0: valid_losses = validate(args, trainer, datasets, [first_valid]) save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) if (i + 1) % args.log_interval == 0: DLLogger.flush() if num_updates >= max_update: break print('Epoch time:', time.time() - begin) # Print epoch stats and reset training meters DLLogger.log(step=trainer.get_num_updates(), data={'speed': trainer.get_throughput_meter().avg}, verbosity=0) DLLogger.flush()
def flush(): dllogger.flush() for tbl in tb_loggers.values(): if tbl.enabled: tbl.summary_writer.flush()
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU or CPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, _ = parser.parse_known_args() log_file = os.path.join(args.output, args.log_file) DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_file), StdOutBackend(Verbosity.VERBOSE)]) for k,v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k:v}) DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'}) tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.fp16, args.cpu, forward_is_infer=True) # forward is infer를 해줌으로써 tacotron model의 infer로 간다. waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.fp16, args.cpu, forward_is_infer=True) denoiser = Denoiser(waveglow) if not args.cpu: denoiser.cuda() jitted_tacotron2 = torch.jit.script(tacotron2) texts = [] id_list = [] try: f = open(args.input, 'r') texts = f.readlines() except: print("Could not read file") sys.exit(1) #------------------------------------------------------------------------------------------------------------------- ref_mel = load_mel(args.ref_mel) id_list.append(args.emotion_id) emotion_id = torch.LongTensor(id_list).cuda() print(emotion_id) #------------------------------------------------------------------------------------------------------------------- if args.include_warmup: sequence = torch.randint(low=0, high=80, size=(1,50)).long() input_lengths = torch.IntTensor([sequence.size(1)]).long() if not args.cpu: sequence = sequence.cuda() input_lengths = input_lengths.cuda() for i in range(3): with torch.no_grad(): mel, mel_lengths, _ = jitted_tacotron2(sequence, input_lengths, ref_mel, emotion_id) _ = waveglow(mel) measurements = {} sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu) with torch.no_grad(), MeasureTime(measurements, "tacotron2_time", args.cpu): mel, mel_lengths, alignments = jitted_tacotron2(sequences_padded, input_lengths, ref_mel, emotion_id) with torch.no_grad(), MeasureTime(measurements, "waveglow_time", args.cpu): audios = waveglow(mel, sigma=args.sigma_infer) audios = audios.float() with torch.no_grad(), MeasureTime(measurements, "denoiser_time", args.cpu): audios = denoiser(audios, strength=args.denoising_strength).squeeze(1) print("Stopping after",mel.size(2),"decoder steps") tacotron2_infer_perf = mel.size(0)*mel.size(2)/measurements['tacotron2_time'] waveglow_infer_perf = audios.size(0)*audios.size(1)/measurements['waveglow_time'] DLLogger.log(step=0, data={"tacotron2_items_per_sec": tacotron2_infer_perf}) DLLogger.log(step=0, data={"tacotron2_latency": measurements['tacotron2_time']}) DLLogger.log(step=0, data={"waveglow_items_per_sec": waveglow_infer_perf}) DLLogger.log(step=0, data={"waveglow_latency": measurements['waveglow_time']}) DLLogger.log(step=0, data={"denoiser_latency": measurements['denoiser_time']}) DLLogger.log(step=0, data={"latency": (measurements['tacotron2_time']+measurements['waveglow_time']+measurements['denoiser_time'])}) for i, audio in enumerate(audios): plt.imshow(alignments[i].float().data.cpu().numpy().T, aspect="auto", origin="lower") figure_path = os.path.join(args.output,"alignment_"+str(i)+args.suffix+".png") plt.savefig(figure_path) audio = audio[:mel_lengths[i]*args.stft_hop_length] audio = audio/torch.max(torch.abs(audio)) audio_path = os.path.join(args.output,"audio_"+str(i)+args.suffix+".wav") write(audio_path, args.sampling_rate, audio.cpu().numpy()) DLLogger.flush()
def main(): parser = argparse.ArgumentParser( description='PyTorch TTS Data Pre-processing') parser = parse_args(parser) args, unk_args = parser.parse_known_args() if len(unk_args) > 0: raise ValueError(f'Invalid options {unk_args}') if args.extract_pitch_char: assert args.extract_durations, "Durations required for pitch extraction" DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) model = load_and_setup_model( 'Tacotron2', parser, args.tacotron2_checkpoint, amp=False, device=torch.device('cuda' if args.cuda else 'cpu'), forward_is_infer=False, ema=False) if args.train_mode: model.train() # n_mel_channels arg has been consumed by model's arg parser args.n_mel_channels = model.n_mel_channels for datum in ('mels', 'mels_teacher', 'attentions', 'durations', 'pitch_mel', 'pitch_char', 'pitch_trichar'): if getattr(args, f'extract_{datum}'): Path(args.dataset_path, datum).mkdir(parents=False, exist_ok=True) filenames = [ Path(l.split('|')[0]).stem for l in open(args.wav_text_filelist, 'r') ] # Compatibility with Tacotron2 Data loader args.n_speakers = 1 dataset = FilenamedLoader(filenames, args.dataset_path, args.wav_text_filelist, args, load_mel_from_disk=False) # TextMelCollate supports only n_frames_per_step=1 data_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, sampler=None, num_workers=0, collate_fn=TextMelCollate(1), pin_memory=False, drop_last=False) pitch_vecs = {'mel': {}, 'char': {}, 'trichar': {}} for i, batch in enumerate(data_loader): tik = time.time() fnames = batch[-1] x, _, _ = batch_to_gpu(batch[:-1]) _, text_lens, mels_padded, _, mel_lens = x for j, mel in enumerate(mels_padded): fpath = Path(args.dataset_path, 'mels', fnames[j] + '.pt') torch.save(mel[:, :mel_lens[j]].cpu(), fpath) with torch.no_grad(): out_mels, out_mels_postnet, _, alignments = model.forward(x) if args.extract_mels_teacher: for j, mel in enumerate(out_mels_postnet): fpath = Path(args.dataset_path, 'mels_teacher', fnames[j] + '.pt') torch.save(mel[:, :mel_lens[j]].cpu(), fpath) if args.extract_attentions: for j, ali in enumerate(alignments): ali = ali[:mel_lens[j], :text_lens[j]] fpath = Path(args.dataset_path, 'attentions', fnames[j] + '.pt') torch.save(ali.cpu(), fpath) durations = [] if args.extract_durations: for j, ali in enumerate(alignments): text_len = text_lens[j] ali = ali[:mel_lens[j], :text_len] dur = torch.histc(torch.argmax(ali, dim=1), min=0, max=text_len - 1, bins=text_len) durations.append(dur) fpath = Path(args.dataset_path, 'durations', fnames[j] + '.pt') torch.save(dur.cpu().int(), fpath) if args.extract_pitch_mel or args.extract_pitch_char or args.extract_pitch_trichar: for j, dur in enumerate(durations): fpath = Path(args.dataset_path, 'pitch_char', fnames[j] + '.pt') wav = Path(args.dataset_path, 'wavs', fnames[j] + '.wav') p_mel, p_char, p_trichar = calculate_pitch( str(wav), dur.cpu().numpy()) pitch_vecs['mel'][fnames[j]] = p_mel pitch_vecs['char'][fnames[j]] = p_char pitch_vecs['trichar'][fnames[j]] = p_trichar nseconds = time.time() - tik DLLogger.log(step=f'{i+1}/{len(data_loader)} ({nseconds:.2f}s)', data={}) if args.extract_pitch_mel: normalize_pitch_vectors(pitch_vecs['mel']) for fname, pitch in pitch_vecs['mel'].items(): fpath = Path(args.dataset_path, 'pitch_mel', fname + '.pt') torch.save(torch.from_numpy(pitch), fpath) if args.extract_pitch_char: mean, std = normalize_pitch_vectors(pitch_vecs['char']) for fname, pitch in pitch_vecs['char'].items(): fpath = Path(args.dataset_path, 'pitch_char', fname + '.pt') torch.save(torch.from_numpy(pitch), fpath) save_stats(args.dataset_path, args.wav_text_filelist, 'pitch_char', mean, std) if args.extract_pitch_trichar: normalize_pitch_vectors(pitch_vecs['trichar']) for fname, pitch in pitch_vecs['trichar'].items(): fpath = Path(args.dataset_path, 'pitch_trichar', fname + '.pt') torch.save(torch.from_numpy(pitch), fpath) DLLogger.flush()
def log_params(self, data): DLLogger.log("PARAMETER", data) DLLogger.flush()
def main(): script_start = time.time() hvd_init() mpi_comm = MPI.COMM_WORLD args = parse_args() if hvd.rank() == 0: dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.log_path), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE) ]) else: dllogger.init(backends=[]) dllogger.log(data=vars(args), step='PARAMETER') if args.seed is not None: tf.random.set_random_seed(args.seed) np.random.seed(args.seed) cp.random.seed(args.seed) if args.amp: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1" if "TF_ENABLE_AUTO_MIXED_PRECISION" in os.environ \ and os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] == "1": args.fp16 = False if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir != '': os.makedirs(args.checkpoint_dir, exist_ok=True) final_checkpoint_path = os.path.join(args.checkpoint_dir, 'model.ckpt') # Load converted data and get statistics train_df = pd.read_pickle(args.data + '/train_ratings.pickle') test_df = pd.read_pickle(args.data + '/test_ratings.pickle') nb_users, nb_items = train_df.max() + 1 # Extract train and test feature tensors from dataframe pos_train_users = train_df.iloc[:, 0].values.astype(np.int32) pos_train_items = train_df.iloc[:, 1].values.astype(np.int32) pos_test_users = test_df.iloc[:, 0].values.astype(np.int32) pos_test_items = test_df.iloc[:, 1].values.astype(np.int32) # Negatives indicator for negatives generation neg_mat = np.ones((nb_users, nb_items), dtype=np.bool) neg_mat[pos_train_users, pos_train_items] = 0 # Get the local training/test data train_users, train_items, train_labels = get_local_train_data( pos_train_users, pos_train_items, args.negative_samples) test_users, test_items = get_local_test_data(pos_test_users, pos_test_items) # Create and run Data Generator in a separate thread data_generator = DataGenerator( args.seed, hvd.rank(), nb_users, nb_items, neg_mat, train_users, train_items, train_labels, args.batch_size // hvd.size(), args.negative_samples, test_users, test_items, args.valid_users_per_batch, args.valid_negative, ) # Create tensorflow session and saver config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) if args.xla: config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 sess = tf.Session(config=config) # Input tensors users = tf.placeholder(tf.int32, shape=(None, )) items = tf.placeholder(tf.int32, shape=(None, )) labels = tf.placeholder(tf.int32, shape=(None, )) is_dup = tf.placeholder(tf.float32, shape=(None, )) dropout = tf.placeholder_with_default(args.dropout, shape=()) # Model ops and saver hit_rate, ndcg, eval_op, train_op = ncf_model_ops( users, items, labels, is_dup, params={ 'fp16': args.fp16, 'val_batch_size': args.valid_negative + 1, 'top_k': args.topk, 'learning_rate': args.learning_rate, 'beta_1': args.beta1, 'beta_2': args.beta2, 'epsilon': args.eps, 'num_users': nb_users, 'num_items': nb_items, 'num_factors': args.factors, 'mf_reg': 0, 'layer_sizes': args.layers, 'layer_regs': [0. for i in args.layers], 'dropout': dropout, 'sigmoid': True, 'loss_scale': args.loss_scale }, mode='TRAIN' if args.mode == 'train' else 'EVAL') saver = tf.train.Saver() # Accuracy metric tensors hr_sum = tf.get_default_graph().get_tensor_by_name( 'neumf/hit_rate/total:0') hr_cnt = tf.get_default_graph().get_tensor_by_name( 'neumf/hit_rate/count:0') ndcg_sum = tf.get_default_graph().get_tensor_by_name('neumf/ndcg/total:0') ndcg_cnt = tf.get_default_graph().get_tensor_by_name('neumf/ndcg/count:0') # Prepare evaluation data data_generator.prepare_eval_data() if args.load_checkpoint_path: saver.restore(sess, args.load_checkpoint_path) else: # Manual initialize weights sess.run(tf.global_variables_initializer()) # If test mode, run one eval if args.mode == 'test': sess.run(tf.local_variables_initializer()) eval_start = time.time() for user_batch, item_batch, dup_batch \ in zip(data_generator.eval_users, data_generator.eval_items, data_generator.dup_mask): sess.run(eval_op, feed_dict={ users: user_batch, items: item_batch, is_dup: dup_batch, dropout: 0.0 }) eval_duration = time.time() - eval_start # Report results hit_rate_sum = sess.run(hvd.allreduce(hr_sum, average=False)) hit_rate_cnt = sess.run(hvd.allreduce(hr_cnt, average=False)) ndcg_sum = sess.run(hvd.allreduce(ndcg_sum, average=False)) ndcg_cnt = sess.run(hvd.allreduce(ndcg_cnt, average=False)) hit_rate = hit_rate_sum / hit_rate_cnt ndcg = ndcg_sum / ndcg_cnt if hvd.rank() == 0: eval_throughput = pos_test_users.shape[0] * (args.valid_negative + 1) / eval_duration dllogger.log(step=tuple(), data={ 'eval_throughput': eval_throughput, 'eval_time': eval_duration, 'hr@10': hit_rate, 'ndcg': ndcg }) return # Performance Metrics train_times = list() eval_times = list() # Accuracy Metrics first_to_target = None time_to_train = 0.0 best_hr = 0 best_epoch = 0 # Buffers for global metrics global_hr_sum = np.ones(1) global_hr_count = np.ones(1) global_ndcg_sum = np.ones(1) global_ndcg_count = np.ones(1) # Buffers for local metrics local_hr_sum = np.ones(1) local_hr_count = np.ones(1) local_ndcg_sum = np.ones(1) local_ndcg_count = np.ones(1) # Begin training begin_train = time.time() for epoch in range(args.epochs): # Train for one epoch train_start = time.time() data_generator.prepare_train_data() for user_batch, item_batch, label_batch \ in zip(data_generator.train_users_batches, data_generator.train_items_batches, data_generator.train_labels_batches): sess.run(train_op, feed_dict={ users: user_batch.get(), items: item_batch.get(), labels: label_batch.get() }) train_duration = time.time() - train_start # Only log "warm" epochs if epoch >= 1: train_times.append(train_duration) # Evaluate if epoch > args.eval_after: eval_start = time.time() sess.run(tf.local_variables_initializer()) for user_batch, item_batch, dup_batch \ in zip(data_generator.eval_users, data_generator.eval_items, data_generator.dup_mask): sess.run(eval_op, feed_dict={ users: user_batch, items: item_batch, is_dup: dup_batch, dropout: 0.0 }) # Compute local metrics local_hr_sum[0] = sess.run(hr_sum) local_hr_count[0] = sess.run(hr_cnt) local_ndcg_sum[0] = sess.run(ndcg_sum) local_ndcg_count[0] = sess.run(ndcg_cnt) # Reduce metrics across all workers mpi_comm.Reduce(local_hr_count, global_hr_count) mpi_comm.Reduce(local_hr_sum, global_hr_sum) mpi_comm.Reduce(local_ndcg_count, global_ndcg_count) mpi_comm.Reduce(local_ndcg_sum, global_ndcg_sum) # Calculate metrics hit_rate = global_hr_sum[0] / global_hr_count[0] ndcg = global_ndcg_sum[0] / global_ndcg_count[0] eval_duration = time.time() - eval_start # Only log "warm" epochs if epoch >= 1: eval_times.append(eval_duration) if hvd.rank() == 0: dllogger.log(step=(epoch, ), data={ 'train_time': train_duration, 'eval_time': eval_duration, 'hr@10': hit_rate, 'ndcg': ndcg }) # Update summary metrics if hit_rate > args.target and first_to_target is None: first_to_target = epoch time_to_train = time.time() - begin_train if hit_rate > best_hr: best_hr = hit_rate best_epoch = epoch time_to_best = time.time() - begin_train if hit_rate > args.target: saver.save(sess, final_checkpoint_path) # Final Summary if hvd.rank() == 0: train_times = np.array(train_times) train_throughputs = pos_train_users.shape[0] * (args.negative_samples + 1) / train_times eval_times = np.array(eval_times) eval_throughputs = pos_test_users.shape[0] * (args.valid_negative + 1) / eval_times dllogger.log(step=tuple(), data={ 'average_train_time_per_epoch': np.mean(train_times), 'average_train_throughput': np.mean(train_throughputs), 'average_eval_time_per_epoch': np.mean(eval_times), 'average_eval_throughput': np.mean(eval_throughputs), 'first_epoch_to_hit': first_to_target, 'time_to_train': time_to_train, 'time_to_best': time_to_best, 'best_hr': best_hr, 'best_epoch': best_epoch }) dllogger.flush() sess.close() return
def log(self, key, value): DLLogger.log(self.step(), {key: value}) DLLogger.flush()
def main(): parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Training') parser = parse_args(parser) args, _ = parser.parse_known_args() if 'LOCAL_RANK' in os.environ and 'WORLD_SIZE' in os.environ: local_rank = int(os.environ['LOCAL_RANK']) world_size = int(os.environ['WORLD_SIZE']) else: local_rank = args.rank world_size = args.world_size distributed_run = world_size > 1 if local_rank == 0: log_file = os.path.join(args.output, args.log_file) DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_file), StdOutBackend(Verbosity.VERBOSE)]) else: DLLogger.init(backends=[]) for k,v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k:v}) DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'}) model_name = args.model_name parser = models.model_parser(model_name, parser) args, _ = parser.parse_known_args() torch.backends.cudnn.enabled = args.cudnn_enabled torch.backends.cudnn.benchmark = args.cudnn_benchmark if distributed_run: init_distributed(args, world_size, local_rank, args.group_name) torch.cuda.synchronize() run_start_time = time.perf_counter() model_config = models.get_model_config(model_name, args) model = models.get_model(model_name, model_config, cpu_run=False, uniform_initialize_bn_weight=not args.disable_uniform_initialize_bn_weight) if distributed_run: model = DDP(model,device_ids=[local_rank],output_device=local_rank) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) scaler = torch.cuda.amp.GradScaler(enabled=args.amp) try: sigma = args.sigma except AttributeError: sigma = None start_epoch = [0] if args.resume_from_last: args.checkpoint_path = get_last_checkpoint_filename(args.output, model_name) if args.checkpoint_path is not "": load_checkpoint(model, optimizer, start_epoch, model_config, args.amp, args.checkpoint_path, local_rank) start_epoch = start_epoch[0] criterion = loss_functions.get_loss_function(model_name, sigma) try: n_frames_per_step = args.n_frames_per_step except AttributeError: n_frames_per_step = None collate_fn = data_functions.get_collate_function( model_name, n_frames_per_step) trainset = data_functions.get_data_loader( model_name, args.dataset_path, args.training_files, args) if distributed_run: train_sampler = DistributedSampler(trainset) shuffle = False else: train_sampler = None shuffle = True train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle, sampler=train_sampler, batch_size=args.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) valset = data_functions.get_data_loader( model_name, args.dataset_path, args.validation_files, args) batch_to_gpu = data_functions.get_batch_to_gpu(model_name) iteration = 0 train_epoch_items_per_sec = 0.0 val_loss = 0.0 num_iters = 0 model.train() for epoch in range(start_epoch, args.epochs): torch.cuda.synchronize() epoch_start_time = time.perf_counter() # used to calculate avg items/sec over epoch reduced_num_items_epoch = 0 train_epoch_items_per_sec = 0.0 num_iters = 0 reduced_loss = 0 # if overflow at the last iteration then do not save checkpoint overflow = False if distributed_run: train_loader.sampler.set_epoch(epoch) for i, batch in enumerate(train_loader): torch.cuda.synchronize() iter_start_time = time.perf_counter() DLLogger.log(step=(epoch, i), data={'glob_iter/iters_per_epoch': str(iteration)+"/"+str(len(train_loader))}) adjust_learning_rate(iteration, epoch, optimizer, args.learning_rate, args.anneal_steps, args.anneal_factor, local_rank) model.zero_grad() x, y, num_items = batch_to_gpu(batch) #AMP upstream autocast with torch.cuda.amp.autocast(enabled=args.amp): y_pred = model(x) loss = criterion(y_pred, y) if distributed_run: reduced_loss = reduce_tensor(loss.data, world_size).item() reduced_num_items = reduce_tensor(num_items.data, 1).item() else: reduced_loss = loss.item() reduced_num_items = num_items.item() if np.isnan(reduced_loss): raise Exception("loss is NaN") DLLogger.log(step=(epoch,i), data={'train_loss': reduced_loss}) num_iters += 1 # accumulate number of items processed in this epoch reduced_num_items_epoch += reduced_num_items if args.amp: scaler.scale(loss).backward() scaler.unscale_(optimizer) grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), args.grad_clip_thresh) scaler.step(optimizer) scaler.update() optimizer.zero_grad(set_to_none=True) else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), args.grad_clip_thresh) optimizer.step() torch.cuda.synchronize() iter_stop_time = time.perf_counter() iter_time = iter_stop_time - iter_start_time items_per_sec = reduced_num_items/iter_time train_epoch_items_per_sec += items_per_sec DLLogger.log(step=(epoch, i), data={'train_items_per_sec': items_per_sec}) DLLogger.log(step=(epoch, i), data={'train_iter_time': iter_time}) iteration += 1 torch.cuda.synchronize() epoch_stop_time = time.perf_counter() epoch_time = epoch_stop_time - epoch_start_time DLLogger.log(step=(epoch,), data={'train_items_per_sec': (train_epoch_items_per_sec/num_iters if num_iters > 0 else 0.0)}) DLLogger.log(step=(epoch,), data={'train_loss': reduced_loss}) DLLogger.log(step=(epoch,), data={'train_epoch_time': epoch_time}) val_loss, val_items_per_sec = validate(model, criterion, valset, epoch, iteration, args.batch_size, world_size, collate_fn, distributed_run, local_rank, batch_to_gpu) if (epoch % args.epochs_per_checkpoint == 0) and args.bench_class == "": save_checkpoint(model, optimizer, scaler, epoch, model_config, args.amp, args.output, args.model_name, local_rank, world_size) if local_rank == 0: DLLogger.flush() torch.cuda.synchronize() run_stop_time = time.perf_counter() run_time = run_stop_time - run_start_time DLLogger.log(step=tuple(), data={'run_time': run_time}) DLLogger.log(step=tuple(), data={'val_loss': val_loss}) DLLogger.log(step=tuple(), data={'train_items_per_sec': (train_epoch_items_per_sec/num_iters if num_iters > 0 else 0.0)}) DLLogger.log(step=tuple(), data={'val_items_per_sec': val_items_per_sec}) if local_rank == 0: DLLogger.flush()
def flush(self): logger.flush()