class CometExperimentLogger(ExperimentLogger): def __init__(self, exp_name, online=True, **kwargs): super(CometExperimentLogger, self).__init__(exp_name, **kwargs) if online: self.comet = Experiment(project_name=exp_name, **kwargs) else: self.comet = OfflineExperiment(project_name=exp_name, **kwargs) def log_metric(self, tag, value, step, **kwargs): self.comet.log_metric(tag, value, step=step, **kwargs) def log_image(self, tag, img, step, **kwargs): self.comet.log_image(img, name=tag, step=step, **kwargs) def log_plt(self, tag, plt, step, **kwargs): self.comet.log_figure(figure=plt, figure_name=tag, step=step, **kwargs) def log_text(self, tag, text, **kwargs): self.comet.log_text(text, **kwargs) def log_parameters(self, params, **kwargs): self.comet.log_parameters(params, **kwargs) def start_epoch(self, **kwargs): super(CometExperimentLogger, self).start_epoch() def end_epoch(self, **kwargs): super(CometExperimentLogger, self).end_epoch() self.comet.log_epoch_end(self.epoch, **kwargs) def end_experiment(self): self.comet.end()
verbose = 10, n_jobs = 2, n_points = 2, scoring = 'accuracy', ) checkpoint_callback = skopt.callbacks.CheckpointSaver(f'D:\\FINKI\\8_dps\\Project\\MODELS\\skopt_checkpoints\\{EXPERIMENT_ID}.pkl') hyperparameters_optimizer.fit(X_train, y_train, callback = [checkpoint_callback]) skopt.dump(hyperparameters_optimizer, f'saved_models\\{EXPERIMENT_ID}.pkl') y_pred = hyperparameters_optimizer.best_estimator_.predict(X_test) for i in range(len(hyperparameters_optimizer.cv_results_['params'])): exp = OfflineExperiment( api_key = 'A8Lg71j9LtIrsv0deBA0DVGcR', project_name = ALGORITHM, workspace = "8_dps", auto_output_logging = 'native', offline_directory = f'D:\\FINKI\\8_dps\\Project\\MODELS\\comet_ml_offline_experiments\\{EXPERIMENT_ID}' ) exp.set_name(f'{EXPERIMENT_ID}_{i + 1}') exp.add_tags([DS, SEGMENTS_LENGTH, ]) for k, v in hyperparameters_optimizer.cv_results_.items(): if k == "params": exp.log_parameters(dict(v[i])) else: exp.log_metric(k, v[i]) exp.end()
def main(args): torch.manual_seed(0) # Get device device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # Get dataset dataset = Dataset("train.txt") loader = DataLoader(dataset, batch_size=hp.batch_size**2, shuffle=True, collate_fn=dataset.collate_fn, drop_last=True, num_workers=hp.num_workers) speaker_encoder = None if hp.speaker_encoder_path != "": speaker_encoder = load_speaker_encoder(Path(hp.speaker_encoder_path), device).to(device) for param in speaker_encoder.parameters(): param.requires_grad = False else: speaker_encoder.train() # Define model fastspeech_model = FastSpeech2(speaker_encoder).to(device) model = nn.DataParallel(fastspeech_model).to(device) print("Model Has Been Defined") num_param = utils.get_param_num(model) print('Number of FastSpeech2 Parameters:', num_param) # Optimizer and loss optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, betas=hp.betas, eps=hp.eps, weight_decay=hp.weight_decay) scheduled_optim = ScheduledOptim(optimizer, hp.decoder_hidden, hp.n_warm_up_step, args.restore_step) Loss = FastSpeech2Loss().to(device) print("Optimizer and Loss Function Defined.") # Load checkpoint if exists checkpoint_path = os.path.join(hp.checkpoint_path) try: checkpoint = torch.load( os.path.join(checkpoint_path, 'checkpoint_{}.pth.tar'.format(args.restore_step))) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("\n---Model Restored at Step {}---\n".format(args.restore_step)) except: print("\n---Start New Training---\n") if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) # Load vocoder if hp.vocoder == 'melgan': vocoder = utils.get_melgan() vocoder_infer = utils.melgan_infer elif hp.vocoder == 'waveglow': vocoder = utils.get_waveglow() vocoder_infer = utils.waveglow_infer else: raise ValueError("Vocoder '%s' is not supported", hp.vocoder) comet_experiment = None use_comet = int(os.getenv("USE_COMET", default=0)) if use_comet != 0: if use_comet == 1: offline_dir = os.path.join(hp.models_path, "comet") os.makedirs(offline_dir, exist_ok=True) comet_experiment = OfflineExperiment( project_name="mlp-project", workspace="ino-voice", offline_directory=offline_dir, ) elif use_comet == 2: comet_experiment = Experiment( api_key="BtyTwUoagGMh3uN4VZt6gMOn8", project_name="mlp-project", workspace="ino-voice", ) comet_experiment.set_name(args.experiment_name) comet_experiment.log_parameters(hp) comet_experiment.log_html(args.m) start_time = time.perf_counter() first_mel_train_loss, first_postnet_train_loss, first_d_train_loss, first_f_train_loss, first_e_train_loss = \ None, None, None, None, None for epoch in range(hp.epochs): total_step = hp.epochs * len(loader) * hp.batch_size for i, batchs in enumerate(loader): for j, data_of_batch in enumerate(batchs): model = model.train() current_step = i * hp.batch_size + j + args.restore_step + epoch * len( loader) * hp.batch_size + 1 # Get Data text = torch.from_numpy( data_of_batch["text"]).long().to(device) mel_target = torch.from_numpy( data_of_batch["mel_target"]).float().to(device) D = torch.from_numpy(data_of_batch["D"]).long().to(device) log_D = torch.from_numpy( data_of_batch["log_D"]).float().to(device) f0 = torch.from_numpy(data_of_batch["f0"]).float().to(device) energy = torch.from_numpy( data_of_batch["energy"]).float().to(device) src_len = torch.from_numpy( data_of_batch["src_len"]).long().to(device) mel_len = torch.from_numpy( data_of_batch["mel_len"]).long().to(device) max_src_len = np.max(data_of_batch["src_len"]).astype(np.int32) max_mel_len = np.max(data_of_batch["mel_len"]).astype(np.int32) # text = torch.from_numpy(data_of_batch["text"]).long() # mel_target = torch.from_numpy(data_of_batch["mel_target"]).float() # D = torch.from_numpy(data_of_batch["D"]).long() # log_D = torch.from_numpy(data_of_batch["log_D"]).float() # f0 = torch.from_numpy(data_of_batch["f0"]).float() # energy = torch.from_numpy(data_of_batch["energy"]).float() # src_len = torch.from_numpy(data_of_batch["src_len"]).long() # mel_len = torch.from_numpy(data_of_batch["mel_len"]).long() # max_src_len = np.max(data_of_batch["src_len"]).astype(np.int32) # max_mel_len = np.max(data_of_batch["mel_len"]).astype(np.int32) # Forward mel_output, mel_postnet_output, log_duration_output, f0_output, energy_output, src_mask, mel_mask, _ = \ model(text, src_len, mel_target, mel_len, D, f0, energy, max_src_len, max_mel_len) # Cal Loss mel_loss, mel_postnet_loss, d_loss, f_loss, e_loss = Loss( log_duration_output, log_D, f0_output, f0, energy_output, energy, mel_output, mel_postnet_output, mel_target, ~src_mask, ~mel_mask) total_loss = mel_loss + mel_postnet_loss + d_loss + f_loss + e_loss # Set initial values for scaling if first_mel_train_loss is None: first_mel_train_loss = mel_loss first_postnet_train_loss = mel_postnet_loss first_d_train_loss = d_loss first_f_train_loss = f_loss first_e_train_loss = e_loss mel_l = mel_loss.item() / first_mel_train_loss mel_postnet_l = mel_postnet_loss.item( ) / first_postnet_train_loss d_l = d_loss.item() / first_d_train_loss f_l = f_loss.item() / first_f_train_loss e_l = e_loss.item() / first_e_train_loss # Logger if comet_experiment is not None: comet_experiment.log_metric( "total_loss", mel_l + mel_postnet_l + d_l + f_l + e_l, current_step) comet_experiment.log_metric("mel_loss", mel_l, current_step) comet_experiment.log_metric("mel_postnet_loss", mel_postnet_l, current_step) comet_experiment.log_metric("duration_loss", d_l, current_step) comet_experiment.log_metric("f0_loss", f_l, current_step) comet_experiment.log_metric("energy_loss", e_l, current_step) # Backward total_loss = total_loss / hp.acc_steps total_loss.backward() if current_step % hp.acc_steps != 0: continue # Clipping gradients to avoid gradient explosion nn.utils.clip_grad_norm_(model.parameters(), hp.grad_clip_thresh) # Update weights scheduled_optim.step_and_update_lr() scheduled_optim.zero_grad() # Print if current_step % hp.log_step == 0: now = time.perf_counter() print("\nEpoch [{}/{}], Step [{}/{}]:".format( epoch + 1, hp.epochs, current_step, total_step)) print( "Total Loss: {:.4f}, Mel Loss: {:.5f}, Mel PostNet Loss: {:.5f}, Duration Loss: {:.5f}, " "F0 Loss: {:.5f}, Energy Loss: {:.5f};".format( mel_l + mel_postnet_l + d_l + f_l + e_l, mel_l, mel_postnet_l, d_l, f_l, e_l)) print("Time Used: {:.3f}s".format(now - start_time)) start_time = now if current_step % hp.checkpoint == 0: file_path = os.path.join( checkpoint_path, 'checkpoint_{}.pth.tar'.format(current_step)) torch.save( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict() }, file_path) print("saving model at to {}".format(file_path)) if current_step % hp.synth_step == 0: length = mel_len[0].item() mel_target_torch = mel_target[ 0, :length].detach().unsqueeze(0).transpose(1, 2) mel_target = mel_target[ 0, :length].detach().cpu().transpose(0, 1) mel_torch = mel_output[0, :length].detach().unsqueeze( 0).transpose(1, 2) mel = mel_output[0, :length].detach().cpu().transpose(0, 1) mel_postnet_torch = mel_postnet_output[ 0, :length].detach().unsqueeze(0).transpose(1, 2) mel_postnet = mel_postnet_output[ 0, :length].detach().cpu().transpose(0, 1) if comet_experiment is not None: comet_experiment.log_audio( audiotools.inv_mel_spec(mel), hp.sampling_rate, "step_{}_griffin_lim.wav".format(current_step)) comet_experiment.log_audio( audiotools.inv_mel_spec(mel_postnet), hp.sampling_rate, "step_{}_postnet_griffin_lim.wav".format( current_step)) comet_experiment.log_audio( vocoder_infer(mel_torch, vocoder), hp.sampling_rate, 'step_{}_{}.wav'.format(current_step, hp.vocoder)) comet_experiment.log_audio( vocoder_infer(mel_postnet_torch, vocoder), hp.sampling_rate, 'step_{}_postnet_{}.wav'.format( current_step, hp.vocoder)) comet_experiment.log_audio( vocoder_infer(mel_target_torch, vocoder), hp.sampling_rate, 'step_{}_ground-truth_{}.wav'.format( current_step, hp.vocoder)) f0 = f0[0, :length].detach().cpu().numpy() energy = energy[0, :length].detach().cpu().numpy() f0_output = f0_output[ 0, :length].detach().cpu().numpy() energy_output = energy_output[ 0, :length].detach().cpu().numpy() utils.plot_data( [(mel_postnet.numpy(), f0_output, energy_output), (mel_target.numpy(), f0, energy)], comet_experiment, [ 'Synthesized Spectrogram', 'Ground-Truth Spectrogram' ]) if current_step % hp.eval_step == 0: model.eval() with torch.no_grad(): if comet_experiment is not None: with comet_experiment.validate(): d_l, f_l, e_l, m_l, m_p_l = evaluate( model, current_step, comet_experiment) t_l = d_l + f_l + e_l + m_l + m_p_l comet_experiment.log_metric( "total_loss", t_l, current_step) comet_experiment.log_metric( "mel_loss", m_l, current_step) comet_experiment.log_metric( "mel_postnet_loss", m_p_l, current_step) comet_experiment.log_metric( "duration_loss", d_l, current_step) comet_experiment.log_metric( "F0_loss", f_l, current_step) comet_experiment.log_metric( "energy_loss", e_l, current_step)
class Logger: def __init__(self, send_logs, tags, parameters, experiment=None): self.stations = 5 self.send_logs = send_logs if self.send_logs: if experiment is None: json_loc = glob.glob("./**/comet_token.json")[0] with open(json_loc, "r") as f: kwargs = json.load(f) self.experiment = OfflineExperiment(**kwargs) else: self.experiment = experiment self.sent_mb = 0 self.speed_window = deque(maxlen=100) self.step_time = None self.current_speed = 0 if self.send_logs: if tags is not None: self.experiment.add_tags(tags) if parameters is not None: self.experiment.log_parameters(parameters) def begin_logging(self, episode_count, steps_per_ep, sigma, theta, step_time): self.step_time = step_time if self.send_logs: self.experiment.log_parameter("Episode count", episode_count) self.experiment.log_parameter("Steps per episode", steps_per_ep) self.experiment.log_parameter("theta", theta) self.experiment.log_parameter("sigma", sigma) def log_round(self, states, reward, cumulative_reward, info, loss, observations, step): self.experiment.log_histogram_3d(states, name="Observations", step=step) info = [[j for j in i.split("|")] for i in info] info = np.mean(np.array(info, dtype=np.float32), axis=0) try: round_mb = info[0] except Exception as e: print(info) print(reward) raise e self.speed_window.append(round_mb) self.current_speed = np.mean(np.asarray(self.speed_window)/self.step_time) self.sent_mb += round_mb CW = info[1] CW_ax = info[2] self.stations = info[3] fairness = info[4] if self.send_logs: self.experiment.log_metric("Round reward", np.mean(reward), step=step) self.experiment.log_metric("Per-ep reward", np.mean(cumulative_reward), step=step) self.experiment.log_metric("Megabytes sent", self.sent_mb, step=step) self.experiment.log_metric("Round megabytes sent", round_mb, step=step) self.experiment.log_metric("Chosen CW for legacy devices", CW, step=step) self.experiment.log_metric("Chosen CW for 802.11ax devices", CW_ax, step=step) self.experiment.log_metric("Station count", self.stations, step=step) self.experiment.log_metric("Current throughput", self.current_speed, step=step) self.experiment.log_metric("Fairness index", fairness, step=step) for i, obs in enumerate(observations): self.experiment.log_metric(f"Observation {i}", obs, step=step) self.experiment.log_metrics(loss, step=step) def log_episode(self, cumulative_reward, speed, step): if self.send_logs: self.experiment.log_metric("Cumulative reward", cumulative_reward, step=step) self.experiment.log_metric("Speed", speed, step=step) self.sent_mb = 0 self.last_speed = speed self.speed_window = deque(maxlen=100) self.current_speed = 0 def end(self): if self.send_logs: self.experiment.end()
iterations = 0 start = time.time() best_valid_loss = -1 header = ' Time Epoch Iteration Progress (%Epoch) Loss' dev_log_template = ' '.join('{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{:8.6f}'.split(',')) log_template = ' '.join('{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f}'.split(',')) print(header) with experiment.train(): for epoch in range(config["training"]["epochs"]): for batch_idx, (X_batch, y_batch) in enumerate(training_generator): X_batch, y_batch = X_batch.to(device), y_batch.to(device) X_batch, y_batch = X_batch.permute(1, 0, 2), y_batch.permute(1, 0, 2) train_loss = train(X_batch, y_batch, model, opt, criterion, config["clip"]) experiment.log_metric("train_loss", train_loss, step=iterations) # checkpoint model periodically if iterations % config["every"]["save"] == 0: snapshot_prefix = os.path.join(config["result_directory"], 'snapshot') snapshot_path = snapshot_prefix + '_loss_{:.6f}_iter_{}_model.pt'.format(train_loss, iterations) torch.save({ 'model': model.state_dict(), 'opt': opt.state_dict(), }, snapshot_path) for f in glob.glob(snapshot_prefix + '*'): if f != snapshot_path: os.remove(f) # evaluate performance on validation set periodically if iterations % config["every"]["validate"] == 0:
class CometLogger(Logger): def __init__( self, batch_size: int, snapshot_dir: Optional[str] = None, snapshot_mode: str = "last", snapshot_gap: int = 1, exp_set: Optional[str] = None, use_print_exp: bool = False, saved_exp: Optional[str] = None, **kwargs, ): """ :param kwargs: passed to comet's Experiment at init. """ if use_print_exp: self.experiment = PrintExperiment() else: from comet_ml import Experiment, ExistingExperiment, OfflineExperiment if saved_exp: self.experiment = ExistingExperiment( previous_experiment=saved_exp, **kwargs ) else: try: self.experiment = Experiment(**kwargs) except ValueError: # no API key log_dir = Path.home() / "logs" log_dir.mkdir(exist_ok=True) self.experiment = OfflineExperiment(offline_directory=str(log_dir)) self.experiment.log_parameter("complete", False) if exp_set: self.experiment.log_parameter("exp_set", exp_set) if snapshot_dir: snapshot_dir = Path(snapshot_dir) / self.experiment.get_key() # log_traj_window (int): How many trajectories to hold in deque for computing performance statistics. self.log_traj_window = 100 self._cum_metrics = { "n_unsafe_actions": 0, "constraint_used": 0, "cum_completed_trajs": 0, "logging_time": 0, } self._new_completed_trajs = 0 self._last_step = 0 self._start_time = self._last_time = time() self._last_snapshot_upload = 0 self._snaphot_upload_time = 30 * 60 super().__init__(batch_size, snapshot_dir, snapshot_mode, snapshot_gap) def log_fast( self, step: int, traj_infos: Sequence[Dict[str, float]], opt_info: Optional[Tuple[Sequence[float], ...]] = None, test: bool = False, ) -> None: if not traj_infos: return start = time() self._new_completed_trajs += len(traj_infos) self._cum_metrics["cum_completed_trajs"] += len(traj_infos) # TODO: do we need to support sum(t[k]) if key in k? # without that, this doesn't include anything from extra eval samplers for key in self._cum_metrics: if key == "cum_completed_trajs": continue self._cum_metrics[key] += sum(t.get(key, 0) for t in traj_infos) self._cum_metrics["logging_time"] += time() - start def log( self, step: int, traj_infos: Sequence[Dict[str, float]], opt_info: Optional[Tuple[Sequence[float], ...]] = None, test: bool = False, ): self.log_fast(step, traj_infos, opt_info, test) start = time() with (self.experiment.test() if test else nullcontext()): step *= self.batch_size if opt_info is not None: # grad norm is left on the GPU for some reason # https://github.com/astooke/rlpyt/issues/163 self.experiment.log_metrics( { k: np.mean(v) for k, v in opt_info._asdict().items() if k != "gradNorm" }, step=step, ) if traj_infos: agg_vals = {} for key in traj_infos[0].keys(): if key in self._cum_metrics: continue agg_vals[key] = sum(t[key] for t in traj_infos) / len(traj_infos) self.experiment.log_metrics(agg_vals, step=step) if not test: now = time() self.experiment.log_metrics( { "new_completed_trajs": self._new_completed_trajs, "steps_per_second": (step - self._last_step) / (now - self._last_time), }, step=step, ) self._last_time = now self._last_step = step self._new_completed_trajs = 0 self.experiment.log_metrics(self._cum_metrics, step=step) self._cum_metrics["logging_time"] += time() - start def log_metric(self, name, val): self.experiment.log_metric(name, val) def log_parameters(self, parameters): self.experiment.log_parameters(parameters) def log_config(self, config): self.experiment.log_parameter("config", json.dumps(convert_dict(config))) def upload_snapshot(self): if self.snapshot_dir: self.experiment.log_asset(self._previous_snapshot_fname) def save_itr_params( self, step: int, params: Dict[str, Any], metric: Optional[float] = None ) -> None: super().save_itr_params(step, params, metric) now = time() if now - self._last_snapshot_upload > self._snaphot_upload_time: self._last_snapshot_upload = now self.upload_snapshot() def shutdown(self, error: bool = False) -> None: if not error: self.upload_snapshot() self.experiment.log_parameter("complete", True) self.experiment.end()
labels = Variable(labels) # Forward + Backward + Optimize optimizer.zero_grad() outputs = rnn(images) loss = criterion(outputs, labels) loss.backward() optimizer.step() # Compute train accuracy _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += float((predicted == labels.data).sum()) # Log accuracy to Comet.ml experiment.log_metric("accuracy", correct / total, step=step) step += 1 if (i + 1) % 100 == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' % (epoch + 1, hyper_params['num_epochs'], i + 1, len(train_dataset) // hyper_params['batch_size'], loss.data.item())) experiment.log_epoch_end(epoch) with experiment.test(): # Test the Model correct = 0 total = 0 for images, labels in test_loader: images = Variable( images.view(-1, hyper_params['sequence_length'],