class CometWriter: def __init__(self, logger, project_name: Optional[str] = None, experiment_name: Optional[str] = None, api_key: Optional[str] = None, log_dir: Optional[str] = None, offline: bool = False, **kwargs): if not _COMET_AVAILABLE: raise ImportError( "You want to use `comet_ml` logger which is not installed yet," " install it with `pip install comet-ml`.") self.project_name = project_name self.experiment_name = experiment_name self.kwargs = kwargs self.timer = Timer() if (api_key is not None) and (log_dir is not None): self.mode = "offline" if offline else "online" self.api_key = api_key self.log_dir = log_dir elif api_key is not None: self.mode = "online" self.api_key = api_key self.log_dir = None elif log_dir is not None: self.mode = "offline" self.log_dir = log_dir else: logger.warning( "CometLogger requires either api_key or save_dir during initialization." ) if self.mode == "online": self.experiment = CometExperiment( api_key=self.api_key, project_name=self.project_name, **self.kwargs, ) else: self.experiment = CometOfflineExperiment( offline_directory=self.log_dir, project_name=self.project_name, **self.kwargs, ) if self.experiment_name: self.experiment.set_name(self.experiment_name) def set_step(self, step, epoch=None, mode='train') -> None: self.mode = mode self.step = step self.epoch = epoch if step == 0: self.timer.reset() else: duration = self.timer.check() self.add_scalar({'steps_per_sec': 1 / duration}) def log_hyperparams(self, params: Dict[str, Any]) -> None: self.experiment.log_parameters(params) def log_code(self, file_name=None, folder='models/') -> None: self.experiment.log_code(file_name=file_name, folder=folder) def add_scalar(self, metrics: Dict[str, Union[torch.Tensor, float]], step: Optional[int] = None, epoch: Optional[int] = None) -> None: metrics_renamed = {} for key, val in metrics.items(): tag = '{}/{}'.format(key, self.mode) if is_tensor(val): metrics_renamed[tag] = val.cpu().detach() else: metrics_renamed[tag] = val if epoch is None: self.experiment.log_metrics(metrics_renamed, step=self.step, epoch=self.epoch) else: self.experiment.log_metrics(metrics_renamed, epoch=epoch) def add_plot(self, figure_name, figure): """ Primarily for log gate plots """ self.experiment.log_figure(figure_name=figure_name, figure=figure) def add_hist3d(self, hist, name): """ Primarily for log gate plots """ self.experiment.log_histogram_3d(hist, name=name) def reset_experiment(self): self.experiment = None def finalize(self) -> None: self.experiment.end() self.reset_experiment()
def train(config, weights, ntrain, ntest, nepochs, recreate, prefix, plot_freq, customize, comet_offline): # tf.debugging.enable_check_numerics() """Train a model defined by config""" config_file_path = config config, config_file_stem = parse_config(config, nepochs=nepochs, weights=weights) if plot_freq: config["callbacks"]["plot_freq"] = plot_freq if customize: config = customization_functions[customize](config) # Decide tf.distribute.strategy depending on number of available GPUs horovod_enabled = config["setup"]["horovod_enabled"] if horovod_enabled: num_gpus = initialize_horovod() else: strategy, num_gpus = get_strategy() outdir = "" if not horovod_enabled or hvd.rank() == 0: outdir = create_experiment_dir(prefix=prefix + config_file_stem + "_", suffix=platform.node()) shutil.copy( config_file_path, outdir + "/config.yaml" ) # Copy the config file to the train dir for later reference try: if comet_offline: print("Using comet-ml OfflineExperiment, saving logs locally.") from comet_ml import OfflineExperiment experiment = OfflineExperiment( project_name="particleflow-tf", auto_metric_logging=True, auto_param_logging=True, auto_histogram_weight_logging=True, auto_histogram_gradient_logging=False, auto_histogram_activation_logging=False, offline_directory=outdir + "/cometml", ) else: print("Using comet-ml Experiment, streaming logs to www.comet.ml.") from comet_ml import Experiment experiment = Experiment( project_name="particleflow-tf", auto_metric_logging=True, auto_param_logging=True, auto_histogram_weight_logging=True, auto_histogram_gradient_logging=False, auto_histogram_activation_logging=False, ) except Exception as e: print("Failed to initialize comet-ml dashboard: {}".format(e)) experiment = None if experiment: experiment.set_name(outdir) experiment.log_code("mlpf/tfmodel/model.py") experiment.log_code("mlpf/tfmodel/utils.py") experiment.log_code(config_file_path) ds_train, num_train_steps = get_datasets(config["train_test_datasets"], config, num_gpus, "train") ds_test, num_test_steps = get_datasets(config["train_test_datasets"], config, num_gpus, "test") ds_val, ds_info = get_heptfds_dataset( config["validation_datasets"][0], config, num_gpus, "test", config["setup"]["num_events_validation"], supervised=False, ) ds_val = ds_val.batch(5) if ntrain: ds_train = ds_train.take(ntrain) num_train_steps = ntrain if ntest: ds_test = ds_test.take(ntest) num_test_steps = ntest print("num_train_steps", num_train_steps) print("num_test_steps", num_test_steps) total_steps = num_train_steps * config["setup"]["num_epochs"] print("total_steps", total_steps) if horovod_enabled: model, optim_callbacks, initial_epoch = model_scope( config, total_steps, weights, horovod_enabled) else: with strategy.scope(): model, optim_callbacks, initial_epoch = model_scope( config, total_steps, weights) callbacks = prepare_callbacks( config, outdir, ds_val, comet_experiment=experiment, horovod_enabled=config["setup"]["horovod_enabled"]) verbose = 1 if horovod_enabled: callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) callbacks.append(hvd.callbacks.MetricAverageCallback()) verbose = 1 if hvd.rank() == 0 else 0 num_train_steps /= hvd.size() num_test_steps /= hvd.size() callbacks.append(optim_callbacks) model.fit( ds_train.repeat(), validation_data=ds_test.repeat(), epochs=initial_epoch + config["setup"]["num_epochs"], callbacks=callbacks, steps_per_epoch=num_train_steps, validation_steps=num_test_steps, initial_epoch=initial_epoch, verbose=verbose, )