Ejemplo n.º 1
0
class CometLogger():
    def __init__(self, enabled, is_existing=False, prev_exp_key=None):
        """
        Handles logging of experiment to comet and also persistence to local file system.
        Supports resumption of stopped experiments.
        """
        disabled = not enabled

        if not is_existing:
            self.experiment = Experiment(api_key=COMET_API_KEY,
                                         workspace=COMET_WORKSPACE,
                                         project_name=PROJECT_NAME,
                                         disabled=disabled)
        else:
            if prev_exp_key is None:
                raise ValueError(
                    "Requested existing experiment, but no key provided")
            print("Continuing existing experiment with key: ", prev_exp_key)
            self.experiment = ExistingExperiment(
                api_key=COMET_API_KEY,
                workspace=COMET_WORKSPACE,
                project_name=PROJECT_NAME,
                disabled=disabled,
                previous_experiment=prev_exp_key)
        self.disabled = disabled

    def get_experiment_key(self):
        return self.experiment.get_key()[:9]

    def add_tag(self, tag):
        self.experiment.add_tag(tag)

    def log_metric(self, name, value, step=None):
        self.experiment.log_metric(name, value, step=step)

    def log_metrics(self, metrics_dict, prefix, step=None):
        self.experiment.log_metrics(metrics_dict, prefix=prefix, step=step)

    def log_params(self, params_dict):
        self.experiment.log_parameters(params_dict)

    def set_name(self, name_str):
        self.experiment.set_name(name_str)

    def log_dataset(self, dataset: SpeakerVerificationDataset):
        if self.disabled:
            return
        dataset_string = ""
        dataset_string += "<b>Speakers</b>: %s\n" % len(dataset.speakers)
        dataset_string += "\n" + dataset.get_logs()
        dataset_string = dataset_string.replace("\n", "<br>")
        self.vis.text(dataset_string, opts={"title": "Dataset"})

    def log_implementation(self, params):
        if self.disabled:
            return
        implementation_string = ""
        for param, value in params.items():
            implementation_string += "<b>%s</b>: %s\n" % (param, value)
            implementation_string = implementation_string.replace("\n", "<br>")
        self.implementation_string = implementation_string
        self.implementation_win = self.vis.text(
            implementation_string, opts={"title": "Training implementation"})

    def draw_projections(self,
                         embeds,
                         utterances_per_speaker,
                         step,
                         out_fpath=None,
                         max_speakers=16):
        if self.disabled:
            return
        max_speakers = min(max_speakers, len(colormap))
        embeds = embeds[:max_speakers * utterances_per_speaker]

        n_speakers = len(embeds) // utterances_per_speaker
        ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
        colors = [colormap[i] for i in ground_truth]

        reducer = umap.UMAP()
        projected = reducer.fit_transform(embeds)
        plt.scatter(projected[:, 0], projected[:, 1], c=colors)
        plt.gca().set_aspect("equal", "datalim")
        plt.title("UMAP projection (step %d)" % step)
        if out_fpath is not None:
            plt.savefig(out_fpath)
        plt.clf()
        self.experiment.log_image(out_fpath, step=step)
Ejemplo n.º 2
0
def main(args, config=None, init_distributed=False):
    utils.import_user_module(args)

    experiment = None
    if config:
        experiment = ExistingExperiment(
            api_key=config["api_key"],
            previous_experiment=config["experiment_key"],
            auto_output_logging=None,
        )

    assert (
        args.max_tokens is not None or args.max_sentences is not None
    ), "Must specify batch size either with --max-tokens or --max-sentences"

    # Initialize CUDA and distributed training
    if torch.cuda.is_available() and not args.cpu:
        torch.cuda.set_device(args.device_id)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if init_distributed:
        args.distributed_rank = distributed_utils.distributed_init(args)

    if distributed_utils.is_master(args):
        checkpoint_utils.verify_checkpoint_directory(args.save_dir)

    print(args)
    if experiment:
        experiment.log_parameters(vars(args),
                                  prefix="Device {} :: ".format(
                                      args.device_id))

    # Setup task, e.g., translation, language modeling, etc.
    task = tasks.setup_task(args)

    # Load valid dataset (we load training data below, based on the latest checkpoint)
    for valid_sub_split in args.valid_subset.split(","):
        task.load_dataset(valid_sub_split, combine=False, epoch=0)

    # Build model and criterion
    model = task.build_model(args)
    criterion = task.build_criterion(args)
    print(model)
    print("| model {}, criterion {}".format(args.arch,
                                            criterion.__class__.__name__))
    print("| num. model params: {} (num. trained: {})".format(
        sum(p.numel() for p in model.parameters()),
        sum(p.numel() for p in model.parameters() if p.requires_grad),
    ))

    if experiment:
        experiment.log_parameters(
            {
                "criterion":
                criterion.__class__.__name__,
                "num. model params":
                sum(p.numel() for p in model.parameters()),
                "num. trained params":
                sum(p.numel() for p in model.parameters() if p.requires_grad),
            },
            prefix="Device {} :: ".format(args.device_id),
        )

    # Build trainer
    trainer = Trainer(args, task, model, criterion)
    print("| training on {} GPUs".format(args.distributed_world_size))
    print("| max tokens per GPU = {} and max sentences per GPU = {}".format(
        args.max_tokens, args.max_sentences))

    # Load the latest checkpoint if one is available and restore the
    # corresponding train iterator
    extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer)

    # Train until the learning rate gets too small
    max_epoch = args.max_epoch or math.inf
    max_update = args.max_update or math.inf
    lr = trainer.get_lr()
    train_meter = StopwatchMeter()
    train_meter.start()
    valid_subsets = args.valid_subset.split(",")
    while (lr > args.min_lr and epoch_itr.epoch < max_epoch
           and trainer.get_num_updates() < max_update):
        # train for one epoch
        train(args, trainer, task, epoch_itr, experiment)

        if (not args.disable_validation
                and epoch_itr.epoch % args.validate_interval == 0):
            valid_losses = validate(args, trainer, task, epoch_itr,
                                    valid_subsets, experiment)
        else:
            valid_losses = [None]

        # only use first validation loss to update the learning rate
        lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0])

        # save checkpoint
        if epoch_itr.epoch % args.save_interval == 0:
            checkpoint_utils.save_checkpoint(args, trainer, epoch_itr,
                                             valid_losses[0])

        reload_dataset = ":" in getattr(args, "data", "")
        # sharded data: get train iterator for next epoch
        epoch_itr = trainer.get_train_iterator(epoch_itr.epoch,
                                               load_dataset=reload_dataset)
    train_meter.stop()
    print("| done training in {:.1f} seconds".format(train_meter.sum))

    if experiment:
        experiment.log_metrics(
            {
                "valid_loss": valid_losses[0],
                "lr": lr
            },
            prefix="Device {} ".format(args.device_id),
        )
Ejemplo n.º 3
0
class CometLogger():
    def __init__(self, disabled, is_existing=False, prev_exp_key=None):
        """
        Handles logging of experiment to comet and also persistence to local file system.
        Supports resumption of stopped experiments.
        """

        if not is_existing:
            self.experiment = Experiment(api_key=COMET_API_KEY,
                                         workspace=COMET_WORKSPACE,
                                         project_name=PROJECT_NAME,
                                         disabled=disabled)
        else:
            if prev_exp_key is None:
                raise ValueError("Requested existing experiment, but no key provided")
            print("Continuing existing experiment with key: ", prev_exp_key)
            self.experiment = ExistingExperiment(api_key=COMET_API_KEY,
                                                 workspace=COMET_WORKSPACE,
                                                 project_name=PROJECT_NAME,
                                                 disabled=disabled,
                                                 previous_experiment=prev_exp_key)
        self.disabled = disabled
        self.name = None

    def get_experiment_key(self):
        return self.experiment.get_key()[:9]

    def add_tag(self, tag):
        self.experiment.add_tag(tag)

    def log_metric(self, name, value, step=None):
        self.experiment.log_metric(name, value, step=step)

    def log_metrics(self, metrics_dict, prefix, step=None):
        self.experiment.log_metrics(metrics_dict, prefix=prefix, step=step)

    def log_params(self, params_dict):
        self.experiment.log_parameters(params_dict)

    def set_name(self, name_str):
        self.experiment.set_name(name_str)
        self.name = name_str

    def save_act_grads(self, log_dict):
        """Save a dictionary of activation/gradients records to disk"""
        assert isinstance(log_dict, dict)
        if self.name is None:
            warnings.warn("Experiment name not set, not saving")
            return

        # Save the log dictionary
        file_name = f"./.{self.name}.record"
        with open(file_name, 'wb') as f:
            pickle.dump(log_dict, f)

    # TODO: need to rewrite before can be used for MNIST.
    def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None,
                         max_speakers=16):
        import umap
        if self.disabled:
            return
        max_speakers = min(max_speakers, len(colormap))
        embeds = embeds[:max_speakers * utterances_per_speaker]

        n_speakers = len(embeds) // utterances_per_speaker
        ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
        colors = [colormap[i] for i in ground_truth]

        reducer = umap.UMAP()
        projected = reducer.fit_transform(embeds)
        plt.scatter(projected[:, 0], projected[:, 1], c=colors)
        plt.gca().set_aspect("equal", "datalim")
        plt.title("UMAP projection (step %d)" % step)
        if out_fpath is not None:
            plt.savefig(out_fpath)
        plt.clf()
        self.experiment.log_image(out_fpath, step=step)
Ejemplo n.º 4
0
    valSSIM[epoch].append(ssim_val)
    valLoss[epoch].append(vLoss)
    # Tensorboard
    itr = int(trainIndex + epoch * (len(trainloader)))

    # writer.add_scalars(
    #     "Loss",
    #     {"trainLoss": iLoss / args.progress_iter, "validationLoss": vLoss},
    #     itr,
    # )
    # writer.add_scalar("PSNR", psnr, itr)

    # writer.add_image("Validation", valImg, itr)
    comet_exp.log_metrics(
        {"trainLoss": iLoss / args.progress_iter, "validationLoss": vLoss},
        step=itr,
        epoch=epoch,
    )
    comet_exp.log_metric("PSNR", psnr, step=itr, epoch=epoch)
    comet_exp.log_metric("SSIM", ssim_val, step=itr, epoch=epoch)
    # valImage = torch.movedim(valImg, 0, -1)
    # print(type(valImage))
    # print(valImage.shape)
    # print(valImage.max())
    # print(valImage.min())

    # comet_exp.log_image(
    #     valImage,
    #     name="iter: " + str(iter) + ";epoch: " + str(epoch),
    #     image_format="jpg",
    #     step=itr,
Ejemplo n.º 5
0
class CometConnection:
    def __init__(self, comet_name=None, dataset_config=None, exp_key=None):
        self.experiment = None

        if comet_name is not None and dataset_config is not None:
            self._init_new_experiment(comet_name, dataset_config)
        elif exp_key is not None:
            self._init_continue_experiment(exp_key)

    def _init_new_experiment(self, comet_name, dataset_config):
        self.experiment = Experiment(api_key=COMET_KEY,
                                     project_name=PROJECT_NAME)
        self.experiment.set_name(comet_name)
        self.log_data_attributes(dataset_config)
        self.experiment.log_asset('datagen/spectra_generator.m')

    def _init_continue_experiment(self, exp_key):
        self.experiment = ExistingExperiment(api_key=COMET_KEY,
                                             previous_experiment=exp_key)

    def serialize(self):
        params = dict()
        params["comet_exp_key"] = self.experiment.get_key()

        return params

    def save(self, save_dir):
        info_dict = self.serialize()
        json.dump(info_dict,
                  open(os.path.join(save_dir, COMET_SAVE_FILENAME), "w"))

    def persist(self, config_path):
        info = json.load(open(config_path, 'r'))
        self.__init__(exp_key=info["comet_exp_key"])

    def log_data_attributes(self, dataset_config):
        for key, value in dataset_config.items():
            self.experiment.log_parameter("SPECTRUM_" + key, value)

    def log_imgs(self, dataset_name):
        try:
            imgs_dir = os.path.join(DATA_DIR, dataset_name, 'imgs')
            self.experiment.log_asset_folder(imgs_dir)
        except:
            print(f"No images found for dataset: {dataset_name}")

    def log_script(self, dataset_config):
        script_name = dataset_config['matlab_script']
        try:
            matlab_dir = os.path.join(GEN_DIR, script_name)
            self.experiment.log_asset(matlab_dir)
        except:
            print(f"Could not find {script_name} under {GEN_DIR}.")

    def format_classification_report(self, classification_report):
        return {
            f'{k}_test_{metric}': metric_val
            for k, v in classification_report.items()
            for metric, metric_val in v.items()
        }

    def get_classification_report(self, y_test, preds):
        preds_formatted = np.argmax(preds, axis=1)
        test_formatted = np.argmax(y_test, axis=1)
        peak_labels = [
            f"n_peaks_{1 + num_peak}" for num_peak in range(y_test.shape[1])
        ]
        classif_report = classification_report(test_formatted,
                                               preds_formatted,
                                               target_names=peak_labels,
                                               output_dict=True)
        classif_report_str = classification_report(test_formatted,
                                                   preds_formatted,
                                                   target_names=peak_labels)

        if self.experiment is not None:
            formatted = self.format_classification_report(classif_report)
            self.experiment.log_metrics(formatted)
            self.experiment.log_text(classif_report_str)

        return classif_report