def convert_and_save_results(
    cfg: Config,
    cluster_label_path: Path,
    results: Tuple[Tensor, Tensor, Tensor],
    enc_path: Path,
    context_metrics: Optional[Dict[str, float]],
    test_metrics: Optional[Dict[str, float]] = None,
) -> Path:
    clusters, s, y = results
    s_count = cfg.misc._s_dim if cfg.misc._s_dim > 1 else 2
    class_ids = get_class_id(s=s,
                             y=y,
                             s_count=s_count,
                             to_cluster=cfg.clust.cluster)
    cluster_results = ClusterResults(
        flags=flatten(
            OmegaConf.to_container(cfg, resolve=True, enum_to_str=True)),
        cluster_ids=clusters,
        class_ids=class_ids,
        enc_path=enc_path,
        context_metrics=context_metrics,
        test_metrics=test_metrics,
    )
    return save_results(save_path=cluster_label_path,
                        cluster_results=cluster_results)
def solve_polynomials(order: int, num_range: int) -> None:
    """
	solve a large number of polynomials and save the data to a database
	"""
    dimensions = utils.repeat_val(order, utils.sequence(-num_range, num_range))
    space = itertools.product(*dimensions)
    total_count = utils.product(len(val) for val in dimensions)

    show_spash(total_count, dimensions)

    start = time.time()
    root_count = 0

    conn = sqlite3.connect('./db.sqlite')
    curse = conn.cursor()
    curse.execute('PRAGMA synchronous = OFF')

    curse.execute(create_table(len(dimensions) - 1))
    conn.commit()

    solutions = []

    for point in space:
        root_count += 1

        id = ','.join(map(str, point))
        roots = [[root.real, root.imag] for root in numpy.roots(point)]

        solution = [id] + utils.flatten(roots)
        solutions.append(solution)

        # -- write the solutions to a database occasionally
        if len(solutions) > constants['batch_size']:
            try:
                curse.executemany(insert_row(len(dimensions) - 1), solutions)
            except Exception as err:
                print(err)

            conn.commit()
            solutions = []

        display_progress(root_count, total_count, start)

    # -- flush the remaining records
    try:
        curse.executemany(insert_row(len(dimensions) - 1), solutions)
    except Exception as err:
        print(err)

    # -- close the DB connection.
    conn.commit()
    conn.close()
def train(
    cfg: Config,
    encoder: Encoder,
    context_data: Dataset,
    num_clusters: int,
    s_count: int,
    enc_path: Path,
) -> ClusterResults:
    # encode the training set with the encoder
    encoded = encode_dataset(cfg, context_data, encoder)
    # create data loader with one giant batch
    data_loader = DataLoader(encoded, batch_size=len(encoded), shuffle=False)
    encoded, s, y = next(iter(data_loader))
    preds = run_kmeans_faiss(
        encoded,
        nmb_clusters=num_clusters,
        cuda=str(cfg.misc._device) != "cpu",
        n_iter=cfg.clust.epochs,
        verbose=True,
    )
    cluster_ids = preds.cpu().numpy()
    # preds, _ = run_kmeans_torch(encoded, num_clusters, device=args._device, n_iter=args.epochs, verbose=True)
    counts = np.zeros((num_clusters, num_clusters), dtype=np.int64)
    counts, class_ids = count_occurances(counts, cluster_ids, s, y, s_count,
                                         cfg.clust.cluster)
    _, context_metrics, logging_dict = cluster_metrics(
        cluster_ids=cluster_ids,
        counts=counts,
        true_class_ids=class_ids.numpy(),
        num_total=preds.size(0),
        s_count=s_count,
        to_cluster=cfg.clust.cluster,
    )
    prepared = (f"{k}: {v:.5g}" if isinstance(v, float) else f"{k}: {v}"
                for k, v in logging_dict.items())
    log.info(" | ".join(prepared))
    wandb_log(cfg.misc, logging_dict, step=0)
    log.info("Context metrics:")
    print_metrics({f"Context {k}": v for k, v in context_metrics.items()})
    return ClusterResults(
        flags=flatten(
            OmegaConf.to_container(cfg, resolve=True, enum_to_str=True)),
        cluster_ids=preds,
        class_ids=get_class_id(s=s,
                               y=y,
                               s_count=s_count,
                               to_cluster=cfg.clust.cluster),
        enc_path=enc_path,
        context_metrics=context_metrics,
    )
def save_model(cfg: Config,
               save_dir: Path,
               model: Model,
               epoch: int,
               sha: str,
               best: bool = False) -> Path:
    if best:
        filename = save_dir / "checkpt_best.pth"
    else:
        filename = save_dir / f"checkpt_epoch{epoch}.pth"
    save_dict = {
        "args":
        flatten(OmegaConf.to_container(cfg, resolve=True, enum_to_str=True)),
        "sha":
        sha,
        "model":
        model.state_dict(),
        "epoch":
        epoch,
    }

    torch.save(save_dict, filename)

    return filename
Example #5
0
 def users(self):
     return [e.title.text for e in utils.flatten(
         [u.entry for u in self.apps_client.GetGeneratorForAllUsers()]) if e.login.suspended == 'false']
def main(cfg: Config,
         cluster_label_file: Optional[Path] = None) -> Tuple[Model, Path]:
    """Main function

    Args:
        cluster_label_file: path to a pth file with cluster IDs
        use_wandb: this arguments overwrites the flag

    Returns:
        the trained generator
    """
    # ==== initialize globals ====
    global ARGS, CFG, DATA, ENC, MISC
    ARGS = cfg.clust
    CFG = cfg
    DATA = cfg.data
    ENC = cfg.enc
    MISC = cfg.misc

    # ==== current git commit ====
    if os.environ.get("STARTED_BY_GUILDAI", None) == "1":
        sha = ""
    else:
        repo = git.Repo(search_parent_directories=True)
        sha = repo.head.object.hexsha

    use_gpu = torch.cuda.is_available() and MISC.gpu >= 0
    random_seed(MISC.seed, use_gpu)
    if cluster_label_file is not None:
        MISC.cluster_label_file = str(cluster_label_file)

    run = None
    if MISC.use_wandb:
        group = ""
        if MISC.log_method:
            group += MISC.log_method
        if MISC.exp_group:
            group += "." + MISC.exp_group
        if cfg.bias.log_dataset:
            group += "." + cfg.bias.log_dataset
        run = wandb.init(
            entity="anonymous",
            project="fcm-hydra",
            config=flatten(
                OmegaConf.to_container(cfg, resolve=True, enum_to_str=True)),
            group=group if group else None,
            reinit=True,
        )

    save_dir = Path(to_absolute_path(MISC.save_dir)) / str(time.time())
    save_dir.mkdir(parents=True, exist_ok=True)

    log.info(str(OmegaConf.to_yaml(cfg, resolve=True, sort_keys=True)))
    log.info(f"Save directory: {save_dir.resolve()}")
    # ==== check GPU ====
    MISC._device = f"cuda:{MISC.gpu}" if use_gpu else "cpu"
    device = torch.device(MISC._device)
    log.info(
        f"{torch.cuda.device_count()} GPUs available. Using device '{device}'")

    # ==== construct dataset ====
    datasets: DatasetTriplet = load_dataset(CFG)
    log.info("Size of context-set: {}, training-set: {}, test-set: {}".format(
        len(datasets.context),
        len(datasets.train),
        len(datasets.test),
    ))
    ARGS.test_batch_size = ARGS.test_batch_size if ARGS.test_batch_size else ARGS.batch_size
    context_batch_size = round(ARGS.batch_size * len(datasets.context) /
                               len(datasets.train))
    context_loader = DataLoader(
        datasets.context,
        shuffle=True,
        batch_size=context_batch_size,
        num_workers=MISC.num_workers,
        pin_memory=True,
    )
    enc_train_data = ConcatDataset([datasets.context, datasets.train])
    if ARGS.encoder == Enc.rotnet:
        enc_train_loader = DataLoader(
            RotationPrediction(enc_train_data, apply_all=True),
            shuffle=True,
            batch_size=ARGS.batch_size,
            num_workers=MISC.num_workers,
            pin_memory=True,
            collate_fn=adaptive_collate,
        )
    else:
        enc_train_loader = DataLoader(
            enc_train_data,
            shuffle=True,
            batch_size=ARGS.batch_size,
            num_workers=MISC.num_workers,
            pin_memory=True,
        )

    train_loader = DataLoader(
        datasets.train,
        shuffle=True,
        batch_size=ARGS.batch_size,
        num_workers=MISC.num_workers,
        pin_memory=True,
    )
    val_loader = DataLoader(
        datasets.test,
        shuffle=False,
        batch_size=ARGS.test_batch_size,
        num_workers=MISC.num_workers,
        pin_memory=True,
    )

    # ==== construct networks ====
    input_shape = get_data_dim(context_loader)
    s_count = datasets.s_dim if datasets.s_dim > 1 else 2
    y_count = datasets.y_dim if datasets.y_dim > 1 else 2
    if ARGS.cluster == CL.s:
        num_clusters = s_count
    elif ARGS.cluster == CL.y:
        num_clusters = y_count
    else:
        num_clusters = s_count * y_count
    log.info(
        f"Number of clusters: {num_clusters}, accuracy computed with respect to {ARGS.cluster.name}"
    )
    mappings: List[str] = []
    for i in range(num_clusters):
        if ARGS.cluster == CL.s:
            mappings.append(f"{i}: s = {i}")
        elif ARGS.cluster == CL.y:
            mappings.append(f"{i}: y = {i}")
        else:
            # class_id = y * s_count + s
            mappings.append(f"{i}: (y = {i // s_count}, s = {i % s_count})")
    log.info("class IDs:\n\t" + "\n\t".join(mappings))
    feature_group_slices = getattr(datasets.context, "feature_group_slices",
                                   None)

    # ================================= encoder =================================
    encoder: Encoder
    enc_shape: Tuple[int, ...]
    if ARGS.encoder in (Enc.ae, Enc.vae):
        encoder, enc_shape = build_ae(CFG, input_shape, feature_group_slices)
    else:
        if len(input_shape) < 2:
            raise ValueError("RotNet can only be applied to image data.")
        enc_optimizer_kwargs = {"lr": ARGS.enc_lr, "weight_decay": ARGS.enc_wd}
        enc_kwargs = {
            "pretrained": False,
            "num_classes": 4,
            "zero_init_residual": True
        }
        net = resnet18(
            **enc_kwargs) if DATA.dataset == DS.cmnist else resnet50(
                **enc_kwargs)

        encoder = SelfSupervised(model=net,
                                 num_classes=4,
                                 optimizer_kwargs=enc_optimizer_kwargs)
        enc_shape = (512, )
        encoder.to(device)

    log.info(f"Encoding shape: {enc_shape}")

    enc_path: Path
    if ARGS.enc_path:
        enc_path = Path(ARGS.enc_path)
        if ARGS.encoder == Enc.rotnet:
            assert isinstance(encoder, SelfSupervised)
            encoder = encoder.get_encoder()
        save_dict = torch.load(ARGS.enc_path,
                               map_location=lambda storage, loc: storage)
        encoder.load_state_dict(save_dict["encoder"])
        if "args" in save_dict:
            args_encoder = save_dict["args"]
            assert ARGS.encoder.name == args_encoder["encoder_type"]
            assert ENC.levels == args_encoder["levels"]
    else:
        encoder.fit(enc_train_loader,
                    epochs=ARGS.enc_epochs,
                    device=device,
                    use_wandb=ARGS.enc_wandb)
        if ARGS.encoder == Enc.rotnet:
            assert isinstance(encoder, SelfSupervised)
            encoder = encoder.get_encoder()
        # the args names follow the convention of the standalone VAE commandline args
        args_encoder = {
            "encoder_type": ARGS.encoder.name,
            "levels": ENC.levels
        }
        enc_path = save_dir.resolve() / "encoder"
        torch.save({
            "encoder": encoder.state_dict(),
            "args": args_encoder
        }, enc_path)
        log.info(f"To make use of this encoder:\n--enc-path {enc_path}")
        if ARGS.enc_wandb:
            log.info("Stopping here because W&B will be messed up...")
            if run is not None:
                run.finish(
                )  # this allows multiple experiments in one python process
            return

    cluster_label_path = get_cluster_label_path(MISC, save_dir)
    if ARGS.method == Meth.kmeans:
        kmeans_results = train_k_means(CFG, encoder, datasets.context,
                                       num_clusters, s_count, enc_path)
        pth = save_results(save_path=cluster_label_path,
                           cluster_results=kmeans_results)
        if run is not None:
            run.finish(
            )  # this allows multiple experiments in one python process
        return (), pth
    if ARGS.finetune_encoder:
        encoder.freeze_initial_layers(ARGS.freeze_layers, {
            "lr": ARGS.finetune_lr,
            "weight_decay": ARGS.weight_decay
        })

    # ================================= labeler =================================
    pseudo_labeler: PseudoLabeler
    if ARGS.pseudo_labeler == PL.ranking:
        pseudo_labeler = RankingStatistics(k_num=ARGS.k_num)
    elif ARGS.pseudo_labeler == PL.cosine:
        pseudo_labeler = CosineSimThreshold(
            upper_threshold=ARGS.upper_threshold,
            lower_threshold=ARGS.lower_threshold)

    # ================================= method =================================
    method: Method
    if ARGS.method == Meth.pl_enc:
        method = PseudoLabelEnc()
    elif ARGS.method == Meth.pl_output:
        method = PseudoLabelOutput()
    elif ARGS.method == Meth.pl_enc_no_norm:
        method = PseudoLabelEncNoNorm()

    # ================================= classifier =================================
    clf_optimizer_kwargs = {"lr": ARGS.lr, "weight_decay": ARGS.weight_decay}
    clf_fn = FcNet(hidden_dims=ARGS.cl_hidden_dims)
    clf_input_shape = (prod(enc_shape), )  # FcNet first flattens the input

    classifier = build_classifier(
        input_shape=clf_input_shape,
        target_dim=s_count if ARGS.use_multi_head else num_clusters,
        model_fn=clf_fn,
        optimizer_kwargs=clf_optimizer_kwargs,
        num_heads=y_count if ARGS.use_multi_head else 1,
    )
    classifier.to(device)

    model: Union[Model, MultiHeadModel]
    if ARGS.use_multi_head:
        labeler_fn: ModelFn
        if DATA.dataset == DS.cmnist:
            labeler_fn = Mp32x23Net(batch_norm=True)
        elif DATA.dataset == DS.celeba:
            labeler_fn = Mp64x64Net(batch_norm=True)
        else:
            labeler_fn = FcNet(hidden_dims=ARGS.labeler_hidden_dims)

        labeler_optimizer_kwargs = {
            "lr": ARGS.labeler_lr,
            "weight_decay": ARGS.labeler_wd
        }
        labeler: Classifier = build_classifier(
            input_shape=input_shape,
            target_dim=s_count,
            model_fn=labeler_fn,
            optimizer_kwargs=labeler_optimizer_kwargs,
        )
        labeler.to(device)
        log.info("Fitting the labeler to the labeled data.")
        labeler.fit(
            train_loader,
            epochs=ARGS.labeler_epochs,
            device=device,
            use_wandb=ARGS.labeler_wandb,
        )
        labeler.eval()
        model = MultiHeadModel(
            encoder=encoder,
            classifiers=classifier,
            method=method,
            pseudo_labeler=pseudo_labeler,
            labeler=labeler,
            train_encoder=ARGS.finetune_encoder,
        )
    else:
        model = Model(
            encoder=encoder,
            classifier=classifier,
            method=method,
            pseudo_labeler=pseudo_labeler,
            train_encoder=ARGS.finetune_encoder,
        )

    start_epoch = 1  # start at 1 so that the val_freq works correctly
    # Resume from checkpoint
    if MISC.resume is not None:
        log.info("Restoring generator from checkpoint")
        model, start_epoch = restore_model(CFG, Path(MISC.resume), model)
        if MISC.evaluate:
            pth_path = convert_and_save_results(
                CFG,
                cluster_label_path,
                classify_dataset(CFG, model, datasets.context),
                enc_path=enc_path,
                context_metrics={},  # TODO: compute this
            )
            if run is not None:
                run.finish(
                )  # this allows multiple experiments in one python process
            return model, pth_path

    # Logging
    # wandb.set_model_graph(str(generator))
    num_parameters = count_parameters(model)
    log.info(f"Number of trainable parameters: {num_parameters}")

    # best_loss = float("inf")
    best_acc = 0.0
    n_vals_without_improvement = 0
    # super_val_freq = ARGS.super_val_freq or ARGS.val_freq

    itr = 0
    # Train generator for N epochs
    for epoch in range(start_epoch, start_epoch + ARGS.epochs):
        if n_vals_without_improvement > ARGS.early_stopping > 0:
            break

        itr = train(model=model,
                    context_data=context_loader,
                    train_data=train_loader,
                    epoch=epoch)

        if epoch % ARGS.val_freq == 0:
            val_acc, _, val_log = validate(model, val_loader)

            if val_acc > best_acc:
                best_acc = val_acc
                save_model(CFG,
                           save_dir,
                           model,
                           epoch=epoch,
                           sha=sha,
                           best=True)
                n_vals_without_improvement = 0
            else:
                n_vals_without_improvement += 1

            prepare = (f"{k}: {v:.5g}" if isinstance(v, float) else f"{k}: {v}"
                       for k, v in val_log.items())
            log.info("[VAL] Epoch {:04d} | {} | "
                     "No improvement during validation: {:02d}".format(
                         epoch,
                         " | ".join(prepare),
                         n_vals_without_improvement,
                     ))
            wandb_log(MISC, val_log, step=itr)
        # if ARGS.super_val and epoch % super_val_freq == 0:
        #     log_metrics(ARGS, model=model.bundle, data=datasets, step=itr)
        #     save_model(args, save_dir, model=model.bundle, epoch=epoch, sha=sha)

    log.info("Training has finished.")
    # path = save_model(args, save_dir, model=model, epoch=epoch, sha=sha)
    # model, _ = restore_model(args, path, model=model)
    _, test_metrics, _ = validate(model, val_loader)
    _, context_metrics, _ = validate(model, context_loader)
    log.info("Test metrics:")
    print_metrics({f"Test {k}": v for k, v in test_metrics.items()})
    log.info("Context metrics:")
    print_metrics({f"Context {k}": v for k, v in context_metrics.items()})
    pth_path = convert_and_save_results(
        CFG,
        cluster_label_path=cluster_label_path,
        results=classify_dataset(CFG, model, datasets.context),
        enc_path=enc_path,
        context_metrics=context_metrics,
        test_metrics=test_metrics,
    )
    if run is not None:
        run.finish()  # this allows multiple experiments in one python process
    return model, pth_path
Example #7
0
def main(cfg: Config, cluster_label_file: Optional[Path] = None) -> Generator:
    """Main function.

    Args:
        cluster_label_file: path to a pth file with cluster IDs
        initialize_wandb: if False, we assume that W&B has already been initialized

    Returns:
        the trained generator
    """
    # ==== initialize globals ====
    global ARGS, CFG, DATA, ENC, MISC
    ARGS = cfg.fdm
    CFG = cfg
    DATA = cfg.data
    ENC = cfg.enc
    MISC = cfg.misc

    # ==== current git commit ====
    repo = git.Repo(search_parent_directories=True)
    sha = repo.head.object.hexsha

    use_gpu = torch.cuda.is_available() and MISC.gpu >= 0
    random_seed(MISC.seed, use_gpu)
    if cluster_label_file is not None:
        MISC.cluster_label_file = str(cluster_label_file)

    run = None
    if MISC.use_wandb:
        project_suffix = f"-{DATA.dataset.name}" if DATA.dataset != DS.cmnist else ""
        group = ""
        if MISC.log_method:
            group += MISC.log_method
        if MISC.exp_group:
            group += "." + MISC.exp_group
        if cfg.bias.log_dataset:
            group += "." + cfg.bias.log_dataset
        run = wandb.init(
            entity="anonymous",
            project="fdm-hydra" + project_suffix,
            config=flatten(
                OmegaConf.to_container(cfg, resolve=True, enum_to_str=True)),
            group=group if group else None,
            reinit=True,
        )

    save_dir = Path(to_absolute_path(MISC.save_dir)) / str(time.time())
    save_dir.mkdir(parents=True, exist_ok=True)

    log.info(str(OmegaConf.to_yaml(cfg, resolve=True, sort_keys=True)))
    log.info(f"Save directory: {save_dir.resolve()}")
    # ==== check GPU ====
    MISC._device = f"cuda:{MISC.gpu}" if use_gpu else "cpu"
    device = torch.device(MISC._device)
    log.info(
        f"{torch.cuda.device_count()} GPUs available. Using device '{device}'")

    # ==== construct dataset ====
    datasets: DatasetTriplet = load_dataset(CFG)
    log.info("Size of context-set: {}, training-set: {}, test-set: {}".format(
        len(datasets.context),
        len(datasets.train),
        len(datasets.test),
    ))
    ARGS.test_batch_size = ARGS.test_batch_size if ARGS.test_batch_size else ARGS.batch_size
    s_count = max(datasets.s_dim, 2)

    cluster_results = None
    cluster_test_metrics: Dict[str, float] = {}
    cluster_context_metrics: Dict[str, float] = {}
    if MISC.cluster_label_file:
        cluster_results = load_results(CFG)
        cluster_test_metrics = cluster_results.test_metrics or {}
        cluster_context_metrics = cluster_results.context_metrics or {}
        weights, n_clusters, min_count, max_count = weight_for_balance(
            cluster_results.cluster_ids,
            min_size=None if ARGS.oversample else ARGS.batch_size)
        # if ARGS.oversample, oversample the smaller clusters instead of undersample the larger ones
        num_samples = n_clusters * max_count if ARGS.oversample else n_clusters * min_count
        assert num_samples > ARGS.batch_size, "not enough samples for a batch"
        context_sampler = WeightedRandomSampler(weights,
                                                num_samples,
                                                replacement=ARGS.oversample)
        dataloader_kwargs = dict(sampler=context_sampler)
    elif ARGS.balanced_context:
        context_sampler = build_weighted_sampler_from_dataset(
            dataset=datasets.context,
            s_count=s_count,
            test_batch_size=ARGS.test_batch_size,
            batch_size=ARGS.batch_size,
            num_workers=0,  # can easily get stuck with more workers
            oversample=ARGS.oversample,
            balance_hierarchical=False,
        )
        dataloader_kwargs = dict(sampler=context_sampler, shuffle=False)
    else:
        dataloader_kwargs = dict(shuffle=True)

    context_loader = DataLoader(
        datasets.context,
        batch_size=ARGS.batch_size,
        num_workers=MISC.num_workers,
        pin_memory=True,
        drop_last=True,
        **dataloader_kwargs,
    )

    train_sampler = build_weighted_sampler_from_dataset(
        dataset=datasets.train,
        s_count=s_count,
        test_batch_size=ARGS.test_batch_size,
        batch_size=ARGS.batch_size,
        num_workers=0,  # can easily get stuck with more workers
        oversample=ARGS.oversample,
        balance_hierarchical=True,
    )
    train_loader = DataLoader(
        dataset=datasets.train,
        batch_size=ARGS.batch_size,
        num_workers=MISC.num_workers,
        drop_last=True,
        shuffle=False,
        sampler=train_sampler,
        pin_memory=True,
    )
    test_loader = DataLoader(
        datasets.test,
        shuffle=False,
        batch_size=ARGS.test_batch_size,
        num_workers=MISC.num_workers,
        pin_memory=True,
        drop_last=False,
    )
    context_data_itr = inf_generator(context_loader)
    train_data_itr = inf_generator(train_loader)
    # ==== construct networks ====
    input_shape = next(context_data_itr)[0][0].shape
    is_image_data = len(input_shape) > 2

    feature_group_slices = getattr(datasets.context, "feature_group_slices",
                                   None)

    if is_image_data:
        decoding_dim = input_shape[
            0] * 256 if ENC.recon_loss == RL.ce else input_shape[0]
        # if ARGS.recon_loss == "ce":
        decoder_out_act = None
        # else:
        #     decoder_out_act = nn.Sigmoid() if ARGS.dataset == "cmnist" else nn.Tanh()
        encoder, decoder, enc_shape = conv_autoencoder(
            input_shape,
            ENC.init_chans,
            encoding_dim=ENC.out_dim,
            decoding_dim=decoding_dim,
            levels=ENC.levels,
            decoder_out_act=decoder_out_act,
            variational=ARGS.vae,
        )
    else:
        encoder, decoder, enc_shape = fc_autoencoder(
            input_shape,
            ENC.init_chans,
            encoding_dim=ENC.out_dim,
            levels=ENC.levels,
            variational=ARGS.vae,
        )

    if ARGS.enc_snorm:

        def _snorm(_module: nn.Module) -> nn.Module:
            if hasattr(_module, "weight"):
                return torch.nn.utils.spectral_norm(_module)
            return _module

        encoder.apply(_snorm)

    recon_loss_fn_: Callable[[torch.Tensor, torch.Tensor], torch.Tensor]
    if ENC.recon_loss == RL.l1:
        recon_loss_fn_ = nn.L1Loss(reduction="sum")
    elif ENC.recon_loss == RL.l2:
        recon_loss_fn_ = nn.MSELoss(reduction="sum")
    elif ENC.recon_loss == RL.bce:
        recon_loss_fn_ = nn.BCELoss(reduction="sum")
    elif ENC.recon_loss == RL.huber:
        recon_loss_fn_ = lambda x, y: 0.1 * F.smooth_l1_loss(
            x * 10, y * 10, reduction="sum")
    elif ENC.recon_loss == RL.ce:
        recon_loss_fn_ = PixelCrossEntropy(reduction="sum")
    elif ENC.recon_loss == RL.mixed:
        assert feature_group_slices is not None, "can only do multi gen_loss with feature groups"
        recon_loss_fn_ = MixedLoss(feature_group_slices, reduction="sum")
    else:
        raise ValueError(
            f"{ENC.recon_loss} is an invalid reconstruction gen_loss")

    recon_loss_fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor]
    if ARGS.vgg_weight != 0:
        vgg_loss = VGGLoss()
        vgg_loss.to(device)

        def recon_loss_fn(input_: torch.Tensor,
                          target: torch.Tensor) -> torch.Tensor:
            return recon_loss_fn_(
                input_, target) + ARGS.vgg_weight * vgg_loss(input_, target)

    else:
        recon_loss_fn = recon_loss_fn_

    generator: Generator
    if ARGS.use_inn:
        autoencoder = build_ae(CFG,
                               encoder,
                               decoder,
                               encoding_size=None,
                               feature_group_slices=feature_group_slices)
        if prod(enc_shape) == enc_shape[0]:
            is_enc_image_data = False
            log.info("Encoding will not be treated as image data.")
        else:
            is_enc_image_data = is_image_data
        generator = build_inn(
            cfg=CFG,
            autoencoder=autoencoder,
            ae_loss_fn=recon_loss_fn,
            is_image_data=is_enc_image_data,
            save_dir=save_dir,
            ae_enc_shape=enc_shape,
            context_loader=context_loader,
        )
        encoding_size = generator.encoding_size
    else:
        zs_dim = round(ARGS.zs_frac * enc_shape[0])
        zy_dim = enc_shape[0] - zs_dim
        encoding_size = EncodingSize(zs=zs_dim, zy=zy_dim)
        generator = build_ae(
            cfg=CFG,
            encoder=encoder,
            decoder=decoder,
            encoding_size=encoding_size,
            feature_group_slices=feature_group_slices,
        )
        # load pretrained encoder if one is provided
        if ARGS.use_pretrained_enc and cluster_results is not None:
            save_dict = torch.load(cluster_results.enc_path,
                                   map_location=lambda storage, loc: storage)
            generator.load_state_dict(save_dict["encoder"])
            if "args" in save_dict:
                args_encoder = save_dict["args"]
                assert args_encoder[
                    "encoder_type"] == "vae" if ARGS.vae else "ae"
                assert args_encoder["levels"] == ENC.levels

    log.info(f"Encoding shape: {enc_shape}, {encoding_size}")

    # ================================== Initialise Discriminator =================================

    disc_optimizer_kwargs = {"lr": ARGS.disc_lr}
    disc_input_shape: Tuple[
        int, ...] = input_shape if ARGS.train_on_recon else enc_shape
    # FIXME: Architectures need to be GAN specific (e.g. incorporate spectral norm)
    disc_fn: ModelFn
    if is_image_data and ARGS.train_on_recon:
        if DATA.dataset == DS.cmnist:
            disc_fn = Strided28x28Net(batch_norm=False)
        else:
            disc_fn = Residual64x64Net(batch_norm=False)
    else:
        disc_fn = FcNet(hidden_dims=ARGS.disc_hidden_dims)
        # FcNet first flattens the input
        disc_input_shape = ((prod(disc_input_shape), ) if isinstance(
            disc_input_shape, Sequence) else disc_input_shape)

    if ARGS.batch_wise_loss != BWLoss.none:
        final_proj = FcNet(ARGS.batch_wise_hidden_dims
                           ) if ARGS.batch_wise_hidden_dims else None
        aggregator: Aggregator
        if ARGS.batch_wise_loss == BWLoss.attention:
            aggregator = AttentionAggregator(ARGS.batch_wise_latent,
                                             final_proj=final_proj)
        elif ARGS.batch_wise_loss == BWLoss.simple:
            aggregator = SimpleAggregator(latent_dim=ARGS.batch_wise_latent,
                                          final_proj=final_proj)
        elif ARGS.batch_wise_loss == BWLoss.transposed:
            aggregator = SimpleAggregatorT(batch_dim=ARGS.batch_size,
                                           final_proj=final_proj)

        disc_fn = ModelAggregatorWrapper(disc_fn,
                                         aggregator,
                                         embed_dim=ARGS.batch_wise_latent)

    components: Union[AeComponents, InnComponents]
    disc: Classifier
    if not ARGS.use_inn:
        disc_list = []
        for k in range(ARGS.num_discs):
            disc = build_discriminator(
                input_shape=disc_input_shape,
                target_dim=1,  # real vs fake
                model_fn=disc_fn,
                optimizer_kwargs=disc_optimizer_kwargs,
            )
            disc_list.append(disc)
        disc_ensemble = nn.ModuleList(disc_list)
        disc_ensemble.to(device)

        predictor_y = build_discriminator(
            input_shape=(
                prod(enc_shape), ),  # this is always trained on encodings
            target_dim=datasets.y_dim,
            model_fn=FcNet(hidden_dims=None),  # no hidden layers
            optimizer_kwargs=disc_optimizer_kwargs,
        )
        predictor_y.to(device)

        predictor_s = build_discriminator(
            input_shape=(
                prod(enc_shape), ),  # this is always trained on encodings
            target_dim=datasets.s_dim,
            model_fn=FcNet(hidden_dims=None),  # no hidden layers
            optimizer_kwargs=disc_optimizer_kwargs,
        )
        predictor_s.to(device)

        components = AeComponents(
            generator=generator,
            disc_ensemble=disc_ensemble,
            recon_loss_fn=recon_loss_fn,
            predictor_y=predictor_y,
            predictor_s=predictor_s,
        )
    else:
        disc_list = []
        for k in range(ARGS.num_discs):
            disc = build_discriminator(
                input_shape=disc_input_shape,
                target_dim=1,  # real vs fake
                model_fn=disc_fn,
                optimizer_kwargs=disc_optimizer_kwargs,
            )
            disc_list.append(disc)
        disc_ensemble = nn.ModuleList(disc_list)
        disc_ensemble.to(device)

        # classifier for y
        class_fn: ModelFn
        if is_image_data:
            if DATA.dataset == DS.cmnist:
                class_fn = Strided28x28Net(batch_norm=False)
            else:
                class_fn = Residual64x64Net(batch_norm=False)
        else:
            class_fn = FcNet(hidden_dims=ARGS.disc_hidden_dims)
        predictor = None
        if ARGS.train_on_recon and ARGS.pred_y_weight > 0:
            predictor = build_discriminator(
                input_shape=input_shape,
                target_dim=datasets.y_dim,  # real vs fake
                model_fn=class_fn,
                optimizer_kwargs=disc_optimizer_kwargs,
            )
            predictor.to(device)
            predictor.fit(Subset(datasets.context, np.arange(100)), 50, device,
                          test_loader)
        components = InnComponents(inn=generator,
                                   disc_ensemble=disc_ensemble,
                                   predictor=predictor)

    start_itr = 1  # start at 1 so that the val_freq works correctly
    # Resume from checkpoint
    if MISC.resume is not None:
        log.info("Restoring generator from checkpoint")
        generator, start_itr = restore_model(CFG, Path(MISC.resume), generator)
        if MISC.evaluate:
            log_metrics(
                CFG,
                generator,
                datasets,
                0,
                save_to_csv=Path(to_absolute_path(MISC.save_dir)),
                cluster_test_metrics=cluster_test_metrics,
                cluster_context_metrics=cluster_context_metrics,
            )
            if run is not None:
                run.finish(
                )  # this allows multiple experiments in one python process
            return generator

    # Logging
    log.info(f"Number of trainable parameters: {count_parameters(generator)}")

    itr = start_itr
    disc: nn.Module
    loss_meters: Optional[Dict[str, AverageMeter]] = None
    start_time = time.monotonic()

    for itr in range(start_itr, ARGS.iters + 1):

        logging_dict = train_step(
            components=components,
            context_data_itr=context_data_itr,
            train_data_itr=train_data_itr,
            itr=itr,
        )
        if loss_meters is None:
            loss_meters = {name: AverageMeter() for name in logging_dict}
        for name, value in logging_dict.items():
            loss_meters[name].update(value)

        if itr % ARGS.log_freq == 0:
            assert loss_meters is not None
            log_string = " | ".join(f"{name}: {loss.avg:.5g}"
                                    for name, loss in loss_meters.items())
            elapsed = time.monotonic() - start_time
            log.info(
                "[TRN] Iteration {:04d} | Elapsed: {} | Iterations/s: {:.4g} | {}"
                .format(
                    itr,
                    readable_duration(elapsed),
                    ARGS.log_freq / elapsed,
                    log_string,
                ))

            loss_meters = None
            start_time = time.monotonic()

        if ARGS.validate and itr % ARGS.val_freq == 0:
            if itr == ARGS.val_freq:  # first validation
                baseline_metrics(CFG,
                                 datasets,
                                 save_to_csv=Path(
                                     to_absolute_path(MISC.save_dir)))
            log_metrics(CFG, model=generator, data=datasets, step=itr)
            save_model(CFG, save_dir, model=generator, itr=itr, sha=sha)

        if ARGS.disc_reset_prob > 0:
            for k, discriminator in enumerate(components.disc_ensemble):
                if np.random.uniform() < ARGS.disc_reset_prob:
                    log.info(f"Reinitializing discriminator {k}")
                    discriminator.reset_parameters()

    log.info("Training has finished.")
    # path = save_model(args, save_dir, model=generator, epoch=epoch, sha=sha)
    # generator, _ = restore_model(args, path, model=generator)
    log_metrics(
        CFG,
        model=generator,
        data=datasets,
        save_to_csv=Path(to_absolute_path(MISC.save_dir)),
        step=itr,
        cluster_test_metrics=cluster_test_metrics,
        cluster_context_metrics=cluster_context_metrics,
    )
    if run is not None:
        run.finish()  # this allows multiple experiments in one python process
    return generator
Example #8
0
 def users(self):
     return [
         e.title.text for e in utils.flatten(
             [u.entry for u in self.apps_client.GetGeneratorForAllUsers()])
         if e.login.suspended == 'false'
     ]