def convert_and_save_results( cfg: Config, cluster_label_path: Path, results: Tuple[Tensor, Tensor, Tensor], enc_path: Path, context_metrics: Optional[Dict[str, float]], test_metrics: Optional[Dict[str, float]] = None, ) -> Path: clusters, s, y = results s_count = cfg.misc._s_dim if cfg.misc._s_dim > 1 else 2 class_ids = get_class_id(s=s, y=y, s_count=s_count, to_cluster=cfg.clust.cluster) cluster_results = ClusterResults( flags=flatten( OmegaConf.to_container(cfg, resolve=True, enum_to_str=True)), cluster_ids=clusters, class_ids=class_ids, enc_path=enc_path, context_metrics=context_metrics, test_metrics=test_metrics, ) return save_results(save_path=cluster_label_path, cluster_results=cluster_results)
def solve_polynomials(order: int, num_range: int) -> None: """ solve a large number of polynomials and save the data to a database """ dimensions = utils.repeat_val(order, utils.sequence(-num_range, num_range)) space = itertools.product(*dimensions) total_count = utils.product(len(val) for val in dimensions) show_spash(total_count, dimensions) start = time.time() root_count = 0 conn = sqlite3.connect('./db.sqlite') curse = conn.cursor() curse.execute('PRAGMA synchronous = OFF') curse.execute(create_table(len(dimensions) - 1)) conn.commit() solutions = [] for point in space: root_count += 1 id = ','.join(map(str, point)) roots = [[root.real, root.imag] for root in numpy.roots(point)] solution = [id] + utils.flatten(roots) solutions.append(solution) # -- write the solutions to a database occasionally if len(solutions) > constants['batch_size']: try: curse.executemany(insert_row(len(dimensions) - 1), solutions) except Exception as err: print(err) conn.commit() solutions = [] display_progress(root_count, total_count, start) # -- flush the remaining records try: curse.executemany(insert_row(len(dimensions) - 1), solutions) except Exception as err: print(err) # -- close the DB connection. conn.commit() conn.close()
def train( cfg: Config, encoder: Encoder, context_data: Dataset, num_clusters: int, s_count: int, enc_path: Path, ) -> ClusterResults: # encode the training set with the encoder encoded = encode_dataset(cfg, context_data, encoder) # create data loader with one giant batch data_loader = DataLoader(encoded, batch_size=len(encoded), shuffle=False) encoded, s, y = next(iter(data_loader)) preds = run_kmeans_faiss( encoded, nmb_clusters=num_clusters, cuda=str(cfg.misc._device) != "cpu", n_iter=cfg.clust.epochs, verbose=True, ) cluster_ids = preds.cpu().numpy() # preds, _ = run_kmeans_torch(encoded, num_clusters, device=args._device, n_iter=args.epochs, verbose=True) counts = np.zeros((num_clusters, num_clusters), dtype=np.int64) counts, class_ids = count_occurances(counts, cluster_ids, s, y, s_count, cfg.clust.cluster) _, context_metrics, logging_dict = cluster_metrics( cluster_ids=cluster_ids, counts=counts, true_class_ids=class_ids.numpy(), num_total=preds.size(0), s_count=s_count, to_cluster=cfg.clust.cluster, ) prepared = (f"{k}: {v:.5g}" if isinstance(v, float) else f"{k}: {v}" for k, v in logging_dict.items()) log.info(" | ".join(prepared)) wandb_log(cfg.misc, logging_dict, step=0) log.info("Context metrics:") print_metrics({f"Context {k}": v for k, v in context_metrics.items()}) return ClusterResults( flags=flatten( OmegaConf.to_container(cfg, resolve=True, enum_to_str=True)), cluster_ids=preds, class_ids=get_class_id(s=s, y=y, s_count=s_count, to_cluster=cfg.clust.cluster), enc_path=enc_path, context_metrics=context_metrics, )
def save_model(cfg: Config, save_dir: Path, model: Model, epoch: int, sha: str, best: bool = False) -> Path: if best: filename = save_dir / "checkpt_best.pth" else: filename = save_dir / f"checkpt_epoch{epoch}.pth" save_dict = { "args": flatten(OmegaConf.to_container(cfg, resolve=True, enum_to_str=True)), "sha": sha, "model": model.state_dict(), "epoch": epoch, } torch.save(save_dict, filename) return filename
def users(self): return [e.title.text for e in utils.flatten( [u.entry for u in self.apps_client.GetGeneratorForAllUsers()]) if e.login.suspended == 'false']
def main(cfg: Config, cluster_label_file: Optional[Path] = None) -> Tuple[Model, Path]: """Main function Args: cluster_label_file: path to a pth file with cluster IDs use_wandb: this arguments overwrites the flag Returns: the trained generator """ # ==== initialize globals ==== global ARGS, CFG, DATA, ENC, MISC ARGS = cfg.clust CFG = cfg DATA = cfg.data ENC = cfg.enc MISC = cfg.misc # ==== current git commit ==== if os.environ.get("STARTED_BY_GUILDAI", None) == "1": sha = "" else: repo = git.Repo(search_parent_directories=True) sha = repo.head.object.hexsha use_gpu = torch.cuda.is_available() and MISC.gpu >= 0 random_seed(MISC.seed, use_gpu) if cluster_label_file is not None: MISC.cluster_label_file = str(cluster_label_file) run = None if MISC.use_wandb: group = "" if MISC.log_method: group += MISC.log_method if MISC.exp_group: group += "." + MISC.exp_group if cfg.bias.log_dataset: group += "." + cfg.bias.log_dataset run = wandb.init( entity="anonymous", project="fcm-hydra", config=flatten( OmegaConf.to_container(cfg, resolve=True, enum_to_str=True)), group=group if group else None, reinit=True, ) save_dir = Path(to_absolute_path(MISC.save_dir)) / str(time.time()) save_dir.mkdir(parents=True, exist_ok=True) log.info(str(OmegaConf.to_yaml(cfg, resolve=True, sort_keys=True))) log.info(f"Save directory: {save_dir.resolve()}") # ==== check GPU ==== MISC._device = f"cuda:{MISC.gpu}" if use_gpu else "cpu" device = torch.device(MISC._device) log.info( f"{torch.cuda.device_count()} GPUs available. Using device '{device}'") # ==== construct dataset ==== datasets: DatasetTriplet = load_dataset(CFG) log.info("Size of context-set: {}, training-set: {}, test-set: {}".format( len(datasets.context), len(datasets.train), len(datasets.test), )) ARGS.test_batch_size = ARGS.test_batch_size if ARGS.test_batch_size else ARGS.batch_size context_batch_size = round(ARGS.batch_size * len(datasets.context) / len(datasets.train)) context_loader = DataLoader( datasets.context, shuffle=True, batch_size=context_batch_size, num_workers=MISC.num_workers, pin_memory=True, ) enc_train_data = ConcatDataset([datasets.context, datasets.train]) if ARGS.encoder == Enc.rotnet: enc_train_loader = DataLoader( RotationPrediction(enc_train_data, apply_all=True), shuffle=True, batch_size=ARGS.batch_size, num_workers=MISC.num_workers, pin_memory=True, collate_fn=adaptive_collate, ) else: enc_train_loader = DataLoader( enc_train_data, shuffle=True, batch_size=ARGS.batch_size, num_workers=MISC.num_workers, pin_memory=True, ) train_loader = DataLoader( datasets.train, shuffle=True, batch_size=ARGS.batch_size, num_workers=MISC.num_workers, pin_memory=True, ) val_loader = DataLoader( datasets.test, shuffle=False, batch_size=ARGS.test_batch_size, num_workers=MISC.num_workers, pin_memory=True, ) # ==== construct networks ==== input_shape = get_data_dim(context_loader) s_count = datasets.s_dim if datasets.s_dim > 1 else 2 y_count = datasets.y_dim if datasets.y_dim > 1 else 2 if ARGS.cluster == CL.s: num_clusters = s_count elif ARGS.cluster == CL.y: num_clusters = y_count else: num_clusters = s_count * y_count log.info( f"Number of clusters: {num_clusters}, accuracy computed with respect to {ARGS.cluster.name}" ) mappings: List[str] = [] for i in range(num_clusters): if ARGS.cluster == CL.s: mappings.append(f"{i}: s = {i}") elif ARGS.cluster == CL.y: mappings.append(f"{i}: y = {i}") else: # class_id = y * s_count + s mappings.append(f"{i}: (y = {i // s_count}, s = {i % s_count})") log.info("class IDs:\n\t" + "\n\t".join(mappings)) feature_group_slices = getattr(datasets.context, "feature_group_slices", None) # ================================= encoder ================================= encoder: Encoder enc_shape: Tuple[int, ...] if ARGS.encoder in (Enc.ae, Enc.vae): encoder, enc_shape = build_ae(CFG, input_shape, feature_group_slices) else: if len(input_shape) < 2: raise ValueError("RotNet can only be applied to image data.") enc_optimizer_kwargs = {"lr": ARGS.enc_lr, "weight_decay": ARGS.enc_wd} enc_kwargs = { "pretrained": False, "num_classes": 4, "zero_init_residual": True } net = resnet18( **enc_kwargs) if DATA.dataset == DS.cmnist else resnet50( **enc_kwargs) encoder = SelfSupervised(model=net, num_classes=4, optimizer_kwargs=enc_optimizer_kwargs) enc_shape = (512, ) encoder.to(device) log.info(f"Encoding shape: {enc_shape}") enc_path: Path if ARGS.enc_path: enc_path = Path(ARGS.enc_path) if ARGS.encoder == Enc.rotnet: assert isinstance(encoder, SelfSupervised) encoder = encoder.get_encoder() save_dict = torch.load(ARGS.enc_path, map_location=lambda storage, loc: storage) encoder.load_state_dict(save_dict["encoder"]) if "args" in save_dict: args_encoder = save_dict["args"] assert ARGS.encoder.name == args_encoder["encoder_type"] assert ENC.levels == args_encoder["levels"] else: encoder.fit(enc_train_loader, epochs=ARGS.enc_epochs, device=device, use_wandb=ARGS.enc_wandb) if ARGS.encoder == Enc.rotnet: assert isinstance(encoder, SelfSupervised) encoder = encoder.get_encoder() # the args names follow the convention of the standalone VAE commandline args args_encoder = { "encoder_type": ARGS.encoder.name, "levels": ENC.levels } enc_path = save_dir.resolve() / "encoder" torch.save({ "encoder": encoder.state_dict(), "args": args_encoder }, enc_path) log.info(f"To make use of this encoder:\n--enc-path {enc_path}") if ARGS.enc_wandb: log.info("Stopping here because W&B will be messed up...") if run is not None: run.finish( ) # this allows multiple experiments in one python process return cluster_label_path = get_cluster_label_path(MISC, save_dir) if ARGS.method == Meth.kmeans: kmeans_results = train_k_means(CFG, encoder, datasets.context, num_clusters, s_count, enc_path) pth = save_results(save_path=cluster_label_path, cluster_results=kmeans_results) if run is not None: run.finish( ) # this allows multiple experiments in one python process return (), pth if ARGS.finetune_encoder: encoder.freeze_initial_layers(ARGS.freeze_layers, { "lr": ARGS.finetune_lr, "weight_decay": ARGS.weight_decay }) # ================================= labeler ================================= pseudo_labeler: PseudoLabeler if ARGS.pseudo_labeler == PL.ranking: pseudo_labeler = RankingStatistics(k_num=ARGS.k_num) elif ARGS.pseudo_labeler == PL.cosine: pseudo_labeler = CosineSimThreshold( upper_threshold=ARGS.upper_threshold, lower_threshold=ARGS.lower_threshold) # ================================= method ================================= method: Method if ARGS.method == Meth.pl_enc: method = PseudoLabelEnc() elif ARGS.method == Meth.pl_output: method = PseudoLabelOutput() elif ARGS.method == Meth.pl_enc_no_norm: method = PseudoLabelEncNoNorm() # ================================= classifier ================================= clf_optimizer_kwargs = {"lr": ARGS.lr, "weight_decay": ARGS.weight_decay} clf_fn = FcNet(hidden_dims=ARGS.cl_hidden_dims) clf_input_shape = (prod(enc_shape), ) # FcNet first flattens the input classifier = build_classifier( input_shape=clf_input_shape, target_dim=s_count if ARGS.use_multi_head else num_clusters, model_fn=clf_fn, optimizer_kwargs=clf_optimizer_kwargs, num_heads=y_count if ARGS.use_multi_head else 1, ) classifier.to(device) model: Union[Model, MultiHeadModel] if ARGS.use_multi_head: labeler_fn: ModelFn if DATA.dataset == DS.cmnist: labeler_fn = Mp32x23Net(batch_norm=True) elif DATA.dataset == DS.celeba: labeler_fn = Mp64x64Net(batch_norm=True) else: labeler_fn = FcNet(hidden_dims=ARGS.labeler_hidden_dims) labeler_optimizer_kwargs = { "lr": ARGS.labeler_lr, "weight_decay": ARGS.labeler_wd } labeler: Classifier = build_classifier( input_shape=input_shape, target_dim=s_count, model_fn=labeler_fn, optimizer_kwargs=labeler_optimizer_kwargs, ) labeler.to(device) log.info("Fitting the labeler to the labeled data.") labeler.fit( train_loader, epochs=ARGS.labeler_epochs, device=device, use_wandb=ARGS.labeler_wandb, ) labeler.eval() model = MultiHeadModel( encoder=encoder, classifiers=classifier, method=method, pseudo_labeler=pseudo_labeler, labeler=labeler, train_encoder=ARGS.finetune_encoder, ) else: model = Model( encoder=encoder, classifier=classifier, method=method, pseudo_labeler=pseudo_labeler, train_encoder=ARGS.finetune_encoder, ) start_epoch = 1 # start at 1 so that the val_freq works correctly # Resume from checkpoint if MISC.resume is not None: log.info("Restoring generator from checkpoint") model, start_epoch = restore_model(CFG, Path(MISC.resume), model) if MISC.evaluate: pth_path = convert_and_save_results( CFG, cluster_label_path, classify_dataset(CFG, model, datasets.context), enc_path=enc_path, context_metrics={}, # TODO: compute this ) if run is not None: run.finish( ) # this allows multiple experiments in one python process return model, pth_path # Logging # wandb.set_model_graph(str(generator)) num_parameters = count_parameters(model) log.info(f"Number of trainable parameters: {num_parameters}") # best_loss = float("inf") best_acc = 0.0 n_vals_without_improvement = 0 # super_val_freq = ARGS.super_val_freq or ARGS.val_freq itr = 0 # Train generator for N epochs for epoch in range(start_epoch, start_epoch + ARGS.epochs): if n_vals_without_improvement > ARGS.early_stopping > 0: break itr = train(model=model, context_data=context_loader, train_data=train_loader, epoch=epoch) if epoch % ARGS.val_freq == 0: val_acc, _, val_log = validate(model, val_loader) if val_acc > best_acc: best_acc = val_acc save_model(CFG, save_dir, model, epoch=epoch, sha=sha, best=True) n_vals_without_improvement = 0 else: n_vals_without_improvement += 1 prepare = (f"{k}: {v:.5g}" if isinstance(v, float) else f"{k}: {v}" for k, v in val_log.items()) log.info("[VAL] Epoch {:04d} | {} | " "No improvement during validation: {:02d}".format( epoch, " | ".join(prepare), n_vals_without_improvement, )) wandb_log(MISC, val_log, step=itr) # if ARGS.super_val and epoch % super_val_freq == 0: # log_metrics(ARGS, model=model.bundle, data=datasets, step=itr) # save_model(args, save_dir, model=model.bundle, epoch=epoch, sha=sha) log.info("Training has finished.") # path = save_model(args, save_dir, model=model, epoch=epoch, sha=sha) # model, _ = restore_model(args, path, model=model) _, test_metrics, _ = validate(model, val_loader) _, context_metrics, _ = validate(model, context_loader) log.info("Test metrics:") print_metrics({f"Test {k}": v for k, v in test_metrics.items()}) log.info("Context metrics:") print_metrics({f"Context {k}": v for k, v in context_metrics.items()}) pth_path = convert_and_save_results( CFG, cluster_label_path=cluster_label_path, results=classify_dataset(CFG, model, datasets.context), enc_path=enc_path, context_metrics=context_metrics, test_metrics=test_metrics, ) if run is not None: run.finish() # this allows multiple experiments in one python process return model, pth_path
def main(cfg: Config, cluster_label_file: Optional[Path] = None) -> Generator: """Main function. Args: cluster_label_file: path to a pth file with cluster IDs initialize_wandb: if False, we assume that W&B has already been initialized Returns: the trained generator """ # ==== initialize globals ==== global ARGS, CFG, DATA, ENC, MISC ARGS = cfg.fdm CFG = cfg DATA = cfg.data ENC = cfg.enc MISC = cfg.misc # ==== current git commit ==== repo = git.Repo(search_parent_directories=True) sha = repo.head.object.hexsha use_gpu = torch.cuda.is_available() and MISC.gpu >= 0 random_seed(MISC.seed, use_gpu) if cluster_label_file is not None: MISC.cluster_label_file = str(cluster_label_file) run = None if MISC.use_wandb: project_suffix = f"-{DATA.dataset.name}" if DATA.dataset != DS.cmnist else "" group = "" if MISC.log_method: group += MISC.log_method if MISC.exp_group: group += "." + MISC.exp_group if cfg.bias.log_dataset: group += "." + cfg.bias.log_dataset run = wandb.init( entity="anonymous", project="fdm-hydra" + project_suffix, config=flatten( OmegaConf.to_container(cfg, resolve=True, enum_to_str=True)), group=group if group else None, reinit=True, ) save_dir = Path(to_absolute_path(MISC.save_dir)) / str(time.time()) save_dir.mkdir(parents=True, exist_ok=True) log.info(str(OmegaConf.to_yaml(cfg, resolve=True, sort_keys=True))) log.info(f"Save directory: {save_dir.resolve()}") # ==== check GPU ==== MISC._device = f"cuda:{MISC.gpu}" if use_gpu else "cpu" device = torch.device(MISC._device) log.info( f"{torch.cuda.device_count()} GPUs available. Using device '{device}'") # ==== construct dataset ==== datasets: DatasetTriplet = load_dataset(CFG) log.info("Size of context-set: {}, training-set: {}, test-set: {}".format( len(datasets.context), len(datasets.train), len(datasets.test), )) ARGS.test_batch_size = ARGS.test_batch_size if ARGS.test_batch_size else ARGS.batch_size s_count = max(datasets.s_dim, 2) cluster_results = None cluster_test_metrics: Dict[str, float] = {} cluster_context_metrics: Dict[str, float] = {} if MISC.cluster_label_file: cluster_results = load_results(CFG) cluster_test_metrics = cluster_results.test_metrics or {} cluster_context_metrics = cluster_results.context_metrics or {} weights, n_clusters, min_count, max_count = weight_for_balance( cluster_results.cluster_ids, min_size=None if ARGS.oversample else ARGS.batch_size) # if ARGS.oversample, oversample the smaller clusters instead of undersample the larger ones num_samples = n_clusters * max_count if ARGS.oversample else n_clusters * min_count assert num_samples > ARGS.batch_size, "not enough samples for a batch" context_sampler = WeightedRandomSampler(weights, num_samples, replacement=ARGS.oversample) dataloader_kwargs = dict(sampler=context_sampler) elif ARGS.balanced_context: context_sampler = build_weighted_sampler_from_dataset( dataset=datasets.context, s_count=s_count, test_batch_size=ARGS.test_batch_size, batch_size=ARGS.batch_size, num_workers=0, # can easily get stuck with more workers oversample=ARGS.oversample, balance_hierarchical=False, ) dataloader_kwargs = dict(sampler=context_sampler, shuffle=False) else: dataloader_kwargs = dict(shuffle=True) context_loader = DataLoader( datasets.context, batch_size=ARGS.batch_size, num_workers=MISC.num_workers, pin_memory=True, drop_last=True, **dataloader_kwargs, ) train_sampler = build_weighted_sampler_from_dataset( dataset=datasets.train, s_count=s_count, test_batch_size=ARGS.test_batch_size, batch_size=ARGS.batch_size, num_workers=0, # can easily get stuck with more workers oversample=ARGS.oversample, balance_hierarchical=True, ) train_loader = DataLoader( dataset=datasets.train, batch_size=ARGS.batch_size, num_workers=MISC.num_workers, drop_last=True, shuffle=False, sampler=train_sampler, pin_memory=True, ) test_loader = DataLoader( datasets.test, shuffle=False, batch_size=ARGS.test_batch_size, num_workers=MISC.num_workers, pin_memory=True, drop_last=False, ) context_data_itr = inf_generator(context_loader) train_data_itr = inf_generator(train_loader) # ==== construct networks ==== input_shape = next(context_data_itr)[0][0].shape is_image_data = len(input_shape) > 2 feature_group_slices = getattr(datasets.context, "feature_group_slices", None) if is_image_data: decoding_dim = input_shape[ 0] * 256 if ENC.recon_loss == RL.ce else input_shape[0] # if ARGS.recon_loss == "ce": decoder_out_act = None # else: # decoder_out_act = nn.Sigmoid() if ARGS.dataset == "cmnist" else nn.Tanh() encoder, decoder, enc_shape = conv_autoencoder( input_shape, ENC.init_chans, encoding_dim=ENC.out_dim, decoding_dim=decoding_dim, levels=ENC.levels, decoder_out_act=decoder_out_act, variational=ARGS.vae, ) else: encoder, decoder, enc_shape = fc_autoencoder( input_shape, ENC.init_chans, encoding_dim=ENC.out_dim, levels=ENC.levels, variational=ARGS.vae, ) if ARGS.enc_snorm: def _snorm(_module: nn.Module) -> nn.Module: if hasattr(_module, "weight"): return torch.nn.utils.spectral_norm(_module) return _module encoder.apply(_snorm) recon_loss_fn_: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] if ENC.recon_loss == RL.l1: recon_loss_fn_ = nn.L1Loss(reduction="sum") elif ENC.recon_loss == RL.l2: recon_loss_fn_ = nn.MSELoss(reduction="sum") elif ENC.recon_loss == RL.bce: recon_loss_fn_ = nn.BCELoss(reduction="sum") elif ENC.recon_loss == RL.huber: recon_loss_fn_ = lambda x, y: 0.1 * F.smooth_l1_loss( x * 10, y * 10, reduction="sum") elif ENC.recon_loss == RL.ce: recon_loss_fn_ = PixelCrossEntropy(reduction="sum") elif ENC.recon_loss == RL.mixed: assert feature_group_slices is not None, "can only do multi gen_loss with feature groups" recon_loss_fn_ = MixedLoss(feature_group_slices, reduction="sum") else: raise ValueError( f"{ENC.recon_loss} is an invalid reconstruction gen_loss") recon_loss_fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] if ARGS.vgg_weight != 0: vgg_loss = VGGLoss() vgg_loss.to(device) def recon_loss_fn(input_: torch.Tensor, target: torch.Tensor) -> torch.Tensor: return recon_loss_fn_( input_, target) + ARGS.vgg_weight * vgg_loss(input_, target) else: recon_loss_fn = recon_loss_fn_ generator: Generator if ARGS.use_inn: autoencoder = build_ae(CFG, encoder, decoder, encoding_size=None, feature_group_slices=feature_group_slices) if prod(enc_shape) == enc_shape[0]: is_enc_image_data = False log.info("Encoding will not be treated as image data.") else: is_enc_image_data = is_image_data generator = build_inn( cfg=CFG, autoencoder=autoencoder, ae_loss_fn=recon_loss_fn, is_image_data=is_enc_image_data, save_dir=save_dir, ae_enc_shape=enc_shape, context_loader=context_loader, ) encoding_size = generator.encoding_size else: zs_dim = round(ARGS.zs_frac * enc_shape[0]) zy_dim = enc_shape[0] - zs_dim encoding_size = EncodingSize(zs=zs_dim, zy=zy_dim) generator = build_ae( cfg=CFG, encoder=encoder, decoder=decoder, encoding_size=encoding_size, feature_group_slices=feature_group_slices, ) # load pretrained encoder if one is provided if ARGS.use_pretrained_enc and cluster_results is not None: save_dict = torch.load(cluster_results.enc_path, map_location=lambda storage, loc: storage) generator.load_state_dict(save_dict["encoder"]) if "args" in save_dict: args_encoder = save_dict["args"] assert args_encoder[ "encoder_type"] == "vae" if ARGS.vae else "ae" assert args_encoder["levels"] == ENC.levels log.info(f"Encoding shape: {enc_shape}, {encoding_size}") # ================================== Initialise Discriminator ================================= disc_optimizer_kwargs = {"lr": ARGS.disc_lr} disc_input_shape: Tuple[ int, ...] = input_shape if ARGS.train_on_recon else enc_shape # FIXME: Architectures need to be GAN specific (e.g. incorporate spectral norm) disc_fn: ModelFn if is_image_data and ARGS.train_on_recon: if DATA.dataset == DS.cmnist: disc_fn = Strided28x28Net(batch_norm=False) else: disc_fn = Residual64x64Net(batch_norm=False) else: disc_fn = FcNet(hidden_dims=ARGS.disc_hidden_dims) # FcNet first flattens the input disc_input_shape = ((prod(disc_input_shape), ) if isinstance( disc_input_shape, Sequence) else disc_input_shape) if ARGS.batch_wise_loss != BWLoss.none: final_proj = FcNet(ARGS.batch_wise_hidden_dims ) if ARGS.batch_wise_hidden_dims else None aggregator: Aggregator if ARGS.batch_wise_loss == BWLoss.attention: aggregator = AttentionAggregator(ARGS.batch_wise_latent, final_proj=final_proj) elif ARGS.batch_wise_loss == BWLoss.simple: aggregator = SimpleAggregator(latent_dim=ARGS.batch_wise_latent, final_proj=final_proj) elif ARGS.batch_wise_loss == BWLoss.transposed: aggregator = SimpleAggregatorT(batch_dim=ARGS.batch_size, final_proj=final_proj) disc_fn = ModelAggregatorWrapper(disc_fn, aggregator, embed_dim=ARGS.batch_wise_latent) components: Union[AeComponents, InnComponents] disc: Classifier if not ARGS.use_inn: disc_list = [] for k in range(ARGS.num_discs): disc = build_discriminator( input_shape=disc_input_shape, target_dim=1, # real vs fake model_fn=disc_fn, optimizer_kwargs=disc_optimizer_kwargs, ) disc_list.append(disc) disc_ensemble = nn.ModuleList(disc_list) disc_ensemble.to(device) predictor_y = build_discriminator( input_shape=( prod(enc_shape), ), # this is always trained on encodings target_dim=datasets.y_dim, model_fn=FcNet(hidden_dims=None), # no hidden layers optimizer_kwargs=disc_optimizer_kwargs, ) predictor_y.to(device) predictor_s = build_discriminator( input_shape=( prod(enc_shape), ), # this is always trained on encodings target_dim=datasets.s_dim, model_fn=FcNet(hidden_dims=None), # no hidden layers optimizer_kwargs=disc_optimizer_kwargs, ) predictor_s.to(device) components = AeComponents( generator=generator, disc_ensemble=disc_ensemble, recon_loss_fn=recon_loss_fn, predictor_y=predictor_y, predictor_s=predictor_s, ) else: disc_list = [] for k in range(ARGS.num_discs): disc = build_discriminator( input_shape=disc_input_shape, target_dim=1, # real vs fake model_fn=disc_fn, optimizer_kwargs=disc_optimizer_kwargs, ) disc_list.append(disc) disc_ensemble = nn.ModuleList(disc_list) disc_ensemble.to(device) # classifier for y class_fn: ModelFn if is_image_data: if DATA.dataset == DS.cmnist: class_fn = Strided28x28Net(batch_norm=False) else: class_fn = Residual64x64Net(batch_norm=False) else: class_fn = FcNet(hidden_dims=ARGS.disc_hidden_dims) predictor = None if ARGS.train_on_recon and ARGS.pred_y_weight > 0: predictor = build_discriminator( input_shape=input_shape, target_dim=datasets.y_dim, # real vs fake model_fn=class_fn, optimizer_kwargs=disc_optimizer_kwargs, ) predictor.to(device) predictor.fit(Subset(datasets.context, np.arange(100)), 50, device, test_loader) components = InnComponents(inn=generator, disc_ensemble=disc_ensemble, predictor=predictor) start_itr = 1 # start at 1 so that the val_freq works correctly # Resume from checkpoint if MISC.resume is not None: log.info("Restoring generator from checkpoint") generator, start_itr = restore_model(CFG, Path(MISC.resume), generator) if MISC.evaluate: log_metrics( CFG, generator, datasets, 0, save_to_csv=Path(to_absolute_path(MISC.save_dir)), cluster_test_metrics=cluster_test_metrics, cluster_context_metrics=cluster_context_metrics, ) if run is not None: run.finish( ) # this allows multiple experiments in one python process return generator # Logging log.info(f"Number of trainable parameters: {count_parameters(generator)}") itr = start_itr disc: nn.Module loss_meters: Optional[Dict[str, AverageMeter]] = None start_time = time.monotonic() for itr in range(start_itr, ARGS.iters + 1): logging_dict = train_step( components=components, context_data_itr=context_data_itr, train_data_itr=train_data_itr, itr=itr, ) if loss_meters is None: loss_meters = {name: AverageMeter() for name in logging_dict} for name, value in logging_dict.items(): loss_meters[name].update(value) if itr % ARGS.log_freq == 0: assert loss_meters is not None log_string = " | ".join(f"{name}: {loss.avg:.5g}" for name, loss in loss_meters.items()) elapsed = time.monotonic() - start_time log.info( "[TRN] Iteration {:04d} | Elapsed: {} | Iterations/s: {:.4g} | {}" .format( itr, readable_duration(elapsed), ARGS.log_freq / elapsed, log_string, )) loss_meters = None start_time = time.monotonic() if ARGS.validate and itr % ARGS.val_freq == 0: if itr == ARGS.val_freq: # first validation baseline_metrics(CFG, datasets, save_to_csv=Path( to_absolute_path(MISC.save_dir))) log_metrics(CFG, model=generator, data=datasets, step=itr) save_model(CFG, save_dir, model=generator, itr=itr, sha=sha) if ARGS.disc_reset_prob > 0: for k, discriminator in enumerate(components.disc_ensemble): if np.random.uniform() < ARGS.disc_reset_prob: log.info(f"Reinitializing discriminator {k}") discriminator.reset_parameters() log.info("Training has finished.") # path = save_model(args, save_dir, model=generator, epoch=epoch, sha=sha) # generator, _ = restore_model(args, path, model=generator) log_metrics( CFG, model=generator, data=datasets, save_to_csv=Path(to_absolute_path(MISC.save_dir)), step=itr, cluster_test_metrics=cluster_test_metrics, cluster_context_metrics=cluster_context_metrics, ) if run is not None: run.finish() # this allows multiple experiments in one python process return generator
def users(self): return [ e.title.text for e in utils.flatten( [u.entry for u in self.apps_client.GetGeneratorForAllUsers()]) if e.login.suspended == 'false' ]