def predict_with_models( predictions_path: pathlib.Path, y: pd.DataFrame, X_test: pd.DataFrame, models_configs: typing.Sequence[typing.Dict], datasets: typing.Sequence, output_dim: int, seed: int, ): _set_seed(seed) for model_config, dataset_generator in zip(models_configs, datasets): for i, (X_train, X_test) in enumerate(dataset_generator): name, model = _create_model(len(list(X_train)), model_config, output_dim) name = f"{i}_{name}" net = skorch.NeuralNet( model, criterion=torch.nn.CrossEntropyLoss, max_epochs=40, optimizer=torch.optim.Adam, device="cuda", batch_size=64, train_split=skorch.dataset.CVSplit(0.2, stratified=True), iterator_train__shuffle=True, callbacks=[ skorch.callbacks.EpochScoring( make_scorer(lambda y_true, y_pred: np.mean( y_true == np.argmax(y_pred, axis=-1))), name="validation_accuracy", lower_is_better=False, ), skorch.callbacks.EarlyStopping( monitor="validation_accuracy", lower_is_better=False, patience=8), ("PredictTest", PredictTest(X_test, monitor="validation_accuracy")), skorch.callbacks.LRScheduler( policy=torch.optim.lr_scheduler.ReduceLROnPlateau, monitor="validation_accuracy", mode="max", factor=0.6, patience=2, verbose=True, ), ], ) net.fit(X_train.values.astype(np.float32), y.values) _save_predictions(predictions_path, name, net.history)
def _make_classifier(self, model): return skorch.NeuralNet(module=CnnLSTMAttention, module__embeddings_size=model.vector_size(), module__out_classes=self.out_classes * 2, criterion=self.criterion, optimizer=torch.optim.Adam, lr=self.lr, max_epochs=self.num_epochs, use_cuda=self.use_cuda, gradient_clip_value=self.clip_grad_norm, verbose=self.verbose, batch_size=self.batch_size, dataset=EmbeddingsSeqDataset, dataset__model=model, dataset__max_len=self.max_seq_len)
def train_fuzzy(model, X, y, show_plots=True): X = torch.tensor(X, dtype=torch.float) y = torch.tensor(y, dtype=torch.float) net = skorch.NeuralNet( model, max_epochs=50, criterion=torch.nn.MSELoss, optimizer=torch.optim.SGD, optimizer__lr=1e-6, optimizer__momentum=0.99, callbacks=[FittingCallback()], ) if show_plots: experimental.plot_all_mfs(model, X) net.fit(X, y) if show_plots: experimental.plot_all_mfs(model, X)
def build_model(): classifier = skorch.NeuralNet( module=MLPModule, optimizer=torch.optim.Adam, criterion=torch.nn.BCEWithLogitsLoss, max_epochs=5, batch_size=128, device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"), train_split=None, callbacks=[ DynamicVariablesSetter(), ], ) model = make_pipeline( build_preprocessor(), classifier, ) return model
def test_vignette(show_plots=True): model = vignette_examples.vignette_ex3() X, y = jang_examples.make_sinc_xy_large().dataset.tensors net = skorch.NeuralNet( model, max_epochs=50, # train_split=None, # window_size=1024, criterion=torch.nn.MSELoss, # criterion__reduction='sum', optimizer=torch.optim.SGD, optimizer__lr=1e-4, optimizer__momentum=0.99, callbacks=[FittingCallback()], ) net.fit(X, y) if show_plots: experimental.plot_all_mfs(model, X) y_actual = y y_pred = model(X) experimental.plot_results(y_actual, y_pred)
def test_jang(show_plots=True): model = jang_examples.ex1_model() train_data = jang_examples.make_sinc_xy() X, y = train_data.dataset.tensors net = skorch.NeuralNet( model, max_epochs=100, train_split=None, criterion=torch.nn.MSELoss, #criterion__reduction='sum', optimizer=torch.optim.SGD, optimizer__lr=1e-4, optimizer__momentum=0.99, callbacks=[FittingCallback()], ) net.fit(X, y) if show_plots: experimental.plot_all_mfs(model, X) y_actual = y y_pred = model(X) experimental.plot_results(y_actual, y_pred)
def load_protein_accessory_model(dirname: str) -> skorch.NeuralNet: """Loads the protein accessory model""" predicted_proteins = utils.read_delimited_file( os.path.join(dirname, "protein_proteins.txt")) with open(os.path.join(dirname, "params.json")) as source: model_params = json.load(source) encoded_to_protein_skorch = skorch.NeuralNet( module=autoencoders.Decoder, module__num_units=16, module__intermediate_dim=model_params["interdim"], module__num_outputs=len(predicted_proteins), module__final_activation=nn.Identity(), module__activation=ACT_DICT[model_params["act"]], # module__final_activation=nn.Linear( # len(predicted_proteins), len(predicted_proteins), bias=True # ), # Paper uses identity activation instead lr=model_params["lr"], criterion=LOSS_DICT[model_params["loss"]], # Other works use L1 loss optimizer=OPTIM_DICT[model_params["optim"]], batch_size=model_params["bs"], max_epochs=500, callbacks=[ skorch.callbacks.EarlyStopping(patience=25), skorch.callbacks.LRScheduler( policy=torch.optim.lr_scheduler.ReduceLROnPlateau, **model_utils.REDUCE_LR_ON_PLATEAU_PARAMS, ), skorch.callbacks.GradientNormClipping(gradient_clip_value=5), ], iterator_train__num_workers=8, iterator_valid__num_workers=8, device="cpu", ) encoded_to_protein_skorch_cp = skorch.callbacks.Checkpoint( dirname=dirname, fn_prefix="net_") encoded_to_protein_skorch.load_params( checkpoint=encoded_to_protein_skorch_cp) return encoded_to_protein_skorch
def build_model(device=torch.device("cpu")): model = skorch.NeuralNet( module=VAE, module__image_shape=(2, 10, 10), module__hid_size=512, module__latent_size=2, optimizer=torch.optim.Adam, optimizer__lr=0.0001, criterion=ELBO, max_epochs=10, batch_size=128, iterator_train=DataIterator, iterator_train__shuffle=True, # iterator_tarin__num_workers=2, iterator_valid=DataIterator, iterator_valid__shuffle=False, # iterator_valid__num_workers=2, device=device, callbacks=[ ShapeSetter(), skorch.callbacks.EpochScoring(epoch_vis, on_train=True), ]) return model
def build_model(device=torch.device("cpu"), regularized=False): model = skorch.NeuralNet( module=AutoEncoder, module__image_shape=(2, 10, 10), module__hid_size=512, module__latent_size=2, optimizer=torch.optim.Adam, optimizer__lr=0.0001, criterion=MSE, criterion__regularized=regularized, max_epochs=20, batch_size=512, iterator_train=DataIterator, iterator_train__shuffle=True, # iterator_tarin__num_workers=2, iterator_valid=DataIterator, iterator_valid__shuffle=False, # iterator_valid__num_workers=2, device=device, callbacks=[ ShapeSetter(), ], ) return model
def main( params_path: Path = "training/params.yml", viz: bool = False, toy: bool = False, model: str = "torch", ) -> None: torch.autograd.set_detect_anomaly(True) params = dicto.load(params_path) df_train, df_test = dataget.toy.spirals().get() X_train = df_train[["x0", "x1"]].to_numpy() y_train = df_train["y"].to_numpy() X_test = df_test[["x0", "x1"]].to_numpy() y_test = df_test["y"].to_numpy() transform = MinMaxScaler() X_train = transform.fit_transform(X_train) X_test = transform.transform(X_test) ds_train = ContrastiveDataset( X_train, y_train, batch_size=params.batch_size, steps_per_epoch=params.steps_per_epoch, noise_std=params.noise_std, n_neighbors=params.n_neighbors, n_hops=params.n_hops, transform=torch.tensor, viz=viz, ) ds_test = ContrastiveDataset( X_test, y_test, batch_size=32, steps_per_epoch=1, noise_std=params.noise_std, n_neighbors=params.n_neighbors, n_hops=params.n_hops, transform=torch.tensor, viz=False, ) if viz: visualize(ds_train) # pytorch model = ContrastiveNet( batch_size=params.batch_size * 2, n_layers=params.n_layers, n_units=params.n_units, embedding_size=params.embedding_size, ) net = skorch.NeuralNet( model, criterion=criterion, batch_size=None, max_epochs=params.epochs, lr=params.lr, optimizer=torch.optim.Adam, # train_split=lambda X, y: (X, ds_test), train_split=None, device="cuda", ) net.fit(ds_train, y=None) net.module.eval() h = (net.module( torch.tensor(X_train, dtype=torch.float32, device="cuda"), return_embeddings=True, ).cpu().detach().numpy()) h = PCA(1).fit_transform(h) px.scatter(x=X_train[:, 0], y=X_train[:, 1], color=h[:, 0]).show()
def main(): """Train a protein predictor""" parser = build_parser() args = parser.parse_args() # Create output directory if not os.path.isdir(args.outdir): os.makedirs(args.outdir) # Specify output log file logger = logging.getLogger() fh = logging.FileHandler(os.path.join(args.outdir, "training.log")) fh.setLevel(logging.INFO) logger.addHandler(fh) # Log parameters for arg in vars(args): logging.info(f"Parameter {arg}: {getattr(args, arg)}") with open(os.path.join(args.outdir, "params.json"), "w") as sink: json.dump(vars(args), sink, indent=4) # Load the model pretrained_net = model_utils.load_model(args.encoder, device=args.device) # Load in some files rna_genes = utils.read_delimited_file( os.path.join(args.encoder, "rna_genes.txt")) atac_bins = utils.read_delimited_file( os.path.join(args.encoder, "atac_bins.txt")) # Read in the RNA rna_data_kwargs = copy.copy(sc_data_loaders.TENX_PBMC_RNA_DATA_KWARGS) rna_data_kwargs["cluster_res"] = args.clusterres rna_data_kwargs["fname"] = args.rnaCounts rna_data_kwargs["reader"] = lambda x: load_rna_files( x, args.encoder, transpose=not args.notrans) # Construct data folds full_sc_rna_dataset = sc_data_loaders.SingleCellDataset( valid_cluster_id=args.validcluster, test_cluster_id=args.testcluster, **rna_data_kwargs, ) full_sc_rna_dataset.data_raw.write_h5ad( os.path.join(args.outdir, "full_rna.h5ad")) train_valid_test_dsets = [] for mode in ["all", "train", "valid", "test"]: logging.info(f"Constructing {mode} dataset") sc_rna_dataset = sc_data_loaders.SingleCellDatasetSplit( full_sc_rna_dataset, split=mode) sc_rna_dataset.data_raw.write_h5ad( os.path.join(args.outdir, f"{mode}_rna.h5ad")) # Write RNA input sc_atac_dummy_dataset = sc_data_loaders.DummyDataset( shape=len(atac_bins), length=len(sc_rna_dataset)) # RNA and fake ATAC sc_dual_dataset = sc_data_loaders.PairedDataset( sc_rna_dataset, sc_atac_dummy_dataset, flat_mode=True, ) # encoded(RNA) as "x" and RNA + fake ATAC as "y" sc_rna_encoded_dataset = sc_data_loaders.EncodedDataset( sc_dual_dataset, model=pretrained_net, input_mode="RNA") sc_rna_encoded_dataset.encoded.write_h5ad( os.path.join(args.outdir, f"{mode}_encoded.h5ad")) sc_protein_dataset = sc_data_loaders.SingleCellProteinDataset( args.proteinCounts, obs_names=sc_rna_dataset.obs_names, transpose=not args.notrans, ) sc_protein_dataset.data_raw.write_h5ad( os.path.join(args.outdir, f"{mode}_protein.h5ad")) # Write protein # x = 16 dimensional encoded layer, y = 25 dimensional protein array sc_rna_protein_dataset = sc_data_loaders.SplicedDataset( sc_rna_encoded_dataset, sc_protein_dataset) _temp = sc_rna_protein_dataset[0] # ensure calling works train_valid_test_dsets.append(sc_rna_protein_dataset) # Unpack and do sanity checks _, sc_rna_prot_train, sc_rna_prot_valid, sc_rna_prot_test = train_valid_test_dsets x, y, z = sc_rna_prot_train[0], sc_rna_prot_valid[0], sc_rna_prot_test[0] assert (x[0].shape == y[0].shape == z[0].shape ), f"Got mismatched shapes: {x[0].shape} {y[0].shape} {z[0].shape}" assert (x[1].shape == y[1].shape == z[1].shape ), f"Got mismatched shapes: {x[1].shape} {y[1].shape} {z[1].shape}" protein_markers = list(sc_protein_dataset.data_raw.var_names) with open(os.path.join(args.outdir, "protein_proteins.txt"), "w") as sink: sink.write("\n".join(protein_markers) + "\n") assert len( utils.read_delimited_file( os.path.join(args.outdir, "protein_proteins.txt"))) == len(protein_markers) logging.info(f"Predicting on {len(protein_markers)} proteins") if args.preprocessonly: return protein_decoder_skorch = skorch.NeuralNet( module=autoencoders.Decoder, module__num_units=16, module__intermediate_dim=args.interdim, module__num_outputs=len(protein_markers), module__activation=ACT_DICT[args.act], module__final_activation=nn.Identity(), # module__final_activation=nn.Linear( # len(protein_markers), len(protein_markers), bias=True # ), # Paper uses identity activation instead lr=args.lr, criterion=LOSS_DICT[args.loss], # Other works use L1 loss optimizer=OPTIM_DICT[args.optim], batch_size=args.bs, max_epochs=args.epochs, callbacks=[ skorch.callbacks.EarlyStopping(patience=15), skorch.callbacks.LRScheduler( policy=torch.optim.lr_scheduler.ReduceLROnPlateau, patience=5, factor=0.1, min_lr=1e-6, # **model_utils.REDUCE_LR_ON_PLATEAU_PARAMS, ), skorch.callbacks.GradientNormClipping(gradient_clip_value=5), skorch.callbacks.Checkpoint( dirname=args.outdir, fn_prefix="net_", monitor="valid_loss_best", ), ], train_split=skorch.helper.predefined_split(sc_rna_prot_valid), iterator_train__num_workers=8, iterator_valid__num_workers=8, device=utils.get_device(args.device), ) protein_decoder_skorch.fit(sc_rna_prot_train, y=None) # Plot the loss history fig = plot_loss_history(protein_decoder_skorch.history, os.path.join(args.outdir, "loss.pdf"))
# # y = torch.cat((y, Y[i]), 0) # # xfin = x.unsqueeze(0) # # yfin = y.unsqueeze(0) # net.fit(X=X, y=Y) n_classes = 12 network = SegNet(in_channels=3, n_classes=n_classes) network.init_encoder() network.cuda() net = skorch.NeuralNet( module=network, criterion=torch.nn.CrossEntropyLoss, train_split=None, use_cuda=True, batch_size=10, ) params = {'lr': [0.01, 0.02], 'max_epochs': [5, 10]} # if only training # net.fit(X=X, y=y) image_indicators = np.hstack([np.repeat(i, len(x)) for i, x in enumerate(X)]) labels = image_indicators % n_classes # X, y = np.vstack(X), np.hstack(Y) cv = LeavePLabelOut(labels=labels, p=1)
def main( params_path: Path = Path("training/params.yml"), cache: bool = False, viz: bool = False, debug: bool = False, toy: bool = False, ): if debug: import debugpy print("Waiting debuger....") debugpy.listen(("localhost", 5678)) debugpy.wait_for_client() params = dicto.load(params_path) train_cache = Path("cache/train.csv") test_cache = Path("cache/test.csv") transformer_cache = Path("cache/transformer.pkl") if cache and train_cache.exists(): df_train = pd.read_csv(train_cache) df_test = pd.read_csv(test_cache) transformer = pickle.load(transformer_cache.open("rb")) else: df, df_real = dataget.kaggle(competition="cat-in-the-dat-ii").get( files=["train.csv", "test.csv"]) df.drop(columns=["id"], inplace=True) df_train, df_test = estimator.split(df, params) if toy: df_train = df_train.sample(n=1000) df_test = df_test.sample(n=1000) transformer = GenericTransformer( categorical=params.categorical, numerical=params.numerical, ) df_train = transformer.fit_transform(df_train) df_test = transformer.transform(df_test) df_train.to_csv(train_cache, index=False) df_test.to_csv(test_cache, index=False) pickle.dump(transformer, transformer_cache.open("wb")) print(df_train) print(df_test) ds_train = estimator.get_dataset(df_train, params, "train") ds_test = estimator.get_dataset(df_test, params, "test") print(ds_train[:10]) print(ds_test[:10]) model = estimator.get_model(params, n_categories=transformer.n_categories, numerical=[]) print(model) exit() net = skorch.NeuralNet(model, ) model.summary() print(ds_train) model.fit( ds_train, epochs=params.epochs, steps_per_epoch=params.steps_per_epoch, validation_data=ds_test, callbacks=[ tf.keras.callbacks.TensorBoard(log_dir=str( Path("summaries") / Path(model.name)), profile_batch=0) ], ) # Export to saved model save_path = f"models/{model.name}" model.save(save_path) print(f"{save_path=}") vizualize(df_train, df_test, model)