def test_wandb_logger(wandb): """Verify that basic functionality of wandb logger works. Wandb doesn't work well with pytest so we have to mock it out here.""" logger = WandbLogger(anonymous=True, offline=True) logger.log_metrics({'acc': 1.0}) wandb.init().log.assert_called_once_with({'acc': 1.0}, step=None) wandb.init().log.reset_mock() logger.log_metrics({'acc': 1.0}, step=3) wandb.init().log.assert_called_once_with({'acc': 1.0}, step=3) # continue training on same W&B run wandb.init().step = 3 logger.finalize('success') logger.log_metrics({'acc': 1.0}, step=3) wandb.init().log.assert_called_with({'acc': 1.0}, step=6) logger.log_hyperparams({'test': None, 'nested': {'a': 1}, 'b': [2, 3, 4]}) wandb.init().config.update.assert_called_once_with( { 'test': 'None', 'nested/a': 1, 'b': [2, 3, 4] }, allow_val_change=True, ) logger.watch('model', 'log', 10) wandb.init().watch.assert_called_once_with('model', log='log', log_freq=10) assert logger.name == wandb.init().project_name() assert logger.version == wandb.init().id
def main(cfg: DictConfig = None): log.info("==> Training Configs:\n%s", OmegaConf.to_yaml(cfg)) width, _, img_size, dropout_p, _, _ = compound_params(cfg.name) transforms = T.Compose( [ T.Resize(size=(img_size, img_size)), T.ToTensor(), T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), ] ) if cfg.pretrained: network = EfficientNet( name=cfg.name, num_classes=cfg.num_classes, ).from_pretrained(name=cfg.name) for params in network.parameters(): params.requires_grad = False final_out_channels = round_filters(1280, 8, width) network.classifier = nn.Sequential( nn.AdaptiveAvgPool2d(1), nn.Flatten(1), nn.Dropout(dropout_p), nn.Linear(final_out_channels, cfg.num_classes), ) else: network = EfficientNet(name=cfg.name, num_classes=cfg.num_classes) gym = EfficientNetGym(network, cfg) dm = instantiate( cfg.dm, **{"train_transforms_conf": transforms, "test_transforms_conf": transforms}, ) with open(f"{cfg.name}.md", "w") as f: f.write(f"## {cfg.name}\n```py\n") f.write(str(network)) f.write("\n```") with open(f"{cfg.name}-summary.md", "w") as f: f.write(f"## {cfg.name}-summary\n```py\n") f.write(str(ModelSummary(gym, "full"))) f.write("\n```") if cfg.logger: logger_ = WandbLogger( name=f"{cfg.optim}", project=cfg.name, ) logger_.watch(network, "all") else: logger_ = True ckpt = ModelCheckpoint("ckpt/{epoch}", prefix="-" + cfg.name) if cfg.ckpt else False trainer = Trainer(**cfg.pl, logger=logger_, checkpoint_callback=ckpt) trainer.fit(gym, datamodule=dm) if cfg.test: trainer.test(datamodule=dm)
def main(config): # ------------------------ # 1 LIGHTNING MODEL # ------------------------ model = SegModel(config) # ------------------------ # 2 DATA PIPELINES # ------------------------ kittiData = KittiDataModule(config) # ------------------------ # 3 WANDB LOGGER # ------------------------ wandb_logger = WandbLogger() # optional: log model topology wandb_logger.watch(model.net) # ------------------------ # 4 TRAINER # ------------------------ trainer = pl.Trainer( gpus=-1, logger=wandb_logger, max_epochs=config.epochs, accumulate_grad_batches=config.grad_batches, ) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model, kittiData)
def train(): hp = { "epochs": 10, "lr_initial": 0.001, "lr_decay_every": 30, "lr_decay_by": 0.3, } config = { "data_path": "../data", "val_split": 0.05, "batch_size": 64, "manual_seed": 2, "output_path": "./output", "model_save_frequency": 5, "dataloader_num_workers": 0, } dataset = MnistDataset(**config) model = MnistModel(**hp, **config) wandb_logger = WandbLogger(project="classification_test", log_model=True) trainer = pl.Trainer( gpus=0, max_epochs=hp["epochs"], default_root_dir=config["output_path"], logger=wandb_logger, ) wandb_logger.watch(model) trainer.fit(model, datamodule=dataset)
def main(args) -> None: """ Функция запуска обучения. """ config = load_cfg(args.config) pretty_printer = pprint.PrettyPrinter(indent=2) pretty_printer.pprint(config) model = BaselineLearner(config) logger = False if args.use_logger: logger = WandbLogger(name=config.name) logger.watch(model.net) trainer = pl.Trainer( gpus=args.gpus, logger=logger, callbacks=[ ModelCheckpoint(monitor='valid_loss', dirpath=config.sources.ckpt_path, filename=config.name) ], max_epochs=config.training.epochs, distributed_backend=args.distributed_backend, precision=16 if args.use_amp else 32, ) trainer.fit(model) print('Model training completed!')
def main(hparams): # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ model = SegModel(hparams) # ------------------------ # 2 SET WANDB LOGGER # ------------------------ wandb_logger = WandbLogger() # optional: log model topology wandb_logger.watch(model.net) # ------------------------ # 3 INIT TRAINER # ------------------------ trainer = pl.Trainer(gpus=hparams.gpus, logger=wandb_logger, max_epochs=hparams.epochs, accumulate_grad_batches=hparams.grad_batches, checkpoint_callback=False) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model)
def train(config): fix_seeds(seed=config.train.seed) crnn = CRNNEncoder( in_channels=config.model.get('in_channels', 42), hidden_size=config.model.get('hidden_size', 16), dropout=config.model.get('dropout', 0.1), cnn_layers=config.model.get('cnn_layers', 2), rnn_layers=config.model.get('rnn_layers', 2), kernel_size=config.model.get('kernel_size', 9) ) model = AttentionNet( crnn, hidden_size=config.model.get('hidden_size', 16), num_classes=config.model.get('num_classes', 3) ) pl_model = KWSModel( model, lr=config.train.get('lr', 4e-5), in_channels=config.model.get('in_channels', 42), batch_size=config.train.get('batch_size', 32) ) wandb_logger = WandbLogger(name=config.train.get('experiment_name', 'final_run'), project='kws-attention', log_model=True) wandb_logger.log_hyperparams(config) wandb_logger.watch(model, log='all', log_freq=100) trainer = pl.Trainer(max_epochs=config.train.get('max_epochs', 15), logger=wandb_logger, gpus=config.train.get('gpus', 1)) trainer.fit(pl_model)
def main(hparams: Namespace): # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ model = SegModel(**vars(hparams)) # ------------------------ # 2 SET LOGGER # ------------------------ logger = False if hparams.log_wandb: logger = WandbLogger() # optional: log model topology logger.watch(model.net) # ------------------------ # 3 INIT TRAINER # ------------------------ trainer = pl.Trainer( gpus=hparams.gpus, logger=logger, max_epochs=hparams.epochs, accumulate_grad_batches=hparams.grad_batches, distributed_backend=hparams.distributed_backend, precision=16 if hparams.use_amp else 32, ) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model)
def main(): print("Running main") print(time.ctime()) default_config_path = "default_config.yaml" with open(default_config_path) as file: default_configs = yaml.load(file, Loader=yaml.FullLoader) wandb.init(config=default_configs, project=default_configs["project"]) config = wandb.config if "random_seed" in dict(config).keys(): set_random_seed(dict(config)["random_seed"]) print("Initialising model") print(time.ctime()) model_name = eval(dict(config)["model"]) model = model_name(dict(config)) logger = WandbLogger(save_dir=default_configs["artifacts"]) logger.watch(model, log="all") if default_configs["gpus"] == 1: trainer = Trainer( gpus=1, max_epochs=default_configs["max_epochs"], logger=logger ) # , strategy=CustomDDPPlugin(find_unused_parameters=False)) else: trainer = Trainer( gpus=default_configs["gpus"], max_epochs=default_configs["max_epochs"], logger=logger, strategy=CustomDDPPlugin(find_unused_parameters=False), ) trainer.fit(model)
def main(hparams, network): # init module model = network(hparams) print(model.hparams) project_folder = 'audio_emotion_team' wandb_logger = WandbLogger(name='lflb_dropout_rnn', project=project_folder, entity='thesis', offline=False) early_stop_callback = EarlyStopping(monitor='val_loss', min_delta=0.00, patience=20, verbose=False, mode='min') # most basic trainer, uses good defaults trainer = Trainer( max_nb_epochs=hparams.max_nb_epochs, gpus=hparams.gpus, nb_gpu_nodes=hparams.nodes, logger=wandb_logger, #weights_summary='full', early_stop_callback=early_stop_callback, #profiler=True, benchmark=True, #log_gpu_memory='all' ) wandb_logger.experiment wandb_logger.watch(model) trainer.fit(model)
def main(hparams: Namespace): # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ model = SegModel(**vars(hparams)) # ------------------------ # 2 SET LOGGER # ------------------------ logger = False if hparams.log_wandb: logger = WandbLogger() # optional: log model topology logger.watch(model.net) # ------------------------ # 3 INIT TRAINER # ------------------------ trainer = pl.Trainer.from_argparse_args(hparams) # ------------------------ # 5 START TRAINING # ------------------------ trainer.fit(model)
def train_classifier(logging=False, train=True): hparams = { 'gpus': [1], 'max_epochs': 25, 'num_classes': 700, 'feature_dimension': 512, 'model_dimension': 1024, 'pretrained_text': False, 'num_modalities': 1, 'batch_size': 32, 'learning_rate': 1e-3, 'model_path': "/home/sgurram/Projects/aai/aai/experimental/sgurram/lava/src/wandb/run-20210626_215155-yqwe58z7/files/lava/yqwe58z7/checkpoints/epoch=6-step=12529.ckpt", 'model_descriptor': 'lava timesformer 1/3 kinetics data, unshuffled', 'accumulate_grad_batches': 2, 'overfit_batches': 0, 'type_modalities': 'av', 'modality_fusion': 'concat', 'loss_funtions': ['cross_entropy'], 'metrics': None, 'optimizer': 'adam', 'scheduler': 'n/a', 'profiler': 'simple', 'default_root_dir': '/home/sgurram/Desktop/video_lava_classifer', } model = EvalLightning( num_classes=hparams['num_classes'], feature_dimension=hparams['feature_dimension'], model_dimension=hparams['model_dimension'], num_modalities=hparams['num_modalities'], batch_size=hparams['batch_size'], learning_rate=hparams['learning_rate'], model_path=hparams['model_path'], model=LAVALightning, pretrained_text=hparams['pretrained_text'], ) if logging: wandb_logger = WandbLogger(name='run', project='lava') wandb_logger.log_hyperparams(hparams) wandb_logger.watch(model, log='gradients', log_freq=10) else: wandb_logger = None if not train: return model trainer = pl.Trainer( default_root_dir=hparams['default_root_dir'], gpus=hparams['gpus'], max_epochs=hparams['max_epochs'], accumulate_grad_batches=hparams['accumulate_grad_batches'], overfit_batches=hparams['overfit_batches'], logger=wandb_logger, profiler=hparams['profiler']) trainer.fit(model)
def train(dataset_name: str, model_name: str, expt_dir: str, data_folder: str, num_workers: int = 0, is_test: bool = False, resume_from_checkpoint: str = None): seed_everything(SEED) dataset_main_folder = data_folder vocab = Vocabulary.load(join(dataset_main_folder, "vocabulary.pkl")) if model_name == "code2seq": config_function = get_code2seq_test_config if is_test else get_code2seq_default_config config = config_function(dataset_main_folder) model = Code2Seq(config, vocab, num_workers) model.half() #elif model_name == "code2class": # config_function = get_code2class_test_config if is_test else get_code2class_default_config # config = config_function(dataset_main_folder) # model = Code2Class(config, vocab, num_workers) else: raise ValueError(f"Model {model_name} is not supported") # define logger wandb_logger = WandbLogger(project=f"{model_name}-{dataset_name}", log_model=True, offline=True) wandb_logger.watch(model) # define model checkpoint callback model_checkpoint_callback = ModelCheckpoint( filepath=join(expt_dir, "{epoch:02d}-{val_loss:.4f}"), period=config.hyperparams.save_every_epoch, save_top_k=3, ) # define early stopping callback early_stopping_callback = EarlyStopping( patience=config.hyperparams.patience, verbose=True, mode="min") # use gpu if it exists gpu = 1 if torch.cuda.is_available() else None # define learning rate logger lr_logger = LearningRateLogger() trainer = Trainer( max_epochs=20, gradient_clip_val=config.hyperparams.clip_norm, deterministic=True, check_val_every_n_epoch=config.hyperparams.val_every_epoch, row_log_interval=config.hyperparams.log_every_epoch, logger=wandb_logger, checkpoint_callback=model_checkpoint_callback, early_stop_callback=early_stopping_callback, resume_from_checkpoint=resume_from_checkpoint, gpus=gpu, callbacks=[lr_logger], reload_dataloaders_every_epoch=True, ) trainer.fit(model) trainer.save_checkpoint(join(expt_dir, 'Latest.ckpt')) trainer.test()
def test_wandb_logger_init(wandb, recwarn): """Verify that basic functionality of wandb logger works. Wandb doesn't work well with pytest so we have to mock it out here.""" # test wandb.init called when there is no W&B run wandb.run = None logger = WandbLogger() logger.log_metrics({'acc': 1.0}) wandb.init.assert_called_once() wandb.init().log.assert_called_once_with({'acc': 1.0}, step=None) # mock wandb step wandb.init().step = 0 # test wandb.init not called if there is a W&B run wandb.init().log.reset_mock() wandb.init.reset_mock() wandb.run = wandb.init() logger = WandbLogger() logger.log_metrics({'acc': 1.0}, step=3) wandb.init.assert_called_once() wandb.init().log.assert_called_once_with({'acc': 1.0}, step=3) # continue training on same W&B run and offset step wandb.init().step = 3 logger.finalize('success') logger.log_metrics({'acc': 1.0}, step=3) wandb.init().log.assert_called_with({'acc': 1.0}, step=6) # log hyper parameters logger.log_hyperparams({'test': None, 'nested': {'a': 1}, 'b': [2, 3, 4]}) wandb.init().config.update.assert_called_once_with( { 'test': 'None', 'nested/a': 1, 'b': [2, 3, 4] }, allow_val_change=True, ) # watch a model logger.watch('model', 'log', 10) wandb.init().watch.assert_called_once_with('model', log='log', log_freq=10) # verify warning for logging at a previous step assert 'Trying to log at a previous step' not in get_warnings(recwarn) # current step from wandb should be 6 (last logged step) logger.experiment.step = 6 # logging at step 2 should raise a warning (step_offset is still 3) logger.log_metrics({'acc': 1.0}, step=2) assert 'Trying to log at a previous step' in get_warnings(recwarn) # logging again at step 2 should not display again the same warning logger.log_metrics({'acc': 1.0}, step=2) assert 'Trying to log at a previous step' not in get_warnings(recwarn) assert logger.name == wandb.init().project_name() assert logger.version == wandb.init().id
def test_wandb_logger_init(wandb): """Verify that basic functionality of wandb logger works. Wandb doesn't work well with pytest so we have to mock it out here. """ # test wandb.init called when there is no W&B run wandb.run = None logger = WandbLogger( name="test_name", save_dir="test_save_dir", version="test_id", project="test_project", resume="never" ) logger.log_metrics({"acc": 1.0}) wandb.init.assert_called_once_with( name="test_name", dir="test_save_dir", id="test_id", project="test_project", resume="never", anonymous=None ) wandb.init().log.assert_called_once_with({"acc": 1.0}) # test wandb.init and setting logger experiment externally wandb.run = None run = wandb.init() logger = WandbLogger(experiment=run) assert logger.experiment # test wandb.init not called if there is a W&B run wandb.init().log.reset_mock() wandb.init.reset_mock() wandb.run = wandb.init() logger = WandbLogger() # verify default resume value assert logger._wandb_init["resume"] == "allow" with pytest.warns(UserWarning, match="There is a wandb run already in progress"): _ = logger.experiment logger.log_metrics({"acc": 1.0}, step=3) wandb.init.assert_called_once() wandb.init().log.assert_called_once_with({"acc": 1.0, "trainer/global_step": 3}) # continue training on same W&B run and offset step logger.finalize("success") logger.log_metrics({"acc": 1.0}, step=6) wandb.init().log.assert_called_with({"acc": 1.0, "trainer/global_step": 6}) # log hyper parameters logger.log_hyperparams({"test": None, "nested": {"a": 1}, "b": [2, 3, 4]}) wandb.init().config.update.assert_called_once_with( {"test": "None", "nested/a": 1, "b": [2, 3, 4]}, allow_val_change=True ) # watch a model logger.watch("model", "log", 10, False) wandb.init().watch.assert_called_once_with("model", log="log", log_freq=10, log_graph=False) assert logger.name == wandb.init().project_name() assert logger.version == wandb.init().id
def main(args): # Load the dataset movielens = MovielensDataModule( args.data_dir, args.filename, args.split, args.threshold, args.negatives, args.batch_size, args.num_workers, ) args.num_items = movielens.dataset.num_items args.num_users = movielens.dataset.num_users # Set up the model and logger model = ImplicitMatrixFactorization(hparams=args) if th.cuda.is_available() and args.gpus > 0: model.cuda() wandb_logger = WandbLogger(project="torch-factorization-models") wandb_logger.watch(model, log="all", log_freq=100) if args.early_stopping: args.early_stopping = EarlyStopping(monitor="tuning_loss") # Most basic trainer, uses good defaults trainer = Trainer.from_argparse_args( args, check_val_every_n_epoch=1, logger=wandb_logger, early_stop_callback=args.early_stopping, ) if args.use_lr_finder: movielens.setup() lr_finder = trainer.lr_find( model, train_dataloader=movielens.train_dataloader(), val_dataloaders=[movielens.val_dataloader()], early_stop_threshold=None, min_lr=1e-6, max_lr=5e-1, ) lr_finder.plot(suggest=True) plt.show(block=True) else: trainer.fit(model, movielens) # Save the model th.save(model.state_dict(), Path(wandb_logger.experiment.dir) / "model.pt")
def train(split, band_type): # Model init model = Densenet() #"/content/drive/Shared drives/EEG_Aditya/data/EEG3DTIME_3SPLIT.pt" train_dataset, validation_dataset, test_dataset = model.datasets( "/content/drive/Shared drives/EEG_Aditya/data/EEG3DTIME_3SPLIT.pt", split, band_type, [45, 21]) train_dataloader, validation_dataloader, test_dataloader = model.dataloaders( train_dataset, validation_dataset, test_dataset, batch_size=256) # Logging model.model_tags.append(split) model.model_tags.append(band_type) model.model_tags.append("train:" + str(len(train_dataset))) model.model_tags.append("validation:" + str(len(validation_dataset))) model.model_tags.append("test:" + str(len(test_dataset))) model.model_tags.append("seed:" + str(model.seed)) wandb_logger = WandbLogger( name=model.model_name, tags=model.model_tags, project="eeg-connectome-analysis", save_dir="/content/drive/Shared drives/EEG_Aditya/model-results/wandb", log_model=True) wandb_logger.watch(model, log='gradients', log_freq=100) # Checkpoints val_loss_cp = pl.callbacks.ModelCheckpoint(monitor='validation-loss') trainer = pl.Trainer(max_epochs=1000, gpus=1, logger=wandb_logger, precision=16, fast_dev_run=False, auto_lr_find=True, auto_scale_batch_size=True, log_every_n_steps=1, checkpoint_callback=val_loss_cp) trainer.fit(model, train_dataloader, validation_dataloader) print("Done training.") print("Testing model on last epoch.") model_path = val_loss_cp.best_model_path model_path = model_path[:model_path.rfind('/')] + "lastModel.ckpt" trainer.save_checkpoint(model_path) print( f"Testing model with best validation loss\t{val_loss_cp.best_model_score}." ) model = model.load_from_checkpoint(val_loss_cp.best_model_path) results = trainer.test(model, test_dataloader) if results[0]["test-accuracy"] < 0.675: train(split, band_type) print("Done testing.")
def train(config: DictConfig): filter_warnings() print_config(config) seed_everything(config.seed) known_models = {"code2seq": get_code2seq, "code2class": get_code2class, "typed-code2seq": get_typed_code2seq} if config.name not in known_models: print(f"Unknown model: {config.name}, try on of {known_models.keys()}") vocabulary = Vocabulary.load_vocabulary(join(config.data_folder, config.dataset.name, config.vocabulary_name)) model, data_module = known_models[config.name](config, vocabulary) # define logger wandb_logger = WandbLogger( project=f"{config.name}-{config.dataset.name}", log_model=True, offline=config.log_offline ) wandb_logger.watch(model) # define model checkpoint callback checkpoint_callback = ModelCheckpoint( dirpath=wandb_logger.experiment.dir, filename="{epoch:02d}-{val_loss:.4f}", period=config.save_every_epoch, save_top_k=-1, ) upload_checkpoint_callback = UploadCheckpointCallback(wandb_logger.experiment.dir) # define early stopping callback early_stopping_callback = EarlyStopping( patience=config.hyper_parameters.patience, monitor="val_loss", verbose=True, mode="min" ) # define callback for printing intermediate result print_epoch_result_callback = PrintEpochResultCallback("train", "val") # use gpu if it exists gpu = 1 if torch.cuda.is_available() else None # define learning rate logger lr_logger = LearningRateMonitor("step") trainer = Trainer( max_epochs=config.hyper_parameters.n_epochs, gradient_clip_val=config.hyper_parameters.clip_norm, deterministic=True, check_val_every_n_epoch=config.val_every_epoch, log_every_n_steps=config.log_every_epoch, logger=wandb_logger, gpus=gpu, progress_bar_refresh_rate=config.progress_bar_refresh_rate, callbacks=[ lr_logger, early_stopping_callback, checkpoint_callback, upload_checkpoint_callback, print_epoch_result_callback, ], resume_from_checkpoint=config.resume_from_checkpoint, ) trainer.fit(model=model, datamodule=data_module) trainer.test()
def test_wandb_logger_init(wandb): """Verify that basic functionality of wandb logger works. Wandb doesn't work well with pytest so we have to mock it out here.""" # test wandb.init called when there is no W&B run wandb.run = None logger = WandbLogger( name='test_name', save_dir='test_save_dir', version='test_id', project='test_project', resume='never' ) logger.log_metrics({'acc': 1.0}) wandb.init.assert_called_once_with( name='test_name', dir='test_save_dir', id='test_id', project='test_project', resume='never', anonymous=None ) wandb.init().log.assert_called_once_with({'acc': 1.0}) # test wandb.init and setting logger experiment externally wandb.run = None run = wandb.init() logger = WandbLogger(experiment=run) assert logger.experiment # test wandb.init not called if there is a W&B run wandb.init().log.reset_mock() wandb.init.reset_mock() wandb.run = wandb.init() logger = WandbLogger() # verify default resume value assert logger._wandb_init['resume'] == 'allow' logger.log_metrics({'acc': 1.0}, step=3) wandb.init.assert_called_once() wandb.init().log.assert_called_once_with({'acc': 1.0, 'trainer/global_step': 3}) # continue training on same W&B run and offset step logger.finalize('success') logger.log_metrics({'acc': 1.0}, step=6) wandb.init().log.assert_called_with({'acc': 1.0, 'trainer/global_step': 6}) # log hyper parameters logger.log_hyperparams({'test': None, 'nested': {'a': 1}, 'b': [2, 3, 4]}) wandb.init().config.update.assert_called_once_with( { 'test': 'None', 'nested/a': 1, 'b': [2, 3, 4] }, allow_val_change=True, ) # watch a model logger.watch('model', 'log', 10) wandb.init().watch.assert_called_once_with('model', log='log', log_freq=10) assert logger.name == wandb.init().project_name() assert logger.version == wandb.init().id
def main(): print("Running main") print(time.ctime()) args = parse_args() with open(args.config) as file: print(f"Using config file: {args.config}") default_configs = yaml.load(file, Loader=yaml.FullLoader) if args.checkpoint is not None: default_configs = torch.load(args.checkpoint)["hyper_parameters"] # Set random seed if args.random_seed is not None: set_random_seed(args.random_seed) default_configs["random_seed"] = args.random_seed elif "random_seed" in default_configs.keys(): set_random_seed(default_configs["random_seed"]) print("Initialising model") print(time.ctime()) model_name = eval(default_configs["model"]) model = model_name(default_configs) checkpoint_callback = ModelCheckpoint(monitor="tot_auc", mode="max", save_top_k=2, save_last=True) logger = WandbLogger( project=default_configs["project"], save_dir=default_configs["artifacts"], ) logger.watch(model, log="all") if args.root_dir is None: if "SLURM_JOB_ID" in os.environ: default_root_dir = os.path.join(".", os.environ["SLURM_JOB_ID"]) else: default_root_dir = None else: default_root_dir = os.path.join(".", args.root_dir) trainer = Trainer(gpus=default_configs["gpus"], num_nodes=default_configs["nodes"], max_epochs=default_configs["max_epochs"], logger=logger, strategy=CustomDDPPlugin(find_unused_parameters=False), callbacks=[checkpoint_callback], default_root_dir=default_root_dir) trainer.fit(model, ckpt_path=args.checkpoint)
def main(arg): seed_everything(42) model = PLModel(arg) wandb_logger = WandbLogger(project="Bachelorarbeit", name=arg.name) wandb_logger.watch(model) wandb_logger.log_hyperparams(arg) trainer = Trainer(gpus=2, logger=wandb_logger, distributed_backend='ddp', deterministic=True, auto_select_gpus=True, num_sanity_val_steps=0) trainer.fit(model)
def experiment(args): utils.seed_everything(seed=args.seed) qa_model = models.QAModel(hparams=args) train_dl, valid_dl, test_dl = data.prepare_data(args) wandb_logger = WandbLogger(project='qa', entity='nlp', tags=args.tags, offline=args.fast_dev_run) wandb_logger.watch(qa_model, log='all') args.logger = wandb_logger trainer = pl.Trainer.from_argparse_args(args) trainer.fit(qa_model, train_dataloader=train_dl, val_dataloaders=valid_dl) trainer.test(qa_model, test_dataloaders=test_dl)
def main(hparams): if hparams.supress_logs: import logging logger = logging.getLogger("wandb") logger.setLevel(logging.ERROR) colornet = model.ColorNet() wandb_logger = WandbLogger(project='video-colorization', tags=["colornet"]) wandb_logger.watch(colornet, log_freq=hparams.log_frequency) early_stopping = EarlyStopping('val_loss', patience=hparams.patience) checkpoint_callback = ModelCheckpoint(filepath='checkpoints/checkpoint_{epoch:02d}-{val_loss:.2f}') trainer = pl.Trainer(max_epochs=hparams.epoch, gpus=hparams.gpus, logger=wandb_logger, early_stop_callback=early_stopping, checkpoint_callback=checkpoint_callback) trainer.fit(colornet)
def main(): with open("../lightning_modules/GNNEmbedding/train_toy_gnn.yaml") as f: hparams = yaml.load(f, Loader=yaml.FullLoader) model = AttentionNodeEmbedding(hparams) wandb_logger = WandbLogger(project="End2End-ToyNodeEmbedding") wandb_logger.watch(model) trainer = Trainer( gpus=1, max_epochs=hparams["max_epochs"], logger=wandb_logger, num_sanity_val_steps=0, accumulate_grad_batches=1, ) trainer.fit(model)
def train(model_name: str, n_cr: int, num_workers: int = 0, is_test: bool = False, resume_from_checkpoint: str = None): seed_everything(SEED) if model_name == "improved_gan": config_function = get_gan_test_config if is_test else get_gan_default_config config = config_function(n_cr) model = GAN(config, num_workers, improved=True) elif model_name == "default_gan": config_function = get_gan_test_config if is_test else get_gan_default_config config = config_function(n_cr) model = GAN(config, num_workers, improved=False) else: raise ValueError(f"Model {model_name} is not supported") # define logger wandb_logger = WandbLogger(project="GAN", log_model=True, offline=is_test) wandb_logger.watch(model, log="all") # define model checkpoint callback model_checkpoint_callback = ModelCheckpoint( filepath=join(wandb.run.dir, "{epoch:02d}-{val_loss:.4f}"), period=config.save_every_epoch, save_top_k=3, ) # use gpu if it exists gpu = 1 if torch.cuda.is_available() else None # define learning rate logger lr_logger = LearningRateLogger() trainer = Trainer( max_epochs=config.n_epochs, deterministic=True, check_val_every_n_epoch=config.val_every_epoch, row_log_interval=config.log_every_epoch, logger=wandb_logger, checkpoint_callback=model_checkpoint_callback, resume_from_checkpoint=resume_from_checkpoint, gpus=gpu, callbacks=[lr_logger], reload_dataloaders_every_epoch=True, ) trainer.fit(model) trainer.test()
def test_wandb_logger_init(wandb, recwarn): """Verify that basic functionality of wandb logger works. Wandb doesn't work well with pytest so we have to mock it out here.""" # test wandb.init called when there is no W&B run wandb.run = None logger = WandbLogger() logger.log_metrics({'acc': 1.0}) wandb.init.assert_called_once() wandb.init().log.assert_called_once_with({'acc': 1.0}) # test wandb.init not called if there is a W&B run wandb.init().log.reset_mock() wandb.init.reset_mock() wandb.run = wandb.init() logger = WandbLogger() logger.log_metrics({'acc': 1.0}, step=3) wandb.init.assert_called_once() wandb.init().log.assert_called_once_with({ 'acc': 1.0, 'trainer/global_step': 3 }) # continue training on same W&B run and offset step logger.finalize('success') logger.log_metrics({'acc': 1.0}, step=6) wandb.init().log.assert_called_with({'acc': 1.0, 'trainer/global_step': 6}) # log hyper parameters logger.log_hyperparams({'test': None, 'nested': {'a': 1}, 'b': [2, 3, 4]}) wandb.init().config.update.assert_called_once_with( { 'test': 'None', 'nested/a': 1, 'b': [2, 3, 4] }, allow_val_change=True, ) # watch a model logger.watch('model', 'log', 10) wandb.init().watch.assert_called_once_with('model', log='log', log_freq=10) assert logger.name == wandb.init().project_name() assert logger.version == wandb.init().id
def main(hparams): """ Main testing routine specific for this project :param hparams: Namespace containing configuration values :type hparams: Namespace """ # ------------------------ # 1 INIT MODEL # ------------------------ model = get_model(hparams) model.load_state_dict(torch.load(hparams.checkpoint_file)["state_dict"]) model.eval() name = "-".join([hparams.model, hparams.out, "-test"]) # ------------------------ # LOGGING SETUP # ------------------------ tb_logger = TensorBoardLogger(save_dir="logs/tb_logs/", name=name) tb_logger.experiment.add_graph(model, model.data[0][0].unsqueeze(0)) wandb_logger = WandbLogger( name=hparams.comment if hparams.comment else time.ctime(), project=name, save_dir="logs", ) wandb_logger.watch(model, log="all", log_freq=200) wandb_logger.log_hyperparams(model.hparams) for file in [ i for s in [glob(x) for x in ["*.py", "dataloader/*.py", "model/*.py"]] for i in s ]: shutil.copy(file, wandb.run.dir) trainer = pl.Trainer(gpus=hparams.gpus, logger=[wandb_logger]) # , tb_logger], # ------------------------ # 3 START TESTING # ------------------------ trainer.test(model)
def train(cfg): """ Trains the classifier. """ if cfg.name == "auto": cfg.name = Haikunator().haikunate() train_csv = Path(cfg.dataroot) / cfg.train_csv logger.info(f"Starting run {cfg.name}") model = HiggsClassifier(hp=cfg.hparams.model) data = HiggsDataModule( trainfile=train_csv, trainset_prop=cfg.train_val_split_frac, hp=cfg.hparams.trainer, ) data.prepare() logger.info( f"Train set size: {data.trainsize}, Validation set size: {data.valsize}" ) os.makedirs(cfg.runroot, exist_ok=True) if cfg.logger == "wandb": ml_logger = WandbLogger( project="higgs", name=cfg.name, save_dir=cfg.runroot, log_model="all", id=cfg.name, ) ml_logger.watch(model, log="all") elif cfg.logger == "csv": ml_logger = CSVLogger(save_dir=cfg.runroot, name="higgs", version=cfg.name) checkpoint = ModelCheckpoint(monitor="val_loss", mode="min") start = datetime.now() trainer = Trainer( default_root_dir=cfg.runroot, max_epochs=cfg.hparams.trainer.n_epochs, logger=ml_logger, callbacks=[checkpoint], ) trainer.fit(model, data) end = datetime.now() logger.info(f"Took {end - start} to finish training.")
def main(): print("Running main") print(time.ctime()) args = parse_args() with open(args.config) as file: default_configs = yaml.load(file, Loader=yaml.FullLoader) print("Initialising model") print(time.ctime()) model_name = eval(default_configs["model"]) model = model_name(default_configs) checkpoint_callback = ModelCheckpoint( monitor="val_loss", mode="min", save_top_k=2, save_last=True ) logger = WandbLogger( project=default_configs["project"], save_dir=default_configs["artifacts"], ) logger.watch(model, log="all") if args.root_dir is None: if "SLURM_JOB_ID" in os.environ: default_root_dir = os.path.join(".", os.environ["SLURM_JOB_ID"]) else: default_root_dir = None else: default_root_dir = os.path.join(".", args.root_dir) trainer = Trainer( gpus=default_configs["gpus"], max_epochs=default_configs["max_epochs"], logger=logger, strategy="ddp", num_sanity_val_steps=0, callbacks=[checkpoint_callback], default_root_dir=default_root_dir ) trainer.fit(model)
def main(hparams, network): # init module model = network(hparams) project_folder = 'audio_emotion_team' wandb_logger = WandbLogger(name='lflb_dropout_rnn', project=project_folder, entity='thesis', offline=False) early_stop_callback = EarlyStopping(monitor='val_loss', min_delta=0.00, patience=20, verbose=False, mode='min') # most basic trainer, uses good defaults trainer = Trainer(max_nb_epochs=hparams.max_nb_epochs, gpus=hparams.gpus, nb_gpu_nodes=hparams.nodes, logger=wandb_logger, weights_summary='full', early_stop_callback=early_stop_callback, profiler=True, benchmark=True, log_gpu_memory='all') wandb_logger.experiment.config.update( {'dataset': 'IEMOCAP_SPECT_GS_8s_512h_2048n'}) wandb_logger.watch(model) trainer.fit(model) # load best model exp_folder = project_folder + '/version_' + wandb_logger.experiment.id model_file = os.listdir(exp_folder + '/checkpoints')[0] # eval and upload best model model = network.load_from_checkpoint(exp_folder + '/checkpoints/' + model_file) report(model, wandb_logger) copyfile(exp_folder + '/checkpoints/' + model_file, wandb_logger.experiment.dir + '/model.ckpt') wandb_logger.experiment.save('model.ckpt')