# per envrionment step for epsilon greedy def on_step(self): k = max(self.eps_decay - self.num_timesteps, 0) / self.eps_decay self.eps = self.eps_final + k * (self.eps_init - self.eps_final) # This is for inference and evaluation of our model, returns the action def predict(self, x, deterministic=True): out = self.qnet(x) if deterministic: out = torch.max(out, dim=1)[1] else: eps = torch.rand_like(out[:, 0]) eps = (eps < self.eps).float() out = eps * torch.rand_like(out).max(dim=1)[1] +\ (1 - eps) * out.max(dim=1)[1] return out.long().cpu().numpy() def configure_optimizers(self): optimizer = torch.optim.Adam(self.parameters(), lr=3e-4) return optimizer if __name__ == '__main__': model = Model(env='CartPole-v1', eval_env='CartPole-v1') trainer = pl.Trainer(max_epochs=20, gradient_clip_val=0.5) trainer.fit(model) rewards, lengths = model.evaluate(num_eval_episodes=10, render=True) print(np.mean(rewards), np.mean(lengths))
def main(args): # We first set the Random Seed for Everything. # Recommended to use seed=42 as that will give best results according to Douglas Adams pl.seed_everything(args.seed) # Data Augmentation Pipeline # Uses HSV color jitter to focus network on textural rather than color features. data_transforms = { 'train': transforms.Compose([ transforms.RandomHorizontalFlip(), # Do Random Flip transforms.RandomRotation( degrees=360), # Rotate the image as orientation doesn't matter transforms.RandomCrop( (512, 512)), # Crop to a smaller size randomly transforms.ColorJitter(hue=0.5), # Color Jitter the hue transforms.Resize( (224, 224)), # Resize to Resnet or VGG input Size transforms.Normalize( mean=MEAN_TRAIN, std=STD_TRAIN ) # Normalize to Imagenet Pixel Value Distribution ]), 'val': transforms.Compose([ transforms.CenterCrop((512, 512)), # Center Crop transforms.Resize((224, 224)), # Resize transforms.Normalize( mean=MEAN_TRAIN, std=STD_TRAIN) # Normalize to pretrained imagenet weights ]) } # Set base working path base_path = Path("..") # Initialize the training and Validation Datasets train_dataset_base = ThinSectionDataset(base_path, args.labelset, preload_images=True, transform=data_transforms['train'], train=True, seed=args.seed) val_dataset = ThinSectionDataset(base_path, args.labelset, preload_images=True, transform=data_transforms['val'], train=False, seed=args.seed) train_loader = DataLoader(train_dataset_base, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=args.pin_memory) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=args.pin_memory) if args.plot: visualize_batch(train_loader) visualize_batch(val_loader) # Use the Weights And Biases Logger wandb_logger = WandbLogger(name='lukas-mosser', project='neural-rock', entity='ccg') wandb_logger.experiment.config.update(args) # Checkpoint Models based on Validation F1 score checkpointer = ModelCheckpoint(monitor="val/f1", verbose=True, mode="max") # Initialize the Pytorch Lightning Trainer trainer = pl.Trainer(gpus=-1, max_epochs=None, logger=[wandb_logger], callbacks=[checkpointer], log_every_n_steps=args.log_every_n_steps, distributed_backend=args.distributed_backend, max_steps=args.steps, benchmark=args.benchmark) # Make the Model if args.model == 'vgg': feature_extractor, classifier = make_vgg11_model( num_classes=train_dataset_base.num_classes, dropout=args.dropout) elif args.model == 'resnet': feature_extractor, classifier = make_resnet18_model( num_classes=train_dataset_base.num_classes, dropout=args.dropout) # Intialize the Neural Rock Model with the feature extractor and classifier model = NeuralRockModel( feature_extractor, classifier, train_dataset_base.num_classes, freeze_feature_extractor=args.freeze_feature_extractor) # Run actual training trainer.fit(model, train_dataloader=train_loader, val_dataloaders=val_loader)
args = parser.parse_args() return args def infer_image_size(image_size): image_size = image_size.split(",") if len(image_size) > 2: image_size = image_size[:2] if len(image_size) == 1: image_size.append(image_size[0]) image_size = list(map(int, image_size)) return image_size if __name__ == "__main__": args = parse_args() test_dataset = TestDataset(root=args.root, image_size=infer_image_size(args.image_size)) test_dataloader = DataLoader( test_dataset, batch_size=args.batch_size, num_workers=args.workers, pin_memory=True, shuffle=False ) model = RetinaNet.from_checkpoint(checkpoint_path=args.weights, name=args.name) logger = pl.loggers.TensorBoardLogger(args.output_dir, name="predictions") trainer = pl.Trainer(gpus=args.gpus, logger=logger) trainer.test(model, test_dataloaders=test_dataloader)
return [optim], [scheduler] # %% # Train the MoCo model # -------------------- # # We can instantiate the model and train it using the # lightning trainer. # use a GPU if available gpus = 1 if torch.cuda.is_available() else 0 model = MocoModel() trainer = pl.Trainer(max_epochs=max_epochs, gpus=gpus, progress_bar_refresh_rate=100) trainer.fit(model, dataloader_train_moco) # %% # Train the Classifier model.eval() classifier = Classifier(model.backbone) trainer = pl.Trainer(max_epochs=max_epochs, gpus=gpus, progress_bar_refresh_rate=100) trainer.fit(classifier, dataloader_train_classifier, dataloader_test) # %% # Checkout the tensorboard logs while the model is training. #
def train( self, train_data: Union[str, TokenDataset], output_dir: str = "trained_model", fp16: bool = False, fp16_opt_level: str = "O1", n_gpu: int = -1, tpu_cores: int = 0, max_grad_norm: float = 0.5, gradient_accumulation_steps: int = 1, seed: int = None, learning_rate: float = 1e-3, weight_decay: float = 0.05, adam_epsilon: float = 1e-8, warmup_steps: int = 0, num_steps: int = 5000, save_every: int = 1000, generate_every: int = 1000, n_generate: int = 1, loggers: List = None, batch_size: int = 1, num_workers: int = None, benchmark: bool = True, avg_loss_smoothing: float = 0.01, save_gdrive: bool = False, run_id: str = f"ATG_{datetime.utcnow():%Y%m%d_%H%M%S}", progress_bar_refresh_rate: int = 20, freeze_layers: bool = False, num_layers_freeze: int = None, use_deepspeed: bool = False, **kwargs, ) -> None: """ Trains/finetunes the model on the provided file/dataset using pytorch-lightning. :param train_data: Either a TokenDataset containing the samples to be trained, or a string containing the text to be trained (shortcut instead of dataset) :param output_dir: A string indicating where to store the resulting model file folder. :param fp16: Boolean whether to use fp16, assuming using a compatible GPU/TPU. :param fp16_opt_level: Option level for FP16/APEX training. :param n_gpu: Number of GPU to use (-1 implies all available GPUs) :param tpu_cores: Number of TPU cores to use (should be a multiple of 8) :param max_grad_norm: Maximum gradient normalization :param gradient_accumulation_steps: Number of gradient acc steps :param seed: Interger representing the training seed. :param learning_rate: Training learnign rate for the default AdamW optimizer. :param weight_decay: Weight decay for the default AdamW optimizer. :param warmup_steps: Warmrup steps for the default AdamW optimizer. :param num_steps: Number of samples through the dataset. :param save_every: Number of steps for each time to save the model to disk :param generate_every: Number of steps for each time to generate sample text :param n_generate: Number of texts to generate when generate_every occurs. :param loggers: pytorch-lightning logger(s) to log results. :param batch_size: Number of input samples per batch :param num_workers: Number of DataLoader workers :param benchmark: If using GPU, whether to use cudnn.benchmarkl :param avg_loss_smoothing: Smoothing factor for Avg loss in progress bar :param save_gdrive: If using Colab, whether to save the notebook to Google Drive at each save_every :param run_id: Run identifier; used for save_gdrive :param progress_bar_refresh_rate: How often to update the progress bar while training. """ if not os.path.exists(output_dir): os.makedirs(output_dir) if save_gdrive: assert ( "google.colab" in sys.modules ), "You must be in Colaboratory to copy to your Google Drive" create_gdrive_folder(run_id) self.model = self.model.train() is_gpu_used = torch.cuda.is_available() and n_gpu != 0 if isinstance(train_data, str): block_size = model_max_length(self.model.config) logger.info( f"Loading text from {train_data} with generation length of {block_size}." ) train_data = TokenDataset( tokenizer=self.tokenizer, bos_token=self.bos_token, eos_token=self.eos_token, unk_token=self.unk_token, file_path=train_data, block_size=block_size, **kwargs, ) if freeze_layers or self.openai_tf_gpt2 == "1558M": logger.info("Layer freezing enabled for model training.") freeze_layers = True if num_layers_freeze: assert ( num_layers_freeze < self.model.config.n_layer ), "You are freezing more Transformer layers than in the model." if num_workers is None: # Use all CPU cores as workers if not training on CPU if is_gpu_used or tpu_cores > 0: num_workers = os.cpu_count() # If training on the CPU, use half the CPUs else: num_workers = int(os.cpu_count() / 2) hparams = dict( weight_decay=weight_decay, learning_rate=learning_rate, adam_epsilon=adam_epsilon, warmup_steps=warmup_steps, batch_size=batch_size, num_steps=num_steps, pin_memory=is_gpu_used, num_workers=num_workers, save_every=save_every, generate_every=generate_every, use_tpu=tpu_cores > 0, ) # Wrap the model in a pytorch-lightning module train_model = ATGTransformer(self.model, train_data, hparams, self.tokenizer) # Begin training if seed: set_seed(seed) if os.path.exists(output_dir) and "pytorch_model.bin" in os.listdir(output_dir): logger.warning( f"pytorch_model.bin already exists in /{output_dir} and will be overwritten!" ) # if try to use a GPU but no CUDA, use CPU if not is_gpu_used: n_gpu = 0 # force single-GPU on Windows if platform.system() == "Windows" and is_gpu_used and n_gpu != 1: logger.warning( "Windows does not support multi-GPU training. Setting to 1 GPU." ) n_gpu = 1 # use the DeepSpeed plugin if installed and specified deepspeed_plugin = None if is_gpu_used and use_deepspeed: deepspeed_plugin = DeepSpeedPlugin() logger.info("Using DeepSpeed training.") if not fp16: logger.info("Setting FP16 to True for DeepSpeed ZeRO Training.") fp16 = True train_params = dict( accumulate_grad_batches=gradient_accumulation_steps, gpus=n_gpu, max_steps=num_steps, gradient_clip_val=max_grad_norm, checkpoint_callback=False, logger=loggers if loggers else False, weights_summary=None, progress_bar_refresh_rate=progress_bar_refresh_rate, # ignored callbacks=[ ATGProgressBar( save_every, generate_every, output_dir, n_generate, is_gpu_used, avg_loss_smoothing, run_id, save_gdrive, progress_bar_refresh_rate, freeze_layers, num_layers_freeze, ) ], plugins=deepspeed_plugin, ) if fp16: train_params["precision"] = 16 if fp16 else 32 train_params["amp_level"] = fp16_opt_level if tpu_cores > 0: train_params["tpu_cores"] = tpu_cores train_params["gpus"] = 0 n_gpu = 0 # benchmark gives a boost for GPUs if input size is constant, # which will always be the case with aitextgen training if is_gpu_used and benchmark: train_params["benchmark"] = True if n_gpu > 1: train_params["distributed_backend"] = "ddp" trainer = pl.Trainer(**train_params) trainer.fit(train_model) logger.info(f"Saving trained model pytorch_model.bin to /{output_dir}") self.model.save_pretrained(output_dir) if save_gdrive: for pt_file in ["pytorch_model.bin", "config.json"]: shutil.copyfile( os.path.join(output_dir, pt_file), os.path.join("/content/drive/My Drive/", run_id, pt_file), ) if seed: reset_seed()
def main(): pl.seed_everything(42) # set seed # Argument Setting ------------------------------------------------------------------------------------------------- parser = argparse.ArgumentParser() # mode specific -------------------------------------------------------------------------------- parser.add_argument("--do_train", action='store_true', help="Whether to train text classifier.") parser.add_argument("--do_predict", action='store_true', help="Whether to predict on real dataset.") # model specific ------------------------------------------------------------------------------- parser.add_argument("--text_reader", help="bert, kobert, koelectra, others, ...", default="bert") # experiment settings -------------------------------------------------------------------------- parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") # bert has 512 tokens. parser.add_argument("--batch_size", help="batch_size", default=32, type=int) parser.add_argument("--gpu_id", help="gpu device id", default="0") parser = pl.Trainer.add_argparse_args(parser) parser = NER.add_model_specific_args(parser) args = parser.parse_args() # ------------------------------------------------------------------------------------------------------------------ # Dataset ---------------------------------------------------------------------------------------------------------- from dataset import NER_Data_Module dm = NER_Data_Module("ner", args.text_reader, args.max_seq_length, args.batch_size) dm.prepare_data() # ------------------------------------------------------------------------------------------------------------------ # Model Checkpoint ------------------------------------------------------------------------------------------------- from pytorch_lightning.callbacks import ModelCheckpoint model_name = '{}'.format(args.text_reader) model_folder = './model/{}/{}'.format("ner", model_name) checkpoint_callback = ModelCheckpoint( monitor='val_loss', dirpath=model_folder, filename='{epoch:02d}-{val_loss:.2f}') # ------------------------------------------------------------------------------------------------------------------ # Early Stopping --------------------------------------------------------------------------------------------------- early_stop_callback = EarlyStopping(monitor="val_loss", patience=3, verbose=True) # ------------------------------------------------------------------------------------------------------------------ # Trainer ---------------------------------------------------------------------------------------------------------- trainer = pl.Trainer( gpus=args.gpu_id if platform.system() != 'Windows' else 1, # <-- for dev. pc checkpoint_callback=checkpoint_callback, callbacks=[early_stop_callback]) # ------------------------------------------------------------------------------------------------------------------ # Do train ! if args.do_train: model = NER("ner", args.text_reader, dm.num_labels, dm.label_vocab) trainer.fit(model, dm) # Do predict ! if args.do_predict: model_files = glob(os.path.join(model_folder, '*.ckpt')) best_fn = model_files[-1] model = NER.load_from_checkpoint(best_fn) trainer.test(model, test_dataloaders=[dm.test_dataloader()])
cfg_name = args.cfg.split('/')[-1] if args.cfg is None: raise Exception('Cfg not specified') module = LyftModule(args.cfg) if args.resume is not None: print("loading from", args.resume) model = LyftModule.load_from_checkpoint(args.resume) default_root_dir = '/var/data/hdd1/{}/lyft_checkpoints/'.format(cfg_name) checkpoint_callback = ModelCheckpoint( filepath=default_root_dir, # filepath='/var/data/lyft_checkpoints/', save_top_k=5, verbose=True, monitor='avg_val_loss', mode='min', prefix='_') early_stop = EarlyStopping(monitor='avg_val_loss', verbose=True, patience=10, mode='min') print('using default root dir', default_root_dir) trainer = pl.Trainer(gpus=1, max_epochs=cfg['train_params']['epochs'], default_root_dir=default_root_dir, callbacks=[early_stop, checkpoint_callback]) trainer.fit(module)
def _integration(data_with_covariates, tmp_path, gpus, data_loader_kwargs={}, train_only=False, **kwargs): data_loader_default_kwargs = dict( target="target", time_varying_known_reals=["price_actual"], time_varying_unknown_reals=["target"], static_categoricals=["agency"], add_relative_time_idx=True, ) data_loader_default_kwargs.update(data_loader_kwargs) dataloaders_with_covariates = make_dataloaders( data_with_covariates, **data_loader_default_kwargs) train_dataloader = dataloaders_with_covariates["train"] val_dataloader = dataloaders_with_covariates["val"] test_dataloader = dataloaders_with_covariates["test"] early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=1, verbose=False, mode="min", strict=False) logger = TensorBoardLogger(tmp_path) trainer = pl.Trainer( max_epochs=3, gpus=gpus, gradient_clip_val=0.1, callbacks=[early_stop_callback], enable_checkpointing=True, default_root_dir=tmp_path, limit_train_batches=2, limit_val_batches=2, limit_test_batches=2, logger=logger, ) net = DecoderMLP.from_dataset(train_dataloader.dataset, learning_rate=0.015, log_gradient_flow=True, log_interval=1000, hidden_size=10, **kwargs) net.size() try: if train_only: trainer.fit(net, train_dataloaders=train_dataloader) else: trainer.fit( net, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader, ) # check loading net = DecoderMLP.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) # check prediction net.predict(val_dataloader, fast_dev_run=True, return_index=True, return_decoder_lengths=True) # check test dataloader test_outputs = trainer.test(net, dataloaders=test_dataloader) assert len(test_outputs) > 0 finally: shutil.rmtree(tmp_path, ignore_errors=True) net.predict(val_dataloader, fast_dev_run=True, return_index=True, return_decoder_lengths=True)
else: resume_from = None lit = LitModel(opt) # warning grad_clip_mode is ignored. trainer = pl.Trainer( callbacks=[ OnEpochStartCallback(), pl.callbacks.lr_logger.LearningRateLogger() ], default_root_dir=opt.checkpoint_path, resume_from_checkpoint=resume_from, distributed_backend='ddp', check_val_every_n_epoch=1, max_epochs=opt.max_epochs, gradient_clip_val=opt.grad_clip_value, gpus=torch.cuda.device_count(), checkpoint_callback=checkpoint_callback, log_gpu_memory='min_max', log_save_interval=opt.losses_log_every, profiler=True, row_log_interval=10, # what is it? num_sanity_val_steps=0, # limit_train_batches=500, # progress_bar_refresh_rate=0, # fast_dev_run=True, ) if os.getenv('EVALUATE', '0') == '1': trainer.test(lit) else: trainer.fit(lit)
def main(): # ------------ # args # ------------ torch.manual_seed(0) pl.seed_everything(0) parser = argparse.ArgumentParser() parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point') parser.add_argument('--batch-size', type=int, dest='batch_size', default=50, help='mini batch size for training') parser.add_argument('--epoch', type=int, dest='epoch', default=10, help='epoch size for training') parser.add_argument('--learning-rate', type=float, dest='learning_rate', default=0.001, help='learning rate') parser.add_argument('--momentum', type=float, dest='momentum', default=0.9, help='momentum') parser.add_argument('--model-name', type=str, dest='model_name', default='resnet', help='Fine Turning model name') parser.add_argument('--optimizer', type=str, dest='optimizer', default='SGD', help='Optimzers to use for training.') parser.add_argument('--criterion', type=str, dest='criterion', default='cross_entropy', help='Loss Function to use for training.') parser.add_argument('--feature_extract', type=bool, dest='feature_extract', default=True, help='Flag for feature extracting. When False, we finetune the whole model, when True we only update the reshaped layer params') args = parser.parse_args() args.num_workers=8 data_folder = args.data_folder print('training dataset is stored here:', data_folder) input_size = 224 if args.model_name == "inception": input_size = 299 # --------------------------- # Azure Machnie Learning # 1) get Azure ML run context and log hyperparameters # --------------------------- run = Run.get_context() run.log('model_name', args.model_name) run.log('optimizer', args.optimizer) run.log('criterion', args.criterion) run.log('lr', np.float(args.learning_rate)) run.log('momentum', np.float(args.momentum)) # For your tagging # run.tag('description', 'xxx') # ------------ # data # ------------ transform = transforms.Compose([ # Augmentation # transforms.RandomHorizontalFlip(), # transforms.RandomVerticalFlip(), transforms.RandomAffine(degrees=[-10, 10], translate=(0.1, 0.1), scale=(0.5, 1.5)), transforms.RandomRotation(degrees=10), # Resize transforms.Resize(int(input_size * 1.3)), transforms.CenterCrop(input_size), # Tensor transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]) ] ) dataset = torchvision.datasets.ImageFolder(args.data_folder, transform) args.num_classes = len(dataset.classes) n_train = int(len(dataset) * 0.7) n_val = int(len(dataset) * 0.15) n_test = len(dataset) - n_train - n_val train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [n_train, n_val, n_test]) train_loader = torch.utils.data.DataLoader(train_dataset, args.batch_size, shuffle=True, drop_last=True, num_workers=args.num_workers) val_loader = torch.utils.data.DataLoader(val_dataset, args.batch_size, num_workers=args.num_workers) test_loader = torch.utils.data.DataLoader(test_dataset, args.batch_size) # Initialize the model for this run model_ft, input_size = initialize_model(args.model_name, args.num_classes, feature_extract=args.feature_extract , use_pretrained=True) model = FineTurningModel(args, model_ft) # GPU Configuration num_gpu = torch.cuda.device_count() print('num_gpu:', num_gpu) accelerator = None if num_gpu > 1: accelerator='ddp' # only for Single Machine # ------------ # training # ------------ trainer = pl.Trainer(max_epochs=args.epoch, gpus=num_gpu, accelerator=accelerator) trainer.fit(model, train_loader, val_loader) # ------------ # Test (Not Validation) # ------------ test_result = trainer.test(test_dataloaders=test_loader) test_result run.log('test_acc', [res["test_acc"] for res in test_result][0]) run.log('test_loss', [res["test_loss"] for res in test_result][0]) run.log('test_acc_epoch', [res["test_acc_epoch"] for res in test_result][0]) run.log('test_loss_epoch', [res["test_loss_epoch"] for res in test_result][0]) # ------------ # save model # ------------ outputdir = './outputs/model' os.makedirs(outputdir, exist_ok=True) torch.save(model.state_dict(), os.path.join(outputdir, 'model.dict')) torch.save(model, os.path.join(outputdir, 'model.pt'))
cfg = parse_config(args.cfg_file) data_cfg = get_data_info(cfg['data']) cfg['data'] = data_cfg args.cfg = cfg ckpt_fd = "{}".format(args.output_directory) + "/{epoch:02d}_{train_mAP:.3f}_{val_mAP:.3f}" ckpt_callback = pl.callbacks.model_checkpoint.ModelCheckpoint( filepath=ckpt_fd, verbose=True, save_top_k=-1 ) es_cb = pl.callbacks.EarlyStopping("val_mAP", mode="max", verbose=True, patience=10) mixer = mixers.BackgroundAddMixer() args.tr_mixer = mixers.UseMixerWithProb(mixer, args.mixer_prob) tr_tfs = get_transforms_fsd_chunks(True, 101) val_tfs = get_transforms_fsd_chunks(False, 101) args.tr_tfs = tr_tfs args.val_tfs = val_tfs net = FSD50k_Lightning(args) precision = 16 if args.fp16 else 32 trainer = pl.Trainer(gpus=args.gpus, max_epochs=args.epochs, precision=precision, accelerator="dp", num_sanity_val_steps=4170, callbacks=[ckpt_callback, es_cb], resume_from_checkpoint=args.resume_from, logger=TensorBoardLogger(args.log_directory)) trainer.fit(net)
def main(): dm = DataModule(training_path=cfg['path']['training_path'], validation_path=cfg['path']['validation_path'], test_path=cfg['path']['test_path'], num_workers=cfg['num_workers'], size=cfg['size'], batch_size=cfg['batch_size'], means=cfg['means'], std=cfg['std']) model = CustomTrainClass(model_train=cfg['model_train'], num_classes=cfg['num_classes'], diffaug_activate=cfg['diffaug_activate'], policy=cfg['policy'], aug=cfg['aug']) # skipping validation with limit_val_batches=0 if cfg['use_amp'] == False: trainer = pl.Trainer( num_sanity_val_steps=0, stochastic_weight_avg=cfg['use_swa'], log_every_n_steps=50, resume_from_checkpoint=cfg['path']['checkpoint_path'], check_val_every_n_epoch=9999999, logger=None, gpus=cfg['gpus'], max_epochs=['max_epochs'], progress_bar_refresh_rate=cfg['progress_bar_refresh_rate'], default_root_dir=cfg['default_root_dir'], callbacks=[ CheckpointEveryNSteps( save_step_frequency=cfg['save_step_frequency'], save_path=cfg['path']['checkpoint_save_path']) ]) if cfg['use_amp'] == True: trainer = pl.Trainer( num_sanity_val_steps=0, stochastic_weight_avg=cfg['use_swa'], log_every_n_steps=50, resume_from_checkpoint=cfg['path']['checkpoint_path'], check_val_every_n_epoch=9999999, logger=None, gpus=cfg['gpus'], precision=16, amp_level='O1', max_epochs=cfg['max_epochs'], progress_bar_refresh_rate=cfg['progress_bar_refresh_rate'], default_root_dir=cfg['default_root_dir'], callbacks=[ CheckpointEveryNSteps( save_step_frequency=cfg['save_step_frequency'], save_path=cfg['path']['checkpoint_save_path']) ]) if cfg['path']['pretrain']: import torch model.netD.load_state_dict(torch.load(cfg['path']['pretrain']), strict=False) print("Pretrain pth loaded!") ############################################# # Loading a Model ############################################# # For resuming training if cfg['path']['checkpoint_path'] is not None: # load from checkpoint (optional) (using a model as pretrain and disregarding other parameters) #model = model.load_from_checkpoint(checkpoint_path) # start training from checkpoint, warning: apperantly global_step will be reset to zero and overwriting validation images, you could manually make an offset # continue training with checkpoint (does restore values) (optional) # https://github.com/PyTorchLightning/pytorch-lightning/issues/2613 # https://pytorch-lightning.readthedocs.io/en/0.6.0/pytorch_lightning.trainer.training_io.html # https://github.com/PyTorchLightning/pytorch-lightning/issues/4333 # dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'callbacks', 'optimizer_states', 'lr_schedulers', 'state_dict', 'hparams_name', 'hyper_parameters']) # To use DDP for local multi-GPU training, you need to add find_unused_parameters=True inside the DDP command model = model.load_from_checkpoint(cfg['path']['checkpoint_path']) #trainer = pl.Trainer(resume_from_checkpoint=checkpoint_path, logger=None, gpus=cfg['gpus'], max_epochs=cfg['datasets']['train']['max_epochs'], progress_bar_refresh_rate=cfg['progress_bar_refresh_rate'], default_root_dir=cfg['default_root_dir'], callbacks=[CheckpointEveryNSteps(save_step_frequency=cfg['datasets']['train']['save_step_frequency'], save_path = cfg['path']['checkpoint_save_path'])]) checkpoint = torch.load(cfg['path']['checkpoint_path']) trainer.checkpoint_connector.restore(checkpoint, on_gpu=True) trainer.checkpoint_connector.restore_training_state(checkpoint) pl.Trainer.global_step = checkpoint['global_step'] pl.Trainer.epoch = checkpoint['epoch'] print("Checkpoint was loaded successfully.") ############################################# trainer.fit(model, dm)
monitor="val_loss", mode="min", save_top_k=5) train_params = dict( accumulate_grad_batches=args.gradient_accumulation_steps, gpus=args.gpus, max_epochs=args.num_train_epochs, #early_stop_callback=True, gradient_clip_val=args.max_grad_norm, checkpoint_callback=checkpoint_callback, callbacks=[LoggingCallback()], ) if args.n_gpu > 1: train_params["distributed_backend"] = "dp" #tokenizer = T5Tokenizer.from_pretrained(args.model_name_or_path) # initialize model model = SeqGenSQL(args) # restore full training state # trainer = pl.Trainer(resume_from_checkpoint='t5_checkpoints/epoch=15.ckpt', gpus=1, ) # multi GPUs: #trainer = pl.Trainer(resume_from_checkpoint='t5_checkpoints/base_gated_e03_0.2470.ckpt', **train_params) trainer = pl.Trainer(**train_params) # Train trainer.fit(model)
a = SSAD(confs) checkpoint_dir = os.path.join(confs["log_dir"], 'checkpoints/') checkpoint = ModelCheckpoint(checkpoint_dir, monitor='val_loss', mode='min', verbose=True, save_top_k=5) early_stop_callback = EarlyStopping(monitor='val_loss', patience=20, verbose=True, mode='min') with open(os.path.join(confs["log_dir"], "confs.yml"), "w") as f: yaml.dump(confs, f) logger = TensorBoardLogger(os.path.dirname(confs["log_dir"]), confs["log_dir"].split("/")[-1]) trainer = pl.Trainer( max_nb_epochs=confs["training"]["n_epochs"], gpus=confs["gpus"], checkpoint_callback=checkpoint, accumulate_grad_batches=confs["training"]["accumulate_batches"], early_stop_callback=early_stop_callback, logger=logger, gradient_clip=bool(confs["training"]["gradient_clip"]), gradient_clip_val=confs["training"]["gradient_clip"]) trainer.fit(a)
def evaluate(args): """Evaluates a detection model based on a configuration :param args: argparse arguments specifying input configuration and output folder """ # create data paths dict data_paths = dict() data_paths['train_images'] = args.train_images data_paths['train_labels'] = args.train_labels data_paths['test_images'] = args.test_images data_paths['test_labels'] = args.test_labels resume_ckpt = args.ckpt_path batch_size = args.batch_size minibatch_size = args.minibatch_size sampler = args.sampler lr = args.lr epochs = args.epochs no_classes = args.classes class_weights = args.weights input_size = args.input_size anchors = args.anchors output = args.output transform_norm_parameters = args.transform_norm_parameters logger = pl_loggers.TensorBoardLogger( '{}/custom_ssd_ckpt/logs'.format(output)) # add background class no_classes = no_classes + 1 # initialize model model, bbox_encoder = create_detection_model(anchors, input_size=input_size, no_classes=no_classes) loss = ODLoss(minibatch_size, sampler, class_weights) # initialize detection model train_model = None if os.path.isdir(resume_ckpt): tmodel_params = dict() tmodel_params['model'] = model tmodel_params['input_size'] = input_size tmodel_params['lr'] = lr tmodel_params['epochs'] = epochs tmodel_params['loss'] = loss resume_ckpt = io_ops.get_best_ckpt(resume_ckpt) train_model = DetectionModel.load_from_checkpoint( resume_ckpt, **tmodel_params) else: print("Checkpoint path is not a directory") exit(-1) # create checkpoint-creation callback checkpoint_callback = ModelCheckpoint(monitor='test_loss', save_top_k=3, save_last=True, mode='min') # initialize trainer trainer = pl.Trainer(logger=logger, max_epochs=epochs, gpus=1, num_sanity_val_steps=0, resume_from_checkpoint=resume_ckpt, limit_train_batches=0, limit_val_batches=0, weights_save_path='{}/custom_rpn_ckpt'.format(output), weights_summary='full', callbacks=[checkpoint_callback]) # initialize data module data_module = DataModule(batch_size, input_size, data_paths, no_classes, bbox_encoder, transform_norm_parameters) # train model trainer.fit(train_model, data_module) # test model trainer.test(test_dataloaders=data_module.test_dataloader()) version_number = trainer.logger.version version_path = '{}/custom_ssd_ckpt/default/version_{}'.format( output, version_number) if not os.path.exists(version_path): os.mkdir(version_path) print('Version is: {}'.format(version_path)) # save all metrics in json preds = train_model.get_test_preds() io_ops.save_dict(preds, os.path.join(version_path, 'test_preds.json')) # coco evaluator cocoGt = COCO(data_paths['test_labels']) cocoDt = cocoGt.loadRes(os.path.join(version_path, 'test_preds.json')) annType = 'bbox' imgIds = sorted(cocoGt.getImgIds()) cocoEval = COCOeval(cocoGt, cocoDt, annType) cocoEval.params.imgIds = imgIds cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize() train_model.log('mAP IoU=0.50:0.95', round(cocoEval.stats[0] * 1, 2)) train_model.log('mAP IoU=0.50', round(cocoEval.stats[1] * 1, 2))
batch_size=64, num_workers=4) try: path = config["checkpoint"] experiment = Experiment.load_from_checkpoint(path) except KeyError: model_hpparams = model_config(config) print(model_hpparams) experiment = Experiment(**model_hpparams) trainer_config = config["trainer"] logger = TensorBoardLogger(prefix) if trainer_config == "tune": trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0, logger=logger, callbacks=[checkpoint_callback], auto_lr_find=True) trainer.tune(experiment, train_dataloader, validate_dataloader) else: try: path = config["checkpoint"] trainer = pl.Trainer(resume_from_checkpoint=path, gpus=1 if torch.cuda.is_available() else 0, logger=logger, callbacks=[checkpoint_callback], max_epochs=4000) except KeyError: trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0, logger=logger, callbacks=[checkpoint_callback], max_epochs=4000)
def main(cfg): trainer = pl.Trainer(**cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) model = HifiGanModel(cfg=cfg.model, trainer=trainer) model.maybe_init_from_pretrained_checkpoint(cfg=cfg) trainer.fit(model)
train_config["batch_size"] = 256 # train_config["accumulate_grad_batches"] = 12 train_config["gradient_clip_val"] = 1.5 train_config["learning_rate"] = 1e-4 pl.seed_everything(42) wandb_logger = WandbLogger(name='PAN13_12-24-24_KUEP', project='AVDV_PAN13') wandb_save(wandb_logger, train_config) model = LightningLongformerCLS(train_config) # model = LightningLongformerCLS.load_from_checkpoint("AVDV/2npel9bz/checkpoints/epoch=7-step=2639.ckpt", config=train_config) cp_valloss = ModelCheckpoint(save_top_k=5, monitor='val_loss', mode='min') trainer = pl.Trainer( max_epochs=train_config["epochs"], # accumulate_grad_batches=train_config["accumulate_grad_batches"], gradient_clip_val=train_config["gradient_clip_val"], gpus=[5], num_nodes=1, # accelerator='ddp', amp_backend='native', precision=16, logger=wandb_logger, log_every_n_steps=1, val_check_interval=0.5, limit_val_batches=40, checkpoint_callback=cp_valloss) trainer.fit(model)
return torch.optim.Adam(self.parameters(), lr=1e-3) def train_dataloader(self): train_ds = ImageListDs(images=train.images.values, labels=train.labels.values, aug=train_aug) n = int(len(train_ds) / 10) sampler = RandomSampler(data_source=train_ds) train_loader = DataLoader(train_ds, shuffle=True, num_workers=12, batch_size=BATCH_SIZE) return train_loader def val_dataloader(self): valid_ds = ImageListDs(images=valid.images.values, labels=valid.labels.values, aug=valid_aug) valid_loader = DataLoader(valid_ds, shuffle=False, num_workers=12, batch_size=BATCH_SIZE) return valid_loader torch.multiprocessing.freeze_support() deep_fake_module = DeepFakeModule() trainer = pl.Trainer(gpus=2, distributed_backend='dp', max_epochs=2) trainer.fit(deep_fake_module)
@torch.no_grad() def get_resnet_layers(self, x): activations = self.resnet_extractor(x) # activation_transform = { # 'early': nn.AvgPool2d(kernel_size=(2, 2), stride=(2, 2)), # 'middle': lambda x: x, # 'deep': nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True), # } activation_transform = { 'early': lambda x: x, 'middle': nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True), 'deep': nn.Upsample(scale_factor=4, mode="bilinear", align_corners=True), } return { 'early': activation_transform['early'](activations["layer2"]), 'middle': activation_transform['middle'](activations["layer3"]), 'deep': activation_transform['deep'](activations["layer4"]) } if __name__ == "__main__": autoencoder = CorrespondenceEncoder() tb_logger = TensorBoardLogger('tb_logs', name='correspondence_encoder_lr1e3') trainer = pytorch_lightning.Trainer( logger=tb_logger, gpus=1 if torch.cuda.is_available() else None) dm = CorrespondenceDataModule() trainer.fit(autoencoder, dm)
tst = tvt_dataset(dataset.source_tst, dataset.wizard_tst, dataset.target_tst) trn_loader = DataLoader(train, batch_size=hparams.batch_size, collate_fn=pad_sequence) val_loader = DataLoader(val, batch_size=hparams.batch_size, collate_fn=pad_sequence) tst_loader = DataLoader(tst, batch_size=hparams.batch_size, collate_fn=pad_sequence) model = Task(hparams, dataset) logger = TensorBoardLogger('lightning_logs', name='sync') trainer = pl.Trainer( progress_bar_refresh_rate=10, logger=logger, max_epochs=hparams.max_epoch, auto_lr_find=True, # gpus = '0', #hparams.cuda, # accelerator = 'ddp', gradient_clip_val=hparams.clip, ) trainer.fit(model, trn_loader, val_loader) trainer.test(model, tst_loader, ckpt_path='None') trainer.test(model, tst_loader, ckpt_path='best') print(f'Random seed = {hparams.seed}.')
def test_trainer_loggers(self, cleanup_local_folder, tmp_path): """ Test that a trainer with logger errors out with a number of arguments. Test that it works with create_tensorboard_logger set to False """ test_trainer = pl.Trainer() # Should create logger and modelcheckpoint with pytest.raises(LoggerMisconfigurationError ): # Fails because exp_manager defaults to trainer exp_manager(test_trainer, {"exp_dir": str(tmp_path)}) with pytest.raises(LoggerMisconfigurationError ): # Fails because exp_manager defaults to trainer exp_manager(test_trainer, {"explicit_log_dir": str(tmp_path)}) with pytest.raises(LoggerMisconfigurationError ): # Fails because exp_manager defaults to trainer exp_manager(test_trainer, {"resume_if_exists": True}) # Check that exp_manager uses trainer.logger, it's exp_dir, name, and version log_dir = exp_manager( test_trainer, { "create_tensorboard_logger": False, "create_checkpoint_callback": False }) assert log_dir.resolve() == Path( "./lightning_logs/version_0").resolve() assert Path("./lightning_logs").exists() assert Path("./lightning_logs/version_0").exists() # Check that a trainer without a logger gets a logger attached to it test_trainer = pl.Trainer(logger=False) log_dir = exp_manager( test_trainer, { "create_tensorboard_logger": True, "create_checkpoint_callback": False, "exp_dir": str(tmp_path) }, ) assert isinstance(test_trainer.logger, pl.loggers.TensorBoardLogger) test_trainer = pl.Trainer(logger=False) # Check that a create_wandb_logger=True errors out unless wandb_logger_kwargs is passed. with pytest.raises(ValueError): log_dir = exp_manager( test_trainer, { "create_tensorboard_logger": False, "create_checkpoint_callback": False, "exp_dir": str(tmp_path), "create_wandb_logger": True, }, ) # Check that a WandbLogger is attached to logger if create_wandb_logger=True and wandb_logger_kwargs has name # and project log_dir = exp_manager( test_trainer, { "create_tensorboard_logger": False, "create_checkpoint_callback": False, "exp_dir": str(tmp_path), "create_wandb_logger": True, "wandb_logger_kwargs": { "name": "", "project": "" }, }, ) assert isinstance(test_trainer.logger, pl.loggers.WandbLogger)
return tg.loader.DataLoader(list(self.dataset), batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=False, shuffle=True) def val_dataloader(self): return tg.loader.DataLoader(list(self.dataset), batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=False, shuffle=True) if __name__ == '__main__': os.environ['CUDA_LAUNCH_BLOCKING'] = '1' data_dir = os.path.join('GraphCoAttention', 'data') wandb.init() wandb_logger = WandbLogger(project='flux', log_model='all') trainer = pl.Trainer(gpus=[0], max_epochs=2000, check_val_every_n_epoch=500, accumulate_grad_batches=1) trainer.fit( Learner(data_dir, bs=20, lr=0.0005, n_cycles=30, hidden_dim=25, n_head=4))
def test_resume(self, tmp_path): """ Tests the resume capabilities of exp_manager""" test_trainer = pl.Trainer(checkpoint_callback=False, logger=False) # Error because explicit_log_dir does not exist with pytest.raises(NotFoundError): exp_manager( test_trainer, { "exp_dir": str(tmp_path / "test_resume"), "resume_if_exists": True, "explicit_log_dir": "Does_not_exist", }, ) # Error because checkpoints folder does not exist with pytest.raises(NotFoundError): exp_manager(test_trainer, { "resume_if_exists": True, "exp_dir": str(tmp_path / "test_resume") }) # No error because we tell exp_manager to ignore notfounderror exp_manager( test_trainer, { "resume_if_exists": True, "exp_dir": str(tmp_path / "test_resume_2"), "resume_ignore_no_checkpoint": True, }, ) test_trainer = pl.Trainer(checkpoint_callback=False, logger=False) Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints").mkdir(parents=True) # Error because checkpoints do not exist in folder with pytest.raises(NotFoundError): exp_manager( test_trainer, { "resume_if_exists": True, "explicit_log_dir": str(tmp_path / "test_resume" / "default" / "version_0"), }, ) Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--end.ckpt").touch() # Error because *end.ckpt is in folder indicating that training has already finished with pytest.raises(ValueError): exp_manager( test_trainer, { "resume_if_exists": True, "explicit_log_dir": str(tmp_path / "test_resume" / "default" / "version_0"), }, ) Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--end.ckpt").unlink() Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--last.ckpt").touch() Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel2--last.ckpt").touch() # Error because multiple *last.ckpt is in folder. If more than one, don't know which to restore with pytest.raises(ValueError): exp_manager( test_trainer, { "resume_if_exists": True, "explicit_log_dir": str(tmp_path / "test_resume" / "default" / "version_0"), }, ) # Finally succeed Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel2--last.ckpt").unlink() log_dir = exp_manager( test_trainer, { "resume_if_exists": True, "explicit_log_dir": str(tmp_path / "test_resume" / "default" / "version_0") }, ) checkpoint = Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--last.ckpt") assert Path(test_trainer.resume_from_checkpoint).resolve( ) == checkpoint.resolve() # Succeed again and make sure that run_0 exists and previous log files were moved test_trainer = pl.Trainer(checkpoint_callback=False, logger=False) exp_manager(test_trainer, { "resume_if_exists": True, "explicit_log_dir": str(log_dir) }) checkpoint = Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--last.ckpt") assert Path(test_trainer.resume_from_checkpoint).resolve( ) == checkpoint.resolve() prev_run_dir = Path(tmp_path / "test_resume" / "default" / "version_0" / "run_0") assert prev_run_dir.exists() prev_log = Path(tmp_path / "test_resume" / "default" / "version_0" / "run_0" / "lightning_logs.txt") assert prev_log.exists()
max_seq_length=128, num_workers=8, num_preprocess_processes=96, use_sentence_selection=True, best_k_sentences=5, ) # checkpoint_callback = ModelCheckpoint( # dirpath='./result/checkpoints/', # filename='epoch{epoch:02d}', # save_top_k=-1, # ) trainer = pl.Trainer( logger=tb_logger, gpus=-1 if torch.cuda.is_available() else None, # callbacks=[checkpoint_callback], amp_backend='native', amp_level='O2', precision=16, accelerator='ddp', gradient_clip_val=1.0, max_epochs=1, plugins='ddp_sharded', val_check_interval=0.2, # limit_train_batches=0.1, # limit_val_batches=0.1, # accumulate_grad_batches=2, ) trainer.fit(model, dm) torch.save(model.model.state_dict(), 'pytorch_model.bin') trainer.test(datamodule=dm)
def train_pl(): # Square linear dataset = MinatarDataset(name="dataset_random_3000_bullet_matched.json") # dataset = MinatarDataset(name="dataset_random_3000_new_matched.json") # dataset = MinatarDataset(name="dataset_random_3000_full_matched.json") # dataset = MinatarDataset(name="asterix_dataset_random_3000.json") dim_dict = dataset.get_dims() env_len = dim_dict["action_len"] obj_in_len = dim_dict["obj_len"] type_len = dim_dict["type_len"] # Prepare the dataloader dataset_size = len(dataset) train_size = int(dataset_size * 0.8) train_set, val_set = torch.utils.data.random_split( dataset, [train_size, dataset_size - train_size]) train_data_loader = DataLoader( train_set, batch_size=1, num_workers=8, shuffle=True) # num_workers=8, pin_memory=True, val_data_loader = DataLoader(val_set, batch_size=1, num_workers=8, pin_memory=True) # Initialize the model # model = SetDSPN( # obj_in_len=obj_in_len, # obj_reg_len=2, # obj_type_len=type_len, # env_len=env_len, # latent_dim=64, # out_set_size=3, # n_iters=10, # internal_lr=50, # overall_lr=1e-3, # loss_encoder_weight=1 # ) model = SetTransformer(obj_in_len=obj_in_len, obj_reg_len=2, obj_type_len=type_len, env_len=env_len, out_set_size=3, learning_rate=1e-4) # Early stop callback # early_stop_callback = EarlyStopping( # monitor='val_loss', # min_delta=0.00, # patience=3, # verbose=False, # mode='min' # ) # Native train # optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) # for i, batch in enumerate(train_data_loader): # print(i) # s, a, sprime, sappear, r = batch # s, a, sappear = s.to(model.device), a.to(model.device), sappear.to(model.device) # pred = model(s, a) # losses = model.loss_fn(pred, sappear) # # optimizer.zero_grad() # losses['loss_encoder'].backward() # optimizer.step() # pass # Train gpus = torch.cuda.device_count() trainer = pl.Trainer( gpus=1, precision=16, max_epochs=16, # check_val_every_n_epoch=4, accumulate_grad_batches=64, profiler="simple", auto_lr_find=True, log_every_n_steps=5, # callbacks=[early_stop_callback] ) lr_finder = False if lr_finder: # Find the ideal lr lr_finder = trainer.tuner.lr_find(model, train_dataloader=train_data_loader, val_dataloaders=val_data_loader, max_lr=0.1, min_lr=1e-5) # Results can be found in lr_finder.results # Plot with fig = lr_finder.plot(suggest=True) fig.show() # Pick point based on plot, or get suggestion new_lr = lr_finder.suggestion() else: trainer.fit(model, train_data_loader, val_data_loader) # Evaluate # trainer.test(model, test_dataloaders = val_data_loader) evaluate(model=model)
def main( cfg: CfgNode, output_dir: Optional[str] = None, task_cls: Type[GeneralizedRCNNTask] = GeneralizedRCNNTask, eval_only: bool = False, num_machines: int = 1, num_gpus: int = 0, num_processes: int = 1, accelerator: Optional[str] = "ddp", ) -> TrainOutput: """Main function for launching a training with lightning trainer Args: cfg: D2go config node num_machines: Number of nodes used for distributed training num_gpus: Number of GPUs to train on each node num_processes: Number of processes on each node. NOTE: Automatically set to the number of GPUs when using DDP. Set a value greater than 1 to mimic distributed training on CPUs. accelerator: Backend for distributed training. Only DDP and DPP_CPU are supported. eval_only: True if run evaluation only. """ assert (num_processes == 1 or num_gpus == 0), "Only set num_processes > 1 when training on CPUs" maybe_override_output_dir(cfg, output_dir) task = build_task(cfg, task_cls) tb_logger = TensorBoardLogger(save_dir=cfg.OUTPUT_DIR) trainer_params = { # training loop is bounded by max steps, use a large max_epochs to make # sure max_steps is met first "max_epochs": 10**8, "max_steps": cfg.SOLVER.MAX_ITER, "val_check_interval": cfg.TEST.EVAL_PERIOD if cfg.TEST.EVAL_PERIOD > 0 else cfg.SOLVER.MAX_ITER, "num_nodes": num_machines, "gpus": num_gpus, "num_processes": num_processes, "accelerator": accelerator, "callbacks": _get_trainer_callbacks(cfg), "logger": tb_logger, "num_sanity_val_steps": 0, "progress_bar_refresh_rate": 10, } last_checkpoint = os.path.join(cfg.OUTPUT_DIR, "last.ckpt") if PathManager.exists(last_checkpoint): # resume training from checkpoint trainer_params["resume_from_checkpoint"] = last_checkpoint logger.info(f"Resuming training from checkpoint: {last_checkpoint}.") trainer = pl.Trainer(**trainer_params) model_configs = None if eval_only: do_test(trainer, task) else: model_configs = do_train(cfg, trainer, task) return TrainOutput( output_dir=cfg.OUTPUT_DIR, tensorboard_log_dir=tb_logger.log_dir, accuracy=task.eval_res, model_configs=model_configs, )
def train_mnist(config): model = LightningMNISTClassifier(config) trainer = pl.Trainer(max_epochs=10, show_progress_bar=False) trainer.fit(model)
def main(conf): train_set = WhamDataset( conf["data"]["train_dir"], conf["data"]["task"], sample_rate=conf["data"]["sample_rate"], segment=conf["data"]["segment"], nondefault_nsrc=conf["data"]["nondefault_nsrc"], ) val_set = WhamDataset( conf["data"]["valid_dir"], conf["data"]["task"], sample_rate=conf["data"]["sample_rate"], nondefault_nsrc=conf["data"]["nondefault_nsrc"], ) train_loader = DataLoader( train_set, shuffle=True, batch_size=conf["training"]["batch_size"], num_workers=conf["training"]["num_workers"], drop_last=True, ) val_loader = DataLoader( val_set, shuffle=False, batch_size=conf["training"]["batch_size"], num_workers=conf["training"]["num_workers"], drop_last=True, ) # Update number of source values (It depends on the task) conf["masknet"].update({"n_src": train_set.n_src}) model = DPRNNTasNet(**conf["filterbank"], **conf["masknet"]) optimizer = make_optimizer(model.parameters(), **conf["optim"]) # Define scheduler scheduler = None if conf["training"]["half_lr"]: scheduler = ReduceLROnPlateau(optimizer=optimizer, factor=0.5, patience=5) # Just after instantiating, save the args. Easy loading in the future. exp_dir = conf["main_args"]["exp_dir"] os.makedirs(exp_dir, exist_ok=True) conf_path = os.path.join(exp_dir, "conf.yml") with open(conf_path, "w") as outfile: yaml.safe_dump(conf, outfile) # Define Loss function. loss_func = PITLossWrapper(pairwise_neg_sisdr, pit_from="pw_mtx") system = System( model=model, loss_func=loss_func, optimizer=optimizer, train_loader=train_loader, val_loader=val_loader, scheduler=scheduler, config=conf, ) # Define callbacks callbacks = [] checkpoint_dir = os.path.join(exp_dir, "checkpoints/") checkpoint = ModelCheckpoint(checkpoint_dir, monitor="val_loss", mode="min", save_top_k=5, verbose=True) callbacks.append(checkpoint) if conf["training"]["early_stop"]: callbacks.append( EarlyStopping(monitor="val_loss", mode="min", patience=30, verbose=True)) # Don't ask GPU if they are not available. gpus = -1 if torch.cuda.is_available() else None distributed_backend = "ddp" if torch.cuda.is_available() else None trainer = pl.Trainer( max_epochs=conf["training"]["epochs"], callbacks=callbacks, default_root_dir=exp_dir, gpus=gpus, distributed_backend=distributed_backend, gradient_clip_val=conf["training"]["gradient_clipping"], ) trainer.fit(system) best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()} with open(os.path.join(exp_dir, "best_k_models.json"), "w") as f: json.dump(best_k, f, indent=0) state_dict = torch.load(checkpoint.best_model_path) system.load_state_dict(state_dict=state_dict["state_dict"]) system.cpu() to_save = system.model.serialize() to_save.update(train_set.get_infos()) torch.save(to_save, os.path.join(exp_dir, "best_model.pth"))
for _, y in dl: labels = torch.cat((labels, y)) counts = np.unique(labels.numpy(), return_counts=True) labels = torch.LongTensor([]) for _, y in dl: labels = torch.cat((labels, y)) weight = torch.ones((7, )) label, counts = torch.unique(labels, return_counts=True) weight[label] = weight[label] - counts / torch.sum(counts) model = MobileNetV2Lightning(num_classes=7, optimizer_args=optimizer_args, participant_name='1', weights=weight) trainer = pl.Trainer(max_epochs=3) trainer.fit(model, dl) models.append(model) from mlmi.clustering import flatten_model_parameter model_states = [m.state_dict() for m in models] keys = list(model_states[0].keys()) model_parameter = np.array( [flatten_model_parameter(m, keys).numpy() for m in model_states], dtype=float) global_parameter = flatten_model_parameter(server.state_dict(), keys).cpu().numpy() euclidean_dist = np.array([ ((model_parameter[participant_id] - global_parameter)**2).sum(axis=0)