def create_trainer(cfg, tags=None, trial=None, callbacks=None): if trial: checkpoint_callback = pl.callbacks.ModelCheckpoint( f'trial#{trial.number}') new_callbacks = [PyTorchLightningPruningCallback(trial, 'val_loss')] if callbacks: new_callbacks.extend(callbacks) trainer = pl.Trainer(logger=False, callbacks=new_callbacks, checkpoint_callback=checkpoint_callback, max_epochs=400, progress_bar_refresh_rate=0, weights_summary=None) else: trainer = pl.Trainer( logger=NeptuneLogger(project_name='yoniosin/amygdala', tags=tags, params=flatten(cfg, reducer='path')), max_epochs=cfg.learner.max_epochs, # callbacks=[pl.callbacks.EarlyStopping('val_loss', patience=200)] # fast_dev_run=True ) return trainer
def main(args: Namespace) -> None: if args.seed is not None: pl.seed_everything(args.seed) if args.distributed_backend == 'ddp': # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / max(1, args.gpus)) args.workers = int(args.workers / max(1, args.gpus)) args.logger = NeptuneLogger(project_name='YOUR_PROJ/hypermixup', experiment_name="experiment_name", params={ 'model': "ResNet18", 'hypernet': True, 'dataset': args.dataset, 'base': args.base, 'z_dim': args.z_dim, 'learning_rate': args.lr, }) model = HyperResNetCIFAR(**vars(args)) trainer = pl.Trainer.from_argparse_args(args) if args.evaluate: trainer.test(model) else: trainer.fit(model)
def main(config): solver = Solver(config) logger = NeptuneLogger(project_name=config.neptune_project, api_key=config.neptune_api_key) checkpoint_callback = ModelCheckpoint(filepath=config.model_save_path, save_top_k=1, verbose=True, monitor="map", mode="max", prefix="") if config.model_load_path != ".": # resume trainer = Trainer(default_root_dir=config.model_save_path, gpus=config.gpu_id, logger=logger, checkpoint_callback=checkpoint_callback, resume_from_checkpoint=config.model_load_path, max_epochs=config.n_epochs) else: trainer = Trainer(default_root_dir=config.model_save_path, gpus=config.gpu_id, logger=logger, checkpoint_callback=checkpoint_callback, max_epochs=config.n_epochs) if config.mode == 'TRAIN': trainer.fit(solver) trainer.save_checkpoint( os.path.join(config.model_save_path, 'last.ckpt')) elif config.mode == 'TEST': S = torch.load(config.model_load_path)['state_dict'] SS = {key[6:]: S[key] for key in S.keys()} solver.model.load_state_dict(SS) trainer.test(solver)
def main(): parser = HfArgumentParser((ModelArguments, ParaphraseDataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=2, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, model_max_length=data_args.model_max_length ) language_model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) if data_args.neptune_logging: neptune_logger = NeptuneLogger( project_name=os.environ['NEPTUNE_PROJECT'], experiment_name=model_args.config_name if model_args.config_name else model_args.model_name_or_path ) train_dataset = ParaphraseDetectionDataset(data_dir=os.path.join(data_args.data_dir, TRAIN_PATH), tokenizer=tokenizer, task_name="paraphrase_detection") val_datasets= [ ParaphraseDetectionDataset(data_dir=os.path.join(data_args.data_dir, EVAL_PATH), tokenizer=tokenizer, name=EVAL_NAME) for (EVAL_PATH, EVAL_NAME) in zip(EVAL_PATHS, EVAL_NAMES) ] model = LMFinetuner(language_model, tokenizer, training_args.learning_rate, model_args.batch_size, train_dataset, val_datasets, data_args, freeze_backend=False) tb_logger = pl_loggers.TensorBoardLogger(os.path.join(training_args.output_dir, model_args.model_name_or_path)) trainer = pl.Trainer( # auto_lr_find=True, # auto_scale_batch_size=True, max_epochs=int(training_args.num_train_epochs), accumulate_grad_batches=training_args.gradient_accumulation_steps, weights_save_path=training_args.output_dir, gpus=torch.cuda.device_count(), precision=16 if training_args.fp16 and torch.cuda.is_available() else 32, distributed_backend='ddp' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, progress_bar_refresh_rate=training_args.logging_steps, logger=[neptune_logger, tb_logger] if data_args.neptune_logging else tb_logger, ) trainer.fit(model) model.lm.save_pretrained(os.path.join(training_args.output_dir, model_args.model_name_or_path))
def main(cfg: TradeConfig): data = TradeDataModule(**dict(cfg.data)) model = TradeModule(cfg.data.look_back, data.ds.data.shape[1], data.ds.full_data.shape[1]) trainer = pl.Trainer(logger=[NeptuneLogger(project_name='yoniosin/Trade')], max_epochs=cfg.max_epochs, fast_dev_run=True) trainer.fit(model, datamodule=data)
def main(): parser = general_args() parser = pl.Trainer.add_argparse_args(parser) parser = RationaleSystem.add_model_specific_args(parser) args = parser.parse_args() neptune_logger = None if args.neptune_project is not None: from pytorch_lightning.loggers.neptune import NeptuneLogger neptune_logger = NeptuneLogger(api_key=args.neptune_key, project_name=args.neptune_project, params=vars(args)) data = IMDBDataModule(args.batch_size) data.prepare_data() # if args.no_generator: # gen = None # else: # gen = GeneratorModel(args, # embeddings=data.text_field.vocab.vectors, # padding_idx=data.text_field.vocab.stoi['<pad>']) # enc = Encoder(args, # embeddings=data.text_field.vocab.vectors, # num_classes=len(data.label_field.vocab), # padding_idx=data.text_field.vocab.stoi['<pad>']) model = RationaleSystem(args, embeddings=data.text_field.vocab.vectors, num_classes=len(data.label_field.vocab), padding_idx=data.text_field.vocab.stoi['<pad>']) checkpoint_callback = pl.callbacks.ModelCheckpoint(filepath=os.getcwd(), save_top_k=3, save_weights_only=True, verbose=True, monitor='val_acc', mode='max', prefix='') earlystop_callback = EarlyStopping(monitor='val_acc', patience=args.patience) trainer = pl.Trainer.from_argparse_args( args, callbacks=[checkpoint_callback, earlystop_callback], logger=neptune_logger) trainer.fit(model, data)
def main(hparams): set_seed(hparams.seed) checkpoint_callback = None if hparams.checkpoint_path: checkpoint_dir = os.path.dirname( os.path.abspath(hparams.checkpoint_path)) print(f'Checkpoints will be saved to {checkpoint_dir}') checkpoint_callback = ModelCheckpoint( dirpath=checkpoint_dir, prefix=hparams.checkpoint_prefix, monitor=hparams.checkpoint_monitor, mode=hparams.checkpoint_monitor_mode, save_top_k=hparams.checkpoint_save_top_k, verbose=True, ) if hparams.resume_from_checkpoint: print(f'Restoring checkpoint: {hparams.resume_from_checkpoint}') logger = NeptuneLogger( api_key=None, # read from NEPTUNE_API_TOKEN environment variable project_name=hparams.project_name, experiment_name=hparams.experiment_name, tags=hparams.experiment_tags, close_after_fit=False, params=vars(hparams)) dm = DocVQADataModule(hparams) dm.setup() model = LitEffNetT5(hparams) trainer = Trainer.from_argparse_args( hparams, logger=logger, callbacks=[checkpoint_callback], ) if hparams.do_train: trainer.fit(model, dm) if hparams.do_test: trainer.test(datamodule=dm) logger.experiment.stop()
def __init__(self, exp_name: str, max_epochs: int, batch_size: int, learning_rate: float): self.neptune_logger = NeptuneLogger( api_key="ANONYMOUS", project_name="shared/pytorch-ae-trainer", close_after_fit=False, experiment_name=exp_name, params={ "max_epochs": max_epochs, "batch_size": batch_size, "lr": learning_rate }, # Optional, tags=["pytorch-lightning", "mlp"], # upload_source_files=['*.py', '*.yaml'], upload_stderr=False, upload_stdout=False)
def init_neptune(args, api_key, project_name, experiment_name, experiment_tags=[]): import neptune from pytorch_lightning.loggers.neptune import NeptuneLogger params = vars(args) neptune.init(project_qualified_name=project_name, api_token=api_key) neptune_logger = NeptuneLogger(api_key=api_key, project_name=project_name, experiment_name=experiment_name, tags=experiment_tags, params=params) return neptune_logger
# model_name=hparams['model_name'], # width=hparams['width'], # size=hparams['size']) checkpoing_callback = ModelCheckpoint(filepath=model_config.weights_folder, save_top_k=True, verbose=True, monitor='val_loss', mode='min', prefix=hparams['model_name']) neptune_logger = NeptuneLogger( api_key='eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vdWkubmVwdHVuZS5haSIs' 'ImFwaV91cmwiOiJodHRwczovL3VpLm5lcHR1bmUuYWkiLCJhcGlfa' '2V5IjoiMTIyODQyZGUtNTdiMS00MDBlLWEzZmYtMzU0N2Q4MDViMjQ0In0=', project_name='vadbeg/birds', experiment_name= f'{hparams["model_name"]}, CrossEntropyLoss, width=2048', params=hparams, tags=['pytorch-lightning', 'birds']) trainer = Trainer(gpus=1, num_nodes=1, checkpoint_callback=checkpoing_callback, max_epochs=hparams['max_epochs'], logger=neptune_logger) trainer.fit(model=model) trainer = Trainer()
def cli_main(): argv = sys.argv[1:] # argv = ['--config', 'configs/base.yaml', # '--exp_name', 'test', # '--exp_dir', '../prj_ssl_ntu_exps/test'] parser = argparse.ArgumentParser() parser.add_argument("-c", "--config", default=None, help="where to load YAML configuration", metavar="FILE") parser.add_argument('--exp_dir', type=str, default=None, help='experiment output directory') parser.add_argument('--path_db', type=str, default='../dbs', help='neptune project directory') args = parser.parse_args(argv) new_exp_dir = args.exp_dir new_path_db = args.path_db with open(args.config, 'r') as stream: config_vars = yaml.load(stream, Loader=yaml.FullLoader) args = argparse.Namespace() args.__dict__.update(config_vars) if new_exp_dir is not None: args.exp_dir = new_exp_dir if new_path_db is not None: args.path_db = new_path_db # trainer args parser = pl.Trainer.add_argparse_args(parser) # get model and model args model_type = vars(modules)[args.model] # get dataset and dataset args dataset_type = vars(datasets)[args.dataset] # save config with open(os.path.join(args.exp_dir, 'config.yaml'), 'w') as cfg_file: yaml.dump(args.__dict__, cfg_file) if args.neptune_key != '': logger = NeptuneLogger( api_key=args.neptune_key, project_name=args.neptune_project, close_after_fit=False, experiment_name=args.exp_name, # Optional, params=args.__dict__, # Optional, tags=["pl"], # Optional, # upload_stderr=False, # upload_stdout=False ) else: logger = TensorBoardLogger(args.exp_dir) #, name="my_model" # ckpt = list(filter(lambda x:'.ckpt' in x, os.listdir(args.exp_dir)))[-1] # ckpt = os.path.join(args.exp_dir, ckpt) ckpts = list(filter(lambda x: 'epoch=' in x, os.listdir(args.exp_dir))) best_epoch = max( [int(x.replace('epoch=', '').replace('.ckpt', '')) for x in ckpts]) best_ckpt = os.path.join(args.exp_dir, 'epoch=' + str(best_epoch) + '.ckpt') model = model_type.load_from_checkpoint(best_ckpt) lincls_results = lincls(args, model) print(best_ckpt) print('test results') for k, v in lincls_results.items(): print(k, v) db_path = os.path.join(args.path_db, args.exp_name + '_db.csv') if os.path.exists(db_path): df = pd.read_csv(db_path, index_col=0) else: df = pd.DataFrame() output_dict = { 'exp_name': args.exp_name, 'exp_dir': args.exp_dir, 'model': args.model, 'dataset': args.dataset, } output_dict.update(lincls_results) if args.neptune_key != '': for k, v in pretrain_result.items(): logger.experiment.log_metric(k, v) for k, v in lincls_results.items(): logger.experiment.log_metric(k, v) df = df.append(output_dict, ignore_index=True) df.to_csv(db_path)
"/data/lyft-motion-prediction-autonomous-vehicles/lyft-config-files/agent_motion_config.yaml" ) cfg = omegaconf.DictConfig(cfg) name_for_save = 'Big_training' epochs = cfg["model_params"]["epochs"] learning_rate = cfg["model_params"]["lr"] training_percentage = cfg["train_data_loader"]["training_percentage"] validation_percentage = cfg["val_data_loader"]["validation_percentage"] API_KEY = os.environ.get('NEPTUNE_API_KEY') neptune_logger = NeptuneLogger( api_key=API_KEY, project_name='hvergnes/KaggleResNet', params={ 'epoch_nr': epochs, 'learning_rate': learning_rate, 'train_size': training_percentage, 'test_size': validation_percentage }, # your hyperparameters, immutable tags=['ResNet'], # tags ) os.environ[ "L5KIT_DATA_FOLDER"] = "/data/lyft-motion-prediction-autonomous-vehicles" dm = LocalDataManager() cfg = load_config_data( "/data/lyft-motion-prediction-autonomous-vehicles/lyft-config-files/agent_motion_config.yaml" ) cfg = omegaconf.DictConfig(cfg) rasterizer = build_rasterizer(cfg, dm)
## 3: Implement Callbacks and Create Them from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint lr_logger = LearningRateMonitor(**LearningRateMonitor_Params) model_checkpoint = ModelCheckpoint(**ModelCheckpoint_Params) # Step 4: Create NeptuneLogger from pytorch_lightning.loggers.neptune import NeptuneLogger neptune_logger = NeptuneLogger( api_key="ANONYMOUS", project_name="shared/pytorch-lightning-integration", close_after_fit=False, experiment_name="train-on-MNIST", params=ALL_PARAMS, tags=['1.x', 'advanced'], ) # Step 5: Pass NeptuneLogger and Callbacks to the Trainer trainer = pl.Trainer(logger=neptune_logger, checkpoint_callback=model_checkpoint, callbacks=[lr_logger], **Trainer_Params) # Step 6: Run experiment ## 1: Initialize model and data objects
def train_text2title(config_file: str, train_file: str, val_file: str, train_sample_rate: float, val_sample_rate: float, output_title_model_path: str, output_text_model_path: str, random_seed: int, neptune_project: str): seed_everything(random_seed) train_file = get_true_file(train_file) val_file = get_true_file(val_file) assert train_file.endswith(".jsonl") assert val_file.endswith(".jsonl") config = json.loads(jsonnet_evaluate_file(config_file)) print("Loading vectors...") ft_model_path = config.pop("ft_vector_model_path", "models/fasttext/ru_vectors_v3.bin") ft_model = ft_load_model(ft_model_path) print("Fetching data...") train_records = [ r for r in read_tg_jsonl(train_file) if random.random() <= train_sample_rate ] val_records = [ r for r in read_tg_jsonl(val_file) if random.random() <= val_sample_rate ] print("Building datasets...") max_words = config.get("max_words", 150) batch_size = config.get("batch_size", 64) num_workers = config.get("num_workers", 5) train_data = Text2TitleDataset(train_records, ft_model, max_words=max_words) train_sampler = RandomSampler(train_data) train_loader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, num_workers=num_workers) val_data = Text2TitleDataset(val_records, ft_model, max_words=max_words) val_loader = DataLoader(val_data, batch_size=batch_size, num_workers=num_workers) print("Training model...") epochs = config.get("epochs", 100) patience = config.get("patience", 4) model = Text2TitleModel() early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=0.0, patience=patience, verbose=True, mode="min") logger = False neptune_api_token = os.getenv("NEPTUNE_API_TOKEN") if neptune_project and neptune_api_token: params = copy.copy(config) params["train_sample_rate"] = train_sample_rate params["val_sample_rate"] = val_sample_rate params["train_file"] = train_file params["val_file"] = val_file logger = NeptuneLogger( api_key=neptune_api_token, project_name=neptune_project, experiment_name="Fasttext text2title", tags=["training", "pytorch-lightning", "text2title"], params=params) trainer = Trainer(gpus=0, checkpoint_callback=False, accumulate_grad_batches=1, max_epochs=epochs, callbacks=[early_stop_callback], val_check_interval=1.0, progress_bar_refresh_rate=100, deterministic=True, logger=logger) trainer.fit(model, train_loader, val_loader) model.save(output_title_model_path, output_text_model_path)
model_checkpoint = pl.callbacks.ModelCheckpoint(CHECKPOINTS_DIR) trainer = Trainer(logger=neptune_logger, gpus=hparams.gpus, checkpoint_callback=model_checkpoint) trainer.test(model) # Save checkpoints folder neptune_logger.experiment.log_artifact(CHECKPOINTS_DIR) # You can stop the experiment neptune_logger.experiment.stop() # ------------------------------------------------------------------------------------------------------------------- CHECKPOINTS_DIR = '/home/rachneet/thesis_results/res_mixed_recordings/' neptune_logger = NeptuneLogger( api_key=os.environ.get("NEPTUNE_API_KEY"), project_name="rachneet/sandbox", experiment_name="res_mixed_recordings", # change this for new runs ) # ---------------------------------------MAIN FUNCTION TRAINER------------------------------------------------------- def main(hparams): model = LightningResnet(hparams) # exp = Experiment(save_dir=os.getcwd()) if not os.path.exists(CHECKPOINTS_DIR): os.makedirs(CHECKPOINTS_DIR) model_checkpoint = pl.callbacks.ModelCheckpoint(CHECKPOINTS_DIR) early_stop_callback = pl.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.00,
def main(): params = Params() # api key api_key = os.environ[ "NEPTUNE" ] # if this throws an error, you didn't set your env var # save directory save_dir = os.getcwd() if not params.SAVE_DIR else params.SAVE_DIR # root directory root = ROOT_PATH / "pytorch_faster_rcnn_tutorial" / "data" / "heads" # input and target files inputs = get_filenames_of_path(root / "input") targets = get_filenames_of_path(root / "target") inputs.sort() targets.sort() # mapping mapping = { "head": 1, } # training transformations and augmentations transforms_training = ComposeDouble( [ Clip(), AlbumentationWrapper(albumentation=albu.HorizontalFlip(p=0.5)), AlbumentationWrapper( albumentation=albu.RandomScale(p=0.5, scale_limit=0.5) ), # AlbuWrapper(albu=A.VerticalFlip(p=0.5)), FunctionWrapperDouble(np.moveaxis, source=-1, destination=0), FunctionWrapperDouble(normalize_01), ] ) # validation transformations transforms_validation = ComposeDouble( [ Clip(), FunctionWrapperDouble(np.moveaxis, source=-1, destination=0), FunctionWrapperDouble(normalize_01), ] ) # test transformations transforms_test = ComposeDouble( [ Clip(), FunctionWrapperDouble(np.moveaxis, source=-1, destination=0), FunctionWrapperDouble(normalize_01), ] ) # random seed seed_everything(params.SEED) # training validation test split inputs_train, inputs_valid, inputs_test = inputs[:12], inputs[12:16], inputs[16:] targets_train, targets_valid, targets_test = ( targets[:12], targets[12:16], targets[16:], ) # dataset training dataset_train = ObjectDetectionDataSet( inputs=inputs_train, targets=targets_train, transform=transforms_training, use_cache=True, convert_to_format=None, mapping=mapping, ) # dataset validation dataset_valid = ObjectDetectionDataSet( inputs=inputs_valid, targets=targets_valid, transform=transforms_validation, use_cache=True, convert_to_format=None, mapping=mapping, ) # dataset test dataset_test = ObjectDetectionDataSet( inputs=inputs_test, targets=targets_test, transform=transforms_test, use_cache=True, convert_to_format=None, mapping=mapping, ) # dataloader training dataloader_train = DataLoader( dataset=dataset_train, batch_size=params.BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=collate_double, ) # dataloader validation dataloader_valid = DataLoader( dataset=dataset_valid, batch_size=1, shuffle=False, num_workers=0, collate_fn=collate_double, ) # dataloader test dataloader_test = DataLoader( dataset=dataset_test, batch_size=1, shuffle=False, num_workers=0, collate_fn=collate_double, ) # neptune logger neptune_logger = NeptuneLogger( api_key=api_key, project_name=f"{params.OWNER}/{params.PROJECT}", # use your neptune name here experiment_name=params.PROJECT, params=params.__dict__, ) assert neptune_logger.name # http GET request to check if the project exists # model init model = get_faster_rcnn_resnet( num_classes=params.CLASSES, backbone_name=params.BACKBONE, anchor_size=params.ANCHOR_SIZE, aspect_ratios=params.ASPECT_RATIOS, fpn=params.FPN, min_size=params.MIN_SIZE, max_size=params.MAX_SIZE, ) # lightning init task = FasterRCNNLightning( model=model, lr=params.LR, iou_threshold=params.IOU_THRESHOLD ) # callbacks checkpoint_callback = ModelCheckpoint(monitor="Validation_mAP", mode="max") learningrate_callback = LearningRateMonitor( logging_interval="step", log_momentum=False ) early_stopping_callback = EarlyStopping( monitor="Validation_mAP", patience=params.PATIENCE, mode="max" ) # trainer init trainer = Trainer( gpus=params.GPU, precision=params.PRECISION, # try 16 with enable_pl_optimizer=False callbacks=[checkpoint_callback, learningrate_callback, early_stopping_callback], default_root_dir=save_dir, # where checkpoints are saved to logger=neptune_logger, log_every_n_steps=1, num_sanity_val_steps=0, max_epochs=params.MAXEPOCHS, ) # start training trainer.fit( model=task, train_dataloader=dataloader_train, val_dataloaders=dataloader_valid ) # start testing trainer.test(ckpt_path="best", dataloaders=dataloader_test) # log packages log_packages_neptune(neptune_logger=neptune_logger) # log mapping as table log_mapping_neptune(mapping=mapping, neptune_logger=neptune_logger) # log model if params.LOG_MODEL: checkpoint_path = pathlib.Path(checkpoint_callback.best_model_path) log_model_neptune( checkpoint_path=checkpoint_path, save_directory=pathlib.Path.home(), name="best_model.pt", neptune_logger=neptune_logger, ) # stop logger neptune_logger.experiment.stop() print("Finished")
'best_model_score', model_checkpoint.best_model_score.tolist()) # Testing trainer.test() if __name__ == "__main__": # Setup logger.setLevel(logging.INFO) pl.seed_everything(CONSTANTS['SEED']) api_token = getpass("Enter Neptune.ai API token: ") neptune_logger = NeptuneLogger( api_key=api_token, project_name="username/projname", # TODO close_after_fit=False, experiment_name="experiment-name", # TODO params=hparams, tags=["pytorch-lightning"]) # Config hparams = { 'lr': 0.0001, 'weight_decay': 0.0001, 'batch_size': { 'train': 8, 'val': 4, 'test': 4 }, 'image_size': 256, 'gradient_clip_val': 0.1,
def cli_main(): # Arguments default_config = os.path.join(os.path.split(os.getcwd())[0], 'config.conf') print(default_config) parser = ArgumentParser(description='Pytorch BT', default_config_files=[default_config]) parser.add_argument('-c', '--my-config', required=False, is_config_file=True, help='config file path') parser.add_argument('--finetune', dest='finetune', action='store_true', help='Perform only finetuning (Default: False)') parser.set_defaults(finetune=False) parser.add_argument( '--transfer', dest='transfer', action='store_true', help='Perform transfer learning on linear eval (Default: False)') parser.set_defaults(transfer=False) parser.add_argument('--offline_log', dest='offline_log', action='store_true', help='Do not log online (Default: False)') parser.set_defaults(offline_log=False) parser.add_argument('--pt_checkpoint', type=str, default=None) parser.add_argument('--val_every_n', type=int, default=1) parser.add_argument('--tag', type=str, default=None) parser.add_argument('--resume_ckpt', type=str, default=None) parser.add_argument('--seed', type=int, default=222) parser.add_argument('--project_name', type=str, default=None) # trainer args parser = pl.Trainer.add_argparse_args(parser) # model args parser = BT.add_model_specific_args(parser) parser = SSLLinearEval.add_model_specific_args(parser) args = parser.parse_args() seed_everything(args.seed) args.status = 'Finetune' args.batch_size = args.ft_batch_size # Get DataModule dm, ft_dm, args = get_dm(args) neptune_logger = NeptuneLogger( offline_mode=args.offline_log, api_key=None, project_name=args.project_name, experiment_name='Testing', # Optional, params=vars(args), # Optional, tags=["Test", args.tag], # Optional, upload_source_files=['src/*.py'], close_after_fit=False) # Define model model = BT(**args.__dict__) load_log_file = os.path.join(os.getcwd(), 'log_files.txt') log_dirs = np.genfromtxt(load_log_file, delimiter=" ", dtype='str') print("\n\n Log Dir: {}\n\n".format(log_dirs)) ft_model_dir = log_dirs[1] checkpoint_path = log_dirs[2] print("Loading checkpoint: {}".format(checkpoint_path)) ft_model_checkpoint = pl.callbacks.ModelCheckpoint(filepath=(ft_model_dir + '/'), save_top_k=1, monitor='val_loss') encoder = BT.load_from_checkpoint(checkpoint_path, strict=False) if args.accelerator == 'ddp' or args.accelerator == 'ddp2': replace_sampler = True # False if args.accelerator == 'ddp': args.effective_bsz = args.ft_batch_size * args.num_nodes * args.gpus elif args.accelerator == 'ddp2': args.effective_bsz = args.ft_batch_size * args.num_nodes else: replace_sampler = True args.effective_bsz = args.ft_batch_size ft_model = SSLLinearEval(encoder.encoder_online, **args.__dict__) trainer_ft = pl.Trainer.from_argparse_args( args, max_epochs=args.ft_epochs, logger=neptune_logger, callbacks=[FTPrintingCallback(ft_model_dir, args)], deterministic=True, checkpoint_callback=False, fast_dev_run=False, sync_batchnorm=True, track_grad_norm=-1, replace_sampler_ddp=replace_sampler, progress_bar_refresh_rate=args.print_freq) if trainer_ft.local_rank == 0: if not args.offline_log: print("Experiment: {}".format(str(trainer_ft.logger.experiment))) log_dirs = np.append( log_dirs, str(trainer_ft.logger.experiment).split('(')[1][:-1]) save_log_file = os.path.join(os.getcwd(), 'log_files.txt') np.savetxt(save_log_file, log_dirs, delimiter=" ", fmt="%s") # Fit trainer_ft.fit(ft_model, ft_dm) if args.save_checkpoint: neptune_logger.experiment.log_artifact( os.path.join(ft_model_dir, os.listdir(ft_model_dir + '/')[-1]), os.path.join('finetune/', os.listdir(ft_model_dir + '/')[-1])) neptune_logger.experiment.stop()
def main(hparams): """ Main training routine specific for this project :param hparams: """ # 0 INIT TRACKER # https://docs.neptune.ai/integrations/pytorch_lightning.html try: import neptune NEPTUNE_AVAILABLE = True except ImportError: # pragma: no-cover NEPTUNE_AVAILABLE = False USE_NEPTUNE = False if getattr(hparams, 'tracker', None) is not None: if getattr(hparams.tracker, 'neptune', None) is not None: USE_NEPTUNE = True if USE_NEPTUNE and not NEPTUNE_AVAILABLE: warnings.warn( 'You want to use `neptune` logger which is not installed yet,' ' install it with `pip install neptune-client`.', UserWarning) time.sleep(5) tracker = None if NEPTUNE_AVAILABLE and USE_NEPTUNE: neptune_params = hparams.tracker.neptune fn_token = getattr(neptune_params, 'fn_token', None) if fn_token is not None: p = Path(neptune_params.fn_token).expanduser() if p.exists(): with open(p, 'r') as f: token = f.readline().splitlines()[0] os.environ['NEPTUNE_API_TOKEN'] = token hparams_flatten = dict_flatten(hparams, sep='.') experiment_name = hparams.tracker.get('experiment_name', None) tags = list(hparams.tracker.get('tags', [])) offline_mode = hparams.tracker.get('offline', False) tracker = NeptuneLogger( project_name=neptune_params.project_name, experiment_name=experiment_name, params=hparams_flatten, tags=tags, offline_mode=offline_mode, upload_source_files=["../../../*.py" ], # because hydra change current dir ) try: # log if tracker is not None: watermark_s = watermark(packages=[ 'python', 'nvidia', 'cudnn', 'hostname', 'torch', 'sparseconvnet', 'pytorch-lightning', 'hydra-core', 'numpy', 'plyfile' ]) log_text_as_artifact(tracker, watermark_s, "versions.txt") # arguments_of_script sysargs_s = str(sys.argv[1:]) log_text_as_artifact(tracker, sysargs_s, "arguments_of_script.txt") for key in ['overrides.yaml', 'config.yaml']: p = Path.cwd() / '.hydra' / key if p.exists(): tracker.log_artifact(str(p), f'hydra_{key}') callbacks = [] if tracker is not None: lr_logger = LearningRateLogger() callbacks.append(lr_logger) # ------------------------ # 1 INIT LIGHTNING MODEL # ------------------------ model = LightningTemplateModel(hparams) if tracker is not None: s = str(model) log_text_as_artifact(tracker, s, "model_summary.txt") # ------------------------ # 2 INIT TRAINER # ------------------------ cfg = hparams.PL if tracker is None: tracker = cfg.logger # True by default in PL kwargs = dict(cfg) kwargs.pop('logger') trainer = pl.Trainer( max_epochs=hparams.train.max_epochs, callbacks=callbacks, logger=tracker, **kwargs, ) # ------------------------ # 3 START TRAINING # ------------------------ print() print("Start training") trainer.fit(model) except (Exception, KeyboardInterrupt) as ex: if tracker is not None: print_exc() tracker.experiment.stop(str(ex)) raise
type=list, nargs='+', default=[3, 3, 3, 3, 3, 3]) parser.add_argument('--pool_size', default=3) parser.add_argument('--fc_neurons', default=128) parser.add_argument('--n_classes', default=8) return parser # =========================================NEPTUNE AI=============================================================== CHECKPOINTS_DIR = '/home/rachneet/thesis_results/tl_vsg_intf_16qam_rw/' # change this neptune_logger = NeptuneLogger( api_key=os.environ.get("NEPTUNE_API_KEY"), project_name="rachneet/sandbox", experiment_name="tl_vsg_intf_16qam_rw", # change this for new runs ) # =================================================================================================================== def inference(hparams: argparse.Namespace): model = TransferLearningModel.load_from_checkpoint( CHECKPOINTS_DIR + 'epoch=14-step=61739.ckpt') model_checkpoint = pl.callbacks.ModelCheckpoint(CHECKPOINTS_DIR) trainer = pl.Trainer(logger=neptune_logger, gpus=hparams.gpus, checkpoint_callback=True, callbacks=[model_checkpoint])
from pytorch_lightning.loggers.neptune import NeptuneLogger import os logger = NeptuneLogger( api_key=os.environ["NEPTUNE_API_TOKEN"], project_name="jonasfrey96/asl", experiment_id='ASL-400', close_after_fit=False, ) print(logger.experiment.id) logger.experiment print('Done') print('Done') print('Done') print('Done') print('Done') print('Done') import time time.sleep(1)
def cli_main(): pl.seed_everything(1234) # ------------ # args # ------------ parser = ArgumentParser() parser.add_argument('--base_folders', nargs='+', default=[], required=True) parser.add_argument('--datasets', nargs='+', default=[], required=True) parser.add_argument('--shuffle', action="store_true", default=False) parser.add_argument('--use_tpu', action="store_true", default=False) parser.add_argument('--memory_profile', action="store_true", default=False) parser.add_argument('--tags', nargs='*', default=[]) parser = UTWRS.add_model_specific_args(parser) parser = pl.Trainer.add_argparse_args(parser) args = parser.parse_args() # ------------ # data path # ------------ file_paths = [] max_seq_length = 0 max_summary_length = 0 if "BBC" in args.datasets: i = args.datasets.index("BBC") file_paths.append(get_file_paths(args.base_folders[i])) max_seq_length = max(get_max_seq_len(args.base_folders[i]), max_seq_length) max_summary_length = max(get_max_summary_len(args.base_folders[i]), max_summary_length) if "OVSD" in args.datasets: i = args.datasets.index("OVSD") file_paths.append(get_file_paths(args.base_folders[i])) max_seq_length = max(get_max_seq_len(args.base_folders[i]), max_seq_length) max_summary_length = max(get_max_summary_len(args.base_folders[i]), max_summary_length) if file_paths == []: raise UnsupportedOperation("--dataset only support BBC or OVSD.") # ------------ # data args # ------------ # Add <START> and <END> token args.enc_seq_len = max_seq_length + 2 args.dec_seq_len = max_summary_length + 2 # ------------ # Split train/test # ------------ print(f"\nTotal number of videos: {sum([len(i) for i in file_paths])}") print(f"Max length of videos: {max_seq_length}") print(f"Max length of summary: {max_summary_length}\n") train_paths = [] test_paths = [] for dataset in file_paths: np.random.shuffle(dataset) train_paths.extend(dataset[:-2]) test_paths.extend(dataset[-2:]) # ------------ # K-fold # ------------ kfold = StratifiedKFold(n_splits=3, shuffle=False) # Generate data index for kfold X = [0] * len(train_paths) Y = [] for i, dataset in enumerate(file_paths): Y += [i] * (len(dataset) - 2) train_paths = np.array(train_paths) for k, (train, val) in enumerate( tqdm(kfold.split(X, Y), total=kfold.get_n_splits())): print(f"Training data: f{train_paths[train]}") print(f"Validation data: f{train_paths[val]}") # ------------ # data loader # ------------ data_loader = OVSDBBCDataModule(max_seq_length, max_summary_length, args.d_model, train_paths[train], train_paths[val], shuffle=args.shuffle, use_tpu=args.use_tpu) # ------------ # model # ------------ model = UTWRS(args, SRC_PAD_TOKEN, TRG_PAD_TOKEN) # ------------ # neptune logger # ------------ neptune_logger = NeptuneLogger(project_name="guyleaf/UTWRS", params=vars(args), experiment_name=f"{k+1}-fold_logger", tags=args.tags) neptune_logger.experiment.log_text("training_data", ','.join(train_paths[train])) neptune_logger.experiment.log_text("validation_data", ','.join(train_paths[val])) # ------------ # checkpoint # ------------ model_checkpoint = ModelCheckpoint( dirpath="checkpoints", filename='{epoch:02d}_{test_loss:.2f}', save_top_k=3, monitor='test_loss', mode='min') # ------------ # profiler # ------------ profiler = PyTorchProfiler( output_filename=f"profiles/{k}-fold_profiler", profile_memory=True, sort_by_key="cuda_memory_usage", row_limit=50, enabled=args.memory_profile) # ------------ # training # ------------ trainer = pl.Trainer.from_argparse_args( args, logger=neptune_logger, profiler=profiler, checkpoint_callback=model_checkpoint, track_grad_norm=2, log_every_n_steps=100) trainer.fit(model, data_loader) # Log model checkpoint to Neptune for k in model_checkpoint.best_k_models.keys(): model_name = 'checkpoints/' + k.split('/')[-1] neptune_logger.experiment.log_artifact(k, model_name) # Log score of the best model checkpoint. neptune_logger.experiment.set_property( 'best_model_loss', model_checkpoint.best_model_score.tolist()) if args.profiler: neptune_logger.experiment.log_artifact('profiles')
TRAIN_DATA=TensorDataset(TR_II, TR_AM, TR_LABEL), TRAIN_CODES=train_codes, DEV_DATA=TensorDataset(DE_II, DE_AM, DE_LABEL), DEV_CODES=dev_codes, TEST_DATA=TensorDataset(TE_II, TE_AM, TE_LABEL), TEST_CODES=test_codes, HIDDEN_UNIT1=model_parameters["HIDDEN_UNIT1"], BATCH_SIZE=model_parameters["BATCH_SIZE"], LR=model_parameters["LEARNING_RATE"], EPS=model_parameters["EPS"], EPOCHS=model_parameters["EPOCHS"], FREEZE_BERT=model_parameters["FREEZE_BERT"]) neptune_logger = NeptuneLogger(api_key=NEPTUNE_API, project_name="fatihbeyhan/CASE21-SUBTASK3", params={ **model_parameters, **data_parameters }) ### initializing trainer trainer = Trainer( max_epochs=model_parameters["EPOCHS"], gpus=1, #auto_lr_find=True, #auto_scale_batch_size='binsearch', #gradient_clip_val= GRADIENT_CLIP, #limit_train_batches = 1, #limit_val_batches = 2, #limit_test_batches = 1, logger=neptune_logger, #accelerator='ddp',
def cli_main(): argv = sys.argv[1:] #argv = ['--config', 'configs/NTU_BUTD_CON.yaml', # '--exp_name', 'test', # '--exp_dir', '../prj_ssl_exps/test'] parser = argparse.ArgumentParser() parser.add_argument("-c", "--config", default=None, help="where to load YAML configuration", metavar="FILE") parser.add_argument('--exp_name', type=str, default='test', help='experiment name') parser.add_argument('--exp_dir', type=str, default='../experiments/', help='experiment output directory') parser.add_argument('--neptune_key', type=str, default='', help='neptune user api key') parser.add_argument('--neptune_project', type=str, default='', help='neptune project directory') parser.add_argument('--path_db', type=str, default='../dbs', help='neptune project directory') parser.add_argument('--model', type=str, default='MocoV2', help='self supervised training method') parser.add_argument('--dataset', type=str, default='NTU_SSL', help='dataset to use for training') parser.add_argument('--seed', type=int, default=None, help='random seed') parser.add_argument('--resume_training', action='store_true', help='resume training from checkpoint training') args = parse_args(parser, argv) if args.seed is not None: pl.seed_everything(args.seed) # trainer args parser = pl.Trainer.add_argparse_args(parser) # get model and model args model_type = vars(modules)[args.model] parser = model_type.add_model_specific_args(parser) # get dataset and dataset args dataset_type = vars(datasets)[args.dataset] parser = dataset_type.add_dataset_specific_args(parser) args = parse_args(parser, argv) os.makedirs(args.exp_dir, exist_ok=True) os.makedirs(args.path_db, exist_ok=True) # save config with open(os.path.join(args.exp_dir, 'config.yaml'), 'w') as cfg_file: yaml.dump(args.__dict__, cfg_file) if args.neptune_key != '': logger = NeptuneLogger( api_key=args.neptune_key, project_name=args.neptune_project, close_after_fit=False, experiment_name=args.exp_name, # Optional, params=args.__dict__, # Optional, tags=["pl"], # Optional, # upload_stderr=False, # upload_stdout=False ) else: logger = TensorBoardLogger(args.exp_dir) #, name="my_model" datamodule = dataset_type(**args.__dict__) model = model_type(**args.__dict__) if args.resume_training: ckpts = list(filter(lambda x:'epoch=' in x, os.listdir(args.exp_dir))) latest_epoch = max([int( x.replace('epoch=','').replace('.ckpt','')) for x in ckpts]) latest_ckpt = os.path.join(args.exp_dir, 'epoch=' + str(latest_epoch) + '.ckpt') print('resuming from checkpoint', latest_ckpt) args.__dict__.update({'resume_from_checkpoint': latest_ckpt}) #model_checkpoint = pl.callbacks.ModelCheckpoint(filepath=args.exp_dir, save_top_k=3, mode='max', monitor='knn_acc', period=args.ckpt_period) # , filename='{epoch}-{knn_acc}' model_checkpoint = pl.callbacks.ModelCheckpoint(save_top_k=3, mode='max', monitor='knn_acc', period=args.ckpt_period) # , filename='{epoch}-{knn_acc}' trainer = pl.Trainer.from_argparse_args(args, logger=logger, checkpoint_callback=model_checkpoint, callbacks=[KNNEval(period=args.ckpt_period)]) # print(len(datamodule.val_dataset())) trainer.fit(model, datamodule) best_ckpt = trainer.checkpoint_callback.best_model_path best_model = model_type.load_from_checkpoint(checkpoint_path=best_ckpt) # pretrain_result = trainer.test(model=best_model)[0] pretrain_result = trainer.test(model=best_model, datamodule=datamodule)[0] print(pretrain_result) lincls_results = lincls(args, best_model) print('test results') for k,v in lincls_results.items(): print(k, v) df = pd.DataFrame() output_dict = { 'exp_name': args.exp_name, 'exp_dir': args.exp_dir, 'model': args.model, 'dataset': args.dataset, } output_dict.update(pretrain_result) output_dict.update(lincls_results) if args.neptune_key != '': for k, v in pretrain_result.items(): logger.experiment.log_metric(k, v) for k, v in lincls_results.items(): logger.experiment.log_metric(k, v) df = df.append(output_dict, ignore_index=True) df.to_csv(os.path.join(args.path_db, args.exp_name + '_db.csv'))
def cli_main(): # Arguments default_config = os.path.join(os.path.split(os.getcwd())[0], 'config.conf') print(default_config) parser = ArgumentParser(description='Pytorch BYOL', default_config_files=[default_config]) parser.add_argument('-c', '--my-config', required=False, is_config_file=True, help='config file path') parser.add_argument('--finetune', dest='finetune', action='store_true', help='Perform only finetuning (Default: False)') parser.set_defaults(finetune=False) parser.add_argument( '--transfer', dest='transfer', action='store_true', help='Perform transfer learning on linear eval (Default: False)') parser.set_defaults(transfer=False) parser.add_argument('--offline_log', dest='offline_log', action='store_true', help='Do not log online (Default: False)') parser.set_defaults(offline_log=False) parser.add_argument('--pt_checkpoint', type=str, default=None) parser.add_argument('--val_every_n', type=int, default=1) parser.add_argument('--tag', type=str, default=None) parser.add_argument('--resume_ckpt', type=str, default=None) parser.add_argument('--seed', type=int, default=222) parser.add_argument('--project_name', type=str, default=None) # trainer args parser = pl.Trainer.add_argparse_args(parser) # model args parser = BYOL.add_model_specific_args(parser) parser = SSLLinearEval.add_model_specific_args(parser) args = parser.parse_args() seed_everything(args.seed) args.status = 'Pretrain' run_name = time.strftime("%Y-%m-%d_%H-%M-%S") save_dir = os.path.join(os.getcwd(), 'checkpoints') pt_model_dir = os.path.join(save_dir, ("BYOL_" + run_name + '/pretrain')) ft_model_dir = os.path.join(save_dir, ("BYOL_" + run_name + '/finetune')) reps_model_dir = os.path.join(save_dir, ("BYOL_" + run_name + '/reps')) os.makedirs(pt_model_dir, exist_ok=True) os.makedirs(ft_model_dir, exist_ok=True) os.makedirs(reps_model_dir, exist_ok=True) # Get DataModule dm, ft_dm, args = get_dm(args) neptune_logger = NeptuneLogger( offline_mode=args.offline_log, api_key=None, project_name=args.project_name, experiment_name='Testing', # Optional, params=vars(args), # Optional, tags=["Test", args.tag], # Optional, upload_source_files=['src/*.py'], close_after_fit=False) pt_model_checkpoint = pl.callbacks.ModelCheckpoint(filepath=pt_model_dir, save_top_k=1, monitor='loss') if args.accelerator == 'ddp' or args.accelerator == 'ddp2': replace_sampler = True # False if args.accelerator == 'ddp': args.effective_bsz = args.batch_size * args.num_nodes * args.gpus elif args.accelerator == 'ddp2': args.effective_bsz = args.batch_size * args.num_nodes else: replace_sampler = True args.effective_bsz = args.batch_size # Define trainer trainer = pl.Trainer.from_argparse_args( args, max_epochs=args.max_epochs, logger=neptune_logger, callbacks=[ PTPrintingCallback(pt_model_dir, args), CheckpointSave(pt_model_dir) ], deterministic=True, fast_dev_run=False, sync_batchnorm=True, checkpoint_callback=False, replace_sampler_ddp=replace_sampler, resume_from_checkpoint=args.resume_ckpt, progress_bar_refresh_rate=args.print_freq, check_val_every_n_epoch=args.val_every_n) # Define model model = BYOL(**args.__dict__) # Fit trainer.fit(model, dm) # time.sleep(15) if trainer.local_rank == 0: print("os.listdir(pt_model_dir) :{}".format(os.listdir(pt_model_dir))) checkpoint_path = os.path.join(pt_model_dir, os.listdir(pt_model_dir)[-1]) if args.save_checkpoint: neptune_logger.experiment.log_artifact( os.path.join(pt_model_dir, os.listdir(pt_model_dir)[-1]), os.path.join('pretrain/', os.listdir(pt_model_dir)[-1])) log_files = [pt_model_dir, ft_model_dir, checkpoint_path] save_log_file = os.path.join(os.getcwd(), 'log_files.txt') np.savetxt(save_log_file, log_files, delimiter=" ", fmt="%s") neptune_logger.experiment.stop()
def main(parser): parser.add_argument('-m', '--model', type=str, default='PredNet') parser.add_argument('-d', '--dataset', type=str, default='SchapiroResnetEmbeddingDataset') parser.add_argument('--load_model', action='store_true') parser.add_argument('--ipy', action='store_true') parser.add_argument('--no_graphs', action='store_true') parser.add_argument('--no_test', action='store_true') parser.add_argument('--user', type=str, default='aprashedahmed') parser.add_argument('-p', '--project', type=str, default='sandbox') parser.add_argument('-t', '--tags', nargs='+') parser.add_argument('--no_checkpoints', action='store_true') parser.add_argument('--offline_mode', action='store_true') parser.add_argument('--save_weights_online', action='store_true') parser.add_argument('--test_checkpoints', action='store_true') parser.add_argument('--test_epochs', type=int, default=2) parser.add_argument('--test_n_paths', type=int, default=2) parser.add_argument('--test_online', action='store_true') parser.add_argument('--test_project', type=str, default='') parser.add_argument('-v', '--verbose', action='store_true') parser.add_argument('--n_workers', type=int, default=1) parser.add_argument('-e', '--epochs', type=int, default=50) parser.add_argument('--gpus', type=float, default=1) parser.add_argument('--device', type=str, default='cuda') parser.add_argument('-s', '--seed', type=str, default='random') parser.add_argument('-b', '--batch_size', type=int, default=256 + 128) parser.add_argument('--n_val', type=int, default=1) parser.add_argument('--mapping', type=str, default='random') parser.add_argument('--dir_checkpoints', type=str, default=str(index.DIR_CHECKPOINTS)) parser.add_argument('--checkpoint_period', type=float, default=1.0) parser.add_argument('--val_check_interval', type=float, default=1.0) parser.add_argument('--save_top_k', type=float, default=1) parser.add_argument('--early_stop_mode', type=str, default='min') parser.add_argument('--early_stop_patience', type=int, default=10) parser.add_argument('--early_stop_min_delta', type=float, default=0.001) parser.add_argument('--name', type=str, default='') parser.add_argument('--exp_prefix', type=str, default='') parser.add_argument('--exp_suffix', type=str, default='') # Get Model and Dataset specific args temp_args, _ = parser.parse_known_args() # Make sure this is correct if hasattr(datasets, temp_args.dataset): Dataset = getattr(datasets, temp_args.dataset) parser = Dataset.add_dataset_specific_args(parser) else: raise Exception( f'Invalid dataset "{temp_args.dataset}" passed. Check it is ' f'importable: "from prevseg.datasets import {temp_args.dataset}"') # Get temp args now with dataset args added temp_args, _ = parser.parse_known_args() # Check this is correct as well if hasattr(models, temp_args.model): Model = getattr(models, temp_args.model) parser = Model.add_model_specific_args(parser) else: raise Exception( f'Invalid model "{temp_args.model}" passed. Check it is importable:' f' "from prevseg.models import {temp_args.model}"') # Get the parser and turn into an omegaconf hparams = parser.parse_args() # If we are test-running, do a few things differently (scale down dataset, # send to sandbox project, etc.) if hparams.test_run: hparams.epochs = hparams.test_epochs hparams.n_paths = hparams.test_n_paths hparams.name = '_'.join(filter(None, ['test_run', hparams.exp_prefix])) hparams.project = hparams.test_project or 'sandbox' hparams.verbose = True hparams.ipdb = True hparams.no_checkpoints = not hparams.test_checkpoints hparams.offline_mode = not hparams.test_online # Seed is a string to allow for None/random as an input. Make it passable # to pl.seed_everything hparams.seed = None if 'None' in hparams.seed or hparams.seed == 'random' \ else int(hparams.seed) # Get the hostname for book keeping hparams.hostname = socket.gethostname() # Set the seed hparams.seed = pl.seed_everything(hparams.seed) # Turn the string entry for mapping into a dict (that is also a str) if hparams.mapping == 'default': hparams.mapping = const.DEFAULT_MAPPING elif hparams.mapping == 'random': hparams.mapping = str( Dataset.random_mapping(n_pentagons=hparams.n_pentagons)) else: raise ValueError(f'Invalid entry for mapping: {hparams.mapping}') # Set the validation path hparams.val_path = str(const.DEFAULT_PATH) # Create experiment name hparams.name = name_from_hparams(hparams) hparams.exp_name = name_from_hparams(hparams, short=True) if hparams.verbose: print(f'Beginning experiment: "{hparams.name}"') # Neptune Logger logger = NeptuneLogger( project_name=f"{hparams.user}/{hparams.project}", experiment_name=hparams.exp_name, params=vars(hparams), tags=hparams.tags, offline_mode=hparams.offline_mode, upload_source_files=[ str(Path(__file__).resolve()), inspect.getfile(Model), inspect.getfile(Dataset) ], close_after_fit=False, ) if not hparams.load_model: # Checkpoint Call back if hparams.no_checkpoints: checkpoint = False if hparams.verbose: print('\nNot saving any checkpoints.\n', flush=True) else: dir_checkpoints_experiment = (Path(hparams.dir_checkpoints) / hparams.name) if not dir_checkpoints_experiment.exists(): dir_checkpoints_experiment.mkdir(parents=True) checkpoint = pl.callbacks.ModelCheckpoint( filepath=str( dir_checkpoints_experiment / (f'seed={hparams.seed}' + '_{epoch}_{val_loss:.3f}')), verbose=hparams.verbose, save_top_k=hparams.save_top_k, period=hparams.checkpoint_period, ) # Early stopping callback early_stop_callback = pl.callbacks.EarlyStopping( monitor='val_loss', min_delta=hparams.early_stop_min_delta, patience=hparams.early_stop_patience, verbose=hparams.verbose, mode=hparams.early_stop_mode, ) # Define the trainer trainer = pl.Trainer( checkpoint_callback=checkpoint, max_epochs=hparams.epochs, logger=logger, val_check_interval=hparams.val_check_interval, gpus=hparams.gpus, early_stop_callback=early_stop_callback, ) # Verbose messaging if hparams.verbose: now = datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S") print(f'\nCurrent time: {now}', flush=True) print(f'\nRunning with following hparams:', flush=True) pprint(vars(hparams)) # Define the model model = Model(hparams) if hparams.verbose: print(f'\nModel being used: \n{model}', flush=True) # Define the datamodule datamodule = datasets.DataModuleConstructor(hparams, Dataset) # Train the model print('\nBeginning training:', flush=True) now = datetime.datetime.now() trainer.fit(model, datamodule=datamodule) if hparams.verbose: elapsed = datetime.datetime.now() - now elapsed_fstr = time.strftime('%H:%M:%S', time.gmtime(elapsed.seconds)) print(f'\nTraining completed! Time Elapsed: {elapsed_fstr}', flush=True) # Record the best checkpoint if we kept track of it if not hparams.no_checkpoints: logger.log_hyperparams( {'best_checkpoint_path': checkpoint.best_model_path}) # Save the weights online if desired if hparams.save_weights_online: if hparams.verbose: print('\nSending weights to neptune servers...', flush=True) logger.log_artifact(checkpoint.best_model_path) if hparams.verbose: print('Finished.', flush=True) else: raise NotImplementedError # # Get all the experiments with the name hparams.name* # experiments = list(index.DIR_CHECKPOINTS.glob( # f'{hparams.name}_{hparams.exp_name}*')) # # import pdb; pdb.set_trace() # if len(experiments) > 1: # # Get the newest exp by v number # experiment_newest = sorted( # experiments, # key=lambda path: int(path.stem.split('_')[-1][1:]))[-1] # # Get the model with the best (lowest) val_loss # else: # experiment_newest = experiments[0] # experiment_newest_best_val = sorted( # experiment_newest.iterdir(), # key=lambda path: float( # path.stem.split('val_loss=')[-1].split('_')[0]))[0] # model = Model.load_from_checkpoint(str(experiment_newest_best_val)) # model.logger = logger # ## LOOK AT THIS LATER # model.prepare_data(val_path=const.DEFAULT_PATH) # # Define the trainer # trainer = pl.Trainer( # logger=model.logger, # gpus=hparams.gpus, # max_epochs=1, # ) if not hparams.no_test: # Ensure we are in cuda for testing if specified if 'cuda' in hparams.device and torch.cuda.is_available(): model.cuda() # Create the test data test_data = np.array( [datamodule.ds.array_data[n] for n in const.DEFAULT_PATH]).reshape( (1, len(const.DEFAULT_PATH), 2048)) torch_data = torch.Tensor(test_data) # Get the model outputs outs = model.forward(torch_data, output_mode='eval') outs.update({'errors': model.forward(torch_data, output_mode='error')}) # Visualize the test data figs = model.visualize(outs, borders=const.DEFAULT_BORDERS) if not hparams.no_graphs: for name, fig in figs.items(): # Doing logger.log_image(...) doesn't work for some reason model.logger.log_image(name, fig) # Close the neptune logger logger.experiment.stop()
weight_decay = parameters["weight_decay"] pool_ratio = parameters["pool_ratio"] nhid = parameters["nhid"] epochs = parameters["epochs"] LearningRateMonitor_Params = {'logging_interval': 'epoch'} device = 'cuda' if torch.cuda.is_available() else 'cpu' print('Device: {}'.format(device)) model = LightningPAN(9, 1, nhid=nhid,ratio=pool_ratio,filter_size=filter_size) lr_logger = LearningRateMonitor(**LearningRateMonitor_Params) neptune_logger = NeptuneLogger( api_key=ANONYMOUS, project_name='hvergnes/PAN', close_after_fit=False, params=parameters, # your hyperparameters, immutable tags=['PAN', 'best_model'], # tags upload_source_files=["parameters.json", "lightning_model.py"] ) trainer = Trainer( max_epochs=epochs, logger=neptune_logger, callbacks=[lr_logger], # fast_dev_run=True, ) trainer.fit(model) trainer.test(model) test_loader = model.test_dataloader()
if __name__ == '__main__': try: import multiprocessing as mp __spec__ = None mp.set_start_method('spawn', force=True) except: pass matplotlib.use('Agg') all_tars = [] neptune_logger = NeptuneLogger( api_key="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJiMDM3MjFkYy1jNTE3LTQ4NTAtOTFlNC00ZGY1NGM3Y2M4YmEifQ====", project_name="erelon39/Line-colorize") if torch.cuda.is_available(): decods = my_decoders(128) model = siggraph17_L(128, pretrained_path="model_e0_batch_19000_gn.pt") for root, dirs, files in os.walk("/home/erelon39/sftp/erelon/df66f8bf-85ef-4dec-aa8f-464dd02ad15c"): for file in files: if file.endswith( ".tar") and "out" not in root and "out" not in file and "trash" not in root.lower() and "trash" not in file.lower(): all_tars.append(os.path.join(root, file)) dataset = wds.WebDataset(all_tars, length=float("inf")) \ .decode(decods.my_decoder_GT).decode(decods.my_decoder_BW).to_tuple("gt.jpg", "train.jpg", "__key__", handler=dummy_func).batched(16) dataloader = torch.utils.data.DataLoader(dataset, batch_size=None, num_workers=4)
self.log('train_loss', loss) return loss def configure_optimizers(self): return torch.optim.Adam(self.parameters(), lr=PARAMS['learning_rate']) # DataLoader train_loader = DataLoader(MNIST(os.getcwd(), download=True, transform=transforms.ToTensor()), batch_size=PARAMS['batch_size']) # Step 4: Create NeptuneLogger from pytorch_lightning.loggers.neptune import NeptuneLogger neptune_logger = NeptuneLogger( api_key="ANONYMOUS", project_name="shared/pytorch-lightning-integration", params=PARAMS) # Step 5: Pass NeptuneLogger to the Trainer trainer = pl.Trainer(max_epochs=PARAMS['max_epochs'], logger=neptune_logger) # Step 6: Run experiment model = LitModel() trainer.fit(model, train_loader) # Step 7: Stop Neptune logger at the end
sampler=test_sampler) model_checkpoint = pl.callbacks.ModelCheckpoint(CHECKPOINTS_DIR) trainer = Trainer(gpus=hparams.gpus, checkpoint_callback=True) trainer.test(model, test_dataloaders=test_dataset) # Save checkpoints folder neptune_logger.experiment.log_artifact(CHECKPOINTS_DIR) # You can stop the experiment neptune_logger.experiment.stop() # ------------------------------------------------------------------------------------------------------------------- CHECKPOINTS_DIR = '/home/rachneet/thesis_results/mixed_impairments_cnn/' neptune_logger = NeptuneLogger( api_key=os.environ.get("NEPTUNE_API_KEY"), project_name="rachneet/sandbox", experiment_name="mixed_impairments_cnn", # change this for new runs ) # ---------------------------------------MAIN FUNCTION TRAINER------------------------------------------------------- def main(hparams): model = LightningCNN(hparams) # exp = Experiment(save_dir=os.getcwd()) if not os.path.exists(CHECKPOINTS_DIR): os.makedirs(CHECKPOINTS_DIR) model_checkpoint = pl.callbacks.ModelCheckpoint(CHECKPOINTS_DIR) early_stop_callback = pl.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.00,