Beispiel #1
0
def test_pytorch_profiler_logger_collection(tmpdir):
    """
    Tests whether the PyTorch profiler is able to write its trace locally when
    the Trainer's logger is an instance of LoggerCollection. See issue #8157.
    """
    def look_for_trace(trace_dir):
        """Determines if a directory contains a PyTorch trace"""
        return any("trace.json" in filename
                   for filename in os.listdir(trace_dir))

    # Sanity check
    assert not look_for_trace(tmpdir)

    model = BoringModel()

    # Wrap the logger in a list so it becomes a LoggerCollection
    logger = [TensorBoardLogger(save_dir=tmpdir)]
    trainer = Trainer(default_root_dir=tmpdir,
                      profiler="pytorch",
                      logger=logger,
                      limit_train_batches=5,
                      max_epochs=1)

    assert isinstance(trainer.logger, LoggerCollection)
    trainer.fit(model)
    assert look_for_trace(tmpdir)
    def initialize_logger(self):
        save_dir = 'tensorboard_logs'
        self.logger_subdir = 'ssl_pretraining'
        self.logger_run_name =  f"{self.now}-" \
                                f"epochs_{self.configs['train_num_epochs']}-" \
                                f"bs_{self.configs['train_batch_size']}-" \
                                f"dsWS_{self.configs['dataset_window_size']}-" \
                                f"dsSS_{self.configs['dataset_shift_size']}-" \
                                f"dsNE_{self.configs['dataset_num_exclude']}-" \
                                f"dsUS_{self.configs['dataset_use_scaled']}-" \
                                f"dsUTS_{self.configs['dataset_use_tree_structure']}-" \
                                f"model_{self.model.__class__.__name__}-" \
                                f"modelHS_{self.configs['bnlstm_hidden_size']}-" \
                                f"modelNL{self.configs['bnlstm_n_layers']}-" \
                                f"modelDOUT_{self.configs['bnlstm_dropout']}-" \
                                f"loss_{self.criterion.__class__.__name__}-" \
                                f"opt_{self.optimizer.__class__.__name__}-" \
                                f"sched_{self.lr_scheduler.__class__.__name__}"

        logger = TensorBoardLogger(save_dir=save_dir,
                                   name=self.logger_subdir,
                                   version=self.logger_run_name,
                                   default_hp_metric=False,
                                   log_graph=False)

        return logger
Beispiel #3
0
    def initialize_logger(self):
        model_name = f"{self.model.__class__.__name__}" if self.model.__class__.__name__ != "LogisticRegression" else "LOGREG"
        loss_name = f"{self.criterion.__class__.__name__}" if self.criterion.__class__.__name__ != "BCEWithLogitsLoss" else "BCEWLL"

        save_dir = 'tensorboard_logs'
        self.logger_subdir = 'supervised'
        self.logger_run_name =  f"{self.now}-" \
                                f"epochs_{self.configs['train_num_epochs']}-" \
                                f"bs_{self.configs['train_batch_size']}-" \
                                f"dsWS_{self.configs['dataset_window_size']}-" \
                                f"dsSS_{self.configs['dataset_shift_size']}-" \
                                f"dsNE_{self.configs['dataset_num_exclude']}-" \
                                f"dsUS_{self.configs['dataset_use_scaled']}-" \
                                f"dsUTS_{self.configs['dataset_use_tree_structure']}-" \
                                f"dsUSSL_{self.configs['dataset_use_ssl']}-" \
                                f"model_{model_name}-" \
                                f"modelHS_{self.configs['bnlstm_hidden_size']}-" \
                                f"modelNL{self.configs['bnlstm_n_layers']}-" \
                                f"modelDOUT_{self.configs['bnlstm_dropout']}-" \
                                f"loss_{loss_name}-" \
                                f"lossUW_{self.configs['loss_use_weight']}-" \
                                f"opt_{self.optimizer.__class__.__name__}-" \
                                f"sched_{self.lr_scheduler.__class__.__name__}"
                                
        
        logger = TensorBoardLogger(save_dir=save_dir, name=self.logger_subdir, version=self.logger_run_name, default_hp_metric=False, log_graph=False)
        
        return logger
Beispiel #4
0
 def get_logger(cls,
                save_dir: str,
                version=0,
                log_graph=True,
                **__) -> LightningLoggerBase:
     return TensorBoardLogger(save_dir=save_dir,
                              version=version,
                              log_graph=log_graph)
Beispiel #5
0
def main(config, log):
    loggers = [
        AzureMlLogger(),
        TensorBoardLogger('lightning_logs')
    ]

    trainer = pl.Trainer(logger=loggers, **config.trainer)
    net = model.Mnist(**config.model)
    trainer.fit(net)
Beispiel #6
0
def main(args):
    ensure_reproducible()
    
    # FIXME: better way to decide image size by dataset name directly
    print(args)
    model = GAN(hparams=args)
    
    adv_type = args.adversarial_loss_type
    if args.num_discriminator_updates > 1:
        v = args.num_discriminator_updates
        adv_type = "_".join([adv_type, f"nd{v}"])
    elif args.discriminator_weight_clip_value:
        v = args.discriminator_weight_clip_value
        adv_type = "_".join([adv_type, f"wc{v}"])
    
    experiment_name = "_".join([
        args.dataset, 
        adv_type,
        f"bs{args.batch_size}",
        f"lr{args.lr}",
    ])

    logger = TensorBoardLogger("lightning_logs", name=experiment_name)
    
#     checkpoint_callback = ModelCheckpoint(
#         filepath=os.path.join(
#             os.getcwd(), 
#             "checkpoints", 
#             "gan",
#             experiment_name
#         ),
#         save_top_k=-1,
#         period=10
#     )
    
    trainer = Trainer(
        logger=logger,
#         checkpoint_callback=checkpoint_callback,
        gpus=args.gpus,
        track_grad_norm=2,
        log_gpu_memory=False,
        max_epochs=args.max_epochs,
    )
    
#     trainer = pl.trainer.Trainer()
#     trainer = pl.Trainer(
#         max_epochs=args.max_epochs,
#     )
#     trainer = pl.trainer.Trainer.from_argparse_args(args)
#     trainer = pl.trainer.Trainer(max_epochs=args.max_epochs)
    
    trainer.fit(model)
Beispiel #7
0
def objective(trial):
    # PyTorch Lightning will try to restore model parameters from previous trials if checkpoint
    # filenames match. Therefore, the filenames for each trial must be made unique.
    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        os.path.join(MODEL_DIR, "trial_{}".format(trial.number)),
        monitor="auc"
    )
    data_file = "training_data.pkl"

    logger = TensorBoardLogger("lightning_logs", name=f"node_classifier_{trial.number}")

    torch.cuda.empty_cache()

    gpu = find_free_gpu(trial)

    trainer = pl.Trainer(
        logger=logger,
        val_percent_check=PERCENT_TEST_EXAMPLES,
        checkpoint_callback=checkpoint_callback,
        max_epochs=EPOCHS,
        distributed_backend='gp',
        gpus=[gpu] if torch.cuda.is_available() else None,
        early_stop_callback=PyTorchLightningPruningCallback(trial,
                                                            monitor="auc"),
    )

    batch_size = 64 * trial.suggest_int("batch_size", 2, 4)
    num_trees = 50 * trial.suggest_int("num_trees", 1, 20)
    depth = trial.suggest_int("depth", 3, 8)

    # determine max number of layers not to get OOM errors
    size = 4 * batch_size * num_trees * depth * 2 ** depth/10**9
    max_layers = min(10, int(2/size))

    num_layers = trial.suggest_int("num_layers", 1, max_layers)

    model = NodePlClassifier(data_file,
                             in_features=57,
                             train_fraction=1,
                             num_trees=num_trees,
                             batch_size=batch_size,
                             depth=depth,
                             num_layers=num_layers,
                             lr=0.001,
                             tree_dim=1,
                             gpu=gpu)
    trainer.fit(model)

    return model.metrics['auc']
Beispiel #8
0
def main(args: Namespace):
    print(args)

    if args.output_dir:
        Path(args.output_dir).mkdir(parents=True, exist_ok=True)

    # Init our model
    mnist_model = MNISTModel()

    # Init DataLoader from MNIST Dataset
    train_ds = MNIST(args.data_dir, train=True, download=True, transform=transforms.ToTensor())
    train_loader = DataLoader(train_ds, batch_size=32, num_workers=4)

    val_ds = MNIST(args.data_dir, train=False, download=True, transform=transforms.ToTensor())
    val_loader = DataLoader(val_ds, batch_size=32, num_workers=4)

    # Initialize a trainer
    logger = False
    checkpoint_callback = False
    callbacks = []
    if args.output_dir:
        logger = TensorBoardLogger(
            args.output_dir,
            version=1,
            default_hp_metric=False)
        checkpoint_callback = True
        callbacks.append(ModelCheckpoint(
            dirpath=args.output_dir,
            save_last=True,
            verbose=True))

    resume_from_checkpoint = None
    if args.resume:
        resume_from_checkpoint = os.path.join(args.output_dir, 'last.ckpt')

    trainer = pl.Trainer(
        logger=logger,
        default_root_dir=args.output_dir,
        gpus=torch.cuda.device_count(),
        max_epochs=args.num_epochs,
        progress_bar_refresh_rate=20,
        resume_from_checkpoint=resume_from_checkpoint,
        checkpoint_callback=checkpoint_callback,
        callbacks=callbacks)

    # Train the model
    trainer.fit(
        mnist_model, train_loader, val_loader)
    def initialize_logger(self):
        model_name = f"{self.model.__class__.__name__}" if self.model.__class__.__name__ != "LogisticRegression" else "LOGREG"
        loss_name = f"{self.criterion.__class__.__name__}" if self.criterion.__class__.__name__ != "BCEWithLogitsLoss" else "BCEWLL"

        if self.configs['aux_criterion_use'] == True:
            aux_loss_name = f"Cos" if self.aux_criterion.__class__.__name__ == "CosineEmbeddingLoss" else "Contr"
            aux_loss_log =  f"auxLoss_{aux_loss_name}-" \
                            f"auxLossM_{self.configs['aux_criterion_margin']}-" \
                            f"auxLossW_{self.configs['aux_criterion_weight']}"
        else:
            aux_loss_log = "auxLoss_None"

        save_dir = 'tensorboard_logs'
        self.logger_subdir = 'siamese'
        self.logger_run_name =  f"{self.now}-" \
                                f"epochs_{self.configs['train_num_epochs']}-" \
                                f"bs_{self.configs['train_batch_size']}-" \
                                f"dsWS_{self.configs['dataset_window_size']}-" \
                                f"dsSS_{self.configs['dataset_shift_size']}-" \
                                f"dsNE_{self.configs['dataset_num_exclude']}-" \
                                f"dsUS_{self.configs['dataset_use_scaled']}-" \
                                f"dsUTS_{self.configs['dataset_use_tree_structure']}-" \
                                f"model_{model_name}-" \
                                f"modelLS_{self.configs['siamese_latent_size']}-" \
                                f"modelOFS_{self.configs['siamese_out_fc_size']}-" \
                                f"modelLDOUT_{self.configs['siamese_latent_dropout']}-" \
                                f"bnlstmHS_{self.configs['bnlstm_hidden_size']}-" \
                                f"bnlstmNL_{self.configs['bnlstm_n_layers']}-" \
                                f"bnlstmDOUT_{self.configs['bnlstm_dropout']}-" \
                                f"bnlstmPRE_{self.configs['pretrained_use']}-" \
                                f"loss_{loss_name}-" \
                                f"{aux_loss_log}"
        '''
        f"optLR_{self.configs['optimizer_base_lr']}-" \
        f"optWD_{self.configs['optimizer_weight_decay']}-" \
        f"sched_{self.lr_scheduler.__class__.__name__}-" \
        f"schedMS_{self.configs['scheduler_milestones']}-" \
        f"schedGAM_{self.configs['scheduler_gamma']}"
        '''

        logger = TensorBoardLogger(save_dir=save_dir,
                                   name=self.logger_subdir,
                                   version=self.logger_run_name,
                                   default_hp_metric=False,
                                   log_graph=False)

        return logger
Beispiel #10
0
def process(args):
    seed_everything(2299)
    dict_args = vars(args)
    if args.pretrained_model is not None:
        pm_name = Path(args.pretrained_model).parents[1].name
        version = f"{args.name}_{args.exp}_{pm_name}_fold-{args.fold}"
    else:
        version = f"{args.name}_{args.exp}_fold-{args.fold}"
    # Data
    dm = CTRPDataModule.from_argparse_args(args)
    # Model
    if args.exp == 'vanilla':
        model = StandardNetwork(**dict_args)
    elif args.exp == 'transformer':
        model = TransformerNetwork(**dict_args)
    else:
        model = ConditionalNetwork(**dict_args)
    # Callbacks
    logger = TensorBoardLogger(save_dir=args.default_root_dir,
                               version=version,
                               name='lightning_logs')
    early_stop = EarlyStopping(monitor='val_loss',
                               min_delta=0.001,
                               patience=20,
                               verbose=False,
                               mode='min')
    checkpoint_callback = ModelCheckpoint(monitor='val_loss', mode='min')
    profiler = AdvancedProfiler(filename='profile')
    profiler = 'simple'
    # Trainer
    start = datetime.now()
    trainer = Trainer.from_argparse_args(
        args,
        default_root_dir=logger.log_dir,
        logger=logger,
        callbacks=[early_stop, checkpoint_callback],
        profiler=profiler)
    trainer.fit(model, dm)
    print("Completed fold {} in {}".format(args.fold,
                                           str(datetime.now() - start)))

    return
Beispiel #11
0
def cli_main():
    
    argv = sys.argv[1:]
    #argv = ['--config',     'configs/NTU_BUTD_CON.yaml',
     #        '--exp_name',   'test',
      #       '--exp_dir',    '../prj_ssl_exps/test']

    parser = argparse.ArgumentParser()
    
    parser.add_argument("-c", "--config", default=None, help="where to load YAML configuration", metavar="FILE")

    parser.add_argument('--exp_name', type=str, default='test', help='experiment name')
    parser.add_argument('--exp_dir', type=str, default='../experiments/', help='experiment output directory')
    parser.add_argument('--neptune_key', type=str, default='', help='neptune user api key')
    parser.add_argument('--neptune_project', type=str, default='', help='neptune project directory')
    parser.add_argument('--path_db', type=str, default='../dbs', help='neptune project directory')

    parser.add_argument('--model', type=str, default='MocoV2', help='self supervised training method')
    parser.add_argument('--dataset', type=str, default='NTU_SSL', help='dataset to use for training')
    
    parser.add_argument('--seed', type=int, default=None, help='random seed')
    
    parser.add_argument('--resume_training', action='store_true', help='resume training from checkpoint training')
    
    
    args = parse_args(parser, argv)
    
    if args.seed is not None:
        pl.seed_everything(args.seed)
    
    # trainer args
    parser = pl.Trainer.add_argparse_args(parser)
    
    # get model and model args
    model_type = vars(modules)[args.model] 
    parser = model_type.add_model_specific_args(parser)
    
    # get dataset and dataset args
    dataset_type = vars(datasets)[args.dataset]
    parser = dataset_type.add_dataset_specific_args(parser)

    args = parse_args(parser, argv)

    os.makedirs(args.exp_dir, exist_ok=True)
    os.makedirs(args.path_db, exist_ok=True)

    # save config
    with open(os.path.join(args.exp_dir, 'config.yaml'), 'w') as cfg_file:
        yaml.dump(args.__dict__, cfg_file)

    if args.neptune_key != '':
        logger = NeptuneLogger(
            api_key=args.neptune_key,
            project_name=args.neptune_project,
            close_after_fit=False,
            experiment_name=args.exp_name,  # Optional,
            params=args.__dict__, # Optional,
            tags=["pl"],  # Optional,
            # upload_stderr=False,
            # upload_stdout=False
        )
    else:
        logger = TensorBoardLogger(args.exp_dir) #, name="my_model"
    
    datamodule = dataset_type(**args.__dict__)
    
    model = model_type(**args.__dict__)

    if args.resume_training:
        ckpts = list(filter(lambda x:'epoch=' in x, os.listdir(args.exp_dir)))
        latest_epoch = max([int( x.replace('epoch=','').replace('.ckpt','')) for x in ckpts])
        latest_ckpt = os.path.join(args.exp_dir, 'epoch=' + str(latest_epoch) + '.ckpt') 

        print('resuming from checkpoint', latest_ckpt)

        args.__dict__.update({'resume_from_checkpoint': latest_ckpt})


    #model_checkpoint = pl.callbacks.ModelCheckpoint(filepath=args.exp_dir, save_top_k=3, mode='max', monitor='knn_acc', period=args.ckpt_period) # , filename='{epoch}-{knn_acc}'
    model_checkpoint = pl.callbacks.ModelCheckpoint(save_top_k=3, mode='max', monitor='knn_acc',
                                                    period=args.ckpt_period)  # , filename='{epoch}-{knn_acc}'

    trainer = pl.Trainer.from_argparse_args(args, logger=logger, checkpoint_callback=model_checkpoint, callbacks=[KNNEval(period=args.ckpt_period)])


    # print(len(datamodule.val_dataset()))

    
    trainer.fit(model, datamodule)

    best_ckpt = trainer.checkpoint_callback.best_model_path

    best_model = model_type.load_from_checkpoint(checkpoint_path=best_ckpt)
    
    # pretrain_result = trainer.test(model=best_model)[0]
    pretrain_result = trainer.test(model=best_model, datamodule=datamodule)[0]
    print(pretrain_result)
    lincls_results = lincls(args, best_model)

    
    print('test results')
    for k,v in lincls_results.items():
        print(k, v)

    df = pd.DataFrame()
    output_dict = {
        'exp_name': args.exp_name,
        'exp_dir': args.exp_dir,
        'model': args.model,
        'dataset': args.dataset,
    }

    output_dict.update(pretrain_result)
    output_dict.update(lincls_results)
    
    if args.neptune_key != '':
        for k, v in pretrain_result.items():
            logger.experiment.log_metric(k, v)
            
        for k, v in lincls_results.items():
            logger.experiment.log_metric(k, v)


    df = df.append(output_dict, ignore_index=True)
    df.to_csv(os.path.join(args.path_db, args.exp_name + '_db.csv'))
Beispiel #12
0
def main(hparams):
    """
    Main training routine specific for this project
    :param hparams:
    """
    # ------------------------
    # 1 INIT LIGHTNING MODEL
    # ------------------------
#    print(hparams)
    model = LitModel(vars(hparams))
    # model.__build_model__()

    # ------------------------
    # 2 INIT TRAINER
    # ------------------------
    name = "runs/f_t-"+"{date:%d-%m-%Y_%H:%M:%S}".format(
                                                date=datetime.datetime.now())
                                                
    logger = TensorBoardLogger("tb_logs", name=name)
    # logger.log_graph(model)
    # logger.log_hyperparams(hparams)
    checkpoint_callback = ModelCheckpoint(
        # filename='{epoch}-{val_mse:.2f}.ckpt',
        filename='model',
        # dirpath=os.path.join(os.getcwd(), 'tb_logs/', name),  # "/"
        # filepath=os.path.join(os.getcwd(), 'tb_logs/', name + '_checkpoints'),
        # dirpath=os.path.join(os.getcwd(), 'tb_logs/', name + '_checkpoints'),
        save_last=True,
        save_top_k=1,
        verbose=True,
        monitor='loss',
        mode='min',
        )
    print([hparams.first_gpu+el for el in range(hparams.gpus)])

    trainer = pl.Trainer(
        max_epochs=1000000,
        callbacks=[checkpoint_callback],
        precision=64,
        logger=logger,
        # distributed_backend=hparams.distributed_backend,
        # gpus=1,  # [hparams.first_gpu+el for el in range(hparams.gpus)]
        # num_nodes=4
        # use_amp=hparams.use_16bit
        # check_val_every_n_epoch=2,
        # auto_scale_batch_size='binsearch',
        # accumulate_grad_batches=2,
        # fast_dev_run=True,
        # accumulate_grad_batches=hparams.acc_batches,
        # auto_lr_find=hparams.lr,
        # weights_summary="full"  ??????
        )

    lr_finder = trainer.tuner.lr_find(model)
    lr_finder.results
    print(lr_finder.results)
    # fig = lr_finder.plot(suggest=True)  # Plot
    # fig.show()
    trainer.tune(model)



    # ------------------------
    # 3 START TRAINING
    # ------------------------
    trainer.fit(model)
Beispiel #13
0
import pytorch_lightning as pl
from pytorch_lightning.loggers.tensorboard import TensorBoardLogger
from model import build_model
from data import PadSequence, IndicDataset
from config import replace, preEnc, preEncDec

def preproc_data():

    split_data('/content/drive/My Drive/Offnote labs/data/hin-eng/hin.txt', '/content/drive/My Drive/Offnote labs/data/hin-eng')

def gen_model_loaders(config):
    model, tokenizers = build_model(config)

    pad_sequence = PadSequence(tokenizers.src.pad_token_id, tokenizers.tgt.pad_token_id)

    train_loader = DataLoader(IndicDataset(tokenizers.src, tokenizers.tgt, config.data, True), 
                            batch_size=config.batch_size, 
                            shuffle=False, 
                            collate_fn=pad_sequence)
    eval_loader = DataLoader(IndicDataset(tokenizers.src, tokenizers.tgt, config.data, False), 
                           batch_size=config.eval_size, 
                           shuffle=False, 
                           collate_fn=pad_sequence)
    return model, tokenizers, train_loader, eval_loader

rconf = preEncDec
preproc_data()
model, tokenizers, train_loader, eval_loader = gen_model_loaders(rconf)
logger = TensorBoardLogger("/content/drive/My Drive/Offnote labs/tb_logs")
trainer = pl.Trainer( max_nb_epochs=10,gpus=[0], logger=logger)
trainer.fit(model, train_dataloader=train_loader)
Beispiel #14
0
def cv(name, exp, target, batch_size, learning_rate, epochs, path, logs,
       nfolds, gpus, subset):
    seed_everything(2299)
    # Paths
    path = Path(path)
    logs = Path(logs)
    logs.mkdir(parents=True, exist_ok=True)

    for fold in nfolds:
        start = datetime.now()
        train_ds, val_ds, input_cols, cond_cols = read(path, exp, fold, subset)
        cols = list(np.concatenate((input_cols, cond_cols, [target])))
        train = train_ds.to_table(columns=cols).to_pandas()
        val = val_ds.to_table(columns=cols).to_pandas()
        # DataModule
        dm = CTRPDataModule(train, val, input_cols, cond_cols, target,
                            batch_size)
        # Remove data from CPU
        del train, val
        print("Completed dataloading in {}".format(str(datetime.now() -
                                                       start)))
        # Model
        if exp == 'vanilla':
            model = StandardNetwork(exp,
                                    len(input_cols),
                                    learning_rate=learning_rate,
                                    batch_size=batch_size)
        else:
            model = ConditionalNetwork(exp,
                                       len(input_cols),
                                       len(cond_cols),
                                       learning_rate=learning_rate,
                                       batch_size=batch_size)
        # Callbacks
        logger = TensorBoardLogger(save_dir=logs,
                                   version=f"{name}_{exp}_fold_{fold}",
                                   name='lightning_logs')
        early_stop = EarlyStopping(monitor='val_r2',
                                   min_delta=0.001,
                                   patience=5,
                                   verbose=False,
                                   mode='max')
        # Trainer
        start = datetime.now()
        trainer = Trainer(
            default_root_dir=logger.
            log_dir,  #in order to avoid lr_find_temp.ckpt conflicts
            auto_lr_find=False,
            auto_scale_batch_size=False,
            max_epochs=epochs,
            gpus=[gpus],
            logger=logger,
            distributed_backend=None,
            callbacks=[
                early_stop,
            ],
            flush_logs_every_n_steps=200,
            profiler=True)
        #trainer.tune(model=model, datamodule=dm) # for auto_lr_find
        trainer.fit(model, dm)
        print("Completed fold {} in {}".format(fold,
                                               str(datetime.now() - start)))

    return print("/done")
Beispiel #15
0
def main(project_config, hparams):
    torch.manual_seed(0)
    np.random.seed(0)

    # init module
    model = MonocularSemiSupDepth(hparams)

    # tags associated to the run
    def shape_format(shape):
        # shape = [Height, Width]
        return f"{shape[1]}x{shape[0]}"

    #assert hparams.metrics.use_gt_scale != hparams.datasets.train.load_pose, f"Either velocity of gt scaled"

    base_output_dir = Path(
        project_config.output_dir) / project_config.project_name

    logs_dir = base_output_dir / 'logs'
    logs_dir.mkdir(parents=True, exist_ok=True)

    experiment_output_dir = base_output_dir / 'outputs' / project_config.experiment_name

    assert hparams.logger in ['wandb', 'tensorboard']

    if hparams.logger == 'tensorboard':
        experiment_logger = TensorBoardLogger(
            save_dir=logs_dir, name=project_config.experiment_name)

        run_output_dir = experiment_output_dir / f'version_{experiment_logger.version}'

    elif hparams.logger == 'wandb':

        list_of_tags = [
            f"{hparams.model.depth_net.name} DepthNet",
            f"{hparams.model.pose_net.name} PoseNet",
            hparams.optimizer.name,
            hparams.scheduler.name,
            {
                1: 'gray',
                3: 'rgb'
            }[hparams.input_channels],
            f"train-{shape_format(hparams.datasets.train.data_transform_options.image_shape)}",
            f"val-{shape_format(hparams.datasets.val.data_transform_options.image_shape)}",
            f"test-{shape_format(hparams.datasets.test.data_transform_options.image_shape)}",
        ]
        if project_config.mixed_precision:
            list_of_tags.append('mixed_precision')

        losses = list(hparams.losses.keys())
        if 'supervised_loss_weight' in losses:
            losses.remove('supervised_loss_weight')
        list_of_tags += losses

        experiment_logger = WandbLogger(
            project=project_config.project_name,
            save_dir=
            logs_dir,  # the path to a directory where artifacts will be written
            log_model=True,
            tags=list_of_tags)
        #wandb_logger.watch(model, log='all', log_freq=5000) # watch model's gradients and params

        run_output_dir = experiment_output_dir / f'version_{experiment_logger.experiment.id}'

    else:
        run_output_dir = experiment_output_dir / 'no_version_system'

    run_output_dir.mkdir(parents=True, exist_ok=True)
    run_output_dir = str(run_output_dir)

    checkpoint_callback = ModelCheckpoint(
        filepath=run_output_dir +
        '/{epoch:04d}-{val-rmse_log:.5f}',  # saves a file like: my/path/epoch=2-abs_rel=0.0115.ckpt
        save_top_k=3,
        verbose=True,
        monitor='val-rmse_log',
        mode='min',
    )

    lr_logger = LearningRateLogger()

    if project_config.mixed_precision:
        amp_level = '01'
        precision = 16

    if project_config.gpus > 1:
        distributed_backend = 'ddp'
    else:
        distributed_backend = None

    profiler = False
    if project_config.fast_dev_run:
        from pytorch_lightning.profiler import AdvancedProfiler
        profiler = AdvancedProfiler(output_filename='./profiler.log')

    trainer = Trainer(
        gpus=project_config.gpus,
        distributed_backend=distributed_backend,
        num_nodes=project_config.nodes,
        checkpoint_callback=checkpoint_callback,
        callbacks=[lr_logger],
        logger=experiment_logger,
        fast_dev_run=project_config.fast_dev_run,
        profiler=profiler,
        early_stop_callback=False,
        #amp_level='O1',
        #precision=16,
        **hparams.trainer)
    trainer.fit(model)
    trainer.test(model)
Beispiel #16
0
def process(args):
    torch.multiprocessing.set_sharing_strategy('file_system')
    # Pretraining data
    if args.dataset == "ZINC5k":
        dataset = ZINC5K("../data/torchdrug/molecule-datasets/",
                         node_feature="pretrain",
                         edge_feature="pretrain",
                         lazy=True)
    elif args.dataset == "ZINC250k":
        dataset = datasets.ZINC250k("../data/torchdrug/molecule-datasets/",
                                    node_feature="pretrain",
                                    edge_feature="pretrain",
                                    lazy=True)
    elif args.dataset == "ZINC2m":
        # defaults to lazy load
        dataset = datasets.ZINC2m("../data/torchdrug/molecule-datasets/",
                                  node_feature="pretrain",
                                  edge_feature="pretrain")

    # CTRP smiles to embed
    ctrp = pd.read_csv("../data/drug_screens/CTRP/v20.meta.per_compound.txt",
                       sep="\t")
    ctrp_ds = MoleculeDataset()
    ctrp_ds.load_smiles(smiles_list=ctrp['cpd_smiles'],
                        targets=dict(),
                        node_feature='pretrain',
                        edge_feature='pretrain')

    # Self-supervised pretraining
    dm = ChemGraphDataModule.from_argparse_args(args,
                                                train=dataset,
                                                predict=ctrp_ds)
    model = ChemGraphEmbeddingNetwork(task=args.task,
                                      input_dim=dataset.node_feature_dim,
                                      hidden_dims=[512] * 5,
                                      edge_input_dim=dataset.edge_feature_dim,
                                      batch_norm=True,
                                      readout="mean",
                                      mask_rate=0.15)
    # Callbacks
    fname = f"{args.name}_{args.task}_{args.dataset}"
    logger = TensorBoardLogger(save_dir=args.default_root_dir,
                               version=fname,
                               name='lightning_logs')
    early_stop = EarlyStopping(monitor='accuracy',
                               min_delta=0.001,
                               patience=5,
                               verbose=False,
                               mode='max')
    checkpoint_callback = ModelCheckpoint(monitor='accuracy', mode='max')
    trainer = Trainer.from_argparse_args(
        args,
        default_root_dir=logger.log_dir,
        logger=logger,
        callbacks=[early_stop, checkpoint_callback],
        strategy=DDPPlugin(find_unused_parameters=False),
        profiler='simple')
    trainer.fit(model, dm)

    # Generate CTRP embeddings
    model.to('cpu')
    model.eval()
    dl = DataLoader(ctrp_ds, batch_size=len(ctrp_ds))
    graph_embeds = []
    node_embeds = []
    for batch in dl:
        graph_feature, node_feature = model(batch)
        graph_embeds.append(graph_feature.detach())
        node_embeds.append(node_feature.detach())
    graph_embeds = torch.cat(graph_embeds).numpy()
    node_embeds = torch.cat(node_embeds).numpy()

    # Write out
    node_cpd_ids = [
        np.repeat(cpd_id, n['graph'].num_node)
        for n, cpd_id in zip(ctrp_ds, ctrp['broad_cpd_id'])
    ]
    node_cpd_ids = np.concatenate(node_cpd_ids)
    node_embeds = pd.DataFrame(node_embeds, index=node_cpd_ids)
    node_embeds['atom_type'] = np.concatenate(
        [[ATOM_SYMBOL[a] for a in n['graph'].atom_type] for n in ctrp_ds])
    graph_embeds = pd.DataFrame(graph_embeds, index=ctrp['broad_cpd_id'])
    node_embeds.to_csv(
        f"../data/torchdrug/molecule-datasets/{fname}_ctrp_node_embeds.csv",
        sep=",")
    graph_embeds.to_csv(
        f"../data/torchdrug/molecule-datasets/{fname}_ctrp_graph_embeds.csv",
        sep=",")
Beispiel #17
0
def main(args):
    pl.seed_everything(RANDOM_STATE)

    exp_dir = pathlib.Path(args.exp_dir)
    exp_dir.mkdir(exist_ok=True, parents=True)

    train_params = TrainParams()

    train_tr, valid_tr = train_transform(
        train_params.img_size_in_batch), valid_transform(
            train_params.img_size_in_batch)

    ignore_train_images_list = None

    if args.ignore_images is not None:
        ignore_train_images_list = load_ignore_images(args.ignore_images)

    if train_params.train_size == 1:
        datamodule = FullLandmarkDataModule(
            path_to_dir=args.data_dir,
            annot_file=args.annot_file,
            ignore_train_images=ignore_train_images_list,
            train_batch_size=args.train_batch_size,
            val_batch_size=args.valid_batch_size,
            train_num_workers=args.train_num_workers,
            valid_num_workers=args.valid_num_workers,
            random_state=RANDOM_STATE,
            train_transforms=train_tr,
            precompute_data=args.precompute_data)
    else:
        datamodule = TrainTestLandmarkDataModule(
            path_to_dir=args.data_dir,
            annot_file=args.annot_file,
            ignore_train_images=ignore_train_images_list,
            train_batch_size=args.train_batch_size,
            val_batch_size=args.valid_batch_size,
            train_num_workers=args.train_num_workers,
            valid_num_workers=args.valid_num_workers,
            random_state=RANDOM_STATE,
            train_size=train_params.train_size,
            precompute_data=args.precompute_data,
            train_transforms=train_tr,
            val_transforms=valid_tr)

    model = get_model(train_params.num_landmarks, train_params.dropout_prob,
                      train_params.train_backbone)

    opt_params = OptimizerParams()
    scheduler_params = SchedulerPrams()

    target_metric_name = "MSE loss"

    train_module = ModelTrain(
        model=model,
        optimizer_params=opt_params,
        scheduler_params=scheduler_params,
        train_backbone_after_epoch=train_params.train_full_model_after_epoch,
        target_metric_name=target_metric_name,
        save_img_every_train_batch=100)

    checkpoint_dir = exp_dir / "checkpoint"
    checkpoint_dir.mkdir(exist_ok=True, parents=True)

    checkpoint_callback = callbacks.ModelCheckpoint(
        monitor=target_metric_name + '_epoch',
        dirpath=checkpoint_dir,
        filename=f"{{epoch}}-{{{target_metric_name}:.4f}}",
        verbose=True,
        save_last=True,
        save_top_k=2,
        mode="min",
        save_weights_only=False)

    lr_monitor = callbacks.LearningRateMonitor(logging_interval='step')

    log_dir = exp_dir / "logs"
    log_dir.mkdir(exist_ok=True, parents=True)

    logger = TensorBoardLogger(str(log_dir))

    gpus = -1 if torch.cuda.is_available() else None

    if gpus is None:
        logging.getLogger().warning(
            "GPU is not available. Try train on CPU. It may will bew very slow"
        )

    trainer = pl.Trainer(
        amp_backend="native",
        auto_scale_batch_size="binsearch",
        gpus=gpus,
        logger=logger,
        auto_select_gpus=True,
        benchmark=True,
        check_val_every_n_epoch=train_params.check_val_every_n_epoch,
        flush_logs_every_n_steps=train_params.flush_logs_every_n_steps,
        default_root_dir=str(exp_dir),
        deterministic=False,
        fast_dev_run=args.fast_dev_run,
        progress_bar_refresh_rate=10,
        precision=train_params.precision,
        max_epochs=train_params.max_epochs,
        callbacks=[checkpoint_callback, lr_monitor])

    trainer.fit(train_module, datamodule=datamodule)
Beispiel #18
0
    def __call__(self, trial):
        torch.cuda.empty_cache()

        trial.set_user_attr('fold', self.fold)
        kwargs = self.suggestions(trial)

        start = datetime.now()
        print(f"Training on fold {self.fold}")
        train_ds, val_ds, input_cols, cond_cols = read(self.path, self.exp,
                                                       self.fold, self.subset)
        cols = list(np.concatenate((input_cols, cond_cols, [self.target])))
        train = train_ds.to_table(columns=cols).to_pandas()
        val = val_ds.to_table(columns=cols).to_pandas()
        # DataModule
        dm = CTRPDataModule(train, val, input_cols, cond_cols, self.target,
                            kwargs['batch_size'])
        del train, val
        print(f"Time elapsed loading data: {datetime.now()-start}")
        # Model
        model = ConditionalNetwork(
            n_blocks=kwargs['n_blocks'],
            exp=self.exp,
            inputs_sz=len(dm.input_cols),
            conds_sz=len(dm.cond_cols),
            inputs_emb_layers=kwargs['inputs_emb_layers'],
            conds_emb_layers=kwargs['conds_emb_layers'],
            film_layers=kwargs['film_layers'],
            linear_layers=kwargs['linear_layers'],
            ps_emb=kwargs['ps_emb'],
            ps_film=kwargs['ps_film'],
            ps_linear=kwargs['ps_linear'],
            learning_rate=kwargs['learning_rate'],
            weight_decay=kwargs['weight_decay'],
            batch_size=kwargs['batch_size'])
        # Callbacks
        logger = TensorBoardLogger(
            save_dir=self.logs,
            version=f"trial{trial.number}_{self.exp}_fold_{self.fold}",
            name='model_logs')
        early_stop = EarlyStopping(monitor='val_r2',
                                   min_delta=0.0001,
                                   patience=12,
                                   verbose=False,
                                   mode='max')
        # Trainer
        trainer = Trainer(
            default_root_dir=logger.
            log_dir,  #in order to avoid lr_find_temp.ckpt conflicts
            auto_lr_find=False,
            auto_scale_batch_size=False,
            max_epochs=self.epochs,
            gpus=self.gpu,
            accelerator=self.accelerator,
            logger=logger,
            callbacks=[
                PyTorchLightningPruningCallback(trial, monitor="val_r2"),
                early_stop
            ],
            flush_logs_every_n_steps=200,
            precision=32,
            profiler="simple",
            deterministic=True)
        trainer.fit(model, dm)

        # save and clean up gpu
        r2 = trainer.callback_metrics["val_r2"].item()
        del dm, model, trainer
        torch.cuda.empty_cache()

        print("Completed fold {} in {}".format(self.fold,
                                               str(datetime.now() - start)))
        print(f'Fold val_r2: {r2}')

        return r2
Beispiel #19
0
def cli_main():

    argv = sys.argv[1:]
    # argv = ['--config',     'configs/base.yaml',
    #         '--exp_name',   'test',
    #         '--exp_dir',    '../prj_ssl_ntu_exps/test']

    parser = argparse.ArgumentParser()

    parser.add_argument("-c",
                        "--config",
                        default=None,
                        help="where to load YAML configuration",
                        metavar="FILE")
    parser.add_argument('--exp_dir',
                        type=str,
                        default=None,
                        help='experiment output directory')
    parser.add_argument('--path_db',
                        type=str,
                        default='../dbs',
                        help='neptune project directory')

    args = parser.parse_args(argv)

    new_exp_dir = args.exp_dir
    new_path_db = args.path_db

    with open(args.config, 'r') as stream:
        config_vars = yaml.load(stream, Loader=yaml.FullLoader)

    args = argparse.Namespace()
    args.__dict__.update(config_vars)

    if new_exp_dir is not None:
        args.exp_dir = new_exp_dir

    if new_path_db is not None:
        args.path_db = new_path_db

    # trainer args
    parser = pl.Trainer.add_argparse_args(parser)

    # get model and model args
    model_type = vars(modules)[args.model]

    # get dataset and dataset args
    dataset_type = vars(datasets)[args.dataset]

    # save config
    with open(os.path.join(args.exp_dir, 'config.yaml'), 'w') as cfg_file:
        yaml.dump(args.__dict__, cfg_file)

    if args.neptune_key != '':
        logger = NeptuneLogger(
            api_key=args.neptune_key,
            project_name=args.neptune_project,
            close_after_fit=False,
            experiment_name=args.exp_name,  # Optional,
            params=args.__dict__,  # Optional,
            tags=["pl"],  # Optional,
            # upload_stderr=False,
            # upload_stdout=False
        )
    else:
        logger = TensorBoardLogger(args.exp_dir)  #, name="my_model"

    # ckpt = list(filter(lambda x:'.ckpt' in x, os.listdir(args.exp_dir)))[-1]
    # ckpt = os.path.join(args.exp_dir, ckpt)

    ckpts = list(filter(lambda x: 'epoch=' in x, os.listdir(args.exp_dir)))

    best_epoch = max(
        [int(x.replace('epoch=', '').replace('.ckpt', '')) for x in ckpts])
    best_ckpt = os.path.join(args.exp_dir,
                             'epoch=' + str(best_epoch) + '.ckpt')
    model = model_type.load_from_checkpoint(best_ckpt)

    lincls_results = lincls(args, model)

    print(best_ckpt)

    print('test results')
    for k, v in lincls_results.items():
        print(k, v)

    db_path = os.path.join(args.path_db, args.exp_name + '_db.csv')

    if os.path.exists(db_path):
        df = pd.read_csv(db_path, index_col=0)
    else:
        df = pd.DataFrame()

    output_dict = {
        'exp_name': args.exp_name,
        'exp_dir': args.exp_dir,
        'model': args.model,
        'dataset': args.dataset,
    }

    output_dict.update(lincls_results)

    if args.neptune_key != '':
        for k, v in pretrain_result.items():
            logger.experiment.log_metric(k, v)

        for k, v in lincls_results.items():
            logger.experiment.log_metric(k, v)

    df = df.append(output_dict, ignore_index=True)
    df.to_csv(db_path)
Beispiel #20
0
from pytorch_lightning.loggers.tensorboard import TensorBoardLogger
from torch.utils.tensorboard import SummaryWriter
import torch
import pytorch_lightning as pl
from config import preEncDec
from eng_utils import gen_model_loaders
import model as M


def preproc():
    from data import split_data
    split_data('../data/hin-eng/hin.txt', '../data/hin-eng')


if __name__ == '__main__':
    pre_proc()
    train_loader, eval_loader = gen_model_loaders(preEncDec)
    hparams = preEncDec.lr
    writer = SummaryWriter()
    net = M.itr_Net(hparams)
    logger = TensorBoardLogger("tb_logs", name="translation_model")
    trainer = pl.Trainer(gpus=1, max_epochs=preEncDec.epochs, logger=logger)
    trainer.fit(net)
    writer.close()
Beispiel #21
0
import torchvision
import pytorch_lightning as pl
import torch
from torch import nn
from model import Generator, Discriminator
from dataloder import make_datapath_list, ImageTransform, GAN_Img_Dataset
from torch.utils.data import DataLoader
import argparse
from argparse import ArgumentParser
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from test_tube import Experiment
from pytorch_lightning.loggers.tensorboard import TensorBoardLogger

logger = TensorBoardLogger('/home/higuchi/ssd/Desktop/Sandbox/pytorch_GAN/lightning_logs', 'DGGAN')

# pytoch-bookのGANをlightningで実装したい
class GAN(pl.LightningModule):
    def __init__(self, hparams):
        super(GAN, self).__init__()
        self.hparams = hparams
        # パラメータの保存
        # self.scale_factor = opt.scale_factor
        # self.batch_size = opt.batch_size
        # self.patch_size = opt.patch_size
        self.last_imgs = None

        self.z_dim = 20
        self.batch_size = 64

        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
Beispiel #22
0
assert data_module is not None
cfg['model']['params']['vocab_size'] = data_module.vocab_size

# %% Load model
model = ETM(**cfg['model']['params'])

model.set_optim_params(optim_params=cfg['optimizer'],
                       sched_params=cfg['scheduler'])

# %% Prepare training
save_dir = mkdir(
    os.path.join(cfg['root_dir'], cfg['dataset']['name'],
                 model.get_model_folder(), str(cfg['seed'])))
# trainer = pl.Trainer(**cfg['model'])
logger = TensorBoardLogger(save_dir=save_dir,
                           name='logs',
                           default_hp_metric=False)
logger.log_hyperparams(flatten_cfg(cfg))

save_dir_ckpt = mkdir(os.path.join(save_dir, 'ckpt'))

checkpoint = ModelCheckpoint(monitor='val_ELBO',
                             mode='max',
                             save_top_k=1,
                             period=5,
                             filename='checkpoint-{epoch:02d}',
                             dirpath=save_dir_ckpt)
early_stopping = EarlyStopping('val_ELBO',
                               mode='max',
                               min_delta=0.0,
                               patience=10)