コード例 #1
0
def main(hparams):
    pl.seed_everything(hparams.seed)
    if hparams.train:
        model = TransformerGenomicModel(WORD_NUM, EMBEDDING_DIM, BATCH_SIZE)
        trainer = pl.Trainer(
            resume_from_checkpoint='checkpoint/epoch=43.ckpt',
            logger=pl_loggers.TensorBoardLogger(save_dir='logs',
                                                name='TensorBoard',
                                                version=3),
            checkpoint_callback=pl.callbacks.ModelCheckpoint(
                filepath='checkpoint',
                verbose=True,
                save_top_k=hparams.save_top_k),
            early_stop_callback=pl.callbacks.EarlyStopping(monitor='val_loss',
                                                           patience=3,
                                                           verbose=True,
                                                           mode='min'),
            default_root_dir=os.getcwd(),
            gpus=hparams.gpus,
            accumulate_grad_batches=2,
            distributed_backend='ddp',
            precision=16,
            log_gpu_memory='all')
        trainer.fit(model)
    else:
        model = TransformerGenomicModel(WORD_NUM, EMBEDDING_DIM, BATCH_SIZE)
        model.load_state_dict(
            torch.load('checkpoint/epoch=43.ckpt')['state_dict'])
コード例 #2
0
def train_unet(args):
    seed_everything(args.seed)
    checkpoint_callback = ModelCheckpoint(filepath=os.path.join(
        "runs", args.experiment, 'checkpoints'),
                                          save_top_k=1,
                                          monitor='val_loss',
                                          verbose=False,
                                          period=args.save_epoch)
    tb_logger = pl_loggers.TensorBoardLogger(save_dir=os.path.join(
        "runs", 'logs/'),
                                             name="UNet",
                                             version=args.version)
    model = DepthRegressorTrainer(args)
    trainer = Trainer(gpus=[args.gpu],
                      num_sanity_val_steps=args.sanity_steps,
                      checkpoint_callback=checkpoint_callback,
                      max_epochs=args.max_epoch,
                      limit_val_batches=args.val_check_percent,
                      val_check_interval=min(args.val_check_interval, 1.0),
                      check_val_every_n_epoch=max(1, args.val_check_interval),
                      resume_from_checkpoint=args.resume,
                      logger=tb_logger,
                      benchmark=True,
                      precision=args.precision)

    trainer.fit(model)
コード例 #3
0
def cnn_main():

    # torch.autograd.set_detect_anomaly(True)
    pl.seed_everything(1234)
    tb_logger = pl_loggers.TensorBoardLogger('cnn_logs/',
                                             default_hp_metric=False,
                                             log_graph=True)
    trainer = pl.Trainer(logger=tb_logger,
                         gpus=1,
                         auto_select_gpus=True,
                         max_epochs=10)

    folder = Path("data/aus_data/3d-expanded")
    # dataset = customdata.AusDataCube(folder, normalise=True)
    # dataset = customdata.AusDataImg(folder, normalise=True)

    batch_size = 512
    lr = 1e-4
    data = customdata.AusDataModule(folder,
                                    cube=True,
                                    normalise=True,
                                    batch_size=batch_size)

    # model = models.SCNN(70, 2)
    # model = models.SCNN3D(1, 2)
    # model = models.SSNet(1, 2)
    model = models.HybridSN(1, 2, 70, 64, lr, batch_size)
    trainer.fit(model, data)
コード例 #4
0
def build_logger(model_type, task=None):
    log_dir = _get_log_dir()
    task = task or 'general'
    experiment_name = f'{model_type}_{task}'
    logger = loggers.TensorBoardLogger(log_dir, experiment_name)

    return logger
コード例 #5
0
def train(experiment_path):

    assert os.path.exists(
        os.path.join(experiment_path, "experiment.yaml")
    ), "No experiment configuration was found, please create an experiment.yaml"

    with open(os.path.join(experiment_path, "experiment.yaml"),
              mode="r") as config_file:
        configuration_dict = yaml.load(config_file, Loader=yaml.Loader)

    model = TextClassificationModel(configuration_dict)
    tb_logger = pl_loggers.TensorBoardLogger(
        configuration_dict['Training']['log_dir'])

    assert not os.path.exists(configuration_dict['Training']
                              ['snapshot_dir']), "Experiment already exists."

    checkpoint = ModelCheckpoint(
        filepath=os.path.join(configuration_dict['Training']['snapshot_dir'],
                              'best_model_{epoch:02d}-{val_loss:.2f}'),
        verbose=True,
        monitor=configuration_dict['Training']['snapshot_selection_scheme'],
        mode='min')

    trainer = Trainer(logger=tb_logger,
                      checkpoint_callback=checkpoint,
                      max_epochs=configuration_dict['Training']['epochs'],
                      gpus=configuration_dict['Training']['gpus'])

    trainer.fit(model)

    trainer.test(ckpt_path=trainer.checkpoint_callback.best_model_path)
コード例 #6
0
def main(args):
    hp = OmegaConf.load(args.config)
    model = AEINet(hp)
    save_path = os.path.join(hp.log.chkpt_dir, args.name)
    os.makedirs(save_path, exist_ok=True)

    checkpoint_callback = ModelCheckpoint(
        filepath=os.path.join(hp.log.chkpt_dir, args.name),
        monitor='val_loss',
        verbose=True,
        save_top_k=args.save_top_k,  # save all
    )

    trainer = Trainer(
        logger=pl_loggers.TensorBoardLogger(hp.log.log_dir),
        early_stop_callback=None,
        checkpoint_callback=checkpoint_callback,
        weights_save_path=save_path,
        gpus=-1 if args.gpus is None else args.gpus,
        distributed_backend='ddp',
        num_sanity_val_steps=1,
        resume_from_checkpoint=args.checkpoint_path,
        gradient_clip_val=hp.model.grad_clip,
        fast_dev_run=args.fast_dev_run,
        val_check_interval=args.val_interval,
        progress_bar_refresh_rate=1,
        max_epochs=10000,
    )
    trainer.fit(model)
コード例 #7
0
ファイル: train.py プロジェクト: nvvaulin/medical_imaging
def main(exp_root, exp_name, version, _config, load_epoch=None):
    tb_logger = pl_loggers.TensorBoardLogger(exp_root, exp_name, version)
    label_names, train_loader, val_loader, test_loader = load_train_val_test(
        _config['dataset'])
    backbone = load_backbone(**_config.get('backbone', {}))
    model = BasicClassifierModel(backbone, label_names, _config['optimizer'],
                                 _config['scheduler'])
    checkpointer = ExistedModelCheckpoint(
        monitor='val_loss',
        mode='min',
        save_top_k=5,
        dirpath=os.path.join(exp_root, exp_name, version, 'checkpoints'),
        filename=_config['backbone']['name'] +
        '-{epoch}-{val_loss:.3f}-{train_loss:.3f}')

    callbacks = [checkpointer, EarlyStopping(monitor='val_loss', patience=10)]
    trainer = pl.Trainer(
        logger=tb_logger,
        resume_from_checkpoint=checkpointer.get_checkpoint_path(load_epoch),
        callbacks=callbacks,
        **_config.get('trainer', {}))
    trainer.fit(model, train_loader, val_loader)
    print('load best epoch ', checkpointer.best_model_path)
    model.load_from_checkpoint(checkpointer.best_model_path)
    result = trainer.test(test_dataloaders=test_loader)
    print(result)
コード例 #8
0
ファイル: train.py プロジェクト: jasson31/vedo_visualizer
def train():
    args = parse_args()

    model_module = importlib.import_module('models.' + args.model + '.' +
                                           args.model)
    config_module = importlib.import_module('configs.' + args.model + '.' +
                                            args.config)
    data_module = importlib.import_module('datasets.' + args.dataset)

    model_cls = getattr(model_module, args.model)
    hparams = config_module.get_hparams(args.option)
    dataset_cls = getattr(data_module, args.dataset + 'DataModule')
    data_hparams = config_module.get_data_hparams(args.option)

    model = model_cls(hparams)
    dataset = dataset_cls(data_hparams)

    tb_logger = loggers.TensorBoardLogger(save_dir=os.path.join(
        'logs', args.model + '_logs'),
                                          name=args.config + '_' + args.option)

    checkpoint_callback = ModelCheckpoint(save_top_k=-1)

    weights_save_path = os.path.join('logs', args.model + '_logs')

    trainer = Trainer(gpus=-1,
                      logger=tb_logger,
                      checkpoint_callback=checkpoint_callback,
                      log_every_n_steps=250,
                      weights_save_path=weights_save_path,
                      distributed_backend='ddp',
                      replace_sampler_ddp=False)

    trainer.fit(model, dataset)
コード例 #9
0
def train(
    dataset_name="cifar10",
    version="efficientnet-b0",
    batch_size=10,
    epochs=100,
    checkpoint=None,
    output_path=None,
    **model_params,
):
    cifar_dm = CifarDataModule(batch_size=batch_size,
                               dataset_name=dataset_name)

    model = EfficientNet(
        model_name=version,
        num_classes=cf.num_classes[dataset_name],
        image_size=32,
        **model_params,
    )

    if output_path is None:
        output_path = f"lightning_logs/{dataset_name}/{version}"

    logger = loggers.TensorBoardLogger(output_path)

    trainer = pl.Trainer(
        progress_bar_refresh_rate=20,
        max_epochs=epochs,
        gpus=1,
        logger=logger,
        resume_from_checkpoint=checkpoint,
    )

    trainer.fit(model, cifar_dm)
コード例 #10
0
def trainer_builder(args):
    # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    logging.info("PyTorch Lighting Trainer constructing...")
    tb_logger = pl_loggers.TensorBoardLogger(save_dir=args.exp_name)
    # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    check_point_dir = args.exp_name
    checkpoint_callback = ModelCheckpoint(monitor='valid_loss',
                                          mode='min',
                                          save_top_k=-1,
                                          dirpath=check_point_dir,
                                          filename='HGN_hotpotQA-{epoch:02d}-{valid_loss:.4f}')
    # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    if args.gpus > 0:
        gpu_list_str = args.gpu_list
        gpu_ids = [int(x) for x in gpu_list_str.split(',')]
        trainer = pl.Trainer(logger=tb_logger,
                             gradient_clip_val=args.max_grad_norm,
                             gpus=gpu_ids,
                             val_check_interval=args.val_check_interval,
                             accumulate_grad_batches=args.gradient_accumulation_steps,
                             callbacks=[checkpoint_callback],
                             accelerator=args.accelerator,
                             precision=args.precision,
                             plugins=args.plugins,
                             log_every_n_steps=args.logging_steps,
                             max_epochs=int(args.num_train_epochs))
    else:
        trainer = pl.Trainer(logger=tb_logger,
                             gradient_clip_val=args.max_grad_norm,
                             val_check_interval=args.val_check_interval,
                             accumulate_grad_batches=args.gradient_accumulation_steps,
                             log_every_n_steps=args.logging_steps,
                             max_epochs=int(args.num_train_epochs))
    return trainer
コード例 #11
0
def train_model(model, model_dir):
    # Setup trainer

    cb1 = callbacks.ModelCheckpoint(filename='best-{epoch}',
                                    monitor='val_loss_mean',
                                    save_top_k=1,
                                    mode='min')
    cb2 = callbacks.ModelCheckpoint(filename='last-{epoch}', save_last=True)

    tb_logger = pl_loggers.TensorBoardLogger('{}/logs/'.format(model_dir))
    if Constants.n_gpus != 0:
        #trainer = Trainer(gpus=Constants.n_gpus, distributed_backend='ddp', logger = tb_logger, precision=16, default_root_dir=model_dir, max_epochs=n_epochs)
        trainer = Trainer(gpus=Constants.n_gpus,
                          callbacks=[cb1, cb2],
                          plugins=DDPPlugin(find_unused_parameters=False),
                          accelerator='ddp_spawn',
                          precision=16,
                          logger=tb_logger,
                          default_root_dir=model_dir,
                          max_epochs=n_epochs)
    else:
        trainer = Trainer(gpus=0,
                          default_root_dir=model_dir,
                          callbacks=[cb1, cb2],
                          logger=tb_logger,
                          distributed_backend='ddp_spawn',
                          max_epochs=n_epochs)

    trainer.fit(model)
    trainer.test()
コード例 #12
0
def main():
    args = parse_args()
    seed_everything(args.seed)

    tb_logger = loggers.TensorBoardLogger("logs/")
    wandb_logger = loggers.WandbLogger(save_dir="logs/", project="xldst")
    assert wandb_logger.experiment.id
    checkpoint_callback = ModelCheckpoint(
        filepath=os.path.join("ckpts", wandb_logger.experiment.id,
                              "{epoch}-{val_loss:.4f}"),
        verbose=True,
    )
    early_stop_callback = EarlyStopping(patience=2, verbose=True)
    trainer = Trainer.from_argparse_args(
        args,
        logger=[tb_logger, wandb_logger],
        checkpoint_callback=checkpoint_callback,
        early_stop_callback=early_stop_callback,
    )
    dm = CldstMBartDataModule(args)
    dm.prepare_data()

    dm.setup("fit")
    model = MBartDST(args)
    trainer.fit(model, datamodule=dm)

    dm.setup("test")
    trainer.test(datamodule=dm)
コード例 #13
0
ファイル: main.py プロジェクト: BeaLove/ImgColorization
def run_trainer():
    early_stop_call_back = EarlyStopping(monitor='val_loss',
                                         min_delta=0.00,
                                         patience=5,
                                         verbose=False,
                                         mode='max')
    '''log learning rate'''
    lr_callback = pl.callbacks.LearningRateMonitor(logging_interval='epoch')
    model = Colorization_model(
        loss=opt.loss)  #TODO set loss as RarityWeighted or L2, default: L2
    logger = loggers.TensorBoardLogger(save_dir='logs/')
    print("using GPU", torch.cuda.is_available())
    trainer = Trainer(
        max_epochs=300,
        #gpus=1,
        logger=logger,  #use default tensorboard
        log_every_n_steps=20,  #log every update step for debugging
        limit_train_batches=1.0,
        limit_val_batches=1.0,
        check_val_every_n_epoch=2,
        callbacks=[early_stop_call_back, lr_callback])
    trainer.fit(model)
    '''we may not need the below. lightning model can be loaded from last checkpoint'''
    os.makedirs('trained_models', exist_ok=True)
    name = 'ColorizationModelOverfitTest.pth'
    torch.save(model, os.path.join('trained_models', name))
コード例 #14
0
def main():
    model = ResidualNetwork()
    tb_logger = pl_loggers.TensorBoardLogger('logs/')
    trainer = pl.Trainer(max_epochs=5, gpus=get_gpu(), logger=tb_logger)
    trainer.fit(model)

    return 0
コード例 #15
0
    def __init__(self, **kwargs):

        # Experiment results of name 'foo' are placed in directory results/foo/version_n/
        kwargs.setdefault('logger', loggers.TensorBoardLogger(
            'results/', name=kwargs['name'], version=kwargs.get('version')))

        # Early stopping is disabled
        kwargs.setdefault('early_stop_callback', False)

        # Create results and/or results/name if they don't exist
        if not os.path.exists('results'):
            os.system('mkdir results')
        if not os.path.exists('results/' + kwargs['name']):
            os.system('mkdir results/' + kwargs['name'])

        # Checkpoint are saved in directory results/foo/version_n/
        kwargs.setdefault('checkpoint_callback', ModelCheckpoint(
            filepath=('results/' + kwargs['name'] + '/version_'
                      + str(kwargs['logger'].version) + '/c'),
            monitor='val_energy',
            prefix='',
            save_top_k=-1
        ))

        kwargs.setdefault('log_save_interval', 100)  # logs are written to disk every 100 episodes
        kwargs.setdefault('row_log_interval', 1)  # logs are created every episode
        kwargs.setdefault('progress_bar_refresh_rate', 1)

        super(Trainer, self).__init__(**kwargs)
コード例 #16
0
def main(hparams):
    model = Longformer(hparams)

    if hparams.output_dir is not None:
        if "test" in hparams.data_path:
            name = "longformer_test"
        else:
            name = "longformer"
        logger = loggers.TensorBoardLogger(save_dir=hparams.output_dir,
                                           name=name,
                                           version=str(hparams.amount_labels),
                                           log_graph=True)
    else:
        logger = True

    trainer = Trainer(default_root_dir=logger.log_dir + "/checkpoints/",
                      logger=logger,
                      log_save_interval=10,
                      gpus=hparams.gpus,
                      tpu_cores=hparams.tpu_cores,
                      fast_dev_run=hparams.fast_dev_run,
                      max_epochs=hparams.max_epochs,
                      auto_lr_find=hparams.auto_lr_find,
                      gradient_clip_val=hparams.gradient_clip_val,
                      check_val_every_n_epoch=hparams.check_val_every_n_epoch,
                      amp_level=hparams.amp_level,
                      accumulate_grad_batches=hparams.accumulate_grad_batches)

    print("Hyperparameter:")
    print("_______________")
    print(json.dumps(vars(hparams), indent=4))
    trainer.fit(model)
    test_result = trainer.test(model)
    trainer.logger.save()
コード例 #17
0
def common_train(args, metric, model_class, build_method, task: str,
                 **model_kwargs):
    pl.seed_everything(args.seed)

    early_stop_callback = EarlyStopping(monitor=metric,
                                        min_delta=1e-5,
                                        patience=3,
                                        verbose=False,
                                        mode='max')
    checkpoint_callback = ModelCheckpoint(monitor=metric,
                                          save_top_k=1,
                                          verbose=True,
                                          mode='max',
                                          save_last=True)
    model = model_class(args, **model_kwargs)
    build_method(model)
    this_time = time.strftime("%m-%d_%H-%M-%S", time.localtime())
    try:
        import wandb
        logger = loggers.WandbLogger(save_dir='lightning_logs',
                                     name=f'{task}_{this_time}',
                                     project='ltp')
    except Exception as e:
        logger = loggers.TensorBoardLogger(save_dir='lightning_logs',
                                           name=f'{task}_{this_time}')
    trainer: Trainer = Trainer.from_argparse_args(
        args,
        logger=logger,
        callbacks=[early_stop_callback],
        checkpoint_callback=checkpoint_callback)
    # Ready to train with new learning rate
    trainer.fit(model)
    trainer.test()
コード例 #18
0
ファイル: run_train.py プロジェクト: lruczu/learning
def main(args):
    tb_logger = pl_loggers.TensorBoardLogger(
        os.path.join(TrainingConfig.EXPERIMENT_NAME, 'logs'))

    qa_data_module = QADataModule(
        tokenizer_name_or_path=TrainingConfig.TOKENIZER_CHECKPOINT,
        train_path=TrainingConfig.TRAIN_DATA_PATH,
        valid_path=TrainingConfig.VALID_DATA_PATH,
        batch_size=TrainingConfig.BATCH_SIZE,
        max_seq_length=PreprocessingConfig.MAX_LENGTH,
        doc_stride=PreprocessingConfig.DOC_STRIDE,
        max_query_length=PreprocessingConfig.MAX_QUERY_LENGTH,
    )

    biobert = BioBERT(model_name_or_path=TrainingConfig.MODEL_CHECKPOINT,
                      n_steps=TrainingConfig.N_EPOCHS *
                      qa_data_module.number_of_steps_per_epoch(),
                      lr=TrainingConfig.LR,
                      weight_decay=TrainingConfig.WEIGHT_DECAY,
                      warm_up_prop=TrainingConfig.WARM_UP_PROP,
                      model_save_dir=TrainingConfig.EXPERIMENT_NAME)

    trainer = pl.Trainer(
        max_epochs=TrainingConfig.N_EPOCHS,
        gpus=args.gpus,
        callbacks=[SaveCallback(TrainingConfig.EXPERIMENT_NAME)],
        logger=tb_logger,
    )

    trainer.fit(biobert, qa_data_module)
コード例 #19
0
ファイル: train.py プロジェクト: dric2018/audioClassification
def run_fold(fold, train_df, args,size=(224, 224), arch='resnet18', pretrained=True,   path='MODELS/', data_transforms=None):
  
  torch.cuda.empty_cache()

  fold_train = train_df[train_df.fold != fold].reset_index(drop=True)
  fold_val = train_df[train_df.fold == fold].reset_index(drop=True)

  train_ds = AudioDataset(images_path=args.specs_images_path, df=fold_train, transforms=data_transforms['train'])
  val_ds = AudioDataset(images_path=args.specs_images_path, df=fold_val, transforms=data_transforms['train'])

  trainloader = DataLoader(train_ds, batch_size=args.train_batch_size, shuffle=True , num_workers=os.cpu_count())
  validloader = DataLoader(val_ds, batch_size=args.test_batch_size, shuffle=False , num_workers=os.cpu_count())

  del train_ds
  del val_ds
  del fold_train
  del fold_val

  model = AudioClassifier(arch_name=arch, lr=args.lr, pretrained=pretrained)

  tb_logger = loggers.TensorBoardLogger(save_dir='./runs', name='ZINDI-GIZ-NLP-AGRI-KEYWORDS', version=fold)

  ckpt_callback = pl.callbacks.ModelCheckpoint(filename=f'ZINDI-GIZ-NLP-AGRI-KEYWORDS-{model.hparams.arch_name}-{fold}-based', 
                                               dirpath=path, 
                                               monitor='val_logLoss', 
                                               mode='min')
  
  trainer = Trainer(max_epochs=args.num_epochs, gpus=args.gpus, logger=tb_logger, callbacks=[ckpt_callback])

  trainer.fit(model, trainloader, validloader)


  gc.collect() # collect garbage

  return trainer.logged_metrics
コード例 #20
0
ファイル: train.py プロジェクト: tarepan/Scyclone-PyTorch
def train(args: Namespace, datamodule: LightningDataModule) -> None:
    """Train Scyclone on PyTorch-Lightning.
    """

    ckptAndLogging = CheckpointAndLogging(args.dir_root, args.name_exp, args.name_version)
    # setup
    gpus: int = 1 if torch.cuda.is_available() else 0  # single GPU or CPU
    model = Scyclone(args.sampling_rate, args.noiseless_d)
    ckpt_cb = ModelCheckpoint(period=60, save_last=True, save_top_k=1, monitor="val_loss")
    trainer = pl.Trainer(
        gpus=gpus,
        auto_select_gpus=True,
        precision=32 if args.no_amp else 16,
        max_epochs=args.max_epochs,
        check_val_every_n_epoch=args.val_interval_epoch,
        # logging/checkpointing
        resume_from_checkpoint=ckptAndLogging.resume_from_checkpoint,
        default_root_dir=ckptAndLogging.default_root_dir,
        checkpoint_callback=ckpt_cb,
        logger=pl_loggers.TensorBoardLogger(
            ckptAndLogging.save_dir, ckptAndLogging.name, ckptAndLogging.version
        ),
        # reload_dataloaders_every_epoch=True,
        profiler=args.profiler,
        progress_bar_refresh_rate=30
    )

    # training
    trainer.fit(model, datamodule=datamodule)
コード例 #21
0
def main(hparams):
    hparams = vars(hparams)
    hparams, loaderDict, normalizer, collate = get_data(hparams)

    # ------------------------
    # Model
    # ------------------------
    add_device_hparams(hparams)

    # define logger
    Path(hparams['log_path']).mkdir(parents=True, exist_ok=True)
    logger = loggers.TensorBoardLogger(hparams['log_path'],
                                       version=hparams['version'])
    logger.log_hyperparams(params=hparams)

    # define model
    model = RegressionModel(hparams, loaderDict['train'], loaderDict['valid'],
                            normalizer, collate)

    chkpt = None if hparams['load'] is None else get_checkpoint_path(
        hparams['load'])
    trainer = pl.Trainer(gpus=hparams['gpus'],
                         logger=logger,
                         max_epochs=hparams['epochs'],
                         distributed_backend=hparams['distributed_backend'],
                         precision=16 if hparams['use_amp'] else 32,
                         default_root_dir=hparams['log_path'],
                         deterministic=True,
                         resume_from_checkpoint=chkpt,
                         auto_lr_find=hparams['auto_lr'],
                         auto_scale_batch_size=hparams['auto_bsz'])
    trainer.fit(model)
コード例 #22
0
ファイル: run_training.py プロジェクト: patmjen/V-Net
def main(hparams):
    today = datetime.datetime.now().strftime('%d.%m.%Y')
    checkpoint_callback = ModelCheckpoint(
        dirpath=join(hparams.logger_save_dir, hparams.experiment_name, 'ckpts'),
        filename='ckpt-' + today + '-{epoch:02d}-{val_loss:2f}',
        save_top_k=hparams.save_top_k,
        verbose=True,
        monitor=hparams.monitor_loss,
        prefix='')

    tb_logger = loggers.TensorBoardLogger(save_dir=hparams.logger_save_dir,
                                          name=hparams.experiment_name)
    if hparams.checkpoint_path is None:
        model = hparams.Model(**vars(hparams))
    else:
        # If any arguments were explicitly given, then force those
        seen_params = { a : getattr(hparams, a) for a in hparams.seen_args_
                        if a != '==SUPPRESS==' }
        model = hparams.Model.load_from_checkpoint(hparams.checkpoint_path,
                                                   **seen_params)

    trainer = Trainer.from_argparse_args(
        hparams,
        callbacks=[checkpoint_callback],
        logger=tb_logger)

    trainer.fit(model)
def main(hparams, fold):
    seed_everything(hparams.seed)
    MAIN_DIR = os.path.join(config.path_to_summaries, "DWSCAllDatasets/")

    model = DWSCClassifier(hparams, fold)

    tb_logger = pl_loggers.TensorBoardLogger(os.path.join(MAIN_DIR, "logs"))
    if hparams.dataset != "SONYCUST":
        early_stopping = EarlyStopping("2_valid/1_accuracy0.5",
                                       patience=50,
                                       mode="max")
    else:
        early_stopping = EarlyStopping("2_valid_coarse/1_auprc_macro",
                                       patience=30,
                                       mode="max")
    trainer = Trainer.from_argparse_args(
        hparams,
        default_root_dir=MAIN_DIR,
        logger=tb_logger,
        early_stop_callback=early_stopping,
        # fast_dev_run=True,
        checkpoint_callback=None,
        gpus=1,
    )
    trainer.fit(model)
    with open(os.path.join(MAIN_DIR, "logs/report.txt"), "a") as file:
        if hparams.dataset != "SONYCUST":
            file.write(hparams.dataset + " fold : " + str(fold) + "\n")
        else:
            file.write(hparams.dataset + "\n")
        file.write(str(model.best_scores) + "\n")
コード例 #24
0
ファイル: lightning_nn.py プロジェクト: jamesgmccarthy/jsmp
def train_cross_val(p):
    data_ = load_data(root_dir='./data/', mode='train')
    data_, target_, features, date = preprocess_data(data_, nn=True)

    gts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=5)

    input_size = data_.shape[-1]
    output_size = 1
    tb_logger = pl_loggers.TensorBoardLogger('logs/')
    models = []
    for i, (train_idx, val_idx) in enumerate(gts.split(data_, groups=date)):
        idx = np.concatenate([train_idx, val_idx])
        data = copy.deepcopy(data_[idx])
        target = copy.deepcopy(target_[idx])
        checkpoint_callback = pl.callbacks.ModelCheckpoint(os.path.join(
            'models/', "fold_{}".format(i)),
                                                           monitor="val_auc",
                                                           mode='max',
                                                           save_top_k=1,
                                                           period=10)
        model = Classifier(input_size=input_size,
                           output_size=output_size,
                           params=p)
        if p['activation'] == nn.ReLU:
            model.apply(lambda m: init_weights(m, 'relu'))
        elif p['activation'] == nn.LeakyReLU:
            model.apply(lambda m: init_weights(m, 'leaky_relu'))
        train_idx = [i for i in range(0, max(train_idx) + 1)]
        val_idx = [i for i in range(len(train_idx), len(idx))]
        data[train_idx] = calc_data_mean(data[train_idx],
                                         './cache',
                                         train=True,
                                         mode='mean')
        data[val_idx] = calc_data_mean(data[val_idx],
                                       './cache',
                                       train=False,
                                       mode='mean')
        dataset = FinData(data=data, target=target, date=date)
        dataloaders = create_dataloaders(dataset,
                                         indexes={
                                             'train': train_idx,
                                             'val': val_idx
                                         },
                                         batch_size=p['batch_size'])
        es = EarlyStopping(monitor='val_auc',
                           patience=10,
                           min_delta=0.0005,
                           mode='max')
        trainer = pl.Trainer(logger=tb_logger,
                             max_epochs=500,
                             gpus=1,
                             callbacks=[checkpoint_callback, es],
                             precision=16)
        trainer.fit(model,
                    train_dataloader=dataloaders['train'],
                    val_dataloaders=dataloaders['val'])
        torch.save(model.state_dict(), f'models/fold_{i}_state_dict.pth')
        models.append(model)
    return models, features
コード例 #25
0
ファイル: train.py プロジェクト: Ruixinhua/NRHA
def run(option=None):
    pl.trainer.seed_everything(40)
    option = option or args.log
    # set path
    saved_dir = f"{args.mind_type}/{args.model_class}"
    ckpt_dir = f"saved/checkpoint/{saved_dir}/{option}"
    if args.resume and os.path.exists(args.resume):
        best_model_path = args.resume
    else:
        best_model_path = os.path.join(ckpt_dir, "best_model.ckpt")
    resume_path = best_model_path if os.path.exists(
        best_model_path) and args.resume else None
    train_news_file, train_behaviors_file = get_path("train",
                                                     mind_type=args.mind_type)
    valid_news_file, valid_behaviors_file = get_path("valid",
                                                     mind_type=args.mind_type)

    converter = Converter(hparams).converter
    train_dataset = TrainingDataset(train_news_file,
                                    train_behaviors_file,
                                    hparams,
                                    converter,
                                    npratio=hparams.npratio)
    train_dataloader = DataLoader(train_dataset,
                                  hparams.batch_size,
                                  num_workers=args.num_workers,
                                  pin_memory=True)
    hparams.update(**{"user_embedding_size": len(train_dataset.uid2index)})
    # set validation interval and max epochs
    interval, epochs = len(train_dataloader) // 3, hparams.epochs
    accelerator = "ddp" if int(args.gpus) > 1 else None
    valid_callback = ValidationCallback(valid_news_file, valid_behaviors_file,
                                        hparams, converter, ckpt_dir, interval)
    tb_logger = pl_loggers.TensorBoardLogger(f"saved/logs/{saved_dir}")
    # trainer object, uses good defaults (auto-tensorboard, checkpoints, logs, and more)
    trainer = pl.Trainer(gpus=int(args.gpus),
                         accelerator=accelerator,
                         max_epochs=epochs,
                         deterministic=True,
                         logger=tb_logger,
                         callbacks=[valid_callback],
                         resume_from_checkpoint=resume_path,
                         profiler="simple")
    model_class = get_model_class(args.model_class)
    trainer.fit(model_class(hparams), train_dataloader)
    group_auc = [
        float(file.split("==")[1].replace(".ckpt", ""))
        for file in os.listdir(ckpt_dir) if "==" in file
    ]
    best_auc = max(group_auc)
    for file in os.scandir(ckpt_dir):
        if "best_model" in file.name:
            continue
        auc = float(file.name.split("==")[1].replace(".ckpt", ""))
        if auc < best_auc:
            if os.path.exists(file.path):
                os.remove(file.path)
コード例 #26
0
def main():
    parser = HfArgumentParser((ModelArguments, ParaphraseDataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=2,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )
    
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        model_max_length=data_args.model_max_length
    )
    
    language_model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    if data_args.neptune_logging:
        neptune_logger = NeptuneLogger(
            project_name=os.environ['NEPTUNE_PROJECT'],
            experiment_name=model_args.config_name if model_args.config_name else model_args.model_name_or_path
        )

    train_dataset = ParaphraseDetectionDataset(data_dir=os.path.join(data_args.data_dir, TRAIN_PATH), tokenizer=tokenizer, task_name="paraphrase_detection")
    val_datasets= [              
        ParaphraseDetectionDataset(data_dir=os.path.join(data_args.data_dir, EVAL_PATH), tokenizer=tokenizer, name=EVAL_NAME) for (EVAL_PATH, EVAL_NAME) in zip(EVAL_PATHS, EVAL_NAMES)
    ]
    
    model = LMFinetuner(language_model, tokenizer, training_args.learning_rate, model_args.batch_size, train_dataset, val_datasets, data_args, freeze_backend=False)
    tb_logger = pl_loggers.TensorBoardLogger(os.path.join(training_args.output_dir, model_args.model_name_or_path))

    trainer = pl.Trainer(
        # auto_lr_find=True,
        # auto_scale_batch_size=True,
        max_epochs=int(training_args.num_train_epochs),
        accumulate_grad_batches=training_args.gradient_accumulation_steps,
        weights_save_path=training_args.output_dir,
        gpus=torch.cuda.device_count(),
        precision=16 if training_args.fp16 and torch.cuda.is_available() else 32,
        distributed_backend='ddp' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
        progress_bar_refresh_rate=training_args.logging_steps,
        logger=[neptune_logger, tb_logger] if data_args.neptune_logging else tb_logger,
    )    
    trainer.fit(model)
    model.lm.save_pretrained(os.path.join(training_args.output_dir, model_args.model_name_or_path))
コード例 #27
0
def train(args, custom_callbacks=None):
    data_module = GwtDataModule(
        args.batch_size,
        args.num_dataset_workers,
        f'{args.dataset_base_path}/{args.split}/train.jsonl',
        f'{args.dataset_base_path}/{args.split}/validate.jsonl',
        f'{args.dataset_base_path}/{args.split}/test.jsonl',
        f'{args.dataset_base_path}/bpe_ast_vocab.txt',
    )

    if args.invalidate_line_caches:
        data_module.invalidate_caches()

    model = GwtSectionPredictionTransformer(
        data_module.vocab.get_size(),
        data_module.vocab.get_index(data_module.vocab.PAD_TOKEN),
        args.max_sequence_length,
        args.embedding_size,
        args.learning_rate,
        args.num_attention_heads,
        args.num_encoder_layers,
        args.num_decoder_layers,
        args.feedforward_dimensions,
        args.positional_encoding_dropout,
        args.transformer_dropout,
        args.lr_warmup_steps,
        args.optimize_on_smoothed_loss,
    )

    logger = loggers.TensorBoardLogger(
        args.tensorboard_dir,
        name=args.experiment_name,
        version=args.version,
    )
    logger.log_hyperparams(args)

    checkpoint_dir = os.path.join(logger.log_dir, 'checkpoints')

    loss_key = 'val_loss' if not args.optimize_on_smoothed_loss else 'label_smoothed_val_loss'

    trainer = pl.Trainer.from_argparse_args(
        args,
        resume_from_checkpoint=load_checkpoint_if_available(checkpoint_dir),
        logger=logger,
        checkpoint_callback=callbacks.ModelCheckpoint(
            filepath=f'{checkpoint_dir}/{{epoch}}-{{{loss_key}}}',
            save_top_k=5,
            monitor=loss_key,
            mode='min',
        ),
        **({
            'callbacks': custom_callbacks
        } if custom_callbacks else {}),
    )

    trainer.fit(model, data_module)
    return trainer
コード例 #28
0
ファイル: train_model.py プロジェクト: zbloss/Style-Transfer
def train_model(user_image: Image, style_image: Image):
    """Trains a Deep Learning model to extract the stylings
    from `style_image` and applies them onto `user_image`
    then returns `user_image`.

    Args:
        user_image (Image): Image you want to apply styles onto.
        style_image (Image): Image you want to extract styles from.

    Returns:
        user_image (Image): `user_image` with styling applied.

    """

    image_processor = ImageProcessor(maximum_image_size=(512, 512))
    print(f"user_image.size: {user_image.size} | style_image.size: {style_image.size}")

    image_size = image_processor.get_common_image_size(user_image, style_image)
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = "cpu"

    user_image = image_processor.prepare_images(user_image, image_size)
    style_image = image_processor.prepare_images(style_image, image_size)
    normalization_mean = torch.tensor([0.485, 0.456, 0.406]).to(image_processor.device)
    normalization_std = torch.tensor([0.229, 0.224, 0.225]).to(image_processor.device)

    datamodule = DataModule(user_image, style_image)
    model = StyleTransferModel(
        user_image=user_image,
        style_image=style_image,
        normalization_mean=normalization_mean,
        normalization_std=normalization_std,
    )

    cnn = models.vgg19(pretrained=True).features.to(device).eval()

    model._build_model_and_loss_functions(base_cnn_model=cnn)

    print("Training...")
    tb_logger = pl_loggers.TensorBoardLogger("logs/")
    trainer_params = {"max_epochs": 6000}

    if device != "cpu":
        print("Using a gpu!")
        trainer_params["gpus"] = 1

    trainer = pl.Trainer(**trainer_params)
    trainer.fit(model, datamodule)

    sample = user_image.to(device)
    model.to(device)

    image = model(sample)
    image = image_processor.save_image(
        image, "output.png", "Sample Output", display_image=True
    )
    return image
コード例 #29
0
ファイル: train.py プロジェクト: Sanger2000/nnue-pytorch
def main():
  parser = argparse.ArgumentParser(description="Trains the network.")
  parser.add_argument("train", help="Training data (.bin or .binpack)")
  parser.add_argument("val", help="Validation data (.bin or .binpack)")
  parser.add_argument("--architecture", default='normal', help="architecture of model")
  parser = pl.Trainer.add_argparse_args(parser)
  parser.add_argument("--py-data", action="store_true", help="Use python data loader (default=False)")
  parser.add_argument("--lambda", default=1.0, type=float, dest='lambda_', help="lambda=1.0 = train on evaluations, lambda=0.0 = train on game results, interpolates between (default=1.0).")
  parser.add_argument("--num-workers", default=1, type=int, dest='num_workers', help="Number of worker threads to use for data loading. Currently only works well for binpack.")
  parser.add_argument("--batch-size", default=-1, type=int, dest='batch_size', help="Number of positions per batch / per iteration. Default on GPU = 8192 on CPU = 128.")
  parser.add_argument("--threads", default=-1, type=int, dest='threads', help="Number of torch threads to use. Default automatic (cores) .")
  parser.add_argument("--seed", default=42, type=int, dest='seed', help="torch seed to use.")
  parser.add_argument("--smart-fen-skipping", action='store_true', dest='smart_fen_skipping', help="If enabled positions that are bad training targets will be skipped during loading. Default: False")
  args = parser.parse_args()

  if args.architecture.lower() == "leiser":
      data_name = halfkp.LEISER_NAME
      model_inputs = halfkp.LEISER_INPUTS
  elif args.architecture.lower() == "normal":
      data_name = halfkp.NAME
      model_inputs = halfkp.INPUTS
  else:
      raise Exception("Incorrect architecture name")

  nnue = M.NNUE(num_inputs=model_inputs, lambda_=args.lambda_)

  print("Training with {} validating with {}".format(args.train, args.val))

  pl.seed_everything(args.seed)
  print("Seed {}".format(args.seed))


  batch_size = args.batch_size
  if batch_size <= 0:
    batch_size = 128 if args.gpus == 0 else 8192
  print('Using batch size {}'.format(batch_size))

  print('Smart fen skipping: {}'.format(args.smart_fen_skipping))

  if args.threads > 0:
    print('limiting torch to {} threads.'.format(args.threads))
    t_set_num_threads(args.threads)

  if args.py_data:
    print('Using python data loader')
    train, val = data_loader_py(args.train, args.val, batch_size)
  else:
    print('Using c++ data loader')
    train, val = data_loader_cc(args.train, args.val, data_name, args.num_workers, batch_size, args.smart_fen_skipping)

  logdir = args.default_root_dir if args.default_root_dir else 'logs/'
  print('Using log dir {}'.format(logdir), flush=True)

  tb_logger = pl_loggers.TensorBoardLogger(logdir)
  checkpoint_callback = pl.callbacks.ModelCheckpoint(save_top_k=1, save_last=True, monitor='val_loss', filename='best_model')
  trainer = pl.Trainer.from_argparse_args(args, callbacks=[checkpoint_callback], logger=tb_logger)
  trainer.fit(nnue, train, val)
コード例 #30
0
def main(config):
    """
    Main function for training LSTMs.
    After training, results on validation & test sets are recorded in the specified log_path.
    """
    dataset, train_loader, subgraph_loader = get_data(config)

    # define logger
    Path(config['log_path']).mkdir(parents=True, exist_ok=True)
    logger = loggers.TensorBoardLogger(config['log_path'],
                                       version=config['version'])
    logger.log_hyperparams(params=config)

    # define model
    model = Model(config, dataset, train_loader, subgraph_loader)
    chkpt = None if config['load'] is None else get_checkpoint_path(
        config['load'])

    trainer = pl.Trainer(gpus=config['gpus'],
                         logger=logger,
                         max_epochs=config['epochs'],
                         distributed_backend='dp',
                         precision=16 if config['use_amp'] else 32,
                         default_root_dir=config['log_path'],
                         deterministic=True,
                         resume_from_checkpoint=chkpt,
                         auto_lr_find=config['auto_lr'],
                         auto_scale_batch_size=config['auto_bsz'])
    trainer.fit(model)

    for phase in ['test', 'valid']:
        if phase == 'valid':
            trainer.eval_split = 'val'
            trainer.eval_mask = dataset.data.val_mask
            print(phase, trainer.eval_split)
        ret = trainer.test()
        if isinstance(ret, list):
            ret = ret[0]

        per_node = ret.pop('per_node')
        test_results = ret
        res_dir = Path(config['log_path']) / 'default'
        if config['version'] is not None:
            res_dir = res_dir / config['version']
        else:
            res_dir = res_dir / ('results_' + str(config['seed']))
        print(phase, ':', test_results)
        Path(res_dir).mkdir(parents=True, exist_ok=True)
        write_json(test_results,
                   res_dir / f'{phase}_results.json',
                   sort_keys=True,
                   verbose=True)
        write_pkl(per_node, res_dir / f'{phase}_per_node.pkl')

        path_results = Path(config['log_path']) / f'all_{phase}_results.csv'
        record_results(path_results, config, test_results)