Esempio n. 1
0
def create_trainer(cfg, tags=None, trial=None, callbacks=None):
    if trial:
        checkpoint_callback = pl.callbacks.ModelCheckpoint(
            f'trial#{trial.number}')
        new_callbacks = [PyTorchLightningPruningCallback(trial, 'val_loss')]
        if callbacks:
            new_callbacks.extend(callbacks)

        trainer = pl.Trainer(logger=False,
                             callbacks=new_callbacks,
                             checkpoint_callback=checkpoint_callback,
                             max_epochs=400,
                             progress_bar_refresh_rate=0,
                             weights_summary=None)
    else:
        trainer = pl.Trainer(
            logger=NeptuneLogger(project_name='yoniosin/amygdala',
                                 tags=tags,
                                 params=flatten(cfg, reducer='path')),
            max_epochs=cfg.learner.max_epochs,
            # callbacks=[pl.callbacks.EarlyStopping('val_loss', patience=200)]
            # fast_dev_run=True
        )

    return trainer
Esempio n. 2
0
def main(args: Namespace) -> None:
    if args.seed is not None:
        pl.seed_everything(args.seed)

    if args.distributed_backend == 'ddp':
        # When using a single GPU per process and per
        # DistributedDataParallel, we need to divide the batch size
        # ourselves based on the total number of GPUs we have
        args.batch_size = int(args.batch_size / max(1, args.gpus))
        args.workers = int(args.workers / max(1, args.gpus))

    args.logger = NeptuneLogger(project_name='YOUR_PROJ/hypermixup',
                                experiment_name="experiment_name",
                                params={
                                    'model': "ResNet18",
                                    'hypernet': True,
                                    'dataset': args.dataset,
                                    'base': args.base,
                                    'z_dim': args.z_dim,
                                    'learning_rate': args.lr,
                                })

    model = HyperResNetCIFAR(**vars(args))
    trainer = pl.Trainer.from_argparse_args(args)

    if args.evaluate:
        trainer.test(model)
    else:
        trainer.fit(model)
Esempio n. 3
0
def main(config):
    solver = Solver(config)
    logger = NeptuneLogger(project_name=config.neptune_project,
                           api_key=config.neptune_api_key)
    checkpoint_callback = ModelCheckpoint(filepath=config.model_save_path,
                                          save_top_k=1,
                                          verbose=True,
                                          monitor="map",
                                          mode="max",
                                          prefix="")

    if config.model_load_path != ".":
        # resume
        trainer = Trainer(default_root_dir=config.model_save_path,
                          gpus=config.gpu_id,
                          logger=logger,
                          checkpoint_callback=checkpoint_callback,
                          resume_from_checkpoint=config.model_load_path,
                          max_epochs=config.n_epochs)
    else:
        trainer = Trainer(default_root_dir=config.model_save_path,
                          gpus=config.gpu_id,
                          logger=logger,
                          checkpoint_callback=checkpoint_callback,
                          max_epochs=config.n_epochs)

    if config.mode == 'TRAIN':
        trainer.fit(solver)
        trainer.save_checkpoint(
            os.path.join(config.model_save_path, 'last.ckpt'))
    elif config.mode == 'TEST':
        S = torch.load(config.model_load_path)['state_dict']
        SS = {key[6:]: S[key] for key in S.keys()}
        solver.model.load_state_dict(SS)
        trainer.test(solver)
Esempio n. 4
0
def main():
    parser = HfArgumentParser((ModelArguments, ParaphraseDataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=2,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )
    
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        model_max_length=data_args.model_max_length
    )
    
    language_model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    if data_args.neptune_logging:
        neptune_logger = NeptuneLogger(
            project_name=os.environ['NEPTUNE_PROJECT'],
            experiment_name=model_args.config_name if model_args.config_name else model_args.model_name_or_path
        )

    train_dataset = ParaphraseDetectionDataset(data_dir=os.path.join(data_args.data_dir, TRAIN_PATH), tokenizer=tokenizer, task_name="paraphrase_detection")
    val_datasets= [              
        ParaphraseDetectionDataset(data_dir=os.path.join(data_args.data_dir, EVAL_PATH), tokenizer=tokenizer, name=EVAL_NAME) for (EVAL_PATH, EVAL_NAME) in zip(EVAL_PATHS, EVAL_NAMES)
    ]
    
    model = LMFinetuner(language_model, tokenizer, training_args.learning_rate, model_args.batch_size, train_dataset, val_datasets, data_args, freeze_backend=False)
    tb_logger = pl_loggers.TensorBoardLogger(os.path.join(training_args.output_dir, model_args.model_name_or_path))

    trainer = pl.Trainer(
        # auto_lr_find=True,
        # auto_scale_batch_size=True,
        max_epochs=int(training_args.num_train_epochs),
        accumulate_grad_batches=training_args.gradient_accumulation_steps,
        weights_save_path=training_args.output_dir,
        gpus=torch.cuda.device_count(),
        precision=16 if training_args.fp16 and torch.cuda.is_available() else 32,
        distributed_backend='ddp' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None,
        progress_bar_refresh_rate=training_args.logging_steps,
        logger=[neptune_logger, tb_logger] if data_args.neptune_logging else tb_logger,
    )    
    trainer.fit(model)
    model.lm.save_pretrained(os.path.join(training_args.output_dir, model_args.model_name_or_path))
Esempio n. 5
0
def main(cfg: TradeConfig):
    data = TradeDataModule(**dict(cfg.data))
    model = TradeModule(cfg.data.look_back, data.ds.data.shape[1],
                        data.ds.full_data.shape[1])

    trainer = pl.Trainer(logger=[NeptuneLogger(project_name='yoniosin/Trade')],
                         max_epochs=cfg.max_epochs,
                         fast_dev_run=True)

    trainer.fit(model, datamodule=data)
Esempio n. 6
0
def main():
    parser = general_args()
    parser = pl.Trainer.add_argparse_args(parser)
    parser = RationaleSystem.add_model_specific_args(parser)

    args = parser.parse_args()

    neptune_logger = None

    if args.neptune_project is not None:
        from pytorch_lightning.loggers.neptune import NeptuneLogger

        neptune_logger = NeptuneLogger(api_key=args.neptune_key,
                                       project_name=args.neptune_project,
                                       params=vars(args))

    data = IMDBDataModule(args.batch_size)
    data.prepare_data()

    # if args.no_generator:
    #     gen = None
    # else:
    #     gen = GeneratorModel(args,
    #     embeddings=data.text_field.vocab.vectors,
    #     padding_idx=data.text_field.vocab.stoi['<pad>'])

    # enc = Encoder(args,
    # embeddings=data.text_field.vocab.vectors,
    # num_classes=len(data.label_field.vocab),
    # padding_idx=data.text_field.vocab.stoi['<pad>'])

    model = RationaleSystem(args,
                            embeddings=data.text_field.vocab.vectors,
                            num_classes=len(data.label_field.vocab),
                            padding_idx=data.text_field.vocab.stoi['<pad>'])

    checkpoint_callback = pl.callbacks.ModelCheckpoint(filepath=os.getcwd(),
                                                       save_top_k=3,
                                                       save_weights_only=True,
                                                       verbose=True,
                                                       monitor='val_acc',
                                                       mode='max',
                                                       prefix='')

    earlystop_callback = EarlyStopping(monitor='val_acc',
                                       patience=args.patience)

    trainer = pl.Trainer.from_argparse_args(
        args,
        callbacks=[checkpoint_callback, earlystop_callback],
        logger=neptune_logger)

    trainer.fit(model, data)
Esempio n. 7
0
def main(hparams):

    set_seed(hparams.seed)

    checkpoint_callback = None
    if hparams.checkpoint_path:
        checkpoint_dir = os.path.dirname(
            os.path.abspath(hparams.checkpoint_path))
        print(f'Checkpoints will be saved to {checkpoint_dir}')

        checkpoint_callback = ModelCheckpoint(
            dirpath=checkpoint_dir,
            prefix=hparams.checkpoint_prefix,
            monitor=hparams.checkpoint_monitor,
            mode=hparams.checkpoint_monitor_mode,
            save_top_k=hparams.checkpoint_save_top_k,
            verbose=True,
        )

    if hparams.resume_from_checkpoint:
        print(f'Restoring checkpoint: {hparams.resume_from_checkpoint}')

    logger = NeptuneLogger(
        api_key=None,  # read from NEPTUNE_API_TOKEN environment variable
        project_name=hparams.project_name,
        experiment_name=hparams.experiment_name,
        tags=hparams.experiment_tags,
        close_after_fit=False,
        params=vars(hparams))

    dm = DocVQADataModule(hparams)
    dm.setup()

    model = LitEffNetT5(hparams)

    trainer = Trainer.from_argparse_args(
        hparams,
        logger=logger,
        callbacks=[checkpoint_callback],
    )

    if hparams.do_train:
        trainer.fit(model, dm)

    if hparams.do_test:
        trainer.test(datamodule=dm)

    logger.experiment.stop()
Esempio n. 8
0
 def __init__(self, exp_name: str, max_epochs: int, batch_size: int,
              learning_rate: float):
     self.neptune_logger = NeptuneLogger(
         api_key="ANONYMOUS",
         project_name="shared/pytorch-ae-trainer",
         close_after_fit=False,
         experiment_name=exp_name,
         params={
             "max_epochs": max_epochs,
             "batch_size": batch_size,
             "lr": learning_rate
         },  # Optional,
         tags=["pytorch-lightning", "mlp"],
         # upload_source_files=['*.py', '*.yaml'],
         upload_stderr=False,
         upload_stdout=False)
Esempio n. 9
0
def init_neptune(args,
                 api_key,
                 project_name,
                 experiment_name,
                 experiment_tags=[]):
    import neptune
    from pytorch_lightning.loggers.neptune import NeptuneLogger

    params = vars(args)

    neptune.init(project_qualified_name=project_name, api_token=api_key)

    neptune_logger = NeptuneLogger(api_key=api_key,
                                   project_name=project_name,
                                   experiment_name=experiment_name,
                                   tags=experiment_tags,
                                   params=params)
    return neptune_logger
Esempio n. 10
0
    #                                                  model_name=hparams['model_name'],
    #                                                  width=hparams['width'],
    #                                                  size=hparams['size'])

    checkpoing_callback = ModelCheckpoint(filepath=model_config.weights_folder,
                                          save_top_k=True,
                                          verbose=True,
                                          monitor='val_loss',
                                          mode='min',
                                          prefix=hparams['model_name'])

    neptune_logger = NeptuneLogger(
        api_key='eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vdWkubmVwdHVuZS5haSIs'
        'ImFwaV91cmwiOiJodHRwczovL3VpLm5lcHR1bmUuYWkiLCJhcGlfa'
        '2V5IjoiMTIyODQyZGUtNTdiMS00MDBlLWEzZmYtMzU0N2Q4MDViMjQ0In0=',
        project_name='vadbeg/birds',
        experiment_name=
        f'{hparams["model_name"]}, CrossEntropyLoss, width=2048',
        params=hparams,
        tags=['pytorch-lightning', 'birds'])

    trainer = Trainer(gpus=1,
                      num_nodes=1,
                      checkpoint_callback=checkpoing_callback,
                      max_epochs=hparams['max_epochs'],
                      logger=neptune_logger)

    trainer.fit(model=model)

    trainer = Trainer()
Esempio n. 11
0
def cli_main():

    argv = sys.argv[1:]
    # argv = ['--config',     'configs/base.yaml',
    #         '--exp_name',   'test',
    #         '--exp_dir',    '../prj_ssl_ntu_exps/test']

    parser = argparse.ArgumentParser()

    parser.add_argument("-c",
                        "--config",
                        default=None,
                        help="where to load YAML configuration",
                        metavar="FILE")
    parser.add_argument('--exp_dir',
                        type=str,
                        default=None,
                        help='experiment output directory')
    parser.add_argument('--path_db',
                        type=str,
                        default='../dbs',
                        help='neptune project directory')

    args = parser.parse_args(argv)

    new_exp_dir = args.exp_dir
    new_path_db = args.path_db

    with open(args.config, 'r') as stream:
        config_vars = yaml.load(stream, Loader=yaml.FullLoader)

    args = argparse.Namespace()
    args.__dict__.update(config_vars)

    if new_exp_dir is not None:
        args.exp_dir = new_exp_dir

    if new_path_db is not None:
        args.path_db = new_path_db

    # trainer args
    parser = pl.Trainer.add_argparse_args(parser)

    # get model and model args
    model_type = vars(modules)[args.model]

    # get dataset and dataset args
    dataset_type = vars(datasets)[args.dataset]

    # save config
    with open(os.path.join(args.exp_dir, 'config.yaml'), 'w') as cfg_file:
        yaml.dump(args.__dict__, cfg_file)

    if args.neptune_key != '':
        logger = NeptuneLogger(
            api_key=args.neptune_key,
            project_name=args.neptune_project,
            close_after_fit=False,
            experiment_name=args.exp_name,  # Optional,
            params=args.__dict__,  # Optional,
            tags=["pl"],  # Optional,
            # upload_stderr=False,
            # upload_stdout=False
        )
    else:
        logger = TensorBoardLogger(args.exp_dir)  #, name="my_model"

    # ckpt = list(filter(lambda x:'.ckpt' in x, os.listdir(args.exp_dir)))[-1]
    # ckpt = os.path.join(args.exp_dir, ckpt)

    ckpts = list(filter(lambda x: 'epoch=' in x, os.listdir(args.exp_dir)))

    best_epoch = max(
        [int(x.replace('epoch=', '').replace('.ckpt', '')) for x in ckpts])
    best_ckpt = os.path.join(args.exp_dir,
                             'epoch=' + str(best_epoch) + '.ckpt')
    model = model_type.load_from_checkpoint(best_ckpt)

    lincls_results = lincls(args, model)

    print(best_ckpt)

    print('test results')
    for k, v in lincls_results.items():
        print(k, v)

    db_path = os.path.join(args.path_db, args.exp_name + '_db.csv')

    if os.path.exists(db_path):
        df = pd.read_csv(db_path, index_col=0)
    else:
        df = pd.DataFrame()

    output_dict = {
        'exp_name': args.exp_name,
        'exp_dir': args.exp_dir,
        'model': args.model,
        'dataset': args.dataset,
    }

    output_dict.update(lincls_results)

    if args.neptune_key != '':
        for k, v in pretrain_result.items():
            logger.experiment.log_metric(k, v)

        for k, v in lincls_results.items():
            logger.experiment.log_metric(k, v)

    df = df.append(output_dict, ignore_index=True)
    df.to_csv(db_path)
Esempio n. 12
0
    "/data/lyft-motion-prediction-autonomous-vehicles/lyft-config-files/agent_motion_config.yaml"
)
cfg = omegaconf.DictConfig(cfg)
name_for_save = 'Big_training'
epochs = cfg["model_params"]["epochs"]
learning_rate = cfg["model_params"]["lr"]
training_percentage = cfg["train_data_loader"]["training_percentage"]
validation_percentage = cfg["val_data_loader"]["validation_percentage"]

API_KEY = os.environ.get('NEPTUNE_API_KEY')
neptune_logger = NeptuneLogger(
    api_key=API_KEY,
    project_name='hvergnes/KaggleResNet',
    params={
        'epoch_nr': epochs,
        'learning_rate': learning_rate,
        'train_size': training_percentage,
        'test_size': validation_percentage
    },  # your hyperparameters, immutable
    tags=['ResNet'],  # tags
)

os.environ[
    "L5KIT_DATA_FOLDER"] = "/data/lyft-motion-prediction-autonomous-vehicles"
dm = LocalDataManager()

cfg = load_config_data(
    "/data/lyft-motion-prediction-autonomous-vehicles/lyft-config-files/agent_motion_config.yaml"
)
cfg = omegaconf.DictConfig(cfg)
rasterizer = build_rasterizer(cfg, dm)
Esempio n. 13
0
## 3: Implement Callbacks and Create Them

from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint

lr_logger = LearningRateMonitor(**LearningRateMonitor_Params)

model_checkpoint = ModelCheckpoint(**ModelCheckpoint_Params)

# Step 4: Create NeptuneLogger

from pytorch_lightning.loggers.neptune import NeptuneLogger

neptune_logger = NeptuneLogger(
    api_key="ANONYMOUS",
    project_name="shared/pytorch-lightning-integration",
    close_after_fit=False,
    experiment_name="train-on-MNIST",
    params=ALL_PARAMS,
    tags=['1.x', 'advanced'],
)

# Step 5: Pass NeptuneLogger and Callbacks to the Trainer

trainer = pl.Trainer(logger=neptune_logger,
                     checkpoint_callback=model_checkpoint,
                     callbacks=[lr_logger],
                     **Trainer_Params)

# Step 6: Run experiment

## 1: Initialize model and data objects
Esempio n. 14
0
def train_text2title(config_file: str, train_file: str, val_file: str,
                     train_sample_rate: float, val_sample_rate: float,
                     output_title_model_path: str, output_text_model_path: str,
                     random_seed: int, neptune_project: str):
    seed_everything(random_seed)

    train_file = get_true_file(train_file)
    val_file = get_true_file(val_file)
    assert train_file.endswith(".jsonl")
    assert val_file.endswith(".jsonl")

    config = json.loads(jsonnet_evaluate_file(config_file))

    print("Loading vectors...")
    ft_model_path = config.pop("ft_vector_model_path",
                               "models/fasttext/ru_vectors_v3.bin")
    ft_model = ft_load_model(ft_model_path)

    print("Fetching data...")
    train_records = [
        r for r in read_tg_jsonl(train_file)
        if random.random() <= train_sample_rate
    ]
    val_records = [
        r for r in read_tg_jsonl(val_file)
        if random.random() <= val_sample_rate
    ]

    print("Building datasets...")
    max_words = config.get("max_words", 150)
    batch_size = config.get("batch_size", 64)
    num_workers = config.get("num_workers", 5)
    train_data = Text2TitleDataset(train_records,
                                   ft_model,
                                   max_words=max_words)
    train_sampler = RandomSampler(train_data)
    train_loader = DataLoader(train_data,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              num_workers=num_workers)

    val_data = Text2TitleDataset(val_records, ft_model, max_words=max_words)
    val_loader = DataLoader(val_data,
                            batch_size=batch_size,
                            num_workers=num_workers)

    print("Training model...")
    epochs = config.get("epochs", 100)
    patience = config.get("patience", 4)
    model = Text2TitleModel()
    early_stop_callback = EarlyStopping(monitor="val_loss",
                                        min_delta=0.0,
                                        patience=patience,
                                        verbose=True,
                                        mode="min")
    logger = False
    neptune_api_token = os.getenv("NEPTUNE_API_TOKEN")
    if neptune_project and neptune_api_token:
        params = copy.copy(config)
        params["train_sample_rate"] = train_sample_rate
        params["val_sample_rate"] = val_sample_rate
        params["train_file"] = train_file
        params["val_file"] = val_file
        logger = NeptuneLogger(
            api_key=neptune_api_token,
            project_name=neptune_project,
            experiment_name="Fasttext text2title",
            tags=["training", "pytorch-lightning", "text2title"],
            params=params)
    trainer = Trainer(gpus=0,
                      checkpoint_callback=False,
                      accumulate_grad_batches=1,
                      max_epochs=epochs,
                      callbacks=[early_stop_callback],
                      val_check_interval=1.0,
                      progress_bar_refresh_rate=100,
                      deterministic=True,
                      logger=logger)
    trainer.fit(model, train_loader, val_loader)
    model.save(output_title_model_path, output_text_model_path)
Esempio n. 15
0
    model_checkpoint = pl.callbacks.ModelCheckpoint(CHECKPOINTS_DIR)
    trainer = Trainer(logger=neptune_logger,
                      gpus=hparams.gpus,
                      checkpoint_callback=model_checkpoint)
    trainer.test(model)
    # Save checkpoints folder
    neptune_logger.experiment.log_artifact(CHECKPOINTS_DIR)
    # You can stop the experiment
    neptune_logger.experiment.stop()


# -------------------------------------------------------------------------------------------------------------------
CHECKPOINTS_DIR = '/home/rachneet/thesis_results/res_mixed_recordings/'
neptune_logger = NeptuneLogger(
    api_key=os.environ.get("NEPTUNE_API_KEY"),
    project_name="rachneet/sandbox",
    experiment_name="res_mixed_recordings",  # change this for new runs
)

# ---------------------------------------MAIN FUNCTION TRAINER-------------------------------------------------------


def main(hparams):

    model = LightningResnet(hparams)
    # exp = Experiment(save_dir=os.getcwd())
    if not os.path.exists(CHECKPOINTS_DIR):
        os.makedirs(CHECKPOINTS_DIR)
    model_checkpoint = pl.callbacks.ModelCheckpoint(CHECKPOINTS_DIR)
    early_stop_callback = pl.callbacks.EarlyStopping(monitor='val_loss',
                                                     min_delta=0.00,
def main():
    params = Params()

    # api key
    api_key = os.environ[
        "NEPTUNE"
    ]  # if this throws an error, you didn't set your env var

    # save directory
    save_dir = os.getcwd() if not params.SAVE_DIR else params.SAVE_DIR

    # root directory
    root = ROOT_PATH / "pytorch_faster_rcnn_tutorial" / "data" / "heads"

    # input and target files
    inputs = get_filenames_of_path(root / "input")
    targets = get_filenames_of_path(root / "target")

    inputs.sort()
    targets.sort()

    # mapping
    mapping = {
        "head": 1,
    }

    # training transformations and augmentations
    transforms_training = ComposeDouble(
        [
            Clip(),
            AlbumentationWrapper(albumentation=albu.HorizontalFlip(p=0.5)),
            AlbumentationWrapper(
                albumentation=albu.RandomScale(p=0.5, scale_limit=0.5)
            ),
            # AlbuWrapper(albu=A.VerticalFlip(p=0.5)),
            FunctionWrapperDouble(np.moveaxis, source=-1, destination=0),
            FunctionWrapperDouble(normalize_01),
        ]
    )

    # validation transformations
    transforms_validation = ComposeDouble(
        [
            Clip(),
            FunctionWrapperDouble(np.moveaxis, source=-1, destination=0),
            FunctionWrapperDouble(normalize_01),
        ]
    )

    # test transformations
    transforms_test = ComposeDouble(
        [
            Clip(),
            FunctionWrapperDouble(np.moveaxis, source=-1, destination=0),
            FunctionWrapperDouble(normalize_01),
        ]
    )

    # random seed
    seed_everything(params.SEED)

    # training validation test split
    inputs_train, inputs_valid, inputs_test = inputs[:12], inputs[12:16], inputs[16:]
    targets_train, targets_valid, targets_test = (
        targets[:12],
        targets[12:16],
        targets[16:],
    )

    # dataset training
    dataset_train = ObjectDetectionDataSet(
        inputs=inputs_train,
        targets=targets_train,
        transform=transforms_training,
        use_cache=True,
        convert_to_format=None,
        mapping=mapping,
    )

    # dataset validation
    dataset_valid = ObjectDetectionDataSet(
        inputs=inputs_valid,
        targets=targets_valid,
        transform=transforms_validation,
        use_cache=True,
        convert_to_format=None,
        mapping=mapping,
    )

    # dataset test
    dataset_test = ObjectDetectionDataSet(
        inputs=inputs_test,
        targets=targets_test,
        transform=transforms_test,
        use_cache=True,
        convert_to_format=None,
        mapping=mapping,
    )

    # dataloader training
    dataloader_train = DataLoader(
        dataset=dataset_train,
        batch_size=params.BATCH_SIZE,
        shuffle=True,
        num_workers=0,
        collate_fn=collate_double,
    )

    # dataloader validation
    dataloader_valid = DataLoader(
        dataset=dataset_valid,
        batch_size=1,
        shuffle=False,
        num_workers=0,
        collate_fn=collate_double,
    )

    # dataloader test
    dataloader_test = DataLoader(
        dataset=dataset_test,
        batch_size=1,
        shuffle=False,
        num_workers=0,
        collate_fn=collate_double,
    )

    # neptune logger
    neptune_logger = NeptuneLogger(
        api_key=api_key,
        project_name=f"{params.OWNER}/{params.PROJECT}",  # use your neptune name here
        experiment_name=params.PROJECT,
        params=params.__dict__,
    )

    assert neptune_logger.name  # http GET request to check if the project exists

    # model init
    model = get_faster_rcnn_resnet(
        num_classes=params.CLASSES,
        backbone_name=params.BACKBONE,
        anchor_size=params.ANCHOR_SIZE,
        aspect_ratios=params.ASPECT_RATIOS,
        fpn=params.FPN,
        min_size=params.MIN_SIZE,
        max_size=params.MAX_SIZE,
    )

    # lightning init
    task = FasterRCNNLightning(
        model=model, lr=params.LR, iou_threshold=params.IOU_THRESHOLD
    )

    # callbacks
    checkpoint_callback = ModelCheckpoint(monitor="Validation_mAP", mode="max")
    learningrate_callback = LearningRateMonitor(
        logging_interval="step", log_momentum=False
    )
    early_stopping_callback = EarlyStopping(
        monitor="Validation_mAP", patience=params.PATIENCE, mode="max"
    )

    # trainer init
    trainer = Trainer(
        gpus=params.GPU,
        precision=params.PRECISION,  # try 16 with enable_pl_optimizer=False
        callbacks=[checkpoint_callback, learningrate_callback, early_stopping_callback],
        default_root_dir=save_dir,  # where checkpoints are saved to
        logger=neptune_logger,
        log_every_n_steps=1,
        num_sanity_val_steps=0,
        max_epochs=params.MAXEPOCHS,
    )

    # start training
    trainer.fit(
        model=task, train_dataloader=dataloader_train, val_dataloaders=dataloader_valid
    )

    # start testing
    trainer.test(ckpt_path="best", dataloaders=dataloader_test)

    # log packages
    log_packages_neptune(neptune_logger=neptune_logger)

    # log mapping as table
    log_mapping_neptune(mapping=mapping, neptune_logger=neptune_logger)

    # log model
    if params.LOG_MODEL:
        checkpoint_path = pathlib.Path(checkpoint_callback.best_model_path)
        log_model_neptune(
            checkpoint_path=checkpoint_path,
            save_directory=pathlib.Path.home(),
            name="best_model.pt",
            neptune_logger=neptune_logger,
        )

    # stop logger
    neptune_logger.experiment.stop()
    print("Finished")
Esempio n. 17
0
        'best_model_score', model_checkpoint.best_model_score.tolist())

    # Testing
    trainer.test()


if __name__ == "__main__":
    # Setup
    logger.setLevel(logging.INFO)
    pl.seed_everything(CONSTANTS['SEED'])

    api_token = getpass("Enter Neptune.ai API token: ")
    neptune_logger = NeptuneLogger(
        api_key=api_token,
        project_name="username/projname",  # TODO
        close_after_fit=False,
        experiment_name="experiment-name",  # TODO
        params=hparams,
        tags=["pytorch-lightning"])

    # Config
    hparams = {
        'lr': 0.0001,
        'weight_decay': 0.0001,
        'batch_size': {
            'train': 8,
            'val': 4,
            'test': 4
        },
        'image_size': 256,
        'gradient_clip_val': 0.1,
Esempio n. 18
0
def cli_main():

    # Arguments
    default_config = os.path.join(os.path.split(os.getcwd())[0], 'config.conf')

    print(default_config)

    parser = ArgumentParser(description='Pytorch BT',
                            default_config_files=[default_config])
    parser.add_argument('-c',
                        '--my-config',
                        required=False,
                        is_config_file=True,
                        help='config file path')
    parser.add_argument('--finetune',
                        dest='finetune',
                        action='store_true',
                        help='Perform only finetuning (Default: False)')
    parser.set_defaults(finetune=False)
    parser.add_argument(
        '--transfer',
        dest='transfer',
        action='store_true',
        help='Perform transfer learning on linear eval (Default: False)')
    parser.set_defaults(transfer=False)
    parser.add_argument('--offline_log',
                        dest='offline_log',
                        action='store_true',
                        help='Do not log online (Default:  False)')
    parser.set_defaults(offline_log=False)
    parser.add_argument('--pt_checkpoint', type=str, default=None)
    parser.add_argument('--val_every_n', type=int, default=1)
    parser.add_argument('--tag', type=str, default=None)
    parser.add_argument('--resume_ckpt', type=str, default=None)
    parser.add_argument('--seed', type=int, default=222)
    parser.add_argument('--project_name', type=str, default=None)

    # trainer args
    parser = pl.Trainer.add_argparse_args(parser)

    # model args
    parser = BT.add_model_specific_args(parser)

    parser = SSLLinearEval.add_model_specific_args(parser)

    args = parser.parse_args()

    seed_everything(args.seed)

    args.status = 'Finetune'

    args.batch_size = args.ft_batch_size

    # Get DataModule
    dm, ft_dm, args = get_dm(args)

    neptune_logger = NeptuneLogger(
        offline_mode=args.offline_log,
        api_key=None,
        project_name=args.project_name,
        experiment_name='Testing',  # Optional,
        params=vars(args),  # Optional,
        tags=["Test", args.tag],  # Optional,
        upload_source_files=['src/*.py'],
        close_after_fit=False)

    # Define model
    model = BT(**args.__dict__)

    load_log_file = os.path.join(os.getcwd(), 'log_files.txt')

    log_dirs = np.genfromtxt(load_log_file, delimiter=" ", dtype='str')

    print("\n\n Log Dir: {}\n\n".format(log_dirs))

    ft_model_dir = log_dirs[1]
    checkpoint_path = log_dirs[2]

    print("Loading checkpoint: {}".format(checkpoint_path))

    ft_model_checkpoint = pl.callbacks.ModelCheckpoint(filepath=(ft_model_dir +
                                                                 '/'),
                                                       save_top_k=1,
                                                       monitor='val_loss')

    encoder = BT.load_from_checkpoint(checkpoint_path, strict=False)

    if args.accelerator == 'ddp' or args.accelerator == 'ddp2':
        replace_sampler = True  # False
        if args.accelerator == 'ddp':
            args.effective_bsz = args.ft_batch_size * args.num_nodes * args.gpus

        elif args.accelerator == 'ddp2':
            args.effective_bsz = args.ft_batch_size * args.num_nodes

    else:
        replace_sampler = True
        args.effective_bsz = args.ft_batch_size

    ft_model = SSLLinearEval(encoder.encoder_online, **args.__dict__)

    trainer_ft = pl.Trainer.from_argparse_args(
        args,
        max_epochs=args.ft_epochs,
        logger=neptune_logger,
        callbacks=[FTPrintingCallback(ft_model_dir, args)],
        deterministic=True,
        checkpoint_callback=False,
        fast_dev_run=False,
        sync_batchnorm=True,
        track_grad_norm=-1,
        replace_sampler_ddp=replace_sampler,
        progress_bar_refresh_rate=args.print_freq)

    if trainer_ft.local_rank == 0:

        if not args.offline_log:

            print("Experiment: {}".format(str(trainer_ft.logger.experiment)))

            log_dirs = np.append(
                log_dirs,
                str(trainer_ft.logger.experiment).split('(')[1][:-1])

            save_log_file = os.path.join(os.getcwd(), 'log_files.txt')

            np.savetxt(save_log_file, log_dirs, delimiter=" ", fmt="%s")

    # Fit
    trainer_ft.fit(ft_model, ft_dm)

    if args.save_checkpoint:

        neptune_logger.experiment.log_artifact(
            os.path.join(ft_model_dir,
                         os.listdir(ft_model_dir + '/')[-1]),
            os.path.join('finetune/',
                         os.listdir(ft_model_dir + '/')[-1]))

    neptune_logger.experiment.stop()
Esempio n. 19
0
File: train.py Progetto: goodok/sgnn
def main(hparams):
    """
    Main training routine specific for this project
    :param hparams:

    """

    # 0 INIT TRACKER
    # https://docs.neptune.ai/integrations/pytorch_lightning.html
    try:
        import neptune
        NEPTUNE_AVAILABLE = True
    except ImportError:  # pragma: no-cover
        NEPTUNE_AVAILABLE = False

    USE_NEPTUNE = False
    if getattr(hparams, 'tracker', None) is not None:
        if getattr(hparams.tracker, 'neptune', None) is not None:
            USE_NEPTUNE = True

    if USE_NEPTUNE and not NEPTUNE_AVAILABLE:
        warnings.warn(
            'You want to use `neptune` logger which is not installed yet,'
            ' install it with `pip install neptune-client`.', UserWarning)
        time.sleep(5)

    tracker = None

    if NEPTUNE_AVAILABLE and USE_NEPTUNE:
        neptune_params = hparams.tracker.neptune
        fn_token = getattr(neptune_params, 'fn_token', None)
        if fn_token is not None:
            p = Path(neptune_params.fn_token).expanduser()
            if p.exists():
                with open(p, 'r') as f:
                    token = f.readline().splitlines()[0]
                    os.environ['NEPTUNE_API_TOKEN'] = token

        hparams_flatten = dict_flatten(hparams, sep='.')
        experiment_name = hparams.tracker.get('experiment_name', None)
        tags = list(hparams.tracker.get('tags', []))
        offline_mode = hparams.tracker.get('offline', False)

        tracker = NeptuneLogger(
            project_name=neptune_params.project_name,
            experiment_name=experiment_name,
            params=hparams_flatten,
            tags=tags,
            offline_mode=offline_mode,
            upload_source_files=["../../../*.py"
                                 ],  # because hydra change current dir
        )

    try:

        # log
        if tracker is not None:
            watermark_s = watermark(packages=[
                'python', 'nvidia', 'cudnn', 'hostname', 'torch',
                'sparseconvnet', 'pytorch-lightning', 'hydra-core', 'numpy',
                'plyfile'
            ])
            log_text_as_artifact(tracker, watermark_s, "versions.txt")
            # arguments_of_script
            sysargs_s = str(sys.argv[1:])
            log_text_as_artifact(tracker, sysargs_s, "arguments_of_script.txt")

            for key in ['overrides.yaml', 'config.yaml']:
                p = Path.cwd() / '.hydra' / key
                if p.exists():
                    tracker.log_artifact(str(p), f'hydra_{key}')

        callbacks = []
        if tracker is not None:
            lr_logger = LearningRateLogger()
            callbacks.append(lr_logger)

        # ------------------------
        # 1 INIT LIGHTNING MODEL
        # ------------------------
        model = LightningTemplateModel(hparams)

        if tracker is not None:
            s = str(model)
            log_text_as_artifact(tracker, s, "model_summary.txt")

        # ------------------------
        # 2 INIT TRAINER
        # ------------------------
        cfg = hparams.PL

        if tracker is None:
            tracker = cfg.logger  # True by default in PL

        kwargs = dict(cfg)
        kwargs.pop('logger')

        trainer = pl.Trainer(
            max_epochs=hparams.train.max_epochs,
            callbacks=callbacks,
            logger=tracker,
            **kwargs,
        )

        # ------------------------
        # 3 START TRAINING
        # ------------------------
        print()
        print("Start training")

        trainer.fit(model)

    except (Exception, KeyboardInterrupt) as ex:
        if tracker is not None:
            print_exc()
            tracker.experiment.stop(str(ex))
        raise
                            type=list,
                            nargs='+',
                            default=[3, 3, 3, 3, 3, 3])
        parser.add_argument('--pool_size', default=3)
        parser.add_argument('--fc_neurons', default=128)
        parser.add_argument('--n_classes', default=8)

        return parser


# =========================================NEPTUNE AI===============================================================

CHECKPOINTS_DIR = '/home/rachneet/thesis_results/tl_vsg_intf_16qam_rw/'  # change this
neptune_logger = NeptuneLogger(
    api_key=os.environ.get("NEPTUNE_API_KEY"),
    project_name="rachneet/sandbox",
    experiment_name="tl_vsg_intf_16qam_rw",  # change this  for new runs
)

# ===================================================================================================================


def inference(hparams: argparse.Namespace):
    model = TransferLearningModel.load_from_checkpoint(
        CHECKPOINTS_DIR + 'epoch=14-step=61739.ckpt')

    model_checkpoint = pl.callbacks.ModelCheckpoint(CHECKPOINTS_DIR)
    trainer = pl.Trainer(logger=neptune_logger,
                         gpus=hparams.gpus,
                         checkpoint_callback=True,
                         callbacks=[model_checkpoint])
Esempio n. 21
0
from pytorch_lightning.loggers.neptune import NeptuneLogger
import os
logger = NeptuneLogger(
    api_key=os.environ["NEPTUNE_API_TOKEN"],
    project_name="jonasfrey96/asl",
    experiment_id='ASL-400',
    close_after_fit=False,
)
print(logger.experiment.id)

logger.experiment
print('Done')

print('Done')

print('Done')

print('Done')

print('Done')

print('Done')

import time
time.sleep(1)
Esempio n. 22
0
def cli_main():
    pl.seed_everything(1234)

    # ------------
    # args
    # ------------
    parser = ArgumentParser()
    parser.add_argument('--base_folders', nargs='+', default=[], required=True)
    parser.add_argument('--datasets', nargs='+', default=[], required=True)
    parser.add_argument('--shuffle', action="store_true", default=False)
    parser.add_argument('--use_tpu', action="store_true", default=False)
    parser.add_argument('--memory_profile', action="store_true", default=False)
    parser.add_argument('--tags', nargs='*', default=[])
    parser = UTWRS.add_model_specific_args(parser)
    parser = pl.Trainer.add_argparse_args(parser)
    args = parser.parse_args()

    # ------------
    # data path
    # ------------
    file_paths = []
    max_seq_length = 0
    max_summary_length = 0

    if "BBC" in args.datasets:
        i = args.datasets.index("BBC")
        file_paths.append(get_file_paths(args.base_folders[i]))
        max_seq_length = max(get_max_seq_len(args.base_folders[i]),
                             max_seq_length)
        max_summary_length = max(get_max_summary_len(args.base_folders[i]),
                                 max_summary_length)

    if "OVSD" in args.datasets:
        i = args.datasets.index("OVSD")
        file_paths.append(get_file_paths(args.base_folders[i]))
        max_seq_length = max(get_max_seq_len(args.base_folders[i]),
                             max_seq_length)
        max_summary_length = max(get_max_summary_len(args.base_folders[i]),
                                 max_summary_length)

    if file_paths == []:
        raise UnsupportedOperation("--dataset only support BBC or OVSD.")

    # ------------
    # data args
    # ------------
    # Add <START> and <END> token
    args.enc_seq_len = max_seq_length + 2
    args.dec_seq_len = max_summary_length + 2

    # ------------
    # Split train/test
    # ------------
    print(f"\nTotal number of videos: {sum([len(i) for i in file_paths])}")
    print(f"Max length of videos: {max_seq_length}")
    print(f"Max length of summary: {max_summary_length}\n")

    train_paths = []
    test_paths = []

    for dataset in file_paths:
        np.random.shuffle(dataset)
        train_paths.extend(dataset[:-2])
        test_paths.extend(dataset[-2:])

    # ------------
    # K-fold
    # ------------

    kfold = StratifiedKFold(n_splits=3, shuffle=False)

    # Generate data index for kfold
    X = [0] * len(train_paths)
    Y = []
    for i, dataset in enumerate(file_paths):
        Y += [i] * (len(dataset) - 2)

    train_paths = np.array(train_paths)
    for k, (train, val) in enumerate(
            tqdm(kfold.split(X, Y), total=kfold.get_n_splits())):
        print(f"Training data: f{train_paths[train]}")
        print(f"Validation data: f{train_paths[val]}")
        # ------------
        # data loader
        # ------------
        data_loader = OVSDBBCDataModule(max_seq_length,
                                        max_summary_length,
                                        args.d_model,
                                        train_paths[train],
                                        train_paths[val],
                                        shuffle=args.shuffle,
                                        use_tpu=args.use_tpu)

        # ------------
        # model
        # ------------
        model = UTWRS(args, SRC_PAD_TOKEN, TRG_PAD_TOKEN)

        # ------------
        # neptune logger
        # ------------
        neptune_logger = NeptuneLogger(project_name="guyleaf/UTWRS",
                                       params=vars(args),
                                       experiment_name=f"{k+1}-fold_logger",
                                       tags=args.tags)
        neptune_logger.experiment.log_text("training_data",
                                           ','.join(train_paths[train]))
        neptune_logger.experiment.log_text("validation_data",
                                           ','.join(train_paths[val]))

        # ------------
        # checkpoint
        # ------------
        model_checkpoint = ModelCheckpoint(
            dirpath="checkpoints",
            filename='{epoch:02d}_{test_loss:.2f}',
            save_top_k=3,
            monitor='test_loss',
            mode='min')

        # ------------
        # profiler
        # ------------
        profiler = PyTorchProfiler(
            output_filename=f"profiles/{k}-fold_profiler",
            profile_memory=True,
            sort_by_key="cuda_memory_usage",
            row_limit=50,
            enabled=args.memory_profile)

        # ------------
        # training
        # ------------
        trainer = pl.Trainer.from_argparse_args(
            args,
            logger=neptune_logger,
            profiler=profiler,
            checkpoint_callback=model_checkpoint,
            track_grad_norm=2,
            log_every_n_steps=100)
        trainer.fit(model, data_loader)

        # Log model checkpoint to Neptune
        for k in model_checkpoint.best_k_models.keys():
            model_name = 'checkpoints/' + k.split('/')[-1]
            neptune_logger.experiment.log_artifact(k, model_name)

        # Log score of the best model checkpoint.
        neptune_logger.experiment.set_property(
            'best_model_loss', model_checkpoint.best_model_score.tolist())
        if args.profiler:
            neptune_logger.experiment.log_artifact('profiles')
Esempio n. 23
0
                            TRAIN_DATA=TensorDataset(TR_II, TR_AM, TR_LABEL),
                            TRAIN_CODES=train_codes,
                            DEV_DATA=TensorDataset(DE_II, DE_AM, DE_LABEL),
                            DEV_CODES=dev_codes,
                            TEST_DATA=TensorDataset(TE_II, TE_AM, TE_LABEL),
                            TEST_CODES=test_codes,
                            HIDDEN_UNIT1=model_parameters["HIDDEN_UNIT1"],
                            BATCH_SIZE=model_parameters["BATCH_SIZE"],
                            LR=model_parameters["LEARNING_RATE"],
                            EPS=model_parameters["EPS"],
                            EPOCHS=model_parameters["EPOCHS"],
                            FREEZE_BERT=model_parameters["FREEZE_BERT"])

    neptune_logger = NeptuneLogger(api_key=NEPTUNE_API,
                                   project_name="fatihbeyhan/CASE21-SUBTASK3",
                                   params={
                                       **model_parameters,
                                       **data_parameters
                                   })

    ### initializing trainer
    trainer = Trainer(
        max_epochs=model_parameters["EPOCHS"],
        gpus=1,
        #auto_lr_find=True,
        #auto_scale_batch_size='binsearch',
        #gradient_clip_val= GRADIENT_CLIP,
        #limit_train_batches = 1,
        #limit_val_batches = 2,
        #limit_test_batches = 1,
        logger=neptune_logger,
        #accelerator='ddp',
Esempio n. 24
0
def cli_main():
    
    argv = sys.argv[1:]
    #argv = ['--config',     'configs/NTU_BUTD_CON.yaml',
     #        '--exp_name',   'test',
      #       '--exp_dir',    '../prj_ssl_exps/test']

    parser = argparse.ArgumentParser()
    
    parser.add_argument("-c", "--config", default=None, help="where to load YAML configuration", metavar="FILE")

    parser.add_argument('--exp_name', type=str, default='test', help='experiment name')
    parser.add_argument('--exp_dir', type=str, default='../experiments/', help='experiment output directory')
    parser.add_argument('--neptune_key', type=str, default='', help='neptune user api key')
    parser.add_argument('--neptune_project', type=str, default='', help='neptune project directory')
    parser.add_argument('--path_db', type=str, default='../dbs', help='neptune project directory')

    parser.add_argument('--model', type=str, default='MocoV2', help='self supervised training method')
    parser.add_argument('--dataset', type=str, default='NTU_SSL', help='dataset to use for training')
    
    parser.add_argument('--seed', type=int, default=None, help='random seed')
    
    parser.add_argument('--resume_training', action='store_true', help='resume training from checkpoint training')
    
    
    args = parse_args(parser, argv)
    
    if args.seed is not None:
        pl.seed_everything(args.seed)
    
    # trainer args
    parser = pl.Trainer.add_argparse_args(parser)
    
    # get model and model args
    model_type = vars(modules)[args.model] 
    parser = model_type.add_model_specific_args(parser)
    
    # get dataset and dataset args
    dataset_type = vars(datasets)[args.dataset]
    parser = dataset_type.add_dataset_specific_args(parser)

    args = parse_args(parser, argv)

    os.makedirs(args.exp_dir, exist_ok=True)
    os.makedirs(args.path_db, exist_ok=True)

    # save config
    with open(os.path.join(args.exp_dir, 'config.yaml'), 'w') as cfg_file:
        yaml.dump(args.__dict__, cfg_file)

    if args.neptune_key != '':
        logger = NeptuneLogger(
            api_key=args.neptune_key,
            project_name=args.neptune_project,
            close_after_fit=False,
            experiment_name=args.exp_name,  # Optional,
            params=args.__dict__, # Optional,
            tags=["pl"],  # Optional,
            # upload_stderr=False,
            # upload_stdout=False
        )
    else:
        logger = TensorBoardLogger(args.exp_dir) #, name="my_model"
    
    datamodule = dataset_type(**args.__dict__)
    
    model = model_type(**args.__dict__)

    if args.resume_training:
        ckpts = list(filter(lambda x:'epoch=' in x, os.listdir(args.exp_dir)))
        latest_epoch = max([int( x.replace('epoch=','').replace('.ckpt','')) for x in ckpts])
        latest_ckpt = os.path.join(args.exp_dir, 'epoch=' + str(latest_epoch) + '.ckpt') 

        print('resuming from checkpoint', latest_ckpt)

        args.__dict__.update({'resume_from_checkpoint': latest_ckpt})


    #model_checkpoint = pl.callbacks.ModelCheckpoint(filepath=args.exp_dir, save_top_k=3, mode='max', monitor='knn_acc', period=args.ckpt_period) # , filename='{epoch}-{knn_acc}'
    model_checkpoint = pl.callbacks.ModelCheckpoint(save_top_k=3, mode='max', monitor='knn_acc',
                                                    period=args.ckpt_period)  # , filename='{epoch}-{knn_acc}'

    trainer = pl.Trainer.from_argparse_args(args, logger=logger, checkpoint_callback=model_checkpoint, callbacks=[KNNEval(period=args.ckpt_period)])


    # print(len(datamodule.val_dataset()))

    
    trainer.fit(model, datamodule)

    best_ckpt = trainer.checkpoint_callback.best_model_path

    best_model = model_type.load_from_checkpoint(checkpoint_path=best_ckpt)
    
    # pretrain_result = trainer.test(model=best_model)[0]
    pretrain_result = trainer.test(model=best_model, datamodule=datamodule)[0]
    print(pretrain_result)
    lincls_results = lincls(args, best_model)

    
    print('test results')
    for k,v in lincls_results.items():
        print(k, v)

    df = pd.DataFrame()
    output_dict = {
        'exp_name': args.exp_name,
        'exp_dir': args.exp_dir,
        'model': args.model,
        'dataset': args.dataset,
    }

    output_dict.update(pretrain_result)
    output_dict.update(lincls_results)
    
    if args.neptune_key != '':
        for k, v in pretrain_result.items():
            logger.experiment.log_metric(k, v)
            
        for k, v in lincls_results.items():
            logger.experiment.log_metric(k, v)


    df = df.append(output_dict, ignore_index=True)
    df.to_csv(os.path.join(args.path_db, args.exp_name + '_db.csv'))
Esempio n. 25
0
def cli_main():

    # Arguments
    default_config = os.path.join(os.path.split(os.getcwd())[0], 'config.conf')

    print(default_config)

    parser = ArgumentParser(description='Pytorch BYOL',
                            default_config_files=[default_config])
    parser.add_argument('-c',
                        '--my-config',
                        required=False,
                        is_config_file=True,
                        help='config file path')
    parser.add_argument('--finetune',
                        dest='finetune',
                        action='store_true',
                        help='Perform only finetuning (Default: False)')
    parser.set_defaults(finetune=False)
    parser.add_argument(
        '--transfer',
        dest='transfer',
        action='store_true',
        help='Perform transfer learning on linear eval (Default: False)')
    parser.set_defaults(transfer=False)
    parser.add_argument('--offline_log',
                        dest='offline_log',
                        action='store_true',
                        help='Do not log online (Default:  False)')
    parser.set_defaults(offline_log=False)
    parser.add_argument('--pt_checkpoint', type=str, default=None)
    parser.add_argument('--val_every_n', type=int, default=1)
    parser.add_argument('--tag', type=str, default=None)
    parser.add_argument('--resume_ckpt', type=str, default=None)
    parser.add_argument('--seed', type=int, default=222)
    parser.add_argument('--project_name', type=str, default=None)

    # trainer args
    parser = pl.Trainer.add_argparse_args(parser)

    # model args
    parser = BYOL.add_model_specific_args(parser)

    parser = SSLLinearEval.add_model_specific_args(parser)

    args = parser.parse_args()

    seed_everything(args.seed)

    args.status = 'Pretrain'

    run_name = time.strftime("%Y-%m-%d_%H-%M-%S")

    save_dir = os.path.join(os.getcwd(), 'checkpoints')

    pt_model_dir = os.path.join(save_dir, ("BYOL_" + run_name + '/pretrain'))
    ft_model_dir = os.path.join(save_dir, ("BYOL_" + run_name + '/finetune'))
    reps_model_dir = os.path.join(save_dir, ("BYOL_" + run_name + '/reps'))

    os.makedirs(pt_model_dir, exist_ok=True)
    os.makedirs(ft_model_dir, exist_ok=True)
    os.makedirs(reps_model_dir, exist_ok=True)

    # Get DataModule
    dm, ft_dm, args = get_dm(args)

    neptune_logger = NeptuneLogger(
        offline_mode=args.offline_log,
        api_key=None,
        project_name=args.project_name,
        experiment_name='Testing',  # Optional,
        params=vars(args),  # Optional,
        tags=["Test", args.tag],  # Optional,
        upload_source_files=['src/*.py'],
        close_after_fit=False)

    pt_model_checkpoint = pl.callbacks.ModelCheckpoint(filepath=pt_model_dir,
                                                       save_top_k=1,
                                                       monitor='loss')

    if args.accelerator == 'ddp' or args.accelerator == 'ddp2':
        replace_sampler = True  # False
        if args.accelerator == 'ddp':
            args.effective_bsz = args.batch_size * args.num_nodes * args.gpus

        elif args.accelerator == 'ddp2':
            args.effective_bsz = args.batch_size * args.num_nodes
    else:
        replace_sampler = True
        args.effective_bsz = args.batch_size

    # Define trainer
    trainer = pl.Trainer.from_argparse_args(
        args,
        max_epochs=args.max_epochs,
        logger=neptune_logger,
        callbacks=[
            PTPrintingCallback(pt_model_dir, args),
            CheckpointSave(pt_model_dir)
        ],
        deterministic=True,
        fast_dev_run=False,
        sync_batchnorm=True,
        checkpoint_callback=False,
        replace_sampler_ddp=replace_sampler,
        resume_from_checkpoint=args.resume_ckpt,
        progress_bar_refresh_rate=args.print_freq,
        check_val_every_n_epoch=args.val_every_n)

    # Define model
    model = BYOL(**args.__dict__)

    # Fit
    trainer.fit(model, dm)

    # time.sleep(15)

    if trainer.local_rank == 0:

        print("os.listdir(pt_model_dir) :{}".format(os.listdir(pt_model_dir)))

        checkpoint_path = os.path.join(pt_model_dir,
                                       os.listdir(pt_model_dir)[-1])

        if args.save_checkpoint:

            neptune_logger.experiment.log_artifact(
                os.path.join(pt_model_dir,
                             os.listdir(pt_model_dir)[-1]),
                os.path.join('pretrain/',
                             os.listdir(pt_model_dir)[-1]))

        log_files = [pt_model_dir, ft_model_dir, checkpoint_path]

        save_log_file = os.path.join(os.getcwd(), 'log_files.txt')

        np.savetxt(save_log_file, log_files, delimiter=" ", fmt="%s")

    neptune_logger.experiment.stop()
Esempio n. 26
0
def main(parser):
    parser.add_argument('-m', '--model', type=str, default='PredNet')
    parser.add_argument('-d',
                        '--dataset',
                        type=str,
                        default='SchapiroResnetEmbeddingDataset')
    parser.add_argument('--load_model', action='store_true')
    parser.add_argument('--ipy', action='store_true')
    parser.add_argument('--no_graphs', action='store_true')
    parser.add_argument('--no_test', action='store_true')
    parser.add_argument('--user', type=str, default='aprashedahmed')
    parser.add_argument('-p', '--project', type=str, default='sandbox')
    parser.add_argument('-t', '--tags', nargs='+')
    parser.add_argument('--no_checkpoints', action='store_true')
    parser.add_argument('--offline_mode', action='store_true')
    parser.add_argument('--save_weights_online', action='store_true')

    parser.add_argument('--test_checkpoints', action='store_true')
    parser.add_argument('--test_epochs', type=int, default=2)
    parser.add_argument('--test_n_paths', type=int, default=2)
    parser.add_argument('--test_online', action='store_true')
    parser.add_argument('--test_project', type=str, default='')
    parser.add_argument('-v', '--verbose', action='store_true')

    parser.add_argument('--n_workers', type=int, default=1)
    parser.add_argument('-e', '--epochs', type=int, default=50)
    parser.add_argument('--gpus', type=float, default=1)
    parser.add_argument('--device', type=str, default='cuda')
    parser.add_argument('-s', '--seed', type=str, default='random')
    parser.add_argument('-b', '--batch_size', type=int, default=256 + 128)
    parser.add_argument('--n_val', type=int, default=1)
    parser.add_argument('--mapping', type=str, default='random')

    parser.add_argument('--dir_checkpoints',
                        type=str,
                        default=str(index.DIR_CHECKPOINTS))
    parser.add_argument('--checkpoint_period', type=float, default=1.0)
    parser.add_argument('--val_check_interval', type=float, default=1.0)
    parser.add_argument('--save_top_k', type=float, default=1)
    parser.add_argument('--early_stop_mode', type=str, default='min')
    parser.add_argument('--early_stop_patience', type=int, default=10)
    parser.add_argument('--early_stop_min_delta', type=float, default=0.001)

    parser.add_argument('--name', type=str, default='')
    parser.add_argument('--exp_prefix', type=str, default='')
    parser.add_argument('--exp_suffix', type=str, default='')

    # Get Model and Dataset specific args
    temp_args, _ = parser.parse_known_args()

    # Make sure this is correct
    if hasattr(datasets, temp_args.dataset):
        Dataset = getattr(datasets, temp_args.dataset)
        parser = Dataset.add_dataset_specific_args(parser)
    else:
        raise Exception(
            f'Invalid dataset "{temp_args.dataset}" passed. Check it is '
            f'importable: "from prevseg.datasets import {temp_args.dataset}"')

    # Get temp args now with dataset args added
    temp_args, _ = parser.parse_known_args()

    # Check this is correct as well
    if hasattr(models, temp_args.model):
        Model = getattr(models, temp_args.model)
        parser = Model.add_model_specific_args(parser)
    else:
        raise Exception(
            f'Invalid model "{temp_args.model}" passed. Check it is importable:'
            f' "from prevseg.models import {temp_args.model}"')

    # Get the parser and turn into an omegaconf
    hparams = parser.parse_args()

    # If we are test-running, do a few things differently (scale down dataset,
    # send to sandbox project, etc.)
    if hparams.test_run:
        hparams.epochs = hparams.test_epochs
        hparams.n_paths = hparams.test_n_paths
        hparams.name = '_'.join(filter(None, ['test_run', hparams.exp_prefix]))
        hparams.project = hparams.test_project or 'sandbox'
        hparams.verbose = True
        hparams.ipdb = True
        hparams.no_checkpoints = not hparams.test_checkpoints
        hparams.offline_mode = not hparams.test_online

    # Seed is a string to allow for None/random as an input. Make it passable
    # to pl.seed_everything
    hparams.seed = None if 'None' in hparams.seed or hparams.seed == 'random' \
        else int(hparams.seed)

    # Get the hostname for book keeping
    hparams.hostname = socket.gethostname()

    # Set the seed
    hparams.seed = pl.seed_everything(hparams.seed)

    # Turn the string entry for mapping into a dict (that is also a str)
    if hparams.mapping == 'default':
        hparams.mapping = const.DEFAULT_MAPPING
    elif hparams.mapping == 'random':
        hparams.mapping = str(
            Dataset.random_mapping(n_pentagons=hparams.n_pentagons))
    else:
        raise ValueError(f'Invalid entry for mapping: {hparams.mapping}')

    # Set the validation path
    hparams.val_path = str(const.DEFAULT_PATH)

    # Create experiment name
    hparams.name = name_from_hparams(hparams)
    hparams.exp_name = name_from_hparams(hparams, short=True)
    if hparams.verbose:
        print(f'Beginning experiment: "{hparams.name}"')

    # Neptune Logger
    logger = NeptuneLogger(
        project_name=f"{hparams.user}/{hparams.project}",
        experiment_name=hparams.exp_name,
        params=vars(hparams),
        tags=hparams.tags,
        offline_mode=hparams.offline_mode,
        upload_source_files=[
            str(Path(__file__).resolve()),
            inspect.getfile(Model),
            inspect.getfile(Dataset)
        ],
        close_after_fit=False,
    )

    if not hparams.load_model:
        # Checkpoint Call back
        if hparams.no_checkpoints:
            checkpoint = False
            if hparams.verbose:
                print('\nNot saving any checkpoints.\n', flush=True)
        else:
            dir_checkpoints_experiment = (Path(hparams.dir_checkpoints) /
                                          hparams.name)
            if not dir_checkpoints_experiment.exists():
                dir_checkpoints_experiment.mkdir(parents=True)

            checkpoint = pl.callbacks.ModelCheckpoint(
                filepath=str(
                    dir_checkpoints_experiment /
                    (f'seed={hparams.seed}' + '_{epoch}_{val_loss:.3f}')),
                verbose=hparams.verbose,
                save_top_k=hparams.save_top_k,
                period=hparams.checkpoint_period,
            )

        # Early stopping callback
        early_stop_callback = pl.callbacks.EarlyStopping(
            monitor='val_loss',
            min_delta=hparams.early_stop_min_delta,
            patience=hparams.early_stop_patience,
            verbose=hparams.verbose,
            mode=hparams.early_stop_mode,
        )

        # Define the trainer
        trainer = pl.Trainer(
            checkpoint_callback=checkpoint,
            max_epochs=hparams.epochs,
            logger=logger,
            val_check_interval=hparams.val_check_interval,
            gpus=hparams.gpus,
            early_stop_callback=early_stop_callback,
        )

        # Verbose messaging
        if hparams.verbose:
            now = datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S")
            print(f'\nCurrent time: {now}', flush=True)
            print(f'\nRunning with following hparams:', flush=True)
            pprint(vars(hparams))

        # Define the model
        model = Model(hparams)
        if hparams.verbose:
            print(f'\nModel being used: \n{model}', flush=True)

        # Define the datamodule
        datamodule = datasets.DataModuleConstructor(hparams, Dataset)

        # Train the model
        print('\nBeginning training:', flush=True)
        now = datetime.datetime.now()
        trainer.fit(model, datamodule=datamodule)
        if hparams.verbose:
            elapsed = datetime.datetime.now() - now
            elapsed_fstr = time.strftime('%H:%M:%S',
                                         time.gmtime(elapsed.seconds))
            print(f'\nTraining completed! Time Elapsed: {elapsed_fstr}',
                  flush=True)

        # Record the best checkpoint if we kept track of it
        if not hparams.no_checkpoints:
            logger.log_hyperparams(
                {'best_checkpoint_path': checkpoint.best_model_path})
            # Save the weights online if desired
            if hparams.save_weights_online:
                if hparams.verbose:
                    print('\nSending weights to neptune servers...',
                          flush=True)
                logger.log_artifact(checkpoint.best_model_path)
                if hparams.verbose:
                    print('Finished.', flush=True)

    else:
        raise NotImplementedError
        # # Get all the experiments with the name hparams.name*
        # experiments = list(index.DIR_CHECKPOINTS.glob(
        #     f'{hparams.name}_{hparams.exp_name}*'))

        # # import pdb; pdb.set_trace()
        # if len(experiments) > 1:
        #     # Get the newest exp by v number
        #     experiment_newest = sorted(
        #         experiments,
        #         key=lambda path: int(path.stem.split('_')[-1][1:]))[-1]
        #     # Get the model with the best (lowest) val_loss
        # else:
        #     experiment_newest = experiments[0]
        # experiment_newest_best_val = sorted(
        #     experiment_newest.iterdir(),
        #     key=lambda path: float(
        #         path.stem.split('val_loss=')[-1].split('_')[0]))[0]

        # model = Model.load_from_checkpoint(str(experiment_newest_best_val))
        # model.logger = logger
        # ## LOOK AT THIS LATER
        # model.prepare_data(val_path=const.DEFAULT_PATH)

        # # Define the trainer
        # trainer = pl.Trainer(
        #     logger=model.logger,
        #     gpus=hparams.gpus,
        #     max_epochs=1,
        # )

    if not hparams.no_test:
        # Ensure we are in cuda for testing if specified
        if 'cuda' in hparams.device and torch.cuda.is_available():
            model.cuda()

        # Create the test data
        test_data = np.array(
            [datamodule.ds.array_data[n] for n in const.DEFAULT_PATH]).reshape(
                (1, len(const.DEFAULT_PATH), 2048))
        torch_data = torch.Tensor(test_data)

        # Get the model outputs
        outs = model.forward(torch_data, output_mode='eval')
        outs.update({'errors': model.forward(torch_data, output_mode='error')})

        # Visualize the test data
        figs = model.visualize(outs, borders=const.DEFAULT_BORDERS)
        if not hparams.no_graphs:
            for name, fig in figs.items():
                # Doing logger.log_image(...) doesn't work for some reason
                model.logger.log_image(name, fig)

    # Close the neptune logger
    logger.experiment.stop()
Esempio n. 27
0
weight_decay = parameters["weight_decay"]
pool_ratio = parameters["pool_ratio"]
nhid = parameters["nhid"]
epochs = parameters["epochs"]
LearningRateMonitor_Params = {'logging_interval': 'epoch'}

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device: {}'.format(device))

model = LightningPAN(9, 1, nhid=nhid,ratio=pool_ratio,filter_size=filter_size)

lr_logger = LearningRateMonitor(**LearningRateMonitor_Params)
neptune_logger = NeptuneLogger(
                api_key=ANONYMOUS,
                project_name='hvergnes/PAN',
                close_after_fit=False,
                params=parameters, # your hyperparameters, immutable
                tags=['PAN', 'best_model'],  # tags
                upload_source_files=["parameters.json", "lightning_model.py"]
                )

trainer = Trainer(
    max_epochs=epochs,
    logger=neptune_logger,
    callbacks=[lr_logger],
    # fast_dev_run=True,
)

trainer.fit(model)
trainer.test(model)

test_loader = model.test_dataloader()
Esempio n. 28
0
if __name__ == '__main__':
    try:
        import multiprocessing as mp

        __spec__ = None
        mp.set_start_method('spawn', force=True)
    except:
        pass

    matplotlib.use('Agg')

    all_tars = []

    neptune_logger = NeptuneLogger(
        api_key="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJiMDM3MjFkYy1jNTE3LTQ4NTAtOTFlNC00ZGY1NGM3Y2M4YmEifQ====",
        project_name="erelon39/Line-colorize")

    if torch.cuda.is_available():
        decods = my_decoders(128)
        model = siggraph17_L(128, pretrained_path="model_e0_batch_19000_gn.pt")
        for root, dirs, files in os.walk("/home/erelon39/sftp/erelon/df66f8bf-85ef-4dec-aa8f-464dd02ad15c"):
            for file in files:
                if file.endswith(
                        ".tar") and "out" not in root and "out" not in file and "trash" not in root.lower() and "trash" not in file.lower():
                    all_tars.append(os.path.join(root, file))

        dataset = wds.WebDataset(all_tars, length=float("inf")) \
            .decode(decods.my_decoder_GT).decode(decods.my_decoder_BW).to_tuple("gt.jpg", "train.jpg", "__key__",
                                                                                handler=dummy_func).batched(16)
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=None, num_workers=4)
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=PARAMS['learning_rate'])

# DataLoader
train_loader = DataLoader(MNIST(os.getcwd(), download=True, transform=transforms.ToTensor()),
                          batch_size=PARAMS['batch_size'])

# Step 4: Create NeptuneLogger

from pytorch_lightning.loggers.neptune import NeptuneLogger

neptune_logger = NeptuneLogger(
    api_key="ANONYMOUS",
    project_name="shared/pytorch-lightning-integration",
    params=PARAMS)

# Step 5: Pass NeptuneLogger to the Trainer

trainer = pl.Trainer(max_epochs=PARAMS['max_epochs'],
                     logger=neptune_logger)

# Step 6: Run experiment

model = LitModel()

trainer.fit(model, train_loader)

# Step 7: Stop Neptune logger at the end
                              sampler=test_sampler)

    model_checkpoint = pl.callbacks.ModelCheckpoint(CHECKPOINTS_DIR)
    trainer = Trainer(gpus=hparams.gpus, checkpoint_callback=True)
    trainer.test(model, test_dataloaders=test_dataset)
    # Save checkpoints folder
    neptune_logger.experiment.log_artifact(CHECKPOINTS_DIR)
    # You can stop the experiment
    neptune_logger.experiment.stop()


# -------------------------------------------------------------------------------------------------------------------
CHECKPOINTS_DIR = '/home/rachneet/thesis_results/mixed_impairments_cnn/'
neptune_logger = NeptuneLogger(
    api_key=os.environ.get("NEPTUNE_API_KEY"),
    project_name="rachneet/sandbox",
    experiment_name="mixed_impairments_cnn",  # change this for new runs
)

# ---------------------------------------MAIN FUNCTION TRAINER-------------------------------------------------------


def main(hparams):

    model = LightningCNN(hparams)
    # exp = Experiment(save_dir=os.getcwd())
    if not os.path.exists(CHECKPOINTS_DIR):
        os.makedirs(CHECKPOINTS_DIR)
    model_checkpoint = pl.callbacks.ModelCheckpoint(CHECKPOINTS_DIR)
    early_stop_callback = pl.callbacks.EarlyStopping(monitor='val_loss',
                                                     min_delta=0.00,