コード例 #1
0
def objective(trial, args):
    params = get_trial_params(trial)
    params['hidden_size'] = 2**params['hidden_size']
    params['acc_grads'] = 2**params['acc_grads']

    early_stopper = EarlyStopping(
        monitor='val_loss', min_delta=0.005, patience=3, mode='min')
    callbacks = [early_stopper, PyTorchLightningPruningCallback(
        trial, monitor="val_loss")]

    if args.model_type == 'attnlstm':
        params['attn_width'] = trial.suggest_int("attn_width", 3, 64)

    if 'split' in args.val_mode:
        dataset_hour = args.data.split('_')[-1]
        logger = MLFlowLogger(experiment_name=f'Optuna_{dataset_hour}h_{args.val_mode[-1]}_split')
        print(f'Optuna_{dataset_hour}_{args.val_mode[-1]}_split')
        val_losses = []
        for _split_id in range(int(args.val_mode[-1])):
            print(f"Split {_split_id} Trial {trial.number}")
            args.__dict__["split_id"] = 0
            for key in params:
                args.__dict__[str(key)] = params.get(key)
            model = LitLSTM(args)
            trainer = Trainer(
                logger=logger,
                callbacks=callbacks,
                **get_trainer_params(args),
            )
            logger.log_hyperparams(model.args)
            args.__dict__["val_mode"] = args.val_mode
            args.__dict__["split_id"] = _split_id
            model._get_data(args, data_mode='init')
            trainer.fit(model)
            trainer.test(model, test_dataloaders=model.test_dataloader())
            # logger.finalize()
            val_losses.append(model.metrics['val_loss'])

        # log mean val loss for later retrieval of best model
        mean_val_loss = torch.stack(val_losses).mean()
        logger.log_metrics({"mean_val_loss": mean_val_loss}, step=0)
        logger.finalize()
        return mean_val_loss

    elif args.val_mode == 'full':
        logger = MLFlowLogger(experiment_name='Optuna_full')
        for key in params:
            args.__dict__[str(key)] = params.get(key)
        model = LitLSTM(args)
        trainer = Trainer(
            logger=logger,
            callbacks=callbacks,
            **get_trainer_params(args),
        )
        logger.log_hyperparams(model.args)
        trainer.fit(model)
        trainer.test(model, test_dataloaders=model.test_dataloader())
        model.save_preds_and_targets(to_disk=True)
        logger.finalize()
        return model.metrics['val_loss']
コード例 #2
0
def train(args):
    seed_everything(args.seed)
    model = LitLSTM(args)

    logger = MLFlowLogger(experiment_name='Default')

    early_stop_callback = EarlyStopping(
        monitor='val_loss', min_delta=0.005, patience=3, verbose=args.verbose, mode='min') if args.early else None

    trainer = Trainer(
        log_gpu_memory='all' if args.verbose else None,
        track_grad_norm=2 if args.verbose else -1,
        logger=logger,
        weights_summary='full',
        callbacks=[early_stop_callback],
        accumulate_grad_batches=args.acc_grads,
        profiler=args.verbose,
        **get_trainer_params(args),
    )

    logger.log_hyperparams(model.args)
    trainer.fit(model)
    trainer.test(model, test_dataloaders=model.test_dataloader())
    model.save_preds_and_targets(to_disk=True)
    logger.finalize()

    return logger.run_id
コード例 #3
0
def main(args: DictConfig):
    # Distributed training
    torch.multiprocessing.set_sharing_strategy('file_system')
    if str(args.exp.gpus) == '-1':
        args.exp.gpus = torch.cuda.device_count()

    # Secondary data args
    args.data.setting = 'in-topic' if args.data.test_id is None else 'cross-topic'
    dataset_name = args.data.path.split('/')[1]
    args.data.path = f'{ROOT_PATH}/{args.data.path}'

    # MlFlow Logging
    if args.exp.logging:
        experiment_name = f'{dataset_name}/{args.setting}-{args.data.setting}/{args.exp.task_name}'
        mlf_logger = MLFlowLogger(experiment_name=experiment_name,
                                  tracking_uri=MLFLOW_URI)
        experiment = mlf_logger._mlflow_client.get_experiment_by_name(
            experiment_name)
        if experiment is not None:
            experiment_id = experiment.experiment_id

            if args.exp.check_exisisting_hash:
                args.hash = calculate_hash(args)
                existing_runs = mlf_logger._mlflow_client.search_runs(
                    filter_string=f"params.hash = '{args.hash}'",
                    run_view_type=mlflow.tracking.client.ViewType.ACTIVE_ONLY,
                    experiment_ids=[experiment_id])
                if len(existing_runs) > 0:
                    logger.info('Skipping existing run.')
                    return
                else:
                    logger.info('No runs found - perfoming one.')

    #     cpnt_path = f'{ROOT_PATH}/mlruns/{experiment_id}/{run_id}/artifacts'
    # else:
    #     cpnt_path = None

    # Load pretrained model and tokenizer
    set_seed(args)
    model = instantiate(args.lightning_module, args=args)
    logger.info(f'Run arguments: \n{args.pretty()}')

    # Early stopping & Checkpointing
    early_stop_callback = EarlyStopping(
        min_delta=0.00,
        patience=args.exp.early_stopping_patience,
        verbose=False,
        mode='min')
    checkpoint_callback = CustomModelCheckpoint(
        model=model,
        verbose=True,
        mode='min',
        save_top_k=1,
        period=0 if args.exp.val_check_interval < 1.0 else 1)
    lr_logging_callback = LearningRateLogger(logging_interval='epoch')

    # Training
    trainer = Trainer(
        gpus=eval(str(args.exp.gpus)),
        logger=mlf_logger if args.exp.logging else None,
        max_epochs=args.exp.max_epochs,
        gradient_clip_val=args.optimizer.max_grad_norm,
        early_stop_callback=early_stop_callback,
        val_check_interval=args.exp.val_check_interval,
        checkpoint_callback=checkpoint_callback
        if args.exp.checkpoint else None,
        accumulate_grad_batches=args.exp.gradient_accumulation_steps,
        auto_lr_find=args.optimizer.auto_lr_find,
        precision=args.exp.precision,
        distributed_backend='dp',
        callbacks=[lr_logging_callback])
    trainer.fit(model)
    trainer.test(model)

    # Cleaning cache
    torch.cuda.empty_cache()

    # Ending the run
    if args.exp.logging:
        mlf_logger.finalize()