def train(dataset_name: str,
          model_name: str,
          expt_dir: str,
          data_folder: str,
          num_workers: int = 0,
          is_test: bool = False,
          resume_from_checkpoint: str = None):
    seed_everything(SEED)
    dataset_main_folder = data_folder
    vocab = Vocabulary.load(join(dataset_main_folder, "vocabulary.pkl"))

    if model_name == "code2seq":
        config_function = get_code2seq_test_config if is_test else get_code2seq_default_config
        config = config_function(dataset_main_folder)
        model = Code2Seq(config, vocab, num_workers)
        model.half()
    #elif model_name == "code2class":
    #	config_function = get_code2class_test_config if is_test else get_code2class_default_config
    #	config = config_function(dataset_main_folder)
    #	model = Code2Class(config, vocab, num_workers)
    else:
        raise ValueError(f"Model {model_name} is not supported")

    # define logger
    wandb_logger = WandbLogger(project=f"{model_name}-{dataset_name}",
                               log_model=True,
                               offline=True)
    wandb_logger.watch(model)
    # define model checkpoint callback
    model_checkpoint_callback = ModelCheckpoint(
        filepath=join(expt_dir, "{epoch:02d}-{val_loss:.4f}"),
        period=config.hyperparams.save_every_epoch,
        save_top_k=3,
    )
    # define early stopping callback
    early_stopping_callback = EarlyStopping(
        patience=config.hyperparams.patience, verbose=True, mode="min")
    # use gpu if it exists
    gpu = 1 if torch.cuda.is_available() else None
    # define learning rate logger
    lr_logger = LearningRateLogger()
    trainer = Trainer(
        max_epochs=20,
        gradient_clip_val=config.hyperparams.clip_norm,
        deterministic=True,
        check_val_every_n_epoch=config.hyperparams.val_every_epoch,
        row_log_interval=config.hyperparams.log_every_epoch,
        logger=wandb_logger,
        checkpoint_callback=model_checkpoint_callback,
        early_stop_callback=early_stopping_callback,
        resume_from_checkpoint=resume_from_checkpoint,
        gpus=gpu,
        callbacks=[lr_logger],
        reload_dataloaders_every_epoch=True,
    )
    trainer.fit(model)
    trainer.save_checkpoint(join(expt_dir, 'Latest.ckpt'))

    trainer.test()
def preprocess(problem: str, data: str, is_vocab_collected: bool, n_jobs: int):
    # Collect vocabulary from train holdout if needed
    if problem not in _config_switcher:
        raise ValueError(f"Unknown problem ({problem}) passed")
    config_function = _config_switcher[problem]
    config = config_function(data)

    vocab_path = path.join(DATA_FOLDER, config.dataset_name, "vocabulary.pkl")
    if path.exists(vocab_path):
        vocab = Vocabulary.load(vocab_path)
    else:
        vocab = collect_vocabulary(config) if is_vocab_collected else convert_vocabulary(config)
        vocab.dump(vocab_path)
    for holdout in ["train", "val", "test"]:
        convert_holdout(holdout, vocab, config, n_jobs)
Beispiel #3
0
def preprocess(problem: str, data: str, is_vocab_collected: bool, n_jobs: int,
               data_folder: str, just_test: bool, test_name: str):
    # Collect vocabulary from train holdout if needed
    if problem not in _config_switcher:
        raise ValueError(f"Unknown problem ({problem}) passed")
    config_function = _config_switcher[problem]
    config = config_function(data)

    vocab_path = path.join(data_folder, "vocabulary.pkl")
    if path.exists(vocab_path):
        vocab = Vocabulary.load(vocab_path)
    else:
        vocab = collect_vocabulary(
            config, data_folder) if is_vocab_collected else convert_vocabulary(
                config, data_folder)
        vocab.dump(vocab_path)

    split = ["train", "val", "test"]
    if just_test:
        split = ["test"]
    for holdout in split:
        convert_holdout(holdout, vocab, config, n_jobs, data_folder, test_name)