Exemple #1
0
def train(model_path, _log, _run, cutoff=0.1, idf_path=None):
    """Train a naive Bayes summarizer."""
    train_docs = list(read_train_jsonl())
    idf_table = None if idf_path is None else read_idf()
    model = NaiveBayesSummarizer.train(train_docs,
                                       cutoff=cutoff,
                                       idf_table=idf_table)
    _log.info('Saving model to %s', model_path)
    with open(model_path, 'w') as f:
        print(dump(model), file=f)
    if SAVE_FILES:
        _run.add_artifact(model_path)
Exemple #2
0
def train(model_path, _log, _run, gamma_word=0.1, gamma_init=0.1, gamma_trans=0.1,
          tf_path=None):
    """Train an HMM summarizer."""
    train_docs = list(read_train_jsonl())
    tf_table = None if tf_path is None else read_tf()
    model = HMMSummarizer.train(
        train_docs, gamma_word=gamma_word, gamma_init=gamma_init, gamma_trans=gamma_trans,
        tf_table=tf_table)
    _log.info('Saving model to %s', model_path)
    with open(model_path, 'w') as f:
        print(dump(model), file=f)
    if SAVE_FILES:
        _run.add_artifact(model_path)
Exemple #3
0
def train(
    seed,
    embedding_dim,
    lstm_hidden_size,
    attention_size,
    embedding_path,
    batch_size,
    max_doc_len,
    max_sen_len,
    grad_clip_val,
    learning_rate,
    resume_path,
):
    pl.utilities.seed.seed_everything(seed)
    dm = IndosumDataModule(
        read_train_jsonl(),
        read_dev_jsonl(),
        read_test_jsonl(),
        embedding_path,
        max_doc_len,
        max_sen_len,
        batch_size,
    )
    hssas = HSSAS(
        dm.vocab,
        embedding_dim,
        lstm_hidden_size,
        attention_size,
        max_doc_len,
        list(read_dev_jsonl()),
        learning_rate,
    )

    checkpoint_callback = ModelCheckpoint(monitor="val_loss")
    trainer = pl.Trainer(
        gpus=1,
        callbacks=[EarlyStopping(monitor="val_loss", mode="min", patience=3)],
        checkpoint_callback=checkpoint_callback,
        gradient_clip_val=grad_clip_val,
        resume_from_checkpoint=resume_path,
        max_epochs=5000,
    )
    torch.autograd.set_detect_anomaly(True)
    trainer.fit(hssas, dm)
    evaluate(model_path=checkpoint_callback.best_model_path, data_module=dm)
Exemple #4
0
def evaluate(
    seed,
    model_path,
    delete_temps,
    embedding_path,
    batch_size,
    max_doc_len,
    max_sen_len,
    _log,
    _run,
    data_module=None,
):
    pl.utilities.seed.seed_everything(seed)
    hssas = HSSAS.load_from_checkpoint(model_path)

    docs = read_test_jsonl()
    # docs = (doc for doc in docs if any([True for sent in doc.sentences if sent.label and len(sent.words) >= 70]))
    if data_module == None:
        data_module = IndosumDataModule(
            read_train_jsonl(),
            read_dev_jsonl(),
            read_test_jsonl(),
            embedding_path,
            max_doc_len,
            max_sen_len,
            batch_size,
        )

    summaries = (
        summary
        for x, y, doc_lens, batch_sent_lens in data_module.test_dataloader()
        for summary in hssas(x, doc_lens, batch_sent_lens)
    )

    score = eval_summaries(
        summaries, 
        (d for d in docs if 1 not in [1 if sent.label else 0 for sent in d.sentences[:0]]), 
        logger=_log, 
        delete_temps=delete_temps
    )
    for name, value in score.items():
        _run.log_scalar(name, value)
    return score["ROUGE-1-F"]
Exemple #5
0
def train(model_path,
          _log,
          _run,
          stopwords_path=None,
          train_algo='iis',
          cutoff=4,
          sigma=0.,
          trim_length=10):
    """Train a maximum entropy summarizer."""
    train_docs = list(read_train_jsonl())
    stopwords = None if stopwords_path is None else read_stopwords()
    model = MaxentSummarizer.train(train_docs,
                                   stopwords=stopwords,
                                   algorithm=train_algo,
                                   cutoff=cutoff,
                                   sigma=sigma,
                                   trim_length=trim_length)
    _log.info('Saving model to %s', model_path)
    with open(model_path, 'w') as f:
        print(dump(model), file=f)
    if SAVE_FILES:
        _run.add_artifact(model_path)
Exemple #6
0
def test(
    seed,
    model_path,
    embedding_path,
    batch_size,
    embedding_dim,
    lstm_hidden_size,
    attention_size,
    delete_temps,
    max_doc_len,
    max_sen_len,
    grad_clip_val,
    learning_rate,
    _log,
):
    pl.utilities.seed.seed_everything(seed)
    hssas = HSSAS.load_from_checkpoint(model_path)
    dm = IndosumDataModule(
        read_train_jsonl(), read_dev_jsonl(), read_test_jsonl(), embedding_path, max_doc_len, max_sen_len, 4
    )
 
    summaries = (
        summary
        for x, _, doc_lens, batch_sent_lens in dm.test_dataloader()
        for summary in hssas(x, doc_lens, batch_sent_lens)
    )

    for s in summaries:
        return
    
    eval_summaries(
        summaries, 
        list(read_test_jsonl())[1509:1511], 
        logger=_log, 
        delete_temps=delete_temps
    )

    return