def train(model_path, _log, _run, cutoff=0.1, idf_path=None): """Train a naive Bayes summarizer.""" train_docs = list(read_train_jsonl()) idf_table = None if idf_path is None else read_idf() model = NaiveBayesSummarizer.train(train_docs, cutoff=cutoff, idf_table=idf_table) _log.info('Saving model to %s', model_path) with open(model_path, 'w') as f: print(dump(model), file=f) if SAVE_FILES: _run.add_artifact(model_path)
def train(model_path, _log, _run, gamma_word=0.1, gamma_init=0.1, gamma_trans=0.1, tf_path=None): """Train an HMM summarizer.""" train_docs = list(read_train_jsonl()) tf_table = None if tf_path is None else read_tf() model = HMMSummarizer.train( train_docs, gamma_word=gamma_word, gamma_init=gamma_init, gamma_trans=gamma_trans, tf_table=tf_table) _log.info('Saving model to %s', model_path) with open(model_path, 'w') as f: print(dump(model), file=f) if SAVE_FILES: _run.add_artifact(model_path)
def train( seed, embedding_dim, lstm_hidden_size, attention_size, embedding_path, batch_size, max_doc_len, max_sen_len, grad_clip_val, learning_rate, resume_path, ): pl.utilities.seed.seed_everything(seed) dm = IndosumDataModule( read_train_jsonl(), read_dev_jsonl(), read_test_jsonl(), embedding_path, max_doc_len, max_sen_len, batch_size, ) hssas = HSSAS( dm.vocab, embedding_dim, lstm_hidden_size, attention_size, max_doc_len, list(read_dev_jsonl()), learning_rate, ) checkpoint_callback = ModelCheckpoint(monitor="val_loss") trainer = pl.Trainer( gpus=1, callbacks=[EarlyStopping(monitor="val_loss", mode="min", patience=3)], checkpoint_callback=checkpoint_callback, gradient_clip_val=grad_clip_val, resume_from_checkpoint=resume_path, max_epochs=5000, ) torch.autograd.set_detect_anomaly(True) trainer.fit(hssas, dm) evaluate(model_path=checkpoint_callback.best_model_path, data_module=dm)
def evaluate( seed, model_path, delete_temps, embedding_path, batch_size, max_doc_len, max_sen_len, _log, _run, data_module=None, ): pl.utilities.seed.seed_everything(seed) hssas = HSSAS.load_from_checkpoint(model_path) docs = read_test_jsonl() # docs = (doc for doc in docs if any([True for sent in doc.sentences if sent.label and len(sent.words) >= 70])) if data_module == None: data_module = IndosumDataModule( read_train_jsonl(), read_dev_jsonl(), read_test_jsonl(), embedding_path, max_doc_len, max_sen_len, batch_size, ) summaries = ( summary for x, y, doc_lens, batch_sent_lens in data_module.test_dataloader() for summary in hssas(x, doc_lens, batch_sent_lens) ) score = eval_summaries( summaries, (d for d in docs if 1 not in [1 if sent.label else 0 for sent in d.sentences[:0]]), logger=_log, delete_temps=delete_temps ) for name, value in score.items(): _run.log_scalar(name, value) return score["ROUGE-1-F"]
def train(model_path, _log, _run, stopwords_path=None, train_algo='iis', cutoff=4, sigma=0., trim_length=10): """Train a maximum entropy summarizer.""" train_docs = list(read_train_jsonl()) stopwords = None if stopwords_path is None else read_stopwords() model = MaxentSummarizer.train(train_docs, stopwords=stopwords, algorithm=train_algo, cutoff=cutoff, sigma=sigma, trim_length=trim_length) _log.info('Saving model to %s', model_path) with open(model_path, 'w') as f: print(dump(model), file=f) if SAVE_FILES: _run.add_artifact(model_path)
def test( seed, model_path, embedding_path, batch_size, embedding_dim, lstm_hidden_size, attention_size, delete_temps, max_doc_len, max_sen_len, grad_clip_val, learning_rate, _log, ): pl.utilities.seed.seed_everything(seed) hssas = HSSAS.load_from_checkpoint(model_path) dm = IndosumDataModule( read_train_jsonl(), read_dev_jsonl(), read_test_jsonl(), embedding_path, max_doc_len, max_sen_len, 4 ) summaries = ( summary for x, _, doc_lens, batch_sent_lens in dm.test_dataloader() for summary in hssas(x, doc_lens, batch_sent_lens) ) for s in summaries: return eval_summaries( summaries, list(read_test_jsonl())[1509:1511], logger=_log, delete_temps=delete_temps ) return