def main(bert_model, conll_path, simple_connectives, sense_lvl): logger = init_logger() logger.info('Load data') docs_val = load_bert_conll_dataset(os.path.join(conll_path, 'en.dev'), simple_connectives=simple_connectives, cache_dir=os.path.join( conll_path, f'en.dev.{bert_model}.joblib'), bert_model=bert_model, sense_level=sense_lvl) docs_test = load_bert_conll_dataset(os.path.join(conll_path, 'en.test'), simple_connectives=simple_connectives, cache_dir=os.path.join( conll_path, f'en.test.{bert_model}.joblib'), bert_model=bert_model, sense_level=sense_lvl) docs_blind = load_bert_conll_dataset( os.path.join(conll_path, 'en.blind-test'), simple_connectives=simple_connectives, cache_dir=os.path.join(conll_path, f'en.blind-test.{bert_model}.joblib'), bert_model=bert_model, sense_level=sense_lvl) docs_train = load_bert_conll_dataset(os.path.join(conll_path, 'en.train'), simple_connectives=simple_connectives, cache_dir=os.path.join( conll_path, f'en.train.{bert_model}.joblib'), bert_model=bert_model, sense_level=sense_lvl) logger.info('Init model') clf = ImplicitArgumentExtractor(window_length=100, input_dim=docs_val[0].get_embedding_dim(), hidden_dim=128, rnn_dim=256) clf.batch_size = 256 logger.info('Train model') clf.fit(docs_train, docs_val) clf.save('models/nn') logger.info('Evaluation on TEST') clf.score(docs_val) for title, docs_eval in [('TEST', docs_test), ('BLIND', docs_blind)]: logger.info(f'Evaluate parser {title}') preds = [d.with_relations(clf.parse(d)) for d in docs_eval] for threshold in [0.7, 0.95]: res = evaluate_docs([ d.with_relations([ r for r in d.relations if r.type in ['Implicit', 'EntRel'] ]) for d in docs_eval ], preds, threshold=threshold) print_results(res, title=f'{title}-{threshold}')
def main(conll_path): logger = init_logger() docs_val = load_bert_conll_dataset(os.path.join(conll_path, 'en.dev'), cache_dir=os.path.join( conll_path, 'en.dev.bert-base-cased.joblib')) docs_train = load_bert_conll_dataset( os.path.join(conll_path, 'en.train'), cache_dir=os.path.join(conll_path, 'en.train.bert-base-cased.joblib')) clf = ConnectiveSenseClassifier(input_dim=docs_val[0].get_embedding_dim(), used_context=2) logger.info('Train model') clf.fit(docs_train, docs_val) logger.info('Evaluation on TRAIN') clf.score(docs_train) logger.info('Evaluation on TEST') clf.score(docs_val) # logger.info('Parse one document') # print(docs_val[0].to_json()) print(clf.parse(docs_val[0], [])) preds = [d.with_relations(clf.parse(d)) for d in docs_val] print_results(evaluate_docs(docs_val, preds))
def main(conll_path, pred_path, threshold, simple_connectives, sense_level): gold_docs = load_parsed_conll_dataset(conll_path, simple_connectives=simple_connectives, sense_level=sense_level) pred_docs = load_documents(get_fh(pred_path)) pred_doc_ids = {doc.doc_id for doc in pred_docs} gold_docs = [doc for doc in gold_docs if doc.doc_id in pred_doc_ids] if not pred_docs or not gold_docs: logger.warning('No documents found') return print_results(evaluate_docs( [doc.with_relations([r for r in doc.relations if r.is_explicit()]) for doc in gold_docs], [doc.with_relations([r for r in doc.relations if r.is_explicit()]) for doc in pred_docs], threshold=threshold), title='EXPLICIT') print_results(evaluate_docs( [doc.with_relations([r for r in doc.relations if not r.is_explicit()]) for doc in gold_docs], [doc.with_relations([r for r in doc.relations if not r.is_explicit()]) for doc in pred_docs], threshold=threshold), title='NON-EXPLICIT') print_results(evaluate_docs(gold_docs, pred_docs, threshold=threshold), title='ALL')
def main(bert_model, conll_path, save_path, simple_connectives, sense_lvl=2): logger = init_logger() logger.info('Load train data') docs_train = load_bert_conll_dataset(os.path.join(conll_path, 'en.train'), simple_connectives=simple_connectives, cache_dir=os.path.join( conll_path, f'en.train.{bert_model}.joblib'), bert_model=bert_model, sense_level=sense_lvl) logger.info('Load dev data') docs_val = load_bert_conll_dataset(os.path.join(conll_path, 'en.dev'), simple_connectives=simple_connectives, cache_dir=os.path.join( conll_path, f'en.dev.{bert_model}.joblib'), bert_model=bert_model, sense_level=sense_lvl) docs_test = load_bert_conll_dataset(os.path.join(conll_path, 'en.test'), simple_connectives=simple_connectives, cache_dir=os.path.join( conll_path, f'en.test.{bert_model}.joblib'), bert_model=bert_model, sense_level=sense_lvl) docs_blind = load_bert_conll_dataset( os.path.join(conll_path, 'en.blind-test'), simple_connectives=simple_connectives, cache_dir=os.path.join(conll_path, f'en.blind-test.{bert_model}.joblib'), bert_model=bert_model, sense_level=sense_lvl) logger.info('Init model') parser = ParserPipeline([ ConnectiveSenseClassifier(input_dim=docs_val[0].get_embedding_dim(), used_context=1), ConnectiveArgumentExtractor(window_length=100, input_dim=docs_val[0].get_embedding_dim(), hidden_dim=256, rnn_dim=512, ckpt_path=save_path), NonExplicitRelationClassifier( input_dim=docs_val[0].get_embedding_dim(), arg_length=50), ]) logger.info('Train model') parser.fit(docs_train, docs_val) parser.save(save_path) logger.info('LOAD model') parser.load(save_path) for title, docs_eval in [('TEST', docs_test), ('BLIND', docs_blind)]: logger.info(f'Evaluate parser {title}') test_preds = parser.parse(docs_eval) for threshold in [0.7, 0.9]: print_results(evaluate_docs([ d.with_relations(d.get_explicit_relations()) for d in docs_eval ], [ d.with_relations(d.get_explicit_relations()) for d in test_preds ], threshold=threshold), title=f'{title}-EXPLICIT-{threshold}') print_results(evaluate_docs([ d.with_relations( [r for r in d.relations if not r.is_explicit()]) for d in docs_eval ], [ d.with_relations( [r for r in d.relations if not r.is_explicit()]) for d in test_preds ], threshold=threshold), title=f'{title}-NON-EXPLICIT-{threshold}')