Ejemplo n.º 1
0
def main(bert_model, conll_path, simple_connectives, sense_lvl):
    logger = init_logger()
    logger.info('Load data')
    docs_val = load_bert_conll_dataset(os.path.join(conll_path, 'en.dev'),
                                       simple_connectives=simple_connectives,
                                       cache_dir=os.path.join(
                                           conll_path,
                                           f'en.dev.{bert_model}.joblib'),
                                       bert_model=bert_model,
                                       sense_level=sense_lvl)
    docs_test = load_bert_conll_dataset(os.path.join(conll_path, 'en.test'),
                                        simple_connectives=simple_connectives,
                                        cache_dir=os.path.join(
                                            conll_path,
                                            f'en.test.{bert_model}.joblib'),
                                        bert_model=bert_model,
                                        sense_level=sense_lvl)
    docs_blind = load_bert_conll_dataset(
        os.path.join(conll_path, 'en.blind-test'),
        simple_connectives=simple_connectives,
        cache_dir=os.path.join(conll_path,
                               f'en.blind-test.{bert_model}.joblib'),
        bert_model=bert_model,
        sense_level=sense_lvl)
    docs_train = load_bert_conll_dataset(os.path.join(conll_path, 'en.train'),
                                         simple_connectives=simple_connectives,
                                         cache_dir=os.path.join(
                                             conll_path,
                                             f'en.train.{bert_model}.joblib'),
                                         bert_model=bert_model,
                                         sense_level=sense_lvl)
    logger.info('Init model')
    clf = ImplicitArgumentExtractor(window_length=100,
                                    input_dim=docs_val[0].get_embedding_dim(),
                                    hidden_dim=128,
                                    rnn_dim=256)
    clf.batch_size = 256
    logger.info('Train model')
    clf.fit(docs_train, docs_val)
    clf.save('models/nn')
    logger.info('Evaluation on TEST')
    clf.score(docs_val)
    for title, docs_eval in [('TEST', docs_test), ('BLIND', docs_blind)]:
        logger.info(f'Evaluate parser {title}')
        preds = [d.with_relations(clf.parse(d)) for d in docs_eval]
        for threshold in [0.7, 0.95]:
            res = evaluate_docs([
                d.with_relations([
                    r for r in d.relations if r.type in ['Implicit', 'EntRel']
                ]) for d in docs_eval
            ],
                                preds,
                                threshold=threshold)
            print_results(res, title=f'{title}-{threshold}')
Ejemplo n.º 2
0
def main(conll_path):
    logger = init_logger()
    docs_val = load_bert_conll_dataset(os.path.join(conll_path, 'en.dev'),
                                       cache_dir=os.path.join(
                                           conll_path,
                                           'en.dev.bert-base-cased.joblib'))
    docs_train = load_bert_conll_dataset(
        os.path.join(conll_path, 'en.train'),
        cache_dir=os.path.join(conll_path, 'en.train.bert-base-cased.joblib'))
    clf = ConnectiveSenseClassifier(input_dim=docs_val[0].get_embedding_dim(),
                                    used_context=2)
    logger.info('Train model')
    clf.fit(docs_train, docs_val)
    logger.info('Evaluation on TRAIN')
    clf.score(docs_train)
    logger.info('Evaluation on TEST')
    clf.score(docs_val)
    # logger.info('Parse one document')
    # print(docs_val[0].to_json())
    print(clf.parse(docs_val[0], []))
    preds = [d.with_relations(clf.parse(d)) for d in docs_val]
    print_results(evaluate_docs(docs_val, preds))
Ejemplo n.º 3
0
def main(conll_path, pred_path, threshold, simple_connectives, sense_level):
    gold_docs = load_parsed_conll_dataset(conll_path, simple_connectives=simple_connectives, sense_level=sense_level)
    pred_docs = load_documents(get_fh(pred_path))
    pred_doc_ids = {doc.doc_id for doc in pred_docs}
    gold_docs = [doc for doc in gold_docs if doc.doc_id in pred_doc_ids]
    if not pred_docs or not gold_docs:
        logger.warning('No documents found')
        return
    print_results(evaluate_docs(
        [doc.with_relations([r for r in doc.relations if r.is_explicit()]) for doc in gold_docs],
        [doc.with_relations([r for r in doc.relations if r.is_explicit()]) for doc in pred_docs],
        threshold=threshold), title='EXPLICIT')
    print_results(evaluate_docs(
        [doc.with_relations([r for r in doc.relations if not r.is_explicit()]) for doc in gold_docs],
        [doc.with_relations([r for r in doc.relations if not r.is_explicit()]) for doc in pred_docs],
        threshold=threshold), title='NON-EXPLICIT')
    print_results(evaluate_docs(gold_docs, pred_docs, threshold=threshold), title='ALL')
Ejemplo n.º 4
0
def main(bert_model, conll_path, save_path, simple_connectives, sense_lvl=2):
    logger = init_logger()
    logger.info('Load train data')
    docs_train = load_bert_conll_dataset(os.path.join(conll_path, 'en.train'),
                                         simple_connectives=simple_connectives,
                                         cache_dir=os.path.join(
                                             conll_path,
                                             f'en.train.{bert_model}.joblib'),
                                         bert_model=bert_model,
                                         sense_level=sense_lvl)
    logger.info('Load dev data')
    docs_val = load_bert_conll_dataset(os.path.join(conll_path, 'en.dev'),
                                       simple_connectives=simple_connectives,
                                       cache_dir=os.path.join(
                                           conll_path,
                                           f'en.dev.{bert_model}.joblib'),
                                       bert_model=bert_model,
                                       sense_level=sense_lvl)
    docs_test = load_bert_conll_dataset(os.path.join(conll_path, 'en.test'),
                                        simple_connectives=simple_connectives,
                                        cache_dir=os.path.join(
                                            conll_path,
                                            f'en.test.{bert_model}.joblib'),
                                        bert_model=bert_model,
                                        sense_level=sense_lvl)
    docs_blind = load_bert_conll_dataset(
        os.path.join(conll_path, 'en.blind-test'),
        simple_connectives=simple_connectives,
        cache_dir=os.path.join(conll_path,
                               f'en.blind-test.{bert_model}.joblib'),
        bert_model=bert_model,
        sense_level=sense_lvl)
    logger.info('Init model')
    parser = ParserPipeline([
        ConnectiveSenseClassifier(input_dim=docs_val[0].get_embedding_dim(),
                                  used_context=1),
        ConnectiveArgumentExtractor(window_length=100,
                                    input_dim=docs_val[0].get_embedding_dim(),
                                    hidden_dim=256,
                                    rnn_dim=512,
                                    ckpt_path=save_path),
        NonExplicitRelationClassifier(
            input_dim=docs_val[0].get_embedding_dim(), arg_length=50),
    ])
    logger.info('Train model')
    parser.fit(docs_train, docs_val)
    parser.save(save_path)
    logger.info('LOAD model')
    parser.load(save_path)
    for title, docs_eval in [('TEST', docs_test), ('BLIND', docs_blind)]:
        logger.info(f'Evaluate parser {title}')
        test_preds = parser.parse(docs_eval)
        for threshold in [0.7, 0.9]:
            print_results(evaluate_docs([
                d.with_relations(d.get_explicit_relations()) for d in docs_eval
            ], [
                d.with_relations(d.get_explicit_relations())
                for d in test_preds
            ],
                                        threshold=threshold),
                          title=f'{title}-EXPLICIT-{threshold}')
            print_results(evaluate_docs([
                d.with_relations(
                    [r for r in d.relations if not r.is_explicit()])
                for d in docs_eval
            ], [
                d.with_relations(
                    [r for r in d.relations if not r.is_explicit()])
                for d in test_preds
            ],
                                        threshold=threshold),
                          title=f'{title}-NON-EXPLICIT-{threshold}')