Ejemplo n.º 1
0
def test_readonly():
    Document.add_property('some_property', 123)
    nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, lang='en')
    doc = nlp(EN_DOC)
    assert doc.some_property == 123
    with pytest.raises(ValueError):
        doc.some_property = 456
Ejemplo n.º 2
0
 def __init__(self, data, word2idx, tolower=True):
     super(GraphData, self).__init__()
     g_p = utils.doc2graph(Document(data[config.pf]))
     g_h = utils.doc2graph(Document(data[config.hf]))
     self.edge_index_p = g_p.edge_index
     self.edge_index_h = g_h.edge_index
     #print(g_p.node_attr)
     # care [ROOT] [UNK] should not get lower!!!
     if tolower == True:
         self.x_p = torch.tensor([
             word2idx[w.lower() if w[0] != "[" or w[-1] != "]" else w]
             for w in g_p.node_attr
         ],
                                 dtype=torch.long)
         self.x_h = torch.tensor([
             word2idx[w.lower() if w[0] != "[" or w[-1] != "]" else w]
             for w in g_h.node_attr
         ],
                                 dtype=torch.long)
     else:
         print("not to lower")
         self.x_p = torch.tensor([word2idx[w] for w in g_p.node_attr],
                                 dtype=torch.long)
         self.x_h = torch.tensor([word2idx[w] for w in g_h.node_attr],
                                 dtype=torch.long)
     label_onehot = torch.zeros([1, config.NUM_CLASSES])
     label_onehot[0][data[config.lf]] = 1
     #label_onehot = label_onehot.squeeze()
     #print(label_onehot.size())
     self.label = label_onehot.to(dtype=torch.float)
     self.pid = data[config.idf]
Ejemplo n.º 3
0
def check_mwt(filename):
    """
    Checks whether or not there are MWTs in the given conll file
    """
    doc = Document(CoNLL.conll2dict(filename))
    data = doc.get_mwt_expansions(False)
    return len(data) > 0
Ejemplo n.º 4
0
 def __init__(self, data, word2idx, tolower=True):
     super(GraphData, self).__init__()
     g_p = utils.doc2graph(Document(data[config.pf]))
     g_h = utils.doc2graph(Document(data[config.hf]))
     self.edge_index_p = g_p.edge_index
     self.edge_index_h = g_h.edge_index
     #print(g_p.node_attr)
     # care [ROOT] [UNK] should not get lower!!!
     if tolower == True:
         self.x_p = torch.tensor([
             word2idx[w.lower() if w[0] != "[" or w[-1] != "]" else w]
             for w in g_p.node_attr
         ],
                                 dtype=torch.long)
         self.x_h = torch.tensor([
             word2idx[w.lower() if w[0] != "[" or w[-1] != "]" else w]
             for w in g_h.node_attr
         ],
                                 dtype=torch.long)
     else:
         print("not to lower")
         self.x_p = torch.tensor([word2idx[w] for w in g_p.node_attr],
                                 dtype=torch.long)
         self.x_h = torch.tensor([word2idx[w] for w in g_h.node_attr],
                                 dtype=torch.long)
     self.label = data[config.lf]
     self.pid = data[config.idf]
Ejemplo n.º 5
0
def evaluate(args):
    # file paths
    system_pred_file = args['output_file']
    gold_file = args['gold_file']
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
            else '{}/{}_mwt_expander.pt'.format(args['save_dir'], args['shorthand'])

    # load model
    use_cuda = args['cuda'] and not args['cpu']
    trainer = Trainer(model_file=model_file, use_cuda=use_cuda)
    loaded_args, vocab = trainer.args, trainer.vocab

    for k in args:
        if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand']:
            loaded_args[k] = args[k]
    logger.debug('max_dec_len: %d' % loaded_args['max_dec_len'])

    # load data
    logger.debug("Loading data with batch size {}...".format(
        args['batch_size']))
    doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    batch = DataLoader(doc,
                       args['batch_size'],
                       loaded_args,
                       vocab=vocab,
                       evaluation=True)

    if len(batch) > 0:
        dict_preds = trainer.predict_dict(
            batch.doc.get_mwt_expansions(evaluation=True))
        # decide trainer type and run eval
        if loaded_args['dict_only']:
            preds = dict_preds
        else:
            logger.info("Running the seq2seq model...")
            preds = []
            for i, b in enumerate(batch):
                preds += trainer.predict(b)

            if loaded_args.get('ensemble_dict', False):
                preds = trainer.ensemble(
                    batch.doc.get_mwt_expansions(evaluation=True), preds)
    else:
        # skip eval if dev data does not exist
        preds = []

    # write to file and score
    doc = copy.deepcopy(batch.doc)
    doc.set_mwt_expansions(preds)
    CoNLL.dict2conll(doc.to_dict(), system_pred_file)

    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        logger.info("MWT expansion score: {} {:.2f}".format(
            args['shorthand'], score * 100))
def test_dict_to_doc_and_doc_to_dict():
    doc = Document(DICT)
    dicts = doc.to_dict()
    dicts_tupleid = []
    for sentence in dicts:
        items = []
        for item in sentence:
            item['id'] = item['id'] if isinstance(item['id'], tuple) else (item['id'], )
            items.append(item)
        dicts_tupleid.append(items)
    assert dicts_tupleid == DICT
Ejemplo n.º 7
0
def _create_stanza_document(sentence_dicts: List[List[Dict[str, str]]],
                            document_text: str) -> Document:
    stanza_document = Document(sentence_dicts, text=document_text)
    contains_entities = False
    for sentence_index, sentence_dict in enumerate(sentence_dicts):
        first_token = sentence_dict[0]
        sentence_sentiment = first_token["sentence_sentiment"]
        if sentence_sentiment is not None:
            stanza_document.sentences[
                sentence_index].sentiment = sentence_sentiment
        if "ner" in first_token:
            contains_entities = True
    if contains_entities:
        stanza_document.build_ents()
    return stanza_document
Ejemplo n.º 8
0
    def process(self, doc):
        """
        Run language detection on a string, a Document, or a list of either, route to language specific pipeline
        """

        # only return a list if given a list
        singleton_input = not isinstance(doc, list)
        if singleton_input:
            docs = [doc]
        else:
            docs = doc

        if docs and isinstance(docs[0], str):
            docs = [Document([], text=text) for text in docs]

        # run language identification
        docs_w_langid = self.lang_id_pipeline.process(docs)

        # create language specific batches, store global idx with each doc
        lang_batches = {}
        for doc in docs_w_langid:
            if doc.lang not in lang_batches:
                lang_batches[doc.lang] = []
            lang_batches[doc.lang].append(doc)

        # run through each language, submit a batch to the language specific pipeline
        for lang in lang_batches.keys():
            self._update_pipeline_cache(lang)
            self.pipeline_cache[lang](lang_batches[lang])

        # only return a list if given a list
        if singleton_input:
            return docs_w_langid[0]
        else:
            return docs_w_langid
Ejemplo n.º 9
0
    def _process_list(self, docs):
        """
        Identify language of list of strings or Documents
        """

        if len(docs) == 0:
            # TO DO: what standard do we want for bad input, such as empty list?
            # TO DO: more handling of bad input
            return

        if isinstance(docs[0], str):
            docs = [Document([], text) for text in docs]

        docs_by_length = {}
        for doc in docs:
            text = LangIDProcessor.clean_text(
                doc.text) if self._clean_text else doc.text
            doc_length = len(text)
            if doc_length not in docs_by_length:
                docs_by_length[doc_length] = []
            docs_by_length[doc_length].append((doc, text))

        for doc_length in docs_by_length:
            inputs = [doc[1] for doc in docs_by_length[doc_length]]
            predictions = self._id_langs(self._text_to_tensor(inputs))
            for doc, lang in zip(docs_by_length[doc_length], predictions):
                doc[0].lang = lang

        return docs
Ejemplo n.º 10
0
def test_xpos_attribute():
    doc = Document(TEST_ONE_SENTENCE)
    response = semgrex.process_doc(
        doc, "{tag:NNP}=source <=zzz {word:Opal}=target")
    check_response(response, response_len=1, source_index=2, reln='compound')
    response = semgrex.process_doc(
        doc, "{pos:NNP}=source <=zzz {word:Opal}=target")
    check_response(response, response_len=1, source_index=2, reln='compound')
Ejemplo n.º 11
0
def doc2graph(doc, word2idx=None):
    """
    2020/8/4 18:30
    input Stanza Document : doc
    output PytorchGeoData : G
    G = {
     x: id tensor
     edge_idx : edges size = (2, l-1)
     edge_attr: (u, v, edge_type in str)
     node_attr: text
    }
    """
    if isinstance(doc,
                  list):  #convert to Doc first if is in dict form ([[dict]])
        doc = Document(doc)
    # add root token for each sentences
    e = [[], []]
    edge_info = []
    node_info = []
    prev_token_sum = 0
    prev_root_id = 0
    cur_root_id = 0
    # get original dependency
    for idx, sent in enumerate(doc.sentences):
        sent.print_dependencies
        # node info by index(add root at the beginning of every sentence)
        cur_root_id = len(node_info)
        node_info.append("[ROOT]")
        for token in sent.tokens:
            node_info.append(token.to_dict()[0]['text'])
        # edge info by index of u in edge (u,v)
        for dep in sent.dependencies:
            id1 = prev_token_sum + int(dep[0].to_dict()["id"])
            id2 = prev_token_sum + int(dep[2].to_dict()["id"])
            e[0].append(id1)
            e[1].append(id2)
            edge_info.append((id1, id2, dep[1]))
        prev_token_sum += len(sent.tokens) + 1
        # add links between sentence roots
        if (cur_root_id != 0):
            id1 = prev_root_id
            id2 = cur_root_id
            e[0].append(id1)
            e[1].append(id2)
            edge_info.append((id1, id2, "bridge"))
        prev_root_id = cur_root_id
    # id to embeddings
    # x = torch.tensor([ for token in node_attr])
    # done building edges and nodes
    if word2idx == None:
        # print("x is not id now, node info is in node_attr as text")
        x = torch.tensor(list(range(doc.num_tokens + len(doc.sentences))))
    else:
        x = torch.tensor([word2idx[token] for token in node_info])
    e = torch.tensor(e)
    G = Data(x=x, edge_index=e, edge_attr=edge_info, node_attr=node_info)
    return G
Ejemplo n.º 12
0
def test_dict_to_doc_and_doc_to_dict():
    """
    Test the conversion from raw dict to Document and back
    This code path will first turn start_char|end_char into start_char & end_char fields in the Document
    That version to a dict will have separate fields for each of those
    Finally, the conversion from that dict to a list of conll entries should convert that back to misc
    """
    doc = Document(DICT)
    dicts = doc.to_dict()
    dicts_tupleid = []
    for sentence in dicts:
        items = []
        for item in sentence:
            item['id'] = item['id'] if isinstance(item['id'], tuple) else (item['id'], )
            items.append(item)
        dicts_tupleid.append(items)
    conll = CoNLL.convert_dict(DICT)
    assert conll == CONLL
Ejemplo n.º 13
0
 def __init__(self, data):
     super(GraphData, self).__init__()
     # graph(edge) info, does not care batch
     g_p = utils.doc2graph(Document(data[config.pf]))
     g_h = utils.doc2graph(Document(data[config.hf]))
     self.edge_index_p = g_p.edge_index
     self.edge_index_h = g_h.edge_index
     self.edge_attr_p = g_p.edge_attr
     self.edge_attr_h = g_h.edge_attr
     # node info, care batch
     self.node_attr_p = g_p.node_attr
     self.node_attr_h = g_h.node_attr
     # one-hot label (1*num_classes), direct in batch first form
     label_onehot = torch.zeros([1, config.NUM_CLASSES])
     label_onehot[0][data[config.lf]] = 1
     self.label = label_onehot.to(dtype=torch.float)
     # problem id, direct in batch first form
     self.pid = data[config.idf]
Ejemplo n.º 14
0
    def bulk_process(self, docs):
        """
        Most processors operate on the sentence level, where each sentence is processed independently and processors can benefit
        a lot from the ability to combine sentences from multiple documents for faster batched processing. This is a transparent
        implementation that allows these processors to batch process a list of Documents as if they were from a single Document.
        """

        if hasattr(self, '_variant'):
            return self._variant.bulk_process(docs)

        combined_sents = [sent for doc in docs for sent in doc.sentences]
        combined_doc = Document([])
        combined_doc.sentences = combined_sents
        combined_doc.num_tokens = sum(doc.num_tokens for doc in docs)
        combined_doc.num_words = sum(doc.num_words for doc in docs)

        self.process(combined_doc) # annotations are attached to sentence objects

        return docs
Ejemplo n.º 15
0
def test_bulk(pipeline):
    NUM_DOCS = 10
    raw_text = [FR_MWT_SENTENCE] * NUM_DOCS
    raw_doc = [Document([], text=doccontent) for doccontent in raw_text]

    result = pipeline(raw_doc)

    assert len(result) == NUM_DOCS
    for doc in result:
        compare_ignoring_whitespace(str(doc), EXPECTED_RESULT)
        assert len(doc.sentences) == 1
        assert doc.num_words == 26
        assert doc.num_tokens == 24
Ejemplo n.º 16
0
def lemmatize(lemmatizer, conllu, morphs):
    def clean_final(text):
        finals = {"פ":"ף","כ":"ך","מ":"ם","נ":"ן","צ":"ץ"}
        if text[-1] in finals:
            text = text[:-1] + finals[text[-1]]
        return text

    def post_process(word, pos, lemma, morph):
        if word == lemma:
            if word + "\t" + pos in lex:
                if pos == "VERB" and "Fut" in morph:
                    lemma = lex[word + "\t" + pos]
                if pos == "VERB" and "Pres" in morph:
                    lemma = lex[word + "\t" + pos]
                if pos == "VERB" and "Part" in morph:
                    lemma = lex[word + "\t" + pos]
                if pos in ["NOUN", "ADJ"] and "Plur" in morph:
                    lemma = lex[word + "\t" + pos]
            else:
                if "Plur" in morph and pos in ["NOUN", "ADJ"] and (
                        word.endswith("ים") or word.endswith("ות")):
                    lemma = lemma[:-2]
                    if word.endswith("ות"):
                        lemma += "ה"
                    lemma = clean_final(lemma)
        return lemma

    uposed = [[l.split("\t") for l in s.split("\n")] for s in conllu.strip().split("\n\n")]
    dicts = CoNLL.convert_conll(uposed)
    for sent in dicts:
        for tok in sent:
            tok["id"] = int(tok["id"][0])
    doc = Document(dicts)
    lemmatized = lemmatizer(doc)
    output = []
    counter = 0
    for sent in lemmatized.sentences:
        for tok in sent.tokens:
            word = tok.words[0]
            lemma = word.lemma
            if lemmatizer.do_post_process:
                lemma = post_process(word.text, word.upos, word.lemma, morphs[counter])
            row = [str(word.id), word.text, lemma, word.upos, word.xpos, '_', str(word.head), "_", "_", "_"]
            output.append("\t".join(row))
            counter += 1
        output.append("")
    lemmatized = "\n".join(output)
    lemmatized = get_col(lemmatized,2)

    return lemmatized
Ejemplo n.º 17
0
 def StanzaAPI(self,conllu):
   d=[]
   e=[]
   for s in conllu.split("\n"):
     if s=="" or s.startswith("#"):
       if e!=[]:
         d.append(list(e))
         e=[]
     else:
       t=s.split("\t")
       e.append({"id":int(t[0]),"text":t[1],"lemma":t[2],"upos":t[3],"xpos":t[4],"misc":t[9]})
   from stanza.models.common.doc import Document
   from stanza.utils.conll import CoNLL
   return CoNLL.conll_as_string(CoNLL.convert_dict(self.model(Document(d)).to_dict()))
Ejemplo n.º 18
0
def prep_conllu(tb, file_path, overwrite):
    out_file = out_dir.joinpath(file_path.name)
    if out_file.exists() and not overwrite:
        print(f"{out_file.name} exists; skipping")
        return None
    lang, tb, tb_kwargs = determine_treebank(tb)
    if not lang:
        shutil.copy(file_path, out_file)
        return None
    doc = Document(CoNLL.conll2dict(input_file=file_path))
    nlp = stanza.Pipeline(lang=lang,
                          processors='tokenize,mwt,pos',
                          tokenize_pretokenized=True)
    doc = nlp.processors['pos'].process(doc)
    return doc
Ejemplo n.º 19
0
    def _encode_parse(sen, field_names) -> Document:
        """
        Converts from xtsv sentence to Stanza Document
        :param sen: An xtsv sentence
        :param field_names: Field names
        :return: Stanza Document containing one sentence
        """

        stanza_sentence = [{
            'id': i,
            'text': line[field_names['form']],
            'lemma': line[field_names['lemma']],
            'upos': line[field_names['upostag']],
            'feats': line[field_names['feats']],
        } for i, line in enumerate(sen, start=1)]

        return Document([stanza_sentence])
Ejemplo n.º 20
0
def retag_trees(trees, pipeline, xpos=True):
    """
    Retag all of the trees using the given processor

    Returns a list of new trees
    """
    sentences = []
    for tree in trees:
        tokens = [{TEXT: pt.children[0].label} for pt in tree.preterminals()]
        sentences.append(tokens)

    doc = Document(sentences)
    doc = pipeline(doc)
    if xpos:
        tag_lists = [[x.xpos for x in sentence.words] for sentence in doc.sentences]
    else:
        tag_lists = [[x.upos for x in sentence.words] for sentence in doc.sentences]

    new_trees = [replace_tags(tree, tags) for tree, tags in zip(trees, tag_lists)]
    return new_trees
Ejemplo n.º 21
0
def doc(sentences_dict):
    doc = Document(sentences_dict)
    return doc
Ejemplo n.º 22
0
def test_two_semgrex():
    doc = Document(TEST_ONE_SENTENCE)
    response = semgrex.process_doc(doc, "{}=source >obj=zzz {}=target",
                                   "{}=source >obj=zzz {}=target")
    check_response(response, semgrex_len=2)
Ejemplo n.º 23
0
def test_two_sentences():
    doc = Document(TEST_TWO_SENTENCES)
    response = semgrex.process_doc(doc, "{}=source >obj=zzz {}=target")
    check_response(response, response_len=2)
Ejemplo n.º 24
0
def test_lemma_attribute():
    doc = Document(TEST_ONE_SENTENCE)
    response = semgrex.process_doc(
        doc, "{lemma:Mox}=source <=zzz {lemma:Opal}=target")
    check_response(response, response_len=1, source_index=2, reln='compound')
Ejemplo n.º 25
0
                out_file = out_dir.joinpath(file_path.name)
                write_doc_to_file(doc, out_file)
            else:
                out_train_file = out_dir.joinpath(file_path.name)
                dev_name = file_path.name.split("-")[0] + "-ud-dev.conllu"
                out_dev_file = out_dir.joinpath(dev_name)
                out_dev_file.touch()

                sents = doc.to_dict()

                permutations = np.random.permutation(np.arange(len(sents))).tolist()
                divide = len(sents) * 7 // 8
                train_sents = [sents[idx] for idx in permutations[:divide]]
                dev_sents = [sents[idx] for idx in permutations[divide:]]

                train_doc = Document(train_sents)
                write_doc_to_file(train_doc, out_train_file)

                dev_doc = Document(dev_sents)
                write_doc_to_file(dev_doc, out_dev_file)

elif args.split == "test":
    kwargs = {}
    if args.case == "preprocess":
        kwargs['processors'] = 'tokenize,mwt,pos,lemma'
    if not args.lang_file:
        txt_files = list(ud_root.glob("*/*_*.txt"))
        assert len(txt_files) == 82, f"number of txt_files = {len(txt_files)}; should be 82"
    else:
        with open(args.lang_file) as fp:
            tbs = fp.read().splitlines()
Ejemplo n.º 26
0
def test_ner_attribute():
    doc = Document(TEST_ONE_SENTENCE)
    response = semgrex.process_doc(
        doc, "{cpos:PROPN}=source <=zzz {ner:GEM}=target")
    check_response(response, response_len=1, source_index=2, reln='compound')
Ejemplo n.º 27
0
def train(args):
    # load data
    logger.debug('max_dec_len: %d' % args['max_dec_len'])
    logger.debug("Loading data with batch size {}...".format(
        args['batch_size']))
    train_doc = Document(CoNLL.conll2dict(input_file=args['train_file']))
    train_batch = DataLoader(train_doc,
                             args['batch_size'],
                             args,
                             evaluation=False)
    vocab = train_batch.vocab
    args['vocab_size'] = vocab.size
    dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    dev_batch = DataLoader(dev_doc,
                           args['batch_size'],
                           args,
                           vocab=vocab,
                           evaluation=True)

    utils.ensure_dir(args['save_dir'])
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
            else '{}/{}_mwt_expander.pt'.format(args['save_dir'], args['shorthand'])

    # pred and gold path
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    # skip training if the language does not have training or dev data
    if len(train_batch) == 0 or len(dev_batch) == 0:
        logger.warning("Skip training because no data available...")
        return

    # train a dictionary-based MWT expander
    trainer = Trainer(args=args, vocab=vocab, use_cuda=args['cuda'])
    logger.info("Training dictionary-based MWT expander...")
    trainer.train_dict(train_batch.doc.get_mwt_expansions(evaluation=False))
    logger.info("Evaluating on dev set...")
    dev_preds = trainer.predict_dict(
        dev_batch.doc.get_mwt_expansions(evaluation=True))
    doc = copy.deepcopy(dev_batch.doc)
    doc.set_mwt_expansions(dev_preds)
    CoNLL.dict2conll(doc.to_dict(), system_pred_file)
    _, _, dev_f = scorer.score(system_pred_file, gold_file)
    logger.info("Dev F1 = {:.2f}".format(dev_f * 100))

    if args.get('dict_only', False):
        # save dictionaries
        trainer.save(model_file)
    else:
        # train a seq2seq model
        logger.info("Training seq2seq-based MWT expander...")
        global_step = 0
        max_steps = len(train_batch) * args['num_epoch']
        dev_score_history = []
        best_dev_preds = []
        current_lr = args['lr']
        global_start_time = time.time()
        format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'

        # start training
        for epoch in range(1, args['num_epoch'] + 1):
            train_loss = 0
            for i, batch in enumerate(train_batch):
                start_time = time.time()
                global_step += 1
                loss = trainer.update(batch, eval=False)  # update step
                train_loss += loss
                if global_step % args['log_step'] == 0:
                    duration = time.time() - start_time
                    logger.info(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\
                                                  max_steps, epoch, args['num_epoch'], loss, duration, current_lr))

            # eval on dev
            logger.info("Evaluating on dev set...")
            dev_preds = []
            for i, batch in enumerate(dev_batch):
                preds = trainer.predict(batch)
                dev_preds += preds
            if args.get('ensemble_dict', False) and args.get(
                    'ensemble_early_stop', False):
                logger.info("[Ensembling dict with seq2seq model...]")
                dev_preds = trainer.ensemble(
                    dev_batch.doc.get_mwt_expansions(evaluation=True),
                    dev_preds)
            doc = copy.deepcopy(dev_batch.doc)
            doc.set_mwt_expansions(dev_preds)
            CoNLL.dict2conll(doc.to_dict(), system_pred_file)
            _, _, dev_score = scorer.score(system_pred_file, gold_file)

            train_loss = train_loss / train_batch.num_examples * args[
                'batch_size']  # avg loss per batch
            logger.info(
                "epoch {}: train_loss = {:.6f}, dev_score = {:.4f}".format(
                    epoch, train_loss, dev_score))

            # save best model
            if epoch == 1 or dev_score > max(dev_score_history):
                trainer.save(model_file)
                logger.info("new best model saved.")
                best_dev_preds = dev_preds

            # lr schedule
            if epoch > args['decay_epoch'] and dev_score <= dev_score_history[
                    -1]:
                current_lr *= args['lr_decay']
                trainer.change_lr(current_lr)

            dev_score_history += [dev_score]

        logger.info("Training ended with {} epochs.".format(epoch))

        best_f, best_epoch = max(dev_score_history) * 100, np.argmax(
            dev_score_history) + 1
        logger.info("Best dev F1 = {:.2f}, at epoch = {}".format(
            best_f, best_epoch))

        # try ensembling with dict if necessary
        if args.get('ensemble_dict', False):
            logger.info("[Ensembling dict with seq2seq model...]")
            dev_preds = trainer.ensemble(
                dev_batch.doc.get_mwt_expansions(evaluation=True),
                best_dev_preds)
            doc = copy.deepcopy(dev_batch.doc)
            doc.set_mwt_expansions(dev_preds)
            CoNLL.dict2conll(doc.to_dict(), system_pred_file)
            _, _, dev_score = scorer.score(system_pred_file, gold_file)
            logger.info("Ensemble dev F1 = {:.2f}".format(dev_score * 100))
            best_f = max(best_f, dev_score)
Ejemplo n.º 28
0
 def process(self, text):
     sentence = [{
         'id': (i + 1, ),
         'text': 'LOL'
     } for i, tok in enumerate(text.split())]
     return Document([sentence], text)
Ejemplo n.º 29
0
    def __init__(self, gold, pred, verbose=False, group=False):
        """
        Align golden and predicted tokens, and their tags. Create dictionaries of falsely predicted tags
        :param gold:  the gold conllu file
        :param pred: the predicted conlly file
        :param verbose: if true print information about token numbers
        :param group: if true, put falsely predicted ufeats labels into a dictionary that contains all the labels it was
        falsely assigned and the number of times each predicted label was found
        """

        gold = C.load_conll(open(gold,
                                 'r', encoding='utf8'))
        gold_dic = C.convert_conll(gold)  # returns a dictionary with all the column names
        gold_doc = Document(gold_dic)

        pred = C.load_conll(open(pred, 'r', encoding='utf8'))
        pred_dic = C.convert_conll(pred)  # returns a dictionary with all the column names
        pred_doc = Document(pred_dic)

        # get the tokens
        self.gold_tokens = [j['text'] for i in gold_dic for j in i]
        self.pred_tokens = [j['text'] for i in pred_dic for j in i]

        # get upos tags
        gold_tags = [j['upos'] for i in gold_dic for j in i]
        pred_tags = [j['upos'] for i in pred_dic for j in i]

        # get xpos tags
        gold_xpos = [j['xpos'] for i in gold_dic for j in i]
        pred_xpos = [j['xpos'] for i in pred_dic for j in i]

        # get ufeats tag
        gold_feats = list()
        pred_feats = list()
        for i in gold_dic:
            for j in i:
                if 'feats' in j:
                    gold_feats.append(j['feats'])
                else:
                    gold_feats.append('_')
        for i in pred_dic:
            for j in i:
                if 'feats' in j:
                    pred_feats.append(j['feats'])
                else:
                    pred_feats.append('_')

        if verbose:
            print('Number of gold tokens:', len(self.gold_tokens), ', number of predicted tokens:', len(self.pred_tokens))

        # align gold and predicted tokens
        cost, a2b, b2a, a2b_multi, b2a_multi = align(self.gold_tokens, self.pred_tokens)

        # align tokens and their POS tags separately
        self.aligned = list()  # tokens
        self.aligned_pos = list()  # upos
        self.aligned_feats = list()
        self.aligned_xpos = list()
        for i in range(len(b2a)):
            t = (self.gold_tokens[b2a[i]], self.pred_tokens[i])
            self.aligned.append(t)
            p = (gold_tags[b2a[i]], pred_tags[i])
            self.aligned_pos.append(p)
            f = (gold_feats[b2a[i]], pred_feats[i])
            self.aligned_feats.append(f)
            x = (gold_xpos[b2a[i]], pred_xpos[i])
            self.aligned_xpos.append(x)

        # align predicted tags to golden tags, not vice versa as before
        gold_aligned = list()
        for i in range(len(a2b)):
            t = (self.gold_tokens[i], self.pred_tokens[a2b[i]])
            gold_aligned.append(t)

        overall = list()
        for (a, b) in self.aligned:
            if a == b:
                overall.append((a, b))
        if verbose:
            print('Aligned tokens. GOLD:', len(gold_aligned), 'PREDICTED:', len(self.aligned), 'ALIGNED:', len(overall))

        self.conf_tags = {} # falsely predicted upos tags
        self.conf_tags_all = {}  # all upos tags
        self.incorrect_upos = 0  # number of incorrectly predicted upos tags
        # how many times different tags cooccured in gold and pred files
        i = 0
        for (a, b) in self.aligned_pos:
            if a != b:
                self.incorrect_upos += 1
                if (a, b) not in self.conf_tags:
                    self.conf_tags[(a, b)] = 1
                else:
                    self.conf_tags[(a, b)] += 1
            if (a, b) not in self.conf_tags_all:
                self.conf_tags_all[(a, b)] = 1
            else:
                self.conf_tags_all[(a, b)] += 1
            i += 1

        self.conf_feats = {}
        self.conf_feats_all = {}
        self.incorrect_feats = 0
        i = 0
        for (a, b) in self.aligned_feats:
            a = "|".join(sorted(feat for feat in a.split("|")
                                if feat.split("=", 1)[0] in UNIVERSAL_FEATURES))
            b = "|".join(sorted(feat for feat in b.split("|")
                                if feat.split("=", 1)[0] in UNIVERSAL_FEATURES))
            if a != b:
                self.incorrect_feats += 1
                # create a dictionary for each falsely predicted ufeats labels and group all its false predictions
                if group:
                    if a not in self.conf_feats:
                        self.conf_feats[a] = dict()
                        self.conf_feats[a][b] = 1
                    else:
                        if b not in self.conf_feats[a]:
                            self.conf_feats[a][b] = 1
                        else:
                            self.conf_feats[a][b] += 1
                else:
                    if (a, b) not in self.conf_feats:
                        self.conf_feats[(a, b)] = 1
                    else:
                        self.conf_feats[(a, b)] += 1
            if (a, b) not in self.conf_feats_all:
                self.conf_feats_all[(a, b)] = 1
            else:
                self.conf_feats_all[(a, b)] += 1
            i += 1

        self.conf_xpos = {}
        self.incorrect_xpos = 0
        i = 0
        for (a, b) in self.aligned_xpos:
            if a != b:
                self.incorrect_xpos += 1
                if (a, b) not in self.conf_xpos:
                    self.conf_xpos[(a, b)] = 1
                else:
                    self.conf_xpos[(a, b)] += 1
            i += 1
Ejemplo n.º 30
0
    def predict_with_pos(self, doc_dict):
        # fix potential memory leak
        torch.cuda.empty_cache()

        conllu_data = doc_dict["dep"]
        xml_data = doc_dict["xml"]

        conllu_data = re.sub(
            r'\n[0-9]+-[^\n]+\n', '\n', conllu_data
        )  # Remove any super tokens in input, we'll add them at the end

        # First parse - just get best deprel and heads
        diaparsed = diaparse(self.nlp1, conllu_data)
        doc = CoNLL.load_conll(io.StringIO(diaparsed))

        # overwrite xpos with our ensemble xpos
        doc_with_our_xpos = CoNLL.conll2dict(input_str=conllu_data)
        replace_xpos(doc, doc_with_our_xpos)

        doc = [["\t".join(l) for l in sent] for sent in doc]
        doc = "\n\n".join(["\n".join(sent) for sent in doc])

        # Second parse - postprocess based on:
        # 1. auxiliary parser predictions trained on EWT for PP attachment disambiguation
        ewt_parse = diaparse(self.aux_parser, conllu_data)
        doc = add_second_deps(doc, ewt_parse)
        # 2. sequence tagger deprel predictions using high quality POS tags and embeddings
        doc = add_sequence_tagger_preds(self.sequence_tagger, doc)
        # 3. postprocessing rules to adjudicate these predictions in a harmonized way
        doc = depedit1.run_depedit(doc)

        # Add upos
        uposed = depedit2.run_depedit(doc)
        uposed = [[l.split("\t") for l in s.split("\n")]
                  for s in uposed.strip().split("\n\n")]
        dicts = CoNLL.convert_conll(uposed)

        # Now add lemmas using Stanza based on pretagged predicted upos (converted from our predicted xpos)
        for sent in dicts:
            for tok in sent:
                tok["id"] = int(tok["id"])
        doc = Document(dicts)
        lemmatized = self.nlp2(doc)
        output = []
        for sent in lemmatized.sentences:
            for tok in sent.tokens:
                word = tok.words[0]
                row = [
                    str(word.id), word.text, word.lemma, word.upos, word.xpos,
                    '_',
                    str(word.head), word.deprel, "_", "_"
                ]
                output.append("\t".join(row))
            output.append("")
        lemmatized = "\n".join(output)

        # Postprocess implausible lemmas (VBG ending in -ed, VBN ending in -ing, etc., incorrect -e restoration...)
        lemmatized = postprocess_lemmas(lemmatized)

        # Fix punctuation
        lemmatized = fix_punct(lemmatized)

        if "<text id=" in xml_data:
            docname = re.search(r'<text id="([^"]+)"', xml_data).group(1)
            morphed_and_enhanced = depedit3.run_depedit(lemmatized,
                                                        sent_id=True,
                                                        sent_text=True,
                                                        docname=docname,
                                                        filename=docname)
        else:
            morphed_and_enhanced = depedit3.run_depedit(lemmatized,
                                                        sent_text=True)

        if xml_data != "":
            xmled = conllu2xml(morphed_and_enhanced, xml_data)
        else:
            xmled = ""

        return {"dep": morphed_and_enhanced, "xml": xmled}