コード例 #1
0
ファイル: midas_old.py プロジェクト: mikekestemont/Midas
def tag(test_items=[],
        WS="",
        tokenize=False,
        tokenizer_context=0,
        left_context=1,
        right_context=1,
        gazetteer=None,
        min_tok_freq=25,
        min_lem_freq=25,
        mode=""):
    """
    Tags a list of (potentially annotated) test tokens.
    """
    tokenized_tokens = []
    token_acc, token_f1 = None, None
    if tokenize:
        # load and apply a tokenizer:
        tokenizer = Tokenizer(context=tokenizer_context, WS=WS)
        if mode == "tag":
            tokenized_tokens = tokenizer.tokenize(test_items=test_items, gazetteer=gazetteer)    
        elif mode in ("test", "crossval"):
            items = []
            for item in test_items:
                if item == "<utt>":
                    items.append(item)
                else:
                    items.append(item[0].lower())
            token_acc, token_f1 = tokenizer.eval_tokenizer(test_items=items, gazetteer=gazetteer)
            # return the original tokens since we only tokenize for evaluation purposes:
            tokenized_tokens = items
    else:
        # assume the input has been properly tokenized already:
        if mode == "tag":
          tokenized_tokens = test_items
        elif mode == "test":
          tokenized_tokens = tuple(item[0].lower() for item in test_items)
    sequential_tagger = MaxentTagger(WS=WS, 
                                     left_context=left_context,
                                     right_context=right_context,
                                     min_tok_freq=min_tok_freq,
                                     min_lem_freq=min_lem_freq)
    sequential_tagger.load_models()
    tagged_items = sequential_tagger.tag(tokenized=tokenized_tokens, gazetteer=gazetteer)
    if mode in ("crossval", "test"):
        results = sequential_tagger.evaluate_tags_and_lemmas(gold_items=test_items,\
                                                             silver_items=tagged_items)
        if tokenize:
            results.extend((token_acc, token_f1))
        return results
    else:
        return tagged_items