コード例 #1
0
        features.update(base_features)
        features.update(prefix_suffix_features)

        # return list(features.values())
        return features
 
 
if __name__ == "__main__":
    file_path = sys.argv[1]

    chunked_sents = [tree2conlltags(chunk.conllstr2tree(s)) for s in open(file_path).read().strip().split("\n\n")]

    random.shuffle(chunked_sents)

    train_sents = []#chunked_sents[:int(len(chunked_sents) * 0.7)]
    test_sents = chunked_sents[int(len(chunked_sents) * 0.7 + 1):]

    ### CRF Chunker
    chunker = CRFChunkParser2(chunked_sents=train_sents, model_file="russian_chunker.crf")
    print(chunker.evaluate([conlltags2tree(s) for s in test_sents]))

    ### Grammar chunker
    chunked_sents = [chunk.conllstr2tree(s, chunk_types=('NP',)) for s in open(file_path).read().strip().split("\n\n")]
    from nltk import RegexpParser
    grammar = r"""
        NP:
            {<S.*|A.*>*<S.*>}  # Nouns and Adjectives, terminated with Nouns
        """
    chunker = RegexpParser(grammar)
    print(chunker.evaluate(chunked_sents))