features.update(base_features) features.update(prefix_suffix_features) # return list(features.values()) return features if __name__ == "__main__": file_path = sys.argv[1] chunked_sents = [tree2conlltags(chunk.conllstr2tree(s)) for s in open(file_path).read().strip().split("\n\n")] random.shuffle(chunked_sents) train_sents = []#chunked_sents[:int(len(chunked_sents) * 0.7)] test_sents = chunked_sents[int(len(chunked_sents) * 0.7 + 1):] ### CRF Chunker chunker = CRFChunkParser2(chunked_sents=train_sents, model_file="russian_chunker.crf") print(chunker.evaluate([conlltags2tree(s) for s in test_sents])) ### Grammar chunker chunked_sents = [chunk.conllstr2tree(s, chunk_types=('NP',)) for s in open(file_path).read().strip().split("\n\n")] from nltk import RegexpParser grammar = r""" NP: {<S.*|A.*>*<S.*>} # Nouns and Adjectives, terminated with Nouns """ chunker = RegexpParser(grammar) print(chunker.evaluate(chunked_sents))