Beispiel #1
0
    train_sentences = conll.read_sentences(train_file)
    formatted_corpus = [
        conll.split_rows(sentence, column_names)
        for sentence in train_sentences
    ]
    # print(formatted_corpus[0])

    counts = Counts(formatted_corpus, column_names, POS_key)
    counts.count_all()
    # counts.print_stats()
    counts.print_debug()
    # print(counts.sentences[0])
    # exit()
    if corpus == 'CoNLL2009':
        cm = ConfusionMatrix(formatted_corpus, POS_key)
        cm.compute_matrix()
        cm.print()
        print("Accuracy: ", cm.compute_accuracy())

    dev_sentences = conll.read_sentences(dev_file)
    formatted_dev_corpus = [
        conll.split_rows(sentence, column_names) for sentence in dev_sentences
    ]

    POS_distr_unk = counts.unk_word_POS_distr(formatted_dev_corpus)
    print('POS distribution of unknown words:', POS_distr_unk)

    hmm_prob = HMMProb(counts)
    if corpus == 'CoNLL2009':
        print("prob: ", hmm_prob.compute_tri("tabl", ("<s>", "DT"), "NN"))
    elif corpus == 'CoNLL-U':