Example #1
0
File: main.py Project: kail/csep517
def main():
    logging.info("Starting...")

    training_parser = InputParser(
        '/Users/skobovm/repos/csep517/hmm/data/twt.train.json')
    dev_parser = InputParser(
        '/Users/skobovm/repos/csep517/hmm/data/twt.dev.json')
    test_parser = InputParser(
        '/Users/skobovm/repos/csep517/hmm/data/twt.test.json')

    # First, count the words!
    counter = WordCounter()
    for parsed_sentence in training_parser.get_tokenized_sentences():
        if parsed_sentence:
            for i in range(1, len(parsed_sentence) - 1):
                counter.add_word(parsed_sentence[i][0])

    # Finalize counter and separate high frequency from low frequency
    counter.finalize()

    # Initialize the models
    bigram = BigramHMM()
    trigram = TrigramHMM()

    for parsed_sentence in training_parser.get_tokenized_sentences():
        if parsed_sentence:
            # Convert the low frequency words to classes
            counter.classify_sentence(parsed_sentence)

            bigram.add_sentence(parsed_sentence)
            trigram.add_sentence(parsed_sentence)

    # Models have been initialized at this point, finalize the distributions
    #bigram.finalize()
    trigram.finalize()

    # PICK THE PARSER HERE
    parser = dev_parser

    # Iterate over data and try to predict
    num_correct_bigram = 0
    num_correct_trigram = 0
    total_words = 0
    for parsed_sentence in parser.get_tokenized_sentences():
        if parsed_sentence:
            original_sentence = copy.deepcopy(parsed_sentence)

            # Convert the low frequency words to classes
            counter.classify_sentence(parsed_sentence)

            # Bigram lattice
            #lattice = Lattice(bigram, parsed_sentence)

            # Trigram lattice
            tri_lattice = TrigramLattice(trigram, parsed_sentence)

            # Calculate best POS using viterbi
            #pos_list_bigram = lattice.get_pos()
            pos_list_trigram = tri_lattice.get_pos()

            # Determine how many were correct
            #num_correct_bigram += get_num_correct(parsed_sentence, pos_list_bigram, lattice)
            num_correct_trigram += get_num_correct(parsed_sentence,
                                                   pos_list_trigram,
                                                   tri_lattice,
                                                   original_sentence, counter)

            # Remove the START and STOP chars
            total_words += (len(parsed_sentence) - 2)

            print("Accuracy: %s" % (num_correct_trigram / total_words))
        else:
            print('ERROR! Couldnt parse sentence')

    print("Bigram HMM Accuracy: %s/%s - %s" %
          (num_correct_bigram, total_words,
           (num_correct_bigram / total_words)))
    print("Trigram HMM Accuracy: %s/%s - %s" %
          (num_correct_trigram, total_words,
           (num_correct_trigram / total_words)))
Example #2
0
import re
from word_counter import WordCounter

# pattern for splitting words
PATTERN = "[\d\W\s_]+"

wc = WordCounter()

with open('README.md', 'r') as file:
    for line in file:
        for word in re.split(PATTERN, line.strip()):
            if word:
                wc.add_word(word)
               
# display the word counts
for word in sorted(wc):
    print(f"{wc[word]:3}  {word}")