def main(): logging.info("Starting...") training_parser = InputParser( '/Users/skobovm/repos/csep517/hmm/data/twt.train.json') dev_parser = InputParser( '/Users/skobovm/repos/csep517/hmm/data/twt.dev.json') test_parser = InputParser( '/Users/skobovm/repos/csep517/hmm/data/twt.test.json') # First, count the words! counter = WordCounter() for parsed_sentence in training_parser.get_tokenized_sentences(): if parsed_sentence: for i in range(1, len(parsed_sentence) - 1): counter.add_word(parsed_sentence[i][0]) # Finalize counter and separate high frequency from low frequency counter.finalize() # Initialize the models bigram = BigramHMM() trigram = TrigramHMM() for parsed_sentence in training_parser.get_tokenized_sentences(): if parsed_sentence: # Convert the low frequency words to classes counter.classify_sentence(parsed_sentence) bigram.add_sentence(parsed_sentence) trigram.add_sentence(parsed_sentence) # Models have been initialized at this point, finalize the distributions #bigram.finalize() trigram.finalize() # PICK THE PARSER HERE parser = dev_parser # Iterate over data and try to predict num_correct_bigram = 0 num_correct_trigram = 0 total_words = 0 for parsed_sentence in parser.get_tokenized_sentences(): if parsed_sentence: original_sentence = copy.deepcopy(parsed_sentence) # Convert the low frequency words to classes counter.classify_sentence(parsed_sentence) # Bigram lattice #lattice = Lattice(bigram, parsed_sentence) # Trigram lattice tri_lattice = TrigramLattice(trigram, parsed_sentence) # Calculate best POS using viterbi #pos_list_bigram = lattice.get_pos() pos_list_trigram = tri_lattice.get_pos() # Determine how many were correct #num_correct_bigram += get_num_correct(parsed_sentence, pos_list_bigram, lattice) num_correct_trigram += get_num_correct(parsed_sentence, pos_list_trigram, tri_lattice, original_sentence, counter) # Remove the START and STOP chars total_words += (len(parsed_sentence) - 2) print("Accuracy: %s" % (num_correct_trigram / total_words)) else: print('ERROR! Couldnt parse sentence') print("Bigram HMM Accuracy: %s/%s - %s" % (num_correct_bigram, total_words, (num_correct_bigram / total_words))) print("Trigram HMM Accuracy: %s/%s - %s" % (num_correct_trigram, total_words, (num_correct_trigram / total_words)))
import re from word_counter import WordCounter # pattern for splitting words PATTERN = "[\d\W\s_]+" wc = WordCounter() with open('README.md', 'r') as file: for line in file: for word in re.split(PATTERN, line.strip()): if word: wc.add_word(word) # display the word counts for word in sorted(wc): print(f"{wc[word]:3} {word}")