def main(): logging.info("Starting...") training_parser = InputParser( '/Users/skobovm/repos/csep517/hmm/data/twt.train.json') dev_parser = InputParser( '/Users/skobovm/repos/csep517/hmm/data/twt.dev.json') test_parser = InputParser( '/Users/skobovm/repos/csep517/hmm/data/twt.test.json') # First, count the words! counter = WordCounter() for parsed_sentence in training_parser.get_tokenized_sentences(): if parsed_sentence: for i in range(1, len(parsed_sentence) - 1): counter.add_word(parsed_sentence[i][0]) # Finalize counter and separate high frequency from low frequency counter.finalize() # Initialize the models bigram = BigramHMM() trigram = TrigramHMM() for parsed_sentence in training_parser.get_tokenized_sentences(): if parsed_sentence: # Convert the low frequency words to classes counter.classify_sentence(parsed_sentence) bigram.add_sentence(parsed_sentence) trigram.add_sentence(parsed_sentence) # Models have been initialized at this point, finalize the distributions #bigram.finalize() trigram.finalize() # PICK THE PARSER HERE parser = dev_parser # Iterate over data and try to predict num_correct_bigram = 0 num_correct_trigram = 0 total_words = 0 for parsed_sentence in parser.get_tokenized_sentences(): if parsed_sentence: original_sentence = copy.deepcopy(parsed_sentence) # Convert the low frequency words to classes counter.classify_sentence(parsed_sentence) # Bigram lattice #lattice = Lattice(bigram, parsed_sentence) # Trigram lattice tri_lattice = TrigramLattice(trigram, parsed_sentence) # Calculate best POS using viterbi #pos_list_bigram = lattice.get_pos() pos_list_trigram = tri_lattice.get_pos() # Determine how many were correct #num_correct_bigram += get_num_correct(parsed_sentence, pos_list_bigram, lattice) num_correct_trigram += get_num_correct(parsed_sentence, pos_list_trigram, tri_lattice, original_sentence, counter) # Remove the START and STOP chars total_words += (len(parsed_sentence) - 2) print("Accuracy: %s" % (num_correct_trigram / total_words)) else: print('ERROR! Couldnt parse sentence') print("Bigram HMM Accuracy: %s/%s - %s" % (num_correct_bigram, total_words, (num_correct_bigram / total_words))) print("Trigram HMM Accuracy: %s/%s - %s" % (num_correct_trigram, total_words, (num_correct_trigram / total_words)))
def main(): logging.info("Starting...") training_parser = InputParser( '/Users/skobovm/repos/csep517/language_models/data/prob1_brown_full/brown.train.txt' ) dev_parser = InputParser( '/Users/skobovm/repos/csep517/language_models/data/prob1_brown_full/brown.dev.txt' ) test_parser = InputParser( '/Users/skobovm/repos/csep517/language_models/data/prob1_brown_full/brown.test.txt' ) unigram = UnigramModel() for parsed_sentence in training_parser.get_tokenized_sentences(): if parsed_sentence: unigram.add_sentence(parsed_sentence) # Normalize the model unigram.normalize_model() unigram.calculate_probabilities() bigram = BigramModel(unigram) trigram = TrigramModel(unigram) for parsed_sentence in training_parser.get_tokenized_sentences(): if parsed_sentence: bigram.add_sentence(parsed_sentence) trigram.add_sentence(parsed_sentence) bigram.calculate_probabilities() trigram.calculate_probabilities() # Set up the appropriate input parser parser = test_parser k_vals = [.0026] # [10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001] for k in k_vals: # Get dev probabilities unigram_probabilities = [] bigram_probabilities = [] trigram_probabilities = [] total_corpus = 0 num_bigrams_dropped = 0 num_trigrams_dropped = 0 total_sentences = 0 for parsed_sentence in parser.get_tokenized_sentences(): if parsed_sentence: total_sentences += 1 # Subtract 1 to account for START total_corpus += len(parsed_sentence) - 1 unigram_probability = unigram.get_probability(parsed_sentence) bigram_probability = bigram.get_probability(parsed_sentence, k_num=k) trigram_probability = trigram.get_probability(parsed_sentence, k_num=k) if unigram_probability == -float('inf'): # This should NOT happen with UNKs... print('dropping 0 probability') else: unigram_probabilities.append(unigram_probability) if bigram_probability == -float('inf'): num_bigrams_dropped += 1 else: bigram_probabilities.append(bigram_probability) if trigram_probability == -float('inf'): num_trigrams_dropped += 1 else: trigram_probabilities.append(trigram_probability) # Calculate perplexities word_count = total_corpus print('K: %s' % k) # unigram unigram_prob_sum = sum(unigram_probabilities) unigram_l = unigram_prob_sum / word_count unigram_perplexity = math.pow(2, -unigram_l) print('unigram perplexity: %s' % unigram_perplexity) # bigram bigram_prob_sum = sum(bigram_probabilities) bigram_l = bigram_prob_sum / word_count bigram_perplexity = math.pow(2, -bigram_l) print('bigram perplexity: %s' % bigram_perplexity) # trigram trigram_prob_sum = sum(trigram_probabilities) trigram_l = trigram_prob_sum / word_count trigram_perplexity = math.pow(2, -trigram_l) print('trigram perplexity: %s' % trigram_perplexity) # lambdas = [ # (1/3, 1/3, 1/3), # Even # (.7, .15, .15), # Trigram-heavy # (.15, .7, .15), # Bigram-heavy # (.15, .15, .7), # Unigram-heavy # (.6, .3, .1) # tri > bi > uni # ] lambdas = [ (.1, .55, .35), # Trigram-heavy (.05, .6, .35), # Unigram-heavy (.1, .6, .3) # tri > bi > uni ] lirp = LinearInterpolator(unigram, bigram, trigram) for lambda_set in lambdas: probabilities = [] total_corpus = 0 for parsed_sentence in parser.get_tokenized_sentences(): if parsed_sentence: # Subtract 1 to account for START total_corpus += len(parsed_sentence) - 1 probabilities.append( lirp.get_probability(parsed_sentence, l1=lambda_set[0], l2=lambda_set[1], l3=lambda_set[2])) print('l1: %s, l2: %s, l3: %s' % lambda_set) # unigram unigram_prob_sum = sum(probabilities) unigram_l = unigram_prob_sum / total_corpus unigram_perplexity = math.pow(2, -unigram_l) print('perplexity: %s' % unigram_perplexity)