def gen_counts(input_path, output_path): if exists(output_path): return print 'Generating counts from: "%s"' % input_path counter = Hmm(3) counter.train(open(input_path, 'r')) counter.write_counts(open(output_path, 'w'))
def save_transition_probs(input_file): """ Computes and stores trigrams and their respective transition probabilities from an input file containing the trigrams """ # read counts file counter = Hmm(3) counter.read_counts(file('ner_rare.counts')) out_lines_list = [] l = input_file.readline() while l: line = l.strip() if line: # Nonempty line trigram = tuple(line.split()) # get transition probability of trigram prob = compute_transition_prob( counter.ngram_counts[1][(trigram[0], trigram[1])], counter.ngram_counts[2][trigram]) # get log probability log_prob = math.log(prob) l = line + " " + str(log_prob) out_lines_list.append(l) l = input_file.readline() out_lines = "\n".join(out_lines_list) # write trigrams and their log probs to file with open('5_1.txt', 'w') as out_file: out_file.write(out_lines)
def __init__(self, infile="ner_train.dat"): self.counter = Hmm(3) with open(infile) as f: self.counter.train(f) self.unigrams = {k[0]:v for k,v in self.counter.ngram_counts[0].iteritems()} #since the key is a one-word tuple self.bigrams = self.counter.ngram_counts[1] self.trigrams = self.counter.ngram_counts[2] self.words = [x[0] for x in self.counter.emission_counts.keys()]
def baseline_tagger(counts_file, dev_file, rare_symbol="_RARE_"): """ Implements a baseline tagger that uses only the emission probabilities to assign tags and stores in a file. """ # get frequently occurring words word_count_dict = get_word_counts(file('ner_train.dat')) freq_words = [word for word in word_count_dict if word_count_dict[word] >= 5] # compute emission probs counter = Hmm(3) counter.read_counts(counts_file) emission_probs = compute_emission_probs(counter.emission_counts, counter.ngram_counts[0]) out_lines_list = [] l = dev_file.readline() while l: word = l.strip() if word: # Nonempty line # use emission probabilities of rare_symbol to assign tag and its probability for rare or unseen words. if word not in freq_words: tag = sorted(emission_probs[rare_symbol], key=emission_probs[word].get, reverse=True)[0] prob = emission_probs[rare_symbol][tag] # use emission probabilities of the word itself for frequently occurring words. else: tag = sorted(emission_probs[word], key=emission_probs[word].get, reverse=True)[0] prob = emission_probs[word][tag] log_prob = math.log(prob, 2) l = word + " " + tag + " " + str(log_prob) else: l = "" out_lines_list.append(l) l = dev_file.readline() out_lines = "\n".join(out_lines_list) out_lines = out_lines + "\n" # write words, corresponding tags and log probs to file with open('4_2.txt','w') as out_file: out_file.write(out_lines)
if __name__ == "__main__": if len(sys.argv ) != 3: # Expect exactly one argument: the training data file usage() sys.exit(2) try: counts_file = file(sys.argv[1], "r") test_file = file(sys.argv[2], "r") except IOError: sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % arg) sys.exit(1) # Initialize a trigram counter counter = Hmm(3) # Read in counts counter.read_counts(counts_file) # Iterate over all test sentences test_sent_iterator = sent_iterator(word_iterator(test_file)) for sentence in test_sent_iterator: # Viterbi Algorithm n = len(sentence) pad_sent = (2) * ["*"] pad_sent.extend(sentence) pad_sent.append("STOP") # Initialize
from collections import defaultdict from count_freqs import Hmm import math import sys def emission_probability(word, tag, emission_counts, ngram_counts): return emission_counts[(word, tag)] / ngram_counts[0][(tag, )] if __name__ == "__main__": counts_file = open(sys.argv[1]) sentences_file = open(sys.argv[2]) hmm = Hmm() hmm.read_counts(counts_file) emission_counts = hmm.emission_counts ngram_counts = hmm.ngram_counts entity_tags = hmm.all_states trained_words = defaultdict(int) infrequent_words = defaultdict(int) for word, tag in emission_counts: trained_words[word] += hmm.emission_counts[(word, tag)] for word in trained_words: if trained_words[word] < 5: infrequent_words[word] = 1
def __init__(self, counts_file="gene.counts"): self.hmm = Hmm() self.hmm.read_counts(counts_file)