Ejemplo n.º 1
0
def gen_counts(input_path, output_path):
    if exists(output_path): return

    print 'Generating counts from: "%s"' % input_path
    counter = Hmm(3)
    counter.train(open(input_path, 'r'))
    counter.write_counts(open(output_path, 'w'))
Ejemplo n.º 2
0
def save_transition_probs(input_file):
    """
	Computes and stores trigrams and their respective transition probabilities from an input file containing the trigrams
	"""

    # read counts file
    counter = Hmm(3)
    counter.read_counts(file('ner_rare.counts'))

    out_lines_list = []
    l = input_file.readline()
    while l:
        line = l.strip()
        if line:  # Nonempty line
            trigram = tuple(line.split())
            # get transition probability of trigram
            prob = compute_transition_prob(
                counter.ngram_counts[1][(trigram[0], trigram[1])],
                counter.ngram_counts[2][trigram])
            # get log probability
            log_prob = math.log(prob)
            l = line + " " + str(log_prob)

        out_lines_list.append(l)
        l = input_file.readline()
    out_lines = "\n".join(out_lines_list)

    # write trigrams and their log probs to file
    with open('5_1.txt', 'w') as out_file:
        out_file.write(out_lines)
Ejemplo n.º 3
0
	def __init__(self, infile="ner_train.dat"):
		self.counter = Hmm(3)
		with open(infile) as f:
			self.counter.train(f)
		self.unigrams = {k[0]:v for k,v in self.counter.ngram_counts[0].iteritems()} #since the key is a one-word tuple
		self.bigrams = self.counter.ngram_counts[1]
		self.trigrams = self.counter.ngram_counts[2]
		self.words = [x[0] for x in self.counter.emission_counts.keys()]
Ejemplo n.º 4
0
def baseline_tagger(counts_file, dev_file, rare_symbol="_RARE_"):
	"""
	Implements a baseline tagger that uses only the emission probabilities to assign tags and stores in a file.
	"""

	# get frequently occurring words
	word_count_dict = get_word_counts(file('ner_train.dat'))
	freq_words = [word for word in word_count_dict if word_count_dict[word] >= 5]

	# compute emission probs
	counter = Hmm(3)
	counter.read_counts(counts_file)
	emission_probs = compute_emission_probs(counter.emission_counts, counter.ngram_counts[0])

	out_lines_list = []
	l = dev_file.readline()
	while l:
		word = l.strip()
		if word:  # Nonempty line
			# use emission probabilities of rare_symbol to assign tag and its probability for rare or unseen words.
			if word not in freq_words:
				tag = sorted(emission_probs[rare_symbol], key=emission_probs[word].get, reverse=True)[0]
				prob = emission_probs[rare_symbol][tag]

			# use emission probabilities of the word itself for frequently occurring words.
			else:
				tag = sorted(emission_probs[word], key=emission_probs[word].get, reverse=True)[0]
				prob = emission_probs[word][tag]
			log_prob = math.log(prob, 2)
			l = word + " " + tag + " " + str(log_prob)
		else:
			l = ""
		out_lines_list.append(l)
		l = dev_file.readline()
	out_lines = "\n".join(out_lines_list)
	out_lines = out_lines + "\n"

	# write words, corresponding tags and log probs to file
	with open('4_2.txt','w') as out_file:
		out_file.write(out_lines)
Ejemplo n.º 5
0
if __name__ == "__main__":

    if len(sys.argv
           ) != 3:  # Expect exactly one argument: the training data file
        usage()
        sys.exit(2)

    try:
        counts_file = file(sys.argv[1], "r")
        test_file = file(sys.argv[2], "r")
    except IOError:
        sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % arg)
        sys.exit(1)

    # Initialize a trigram counter
    counter = Hmm(3)

    # Read in counts
    counter.read_counts(counts_file)

    # Iterate over all test sentences
    test_sent_iterator = sent_iterator(word_iterator(test_file))
    for sentence in test_sent_iterator:
        # Viterbi Algorithm
        n = len(sentence)

        pad_sent = (2) * ["*"]
        pad_sent.extend(sentence)
        pad_sent.append("STOP")

        # Initialize
Ejemplo n.º 6
0
from collections import defaultdict
from count_freqs import Hmm
import math
import sys


def emission_probability(word, tag, emission_counts, ngram_counts):
    return emission_counts[(word, tag)] / ngram_counts[0][(tag, )]


if __name__ == "__main__":
    counts_file = open(sys.argv[1])
    sentences_file = open(sys.argv[2])

    hmm = Hmm()
    hmm.read_counts(counts_file)

    emission_counts = hmm.emission_counts
    ngram_counts = hmm.ngram_counts

    entity_tags = hmm.all_states
    trained_words = defaultdict(int)
    infrequent_words = defaultdict(int)

    for word, tag in emission_counts:
        trained_words[word] += hmm.emission_counts[(word, tag)]

    for word in trained_words:
        if trained_words[word] < 5:
            infrequent_words[word] = 1
Ejemplo n.º 7
0
 def __init__(self, counts_file="gene.counts"):
     self.hmm = Hmm()
     self.hmm.read_counts(counts_file)