Ejemplo n.º 1
0
class Tagger:

    def __init__(self, common_file, counts_file):
        self.common_words = get_common_words(common_file)
        
        self.hmm = Hmm(3)
        self.hmm.read_counts(counts_file)
Ejemplo n.º 2
0
def replace_rare(raw_data_file, raw_count_file, output_file, rare_counts = 5):
    # read in the raw counts from hmm
    fp = open(raw_count_file, 'r')
    hmm = Hmm(3)
    hmm.read_counts(fp)
    fp.close()

    # accumulate the word counts from emission_counts
    word_count = defaultdict(int)
    for word_tag in hmm.emission_counts:
        word_count[word_tag[0]] += hmm.emission_counts[word_tag]
    rare_words = set([word for word in word_count if word_count[word] < rare_counts])
    #print rare_words

    # replace rare words with _RARE_
    input = open(raw_data_file, 'r')
    output = open(output_file, 'w')
    for line in input:
        line = line.strip()
        if line:
            word, tag = line.split(" ")
            if word in rare_words:
                word_class = get_word_class(word)
                output.write(" ".join([word_class, tag]))
                #output.write(" ".join(['_RARE_', tag]))
            else:
                output.write(line)
        output.write("\n")
    input.close()
    output.close()
Ejemplo n.º 3
0
def save_transition_probs(input_file):
    """
	Computes and stores trigrams and their respective transition probabilities from an input file containing the trigrams
	"""

    # read counts file
    counter = Hmm(3)
    counter.read_counts(file('ner_rare.counts'))

    out_lines_list = []
    l = input_file.readline()
    while l:
        line = l.strip()
        if line:  # Nonempty line
            trigram = tuple(line.split())
            # get transition probability of trigram
            prob = compute_transition_prob(
                counter.ngram_counts[1][(trigram[0], trigram[1])],
                counter.ngram_counts[2][trigram])
            # get log probability
            log_prob = math.log(prob)
            l = line + " " + str(log_prob)

        out_lines_list.append(l)
        l = input_file.readline()
    out_lines = "\n".join(out_lines_list)

    # write trigrams and their log probs to file
    with open('5_1.txt', 'w') as out_file:
        out_file.write(out_lines)
Ejemplo n.º 4
0
class Hmm(object):
    
    def __init__(self, counts_file="gene.counts"):
        self.hmm = Hmm()
        self.hmm.read_counts(counts_file)
        
    def emission(self, x, y):
        
        pass
Ejemplo n.º 5
0
def problem4(count_file, dev_file):
    """Implement a simple named entity tagger and output predictions."""

    try:
        infile = file(count_file, "r")
    except IOError:
        sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % arg)
        sys.exit(1)

    # Initialize a trigram counter
    counter = Hmm(3)
    # Read counts
    counter.read_counts(infile)
    # Write the predictions
    counter.write_predicts(dev_file, sys.stdout)
Ejemplo n.º 6
0
def baseline_tagger(counts_file, dev_file, rare_symbol="_RARE_"):
	"""
	Implements a baseline tagger that uses only the emission probabilities to assign tags and stores in a file.
	"""

	# get frequently occurring words
	word_count_dict = get_word_counts(file('ner_train.dat'))
	freq_words = [word for word in word_count_dict if word_count_dict[word] >= 5]

	# compute emission probs
	counter = Hmm(3)
	counter.read_counts(counts_file)
	emission_probs = compute_emission_probs(counter.emission_counts, counter.ngram_counts[0])

	out_lines_list = []
	l = dev_file.readline()
	while l:
		word = l.strip()
		if word:  # Nonempty line
			# use emission probabilities of rare_symbol to assign tag and its probability for rare or unseen words.
			if word not in freq_words:
				tag = sorted(emission_probs[rare_symbol], key=emission_probs[word].get, reverse=True)[0]
				prob = emission_probs[rare_symbol][tag]

			# use emission probabilities of the word itself for frequently occurring words.
			else:
				tag = sorted(emission_probs[word], key=emission_probs[word].get, reverse=True)[0]
				prob = emission_probs[word][tag]
			log_prob = math.log(prob, 2)
			l = word + " " + tag + " " + str(log_prob)
		else:
			l = ""
		out_lines_list.append(l)
		l = dev_file.readline()
	out_lines = "\n".join(out_lines_list)
	out_lines = out_lines + "\n"

	# write words, corresponding tags and log probs to file
	with open('4_2.txt','w') as out_file:
		out_file.write(out_lines)
Ejemplo n.º 7
0
from collections import defaultdict
from count_freqs import Hmm

def p2_1emission (word,tag,hmm,countTag):

    #print "p2_1 " + word + " " + tag + " %i" %hmm.emission_counts[(word,tag)]
    if (word,tag) in hmm.emission_counts:
        return hmm.emission_counts[(word,tag)]/countTag[tag]
    else:
        return 0

if __name__ == "__main__":
    input = file(sys.argv[1],"r")
    model = Hmm(3)
    #print len(model.emission_counts)
    model.read_counts(input)
    #print len(model.emission_counts
    #if ("BACKGROUND","O") in model.emission_counts:
        #print "yes"
    #print model.all_states 
    testFile = file(sys.argv[2],"r")

    tagsNum = len(model.all_states)
    countTag = dict.fromkeys(model.all_states,0)
    #print countTag

    for (word,tag) in model.emission_counts:
        countTag[tag] += model.emission_counts[(word,tag)]
    #print countTag
    for line in testFile:
        word = line.strip()
Ejemplo n.º 8
0
#! /usr/bin/python
#
__author__="Xiaochen Wei <*****@*****.**>"
__date__ ="$Sep 20, 2014"

from dataClean import *
from count_freqs import Hmm
import math

# the file of train data
trainingDataFilePath = "ner.counts"
hmm = Hmm(3)
inputFile = open(trainingDataFilePath, "r")
hmm.read_counts(inputFile)

class SimpleNamedEntityTagger:


	'''
	get the Emission Parameter

	INPUT: the target word, and the status of target word
	======================
	RETURN: the emission of a target in specific targetType
	'''
	def GetEmissionParameters(self, target, targetType):

		sumCount = 0
		count = 0
		
		if target not in [key[0] for key in hmm.emission_counts.keys()]:
Ejemplo n.º 9
0
import sys
from collections import defaultdict
import math
from count_freqs import Hmm

"""
Implement the Viterbi algorithm to compute
argmax (y1...yn) p(x1...xn, y1...yn)
Your tagger should have the same basic functionality as the baseline tagger.
Instead of emission probabilities the third column should contain the log-probability
of the tagged sequence up to this word.
"""

if __name__ == "__main__":

    if len(sys.argv) != 3:  # Expect exactly two arguments: the counts file and dev file
        usage()
        sys.exit(2)

    try:
        counts_file = file(sys.argv[1], "r")
    except IOError:
        sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % arg)
        sys.exit(1)

    counter = Hmm(3)
    # Read counts
    counter.read_counts(counts_file)

    counter.viterbi_read(sys.argv[2])
Ejemplo n.º 10
0
            if word in infreq_words:
                if word.isupper():
                    f2.write("_UPPER_" + " " + parts[1] + "\n")
                elif word.isdigit():
                    f2.write("_DIGIT_" + " " + parts[1] + "\n")
                elif not word.isalpha():
                    f2.write("_NOTALPHA_" + " " + parts[1] + "\n")
                else:
                    f2.write("_RARE_" + " " + parts[1] + "\n")
            else:
                f2.write(line)
        f2.close()


def usage():
    print """
    python add_class.py [count_file] [training_data] 
    """

if __name__ == "__main__":

    if len(sys.argv)!=3: # Expects two argument: original count file and training data file
        usage()
        sys.exit(2)

    counter = Hmm(3)
    # finds count information for words in file
    (em_count, ngram_count, infreq_word_set, all_tags, all_words) = counter.read_counts(sys.argv[1])
    #produces new file with _RARE_
    replace_class(sys.argv[2], infreq_word_set)      
Ejemplo n.º 11
0
			word = " ".join(fields[:-1])

			# replace word with its category if frequency < count_thresh
			if word_count_dict[word] < count_thresh:
				line = " ".join([get_category(word), fields[-1]])
		out_lines_list.append(line)
		l = in_file.readline()
	out_lines = "\n".join(out_lines_list)
	out_file.write(out_lines)

if __name__ == "__main__":
	# replace infrequent words with categories and write to file
	replace_infrequent_words_with_categories(file('ner_train.dat'), file('ner_train_cats.dat', 'w'))

	# generate counts file
	os.system('python count_freqs.py ner_train_cats.dat > ner_cats.counts')

	# get frequent words
	word_count_dict = get_word_counts(file('ner_train.dat'))
	freq_words = [word for word in word_count_dict if word_count_dict[word] >= 5]

	# get transition and emission probabilities
	counter = Hmm(3)
	counter.read_counts(file('ner_cats.counts'))
	transition_probs = compute_transition_probs(counter.ngram_counts[1], counter.ngram_counts[2])
	emission_probs = compute_emission_probs(counter.emission_counts, counter.ngram_counts[0])

	# store tagged data with the log probs to file
	tagger(file('ner_dev.dat'), transition_probs, emission_probs, freq_words)

	os.system('python eval_ne_tagger.py ner_dev.key 6.txt')
Ejemplo n.º 12
0
     with open(file, "r") as f:
        f2 = open("ner_train_rare.dat", "w")
        for line in f:
            parts = line.strip().split(" ")
            word = parts[0]
            if word in infreq_words:
                f2.write("_RARE_" + " " + parts[1] + "\n")
            else:
                f2.write(line)
        f2.close()


def usage():
    print """
    python add_rare.py [count_file] [training_data] 
    """

if __name__ == "__main__":

    if len(sys.argv)!=3: # Expects two argument: original count file and training data file
        usage()
        sys.exit(2)

    counter = Hmm(3)
    # finds count information for words in file
    (em_count, ngram_count, infreq_word_set, all_tags, all_words) = counter.read_counts(sys.argv[1])

    #produces new file with _RARE_
    replace_rare(sys.argv[2], infreq_word_set)      

Ejemplo n.º 13
0
        sys.stdout.write(line)


if __name__ == "__main__":

    if len(
            sys.argv
    ) != 3:  # Expect exactly two arguments: the counts and corresponding training data file
        usage()
        sys.exit(2)

    try:
        input = file(sys.argv[1], "r")
        output = sys.argv[2]
    except IOError:
        sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % arg)
        sys.exit(1)

    # Initialize a trigram counter
    counter = Hmm(3)
    # Read in counts
    counter.read_counts(input)

    # Filter words with count < 5
    low_words = dict(
        (k, v) for k, v in counter.word_counts.iteritems() if v < 5)
    high_words = dict(
        (k, v) for k, v in counter.word_counts.iteritems() if v > 5)

    # Replace each instance of word in low_words with _RARE_ in training set
    replace_all(output, low_words, '_RARE_')
Ejemplo n.º 14
0
            word = line[:-1]
            if len(word) != 0:
                if word in keys:
                    tag = viterbi(d[0], d[1], word)
                else:
                    tag = viterbi(d[0], d[1], classify(word))
                output.write("%s %s\n" % (word, tag))
                d.append(tag)
            else:
                output.write("\n")
                d = deque(["*", "*"], maxlen=2)


if __name__ == "__main__":
    counter = Hmm(3)
    counter.read_counts(file("outputs/p3_count.txt", "r"))
    bigram_counts = counter.ngram_counts[1]
    trigram_counts = counter.ngram_counts[2]

    keys = set()
    for k in counter.emission_counts.keys():
        keys.add(k[0])

    # FOR THE DEVELOPENT FILE
    write_tags("data/gene.dev", keys, file("outputs/gene_dev.p3.out", "w"))
    """
	TO EVALUATE, RUN:
		>>> python eval_gene_tagger.py data/gene.key outputs/gene_dev.p3.out
	AND THE OUTPUT WILL BE:
		Found 404 GENEs. Expected 642 GENEs; Correct: 214.
Ejemplo n.º 15
0
#!/usr/bin/python

import sys
from count_freqs import Hmm

countInput = file(sys.argv[1],"r")
hmm = Hmm(3)
hmm.read_counts(countInput)

for tag in hmm.all_states:
    hmm.emission_counts[("_RARE_",tag)]=0
    hmm.emission_counts[("_Numeric_",tag)]=0
    hmm.emission_counts[("_AllCapitals_",tag)]=0
    hmm.emission_counts[("_LastCapital_",tag)]=0

for key,value in hmm.emission_counts.items():
    #print value
    if key[0] == "_RARE_":
        continue
    if value < 5:
        if key[0].isdigit():
            hmm.emission_counts[("_Numeric_",key[1])] += value
            #print "%s delete %i to Numeric %i" %(key,value,hmm.emission_counts[("_Numeric_",key[1])])
        elif key[0].isalpha() and key[0].isupper():
            hmm.emission_counts[("_AllCapitals_",key[1])] += value
            #print "%s delete %i to Captital %i" %(key,value,hmm.emission_counts[("_AllCapitals_",key[1])])
        elif key[0].isalpha() and key[0][-1].isupper():
        #elif key[0][-1].isupper():
            hmm.emission_counts[("_LastCapital_",key[1])] += value
            #print "%s delete %i to LastCaptital %i" %(key,value,hmm.emission_counts[("_LastCapital_",key[1])])
        else:
Ejemplo n.º 16
0
 def __read_counts(self, count_file):
     fp = open(count_file, 'r')
     hmm = Hmm(3)
     hmm.read_counts(fp)
     fp.close()
     return hmm
Ejemplo n.º 17
0
    out_lines = "\n".join(out_lines_list)
    out_lines = out_lines + "\n"

    # write to file
    with open('5_2.txt', 'w') as out_file:
        out_file.write(out_lines)


if __name__ == "__main__":
    os.system('python 4_1.py')
    os.system('python count_freqs.py ner_train_rare.dat > ner_rare.counts')

    # get frequent words
    word_count_dict = get_word_counts(file('ner_train.dat'))
    freq_words = [
        word for word in word_count_dict if word_count_dict[word] >= 5
    ]

    # get transition and emission probs
    counter = Hmm(3)
    counter.read_counts(file('ner_rare.counts'))
    transition_probs = compute_transition_probs(counter.ngram_counts[1],
                                                counter.ngram_counts[2])
    emission_probs = compute_emission_probs(counter.emission_counts,
                                            counter.ngram_counts[0])

    # store tagged data with the log probs to file
    tagger(file('ner_dev.dat'), transition_probs, emission_probs, freq_words)

    os.system('python eval_ne_tagger.py ner_dev.key 5_2.txt')
Ejemplo n.º 18
0
from collections import defaultdict
from count_freqs import Hmm
import math
import sys


def emission_probability(word, tag, emission_counts, ngram_counts):
    return emission_counts[(word, tag)] / ngram_counts[0][(tag, )]


if __name__ == "__main__":
    counts_file = open(sys.argv[1])
    sentences_file = open(sys.argv[2])

    hmm = Hmm()
    hmm.read_counts(counts_file)

    emission_counts = hmm.emission_counts
    ngram_counts = hmm.ngram_counts

    entity_tags = hmm.all_states
    trained_words = defaultdict(int)
    infrequent_words = defaultdict(int)

    for word, tag in emission_counts:
        trained_words[word] += hmm.emission_counts[(word, tag)]

    for word in trained_words:
        if trained_words[word] < 5:
            infrequent_words[word] = 1
Ejemplo n.º 19
0
from collections import defaultdict
from count_freqs import Hmm
import math
import sys

def emission_probability(word, tag, emission_counts, ngram_counts):
  return emission_counts[(word, tag)] / ngram_counts[0][(tag,)]

if __name__ == "__main__":
  counts_file = open(sys.argv[1])
  sentences_file = open(sys.argv[2])
  
  hmm = Hmm()
  hmm.read_counts(counts_file)
  
  emission_counts = hmm.emission_counts
  ngram_counts = hmm.ngram_counts
  
  entity_tags = hmm.all_states
  trained_words = defaultdict(int)
  infrequent_words = defaultdict(int)
  
  for word, tag in emission_counts:
    trained_words[word] += hmm.emission_counts[(word, tag)]
  
  for word in trained_words:
    if trained_words[word] < 5:
      infrequent_words[word] = 1
  
  for word in infrequent_words:
Ejemplo n.º 20
0
           ) != 3:  # Expect exactly one argument: the training data file
        usage()
        sys.exit(2)

    try:
        counts_file = file(sys.argv[1], "r")
        test_file = file(sys.argv[2], "r")
    except IOError:
        sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % arg)
        sys.exit(1)

    # Initialize a trigram counter
    counter = Hmm(3)

    # Read in counts
    counter.read_counts(counts_file)

    # Iterate over all test sentences
    test_sent_iterator = sent_iterator(word_iterator(test_file))
    for sentence in test_sent_iterator:
        # Viterbi Algorithm
        n = len(sentence)

        pad_sent = (2) * ["*"]
        pad_sent.extend(sentence)
        pad_sent.append("STOP")

        # Initialize
        # K[0], K[-1] = "*", K[1...n] = all_states
        K = ["*"] + (n) * [counter.all_states] + ["*"]
Ejemplo n.º 21
0
if __name__ == "__main__":
    if len(sys.argv) < 4:  # Expects atleast 3 arguments
        usage()
        sys.exit(2)
    try:
        input = file(sys.argv[1], "r")
    except IOError:
        sys.stderr.write("ERROR: Cannot read inputfile %s.\n" % arg)
        sys.exit(1)

    # Initialize a trigram counter
    counter = Hmm(3)
    if (len(sys.argv) == 4):
        #to obtain original counts
        (em_count1, ngram_count1, infreq_word1, all_tags1,
         all_words1) = counter.read_counts(sys.argv[3])
        #to process new data
        (em_count, ngram_count, infreq_word, all_tags,
         all_words) = counter.read_counts(sys.argv[1])
        #to obtain emission prob
        emission_probabilities = emission_parameters(sys.argv[2], em_count,
                                                     ngram_count[0], all_tags,
                                                     all_words1, infreq_word1)
    else:
        #to process new data
        (em_count, ngram_count, infreq_word, all_tags,
         all_words) = counter.read_counts(sys.argv[1])
        #to obtain trigram prob from samplefile
        trigram(ngram_count, sys.argv[4])