def gen_counts(input_path, output_path): if exists(output_path): return print 'Generating counts from: "%s"' % input_path counter = Hmm(3) counter.train(open(input_path, 'r')) counter.write_counts(open(output_path, 'w'))
################################################### ################################################### print("\n2. Generate word count file.\n") freqs_input = open('gene.replace.train',"r") freqs_output = open('gene.counts', "w") # Initialize a trigram counter counter = Hmm(3) # Collect counts counter.train(freqs_input) # Write the counts counter.write_counts(freqs_output) freqs_output.flush() ################################################### ################################################### print("\n3. Tag dev corpus with Viterbi tagger.\n") v_counts_file = open('gene.counts',"r")
class Tagger(object): def __init__(self, infile="ner_train.dat"): self.counter = Hmm(3) with open(infile) as f: self.counter.train(f) self.unigrams = {k[0]:v for k,v in self.counter.ngram_counts[0].iteritems()} #since the key is a one-word tuple self.bigrams = self.counter.ngram_counts[1] self.trigrams = self.counter.ngram_counts[2] self.words = [x[0] for x in self.counter.emission_counts.keys()] """ conditional probability that the word maps to tag given the number of times the tag occurs """ def compute_emission(self, word, tag): em = self.counter.emission_counts if tag == '*': return 0 if (word,tag) in em: return em[(word,tag)]/float(self.unigrams[tag]) elif word in self.words: return 0 else: return em[('_RARE_',tag)]/float(self.unigrams[tag]) """ returns the trigram count over the bigram count, defaulting the dict gets so that there aren't division by 0 errors """ def compute_trigram(self,yi,y1,y2): return self.trigrams.get((y2,y1,yi),0)/float(self.bigrams.get((y2,y1),1)) """ basic file replacement, writes to a new file called rare-{infile} where infile is provided. Can pass a threshold of how many common_words is considered "rare" """ def replace_rare(self,infile,threshold=5): wordcounts = defaultdict(int) for tup in self.counter.emission_counts.iteritems(): wordcounts[tup[0][0]] += tup[1] # aggregates counts of words total, with any tag common_words = [k for k,v in wordcounts.iteritems() if v >= threshold] replaced = 0 f = open(infile) f2 = open(infile.replace('.dat','-rare.dat'), 'w') for line in f: if len(line.split(' ')) == 2: if line.split(' ')[0] not in common_words: # closed set, there are more rare than not rare, we know it's one or the other f2.write(line.replace(line.split(' ')[0], '_RARE_', 1)) replaced +=1 else: f2.write(line) else: f2.write(line) # maintain stops f.close() f2.close() """ returns a dictionary of relative probabilities for emission counts """ def tag_probabilities(self,word): counts = {tag:self.compute_emission(word,tag) for tag in self.unigrams} prob = lambda v: v/sum(counts.values()) if sum(counts.values()) != 0 else 0 return {k:prob(v) for k,v in counts.iteritems()} """ wrapper function for dynamic programming algorithm, writes to outfile """ def viterbi(self,infile,outfile): def write_to_pred_file(f,sentence): tag_seq = [" ".join(x) for x in self.tag_sequence(sentence)] #tuples of tag,probability for word,tag in itertools.izip(sentence,tag_seq): # word, tag, probability f.write('%s %s\n' % (word,tag)) f.write('\n') with open(infile) as f, open(outfile,"w") as f2: sentence = [] for line in f: if line == '\n': write_to_pred_file(f2,sentence) sentence = [] continue else: sentence.append(line.strip()) #write the last sentence to the file (if there is no newline -- will just return and escape if sentence is empty write_to_pred_file(f2,sentence) def tag_sequence(self,sentence): if len(sentence) == 0: return [] possible_tags = self.unigrams.keys() possible_tags.append('*') bp = {i:{} for i in range(len(sentence) + 1)} # initialization: pi(0,'*','*') = 1, pi(0,u,v) = 0 bp[0] = {t:('O',0) for t in itertools.product(possible_tags,repeat=2)} bp[0][('*','*')] = (1.0,1.0) # at idx 1, u can only be * for v in possible_tags: tag_max = ('sentinel',-1) #a real probability (since logs are only computed at end) will never be negative, so this will be reset tags = {} for w in possible_tags: tags[w] = bp[0][(w,'*')][1]*self.compute_trigram(v,w,'*')*self.compute_emission(sentence[0],v) if tags[w] > tag_max[1] and tags[w] != 0: tag_max = (w,tags[w]) bp[1][('*',v)] = tag_max if tag_max != ('sentinel',-1) else ('O',0) #default tag is no tag, so O -- no sequences with this u,v with a nonzero probability for i,word in enumerate(sentence[1:], start=2): #from 2...n for v,u in itertools.product(possible_tags,repeat=2): #same as nested for u in K, v in K tag_max = ('sentinel', -1) tags = {} for w in possible_tags: if (w,u) in bp[i-1]: tags[w] = bp[i-1][(w,u)][1]*self.compute_trigram(v,u,w)*self.compute_emission(word,v) if tags[w] > tag_max[1] and tags[w] != 0: tag_max = (w,tags[w]) bp[i][(u,v)] = tag_max if tag_max != ('sentinel',-1) else ('O',0) n = len(sentence) last = {(u,v): bp[n][(u,v)][1]*self.compute_trigram('STOP',v,u) for u,v in bp[n].keys()} yn1,yn = max(last, key=last.get) # max probability for sequence ending in STOP conf = last[(yn1,yn)] seq = [(yn,str(ln(conf))), (yn1,str(ln(conf)))] #sequence will be yn...y0 for i in xrange(len(sentence) - 2, 0, -1): u,v = tuple(x[0] for x in reversed(seq[-2:])) #previous two are yn-1, yn-2 prev = bp[i+2][(u,v)] seq.append((prev[0], str(ln(prev[1])))) return reversed(seq) #reversed yn...y0 is y0...yn
def train_ngram_and_emission_freq_from_corpus_file(self, corpus_file): counter = Hmm(3) counter.train(corpus_file) self.emission_counts = counter.emission_counts self.ngram_counts = counter.ngram_counts
else: return "I-GENE" def get_rare_words(d): temp_d = d.copy() O_words, GENE_words = set(), set() for key, value in d.iteritems(): if value < 5 and get_max_value(temp_d, key[0]) < 5: if get_most_tag(temp_d, key[0]) == "O": O_words.add(key[0]) else: GENE_words.add(key[0]) return (O_words, GENE_words) if __name__ == "__main__": counter = Hmm(3) counter.train(file("data/gene.train","r"), RARE=False) # print counter.emission_counts rare_words = get_rare_words(counter.emission_counts) # print len(rare_words[0]) #O_words = 19034 # print len(rare_words[1]) #GENE_words = 6231 with open("data/rare_words.pickle", "wb") as f: pickle.dump(rare_words, f)