for line in test_file: word = line.strip() if word: # Nonempty line original_word = word # Check if word is absent in training set, if so, use _RARE_ if word not in counter.all_words or counter.word_counts[word] < 5: word = "_RARE_" # Initialize dict to hold emission values candidates = defaultdict(float) # Iterate through tags for tag in counter.all_states: prob = counter.calc_emissions(word, tag) # Make sure not to do log(0) if prob == 0.0: candidates[tag] = float("-inf") else: candidates[tag] = math.log(prob) # Get argmax of candidates pred = max(candidates.iteritems(), key=operator.itemgetter(1))[0] # Write prediction to output file sys.stdout.write("%s %s %s\n" % (original_word, pred, str(candidates[pred]))) else: print ""
word = "_CF_" elif all(c in string.punctuation or c.isdigit() for c in word): word = "_NP_" else: word = "_RARE_" # Iterate over u and v for u in K[k - 1]: for v in K[k]: # Find max over w in K[k-2] w_candidates = defaultdict(float) for w in K[k - 2]: w_candidates[w] = pi[k - 1][(w, u)] * counter.calc_mle( [w, u, v]) * counter.calc_emissions(word, v) final_w = max(w_candidates.iteritems(), key=operator.itemgetter(1)) # Assign pi value pi[k][(u, v)] = final_w[1] # Get the (tag, probability) of v in max(pi[k](u,v)) final_k_idx = max(pi[k].iteritems(), key=operator.itemgetter(1)) prob = final_k_idx[1] # Log probability log_prob = math.log(prob) # Ouput format: word, tag, log probability sys.stdout.write("%s %s %s\n" %
word = pad_sent[k+1] original_word = pad_sent[k+1] # Check if word is absent in training set or count(word) < 5, if so, use _RARE_ if word not in counter.all_words or counter.word_counts[word] < 5: word = "_RARE_" # Iterate over u and v for u in K[k-1]: for v in K[k]: # Find max over w in K[k-2] w_candidates = defaultdict(float) for w in K[k-2]: w_candidates[w] = pi[k-1][(w,u)] * counter.calc_mle([w,u,v]) * counter.calc_emissions(word,v) final_w = max(w_candidates.iteritems(), key=operator.itemgetter(1)) # Assign pi value pi[k][(u,v)] = final_w[1] # Get the (tag, probability) of v in max(pi[k](u,v)) final_k_idx = max(pi[k].iteritems(), key=operator.itemgetter(1)) prob = final_k_idx[1] # Log probability log_prob = math.log(prob) # Ouput format: word, tag, log probability sys.stdout.write("%s %s %s\n" % (original_word, final_k_idx[0][1], log_prob))