def interpolate_ngram_collection(ngram_collection, corpus, start_token, end_token, inf_constant): # n is how big our ngram is n = len(ngram_collection) lambds = lambdas_from_ngrams(ngram_collection) results = [] for sentence in corpus: # Create ntuples from sentence word_list = __build_word_list(sentence, start_token, end_token, n) n_tuple = gramer.ngram_from_word_list(word_list, n) total = 0 # print sentence for sentence_tuple in n_tuple: # print '=======================' # print "%s" %(sentence_tuple,) powerset = powerset_from_collection(sentence_tuple) try: #Account for foreign words. If unigram isn't there, then powerset is invalid if is_unkown_ngram(powerset[0], ngram_collection[0]): total = float(inf_constant) break # Get log probabilities for this tuple across all ngrams log_probs = [ item[powerset[i]] for i, item in enumerate(ngram_collection) ] # Calculate log probability probs_sum = [2**log_prob for log_prob in log_probs] trigram_interpolated_log_prob = math.log( lambds[0], 2) + math.log(np.sum(probs_sum), 2) # Add to overall sentence count total += trigram_interpolated_log_prob # print '%f TOTAL:%f' %(trigram_interpolated_log_prob, total) except: # print 'skipped %s' %(sentence_tuple,) pass results.append(total) return results
def score(ngram_p, n, corpus, start_token, end_token): results = [] for sentence in corpus: # Create ntuples from sentence word_list = __build_word_list(sentence, start_token, end_token, n) n_tuple = gramer.ngram_from_word_list(word_list, n) total = 0 for sentence_tuple in n_tuple: total += ngram_p[sentence_tuple] results.append(total) return results
def interpolate_ngram_collection(ngram_collection, corpus, start_token, end_token, inf_constant): # n is how big our ngram is n = len(ngram_collection) lambds = lambdas_from_ngrams(ngram_collection) results = [] for sentence in corpus: # Create ntuples from sentence word_list = __build_word_list(sentence, start_token, end_token, n) n_tuple = gramer.ngram_from_word_list(word_list, n) total = 0 # print sentence for sentence_tuple in n_tuple: # print '=======================' # print "%s" %(sentence_tuple,) powerset = powerset_from_collection(sentence_tuple) try: #Account for foreign words. If unigram isn't there, then powerset is invalid if is_unkown_ngram(powerset[0], ngram_collection[0]): total = float(inf_constant) break # Get log probabilities for this tuple across all ngrams log_probs = [item[powerset[i]] for i, item in enumerate(ngram_collection)] # Calculate log probability probs_sum = [2**log_prob for log_prob in log_probs] trigram_interpolated_log_prob = math.log(lambds[0], 2) + math.log(np.sum(probs_sum), 2) # Add to overall sentence count total += trigram_interpolated_log_prob # print '%f TOTAL:%f' %(trigram_interpolated_log_prob, total) except: # print 'skipped %s' %(sentence_tuple,) pass results.append(total) return results