def calculate_total_similarity(lsi, word2id = {}, context=[], blanks=[]): """calcualte the score given context words and blanks""" score = 0.0 cnt = 0.0 for i in range(0, len(blanks)): for em in blanks[i]: if word2id.has_key(em): em_vec = lsi.projection.u[int(word2id[em])] for j in range(0, len(blanks)): if j != i: for em2 in blanks[j]: if word2id.has_key(em2): cnt = cnt + 1 word_vec = lsi.projection.u[int(word2id[em2])]; score = score + mathutils.cossim(em_vec, word_vec) for word in context: if word2id.has_key(word): cnt = cnt + 1 word_vec = lsi.projection.u[int(word2id[word])] score = score + mathutils.cossim(em_vec, word_vec) if cnt != 0: score = score / cnt return score;
def calculate_total_similarity_with_rake(lsi,text, word2id = {}, blanks=[]): """calculate total similarity between the blanks and keywords identified by rake""" score = 0.0 cnt = 0.0 keywords = rake_obj.run(text) keyword_dict = {} # build the key word dictionary with the key is the token and value is the corresponding value for key in keywords: words = nltk.word_tokenize(key[0]) for word in words: keyword_dict[word] = key[1] # print keyword_dict weight_sum = 0.0 for blank in blanks: for em in blank: if word2id.has_key(em): em_vec = lsi.projection.u[int(word2id[em])] for key in keyword_dict.keys(): if word2id.has_key(key): cnt = cnt + 1 score = score + mathutils.cossim(em_vec, lsi.projection.u[int(word2id[key])]) * keyword_dict[key] # weight_sum = weight_sum + keyword_dict[key] if cnt != 0: score = score / (cnt) return score;
def calculate_total_similarity_by_k_max(lsi, word2id = {}, context=[], blanks=[], k=2): """calcualte the score given context words and blanks""" score = 0.0 queue = [] for i in range(0, len(blanks)): for em in blanks[i]: if word2id.has_key(em): em_vec = lsi.projection.u[int(word2id[em])] # loop all other blanks for j in range(0, len(blanks)): if j != i: for em2 in blanks[j]: if word2id.has_key(em2): word_vec = lsi.projection.u[int(word2id[em2])]; heapq.heappush(queue, mathutils.cossim(em_vec, word_vec)) else: unseen_word.add(em2) for word in context: if word2id.has_key(word): word_vec = lsi.projection.u[int(word2id[word])] heapq.heappush(queue, mathutils.cossim(em_vec, word_vec)) else: unseen_word.add(word) else: unseen_word.add(em) kmax = heapq.nlargest(k, queue) for i in range(0, len(kmax)): if i < k: score = score + kmax[i] else: break if len(kmax) > k: score = score / k elif len(kmax) != 0: score = score / len(kmax) return score;
def calculate_total_similarity_with_combination(lsi, word2id = {}, context=[], blanks=[], alpha=0.5): """calcualte the score given context words and blanks""" score = 0.0 blank_vecs = [] score_per_blank = [] for blank in blanks: vector = []; for em in blank: if my_dic.has_key(em.lower()): vector.append(model[em.lower()]) else: unseen_word.add(em) if len(vector) > 0: # print vector blank_vecs.append(mathutils.combine_and_normalize(vector, len(vector[0]))) if len(blank_vecs) == 0: return score for vec in blank_vecs: cnt = 0; temp_score = 0.0; for word in context: if my_dic.has_key(word.lower()): cnt = cnt + 1 word_vec = model[word.lower()] temp_score = temp_score + mathutils.cossim(vec, word_vec) else: unseen_word.add(word) if cnt != 0: score_per_blank.append(temp_score/cnt) if len(score_per_blank) == 2: score = alpha * score_per_blank[0] + (1 - alpha) * score_per_blank[1] elif len(score_per_blank) == 1: score = score_per_blank[0] return score;