def SIFRank(text, SIF, en_model, method="average", N=15, sent_emb_method="bert", if_DS=True, if_EA=True, ratio=0.6): """ :param text_obj: :param sent_embeddings: :param candidate_embeddings_list: :param sents_weight_list: :param method: :param N: the top-N number of keyphrases :param sent_emb_method: 'elmo', 'glove' :param if_DS: if take document segmentation(DS) :param if_EA: if take embeddings alignment(EA) :return: """ text_obj = input_representation.InputTextObj(en_model, text) sent_embeddings, candidate_embeddings_list = SIF.get_tokenized_sent_embeddings( text_obj, if_DS=if_DS, if_EA=if_EA) dist_list = [] for i, emb in enumerate(candidate_embeddings_list): dist = get_dist_cosine(sent_embeddings, emb, sent_emb_method) dist_list.append(dist) dist_all = get_all_dist(candidate_embeddings_list, text_obj, dist_list) dist_final = get_final_dist(dist_all, method='average') dist_sorted = sorted(dist_final.items(), key=lambda x: x[1], reverse=True) dist_deduplicated = eliminate_duplicates(dist_sorted, ratio) return dist_deduplicated[0:N]
def SIFRank(text, SIF, en_model, method="average", N=15, sent_emb_method="elmo", elmo_layers_weight=[0.0, 1.0, 0.0]): """ :param text_obj: :param sent_embeddings: :param candidate_embeddings_list: :param sents_weight_list: :param method: :param N: the top-N number of keyphrases :param sent_emb_method: 'elmo', 'glove' :param elmo_layers_weight: the weights of different layers of ELMo :return: """ text_obj = input_representation.InputTextObj(en_model, text) sent_embeddings, candidate_embeddings_list = SIF.get_tokenized_sent_embeddings( text_obj) dist_list = [] for i, emb in enumerate(candidate_embeddings_list): dist = get_dist_cosine(sent_embeddings, emb, sent_emb_method, elmo_layers_weight=elmo_layers_weight) dist_list.append(dist) dist_all = get_all_dist(candidate_embeddings_list, text_obj, dist_list) dist_final = get_final_dist(dist_all, method='average') dist_sorted = sorted(dist_final.items(), key=lambda x: x[1], reverse=True) return dist_sorted[0:N]
def SIFRank(text, SIF, en_model, method="average", N=15, sent_emb_method="elmo", elmo_layers_weight=[0.0, 1.0, 0.0], if_DS=True, if_EA=True, kwdict=None, kw_info=None, cut_dict=False, seg_only=False, logger=logging.getLogger()): """ :param text_obj: :param sent_embeddings: :param candidate_embeddings_list: :param sents_weight_list: :param method: :param N: the top-N number of keyphrases :param sent_emb_method: 'elmo', 'glove' :param elmo_layers_weight: the weights of different layers of ELMo :param if_DS: if take document segmentation(DS) :param if_EA: if take embeddings alignment(EA) :return: """ text_obj = input_representation.InputTextObj(en_model, text, kw_dict=kwdict, cut_dict=cut_dict, seg_only=seg_only, logger=logger) sent_embeddings, candidate_embeddings_list = SIF.get_tokenized_sent_embeddings(text_obj,if_DS=if_DS,if_EA=if_EA) dist_list = [] for i, emb in enumerate(candidate_embeddings_list): dist = get_dist_cosine(sent_embeddings, emb, sent_emb_method, elmo_layers_weight=elmo_layers_weight) dist_list.append(dist) dist_all = get_all_dist(candidate_embeddings_list, text_obj, dist_list) dist_final = get_final_dist(dist_all, method='average') dist_sorted = sorted(dist_final.items(), key=lambda x: x[1], reverse=True) phrase_freq = dict() max_freq,min_freq = 1,1 for phrase, dist_list in dist_all.items(): freq = len(dist_list) phrase_freq[phrase] = freq if freq < min_freq: min_freq = freq if freq > max_freq: max_freq = freq top_dist = dist_sorted[0:N] top_kw_map = dict() for kw,score in top_dist: top_kw_map[kw] = [score] for idx, (np,dist) in enumerate(top_dist): freq_weight = (phrase_freq[np] - min_freq + 1)/float(max_freq - min_freq + 1) #print("%s %f %f, freq:%d, max:%d, min:%d" % (np, dist, freq_weight, phrase_freq[np], max_freq, min_freq)) if kw_info is not None: if np in kw_info: idf = kw_info[np][0] if idf > 0.0 and idf < 4.0: idf_weight = 0.5 top_kw_map[np].extend([freq_weight, idf_weight]) top_dist[idx] = (np, dist*freq_weight) top_sorted = sorted(top_dist, key=lambda x:x[1], reverse=True) final_kw_list = [] for kw,score in top_sorted: final_kw_list.append((kw, top_kw_map[kw][0])) return final_kw_list
def SIFRank(text, SIF, en_model, method="average", N=15, sent_emb_method="elmo", elmo_layers_weight=[0.0, 1.0, 0.0], if_DS=True, if_EA=True, kwdict=None, kw_info=None, cut_dict=False, seg_only=False): """ :param text_obj: :param sent_embeddings: :param candidate_embeddings_list: :param sents_weight_list: :param method: :param N: the top-N number of keyphrases :param sent_emb_method: 'elmo', 'glove' :param elmo_layers_weight: the weights of different layers of ELMo :param if_DS: if take document segmentation(DS) :param if_EA: if take embeddings alignment(EA) :return: """ text_obj = input_representation.InputTextObj(en_model, text, kw_dict=kwdict, cut_dict=cut_dict, seg_only=seg_only) sent_embeddings, candidate_embeddings_list = SIF.get_tokenized_sent_embeddings( text_obj, if_DS=if_DS, if_EA=if_EA) dist_list = [] for i, emb in enumerate(candidate_embeddings_list): dist = get_dist_cosine(sent_embeddings, emb, sent_emb_method, elmo_layers_weight=elmo_layers_weight) dist_list.append(dist) dist_all = get_all_dist(candidate_embeddings_list, text_obj, dist_list) dist_final = get_final_dist(dist_all, method='average') dist_sorted = sorted(dist_final.items(), key=lambda x: x[1], reverse=True) phrase_freq = dict() max_freq, min_freq = 1, 1 for phrase, dist_list in dist_all.items(): freq = len(dist_list) phrase_freq[phrase] = freq if freq < min_freq: min_freq = freq if freq > max_freq: max_freq = freq return dist_sorted[0:N]
def SIFRank_plus( text, SIF, ja_model, method="average", N=5, sent_emb_method="elmo", elmo_layers_weight=[0.0, 1.0, 0.0], if_DS=True, if_EA=True, position_bias=3.4, ): """ :param text_obj: :param sent_embeddings: :param candidate_embeddings_list: :param sents_weight_list: :param method: :param N: the top-N number of keyphrases :param sent_emb_method: 'elmo', 'glove' :param elmo_layers_weight: the weights of different layers of ELMo :return: """ text_obj = input_representation.InputTextObj(ja_model, text) sent_embeddings, candidate_embeddings_list = SIF.get_tokenized_sent_embeddings( text_obj, if_DS=if_DS, if_EA=if_EA) position_score = get_position_score(text_obj.keyphrase_candidate, position_bias) average_score = sum(position_score.values()) / (float)( len(position_score) + 1) # Little change here dist_list = [] for i, emb in enumerate(candidate_embeddings_list): dist = get_dist_cosine(sent_embeddings, emb, sent_emb_method, elmo_layers_weight=elmo_layers_weight) dist_list.append(dist) dist_all = get_all_dist(candidate_embeddings_list, text_obj, dist_list) dist_final = get_final_dist(dist_all, method=method) for np, dist in dist_final.items(): if np in position_score: dist_final[np] = (dist * position_score[np] / (average_score + 1) ) # Little change here dist_sorted = sorted(dist_final.items(), key=lambda x: x[1], reverse=True) keywords = [item[0] for item in dist_sorted] relevance = [item[1] for item in dist_sorted] return keywords[:N], relevance[:N]
def SIFRank( text, SIF, ja_model, method="average", N=5, sent_emb_method="elmo", elmo_layers_weight=[0.0, 1.0, 0.0], if_DS=True, if_EA=True, ): """ :param text_obj: :param sent_embeddings: :param candidate_embeddings_list: :param sents_weight_list: :param method: :param N: the top-N number of keyphrases :param sent_emb_method: 'elmo', 'glove' :param elmo_layers_weight: the weights of different layers of ELMo :param if_DS: if take document segmentation(DS) :param if_EA: if take embeddings alignment(EA) :return: """ text_obj = input_representation.InputTextObj(ja_model, text) sent_embeddings, candidate_embeddings_list = SIF.get_tokenized_sent_embeddings( text_obj, if_DS=if_DS, if_EA=if_EA) dist_list = [] for i, emb in enumerate(candidate_embeddings_list): dist = get_dist_cosine(sent_embeddings, emb, sent_emb_method, elmo_layers_weight=elmo_layers_weight) dist_list.append(dist) dist_all = get_all_dist(candidate_embeddings_list, text_obj, dist_list) dist_final = get_final_dist(dist_all, method=method) dist_sorted = sorted(dist_final.items(), key=lambda x: x[1], reverse=True) keywords = [item[0] for item in dist_sorted] relevance = [item[1] for item in dist_sorted] return keywords[:N], relevance[:N]
def SIFRank_plus(text, SIF, en_model, method="average", N=15, sent_emb_method="bert", if_DS=True, if_EA=True, position_bias=3.4, ratio=0.6): """ :param text_obj: :param sent_embeddings: :param candidate_embeddings_list: :param sents_weight_list: :param method: :param N: the top-N number of keyphrases :param sent_emb_method: 'elmo', 'glove' :return: """ text_obj = input_representation.InputTextObj(en_model, text) sent_embeddings, candidate_embeddings_list = SIF.get_tokenized_sent_embeddings( text_obj, if_DS=if_DS, if_EA=if_EA) position_score = get_position_score(text_obj.keyphrase_candidate, position_bias) average_score = sum(position_score.values()) / (float)( len(position_score)) #Little change here dist_list = [] for i, emb in enumerate(candidate_embeddings_list): dist = get_dist_cosine(sent_embeddings, emb, sent_emb_method) dist_list.append(dist) dist_all = get_all_dist(candidate_embeddings_list, text_obj, dist_list) dist_final = get_final_dist(dist_all, method='average') for np, dist in dist_final.items(): if np in position_score: dist_final[np] = dist * position_score[ np] / average_score #Little change here dist_sorted = sorted(dist_final.items(), key=lambda x: x[1], reverse=True) dist_deduplicated = eliminate_duplicates(dist_sorted, ratio) return dist_deduplicated[0:N]
def SIFRank_plus(text, SIF, en_model, method="average", N=15, sent_emb_method="elmo", elmo_layers_weight=[1.0, 0.0, 0.0], if_DS=True, if_EA=True, position_bias = 3.4, kwdict=None, kw_info=None, cut_dict=False, seg_only=False, use_pos=False, logger=logging.getLogger(), check=False): """ :param text_obj: :param sent_embeddings: :param candidate_embeddings_list: :param sents_weight_list: :param method: :param N: the top-N number of keyphrases :param sent_emb_method: 'elmo', 'glove' :param elmo_layers_weight: the weights of different layers of ELMo :return: """ text_obj = input_representation.InputTextObj(en_model, text, kw_dict=kwdict, cut_dict=cut_dict, seg_only=seg_only, use_pos=use_pos, logger=logger) st = time.time() sent_embeddings, candidate_embeddings_list = SIF.get_tokenized_sent_embeddings(text_obj,if_DS=if_DS,if_EA=if_EA) #logging.debug("sent_embeddings:%s\ncandidate_embeddings_list:%s" %(sent_embeddings, candidate_embeddings_list)) ed = time.time() cost = int((ed - st)*1000) #logging.debug("[emb_cost] %dms" %(cost)) position_score = get_position_score(text_obj.keyphrase_candidate, position_bias) if len(position_score) == 0: average_score = 0 else: average_score = sum(position_score.values())/(float)(len(position_score)) dist_list = [] for i, emb in enumerate(candidate_embeddings_list): dist = get_dist_cosine(sent_embeddings, emb, sent_emb_method, elmo_layers_weight=elmo_layers_weight) #logging.debug("%s, cosine:%f" % (text_obj.keyphrase_candidate[i], dist)) dist_list.append(dist) dist_all = get_all_dist(candidate_embeddings_list, text_obj, dist_list) phrase_freq = dict() max_freq,min_freq = 1,1 for phrase, dist_list in dist_all.items(): freq = len(dist_list) phrase_freq[phrase] = freq if freq < min_freq: min_freq = freq if freq > max_freq: max_freq = freq dist_final = get_final_dist(dist_all, method='average') for np,dist in dist_final.items(): if np in position_score: dist_final[np] = dist*position_score[np]/average_score dist_sorted = sorted(dist_final.items(), key=lambda x: x[1], reverse=True) check_kw_score = " ".join(["%s:%s" % (k,s) for k,s in dist_sorted]) logger.info("[check_kw_score]%s" %(check_kw_score)) top_dist = dist_sorted[0:N] top_kw_map = dict() for kw,score in top_dist: top_kw_map[kw] = [score] for idx, (np,dist) in enumerate(top_dist): freq_weight = (phrase_freq[np] - min_freq + 1)/float(max_freq - min_freq + 1) #if phrase_freq[np] < 2: # freq_weight = freq_weight*0.5 idf_weight = 1.0 if kw_info is not None: if np in kw_info: idf = kw_info[np][0] if idf >= 13.0: idf = 13.0 if idf > 0.0: idf_weight = idf / 13.0 #if idf < 6.0: # idf_weight *= 0.5 if idf_weight < 0.35: idf_weight = 0.01 top_kw_map[np].extend([freq_weight, idf_weight]) # print("%s %f %f, freq:%d, max:%d, min:%d" % (np, dist, freq_weight, phrase_freq[np], max_freq, min_freq)) # logging.debug("%s %f %f, freq:%d, max:%d, min:%d, idf:%f" % (np, dist, freq_weight, phrase_freq[np], max_freq, min_freq, idf_weight)) final_score = dist*freq_weight*idf_weight if final_score < 0.075 and phrase_freq[np] == 1: final_score = 0.0 if len(np) < 2: final_score = 0.0 top_dist[idx] = (np, final_score) top_sorted = sorted(top_dist, key=lambda x:x[1], reverse=True) final_kw_list = [] for kw,score in top_sorted: final_kw_list.append((kw, (top_kw_map[kw][0], score, top_kw_map[kw][1], top_kw_map[kw][2]))) if check == True: return final_kw_list,sent_embeddings, candidate_embeddings_list, text_obj else: return final_kw_list
def SIFRank_plus(text, SIF, en_model, method="average", N=15, sent_emb_method="elmo", elmo_layers_weight=[1.0, 0.0, 0.0], if_DS=True, if_EA=True, position_bias=3.4, kwdict=None, kw_info=None, cut_dict=False, seg_only=False, use_pos=False): """ :param text_obj: :param sent_embeddings: :param candidate_embeddings_list: :param sents_weight_list: :param method: :param N: the top-N number of keyphrases :param sent_emb_method: 'elmo', 'glove' :param elmo_layers_weight: the weights of different layers of ELMo :return: """ text_obj = input_representation.InputTextObj(en_model, text, kw_dict=kwdict, cut_dict=cut_dict, seg_only=seg_only, use_pos=use_pos) st = time.time() sent_embeddings, candidate_embeddings_list = SIF.get_tokenized_sent_embeddings( text_obj, if_DS=if_DS, if_EA=if_EA) #logging.debug("sent_embeddings:%s\ncandidate_embeddings_list:%s" %(sent_embeddings, candidate_embeddings_list)) ed = time.time() cost = int((ed - st) * 1000) #logging.debug("[emb_cost] %dms" %(cost)) position_score = get_position_score(text_obj.keyphrase_candidate, position_bias) if len(position_score) == 0: average_score = 0 else: average_score = sum(position_score.values()) / (float)( len(position_score)) dist_list = [] for i, emb in enumerate(candidate_embeddings_list): dist = get_dist_cosine(sent_embeddings, emb, sent_emb_method, elmo_layers_weight=elmo_layers_weight) #logging.debug("%s, cosine:%f" % (text_obj.keyphrase_candidate[i], dist)) dist_list.append(dist) dist_all = get_all_dist(candidate_embeddings_list, text_obj, dist_list) dist_final = get_final_dist(dist_all, method='average') # for np,dist in dist_final.items(): # if np in position_score: # dist_final[np] = dist*position_score[np]/average_score logger.info("[check sif] %s" % (dist_final)) min_s = 1.0 max_s = 0.0 for w, s in dist_final.items(): if s < min_s: min_s = s if s > max_s: max_s = s # dist_sorted = sorted(dist_final.items(), key=lambda x: x[1], reverse=True) word_textrank = textrank.get_textrank(text_obj.sentence_words, window=10) new_textrank = {} for k, v in word_textrank.items(): if k not in kwdict: continue new_textrank[k] = v min_sr = 1.0 max_sr = 0.0 denominate_sr = 1.0 for w, s in dist_final.items(): if s < min_sr: min_sr = s if s > max_sr: max_sr = s denominate_sr = max_sr - min_sr min_tr = 1.0 max_tr = 0.0 for w, s in new_textrank.items(): if s < min_tr: min_tr = s if s > max_tr: max_tr = s denominate_tr = max_tr - min_tr final_items = [] if len(word_textrank) > 0: for w, s in dist_final.items(): if w not in new_textrank: continue norm_sr = (s - min_sr) / denominate_sr norm_tr = (new_textrank[w] - min_tr) / denominate_tr final_items.append((w, norm_sr * norm_tr, norm_sr, norm_tr)) final_items = sorted(final_items, key=lambda x: x[1], reverse=True) logger.info( "[check method]dist_final:%s\nword_textrank:%s\nfinal_items:%s" % (dist_final, word_textrank, final_items)) return final_items
def SIFRank_plus(text, SIF, en_model, method="average", N=15, sent_emb_method="elmo", elmo_layers_weight=[0.0, 1.0, 0.0], if_DS=True, if_EA=True, position_bias=3.4, kwdict=None, kw_info=None, cut_dict=False, seg_only=False, use_pos=False, user_dict=None): """ :param text_obj: :param sent_embeddings: :param candidate_embeddings_list: :param sents_weight_list: :param method: :param N: the top-N number of keyphrases :param sent_emb_method: 'elmo', 'glove' :param elmo_layers_weight: the weights of different layers of ELMo :return: """ text_obj = input_representation.InputTextObj(en_model, text, kw_dict=kwdict, cut_dict=cut_dict, seg_only=seg_only, use_pos=use_pos, user_dict=user_dict) st = time.time() sent_embeddings, candidate_embeddings_list, weight_list, elmo_embeddings = SIF.get_tokenized_sent_embeddings( text_obj, if_DS=if_DS, if_EA=if_EA) #logging.debug("sent_embeddings:%s\ncandidate_embeddings_list:%s" %(sent_embeddings, candidate_embeddings_list)) ed = time.time() cost = int((ed - st) * 1000) #logging.debug("[emb_cost] %dms" %(cost)) position_score = get_position_score(text_obj.keyphrase_candidate, position_bias) if len(position_score) == 0: average_score = 0 else: average_score = sum(position_score.values()) / (float)( len(position_score)) dist_list = [] for i, emb in enumerate(candidate_embeddings_list): dist = get_dist_cosine(sent_embeddings, emb, "glove", elmo_layers_weight=elmo_layers_weight) #logging.debug("%s, cosine:%f" % (text_obj.keyphrase_candidate[i], dist)) dist_list.append(dist) dist_all = get_all_dist(candidate_embeddings_list, text_obj, dist_list) #textranks = get_textrank(candidate_embeddings_list, elmo_layers_weight) #textrank_all = {} #if textranks is not None: # for idx, score in textranks: # textrank_all[text_obj.keyphrase_candidate[idx][0]] = score dist_final = get_final_dist(dist_all, method='average') #for item,dist in dist_final.items(): # if item in position_score: # dist_final[item] = dist*position_score[item]/average_score # # if item in textrank_all: # dist_final[item] = dist_final[item] * textrank_all[item] # else: # print("[miss error] %s" % (item)) dist_sorted = sorted(dist_final.items(), key=lambda x: x[1], reverse=True) #top_dist = dist_sorted[0:N] top_dist = dist_sorted check_data = { "keyphrase_candidate": text_obj.keyphrase_candidate, "sent_embedding": sent_embeddings, "candidate_embeddings_list": candidate_embeddings_list, "weight_list": weight_list, "elmo_embeddings": elmo_embeddings, "text_obj": text_obj } return top_dist, check_data
def SIFRank(text, SIF, en_model, method="average", N=15, sent_emb_method="elmo", elmo_layers_weight=[0.0, 1.0, 0.0], if_DS=True, if_EA=True, kwdict=None, cut_dict=False): """ :param text_obj: :param sent_embeddings: :param candidate_embeddings_list: :param sents_weight_list: :param method: :param N: the top-N number of keyphrases :param sent_emb_method: 'elmo', 'glove' :param elmo_layers_weight: the weights of different layers of ELMo :param if_DS: if take document segmentation(DS) :param if_EA: if take embeddings alignment(EA) :return: """ text_obj = input_representation.InputTextObj(en_model, text, kwdict, cut_dict=cut_dict) sent_embeddings, candidate_embeddings_list = SIF.get_tokenized_sent_embeddings( text_obj, if_DS=if_DS, if_EA=if_EA) dist_list = [] for i, emb in enumerate(candidate_embeddings_list): dist = get_dist_cosine(sent_embeddings, emb, sent_emb_method, elmo_layers_weight=elmo_layers_weight) dist_list.append(dist) dist_all = get_all_dist(candidate_embeddings_list, text_obj, dist_list) dist_final = get_final_dist(dist_all, method='average') dist_sorted = sorted(dist_final.items(), key=lambda x: x[1], reverse=True) phrase_freq = dict() max_freq, min_freq = 1, 1 for phrase, dist_list in dist_all.items(): freq = len(dist_list) phrase_freq[phrase] = freq if freq < min_freq: min_freq = freq if freq > max_freq: max_freq = freq top_dist = dist_sorted[0:N] for idx, (np, dist) in enumerate(top_dist): freq_weight = (phrase_freq[np] - min_freq + 1) / float(max_freq - min_freq + 1) print("%s %f %f, freq:%d, max:%d, min:%d" % (np, dist, freq_weight, phrase_freq[np], max_freq, min_freq)) #if dist > 0.9: # continue #else: top_dist[idx] = (np, dist * freq_weight) top_sorted = sorted(top_dist, key=lambda x: x[1], reverse=True) return top_sorted
length += 1 tmp_dict = tmp_dict[tmp_token] if "is_leaf" in tmp_dict: is_match = True match_length = length else: break if is_match: candidate_kw = "" for k in range(0, match_length): candidate_kw += tokens_tagged[i + k][0] start_end = (i, i + match_length) keyphrase_candidate.append((candidate_kw, start_end)) i += match_length else: i += 1 else: i += 1 return keyphrase_candidate if __name__ == '__main__': #This is an example. zh_model = thulac.thulac(model_path=r'../auxiliary_data/thulac.models/',user_dict=r'../auxiliary_data/user_dict.txt') sent = "以BERT为代表的自然语言预训练模型(Pre-trained Language Model)的出现使自然语言的各个任务领域的效果都得到大幅地提升。" ito = input_representation.InputTextObj(text=sent,zh_model=zh_model) keyphrase_candidate = ito.keyphrase_candidate for kc in keyphrase_candidate: print(kc)