Ejemplo n.º 1
0
def SIFRank(text,
            SIF,
            en_model,
            method="average",
            N=15,
            sent_emb_method="bert",
            if_DS=True,
            if_EA=True,
            ratio=0.6):
    """
    :param text_obj:
    :param sent_embeddings:
    :param candidate_embeddings_list:
    :param sents_weight_list:
    :param method:
    :param N: the top-N number of keyphrases
    :param sent_emb_method: 'elmo', 'glove'
    :param if_DS: if take document segmentation(DS)
    :param if_EA: if take  embeddings alignment(EA)
    :return:
    """
    text_obj = input_representation.InputTextObj(en_model, text)
    sent_embeddings, candidate_embeddings_list = SIF.get_tokenized_sent_embeddings(
        text_obj, if_DS=if_DS, if_EA=if_EA)
    dist_list = []
    for i, emb in enumerate(candidate_embeddings_list):
        dist = get_dist_cosine(sent_embeddings, emb, sent_emb_method)
        dist_list.append(dist)
    dist_all = get_all_dist(candidate_embeddings_list, text_obj, dist_list)
    dist_final = get_final_dist(dist_all, method='average')
    dist_sorted = sorted(dist_final.items(), key=lambda x: x[1], reverse=True)
    dist_deduplicated = eliminate_duplicates(dist_sorted, ratio)
    return dist_deduplicated[0:N]
Ejemplo n.º 2
0
def SIFRank(text,
            SIF,
            en_model,
            method="average",
            N=15,
            sent_emb_method="elmo",
            elmo_layers_weight=[0.0, 1.0, 0.0]):
    """
    :param text_obj:
    :param sent_embeddings:
    :param candidate_embeddings_list:
    :param sents_weight_list:
    :param method:
    :param N: the top-N number of keyphrases
    :param sent_emb_method: 'elmo', 'glove'
    :param elmo_layers_weight: the weights of different layers of ELMo
    :return:
    """
    text_obj = input_representation.InputTextObj(en_model, text)
    sent_embeddings, candidate_embeddings_list = SIF.get_tokenized_sent_embeddings(
        text_obj)
    dist_list = []
    for i, emb in enumerate(candidate_embeddings_list):
        dist = get_dist_cosine(sent_embeddings,
                               emb,
                               sent_emb_method,
                               elmo_layers_weight=elmo_layers_weight)
        dist_list.append(dist)
    dist_all = get_all_dist(candidate_embeddings_list, text_obj, dist_list)
    dist_final = get_final_dist(dist_all, method='average')
    dist_sorted = sorted(dist_final.items(), key=lambda x: x[1], reverse=True)
    return dist_sorted[0:N]
Ejemplo n.º 3
0
def SIFRank(text, SIF, en_model, method="average", N=15,
            sent_emb_method="elmo", elmo_layers_weight=[0.0, 1.0, 0.0], if_DS=True, if_EA=True, kwdict=None, kw_info=None, cut_dict=False, seg_only=False, logger=logging.getLogger()):
    """
    :param text_obj:
    :param sent_embeddings:
    :param candidate_embeddings_list:
    :param sents_weight_list:
    :param method:
    :param N: the top-N number of keyphrases
    :param sent_emb_method: 'elmo', 'glove'
    :param elmo_layers_weight: the weights of different layers of ELMo
    :param if_DS: if take document segmentation(DS)
    :param if_EA: if take  embeddings alignment(EA)
    :return:
    """
    text_obj = input_representation.InputTextObj(en_model, text, kw_dict=kwdict, cut_dict=cut_dict, seg_only=seg_only, logger=logger)
    sent_embeddings, candidate_embeddings_list = SIF.get_tokenized_sent_embeddings(text_obj,if_DS=if_DS,if_EA=if_EA)
    dist_list = []
    for i, emb in enumerate(candidate_embeddings_list):
        dist = get_dist_cosine(sent_embeddings, emb, sent_emb_method, elmo_layers_weight=elmo_layers_weight)
        dist_list.append(dist)
    dist_all = get_all_dist(candidate_embeddings_list, text_obj, dist_list)
    dist_final = get_final_dist(dist_all, method='average')
    dist_sorted = sorted(dist_final.items(), key=lambda x: x[1], reverse=True)

    phrase_freq = dict()
    max_freq,min_freq = 1,1
    for phrase, dist_list in dist_all.items():
        freq = len(dist_list)
        phrase_freq[phrase] = freq
        if freq < min_freq:
            min_freq = freq
        if freq > max_freq:
            max_freq = freq

    top_dist = dist_sorted[0:N]
    top_kw_map = dict()
    for kw,score in top_dist:
        top_kw_map[kw] = [score]

    for idx, (np,dist) in enumerate(top_dist):
        freq_weight = (phrase_freq[np] - min_freq + 1)/float(max_freq - min_freq + 1)
        #print("%s %f %f, freq:%d, max:%d, min:%d" % (np, dist, freq_weight, phrase_freq[np], max_freq, min_freq))
        if kw_info is not None:
            if np in kw_info:
                idf = kw_info[np][0]
                if idf > 0.0 and idf < 4.0:
                    idf_weight = 0.5
        top_kw_map[np].extend([freq_weight, idf_weight])

        top_dist[idx] = (np, dist*freq_weight)
    top_sorted = sorted(top_dist, key=lambda x:x[1], reverse=True)
    final_kw_list = []
    for kw,score in top_sorted:
        final_kw_list.append((kw, top_kw_map[kw][0]))
    return final_kw_list
Ejemplo n.º 4
0
def SIFRank(text,
            SIF,
            en_model,
            method="average",
            N=15,
            sent_emb_method="elmo",
            elmo_layers_weight=[0.0, 1.0, 0.0],
            if_DS=True,
            if_EA=True,
            kwdict=None,
            kw_info=None,
            cut_dict=False,
            seg_only=False):
    """
    :param text_obj:
    :param sent_embeddings:
    :param candidate_embeddings_list:
    :param sents_weight_list:
    :param method:
    :param N: the top-N number of keyphrases
    :param sent_emb_method: 'elmo', 'glove'
    :param elmo_layers_weight: the weights of different layers of ELMo
    :param if_DS: if take document segmentation(DS)
    :param if_EA: if take  embeddings alignment(EA)
    :return:
    """
    text_obj = input_representation.InputTextObj(en_model,
                                                 text,
                                                 kw_dict=kwdict,
                                                 cut_dict=cut_dict,
                                                 seg_only=seg_only)
    sent_embeddings, candidate_embeddings_list = SIF.get_tokenized_sent_embeddings(
        text_obj, if_DS=if_DS, if_EA=if_EA)
    dist_list = []
    for i, emb in enumerate(candidate_embeddings_list):
        dist = get_dist_cosine(sent_embeddings,
                               emb,
                               sent_emb_method,
                               elmo_layers_weight=elmo_layers_weight)
        dist_list.append(dist)
    dist_all = get_all_dist(candidate_embeddings_list, text_obj, dist_list)
    dist_final = get_final_dist(dist_all, method='average')
    dist_sorted = sorted(dist_final.items(), key=lambda x: x[1], reverse=True)

    phrase_freq = dict()
    max_freq, min_freq = 1, 1
    for phrase, dist_list in dist_all.items():
        freq = len(dist_list)
        phrase_freq[phrase] = freq
        if freq < min_freq:
            min_freq = freq
        if freq > max_freq:
            max_freq = freq

    return dist_sorted[0:N]
Ejemplo n.º 5
0
def SIFRank_plus(
    text,
    SIF,
    ja_model,
    method="average",
    N=5,
    sent_emb_method="elmo",
    elmo_layers_weight=[0.0, 1.0, 0.0],
    if_DS=True,
    if_EA=True,
    position_bias=3.4,
):
    """
    :param text_obj:
    :param sent_embeddings:
    :param candidate_embeddings_list:
    :param sents_weight_list:
    :param method:
    :param N: the top-N number of keyphrases
    :param sent_emb_method: 'elmo', 'glove'
    :param elmo_layers_weight: the weights of different layers of ELMo
    :return:
    """
    text_obj = input_representation.InputTextObj(ja_model, text)
    sent_embeddings, candidate_embeddings_list = SIF.get_tokenized_sent_embeddings(
        text_obj, if_DS=if_DS, if_EA=if_EA)
    position_score = get_position_score(text_obj.keyphrase_candidate,
                                        position_bias)
    average_score = sum(position_score.values()) / (float)(
        len(position_score) + 1)  # Little change here
    dist_list = []
    for i, emb in enumerate(candidate_embeddings_list):
        dist = get_dist_cosine(sent_embeddings,
                               emb,
                               sent_emb_method,
                               elmo_layers_weight=elmo_layers_weight)
        dist_list.append(dist)
    dist_all = get_all_dist(candidate_embeddings_list, text_obj, dist_list)
    dist_final = get_final_dist(dist_all, method=method)
    for np, dist in dist_final.items():
        if np in position_score:
            dist_final[np] = (dist * position_score[np] / (average_score + 1)
                              )  # Little change here
    dist_sorted = sorted(dist_final.items(), key=lambda x: x[1], reverse=True)
    keywords = [item[0] for item in dist_sorted]
    relevance = [item[1] for item in dist_sorted]

    return keywords[:N], relevance[:N]
Ejemplo n.º 6
0
def SIFRank(
    text,
    SIF,
    ja_model,
    method="average",
    N=5,
    sent_emb_method="elmo",
    elmo_layers_weight=[0.0, 1.0, 0.0],
    if_DS=True,
    if_EA=True,
):
    """
    :param text_obj:
    :param sent_embeddings:
    :param candidate_embeddings_list:
    :param sents_weight_list:
    :param method:
    :param N: the top-N number of keyphrases
    :param sent_emb_method: 'elmo', 'glove'
    :param elmo_layers_weight: the weights of different layers of ELMo
    :param if_DS: if take document segmentation(DS)
    :param if_EA: if take  embeddings alignment(EA)
    :return:
    """
    text_obj = input_representation.InputTextObj(ja_model, text)
    sent_embeddings, candidate_embeddings_list = SIF.get_tokenized_sent_embeddings(
        text_obj, if_DS=if_DS, if_EA=if_EA)
    dist_list = []
    for i, emb in enumerate(candidate_embeddings_list):
        dist = get_dist_cosine(sent_embeddings,
                               emb,
                               sent_emb_method,
                               elmo_layers_weight=elmo_layers_weight)
        dist_list.append(dist)
    dist_all = get_all_dist(candidate_embeddings_list, text_obj, dist_list)
    dist_final = get_final_dist(dist_all, method=method)
    dist_sorted = sorted(dist_final.items(), key=lambda x: x[1], reverse=True)
    keywords = [item[0] for item in dist_sorted]
    relevance = [item[1] for item in dist_sorted]

    return keywords[:N], relevance[:N]
Ejemplo n.º 7
0
def SIFRank_plus(text,
                 SIF,
                 en_model,
                 method="average",
                 N=15,
                 sent_emb_method="bert",
                 if_DS=True,
                 if_EA=True,
                 position_bias=3.4,
                 ratio=0.6):
    """
    :param text_obj:
    :param sent_embeddings:
    :param candidate_embeddings_list:
    :param sents_weight_list:
    :param method:
    :param N: the top-N number of keyphrases
    :param sent_emb_method: 'elmo', 'glove'
    :return:
    """
    text_obj = input_representation.InputTextObj(en_model, text)
    sent_embeddings, candidate_embeddings_list = SIF.get_tokenized_sent_embeddings(
        text_obj, if_DS=if_DS, if_EA=if_EA)
    position_score = get_position_score(text_obj.keyphrase_candidate,
                                        position_bias)
    average_score = sum(position_score.values()) / (float)(
        len(position_score))  #Little change here
    dist_list = []
    for i, emb in enumerate(candidate_embeddings_list):
        dist = get_dist_cosine(sent_embeddings, emb, sent_emb_method)
        dist_list.append(dist)
    dist_all = get_all_dist(candidate_embeddings_list, text_obj, dist_list)
    dist_final = get_final_dist(dist_all, method='average')
    for np, dist in dist_final.items():
        if np in position_score:
            dist_final[np] = dist * position_score[
                np] / average_score  #Little change here
    dist_sorted = sorted(dist_final.items(), key=lambda x: x[1], reverse=True)
    dist_deduplicated = eliminate_duplicates(dist_sorted, ratio)
    return dist_deduplicated[0:N]
Ejemplo n.º 8
0
def SIFRank_plus(text, SIF, en_model, method="average", N=15,
            sent_emb_method="elmo", elmo_layers_weight=[1.0, 0.0, 0.0], if_DS=True, if_EA=True, position_bias = 3.4,
            kwdict=None, kw_info=None, cut_dict=False, seg_only=False, use_pos=False, logger=logging.getLogger(), check=False):
    """
    :param text_obj:
    :param sent_embeddings:
    :param candidate_embeddings_list:
    :param sents_weight_list:
    :param method:
    :param N: the top-N number of keyphrases
    :param sent_emb_method: 'elmo', 'glove'
    :param elmo_layers_weight: the weights of different layers of ELMo
    :return:
    """
    text_obj = input_representation.InputTextObj(en_model, text, kw_dict=kwdict, cut_dict=cut_dict, seg_only=seg_only, use_pos=use_pos, logger=logger)
    st = time.time()
    sent_embeddings, candidate_embeddings_list = SIF.get_tokenized_sent_embeddings(text_obj,if_DS=if_DS,if_EA=if_EA)
    #logging.debug("sent_embeddings:%s\ncandidate_embeddings_list:%s" %(sent_embeddings, candidate_embeddings_list))
    ed = time.time()
    cost = int((ed - st)*1000)
    #logging.debug("[emb_cost] %dms" %(cost))
    position_score = get_position_score(text_obj.keyphrase_candidate, position_bias)
    if len(position_score) == 0:
        average_score = 0
    else:
        average_score = sum(position_score.values())/(float)(len(position_score))

    dist_list = []
    for i, emb in enumerate(candidate_embeddings_list):
        dist = get_dist_cosine(sent_embeddings, emb, sent_emb_method, elmo_layers_weight=elmo_layers_weight)
        #logging.debug("%s, cosine:%f" % (text_obj.keyphrase_candidate[i], dist))
        dist_list.append(dist)
    dist_all = get_all_dist(candidate_embeddings_list, text_obj, dist_list)
    phrase_freq = dict()
    max_freq,min_freq = 1,1
    for phrase, dist_list in dist_all.items():
        freq = len(dist_list)
        phrase_freq[phrase] = freq
        if freq < min_freq:
            min_freq = freq
        if freq > max_freq:
            max_freq = freq

    dist_final = get_final_dist(dist_all, method='average')

    for np,dist in dist_final.items():
        if np in position_score:
            dist_final[np] = dist*position_score[np]/average_score

    dist_sorted = sorted(dist_final.items(), key=lambda x: x[1], reverse=True)
    check_kw_score = " ".join(["%s:%s" % (k,s) for k,s in dist_sorted])
    logger.info("[check_kw_score]%s" %(check_kw_score))
    top_dist = dist_sorted[0:N]
    top_kw_map = dict()
    for kw,score in top_dist:
        top_kw_map[kw] = [score]

    for idx, (np,dist) in enumerate(top_dist):
        freq_weight = (phrase_freq[np] - min_freq + 1)/float(max_freq - min_freq + 1)
        #if phrase_freq[np] < 2:
        #    freq_weight = freq_weight*0.5
        idf_weight = 1.0
        if kw_info is not None:
            if np in kw_info:
                idf = kw_info[np][0]
                if idf >= 13.0:
                    idf = 13.0
                if idf > 0.0:
                    idf_weight = idf / 13.0
                    #if idf < 6.0:
                    #    idf_weight *= 0.5
        if idf_weight < 0.35:
            idf_weight = 0.01
        top_kw_map[np].extend([freq_weight, idf_weight])
        # print("%s %f %f, freq:%d, max:%d, min:%d" % (np, dist, freq_weight, phrase_freq[np], max_freq, min_freq))
        # logging.debug("%s %f %f, freq:%d, max:%d, min:%d, idf:%f" % (np, dist, freq_weight, phrase_freq[np], max_freq, min_freq, idf_weight))
        final_score = dist*freq_weight*idf_weight
        if final_score < 0.075 and phrase_freq[np] == 1:
            final_score = 0.0

        if len(np) < 2:
            final_score = 0.0
        top_dist[idx] = (np, final_score)

    top_sorted = sorted(top_dist, key=lambda x:x[1], reverse=True)
    final_kw_list = []
    for kw,score in top_sorted:
        final_kw_list.append((kw, (top_kw_map[kw][0], score, top_kw_map[kw][1], top_kw_map[kw][2])))
    if check == True:
        return final_kw_list,sent_embeddings, candidate_embeddings_list, text_obj
    else:
        return final_kw_list
Ejemplo n.º 9
0
def SIFRank_plus(text,
                 SIF,
                 en_model,
                 method="average",
                 N=15,
                 sent_emb_method="elmo",
                 elmo_layers_weight=[1.0, 0.0, 0.0],
                 if_DS=True,
                 if_EA=True,
                 position_bias=3.4,
                 kwdict=None,
                 kw_info=None,
                 cut_dict=False,
                 seg_only=False,
                 use_pos=False):
    """
    :param text_obj:
    :param sent_embeddings:
    :param candidate_embeddings_list:
    :param sents_weight_list:
    :param method:
    :param N: the top-N number of keyphrases
    :param sent_emb_method: 'elmo', 'glove'
    :param elmo_layers_weight: the weights of different layers of ELMo
    :return:
    """
    text_obj = input_representation.InputTextObj(en_model,
                                                 text,
                                                 kw_dict=kwdict,
                                                 cut_dict=cut_dict,
                                                 seg_only=seg_only,
                                                 use_pos=use_pos)
    st = time.time()
    sent_embeddings, candidate_embeddings_list = SIF.get_tokenized_sent_embeddings(
        text_obj, if_DS=if_DS, if_EA=if_EA)
    #logging.debug("sent_embeddings:%s\ncandidate_embeddings_list:%s" %(sent_embeddings, candidate_embeddings_list))
    ed = time.time()
    cost = int((ed - st) * 1000)
    #logging.debug("[emb_cost] %dms" %(cost))
    position_score = get_position_score(text_obj.keyphrase_candidate,
                                        position_bias)
    if len(position_score) == 0:
        average_score = 0
    else:
        average_score = sum(position_score.values()) / (float)(
            len(position_score))

    dist_list = []
    for i, emb in enumerate(candidate_embeddings_list):
        dist = get_dist_cosine(sent_embeddings,
                               emb,
                               sent_emb_method,
                               elmo_layers_weight=elmo_layers_weight)
        #logging.debug("%s, cosine:%f" % (text_obj.keyphrase_candidate[i], dist))
        dist_list.append(dist)
    dist_all = get_all_dist(candidate_embeddings_list, text_obj, dist_list)
    dist_final = get_final_dist(dist_all, method='average')

    # for np,dist in dist_final.items():
    #     if np in position_score:
    #         dist_final[np] = dist*position_score[np]/average_score

    logger.info("[check sif] %s" % (dist_final))

    min_s = 1.0
    max_s = 0.0
    for w, s in dist_final.items():
        if s < min_s:
            min_s = s
        if s > max_s:
            max_s = s

    # dist_sorted = sorted(dist_final.items(), key=lambda x: x[1], reverse=True)
    word_textrank = textrank.get_textrank(text_obj.sentence_words, window=10)
    new_textrank = {}
    for k, v in word_textrank.items():
        if k not in kwdict:
            continue
        new_textrank[k] = v

    min_sr = 1.0
    max_sr = 0.0
    denominate_sr = 1.0
    for w, s in dist_final.items():
        if s < min_sr:
            min_sr = s
        if s > max_sr:
            max_sr = s
    denominate_sr = max_sr - min_sr

    min_tr = 1.0
    max_tr = 0.0
    for w, s in new_textrank.items():
        if s < min_tr:
            min_tr = s
        if s > max_tr:
            max_tr = s
    denominate_tr = max_tr - min_tr

    final_items = []
    if len(word_textrank) > 0:
        for w, s in dist_final.items():
            if w not in new_textrank:
                continue
            norm_sr = (s - min_sr) / denominate_sr
            norm_tr = (new_textrank[w] - min_tr) / denominate_tr
            final_items.append((w, norm_sr * norm_tr, norm_sr, norm_tr))
    final_items = sorted(final_items, key=lambda x: x[1], reverse=True)
    logger.info(
        "[check method]dist_final:%s\nword_textrank:%s\nfinal_items:%s" %
        (dist_final, word_textrank, final_items))

    return final_items
Ejemplo n.º 10
0
def SIFRank_plus(text,
                 SIF,
                 en_model,
                 method="average",
                 N=15,
                 sent_emb_method="elmo",
                 elmo_layers_weight=[0.0, 1.0, 0.0],
                 if_DS=True,
                 if_EA=True,
                 position_bias=3.4,
                 kwdict=None,
                 kw_info=None,
                 cut_dict=False,
                 seg_only=False,
                 use_pos=False,
                 user_dict=None):
    """
    :param text_obj:
    :param sent_embeddings:
    :param candidate_embeddings_list:
    :param sents_weight_list:
    :param method:
    :param N: the top-N number of keyphrases
    :param sent_emb_method: 'elmo', 'glove'
    :param elmo_layers_weight: the weights of different layers of ELMo
    :return:
    """
    text_obj = input_representation.InputTextObj(en_model,
                                                 text,
                                                 kw_dict=kwdict,
                                                 cut_dict=cut_dict,
                                                 seg_only=seg_only,
                                                 use_pos=use_pos,
                                                 user_dict=user_dict)
    st = time.time()
    sent_embeddings, candidate_embeddings_list, weight_list, elmo_embeddings = SIF.get_tokenized_sent_embeddings(
        text_obj, if_DS=if_DS, if_EA=if_EA)
    #logging.debug("sent_embeddings:%s\ncandidate_embeddings_list:%s" %(sent_embeddings, candidate_embeddings_list))
    ed = time.time()
    cost = int((ed - st) * 1000)
    #logging.debug("[emb_cost] %dms" %(cost))
    position_score = get_position_score(text_obj.keyphrase_candidate,
                                        position_bias)
    if len(position_score) == 0:
        average_score = 0
    else:
        average_score = sum(position_score.values()) / (float)(
            len(position_score))

    dist_list = []
    for i, emb in enumerate(candidate_embeddings_list):
        dist = get_dist_cosine(sent_embeddings,
                               emb,
                               "glove",
                               elmo_layers_weight=elmo_layers_weight)
        #logging.debug("%s, cosine:%f" % (text_obj.keyphrase_candidate[i], dist))
        dist_list.append(dist)
    dist_all = get_all_dist(candidate_embeddings_list, text_obj, dist_list)
    #textranks = get_textrank(candidate_embeddings_list, elmo_layers_weight)

    #textrank_all = {}
    #if textranks is not None:
    #    for idx, score in textranks:
    #        textrank_all[text_obj.keyphrase_candidate[idx][0]] = score

    dist_final = get_final_dist(dist_all, method='average')

    #for item,dist in dist_final.items():
    #    if item in position_score:
    #        dist_final[item] = dist*position_score[item]/average_score
    #
    #    if item in textrank_all:
    #        dist_final[item] = dist_final[item] * textrank_all[item]
    #    else:
    #        print("[miss error] %s" % (item))

    dist_sorted = sorted(dist_final.items(), key=lambda x: x[1], reverse=True)

    #top_dist = dist_sorted[0:N]
    top_dist = dist_sorted
    check_data = {
        "keyphrase_candidate": text_obj.keyphrase_candidate,
        "sent_embedding": sent_embeddings,
        "candidate_embeddings_list": candidate_embeddings_list,
        "weight_list": weight_list,
        "elmo_embeddings": elmo_embeddings,
        "text_obj": text_obj
    }
    return top_dist, check_data
Ejemplo n.º 11
0
def SIFRank(text,
            SIF,
            en_model,
            method="average",
            N=15,
            sent_emb_method="elmo",
            elmo_layers_weight=[0.0, 1.0, 0.0],
            if_DS=True,
            if_EA=True,
            kwdict=None,
            cut_dict=False):
    """
    :param text_obj:
    :param sent_embeddings:
    :param candidate_embeddings_list:
    :param sents_weight_list:
    :param method:
    :param N: the top-N number of keyphrases
    :param sent_emb_method: 'elmo', 'glove'
    :param elmo_layers_weight: the weights of different layers of ELMo
    :param if_DS: if take document segmentation(DS)
    :param if_EA: if take  embeddings alignment(EA)
    :return:
    """
    text_obj = input_representation.InputTextObj(en_model,
                                                 text,
                                                 kwdict,
                                                 cut_dict=cut_dict)
    sent_embeddings, candidate_embeddings_list = SIF.get_tokenized_sent_embeddings(
        text_obj, if_DS=if_DS, if_EA=if_EA)
    dist_list = []
    for i, emb in enumerate(candidate_embeddings_list):
        dist = get_dist_cosine(sent_embeddings,
                               emb,
                               sent_emb_method,
                               elmo_layers_weight=elmo_layers_weight)
        dist_list.append(dist)
    dist_all = get_all_dist(candidate_embeddings_list, text_obj, dist_list)
    dist_final = get_final_dist(dist_all, method='average')
    dist_sorted = sorted(dist_final.items(), key=lambda x: x[1], reverse=True)

    phrase_freq = dict()
    max_freq, min_freq = 1, 1
    for phrase, dist_list in dist_all.items():
        freq = len(dist_list)
        phrase_freq[phrase] = freq
        if freq < min_freq:
            min_freq = freq
        if freq > max_freq:
            max_freq = freq

    top_dist = dist_sorted[0:N]
    for idx, (np, dist) in enumerate(top_dist):
        freq_weight = (phrase_freq[np] - min_freq + 1) / float(max_freq -
                                                               min_freq + 1)
        print("%s %f %f, freq:%d, max:%d, min:%d" %
              (np, dist, freq_weight, phrase_freq[np], max_freq, min_freq))
        #if dist > 0.9:
        #    continue
        #else:
        top_dist[idx] = (np, dist * freq_weight)
    top_sorted = sorted(top_dist, key=lambda x: x[1], reverse=True)
    return top_sorted
Ejemplo n.º 12
0
                    length += 1        
                    tmp_dict = tmp_dict[tmp_token]
                    if "is_leaf" in tmp_dict:
                        is_match = True
                        match_length = length
                else:
                    break
            
            if is_match:
                candidate_kw = ""
                for k in range(0, match_length):
                    candidate_kw += tokens_tagged[i + k][0]
                start_end = (i, i + match_length)
                keyphrase_candidate.append((candidate_kw, start_end))
                i += match_length
            else:
                i += 1
        else:
            i += 1
    return keyphrase_candidate


if __name__ == '__main__':
    #This is an example.
    zh_model = thulac.thulac(model_path=r'../auxiliary_data/thulac.models/',user_dict=r'../auxiliary_data/user_dict.txt')
    sent = "以BERT为代表的自然语言预训练模型(Pre-trained Language Model)的出现使自然语言的各个任务领域的效果都得到大幅地提升。"
    ito = input_representation.InputTextObj(text=sent,zh_model=zh_model)
    keyphrase_candidate = ito.keyphrase_candidate
    for kc in keyphrase_candidate:
        print(kc)