def run(self):
        self.ELMO = word_emb_elmo.WordEmbeddings(self.model_file, cuda_device=self.gpu_id)
        self.SIF = sent_emb_sif.SentEmbeddings(self.ELMO, lamda=1.0)
        if self.cut_dict == True:
            self.zh_model = thulac.thulac(model_path=r'./auxiliary_data/thulac.models/', seg_only=self.seg_only)
        else:
            self.zh_model = thulac.thulac(model_path=r'./auxiliary_data/thulac.models/',user_dict=self.user_dict_file, seg_only=self.seg_only)
        while self.stop_sign.value == 0:
            if self.recv_queue.empty() == False:
                try:
                    data = self.recv_queue.get(True, 1)
                except Exception as e:
                    continue
                docid = data[0]
                text = data[1]
                if len(text) > 4000:
                    text = text[0:4000]
                # [title, content] = text.split('\t')
                self.logger.info("worker_process[%d] %s, len:%d" %(self.worker_id, docid, len(text)))
                keywords = extract_keyword(text, self.SIF, self.zh_model, self.elmo_layers_weight, plus=self.plus,
                                            topk=20, kwdict=self.user_dict, kw_info=self.kw_info, cut_dict=self.cut_dict, seg_only=self.seg_only)

                self.logger.info("worker_succ[%d] %s" %(self.worker_id, docid))
                self.logger.info("worker_succ[%d] %s %s" %(self.worker_id, docid, keywords))
                #self.push_queue.put([docid, title_kw, content_kw])
                self.push_queue.put([docid, keywords])

        self.logger.info("stop worker[%d]" %(self.worker_id))
Example #2
0
 def __init__(self):
     # path = os.path.dirname(os.path.realpath('__file__'))
     self.vncorenlp = VnCoreNLP(
         "auxiliary_data/VnCoreNLP-master/VnCoreNLP-1.1.1.jar",
         annotators="wseg, pos",
         max_heap_size='-Xmx500m')
     self.phoBERT = word_emb_phoBert.WordEmbeddings()
     self.SIF = sent_emb_sif.SentEmbeddings(self.phoBERT,
                                            lamda=1.0,
                                            embeddings_type='bert')
def extract_article_kw(article_file, output_file):
    docids, texts = load_articles(article_file)
    #docids, texts = load_tencent_articles(article_file)
    wfp = open(output_file, "w", encoding="utf-8")
    user_dict_file = r'./auxiliary_data/keyword_vocab_final'
    kw_info_file = r'/search/odin/liruihong/keyword-project/config_data/ret_item_info'
    model_file = r'./auxiliary_data/zhs.model/'
    gpu_id = 0

    kwdict = load_user_dict(user_dict_file)
    kw_info = load_kw_info(kw_info_file, encoding="gbk")

    user_dict = load_user_dict(
        "/search/odin/liruihong/keyword-project/keywords_embrank/config_data/articles_7d_vocab.txt"
    )

    seg_only = True
    # ELMO = word_emb_elmo.WordEmbeddings(model_file, cuda_device=gpu_id)
    # SIF = sent_emb_sif.SentEmbeddings(ELMO, lamda=1.0)
    w2v = word2vec_emb.Word2VecEmbeddings(
        "/search/odin/liruihong/keyword-project/keywords_embrank/config_data/articles_7d_word2vec.txt"
    )
    SIF = sent_emb_sif.SentEmbeddings(w2v, lamda=1.0, embeddings_type="glove")
    # zh_model = thulac.thulac(model_path=r'./auxiliary_data/thulac.models/', seg_only=seg_only)
    elmo_layers_weight = [1.0, 0.0, 0.0]
    plus = True
    kw_info = kw_info

    checkfp = open(
        "/search/odin/liruihong/keyword-project/SIFRank_zh/check_w2vsif_model.bin",
        "wb")
    check_data = {}
    for idx, text in enumerate(texts):
        #if idx >= 200:
        #    break
        docid = docids[idx]
        logging.info("%s" % (docid))
        st = time.time()
        keywords = extract_keyword(text,
                                   SIF,
                                   zh_model,
                                   elmo_layers_weight,
                                   plus=plus,
                                   topk=20,
                                   kwdict=kwdict,
                                   kw_info=kw_info,
                                   cut_dict=False,
                                   seg_only=seg_only,
                                   user_dict=user_dict)

        if isinstance(keywords, tuple):
            check_item = keywords[1]
            check_data[docid] = check_item
            keywords = keywords[0]
        ed = time.time()
        cost = int((ed - st) * 1000)
        print("%s time_cost:%d ms" % (docid, cost))
        #keywords = [(kw,score) for kw,score in keywords if kw in user_dict]
        writer_keywords = " ".join(
            ["%s:%f" % (kw, score) for kw, score in keywords])
        wfp.write("%s\t%s\n" % (docid, writer_keywords))

    pickle.dump(check_data, checkfp)
    checkfp.close()
    wfp.close()
Example #4
0
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# __author__ = "Sponge_sy"
# Date: 2020/2/21

from embeddings import sent_emb_sif, word_emb_elmo
from model.method import SIFRank, SIFRank_plus
import thulac
import jieba.analyse

#download from https://github.com/HIT-SCIR/ELMoForManyLangs
model_file = r'../auxiliary_data/zhs.model/'

ELMO = word_emb_elmo.WordEmbeddings(model_file)
SIF = sent_emb_sif.SentEmbeddings(ELMO, lamda=1.0)
#download from http://thulac.thunlp.org/
zh_model = thulac.thulac(model_path=r'../auxiliary_data/thulac.models/',user_dict=r'../auxiliary_data/user_dict.txt')
elmo_layers_weight = [0.0, 1.0, 0.0]

text = "计算机科学与技术(Computer Science and Technology)是国家一级学科,下设信息安全、软件工程、计算机软件与理论、计算机系统结构、计算机应用技术、计算机技术等专业。 [1]主修大数据技术导论、数据采集与处理实践(Python)、Web前/后端开发、统计与数据分析、机器学习、高级数据库系统、数据可视化、云计算技术、人工智能、自然语言处理、媒体大数据案例分析、网络空间安全、计算机网络、数据结构、软件工程、操作系统等课程,以及大数据方向系列实验,并完成程序设计、数据分析、机器学习、数据可视化、大数据综合应用实践、专业实训和毕业设计等多种实践环节。"
keyphrases = SIFRank(text, SIF, zh_model, N=15,elmo_layers_weight=elmo_layers_weight)
keyphrases_ = SIFRank_plus(text, SIF, zh_model, N=15, elmo_layers_weight=elmo_layers_weight)
print("------------------------------------------")
print("原文:"+text)
print("------------------------------------------")
print("SIFRank_zh结果:")
print(keyphrases)
print("SIFRank+_zh结果:")
print(keyphrases_)
print("------------------------------------------")
print("jieba分词TFIDF算法结果:")
Example #5
0
elif (database == "Duc2001"):
    data, labels = fileIO.get_duc2001_data()
    lamda = 1.0
    elmo_layers_weight = [1.0, 0.0, 0.0]
else:
    data, labels = fileIO.get_semeval2017_data()
    lamda = 0.6
    elmo_layers_weight = [1.0, 0.0, 0.0]

#download from https://allennlp.org/elmo
options_file = "../auxiliary_data/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "../auxiliary_data/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

porter = nltk.PorterStemmer()  #please download nltk
ELMO = word_emb_elmo.WordEmbeddings(options_file, weight_file, cuda_device=0)
SIF = sent_emb_sif.SentEmbeddings(ELMO, lamda=lamda, database=database)
en_model = StanfordCoreNLP(
    r'E:\Python_Files\stanford-corenlp-full-2018-02-27',
    quiet=True)  #download from https://stanfordnlp.github.io/CoreNLP/

try:
    for key, data in data.items():

        lables = labels[key]
        lables_stemed = []

        for lable in lables:
            tokens = lable.split()
            lables_stemed.append(' '.join(porter.stem(t) for t in tokens))

        print(key)
Example #6
0
import sys
from flask import Flask, jsonify, request, render_template
from werkzeug.utils import secure_filename
import json
import os

from vncorenlp import VnCoreNLP
from embeddings import sent_emb_sif, word_emb_phoBert
from model.method import SIFRank, SIFRank_plus

app = Flask(__name__, static_url_path="", static_folder='./static/')
vncorenlp = VnCoreNLP("auxiliary_data/VnCoreNLP-master/VnCoreNLP-1.1.1.jar",
                      annotators="wseg, pos",
                      max_heap_size='-Xmx500m')
phoBERT = word_emb_phoBert.WordEmbeddings()
SIF = sent_emb_sif.SentEmbeddings(phoBERT, lamda=1.0, embeddings_type='bert')


# Xử lý điều hướng câu truy vấn từ client gửi đến
@app.route('/extract', methods=['GET'])
def extract():
    return render_template('extract.html')


@app.route('/result', methods=['POST'])
def result():
    text = request.form.get('query')
    highlighted, phrases = extract_keyphrase(text)
    return render_template('result.html',
                           highlighted=highlighted,
                           phrases=phrases)
Example #7
0
    data = request.json
    query = data.get("text", "")
    top_n = int(data.get("n", 15))

    keywords = SIFRank_plus(query,
                            SIF,
                            en_model,
                            N=top_n,
                            elmo_layers_weight=elmo_layers_weight)

    return jsonify(keywords)


if __name__ == '__main__':
    data_dir = "data/"
    options_file = f"{data_dir}elmo_2x4096_512_2048cnn_2xhighway_options.json"
    weight_file = f"{data_dir}elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
    cuda_device = 0

    porter = nltk.PorterStemmer()
    ELMO = word_emb_elmo.WordEmbeddings(options_file=options_file,
                                        weight_file=weight_file,
                                        cuda_device=cuda_device)
    SIF = sent_emb_sif.SentEmbeddings(word_embeddor=ELMO,
                                      data_dir=data_dir,
                                      lamda=1.0)
    en_model = stanza.Pipeline(lang='en', processors={}, use_gpu=True)
    elmo_layers_weight = [0.0, 1.0, 0.0]

    serve(app, host="0.0.0.0", port=5000)