def run(self): self.ELMO = word_emb_elmo.WordEmbeddings(self.model_file, cuda_device=self.gpu_id) self.SIF = sent_emb_sif.SentEmbeddings(self.ELMO, lamda=1.0) if self.cut_dict == True: self.zh_model = thulac.thulac(model_path=r'./auxiliary_data/thulac.models/', seg_only=self.seg_only) else: self.zh_model = thulac.thulac(model_path=r'./auxiliary_data/thulac.models/',user_dict=self.user_dict_file, seg_only=self.seg_only) while self.stop_sign.value == 0: if self.recv_queue.empty() == False: try: data = self.recv_queue.get(True, 1) except Exception as e: continue docid = data[0] text = data[1] if len(text) > 4000: text = text[0:4000] # [title, content] = text.split('\t') self.logger.info("worker_process[%d] %s, len:%d" %(self.worker_id, docid, len(text))) keywords = extract_keyword(text, self.SIF, self.zh_model, self.elmo_layers_weight, plus=self.plus, topk=20, kwdict=self.user_dict, kw_info=self.kw_info, cut_dict=self.cut_dict, seg_only=self.seg_only) self.logger.info("worker_succ[%d] %s" %(self.worker_id, docid)) self.logger.info("worker_succ[%d] %s %s" %(self.worker_id, docid, keywords)) #self.push_queue.put([docid, title_kw, content_kw]) self.push_queue.put([docid, keywords]) self.logger.info("stop worker[%d]" %(self.worker_id))
def __init__(self): # path = os.path.dirname(os.path.realpath('__file__')) self.vncorenlp = VnCoreNLP( "auxiliary_data/VnCoreNLP-master/VnCoreNLP-1.1.1.jar", annotators="wseg, pos", max_heap_size='-Xmx500m') self.phoBERT = word_emb_phoBert.WordEmbeddings() self.SIF = sent_emb_sif.SentEmbeddings(self.phoBERT, lamda=1.0, embeddings_type='bert')
def extract_article_kw(article_file, output_file): docids, texts = load_articles(article_file) #docids, texts = load_tencent_articles(article_file) wfp = open(output_file, "w", encoding="utf-8") user_dict_file = r'./auxiliary_data/keyword_vocab_final' kw_info_file = r'/search/odin/liruihong/keyword-project/config_data/ret_item_info' model_file = r'./auxiliary_data/zhs.model/' gpu_id = 0 kwdict = load_user_dict(user_dict_file) kw_info = load_kw_info(kw_info_file, encoding="gbk") user_dict = load_user_dict( "/search/odin/liruihong/keyword-project/keywords_embrank/config_data/articles_7d_vocab.txt" ) seg_only = True # ELMO = word_emb_elmo.WordEmbeddings(model_file, cuda_device=gpu_id) # SIF = sent_emb_sif.SentEmbeddings(ELMO, lamda=1.0) w2v = word2vec_emb.Word2VecEmbeddings( "/search/odin/liruihong/keyword-project/keywords_embrank/config_data/articles_7d_word2vec.txt" ) SIF = sent_emb_sif.SentEmbeddings(w2v, lamda=1.0, embeddings_type="glove") # zh_model = thulac.thulac(model_path=r'./auxiliary_data/thulac.models/', seg_only=seg_only) elmo_layers_weight = [1.0, 0.0, 0.0] plus = True kw_info = kw_info checkfp = open( "/search/odin/liruihong/keyword-project/SIFRank_zh/check_w2vsif_model.bin", "wb") check_data = {} for idx, text in enumerate(texts): #if idx >= 200: # break docid = docids[idx] logging.info("%s" % (docid)) st = time.time() keywords = extract_keyword(text, SIF, zh_model, elmo_layers_weight, plus=plus, topk=20, kwdict=kwdict, kw_info=kw_info, cut_dict=False, seg_only=seg_only, user_dict=user_dict) if isinstance(keywords, tuple): check_item = keywords[1] check_data[docid] = check_item keywords = keywords[0] ed = time.time() cost = int((ed - st) * 1000) print("%s time_cost:%d ms" % (docid, cost)) #keywords = [(kw,score) for kw,score in keywords if kw in user_dict] writer_keywords = " ".join( ["%s:%f" % (kw, score) for kw, score in keywords]) wfp.write("%s\t%s\n" % (docid, writer_keywords)) pickle.dump(check_data, checkfp) checkfp.close() wfp.close()
#! /usr/bin/env python # -*- coding: utf-8 -*- # __author__ = "Sponge_sy" # Date: 2020/2/21 from embeddings import sent_emb_sif, word_emb_elmo from model.method import SIFRank, SIFRank_plus import thulac import jieba.analyse #download from https://github.com/HIT-SCIR/ELMoForManyLangs model_file = r'../auxiliary_data/zhs.model/' ELMO = word_emb_elmo.WordEmbeddings(model_file) SIF = sent_emb_sif.SentEmbeddings(ELMO, lamda=1.0) #download from http://thulac.thunlp.org/ zh_model = thulac.thulac(model_path=r'../auxiliary_data/thulac.models/',user_dict=r'../auxiliary_data/user_dict.txt') elmo_layers_weight = [0.0, 1.0, 0.0] text = "计算机科学与技术(Computer Science and Technology)是国家一级学科,下设信息安全、软件工程、计算机软件与理论、计算机系统结构、计算机应用技术、计算机技术等专业。 [1]主修大数据技术导论、数据采集与处理实践(Python)、Web前/后端开发、统计与数据分析、机器学习、高级数据库系统、数据可视化、云计算技术、人工智能、自然语言处理、媒体大数据案例分析、网络空间安全、计算机网络、数据结构、软件工程、操作系统等课程,以及大数据方向系列实验,并完成程序设计、数据分析、机器学习、数据可视化、大数据综合应用实践、专业实训和毕业设计等多种实践环节。" keyphrases = SIFRank(text, SIF, zh_model, N=15,elmo_layers_weight=elmo_layers_weight) keyphrases_ = SIFRank_plus(text, SIF, zh_model, N=15, elmo_layers_weight=elmo_layers_weight) print("------------------------------------------") print("原文:"+text) print("------------------------------------------") print("SIFRank_zh结果:") print(keyphrases) print("SIFRank+_zh结果:") print(keyphrases_) print("------------------------------------------") print("jieba分词TFIDF算法结果:")
elif (database == "Duc2001"): data, labels = fileIO.get_duc2001_data() lamda = 1.0 elmo_layers_weight = [1.0, 0.0, 0.0] else: data, labels = fileIO.get_semeval2017_data() lamda = 0.6 elmo_layers_weight = [1.0, 0.0, 0.0] #download from https://allennlp.org/elmo options_file = "../auxiliary_data/elmo_2x4096_512_2048cnn_2xhighway_options.json" weight_file = "../auxiliary_data/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" porter = nltk.PorterStemmer() #please download nltk ELMO = word_emb_elmo.WordEmbeddings(options_file, weight_file, cuda_device=0) SIF = sent_emb_sif.SentEmbeddings(ELMO, lamda=lamda, database=database) en_model = StanfordCoreNLP( r'E:\Python_Files\stanford-corenlp-full-2018-02-27', quiet=True) #download from https://stanfordnlp.github.io/CoreNLP/ try: for key, data in data.items(): lables = labels[key] lables_stemed = [] for lable in lables: tokens = lable.split() lables_stemed.append(' '.join(porter.stem(t) for t in tokens)) print(key)
import sys from flask import Flask, jsonify, request, render_template from werkzeug.utils import secure_filename import json import os from vncorenlp import VnCoreNLP from embeddings import sent_emb_sif, word_emb_phoBert from model.method import SIFRank, SIFRank_plus app = Flask(__name__, static_url_path="", static_folder='./static/') vncorenlp = VnCoreNLP("auxiliary_data/VnCoreNLP-master/VnCoreNLP-1.1.1.jar", annotators="wseg, pos", max_heap_size='-Xmx500m') phoBERT = word_emb_phoBert.WordEmbeddings() SIF = sent_emb_sif.SentEmbeddings(phoBERT, lamda=1.0, embeddings_type='bert') # Xử lý điều hướng câu truy vấn từ client gửi đến @app.route('/extract', methods=['GET']) def extract(): return render_template('extract.html') @app.route('/result', methods=['POST']) def result(): text = request.form.get('query') highlighted, phrases = extract_keyphrase(text) return render_template('result.html', highlighted=highlighted, phrases=phrases)
data = request.json query = data.get("text", "") top_n = int(data.get("n", 15)) keywords = SIFRank_plus(query, SIF, en_model, N=top_n, elmo_layers_weight=elmo_layers_weight) return jsonify(keywords) if __name__ == '__main__': data_dir = "data/" options_file = f"{data_dir}elmo_2x4096_512_2048cnn_2xhighway_options.json" weight_file = f"{data_dir}elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" cuda_device = 0 porter = nltk.PorterStemmer() ELMO = word_emb_elmo.WordEmbeddings(options_file=options_file, weight_file=weight_file, cuda_device=cuda_device) SIF = sent_emb_sif.SentEmbeddings(word_embeddor=ELMO, data_dir=data_dir, lamda=1.0) en_model = stanza.Pipeline(lang='en', processors={}, use_gpu=True) elmo_layers_weight = [0.0, 1.0, 0.0] serve(app, host="0.0.0.0", port=5000)