def genfast(rows,outfile): fou = open(outfile, 'w', encoding='UTF-8') # for file in f: # print(file) # domboj = xmldom.parse(file) # rows=domboj.getElementsByTagName("row") TextProcess() for row in rows: title = row[3] content = row[4] catalog = row[9] if catalog is None: catalog='other' elif catalog.find('扶贫') > -1: catalog = 'fupin' elif catalog.find('环保')>-1: catalog='huanbao' # catalog = 'other' if row[9] is None else 'huanbao' # print(title,catalog) line = title.strip() + " " + content.strip() outline = TextProcess.doAll(line) outline = "\t__label__" + catalog + " " + outline + "\t\n" fou.write(outline) fou.close()
def NB(): data = getTrainData("SELECT *" + " FROM news_" + " WHERE detail like '%s'", '%习近平%') TextProcess() X_train = [TextProcess.doAll(r[3] + r[4]) for r in data] y_train = [1 if r[9] == '环保' else 0 for r in data] vectorizer = CountVectorizer(min_df=1, max_df=1.0, token_pattern='\\b\\w+\\b') # vectorizer=TfidfVectorizer(token_pattern=,) X_count_train = vectorizer.fit_transform(X_train) mnb_count = MultinomialNB() svm_ = SVC(kernel='rbf') mnb_count.fit(X_count_train, y_train) svm_.fit(X_count_train, y_train) data1 = getTrainData("SELECT *" + " FROM news_" + " WHERE detail not like '%s'", '%习近平%') X_test = [TextProcess.doAll(r[3] + r[4]) for r in data1] X_count_test = vectorizer.transform(X_test) y_test = [1 if r[9] == '环保' else 0 for r in data1] y_predict = mnb_count.predict(X_count_test) y_predict1 = svm_.predict(X_count_test) print(classification_report(y_test, y_predict)) # , target_names=news.target_names)) print(classification_report(y_test, y_predict1))
def ft(): saveDataFile = r'./resources/news_fasttext_all.txt' testFile=r'./resources/news_fasttext_环保扶贫1.txt' classifier=fasttext.supervised(saveDataFile, output='./mod/xjpnews_classifier_model3',dim=200,min_count=1,ws=10,epoch=150,neg=5,word_ngrams=2,bucket=1) # classifier = fasttext.load_model("./mod/xjpnews_classifier_model3.bin", encoding='utf-8', label_prefix='__lable__') # fasttext # fasttext.supervised(). result = classifier.test(testFile) print("P@1:", result.precision) # 准确率 print("R@2:", result.recall) # 召回率 print("Number of examples:", result.nexamples) # 预测错的例子 texts='不谋全局者,不足谋一域。”2016年7月至今,以绿色税收等措施为发力点,政策力度持续增强。十九大报告亦指出,“必须树立和践行绿水青山就是金山银山的理念”“实行最严格的生态制度”。从经济学视角看环境治理,我们认为,“严监管”不仅有利于生态文明,更在三个层面牵动改革全局,促进中国经济转型升级。' texts1='水是指经济环境、制度环境;鱼是企业。他问如果“水”不好、中国的经济很差、中国不适合办企业,那么115家世界500强怎么来的?如果说“水”很好,那么为什么那么多“鱼”非正常死掉?今天很多的企业家在改革开放近40年里在这个国家赚了很多的钱,但他们移民了。2016年,美国的投资移民签了800个人,很多是咱们中国人。他们为什么要移民?这个焦虑是从何而来?这个问题在很多人的心目中仍是一个问号。' TextProcess() texts=[TextProcess.doAll(texts)] texts1=[TextProcess.doAll(texts1)] lables = classifier.predict_proba(texts,k=2) print(lables,texts) lables1 = classifier.predict_proba(texts1, k=2) print(lables1,texts1) # import xml.dom.minidom as xmldom # f = glob.glob('./resources/*.xml') # fou = open('./resources/xijinping_fasttext_predict.txt', 'w', encoding='UTF-8') # for file in f: # print(file) # domboj = xmldom.parse(file) # rows=domboj.getElementsByTagName("row") # for row in rows: # title=row.getElementsByTagName("IR_URLTITLE")[0].firstChild.data # content=row.getElementsByTagName("IR_CONTENT")[0].firstChild.data # # catalog=row.getElementsByTagName("IR_CATALOG")[0].firstChild.data # #标题 # line=title.strip()+" "+content.strip() # newsline=TextProcess.doAll(line) # lables = classifier.predict([newsline]) # outline = "\t" + lables[0][0] + " " + newsline + "\t\n" # fou.write(outline) # # fou.close() # fou = open('./resources/news_fasttext_predict2.txt', 'w', encoding='UTF-8') # rows=query_all('%生态%') # for row in rows: # title = row[3] # content = row[4] # line = title.strip() + " " + content.strip() # newsline = TextProcess.doAll(line) # lables = classifier.predict_proba([newsline],k=2) # outline = "\t__label__" # fou.write(outline) # if (len(lables) == 1): # fou.write(lables[0][0][0]+str(lables[0][0][1])) # if(len(lables[0])==2): # fou.write(lables[0][1][0]+str(lables[0][1][1])) # fou.write(' '+line + "\t\n") # fou.close() return
def gensim(): data = query_country_name('环保') # 词频统计 TextProcess() # from collections import defaultdict # frequency = defaultdict(int) # for r in data: # text= (TextProcess.doAll(r[3] + r[4])).split() # for token in text: # frequency[token] += 1 # print(frequency) corpus = [(TextProcess.doAll(r[3] + r[4])).split() for r in data] dictionary = corpora.Dictionary(corpus) print(dictionary.dfs)
def prepare(self): f = glob.glob('./resources/book/*.txt') sents = [] sents_ = [] for file in f: with open(file, 'r', encoding='utf-8') as fin: lines = fin.readlines() bookname = lines[0].strip() for row in lines: row = TextProcess.remove_noisy(row.strip()) if row == None or row == '' or len( row) < 10 or row[-1] not in zhon.hanzi.punctuation: continue rowsents = self.splitter.split(row) for s in rowsents: sents.append([s, bookname]) sents_.append(s) logging.info(bookname + str(len(sents)) + '句') fou = open('./resources/book.bin', 'wb') pickle.dump(sents, fou) fou.close() annoyIndex = AnnoyIndex(768) encodes = self.bc.encode(sents_) for i, sent in enumerate(sents): encode = encodes[i] #self.bc.encode([sent[1]])[0] annoyIndex.add_item(i, encode) annoyIndex.build(10) annoyIndex.save('./mod/book.mod')
def getSim(doc): model = models.doc2vec.Doc2Vec.load('./mod/docvec1.model') infer = model.infer_vector(TextProcess.doAll(doc).split(' ')) sims = model.docvecs.most_similar([infer], topn=5) for id, sim in sims: print(id, sim) #文章在哪儿 row = query_(id) print(row)
def find_golden_org(self, content, threadhold=0.94): sents = list(self.splitter.split(content)) sents = [ TextProcess.delNum(TextProcess.remove_noisy(sent)).strip() for sent in sents ] sents_encode = self.bc.encode(sents) result = [] for i, sencode in enumerate(sents_encode): sentindex, dis = self.annoyIndex.get_nns_by_vector( sencode, 1, include_distances=True) print(sents[i] + str(np.cos(dis[0]))) if (np.cos(dis[0]) > threadhold): result.append({ 'org': self.sents[sentindex[0]][1].strip(), 'subcontent': sents[i], 'score': np.cos(dis[0]), 'video': self.sents[sentindex[0]] if self.sents[sentindex[0]] else {} }) return result
def main(self): print('Welcome to Chatbot') stream = open("parameters.txt", 'r') self.parameters = yaml.safe_load(stream) self.textProcess = TextProcess(self.parameters) if not os.path.isfile('chatbotModel.pkl'): input_context = Input(shape=(self.parameters['maxLength'], ), dtype="int32", name="input_context") embeddingLayer = Embedding( self.parameters['vocabularySize'], output_dim=self.parameters['embeddingOutputDim'], weights=[self.textProcess.embedding_matrix], input_length=self.parameters['maxLength'], trainable=True) embedding_context = embeddingLayer(input_context) layer = GlobalMaxPool1D()(embedding_context) layer = Dense(int(self.parameters['trainData'] / 2), activation='relu')(layer) outputs = Dense(self.parameters['trainData'], activation='softmax')(layer) self.model = Model(input=[input_context], output=[outputs]) adam = Adam(lr=self.parameters['learningRate']) self.model.compile(loss="categorical_crossentropy", optimizer=adam) self.model.summary() self.model.fit( self.textProcess.inputSequences[:self.parameters['trainData']], self.textProcess.getDecoderOutputData( self.textProcess.targetSequences[:self. parameters['trainData']]), batch_size=self.parameters['batchSize'], epochs=self.parameters['numEpochs']) self.saveDataset() else: self.loadDataset() self.start_chatbot()
def tfidf(): data = query_country_name('环保') TextProcess() # step 1 vectorizer = CountVectorizer(min_df=1, max_df=1.0, token_pattern='\\b\\w+\\b') corpus = [TextProcess.doAll(r[3] + r[4]) for r in data] transformer = TfidfTransformer() corpus_train = vectorizer.fit_transform(corpus) tfidf = transformer.fit_transform(corpus_train) words = vectorizer.get_feature_names() words = np.array(words) # transformer.fit() vectorizer.fit_transform(corpus) weight = tfidf.toarray() word_index = np.argsort(-weight) words_ = words[word_index] print(words_[:3][:3]) for word in words: print(word)
def __init__(self): self.tp=TextProcess() #数据库管理,加载政要数据 self.dataManager=DataManager() # self.political_person_dict=list() #改用aho形式进行存储,方便进行多模匹配。 self.aho_policical_person=ahocorasick.Automaton() try: # load_file = open('./mod/political_person_dict.bin', 'rb') # self.political_person_dict = pickle.load(load_file) # logging.info('political_person_dict count %d' % (len(self.political_person_dict))) file = open('./mod/aho_policical_person.aho', 'rb') self.aho_policical_person = pickle.load(file) logging.info('aho_policical_person count %d' % (len(self.aho_policical_person))) except: pass self.detector=MultiSenDetect() #加载地名数据索引,用于判断词性为hs的是否是地名 load_file = open('./mod/place_dict.bin', 'rb') self.place_dict = pickle.load(load_file) logging.info('place_dict count %d' % (len(self.aho_policical_person))) return
def similarity(text1, text2): List_Text = TextProcess(text1,text2) if List_Text == 0: print("Error") load_mod = Doc2Vec.load(r"/home/prouse/test.model") load_mod.random.seed(0) a_vec = load_mod.infer_vector(List_Text[0], alpha=0.001, epochs=50) load_mod.random.seed(0) b_vec = load_mod.infer_vector(List_Text[1], alpha=0.001, epochs=50) print(similar(a_vec, b_vec))
def genTxt(): rows = query('%习近平%') textProcess = TextProcess() allcent = list() for row in rows: title = row[3] content = row[4] #这里应该按符号进行分隔,分成句,不要符号 temp = textProcess.cut_sent(title, ',') + textProcess.cut_sent( content, ',') print(temp) allcent.extend([item for item in temp if len(item) > 9]) allcent.extend(['\n']) fou = open('./resources/bert_pretrain.txt', 'w', encoding='UTF-8') for row in allcent: words = textProcess.cut(row) if (row != '\n'): fou.write(' '.join(list(words)) + '\n') else: fou.write('\n') fou.close() return
def train(ids, docs): x_train = [] for i, doc in enumerate(docs): x_train.append( models.doc2vec.TaggedDocument(TextProcess.doAll(doc).split(' '), tags=[ids[i]])) model = models.doc2vec.Doc2Vec(vector_size=200, window=10, min_count=5, workers=4, negative=4) model.build_vocab(x_train) #看来是一个量的问题,epochs要达到与vector_size差不多的大小,效果才能看得出来 #之前看到网上说不稳定的问题,看起来就是 model.train(x_train, total_examples=model.corpus_count, epochs=200) model.save('./mod/docvec1.model')
def create_temp(cat_keyword): fou = open('./resources/news_fasttext_all.txt', 'w', encoding='UTF-8') con = connect_wxremit_db() # cur = con.cursor() # cur.execute('TRUNCATE table news_temp') sql_str = "SELECT * FROM news_ WHERE detail like '%s' " % '%习近平%' cur = con.cursor() cur.execute(sql_str) rows = cur.fetchall() for row in rows: flag = False for topic in cat_keyword: catalog=topic title = row[3] content = row[4] line = title.strip() + " " + content.strip() outline = TextProcess.doAll(line) for keyword in cat_keyword[topic]: if( outline.find(keyword)>-1): flag=True break if flag: outline = "\t__label__" + catalog + " " + outline + "\t\n" fou.write(outline) break if not flag: catalog='其它' outline = "\t__label__" + catalog + " " + outline + "\t\n" fou.write(outline) cur.close() fou.close() con.close()
class PersonDisambiguation(): def __init__(self): self.tp=TextProcess() #数据库管理,加载政要数据 self.dataManager=DataManager() # self.political_person_dict=list() #改用aho形式进行存储,方便进行多模匹配。 self.aho_policical_person=ahocorasick.Automaton() try: # load_file = open('./mod/political_person_dict.bin', 'rb') # self.political_person_dict = pickle.load(load_file) # logging.info('political_person_dict count %d' % (len(self.political_person_dict))) file = open('./mod/aho_policical_person.aho', 'rb') self.aho_policical_person = pickle.load(file) logging.info('aho_policical_person count %d' % (len(self.aho_policical_person))) except: pass self.detector=MultiSenDetect() #加载地名数据索引,用于判断词性为hs的是否是地名 load_file = open('./mod/place_dict.bin', 'rb') self.place_dict = pickle.load(load_file) logging.info('place_dict count %d' % (len(self.aho_policical_person))) return ''' 分辨政要人物,保存基本数据,生成政要人物对应百度数据字典 ''' def checkPersonBaike(self): rows=self.dataManager.query_sql("select * from psm_cityfather") persons=[] for row in rows: person=dict() person['id']=row[0] person['nationlity']=row[1] person['region']=row[2] person['cname']=row[3] person['duty']=row[7] persons.append(person) logging.info('persons count: %d' % len(persons)) #使用消歧工具 detector=MultiSenDetect() count=0 persons_temp=self.political_person_dict bar=tqdm(persons) for person in bar: bar.set_description_str(person['cname']) # self.political_person_dict=list() for p in self.political_person_dict: if p['cname'] == person['cname'] and p['duty'] == person['duty']: person['baikename'] = p['baikename'] person['baikeurl']=p['baikeurl'] person['baikeconcept']=p['baikeconcept'] person.update() break if person.get('baikeconcept'): count = count + 1 persons_temp.append(person) continue else: sent_embedding_res, wds_embedding_res=detector.detect_main(person['duty'],person['cname']) # print(sent_embedding_res) # print(wds_embedding_res) person['baikename']=wds_embedding_res[0][0] person['baikeurl']=detector.getConcept(person['baikename'])['link'] person['baikeconcept']=detector.getConcept(person['baikename']) person.update() # pprint.pprint(person) count=count+1 persons_temp.append(person) if count % 5==0: fou = open('./mod/political_person_dict.bin', 'wb') pickle.dump(persons_temp, fou) fou.close() detector.save_cache() detector.save_cache() fou = open('./mod/political_person_dict.bin', 'wb') pickle.dump(persons, fou) fou.close() # 服务器版的,完整化宾语,但下一句的后置宾语不能识别 def complete_VOB_server(self, arcs, word_index): word = arcs[word_index][1] prefix = '' postfix = '' for arc in arcs: if arc[5] == word_index and arc[2] < word_index: prefix += self.complete_VOB_server(arcs, arc[2]) if arc[5] == word_index and arc[2] > word_index: postfix += self.complete_VOB_server(arcs, arc[2]) return prefix + word + postfix def findPerson(self,content): #1先分句 sents=self.tp.cut_sentences(content) nrs=dict() geos=set() for sent in sents: # nr=set(self.tp.posseg(sent,POS=['nr'])) # nrs=nrs.union(nr) # return nrs arcs=self.parseContent(sent) for arc in arcs: # 可能是人名了 if arc[3]=='nh': #从这里找到定中关键词,放进去 # nrs.add(arc[1]) prefix = '' for arc_ in arcs: if arc_[5] == arc[2] and arc_[2] < arc[2]: prefix += self.complete_VOB_server(arcs, arc_[2]) # if prefix=='' : # nrs[arc[1]] = [prefix] # continue pattern = r',|\.|/|;|\'|`|\[|\]|<|>|\?|:|"|\{|\}|\~|!|@|#|\$|%|\^|&|\(|\)|-|=|\_|\+|,|。|、|;|‘|’|【|】|·|!| |…|(|)' prefix_list = re.split(pattern, prefix) for prefix_ in prefix_list: if nrs.get(arc[1]): if prefix_ not in nrs.get(arc[1]) and prefix_!='': nrs[arc[1]].append(prefix_) else: nrs[arc[1]]=[prefix_] if arc[3]=='ns': if (self.place_dict.get(arc[1])): geos.add(arc[1]) return nrs,geos '''用LTP Server形成arcs和child_dict_list''' '''这部分可以有其它LTP工具代替''' def parser_main_ltpserver(self, sentence): url = 'http://192.168.1.101:8020/ltp' wb_data = requests.post(url, data={'s': sentence, 't': 'dp'}, json=True, allow_redirects=True) wb_data.encoding = 'utf-8' arcs_list = [] try: content = wb_data.json() for c in content[0][0]: p = c.get('parent') pc = content[0][0][p] pname = pc.get('cont') ppos = pc.get('pos') arcs_list.append( [c.get('relate'), c.get('cont'), c.get('id'), c.get('pos'), pname, c.get('parent'), ppos]) child_dict_list = [] for index in range(len(content[0][0])): child_dict = dict() for arc_index in range(len(arcs_list)): # if arcs[arc_index].relation=='HED': # print('hed') if arcs_list[arc_index][5] == index: # arcs的索引从1开始---->把HED去掉了 if arcs_list[arc_index][0] in child_dict: child_dict[arcs_list[arc_index][0]].append(arc_index) else: child_dict[arcs_list[arc_index][0]] = [] child_dict[arcs_list[arc_index][0]].append(arc_index) child_dict_list.append(child_dict) except: None return arcs_list, child_dict_list def parseContent(self,sent): arcs, child_dict_list = self.parser_main_ltpserver(sent) return arcs def test1(self): load_file = open('./mod/political_person_dict.bin', 'rb') political_person_dict = pickle.load(load_file) # pprint.pprint(political_person_dict) for i, person in enumerate(political_person_dict): if person['cname']=='哈勒特马·巴特图勒嘎': pprint.pprint(person) pprint.pprint(i) break ''' 更新political_person_dict的数据,而不全重新生成 ''' def update_political_person_dict(self,cname,duty): load_file = open('./mod/political_person_dict.bin', 'rb') political_person_dict = pickle.load(load_file) for i, person in enumerate(political_person_dict): if person['cname']==cname and person['duty']==duty: sent_embedding_res, wds_embedding_res = self.detector.detect_main(person['duty'], person['cname'],[person['duty']]) # print(sent_embedding_res) # print(wds_embedding_res) person['baikename'] = wds_embedding_res[0][0] person['baikeurl'] = self.detector.getConcept(person['baikename'])['link'] person['baikeconcept'] = self.detector.getConcept(person['baikename']) person.update() pprint.pprint(person) fou = open('./mod/political_person_dict.bin', 'wb') pickle.dump(political_person_dict, fou) fou.close() ''' 利用百分点服务,得到同义词,用于对齐 ''' def get_sim(self, something): url = 'http://10.122.141.12:9006/similar' r = requests.post(url, json={"ck": "synonym", "synonym_word": something, "synonym_selectedMode": "auto", "homoionym_word": "", "homoionym_selectedMode": "auto", "homoionym_num": ""}) json = r.json() result = json['detail']['res']['synonym'] return result ''' 生成模匹配索引,也可以用dict来代替, 实际没有真正用模式匹配来取得人名,而是用LTP词性识别来做的,这样准确度比较好。 ''' def genAhocorasick(self): load_file = open('./mod/political_person_dict.bin', 'rb') self.political_person_dict = pickle.load(load_file) self.aho_policical_person=ahocorasick.Automaton() for i,person in enumerate(self.political_person_dict): word=person.get('cname') #这里发现要有外国人名对齐功能,唐纳德·特朗普===》特朗普、川普 习近平---》习主席, #但大部分中国人名,不需要对齐, aliasPerson = self.get_sim(word) baidualias=person.get('baikeconcept').get('别名') if word.find('·')>-1: aliasPerson.append(word[word.index('·')+1:]) aliasPerson.append(word[word.rindex('·')+1:]) #去掉中间名 aliasPerson.append(word[word.index('·') + 1:]+word[word.rindex('·'):]) baidualias_list=[] if baidualias: pattern = r',|\.|/|;|\'|`|\[|\]|<|>|\?|:|"|\{|\}|\~|!|@|#|\$|%|\^|&|\(|\)|-|=|\_|\+|,|。|、|;|‘|’|【|】|·|!| |…|(|)' baidualias_list = re.split(pattern, baidualias) person_all = set([word]).union(set(aliasPerson)).union(set(baidualias_list)) for word_ in person_all: persons=[] if self.aho_policical_person.exists(word_): persons=self.aho_policical_person.get(word_) persons.append(person) self.aho_policical_person.add_word(word_,persons) self.aho_policical_person.make_automaton() # s=self.aho_policical_person.get('习近平') # pprint.pprint(s) out=open('./mod/aho_policical_person.aho','wb') out.write(pickle.dumps(self.aho_policical_person)) out.close() def testAho(self): sent='本院受理的原告易纲诉被告吴勇、王国珍机动车交通事故责任纠纷一案,现已审理终结。判决如下:一、自本判决生效之日起三日内,王国珍赔偿杨旭维修费11703元;二、驳回杨旭的其他诉讼请求。因你下落不明,现依法向你公告送达本案的民事判决书。自本公告发出之日起,经过60日即视为送达。如不服本判决,可在判决书送达之日起十五日内,向本院递交上诉状,并按对方当事人的人数提出副本,上诉于广州市中级人民法院。特此公告。' file=open('./mod/aho_policical_person.aho','rb') aho_policical_person=pickle.load(file) for word in aho_policical_person.iter('刘惠'): pprint.pprint(word) ''' 识别文本中的政要人物 repeat:是否要对每一个名字(即使文章中多次出现)进行识别 att_weight:是否要进行人物头衔的加权 geo_weight:是否要进行地理位置的加权 ''' def recongnizePoliticalPerson(self,sent,repeat=False,att_weight=True,geo_weight=True): pperon_candidates=[] pperson_sure=[] npperson_sure=[] #一句话中也可能有多个政要人名,多个重名怎么办,这种模式下会对有重复字的名字进行抽取 # 如果有两个字的政要,恰好有三个字的其它人员,则会出现误判,所以最合理的 # 方式仍然是利用分词和句法分析来定中分析。 # 要先进行词法分析才行,这里用了LTP的server来做的,jieba的不准确,需要启动ltp_server nrs,geos=self.findPerson(sent) # for word in self.aho_policical_person.iter(sent): for nr in nrs: if not self.aho_policical_person.exists(nr):#只处理政要名字,以及与政要重名的名字,其它人名不处理 continue ppersons=self.aho_policical_person.get(nr)#此处已包括重名政要,但不包括非政要 #一句话里出现多次名字,只取一次,提高效率 flag=True if not repeat: for pperon_candidate in pperon_candidates: if pperon_candidate.get('cname')==ppersons[0].get('cname'): flag=False if not flag: continue pperon_candidates=pperon_candidates+ppersons #把定中的关键词加权给到判断过程中 att = [] if att_weight: att = nrs.get(nr) #地理位置加权 geo=[] if geo_weight: # geo=self.geoKG.parseDoc_global(sent) geo=geos # sent_embedding_res暂时无用,顺接原来的接口, # 能不能在这里加一个类别的判断呢,通过title判断是否是官员,再对官员进行过滤? # ATT与全句相比,更加贴近人物本身,其它关键词是背景 # 应该先进行类似知识图谱级的判断,再进行消歧综合判断 sent_embedding_res, wds_embedding_res = self.detector.detect_main(sent, ppersons[0].get('cname'), att, geo) concept=self.detector.getConcept(wds_embedding_res[0][0])#拿回元数据 for pperson in ppersons: #政治人物 是百度给人物打的标签,这里为加强准确性,判断是否符合标签 if concept.get('出生日期')==pperson.get('baikeconcept').get('出生日期'):# and '政治人物' in concept.get('tags'): # logging.info(pperson) # pprint.pprint(pperson) pperson_sure.append(pperson) break if pperson not in pperson_sure: concept['是否政要']='否' # pprint.pprint(concept) npperson_sure.append(concept) #保存baidu的访问缓存 self.detector.save_cache() pprint.pprint(pperson_sure) pprint.pprint(npperson_sure) return pperson_sure,npperson_sure def recongnizePerson(self,sent,repeat=False,att_weight=True,geo_weight=True): pperon_candidates = [] # 一句话中也可能有多个政要人名,多个重名怎么办,这种模式下会对有重复字的名字进行抽取 # 如果有两个字的政要,恰好有三个字的其它人员,则会出现误判,所以最合理的 # 方式仍然是利用分词和句法分析来定中分析。 # 要先进行词法分析才行,这里用了LTP的server来做的,jieba的不准确,需要启动ltp_server nrs, geos = self.findPerson(sent) # for word in self.aho_policical_person.iter(sent): for nr in set(nrs): att = [] if att_weight: att = nrs.get(nr) # 地理位置加权 geo = [] if geo_weight: # geo=self.geoKG.parseDoc_global(sent) geo = geos # sent_embedding_res暂时无用,顺接原来的接口, # 能不能在这里加一个类别的判断呢,通过title判断是否是官员,再对官员进行过滤? # ATT与全句相比,更加贴近人物本身,其它关键词是背景 # 应该先进行类似知识图谱级的判断,再进行消歧综合判断 sent_embedding_res, wds_embedding_res = self.detector.detect_main(sent, nr, att, geo) concept = self.detector.getConcept(wds_embedding_res[0][0]) # 拿回元数据 pperon_candidates.append(concept) pprint.pprint(pperon_candidates)
class Chatbot: def __init__(self): self.parameters = None self.textProcess = None self.model = None def main(self): print('Welcome to Chatbot') stream = open("parameters.txt", 'r') self.parameters = yaml.safe_load(stream) self.textProcess = TextProcess(self.parameters) if not os.path.isfile('chatbotModel.pkl'): input_context = Input(shape=(self.parameters['maxLength'], ), dtype="int32", name="input_context") embeddingLayer = Embedding( self.parameters['vocabularySize'], output_dim=self.parameters['embeddingOutputDim'], weights=[self.textProcess.embedding_matrix], input_length=self.parameters['maxLength'], trainable=True) embedding_context = embeddingLayer(input_context) layer = GlobalMaxPool1D()(embedding_context) layer = Dense(int(self.parameters['trainData'] / 2), activation='relu')(layer) outputs = Dense(self.parameters['trainData'], activation='softmax')(layer) self.model = Model(input=[input_context], output=[outputs]) adam = Adam(lr=self.parameters['learningRate']) self.model.compile(loss="categorical_crossentropy", optimizer=adam) self.model.summary() self.model.fit( self.textProcess.inputSequences[:self.parameters['trainData']], self.textProcess.getDecoderOutputData( self.textProcess.targetSequences[:self. parameters['trainData']]), batch_size=self.parameters['batchSize'], epochs=self.parameters['numEpochs']) self.saveDataset() else: self.loadDataset() self.start_chatbot() def start_chatbot(self): while True: question = input('You: ') if question == '' or question == 'exit': break answer = self.decode_sequence(question) print('Bot: ' + format(answer)) print() def decode_sequence(self, input_seq): target_seq = self.textProcess.getSentenceTokens(input_seq) states_value = self.model.predict(target_seq) sampled_token_index = np.argmax(states_value[0, :]) decoded_sentence = self.textProcess.targetData[sampled_token_index] return decoded_sentence def saveDataset(self): with open('chatbotModel.pkl', 'wb') as handle: data = {'model': self.model} pickle.dump(data, handle, -1) def loadDataset(self): with open('chatbotModel.pkl', 'rb') as handle: data = pickle.load(handle) self.model = data['model']