def analyze(self, string): '''Return a list of three string output: segment, pos, ner''' res = [] #segment words = segmenter.seg(string) segment_str = " ".join(words) res.append(segment_str) #POS pos_tagging = self.tag_pos(words) res.append(_concat_tuples(pos_tagging)) #NER ner_tagging = self.tag_ner(words) res.append(_concat_tuples(ner_tagging)) return res
def _concat_tuples(tagging): TOKEN_BLANK = " " wl = [] # wordlist for (x, y) in tagging: wl.append(x + "/" + y) concat_str = TOKEN_BLANK.join(wl) return concat_str # read input file docs = [] file = codecs.open(os.path.join(BASE_DIR, 'docs_test.txt'), 'r', encoding='utf-8') for line in file: line = line.replace("\n", "").replace("\r", "") docs.append(line) # Test each individual module # output file fileOut = codecs.open(os.path.join(BASE_DIR, 'modules_test_results.txt'), 'w', encoding='utf-8') words = segmenter.seg(docs[0]) pos_tagging = _concat_tuples(tagger_pos.predict(words)) ner_tagging = _concat_tuples(tagger_ner.predict(words)) fileOut.writelines(" ".join(words) + "\n") fileOut.writelines(pos_tagging + "\n") fileOut.writelines(ner_tagging + "\n") fileOut.close print (" ".join(words).encode('utf-8')) print (pos_tagging.encode('utf-8')) print (ner_tagging.encode('utf-8'))
#coding:utf-8 from __future__ import unicode_literals # compatible with python3 unicode from nlp_dl import segmenter from nlp_dl import pos_tagger tagger = pos_tagger.load_model(lang='zh') #Segmentation text = "我爱吃北京烤鸭" # unicode coding, py2 and py3 compatible words = segmenter.seg(text) print(" ".join(words).encode('utf-8')) #POS Tagging tagging = tagger.predict(words) for (w, t) in tagging: str = w + "/" + t print(str.encode('utf-8')) #Results #我/r #爱/v #吃/v #北京/ns #烤鸭/n
def segment(self, string): ''' Return list of [word]''' words = segmenter.seg(string) return words
#coding=utf-8 from __future__ import unicode_literals from nlp_dl import segmenter text = "我刚刚在浙江卫视看了电视剧老九门,觉得陈伟霆很帅" segList = segmenter.seg(text) text_seg = " ".join(segList) print(text.encode('utf-8')) print(text_seg.encode('utf-8'))