def __call__(self, input): def clear_tag(strin): tmp = strin.split() rs = [] for tmpu in tmp: if tmpu: ind = tmpu.rfind("/") if ind > 0: rs.append(tmpu[:ind]) else: rs.append(tmpu) return " ".join(rs) if not isinstance(input, (list, tuple)): input = [input] rs = [] for inputu in input: try: _tmp = nlpir.ParagraphProcess(inputu.encode("utf-8", "ignore"), 1) except: _tmp = "" rs.append(clear_tag(_tmp.decode("utf-8", "ignore"))) return rs
def split_words(s): # 繁体转简体 s = SplitWords.__convert(s) # 去除标签 s = SplitWords.__del_non_tag(s) # 去除标点符号 s = SplitWords.__del_punctuation(s) # 去除数字 s = SplitWords.__del_digit(s) # 分词 words = nlpir.ParagraphProcess(s, True) # 去掉左右两边多余的空格,并分割 words = words.strip().split(" ") # 去掉中文停用词 # 不管分词后的结果是否带有词性 words = SplitWords.__del_stop(words, SplitWords.__read_chinese_stoplist()) # 此方法只能是分词后不带词性才可以使用 # words = [word for word in words if word not in SplitWords.__read_chinese_stoplist()] # 去掉英文停用词 words = SplitWords.__del_stop(words, SplitWords.__read_english_stoplist()) # 去掉多余的空格 words = SplitWords.__del_blank(words) # 去掉无用的词性词汇,并将剩下的词汇的词性删除 words = SplitWords.__del_non_pos(words) return words
def pos_tag(data): nlpir.Init(nlpir.PACKAGE_DIR.encode('UTF-8'), nlpir.UTF8_CODE, None) new = [] for line in data: s = line.encode('utf-8') new_line = nlpir.ParagraphProcess(s, True).decode('utf-8') new.append(new_line) return new
def segline(strin): try: rs = nlpir.ParagraphProcess(strin.encode("utf-8", "ignore"), 0) except: rs = "" return rs.decode("utf-8", "ignore")
#!/usr/bin/python #-*- coding:utf-8 -*- import sys import os import re ############################ #File Name: segment.py #Author: weitao #Mail: [email protected] #Created Time: 2017-05-20 12:38:33 #Description: ############################ from pynlpir import nlpir nlpir.Init(nlpir.PACKAGE_DIR, nlpir.UTF8_CODE, None) nlpir.SetPOSmap(nlpir.ICT_POS_MAP_FIRST) #pynlpir.SetPOSmap(pos_map) #pynlpir.pos_map #sys.exit() for line in sys.stdin: content = line.strip().split(",", 1)[1] label = line.strip().split(",")[0] items = nlpir.ParagraphProcess(line.strip(), 1) print label + "," + items
def ParagraphProcess(self, strs): nlpir.ParagraphProcess(strs, False)