class segment: def __init__(self): LTP_DATA_DIR = 'resources/ltp_data_v3.4.0/' cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') from pyltp import Segmentor self.segmentor = Segmentor() self.segmentor.load_with_lexicon(cws_model_path, '/path/to/your/lexicon') def seg(self, text): words = self.segmentor.segment(text) return words def destroy(self): self.segmentor.release() def segFile(self, infile, outfile): data = codecs.open(infile, 'r') out = codecs.open(outfile, 'w') #, 'utf-8' for line in data: fields = line.strip().split('\t') out.write(fields[0] + '\t' + '\t'.join( [' '.join(self.seg(fields[i])) for i in range(1, len(fields))]) + '\n') data.close() out.close()
def feature_about(): # 获取特征列表 feature_dict = NewsUtil.get_feature() # 获取新闻中出现特征后最近的5个词及其属性 logger.info("In Prepare Raw News...") raw_news_data = CommonUtil.read_excel(RAW_NEWS_DEMO_PATH) raw_news_table = raw_news_data.sheet_by_index(0) raw_news_rows = raw_news_table.nrows segmentor = Segmentor() # 初始化实例 segmentor.load_with_lexicon(cws_model_path, CFETSFX_LEXICON_PATH) # 加载模型,第二个参数是您的外部词典文件路径 feature_about_list = list() for rowN in range(0, raw_news_rows): news_content = raw_news_table.cell_value(rowN, 2) sentences = SentenceSplitter.split(news_content) for sentence in sentences: print(sentence) # 分词 words = segmentor.segment(sentence) print(list(words)) for word_index in range(0, len(words)): word = words[word_index] for feature_word in feature_dict.values(): if feature_word in word: about_list = list() count = 0 while word_index < len(words) and count < 6: about_list.append(words[word_index]) count += 1 word_index += 1 feature_about_list.append(about_list) print(about_list) break segmentor.release() CommonUtil.write_csv(FEATURE_ABOUT_PATH, feature_about_list)
def pyltp_cutting(sentence): segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 result = segmentor.segment(sentence) # 分词 #print ('\t'.join(words)) segmentor.release() # 释放模型 return result
def genData(): path = "/home/liberty/Sentiment/sentiment-data/pnn_annotated.txt" MODELDIR = "/home/liberty/ltp_data" segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) posList = [] senList = [] with open(path, "r") as file: with open("/home/liberty/Sentiment/sentiment-data/After.txt", "w") as out: with open("/home/liberty/Sentiment/sentiment-data/Pos.txt", "w") as posOut: cnt = 0 for line in file.readlines(): random.seed(cnt * 10) pos, sentence = line.split("\t") words = list(segmentor.segment(sentence)) if cnt < 2500: length = len(words) unks = int(length * 0.1) for i in range(unks): idx = random.randint(0, length - 1) words[idx] = "UNK" senList.append(words) posList.append(eval(pos)) out.write(" ".join(words) + "\n") posOut.write(pos + "\n") cnt += 1 segmentor.release() return posList, senList
class LTP: def __init__(self): self.segmentor = Segmentor() # 分词器 self.segmentor.load_with_lexicon( Config.SEGMENTOR_PATH, Config.PERSONAL_SEGMENTOR_PATH) # 加载模型 self.postagger = Postagger() # 词性分析器 self.postagger.load(Config.POSTAGGER_PATH) # 加载模型 self.parser = Parser() # 句法分析器 self.recognizer = NamedEntityRecognizer() self.recognizer.load(Config.NAMED_ENTITY_RECONGNTION_PATH) self.parser.load(Config.PARSER_PATH) # 加载模型 self.labeller = SementicRoleLabeller() # 语义角色分析器 self.labeller.load(Config.LABELLER_PATH) # 加载模型 self.negative_list = get_negative_list() self.no_list = get_no_list() self.limit_list = get_limit_list() self.special_list = get_special_list() self.key_sentences = [] def __del__(self): """ 资源释放 """ self.segmentor.release() self.postagger.release() self.parser.release() self.labeller.release()
def pyltp_cut(sentence): segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 words = segmentor.segment(sentence) # 分 segmentor.release() # 释放模型 return words
def cut_process(Questioning_path, new_word_path='./data/new_word.txt'): cws_model_path = os.path.join(config.LTP_DATA_DIR, 'cws.model') segmentor = Segmentor() segmentor.load(cws_model_path) with open(Questioning_path, 'r', encoding='utf8') as f: lines = f.readlines() with open(config.stopword_path, 'r', encoding='utf8') as f: stopword_list = f.readlines() new_word = [] for line in lines: words = segmentor.segment(line.replace(' ', '')) words_list_temp = list(words) words_list = [] for w in words_list_temp: if w not in stopword_list: words_list.append(w) for i in range(len(words_list) - 1): if len(words_list[i]) == 1 and len(words_list[i + 1]) == 1: w = words_list[i] + words_list[i + 1] if w not in new_word: new_word.append(w) Logger.log_DEBUG.debug('分词结果:' + str(words_list)) segmentor.release() Logger.log_DEBUG.debug('新词:' + str(new_word)) fw = open(new_word_path, 'w', encoding='utf8') for w in new_word: fw.write(w + '\n') fw.close() return new_word
def words_split(): """ 对于句子进行分词 :return: """ segmentor = Segmentor() cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') segmentor.load_with_lexicon(cws_model_path, '../data/all_word_dict.txt') for sentence in sentences: words = segmentor.segment(sentence) postags = postaggers(words) index = 0 for word, postag in zip(words, postags): if postag == 'v': relation_words.append(word) # print(word) all_words.append(words) relation_words_file = open('relation_words.txt', 'w+', encoding='utf8') for word in relation_words: relation_words_file.write(word + '\n') # 将当前扫描的所有词加入file all_words_file = open('all_words.txt', 'w+', encoding='utf8') for words in all_words: temp_words = '\t'.join(words) all_words_file.write(temp_words + '\n') segmentor.release()
def locationNER(text): #先分词 segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 words = segmentor.segment(text) # 分词 #print ('\t'.join(words)) segmentor.release() #再词性标注 postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 postags = postagger.postag(words) # 词性标注 postagger.release() # 释放模型 #最后地理实体识别 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 netags = recognizer.recognize(words, postags) # 命名实体识别 for i in range (0,len(netags)): if 'I-Ns'in netags[i] or 'I-Ni'in netags[i]: results.append(words[i-1]+words[i]+words[i+1]) if 'S-Ns'in netags[i] or 'S-Ni'in netags[i]: results.append(words[i]) return results
def preprocess_data(train_mode=True, fine=True, remove_stopwords=False): print("Initializing Segmentor!") segmentor = Segmentor() segmentor.load(cws_model_path) if remove_stopwords: get_stop_words() print(len(stop_words)) text, y = [], [] if train_mode: path = TRAIN_DATA_PATH else: path = TEST_DATA_PATH for line in open(path, 'r', encoding='utf-8'): tmp = line.split('\t') assert len(tmp) == 2, "Something wrong with the data!" if fine: tag, question = tmp[0], tmp[1] else: tag, question = tmp[0].split('_')[0], tmp[1] if remove_stopwords: pred_words = remove_stop_words(list(segmentor.segment(question))) else: pred_words = list(segmentor.segment(question)) seg_text = '' for word in pred_words: seg_text += word + ' ' text.append(seg_text) y.append(tag) segmentor.release() return text, y
def segmentor(sentence): segmentor = Segmentor() segmentor.load('/home/pengbin/下载/ltp_data_v3.4.0/cws.model') words = segmentor.segment(sentence) words_list = list(words) segmentor.release() return words_list
def cut(string): segmentor = Segmentor() segmentor.load(cws_model_path) words = segmentor.segment(string) # print('\t'.join(words)) segmentor.release() return words
class LtpLanguageAnalysis(object): def __init__(self, model_dir="/home/xxx/ltp-3.4.0/ltp_data/"): self.segmentor = Segmentor() self.segmentor.load(os.path.join(model_dir, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(model_dir, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(model_dir, "parser.model")) def analyze(self, text): # 分词 words = self.segmentor.segment(text) print '\t'.join(words) # 词性标注 postags = self.postagger.postag(words) print '\t'.join(postags) # 句法分析 arcs = self.parser.parse(words, postags) print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs) def release_model(self): # 释放模型 self.segmentor.release() self.postagger.release() self.parser.release()
def cut_words(): #分词+去除空行 #词性标注集http://ltp.readthedocs.io/zh_CN/latest/appendix.html cont = open('resource_new.txt', 'r', encoding='utf-8') f = open('key/cut_resouce.txt', 'w', encoding='utf-8') segmentor = Segmentor() # 初始化实例 # segmentor.load('cws.model') # 加载模型,不加载字典 segmentor.load_with_lexicon('module/cws.model', 'userdict.txt') # 加载模型,加载用户字典 postagger = Postagger() # 初始化实例 postagger.load('module/pos.model') # 加载模型 for sentence in cont: if sentence.strip() != '': words = segmentor.segment(sentence) # 分词 pos_tags = postagger.postag(words) # 词性标注 for word, tag in zip(words, pos_tags): if tag != 'wp': f.write(word) else: f.write('\n') f.write('\n') else: continue f.close() segmentor.release() postagger.release()
def demo_three(): string = '这个把手该换了,我不喜欢日本和服,别把手放在我的肩膀上,工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作' segmentor = Segmentor() segmentor.load(sg_model_path) ret = segmentor.segment(string) print('/'.join(ret)) segmentor.release()
def extract_views(all_sents): segmentor = Segmentor() segmentor.load(r'/home/student/project-01/ltp_data/cws.model') postagger = Postagger() postagger.load(r'/home/student/project-01/ltp_data/pos.model') parser = Parser() parser.load(r'/home/student/project-01/ltp_data/parser.model') views_in_sents = [] for i, sents in enumerate(all_sents): views_tmp = [] for sent in sents: sent = sent.replace('\\n', '\n').strip() if len(sent) == 0: continue # words = list(jieba.cut(sent)) words = list(segmentor.segment(sent)) contains = contain_candidates(words) if len(contains) == 0: continue tags = list(postagger.postag(words)) arcs = list(parser.parse(words, tags)) sbv, head = get_sbv_head(arcs, words, tags) if sbv[0] is None or head[0] is None or head[0] not in contains: continue subj = sbv[0] view = clean_view(words[head[1] + 1:]) views_tmp.append((subj, view, i)) if len(views_tmp) > 0: views_in_sents.append({'sents': sents, 'views': views_tmp}) segmentor.release() postagger.release() parser.release() return views_in_sents
def get_words_list(string): segmentor = Segmentor() segmentor.load(cws_model_path) words_list = list(segmentor.segment(string)) segmentor.release() return words_list
class pyltp_model(): def __init__(self, LTP_DATA_DIR='/Users/didi/Desktop/ltp_data_v3.4.0'): cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') ner_model_path = os.path.join( LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` self.segmentor = Segmentor() # 初始化实例 self.postagger = Postagger() # 初始化实例 self.recognizer = NamedEntityRecognizer() # 初始化实例 self.segmentor.load(cws_model_path) # 加载模型 self.postagger.load(pos_model_path) # 加载模型 self.recognizer.load(ner_model_path) # 加载模型 def token(self, sentence): words = self.segmentor.segment(sentence) # 分词 words = list(words) postags = self.postagger.postag(words) # 词性标注 postags = list(postags) netags = self.recognizer.recognize(words, postags) # 命名实体识别 netags = list(netags) result = [] for i, j in zip(words, netags): if j in ['S-Nh', 'S-Ni', 'S-Ns']: result.append(j) continue result.append(i) return result def close(self): self.segmentor.release() self.postagger.release() self.recognizer.release() # 释放模型
def cut_word(string): segmentor = Segmentor() segmentor.load(cws_model_path) words = segmentor.segment(string) segmentor.release() return ' '.join(words)
def ltp_seg(s): segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 words = segmentor.segment(s)# 分词 s = " ".join(words) segmentor.release() # 释放模型 return s
class LTP_word(): """docstring for parser_word deal处理文本,返回词表、词性及依存关系,语义,命名实体五个值 release释放缓存""" def __init__(self, model_path): self.model_path = model_path self.segmentor = Segmentor() # 分词初始化实例 self.segmentor.load_with_lexicon(path.join(self.model_path, 'cws.model'), path.join(self.model_path, 'dictionary_kfc.txt')) self.postagger = Postagger() # 词性标注初始化实例 self.postagger.load(path.join(self.model_path, 'pos.model') ) # 加载模型 self.recognizer = NamedEntityRecognizer() # 命名实体识别初始化实例 self.recognizer.load(path.join(self.model_path, 'ner.model')) self.parser = Parser() # 依存句法初始化实例 s self.parser.load(path.join(self.model_path, 'parser.model')) # 加载模型 self.labeller = SementicRoleLabeller() # 语义角色标注初始化实例 self.labeller.load(path.join(self.model_path, 'srl')) def deal (self, text): #把所有该要使用的东西都提取出来 words =self.segmentor.segment(text) # 分词 postags = self.postagger.postag(words) # 词性标注 netags = self.recognizer.recognize(words, postags) #命名实体 arcs = self.parser.parse(words, postags) # 句法分析 roles = self.labeller.label(words, postags, netags, arcs) # 语义角色标注 return words,postags,arcs,roles,netags def release(self): self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() self.labeller.release()
def namedEntityRecognize(sentence): ''' 使用pyltp模块进行命名实体识别 返回:1)命名实体和类别元组列表、2)实体类别列表 ''' namedEntityTagTupleList = [] segmentor = Segmentor() # segmentor.load(inout.getLTPPath(index.CWS)) segmentor.load_with_lexicon(inout.getLTPPath(index.CWS), inout.getResourcePath('userDic.txt')) words = segmentor.segment(sentence) segmentor.release() postagger = Postagger() postagger.load(inout.getLTPPath(index.POS)) postags = postagger.postag(words) postagger.release() recognizer = NamedEntityRecognizer() recognizer.load(inout.getLTPPath(index.NER)) netags = recognizer.recognize(words, postags) recognizer.release() # 封装成元组形式 for word, netag in zip(words, netags): namedEntityTagTupleList.append((word, netag)) neTagList = '\t'.join(netags).split('\t') return namedEntityTagTupleList, neTagList
def cut_words(self): print "plot:", self.plot segmentor = Segmentor() # 初始化实例 segmentor.load('ltp_data/cws.model') # 加载模型 self.words = segmentor.segment(self.plot) print '\t'.join(self.words) segmentor.release() # 释放模型
class LtpTree(DepTree): def __init__(self, dict_path=None): super(DepTree, self).__init__() print("正在加载LTP模型... ...") self.segmentor = Segmentor() if dict_path is None: self.segmentor.load(os.path.join(MODELDIR, "cws.model")) else: self.segmentor.load_with_lexicon(os.path.join(MODELDIR, "cws.model"), dict_path) self.postagger = Postagger() self.postagger.load(os.path.join(MODELDIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(MODELDIR, "parser.model")) print("加载模型完毕。") def parse(self, sentence): self.words = self.segmentor.segment(sentence) self.postags = self.postagger.postag(self.words) self.arcs = self.parser.parse(self.words, self.postags) for i in range(len(self.words)): if self.arcs[i].head == 0: self.arcs[i].relation = "ROOT" def release_model(self): # 释放模型 self.segmentor.release() self.postagger.release() self.parser.release()
def sent_split(sentence): segmentor = Segmentor() segmentor.load(seg_model_path) words = segmentor.segment(sentence) segmentor.release() print('\t'.join(words)) return words
def ltp_word(self): """创建一个方法,用来进行句子的分词、词性分析等处理。""" # 分词 segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) words = segmentor.segment(self.content) #print("*************分词*****************") #print("\t".join(words)) # 词性标注 postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) #print("*************词性标注*************") #print(type(postags)) #print("\t".join(postags)) # 依存句法分析 parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) #print("*************依存句法分析*************") #print(type(arcs)) #print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) # 把依存句法分析结果的head和relation分离出来 arcs_head = [] arcs_relation = [] for arc in arcs: arcs_head.append(arc.head) arcs_relation.append(arc.relation) # 命名实体识别 recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) netags = recognizer.recognize(words, postags) #print("*************命名实体识别*************") #print("\t".join(netags)) """ # 语义角色标注 labeller = SementicRoleLabeller() labeller.load(os.path.join(MODELDIR, "pisrl.model")) roles = labeller.label(words, postags, arcs) print("*************语义角色标注*************") for role in roles: print(role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) """ segmentor.release() postagger.release() parser.release() recognizer.release() #labeller.release() # 调用list_conversion函数,把处理结果列表化 words_result = list_conversion(words, postags, netags, arcs_head, arcs_relation) return words_result
def seg(input_file, output_file): segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 data = json.load(open(input_file, "r", encoding="utf-8")) count = 0 for d in data: count += 1 print(count) d['article_content'] = list(segmentor.segment(d['article_content'])) d['article_title'] = list(segmentor.segment(d['article_title'])) questions = d["questions"] for q in questions: q['answer'] = list(segmentor.segment( q['answer'])) # 其实没什么用,现在是用answer_span进行计算 q['question'] = list(segmentor.segment(q['question'])) # 根据字级别的answer_span 生成词语级别的 if len(q['answer']) == 0 or len(q['question']) == 0: continue answer_span_char_level = q['answer_span'] answer_span_word_level = answer_span_char2word( d['article_content'], answer_span_char_level) q['answer_span'] = answer_span_word_level segmentor.release() # 释放模型 json.dump(data, open(output_file, "w", encoding="utf-8"), ensure_ascii=False)
def get_word_list(self, sentence, model): # 得到分词 segmentor = Segmentor() segmentor.load(model) word_list = list(segmentor.segment(sentence)) segmentor.release() return word_list
class Parse_Util(object): def __init__(self, lexicon_path='./data/lexicon'): # 分词 self.segmentor = Segmentor() # self.segmentor.load_with_lexicon(cws_model_path, lexicon_path) self.segmentor.load(cws_model_path) # 词性标注 self.postagger = Postagger() self.postagger.load(pos_model_path) # 依存句法分析 self.parser = Parser() self.parser.load(par_model_path) # 命名实体识别 self.recognizer = NamedEntityRecognizer() self.recognizer.load(ner_model_path) # jieba 分词 # jieba.load_userdict(lexicon_path) def __del__(self): self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() # 解析句子 def parse_sentence(self, sentence): words = self.segmentor.segment(sentence) postags = self.postagger.postag(words) netags = self.recognizer.recognize(words, postags) arcs = self.parser.parse(words, postags) # child_dict_list = ParseUtil.build_parse_child_dict(words, arcs) return words, postags, netags, arcs
def cut_sentence(file_in): cutwords_list = [] #清零 file_original_txt = open(file_in, 'r', encoding='utf-8') stopwords = [ line.rstrip() for line in open('stopwords', encoding='utf-8') ] #rstrip() 删除 str末尾的指定字符(默认为空格) segmentor = Segmentor() segmentor.load('cws.model') #加载模型 sentences = file_original_txt.readlines() for sente in sentences: temp = '' #用来存放被切分后的“句子” sente = str(sente).encode('utf-8').decode( 'utf-8-sig') #编码、解码。否则label会认为是 '\ufeff' 非法字符 label = str(sente[0:1]) #去label temp += label + '\t' sente = sente[2:] #去label words = segmentor.segment(sente) #分词,类型为 pyltp.VectorOfString word_list = list(words) #收纳在list中 for word in word_list[1:]: if word not in stopwords: temp += word + ' ' cutwords_list.append(temp) segmentor.release() #释放模型 file_original_txt.close() return cutwords_list
def process(index): ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir) sys.path.append(os.path.join(ROOTDIR, "lib")) # Set your own model path MODELDIR=os.path.join(ROOTDIR, "ltp_data") segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) finname = "o_"+str(index)+".txt" foutname = "p_"+str(index)+".txt" print finname count = 0 fin = codecs.open(finname, encoding='utf-8') with codecs.open(foutname, 'w', encoding="utf-8") as fout: while 1: line = fin.readline() if not line: break tmp = line.split(" ^ {")[1] # Get JSON tmp = "{"+tmp data = json.loads(tmp) content = data['content'] # error_correction(content) content = content.strip() segmentation = "" for line in content.split("\n"): line = line.encode("utf-8") words = segmentor.segment(line) segmentation += "/".join(words) segmentation += "/" # Return type of the function is str, not unicode. Thus need to change into unicode. segmentation = unicode(segmentation, "utf-8") pinyin = add_pinyin(segmentation) obj = {} obj['flavor'] = data['flavor'] obj['environment'] = data['environment'] obj['service'] = data['service'] obj['content'] = data['content'] obj['segmentation'] = segmentation obj['pinyin'] = pinyin tmpstr = json.dumps(obj,ensure_ascii=False) fout.write(tmpstr) fout.write('\n') count += 1 print count segmentor.release()
def __init__(self): self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` self.pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` self.ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` segmentor = Segmentor() segmentor.load(self.cws_model_path) self.words = segmentor.segment(data) # print("|".join(words)) segmentor.release() postagger = Postagger() # 初始化实例 postagger.load(self.pos_model_path) # 加载模型 self.postags = postagger.postag(self.words) # 词性标注 # print('\t'.join(postags)) postagger.release() # 释放模型 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(self.ner_model_path) # 加载模型 self.netags = recognizer.recognize(self.words, self.postags) # 命名实体识别 # print('\t'.join(netags)) recognizer.release() # 释放模型
postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print "\t".join(postags) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) netags = recognizer.recognize(words, postags) print "\t".join(netags) labeller = SementicRoleLabeller() labeller.load(os.path.join(MODELDIR, "srl/")) roles = labeller.label(words, postags, netags, arcs) for role in roles: print role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) segmentor.release() postagger.release() parser.release() recognizer.release() labeller.release()
def ws_data(self): f = open("pnn_annotated.txt", 'r') total_line = 0 orgin_attr = [0, 0, 0] judge_attr = [0, 0, 0] right = [0, 0, 0] segmentor = Segmentor() segmentor.load("cws.model") for line in f: total_line += 1 # print 'line has been read' value_num = [0, 0] result = line.split('\t') ws_lst = segmentor.segment(result[1]) # print 'this line is %s' % (line) for i in ws_lst: classify = '' try: value = self.setiment_words[i] except: pass else: if value == 1: print 'positive word:%s' % i value_num[0] += 1 elif value == -1: print 'negative word:%s' % i value_num[1] += 1 if value_num[0] == 0 and value_num[1] == 0: classify = 'neutral' judge_attr[0] += 1 elif value_num[0] == value_num[1] != 0: classify = 'neutral' judge_attr[0] += 1 elif value_num[0] > value_num[1]: classify = 'positive' judge_attr[1] += 1 else: classify = 'negative' judge_attr[2] += 1 print value_num print 'classfiy result:%s' % classify # the count of original'emotion if result[0] == '0': orgin_attr[0] += 1 elif result[0] == '1': orgin_attr[1] += 1 else: orgin_attr[2] += 1 if (int(result[0]) == 0 and value_num[0] == 0 and value_num[1] == 0): # print 'neutral' right[0] += 1 elif (int(result[0]) == 0 and value_num[0] == value_num[1] != 0): # print 'neutral' right[0] += 1 elif (int(result[0]) > 0 and value_num[0] >= value_num[1] and value_num[0] != 0): # print 'positive' right[1] += 1 elif (int(result[0]) < 0 and value_num[0] < value_num[1] and value_num[1] != 0): # print 'negative' right[2] += 1 # print 'Accuracy so far: %f\n' % ((right[0] + right[1] + right[2]) / float(total_line)) print 'orgin\'s neutral, positive, negative' print orgin_attr print 'judge_attr neutral, positive, negative' print judge_attr print 'neutral, positive, negative' print right print (right[0] + right[1] + right[2]) print 'total_line %f\n' % total_line print 'Accuracy so far: %f\n' % ((right[0] + right[1] + right[2]) / float(total_line)) segmentor.release()