def tokenizeString(self, text, encoding='utf8', seg_type='WPCOMP'): """ # @Synopsis tokenize a given text string, return token and its position of # sentence(pos) # # @Args text string to be tokenized # @Args encoding support utf8, gbk and unicode # @Args seg_type basic or complex mode # # @Returns dict{'errno': error number, 'data': [(token, pos)]} """ ret = { 'errno': 0, 'data': [], } if len(text) == 0: return ret try: if encoding == 'utf8': text = text.decode('utf8', errors='ignore').encode('gbk') elif encoding == 'unicode': text = text.encode('gbk') data = [] wordseg.scw_segment_words(self.scw_worddict, self.scw_out, text, len(text), 1) token_cnt = wordseg.scw_get_token_1(self.scw_out, self.SEG_TYPE_DICT[seg_type], self.tokens, self.MAX_TERM_CNT) tokens = wordseg.tokens_to_list(self.tokens, token_cnt) token_cnt = postag.tag_postag(self.scw_tagdict, self.tokens, token_cnt) postag_ret = postag.print_tags(self.tokens, token_cnt) for token, pos in postag_ret: token = token.decode('gbk', 'ignore') data.append([token, pos]) ret['data'] = data return ret except Exception as e: print e.message if encoding == 'unicode': print text.encode('utf8') else: print text.decode(encoding).encode('utf8') ret['errno'] = 1 return ret
def seg_word(self, word, mode = "BASIC"): ret = wordseg.scw_segment_words(WordSeg.m_dict_handle, self.m_result_handle, word, len(word), 1) if ret < 0: return None token_count = wordseg.scw_get_token_1(self.m_result_handle, self.m_mode[mode], self.m_token_handle, self.m_max_term_count) l = wordseg.tokens_to_list(self.m_token_handle, token_count) ts = [] for token in l: ts.append(token[7]) return ts
def tokenize_string(self, text, coding="utf8", segType=SEG_DEFAULT): ret = {"error": 0, "reason": "", "ret": [], "text": text} try: if coding == "utf8": text = text.decode("utf8").encode("gbk") elif coding == 'unicode': text = text.encode('gbk') segRes = [] # 切词 if len(text) == 0 or not isinstance(text, str): return ret wordseg.scw_segment_words(self.hWordDict, self.hScwOut, text, len(text), 1) # 你妹的,错误中文编码会在这里抛异常 # if 0 > wordseg.scw_segment_words(Tokenize.hWordDict, Tokenize.hScwOut, text, 1): # ret["error"]=1 # ret["reason"]="scw_segment_words failed" # return ret except Exception, e: ret["error"] = 1 ret["reason"] = "scw_segment_words failed" return ret
def seg_word(self, word, mode="BASIC"): ret = wordseg.scw_segment_words(WordSeg.m_dict_handle, self.m_result_handle, word, len(word), 1) if ret < 0: return None token_count = wordseg.scw_get_token_1(self.m_result_handle, self.m_mode[mode], self.m_token_handle, self.m_max_term_count) l = wordseg.tokens_to_list(self.m_token_handle, token_count) ts = [] for token in l: ts.append(token[7]) return ts
def get_words(self, content): ''' @brief 取得分词结果 @param content 文本内容 @return 分词结果,以列表形式返回 ''' ANGTYPE_SIMP_CHINESE = 1 # 语言类型,简体中文为1,详细参见ul_ccode.h succ = 1 if (succ == wordseg.scw_segment_words(self.dict_handle, self.result_handle, content, ANGTYPE_SIMP_CHINESE)): token_count = wordseg.scw_get_token_1(self.result_handle, wordseg.SCW_BASIC, self.token_handle, self.max_term_count) token_list = wordseg.tokens_to_list(self.token_handle, token_count) word_list = [token[7] for token in token_list] return word_list else: log.warning("[Segment Word Fail! func=scw_segment_words,content=%s", content) return []
MAX_TERM_COUNT = 1024 dict_handle = wordseg.scw_load_worddict("./dict/wordseg_dict/") result_handle = wordseg.scw_create_out(MAX_TERM_COUNT) token_handle = wordseg.create_tokens(MAX_TERM_COUNT) token_handle = wordseg.init_tokens(token_handle, MAX_TERM_COUNT) for query, title, label in pos: for char in strip_chars: query = query.strip(char) title = title.strip(char) query_title = [] for line in [query, title]: wordseg.scw_segment_words(dict_handle, result_handle, line, 1) token_count = wordseg.scw_get_token_1(result_handle, wordseg.SCW_WPCOMP, token_handle, MAX_TERM_COUNT) query_title.append([ token[7] for token in wordseg.tokens_to_list(token_handle, token_count) ]) query = " ".join(query_title[0]) title = " ".join(query_title[1]) final.append([query, title, label]) wordseg.destroy_tokens(token_handle) wordseg.scw_destroy_out(result_handle) wordseg.scw_destroy_worddict(dict_handle)