Example #1
0
    def seg_word(self, word, mode = "BASIC"):
        ret = wordseg.scw_segment_words(WordSeg.m_dict_handle, self.m_result_handle, word, len(word), 1)
        if ret < 0:
            return None
        token_count = wordseg.scw_get_token_1(self.m_result_handle, self.m_mode[mode], self.m_token_handle, self.m_max_term_count)
        l = wordseg.tokens_to_list(self.m_token_handle, token_count)
        ts = []
        for token in l:
            ts.append(token[7])

        return ts
Example #2
0
    def tokenizeString(self, text, encoding='utf8', seg_type='WPCOMP'):
        """
        # @Synopsis  tokenize a given text string, return token and its position of
        # sentence(pos)
        #
        # @Args text string to be tokenized
        # @Args encoding support utf8, gbk and unicode
        # @Args seg_type basic or complex mode
        #
        # @Returns   dict{'errno': error number, 'data': [(token, pos)]}
        """
        ret = {
            'errno': 0,
            'data': [],
        }

        if len(text) == 0:
            return ret
        try:
            if encoding == 'utf8':
                text = text.decode('utf8', errors='ignore').encode('gbk')
            elif encoding == 'unicode':
                text = text.encode('gbk')
            data = []
            wordseg.scw_segment_words(self.scw_worddict, self.scw_out, text,
                                      len(text), 1)
            token_cnt = wordseg.scw_get_token_1(self.scw_out,
                                                self.SEG_TYPE_DICT[seg_type],
                                                self.tokens, self.MAX_TERM_CNT)
            tokens = wordseg.tokens_to_list(self.tokens, token_cnt)

            token_cnt = postag.tag_postag(self.scw_tagdict, self.tokens,
                                          token_cnt)
            postag_ret = postag.print_tags(self.tokens, token_cnt)

            for token, pos in postag_ret:
                token = token.decode('gbk', 'ignore')
                data.append([token, pos])
            ret['data'] = data
            return ret

        except Exception as e:
            print e.message
            if encoding == 'unicode':
                print text.encode('utf8')
            else:
                print text.decode(encoding).encode('utf8')
            ret['errno'] = 1
            return ret
Example #3
0
    def seg_word(self, word, mode="BASIC"):
        ret = wordseg.scw_segment_words(WordSeg.m_dict_handle,
                                        self.m_result_handle, word, len(word),
                                        1)
        if ret < 0:
            return None
        token_count = wordseg.scw_get_token_1(self.m_result_handle,
                                              self.m_mode[mode],
                                              self.m_token_handle,
                                              self.m_max_term_count)
        l = wordseg.tokens_to_list(self.m_token_handle, token_count)
        ts = []
        for token in l:
            ts.append(token[7])

        return ts
Example #4
0
    def get_words(self, content):
        '''
        @brief 取得分词结果
        @param content 文本内容
        @return 分词结果,以列表形式返回
        '''

        ANGTYPE_SIMP_CHINESE = 1 # 语言类型,简体中文为1,详细参见ul_ccode.h
        succ = 1
        if (succ == wordseg.scw_segment_words(self.dict_handle, self.result_handle, content,
                                              ANGTYPE_SIMP_CHINESE)):
            token_count = wordseg.scw_get_token_1(self.result_handle, wordseg.SCW_BASIC,
                                                  self.token_handle, self.max_term_count)
            token_list = wordseg.tokens_to_list(self.token_handle, token_count)
            word_list = [token[7] for token in token_list]
            return word_list
        else:
            log.warning("[Segment Word Fail! func=scw_segment_words,content=%s", content)
            return []
Example #5
0
class Tokenizer():
    def __init__(self):
        print >> sys.stderr, "WordSegUtil constructed"
        self.maxTermCount = 2048
        dict_ab_url = (os.path.dirname(os.path.abspath(__file__))) + "/dict"
        #print dict_ab_url
        # 加载词典
        #print os.path.join(dict_ab_url, "worddict")
        self.hWordDict = wordseg.scw_load_worddict(
            os.path.join(dict_ab_url, "worddict"))
        self.hTagDict = postag.tag_create(os.path.join(dict_ab_url, "tagdict"))
        # hNerDict  = wordner.ner_dict_load(os.path.join(dict_ab_url, "nerdict"))
        self.hRankDict = wordrank.wdr_create(
            os.path.join(dict_ab_url, "rankdict"))

        self.hScwOut = wordseg.scw_create_out(self.maxTermCount * 10)
        # hNerOut = wordner.ner_out_create(hNerDict, self.maxTermCount)
        self.hRanks = wordrank.create_ranks(self.maxTermCount)

        # token
        self.hTokens = wordseg.create_tokens(self.maxTermCount)
        self.hTokens = wordseg.init_tokens(self.hTokens, self.maxTermCount)

        # 专名过滤
        self.nerWhiteTags = set([
            "PER",  # 人名
            #"LOC",          # 地名
            #"ORG",          # 机构
            #"SFT",          # 软件
            "GME",  # 游戏
            "SNG",  # 歌曲
            #"NVL",          # 小说
            "VDO",  # 视频
            "BRD",  # 品牌
            "CTN",  # 动漫
            "VDO_MVE",  # 电影
            "VDO_TV",  # 电视剧
            "VDO_TVSHOW"  # 电视节目
        ])

    def __del__(self):
        wordrank.destroy_ranks(self.hRanks)
        wordseg.destroy_tokens(self.hTokens)

        # wordner.ner_out_destroy(Tokenize.hNerOut)
        wordseg.scw_destroy_out(self.hScwOut)

        wordrank.wdr_destroy(self.hRankDict)
        # wordner.ner_dict_destroy(Tokenize.hNerDict)
        postag.tag_destroy(self.hTagDict)
        wordseg.scw_destroy_worddict(self.hWordDict)

        print >> sys.stderr, "Tokenize destroied"

    def tokenize_string(self, text, coding="utf8", segType=SEG_DEFAULT):
        ret = {"error": 0, "reason": "", "ret": [], "text": text}
        try:
            if coding == "utf8":
                text = text.decode("utf8").encode("gbk")
            elif coding == 'unicode':
                text = text.encode('gbk')
            segRes = []
            # 切词
            if len(text) == 0 or not isinstance(text, str):
                return ret
            wordseg.scw_segment_words(self.hWordDict, self.hScwOut, text,
                                      len(text), 1)
            # 你妹的,错误中文编码会在这里抛异常
            # if 0 > wordseg.scw_segment_words(Tokenize.hWordDict, Tokenize.hScwOut, text, 1):
            #     ret["error"]=1
            #     ret["reason"]="scw_segment_words failed"
            #     return ret
        except Exception, e:
            ret["error"] = 1
            ret["reason"] = "scw_segment_words failed"
            return ret

        tokensLen = wordseg.scw_get_token_1(self.hScwOut, segType,
                                            self.hTokens, self.maxTermCount)
        tokensList = wordseg.tokens_to_list(self.hTokens, tokensLen)

        # 专名识别
        # if 0 > wordner.ner_tag(Tokenize.hNerDict, Tokenize.hTokens, tokensLen, Tokenize.hNerOut, langid):
        #     print >> sys.stderr, "WARNING: ner_tag failed"
        #     return segRes, nerRes
        #
        # gran = 2
        # nerRes = wordner.get_tag_list(Tokenize.hNerOut, Tokenize.hTokens, tokensLen, gran)
        # nerRes = [ (term, wordner.get_type_name(Tokenize.hNerDict, langid, nerTag)) for term, nerTag in nerRes ]
        # nerRes = [ (term, nerTag) for term, nerTag in nerRes if nerTag in Tokenize.nerWhiteTags ]

        #tokensLen = wordrank.get_nertokens(Tokenize.hScwOut, Tokenize.hNerOut, Tokenize.hTokens, Tokenize.maxTermCount)
        #tokensList = wordseg.tokens_to_list(Tokenize.hTokens, tokensLen)

        # 词性标注
        tokensLen = postag.tag_postag(self.hTagDict, self.hTokens, tokensLen)
        postagRes = postag.print_tags(self.hTokens, tokensLen)

        position = 0
        for token, pos in postagRes:
            token = token.decode('gbk', 'ignore')
            segRes.append([token, pos, position])
            position += len(token)
        ret["ret"] = segRes
        #return segRes
        return ret
Example #6
0
for query, title, label in pos:

    for char in strip_chars:
        query = query.strip(char)
        title = title.strip(char)

    query_title = []
    for line in [query, title]:
        wordseg.scw_segment_words(dict_handle, result_handle, line, 1)
        token_count = wordseg.scw_get_token_1(result_handle,
                                              wordseg.SCW_WPCOMP, token_handle,
                                              MAX_TERM_COUNT)
        query_title.append([
            token[7]
            for token in wordseg.tokens_to_list(token_handle, token_count)
        ])

    query = " ".join(query_title[0])
    title = " ".join(query_title[1])
    final.append([query, title, label])

wordseg.destroy_tokens(token_handle)
wordseg.scw_destroy_out(result_handle)
wordseg.scw_destroy_worddict(dict_handle)

for query, title, label in final:
    _, ti, _ = random.choice(pos)
    if random.randint(0, 1) == 1:
        print "\t".join(map(str, [query, ti, title, 0]))
    else: