def __init__(self, max_term_count=None, dict_path=None): if max_term_count: self.m_max_term_count = max_term_count else: self.m_max_term_count = default_max_term_count if not WordSeg.m_dict_loaded: if dict_path: WordSeg.m_dict_path = dict_path else: WordSeg.m_dict_path = default_dict_path print WordSeg.m_dict_path WordSeg.m_dict_handle = wordseg.scw_load_worddict( WordSeg.m_dict_path) if WordSeg.m_dict_handle: WordSeg.m_dict_loaded = True self.m_result_handle = wordseg.scw_create_out(self.m_max_term_count * 10) self.m_token_handle = wordseg.create_tokens(self.m_max_term_count) self.m_token_handle = wordseg.init_tokens(self.m_token_handle, self.m_max_term_count) self.m_mode['WPCOMP'] = wordseg.SCW_WPCOMP self.m_mode['BASIC'] = wordseg.SCW_BASIC self.m_mode['SUBPH'] = wordseg.SCW_SUBPH self.m_mode['NEWWORD'] = wordseg.SCW_NEWWORD self.m_mode['HUMAN'] = wordseg.SCW_HUMANNAME self.m_mode['BOOK'] = wordseg.SCW_BOOKNAME self.m_mode['DISAMB'] = wordseg.SCW_DISAMB
def __init__(self, dict_path): print >> sys.stderr, 'WordSegUtil constructed' self.MAX_TERM_CNT = 2048 self.scw_worddict = wordseg.scw_load_worddict( os.path.join(dict_path, 'wordseg/chinese_gbk')) self.scw_tagdict = postag.tag_create(os.path.join(dict_path, 'postag')) self.scw_out = wordseg.scw_create_out(self.MAX_TERM_CNT * 10) # token self.tokens = wordseg.create_tokens(self.MAX_TERM_CNT) self.tokens = wordseg.init_tokens(self.tokens, self.MAX_TERM_CNT)
def __init__(self, segdict_conf_path, segdict_path): ''' @brief 构造函数 @param segdict_conf_path 配置文件位置 @param segdict_path 切词库位置 @param tagdict_path 词性库位置 ''' self.max_term_count = 512 try: self.conf_handle = wordseg.scw_load_conf(segdict_conf_path) self.dict_handle = wordseg.scw_load_worddict(segdict_path) self.result_handle = wordseg.scw_create_out(self.max_term_count*10) self.token_handle = wordseg.create_tokens(self.max_term_count) self.token_handle = wordseg.init_tokens(self.token_handle, self.max_term_count) except Exception as e: log.warning("SegDict Load Error! error=%s",e);
def __init__(self): print >> sys.stderr, "WordSegUtil constructed" self.maxTermCount = 2048 dict_ab_url = (os.path.dirname(os.path.abspath(__file__))) + "/dict" #print dict_ab_url # 加载词典 #print os.path.join(dict_ab_url, "worddict") self.hWordDict = wordseg.scw_load_worddict( os.path.join(dict_ab_url, "worddict")) self.hTagDict = postag.tag_create(os.path.join(dict_ab_url, "tagdict")) # hNerDict = wordner.ner_dict_load(os.path.join(dict_ab_url, "nerdict")) self.hRankDict = wordrank.wdr_create( os.path.join(dict_ab_url, "rankdict")) self.hScwOut = wordseg.scw_create_out(self.maxTermCount * 10) # hNerOut = wordner.ner_out_create(hNerDict, self.maxTermCount) self.hRanks = wordrank.create_ranks(self.maxTermCount) # token self.hTokens = wordseg.create_tokens(self.maxTermCount) self.hTokens = wordseg.init_tokens(self.hTokens, self.maxTermCount) # 专名过滤 self.nerWhiteTags = set([ "PER", # 人名 #"LOC", # 地名 #"ORG", # 机构 #"SFT", # 软件 "GME", # 游戏 "SNG", # 歌曲 #"NVL", # 小说 "VDO", # 视频 "BRD", # 品牌 "CTN", # 动漫 "VDO_MVE", # 电影 "VDO_TV", # 电视剧 "VDO_TVSHOW" # 电视节目 ])
def __init__(self, max_term_count = None, dict_path = None): if max_term_count: self.m_max_term_count = max_term_count else: self.m_max_term_count = default_max_term_count if not WordSeg.m_dict_loaded: if dict_path: WordSeg.m_dict_path = dict_path else: WordSeg.m_dict_path = default_dict_path print WordSeg.m_dict_path WordSeg.m_dict_handle = wordseg.scw_load_worddict(WordSeg.m_dict_path) if WordSeg.m_dict_handle: WordSeg.m_dict_loaded = True self.m_result_handle = wordseg.scw_create_out(self.m_max_term_count*10) self.m_token_handle = wordseg.create_tokens(self.m_max_term_count) self.m_token_handle = wordseg.init_tokens(self.m_token_handle, self.m_max_term_count) self.m_mode['WPCOMP'] = wordseg.SCW_WPCOMP self.m_mode['BASIC'] = wordseg.SCW_BASIC self.m_mode['SUBPH'] = wordseg.SCW_SUBPH self.m_mode['NEWWORD'] = wordseg.SCW_NEWWORD self.m_mode['HUMAN'] = wordseg.SCW_HUMANNAME self.m_mode['BOOK'] = wordseg.SCW_BOOKNAME self.m_mode['DISAMB'] = wordseg.SCW_DISAMB
if query == "" or title == "": continue query = urllib.unquote(query) title = urllib.unquote(title) label = int(line[7]) pos.append([query, title, label]) strip_chars = [".", ",", "-", "_", ":"] final = [] MAX_TERM_COUNT = 1024 dict_handle = wordseg.scw_load_worddict("./dict/wordseg_dict/") result_handle = wordseg.scw_create_out(MAX_TERM_COUNT) token_handle = wordseg.create_tokens(MAX_TERM_COUNT) token_handle = wordseg.init_tokens(token_handle, MAX_TERM_COUNT) for query, title, label in pos: for char in strip_chars: query = query.strip(char) title = title.strip(char) query_title = [] for line in [query, title]: wordseg.scw_segment_words(dict_handle, result_handle, line, 1) token_count = wordseg.scw_get_token_1(result_handle, wordseg.SCW_WPCOMP, token_handle, MAX_TERM_COUNT) query_title.append([ token[7]