def load_worddict(): if os.path.exists('%s/' % (FLAGS.worddict_directory)): wd_file = '%s/' % (FLAGS.worddict_directory) dictionary = cPickle.load(open(wd_file)) #cmmseg lib cmmseg_dir = "%s_cmmseg" % wd_file cmmseg_lib = "%s/uni.lib" % (cmmseg_dir) if FLAGS.use_client_cmmseg: if not os.path.isfile(cmmseg_lib): gen_cmmseg_dict(dictionary, cmmseg_dir) #init cmmseg dict init_cmmseg_dict('%s' % cmmseg_dir) else: init_cmmseg_dict('/usr/local/etc') else: if FLAGS.use_client_cmmseg: init_cmmseg_dict('/usr/local/etc') dictionary = {} logger.warning("keyword_convert not have worddict") return lazy_dict(dictionary.items())
def load_worddict(): if os.path.exists("%s/" % (FLAGS.worddict_directory)): wd_file = "%s/" % (FLAGS.worddict_directory) dictionary = cPickle.load(open(wd_file)) # cmmseg lib cmmseg_dir = "%s_cmmseg" % wd_file cmmseg_lib = "%s/uni.lib" % (cmmseg_dir) if FLAGS.use_client_cmmseg: if not os.path.isfile(cmmseg_lib): gen_cmmseg_dict(dictionary, cmmseg_dir) # init cmmseg dict init_cmmseg_dict("%s" % cmmseg_dir) else: init_cmmseg_dict("/usr/local/etc") else: if FLAGS.use_client_cmmseg: init_cmmseg_dict("/usr/local/etc") dictionary = {} logger.warning("keyword_convert not have worddict") return lazy_dict(dictionary.items())
freq = int(freq) yield wd,freq except: pass def extend_sogoudict(ext_dicts): for wd, freq in enum_sogoudict(): yield wd, freq for wd, freq in ext_dicts: yield wd, freq def lines2dict(lines): for l in lines: yield l, 2 sogoudict = lazy_dict(enum_sogoudict()) def getwd(txt,idx=0,maxwdlen=1e10,dictionary=sogoudict): ''' >>> [(wd,f) for wd,f in getwd('我爱北京天安门')] [('\\xce', 0)] >>> [(wd,f) for wd,f in getwd('吃饭')] [('\\xb3', 0)] ''' pref2wd2f,ch2f = dictionary.load() if idx >= len(txt): return yield txt[idx],ch2f.get(txt[idx],0) for wd,f in pref2wd2f.get(txt[idx:idx+2],{}).items(): if txt[idx:idx+len(wd)] != wd: continue