Example #1
0
def load_worddict():
    if os.path.exists('%s/' % (FLAGS.worddict_directory)):
        wd_file = '%s/' % (FLAGS.worddict_directory)
        dictionary = cPickle.load(open(wd_file))

        #cmmseg lib
        cmmseg_dir = "%s_cmmseg" % wd_file
        cmmseg_lib = "%s/uni.lib" % (cmmseg_dir)

        if FLAGS.use_client_cmmseg:
            if not os.path.isfile(cmmseg_lib):
                gen_cmmseg_dict(dictionary, cmmseg_dir)
            #init cmmseg dict
            init_cmmseg_dict('%s' % cmmseg_dir)
        else:
            init_cmmseg_dict('/usr/local/etc')

    else:
        if FLAGS.use_client_cmmseg:
            init_cmmseg_dict('/usr/local/etc')
        dictionary = {}
        logger.warning("keyword_convert not have worddict")
    return lazy_dict(dictionary.items())
Example #2
0
def load_worddict():
    if os.path.exists("%s/" % (FLAGS.worddict_directory)):
        wd_file = "%s/" % (FLAGS.worddict_directory)
        dictionary = cPickle.load(open(wd_file))

        # cmmseg lib
        cmmseg_dir = "%s_cmmseg" % wd_file
        cmmseg_lib = "%s/uni.lib" % (cmmseg_dir)

        if FLAGS.use_client_cmmseg:
            if not os.path.isfile(cmmseg_lib):
                gen_cmmseg_dict(dictionary, cmmseg_dir)
            # init cmmseg dict
            init_cmmseg_dict("%s" % cmmseg_dir)
        else:
            init_cmmseg_dict("/usr/local/etc")

    else:
        if FLAGS.use_client_cmmseg:
            init_cmmseg_dict("/usr/local/etc")
        dictionary = {}
        logger.warning("keyword_convert not have worddict")
    return lazy_dict(dictionary.items())
Example #3
0
            freq = int(freq)
            yield wd,freq
        except:
            pass

def extend_sogoudict(ext_dicts):
    for wd, freq in enum_sogoudict():
        yield wd, freq
    for wd, freq in ext_dicts:
        yield wd, freq

def lines2dict(lines):
    for l in lines:
        yield l, 2

sogoudict = lazy_dict(enum_sogoudict())

def getwd(txt,idx=0,maxwdlen=1e10,dictionary=sogoudict):
    '''
    >>> [(wd,f) for wd,f in getwd('我爱北京天安门')]
    [('\\xce', 0)]
    >>> [(wd,f) for wd,f in getwd('吃饭')]
    [('\\xb3', 0)]
    '''
    pref2wd2f,ch2f = dictionary.load()
    if idx >= len(txt):
        return
    yield txt[idx],ch2f.get(txt[idx],0)
    for wd,f in pref2wd2f.get(txt[idx:idx+2],{}).items():
        if txt[idx:idx+len(wd)] != wd:
            continue