Esempio n. 1
0
def initialize(*args):
    global pfdict, FREQ, total, min_freq, initialized
    if not args:
        dictionary = DICTIONARY
    else:
        dictionary = args[0]
    with DICT_LOCK:
        if initialized:
            return
        if pfdict:
            del pfdict
            pfdict = None
        _curpath = os.path.normpath(
            os.path.join(os.getcwd(), os.path.dirname(__file__)))

        abs_path = os.path.join(_curpath, dictionary)
        logger.debug("Building prefix dict from %s ..." % abs_path)
        t1 = time.time()
        if abs_path == os.path.join(_curpath, "dict.txt"):  #default dictionary
            cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache")
        else:  #custom dictionary
            cache_file = os.path.join(tempfile.gettempdir(),
                                      "jieba.user.%s.cache" % hash(abs_path))

        load_from_cache_fail = True
        if os.path.exists(cache_file) and os.path.getmtime(
                cache_file) > os.path.getmtime(abs_path):
            logger.debug("Loading model from cache %s" % cache_file)
            try:
                pfdict, FREQ, total, min_freq = marshal.load(
                    open(cache_file, 'rb'))
                # prevent conflict with old version
                load_from_cache_fail = not isinstance(pfdict, set)
            except:
                load_from_cache_fail = True

        if load_from_cache_fail:
            pfdict, FREQ, total = gen_pfdict(abs_path)
            FREQ = dict([(k, log(float(v) / total))
                         for k, v in FREQ.iteritems()])  #normalize
            min_freq = min(FREQ.itervalues())
            logger.debug("Dumping model to file cache %s" % cache_file)
            try:
                tmp_suffix = "." + str(random.random())
                with open(cache_file + tmp_suffix, 'wb') as temp_cache_file:
                    marshal.dump((pfdict, FREQ, total, min_freq),
                                 temp_cache_file)
                if os.name == 'nt':
                    from shutil import move as replace_file
                else:
                    replace_file = os.rename
                replace_file(cache_file + tmp_suffix, cache_file)
            except:
                logger.exception("Dump cache file failed.")

        initialized = True

        logger.debug("Loading model cost %s seconds." % (time.time() - t1))
        logger.debug("Prefix dict has been built succesfully.")
Esempio n. 2
0
def initialize(dictionary=None):
    global pfdict, FREQ, total, min_freq, initialized, DICTIONARY, DICT_LOCK
    if not dictionary:
        dictionary = DICTIONARY
    with DICT_LOCK:
        if initialized:
            return
        if pfdict:
            del pfdict
            pfdict = None
        _curpath = os.path.normpath(
            os.path.join(os.getcwd(), os.path.dirname(__file__)))

        abs_path = os.path.join(_curpath, dictionary)
        logger.debug("Building prefix dict from %s ..." % abs_path)
        t1 = time.time()
        if abs_path == os.path.join(_curpath, "dict.txt"):  #default dictionary
            cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache")
        else:  #custom dictionary
            cache_file = os.path.join(
                tempfile.gettempdir(), "jieba.u%s.cache" %
                md5(abs_path.encode('utf-8', 'replace')).hexdigest())

        load_from_cache_fail = True
        if os.path.exists(cache_file) and os.path.getmtime(
                cache_file) > os.path.getmtime(abs_path):
            logger.debug("Loading model from cache %s" % cache_file)
            try:
                with open(cache_file, 'rb') as cf:
                    pfdict, FREQ, total, min_freq = marshal.load(cf)
                # prevent conflict with old version
                load_from_cache_fail = not isinstance(pfdict, set)
            except:
                load_from_cache_fail = True

        if load_from_cache_fail:
            pfdict, FREQ, total = gen_pfdict(abs_path)
            FREQ = dict((k, log(float(v) / total))
                        for k, v in FREQ.items())  #normalize
            min_freq = min(FREQ.values())
            logger.debug("Dumping model to file cache %s" % cache_file)
            try:
                fd, fpath = tempfile.mkstemp()
                with os.fdopen(fd, 'wb') as temp_cache_file:
                    marshal.dump((pfdict, FREQ, total, min_freq),
                                 temp_cache_file)
                if os.name == 'nt':
                    from shutil import move as replace_file
                else:
                    replace_file = os.rename
                replace_file(fpath, cache_file)
            except:
                logger.exception("Dump cache file failed.")

        initialized = True

        logger.debug("Loading model cost %s seconds." % (time.time() - t1))
        logger.debug("Prefix dict has been built succesfully.")
Esempio n. 3
0
def initialize(*args):
    global pfdict, FREQ, total, min_freq, initialized
    if not args:
        dictionary = DICTIONARY
    else:
        dictionary = args[0]
    with DICT_LOCK:
        if initialized:
            return
        if pfdict:
            del pfdict
            pfdict = None
        _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

        abs_path = os.path.join(_curpath,dictionary)
        logger.debug("Building prefix dict from %s ..." % abs_path)
        t1 = time.time()
        if abs_path == os.path.join(_curpath, "dict.txt"): #default dictionary
            cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache")
        else: #custom dictionary
            cache_file = os.path.join(tempfile.gettempdir(), "jieba.user.%s.cache" % hash(abs_path))

        load_from_cache_fail = True
        if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
            logger.debug("Loading model from cache %s" % cache_file)
            try:
                pfdict,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
                # prevent conflict with old version
                load_from_cache_fail = not isinstance(pfdict, set)
            except:
                load_from_cache_fail = True

        if load_from_cache_fail:
            pfdict,FREQ,total = gen_pfdict(abs_path)
            FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.iteritems()]) #normalize
            min_freq = min(FREQ.itervalues())
            logger.debug("Dumping model to file cache %s" % cache_file)
            try:
                tmp_suffix = "."+str(random.random())
                with open(cache_file+tmp_suffix,'wb') as temp_cache_file:
                    marshal.dump((pfdict,FREQ,total,min_freq), temp_cache_file)
                if os.name == 'nt':
                    from shutil import move as replace_file
                else:
                    replace_file = os.rename
                replace_file(cache_file + tmp_suffix, cache_file)
            except:
                logger.exception("Dump cache file failed.")

        initialized = True

        logger.debug("Loading model cost %s seconds." % (time.time() - t1))
        logger.debug("Prefix dict has been built succesfully.")
Esempio n. 4
0
def initialize(dictionary=None):
    global pfdict, FREQ, total, min_freq, initialized, DICTIONARY, DICT_LOCK
    if not dictionary:
        dictionary = DICTIONARY
    with DICT_LOCK:
        if initialized:
            return
        if pfdict:
            del pfdict
            pfdict = None
        _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

        abs_path = os.path.join(_curpath, dictionary)
        logger.debug("Building prefix dict from %s ..." % abs_path)
        t1 = time.time()
        if abs_path == os.path.join(_curpath, "dict.txt"): #default dictionary
            cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache")
        else: #custom dictionary
            cache_file = os.path.join(tempfile.gettempdir(), "jieba.u%s.cache" % md5(abs_path.encode('utf-8', 'replace')).hexdigest())

        load_from_cache_fail = True
        if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
            logger.debug("Loading model from cache %s" % cache_file)
            try:
                with open(cache_file, 'rb') as cf:
                    pfdict,FREQ,total,min_freq = marshal.load(cf)
                # prevent conflict with old version
                load_from_cache_fail = not isinstance(pfdict, set)
            except:
                load_from_cache_fail = True

        if load_from_cache_fail:
            pfdict,FREQ,total = gen_pfdict(abs_path)
            FREQ = dict((k,log(float(v)/total)) for k,v in FREQ.items()) #normalize
            min_freq = min(FREQ.values())
            logger.debug("Dumping model to file cache %s" % cache_file)
            try:
                fd, fpath = tempfile.mkstemp()
                with os.fdopen(fd, 'wb') as temp_cache_file:
                    marshal.dump((pfdict,FREQ,total,min_freq), temp_cache_file)
                if os.name == 'nt':
                    from shutil import move as replace_file
                else:
                    replace_file = os.rename
                replace_file(fpath, cache_file)
            except:
                logger.exception("Dump cache file failed.")

        initialized = True

        logger.debug("Loading model cost %s seconds." % (time.time() - t1))
        logger.debug("Prefix dict has been built succesfully.")
Esempio n. 5
0
def initialize(dictionary=None):
    global FREQ, total, initialized, DICTIONARY, DICT_LOCK, tmp_dir
    if not dictionary:
        dictionary = DICTIONARY
    with DICT_LOCK:
        if initialized:
            return

        abs_path = os.path.join(_curpath, dictionary)
        logger.debug("Building prefix dict from %s ..." % abs_path)
        t1 = time.time()
        # default dictionary
        if abs_path == os.path.join(_curpath, "dict.txt"):
            cache_file = os.path.join(
                tmp_dir if tmp_dir else tempfile.gettempdir(), "jieba.cache")
        else:  # custom dictionary
            cache_file = os.path.join(
                tmp_dir if tmp_dir else tempfile.gettempdir(),
                "jieba.u%s.cache" %
                md5(abs_path.encode('utf-8', 'replace')).hexdigest())

        load_from_cache_fail = True
        if os.path.isfile(cache_file) and os.path.getmtime(
                cache_file) > os.path.getmtime(abs_path):
            logger.debug("Loading model from cache %s" % cache_file)
            try:
                with open(cache_file, 'rb') as cf:
                    FREQ, total = marshal.load(cf)
                load_from_cache_fail = False
            except Exception:
                load_from_cache_fail = True

        if load_from_cache_fail:
            FREQ, total = gen_pfdict(abs_path)
            logger.debug("Dumping model to file cache %s" % cache_file)
            try:
                fd, fpath = tempfile.mkstemp()
                with os.fdopen(fd, 'wb') as temp_cache_file:
                    marshal.dump((FREQ, total), temp_cache_file)
                if os.name == 'nt':
                    from shutil import move as replace_file
                else:
                    replace_file = os.rename
                replace_file(fpath, cache_file)
            except Exception:
                logger.exception("Dump cache file failed.")

        initialized = True

        logger.debug("Loading model cost %s seconds." % (time.time() - t1))
        logger.debug("Prefix dict has been built succesfully.")
Esempio n. 6
0
def initialize(dictionary=None):
    global FREQ, total, initialized, DICTIONARY, DICT_LOCK, tmp_dir
    if not dictionary:
        dictionary = DICTIONARY
    with DICT_LOCK:
        if initialized:
            return

        abs_path = os.path.join(_curpath, dictionary)
        logger.debug("Building prefix dict from %s ..." % abs_path)
        t1 = time.time()
        # default dictionary
        if abs_path == os.path.join(_curpath, "dict.txt"):
            cache_file = os.path.join(tmp_dir if tmp_dir else tempfile.gettempdir(), "jieba.cache")
        else:  # custom dictionary
            cache_file = os.path.join(
                tmp_dir if tmp_dir else tempfile.gettempdir(),
                "jieba.u%s.cache" % md5(abs_path.encode("utf-8", "replace")).hexdigest(),
            )

        load_from_cache_fail = True
        if os.path.isfile(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
            logger.debug("Loading model from cache %s" % cache_file)
            try:
                with open(cache_file, "rb") as cf:
                    FREQ, total = marshal.load(cf)
                load_from_cache_fail = False
            except Exception:
                load_from_cache_fail = True

        if load_from_cache_fail:
            FREQ, total = gen_pfdict(abs_path)
            logger.debug("Dumping model to file cache %s" % cache_file)
            try:
                fd, fpath = tempfile.mkstemp()
                with os.fdopen(fd, "wb") as temp_cache_file:
                    marshal.dump((FREQ, total), temp_cache_file)
                if os.name == "nt":
                    from shutil import move as replace_file
                else:
                    replace_file = os.rename
                replace_file(fpath, cache_file)
            except Exception:
                logger.exception("Dump cache file failed.")

        initialized = True

        logger.debug("Loading model cost %s seconds." % (time.time() - t1))
        logger.debug("Prefix dict has been built succesfully.")
Esempio n. 7
0
def initialize(dictionary=None):
    global FREQ, total, initialized, DICTIONARY, DICT_LOCK, tmp_dir
    if not dictionary:
        dictionary = DICTIONARY
    with DICT_LOCK:
        if initialized:
            return
        dict_abs_path = os.path.join(_curpath, dictionary)
        logger.debug("Building prefix dict from %s ..." % dict_abs_path)
        start = time.time()

        # default dictionary, else is custom dictionary
        if dict_abs_path == os.path.join(_curpath, "dict.txt"):
            cache_file = os.path.join(tmp_dir if tmp_dir else tempfile.gettempdir(), "jieba.cache")
        else:
            cache_file = os.path.join(
                tmp_dir if tmp_dir else tempfile.gettempdir(),
                "jieba.u%s.cache" % md5(dict_abs_path.encode('utf-8', 'replace')).hexdigest())

        load_from_cache_fail = True
        if os.path.isfile(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(dict_abs_path):
            logger.debug("Loading model from cache %s" % cache_file)
            try:
                with open(cache_file, 'rb') as cf:
                    FREQ, total = marshal.load(cf)
                load_from_cache_fail = False
            except (ValueError, TypeError, EOFError):
                logger.debug("open cache file is failure.")

        if load_from_cache_fail:
            FREQ, total = gen_dict_data(dict_abs_path)
            logger.debug("Dumping model to file cache %s" % cache_file)
            try:
                fd, fpath = tempfile.mkstemp()
                with os.fdopen(fd, 'wb') as temp_cache_file:
                    marshal.dump((FREQ, total), temp_cache_file)
                if os.name == 'nt':
                    from shutil import move as replace_file
                else:
                    replace_file = os.rename
                replace_file(fpath, cache_file)
            except Exception as e:
                logger.exception("Dump cache file failed: {}".format(e))
        initialized = True

        logger.debug("Loading model cost %s seconds." % (time.time() - start))
        logger.debug("Prefix dict has been built successfully.")