def initialize(*args): global pfdict, FREQ, total, min_freq, initialized if not args: dictionary = DICTIONARY else: dictionary = args[0] with DICT_LOCK: if initialized: return if pfdict: del pfdict pfdict = None _curpath = os.path.normpath( os.path.join(os.getcwd(), os.path.dirname(__file__))) abs_path = os.path.join(_curpath, dictionary) logger.debug("Building prefix dict from %s ..." % abs_path) t1 = time.time() if abs_path == os.path.join(_curpath, "dict.txt"): #default dictionary cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache") else: #custom dictionary cache_file = os.path.join(tempfile.gettempdir(), "jieba.user.%s.cache" % hash(abs_path)) load_from_cache_fail = True if os.path.exists(cache_file) and os.path.getmtime( cache_file) > os.path.getmtime(abs_path): logger.debug("Loading model from cache %s" % cache_file) try: pfdict, FREQ, total, min_freq = marshal.load( open(cache_file, 'rb')) # prevent conflict with old version load_from_cache_fail = not isinstance(pfdict, set) except: load_from_cache_fail = True if load_from_cache_fail: pfdict, FREQ, total = gen_pfdict(abs_path) FREQ = dict([(k, log(float(v) / total)) for k, v in FREQ.iteritems()]) #normalize min_freq = min(FREQ.itervalues()) logger.debug("Dumping model to file cache %s" % cache_file) try: tmp_suffix = "." + str(random.random()) with open(cache_file + tmp_suffix, 'wb') as temp_cache_file: marshal.dump((pfdict, FREQ, total, min_freq), temp_cache_file) if os.name == 'nt': from shutil import move as replace_file else: replace_file = os.rename replace_file(cache_file + tmp_suffix, cache_file) except: logger.exception("Dump cache file failed.") initialized = True logger.debug("Loading model cost %s seconds." % (time.time() - t1)) logger.debug("Prefix dict has been built succesfully.")
def initialize(dictionary=None): global pfdict, FREQ, total, min_freq, initialized, DICTIONARY, DICT_LOCK if not dictionary: dictionary = DICTIONARY with DICT_LOCK: if initialized: return if pfdict: del pfdict pfdict = None _curpath = os.path.normpath( os.path.join(os.getcwd(), os.path.dirname(__file__))) abs_path = os.path.join(_curpath, dictionary) logger.debug("Building prefix dict from %s ..." % abs_path) t1 = time.time() if abs_path == os.path.join(_curpath, "dict.txt"): #default dictionary cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache") else: #custom dictionary cache_file = os.path.join( tempfile.gettempdir(), "jieba.u%s.cache" % md5(abs_path.encode('utf-8', 'replace')).hexdigest()) load_from_cache_fail = True if os.path.exists(cache_file) and os.path.getmtime( cache_file) > os.path.getmtime(abs_path): logger.debug("Loading model from cache %s" % cache_file) try: with open(cache_file, 'rb') as cf: pfdict, FREQ, total, min_freq = marshal.load(cf) # prevent conflict with old version load_from_cache_fail = not isinstance(pfdict, set) except: load_from_cache_fail = True if load_from_cache_fail: pfdict, FREQ, total = gen_pfdict(abs_path) FREQ = dict((k, log(float(v) / total)) for k, v in FREQ.items()) #normalize min_freq = min(FREQ.values()) logger.debug("Dumping model to file cache %s" % cache_file) try: fd, fpath = tempfile.mkstemp() with os.fdopen(fd, 'wb') as temp_cache_file: marshal.dump((pfdict, FREQ, total, min_freq), temp_cache_file) if os.name == 'nt': from shutil import move as replace_file else: replace_file = os.rename replace_file(fpath, cache_file) except: logger.exception("Dump cache file failed.") initialized = True logger.debug("Loading model cost %s seconds." % (time.time() - t1)) logger.debug("Prefix dict has been built succesfully.")
def initialize(*args): global pfdict, FREQ, total, min_freq, initialized if not args: dictionary = DICTIONARY else: dictionary = args[0] with DICT_LOCK: if initialized: return if pfdict: del pfdict pfdict = None _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) abs_path = os.path.join(_curpath,dictionary) logger.debug("Building prefix dict from %s ..." % abs_path) t1 = time.time() if abs_path == os.path.join(_curpath, "dict.txt"): #default dictionary cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache") else: #custom dictionary cache_file = os.path.join(tempfile.gettempdir(), "jieba.user.%s.cache" % hash(abs_path)) load_from_cache_fail = True if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path): logger.debug("Loading model from cache %s" % cache_file) try: pfdict,FREQ,total,min_freq = marshal.load(open(cache_file,'rb')) # prevent conflict with old version load_from_cache_fail = not isinstance(pfdict, set) except: load_from_cache_fail = True if load_from_cache_fail: pfdict,FREQ,total = gen_pfdict(abs_path) FREQ = dict([(k,log(float(v)/total)) for k,v in FREQ.iteritems()]) #normalize min_freq = min(FREQ.itervalues()) logger.debug("Dumping model to file cache %s" % cache_file) try: tmp_suffix = "."+str(random.random()) with open(cache_file+tmp_suffix,'wb') as temp_cache_file: marshal.dump((pfdict,FREQ,total,min_freq), temp_cache_file) if os.name == 'nt': from shutil import move as replace_file else: replace_file = os.rename replace_file(cache_file + tmp_suffix, cache_file) except: logger.exception("Dump cache file failed.") initialized = True logger.debug("Loading model cost %s seconds." % (time.time() - t1)) logger.debug("Prefix dict has been built succesfully.")
def initialize(dictionary=None): global pfdict, FREQ, total, min_freq, initialized, DICTIONARY, DICT_LOCK if not dictionary: dictionary = DICTIONARY with DICT_LOCK: if initialized: return if pfdict: del pfdict pfdict = None _curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) abs_path = os.path.join(_curpath, dictionary) logger.debug("Building prefix dict from %s ..." % abs_path) t1 = time.time() if abs_path == os.path.join(_curpath, "dict.txt"): #default dictionary cache_file = os.path.join(tempfile.gettempdir(), "jieba.cache") else: #custom dictionary cache_file = os.path.join(tempfile.gettempdir(), "jieba.u%s.cache" % md5(abs_path.encode('utf-8', 'replace')).hexdigest()) load_from_cache_fail = True if os.path.exists(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path): logger.debug("Loading model from cache %s" % cache_file) try: with open(cache_file, 'rb') as cf: pfdict,FREQ,total,min_freq = marshal.load(cf) # prevent conflict with old version load_from_cache_fail = not isinstance(pfdict, set) except: load_from_cache_fail = True if load_from_cache_fail: pfdict,FREQ,total = gen_pfdict(abs_path) FREQ = dict((k,log(float(v)/total)) for k,v in FREQ.items()) #normalize min_freq = min(FREQ.values()) logger.debug("Dumping model to file cache %s" % cache_file) try: fd, fpath = tempfile.mkstemp() with os.fdopen(fd, 'wb') as temp_cache_file: marshal.dump((pfdict,FREQ,total,min_freq), temp_cache_file) if os.name == 'nt': from shutil import move as replace_file else: replace_file = os.rename replace_file(fpath, cache_file) except: logger.exception("Dump cache file failed.") initialized = True logger.debug("Loading model cost %s seconds." % (time.time() - t1)) logger.debug("Prefix dict has been built succesfully.")
def initialize(dictionary=None): global FREQ, total, initialized, DICTIONARY, DICT_LOCK, tmp_dir if not dictionary: dictionary = DICTIONARY with DICT_LOCK: if initialized: return abs_path = os.path.join(_curpath, dictionary) logger.debug("Building prefix dict from %s ..." % abs_path) t1 = time.time() # default dictionary if abs_path == os.path.join(_curpath, "dict.txt"): cache_file = os.path.join( tmp_dir if tmp_dir else tempfile.gettempdir(), "jieba.cache") else: # custom dictionary cache_file = os.path.join( tmp_dir if tmp_dir else tempfile.gettempdir(), "jieba.u%s.cache" % md5(abs_path.encode('utf-8', 'replace')).hexdigest()) load_from_cache_fail = True if os.path.isfile(cache_file) and os.path.getmtime( cache_file) > os.path.getmtime(abs_path): logger.debug("Loading model from cache %s" % cache_file) try: with open(cache_file, 'rb') as cf: FREQ, total = marshal.load(cf) load_from_cache_fail = False except Exception: load_from_cache_fail = True if load_from_cache_fail: FREQ, total = gen_pfdict(abs_path) logger.debug("Dumping model to file cache %s" % cache_file) try: fd, fpath = tempfile.mkstemp() with os.fdopen(fd, 'wb') as temp_cache_file: marshal.dump((FREQ, total), temp_cache_file) if os.name == 'nt': from shutil import move as replace_file else: replace_file = os.rename replace_file(fpath, cache_file) except Exception: logger.exception("Dump cache file failed.") initialized = True logger.debug("Loading model cost %s seconds." % (time.time() - t1)) logger.debug("Prefix dict has been built succesfully.")
def initialize(dictionary=None): global FREQ, total, initialized, DICTIONARY, DICT_LOCK, tmp_dir if not dictionary: dictionary = DICTIONARY with DICT_LOCK: if initialized: return abs_path = os.path.join(_curpath, dictionary) logger.debug("Building prefix dict from %s ..." % abs_path) t1 = time.time() # default dictionary if abs_path == os.path.join(_curpath, "dict.txt"): cache_file = os.path.join(tmp_dir if tmp_dir else tempfile.gettempdir(), "jieba.cache") else: # custom dictionary cache_file = os.path.join( tmp_dir if tmp_dir else tempfile.gettempdir(), "jieba.u%s.cache" % md5(abs_path.encode("utf-8", "replace")).hexdigest(), ) load_from_cache_fail = True if os.path.isfile(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path): logger.debug("Loading model from cache %s" % cache_file) try: with open(cache_file, "rb") as cf: FREQ, total = marshal.load(cf) load_from_cache_fail = False except Exception: load_from_cache_fail = True if load_from_cache_fail: FREQ, total = gen_pfdict(abs_path) logger.debug("Dumping model to file cache %s" % cache_file) try: fd, fpath = tempfile.mkstemp() with os.fdopen(fd, "wb") as temp_cache_file: marshal.dump((FREQ, total), temp_cache_file) if os.name == "nt": from shutil import move as replace_file else: replace_file = os.rename replace_file(fpath, cache_file) except Exception: logger.exception("Dump cache file failed.") initialized = True logger.debug("Loading model cost %s seconds." % (time.time() - t1)) logger.debug("Prefix dict has been built succesfully.")
def initialize(dictionary=None): global FREQ, total, initialized, DICTIONARY, DICT_LOCK, tmp_dir if not dictionary: dictionary = DICTIONARY with DICT_LOCK: if initialized: return dict_abs_path = os.path.join(_curpath, dictionary) logger.debug("Building prefix dict from %s ..." % dict_abs_path) start = time.time() # default dictionary, else is custom dictionary if dict_abs_path == os.path.join(_curpath, "dict.txt"): cache_file = os.path.join(tmp_dir if tmp_dir else tempfile.gettempdir(), "jieba.cache") else: cache_file = os.path.join( tmp_dir if tmp_dir else tempfile.gettempdir(), "jieba.u%s.cache" % md5(dict_abs_path.encode('utf-8', 'replace')).hexdigest()) load_from_cache_fail = True if os.path.isfile(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(dict_abs_path): logger.debug("Loading model from cache %s" % cache_file) try: with open(cache_file, 'rb') as cf: FREQ, total = marshal.load(cf) load_from_cache_fail = False except (ValueError, TypeError, EOFError): logger.debug("open cache file is failure.") if load_from_cache_fail: FREQ, total = gen_dict_data(dict_abs_path) logger.debug("Dumping model to file cache %s" % cache_file) try: fd, fpath = tempfile.mkstemp() with os.fdopen(fd, 'wb') as temp_cache_file: marshal.dump((FREQ, total), temp_cache_file) if os.name == 'nt': from shutil import move as replace_file else: replace_file = os.rename replace_file(fpath, cache_file) except Exception as e: logger.exception("Dump cache file failed: {}".format(e)) initialized = True logger.debug("Loading model cost %s seconds." % (time.time() - start)) logger.debug("Prefix dict has been built successfully.")