Beispiel #1
0
    def initialize(self, dictionary=None):
        if dictionary:
            abs_path = _get_abs_path(dictionary)
            if self.dictionary == abs_path and self.initialized:
                return
            else:
                self.dictionary = abs_path
                self.initialized = False
        else:
            abs_path = self.dictionary

        with self.lock:
            try:
                with DICT_WRITING[abs_path]:
                    pass
            except KeyError:
                pass
            if self.initialized:
                return

            default_logger.debug("Building prefix dict from %s ..." %
                                 (abs_path or 'the default dictionary'))
            t1 = time.time()
            if self.cache_file:
                cache_file = self.cache_file
            # default dictionary
            elif abs_path == DEFAULT_DICT:
                cache_file = "jieba.cache"
            # custom dictionary
            else:
                cache_file = "jieba.u%s.cache" % md5(
                    abs_path.encode('utf-8', 'replace')).hexdigest()
            cache_file = os.path.join(self.tmp_dir or tempfile.gettempdir(),
                                      cache_file)
            # prevent absolute path in self.cache_file
            tmpdir = os.path.dirname(cache_file)

            load_from_cache_fail = True
            if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT
                                               or os.path.getmtime(cache_file)
                                               > os.path.getmtime(abs_path)):
                default_logger.debug("Loading model from cache %s" %
                                     cache_file)
                try:
                    with open(cache_file, 'rb') as cf:
                        self.FREQ, self.total = marshal.load(cf)
                    load_from_cache_fail = False
                except Exception:
                    load_from_cache_fail = True

            if load_from_cache_fail:
                wlock = DICT_WRITING.get(abs_path, threading.RLock())
                DICT_WRITING[abs_path] = wlock
                with wlock:
                    self.FREQ, self.total = self.gen_pfdict(
                        self.get_dict_file())
                    default_logger.debug("Dumping model to file cache %s" %
                                         cache_file)
                    try:
                        # prevent moving across different filesystems
                        fd, fpath = tempfile.mkstemp(dir=tmpdir)
                        with os.fdopen(fd, 'wb') as temp_cache_file:
                            marshal.dump((self.FREQ, self.total),
                                         temp_cache_file)
                        _replace_file(fpath, cache_file)
                    except Exception:
                        default_logger.exception("Dump cache file failed.")

                try:
                    del DICT_WRITING[abs_path]
                except KeyError:
                    pass

            self.initialized = True
            default_logger.debug("Loading model cost %.3f seconds." %
                                 (time.time() - t1))
            default_logger.debug("Prefix dict has been built succesfully.")
Beispiel #2
0
    def initialize(self, dictionary=None):
        if dictionary:
            abs_path = _get_abs_path(dictionary)
            if self.dictionary == abs_path and self.initialized:
                return
            else:
                self.dictionary = abs_path
                self.initialized = False
        else:
            abs_path = self.dictionary

        with self.lock:
            try:
                with DICT_WRITING[abs_path]:
                    pass
            except KeyError:
                pass
            if self.initialized:
                return

            default_logger.debug("Building prefix dict from %s ..." % abs_path)
            t1 = time.time()
            if self.cache_file:
                cache_file = self.cache_file
            # default dictionary
            elif abs_path == DEFAULT_DICT:
                cache_file = "jieba.cache"
            # custom dictionary
            else:
                cache_file = "jieba.u%s.cache" % md5(
                    abs_path.encode('utf-8', 'replace')).hexdigest()
            cache_file = os.path.join(
                self.tmp_dir or tempfile.gettempdir(), cache_file)
            # prevent absolute path in self.cache_file
            tmpdir = os.path.dirname(cache_file)

            load_from_cache_fail = True
            if os.path.isfile(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path):
                default_logger.debug(
                    "Loading model from cache %s" % cache_file)
                try:
                    with open(cache_file, 'rb') as cf:
                        self.FREQ, self.total = marshal.load(cf)
                    load_from_cache_fail = False
                except Exception:
                    load_from_cache_fail = True

            if load_from_cache_fail:
                wlock = DICT_WRITING.get(abs_path, threading.RLock())
                DICT_WRITING[abs_path] = wlock
                with wlock:
                    self.FREQ, self.total = self.gen_pfdict(abs_path)
                    default_logger.debug(
                        "Dumping model to file cache %s" % cache_file)
                    try:
                        # prevent moving across different filesystems
                        fd, fpath = tempfile.mkstemp(dir=tmpdir)
                        with os.fdopen(fd, 'wb') as temp_cache_file:
                            marshal.dump(
                                (self.FREQ, self.total), temp_cache_file)
                        _replace_file(fpath, cache_file)
                    except Exception:
                        default_logger.exception("Dump cache file failed.")

                try:
                    del DICT_WRITING[abs_path]
                except KeyError:
                    pass

            self.initialized = True
            default_logger.debug(
                "Loading model cost %.3f seconds." % (time.time() - t1))
            default_logger.debug("Prefix dict has been built succesfully.")
Beispiel #3
0
    def initialize(self, DICTIONARY):
        file_name = DICTIONARY
        abs_path = os.path.join(os.getcwd(), file_name)
        self.dictionary = abs_path

        #print self.dictionary

        with self.lock:
            try:
                with DICT_WRITING[abs_path]:
                    pass
            except KeyError:
                pass

            if self.initialized:
                return

            default_logger.debug("Building prefix dict from %s ..." %
                                 (abs_path or 'the default dictionary'))
            t1 = time.time()
            if self.cache_file:
                cache_file = self.cache_file
            # default dictionary
            cache_file = "dict.cache"
            '''
            elif abs_path == DEFAULT_DICT:
                cache_file = "dict.cache"
            # custom dictionary
            else:#hexdigest 16进制的摘要,获取加密串如:5f82e0b599b4397b322efdc0aeea6a72,32位
                cache_file = "dict.u%s.cache" % md5(abs_path.encode('utf-8', 'replace')).hexdigest() #str.encode(encoding='UTF-8',errors='strict')
            '''

            #cache_file = os.path.join(self.tmp_dir or tempfile.gettempdir(), cache_file) # gettempdir()则用于返回保存临时文件的文件夹路径。
            #print ("tempfile.gettempdir():{}\n".format(tempfile.gettempdir()))  #/tmp
            # prevent absolute path in self.cache_file
            tmpdir = os.path.dirname(cache_file)  #返回最后的文件名  /tmp
            #self.cache_file = cache_file
            #print tmpdir

            load_from_cache_fail = True  #第二次走着
            if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT
                                               or os.path.getmtime(cache_file)
                                               > os.path.getmtime(abs_path)):
                default_logger.debug("Loading model from cache %s" %
                                     cache_file)  # /tmp/jieba.cache

                try:
                    #print (cache_file) #/tmp/jieba.cache
                    with open(cache_file, 'rb') as cf:
                        self.words, self.freqs = marshal.load(cf)  #二进制流反序列化为对象
                        #print  len(self.words)#, self.FREQ
                        #print  len(self.freqs)

                    load_from_cache_fail = False
                except Exception:
                    load_from_cache_fail = True

            if load_from_cache_fail:  #第一次走这
                wlock = DICT_WRITING.get(abs_path, threading.RLock())
                DICT_WRITING[abs_path] = wlock
                with wlock:
                    self.words, self.freqs = self.gen_pfdict(
                        self.get_dict_file())
                    #print len(self.words), len(self.freqs)
                    default_logger.debug("Dumping model to file cache %s" %
                                         cache_file)

                    try:
                        # prevent moving across different filesystems
                        #返回包含两个元素的元组,第一个元素指示操作该临时文件的安全级别,第二个元素指示该临时文件的路径。
                        fd, fpath = tempfile.mkstemp(
                            dir=tmpdir)  #mkstemp方法用于创建一个临时文件
                        with os.fdopen(fd, 'wb') as temp_cache_file:
                            marshal.dump((self.words, self.freqs),
                                         temp_cache_file)  #将数值进序列对象成二进制流

                        _replace_file(fpath, cache_file)
                    except Exception:
                        default_logger.exception("Dump cache file failed.")

                try:
                    del DICT_WRITING[abs_path]
                except KeyError:
                    pass

            self.initialized = True
            default_logger.debug("Loading model cost %.3f seconds." %
                                 (time.time() - t1))
            default_logger.debug("Prefix dict has been built succesfully.")
Beispiel #4
0
    def _initialize(self, dictionary=None):
        if dictionary:
            abs_path = _get_abs_path(dictionary)
            if self.dictionary == abs_path and self.initialized:
                return
            else:
                self.dictionary = abs_path
                self.initialized = False
        else:
            abs_path = self.dictionary

        with self.lock:
            try:
                with DICT_WRITING[abs_path]:
                    pass
            except KeyError:
                pass
            if self.initialized:
                return

        default_logger.debug(
            "Building from %s ..." % (abs_path or "the default dictionary")
        )
        t1 = time.time()
        if self.cache_file:
            cache_file = self.cache_file
            # default dictionary
        elif abs_path == DEFAULT_DICT:
            cache_file = "yn.cache"
            # custom dictionary
        else:
            cache_file = (
                "yn.u%s.cache" % md5(abs_path.encode("utf-8", "replace")).hexdigest()
            )
        cache_file = os.path.join(self.tmp_dir or tempfile.gettempdir(), cache_file)
        load_from_cache_fail = True
        tmpdir = os.path.dirname(cache_file)

        if os.path.isfile(cache_file) and (
            abs_path == DEFAULT_DICT
            or os.path.getmtime(cache_file) > os.path.getmtime(abs_path)
        ):
            default_logger.debug("Loading model from cache %s" % cache_file)
            try:
                with open(cache_file, "rb") as cf:
                    self.yes, self.no, self.others, self.filtered = marshal.load(cf)
                    load_from_cache_fail = False
            except Exception:
                load_from_cache_fail = True
        if load_from_cache_fail:
            wlock = DICT_WRITING.get(abs_path, threading.RLock())
            DICT_WRITING[abs_path] = wlock
            with wlock:
                self.yes, self.no, self.others, self.filtered = self._load_simple_dict(
                    self.dictionary
                )
                default_logger.debug("Dumping model to file cache %s" % cache_file)
                try:
                    # prevent moving across different filesystems
                    fd, fpath = tempfile.mkstemp(dir=tmpdir)
                    with os.fdopen(fd, "wb") as temp_cache_file:
                        marshal.dump(
                            (self.yes, self.no, self.others, self.filtered),
                            temp_cache_file,
                        )
                    _replace_file(fpath, cache_file)
                except Exception:
                    default_logger.exception("Dump cache file failed.")

            try:
                del DICT_WRITING[abs_path]
            except KeyError:
                pass
        self.yes = set(self.yes)
        self.no = set(self.no)
        self.filtered = set(self.filtered)
        self.others = set(self.others)
        self.initialized = True
        default_logger.debug("Loading dict cost %.3f seconds." % (time.time() - t1))
        default_logger.debug("simple dict has been built successfully.")
Beispiel #5
0
    def initialize(self, dictionary=None):
        """
        abs_path代表的是字典的絕對路徑
        如果使用者傳入了dictionary參數,則需要更新abs_path
        否則的話,就直接使用在__init__()中己經設好的self.dictionary
        """
        if dictionary:
            abs_path = _get_abs_path(dictionary)
            if self.dictionary == abs_path and self.initialized:
                #因為詞典己載入,所以返回
                return
            else:
                self.dictionary = abs_path
                self.initialized = False
        else:
            abs_path = self.dictionary

        #載入詞典的過程必須被完整執行,所以使用lock
        with self.lock:
            #這一段try-except的內容都是pass,似乎沒有作用
            try:
                with DICT_WRITING[abs_path]:
                    pass
            except KeyError:
                pass
            #如果self.intialized為True,代表字典己載入

#這時就直接返回
            if self.initialized:
                return

            default_logger.debug("Building prefix dict from %s ..." %
                                 (abs_path or 'the default dictionary'))
            t1 = time.time()
            #將cache_file設定快取檔案的名稱
            if self.cache_file:
                cache_file = self.cache_file
            # default dictionary
            elif abs_path == DEFAULT_DICT:
                cache_file = "jieba.cache"
            # custom dictionary
            else:
                cache_file = "jieba.u%s.cache" % md5(
                    abs_path.encode('utf-8', 'replace')).hexdigest()
            """
            tempfile.gettempdir的作用旨在尋找一個可以寫入暫存檔的目錄。
            """
            #將cache_file更新為其絕對路徑
            cache_file = os.path.join(self.tmp_dir or tempfile.gettempdir(),
                                      cache_file)
            #快取檔案的目錄
            # prevent absolute path in self.cache_file
            tmpdir = os.path.dirname(cache_file)

            load_from_cache_fail = True
            """
            載入cache_file
            首先檢查cache_file是否存在,並且是一個檔案
            如果不是的話則略過這部份;
            如果是的話則接著確認如果使用的是預設的字典DEFAULT_DICT
            如果不是使用預設的字典,則要確認cache_file的修改時間晚於自訂義字典的修改時間
            如果都符合條件,則從快取檔案中載入self.FREQ, self.total這兩個值,
            並將load_from_cache_fail設為False
            """
            if os.path.isfile(cache_file) and (
                    abs_path == DEFAULT_DICT or
                    #os.path.getmtime: 獲取檔案的最後修改時間
                    os.path.getmtime(cache_file) > os.path.getmtime(abs_path)):
                default_logger.debug("Loading model from cache %s" %
                                     cache_file)
                try:
                    with open(cache_file, 'rb') as cf:
                        """
                        marshal.dump及marshal.load是用來儲存及載入Python物件的工具。
                        """
                        self.FREQ, self.total = marshal.load(cf)
                    load_from_cache_fail = False
                except Exception:
                    load_from_cache_fail = True

            #如果cache_file載入失敗,就重新讀取字典檔案,
            # 獲取self.FREQ, self.total然後生成快取檔案
            if load_from_cache_fail:
                #可能是怕程式中斷,所以先把lock存到DICT_WRITING這個字典裡
                #中斷後繼續執行時就可以不用再重新生成一個lock
                wlock = DICT_WRITING.get(abs_path, threading.RLock())
                DICT_WRITING[abs_path] = wlock
                #在這個程式區塊中,又需要一個lock,用來鎖住寫檔的這一區塊
                with wlock:
                    self.FREQ, self.total = self.gen_pfdict(
                        self.get_dict_file())
                    default_logger.debug("Dumping model to file cache %s" %
                                         cache_file)
                    try:
                        # prevent moving across different filesystems
                        """
                        tempfile.mkstemp的作用旨在使用最安全的方式創建一個暫存檔。
                        它回傳的是一個file descriptor,以及該檔案的絕對路徑。
                        """
                        # tmpdir是剛剛決定好的快取檔案的路徑
                        # prevent moving across different filesystems
                        fd, fpath = tempfile.mkstemp(dir=tmpdir)
                        """
                        os.fdopen:
                        利用傳入的file descriptor fd,回傳一個開啟的檔案物件。
                        """
                        # 使用marshal.dump將剛拿到的
                        # (self.FREQ, self.total)倒入temp_cache_file
                        with os.fdopen(fd, 'wb') as temp_cache_file:
                            """
                            marshal.dump及marshal.load是用來儲存及載入Python物件的工具。
                            """
                            marshal.dump((self.FREQ, self.total),
                                         temp_cache_file)
                        #把檔案重命名為cache_file
                        _replace_file(fpath, cache_file)
                    except Exception:
                        default_logger.exception("Dump cache file failed.")

                try:
                    del DICT_WRITING[abs_path]
                except KeyError:
                    pass

            #之後會利用self.initialized這個屬性
            # 來檢查self.FREQ, self.total是否己被設為有意義的值
            self.initialized = True
            default_logger.debug("Loading model cost %.3f seconds." %
                                 (time.time() - t1))
            default_logger.debug("Prefix dict has been built successfully.")