Ejemplo n.º 1
0
    def __init__(self):
        if not hot_word.idf_hd:
            hot_word.idf_hd = idf()
            hot_word.idf_hd.load(tf_idf_config.idf_dumps_path, tf_idf_config.stopwords_path)
        if not hot_word.seg_hd:
            hot_word.seg_hd = cppjieba(tf_idf_config.dict_path, tf_idf_config.hmm_path)

        if not hot_word.short_url_hd:
            hot_word.short_url_hd = fast_search.load(tf_idf_config.short_url_path)
        if not hot_word.url_re:
            hot_word.url_re = re.compile(r"(http:\/\/)*[\w\d]+\.[\w\d\.]+\/[\w\d_!@#$%^&\*-_=\+]+")
        self.hot_word_dic = {}
        self.get_file_word_flag = "num"
        self.word_list_n = 5
        self.get_file_word_cbk = {}
        self.get_file_word_cbk["num"] = self.get_file_word_list_by_num
        self.get_file_word_cbk["percent"] = self.get_file_word_list_by_persent
Ejemplo n.º 2
0
    def __init__(self):
        if not hot_word.idf_hd:
            hot_word.idf_hd = idf()
            hot_word.idf_hd.load(tf_idf_config.idf_dumps_path,
                                 tf_idf_config.stopwords_path)
        if not hot_word.seg_hd:
            hot_word.seg_hd = cppjieba(tf_idf_config.dict_path,
                                       tf_idf_config.hmm_path)

        if not hot_word.short_url_hd:
            hot_word.short_url_hd = fast_search.load(
                tf_idf_config.short_url_path)
        if not hot_word.url_re:
            hot_word.url_re = re.compile(
                r'(http:\/\/)*[\w\d]+\.[\w\d\.]+\/[\w\d_!@#$%^&\*-_=\+]+')
        self.hot_word_dic = {}
        self.get_file_word_flag = "num"
        self.word_list_n = 5
        self.get_file_word_cbk = {}
        self.get_file_word_cbk["num"] = self.get_file_word_list_by_num
        self.get_file_word_cbk["percent"] = self.get_file_word_list_by_persent
Ejemplo n.º 3
0
 def get_rubbish_set(self, stopword_path="stopwords.txt"):
     rubbish_set = set()
     hd = 0
     try:
         hd = fast_search.load(stopword_path)
         with open(stopword_path, "r") as fd:
             for l in fd:
                 rubbish_set.add(l.strip())
     except:
         hd = 0
         #如果读不到文件则忽略
         pass
     rubbish_set.add(" ")
     rubbish_set.add(" ")
     rubbish_set.add("\t")
     rubbish_set.add("\r")
     rubbish_set.add("\n")
     rubbish_set.add("\r\n")
     rubbish_set.add("DC")
     rubbish_set.add("DS")
     rubbish_set.add("gt")
     return rubbish_set, hd
Ejemplo n.º 4
0
 def get_rubbish_set(self, stopword_path = "stopwords.txt"):
     rubbish_set = set()
     hd = 0
     try:
         hd = fast_search.load(stopword_path)
         with open(stopword_path, "r") as fd:
             for l in fd:
                 rubbish_set.add(l.strip())
     except:
         hd = 0
         #如果读不到文件则忽略
         pass
     rubbish_set.add(" ")
     rubbish_set.add(" ")
     rubbish_set.add("\t")
     rubbish_set.add("\r")
     rubbish_set.add("\n")
     rubbish_set.add("\r\n")
     rubbish_set.add("DC")
     rubbish_set.add("DS")
     rubbish_set.add("gt")
     return rubbish_set, hd
Ejemplo n.º 5
0
 def __init__(self, db_name):
     self.idf_hd = idf()
     with open("idf_dumps.txt", "r") as fd:
         s = fd.read()
 
     self.idf_hd.loads(s)
     self.hot_word_dic = {}
     self.short_url_hd = fast_search.load("short_url.txt")
     self.dbhd = leveldb.LevelDB(db_name)
     self.url_re = re.compile(r'(http:\/\/)*[\w\d]+\.[\w\d\.]+\/[\w\d_!@#$%^&\*-_=\+]+')
     #内部使用batch做缓存 add_doc时暂时不写入db文件
     #要获取结果,或者达到阈值(batch_limit)时才写入文件
     self.batch = leveldb.WriteBatch()
     self.batch_counter = 0
     self.batch_limit = 100000
     self.fid = 0
     #self.get_file_word_flag = "percent"
     self.get_file_word_flag = "num"
     self.word_list_n = 5
     self.get_file_word_cbk = {}
     self.get_file_word_cbk["num"] = self.get_file_word_list_by_num
     self.get_file_word_cbk["percent"] = self.get_file_word_list_by_persent
Ejemplo n.º 6
0
    def __init__(self, db_name):
        self.idf_hd = idf()
        with open("idf_dumps.txt", "r") as fd:
            s = fd.read()

        self.idf_hd.loads(s)
        self.hot_word_dic = {}
        self.short_url_hd = fast_search.load("short_url.txt")
        self.dbhd = leveldb.LevelDB(db_name)
        self.url_re = re.compile(
            r'(http:\/\/)*[\w\d]+\.[\w\d\.]+\/[\w\d_!@#$%^&\*-_=\+]+')
        #内部使用batch做缓存 add_doc时暂时不写入db文件
        #要获取结果,或者达到阈值(batch_limit)时才写入文件
        self.batch = leveldb.WriteBatch()
        self.batch_counter = 0
        self.batch_limit = 100000
        self.fid = 0
        #self.get_file_word_flag = "percent"
        self.get_file_word_flag = "num"
        self.word_list_n = 5
        self.get_file_word_cbk = {}
        self.get_file_word_cbk["num"] = self.get_file_word_list_by_num
        self.get_file_word_cbk["percent"] = self.get_file_word_list_by_persent