Example #1
0
 def init(cls, folder):
     PySeg.init(folder)
     customWordPath = os.path.join(folder, "custom_word")
     if not os.path.exists(customWordPath):
         return 
     for fileName in os.listdir(customWordPath):
         path = os.path.join(customWordPath, fileName)
         wordFile = file(path)
         while True:
             line = wordFile.readline()
             if len(line) == 0:
                 break
             PySeg.addUserWord(line.decode("utf8").encode("gbk"))
         wordFile.close()
     tongyiWordPath = os.path.join(folder, "tong_yi_word")
     if not os.path.exists(tongyiWordPath):
         return
     for fileName in os.listdir(tongyiWordPath):
         path = os.path.join(tongyiWordPath, fileName)
         wordFile = file(path)
         while True:
             line = wordFile.readline()
             if len(line) == 0:
                 break
             seps = line.split()
             if len(seps) != 2:
                 continue
             if seps[0] not in cls.tongyiPair:
                 cls.tongyiPair[seps[0]] = seps[1]
                 PySeg.addUserWord(seps[0].decode('utf8').encode("gbk"))
                 PySeg.addUserWord(seps[1].decode('utf8').encode("gbk"))
Example #2
0
 def init(cls, folder):
     PySeg.init(folder)
     customWordPath = os.path.join(folder, "custom_word")
     if not os.path.exists(customWordPath):
         return
     for fileName in os.listdir(customWordPath):
         path = os.path.join(customWordPath, fileName)
         wordFile = file(path)
         while True:
             line = wordFile.readline()
             if len(line) == 0:
                 break
             PySeg.addUserWord(line.decode("utf8").encode("gbk"))
         wordFile.close()
     tongyiWordPath = os.path.join(folder, "tong_yi_word")
     if not os.path.exists(tongyiWordPath):
         return
     for fileName in os.listdir(tongyiWordPath):
         path = os.path.join(tongyiWordPath, fileName)
         wordFile = file(path)
         while True:
             line = wordFile.readline()
             if len(line) == 0:
                 break
             seps = line.split()
             if len(seps) != 2:
                 continue
             if seps[0] not in cls.tongyiPair:
                 cls.tongyiPair[seps[0]] = seps[1]
                 PySeg.addUserWord(seps[0].decode('utf8').encode("gbk"))
                 PySeg.addUserWord(seps[1].decode('utf8').encode("gbk"))
Example #3
0
 def CreateDB(self):
     options = {
         'create_if_missing': True,
         'error_if_exists': False,
         'paranoid_checks': False,
         'block_cache_size': 100 * (1 << 20),
         'write_buffer_size': 2 * (1 << 20),
         'block_size': 4096,
         'max_open_files': 1000,
         'block_restart_interval': 16
     }
     self.db = leveldb.LevelDB(self.path, **options)
     self.logger = logging.getLogger("build_index")
     PySeg.init(self.path + "/../")
Example #4
0
 def SegForLtp(cls, content):
     words = PySeg.seg(content)
     res = []
     for w in words:
         res.append((w[0].decode('utf8').encode('gb2312'),
                     cls.ConvertPosICT2Ltp(w[1])))
     return tuple(res)
Example #5
0
 def seg(cls, content):
     res = []
     tmp = PySeg.seg(content)
     for t in tmp:
         if t[0] in cls.tongyiPair:
             res.append((cls.tongyiPair[t[0]], t[1]))
         else:
             res.append(t)
     return res
Example #6
0
 def seg(cls, content):
     res = []
     tmp = PySeg.seg(content)
     for t in tmp:
         if t[0] in cls.tongyiPair:
             res.append((cls.tongyiPair[t[0]], t[1]))
         else:
             res.append(t)
     return res
Example #7
0
    def BuildIndexForOne(self, gameId, name, description, categorys, tags):
        terms = {}
        self.logger.debug(
            "build index for one %d %s %s %s %s" %
            (gameId, name, description, str(categorys), str(tags)))
        ts = PySeg.seg(name.encode('utf8'))
        terms[NameAddr] = []
        for t in ts:
            if len(t[1]) > 0 and (t[1][0] == 'n' or t[1][0] == 'v'
                                  or t[1][0] == 'a'):
                terms[NameAddr].append(t[0])
        ts = PySeg.seg(description.encode('utf8'))
        terms[DescAddr] = []
        for t in ts:
            if len(t[1]) > 0 and (t[1][0] == 'n' or t[1][0] == 'v'
                                  or t[1][0] == 'a'):
                terms[DescAddr].append(t[0])

        terms[CategoryAddr] = []
        for c in categorys:
            terms[CategoryAddr].append(c.encode("utf8"))
        terms[TagAddr] = []
        for t in tags:
            terms[TagAddr].append(t.encode('utf8'))
        term2Addrs = {}

        for k, v in terms.items():
            for term in v:
                self.logger.debug("%d %s" % (k, term.decode('utf8')))
                if term not in term2Addrs:
                    term2Addrs[term] = []

                if k not in term2Addrs[term]:
                    term2Addrs[term].append(k)

        for term, addrs in term2Addrs.items():
            self.logger.debug("term %s addrs %s" %
                              (term.decode('utf8'), str(addrs)))
            item = DBItem(term, gameId, addrs)
            (k, v) = item.Encode()

            self.db.Put(k, v)
Example #8
0
 def SegForLtp(cls, content):
     words = PySeg.seg(content)
     res = []
     for w in words:
         res.append((w[0].decode('utf8').encode('gb2312'), cls.ConvertPosICT2Ltp(w[1])))
     return tuple(res)