コード例 #1
0
ファイル: dictionary.py プロジェクト: toomoresuch/pysonengine
 def __init__(self, dataDir, bigendian=False):
     self.categorys = CharCategory.readCategorys(dataDir, bigendian)
     fmis = FileMappedInputStream(dataDir + "/code2category", bigendian)
     try:
         self.char2id = fmis.getIntArray(fmis.size() / 4 / 2)
         self.eqlMasks = fmis.getIntArray(fmis.size() / 4 / 2)
     finally:
         fmis.close()
コード例 #2
0
 def __init__(self, dataDir, bigendian=False):
     self.categorys = CharCategory.readCategorys(dataDir, bigendian)
     fmis = FileMappedInputStream(dataDir + "/code2category", bigendian)
     try:
         self.char2id = fmis.getIntArray(fmis.size() // 4 // 2)
         self.eqlMasks = fmis.getIntArray(fmis.size() // 4 // 2)
     finally:
         fmis.close()
コード例 #3
0
ファイル: dictionary.py プロジェクト: toomoresuch/pysonengine
    def __init__(self, dataDir, bigendian=False, splitted=False):
        self.trie = Searcher(dataDir + "/word2id", bigendian)
        if splitted:
            paths = sorted(glob.glob(dataDir + "/word.dat.*"))
            self.data = util.getCharArrayMulti(paths, bigendian)
        else:
            self.data = util.getCharArray(dataDir + "/word.dat", bigendian)
        self.indices = util.getIntArray(dataDir + "/word.ary.idx", bigendian)

        fmis = FileMappedInputStream(dataDir + "/word.inf", bigendian)
        try:
            wordCount = fmis.size() / (4 + 2 + 2 + 2)
            self.dataOffsets = fmis.getIntArray(wordCount)
            """ dataOffsets[単語ID] = 単語の素性データの開始位置 """
            self.leftIds = fmis.getShortArray(wordCount)
            """ leftIds[単語ID] = 単語の左文脈ID """
            self.rightIds = fmis.getShortArray(wordCount)
            """ rightIds[単語ID] = 単語の右文脈ID """
            self.costs = fmis.getShortArray(wordCount)
            """ consts[単語ID] = 単語のコスト """
        finally:
            fmis.close()
コード例 #4
0
    def __init__(self, dataDir, bigendian=False, splitted=False):
        self.trie = Searcher(dataDir + "/word2id", bigendian)
        if splitted:
            paths = sorted(glob.glob(dataDir + "/word.dat.*"))
            self.data = util.getCharArrayMulti(paths, bigendian)
        else:
            self.data = util.getCharArray(dataDir + "/word.dat", bigendian)
        self.indices = util.getIntArray(dataDir + "/word.ary.idx", bigendian)

        fmis = FileMappedInputStream(dataDir + "/word.inf", bigendian)
        try:
            wordCount = fmis.size() // (4 + 2 + 2 + 2)
            self.dataOffsets = fmis.getIntArray(wordCount)
            """ dataOffsets[単語ID] = 単語の素性データの開始位置 """
            self.leftIds = fmis.getShortArray(wordCount)
            """ leftIds[単語ID] = 単語の左文脈ID """
            self.rightIds = fmis.getShortArray(wordCount)
            """ rightIds[単語ID] = 単語の右文脈ID """
            self.costs = fmis.getShortArray(wordCount)
            """ consts[単語ID] = 単語のコスト """
        finally:
            fmis.close()