Example #1
0
class PyPortSentenceParser(object):
    def __init__(self, dataLoader):
        self.mecab = None
        self.viterbi = Viterbi(dataLoader)
        self.writer = Writer()

    def tokenize(self, expr):
        path = self.viterbi.getBestPath(expr)
        return self.writer.getWordInfo(self.viterbi.getTokenizer(), path)
Example #2
0
 def setUp(self):
     self.defaultText = '船が検疫所に着いたのは、朝の四時頃にちがいない。'
     self.viterbi = Viterbi(getDataLoader())
Example #3
0
class ViterbiTest(unittest.TestCase):
    def setUp(self):
        self.defaultText = '船が検疫所に着いたのは、朝の四時頃にちがいない。'
        self.viterbi = Viterbi(getDataLoader())

    #@unittest.skip("temp skipping")
    def testConnectNodeMutualCost(self):
        self.viterbi.connector = MockConnector()
        bestNode = Node(Token('b', 12, 10, 0, 0, 5, 0), 0)
        beginNodes = [Node(Token('a', 10, 15, 0, 0, 5, 0), 0),
                      bestNode,
                      Node(Token('c', 11, 11, 0, 0, 5, 0), 0)]
        endNode = Node(Token('7', 10, 10, 0, 0, 5, 0), 0)
        self.viterbi.connect(beginNodes, endNode)
        self.assertEquals(endNode.leftNode, bestNode)

    #@unittest.skip("temp skipping")
    def testAnalyze(self):
        nodes = self.viterbi.getBestPath(self.defaultText)
        writer = Writer()
        res = writer.getNodeText(self.viterbi.getTokenizer(), nodes)
        self.assertEqual(['<BOS>', '船', 'が', '検疫', '所', 'に',
                          '着い', 'た', 'の', 'は', '、', '朝', 'の',
                          '四', '時', '頃', 'に', 'ちがい',
                          'ない', '。', '<EOS>'], res)

    #@unittest.skip("temp skipping")
    def testCompareMecabWithOneSentence(self):
        self.compareOneSentence(self.defaultText)

    def testNoneToken(self):
        expr = '船客の大部分はまだ眠っていた。'
        self.compareOneSentence(expr)

    def testUnknownNode(self):
        expr = 'すべてに滲《し》み込み'
        self.compareOneSentence(expr)

    def testWordWithUnknownNode(self):
        expr = 'にもかかわらず、デッキに'
        self.compareOneSentence(expr)

    def testSymbol(self):
        expr = '」'
        self.compareOneSentence(expr)

    def compareOneSentence(self, expr):
        nodes = self.viterbi.getBestPath(expr)
        writer = Writer()
        pyResult = writer.getMecabOutput(self.viterbi.getTokenizer(), nodes)
        runner = MecabOutputGetter()
        mecabResult = runner.run(expr)
        self.assertEqual(len(mecabResult), len(pyResult))
        for i in range(len(mecabResult)):
            self.assertEqual(mecabResult[i], pyResult[i])


    def out(text, mecabOutput, pyOutput):
        z = text + ' | ' + str(pyResult) + str(mecabResult)

    def testError(self):
        self.compareOneSentence('づめに働い')

    def testDash(self):
        self.compareOneSentence('-----')
Example #4
0
 def setUp(self):
     self.viterbi = Viterbi(getDataLoader())
     self.maxDiff = None
Example #5
0
class MecabCompareTest(unittest.TestCase):
    def setUp(self):
        self.viterbi = Viterbi(getDataLoader())
        self.maxDiff = None

    @unittest.skip("temp skipping")
    def testEntireFile(self):
        resAccumulator = ResultContainer('maigrat_rep.txt')
        self.compareOnFile(r'../testdata/other/MaigraitInNewYork_ch1.txt', 'utf-8', resAccumulator)
        self.compareOnFile(r'../testdata/other/MaigraitInNewYork.txt', 'utf-8', resAccumulator)

    #@unittest.skip("temp skipping")
    def testDirContents(self):
        resAccumulator = ResultContainer('dir_report.txt')
        baseDir = r'c:\project\temp_for_mecab\1'
        dirList = os.listdir(baseDir)
        txtFiles = list(filter(lambda x: x.endswith('.txt'), dirList))
        for fileName in txtFiles[:3]:
            self.compareOnFile(os.path.join(baseDir, fileName), 'shift-jis', resAccumulator)

    def readFile(self, fileName, encoding, resAccumulator):
        with io.open(fileName, 'rb') as inFile:
            contents = inFile.read()
        contents = contents.split(b'\r\n')
        lineNum = 1
        encodingError = 0
        resAccumulator.print(fileName)
        for line in contents:
            try:
                text = line.strip()
                text = text_type(line, encoding)
                lineNum += 1
                yield text
            except UnicodeDecodeError as u:
                encodingError += 1
                resAccumulator.print('line {0}, pos {1}: encoding error'.format(lineNum, u.start))

    def testFixEncoding(self):
        helper = Helper()
        z = helper.fixEncodingError('11~22')
        self.assertEqual(z, '11~22')

    def compareOnFile(self, fileName, encoding, resAccumulator):
        helper = Helper()
        writer = Writer()
        runner = MecabOutputGetter()
        lineNum = 1
        for line in self.readFile(fileName, encoding, resAccumulator):
                text = line.strip()
                #if isPy2():
                #    text = text_type(text)
                if encoding == 'utf-8':
                    text = helper.fixEncodingError(text)
                nodes = self.viterbi.getBestPath(text)

                pyResult = writer.getMecabOutput(self.viterbi.getTokenizer(), nodes)
                try:
                    #runner = MecabOutputGetter()
                    mecabResult = runner.run(text)
                except IOError as e:
                    resAccumulator.print(text_type(e))
                    continue
                try:

                    self.assertEqual(len(mecabResult), len(pyResult),
                        text + '\npyPort:\n' + helper.outputNodes(pyResult) +
                         '\nmecab:\n' + helper.outputNodes(mecabResult))
                    for i in range(len(mecabResult)):
                        self.assertEqual(mecabResult[i], pyResult[i], "at line " + str(lineNum) + ": '" + line + "'")
                except AssertionError as e:
                    resAccumulator.print(text_type(e))
                lineNum += 1
                if lineNum % 500 == 0:
                    resAccumulator.print(text_type(lineNum) + ' lines have been processed')
        resAccumulator.print(text_type(lineNum) + ' lines have been processed')
Example #6
0
 def __init__(self, dataLoader):
     self.mecab = None
     self.viterbi = Viterbi(dataLoader)
     self.writer = Writer()