Exemple #1
0
    def compareOnFile(self, fileName, encoding, resAccumulator):
        helper = Helper()
        writer = Writer()
        runner = MecabOutputGetter()
        lineNum = 1
        for line in self.readFile(fileName, encoding, resAccumulator):
                text = line.strip()
                #if isPy2():
                #    text = text_type(text)
                if encoding == 'utf-8':
                    text = helper.fixEncodingError(text)
                nodes = self.viterbi.getBestPath(text)

                pyResult = writer.getMecabOutput(self.viterbi.getTokenizer(), nodes)
                try:
                    #runner = MecabOutputGetter()
                    mecabResult = runner.run(text)
                except IOError as e:
                    resAccumulator.print(text_type(e))
                    continue
                try:

                    self.assertEqual(len(mecabResult), len(pyResult),
                        text + '\npyPort:\n' + helper.outputNodes(pyResult) +
                         '\nmecab:\n' + helper.outputNodes(mecabResult))
                    for i in range(len(mecabResult)):
                        self.assertEqual(mecabResult[i], pyResult[i], "at line " + str(lineNum) + ": '" + line + "'")
                except AssertionError as e:
                    resAccumulator.print(text_type(e))
                lineNum += 1
                if lineNum % 500 == 0:
                    resAccumulator.print(text_type(lineNum) + ' lines have been processed')
        resAccumulator.print(text_type(lineNum) + ' lines have been processed')
Exemple #2
0
 def testAnalyze(self):
     nodes = self.viterbi.getBestPath(self.defaultText)
     writer = Writer()
     res = writer.getNodeText(self.viterbi.getTokenizer(), nodes)
     self.assertEqual(['<BOS>', '船', 'が', '検疫', '所', 'に',
                       '着い', 'た', 'の', 'は', '、', '朝', 'の',
                       '四', '時', '頃', 'に', 'ちがい',
                       'ない', '。', '<EOS>'], res)
Exemple #3
0
 def compareOneSentence(self, expr):
     nodes = self.viterbi.getBestPath(expr)
     writer = Writer()
     pyResult = writer.getMecabOutput(self.viterbi.getTokenizer(), nodes)
     runner = MecabOutputGetter()
     mecabResult = runner.run(expr)
     self.assertEqual(len(mecabResult), len(pyResult))
     for i in range(len(mecabResult)):
         self.assertEqual(mecabResult[i], pyResult[i])
Exemple #4
0
class PyPortSentenceParser(object):
    def __init__(self, dataLoader):
        self.mecab = None
        self.viterbi = Viterbi(dataLoader)
        self.writer = Writer()

    def tokenize(self, expr):
        path = self.viterbi.getBestPath(expr)
        return self.writer.getWordInfo(self.viterbi.getTokenizer(), path)
Exemple #5
0
 def __init__(self, dataLoader):
     self.mecab = None
     self.viterbi = Viterbi(dataLoader)
     self.writer = Writer()