class PyPortSentenceParser(object): def __init__(self, dataLoader): self.mecab = None self.viterbi = Viterbi(dataLoader) self.writer = Writer() def tokenize(self, expr): path = self.viterbi.getBestPath(expr) return self.writer.getWordInfo(self.viterbi.getTokenizer(), path)
def setUp(self): self.defaultText = '船が検疫所に着いたのは、朝の四時頃にちがいない。' self.viterbi = Viterbi(getDataLoader())
class ViterbiTest(unittest.TestCase): def setUp(self): self.defaultText = '船が検疫所に着いたのは、朝の四時頃にちがいない。' self.viterbi = Viterbi(getDataLoader()) #@unittest.skip("temp skipping") def testConnectNodeMutualCost(self): self.viterbi.connector = MockConnector() bestNode = Node(Token('b', 12, 10, 0, 0, 5, 0), 0) beginNodes = [Node(Token('a', 10, 15, 0, 0, 5, 0), 0), bestNode, Node(Token('c', 11, 11, 0, 0, 5, 0), 0)] endNode = Node(Token('7', 10, 10, 0, 0, 5, 0), 0) self.viterbi.connect(beginNodes, endNode) self.assertEquals(endNode.leftNode, bestNode) #@unittest.skip("temp skipping") def testAnalyze(self): nodes = self.viterbi.getBestPath(self.defaultText) writer = Writer() res = writer.getNodeText(self.viterbi.getTokenizer(), nodes) self.assertEqual(['<BOS>', '船', 'が', '検疫', '所', 'に', '着い', 'た', 'の', 'は', '、', '朝', 'の', '四', '時', '頃', 'に', 'ちがい', 'ない', '。', '<EOS>'], res) #@unittest.skip("temp skipping") def testCompareMecabWithOneSentence(self): self.compareOneSentence(self.defaultText) def testNoneToken(self): expr = '船客の大部分はまだ眠っていた。' self.compareOneSentence(expr) def testUnknownNode(self): expr = 'すべてに滲《し》み込み' self.compareOneSentence(expr) def testWordWithUnknownNode(self): expr = 'にもかかわらず、デッキに' self.compareOneSentence(expr) def testSymbol(self): expr = '」' self.compareOneSentence(expr) def compareOneSentence(self, expr): nodes = self.viterbi.getBestPath(expr) writer = Writer() pyResult = writer.getMecabOutput(self.viterbi.getTokenizer(), nodes) runner = MecabOutputGetter() mecabResult = runner.run(expr) self.assertEqual(len(mecabResult), len(pyResult)) for i in range(len(mecabResult)): self.assertEqual(mecabResult[i], pyResult[i]) def out(text, mecabOutput, pyOutput): z = text + ' | ' + str(pyResult) + str(mecabResult) def testError(self): self.compareOneSentence('づめに働い') def testDash(self): self.compareOneSentence('-----')
def setUp(self): self.viterbi = Viterbi(getDataLoader()) self.maxDiff = None
class MecabCompareTest(unittest.TestCase): def setUp(self): self.viterbi = Viterbi(getDataLoader()) self.maxDiff = None @unittest.skip("temp skipping") def testEntireFile(self): resAccumulator = ResultContainer('maigrat_rep.txt') self.compareOnFile(r'../testdata/other/MaigraitInNewYork_ch1.txt', 'utf-8', resAccumulator) self.compareOnFile(r'../testdata/other/MaigraitInNewYork.txt', 'utf-8', resAccumulator) #@unittest.skip("temp skipping") def testDirContents(self): resAccumulator = ResultContainer('dir_report.txt') baseDir = r'c:\project\temp_for_mecab\1' dirList = os.listdir(baseDir) txtFiles = list(filter(lambda x: x.endswith('.txt'), dirList)) for fileName in txtFiles[:3]: self.compareOnFile(os.path.join(baseDir, fileName), 'shift-jis', resAccumulator) def readFile(self, fileName, encoding, resAccumulator): with io.open(fileName, 'rb') as inFile: contents = inFile.read() contents = contents.split(b'\r\n') lineNum = 1 encodingError = 0 resAccumulator.print(fileName) for line in contents: try: text = line.strip() text = text_type(line, encoding) lineNum += 1 yield text except UnicodeDecodeError as u: encodingError += 1 resAccumulator.print('line {0}, pos {1}: encoding error'.format(lineNum, u.start)) def testFixEncoding(self): helper = Helper() z = helper.fixEncodingError('11~22') self.assertEqual(z, '11~22') def compareOnFile(self, fileName, encoding, resAccumulator): helper = Helper() writer = Writer() runner = MecabOutputGetter() lineNum = 1 for line in self.readFile(fileName, encoding, resAccumulator): text = line.strip() #if isPy2(): # text = text_type(text) if encoding == 'utf-8': text = helper.fixEncodingError(text) nodes = self.viterbi.getBestPath(text) pyResult = writer.getMecabOutput(self.viterbi.getTokenizer(), nodes) try: #runner = MecabOutputGetter() mecabResult = runner.run(text) except IOError as e: resAccumulator.print(text_type(e)) continue try: self.assertEqual(len(mecabResult), len(pyResult), text + '\npyPort:\n' + helper.outputNodes(pyResult) + '\nmecab:\n' + helper.outputNodes(mecabResult)) for i in range(len(mecabResult)): self.assertEqual(mecabResult[i], pyResult[i], "at line " + str(lineNum) + ": '" + line + "'") except AssertionError as e: resAccumulator.print(text_type(e)) lineNum += 1 if lineNum % 500 == 0: resAccumulator.print(text_type(lineNum) + ' lines have been processed') resAccumulator.print(text_type(lineNum) + ' lines have been processed')
def __init__(self, dataLoader): self.mecab = None self.viterbi = Viterbi(dataLoader) self.writer = Writer()