def compareOnFile(self, fileName, encoding, resAccumulator): helper = Helper() writer = Writer() runner = MecabOutputGetter() lineNum = 1 for line in self.readFile(fileName, encoding, resAccumulator): text = line.strip() #if isPy2(): # text = text_type(text) if encoding == 'utf-8': text = helper.fixEncodingError(text) nodes = self.viterbi.getBestPath(text) pyResult = writer.getMecabOutput(self.viterbi.getTokenizer(), nodes) try: #runner = MecabOutputGetter() mecabResult = runner.run(text) except IOError as e: resAccumulator.print(text_type(e)) continue try: self.assertEqual(len(mecabResult), len(pyResult), text + '\npyPort:\n' + helper.outputNodes(pyResult) + '\nmecab:\n' + helper.outputNodes(mecabResult)) for i in range(len(mecabResult)): self.assertEqual(mecabResult[i], pyResult[i], "at line " + str(lineNum) + ": '" + line + "'") except AssertionError as e: resAccumulator.print(text_type(e)) lineNum += 1 if lineNum % 500 == 0: resAccumulator.print(text_type(lineNum) + ' lines have been processed') resAccumulator.print(text_type(lineNum) + ' lines have been processed')
def testAnalyze(self): nodes = self.viterbi.getBestPath(self.defaultText) writer = Writer() res = writer.getNodeText(self.viterbi.getTokenizer(), nodes) self.assertEqual(['<BOS>', '船', 'が', '検疫', '所', 'に', '着い', 'た', 'の', 'は', '、', '朝', 'の', '四', '時', '頃', 'に', 'ちがい', 'ない', '。', '<EOS>'], res)
def compareOneSentence(self, expr): nodes = self.viterbi.getBestPath(expr) writer = Writer() pyResult = writer.getMecabOutput(self.viterbi.getTokenizer(), nodes) runner = MecabOutputGetter() mecabResult = runner.run(expr) self.assertEqual(len(mecabResult), len(pyResult)) for i in range(len(mecabResult)): self.assertEqual(mecabResult[i], pyResult[i])
class PyPortSentenceParser(object): def __init__(self, dataLoader): self.mecab = None self.viterbi = Viterbi(dataLoader) self.writer = Writer() def tokenize(self, expr): path = self.viterbi.getBestPath(expr) return self.writer.getWordInfo(self.viterbi.getTokenizer(), path)
def __init__(self, dataLoader): self.mecab = None self.viterbi = Viterbi(dataLoader) self.writer = Writer()