Example #1
0
    def analyze(self, text):
        """
        Runs a line of text through MeCab, and returns the results as a
        list of lists ("records") that contain the MeCab analysis of each
        word.
        """
        try:
            self.process  # make sure things are loaded
            text = render_safe(text).replace('\n', ' ').lower()
            results = []
            for chunk in string_pieces(text):
                self.send_input((chunk + '\n').encode('utf-8'))
                while True:
                    out_line = self.receive_output_line().decode('utf-8')
                    if out_line == 'EOS\n':
                        break

                    word, info = out_line.strip('\n').split('\t')
                    record_parts = [word] + info.split(',')

                    # Pad the record out to have 10 parts if it doesn't
                    record_parts += [None] * (10 - len(record_parts))
                    record = MeCabRecord(*record_parts)

                    # special case for detecting nai -> n
                    if (record.surface == 'ん' and
                        record.conjugation == '不変化型'):
                        # rebuild the record so that record.root is 'nai'
                        record_parts[MeCabRecord._fields.index('root')] = 'ない'
                        record = MeCabRecord(*record_parts)

                    results.append(record)
            return results
        except ProcessError:
            self.restart_process()
            return self.analyze(text)
Example #2
0
    def analyze(self, text):
        """
        Runs a line of text through MeCab, and returns the results as a
        list of lists ("records") that contain the MeCab analysis of each
        word.
        """
        try:
            self.process  # make sure things are loaded
            text = render_safe(text).replace('\n', ' ').lower()
            results = []
            for chunk in string_pieces(text):
                self.send_input((chunk + '\n').encode('utf-8'))
                while True:
                    out_line = self.receive_output_line().decode('utf-8')
                    if out_line == 'EOS\n':
                        break

                    word, info = out_line.strip('\n').split('\t')
                    record_parts = [word] + info.split(',')

                    # Pad the record out to have 10 parts if it doesn't
                    record_parts += [None] * (10 - len(record_parts))
                    record = MeCabRecord(*record_parts)

                    # special case for detecting nai -> n
                    if (record.surface == 'ん'
                            and record.conjugation == '不変化型'):
                        # rebuild the record so that record.root is 'nai'
                        record_parts[MeCabRecord._fields.index('root')] = 'ない'
                        record = MeCabRecord(*record_parts)

                    results.append(record)
            return results
        except ProcessError:
            self.restart_process()
            return self.analyze(text)
Example #3
0
def test_string_pieces():
    # Break as close to whitespace as possible
    text = "12 12 12345 123456 1234567-12345678"
    eq_(list(string_pieces(text, 6)), ["12 12 ", "12345 ", "123456", " ", "123456", "7-", "123456", "78"])
Example #4
0
def test_string_pieces():
    # Break as close to whitespace as possible
    text = "12 12 12345 123456 1234567-12345678"
    eq_(list(string_pieces(text, 6)),
        ['12 12 ', '12345 ', '123456', ' ', '123456', '7-', '123456', '78'])
Example #5
0
def test_string_pieces():
    # Break as close to whitespace as possible
    text = "12 12 12345 123456 1234567-12345678"
    eq_(list(string_pieces(text, 6)),
        ['12 12 ', '12345 ', '123456', ' ', '123456', '7-', '123456', '78'])