Ejemplo n.º 1
0
    def analyze(self, text):
        """
        Run text through the external process, and get a list of lists
        ("records") that contain the analysis of each word.
        """
        try:
            text = render_safe(text).strip()
            if not text:
                return []
            chunks = text.split('\n')
            results = []
            for chunk_text in chunks:
                if chunk_text.strip():
                    textbytes = (chunk_text + '\n').encode('utf-8')
                    self.send_input(textbytes)
                    out_line = ''
                    while True:
                        out_line = self.receive_output_line()
                        out_line = out_line.decode('utf-8')

                        if out_line == '\n':
                            break

                        record = out_line.strip('\n').split(' ')
                        results.append(record)
            return results
        except ProcessError:
            self.restart_process()
            return self.analyze(text)
Ejemplo n.º 2
0
    def analyze(self, text):
        """
        Run text through the external process, and get a list of lists
        ("records") that contain the analysis of each word.
        """
        try:
            text = render_safe(text).strip()
            if not text:
                return []
            chunks = text.split('\n')
            results = []
            for chunk_text in chunks:
                if chunk_text.strip():
                    textbytes = (chunk_text + '\n').encode('utf-8')
                    self.send_input(textbytes)
                    out_line = ''
                    while True:
                        out_line = self.receive_output_line()
                        out_line = out_line.decode('utf-8')

                        if out_line == '\n':
                            break

                        record = out_line.strip('\n').split(' ')
                        results.append(record)
            return results
        except ProcessError:
            self.restart_process()
            return self.analyze(text)
Ejemplo n.º 3
0
    def analyze(self, text):
        """
        Runs a line of text through MeCab, and returns the results as a
        list of lists ("records") that contain the MeCab analysis of each
        word.
        """
        try:
            self.process  # make sure things are loaded
            text = render_safe(text).replace('\n', ' ').lower()
            n_chunks = (len(text) + 1024) // 1024
            results = []
            for chunk in range(n_chunks):
                chunk_text = text[chunk * 1024:(chunk + 1) * 1024]
                chunk_text = (chunk_text + '\n').encode('utf-8')
                self.send_input(chunk_text)
                out_line = ''
                while True:
                    out_line = self.receive_output_line()
                    out_line = out_line.decode('utf-8')

                    if out_line == 'EOS\n':
                        break

                    word, info = out_line.strip('\n').split('\t')
                    record_parts = [word] + info.split(',')

                    # Pad the record out to have 10 parts if it doesn't
                    record_parts += [None] * (10 - len(record_parts))
                    record = MeCabRecord(*record_parts)

                    # special case for detecting nai -> n
                    if (record.surface == 'ん'
                            and record.conjugation == '不変化型'):
                        # rebuild the record so that record.root is 'nai'
                        record_parts[MeCabRecord._fields.index('root')] = 'ない'
                        record = MeCabRecord(*record_parts)

                    results.append(record)
            return results
        except ProcessError:
            self.restart_process()
            return self.analyze(text)
Ejemplo n.º 4
0
    def analyze(self, text):
        """
        Runs a line of text through MeCab, and returns the results as a
        list of lists ("records") that contain the MeCab analysis of each
        word.
        """
        try:
            self.process  # make sure things are loaded
            text = render_safe(text).replace('\n', ' ').lower()
            n_chunks = (len(text) + 1024) // 1024
            results = []
            for chunk in range(n_chunks):
                chunk_text = text[chunk * 1024:(chunk + 1) * 1024]
                chunk_text = (chunk_text + '\n').encode('utf-8')
                self.send_input(chunk_text)
                out_line = ''
                while True:
                    out_line = self.receive_output_line()
                    out_line = out_line.decode('utf-8')

                    if out_line == 'EOS\n':
                        break

                    word, info = out_line.strip('\n').split('\t')
                    record_parts = [word] + info.split(',')

                    # Pad the record out to have 10 parts if it doesn't
                    record_parts += [None] * (10 - len(record_parts))
                    record = MeCabRecord(*record_parts)

                    # special case for detecting nai -> n
                    if (record.surface == 'ん' and
                        record.conjugation == '不変化型'):
                        # rebuild the record so that record.root is 'nai'
                        record_parts[MeCabRecord._fields.index('root')] = 'ない'
                        record = MeCabRecord(*record_parts)

                    results.append(record)
            return results
        except ProcessError:
            self.restart_process()
            return self.analyze(text)