Beispiel #1
0
    def __init__(self,
                 command='juman',
                 server=None,
                 port=32000,
                 timeout=30,
                 rcfile=None,
                 option='-e2 -B',
                 pattern='EOS',
                 is_use_pyknp=False,
                 **args):
        """* Class to call Juman tokenizer
        """
        # type: (text_type,Union[str,None],int,int,text_type,Union[bytes,str],Union[bytes,str],bool)->None

        self.timeout = timeout
        self.pattern = pattern
        self.option = option
        self.command = command
        if not rcfile is None and not os.path.exists(rcfile):
            raise FileExistsError('rcfile does not exist at {}'.format(rcfile))
        if not server is None:
            ### It converts from str into bytes only for sever mode ###
            self.option = self.option.encode('utf-8')
            self.pattern = self.pattern.encode('utf-8')
        else:
            pass

        # check os #
        if os.name == 'nt':
            if not is_use_pyknp:
                logger.warning(msg='It forces is_use_pyknp = True on Windows.')
            else:
                pass
            self.is_use_pyknp = True
        else:
            pass

        if is_use_pyknp or not server is None:
            self.juman = pyknp.Juman(command=command,
                                     server=server,
                                     port=port,
                                     timeout=self.timeout,
                                     rcfile=rcfile,
                                     option=option,
                                     pattern=pattern,
                                     **args)
            ### It overwrites juman_lines() method ###
            self.juman.juman_lines = self.__monkey_patch_juman_lines
        else:
            self.juman = JumanppHnadler(jumanpp_command=command,
                                        option=self.option,
                                        pattern=self.pattern,
                                        timeout_second=self.timeout)
    def __init__(self,
                 command='jumanpp',
                 timeout=30,
                 pattern=r'EOS',
                 server=None,
                 port=12000,
                 is_use_pyknp=False,
                 **args):
        """* What you can do
        - You can select backend process of jumanpp.
            - jumanpp-pexpect: It calls jumanpp on your local machine. It keeps jumanpp process running.
            - jumanpp-pyknp: It calls jumanpp on your local machine. It launches jumanpp process everytime you call. Thus, this is slower than jumanpp-pexpect 
            - jumanpp-server: It calls jumannpp on somewhere else. Keep mind, you have jumanpp sever process somewhere.
        
        * Parameters
        - timeout: Time to wait from jumanpp process.
        - is_use_pyknp: bool flag to decide if you use pyknp as backend process.  If True; you use pyknp. False; you use pexpect. 
        pexpect is much faster than you use pyknp. You can not use pexpect if you're using it on Windowns
        - server: hostname where jumanpp is running
        - port: port number where jumanpp is running
        """
        # type: (str,int,str,str,bool)->None
        self.eos_pattern = pattern
        self.is_use_pyknp = is_use_pyknp
        if not server is None:
            pattern = pattern.encode('utf-8')
        else:
            pass

        if os.name=='nt':
            """It forces to use pyknp if it runs on Windows."""
            if not self.is_use_pyknp:
                logger.warning(msg="You're not able to use pexpect in Windows. It forced to set is_use_pyknp = True")
            else:
                pass
            self.is_use_pyknp = True
        else:
            pass

        if server is None and self.is_use_pyknp:
            # jumanpp-pexpect #
            self.jumanpp_obj = Jumanpp(
                command=command,
                timeout=timeout,
                pattern=pattern,
                **args)
        elif server is None:
            # jumanpp-pexpect #
            self.jumanpp_obj = JumanppHnadler(jumanpp_command=command, timeout_second=timeout, pattern=pattern)
        else:
            # jumanpp-server #
            self.jumanpp_obj = JumanppClient(hostname=server, port=port, timeout=timeout)
class JumanWrapper(WrapperBase):
    def __init__(self,
                 command='juman',
                 server=None,
                 port=32000,
                 timeout=30,
                 rcfile=None,
                 option='-e2 -B',
                 pattern='EOS',
                 is_use_pyknp=False,
                 **args):
        # type: (text_type, text_type, int, int, text_type, Union[bytes, text_type], Union[bytes, text_type], bool, **str)->None
        """* Class to call Juman tokenizer
        """

        self.timeout = timeout
        self.pattern = pattern
        self.option = option
        self.command = command
        if not rcfile is None and not os.path.exists(rcfile):
            raise FileExistsError('rcfile does not exist at {}'.format(rcfile))
        if not server is None:
            # It converts from str into bytes only for sever mode #
            self.option = self.option.encode('utf-8')  # type: Union[str,bytes]
            self.pattern = self.pattern.encode(
                'utf-8')  # type: Union[str,bytes]
        else:
            pass

        # check os #
        if os.name == 'nt':
            if not is_use_pyknp:
                logger.warning(msg='It forces is_use_pyknp = True on Windows.')
            else:
                pass
            self.is_use_pyknp = True
        else:
            pass

        if server is not None:
            # use server mode #
            self.juman = pyknp.Juman(command=command,
                                     server=server,
                                     port=port,
                                     timeout=self.timeout,
                                     rcfile=rcfile,
                                     option=option,
                                     pattern=pattern,
                                     jumanpp=False,
                                     **args)
            if six.PY3:
                # It overwrites juman_lines() method #
                self.juman.juman_lines = self.__monkey_patch_juman_lines
        elif is_use_pyknp and server is None:
            # use unix process with pyknp
            self.juman = pyknp.Juman(command=command,
                                     server=server,
                                     port=port,
                                     timeout=self.timeout,
                                     rcfile=rcfile,
                                     option=option,
                                     pattern=pattern,
                                     jumanpp=False,
                                     **args)
        else:
            # use unix process with pexpect(RECOMMENDED) #
            self.juman = JumanppHnadler(jumanpp_command=command,
                                        option=self.option,
                                        pattern=self.pattern,
                                        timeout_second=self.timeout)

    def __del__(self):
        if hasattr(self, "juman"):
            if isinstance(self.juman, JumanppHnadler):
                self.juman.stop_process()

    def __monkey_patch_juman_lines(self, input_str):
        # type: (text_type)->text_type
        """* What you can do
        - It overwrites juman_line() method because this method causes TypeError in python3
        """
        assert isinstance(self.juman, pyknp.Juman)
        if not self.juman.socket and not self.juman.subprocess:
            if self.juman.server is not None:
                self.juman.socket = MonkeyPatchSocket(self.juman.server,
                                                      self.juman.port,
                                                      b"RUN -e2\n")
            else:
                command = "%s %s" % (self.juman.command, self.juman.option)
                if self.juman.rcfile:
                    command += " -r %s" % self.juman.rcfile
                self.juman.subprocess = pyknp.Subprocess(command)
        if self.juman.socket:
            return self.juman.socket.query(input_str,
                                           pattern=self.juman.pattern)
        return self.juman.subprocess.query(input_str,
                                           pattern=self.juman.pattern)

    def __extract_morphological_information(self, mrph_object, is_feature,
                                            is_surface):
        """This method extracts morphlogical information from token object.
        """
        assert isinstance(mrph_object, pyknp.Morpheme)
        assert isinstance(is_feature, bool)
        assert isinstance(is_surface, bool)

        surface = mrph_object.midasi
        word_stem = mrph_object.genkei

        tuple_pos = (mrph_object.hinsi, mrph_object.bunrui)

        misc_info = {
            'katuyou1': mrph_object.katuyou1,
            'katuyou2': mrph_object.katuyou2,
            'imis': mrph_object.imis,
            'repname': mrph_object.repname
        }

        token_object = TokenizedResult(node_obj=None,
                                       tuple_pos=tuple_pos,
                                       word_stem=word_stem,
                                       word_surface=surface,
                                       is_feature=is_feature,
                                       is_surface=is_surface,
                                       misc_info=misc_info)

        return token_object

    def call_juman_interface(self, input_str):
        # type: (text_type)->MList
        if isinstance(self.juman, pyknp.Juman):
            result = self.juman.analysis(input_str)
            return result
        elif isinstance(self.juman, JumanppHnadler):
            try:
                result_analysis = self.juman.query(input_str)
            except UnicodeDecodeError:
                logger.warning(
                    msg=
                    "Process is down by some reason. It restarts process automatically."
                )
                self.juman.restart_process()
                result_analysis = self.juman.query(input_string=input_str)
            return MList(result_analysis)
        else:
            raise Exception('Not defined.')

    def tokenize(self,
                 sentence,
                 normalize=True,
                 is_feature=False,
                 is_surface=False,
                 return_list=False,
                 func_normalizer=text_preprocess.normalize_text):
        # type: (text_preprocess, bool, bool, bool, bool, Callable[[str], text_type])->Union[List[text_type], TokenizedSenetence]
        """This method returns tokenized result.
        If return_list==True(default), this method returns list whose element is tuple consisted with word_stem and POS.
        If return_list==False, this method returns TokenizedSenetence object.
        """
        assert isinstance(normalize, bool)
        assert isinstance(sentence, text_type)
        normalized_sentence = func_normalizer(sentence)
        result = self.call_juman_interface(normalized_sentence)

        token_objects = [
            self.__extract_morphological_information(mrph_object=morph_object,
                                                     is_surface=is_surface,
                                                     is_feature=is_feature)
            for morph_object in result
        ]

        if return_list:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence, tokenized_objects=token_objects)
            return tokenized_objects.convert_list_object()
        else:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence, tokenized_objects=token_objects)

            return tokenized_objects

    def filter(self, parsed_sentence, pos_condition=None, stopwords=None):
        # type: (TokenizedSenetence, List[Tuple[text_type,...]], List[text_type])->FilteredObject
        assert isinstance(parsed_sentence, TokenizedSenetence)
        assert isinstance(pos_condition, (type(None), list))
        assert isinstance(stopwords, (type(None), list))

        return parsed_sentence.filter(pos_condition, stopwords)
    def __init__(self,
                 command='jumanpp',
                 timeout=30,
                 pattern=r'EOS',
                 server=None,
                 port=12000,
                 is_use_pyknp=False,
                 **args):
        # type: (text_type,int,text_type,text_type,bool)
        """* What you can do
        - You can select backend process of jumanpp.
            - jumanpp-pexpect: It calls jumanpp on your local machine. It keeps jumanpp process running.
            - jumanpp-pyknp: It calls jumanpp on your local machine. It launches jumanpp process everytime you call. Thus, this is slower than jumanpp-pexpect
            - jumanpp-server: It calls jumannpp on somewhere else. Keep mind, you have jumanpp sever process somewhere.

        * Parameters
        - timeout: Time to wait from jumanpp process.
        - is_use_pyknp: bool flag to decide if you use pyknp as backend process.  If True; you use pyknp. False; you use pexpect.
        pexpect is much faster than you use pyknp. You can not use pexpect if you're using it on Windowns
        - server: hostname where jumanpp is running
        - port: port number where jumanpp is running
        """
        self.eos_pattern = pattern
        self.is_use_pyknp = is_use_pyknp

        if six.PY2:
            self.dummy_text = 'これはダミーテキストです'.decode('utf-8')
        elif six.PY3:
            self.dummy_text = 'これはダミーテキストです'

        if not server is None:
            pattern = pattern.encode('utf-8')
        else:
            pass

        if os.name == 'nt':
            """It forces to use pyknp if it runs on Windows."""
            if not self.is_use_pyknp:
                logger.warning(
                    msg=
                    "You're not able to use pexpect in Windows. It forced to set is_use_pyknp = True"
                )
            else:
                pass
            self.is_use_pyknp = True
        else:
            pass

        if server is None and self.is_use_pyknp:
            # jumanpp-pexpect #
            logger.debug('jumanpp wrapper is initialized with pyknp package')
            self.jumanpp_obj = Juman(command=command,
                                     timeout=timeout,
                                     pattern=pattern,
                                     jumanpp=True,
                                     **args)
        elif server is None:
            # jumanpp-pexpect #
            logger.debug(
                'jumanpp wrapper is initialized with pexpect unix handler')
            self.jumanpp_obj = JumanppHnadler(
                jumanpp_command=command,
                timeout_second=timeout,
                pattern=pattern)  # type: JumanppHnadler
            # put dummy sentence to avoid exception just after command initialization #
            res = self.jumanpp_obj.query(self.dummy_text)
        else:
            # jumanpp-server #
            self.jumanpp_obj = JumanppClient(hostname=server,
                                             port=port,
                                             timeout=timeout)
Beispiel #5
0
class JumanWrapper(WrapperBase):
    def __init__(self,
                 command='juman',
                 server=None,
                 port=32000,
                 timeout=30,
                 rcfile=None,
                 option='-e2 -B',
                 pattern='EOS',
                 is_use_pyknp=False,
                 **args):
        """* Class to call Juman tokenizer
        """
        # type: (text_type,Union[str,None],int,int,text_type,Union[bytes,str],Union[bytes,str],bool)->None

        self.timeout = timeout
        self.pattern = pattern
        self.option = option
        self.command = command
        if not rcfile is None and not os.path.exists(rcfile):
            raise FileExistsError('rcfile does not exist at {}'.format(rcfile))
        if not server is None:
            ### It converts from str into bytes only for sever mode ###
            self.option = self.option.encode('utf-8')
            self.pattern = self.pattern.encode('utf-8')
        else:
            pass

        # check os #
        if os.name == 'nt':
            if not is_use_pyknp:
                logger.warning(msg='It forces is_use_pyknp = True on Windows.')
            else:
                pass
            self.is_use_pyknp = True
        else:
            pass

        if is_use_pyknp or not server is None:
            self.juman = pyknp.Juman(command=command,
                                     server=server,
                                     port=port,
                                     timeout=self.timeout,
                                     rcfile=rcfile,
                                     option=option,
                                     pattern=pattern,
                                     **args)
            ### It overwrites juman_lines() method ###
            self.juman.juman_lines = self.__monkey_patch_juman_lines
        else:
            self.juman = JumanppHnadler(jumanpp_command=command,
                                        option=self.option,
                                        pattern=self.pattern,
                                        timeout_second=self.timeout)

    def __del__(self):
        if hasattr(self, "juman"):
            if isinstance(self.juman, JumanppHnadler):
                self.juman.stop_process()

    def __monkey_patch_juman_lines(self, input_str: str):
        """* What you can do
        - It overwrites juman_line() method because this method causes TypeError in python3
        """
        assert isinstance(self.juman, pyknp.Juman)
        if not self.juman.socket and not self.juman.subprocess:
            if self.juman.server is not None:
                self.juman.socket = MonkeyPatchSocket(self.juman.server,
                                                      self.juman.port,
                                                      b"RUN -e2\n")
            else:
                command = "%s %s" % (self.juman.command, self.juman.option)
                if self.juman.rcfile:
                    command += " -r %s" % self.juman.rcfile
                self.juman.subprocess = pyknp.Subprocess(command)
        if self.juman.socket:
            return self.juman.socket.query(input_str,
                                           pattern=self.juman.pattern)
        return self.juman.subprocess.query(input_str,
                                           pattern=self.juman.pattern)

    def call_juman_interface(self, input_str):
        """* What you can do
        - You call Juman tokenizer interface.

        * Output
        - pyknp.MList
        """
        # type: (str)->MList
        if isinstance(self.juman, pyknp.Juman):
            return self.juman.analysis(input_str)
        elif isinstance(self.juman, JumanppHnadler):
            try:
                result_analysis = self.juman.query(input_str)
            except UnicodeDecodeError:
                logger.warning(
                    msg=
                    "Process is down by some reason. It restarts process automatically."
                )
                self.juman.restart_process()
                result_analysis = self.juman.query(input_string=input_str)
            return MList(spec=result_analysis)
        else:
            raise Exception('Not defined.')

    def tokenize(self,
                 sentence,
                 normalize=True,
                 is_feature=False,
                 is_surface=False,
                 return_list=False,
                 func_normalizer=text_preprocess.normalize_text):
        # type: (str, bool, bool, bool, bool, Callable[[str], str]) -> Union[TokenizedSenetence, List[str]]
        """
        :param sentence:
        :param ins_mecab:
        :param list_stopword:
        :param list_pos_candidate:
        :return:  list [tuple (unicode, unicode)]
        """
        assert isinstance(normalize, bool)
        assert isinstance(sentence, str)
        if normalize:
            normalized_sentence = func_normalizer(sentence)
        else:
            normalized_sentence = sentence

        result = self.call_juman_interface(normalized_sentence)
        token_objects = [
            juman_utils.extract_morphological_information(
                mrph_object=morph_object,
                is_surface=is_surface,
                is_feature=is_feature) for morph_object in result
        ]

        if return_list:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence, tokenized_objects=token_objects)
            return tokenized_objects.convert_list_object()
        else:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence, tokenized_objects=token_objects)

            return tokenized_objects

    def filter(self,
               parsed_sentence: TokenizedSenetence,
               pos_condition: List[Tuple[str, ...]] = None,
               stopwords: List[str] = None) -> FilteredObject:
        assert isinstance(parsed_sentence, TokenizedSenetence)
        assert isinstance(pos_condition, (type(None), list))
        assert isinstance(stopwords, (type(None), list))

        return parsed_sentence.filter(pos_condition, stopwords)