Exemple #1
0
    def lexical_analysis(self,
                         texts=[],
                         data={},
                         use_gpu=False,
                         batch_size=1,
                         user_dict=None,
                         return_tag=True):
        """
        Get the word segmentation results with the texts as input

        Args:
             texts(list): the input texts to be segmented, if texts not data
             data(dict): key must be 'text', value is the texts to be segmented, if data not texts
             use_gpu(bool): whether use gpu to predict or not
             batch_size(int): the program deals once with one batch
             user_dict(None): the parameter is not to be recommended. Please set the dictionause the function set_user_dict()

        Returns:
             results(list): the word segmentation results
        """
        if user_dict:
            logger.warning(
                "If you wanna use customized dictionary, please use the function set_user_dict() to set the dictionay. The parameter user_dict has been dropped!"
            )

        try:
            _places = os.environ["CUDA_VISIBLE_DEVICES"]
            int(_places[0])
        except:
            use_gpu = False

        if texts != [] and isinstance(texts, list) and data == {}:
            predicted_data = texts
        elif texts == [] and isinstance(data, dict) and isinstance(
                data.get('text', None), list) and data['text']:
            predicted_data = data["text"]
        else:
            raise TypeError(
                "The input data is inconsistent with expectations.")

        predicted_data = self.to_unicode(predicted_data)

        # drop the empty string like "" in predicted_data
        empty_str_indexes = self._get_index(predicted_data)
        predicted_data = [data for data in predicted_data if data != ""]

        start_idx = 0
        iteration = int(math.ceil(len(predicted_data) / batch_size))
        results = []
        for i in range(iteration):
            if i < (iteration - 1):
                batch_data = predicted_data[start_idx:(start_idx + batch_size)]
            else:
                batch_data = predicted_data[start_idx:]

            start_idx = start_idx + batch_size
            tensor_words = self.texts2tensor(batch_data)

            if use_gpu:
                batch_out = self.gpu_predictor.run([tensor_words])
            else:
                batch_out = self.cpu_predictor.run([tensor_words])
            batch_result = parse_result(batch_data,
                                        batch_out[0],
                                        self.id2label_dict,
                                        interventer=self.interventer)
            results += batch_result

        for index in empty_str_indexes:
            results.insert(index, {"word": [""], "tag": [""]})

        if not return_tag:
            for result in results:
                result = result.pop("tag")
            return results

        return results
Exemple #2
0
    def lexical_analysis(self,
                         texts=[],
                         data={},
                         use_gpu=False,
                         batch_size=1,
                         return_tag=True,
                         use_device=None):
        """
        Get the word segmentation results with the texts as input

        Args:
             texts(list): the input texts to be segmented, if texts not data
             data(dict): key must be 'text', value is the texts to be segmented, if data not texts
             use_gpu(bool): whether use gpu to predict or not
             batch_size(int): the program deals once with one batch
             return_tag: Whether to get tag or not.
             use_device (str): use cpu, gpu, xpu or npu, overwrites use_gpu flag.

        Returns:
             results(list): the word segmentation results
        """

        # real predictor to use
        if use_device is not None:
            if use_device == "cpu":
                predictor = self.cpu_predictor
            elif use_device == "xpu":
                predictor = self.xpu_predictor
            elif use_device == "npu":
                predictor = self.npu_predictor
            elif use_device == "gpu":
                predictor = self.gpu_predictor
            else:
                raise Exception("Unsupported device: " + use_device)
        else:
            # use_device is not set, therefore follow use_gpu
            if use_gpu:
                predictor = self.gpu_predictor
            else:
                predictor = self.cpu_predictor

        if texts != [] and isinstance(texts, list) and data == {}:
            predicted_data = texts
        elif texts == [] and isinstance(data, dict) and isinstance(
                data.get('text', None), list) and data['text']:
            predicted_data = data["text"]
        else:
            raise TypeError(
                "The input data is inconsistent with expectations.")

        predicted_data = self.to_unicode(predicted_data)

        # drop the empty string like "" in predicted_data
        empty_str_indexes = self._get_index(predicted_data)
        predicted_data = [data for data in predicted_data if data != ""]

        start_idx = 0
        iteration = int(math.ceil(len(predicted_data) / batch_size))
        results = []
        for i in range(iteration):
            if i < (iteration - 1):
                batch_data = predicted_data[start_idx:(start_idx + batch_size)]
            else:
                batch_data = predicted_data[start_idx:]

            start_idx = start_idx + batch_size
            batch_out = self._internal_predict(predictor, batch_data)
            batch_result = parse_result(batch_data,
                                        batch_out,
                                        self.id2label_dict,
                                        interventer=self.custom)
            results += batch_result

        for index in empty_str_indexes:
            results.insert(index, {"word": [""], "tag": [""]})

        if not return_tag:
            for result in results:
                result = result.pop("tag")
            return results

        return results
Exemple #3
0
    def cut(self, text, use_gpu=False, batch_size=1, return_tag=True):
        """
        The main function that segments an entire text that contains
        Chinese characters into separated words.
        Args:
            text(:obj:`str` or :obj:`List[str]`): The chinese texts to be segmented. This can be a string, a list of strings.
            use_gpu(bool): whether use gpu to predict or not
            batch_size(int): the program deals once with one batch
            return_tag: Whether to get tag or not.

        Returns:
            results(dict or list): The word segmentation result of the input text, whose key is 'word', if text is a list.
                If text is a str, the word segmentation result (list) is obtained.

        """
        if use_gpu:
            try:
                _places = os.environ["CUDA_VISIBLE_DEVICES"]
                int(_places[0])
            except:
                raise RuntimeError(
                    "Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES as cuda_device_id."
                )

        if isinstance(text, list) and len(text) != 0:

            predicted_data = self.to_unicode(text)

            # drop the empty string like "" in predicted_data
            empty_str_indexes = self._get_index(predicted_data)
            predicted_data = [data for data in predicted_data if data != ""]

            start_idx = 0
            iteration = int(math.ceil(len(predicted_data) / batch_size))
            results = []
            for i in range(iteration):
                if i < (iteration - 1):
                    batch_data = predicted_data[start_idx:(start_idx +
                                                           batch_size)]
                else:
                    batch_data = predicted_data[start_idx:]

                start_idx = start_idx + batch_size
                tensor_words = self.texts2tensor(batch_data)

                if use_gpu:
                    batch_out = self.gpu_predictor.run([tensor_words])
                else:
                    batch_out = self.cpu_predictor.run([tensor_words])
                batch_result = parse_result(batch_data,
                                            batch_out[0],
                                            self.id2label_dict,
                                            interventer=self.custom)
                results += batch_result

            for index in empty_str_indexes:
                results.insert(index, {"word": [""], "tag": [""]})

            if not return_tag:
                for result in results:
                    result = result.pop("tag")
                return results

            return results
        elif isinstance(text, str) and text != "":
            tensor_words = self.texts2tensor([text])

            if use_gpu:
                batch_out = self.gpu_predictor.run([tensor_words])
            else:
                batch_out = self.cpu_predictor.run([tensor_words])
            batch_result = parse_result([text],
                                        batch_out[0],
                                        self.id2label_dict,
                                        interventer=self.custom)

            return batch_result[0]['word']
        elif text == "":
            return text
        else:
            raise TypeError(
                "The input data is inconsistent with expectations.")
Exemple #4
0
    def cut(self,
            text,
            use_gpu=False,
            batch_size=1,
            return_tag=True,
            use_device=None):
        """
        The main function that segments an entire text that contains
        Chinese characters into separated words.
        Args:
            text(:obj:`str` or :obj:`List[str]`): The chinese texts to be segmented. This can be a string, a list of strings.
            use_gpu(bool): whether use gpu to predict or not
            batch_size(int): the program deals once with one batch
            return_tag: Whether to get tag or not.
            use_device (str): use cpu, gpu, xpu or npu, overwrites use_gpu flag.

        Returns:
            results(dict or list): The word segmentation result of the input text, whose key is 'word', if text is a list.
                If text is a str, the word segmentation result (list) is obtained.

        """

        # real predictor to use
        if use_device is not None:
            if use_device == "cpu":
                predictor = self.cpu_predictor
            elif use_device == "xpu":
                predictor = self.xpu_predictor
            elif use_device == "npu":
                predictor = self.npu_predictor
            elif use_device == "gpu":
                predictor = self.gpu_predictor
            else:
                raise Exception("Unsupported device: " + use_device)
        else:
            # use_device is not set, therefore follow use_gpu
            if use_gpu:
                predictor = self.gpu_predictor
            else:
                predictor = self.cpu_predictor

        if isinstance(text, list) and len(text) != 0:

            predicted_data = self.to_unicode(text)

            # drop the empty string like "" in predicted_data
            empty_str_indexes = self._get_index(predicted_data)
            predicted_data = [data for data in predicted_data if data != ""]

            start_idx = 0
            iteration = int(math.ceil(len(predicted_data) / batch_size))
            results = []
            for i in range(iteration):
                if i < (iteration - 1):
                    batch_data = predicted_data[start_idx:(start_idx +
                                                           batch_size)]
                else:
                    batch_data = predicted_data[start_idx:]

                start_idx = start_idx + batch_size
                batch_out = self._internal_predict(predictor, batch_data)
                batch_result = parse_result(batch_data,
                                            batch_out,
                                            self.id2label_dict,
                                            interventer=self.custom)
                results += batch_result

            for index in empty_str_indexes:
                results.insert(index, {"word": [""], "tag": [""]})

            if not return_tag:
                for result in results:
                    result = result.pop("tag")
                return results

            return results
        elif isinstance(text, str) and text != "":
            batch_out = self._internal_predict(predictor, [text])
            batch_result = parse_result([text],
                                        batch_out,
                                        self.id2label_dict,
                                        interventer=self.custom)

            return batch_result[0]['word']
        elif text == "":
            return text
        else:
            raise TypeError(
                "The input data is inconsistent with expectations.")