Example #1
0
    def __init__(
        self,
        model_dir: str,
        dest: Optional[str] = None,
        guards: Optional[List[Guard]] = None,
        debug: bool = False,
        threshold: float = 0.1,
        use_cuda: bool = False,
        score_round_off: int = 5,
        purpose: str = const.TRAIN,
        fallback_label: str = const.ERROR_LABEL,
        use_state: bool = False,
        data_column: str = const.DATA,
        label_column: str = const.LABELS,
        state_column: str = const.STATE,
        args_map: Optional[Dict[str, Any]] = None,
        skip_labels: Optional[List[str]] = None,
        kwargs: Optional[Dict[str, Any]] = None,
    ) -> None:
        try:
            classifer = getattr(importlib.import_module(const.XLMR_MODULE),
                                const.XLMR_MULTI_CLASS_MODEL)
        except ModuleNotFoundError as error:
            raise ModuleNotFoundError(
                "Plugin requires simpletransformers -- https://simpletransformers.ai/docs/installation/"
            ) from error

        super().__init__(dest=dest, guards=guards, debug=debug)
        self.labelencoder = preprocessing.LabelEncoder()
        self.classifier = classifer
        self.model: Any = None
        self.model_dir = model_dir
        self.fallback_label = fallback_label
        self.data_column = data_column
        self.label_column = label_column
        self.state_column = state_column
        self.use_cuda = use_cuda
        self.use_state = use_state
        self.labelencoder_file_path = os.path.join(self.model_dir,
                                                   const.LABELENCODER_FILE)
        self.threshold = threshold
        self.skip_labels = set(skip_labels or set())
        self.purpose = purpose
        self.round = score_round_off
        if args_map and (const.TRAIN not in args_map or const.TEST
                         not in args_map or const.PRODUCTION not in args_map):
            raise ValueError(
                f"Attempting to set invalid {args_map=}. "
                "It is missing some of {const.TRAIN}, {const.TEST}, {const.PRODUCTION} in configs."
            )
        self.args_map = args_map
        self.kwargs = kwargs or {}
        try:
            if os.path.exists(self.labelencoder_file_path):
                self.init_model()
        except EOFError:
            logger.error(
                f"Plugin {self.__class__.__name__} Failed to load labelencoder from {self.labelencoder_file_path}. "
                "Ignore this message if you are training but if you are using this in production or testing, then this is serious!"
            )
Example #2
0
    def inference(self, texts: Optional[List[str]]) -> List[Intent]:
        """
        Predict the intent of a list of texts.

        :param texts: A list of strings, derived from ASR transcripts.
        :type texts: List[str]
        :raises AttributeError: In case the model isn't of sklearn pipeline or gridsearchcv.
        :return: A list of intents corresponding to texts.
        :rtype: List[Intent]
        """
        logger.debug(f"Classifier input:\n{texts}")
        fallback_output = Intent(name=self.fallback_label,
                                 score=1.0).add_parser(self)

        if self.model_pipeline is None:
            logger.error(
                f"No model found for plugin {self.__class__.__name__}!")
            return [fallback_output]

        if not texts:
            return [fallback_output]

        if not isinstance(self.model_pipeline, Pipeline) and not isinstance(
                self.model_pipeline, GridSearchCV):
            raise AttributeError("Seems like you forgot to "
                                 f"save the {self.__class__.__name__} plugin.")

        try:
            probs_and_classes = sorted(
                zip(
                    self.model_pipeline.predict_proba(texts)[0],
                    self.model_pipeline.classes_,
                ),
                key=operator.itemgetter(0),
                reverse=True,
            )
        except NotFittedError:
            logger.error(f"{self.__class__.__name__} model not trained yet!")
            return [fallback_output]

        return [
            Intent(name=intent, score=round(score,
                                            self.round)).add_parser(self)
            for score, intent in probs_and_classes
        ]
Example #3
0
    def validate(self, training_data: pd.DataFrame) -> bool:
        """
        Validate the training data is in the appropriate format

        :param training_data: A pandas dataframe containing at least list of strings and corresponding labels.
        :type training_data: pd.DataFrame
        :return: True if the dataframe is valid, False otherwise.
        :rtype: bool
        """
        if training_data.empty:
            logger.error("Training dataframe is empty.")
            return False

        for column in [self.data_column, self.label_column]:
            if column not in training_data.columns:
                logger.warning(f"Column {column} not found in training data")
                return False
        return True
Example #4
0
    def __init__(
        self,
        model_dir: str,
        dest: Optional[str] = None,
        guards: Optional[List[Guard]] = None,
        debug: bool = False,
        threshold: float = 0.1,
        score_round_off: int = 5,
        purpose: str = const.TRAIN,
        fallback_label: str = const.ERROR_LABEL,
        data_column: str = const.DATA,
        label_column: str = const.LABELS,
        args_map: Optional[Dict[str, Any]] = None,
        skip_labels: Optional[List[str]] = None,
        kwargs: Optional[Dict[str, Any]] = None,
    ) -> None:

        super().__init__(dest=dest, guards=guards, debug=debug)
        self.model_pipeline: Any = None
        self.fallback_label = fallback_label
        self.data_column = data_column
        self.label_column = label_column
        self.mlp_model_path = os.path.join(model_dir, const.MLPMODEL_FILE)
        self.threshold = threshold
        self.skip_labels = set(skip_labels or set())
        self.purpose = purpose
        self.round = score_round_off
        if args_map and (const.TRAIN not in args_map or const.TEST
                         not in args_map or const.PRODUCTION not in args_map):
            raise ValueError(
                f"Attempting to set invalid {args_map=}. "
                "It is missing some of {const.TRAIN}, {const.TEST}, {const.PRODUCTION} in configs."
            )
        self.args_map = args_map
        self.kwargs = kwargs or {}
        try:
            if os.path.exists(self.mlp_model_path):
                self.init_model()
        except EOFError:
            logger.error(
                f"Plugin {self.__class__.__name__} Failed to load MLPClassifier Model from {self.mlp_model_path}. "
                "Ignore this message if you are training but if you are using this in production or testing, then this is serious!"
            )
Example #5
0
    def _get_entities(
        self,
        text: str,
        locale: str = "en_IN",
        reference_time: Optional[int] = None,
        use_latent: Union[Callable[..., bool], bool] = False,
        sort_idx: int = 0,
    ) -> Dict[str, Any]:
        """
        Get entities from duckling-server.

        Assuming duckling-server is running at expected `url`. The entities are returned in
        `json` compatible format.

        :param text: The sentence or document in which entities must be looked up.
        :type text: str
        :param reference_time: Cases where relative units of time are mentioned,
                                like "today", "now", etc. We need to know the current time
                                to parse the values into usable dates/times, defaults to None
        :type reference_time: Optional[int], optional
        :raises ValueError: Duckling API call failure leading to no json response.
        :return: Duckling entities as :code:`dicts`.
        :rtype: List[Dict[str, Any]]
        """
        body = self.__create_req_body(text,
                                      reference_time=reference_time,
                                      locale=locale,
                                      use_latent=use_latent)

        try:
            response = self.session.post(self.url,
                                         data=body,
                                         headers=self.headers,
                                         timeout=self.timeout)

            if response.status_code == 200:
                # The API call was successful, expect the following to contain entities.
                # A list of dicts or an empty list.
                return {const.IDX: sort_idx, const.VALUE: response.json()}
        except requests.exceptions.Timeout as timeout_exception:
            logger.error(f"Duckling timed out: {timeout_exception}")
            logger.error(pformat(body))
            return {const.IDX: sort_idx, const.VALUE: []}
        except requests.exceptions.ConnectionError as connection_error:
            logger.error(f"Duckling server is turned off?: {connection_error}")
            logger.error(pformat(body))
            return {const.IDX: sort_idx, const.VALUE: []}

        # Control flow reaching here would mean the API call wasn't successful.
        # To prevent rest of the things from crashing, we will raise an exception.
        raise ValueError(
            f"Duckling API call failed | [{response.status_code}]: {response.text}"
        )
Example #6
0
    def validate(self, training_data: pd.DataFrame) -> bool:
        """
        Validate the training data is in the appropriate format

        :param training_data: A pandas dataframe containing at least list of strings and corresponding labels.
            Should also contain a state key if use_state = True while initializing object.
        :type training_data: pd.DataFrame
        :return: True if the dataframe is valid, False otherwise.
        :rtype: bool
        """
        if training_data.empty:
            logger.error("Training dataframe is empty.")
            return False
        expected_columns = [self.data_column, self.label_column]
        if self.use_state:
            expected_columns.append(self.state_column)
        for column in expected_columns:
            if column not in training_data.columns:
                logger.warning(f"Column {column} not found in training data")
                return False
        return True
Example #7
0
    def inference(self, texts: Optional[List[str]]) -> List[Intent]:
        """
        Predict the intent of a list of texts.

        :param texts: A list of strings, derived from ASR transcripts.
        :type texts: List[str]
        :raises AttributeError: In case the labelencoder is not available.
        :return: A list of intents corresponding to texts.
        :rtype: List[Intent]
        """
        logger.debug(f"Classifier input:\n{texts}")
        fallback_output = Intent(name=self.fallback_label,
                                 score=1.0).add_parser(self)
        if not texts:
            logger.error(f"texts passed to model {texts}!")
            return [fallback_output]

        if self.model is None:
            logger.error(
                f"No model found for plugin {self.__class__.__name__}!")
            return [fallback_output]
        if self.use_state:
            for text in texts:
                if "<st>" not in text:
                    logger.error(
                        f"use_state = True but {text} doesn't contain <st> </st> tags, \
                        which means that training data is different from this data. This can lead to anomalous results"
                    )
                    return [fallback_output]
        if not self.valid_labelencoder:
            raise AttributeError("Seems like you forgot to "
                                 f"save the {self.__class__.__name__} plugin.")

        predictions, logits = self.model.predict(texts)
        if not predictions:
            return [fallback_output]

        confidence_scores = [
            np.exp(logit) / sum(np.exp(logit)) for logit in logits
        ]
        intents_confidence_order = np.argsort(confidence_scores)[0][::-1]
        predicted_intents = self.labelencoder.inverse_transform(
            intents_confidence_order)
        ordered_confidence_scores = [
            confidence_scores[0][idx] for idx in intents_confidence_order
        ]

        return [
            Intent(name=intent,
                   score=round(score,
                               self.round)).add_parser(self.__class__.__name__)
            for intent, score in zip(predicted_intents,
                                     ordered_confidence_scores)
        ]