def __init__( self, model_dir: str, dest: Optional[str] = None, guards: Optional[List[Guard]] = None, debug: bool = False, threshold: float = 0.1, use_cuda: bool = False, score_round_off: int = 5, purpose: str = const.TRAIN, fallback_label: str = const.ERROR_LABEL, use_state: bool = False, data_column: str = const.DATA, label_column: str = const.LABELS, state_column: str = const.STATE, args_map: Optional[Dict[str, Any]] = None, skip_labels: Optional[List[str]] = None, kwargs: Optional[Dict[str, Any]] = None, ) -> None: try: classifer = getattr(importlib.import_module(const.XLMR_MODULE), const.XLMR_MULTI_CLASS_MODEL) except ModuleNotFoundError as error: raise ModuleNotFoundError( "Plugin requires simpletransformers -- https://simpletransformers.ai/docs/installation/" ) from error super().__init__(dest=dest, guards=guards, debug=debug) self.labelencoder = preprocessing.LabelEncoder() self.classifier = classifer self.model: Any = None self.model_dir = model_dir self.fallback_label = fallback_label self.data_column = data_column self.label_column = label_column self.state_column = state_column self.use_cuda = use_cuda self.use_state = use_state self.labelencoder_file_path = os.path.join(self.model_dir, const.LABELENCODER_FILE) self.threshold = threshold self.skip_labels = set(skip_labels or set()) self.purpose = purpose self.round = score_round_off if args_map and (const.TRAIN not in args_map or const.TEST not in args_map or const.PRODUCTION not in args_map): raise ValueError( f"Attempting to set invalid {args_map=}. " "It is missing some of {const.TRAIN}, {const.TEST}, {const.PRODUCTION} in configs." ) self.args_map = args_map self.kwargs = kwargs or {} try: if os.path.exists(self.labelencoder_file_path): self.init_model() except EOFError: logger.error( f"Plugin {self.__class__.__name__} Failed to load labelencoder from {self.labelencoder_file_path}. " "Ignore this message if you are training but if you are using this in production or testing, then this is serious!" )
def inference(self, texts: Optional[List[str]]) -> List[Intent]: """ Predict the intent of a list of texts. :param texts: A list of strings, derived from ASR transcripts. :type texts: List[str] :raises AttributeError: In case the model isn't of sklearn pipeline or gridsearchcv. :return: A list of intents corresponding to texts. :rtype: List[Intent] """ logger.debug(f"Classifier input:\n{texts}") fallback_output = Intent(name=self.fallback_label, score=1.0).add_parser(self) if self.model_pipeline is None: logger.error( f"No model found for plugin {self.__class__.__name__}!") return [fallback_output] if not texts: return [fallback_output] if not isinstance(self.model_pipeline, Pipeline) and not isinstance( self.model_pipeline, GridSearchCV): raise AttributeError("Seems like you forgot to " f"save the {self.__class__.__name__} plugin.") try: probs_and_classes = sorted( zip( self.model_pipeline.predict_proba(texts)[0], self.model_pipeline.classes_, ), key=operator.itemgetter(0), reverse=True, ) except NotFittedError: logger.error(f"{self.__class__.__name__} model not trained yet!") return [fallback_output] return [ Intent(name=intent, score=round(score, self.round)).add_parser(self) for score, intent in probs_and_classes ]
def validate(self, training_data: pd.DataFrame) -> bool: """ Validate the training data is in the appropriate format :param training_data: A pandas dataframe containing at least list of strings and corresponding labels. :type training_data: pd.DataFrame :return: True if the dataframe is valid, False otherwise. :rtype: bool """ if training_data.empty: logger.error("Training dataframe is empty.") return False for column in [self.data_column, self.label_column]: if column not in training_data.columns: logger.warning(f"Column {column} not found in training data") return False return True
def __init__( self, model_dir: str, dest: Optional[str] = None, guards: Optional[List[Guard]] = None, debug: bool = False, threshold: float = 0.1, score_round_off: int = 5, purpose: str = const.TRAIN, fallback_label: str = const.ERROR_LABEL, data_column: str = const.DATA, label_column: str = const.LABELS, args_map: Optional[Dict[str, Any]] = None, skip_labels: Optional[List[str]] = None, kwargs: Optional[Dict[str, Any]] = None, ) -> None: super().__init__(dest=dest, guards=guards, debug=debug) self.model_pipeline: Any = None self.fallback_label = fallback_label self.data_column = data_column self.label_column = label_column self.mlp_model_path = os.path.join(model_dir, const.MLPMODEL_FILE) self.threshold = threshold self.skip_labels = set(skip_labels or set()) self.purpose = purpose self.round = score_round_off if args_map and (const.TRAIN not in args_map or const.TEST not in args_map or const.PRODUCTION not in args_map): raise ValueError( f"Attempting to set invalid {args_map=}. " "It is missing some of {const.TRAIN}, {const.TEST}, {const.PRODUCTION} in configs." ) self.args_map = args_map self.kwargs = kwargs or {} try: if os.path.exists(self.mlp_model_path): self.init_model() except EOFError: logger.error( f"Plugin {self.__class__.__name__} Failed to load MLPClassifier Model from {self.mlp_model_path}. " "Ignore this message if you are training but if you are using this in production or testing, then this is serious!" )
def _get_entities( self, text: str, locale: str = "en_IN", reference_time: Optional[int] = None, use_latent: Union[Callable[..., bool], bool] = False, sort_idx: int = 0, ) -> Dict[str, Any]: """ Get entities from duckling-server. Assuming duckling-server is running at expected `url`. The entities are returned in `json` compatible format. :param text: The sentence or document in which entities must be looked up. :type text: str :param reference_time: Cases where relative units of time are mentioned, like "today", "now", etc. We need to know the current time to parse the values into usable dates/times, defaults to None :type reference_time: Optional[int], optional :raises ValueError: Duckling API call failure leading to no json response. :return: Duckling entities as :code:`dicts`. :rtype: List[Dict[str, Any]] """ body = self.__create_req_body(text, reference_time=reference_time, locale=locale, use_latent=use_latent) try: response = self.session.post(self.url, data=body, headers=self.headers, timeout=self.timeout) if response.status_code == 200: # The API call was successful, expect the following to contain entities. # A list of dicts or an empty list. return {const.IDX: sort_idx, const.VALUE: response.json()} except requests.exceptions.Timeout as timeout_exception: logger.error(f"Duckling timed out: {timeout_exception}") logger.error(pformat(body)) return {const.IDX: sort_idx, const.VALUE: []} except requests.exceptions.ConnectionError as connection_error: logger.error(f"Duckling server is turned off?: {connection_error}") logger.error(pformat(body)) return {const.IDX: sort_idx, const.VALUE: []} # Control flow reaching here would mean the API call wasn't successful. # To prevent rest of the things from crashing, we will raise an exception. raise ValueError( f"Duckling API call failed | [{response.status_code}]: {response.text}" )
def validate(self, training_data: pd.DataFrame) -> bool: """ Validate the training data is in the appropriate format :param training_data: A pandas dataframe containing at least list of strings and corresponding labels. Should also contain a state key if use_state = True while initializing object. :type training_data: pd.DataFrame :return: True if the dataframe is valid, False otherwise. :rtype: bool """ if training_data.empty: logger.error("Training dataframe is empty.") return False expected_columns = [self.data_column, self.label_column] if self.use_state: expected_columns.append(self.state_column) for column in expected_columns: if column not in training_data.columns: logger.warning(f"Column {column} not found in training data") return False return True
def inference(self, texts: Optional[List[str]]) -> List[Intent]: """ Predict the intent of a list of texts. :param texts: A list of strings, derived from ASR transcripts. :type texts: List[str] :raises AttributeError: In case the labelencoder is not available. :return: A list of intents corresponding to texts. :rtype: List[Intent] """ logger.debug(f"Classifier input:\n{texts}") fallback_output = Intent(name=self.fallback_label, score=1.0).add_parser(self) if not texts: logger.error(f"texts passed to model {texts}!") return [fallback_output] if self.model is None: logger.error( f"No model found for plugin {self.__class__.__name__}!") return [fallback_output] if self.use_state: for text in texts: if "<st>" not in text: logger.error( f"use_state = True but {text} doesn't contain <st> </st> tags, \ which means that training data is different from this data. This can lead to anomalous results" ) return [fallback_output] if not self.valid_labelencoder: raise AttributeError("Seems like you forgot to " f"save the {self.__class__.__name__} plugin.") predictions, logits = self.model.predict(texts) if not predictions: return [fallback_output] confidence_scores = [ np.exp(logit) / sum(np.exp(logit)) for logit in logits ] intents_confidence_order = np.argsort(confidence_scores)[0][::-1] predicted_intents = self.labelencoder.inverse_transform( intents_confidence_order) ordered_confidence_scores = [ confidence_scores[0][idx] for idx in intents_confidence_order ] return [ Intent(name=intent, score=round(score, self.round)).add_parser(self.__class__.__name__) for intent, score in zip(predicted_intents, ordered_confidence_scores) ]