def get_operator(self, filter_type: Any) -> Any: try: return getattr(operator, filter_type) except (AttributeError, TypeError) as exception: logger.debug(traceback.format_exc()) raise ValueError( f"Expected datetime_filters to be one of {self.FUTURE}, {self.PAST} " "or a valid comparison operator here: https://docs.python.org/3/library/operator.html" ) from exception
def inference(self, texts: Optional[List[str]]) -> List[Intent]: """ Predict the intent of a list of texts. :param texts: A list of strings, derived from ASR transcripts. :type texts: List[str] :raises AttributeError: In case the labelencoder is not available. :return: A list of intents corresponding to texts. :rtype: List[Intent] """ logger.debug(f"Classifier input:\n{texts}") fallback_output = Intent(name=self.fallback_label, score=1.0).add_parser(self) if not texts: logger.error(f"texts passed to model {texts}!") return [fallback_output] if self.model is None: logger.error( f"No model found for plugin {self.__class__.__name__}!") return [fallback_output] if self.use_state: for text in texts: if "<st>" not in text: logger.error( f"use_state = True but {text} doesn't contain <st> </st> tags, \ which means that training data is different from this data. This can lead to anomalous results" ) return [fallback_output] if not self.valid_labelencoder: raise AttributeError("Seems like you forgot to " f"save the {self.__class__.__name__} plugin.") predictions, logits = self.model.predict(texts) if not predictions: return [fallback_output] confidence_scores = [ np.exp(logit) / sum(np.exp(logit)) for logit in logits ] intents_confidence_order = np.argsort(confidence_scores)[0][::-1] predicted_intents = self.labelencoder.inverse_transform( intents_confidence_order) ordered_confidence_scores = [ confidence_scores[0][idx] for idx in intents_confidence_order ] return [ Intent(name=intent, score=round(score, self.round)).add_parser(self.__class__.__name__) for intent, score in zip(predicted_intents, ordered_confidence_scores) ]
def inference(self, texts: Optional[List[str]]) -> List[Intent]: """ Predict the intent of a list of texts. :param texts: A list of strings, derived from ASR transcripts. :type texts: List[str] :raises AttributeError: In case the model isn't of sklearn pipeline or gridsearchcv. :return: A list of intents corresponding to texts. :rtype: List[Intent] """ logger.debug(f"Classifier input:\n{texts}") fallback_output = Intent(name=self.fallback_label, score=1.0).add_parser(self) if self.model_pipeline is None: logger.error( f"No model found for plugin {self.__class__.__name__}!") return [fallback_output] if not texts: return [fallback_output] if not isinstance(self.model_pipeline, Pipeline) and not isinstance( self.model_pipeline, GridSearchCV): raise AttributeError("Seems like you forgot to " f"save the {self.__class__.__name__} plugin.") try: probs_and_classes = sorted( zip( self.model_pipeline.predict_proba(texts)[0], self.model_pipeline.classes_, ), key=operator.itemgetter(0), reverse=True, ) except NotFittedError: logger.error(f"{self.__class__.__name__} model not trained yet!") return [fallback_output] return [ Intent(name=intent, score=round(score, self.round)).add_parser(self) for score, intent in probs_and_classes ]
def transform(self, training_data: pd.DataFrame) -> pd.DataFrame: """ Transform training data. :param training_data: Training data. :type training_data: pd.DataFrame :return: Transformed training data. :rtype: pd.DataFrame """ if not self.use_transform: return training_data logger.debug(f"Transforming dataset via {self.__class__.__name__}") logger.disable("dialogy") training_data = training_data.copy() if self.output_column not in training_data.columns: training_data[self.output_column] = None for i, row in tqdm(training_data.iterrows(), total=len(training_data)): reference_time = row[self.reference_time_column] if isinstance(reference_time, str): reference_time = int( datetime.fromisoformat(reference_time).timestamp() * 1000) elif pd.isna(reference_time): continue elif not isinstance(reference_time, int): raise TypeError( f"{reference_time=} should be isoformat date or unix timestamp integer." ) transcripts = self.make_transform_values(row[self.input_column]) entities = self.extract( transcripts, lang_detect_from_text(self.input_column), reference_time=reference_time, use_latent=self.activate_latent_entities, ) if row[self.output_column] is None or pd.isnull( row[self.output_column]): training_data.at[i, self.output_column] = entities else: training_data.at[i, self.output_column] = ( row[self.output_column] + entities) logger.enable("dialogy") return training_data
def train(self, training_data: pd.DataFrame) -> None: """ Train an intent-classifier on the provided training data. The training is skipped if the data-format is not valid. :param training_data: A pandas dataframe containing at least list of strings and corresponding labels. :type training_data: pd.DataFrame """ if not self.validate(training_data): logger.warning( f"Training dataframe is invalid, for {self.__class__.__name__} plugin." ) return if self.valid_mlpmodel: logger.warning(f"Model already exists on {self.mlp_model_path}") return skip_labels_filter = training_data[self.label_column].isin( self.skip_labels) training_data = training_data[~skip_labels_filter].copy() self.labels_num = training_data[self.label_column].nunique() sample_size = 5 if len(training_data) > 5 else len(training_data) training_data.rename( columns={ self.data_column: const.TEXT, self.label_column: const.LABELS }, inplace=True, ) args = self.init_model() logger.debug( f"Displaying a few samples (this goes into the model):\n{training_data.sample(sample_size)}\nLabels: {self.labels_num}." ) self.model_pipeline.fit(training_data[const.TEXT], training_data[const.LABELS]) USE = "use" if args and args[const.USE_GRIDSEARCH][USE]: logger.debug("Best gridsearch params found:\n" + str("\n".join( str(items) for items in self.model_pipeline.best_params_.items()))) self.save()
def __create_req_body( self, text: str, reference_time: Optional[int] = None, locale: str = "en_IN", use_latent: Union[Callable[..., bool], bool] = False, ) -> Dict[str, Any]: """ create request body for entity parsing example: "3 people tomorrow" Make your own reference time using the current timestamp using: :code:`int(datetime.now().timestamp() * 1000)` These are the milliseconds since the `Unix epoch <https://en.wikipedia.org/wiki/Unix_time>`_ :param text: A sentence or document. :type text: str :param reference_time: Impart context of timestamp, relevant for time related entities. Resolve relative time like "yesterday", "next month", etc. :type Optional[int] :param reference_time: Optional[int] :return: request object for Duckling API. :rtype: Dict[str, Any] """ dimensions = self.dimensions self.activate_latent_entities = use_latent or self.activate_latent_entities activate_latent_entities = (self.activate_latent_entities if isinstance( self.activate_latent_entities, bool) else self.activate_latent_entities()) payload = { "text": text, "locale": locale or self.locale, "tz": self.__set_timezone(), "dims": json.dumps(dimensions), "reftime": reference_time, "latent": activate_latent_entities, } logger.debug("Duckling API payload:") logger.debug(pformat(payload)) return payload
def train(self, training_data: pd.DataFrame) -> None: """ Train an intent-classifier on the provided training data. The training is skipped if the data-format is not valid. :param training_data: A pandas dataframe containing at least list of strings and corresponding labels. :type training_data: pd.DataFrame """ if not self.validate(training_data): logger.warning( f"Training dataframe is invalid, for {self.__class__.__name__} plugin." ) return skip_labels_filter = training_data[self.label_column].isin( self.skip_labels) training_data = training_data[~skip_labels_filter].copy() encoder = self.labelencoder.fit(training_data[self.label_column]) sample_size = 5 if len(training_data) > 5 else len(training_data) training_data.rename( columns={ self.data_column: const.TEXT, self.label_column: const.LABELS }, inplace=True, ) training_data.loc[:, const.LABELS] = encoder.transform( training_data[const.LABELS]) # Add state as an additonal field to text if self.use_state: training_data[ const.TEXT] += "<st> " + training_data["state"] + " </st>" training_data = training_data[[const.TEXT, const.LABELS]] self.init_model(len(encoder.classes_)) logger.debug( f"Displaying a few samples (this goes into the model):\n{training_data.sample(sample_size)}\nLabels: {len(encoder.classes_)}." ) self.model.train_model(training_data) self.save()
def _reshape(self, entities_json: List[Dict[str, Any]], alternative_index: int = 0) -> List[BaseEntity]: """ Create a list of :ref:`BaseEntity <base_entity>` objects from a list of entity dicts. :param entities_json: List of entities derived from Duckling's API. :type entities_json: List[Dict[str, Any]] :raises NotImplementedError: Raised when dimensions not supported by the project are used. :raises KeyError: Expected keys in entity dict don't match the Entity class. :return: A list of objects subclassed from :ref:`BaseEntity <base_entity>` :rtype: Optional[List[BaseEntity]] """ entity_object_list: List[BaseEntity] = [] duckling_entity: BaseEntity try: # For each entity dict: for entity in entities_json: if entity[EntityKeys.DIM] == EntityKeys.CREDIT_CARD_NUMBER: cls = self.dimension_entity_map[entity[EntityKeys.DIM]][ EntityKeys.VALUE] duckling_entity = cls.from_dict(entity) duckling_entity.add_parser(self) # Depending on the type of entity, the value is searched and filled. duckling_entity.alternative_index = alternative_index # Collect the entity object in a list. entity_object_list.append(duckling_entity) elif entity[EntityKeys.VALUE][EntityKeys.TYPE] in [ EntityKeys.VALUE, EntityKeys.DURATION, EntityKeys.INTERVAL, ]: # We can auto convert dict forms of duckling entities to dialogy entity classes only if they are # known in advance. We currently support only the types in the condition above. if entity[EntityKeys.VALUE][ EntityKeys.TYPE] == EntityKeys.INTERVAL: # Duckling entities with interval type have a different structure for value(s). # They have a need to express units in "from", "to" format. cls = self.dimension_entity_map[entity[ EntityKeys.DIM]][EntityKeys.INTERVAL] else: cls = self.dimension_entity_map[entity[ EntityKeys.DIM]][EntityKeys.VALUE] # The most appropriate class is picked for making an object from the dict. duckling_entity = cls.from_dict(entity) duckling_entity.add_parser(self) # Depending on the type of entity, the value is searched and filled. duckling_entity.set_value() duckling_entity.alternative_index = alternative_index # Collect the entity object in a list. entity_object_list.append(duckling_entity) else: # Raised only if an unsupported `dimension` is used. raise NotImplementedError( f"Entities with value.type {entity['value']['type']} are" " not implemented. Report this" " issue here: https://github.com/Vernacular-ai/dialogy/issues" ) except KeyError as key_error: # Being vary of structural changes in the API or entity dicts. # Under normal circumstances this error shouldn't be raised. logger.debug(traceback.format_exc()) raise KeyError( f"Missing key {key_error} in entity {entity}.") from key_error return entity_object_list