Beispiel #1
0
 def get_operator(self, filter_type: Any) -> Any:
     try:
         return getattr(operator, filter_type)
     except (AttributeError, TypeError) as exception:
         logger.debug(traceback.format_exc())
         raise ValueError(
             f"Expected datetime_filters to be one of {self.FUTURE}, {self.PAST} "
             "or a valid comparison operator here: https://docs.python.org/3/library/operator.html"
         ) from exception
Beispiel #2
0
    def inference(self, texts: Optional[List[str]]) -> List[Intent]:
        """
        Predict the intent of a list of texts.

        :param texts: A list of strings, derived from ASR transcripts.
        :type texts: List[str]
        :raises AttributeError: In case the labelencoder is not available.
        :return: A list of intents corresponding to texts.
        :rtype: List[Intent]
        """
        logger.debug(f"Classifier input:\n{texts}")
        fallback_output = Intent(name=self.fallback_label,
                                 score=1.0).add_parser(self)
        if not texts:
            logger.error(f"texts passed to model {texts}!")
            return [fallback_output]

        if self.model is None:
            logger.error(
                f"No model found for plugin {self.__class__.__name__}!")
            return [fallback_output]
        if self.use_state:
            for text in texts:
                if "<st>" not in text:
                    logger.error(
                        f"use_state = True but {text} doesn't contain <st> </st> tags, \
                        which means that training data is different from this data. This can lead to anomalous results"
                    )
                    return [fallback_output]
        if not self.valid_labelencoder:
            raise AttributeError("Seems like you forgot to "
                                 f"save the {self.__class__.__name__} plugin.")

        predictions, logits = self.model.predict(texts)
        if not predictions:
            return [fallback_output]

        confidence_scores = [
            np.exp(logit) / sum(np.exp(logit)) for logit in logits
        ]
        intents_confidence_order = np.argsort(confidence_scores)[0][::-1]
        predicted_intents = self.labelencoder.inverse_transform(
            intents_confidence_order)
        ordered_confidence_scores = [
            confidence_scores[0][idx] for idx in intents_confidence_order
        ]

        return [
            Intent(name=intent,
                   score=round(score,
                               self.round)).add_parser(self.__class__.__name__)
            for intent, score in zip(predicted_intents,
                                     ordered_confidence_scores)
        ]
Beispiel #3
0
    def inference(self, texts: Optional[List[str]]) -> List[Intent]:
        """
        Predict the intent of a list of texts.

        :param texts: A list of strings, derived from ASR transcripts.
        :type texts: List[str]
        :raises AttributeError: In case the model isn't of sklearn pipeline or gridsearchcv.
        :return: A list of intents corresponding to texts.
        :rtype: List[Intent]
        """
        logger.debug(f"Classifier input:\n{texts}")
        fallback_output = Intent(name=self.fallback_label,
                                 score=1.0).add_parser(self)

        if self.model_pipeline is None:
            logger.error(
                f"No model found for plugin {self.__class__.__name__}!")
            return [fallback_output]

        if not texts:
            return [fallback_output]

        if not isinstance(self.model_pipeline, Pipeline) and not isinstance(
                self.model_pipeline, GridSearchCV):
            raise AttributeError("Seems like you forgot to "
                                 f"save the {self.__class__.__name__} plugin.")

        try:
            probs_and_classes = sorted(
                zip(
                    self.model_pipeline.predict_proba(texts)[0],
                    self.model_pipeline.classes_,
                ),
                key=operator.itemgetter(0),
                reverse=True,
            )
        except NotFittedError:
            logger.error(f"{self.__class__.__name__} model not trained yet!")
            return [fallback_output]

        return [
            Intent(name=intent, score=round(score,
                                            self.round)).add_parser(self)
            for score, intent in probs_and_classes
        ]
Beispiel #4
0
    def transform(self, training_data: pd.DataFrame) -> pd.DataFrame:
        """
        Transform training data.

        :param training_data: Training data.
        :type training_data: pd.DataFrame
        :return: Transformed training data.
        :rtype: pd.DataFrame
        """
        if not self.use_transform:
            return training_data

        logger.debug(f"Transforming dataset via {self.__class__.__name__}")
        logger.disable("dialogy")
        training_data = training_data.copy()
        if self.output_column not in training_data.columns:
            training_data[self.output_column] = None

        for i, row in tqdm(training_data.iterrows(), total=len(training_data)):
            reference_time = row[self.reference_time_column]
            if isinstance(reference_time, str):
                reference_time = int(
                    datetime.fromisoformat(reference_time).timestamp() * 1000)
            elif pd.isna(reference_time):
                continue
            elif not isinstance(reference_time, int):
                raise TypeError(
                    f"{reference_time=} should be isoformat date or unix timestamp integer."
                )
            transcripts = self.make_transform_values(row[self.input_column])
            entities = self.extract(
                transcripts,
                lang_detect_from_text(self.input_column),
                reference_time=reference_time,
                use_latent=self.activate_latent_entities,
            )
            if row[self.output_column] is None or pd.isnull(
                    row[self.output_column]):
                training_data.at[i, self.output_column] = entities
            else:
                training_data.at[i, self.output_column] = (
                    row[self.output_column] + entities)
        logger.enable("dialogy")
        return training_data
Beispiel #5
0
    def train(self, training_data: pd.DataFrame) -> None:
        """
        Train an intent-classifier on the provided training data.

        The training is skipped if the data-format is not valid.
        :param training_data: A pandas dataframe containing at least list of strings and corresponding labels.
        :type training_data: pd.DataFrame
        """
        if not self.validate(training_data):
            logger.warning(
                f"Training dataframe is invalid, for {self.__class__.__name__} plugin."
            )
            return

        if self.valid_mlpmodel:
            logger.warning(f"Model already exists on {self.mlp_model_path}")
            return

        skip_labels_filter = training_data[self.label_column].isin(
            self.skip_labels)
        training_data = training_data[~skip_labels_filter].copy()

        self.labels_num = training_data[self.label_column].nunique()
        sample_size = 5 if len(training_data) > 5 else len(training_data)
        training_data.rename(
            columns={
                self.data_column: const.TEXT,
                self.label_column: const.LABELS
            },
            inplace=True,
        )
        args = self.init_model()
        logger.debug(
            f"Displaying a few samples (this goes into the model):\n{training_data.sample(sample_size)}\nLabels: {self.labels_num}."
        )
        self.model_pipeline.fit(training_data[const.TEXT],
                                training_data[const.LABELS])
        USE = "use"
        if args and args[const.USE_GRIDSEARCH][USE]:
            logger.debug("Best gridsearch params found:\n" + str("\n".join(
                str(items)
                for items in self.model_pipeline.best_params_.items())))
        self.save()
Beispiel #6
0
    def __create_req_body(
        self,
        text: str,
        reference_time: Optional[int] = None,
        locale: str = "en_IN",
        use_latent: Union[Callable[..., bool], bool] = False,
    ) -> Dict[str, Any]:
        """
        create request body for entity parsing

        example: "3 people tomorrow"

        Make your own reference time using the current timestamp using: :code:`int(datetime.now().timestamp() * 1000)`
        These are the milliseconds since the `Unix epoch <https://en.wikipedia.org/wiki/Unix_time>`_

        :param text: A sentence or document.
        :type text: str
        :param reference_time: Impart context of timestamp,
        relevant for time related entities. Resolve relative time like "yesterday", "next month", etc.
        :type Optional[int]
        :param reference_time: Optional[int]
        :return: request object for Duckling API.
        :rtype: Dict[str, Any]
        """
        dimensions = self.dimensions
        self.activate_latent_entities = use_latent or self.activate_latent_entities
        activate_latent_entities = (self.activate_latent_entities
                                    if isinstance(
                                        self.activate_latent_entities, bool)
                                    else self.activate_latent_entities())

        payload = {
            "text": text,
            "locale": locale or self.locale,
            "tz": self.__set_timezone(),
            "dims": json.dumps(dimensions),
            "reftime": reference_time,
            "latent": activate_latent_entities,
        }
        logger.debug("Duckling API payload:")
        logger.debug(pformat(payload))

        return payload
Beispiel #7
0
    def train(self, training_data: pd.DataFrame) -> None:
        """
        Train an intent-classifier on the provided training data.

        The training is skipped if the data-format is not valid.
        :param training_data: A pandas dataframe containing at least list of strings and corresponding labels.
        :type training_data: pd.DataFrame
        """
        if not self.validate(training_data):
            logger.warning(
                f"Training dataframe is invalid, for {self.__class__.__name__} plugin."
            )
            return

        skip_labels_filter = training_data[self.label_column].isin(
            self.skip_labels)
        training_data = training_data[~skip_labels_filter].copy()

        encoder = self.labelencoder.fit(training_data[self.label_column])
        sample_size = 5 if len(training_data) > 5 else len(training_data)
        training_data.rename(
            columns={
                self.data_column: const.TEXT,
                self.label_column: const.LABELS
            },
            inplace=True,
        )
        training_data.loc[:, const.LABELS] = encoder.transform(
            training_data[const.LABELS])
        # Add state as an additonal field to text
        if self.use_state:
            training_data[
                const.TEXT] += "<st> " + training_data["state"] + " </st>"
        training_data = training_data[[const.TEXT, const.LABELS]]
        self.init_model(len(encoder.classes_))
        logger.debug(
            f"Displaying a few samples (this goes into the model):\n{training_data.sample(sample_size)}\nLabels: {len(encoder.classes_)}."
        )
        self.model.train_model(training_data)
        self.save()
Beispiel #8
0
    def _reshape(self,
                 entities_json: List[Dict[str, Any]],
                 alternative_index: int = 0) -> List[BaseEntity]:
        """
        Create a list of :ref:`BaseEntity <base_entity>` objects from a list of entity dicts.

        :param entities_json: List of entities derived from Duckling's API.
        :type entities_json: List[Dict[str, Any]]
        :raises NotImplementedError: Raised when dimensions not supported by the project are used.
        :raises KeyError: Expected keys in entity dict don't match the Entity class.
        :return: A list of objects subclassed from :ref:`BaseEntity <base_entity>`
        :rtype: Optional[List[BaseEntity]]
        """
        entity_object_list: List[BaseEntity] = []
        duckling_entity: BaseEntity
        try:
            # For each entity dict:
            for entity in entities_json:
                if entity[EntityKeys.DIM] == EntityKeys.CREDIT_CARD_NUMBER:
                    cls = self.dimension_entity_map[entity[EntityKeys.DIM]][
                        EntityKeys.VALUE]
                    duckling_entity = cls.from_dict(entity)
                    duckling_entity.add_parser(self)
                    # Depending on the type of entity, the value is searched and filled.
                    duckling_entity.alternative_index = alternative_index
                    # Collect the entity object in a list.
                    entity_object_list.append(duckling_entity)

                elif entity[EntityKeys.VALUE][EntityKeys.TYPE] in [
                        EntityKeys.VALUE,
                        EntityKeys.DURATION,
                        EntityKeys.INTERVAL,
                ]:
                    # We can auto convert dict forms of duckling entities to dialogy entity classes only if they are
                    # known in advance. We currently support only the types in the condition above.
                    if entity[EntityKeys.VALUE][
                            EntityKeys.TYPE] == EntityKeys.INTERVAL:
                        # Duckling entities with interval type have a different structure for value(s).
                        # They have a need to express units in "from", "to" format.
                        cls = self.dimension_entity_map[entity[
                            EntityKeys.DIM]][EntityKeys.INTERVAL]
                    else:
                        cls = self.dimension_entity_map[entity[
                            EntityKeys.DIM]][EntityKeys.VALUE]
                        # The most appropriate class is picked for making an object from the dict.
                    duckling_entity = cls.from_dict(entity)
                    duckling_entity.add_parser(self)
                    # Depending on the type of entity, the value is searched and filled.
                    duckling_entity.set_value()
                    duckling_entity.alternative_index = alternative_index
                    # Collect the entity object in a list.
                    entity_object_list.append(duckling_entity)
                else:
                    # Raised only if an unsupported `dimension` is used.
                    raise NotImplementedError(
                        f"Entities with value.type {entity['value']['type']} are"
                        " not implemented. Report this"
                        " issue here: https://github.com/Vernacular-ai/dialogy/issues"
                    )
        except KeyError as key_error:
            # Being vary of structural changes in the API or entity dicts.
            # Under normal circumstances this error shouldn't be raised.
            logger.debug(traceback.format_exc())
            raise KeyError(
                f"Missing key {key_error} in entity {entity}.") from key_error
        return entity_object_list