Beispiel #1
0
class DucklingExtractor(Component):
    """Adds entity normalization by analyzing found entities and transforming them into regular formats."""

    name = "ner_duckling"

    context_provides = {
        "process": ["entities"],
    }

    output_provides = ["entities"]

    def __init__(self, duckling_processing_mode, duckling=None):
        # type: (Text, Optional[DucklingWrapper]) -> None

        self.duckling_processing_mode = duckling_processing_mode
        self.duckling = duckling

    @classmethod
    def required_packages(cls):
        # type: () -> List[Text]
        return ["duckling"]

    @classmethod
    def create(cls, duckling_processing_mode):
        if duckling_processing_mode not in DUCKLING_PROCESSING_MODES:
            raise ValueError(
                "Invalid duckling processing mode. Got '{}'. Allowed: {}".
                format(duckling_processing_mode,
                       ", ".join(DUCKLING_PROCESSING_MODES)))

        return DucklingExtractor(duckling_processing_mode)

    @classmethod
    def cache_key(cls, model_metadata):
        # type: (Metadata) -> Text

        return cls.name + "-" + model_metadata.language

    def pipeline_init(self, language):
        # type: (Text, Text) -> None
        from duckling import DucklingWrapper

        if self.duckling is None:
            try:
                self.duckling = DucklingWrapper(
                    language=language
                )  # languages in duckling are eg "de$core"
            except ValueError as e:
                raise Exception("Duckling error. {}".format(e))

    def process(self, text, entities):
        # type: (Text, List[Dict[Text, Any]], Text) -> Dict[Text, Any]

        if self.duckling is not None:
            parsed = self.duckling.parse(text)
            for duckling_match in parsed:
                for entity in entities:
                    if entity["start"] == duckling_match["start"] and entity[
                            "end"] == duckling_match["end"]:
                        entity["value"] = duckling_match["value"]["value"]
                        entity["duckling"] = duckling_match["dim"]
                        break
                else:
                    if self.duckling_processing_mode == "append":
                        # Duckling will retrieve multiple entities, even if they overlap..
                        # hence the append mode might add some noise to the found entities
                        entities.append({
                            "entity":
                            duckling_match["dim"],
                            "duckling":
                            duckling_match["dim"],
                            "value":
                            duckling_match["value"]["value"],
                            "start":
                            duckling_match["start"],
                            "end":
                            duckling_match["end"],
                        })

        return {"entities": entities}

    @classmethod
    def load(cls, duckling_processing_mode):
        # type: (Text) -> DucklingExtractor

        return cls.create(duckling_processing_mode)
Beispiel #2
0
class DucklingExtractor(EntityExtractor):
    """Adds entity normalization by analyzing found entities and transforming them into regular formats."""

    name = "ner_duckling"

    context_provides = {
        "process": ["entities"],
    }

    output_provides = ["entities"]

    @staticmethod
    def available_dimensions():
        from duckling.dim import Dim
        return [
            m[1] for m in getmembers(Dim)
            if not m[0].startswith("__") and not m[0].endswith("__")
        ]

    def __init__(self, dimensions=None, duckling=None):
        # type: (Text, Optional[DucklingWrapper]) -> None

        self.dimensions = dimensions if dimensions else self.available_dimensions(
        )
        self.duckling = duckling

    @classmethod
    def required_packages(cls):
        # type: () -> List[Text]
        return ["duckling"]

    @classmethod
    def create(cls, duckling_dimensions):
        if duckling_dimensions is None:
            duckling_dimensions = cls.available_dimensions()
        unknown_dimensions = [
            dim for dim in duckling_dimensions
            if dim not in cls.available_dimensions()
        ]
        if len(unknown_dimensions) > 0:
            raise ValueError(
                "Invalid duckling dimension. Got '{}'. Allowed: {}".format(
                    ", ".join(unknown_dimensions),
                    ", ".join(cls.available_dimensions())))

        return DucklingExtractor(duckling_dimensions)

    @classmethod
    def cache_key(cls, model_metadata):
        # type: (Metadata) -> Text

        return cls.name + "-" + model_metadata.language

    def pipeline_init(self, language):
        # type: (Text, Text) -> None
        from duckling import DucklingWrapper

        if self.duckling is None:
            try:
                self.duckling = DucklingWrapper(
                    language=language
                )  # languages in duckling are eg "de$core"
            except ValueError as e:  # pragma: no cover
                raise Exception("Duckling error. {}".format(e))

    def process(self, text, entities):
        # type: (Text, List[Dict[Text, Any]]) -> Dict[Text, Any]

        extracted = []
        if self.duckling is not None:
            matches = self.duckling.parse(text)
            relevant_matches = [
                match for match in matches if match["dim"] in self.dimensions
            ]
            for match in relevant_matches:
                entity = {
                    "start": match["start"],
                    "end": match["end"],
                    "text": match["text"],
                    "value": match["value"],
                    "entity": match["dim"]
                }

                extracted.append(entity)

        extracted = self.add_extractor_name(extracted)
        entities.extend(extracted)
        return {"entities": entities}

    def persist(self, model_dir):
        # type: (Text) -> Dict[Text, Any]
        file_name = self.name + ".json"
        full_name = os.path.join(model_dir, file_name)
        with io.open(full_name, 'w') as f:
            f.write(str(json.dumps({"dimensions": self.dimensions})))
        return {"ner_duckling_persisted": file_name}

    @classmethod
    def load(cls, model_dir, ner_duckling_persisted):
        # type: (Text) -> DucklingExtractor
        persisted = os.path.join(model_dir, ner_duckling_persisted)
        if os.path.isfile(persisted):
            with io.open(persisted, encoding='utf-8') as f:
                persisted_data = json.loads(f.read())
                return cls.create(persisted_data["dimensions"])
            ent['value'] = txt
            ent['entity'] = 'product'
            sentence['entities'].append(ent)
            sentence['text'] += txt + " "

            while random.random() > .5:
                m = random.choice(middle)
                sentence['text'] += m
                txt = df.sample().iloc[0, 0]
                ent = dict()
                ent['start'] = len(sentence['text'])
                ent['end'] = len(sentence['text'] + txt)
                ent['value'] = txt
                ent['entity'] = 'product'
                sentence['entities'].append(ent)
                sentence['text'] += txt + " "

            sentence['text'] += random.choice(end)
            train_data['rasa_nlu_data']["common_examples"].append(sentence)

    with open('result.json', 'w+') as fp:
        json.dump(train_data, fp)

    container = IntentContainer('intent_cache')

    d = DucklingWrapper()
    d.parse('Bring me 250 ml sugar')
    d.parse_
    print(d.parse_time(u'Let\'s meet at 11:45am'))
    print(d.parse_number(u'Bring me one conserve of ravioli'))
    print(d.parse_quantity(u'Bring me 100 g of sugar'))