Beispiel #1
0
def training_data_from_paths(paths: Iterable[Text],
                             language: Text) -> TrainingData:
    from rasa.nlu.training_data import loading

    training_datas = [
        loading.load_data(nlu_file, language) for nlu_file in paths
    ]
    merged_training_data = TrainingData().merge(*training_datas)
    merged_training_data.fill_response_phrases()
    return merged_training_data
Beispiel #2
0
    async def get_nlu_data(self,
                           language: Optional[Text] = "en") -> TrainingData:
        fake_data_count = self.DEFAULT_FAKE_DATA_COUNT

        for importer in self.config["importers"]:
            if importer.get("name") == "rasam.PlaceholderImporter":
                fake_data_count = importer.get("fake_data_count",
                                               self.DEFAULT_FAKE_DATA_COUNT)

        faker_ = faker.Faker()
        faker_.seed_instance(fake_data_count)

        training_data = [
            loading.load_data(nlu_file, language)
            for nlu_file in self._nlu_files
        ]

        new_training_data = []

        for data in training_data:
            training_examples = []
            example: Message
            for example in data.training_examples:
                if example.get("intent"):
                    matches = [
                        i async for i in self.find_placeholders(example.text)
                    ]
                    if matches:
                        async for new_message in self.replace_placeholders(
                                example, faker_, matches, fake_data_count):
                            training_examples.append(new_message)
                    else:
                        training_examples.append(example)
                else:
                    training_examples.append(example)
            new_training_data.append(
                TrainingData(training_examples, data.entity_synonyms,
                             data.regex_features, data.lookup_tables,
                             data.nlg_stories))

        merged_training_data = TrainingData().merge(*new_training_data)
        merged_training_data.fill_response_phrases()
        return merged_training_data
Beispiel #3
0
def load_data(resource_name: Text, language: Optional[Text] = "en") -> "TrainingData":
    """Load training data from disk.

    Merges them if loaded from disk and multiple files are found."""
    from rasa.nlu.training_data import TrainingData

    if not os.path.exists(resource_name):
        raise ValueError(f"File '{resource_name}' does not exist.")

    files = io_utils.list_files(resource_name)
    data_sets = [_load(f, language) for f in files]
    data_sets = [ds for ds in data_sets if ds]
    if len(data_sets) == 0:
        training_data = TrainingData()
    elif len(data_sets) == 1:
        training_data = data_sets[0]
    else:
        training_data = data_sets[0].merge(*data_sets[1:])

    if training_data.nlg_stories:
        training_data.fill_response_phrases()

    return training_data
Beispiel #4
0
    def import_data(self, faq_path) -> None:
        config_path = os.path.join("bots/es-faq", "config.yml")

        importer = FAQImporter(config_path)
        importer.faq_paths = [faq_path]
        importer.synonym_paths = []
        importer.augment_with_synonyms = False
        importer.timezone = "Asia/Taipei"
        training_examples, nlg_stories, entity_synonyms = importer._load_files(
        )
        training_data = TrainingData(training_examples=training_examples,
                                     entity_synonyms=entity_synonyms,
                                     regex_features=[],
                                     lookup_tables=[],
                                     nlg_stories=nlg_stories)
        training_data.fill_response_phrases()

        # training_data = load_data("data/train_zh_cn.json")
        # td_responses = load_data("data/responses_zh_cn.md")
        # training_data = training_data.merge(td_responses)
        # training_data.fill_response_phrases()

        # train/test split
        training_examples = [
            ex for ex in training_data.training_examples
            if ex.get('intent') == 'faq'
        ]
        examples_per_response_key = defaultdict(lambda: [])
        for ex in training_examples:
            examples_per_response_key[ex.get('response_key')].append(ex)

        for intent in examples_per_response_key:
            examples = examples_per_response_key[intent]
            random.Random(1).shuffle(examples)
            split_offset = max(1, int(0.8 * len(examples)))
            self.train_exs.extend(examples[:split_offset])
            self.test_exs.extend(examples[split_offset:])