def generate_dataset(language, *files): """Create a Snips NLU dataset from text friendly files""" if any(f.endswith(".yml") or f.endswith(".yaml") for f in files): dataset = Dataset.from_yaml_files(language, list(files)) else: dataset = Dataset.from_files(language, list(files)) print(json.dumps(dataset.json, indent=2, sort_keys=True))
def test_should_generate_dataset_from_files(self): # Given intent_file_1 = "intent_whoIsGame.txt" intent_file_2 = "intent_getWeather.txt" entity_file_1 = "entity_location.txt" who_is_game_txt = """ who is the [role:role](president) of [country:country](France) who is the [role:role](CEO) of [company:company](Google) please """ get_weather_txt = """ what is the weather in [weatherLocation:location](Paris)? is it raining in [weatherLocation] [weatherDate:snips/datetime] """ location_txt = """ new york,big apple london """ # pylint:disable=unused-argument def mock_open(self_, *args, **kwargs): if str(self_) == intent_file_1: return io.StringIO(who_is_game_txt) if str(self_) == intent_file_2: return io.StringIO(get_weather_txt) if str(self_) == entity_file_1: return io.StringIO(location_txt) return None # pylint:enable=unused-argument dataset_files = [intent_file_1, intent_file_2, entity_file_1] # When with patch("pathlib.io") as mock_io: mock_io.open.side_effect = mock_open dataset = Dataset.from_files("en", dataset_files) dataset_dict = dataset.json # When / Then validate_and_format_dataset(dataset_dict) self.assertDictEqual(EXPECTED_DATASET_DICT, dataset_dict)