def __init__(self, resource_name, backend, language_name): self.intent_examples = [] self.entity_examples = [] self.resource_name = resource_name self.files = util.recursively_find_files(resource_name) self.fformat = self.guess_format(self.files) self.tokenizer = None self.language_name = language_name if backend in ['mitie', 'mitie_sklearn']: from rasa_nlu.tokenizers.mitie_tokenizer import MITIETokenizer self.tokenizer = MITIETokenizer() elif backend in ['spacy_sklearn']: from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer self.tokenizer = SpacyTokenizer(language_name) else: from rasa_nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer self.tokenizer = WhitespaceTokenizer() warnings.warn( "backend not recognised by TrainingData : defaulting to tokenizing by splitting on whitespace" ) if self.fformat == 'luis': self.load_luis_data(self.files[0]) elif self.fformat == 'wit': self.load_wit_data(self.files[0]) elif self.fformat == 'api': self.load_api_data(self.files) elif self.fformat == 'rasa_nlu': self.load_data(self.files[0]) else: raise ValueError("unknown training file format : {0}".format( self.fformat))
def resolve_data_files(self, resource_name): try: return util.recursively_find_files(resource_name) except ValueError, e: raise ValueError( "Invalid training data file / folder specified. " + e.message)