Example #1
0
    def __init__(self, resource_name, backend, language_name):
        self.intent_examples = []
        self.entity_examples = []
        self.resource_name = resource_name
        self.files = util.recursively_find_files(resource_name)
        self.fformat = self.guess_format(self.files)
        self.tokenizer = None
        self.language_name = language_name

        if backend in ['mitie', 'mitie_sklearn']:
            from rasa_nlu.tokenizers.mitie_tokenizer import MITIETokenizer
            self.tokenizer = MITIETokenizer()
        elif backend in ['spacy_sklearn']:
            from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
            self.tokenizer = SpacyTokenizer(language_name)
        else:
            from rasa_nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
            self.tokenizer = WhitespaceTokenizer()
            warnings.warn(
                "backend not recognised by TrainingData : defaulting to tokenizing by splitting on whitespace"
            )

        if self.fformat == 'luis':
            self.load_luis_data(self.files[0])
        elif self.fformat == 'wit':
            self.load_wit_data(self.files[0])
        elif self.fformat == 'api':
            self.load_api_data(self.files)
        elif self.fformat == 'rasa_nlu':
            self.load_data(self.files[0])
        else:
            raise ValueError("unknown training file format : {0}".format(
                self.fformat))
 def __init__(self, intent_classifier_file=None, entity_extractor_file=None, feature_extractor_file=None, **kwargs):
     if entity_extractor_file:
         self.extractor = named_entity_extractor(entity_extractor_file)  # ,metadata["feature_extractor"])
     with open(intent_classifier_file, 'rb') as f:
         self.classifier = cloudpickle.load(f)
     self.featurizer = MITIEFeaturizer(feature_extractor_file)
     self.tokenizer = MITIETokenizer()
Example #3
0
 def __init__(self,
              intent_classifier=None,
              entity_extractor=None,
              entity_synonyms=None):
     self.extractor = entity_extractor
     self.classifier = intent_classifier
     self.ent_synonyms = entity_synonyms
     self.tokenizer = MITIETokenizer()
Example #4
0
def test_mitie():
    from rasa_nlu.tokenizers.mitie_tokenizer import MITIETokenizer
    tk = MITIETokenizer()

    tk.tokenize(u"Hi. My name is rasa") == [
        u'Hi', u'My', u'name', u'is', u'rasa'
    ]
    tk.tokenize(u"ὦ ἄνδρες ᾿Αθηναῖοι.") == [u'ὦ', u'ἄνδρες', u'᾿Αθηναῖοι']
Example #5
0
 def __init__(self,
              intent_classifier=None,
              entity_extractor=None,
              feature_extractor=None,
              **kwargs):
     self.extractor = named_entity_extractor(entity_extractor,
                                             feature_extractor)
     self.classifier = text_categorizer(intent_classifier,
                                        feature_extractor)
     self.tokenizer = MITIETokenizer()
Example #6
0
def test_mitie():
    from rasa_nlu.tokenizers.mitie_tokenizer import MITIETokenizer
    tk = MITIETokenizer()

    tk.tokenize(u"Hi. My name is rasa") == [
        u'Hi', u'My', u'name', u'is', u'rasa'
    ]
    tk.tokenize(u"ὦ ἄνδρες ᾿Αθηναῖοι.") == [u'ὦ', u'ἄνδρες', u'᾿Αθηναῖοι']
    tk.tokenize_with_offsets(u"Forecast for lunch") == ([
        u'Forecast', u'for', u'lunch'
    ], [0, 9, 13])
Example #7
0
 def find_entity(cls, ent, text):
     tk = MITIETokenizer()
     tokens, offsets = tk.tokenize_with_offsets(text)
     if ent["start"] not in offsets:
         message = u"invalid entity {0} in example {1}:".format(ent, text) + \
             u" entities must span whole tokens"
         raise ValueError(message)
     start = offsets.index(ent["start"])
     _slice = text[ent["start"]:ent["end"]]
     val_tokens = tokenize(_slice)
     end = start + len(val_tokens)
     return start, end
Example #8
0
 def init_tokenizer(self, backend, nlp):
     if backend in [mitie.MITIE_BACKEND_NAME, mitie.MITIE_SKLEARN_BACKEND_NAME]:
         from rasa_nlu.tokenizers.mitie_tokenizer import MITIETokenizer
         self.tokenizer = MITIETokenizer()
     elif backend in [spacy.SPACY_BACKEND_NAME]:
         from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
         self.tokenizer = SpacyTokenizer(nlp)
     else:
         from rasa_nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
         self.tokenizer = WhitespaceTokenizer()
         warnings.warn(
             "backend not recognised by TrainingData : defaulting to tokenizing by splitting on whitespace")
Example #9
0
def test_mitie_featurizer():
    from rasa_nlu.featurizers.mitie_featurizer import MITIEFeaturizer

    filename = os.environ.get('MITIE_FILE')
    if not filename or not os.path.isfile(filename):
        filename = "data/total_word_feature_extractor.dat"

    ftr = MITIEFeaturizer(filename)
    sentence = "Hey how are you today"
    tokens = MITIETokenizer().tokenize(sentence)
    vecs = ftr.features_for_tokens(tokens)
    assert np.allclose(
        vecs[:5],
        np.array([0., -4.4551446, 0.26073121, -1.46632245, -1.84205751]),
        atol=1e-5)
Example #10
0
def test_mitie():
    from rasa_nlu.tokenizers.mitie_tokenizer import MITIETokenizer
    tk = MITIETokenizer()

    assert tk.tokenize(u"Hi. My name is rasa") == [
        u'Hi', u'My', u'name', u'is', u'rasa'
    ]
    assert tk.tokenize(u"ὦ ἄνδρες ᾿Αθηναῖοι") == [
        u'ὦ', u'ἄνδρες', u'᾿Αθηναῖοι'
    ]
    assert tk.tokenize_with_offsets(u"Forecast for lunch") == ([
        u'Forecast', u'for', u'lunch'
    ], [0, 9, 13])
    assert tk.tokenize_with_offsets(u"hey ńöñàśçií how're you?") == ([
        u'hey', u'ńöñàśçií', u'how', u'\'re', 'you', '?'
    ], [0, 4, 13, 16, 20, 23])
Example #11
0
 def __init__(self,
              intent_classifier=None,
              entity_extractor=None,
              feature_extractor=None,
              entity_synonyms=None,
              **kwargs):
     self.extractor = None
     self.classifier = None
     if entity_extractor:
         self.extractor = named_entity_extractor(entity_extractor, feature_extractor)
     if intent_classifier:
         self.classifier = text_categorizer(intent_classifier, feature_extractor)
     self.tokenizer = MITIETokenizer()
     self.ent_synonyms = None
     if entity_synonyms:
         Interpreter.load_synonyms(entity_synonyms)
 def __init__(self,
              intent_classifier=None,
              entity_extractor=None,
              feature_extractor=None,
              entity_synonyms=None,
              **kwargs):
     self.extractor = None
     self.classifier = None
     if entity_extractor:
         self.extractor = named_entity_extractor(entity_extractor,
                                                 feature_extractor)
     if intent_classifier:
         with open(intent_classifier, 'rb') as f:
             self.classifier = cloudpickle.load(f)
     self.featurizer = MITIEFeaturizer(feature_extractor)
     self.tokenizer = MITIETokenizer()
     self.ent_synonyms = None
     if entity_synonyms:
         self.ent_synonyms = Interpreter.load_synonyms(entity_synonyms)
Example #13
0
 def __init__(self, metadata):
     self.extractor = named_entity_extractor(
         metadata["entity_extractor"])  # ,metadata["feature_extractor"])
     self.classifier = text_categorizer(
         metadata["intent_classifier"])  # ,metadata["feature_extractor"])
     self.tokenizer = MITIETokenizer()