コード例 #1
0
 def get_default_config() -> Dict[Text, Any]:
     """Returns the component's default config."""
     return {
         **SparseFeaturizer.get_default_config(),
         "analyzer": "word",
         "min_ngram": 1,
         "max_ngram": 1,
     }
コード例 #2
0
 def get_default_config() -> Dict[Text, Any]:
     """Returns the component's default config."""
     return {
         **SparseFeaturizer.get_default_config(),
         FEATURES: [
             ["low", "title", "upper"],
             ["BOS", "EOS", "low", "upper", "title", "digit"],
             ["low", "title", "upper"],
         ],
     }
コード例 #3
0
ファイル: regex_featurizer.py プロジェクト: spawn08/rasa
 def get_default_config() -> Dict[Text, Any]:
     """Returns the component's default config."""
     return {
         **SparseFeaturizer.get_default_config(),
         # text will be processed with case sensitive as default
         "case_sensitive": True,
         # use lookup tables to generate features
         "use_lookup_tables": True,
         # use regexes to generate features
         "use_regexes": True,
         # use match word boundaries for lookup table
         "use_word_boundaries": True,
     }
コード例 #4
0
 def get_default_config() -> Dict[Text, Any]:
     """Returns the component's default config."""
     return {
         **SparseFeaturizer.get_default_config(),
         # whether to use a shared vocab
         "use_shared_vocab":
         False,
         # the parameters are taken from
         # sklearn's CountVectorizer
         # whether to use word or character n-grams
         # 'char_wb' creates character n-grams inside word boundaries
         # n-grams at the edges of words are padded with space.
         "analyzer":
         "word",  # use 'char' or 'char_wb' for character
         # remove accents during the preprocessing step
         "strip_accents":
         None,  # {'ascii', 'unicode', None}
         # list of stop words
         "stop_words":
         None,  # string {'english'}, list, or None (default)
         # min document frequency of a word to add to vocabulary
         # float - the parameter represents a proportion of documents
         # integer - absolute counts
         "min_df":
         1,  # float in range [0.0, 1.0] or int
         # max document frequency of a word to add to vocabulary
         # float - the parameter represents a proportion of documents
         # integer - absolute counts
         "max_df":
         1.0,  # float in range [0.0, 1.0] or int
         # set range of ngrams to be extracted
         "min_ngram":
         1,  # int
         "max_ngram":
         1,  # int
         # limit vocabulary size
         "max_features":
         None,  # int or None
         # if convert all characters to lowercase
         "lowercase":
         True,  # bool
         # handling Out-Of-Vocabulary (OOV) words
         # will be converted to lowercase if lowercase is True
         "OOV_token":
         None,  # string or None
         "OOV_words": [],  # string or list of strings
         # indicates whether the featurizer should use the lemma of a word for
         # counting (if available) or not
         "use_lemma":
         True,
     }