def get_default_config() -> Dict[Text, Any]: """Returns the component's default config.""" return { **SparseFeaturizer.get_default_config(), "analyzer": "word", "min_ngram": 1, "max_ngram": 1, }
def get_default_config() -> Dict[Text, Any]: """Returns the component's default config.""" return { **SparseFeaturizer.get_default_config(), FEATURES: [ ["low", "title", "upper"], ["BOS", "EOS", "low", "upper", "title", "digit"], ["low", "title", "upper"], ], }
def get_default_config() -> Dict[Text, Any]: """Returns the component's default config.""" return { **SparseFeaturizer.get_default_config(), # text will be processed with case sensitive as default "case_sensitive": True, # use lookup tables to generate features "use_lookup_tables": True, # use regexes to generate features "use_regexes": True, # use match word boundaries for lookup table "use_word_boundaries": True, }
def get_default_config() -> Dict[Text, Any]: """Returns the component's default config.""" return { **SparseFeaturizer.get_default_config(), # whether to use a shared vocab "use_shared_vocab": False, # the parameters are taken from # sklearn's CountVectorizer # whether to use word or character n-grams # 'char_wb' creates character n-grams inside word boundaries # n-grams at the edges of words are padded with space. "analyzer": "word", # use 'char' or 'char_wb' for character # remove accents during the preprocessing step "strip_accents": None, # {'ascii', 'unicode', None} # list of stop words "stop_words": None, # string {'english'}, list, or None (default) # min document frequency of a word to add to vocabulary # float - the parameter represents a proportion of documents # integer - absolute counts "min_df": 1, # float in range [0.0, 1.0] or int # max document frequency of a word to add to vocabulary # float - the parameter represents a proportion of documents # integer - absolute counts "max_df": 1.0, # float in range [0.0, 1.0] or int # set range of ngrams to be extracted "min_ngram": 1, # int "max_ngram": 1, # int # limit vocabulary size "max_features": None, # int or None # if convert all characters to lowercase "lowercase": True, # bool # handling Out-Of-Vocabulary (OOV) words # will be converted to lowercase if lowercase is True "OOV_token": None, # string or None "OOV_words": [], # string or list of strings # indicates whether the featurizer should use the lemma of a word for # counting (if available) or not "use_lemma": True, }