def __init__(self, features=None, excluded_features=None, num_bits=18, output_column_name='hashed_features'): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features( features, excluded_features) # Type checking _raise_error_if_not_of_type(num_bits, [int]) _raise_error_if_not_of_type(output_column_name, [str]) # Set up options opts = { 'num_bits': num_bits, 'output_column_name': output_column_name, } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._FeatureHasher() proxy.init_transformer(opts) super(FeatureHasher, self).__init__(proxy, self.__class__)
def __init__(self, features=None, excluded_features=None, max_categories=None, output_column_name = 'encoded_features'): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features( features, excluded_features) # Type checking _raise_error_if_not_of_type(max_categories, [int, _NoneType]) _raise_error_if_not_of_type(output_column_name, [str]) # Set up options opts = { 'max_categories': max_categories, 'output_column_name': output_column_name, } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._OneHotEncoder() proxy.init_transformer(opts) super(OneHotEncoder, self).__init__(proxy, self.__class__)
def __init__(self, features=None, excluded_features=None, strategy='logarithmic', num_bins=10, output_column_prefix=None): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features(features, excluded_features) # Type checking _raise_error_if_not_of_type(num_bins, [int]) _raise_error_if_not_of_type(strategy, [str]) # Set up options opts = { 'strategy': strategy, 'num_bins': num_bins, 'output_column_prefix': output_column_prefix } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._FeatureBinner() proxy.init_transformer(opts) super(FeatureBinner, self).__init__(proxy, self.__class__)
def __init__(self, features=None, excluded_features=None, threshold=1, output_category_name=None, output_column_prefix=None): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features( features, excluded_features) # Type checking _raise_error_if_not_of_type(threshold, [int, _NoneType]) # Set up options opts = { 'threshold': threshold, 'output_category_name': output_category_name, 'output_column_prefix': output_column_prefix } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._CountThresholder() proxy.init_transformer(opts) super(CountThresholder, self).__init__(proxy, self.__class__)
def __init__( self, features=None, excluded_features=None, min_document_frequency=0.0, max_document_frequency=1.0, output_column_prefix=None, ): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features(features, excluded_features) # Type checking _raise_error_if_not_of_type(min_document_frequency, [float, int]) _raise_error_if_not_of_type(max_document_frequency, [float, int]) _raise_error_if_not_of_type(output_column_prefix, [str, _NoneType]) # Set up options opts = { "min_document_frequency": min_document_frequency, "max_document_frequency": max_document_frequency, "output_column_prefix": output_column_prefix, } if _exclude: opts["exclude"] = True opts["features"] = _exclude else: opts["exclude"] = False opts["features"] = _features # Initialize object proxy = _gl.extensions._TFIDF() proxy.init_transformer(opts) super(TFIDF, self).__init__(proxy, self.__class__)
def __init__( self, features=None, excluded_features=None, to_lower=False, delimiters=["\r", "\v", "\n", "\f", "\t", " "], output_column_prefix=None, ): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features(features, excluded_features) # Type checking _raise_error_if_not_of_type(features, [list, str, _NoneType]) _raise_error_if_not_of_type(excluded_features, [list, str, _NoneType]) _raise_error_if_not_of_type(to_lower, [bool]) _raise_error_if_not_of_type(delimiters, [list, _NoneType]) _raise_error_if_not_of_type(output_column_prefix, [str, _NoneType]) if delimiters != None: for delim in delimiters: _raise_error_if_not_of_type(delim, str, "delimiters") if len(delim) != 1: raise ValueError("Delimiters must be single-character strings") # Set up options opts = { "features": features, "to_lower": to_lower, "delimiters": delimiters, "output_column_prefix": output_column_prefix, } if _exclude: opts["exclude"] = True opts["features"] = _exclude else: opts["exclude"] = False opts["features"] = _features # Initialize object proxy = _gl.extensions._Tokenizer() proxy.init_transformer(opts) super(Tokenizer, self).__init__(proxy, self.__class__)
def __init__(self, features=None, excluded_features=None): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features(features, excluded_features) # Set up options opts = { 'features': features } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._Tokenizer() proxy.init_transformer(opts) super(Tokenizer, self).__init__(proxy, self.__class__)
def __init__(self, reference_features=None, feature="feature", verbose=False): # Process and make a copy of the reference_features _reference_features, _exclude = _internal_utils.process_features(reference_features, None) # Type checking _raise_error_if_not_of_type(feature, [str]) # Set up options opts = { 'reference_features': reference_features, 'feature': feature, 'verbose': verbose } opts['reference_features'] = _reference_features # Initialize object proxy = _gl.extensions._CategoricalImputer() proxy.init_transformer(opts) super(CategoricalImputer, self).__init__(proxy, self.__class__)
def __init__(self, features=None, excluded_features=None, n=2, method="word", to_lower=True, ignore_punct=True, ignore_space=True, delimiters=["\r", "\v", "\n", "\f", "\t", " ", "!", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~"], output_column_prefix=None): # Process and make a copy of the features, exclude. _features, _exclude = _internal_utils.process_features(features, excluded_features) # Type checking _raise_error_if_not_of_type(features, [list, str, _NoneType]) _raise_error_if_not_of_type(excluded_features, [list, str, _NoneType]) _raise_error_if_not_of_type(n, [int]) _raise_error_if_not_of_type(method, [str]) _raise_error_if_not_of_type(to_lower, [bool]) _raise_error_if_not_of_type(ignore_punct, [bool]) _raise_error_if_not_of_type(ignore_space, [bool]) _raise_error_if_not_of_type(delimiters, [list, _NoneType]) _raise_error_if_not_of_type(output_column_prefix, [str, _NoneType]) if delimiters != None: for delim in delimiters: _raise_error_if_not_of_type(delim, str, "delimiters") if (len(delim) != 1): raise ValueError("Delimiters must be single-character strings") if n < 1: raise ValueError("Input 'n' must be greater than 0") if n > 5 and method == 'word': warnings.warn("It is unusual for n-grams to be of size larger than 5.") if method != "word" and method != "character": raise ValueError("Invalid 'method' input value. Please input " + "either 'word' or 'character' ") # Set up options opts = { 'n': n, 'features': features, 'ngram_type': method, 'to_lower': to_lower, 'ignore_punct': ignore_punct, 'ignore_space': ignore_space, 'delimiters': delimiters, 'output_column_prefix' : output_column_prefix } if _exclude: opts['exclude'] = True opts['features'] = _exclude else: opts['exclude'] = False opts['features'] = _features # Initialize object proxy = _gl.extensions._NGramCounter() proxy.init_transformer(opts) super(NGramCounter, self).__init__(proxy, self.__class__)