def __init__(self, features=None, excluded_features=None, num_bits=18,
                                        output_column_name='hashed_features'):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(
                                        features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(num_bits, [int])
        _raise_error_if_not_of_type(output_column_name, [str])

        # Set up options
        opts = {
            'num_bits': num_bits,
            'output_column_name': output_column_name,
            }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._FeatureHasher()
        proxy.init_transformer(opts)
        super(FeatureHasher, self).__init__(proxy, self.__class__)
    def __init__(self, features=None, excluded_features=None,
            max_categories=None, output_column_name = 'encoded_features'):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(
                                        features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(max_categories, [int, _NoneType])
        _raise_error_if_not_of_type(output_column_name, [str])

        # Set up options
        opts = {
          'max_categories': max_categories,
          'output_column_name': output_column_name,
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._OneHotEncoder()
        proxy.init_transformer(opts)
        super(OneHotEncoder, self).__init__(proxy, self.__class__)
    def __init__(self, features=None, excluded_features=None,
                 strategy='logarithmic', num_bins=10,
                 output_column_prefix=None):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(num_bins, [int])
        _raise_error_if_not_of_type(strategy, [str])

        # Set up options
        opts = {
          'strategy': strategy,
          'num_bins': num_bins,
          'output_column_prefix': output_column_prefix
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._FeatureBinner()
        proxy.init_transformer(opts)
        super(FeatureBinner, self).__init__(proxy, self.__class__)
    def __init__(self, features=None, excluded_features=None, threshold=1,
                 output_category_name=None, output_column_prefix=None):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(
                                        features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(threshold, [int, _NoneType])

        # Set up options
        opts = {
          'threshold': threshold,
          'output_category_name': output_category_name,
          'output_column_prefix': output_column_prefix
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._CountThresholder()
        proxy.init_transformer(opts)
        super(CountThresholder, self).__init__(proxy, self.__class__)
예제 #5
0
    def __init__(
        self,
        features=None,
        excluded_features=None,
        min_document_frequency=0.0,
        max_document_frequency=1.0,
        output_column_prefix=None,
    ):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(min_document_frequency, [float, int])
        _raise_error_if_not_of_type(max_document_frequency, [float, int])
        _raise_error_if_not_of_type(output_column_prefix, [str, _NoneType])

        # Set up options
        opts = {
            "min_document_frequency": min_document_frequency,
            "max_document_frequency": max_document_frequency,
            "output_column_prefix": output_column_prefix,
        }
        if _exclude:
            opts["exclude"] = True
            opts["features"] = _exclude
        else:
            opts["exclude"] = False
            opts["features"] = _features

        # Initialize object
        proxy = _gl.extensions._TFIDF()
        proxy.init_transformer(opts)
        super(TFIDF, self).__init__(proxy, self.__class__)
예제 #6
0
    def __init__(
        self,
        features=None,
        excluded_features=None,
        to_lower=False,
        delimiters=["\r", "\v", "\n", "\f", "\t", " "],
        output_column_prefix=None,
    ):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(features, [list, str, _NoneType])
        _raise_error_if_not_of_type(excluded_features, [list, str, _NoneType])
        _raise_error_if_not_of_type(to_lower, [bool])
        _raise_error_if_not_of_type(delimiters, [list, _NoneType])
        _raise_error_if_not_of_type(output_column_prefix, [str, _NoneType])

        if delimiters != None:
            for delim in delimiters:
                _raise_error_if_not_of_type(delim, str, "delimiters")
                if len(delim) != 1:
                    raise ValueError("Delimiters must be single-character strings")

        # Set up options
        opts = {
            "features": features,
            "to_lower": to_lower,
            "delimiters": delimiters,
            "output_column_prefix": output_column_prefix,
        }
        if _exclude:
            opts["exclude"] = True
            opts["features"] = _exclude
        else:
            opts["exclude"] = False
            opts["features"] = _features

        # Initialize object
        proxy = _gl.extensions._Tokenizer()
        proxy.init_transformer(opts)
        super(Tokenizer, self).__init__(proxy, self.__class__)
예제 #7
0
    def __init__(self, features=None, excluded_features=None):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(features, excluded_features)

        # Set up options
        opts = {
          'features': features
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._Tokenizer()
        proxy.init_transformer(opts)
        super(Tokenizer, self).__init__(proxy, self.__class__)
예제 #8
0
    def __init__(self, reference_features=None, feature="feature", verbose=False):

        # Process and make a copy of the reference_features
        _reference_features, _exclude = _internal_utils.process_features(reference_features, None)

        # Type checking
        _raise_error_if_not_of_type(feature, [str])

        # Set up options
        opts = {
          'reference_features': reference_features,
          'feature': feature,
          'verbose': verbose
        }
        opts['reference_features'] = _reference_features

        # Initialize object
        proxy = _gl.extensions._CategoricalImputer()
        proxy.init_transformer(opts)
        super(CategoricalImputer, self).__init__(proxy, self.__class__)
    def __init__(self, features=None, excluded_features=None,
        n=2, method="word", to_lower=True, ignore_punct=True, ignore_space=True,
        delimiters=["\r", "\v", "\n", "\f", "\t", " ",
                    "!", "#", "$", "%", "&", "'", "(", ")",
                    "*", "+", ",", "-", ".", "/", ":", ";",
                    "<", "=", ">", "?", "@", "[", "\\", "]",
                    "^", "_", "`", "{", "|", "}", "~"],
        output_column_prefix=None):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(features, [list, str, _NoneType])
        _raise_error_if_not_of_type(excluded_features, [list, str, _NoneType])
        _raise_error_if_not_of_type(n, [int])
        _raise_error_if_not_of_type(method, [str])
        _raise_error_if_not_of_type(to_lower, [bool])
        _raise_error_if_not_of_type(ignore_punct, [bool])
        _raise_error_if_not_of_type(ignore_space, [bool])
        _raise_error_if_not_of_type(delimiters, [list, _NoneType])
        _raise_error_if_not_of_type(output_column_prefix, [str, _NoneType])

        if delimiters != None:
            for delim in delimiters:
                _raise_error_if_not_of_type(delim, str, "delimiters")
                if (len(delim) != 1):
                    raise ValueError("Delimiters must be single-character strings")

        if n < 1:
            raise ValueError("Input 'n' must be greater than 0")

        if n > 5 and method == 'word':
            warnings.warn("It is unusual for n-grams to be of size larger than 5.")

        if method != "word" and method != "character":
            raise ValueError("Invalid 'method' input  value. Please input " +
                             "either 'word' or 'character' ")

        # Set up options
        opts = {
          'n': n,
          'features': features,
          'ngram_type': method,
          'to_lower': to_lower,
          'ignore_punct': ignore_punct,
          'ignore_space': ignore_space,
          'delimiters': delimiters,
          'output_column_prefix' : output_column_prefix
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._NGramCounter()
        proxy.init_transformer(opts)
        super(NGramCounter, self).__init__(proxy, self.__class__)