Example #1
0
    def transform(self, X):
        """Transform X, a DataFrame, by stripping
        out the object columns, dummifying them, and
        re-appending them to the end.
        
        Parameters
        ----------
        X : pandas dataframe
        """
        check_is_fitted(self, 'obj_cols_')
        # check on state of X, don't care about cols or warning
        X, _ = validate_is_pd(X, None)

        # if there is no encoder to speak of, just bail early
        if not self.one_hot_:
            return X if self.as_df else X.as_matrix()
        
        ## Retain just the numers
        numers = X[[nm for nm in X.columns.values if not nm in self.obj_cols_]]
        objs = X[self.obj_cols_]
        
        ## If we need to fill in the NAs, take care of it
        if not self.fill is None:
            objs = objs.fillna(self.fill)
            
        ## Do label encoding using the safe label encoders
        trans = np.array([v.transform(objs[self.obj_cols_[i]]) for\
                          i,v in enumerate(self.lab_encoders_)]).transpose()
        
        ## Finally, get the one-hot encoding...
        oh = self.one_hot_.transform(trans).todense()
        x = np.array(np.hstack((numers, oh)))

        return x if not self.as_df else pd.DataFrame.from_records(data=x, columns=self.trans_nms_)
Example #2
0
    def transform(self, X):
        """Transform X, a DataFrame, by stripping
        out the object columns, dummifying them, and
        re-appending them to the end.
        
        Parameters
        ----------

        X : Pandas ``DataFrame``, shape=(n_samples, n_features)
            The Pandas frame to transform.

        Returns
        -------

        x : Pandas ``DataFrame`` or np.ndarray, shape=(n_samples, n_features)
            The encoded dataframe or array
        """
        check_is_fitted(self, 'obj_cols_')
        # check on state of X, don't care about cols or warning
        X, _ = validate_is_pd(X, None)

        # if there is no encoder to speak of, just bail early
        if not self.one_hot_:
            return X if self.as_df else X.as_matrix()

        # Retain just the numers
        numers = X[[nm for nm in X.columns.values if nm not in self.obj_cols_]]
        objs = X[self.obj_cols_]

        # If we need to fill in the NAs, take care of it
        if self.fill is not None:
            objs = objs.fillna(self.fill)

        # Do label encoding using the safe label encoders
        trans = np.array([v.transform(objs[self.obj_cols_[i]]) for
                          i, v in enumerate(self.lab_encoders_)]).transpose()

        # Finally, get the one-hot encoding...
        oh = self.one_hot_.transform(trans).todense()
        x = np.array(np.hstack((numers, oh)))

        return x if not self.as_df else pd.DataFrame.from_records(data=x, columns=self.trans_nms_)
Example #3
0
    def fit(self, X, y = None):
        """Fit the estimator.
        
        Parameters
        ----------
        X : pandas dataframe
        y : passthrough for Pipeline
        """
        # check on state of X, don't care about cols or the warning
        X, _ = validate_is_pd(X, None)
            
        ## Extract the object columns
        obj_cols_ = X.select_dtypes(include = ['object']).columns.values
        
        ## If we need to fill in the NAs, take care of it
        if not self.fill is None:
            X[obj_cols_] = X[obj_cols_].fillna(self.fill)
        
        ## Set an array of uninitialized label encoders
        ## Then use fit_transform for effiency purposes
        ## We can also set the dummy-level feature names in the same pass
        lab_encoders_ = []
        trans_array = []
        tnms = []
        
        unseen = get_unseen()
        for nm in obj_cols_:
            encoder = SafeLabelEncoder()
            lab_encoders_.append(encoder)
            
            ## This fits the reference to the encoder, and gets
            ## the transformation. We then append a single unseen
            ## value to the end as a safety for the transform method.
            ## After the transpose, this is tantamount to appending a row
            ## of unseen values so each feature can handle the 99999
            ## This will expand the matrix by N columns, but if there's 
            ## no new values, they will be entirely zero and can be dropped later.
            encoded_array = np.append(encoder.fit_transform(X[nm]), unseen)
            
            ## Add the transformed row
            trans_array.append(encoded_array) ## Updates in array
            
            ## Update the names
            n_classes = len(encoder.classes_)
            sequential_nms = ['%s.%s' % (nm,str(encoder.classes_[i])) for i in range(n_classes)]
            
            ## Remember to append the NA col
            sequential_nms.append('%s.NA' % nm)
            tnms.append(sequential_nms)
        
        ## Get the transpose
        trans = np.array(trans_array).transpose()
            
        ## flatten the name array, append numeric names prior
        num_nms = [n for n in X.columns.values if not n in obj_cols_]
        trans_nms_= [item for sublist in tnms for item in sublist]
        self.trans_nms_ = num_nms + trans_nms_
        
        # we might get an empty set of object cols
        shape_tup = trans.shape
        is_empty = len(shape_tup) < 2 or shape_tup[1] == 0 # zero cols

        ## Now we can do the actual one hot encoding, set internal state
        self.one_hot_ = None if is_empty else OneHotEncoder().fit(trans)
        self.obj_cols_ = obj_cols_
        self.lab_encoders_ = lab_encoders_
        
        return self
Example #4
0
    def fit(self, X, y=None):
        """Fit the encoder.

        Parameters
        ----------

        X : Pandas ``DataFrame``, shape=(n_samples, n_features)
            The Pandas frame to fit. The frame will only
            be fit on the object columns of the dataframe.

        y : None
            Passthrough for ``sklearn.pipeline.Pipeline``. Even
            if explicitly set, will not change behavior of ``fit``.

        Returns
        -------

        self
        """
        # check on state of X, don't care about cols or the warning
        X, _ = validate_is_pd(X, None)

        # Extract the object columns
        obj_cols_ = X.select_dtypes(include=['object']).columns.values

        # If we need to fill in the NAs, take care of it
        if self.fill is not None:
            X[obj_cols_] = X[obj_cols_].fillna(self.fill)

        # Set an array of uninitialized label encoders
        # Then use fit_transform for effiency purposes
        # We can also set the dummy-level feature names in the same pass
        lab_encoders_ = []
        trans_array = []
        tnms = []

        unseen = _get_unseen()
        for nm in obj_cols_:
            encoder = SafeLabelEncoder()
            lab_encoders_.append(encoder)

            # This fits the reference to the encoder, and gets
            # the transformation. We then append a single unseen
            # value to the end as a safety for the transform method.
            # After the transpose, this is tantamount to appending a row
            # of unseen values so each feature can handle the 99999
            # This will expand the matrix by N columns, but if there's
            # no new values, they will be entirely zero and can be dropped later.
            encoded_array = np.append(encoder.fit_transform(X[nm]), unseen)

            # Add the transformed row
            trans_array.append(encoded_array)  # Updates in array

            # Update the names
            n_classes = len(encoder.classes_)
            sequential_nms = ['%s.%s' % (nm, str(encoder.classes_[i])) for i in range(n_classes)]

            # Remember to append the NA col
            sequential_nms.append('%s.NA' % nm)
            tnms.append(sequential_nms)

        # Get the transpose
        trans = np.array(trans_array).transpose()

        # flatten the name array, append numeric names prior
        num_nms = [n for n in X.columns.values if n not in obj_cols_]
        trans_nms_ = [item for sublist in tnms for item in sublist]
        self.trans_nms_ = num_nms + trans_nms_

        # we might get an empty set of object cols
        shape_tup = trans.shape
        is_empty = len(shape_tup) < 2 or shape_tup[1] == 0  # zero cols

        # Now we can do the actual one hot encoding, set internal state
        self.one_hot_ = None if is_empty else OneHotEncoder().fit(trans)
        self.obj_cols_ = obj_cols_
        self.lab_encoders_ = lab_encoders_

        return self