Ejemplo n.º 1
0
    def perform1HotEncoding(self):
        if not hasattr(self, 'data'):
            raise ValueError('perform1HotEncoding can only be called when '
                             'data is loaded')
        if hasattr(self, 'encoder_'):
            raise ValueError('perform1HotEncoding can only be called on '
                             'non-encoded data.')
        self._encoder = None

        sparse = True if self.info['is_sparse'] == 1 else False
        has_missing = True if self.info['has_missing'] else False

        to_encode = ['categorical']
        if has_missing:
            to_encode += ['binary']
        encoding_mask = [
            feat_type.lower() in to_encode for feat_type in self.feat_type
        ]

        categorical = [
            True if feat_type.lower() == 'categorical' else False
            for feat_type in self.feat_type
        ]

        predicted_RAM_usage = float(
            predict_RAM_usage(self.data['X_train'], categorical)) / 1024 / 1024

        if predicted_RAM_usage > 1000:
            sparse = True

        if any(encoding_mask):
            encoder = OneHotEncoder(categorical_features=encoding_mask,
                                    dtype=np.float32,
                                    sparse=sparse)
            self.data['X_train'] = encoder.fit_transform(self.data['X_train'])
            if 'X_valid' in self.data:
                self.data['X_valid'] = encoder.transform(self.data['X_valid'])
            if 'X_test' in self.data:
                self.data['X_test'] = encoder.transform(self.data['X_test'])

            if not sparse and scipy.sparse.issparse(self.data['X_train']):
                self.data['X_train'] = self.data['X_train'].todense()
                if 'X_valid' in self.data:
                    self.data['X_valid'] = self.data['X_valid'].todense()
                if 'X_test' in self.data:
                    self.data['X_test'] = self.data['X_test'].todense()

            self.encoder = encoder
            self.info['is_sparse'] = 1 if sparse else 0
Ejemplo n.º 2
0
    def perform1HotEncoding(self):
        if not hasattr(self, 'data'):
            raise ValueError('perform1HotEncoding can only be called when '
                             'data is loaded')
        if hasattr(self, 'encoder_'):
            raise ValueError('perform1HotEncoding can only be called on '
                             'non-encoded data.')
        self._encoder = None

        sparse = True if self.info['is_sparse'] == 1 else False
        has_missing = True if self.info['has_missing'] else False

        to_encode = ['categorical']
        if has_missing:
            to_encode += ['binary']
        encoding_mask = [feat_type.lower() in to_encode
                         for feat_type in self.feat_type]

        categorical = [True if feat_type.lower() == 'categorical' else False
                       for feat_type in self.feat_type]

        predicted_RAM_usage = float(predict_RAM_usage(
            self.data['X_train'], categorical)) / 1024 / 1024

        if predicted_RAM_usage > 1000:
            sparse = True

        if any(encoding_mask):
            encoder = OneHotEncoder(categorical_features=encoding_mask,
                                    dtype=np.float32,
                                    sparse=sparse)
            self.data['X_train'] = encoder.fit_transform(self.data['X_train'])
            if 'X_valid' in self.data:
                self.data['X_valid'] = encoder.transform(self.data['X_valid'])
            if 'X_test' in self.data:
                self.data['X_test'] = encoder.transform(self.data['X_test'])

            if not sparse and scipy.sparse.issparse(self.data['X_train']):
                self.data['X_train'] = self.data['X_train'].todense()
                if 'X_valid' in self.data:
                    self.data['X_valid'] = self.data['X_valid'].todense()
                if 'X_test' in self.data:
                    self.data['X_test'] = self.data['X_test'].todense()

            self.encoder = encoder
            self.info['is_sparse'] = 1 if sparse else 0
Ejemplo n.º 3
0
    def perform_hot_encoding(self):
        if not hasattr(self, '_data') and self._data is not None:
            raise ValueError('perform1HotEncoding can only be called when '
                             'data is loaded')
        if hasattr(self, '_encoder') and self._encoder is not None:
            raise ValueError('perform1HotEncoding can only be called on '
                             'non-encoded data.')

        sparse = True if self.info['is_sparse'] == 1 else False
        has_missing = True if self.info['has_missing'] else False

        to_encode = ['categorical']
        if has_missing:
            to_encode += ['binary']
        encoding_mask = [feat_type.lower() in to_encode
                         for feat_type in self._feat_type]

        categorical = [True if feat_type.lower() == 'categorical' else False
                       for feat_type in self._feat_type]

        predicted_RAM_usage = float(
            predict_RAM_usage(self.data['X_train'], categorical)) / pow(1024, 2)

        if predicted_RAM_usage > 1000:
            sparse = True

        if any(encoding_mask):
            encoder = OneHotEncoder(categorical_features=encoding_mask,
                                    dtype=np.float32,
                                    sparse=sparse)

            to_dence_flg = False
            for x in ['X_train', 'X_valid', 'X_test']:
                if x in self.data:
                    self.data[x] = encoder.fit_transform(self.data[x])
                    if x == 'X_train':
                        to_dence_flg = not sparse and scipy.sparse.issparse(self.data[x])
                    if to_dence_flg:
                        self.data[x] = self.data[x].todense()

            self._encoder = encoder
            self.info['is_sparse'] = 1 if sparse else 0
Ejemplo n.º 4
0
def perform_one_hot_encoding(sparse, categorical, data):

    predicted_RAM_usage = float(predict_RAM_usage(data[0], categorical)) / 1024 / 1024

    if predicted_RAM_usage > 1000:
        sparse = True

    rvals = []
    if any(categorical):
        encoder = OneHotEncoder(categorical_features=categorical,
                                dtype=np.float32,
                                sparse=sparse)
        rvals.append(encoder.fit_transform(data[0]))
        for d in data[1:]:
            rvals.append(encoder.transform(d))

        if not sparse and scipy.sparse.issparse(rvals[0]):
            for i in range(len(rvals)):
                rvals[i] = rvals[i].todense()
    else:
        rvals = data

    return rvals, sparse
Ejemplo n.º 5
0
def perform_one_hot_encoding(sparse, categorical, data):
    predicted_RAM_usage = float(
        predict_RAM_usage(data[0], categorical)) / 1024 / 1024

    if predicted_RAM_usage > 1000:
        sparse = True

    rvals = []
    if any(categorical):
        encoder = OneHotEncoder(categorical_features=categorical,
                                dtype=np.float32,
                                sparse=sparse)
        rvals.append(encoder.fit_transform(data[0]))
        for d in data[1:]:
            rvals.append(encoder.transform(d))

        if not sparse and scipy.sparse.issparse(rvals[0]):
            for i in range(len(rvals)):
                rvals[i] = rvals[i].todense()
    else:
        rvals = data

    return rvals, sparse