Exemple #1
1
def variance_threshold(features_train, features_valid):
    """Return the initial dataframes after dropping some features according to variance threshold

    Parameters:
    ----------
    features_train: pd.DataFrame
        features of training set

    features_valid: pd.DataFrame
        features of validation set

    Output:
    ------
    features_train: pd.DataFrame

    features_valid: pd.DataFrame
    """
    from sklearn.feature_selection import VarianceThreshold    

    threshold=0.01
    selector = VarianceThreshold(threshold=threshold)
    selector.fit(features_train)

    ## Instead of using the transform() method, we look at which columns have been dropped, to be able to drop in both training and validation set the same features. This way, we keep the column names to make interpretation easier
    variances = selector.variances_
    dropped_features = features_train.columns.values[variances < threshold] #name of features to drop
    features_train.drop(dropped_features, axis=1, inplace=True)
    features_valid.drop(dropped_features, axis=1, inplace=True)

    return features_train, features_valid
    def test_same_variances(self):
        local = VarianceThreshold()
        dist = SparkVarianceThreshold()

        shapes = [((10, 5), None),
                  ((1e3, 20), None),
                  ((1e3, 20), 100),
                  ((1e4, 100), None),
                  ((1e4, 100), 600)]

        for shape, block_size in shapes:
            X_dense, X_dense_rdd = self.make_dense_rdd()
            X_sparse, X_sparse_rdd = self.make_sparse_rdd()
            Z = DictRDD([X_sparse_rdd, X_dense_rdd], columns=('X', 'Y'))

            local.fit(X_dense)
            dist.fit(X_dense_rdd)
            assert_array_almost_equal(local.variances_, dist.variances_)

            local.fit(X_sparse)
            dist.fit(X_sparse_rdd)
            assert_array_almost_equal(local.variances_, dist.variances_)

            dist.fit(Z)
            assert_array_almost_equal(local.variances_, dist.variances_)
Exemple #3
0
    def _variance_threshold(self, input_df, threshold):
        """Uses Scikit-learn's VarianceThreshold feature selection to learn the subset of features that pass the threshold

        Parameters
        ----------
        input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
            Input DataFrame to perform feature selection on
        threshold: float
            The variance threshold that removes features that fall under the threshold

        Returns
        -------
        subsetted_df: pandas.DataFrame {n_samples, n_filtered_features + ['guess', 'group', 'class']}
            Returns a DataFrame containing the features that are above the variance threshold

        """

        training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1)

        selector = VarianceThreshold(threshold=threshold)
        try:
            selector.fit(training_features)
        except ValueError:
            # None features are above the variance threshold
            return input_df[['guess', 'class', 'group']].copy()

        mask = selector.get_support(True)
        mask_cols = list(training_features.iloc[:, mask].columns) + ['guess', 'class', 'group']
        return input_df[mask_cols].copy()
Exemple #4
0
def select_features(x, y, methods=('variance', 'correlation', 'l1', 'forest')):
    ''' methods = ('variance', 'correlation', 'l1', 'forest')
        - variance: use variance threshold to discard features that are mostly 0 or 1
        - correlation: use chi2 test to remove most very correlated features
        - l1: use l1 penalty to remove features that make solution sparse
        - forest: use ExtraTreesClassifier to point out importance of features
                    select important ones
    '''
    features = x.loc[:,'Feature_1':'Feature_2']

    if 'variance' in methods:
        vt = VT(threshold=(0.99*(1-0.99)))
        vt.fit(features)
        

    if 'correlation' in methods:
        cr = SP(f_regression, percentile=80)

    if 'l1' in methods:
        rgr = MultiTaskLassoCV(cv=5, n_jobs=-1)
        m = SFM(rgr)
        

    if 'forest' in methods:
        clf = RandomRorestRegressor(n_estimators=300, max_features=0.7,n_jobs=-1).fit(x,y)
        m = SFM(clf)
        m.fit(x.values, y.values)

    for indices in idx_list:
        x_indices = x_indices & indices
    print 'All: %s' % len(x_indices)

    return list(x_indices)
def remove_feat_constants(data_frame):
    # Remove feature vectors containing one unique value,
    # because such features do not have predictive value.
    print("")
    print("Deleting zero variance features...")
    # Let's get the zero variance features by fitting VarianceThreshold
    # selector to the data, but let's not transform the data with
    # the selector because it will also transform our Pandas data frame into
    # NumPy array and we would like to keep the Pandas data frame. Therefore,
    # let's delete the zero variance features manually.
    n_features_originally = data_frame.shape[1]
    selector = VarianceThreshold()
    selector.fit(data_frame)
    # Get the indices of zero variance feats
    feat_ix_keep = selector.get_support(indices=True)
    orig_feat_ix = np.arange(data_frame.columns.size)
    feat_ix_delete = np.delete(orig_feat_ix, feat_ix_keep)
    # Delete zero variance feats from the original pandas data frame
    data_frame = data_frame.drop(labels=data_frame.columns[feat_ix_delete],
                                 axis=1)
    # Print info
    n_features_deleted = feat_ix_delete.size
    print("  - Deleted %s / %s features (~= %.1f %%)" % (
        n_features_deleted, n_features_originally,
        100.0 * (np.float(n_features_deleted) / n_features_originally)))
    return data_frame
    def variance_threshold(self, dframe=None, columns=None, skip_columns=None, thresh=0.0, autoremove=False):
        """
         Wrapper for sklearn variance threshold to for pandas dataframe
        :param dframe:
        :param columns:
        :param skip_columns:
        :param thresh:
        :param autoremove:
        :return:
        """
        logging.debug("Finding low-variance features")
        removed_features=[]
        try:
            all_columns = dframe.columns

            # remove the skip columns
            remaining_cols = all_columns.drop(skip_columns)

            # get length of new index.
            max_index = len(remaining_cols) - 1

            skipped_idx = [all_columns.get_loc(column) for column in skip_columns]

            for idx, item in enumerate(skipped_idx):
                if item > max_index:
                    diff = item - max_index
                    skipped_idx[idx] -= diff
                if item == max_index:
                    diff = item - len(skip_columns)
                    skipped_idx[idx] -= diff
                if idx == 0:
                    skipped_idx[idx] = item

            skipped_values = dframe.iloc[:skipped_idx].values

            X = dframe.loc[:, remaining_cols].values

            vt = VarianceThreshold(threshold=thresh)

            vt.fit(X)

            feature_indices = vt.get_support(indices=True)

            feature_names = [remaining_cols[idx] for idx, _ in enumerate(remaining_cols) if idx in feature_indices]

            removed_features = list(np.setdiff1d(remaining_cols, feature_names))

            logging.debug("Found %d low - variance columns " % len(removed_features))

        except Exception as e:
            logging.error(e)
            logging.error("Could not remove low variance features, some thing went wrong")
            print(e)
            pass

        return dframe, removed_features
Exemple #7
0
def test_variance_threshold():
        tpot_obj = TPOT()
        non_feature_columns = ['class', 'group', 'guess']
        training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1)
        selector = VarianceThreshold(threshold=0)
        selector.fit(training_features)
        mask = selector.get_support(True)
        mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns

        assert np.array_equal(tpot_obj._variance_threshold(training_testing_data, 0), training_testing_data[mask_cols])
Exemple #8
0
def feature_selection(train_instances):
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    logger.info('Crossvalidation started... ')
    selector = VarianceThreshold()
    selector.fit(train_instances)
    logger.info('Number of features used... ' +
                str(Counter(selector.get_support())[True]))
    logger.info('Number of features ignored... ' +
                str(Counter(selector.get_support())[False]))
    return selector
Exemple #9
0
 def _variance_threshhold(self, variance):
     '''Remove columns that do not meat the variance threshold'''
     logging.info('Removing data that has variance less than %f.' %(variance))
     vt = VarianceThreshold(variance)
     vt.fit(self.X) # XXX: Because idx should have high variance we pas all of X
     self.X = vt.transform(self.X)
     self.X_submit = vt.transform(self.X_submit)
     
     # Repeat this process for X_submit # XXX: This might not be kosher outside of competition
     vt.fit(self.X_submit)
     self.X = vt.transform(self.X)
     self.X_submit = vt.transform(self.X_submit)
Exemple #10
0
def pre_process_datasets(datasets, filter_method=None, threshold=(0, 0), normalize=True, use_cnv=False, use_mut=False):

    exp_train_data = datasets['exp_train_data']
    exp_board_data = datasets['exp_board_data']

    if use_cnv:
        cnv_train_data = datasets['cnv_train_data']
        cnv_board_data = datasets['cnv_board_data']


    if filter_method == 'cv':
        exp_cv = exp_train_data.std(1).values / exp_train_data.mean(1).values
        exp_train_data = exp_train_data.loc[exp_cv > threshold[0], :]
        exp_board_data = exp_board_data.loc[exp_cv > threshold[0], :]
        if use_cnv:
            cnv_train_data = cnv_train_data.apply(exp)
            cnv_cv = cnv_train_data.std(1).values / cnv_train_data.mean(1).values
            cnv_train_data = cnv_train_data.loc[cnv_cv > threshold[1], :]
            cnv_board_data = cnv_board_data.loc[cnv_cv > threshold[1], :]

    if filter_method == 'var':
        selector = VarianceThreshold(threshold[0])
        selector.fit(exp_train_data.values.T)
        exp_train_data = exp_train_data.loc[selector.get_support(), :]
        exp_board_data = exp_board_data.loc[selector.get_support(), :]
        if use_cnv:
            selector = VarianceThreshold(threshold[1])
            selector.fit(cnv_train_data.values.T)
            cnv_train_data = cnv_train_data.loc[selector.get_support(), :]
            cnv_board_data = cnv_board_data.loc[selector.get_support(), :]

    if use_cnv:
        feat_train_data = exp_train_data.append(cnv_train_data)
        feat_board_data = exp_board_data.append(cnv_board_data)
        print 'features after filtering', exp_train_data.shape[0], '+', cnv_train_data.shape[0], '=', feat_train_data.shape[0]
    else:
        feat_train_data = exp_train_data
        feat_board_data = exp_board_data
        print 'features after filtering', exp_train_data.shape[0]

    if use_mut:
        feat_train_data = feat_train_data.append(datasets['mut_train_data'])
        feat_board_data = feat_board_data.append(datasets['mut_board_data'])

    if normalize:
        scaler = StandardScaler().fit(feat_train_data.values.T)
        feat_train_data.values[:,:] = scaler.transform(feat_train_data.values.T).T
        feat_board_data.values[:,:] = scaler.transform(feat_board_data.values.T).T

    datasets['feat_train_data'] = feat_train_data
    datasets['feat_board_data'] = feat_board_data
Exemple #11
0
def main():
    from sklearn.feature_selection import VarianceThreshold
    X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]
    
    root = Feature('root')
    featureList = np.array([])
    for i in range(len(X[0])):
        feature = Feature('feature_%d' % i)
        root.transform('init', feature)
        featureList = np.append(featureList, feature)

    model = VarianceThreshold()
    model.fit(X)
    doWithSelector(model, featureList)
    root.printTree()
def feature_select(word,instance_dic,feature_dic, thre_hold=0.01, num_feature=100):
    instances_list = instance_dic[word]
    feature_words=feature_dic[word]
    feature_xs = []
    labels = []

    for instance in instances_list:
        label = ' '.join(instance.senseid)
        feature_x_dic = feature_vector(instance,feature_words)
        feature_vals=[]
        for word in feature_words:
            feature_vals.append(feature_x_dic[word])
        feature_xs.append(feature_vals)
        labels.append(label)

    # 1st round feature selection by removing low variance features
    sel_lowvr = VarianceThreshold(threshold=(thre_hold))
    feature_xs_selected = sel_lowvr.fit(feature_xs)
    lowvr_index = feature_xs_selected.get_support(indices=True).tolist()
    feature_xs_selected = feature_xs_selected.transform(feature_xs).tolist()



    # 2nd round feature selection using sklearn's SelectKBest()
    if num_feature < len(feature_xs_selected[0]):
        sel_chi2 = SelectKBest(chi2, k= num_feature).fit(feature_xs_selected, labels)
        chi2_index= sel_chi2.get_support(indices=True).tolist()
        #feature_xs_selected = sel_chi2.transform(feature_xs_selected).tolist()# transform from numpy array back to lis
        return lowvr_index, chi2_index
    else:
        print str(word) + ": chi2 selection not executed due to low # of features"
        return lowvr_index, [i for i in range(len(lowvr_index))]
Exemple #13
0
 def removeZeroVariance(data_frame):
     n_features_originally = data_frame.shape[1]
     selector = VarianceThreshold()
     selector.fit(data_frame)
     # Get the indices of zero variance feats
     feat_ix_keep = selector.get_support(indices=True)
     orig_feat_ix = np.arange(data_frame.columns.size)
     feat_ix_delete = np.delete(orig_feat_ix, feat_ix_keep)
     # Delete zero variance feats from the original pandas data frame
     data_frame = data_frame.drop(labels=data_frame.columns[feat_ix_delete],
                                  axis=1)
     # Print info
     n_features_deleted = feat_ix_delete.size
     print("  - Deleted %s / %s features (~= %.1f %%)" % (
         n_features_deleted, n_features_originally,
         100.0 * (np.float(n_features_deleted) / n_features_originally)))
     return data_frame
Exemple #14
0
    def test_variance_k_best_random_tree_k_fold(self):
        # Feature Selection
        samples, responses = open_model("models.obj")
        samples = np.array(samples)
        responses = np.array(responses)

        FeatureSelection = True

        if FeatureSelection:
            selection = VarianceThreshold(threshold=0.00)
            selection.fit(samples)
            idxs = selection.get_support(indices=True)
            samples = samples[:, idxs]

        samples = preprocessing.scale(samples)

        # Stratified cross-validation
        scv = StratifiedKFold(responses, n_folds=10)
        sum = 0
        for i, (train, test) in enumerate(scv):
            print('Case %d' % (i))
            # Modeling
            rdmForest = RandomForest_scikit()

            # Train
            init = time()
            rdmForest.train(samples[train, :], responses[train])

            # Test
            a, confusionPre = rdmForest.test(samples[test, :], responses[test], True)
            print('Time: %0.3fs' % (time() - init))

            for idx, fila in enumerate(confusionPre):
                for jdx, entrada in enumerate(fila):
                    if idx != jdx:
                        sum += entrada

        print("Wrong Cases: "+str(sum))
        print(' Full Case ')
        rdmForest = RandomForest_scikit()
        rdmForest.train(samples, responses)
        rdmForest.test(samples, responses, True)
    def test_same_variances(self):
        local = VarianceThreshold()
        dist = SparkVarianceThreshold()

        shapes = [((10, 5), None),
                  ((1e3, 20), None),
                  ((1e3, 20), 100),
                  ((1e4, 100), None),
                  ((1e4, 100), 600)]

        for shape, block_size in shapes:
            X, X_rdd = self.generate_dataset(shape, block_size)
            local.fit(X)
            dist.fit(X_rdd)
            assert_array_almost_equal(local.variances_, dist.variances_)

            X, X_rdd = self.generate_sparse_dataset()
            local.fit(X)
            dist.fit(X_rdd)
            assert_array_almost_equal(local.variances_, dist.variances_)
def main():
    # shape (#rows,18)
    train_users_raw = pd.read_csv('train_users_pruned.csv',delimiter=',',encoding='utf-8')
    test_users_raw = pd.read_csv('test_users.csv',delimiter=',',encoding='utf-8')


    del train_users_raw['id']
    user_id = test_users_raw['id']
    del test_users_raw['id']

    train_users_raw=train_users_raw.drop(train_users_raw.columns[[0]], axis=1)
    test_users_raw=test_users_raw.drop(test_users_raw.columns[[0]], axis=1)

    country_destination = train_users_raw['country_destination']
    del train_users_raw['country_destination']

    del train_users_raw['year_booked']
    del train_users_raw['month_booked']
    del train_users_raw['date_booked']
    del test_users_raw['year_booked']
    del test_users_raw['month_booked']
    del test_users_raw['date_booked']

    selector = VarianceThreshold(threshold=2.0)
    selector.fit(train_users_raw)
    selected_col_ind = selector.get_support(indices=True)
    selected_col_ind =  np.append(selected_col_ind, train_users_raw.shape[1]-1)
    #print selected_col_ind
    #print train_users_raw.columns.values
    # shape (#rows,11)
    train_users_downsized = train_users_raw.ix[:,selected_col_ind]
    train_users_downsized['country_destination'] = country_destination
    print train_users_downsized.columns.values
    test_users_downsized = test_users_raw.ix[:,selected_col_ind]
    test_users_downsized['id'] = user_id
    print test_users_downsized.columns.values

    train_users_downsized.to_csv('training_data_processed.csv', sep=',', encoding='utf-8')
    test_users_downsized.to_csv('testing_data_processed.csv', sep=',', encoding='utf-8')
# load data
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
ids_tr = train.pop('id').values
ids_te = test.pop('id').values
magic_tr = train.pop('wheezy-copper-turtle-magic').values
magic_te = test.pop('wheezy-copper-turtle-magic').values
target = train.pop('target').values
train = train.values
test = test.values

# infomative columns of each magic value
vt = VarianceThreshold(threshold=1.5)
infomative_cols = []
for i in range(MAX_MAGIC_NO):
    vt.fit(train[magic_tr == i])
    infomative_cols.append(vt.get_support(indices=True))

### Step-1 ###
oof_all = []
pred_all = []
for n in range(1, MAX_COMPONENTS + 1):
    oof_n = np.zeros(len(train))
    pred_n = np.zeros(len(test))
    gmm0 = GaussianMixture(n_components=n,
                           covariance_type='full',
                           random_state=RANDOM_SEED)
    gmm1 = GaussianMixture(n_components=n,
                           covariance_type='full',
                           random_state=RANDOM_SEED)
    for i in range(MAX_MAGIC_NO):
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

# 加载数据
iris = datasets.load_iris()

# 创建features和target
features = iris.data
target = iris.target

# 创建VarianceThreshold对象
thresholder = VarianceThreshold(threshold=.5)

# 创建大方差特征矩阵
features_high_variance = thresholder.fit_transform(features)

# 显示大方差特征矩阵
print(features_high_variance[0:3])

# 显示方差
print(thresholder.fit(features).variances_)

# 标准化特征矩阵
scaler = StandardScaler()
features_std = scaler.fit_transform(features)

# 计算每个特征值的方差
selector = VarianceThreshold()
print(selector.fit(features_std).variances_)
Exemple #19
0
def variance_threshold_selector(data, threshold=0.5):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]
Exemple #20
0
########
# Main #
########

if __name__ == '__main__':
    # Load training and test set
    LS = utils.load_from_csv(TRAINING_SET)
    TS = utils.load_from_csv(TEST_SET)

    # Create fingerprint features and output of learning set
    X_LS = fingerprints.transform(LS['SMILES'].values, FINGERPRINT)
    y_LS = LS['ACTIVE'].values

    # Variance threshold (feature selection)
    selector = VarianceThreshold()
    selector.fit(X_LS)
    X_LS = selector.transform(X_LS)

    # Cross validation score
    cv = ShuffleSplit(n_splits=5, test_size=0.25, random_state=0)
    scores = cross_val_score(MODEL, X_LS, y_LS, cv=cv, scoring='roc_auc')

    # Estimated AUC
    AUC = scores.mean()

    # Train model
    MODEL.fit(X_LS, y_LS)

    # Create fingerprint features of test set
    X_TS = fingerprints.transform(TS['SMILES'].values, FINGERPRINT)
    X_TS = selector.transform(X_TS)
def remove_features_with_low_variance(x_data):
    variance = VarianceThreshold(threshold=1.4)
    print ('before transform', len(x_data[4]), x_data[4])
    variance.fit(x_data)
    transformed_x = variance.transform(x_data)
    print ('after transform', len(transformed_x[4]), transformed_x[4])
def get_removed_feats(df, model):
    return df.columns.values[1:][~model.get_support()]

def update_df(df, removed_descriptors, inplace=True):
    if inplace:
        df.drop(removed_descriptors, 1, inplace=True)
        # print(df.shape)
        return df
    else:
        new_df = df.drop(removed_descriptors, 1, inplace=False)
        # print(new_df.shape)
        return new_df

# find the names of the columns with zero variance
var_sel = VarianceThreshold()
var_sel.fit(df.iloc[:,1:])
removed_descriptors = get_removed_feats(df, var_sel)

# update the data frame
update_df(df, removed_descriptors)

# correlation filter
def find_correlated(data):
    correlation_matrix = data.iloc[:,1:].corr(method='spearman')
    removed_descs = set()
    all_descs = correlation_matrix.columns.values
    for label in all_descs:
        if label not in removed_descs:
            correlations_abs = correlation_matrix[label].abs()
            mask = (correlations_abs > 0.7).values
            to_remove = set(all_descs[mask])
# ## Variance based thresholding

# In[3]:

df = pd.read_csv('datasets/Pokemon.csv')
poke_gen = pd.get_dummies(df['Generation'])
poke_gen.head()


# In[4]:

from sklearn.feature_selection import VarianceThreshold

vt = VarianceThreshold(threshold=.15)
vt.fit(poke_gen)


# In[5]:

pd.DataFrame({'variance': vt.variances_,
              'select_feature': vt.get_support()},
            index=poke_gen.columns).T


# In[6]:

poke_gen_subset = poke_gen.iloc[:,vt.get_support()].head()
poke_gen_subset

Exemple #24
0
x_train_um = x_train_1[x_train_1['fl_severidade'] == 1]
x_train_zero_down = resample(x_train_zero,
                             replace=True,
                             n_samples=len(x_train_um),
                             random_state=123)
x_train_1 = pd.concat([x_train_zero_down, x_train_um])
print(x_train_1[x_train_1['fl_severidade'] == 0].count())
y_train_1 = x_train_1['fl_severidade']
x_train_1.drop('fl_severidade', axis=1)

#################################################################
#LIMPEZA DE VARIAVEIS CONSTANTES
##################################################################
vl_limpa_const = 0.1  # variaveis com 99% dos campos repetidos
limpa_const = VarianceThreshold(threshold=vl_limpa_const)
limpa_const.fit(amostra_paci)
vars_const = [
    v for v in amostra_paci.columns
    if v not in amostra_paci.columns[limpa_const.get_support()]
]
qt_var = len([
    v for v in amostra_paci.columns
    if v not in amostra_paci.columns[limpa_const.get_support()]
])

print('Existem {} variaveis constantes com limite de {}'.format(
    qt_var, vl_limpa_const))
print('Variaveis constantes com limite de {}'.format(vl_limpa_const))
print(vars_const)
d = {'vars_const': vars_const}
df = pd.DataFrame(data=d)
Exemple #25
0
def variance_threshold_selector(data, threshold):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    variances = selector.variances_
    print(variances)
    return variances, data[data.columns[selector.get_support(indices=True)]]
    def fs_attack(self, clf, do_vt=None, do_rfe=None, verbose=None):
        """
        :param clf: classifier
        :param do_vt: do variance thresholding
        :param do_rfe: do recursive feature selection
        :return: [auc, auc_lv, auc_rfe] always 3 values. if no features were removed, the regular auc repeats.

        """

        retarr = []

        train_ = pd.read_csv(self.out_datapath + self.train_fname, index_col=0)
        test_ = pd.read_csv(self.out_datapath + self.test_fname, index_col=0)

        X_train, y_train = train_.iloc[:, 2:-3].values, train_[
            self.attribute].values
        X_test, y_test = test_.iloc[:,
                                    2:-3].values, test_[self.attribute].values

        clf.fit(X_train, y_train)
        pred_ = clf.predict(X_test)
        auc = roc_auc_score(y_test, pred_)

        if auc >= 0.5:
            print(self.vf_fname + ',', auc)
        else:
            print(self.vf_fname + ',', 1 - auc)

        retarr.append(auc)

        if do_vt:

            sel = VarianceThreshold()
            sel.fit(X_train)

            #print (sel.variances_)
            X_train_lv = sel.transform(X_train)
            #print(sel.get_support(indices=True))

            if (X_train.shape[1] > X_train_lv.shape[1]):

                if verbose:
                    print("X_train.shape[1], X_train_lv.shape[1]",
                          X_train.shape[1],
                          X_train_lv.shape[1])  # , X_test_lv.shape)

                X_test_lv = sel.transform(X_test)
                clf.fit(X_train_lv, y_train)
                pred_ = clf.predict(X_test_lv)
                auc_lv = roc_auc_score(y_test, pred_)

                if auc_lv >= 0.5:
                    print(self.vf_fname + '_lv,', auc_lv)
                else:
                    print(self.vf_fname + '_lv,', 1 - auc_lv)

                X_train = X_train_lv
                X_test = X_test_lv

                retarr.append(auc_lv)

            else:
                retarr.append(retarr[-1])

        if do_rfe:

            if not hasattr(clf, 'score'):

                print(
                    "WARNING! The classifier passed should have a 'score' method for RFE! You are probably using BinaryDNN! RFE will be skipped!"
                )
                retarr.append(retarr[-1])

            else:

                if X_train.shape[1] <= 14:  # too few features
                    if verbose:
                        print("too few features, skipping RFE")
                    retarr.append(retarr[-1])

                else:
                    selector = RFECV(clf, step=1, cv=5, n_jobs=-2)
                    selector.fit(X_train, y_train)

                    if (selector.n_features_ < X_train.shape[1]):

                        if verbose:
                            print(selector.n_features_,
                                  " feats selected out of", X_train.shape[1])

                        X_train_fe = selector.transform(X_train)
                        X_test_fe = selector.transform(X_test)

                        clf.fit(X_train_fe, y_train)
                        pred_ = clf.predict(X_test_fe)
                        auc_fe = roc_auc_score(y_test, pred_)

                        if auc_fe >= 0.5:
                            print(self.vf_fname + '_lv_fe,', auc_fe)
                        else:
                            print(self.vf_fname + '_lv_fe,', 1 - auc_fe)

                        retarr.append(auc_fe)

                    else:  # if nothing was removed

                        retarr.append(retarr[-1])

        return retarr
        """
Exemple #27
0
def remove_low_variance_features(input_df, thres=0):
    sel = VarianceThreshold(threshold=thres)
    sel.fit(input_df)
    index = np.where(sel.variances_ > thres)[0]
    return input_df.ix[:, index]
Exemple #28
0
    target_np=np.ravel(target_np_bin)
    

#############################################################################
#
# Feature Selection
#
##########################################

#Low Variance Filter
if lv_filter==1:
    print('--LOW VARIANCE FILTER ON--', '\n')
    
    #LV Threshold
    sel = VarianceThreshold(threshold=0.5)                                      #Removes any feature with less than 20% variance
    fit_mod=sel.fit(data_np)
    fitted=sel.transform(data_np)
    sel_idx=fit_mod.get_support()

    #Get lists of selected and non-selected features (names and indexes)
    temp=[]
    temp_idx=[]
    temp_del=[]
    for i in range(len(data_np[0])):
        if sel_idx[i]==1:                                                           #Selected Features get added to temp header
            temp.append(header[i+feat_start])
            temp_idx.append(i)
        else:                                                                       #Indexes of non-selected features get added to delete array
            temp_del.append(i)

    print('Selected', temp)
Exemple #29
0
def preprocess(X, LB, datasets, use_mut, use_CNV, use_exp, exp_threshold, use_methyl, use_cell_info, scale):
    """Preprocesses data"""

    # Remove COMBINATION_ID column
    # Remove CELL_LINE column
    X = X.drop("COMBINATION_ID", 1)
    LB = LB.drop("COMBINATION_ID", 1)

    if use_mut:
        mut = datasets["mut_data"]
        X = (
            X.reset_index().merge(mut, how="left", on="CELL_LINE", sort=False).set_index("index")
        )  # to preserve original order
        LB = LB.reset_index().merge(mut, how="left", on="CELL_LINE", sort=False).set_index("index")

    if use_CNV:
        cnv = datasets["cnv_data"]
        X = X.reset_index().merge(cnv, how="left", on="CELL_LINE", sort=False).set_index("index")
        LB = LB.reset_index().merge(cnv, how="left", on="CELL_LINE", sort=False).set_index("index")

    if use_exp:
        gex = datasets["gex_data"]
        col1 = gex.loc[:, "CELL_LINE"]
        exp_data = gex.iloc[:, 1:]
        # Need to impute missing values (because of the added CCLE data) before being able to filter
        imp = preprocessing.Imputer(strategy="median")
        exp_data = pd.DataFrame(data=imp.fit_transform(exp_data.values), columns=list(exp_data.columns.values))
        # Filter by variance
        filt = VarianceThreshold(exp_threshold)
        filt.fit(exp_data.values)
        gex = pd.concat(
            [col1, exp_data.loc[:, filt.get_support()]], axis=1
        )  # gex dataframe, now without missing values and filtered by variance
        X = X.reset_index().merge(gex, how="left", on="CELL_LINE", sort=False).set_index("index")
        LB = LB.reset_index().merge(gex, how="left", on="CELL_LINE", sort=False).set_index("index")

    if use_methyl:
        X = X.reset_index().merge(datasets["methyl_data"], how="left", on="CELL_LINE", sort=False).set_index("index")
        LB = LB.reset_index().merge(datasets["methyl_data"], how="left", on="CELL_LINE", sort=False).set_index("index")

    if use_cell_info:
        X = X.reset_index().merge(datasets["cell_data"], how="left", on="CELL_LINE", sort=False).set_index("index")
        LB = LB.reset_index().merge(datasets["cell_data"], how="left", on="CELL_LINE", sort=False).set_index("index")

    X = X.sort_index(axis=0)
    LB = LB.sort_index(axis=0)

    # Remove CELL_LINE column
    X = X.drop("CELL_LINE", 1)
    LB = LB.drop("CELL_LINE", 1)
    #     X = X.drop(['COMPOUND_A', 'COMPOUND_B'], 1)
    #     LB = LB.drop(['COMPOUND_A', 'COMPOUND_B'], 1)

    # Encode categorical data
    obj_cols = list(X.select_dtypes(include=["object"]).columns)
    col_names = list(X.columns.values)
    last_col = col_names.index(col_names[-1])
    X = pd.get_dummies(X, columns=obj_cols)
    LB = pd.get_dummies(LB, columns=obj_cols)
    missing_classes_X = list(set(list(LB.columns.values)[last_col:]).difference(set(list(X.columns.values)[last_col:])))
    missing_classes_LB = list(
        set(list(X.columns.values)[last_col:]).difference(set(list(LB.columns.values)[last_col:]))
    )
    X = pd.concat(
        [
            X,
            pd.DataFrame(
                data=np.zeros((len(list(X.index.values)), len(missing_classes_X))),
                index=X.index,
                columns=missing_classes_X,
            ),
        ],
        axis=1,
    )
    LB = pd.concat(
        [
            LB,
            pd.DataFrame(
                data=np.zeros((len(list(LB.index.values)), len(missing_classes_LB))),
                index=LB.index,
                columns=missing_classes_LB,
            ),
        ],
        axis=1,
    )

    #     # Remove features with more than 50% NaN
    #     keep_features = []
    #     nan = X.isnull().sum()/len(X.index)
    #     for col_name in X:
    #         if nan[col_name] <= 0.5:
    #             keep_features.append(col_name)
    #     X = X.loc[:, keep_features]
    #     LB = LB.loc[:, keep_features]
    keep_features = list(X.columns.values)

    # Impute missing values
    imp = preprocessing.Imputer(missing_values="NaN", strategy="median")
    X = imp.fit_transform(X.values)
    LB = imp.fit_transform(LB.values)

    # Remove features with zero variance
    filt = VarianceThreshold()
    filt.fit(X)
    X = X[:, filt.get_support()]
    LB = LB[:, filt.get_support()]
    keep_features = [keep_features[i] for i in xrange(len(keep_features)) if list(filt.get_support())[i]]

    if scale:
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)
        LB = scaler.transform(LB)

    return X, LB, keep_features
# Thresholding Numerical Feature Variance

# removing features with low variance by selecting a subset of features with variances above a given threshold

iris = datasets.load_iris()
features = iris.data
target = iris.target
# Create thresholder
thresholder = VarianceThreshold(threshold=.5)
# Create high variance feature matrix
features_high_variance = thresholder.fit_transform(features)
# View high variance feature matrix
print(features_high_variance[0:3])
# We can see the variance for each feature using variances_:
print(thresholder.fit(features).variances_)

# VT first calculates the variance of each feature, then it drops those whose variance does not meet that threshold.
# If the features have been standardized (mean zero and unit variance), of course variance thresholding will not work.

# Thresholding Binary Feature Variance

# You have a set of binary categorical features and want to remove those with low variance.
# We select a subset of features with a Bernoulli random variable variance above a given threshold.

# Create feature matrix with:
# Feature 0: 80% class 0
# Feature 1: 80% class 1
# Feature 2: 60% class 0, 40% class 1
features = [[0, 1, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1], [1, 0, 0]]
# Run threshold by variance
Exemple #31
0
print('NÚMERO EM PORCENTAGEM DA BASE EXCLUÍDA:',(1 - df[col].count() / 80000) * 100,'%')


"""

4º) LIMPEZA DE VARIÁVEIS CONSTANTES 

EXEMPLO: 99% DE VARIÁVEIS PREENCHIDAS COM 0 OU NULL DEVEM SER EXCLUÍDAS UMA VEZ QUE NÃO SÃO SIGNIFICATIVAS NO MODELO
              
"""

from sklearn.feature_selection import VarianceThreshold

var_thres = VarianceThreshold(threshold = 0.01)
var_thres.fit(df)
var_thres.get_support()

constant_columns = [column for column in df.columns
                    if column not in df.columns[var_thres.get_support()]]


df.drop(constant_columns, axis = 1)

print('NÚMERO DE VARIÁVEIS CONSTANTES EXCLUÍDAS:',len(constant_columns))


"""

5º) SELEÇÃO DAS MELHORES VARIÁVEIS PARA O MODELO
def remove_features_with_low_variance(x_data):
    variance = VarianceThreshold(threshold=1.4)
    print('before transform', len(x_data[4]), x_data[4])
    variance.fit(x_data)
    transformed_x = variance.transform(x_data)
    print('after transform', len(transformed_x[4]), transformed_x[4])
# for feature selection i have a few ideas. 1) run feature selection over the whole matrix of features.
#2) remove some of the recordings and do it a few times (so manually k-folding), because that way if the same features are removed
#then we know that for real those are the features not helpful


xtrain_aud = sio.loadmat('xtrain_all_aud.mat')
xtrain_aud = xtrain_aud['xtrain']
ytrain_aud = sio.loadmat('ytrain_all_aud.mat')
ytrain_aud = ytrain_aud['ytrain']

# method 1: variance threshold

Var_selector = VarThresh(.5)
# without any parameters passed to varthresh it defaults to anything with all feautres the exact same
#  am going to start with .1
Var_selector.fit(xtrain_aud)
which_feats = Var_selector.get_support()
x_aud_fitted = Var_selector.transform(xtrain_aud)

print x_aud_fitted.shape


xtrunclength = sio.loadmat('xtrunclength.mat')
xtrunclength = xtrunclength['xtrunclength']

xtesting = sio.loadmat('xtesting.mat')
xtesting = xtesting['xtesting']

xtesting = xtesting[~np.isnan(xtesting).any(axis=1),:]
xtesting = xtesting[~np.isinf(xtesting).any(axis=1),:]
Exemple #34
0
        y[categorical_ix]) * 1  # use "* 1" to convert it into int
    results_array[numerical_ix] = np.abs(
        x[numerical_ix] - y[numerical_ix]) / norm_range[numerical_ix]
    return np.sum(np.square(results_array))


tidy_data = pd.read_csv('tidy.csv')
X_data = tidy_data.drop(['CHT_No', 'reStroke'], axis=1)
X_data = X_data.drop(['Adm_AF_0otEKG', 'EKG_AF', 'Adm_AntiCO'],
                     axis=1)  # highly related to AF
y_data = tidy_data[['reStroke']]

# remove constant features
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=0)
selector.fit(X_data)
X_data = X_data[X_data.columns[selector.get_support(indices=True)]]

categorical_ix = np.array(
    [0, 2, 3, 4, 5, 6, 7, 8, 34, 35, 36, 41, 42, 43, 44, 46, 47, 48, 49, 51])
categorical_columns = X_data.columns[categorical_ix].values
numerical_columns = np.setdiff1d(X_data.columns, categorical_columns)
numerical_ix = np.array([X_data.columns.get_loc(c) for c in numerical_columns])

X_data[numerical_columns] = StandardScaler().fit_transform(
    X_data[numerical_columns])
norm_range = np.array(
    np.nanmax(X_data.values, axis=0) - np.nanmin(X_data.values, axis=0))

heom_metric = distython.HEOM(X_data, categorical_ix, nan_equivalents=[np.nan])
reducer = umap.UMAP(metric=heom_metric.heom, random_state=369)
Exemple #35
0
def model_xgb(features_train, labels_train, features_test):
	
	# Remove constant features
	selector_vt = VarianceThreshold()
	selector_vt.fit(features_train)

	# Get the indices of zero variance features
	features_kept = selector_vt.get_support(indices=True)
	orig_features = np.arange(features_train.columns.size)
	features_deleted = np.delete(orig_features, features_kept)
	#print ("Indices of deleted features:", features_deleted)
	print ("- Number of constant features removed:", len(features_deleted))

	# Delete zero variance features from train and test sets
	features_train = features_train.drop(labels=features_train.columns[features_deleted], axis=1)
	features_test = features_test.drop(labels=features_test.columns[features_deleted], axis=1)
	#print (features_train.shape, features_test.shape)


	"""
	# Another way of removing constant features. Slightly slower than the above method
	# Count the number of unique values in each feature
	nuniques_train = features_train.apply(lambda x:x.nunique())
	no_variation_train = nuniques_train[nuniques_train==1].index
	features_train = features_train.drop(no_variation_train, axis=1)

	features_test = features_test.drop(no_variation_train, axis=1)
	print (features_train.shape, features_test.shape)
	"""
	
	# Remove idential features
	features_deleted = []
	
	# Find the names of identical features by going through all the combinations of features
	for f1, f2 in itertools.combinations(iterable=features_train.columns, r=2):
		if np.array_equal(features_train[f1], features_train[f2]):
			features_deleted.append(f2)
	features_deleted = np.unique(features_deleted)

	# Delete the identical features
	features_train = features_train.drop(labels=features_deleted, axis=1)
	features_test = features_test.drop(labels=features_deleted, axis=1)
	print ("- Number of idential features removed:", len(features_deleted))

	# Add a column to count the number of zeros per row
	features_train['n0'] = (features_train == 0).sum(axis=1)
	features_test['n0'] = (features_test == 0).sum(axis=1)


	# Feature normalization
	f_train_normalized = normalize(features_train, axis=0)
	f_test_normalized = normalize(features_test, axis=0)

	# Do PCA
	print ("- Do PCA")
	pca = PCA(n_components=2)
	f_train_pca = pca.fit_transform(f_train_normalized)
	features_train['PCA1'] = f_train_pca[:,0]
	features_train['PCA2'] = f_train_pca[:,1]
	
	f_test_pca = pca.fit_transform(f_test_normalized)
	features_test['PCA1'] = f_test_pca[:,0]
	features_test['PCA2'] = f_test_pca[:,1]

	# Feature selection
	#p = 75, AUC = 0.834348
	p = 70 # AUC = 0.834820
	#p = 65, AUC = 
	print ("- Do feature selection")
	f_train_binarized = Binarizer().fit_transform(scale(features_train))
	selector_chi2 = SelectPercentile(chi2, percentile=p).fit(f_train_binarized, labels_train)
	selected_chi2 = selector_chi2.get_support() # a list of True/False to indicate if a feature is selected or not
	#selected_chi2_features = [f for i, f in enumerate(features_train.columns) if selected_chi2[i]]
	#print (selected_chi2_features)

	select_f_classif = SelectPercentile(f_classif, percentile=p).fit(f_train_binarized, labels_train)
	selected_f_classif = select_f_classif.get_support() # a list of True/False to indicate if a feature is selected or not
	#selected_f_classif_features = [f for i, f in enumerate(features_train.columns) if selected_f_classif[i]]
	#print (selected_f_classif_features)

	selected = selected_chi2 & selected_f_classif
	selected_features = [f for i, f in enumerate(features_train.columns) if selected[i]]
	#print (selected_features)

	features_train = features_train[selected_features]
	features_test = features_test[selected_features]

	
	# xgboost
	print ("- Perform xgboost")
	params = { 
	"objective": "binary:logistic",
	"silent": 1,
	"eval_metric": "auc",
	"eta": 0.03, # tried 0.01
	"subsample": 0.5, # tried 1.0, 0.4
	"colsample_bytree": 0.7, # tried 0.5, 0.9
	"max_depth": 2  # 2-->AUC=0.836347; 5 --> AUC=0.835131; 7 -> AUC=0.834351
	#"min_child_weight": 1, # tried 2 & 5
	#"gamma": 0 # tried 4
	}

	train_xgb = xgb.DMatrix(features_train, labels_train)
	test_xgb  = xgb.DMatrix(features_test)
	clf = xgb.train(params, train_xgb, num_boost_round=500) # tried 400, 500, 600

	# Get the importances of features, returning pairs of features and their importances
	importance = clf.get_fscore() 

	# Sort features by importance, and return the top features only
	# 'key' parameter specifies a function to be called on each list element prior to making comparisons
	# itemgetter(1) returns importances, itemgetter(0) returns features
	sorted_importance = sorted(importance.items(), key=operator.itemgetter(1))[-15:]
	#print (sorted_importance)

	# Put pairs of features and their importances into a DataFrame for plotting
	df_importance = pd.DataFrame(sorted_importance, columns=['feature', 'fscore'])

	# Plot the importance of features, which is useful for data exploration phase
	df_importance.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(20, 6))
	plt.title('XGBoost Feature Importance')
	plt.xlabel('feature importance')
	plt.gcf().savefig('feature_importance_xgb.png')
	#plt.show() # if putting show() before gcf().savefig, the figure won't be saved

	return clf.predict(test_xgb)
def FeatureSelection(x):
    selector = VarianceThreshold()
    selector.fit(x)
    return selector.get_support(True)
Exemple #37
0
    gs.fit(X_all, y_all)

    scores_post = cross_val_score(gs.best_estimator_, X_all, y_all, scoring=scorer, cv=5)

    print '*' * 20
    print clf.__class__.__name__
    print "Params %s" % gs.best_params_
    print "Score %.4f" % gs.best_score_
    return gs.best_estimator_




from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(.5)
result = sel.fit(X_all)
remove = []
for x in xrange(0, len(sel.variances_)):
    if(sel.variances_[x] < 0.0):
        remove.append(X_all.dtypes.index[x])
for a in remove:
    print "Removing %s" % a
    del X_all[a]


good_cols = ['failures',
            'absences',
            #'schoolsup',
            #'goout',
            # 'paid',
            # 'guardian_other',
Exemple #38
0
def delete_low_variance(x_train, x_test):
    low_var = VarianceThreshold(threshold=0.1)
    low_var.fit(x_train)
    x_train, x_test = low_var.transform(x_train), low_var.transform(x_test)
    return (x_train, x_test)
Exemple #39
0
print("Classification report: ")

print(classification_report(Y_test, Y_pred))

accuracy_score = accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ", accuracy_score)

# ** Variance Threshold**

# In[45]:

X = adult_df_rev.values[:, :-1]
Y = adult_df_rev.values[:, -1]

# In[46]:

from sklearn.feature_selection import VarianceThreshold

# In[53]:

#scaling required
vt = VarianceThreshold()
fit1 = vt.fit(X, Y)
print(fit1.variances_)

features = fit1.transform(X)
print(features)
print(features.shape[1])
print(list(zip(colname, fit1.get_support())))
# import some data to play with
iris = datasets.load_iris()

# Create features and target
features = iris.data
target = iris.target

# Create thresholder
thresholder = VarianceThreshold(threshold=.5)

# Create high variance feature matrix
features_high_variance = thresholder.fit_transform(features)

# View high variance feature matrix
features_high_variance[0:3]

# View variances
thresholder.fit(features).variances_

# Load library
from sklearn.preprocessing import StandardScaler

# Standardize feature matrix
scaler = StandardScaler()
features_std = scaler.fit_transform(features)

# Caculate variance of each feature
selector = VarianceThreshold()
selector.fit(features_std).variances_
                                                    y_donate,
                                                    train_size=0.70,
                                                    random_state=123)
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(
    X_scaled, y_donate, train_size=0.70, random_state=123)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print(X_train_scaled.shape, X_test_scaled.shape, y_train.shape, y_test.shape)

# ### IV-2. Feature Selection Using Variance Threshold

# In[41]:

# Remove all features whose variance are less than 0.05
var_selector = VarianceThreshold(threshold=0.05)
var_selector.fit(X_train_scaled)
indices_selected = var_selector.get_support(indices=True)
colnames_vtselected = [X_train_scaled.columns[i] for i in indices_selected]
print(colnames_vtselected)
len(colnames_vtselected)

# ### IV-3. Further Feature Selection using RFECV

# In[42]:

# Specify the model
estimator = LogisticRegression(
)  # estimator for RFE, select the suitable model

# Select variables using RFECV
rfe_selector = RFECV(estimator, step=1, cv=5, n_jobs=-1, scoring='roc_auc')
Exemple #42
0
rbscale = preprocessing.RobustScaler(quantile_range=(25, 75)).fit(ModelSample[FeatureList])
RbIndex = rbscale.transform(ModelSample[FeatureList])
RbIndex = pd.DataFrame(RbIndex,columns=ModelSample[FeatureList].columns,index=ModelSample[FeatureList].index)

RbIndexCorr = RbIndex.corr()
RbIndexCorr.to_csv('RbIndexCorr.csv',encoding='utf8')
RbIndex['dis_index'] = ModelSample['dis_index']

#################Index Selection##################
######1#####
#VarTIndexSelect = ModelSample.drop(['con_index','com_index','dis_index'],axis=1)
#VarTYSelect = ModelSample[['con_index','com_index','dis_index']]

from sklearn.feature_selection import VarianceThreshold
VarT = VarianceThreshold(threshold=(0.8*(1-0.8)))
VarTvari = VarT.fit(RbIndex[FeatureList]).variances_
VarTvari = pd.DataFrame(VarTvari,index=RbIndex[FeatureList].columns)
VarTvari.columns = ['variances']
VarTFeatureList = VarTvari[VarTvari.variances>=0.2].index.tolist()
VarTFeatureList.append('dis_index')
VarTSample = ModelSample[VarTFeatureList]
VarTFeatureList.pop()

#####2#####
#UniIndexSelect = IndexVarT.drop(['con_index','com_index','dis_index'],axis=1)
#UniYSelect = IndexVarT[['con_index','com_index','dis_index']]

from sklearn.feature_selection import f_classif, mutual_info_classif,chi2
mi = mutual_info_classif(VarTSample[VarTFeatureList],VarTSample.dis_index)
mi = pd.Series(mi,index=VarTSample[VarTFeatureList].columns)
ff,fp = f_classif(VarTSample[VarTFeatureList],VarTSample.dis_index)
Exemple #43
0
from sklearn.feature_selection import VarianceThreshold

# need to variance decide threshold first
vt = VarianceThreshold(threshold=0.005)   # or 0.003

# numeric data only
ansur_male_num = ansur_male.select_dtypes(include='number')

# need to normalizing all features by dividing them by their mean
normalized_df = ansur_male_num / ansur_male_num.mean()

# ensure the same variance
normalized_df.var()

_ = vt.fit(ansur_male_num)

mask = vt.get_support()
ansur_male_num = ansur_male_num.loc[:, mask]


# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=1121218)

# Init, fit, score
forest = RandomForestRegressor(random_state=1121218)

_ = forest.fit(X_train, y_train)

# Training Score
print(f"Training Score: {forest.score(X_train, y_train)}")
print("{0:.1%} accuracy on test set vs. {1:.1%} on training set".format(accuracy_test, accuracy_train))

# 93.3% accuracy on test set vs. 94.9% on training set

# Wow, what just happened!? On the full dataset the model is rubbish but with a single feature we can make good predictions? This is an example of the curse of dimensionality! The model badly overfits when we feed it too many features. It overlooks that neck circumference by itself is pretty different for males and females.

# Features with missing values or little variance

# Low variance features are so similar between different observation that they may contain little information we can use in an analysis
# To remove them we can use
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=1) # set the minimal variance threshold

# Fit the selector to our dataset
sel.fit(ansur_df)

mask = sel.get_support() # This will give us a TRUE or FALSE value on whether each feature's variance is above the threshold or not

# loc method and specify we want to select all rows using a colon for the first argument and subselect the columns by using our mask as the second
reduced_df = ansur_df.loc[:, mask]

# Normalize the variance before using it for feature selection. To do so we divide each column by its mean value before fitting the selector
sel = VarianceThreshold(threshold=0.005)

sel.fit(ansur_df / ansur_df.mean())

# After normlisation the variance in the dataset will be lower.

# When we apply the selector to our dataset the nr of features is more than haved, to 45
Exemple #45
0
    def demo(self):
        with open('abalone.data', 'r') as open_file:
            abalone = open_file.read()
            abalone = abalone.strip()
            abalone = re.split('[\n ,]', abalone)
            #print abalone

            for index in range(len(abalone)):

                if abalone[index] == 'M':
                    abalone[index] = '0'
                elif abalone[index] == 'F':
                    abalone[index] = '1'
                elif abalone[index] == 'I':
                    abalone[index] = '2'

            abalone = [abalone[i:i + 9] for i in range(0, len(abalone), 9)]
            abalone = np.array(abalone, dtype=float)
            X = np.delete(abalone, [0], axis=1)
            y = abalone.T[0]

            # feature selection
            # VarianceThreshold
            sel = VarianceThreshold(threshold=1)
            sel.fit(X, y)
            scores1 = sel.variances_
            index1 = np.argsort(scores1)
            n = index1[:-4]
            X_new_1 = np.delete(X, [n], axis=1)

            # SelectKBest
            skb = SelectKBest(chi2, k=3)
            skb.fit(X, y)
            scores2 = skb.scores_
            index2 = np.argsort(scores2)
            n = index2[:-4]
            X_new_2 = np.delete(X, [n], axis=1)

            # L1
            lsvc = LinearSVC(C=0.043, penalty="l1", dual=False)
            lsvc.fit(X, y)
            model = SelectFromModel(lsvc, prefit=True)
            X_new_3 = lsvc.transform(X)
            scores3 = lsvc.coef_
            np.abs(scores3)
            index3 = np.argsort(scores3)

            # tree
            clf = ExtraTreesClassifier()
            clf.fit(X, y)
            model = SelectFromModel(clf, prefit=True)
            scores4 = clf.feature_importances_
            index4 = np.argsort(scores4)
            n = index4[:-4]
            X_new_4 = np.delete(X, [n], axis=1)

            # pipline
            clf = Pipeline([('feature_selection',
                             SelectFromModel(LinearSVC(penalty="l2"))),
                            ('classification', RandomForestClassifier())])
            clf.fit(X, y)

            X = PolynomialFeatures(
                interaction_only=True).fit_transform(X_new_1).astype(float)
            clf = Perceptron(fit_intercept=False, n_iter=10,
                             shuffle=False).fit(X_new_1, y)
            clf.predict(X_new_1)
            score1 = clf.score(X_new_1, y)
            X = PolynomialFeatures(
                interaction_only=True).fit_transform(X_new_2).astype(float)
            clf = Perceptron(fit_intercept=False, n_iter=10,
                             shuffle=False).fit(X_new_2, y)
            clf.predict(X_new_2)
            score2 = clf.score(X_new_2, y)
            X = PolynomialFeatures(
                interaction_only=True).fit_transform(X_new_3).astype(float)
            clf = Perceptron(fit_intercept=False, n_iter=10,
                             shuffle=False).fit(X_new_3, y)
            clf.predict(X_new_3)
            score3 = clf.score(X_new_3, y)
            X = PolynomialFeatures(
                interaction_only=True).fit_transform(X_new_4).astype(float)
            clf = Perceptron(fit_intercept=False, n_iter=10,
                             shuffle=False).fit(X_new_4, y)
            clf.predict(X_new_4)
            score4 = clf.score(X_new_4, y)
            print score1, score2, score3, score4
            # 0.385683504908 0.385683504908 0.386641129998 0.531242518554
            #0.385683504908 0.385683504908 0.386641129998 0.403878381614
            #0.385683504908 0.385683504908 0.386641129998 0.456787167824
            #0.385683504908 0.385683504908 0.386641129998 0.531481924826
            #0.385683504908 0.385683504908 0.386641129998 0.427100790041

            #fig, ax = plt.subplots()
            '''fig = plt.figure(1)
            ax2 = fig.add_subplot(311)
            ax3 = fig.add_subplot(312)
            ax4=fig.add_subplot(313)

            y1=[]
            y2=[]
            y3=[]

            for i in range(8):
                x_1= np.linspace(i,8,1)
                x_2=np.linspace(i,9,1)
                x_3=np.linspace(i,10,1)


                y1=scores_1[i]
                y2=scores_2[i]
                y3=scores_3[i]'''
            '''ax.cla()
                ax.set_title("festure selection")
                ax.set_xlabel("features")
                ax.set_ylabel("scores")
                ax.set_xlim(0, 10)
                ax.grid()
                ax.plot(y1, 'r^',label='Varience')
                ax.plot(y2, 'k^' ,label='selectbeskt')
                ax.plot(y3, 'bs',label='tree')
                ax.legend(loc='best')
                

                
                ax2.set_title("Varience")
                ax3.set_title("SelectKBest")
                ax4.set_title("ExtraTreesClassifier")

                ax2.set_xlabel("features")
                ax2.set_ylabel("scores")
                ax2.set_xlim(-1,10,1)
                n1=ax2.plot(x_1,y1,'r^')

                n2=ax3.plot(x_2,y2,'k^')

                n3=ax4.plot(x_3,y3,'bs')'''
            '''ax2.legend(loc='best')
                ax3.legend(loc='best')
                ax4.legend(loc='best')



                    #if ax2.legend in fig2:

                plt.pause(1.5)'''
            '''plt.clf()
def compute(train, test):

  #Train data
  train_X              = [];
  train_restaurant_ids = [];
  test_X               = [];
  test_restaurant_ids  = [];
  train_Y              = [];

  #Common feature values in train/test
  train_feature_val    = {};
  test_feature_val     = {};

  build_FeatureVal(train, train_feature_val);
  build_FeatureVal(test, test_feature_val);
 
  buildFeatures(train, train_feature_val, test_feature_val, train_X, train_Y, train_restaurant_ids, "train");
  buildFeatures(test, train_feature_val, test_feature_val, test_X, None, test_restaurant_ids, "test");


  train_Y = np.array(train_Y);

  enc = OneHotEncoder(categorical_features=np.array([3,4,5,32,33,34,35,36,37,38,39,40,41,42]), sparse=False, n_values=100);

  enc.fit(test_X);

  train_X = enc.transform(train_X);
  test_X  = enc.transform(test_X);

  print("No of train features " +  str(len(train_X[0])));
  print("No of test features " +  str(len(test_X[0])));

  #Remove features with similar values
  selector = VarianceThreshold();
  selector.fit(train_X);
  train_X = selector.transform(train_X);
  test_X = selector.transform(test_X);

  print("No of train features " +  str(len(train_X[0])));
  print("No of test features " +  str(len(test_X[0])));

  
  parameters_to_try = generateParams();
  print("No of Paramters to test " + str(len(parameters_to_try)));

  #Contruct parameters as s list
  models_to_try     = [ (copy.copy(train_X), copy.copy(train_Y), parameters_to_try[i] ) for i in range(0, len(parameters_to_try)) ];

  #Create a Thread pool.
  pool              = Pool(8);
  results           = pool.map( train_model_wrapper, models_to_try );

  pool.close();
  pool.join();


  best_params       = None;
  best_rmse         = sys.float_info.max;
  for i in range(0, len(results)):
    if results[i][1] < best_rmse:
        best_rmse   = results[i][1];
        best_params = results[i][0];

  print("Best Params : " + str(best_params));
  print("Best RMSE :   " + str(best_rmse));

  #estimator               = SVR(**params)
  #estimator               = RandomForestRegressor(**best_params)
  estimator                = GradientBoostingRegressor(**best_params)


  estimator.fit(train_X, train_Y);

  print("Writing Output");
  predict_and_save(estimator, test_X, test_restaurant_ids);
Exemple #47
0
class VarianceSelector(Transformer):
    type = 9

    def __init__(self, threshold=1e-7):
        super().__init__("variance_selector")
        self.input_type = [NUMERICAL, DISCRETE, CATEGORICAL]
        self.compound_mode = 'only_new'
        self.threshold = threshold

    def operate(self, input_datanode, target_fields=None):
        from sklearn.feature_selection import VarianceThreshold

        feature_types = input_datanode.feature_types
        X, y = input_datanode.data
        if target_fields is None:
            target_fields = collect_fields(feature_types, self.input_type)
            X_new = X.copy()
        else:
            X_new = X[:, target_fields]

        n_fields = len(feature_types)
        irrevalent_fields = list(range(n_fields))
        for field_id in target_fields:
            irrevalent_fields.remove(field_id)

        is_selected = [True] * len(target_fields)
        if self.model is None:
            self.model = VarianceThreshold(threshold=self.threshold)
            self.model.fit(X_new)

        for idx, var in enumerate(self.model.variances_):
            is_selected[idx] = True if var > self.threshold else False

        irrevalent_types = [feature_types[idx] for idx in irrevalent_fields]
        selected_types = [
            feature_types[idx] for idx in target_fields if is_selected[idx]
        ]
        selected_types.extend(irrevalent_types)

        _X = self.model.transform(X_new)

        if len(irrevalent_fields) > 0:
            new_X = np.hstack((_X, X[:, irrevalent_fields]))
            if input_datanode.feature_names is not None:
                feature_names = np.hstack(([
                    input_datanode.feature_names[idx]
                    for idx in irrevalent_fields
                ], [
                    input_datanode.feature_names[idx]
                    for idx in self.model.get_support(True)
                ]))
            else:
                feature_names = None
        else:
            new_X = _X
            if input_datanode.feature_names is not None:
                feature_names = [
                    input_datanode.feature_names[idx]
                    for idx in self.model.get_support(True)
                ]
            else:
                feature_names = None
        new_feature_types = selected_types
        output_datanode = DataNode((new_X, y),
                                   new_feature_types,
                                   input_datanode.task_type,
                                   feature_names=feature_names)
        output_datanode.trans_hist = input_datanode.trans_hist.copy()
        output_datanode.trans_hist.append(self.type)
        output_datanode.enable_balance = input_datanode.enable_balance
        output_datanode.data_balance = input_datanode.data_balance
        self.target_fields = target_fields.copy()

        return output_datanode

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None,
                                        optimizer='smac'):
        if optimizer == 'smac':
            cs = ConfigurationSpace()
            return cs
        elif optimizer == 'tpe':
            from hyperopt import hp
            space = {}
            return space
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0, stratify = y)


# In[80]:


x_train.shape


# ### Removing Constant, Quasi & Duplicated features

# In[81]:


constant_filter = VarianceThreshold(threshold =0.01)
constant_filter.fit(x_train)
x_train_filter = constant_filter.transform(x_train)
x_test_filter = constant_filter.transform(x_test)


# In[82]:


x_train_filter.shape


# In[83]:


x_train_T = x_train_filter.T
x_test_T = x_test_filter.T
Exemple #49
0
).add(flights[AIRPORTS[1]].value_counts(), fill_value=0)

for airport in AIRPORTS:
    flights = join_aggregates(
        flights, airport, airport_size, airport+"_TRAFFIC")

for airport in AIRPORT_NAMES:
    airport_intl = airport+"_INTL"
    flights[airport_intl] = flights[airport].apply(is_international)

for coordinate in COORDINATES:
    flights[coordinate+"_DIF"] = flights[coordinate +
                                         '_ORIGIN'] - flights[coordinate+'_DESTINATION']

flights.drop(UNAVAILABLE + EXTRA + DUP_COLS, axis=1, inplace=True)

STR_VAR = [v for v in flights.columns if is_string_dtype(flights[v])]
flights = categorize_multiple(flights, STR_VAR)
flights.drop(STR_VAR, axis=1, inplace=True)

NULL_VAR = get_col_with_null(flights)
NUM_VAR_COMPLETE = [v for v in flights.columns if is_numeric_dtype(
    flights[v]) and v not in NULL_VAR]
selector = VarianceThreshold()
selector.fit(flights[NUM_VAR_COMPLETE])
NUM_VAR_REMOVED = [NUM_VAR_COMPLETE[i] for i in range(
    len(NUM_VAR_COMPLETE)) if selector.get_support()[i] == False]
flights.drop(NUM_VAR_REMOVED, axis=1, inplace=True)

flights.to_csv("./data/flights_processed.csv", index=False)
Exemple #50
0
def variance_threshold_selector(X, threshold=0.01):
    selector = VarianceThreshold(threshold)
    selector.fit(X)
    return X[X.columns[selector.get_support(indices=True)]]
Exemple #51
0
def preProcessData(trainFeatureMatrix, testFeatureMatrix):
	totalFeatureNum = 52
	singleValueIndexList = [17, 19, 20, 23]
	categoricalAttriIndexList = [0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 44, 45, 46]	
	categoricalFeatureValueNumList = [13, 112, 2, 13, 13, 112, 2, 13, 145, 4, 3031, 4, 138, 102, 102, 2090]
	cateNumericIndexList = [1, 6, 15, 16, 18,21,22,24,25,26,27,28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,49,50,51]

	numericAttriIndexList = [1, 6, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 47, 48, 49, 50, 51]
	
	# for i in range(len(trainFeatureSpace[0])):
	# 	if not i in categoricalAttriIndexList:
	# 		#print 'numerical', i, len(list(set(trainFeatureSpace[:,i])))
	# 		print '%s, numerical, train: %s, test:%s' % (i, len(list(set(trainFeatureMatrix[:,i]))), len(list(set(testFeatureMatrix[:,i]))))
	# 	else:
	# 		print '%s, categorical, train: %s, test:%s' % (i, len(list(set(trainFeatureMatrix[:,i]))), len(list(set(testFeatureMatrix[:,i]))))
	


	tempResultMatrix =  np.concatenate((trainFeatureMatrix, testFeatureMatrix), axis=0)

	# print len(trainFeatureMatrix), len(trainFeatureMatrix[0])
	# print len(testFeatureMatrix), len(testFeatureMatrix[0])
	# print len(tempResultMatrix), len(tempResultMatrix[0])

	# exit()

	# for i in range(len(trainFeatureMatrix)):
	# 	for j in range(len(trainFeatureMatrix[0])):
	# 		if j in cateNumericIndexList:
	# 			trainFeatureMatrix[i][j] = int(trainFeatureMatrix[i][j])

	# for i in range(len(testFeatureMatrix)):
	# 	for j in range(len(testFeatureMatrix[0])):
	# 		if j in cateNumericIndexList:
	# 			testFeatureMatrix[i][j] = int(testFeatureMatrix[i][j])

	#selectedFeatureList = []
	# for i in range(53):
	# 	if not i in singleValueIndexList:
	# 		selectedFeatureList.append(i)

	# trainFeatureMatrix = trainFeatureMatrix[ : , selectedFeatureList]
	# testFeatureMatrix = testFeatureMatrix[ : , selectedFeatureList]
	from sklearn.preprocessing import OneHotEncoder
	enc = OneHotEncoder()
	enc.__init__(categorical_features = categoricalAttriIndexList + cateNumericIndexList)
	enc.fit(tempResultMatrix)
	trainFeatureMatrix = enc.transform(trainFeatureMatrix).toarray()
	testFeatureMatrix = enc.transform(testFeatureMatrix).toarray()

	print 'old feature num is ', len(trainFeatureMatrix[0]), len(testFeatureMatrix[0])


	#tempResultMatrix =  np.concatenate((trainFeatureMatrix, testFeatureMatrix), axis=0)


	sel = VarianceThreshold()
	sel.fit(trainFeatureMatrix)
	trainFeatureMatrix = sel.transform(trainFeatureMatrix)
	testFeatureMatrix = sel.transform(testFeatureMatrix)
	print 'new feature num is ', len(trainFeatureMatrix[0]), len(testFeatureMatrix[0])
	#exit()
	return trainFeatureMatrix, testFeatureMatrix
Exemple #52
0
def get_low_variance_columns(dframe=None,
                             columns=[],
                             skip_columns=[],
                             threshold=0.0,
                             autoremove=False):
    """Wrapper for sklearn VarianceThreshold for use on pandas dataframes."""
    print("Finding low-variance features.")
    removed_features = []
    ranking_variance_thresholds = {}
    try:
        # get list of all the original df columns
        all_columns = dframe.columns

        # remove `skip_columns`
        remaining_columns = all_columns.drop(skip_columns)

        # get length of new index
        max_index = len(remaining_columns) - 1

        # get indices for `skip_columns`
        skipped_idx = [all_columns.get_loc(col) for col in skip_columns]

        # adjust insert location by the number of columns removed
        # (for non-zero insertion locations) to keep relative
        # locations intact
        for idx, item in enumerate(skipped_idx):
            if item > max_index:
                diff = item - max_index
                skipped_idx[idx] -= diff
            if item == max_index:
                diff = item - len(skip_columns)
                skipped_idx[idx] -= diff
            if idx == 0:
                skipped_idx[idx] = item

        # get values of `skip_columns`
        skipped_values = dframe.iloc[:, skipped_idx].values

        # get dataframe values
        X = dframe.loc[:, remaining_columns].values

        # instantiate VarianceThreshold object
        vt = VarianceThreshold(threshold=threshold)

        # fit vt to data
        vt.fit(X)

        # threshold ranking
        ranking_variance_thresholds = dict(
            list(zip(remaining_columns, vt.variances_)))

        # get the indices of the features that are being kept
        feature_indices = vt.get_support(indices=True)

        # remove low-variance columns from index
        feature_names = [
            remaining_columns[idx] for idx, _ in enumerate(remaining_columns)
            if idx in feature_indices
        ]

        # get the columns to be removed
        removed_features = list(np.setdiff1d(remaining_columns, feature_names))
        print(("""Found {0} low-variance columns.
                """.format(len(removed_features))))

        # remove the columns
        if autoremove:
            print("Removing low-variance features.")
            # remove the low-variance columns
            X_removed = vt.transform(X)

            print("Reassembling the dataframe (with low-variance "
                  "features removed).")
            # re-assemble the dataframe
            dframe = pd.DataFrame(data=X_removed, columns=feature_names)

            # add back the `skip_columns`
            for idx, index in enumerate(skipped_idx):
                dframe.insert(loc=index,
                              column=skip_columns[idx],
                              value=skipped_values[:, idx])
            print("Succesfully removed low-variance columns.")

        # do not remove columns
        else:
            print("No changes have been made to the dataframe.")

    except Exception as e:
        print(e)
        print("Could not remove low-variance features. Something "
              "went wrong.")
        return dframe, [], {}

    return dframe, removed_features, ranking_variance_thresholds
train_data['Y']=train_X_Y['Y']
train_data['X']=train_X_Y['X']

training, validation = train_test_split(train_data, train_size=.60)

features = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 
'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION','NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN',
'TARAVAL', 'TENDERLOIN','X','Y']
 
features2 = [x for x in range(0,24)]
features = features + features2


print "Variance Threshold"
sel = VarianceThreshold(threshold=(0.90 * (1 - 0.90)))
selector=sel.fit(training[features])
print selector.get_support(indices=True)

for i in range(0,len(features)):
    if i in selector.get_support(indices=True):
        print features[i]


print "Select from Model - Logistic"
modelLReg = LogisticRegression()
modelLReg = modelLReg.fit(training[features], training['crime'])
model = SelectFromModel(modelLReg, prefit=True)
print model.get_support(indices=True)

for i in range(0,len(features)):
    if i in model.get_support(indices=True):
#To drop the year column
retail_set = retail_set.drop('YEAR', axis=1)

#To fill the missing values with the mean of their respective columns
retail_set.fillna(retail_set.mean(), inplace=True)

#To convert the categorical values to numeric so that it can be included for the prediction
X = pd.get_dummies(retail_set,
                   columns=['CITY', 'STATE', 'FORMAT', 'REGION', 'SPECIAL'],
                   drop_first=True)

#Eliminate the columns whose variance is less than 0.5
sel = VarianceThreshold(threshold=(.5 * (1 - .5)))

sel.fit(X)

X = sel.transform(X)
X = StandardScaler().fit_transform(X)

#To reduce the dimensions using principal component analyis
pca = PCA(n_components=9)
pc = pca.fit_transform(X)

#Assigning the required components to features and output labels so that it can be used in future for training and testing
X = pc
y = retail_set['RS_SALES']

#Dividing the set into training and testing
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
Exemple #55
0
df = pd.read_csv(data_path('train.csv'))
df_test = pd.read_csv(data_path('test.csv'))

target = df['TARGET']
del df['TARGET']
id = df_test['ID']

from src.transfomations import remove_correlated
_, to_remove = remove_correlated(df, 0.99)

df_test.drop(to_remove, axis=1, inplace=True
             )
variance_threshold = VarianceThreshold(threshold=0.001)
df = variance_threshold.fit_transform(df)

df_test = variance_threshold.fit(df_test)

m2_xgb = XGBClassifier(n_estimators=110, nthread=1, max_depth=4, scale_pos_weight=.8)
m2_xgb.fit(df, target, eval_metric='auc')

param_dist = {
    "n_estimators": [80, 100, 110, 130],
    "max_depth": [3, 4, 5],
    "scale_pos_weight": [0.8, 1, 1.2],
    "learning_rate": [0.1, 0.05, 0.02],
}

randomizedSearch = RandomizedSearchCV(m2_xgb, n_iter=20, param_distributions=param_dist, verbose=2)
randomizedSearch.fit(df, target)

best = randomizedSearch.best_estimator_
"""
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

kbestfilter = SelectKBest(chi2,k=500)

train_features = kbestfilter.fit_transform(dataset_small.get_train_features(),
                     dataset_small.get_train_labels())
test_features = kbestfilter.transform(dataset_small.get_test_features())                     


##
threshold = 0.8*(1-0.8)
sel_var = VarianceThreshold(threshold = threshold)
sel_var.fit(np.sign(dataset_small.get_train_features()))

train_selected_features = sel_var.transform(dataset_small.get_train_features())
test_selected_features = sel_var.transform(dataset_small.get_test_features())

## train naive bayes
import sklearn.naive_bayes as naive_bayes
bnb = naive_bayes.BernoulliNB()

spam_filter = bnb.fit(np.sign(train_selected_features), 
                      dataset_small.get_train_labels())
spam_pred   = spam_filter.predict(test_selected_features)

## evaluate goodness of prediction
import sklearn.metrics
report = sklearn.metrics.classification_report(dataset_small.get_test_labels(),
Exemple #57
0

    pass

# Import train and test raw data
train_data_raw = pd.read_csv("train.csv")
test_data_raw = pd.read_csv("test.csv")

train_data = optimize_data(train_data_raw)
test_data = optimize_data(test_data_raw)

# Remove the features with low variance
from sklearn.feature_selection import VarianceThreshold
P = .8
sel = VarianceThreshold(threshold=(P * (1 - P)))
train_data = train_data[train_data.columns[sel.fit(train_data).variances_>(P * (1 - P))]]
test_data = test_data[train_data.columns[sel.fit(train_data).variances_>(P * (1 - P))]]

# test_data must contain the same columns as train data due to model fitting and prediction
for column in train_data.columns:
    if not column in test_data.columns:
        test_data[column] = pd.DataFrame().apply(lambda _: '', axis=1)
test_data.fillna(0, inplace=True)

#train_data = train_data_raw[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]]
train_data_results = train_data_raw["Survived"]

# scaling train data
train_data_scaled = scale(train_data)
# scaling test data
test_data_scaled = scale(test_data)
Exemple #58
0
def get_low_variance_columns(dframe=None, columns=None, skip_columns=None, thresh=0.0, autoremove=False):
    """
    Wrapper for sklearn VarianceThreshold for use on pandas dataframes.
    """
    print("Finding low-variance features.")
    try:
        # get list of all the original df columns
        all_columns = dframe.columns

        # remove `skip_columns`
        remaining_columns = all_columns.drop(skip_columns)

        # get length of new index
        max_index = len(remaining_columns) - 1

        # get indices for `skip_columns`
        skipped_idx = [all_columns.get_loc(column) for column in skip_columns]

        # adjust insert location by the number of columns removed
        # (for non-zero insertion locations) to keep relative
        # locations intact
        for idx, item in enumerate(skipped_idx):
            if item > max_index:
                diff = item - max_index
                skipped_idx[idx] -= diff
            if item == max_index:
                diff = item - len(skip_columns)
                skipped_idx[idx] -= diff
            if idx == 0:
                skipped_idx[idx] = item

        # get values of `skip_columns`
        skipped_values = dframe.iloc[:, skipped_idx].values

        # get dataframe values
        X = dframe.loc[:, remaining_columns].values

        # instantiate VarianceThreshold object
        vt = VarianceThreshold(threshold=thresh)

        # fit vt to data
        vt.fit(X)

        # get the indices of the features that are being kept
        feature_indices = vt.get_support(indices=True)

        # remove low-variance columns from index
        feature_names = [remaining_columns[idx] for idx, _ in enumerate(remaining_columns) if idx in feature_indices]

        # get the columns to be removed
        removed_features = list(np.setdiff1d(remaining_columns, feature_names))
        print("Found {0} low-variance columns.".format(len(removed_features)))

        # remove the columns
        if autoremove:
            print("Removing low-variance features.")
            # remove the low-variance columns
            X_removed = vt.transform(X)

            print("Reassembling the dataframe (with low-variance " "features removed).")
            # re-assemble the dataframe
            dframe = pd.DataFrame(data=X_removed, columns=feature_names)

            # add back the `skip_columns`
            for idx, index in enumerate(skipped_idx):
                dframe.insert(loc=index, column=skip_columns[idx], value=skipped_values[:, idx])
            print("Succesfully removed low-variance columns.")

        # do not remove columns
        else:
            print("No changes have been made to the dataframe.")

    except Exception as e:
        print(e)
        print("Could not remove low-variance features. Something " "went wrong.")
        pass

    return dframe
# -*-coding:utf-8-*-
# @auth ivan
# @time 20200611
# @goal test 054.Test_Feature_selection

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, f_classif
X = [[100, 1, 2, 3], [100, 4, 5, 6], [100, 7, 8, 9], [101, 11, 12, 13]]
selector = VarianceThreshold(1)
selector.fit(X)
print('Variances is %s' % selector.variances_)
print('After transform is \n%s' % selector.transform(X))
print('The surport is %s' % selector.get_support(True))
print('The surport is %s' % selector.get_support(False))
print('After reverse transform is \n%s' %
      selector.inverse_transform(selector.transform(X)))
# Variances is [ 0.1875 13.6875 13.6875 13.6875]
# After transform is
# [[ 1  2  3]
#  [ 4  5  6]
#  [ 7  8  9]
#  [11 12 13]]
# The surport is [1 2 3]
# The surport is [False  True  True  True]
# After reverse transform is
# [[ 0  1  2  3]
#  [ 0  4  5  6]
#  [ 0  7  8  9]
#  [ 0 11 12 13]]

X = [[1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [3, 3, 3, 3, 3], [1, 1, 1, 1, 1]]
def remove_variance(features, p):
    t = (p * (1 - p))
    sel = VarianceThreshold(threshold=p)
    sel = sel.fit(features)
    return sel.fit_transform(features), sel