Beispiel #1
0
def recursive_feature_selection(info_humans, info_bots, params, scale=False):

    X, y, features, scaler = get_Xy(info_humans, info_bots, scale=scale)

    print "first feature selection by variance test"
    skb = VarianceThreshold(threshold=(.8 * (1 - .8)))
    X_new = skb.fit_transform(X)
    features_1 = features[skb.get_support()]

    print "second feature selection by ch2 test"
    skb = SelectKBest(chi2, k=200)
    # skb = SelectFpr(chi2, alpha=0.005)
    X_new = skb.fit_transform(X_new, y)
    features_2 = features_1[skb.get_support()]

    # skb = PCA(n_components=250)
    # X_new = skb.fit_transform(X_new, y)
    
    print "third feature selection by recursive featue elimination (RFECV)"
    clf = LogisticRegression(penalty=params['penalty'],
                             C=params['C'])
    # clf = SVC(kernel="linear")
    rfecv = RFECV(estimator=clf, step=1,
                  cv=cross_validation.StratifiedKFold(y, 5),
                  scoring='roc_auc', verbose=1)
    rfecv.fit(X_new, y)

    print("Optimal number of features : %d" % rfecv.n_features_)
    
    return skb, rfecv
Beispiel #2
0
def feature_selection(train_instances):
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    logger.info('Crossvalidation started... ')
    selector = VarianceThreshold()
    selector.fit(train_instances)
    logger.info('Number of features used... ' +
                str(Counter(selector.get_support())[True]))
    logger.info('Number of features ignored... ' +
                str(Counter(selector.get_support())[False]))
    return selector
Beispiel #3
0
def pre_process_datasets(datasets, filter_method=None, threshold=(0, 0), normalize=True, use_cnv=False, use_mut=False):

    exp_train_data = datasets['exp_train_data']
    exp_board_data = datasets['exp_board_data']

    if use_cnv:
        cnv_train_data = datasets['cnv_train_data']
        cnv_board_data = datasets['cnv_board_data']


    if filter_method == 'cv':
        exp_cv = exp_train_data.std(1).values / exp_train_data.mean(1).values
        exp_train_data = exp_train_data.loc[exp_cv > threshold[0], :]
        exp_board_data = exp_board_data.loc[exp_cv > threshold[0], :]
        if use_cnv:
            cnv_train_data = cnv_train_data.apply(exp)
            cnv_cv = cnv_train_data.std(1).values / cnv_train_data.mean(1).values
            cnv_train_data = cnv_train_data.loc[cnv_cv > threshold[1], :]
            cnv_board_data = cnv_board_data.loc[cnv_cv > threshold[1], :]

    if filter_method == 'var':
        selector = VarianceThreshold(threshold[0])
        selector.fit(exp_train_data.values.T)
        exp_train_data = exp_train_data.loc[selector.get_support(), :]
        exp_board_data = exp_board_data.loc[selector.get_support(), :]
        if use_cnv:
            selector = VarianceThreshold(threshold[1])
            selector.fit(cnv_train_data.values.T)
            cnv_train_data = cnv_train_data.loc[selector.get_support(), :]
            cnv_board_data = cnv_board_data.loc[selector.get_support(), :]

    if use_cnv:
        feat_train_data = exp_train_data.append(cnv_train_data)
        feat_board_data = exp_board_data.append(cnv_board_data)
        print 'features after filtering', exp_train_data.shape[0], '+', cnv_train_data.shape[0], '=', feat_train_data.shape[0]
    else:
        feat_train_data = exp_train_data
        feat_board_data = exp_board_data
        print 'features after filtering', exp_train_data.shape[0]

    if use_mut:
        feat_train_data = feat_train_data.append(datasets['mut_train_data'])
        feat_board_data = feat_board_data.append(datasets['mut_board_data'])

    if normalize:
        scaler = StandardScaler().fit(feat_train_data.values.T)
        feat_train_data.values[:,:] = scaler.transform(feat_train_data.values.T).T
        feat_board_data.values[:,:] = scaler.transform(feat_board_data.values.T).T

    datasets['feat_train_data'] = feat_train_data
    datasets['feat_board_data'] = feat_board_data
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser(description='Normalize the feature values')
    required = parser.add_argument_group('required options')

    required.add_argument('-x', '--outlist', required=True, help='File containing feature values')
    required.add_argument('-y', '--execlist', required=True, help='File containing exec list')
    
    args = parser.parse_args()

    #X = np.loadtxt(args.outlist, skiprows=1)
    np.set_printoptions(precision=2)
    X = np.genfromtxt(args.outlist, skiprows=1)
    X=np.nan_to_num(X)
    Y = np.loadtxt(args.execlist, ndmin=2)

    #f = open("trainlist","wb")
    #newResult = X/Y
    #sel = VarianceThreshold(threshold=(.8*(1-.8)))
    sel = VarianceThreshold(threshold=(.8*(1-.8)))
    result1 = sel.fit_transform(X)
    newResult = result1/Y
    #result2 = sel.fit_transform(newResult)

    #feature collection for test programs
    if os.path.isfile('eventlist'):
       features = np.genfromtxt('eventlist',dtype='str')
       featureFromVariance = sel.get_support(indices=True)
       text_file = open("variancefeatures.txt","w")
       for i in featureFromVariance:
           text_file.write(features[i])
           text_file.write("\n")
       text_file.close()

    np.savetxt('normfeaturelist', newResult, fmt='%.2f', delimiter='\t')
def remove_feat_constants(data_frame):
    # Remove feature vectors containing one unique value,
    # because such features do not have predictive value.
    print("")
    print("Deleting zero variance features...")
    # Let's get the zero variance features by fitting VarianceThreshold
    # selector to the data, but let's not transform the data with
    # the selector because it will also transform our Pandas data frame into
    # NumPy array and we would like to keep the Pandas data frame. Therefore,
    # let's delete the zero variance features manually.
    n_features_originally = data_frame.shape[1]
    selector = VarianceThreshold()
    selector.fit(data_frame)
    # Get the indices of zero variance feats
    feat_ix_keep = selector.get_support(indices=True)
    orig_feat_ix = np.arange(data_frame.columns.size)
    feat_ix_delete = np.delete(orig_feat_ix, feat_ix_keep)
    # Delete zero variance feats from the original pandas data frame
    data_frame = data_frame.drop(labels=data_frame.columns[feat_ix_delete],
                                 axis=1)
    # Print info
    n_features_deleted = feat_ix_delete.size
    print("  - Deleted %s / %s features (~= %.1f %%)" % (
        n_features_deleted, n_features_originally,
        100.0 * (np.float(n_features_deleted) / n_features_originally)))
    return data_frame
Beispiel #6
0
    def _variance_threshold(self, input_df, threshold):
        """Uses Scikit-learn's VarianceThreshold feature selection to learn the subset of features that pass the threshold

        Parameters
        ----------
        input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
            Input DataFrame to perform feature selection on
        threshold: float
            The variance threshold that removes features that fall under the threshold

        Returns
        -------
        subsetted_df: pandas.DataFrame {n_samples, n_filtered_features + ['guess', 'group', 'class']}
            Returns a DataFrame containing the features that are above the variance threshold

        """

        training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1)

        selector = VarianceThreshold(threshold=threshold)
        try:
            selector.fit(training_features)
        except ValueError:
            # None features are above the variance threshold
            return input_df[['guess', 'class', 'group']].copy()

        mask = selector.get_support(True)
        mask_cols = list(training_features.iloc[:, mask].columns) + ['guess', 'class', 'group']
        return input_df[mask_cols].copy()
Beispiel #7
0
 def varianceSelection(self, df, threashold=.8):
     if not isinstance(df, pandas.core.frame.DataFrame):
         logger.error('[%s] : [ERROR] Variance selection only possible on Dataframe not %s',
                                      datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(df))
         sys.exit(1)
     sel = VarianceThreshold(threshold=(threashold * (1 - threashold)))
     sel.fit_transform(df)
     return df[[c for (s, c) in zip(sel.get_support(), df.columns.values) if s]]
def test_zero_variance():
    """Test VarianceThreshold with default setting, zero variance."""

    for X in [data, csr_matrix(data), csc_matrix(data), bsr_matrix(data)]:
        sel = VarianceThreshold().fit(X)
        assert_array_equal([0, 1, 3, 4], sel.get_support(indices=True))

    assert_raises(ValueError, VarianceThreshold().fit, [0, 1, 2, 3])
    assert_raises(ValueError, VarianceThreshold().fit, [[0, 1], [0, 1]])
    def variance_threshold(self, dframe=None, columns=None, skip_columns=None, thresh=0.0, autoremove=False):
        """
         Wrapper for sklearn variance threshold to for pandas dataframe
        :param dframe:
        :param columns:
        :param skip_columns:
        :param thresh:
        :param autoremove:
        :return:
        """
        logging.debug("Finding low-variance features")
        removed_features=[]
        try:
            all_columns = dframe.columns

            # remove the skip columns
            remaining_cols = all_columns.drop(skip_columns)

            # get length of new index.
            max_index = len(remaining_cols) - 1

            skipped_idx = [all_columns.get_loc(column) for column in skip_columns]

            for idx, item in enumerate(skipped_idx):
                if item > max_index:
                    diff = item - max_index
                    skipped_idx[idx] -= diff
                if item == max_index:
                    diff = item - len(skip_columns)
                    skipped_idx[idx] -= diff
                if idx == 0:
                    skipped_idx[idx] = item

            skipped_values = dframe.iloc[:skipped_idx].values

            X = dframe.loc[:, remaining_cols].values

            vt = VarianceThreshold(threshold=thresh)

            vt.fit(X)

            feature_indices = vt.get_support(indices=True)

            feature_names = [remaining_cols[idx] for idx, _ in enumerate(remaining_cols) if idx in feature_indices]

            removed_features = list(np.setdiff1d(remaining_cols, feature_names))

            logging.debug("Found %d low - variance columns " % len(removed_features))

        except Exception as e:
            logging.error(e)
            logging.error("Could not remove low variance features, some thing went wrong")
            print(e)
            pass

        return dframe, removed_features
Beispiel #10
0
def test_variance_threshold():
        tpot_obj = TPOT()
        non_feature_columns = ['class', 'group', 'guess']
        training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1)
        selector = VarianceThreshold(threshold=0)
        selector.fit(training_features)
        mask = selector.get_support(True)
        mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns

        assert np.array_equal(tpot_obj._variance_threshold(training_testing_data, 0), training_testing_data[mask_cols])
Beispiel #11
0
def filter_features(info_humans, info_bots, k=200, scale=False):
    """
    Carry out 2-layer feature filtering
    """
    X, y, features, scaler = get_Xy(info_humans, info_bots, scale=scale)
    
    vt = VarianceThreshold(threshold=(.8 * (1 - .8)))
    X_new = vt.fit_transform(X)
    features_1 = features[vt.get_support()]
    
    skb = SelectKBest(chi2, k=min(k, len(features_1)))
    X_new = skb.fit_transform(X_new, y)
    features_2 = features_1[skb.get_support()]

    return features_1, features_2, vt, skb
Beispiel #12
0
def feat1(matrix):
	last_column = [row[len(matrix[0])-1] for row in matrix]
	data_class = transform_to_int(last_column, matrix[0][len(matrix[0])-1])
	indices = list(range(len(matrix[0])-1))
	new_list = map(operator.itemgetter(*indices), matrix)
	data = np.asarray(new_list) 
	data = data.astype(np.float)
	sel = VarianceThreshold(threshold=(0.35))
	matrix_new =  sel.fit_transform(data)
	data_class = np.array([data_class])
	features_selected = np.concatenate((matrix_new,data_class.T),axis=1)
	indices_resultados = sel.get_support(new_list)
	features = []	
	for data in indices_resultados:
		features.append(data)
	return features
def calCorrMat():
    '''
    delete variable that has variance lower than threshold. 10 here
    '''
    df = pd.read_csv('183_descs_3763.csv',header=0,index_col=None)
    
    sel = VarianceThreshold(10)
    data = sel.fit_transform(df.values)
    aMask = sel.get_support(True)
    newDf = df.iloc[:,aMask]
    print newDf.shape
    raw_input()
    corrMat = newDf.corr(method='pearson')
    corrMat.to_csv('./data/corrNew.csv')
    newDf.to_csv('./data/reducedDescs.csv')  
    return corrMat
Beispiel #14
0
 def removeZeroVariance(data_frame):
     n_features_originally = data_frame.shape[1]
     selector = VarianceThreshold()
     selector.fit(data_frame)
     # Get the indices of zero variance feats
     feat_ix_keep = selector.get_support(indices=True)
     orig_feat_ix = np.arange(data_frame.columns.size)
     feat_ix_delete = np.delete(orig_feat_ix, feat_ix_keep)
     # Delete zero variance feats from the original pandas data frame
     data_frame = data_frame.drop(labels=data_frame.columns[feat_ix_delete],
                                  axis=1)
     # Print info
     n_features_deleted = feat_ix_delete.size
     print("  - Deleted %s / %s features (~= %.1f %%)" % (
         n_features_deleted, n_features_originally,
         100.0 * (np.float(n_features_deleted) / n_features_originally)))
     return data_frame
Beispiel #15
0
def bayes():
    recipeData = getRecipeData()
    sel = VarianceThreshold()
        
    ingredients = sorted(set([e for sublist in map(lambda e: e['ingredients'], recipeData) for e in sublist]))
    labels = [recipe['cuisine'] for recipe in recipeData]
    
    features = sel.fit_transform([buildFeaturesArray(ingredients, recipe) for recipe in recipeData])
    ingredients = [ingredients[i] for i in sel.get_support(True)]    
    
    clf = MultinomialNB()
    clf.fit(features, labels)
    
    testRecipes = getTestData()
    testFeatures = [buildFeaturesArray(ingredients, recipe) for recipe in testRecipes]
    predictions = clf.predict(testFeatures)
    
    outputPercentCorrect(predictions)
    copyAndOutput(predictions, testRecipes)
Beispiel #16
0
    def test_variance_k_best_random_tree_k_fold(self):
        # Feature Selection
        samples, responses = open_model("models.obj")
        samples = np.array(samples)
        responses = np.array(responses)

        FeatureSelection = True

        if FeatureSelection:
            selection = VarianceThreshold(threshold=0.00)
            selection.fit(samples)
            idxs = selection.get_support(indices=True)
            samples = samples[:, idxs]

        samples = preprocessing.scale(samples)

        # Stratified cross-validation
        scv = StratifiedKFold(responses, n_folds=10)
        sum = 0
        for i, (train, test) in enumerate(scv):
            print('Case %d' % (i))
            # Modeling
            rdmForest = RandomForest_scikit()

            # Train
            init = time()
            rdmForest.train(samples[train, :], responses[train])

            # Test
            a, confusionPre = rdmForest.test(samples[test, :], responses[test], True)
            print('Time: %0.3fs' % (time() - init))

            for idx, fila in enumerate(confusionPre):
                for jdx, entrada in enumerate(fila):
                    if idx != jdx:
                        sum += entrada

        print("Wrong Cases: "+str(sum))
        print(' Full Case ')
        rdmForest = RandomForest_scikit()
        rdmForest.train(samples, responses)
        rdmForest.test(samples, responses, True)
Beispiel #17
0
def main():
    # shape (#rows,18)
    train_users_raw = pd.read_csv('train_users_pruned.csv',delimiter=',',encoding='utf-8')
    test_users_raw = pd.read_csv('test_users.csv',delimiter=',',encoding='utf-8')


    del train_users_raw['id']
    user_id = test_users_raw['id']
    del test_users_raw['id']

    train_users_raw=train_users_raw.drop(train_users_raw.columns[[0]], axis=1)
    test_users_raw=test_users_raw.drop(test_users_raw.columns[[0]], axis=1)

    country_destination = train_users_raw['country_destination']
    del train_users_raw['country_destination']

    del train_users_raw['year_booked']
    del train_users_raw['month_booked']
    del train_users_raw['date_booked']
    del test_users_raw['year_booked']
    del test_users_raw['month_booked']
    del test_users_raw['date_booked']

    selector = VarianceThreshold(threshold=2.0)
    selector.fit(train_users_raw)
    selected_col_ind = selector.get_support(indices=True)
    selected_col_ind =  np.append(selected_col_ind, train_users_raw.shape[1]-1)
    #print selected_col_ind
    #print train_users_raw.columns.values
    # shape (#rows,11)
    train_users_downsized = train_users_raw.ix[:,selected_col_ind]
    train_users_downsized['country_destination'] = country_destination
    print train_users_downsized.columns.values
    test_users_downsized = test_users_raw.ix[:,selected_col_ind]
    test_users_downsized['id'] = user_id
    print test_users_downsized.columns.values

    train_users_downsized.to_csv('training_data_processed.csv', sep=',', encoding='utf-8')
    test_users_downsized.to_csv('testing_data_processed.csv', sep=',', encoding='utf-8')
# Filter complete null columns
cols = np.where((np.sum(df_2.isnull(), axis=0).values) == df_2.shape[0])[0]
print (cols)
filt_cols = [c for c in df_2.columns if c not in df_2.columns[cols]]
df_3 = df_2[filt_cols]
print ("df_3",df_3.shape)

#Fill na
df_4 = df_3.fillna(value=np.mean(df_3,axis=0),inplace=False,axis=0).values
print ("df_4",df_4.shape)
data=df_4

selector = VarianceThreshold(threshold=(.99 * (1 - .99)))
newdata=selector.fit_transform(data)
idxs = selector.get_support(indices=True)
print(data[:, idxs])
print("indices",idxs)
columnslist=df_2.columns.tolist()
print("lenindex",len(idxs))
for z in range(0,len(columnslist)):
    if z not in idxs:
        print(columnslist[z])
print("after",newdata.shape)
print("initial",data.shape)
print("Headers_FINAL: ", df_2.columns.values.tolist())


# In[27]:

for i in range (0,df_2['diag_3'].size):
Beispiel #19
0
X_test = bash_testleg_cc.drop('labels', axis=1)
y1 = bash_testleg_cc['labels']
#print(y1)
print(type(y1))
print(type(y))
X_train_T = y.T
y_train = pd.DataFrame(X_train_T)
X_test_T = y1.T
y_test = pd.DataFrame(X_test_T)
#X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state = 0,stratify = y)
##constant feature removall

constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(X)

print(constant_filter.get_support().sum())

constant_list = [not temp for temp in constant_filter.get_support()]
print(constant_list)
print(X.columns[constant_list])
X_train_filter = constant_filter.transform(X)
X_test_filter = constant_filter.transform(X_test)
print(X_train_filter.shape)
print(X_test_filter.shape)
print(X.shape)

##Quasi constant feature removal
quasi_constant_filter = VarianceThreshold(threshold=0.01)
quasi_constant_filter.fit(X_train_filter)
print(quasi_constant_filter.get_support().sum())
X_train_quasi_filter = quasi_constant_filter.transform(X_train_filter)
# If you want to remove the 2 very low variance features. What would be a
# good variance threshold?
#  A threshold of 1.0e-03 (0.001) will remove the two low variance features.

## Features with low variance

from sklearn.feature_selection import VarianceThreshold

# Create a VarianceThreshold feature selector
sel = VarianceThreshold(threshold=0.001)

# Fit the selector to normalized head_df
sel.fit(head_df / head_df.mean())

# Create a boolean mask
mask = sel.get_support()

# Apply the mask to create a reduced dataframe
reduced_df = head_df.loc[:, mask]

print("Dimensionality reduced from {} to {}.".format(head_df.shape[1],
                                                     reduced_df.shape[1]))
# Dimensionality reduced from 6 to 4

## Removing features with many missing values

school_df.isna().sum() / len(school_df)

# Create a boolean mask on whether each feature less than 50% missing values.
mask = school_df.isna().sum() / len(school_df) < 0.5
Beispiel #21
0
# GSE55145_exprs_B = pd.DataFrame.transpose(GSE55145_exprs_B)
# GSE9782_exprs_B = pd.DataFrame.transpose(GSE9782_exprs_B)

GDSC_exprs_z = pd.DataFrame.transpose(GDSC_exprs_z)
GSE1_exprs_z = pd.DataFrame.transpose(GSE1_exprs_z)
GSE2_exprs_z = pd.DataFrame.transpose(GSE2_exprs_z)
GSE3_exprs_z = pd.DataFrame.transpose(GSE3_exprs_z)
GSE4_exprs_z = pd.DataFrame.transpose(GSE4_exprs_z)
TCGA_exprs_z = pd.DataFrame.transpose(TCGA_exprs_z)
#

# Remove genes with low signal (i.e. below the variance threshold) from expression data

selector = VarianceThreshold(0.05)
selector.fit_transform(GDSC_exprs_z)
GDSC_exprs_z = GDSC_exprs_z[GDSC_exprs_z.columns[selector.get_support(
    indices=True)]]
ls = GSE1_exprs_z.columns.intersection(GDSC_exprs_z.columns)
ls = ls.intersection(GSE2_exprs_z.columns)
ls = ls.intersection(GSE3_exprs_z.columns)
ls = ls.intersection(GSE4_exprs_z.columns)
ls = ls.intersection(TCGA_exprs_z.columns)
GSE1_exprs_z = GSE1_exprs_z.loc[:, ls]
GSE2_exprs_z = GSE2_exprs_z.loc[:, ls]
GSE3_exprs_z = GSE3_exprs_z.loc[:, ls]
GSE4_exprs_z = GSE4_exprs_z.loc[:, ls]
TCGA_exprs_z = TCGA_exprs_z.loc[:, ls]

# Obtain selected genes
GDSC_exprs_z_genes = list(GDSC_exprs_z.columns.values)
GSE1_exprs_z_genes = list(GSE1_exprs_z.columns.values)
GSE2_exprs_z_genes = list(GSE2_exprs_z.columns.values)
def featureSelectionVarianceThreshold(data, probability = 0.8):
    dataRaw = data[:, 2:]
    sel = VarianceThreshold(threshold=(probability*(1 - probability)))
    dataNew = sel.fit_transform(dataRaw)
    fd = open('History.txt','a')
    history = 'Feature Selection: Variance Threshold' + '\n' + 'Selected Feature: ' + str(sel.get_support(True)) + '\n'
    fd.write(history)
    fd.close()
    return np.c_[data[:, :2], dataNew]
sel = VarianceThreshold(threshold=(theIndex* (1 - theIndex)))

data = np.loadtxt("train.nmv.txt")
firstData = data.copy()
tagUno = [ row[-1] for row in data]
tagUno = np.array([tagUno])
#arr = np.concatenate(  (arr , for_arr.T ), axis =1)

'''
Idea : -Save class labels which will be used
after the fit_transform thing 
cuts the poor variance labels down
'''    
data = sel.fit_transform(data)
data = np.concatenate(  (data , tagUno.T ), axis =1)
guillotine = sel.get_support()

prelimData = np.genfromtxt("prelim-nmv-noclass.txt")
prelimData = [i[:-1] for i in prelimData]    
prelimData = np.array(prelimData)

#guillotine = guillotine[:-1]
#assert len(guillotine) == len(prelimData[:-1])
guillotine_full = guillotine.copy()
guillotine = guillotine[:-1]

'''
#assigning this to prelimData verifies that we collapse the
#preliminary test set correctly
prelimData = firstData.copy()[:-1]
'''
def train_test(X_train, X_test):
    try:
        vs_constant = VarianceThreshold(threshold=0)
        # select the numerical columns only.
        numerical_x_train = X_train[X_train.select_dtypes([np.number]).columns]
        # fit the object to our data.
        vs_constant.fit(numerical_x_train)
        # get the constant colum names.
        constant_columns = [
            column for column in numerical_x_train.columns if column not in
            numerical_x_train.columns[vs_constant.get_support()]
        ]
        # detect constant categorical variables.
        constant_cat_columns = [
            column for column in X_train.columns
            if (X_train[column].dtype == "O"
                and len(X_train[column].unique()) == 1)
        ]
        all_constant_columns = constant_cat_columns + constant_columns
        X_train.drop(labels=all_constant_columns, axis=1, inplace=True)
        X_test.drop(labels=all_constant_columns, axis=1, inplace=True)
        print(X_train.shape)

        # threshold value for quasi constant.
        ####### Quasi-Constant Features
        threshold = 0.98
        # create empty list
        quasi_constant_feature = []
        # loop over all the columns
        for feature in X_train.columns:
            # calculate the ratio.
            predominant = (X_train[feature].value_counts() /
                           np.float(len(X_train))).sort_values(
                               ascending=False).values[0]
            # append the column name if it is bigger than the threshold
            if predominant >= threshold:
                quasi_constant_feature.append(feature)
        X_train.drop(labels=quasi_constant_feature, axis=1, inplace=True)
        X_test.drop(labels=quasi_constant_feature, axis=1, inplace=True)
        print(X_train.shape)
        #######Duplicated Features
        # transpose the feature matrice
        train_features_T = X_train.T
        ########  Correlation Filter Methods
        # select the duplicated features columns names
        duplicated_columns = train_features_T[
            train_features_T.duplicated()].index.values
        # drop those columns
        X_train.drop(labels=duplicated_columns, axis=1, inplace=True)
        X_test.drop(labels=duplicated_columns, axis=1, inplace=True)
        print(X_train.shape)
        correlated_features = set()
        correlation_matrix = X_train.corr()
        for i in range(len(correlation_matrix.columns)):
            for j in range(i):
                if abs(correlation_matrix.iloc[i, j]) > 0.8:
                    colname = correlation_matrix.columns[i]
                    correlated_features.add(colname)
        X_train.drop(labels=correlated_features, axis=1, inplace=True)
        X_test.drop(labels=correlated_features, axis=1, inplace=True)
        print(X_train.shape)
        return X_train, X_test
    except:
        print('sucsessfully completed QC')
Beispiel #25
0
Xs=Xs.fillna(Xs.mean())

#reescalo las Xs para que funcione PCA y otros algos
scaler = StandardScaler()
scaler.fit(Xs)
Xs_res=scaler = scaler.transform(Xs)
Xs_res=pd.DataFrame(data=Xs_res,index=Xs.index,columns=Xs.columns)

#selección de variables usando Mínima Varianza
cov=Xs_res.cov()
correls=Xs.corr()
sel = VarianceThreshold(threshold=0.01)
filtered1_Xs=sel.fit_transform(Xs)
filtered1_Xs=pd.DataFrame(data=filtered1_Xs,index=Xs.index,columns=Xs.columns)

sel_cols1=sel.get_support(indices=True).T


#reducción de variables usando PCA

pca=PCA(n_components=17)
pca.fit(Xs)
evals=pca.explained_variance_                           # corresponde a los eigenvalues
var_expl=pca.explained_variance_ratio_                  # Varianza explicada por cada componente principal 
evecs=pca.components_.T                                 # corresponde a los eigenvectores
loadings=evecs*np.sqrt(evals)
loadings_filt=np.where(np.abs(loadings)>0.3,loadings,float('nan'))
loadings_filt=pd.DataFrame(data=loadings_filt,index=Xs.columns)


#repito la operación, ahora con n factores que explican %de var total q quiero
Beispiel #26
0
        dataset[min_max_attributes])


minmaxScaling(X_train)
minmaxScaling(X_validation)
minmaxScaling(X_test)

# -------------------------------------------------------------------
# ------------------------ E: Feature Selection ---------------------
# -------------------------------------------------------------------
# Being done only on the train set to determine the features to select

# Filter method = Variance Threshold
filter = VarianceThreshold(threshold=0.2)
filter.fit_transform(X_train)
print(filter.get_support(indices=True))

# Wrapper method = SFS
knn = KNeighborsClassifier(n_neighbors=3)
sfs = SFS(knn,
          k_features=30,
          forward=True,
          floating=False,
          verbose=2,
          scoring='accuracy',
          cv=0)
sfs = sfs.fit(X_train, Y_train)
print(sfs.k_feature_idx_)

# -------------------------------------------------------------------
# ------------------------ 5: Saving the prepared data --------------
poke_gen = pd.get_dummies(df['Generation'])
poke_gen.head()


# In[4]:

from sklearn.feature_selection import VarianceThreshold

vt = VarianceThreshold(threshold=.15)
vt.fit(poke_gen)


# In[5]:

pd.DataFrame({'variance': vt.variances_,
              'select_feature': vt.get_support()},
            index=poke_gen.columns).T


# In[6]:

poke_gen_subset = poke_gen.iloc[:,vt.get_support()].head()
poke_gen_subset


# # Statistical Methods

# In[7]:

from sklearn.datasets import load_breast_cancer
Beispiel #28
0
def main():
    set_option('display.width', 2000)
    pd.set_option("display.max_rows", 500, "display.max_columns", 2000)
    set_option('precision', 3)
    pd.options.mode.chained_assignment = None

    input_file = './raw data_edit/data_fd.csv'
    data_input_ori = pd.read_csv(input_file)
    data_input_ori = data_input_ori.drop(columns=['Subject'])

    # Create correlation heatmap
    cols = data_input_ori.keys()
    cols_edit = cols[1:]
    corr = data_input_ori[cols_edit].corr()
    mask = np.zeros_like(corr)
    mask[np.triu_indices_from(mask)] = True
    thd = 0.35
    corr_np = corr.values
    corr_edit = np.zeros_like(corr)
    corr_pairs = []
    for i in range(corr_np.shape[0]):
        for j in range(corr_np.shape[1]):
            if j > i:
                if corr_np[i, j] >= thd or corr_np[i, j] <= -thd:
                    corr_edit[i, j] = corr_np[i, j]
                    tmp1 = cols_edit[i]
                    tmp2 = cols_edit[j]
                    tmp = [tmp1, tmp2, corr_np[i, j]]
                    corr_pairs.append(tmp)
    print('Feature pairs with a high correlation (|cc| >=0.35):', corr_pairs)

    plt.figure(1)
    sns.heatmap(corr,
                annot=False,
                vmin=-1,
                vmax=1,
                xticklabels=1,
                yticklabels=1,
                mask=mask,
                cmap='seismic')

    data_input_ori = data_input_ori.drop(
        columns=['INDEPEND', 'TOBAC100', 'TOBAC30'])
    keys = data_input_ori.keys()

    # Remove the features with low variance
    var = 0.10
    sel = VarianceThreshold(threshold=var)
    data_edit = sel.fit_transform(data_input_ori)
    indices = sel.get_support(indices=True)
    keys0 = []  # The features with low variance
    for i in range(len(keys)):
        if i not in indices:
            keys0.append(keys[i])
    print('The features removed due to low variance', keys0)

    keys1 = keys[indices]
    # Exclude the 'sumbox'
    keys1 = keys1[2:]

    features = data_edit[:, 2:]
    label = data_edit[:, 1]

    N1 = 24
    fv, pv = f_regression(features, label)

    indices3 = np.argsort(pv)
    print('keys3 without removing feature', keys1[indices3])
    indices3c = indices3[0:N1]
    print('Features removed due to the high p-value', keys1[indices3[N1:]])
    features_new = features[:, indices3c]
    x_axis = np.linspace(1, len(pv), len(pv))
    pv = sorted(pv)

    # Merge the data together
    label_norm = np.reshape(label, (len(label), 1))
    # Make sure the dimension is the same for the data sets
    data_edit = np.concatenate((features_new, label_norm), axis=1)
    np.save('./raw data_edit/data_ml', data_edit)

    font = {'size': 16}
    plt.rc('font', **font)
    plt.figure(2)
    plt.scatter(x_axis, pv, color='black')
    plt.xlabel("Feature")
    plt.ylabel("P-value")
    plt.show()
data.shape

[col for col in data.columns if data[col].isnull().sum() > 0]

x_train, x_test, y_train, y_test = train_test_split(data.drop(
    labels=["TARGET"], axis=1),
                                                    data["TARGET"],
                                                    test_size=0.3,
                                                    random_state=0)
x_train.shape
x_test.shape

#variance Threshhold
sel = VarianceThreshold(threshold=0)
sel.fit(x_train)
sum(sel.get_support())
#another way
len(x_train.columns[sel.get_support()])

print(
    len([
        x for x in x_train.columns
        if x not in x_train.columns[sel.get_support()]
    ]))

[x for x in x_train.columns if x not in x_train.columns[sel.get_support()]]

x_train['ind_var2_0'].unique()

x_train = sel.transform(x_train)
x_test = sel.transform(x_test)
Beispiel #30
0
def varianceSelection(X, THRESHOLD=10):
    from sklearn.feature_selection import VarianceThreshold
    sel = VarianceThreshold(threshold=THRESHOLD)
    sel.fit_transform(X)
    return X[[c for (s, c) in zip(sel.get_support(), X.columns.values) if s]]
from sklearn.feature_selection import VarianceThreshold
import dataframe

debug = True

# x, y = dataframe.get_dataset_from_file('corrected')
#
# print 'Dataset contains %d instances with %d initial features.' % (len(y), len(x[0]))
#
threshold = 0
threshold_increment = 0.01


def get_transformed_matrix_with_threshold(x, y, threshold):
    sel = VarianceThreshold(threshold)
    return sel.fit_transform(x, y)

if not debug:
    while threshold <= 1.0:
        x, y = dataframe.df_data, dataframe.df_target

        selector = VarianceThreshold(threshold)
        result = selector.fit_transform(x, y)

        print 'Threshold = %f, Features remained after fit_transform %d' % (threshold, len(result[0]))

        threshold += threshold_increment

        print selector.get_support(indices=True)
Beispiel #32
0
X_test.drop(labels=constant_features, axis=1, inplace=True)
 
X_train.shape, X_test.shape


# # remove quasi-constant features

# In[7]:


sel = VarianceThreshold(
    threshold=0.01)  # 0.1 indicates 99% of observations approximately
 
sel.fit(X_train)  # fit finds the features with low variance
 
sum(sel.get_support()) # how many not quasi-constant?


# In[8]:


features_to_keep = X_train.columns[sel.get_support()]


# In[9]:


X_train = sel.transform(X_train)
X_test = sel.transform(X_test)
 
X_train.shape, X_test.shape
            Y.append(1 if d==0 else 0)
    return normalize(np.array(X),norm='l2'),Y

def sample_data(X, Y, value=0):
    XX=[]
    for i in xrange(len(Y)):
        if Y[i]==value:
            XX.append(X[i])
    return XX

out=open(sys.argv[1],"r")
model=svm.OneClassSVM(kernel='rbf')
X, Y = read_fea(sys.argv[1])
sel = VarianceThreshold(threshold=0)
model.fit(sample_data(sel.fit_transform(X),Y, 1))
warning("useful features dim: "+str(len(sel.get_support(True))))
if hasattr(model,'score'):
    warning("accuracy on training set: "+str(model.score(sel.transform(X), Y)))
    if len(sys.argv)>2:
        X, Y = read_fea(sys.argv[2])
        warning("accuracy on cv set: "+str(model.score(sel.transform(X), Y)))

    if len(sys.argv)>3:
        X, Y = read_fea(sys.argv[3])
        warning("accuracy on dev set: "+str(model.score(sel.transform(X), Y)))

if len(sys.argv)>4:
    ref = model.decision_function(sel.transform(X))
    X, Y = read_fea(sys.argv[4], True)
    Z = model.decision_function(sel.transform(X)).tolist()
    Z = (Z-ref.mean())/ref.std()
Beispiel #34
0
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=0.01)
sel.fit(x_train)


# In[39]:


### if we sum over get_support, we get the number of features that are not constant


# In[178]:


sum(sel.get_support())


# In[179]:


x_train = sel.transform(x_train)
test = sel.transform(test)


# In[180]:


test.shape

from sklearn.feature_selection import VarianceThreshold
from numpy import genfromtxt, savetxt

dataset = genfromtxt(open('/Users/larryhan/Dropbox/SML Project2/code/Part B/data_4/classification.csv','r'), delimiter=',', dtype='f3')[1:]
title = genfromtxt(open('/Users/larryhan/Dropbox/SML Project2/code/Part B/data_4/classification.csv','r'), delimiter=',',dtype="S5")[0]

target = [x[418] for x in dataset]
train  = [x[0:418] for x in dataset]

sel = VarianceThreshold(0.9*(1-0.9))
sel.fit_transform(train)


support = sel.get_support()

# for i in range(len(support)):
# 	if support[i]:
# 		print(title[i])


sub_title = []
for i in range(len(support)):
	if support[i]:
		sub_title.append(title[i])

Beispiel #36
0
def main():

    df = joblib.load('modelDataset.pkl')

    # Split dataframe into features and target
    y = df.iloc[:, 1]  # .as_matrix()
    X = df.iloc[:, 2:]  # .as_matrix()
    id = df.iloc[:, 0]

    # Scalings
    sc = StandardScaler()

    # Apply scaler
    colNames = X.columns
    X = sc.fit_transform(X)
    X = pd.DataFrame(X, columns=colNames)

    # Remove features with less than 20% variance
    colNames = X.columns
    sel = VarianceThreshold(threshold=0.16)
    X = sel.fit_transform(X)
    # Get column names back
    newCols = []
    for remain, col in zip(sel.get_support(), colNames):
        if remain == True:
            newCols.append(col)
    X = pd.DataFrame(X, columns=newCols)

    # Perform univariate feature selection (ANOVA F-values)
    colNames = X.columns
    selection_Percent = SelectPercentile(percentile=5)
    X = selection_Percent.fit_transform(X, y)
    # Get column names back
    newCols = []
    for remain, col in zip(selection_Percent.get_support(), colNames):
        if remain == True:
            newCols.append(col)
    X = pd.DataFrame(X, columns=newCols)

    # Perform tree-based feature selection
    clf = ExtraTreesRegressor()
    clf = clf.fit(X, y)
    colNames = X.columns
    sel = SelectFromModel(clf, prefit=True)
    X = sel.transform(X)
    newCols = []
    for remain, col in zip(sel.get_support(), colNames):
        if remain == True:
            newCols.append(col)
    X = pd.DataFrame(X, columns=newCols)

    # Split train/test
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=1555)

    def testRegressor(clf):
        '''
        #RF grid
        param_grid = [{'n_estimators': range(320, 350, 10),
                       'min_samples_split': range(2, 20, 2),
                       'min_samples_leaf': range(2, 20, 2),
                       'max_leaf_nodes': range(140, 170, 5)
                       }]
        grid = GridSearchCV(clf, param_grid, cv=3, verbose=1, n_jobs=-1)
        fitted_classifier = grid.fit(X_train, y_train)
        print(grid.best_score_, grid.best_params_)
        predictions = fitted_classifier.predict(X_train)'''
        '''
        #XGB tuning - concept, not in use
        param_grid = [{'max_depth': range(2, 4, 1),
                       'min_child_weight': range(3, 6, 1),
                       'n_estimators': range(80, 110, 10),
                       'learning_rate': [0.1],
                       'gamma': [0],
                       'subsample': [0.9, 1],
                       'colsample_bytree': [0.7],
                       'reg_alpha': [15, 50, 100, 150, 200],
                       'reg_lambda': [15, 20, 25, 30, 40, 50]}]
        fit_params = {"early_stopping_rounds": 8,
                      "eval_metric": "mae",
                      "eval_set": [[X_test, y_test]],
                      "verbose": False}
        grid = GridSearchCV(clf, param_grid, fit_params=fit_params,
                            cv=3, verbose=1, n_jobs=-1)
        fitted_classifier = grid.fit(X_train, y_train)
        print(grid.best_score_, grid.best_params_)
        predictions = fitted_classifier.predict(X_train)
        '''

        fitted = clf.fit(X_train, y_train)
        scoresCV = cross_val_score(clf,
                                   X_train,
                                   y_train,
                                   cv=3,
                                   verbose=0,
                                   n_jobs=-1)
        trainPredictionsCV = cross_val_predict(clf,
                                               X_train,
                                               y_train,
                                               cv=3,
                                               verbose=0,
                                               n_jobs=-1)

        trainPredictions = clf.predict(X_train)
        testPredictions = clf.predict(X_test)

        score1 = metrics.explained_variance_score(y_test.values,
                                                  testPredictions)
        score2 = metrics.mean_absolute_error(y_test.values, testPredictions)
        score3 = metrics.mean_squared_error(y_test.values, testPredictions)
        score4 = metrics.r2_score(y_test.values, testPredictions)
        print('Train score: ',
              metrics.mean_absolute_error(y_train.values, trainPredictions))
        print('CV score: ', scoresCV)
        print('Explained Variance Score, MAE, MSE, R^2')
        print(score1, score2, score3, score4)

        tempIndex = range(0, len(y_test.values), 1)
        plt.scatter(tempIndex, y_test.values, color='black', s=20, alpha=0.8)
        plt.scatter(tempIndex, testPredictions, color='red', s=20, alpha=0.4)
        plt.show()
        #Results appear to be highly interesting
        #MSE (and thus penalising large errors more) suggests that the model does not deal well with
        #particular categories of retweets where there is a significant difference between true value and predicted
        #Data appears to have high bias in terms of selection, as if tweets were selected from specific pools
        #based on retweet value
        #While the random forest deals well with those particular types of tweets, more analysis is needed
        # Further steps would start by understanding the sampling procedure that produced these tweets
        # From there, features need to be relooked at, dimensionality reduction (such as PCA) might be needed
        # Simpler / more powerful models to then be appropriately applied
        #The target retweets actually seem to be created from a Decision Tree Model
        print('x')

    lr = LinearRegression()
    dt = DecisionTreeRegressor()
    rf = RandomForestRegressor()
    gb = xgboost.XGBRegressor()

    #print('LR')
    #testRegressor(lr)
    #print('DT')
    #testRegressor(dt)
    print('RF')
    testRegressor(dt)
Beispiel #37
0
class SemiSupervisedFeatureSelection(FeatureSelection):
    def __init__(self, conf):
        FeatureSelection.__init__(self, conf)

    def setBestParameters(self, instances):
        return

    def getFittingInstances(self, instances):
        return instances.getLabeledInstances()

    # Remove instances those family is too rare (num_instances < k = 3)
    def generateInputLabels(self, instances):
        if self.conf.families_supervision:
            families_count = instances.getFamiliesCount()
            drop_ids = []
            for family, count in families_count.iteritems():
                if count < 3:
                    drop_ids += instances.getFamilyIds(family)
            selected_ids = [i for i in instances.getIds() if i not in drop_ids]
            selected_instances = instances.getInstancesFromIds(selected_ids)
            labels = selected_instances.families
        else:
            selected_instances = instances
            labels = selected_instances.labels
        ## String labels are transformed into integer labels (0 -> num_labels-1).
        ## This format is required blabels the library metric-learn.
        labels_values = list(set(labels))
        if len(labels_values) < 2:
            raise FewerThanTwoLabels()
        labels = np.array([labels_values.index(x) for x in labels])
        return labels, selected_instances

    def generateInputParameters(self, instances):
        fitting_instances = self.getFittingInstances(instances)
        labels, fitting_instances = self.generateInputLabels(fitting_instances)
        features = self.featuresPreprocessing(fitting_instances)
        return features, labels

    def fit(self, instances):
        features, labels = self.generateInputParameters(instances)
        self.setBestParameters(instances)
        self.createPipeline()
        self.pipeline.fit(features, labels)
        self.setProjectionMatrix()

    def createPipeline(self):
        # Remove features with null variance
        self.var_filter = VarianceThreshold()
        self.pipeline = Pipeline([('var_filter', self.var_filter),
                                  ('projection', self.projection)])

    def getSelectedFeatures(self, features_names):
        non_constant_features = np.array(features_names)[
            self.var_filter.get_support()]
        selected_features = list(
            non_constant_features[self.projection.get_support()])
        return selected_features

    ## The name of the selected features.
    def componentLabels(self, features_names):
        return self.getSelectedFeatures(features_names)
Beispiel #38
0
# fature selection
# =============================================================================

col = ['formation_energy_ev_natom', 'bandgap_energy_ev']
X = train1.drop(['id'] + col, axis=1)
T = test1.drop(['id'] + col, axis=1)
y = np.log(train1[col] + 1)
plt.hist(y[col[0]])
plt.hist(y[col[1]], color='r')

selector = VarianceThreshold(threshold=0)
selector.fit(X)  # Fit to train without id and target variables

f = np.vectorize(lambda x: not x)  # Function to toggle boolean array elements

v = X.columns[f(selector.get_support())]
print('{} variables have too low variance.'.format(len(v)))
print('These variables are {}'.format(list(v)))
selected_feat = X.columns.drop(v)

#update
X = X[selected_feat]
T = T[selected_feat]

# RFE Recursive feature elimination
rf = RandomForestRegressor(n_estimators=500, random_state=seed)
selector = RFECV(rf, cv=3, step=5)

y = np.log(train1[col[0]] + 1)  # formation ev
selector = selector.fit(X, y)
selector.support_
    'XRP' : ['XRP', 'Ripple'],
    'ZEC' : ['ZEC', 'ZCash'],
    'ZRX' : ['ZRX', '0x']
}
if __name__ == '__main__':
    index = {}
    df = pd.read_csv('data/google_trends/trends_all_20101112000000_20190101000000.csv', sep=',', encoding='utf-8', index_col='date', parse_dates=True)
    for sym, columns in COLUMNS.items():
        _df = df.loc[:, columns]
        _df.columns = ['gtrends_{}_{}'.format(sym, c.lower()) for c in _df.columns]
        _df = _df.drop_duplicates().resample('D').mean().fillna(method='ffill')

        sel = VarianceThreshold()
        sel.fit(_df.values)

        sel_columns = [c for c, s in zip(_df.columns, sel.get_support()) if s]
        _df = _df.loc[:, sel_columns]
        print("{}: {} Features, {} Selected".format(sym, len(columns), len(sel_columns)))


        os.makedirs('data/preprocessed/google_trends/csv/', exist_ok=True)
        os.makedirs('data/preprocessed/google_trends/excel/', exist_ok=True)
        csv_path = 'data/preprocessed/google_trends/csv/{}.csv'.format(sym.lower())
        xls_path = 'data/preprocessed/google_trends/excel/{}.xlsx'.format(sym.lower())
        _df.to_csv(csv_path, sep=',', encoding='utf-8', index=True, index_label='Date')
        _df.to_excel(xls_path, index=True, index_label='Date')
        index[sym] = {'csv':csv_path, 'xls':xls_path}
        print('Saved {} in data/preprocessed/google_trends/'.format(sym))

    with open('data/preprocessed/google_trends/index.json', 'w') as f:
        json.dump(index, f, sort_keys=True, indent=4)

def sample_data(X, Y, value=0):
    XX = []
    for i in xrange(len(Y)):
        if Y[i] == value:
            XX.append(X[i])
    return XX


out = open(sys.argv[1], "r")
model = svm.OneClassSVM(kernel='rbf')
X, Y = read_fea(sys.argv[1])
sel = VarianceThreshold(threshold=0)
model.fit(sample_data(sel.fit_transform(X), Y, 1))
warning("useful features dim: " + str(len(sel.get_support(True))))
if hasattr(model, 'score'):
    warning("accuracy on training set: " +
            str(model.score(sel.transform(X), Y)))
    if len(sys.argv) > 2:
        X, Y = read_fea(sys.argv[2])
        warning("accuracy on cv set: " + str(model.score(sel.transform(X), Y)))

    if len(sys.argv) > 3:
        X, Y = read_fea(sys.argv[3])
        warning("accuracy on dev set: " +
                str(model.score(sel.transform(X), Y)))

if len(sys.argv) > 4:
    ref = model.decision_function(sel.transform(X))
    X, Y = read_fea(sys.argv[4], True)
GDSCM = pd.read_csv("GDSC_mutations.Erlotinib.tsv",
                    sep="\t",
                    index_col=0,
                    decimal=",")
GDSCM = pd.DataFrame.transpose(GDSCM)

GDSCC = pd.read_csv("GDSC_CNA.Erlotinib.tsv",
                    sep="\t",
                    index_col=0,
                    decimal=",")
GDSCC.drop_duplicates(keep='last')
GDSCC = pd.DataFrame.transpose(GDSCC)

selector = VarianceThreshold(0.05)
selector.fit_transform(GDSCE)
GDSCE = GDSCE[GDSCE.columns[selector.get_support(indices=True)]]

ls = GDSCE.columns.intersection(GDSCM.columns)
ls = ls.intersection(GDSCC.columns)
ls = ls.intersection(PDXE.columns)
ls = ls.intersection(PDXM.columns)
ls = ls.intersection(PDXC.columns)
ls2 = GDSCE.index.intersection(GDSCM.index)
ls2 = ls2.intersection(GDSCC.index)
ls3 = PDXE.index.intersection(PDXM.index)
ls3 = ls3.intersection(PDXC.index)
ls = pd.unique(ls)

PDXE = PDXE.loc[ls3, ls]
PDXM = PDXM.loc[ls3, ls]
PDXC = PDXC.loc[ls3, ls]
#2) remove some of the recordings and do it a few times (so manually k-folding), because that way if the same features are removed
#then we know that for real those are the features not helpful


xtrain_aud = sio.loadmat('xtrain_all_aud.mat')
xtrain_aud = xtrain_aud['xtrain']
ytrain_aud = sio.loadmat('ytrain_all_aud.mat')
ytrain_aud = ytrain_aud['ytrain']

# method 1: variance threshold

Var_selector = VarThresh(.5)
# without any parameters passed to varthresh it defaults to anything with all feautres the exact same
#  am going to start with .1
Var_selector.fit(xtrain_aud)
which_feats = Var_selector.get_support()
x_aud_fitted = Var_selector.transform(xtrain_aud)

print x_aud_fitted.shape


xtrunclength = sio.loadmat('xtrunclength.mat')
xtrunclength = xtrunclength['xtrunclength']

xtesting = sio.loadmat('xtesting.mat')
xtesting = xtesting['xtesting']

xtesting = xtesting[~np.isnan(xtesting).any(axis=1),:]
xtesting = xtesting[~np.isinf(xtesting).any(axis=1),:]

from CurrentThingsNeededtoRun import FinalClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

pd=pd.read_csv("train_label.csv")
constant_filter = VarianceThreshold(threshold=0.05)
constant_filter.fit(pd)
len(pd.columns[constant_filter.get_support()])


constant_columns = [column for column in pd.columns
                    if column not in pd.columns[constant_filter.get_support()]]

pd.drop(labels=constant_columns, axis=1, inplace=True)
print(constant_columns)
pd.to_csv("train_label_p.csv",index=False)
Beispiel #44
0
df = file[columnList]
df.dropna(axis=1, how='all', thresh=40051, inplace=True)

#drop columns which only contain 1 unique values. (variance = 0)
df = df[[col for col in df if not df[col].nunique() == 1]]

df.to_csv('./cleanData.csv', index=False)

#drop columns with low variance
import pandas
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=1)
sel.fit_transform(data)

m = 0
for i in sel.get_support(True):
    print(data.columns[i])
    m += 1
print(m)

#use selectFromModel
data = pandas.read_csv(
    '/Users/sherry/Desktop/python/energyProject/cleanData.csv', quotechar="'")
data = data.astype('float64', inplace=True)
data.dropna(inplace=True)

feature = data[[col for col in data.columns if col != "GASAMT"]]
lrModel = LinearRegression()
selectFromModel = SelectFromModel(lrModel)
selectFromModel.fit_transform(feature, data['GASAMT'])
Beispiel #45
0
newdf_test.drop('protocol_type', axis=1, inplace=True)
newdf_test.drop('service', axis=1, inplace=True)

print(newdf_test['label'].value_counts())

X_Probe = newdf.drop('label', 1)
Y_Probe = newdf.label
X_Probe_test = newdf_test.drop('label', 1)
Y_Probe_test = newdf_test.label

colNames = list(X_Probe)

from sklearn.feature_selection import VarianceThreshold
variance_threshold = VarianceThreshold()
variance_threshold.fit(X_Probe)
true = variance_threshold.get_support()
varcolindex_Probe = [i for i, x in enumerate(true) if x]
varcolname_Probe = list(colNames[i] for i in varcolindex_Probe)
print('Features selected :', varcolname_Probe)

features = newdf[varcolname_Probe].astype(float)
features1 = newdf_test[varcolname_Probe].astype(float)
lab = newdf['label']
lab1 = newdf_test['label']

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=0)
t0 = time()
clf.fit(features, lab)
tt = time() - t0
print("Classifier trained in {} seconds.".format(round(tt, 3)))
Beispiel #46
0
def variance_threshold_selector(data, threshold=0.5):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]
    for i in normalize.drop(['oferta_id', 'target', 'CONTROL'],
                            axis=1).columns.values.tolist():
        normalize[i] = normalize[i].map(float)
        normalize[i] = StandardScaler().fit_transform(
            normalize[i].values.reshape(-1, 1))

    normal = normalize[normalize['CONTROL'] == 0]
    anormal = normalize[normalize['CONTROL'] == 1]

    del normal['CONTROL']
    del anormal['CONTROL']

    # VARIANCE REDUCTION
    selection = VarianceThreshold(threshold=0.0)
    selection.fit(normal.drop(['oferta_id', 'target'], axis=1))
    features = selection.get_support(indices=True)
    features = list(normal.columns[features]) + ['oferta_id', 'target']

    normal = normal[features]
    test_anormal = anormal[features]

    train, valid, _, _ = train_test_split(normal,
                                          normal,
                                          test_size=0.30,
                                          random_state=42)
    valid, test_normal, _, _ = train_test_split(valid,
                                                valid,
                                                test_size=len(anormal.index),
                                                random_state=42)
    valid = valid.drop(['oferta_id', 'target'], axis=1)
Beispiel #48
0
# ## Feature selection

# ### Removing features with low or zero variance

# Personally, I prefer to let the classifier algorithm chose which features to keep. But there is one thing that we can do ourselves. That is removing features with no or a very low variance. Sklearn has a handy method to do that: **VarianceThreshold**. By default it removes features with zero variance. This will not be applicable for this competition as we saw there are no zero-variance variables in the previous steps. But if we would remove features with less than 1% variance, we would remove 31 variables.

# In[ ]:

selector = VarianceThreshold(threshold=.01)
selector.fit(train.drop(
    ['id', 'target'], axis=1))  # Fit to train without id and target variables

f = np.vectorize(lambda x: not x)  # Function to toggle boolean array elements

v = train.drop(['id', 'target'], axis=1).columns[f(selector.get_support())]
print('{} variables have too low variance.'.format(len(v)))
print('These variables are {}'.format(list(v)))

# We would lose rather many variables if we would select based on variance. But because we do not have so many variables, we'll let the classifier chose. For data sets with many more variables this could reduce the processing time.
#
# Sklearn also comes with other [feature selection methods](http://scikit-learn.org/stable/modules/feature_selection.html). One of these methods is *SelectFromModel* in which you let another classifier select the best features and continue with these. Below I'll show you how to do that with a Random Forest.

# ### Selecting features with a Random Forest and SelectFromModel
# Here we'll base feature selection on the feature importances of a random forest. With Sklearn's SelectFromModel you can then specify how many variables you want to keep. You can set a threshold on the level of feature importance manually. But we'll simply select the top 50% best variables.
#
# > The code in the cell below is borrowed from the [GitHub repo of Sebastian Raschka](https://github.com/rasbt/python-machine-learning-book/blob/master/code/ch04/ch04.ipynb). This repo contains code samples of his book *Python Machine Learning*, which is an absolute must to read.

# In[ ]:

X_train = train.drop(['id', 'target'], axis=1)
Beispiel #49
0
        total = total + float(tp+tn)/(tp+tn+fp+fn)*100
    return total/len(labels)

# train_text,train_classfi_number,train_classfi,train_feature_name = getTargetData("Breast_train.data")
# test_text,test_classfi_number,test_classfi,test_feature_name = getTargetData("Breast_test.data")

# for i in range(len(train_text)):
#         for j in range(len(train_text[0])):
#             train_text[i][j] = float(train_text[i][j])
#             print type(train_text[i][j] )

# selector = VarianceThreshold()
# data = selector.fit_transform(train_text)
# index = selector.get_support(True)

# train = data
# test = []
# df = pd.DataFrame(test_text)
# for line in index:
# 	test.append(df[line])

X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]
selector = VarianceThreshold()
selector.fit_transform(X)
print selector.get_support()
# clf = DecisionTreeClassifier(max_depth=4)
# clf = SVC(kernel='rbf', probability=True)
# clf.fit(data, train_classfi)
# result = clf.predict(test_text)

Beispiel #50
0
 def variance(self, X, threshold):
     sel = VarianceThreshold(threshold=(threshold * (1 - threshold)))
     sel_var = sel.fit_transform(X)
     X = X[X.columns[sel.get_support(indices=True)]]
     return X
Beispiel #51
0
    def preprocess(self):
        print 'Preprocess...'
        print 'Start: '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')        
        data = self.data.copy()
        label = self.label.copy()
        
        m = data.shape[0]
        print data['MarriageStatus'].dtype
        
        #fillna
        for i in data.columns:
            if i!='AppId' and i!='InstallmentStartedOn':
                if data[i].hasnans:
                    t0=pd.DataFrame(np.ones((data.shape[0],1),dtype=np.int),columns=[i+'_Ex'],index=data.index)
                    ind0=data[data[i].isnull()].index
                    t0.ix[ind0]=0
                    data[i+'_Ex']=t0
                    
                    if data[i].dtype==np.object:
                        if data[i].value_counts().sort_values().shape[0]>0:
                            data[i].fillna(data[i].value_counts().sort_values().index[-1],inplace=True,downcast='infer')
                        else:
                            data[i].fillna('0',inplace=True,downcast='infer')
                    else:
                        if np.isnan(data[i].mean())==False:
                            data[i].fillna(data[i].mean(),inplace=True,downcast='infer')
                        else:
                            data[i].fillna(0,inplace=True,downcast='infer')
                            
        train,train_label,test,test_label=self.split(data,label)

        self.raw_train=train.copy()
        self.raw_train_label=train_label.copy()
        self.raw_test=test.copy()
        self.raw_test_label=test_label.copy()
        
        #delete AppId and InstallmentStartedOn
        data.drop(['AppId','InstallmentStartedOn'],axis=1,inplace=True)
        train.drop(['AppId','InstallmentStartedOn'],axis=1,inplace=True)
        test.drop(['AppId','InstallmentStartedOn'],axis=1,inplace=True)
        
        data.reset_index(inplace=True,drop=True)
        train.reset_index(inplace=True,drop=True)
        test.reset_index(inplace=True,drop=True)
        
        #preprocess 
        enc0=LabelEncoder()
        enc1 = OneHotEncoder()
        scaler = MinMaxScaler()
        
        for i in train.columns:
            if train[i].dtype==np.object:
                t0=enc0.fit_transform(train[i].values.reshape(-1,1))
                t1=enc1.fit_transform(t0.reshape(-1,1)).toarray()
                tf=pd.DataFrame(t1,index=train.index)
                tf.rename(columns=lambda x: i+'_'+str(x)+'_E', inplace=True)
                train.drop(i,inplace=True,axis=1)
                train=train.join(tf,how='inner')

                clas = enc0.classes_
                if test[i][~test[i].isin(clas)].size != 0:
                    ind = test[i][~test[i].isin(clas)].index
                    test[i].iloc[ind] = clas[0]
                    
                t0=enc0.transform(test[i].values.reshape(-1,1))
                t1=enc1.transform(t0.reshape(-1,1)).toarray()
                tf=pd.DataFrame(t1,index=test.index)
                tf.rename(columns=lambda x: i+'_'+str(x)+'_E', inplace=True)
                test.drop(i,inplace=True,axis=1)
                test=test.join(tf,how='inner')              
            else:
                tt0=train[i].values.reshape(-1,1)
                tt0_s=scaler.fit_transform(tt0)
                train[i+'_S']=tt0_s
                train.drop(i,inplace=True,axis=1)               
               
                tt2=test[i].values.reshape(-1,1)
                tt2_s=scaler.transform(tt2)      
                test[i+'_S']=tt2_s
                test.drop(i,inplace=True,axis=1)
        
        #feature selection
        sel = VarianceThreshold(threshold=0.0002)
        train_new=sel.fit_transform(train)
        sup=sel.get_support()
        features=train.columns.tolist()
        for i in xrange(train.shape[1]):
            if sup[i]==False:
                features.remove(train.columns[i])
        
        train=pd.DataFrame(train_new,columns=features)
        
        test_new=sel.transform(test)
        test=pd.DataFrame(test_new,columns=features)
        
        self.train=train.copy()
        self.train_label=train_label.copy()
        self.test=test.copy()
        self.test_label=test_label.copy()
        
        print 'End: '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')        
        return train,train_label,test,test_label
        combo_ctr=0
        feat_arr=[0 for col in range(feat_cnt)]                                         #Initialize feature array
        for idx in range(feat_cnt):
            roll_idx=idx
            feat_space_search(feat_arr, idx)                                           #Recurse
            feat_arr=[0 for col in range(feat_cnt)]                                     #Reset feature array after each iteration
        
        print('# of Feature Combos Tested:', combo_ctr)
        print(best_score, sel_idx, len(data_np[0]))
        print("Wrapper Feat Sel Runtime:", time.time()-start_ts)

    if fs_type ==5:
      print("L2 Regularization")
      sel = SelectFromModel(LogisticRegression(penalty = 'l2', C = 1.0, solver = 'liblinear'))
      sel.fit(data_np, target_np)
      sel_idx = sel.get_support()


    ##2) Get lists of selected and non-selected features (names and indexes) #######
    temp=[]
    temp_idx=[]
    temp_del=[]
    for i in range(len(data_np[0])):
        if sel_idx[i]==1:                                                           #Selected Features get added to temp header
            temp.append(header[i+feat_start])
            temp_idx.append(i)
        else:                                                                       #Indexes of non-selected features get added to delete array
            temp_del.append(i)
    print('Selected', temp)
    print('Features (total/selected):', len(data_np[0]), len(temp))
    print('\n')
                            df_gram,
                            df_edit,
                            df_w2v,
                            df_tdif,
                            df_char_nstem,
                            df_word_nstem,
                            df_gram_nstem,
                            df_edit_nstem,
                            df_w2v_nstem,
                            df_tdif_nstem,
                          ), axis=1).replace(np.inf, 1e20)  # Some infinites appear on the middle of the data...

    del df_w2v, df_char_nstem, df_word_nstem, df_gram_nstem, df_w2v_nstem
    del df_gram, df_edit, df_tdif, df_tdif_nstem, df_char, df_word,
    joblib.dump(df_metrics, 'df_metrics.pkl', compress=9)

    #id_test = df_tudo.iloc[num_train:]['id']
    #y_train = df_tudo.iloc[:num_train].relevance.values
    #joblib.dump(y_train, 'y_train.pkl')
    #joblib.dump(id_test, 'id_test.pkl')
    id_test = joblib.load('id_test.pkl')
    y_train = joblib.load('y_train.pkl')

    var = VarianceThreshold()
    var.fit_transform(df_metrics)
    df_val_metrics = df_metrics[var.get_support(indices=True)]
    joblib.dump(df_val_metrics, 'df_val_metrics.pkl', compress=9)

    df_train = df_val_metrics.iloc[:num_train]
    df_test = df_val_metrics.iloc[num_train:]
Beispiel #54
0
# ----------------
nifti_masker = NiftiMasker(standardize=False,
                           smoothing_fwhm=2,
                           memory='nilearn_cache')  # cache options
gm_maps_masked = nifti_masker.fit_transform(gm_imgs_train)

# The features with too low between-subject variance are removed using
# :class:`sklearn.feature_selection.VarianceThreshold`.
from sklearn.feature_selection import VarianceThreshold
variance_threshold = VarianceThreshold(threshold=.01)
gm_maps_thresholded = variance_threshold.fit_transform(gm_maps_masked)
gm_maps_masked = variance_threshold.inverse_transform(gm_maps_thresholded)

# Then we convert the data back to the mask image in order to use it for
# decoding process
mask = nifti_masker.inverse_transform(variance_threshold.get_support())

############################################################################
# Prediction pipeline with ANOVA and SVR using
# :class:`nilearn.decoding.DecoderRegressor` Object

# In nilearn we can benefit from the built-in DecoderRegressor object to
# do ANOVA with SVR instead of manually defining the whole pipeline.
# This estimator also uses Cross Validation to select best models and ensemble
# them. Furthermore, you can pass n_jobs=<some_high_value> to the
# DecoderRegressor class to take advantage of a multi-core system.
# To save time (because these are anat images with many voxels), we include
# only the 1-percent voxels most correlated with the age variable to fit. We
# also want to set mask hyperparameter to be the mask we just obtained above.

from nilearn.decoding import DecoderRegressor
Beispiel #55
0

def spearm_cor_func(expected, pred):
    return spearmanr(expected, pred)[0]

# Folders
submission_filename_prefix = 'sc3_emanuel_phase2_'

# Import data
train_exp, train_cnv, train_ess, leader_exp, leader_cnv, prioritized_genes = read_data_sets()

X_train_pre = train_exp
X_test_pre = leader_exp

var_thres = VarianceThreshold(0.65).fit(X_train_pre)
X_train_pre = X_train_pre.loc[:, var_thres.get_support()]
X_test_pre = X_test_pre.loc[:, var_thres.get_support()]

# Prepare features
features = X_train_pre.columns
important_features = []

for gene in prioritized_genes:
    # Assemble prediction variables
    X_train = X_train_pre
    y_train = train_ess.ix[:, gene]
    X_test = X_test_pre

    # Feature selection
    fs = SelectKBest(f_regression, k=100)
    X_train = fs.fit_transform(X_train, y_train)
Beispiel #56
0
・Emedded Method

Filter Methodは、大別して3つある
・特徴量の値のみ
・特徴量間の相関係数
・統計的評価指標

# 特徴量のみ
・分散がゼロ => 全て同じ値 => 削減

from sklearn.feature_selection import VarianceThreshold
X = desc_df.values
select = VarianceThreshold()
X_new = select.fit_transform(X)

np.array(descs)[select.get_support()==False]  # 削減後の特徴量の数を確認

・分散がほぼゼロ => データをよく観察して削除するか判断

・特徴量がほかの特徴量と完全に一致 


# 特徴量間の相関係数
メリット
・互いに相関の高い特徴量の片方を削除することで、精度にあまり影響を与えずに特徴量空間の次元を下げる
・線形モデルの解釈性を上げることができる。

ピアソン相関係数(いわゆる普通の相関係数)

threshold = 0.8 # 閾値
def do():
    print 'loading data'
    #X_train, y_train = load_svmlight_file("sparse_input.log")    
    #X_train, y_train = load_svmlight_file("GetDict_80w_new_input.txt")    
    X_train, y_train = load_svmlight_file("GetUniq_100w_new_input.txt")    
    #X_test, y_test = load_svmlight_file("svm_test.log", n_features=X_train.shape[1])

    startime= time.clock()
    print 'fs start'
    _tree='''
    print 'tree start'
    print '1:',X_train.shape
    clf_tree = ExtraTreesClassifier()
    X_train = clf_tree.fit(X_train.toarray(), y_train).transform(X_train)
    print '2:',X_train.shape
    #X_train_tree = sp.sparse.csr_matrix(X_new_tree)
    #'''
    
    #_threshold='''
    th=0.99
    print 'v start,threshold=',th
    print '1:',X_train.shape

    sel = VarianceThreshold(threshold=(th * (1 - th)))
    X_train=sel.fit_transform(X_train)
    print '2:',X_train.shape
    X_train=X_train.toarray()
    #print sel.variances_

    fet_got = sel.get_support(True)
    print type(fet_got)
    str=''
    for it in fet_got:
        str+='%d,'%it 
    print str
    return
    #'''
    _Kbest='''
    num=180
    print '_Kbest,',num
    print X_train.shape
    X_train = SelectKBest(chi2, k=num).fit_transform(X_train, y_train)
    print X_train.shape
    X_train=X_train.toarray()
    #'''
    _Kper='''
    print X_train.shape
    X_train = SelectPercentile(f_classif, percentile=10).fit_transform(X_train, y_train)
    print X_train.shape
    
    #'''

    _svd='''
    print 'PCA+RF 1:',X_train.shape
    svd = TruncatedSVD(n_components=500, random_state=42)
    svd.fit(X_train) 
    TruncatedSVD(algorithm='randomized', n_components=500, n_iter=5,
            random_state=42, tol=0.0)
    print 'var percent=',(svd.explained_variance_ratio_.sum())

    X_train = svd.transform(X_train)
    #X_test = svd.transform(X_test)
    
    print '2:',X_train.shape
    #print '2:',X_test.shape
    #'''
    
    fstime = time.clock()
    print 'fs end,time=%fs'%(fstime-startime)
  
    #_cv='''
    print 'cv start,LSVM'
    #clf = svm.SVC()
    #clf = svm.LinearSVC()
    #clf = tree.DecisionTreeClassifier()
    clf = RandomForestClassifier(n_estimators=10)
    scores=cross_validation.cross_val_score(clf,X_train,y_train,cv=5,scoring="accuracy")
    print(scores,scores.mean())
    #cv end
    #'''

    _svm='''
    print 'SVM trainning start'
    clf = svm.SVC()
    clf.fit(X_train, y_train)
    print 'SVM predicting start'

    y_pred = clf.predict(X_train)
    print y_pred
    print y_train
    print 'SVM predict end'
    print "Accuracy", np.mean(y_pred == y_train)
    #'''

    _dtm='''
    print 'DTM train start'
    clf = tree.DecisionTreeClassifier()
    #pca出来的是dense的,否则X.toarray()
    clf.fit(X_train, y_train)
    print 'DTM predict start'
    traintime = time.clock()
    print 'trainning end, time=%fs'%(traintime-startime)
    y_pred = clf.predict(X_train)
    #print y_pred
    print 'DTM predict end'
    print "Accuracy", np.mean(y_pred == y_train)
   # '''
    
    predtime = time.clock()
    print 'predict end,time=%fs'%(predtime-startime)
Beispiel #58
0
    def find_low_variance_features(self, threshold=0.0, skip_columns=[]):
        """
        Wrapper for sklearn VarianceThreshold for use on pandas dataframes.
        """
        df = self.dataset.data
        #print("Finding low-variance features.")
        #try:
        # get list of all the original df columns
        all_columns = df.columns

        # remove `skip_columns`
        remaining_columns = all_columns.drop(skip_columns)

        # get length of new index
        max_index = len(remaining_columns) - 1

        # get indices for `skip_columns`
        skipped_idx = [all_columns.get_loc(column) for column in skip_columns]

        # adjust insert location by the number of columns removed
        # (for non-zero insertion locations) to keep relative
        # locations intact
        for idx, item in enumerate(skipped_idx):
            if item > max_index:
                diff = item - max_index
                skipped_idx[idx] -= diff
            if item == max_index:
                diff = item - len(skip_columns)
                skipped_idx[idx] -= diff
            if idx == 0:
                skipped_idx[idx] = item

        # get values of `skip_columns`
        skipped_values = df.iloc[:, skipped_idx].values

        # get dataframe values
        X = df.loc[:, remaining_columns].values

        # instantiate VarianceThreshold object
        vt = VarianceThreshold(threshold=threshold)

        # fit vt to data
        vt.fit(X)

        # get the indices of the features that are being kept
        feature_indices = vt.get_support(indices=True)

        # remove low-variance columns from index
        feature_names = [
            remaining_columns[idx] for idx, _ in enumerate(remaining_columns)
            if idx in feature_indices
        ]

        # get the columns to be removed
        low_variance_features = list(
            np.setdiff1d(remaining_columns, feature_names))
        self.low_variance_features += low_variance_features

        if (len(low_variance_features)):
            self.log.info(
                "find_low_variance_features: {0} features below {1}.".format(
                    len(low_variance_features), threshold))
        else:
            self.log.info(
                "find_low_variance_features: none found below threshold %s:",
                threshold)

        return low_variance_features
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
ids_tr = train.pop('id').values
ids_te = test.pop('id').values
magic_tr = train.pop('wheezy-copper-turtle-magic').values
magic_te = test.pop('wheezy-copper-turtle-magic').values
target = train.pop('target').values
train = train.values
test = test.values

# infomative columns of each magic value
vt = VarianceThreshold(threshold=1.5)
infomative_cols = []
for i in range(MAX_MAGIC_NO):
    vt.fit(train[magic_tr == i])
    infomative_cols.append(vt.get_support(indices=True))

### Step-1 ###
oof_all = []
pred_all = []
for n in range(1, MAX_COMPONENTS + 1):
    oof_n = np.zeros(len(train))
    pred_n = np.zeros(len(test))
    gmm0 = GaussianMixture(n_components=n,
                           covariance_type='full',
                           random_state=RANDOM_SEED)
    gmm1 = GaussianMixture(n_components=n,
                           covariance_type='full',
                           random_state=RANDOM_SEED)
    for i in range(MAX_MAGIC_NO):
        print('.', end='')
Beispiel #60
0
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# ### Constant Feature Removal

# In[12]:

constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(x_train)

# In[13]:

constant_filter.get_support().sum()

# In[14]:

constant_list = [not temp for temp in constant_filter.get_support()]
constant_list

# In[15]:

x.columns[constant_list]

# In[16]:

x_train_filter = constant_filter.transform(x_train)
x_test_filter = constant_filter.transform(x_test)