Ejemplo n.º 1
0
    def check(self, train, dropped_columns_):

        assert dropped_columns_.shape[
            0] == 51, "Please choose to keep 40 columns"

        feature_cols = train.columns.drop(
            ['click_time', 'attributed_time', 'is_attributed'])
        # Do feature extraction on the training data only!
        selector = SelectKBest(f_classif, k=40)
        X_new = selector.fit_transform(train[feature_cols],
                                       train['is_attributed'])

        # Get back the features we've kept, zero out all other features
        selected_features = pd.DataFrame(selector.inverse_transform(X_new),
                                         index=train.index,
                                         columns=feature_cols)

        # Dropped columns have values of all 0s, so var is 0, drop them
        dropped_columns = selected_features.columns[selected_features.var() ==
                                                    0]

        message = (
            "Somethings not right with your result. Be sure to use the train "
            "dataset for the feature selection")

        assert (dropped_columns == dropped_columns_).all(), message
Ejemplo n.º 2
0
    def features_selection(self):

        features_index = []

        if self.features_number != 0 and self.features_number != self.dataset_features_number:

            selector = SelectKBest(mutual_info_classif, k=self.features_number)
            training_set_selected = selector.fit_transform(
                self.training_set[:, :self.dataset_features_number],
                self.ground_through)
            training_set_reconstr = selector.inverse_transform(
                training_set_selected)

            i0 = 0
            i1 = 0
            while i0 < self.features_number:
                if np.array_equal(training_set_selected[:, i0],
                                  training_set_reconstr[:, i1]):
                    features_index.append(i1)
                    i0 += 1
                i1 += 1
        else:
            if self.packets_number == 0:
                features_index = [
                    i for i in range(self.dataset_features_number)
                ]
            else:
                features_index = np.r_[0:self.packets_number,
                                       self.dataset_features_number /
                                       2:self.dataset_features_number / 2 +
                                       self.packets_number]

        return features_index
Ejemplo n.º 3
0
	def features_selection(self,node):
		features_index = []

		if node.features_number != 0 and node.features_number != self.dataset_features_number:

			selector = SelectKBest(mutual_info_classif, k=node.features_number)
			training_set_selected = selector.fit_transform(
				node.encoder.transform(self.training_set[node.train_index, :self.dataset_features_number]),
				node.encoder.transform(self.ground_truth[node.train_index, node.level], dtype=int)
				)
			
			training_set_reconstr = selector.inverse_transform(
				node.encoder.inverse_transform(
					training_set_selected
				))

			i0 = 0
			i1 = 0
			while i0 < node.features_number:
				if np.array_equal(training_set_selected[:, i0], training_set_reconstr[:, i1]):
					features_index.append(i1)
					i0 += 1
				i1 += 1
		else:
			if node.packets_number == 0:
				features_index = [i for i in range(self.dataset_features_number)]
			else:
				features_index = np.r_[0:node.packets_number, self.dataset_features_number/2:self.dataset_features_number/2+node.packets_number]

		return features_index
Ejemplo n.º 4
0
	def features_selection(self):
		'''
		Questa funzione parte dal presupposto che features che presentano occorrenze costanti
		non verranno selezionate, dando un contributo nullo, e saranno sempre azzerate.
		'''

		features_index = []

		if self.features_number != 0 and self.features_number != self.dataset_features_number:

			# print('\n***\nFeature Selection -f'+str(self.features_number)+' -c'+self.classifier_name'\n***\n')

			selector = SelectKBest(mutual_info_classif, k=self.features_number)
			training_set_selected = selector.fit_transform(
				self.training_set[:, :self.dataset_features_number], self.ground_through)
			training_set_reconstr = selector.inverse_transform(
				training_set_selected)

			i0 = 0
			i1 = 0
			while i0 < self.features_number:
				if np.array_equal(training_set_selected[:, i0], training_set_reconstr[:, i1]):
					features_index.append(i1)
					i0 += 1
				i1 += 1
		else:
			if self.packets_number == 0:
				features_index = [i for i in range(self.dataset_features_number)]
			else:
				features_index = np.r_[0:self.packets_number, self.dataset_features_number /
					2:self.dataset_features_number/2+self.packets_number]

		return features_index
Ejemplo n.º 5
0
	def features_selection(self,node):
		features_index = []

		if node.features_number != 0 and node.features_number != self.dataset_features_number:

			# print('\n***\nFeature Selection for Classifier ' + node.tag + ' Level ' + str(node.level) + '\n***\n')

			selector = SelectKBest(mutual_info_classif, k=node.features_number)
			training_set_selected = selector.fit_transform(
				self.training_set[node.train_index, :self.dataset_features_number], self.ground_through[node.train_index, node.level])
			training_set_reconstr = selector.inverse_transform(
				training_set_selected)

			i0 = 0
			i1 = 0
			while i0 < node.features_number:
				if np.array_equal(training_set_selected[:, i0], training_set_reconstr[:, i1]):
					features_index.append(i1)
					i0 += 1
				i1 += 1
		else:
			if node.packets_number == 0:
				features_index = [i for i in range(self.dataset_features_number)]
			else:
				features_index = np.r_[0:node.packets_number, self.dataset_features_number/2:self.dataset_features_number/2+node.packets_number]
			
		return features_index
Ejemplo n.º 6
0
def feature_selection_univariate(Xtrain, ytrain, keep=5):
    selector = SelectKBest(mutual_info_classif, k=keep)
    Xtrain_new = selector.fit_transform(Xtrain, ytrain)
    selected_features = pd.DataFrame(selector.inverse_transform(Xtrain_new),
                                     index=Xtrain.index,
                                     columns=Xtrain.columns)
    selected_columns = selected_features.columns[selected_features.var() != 0]
    dropped_columns = selected_features.columns[selected_features.var() == 0]

    return selected_columns, dropped_columns
Ejemplo n.º 7
0
def chiSquare(train_data, train_classes, topK):
    vectorizer = DictVectorizer()

    # Fit and transform the train data.
    x_train = vectorizer.fit_transform(train_data)
    y_train = train_classes

    if (x_train.shape[1] < topK):
        topK = x_train.shape[1]

    selector = SelectKBest(chi2, k=topK)
    x_new = selector.fit_transform(x_train, y_train)

    return vectorizer.inverse_transform(selector.inverse_transform(x_new))
Ejemplo n.º 8
0
def feature_selectionKbest(data, y, num_feature):
    xx = data.sort_values('pid').values
    xx_label = y.sort_values('pid')[sep].values
    select = SelectKBest(f_classif, k=num_feature).fit(xx, xx_label)
    # select = SelectKBest(chi2, k=num_feature).fit(xx,xx_label)
    # select = SelectFromModel(estimator=Lasso(), threshold=-np.inf, max_features=num_feature).fit(data,y)
    reduced_xx = select.transform(xx)
    new_data = select.inverse_transform(reduced_xx)
    new_data = pd.DataFrame(new_data,
                            index=data.sort_values('pid').index,
                            columns=data.sort_values('pid').columns)
    # idx = select.get_support()
    # print(idx)
    # new_data = np.delete(new_data,idx,1)
    return new_data
Ejemplo n.º 9
0
def feature_selection_anova_f_value(data, features_count):
    """

    :param data: the data set we want to check its top best features
    :param features_count: the number of top features we want to consider
    :return: the top features
    """

    feature_cols = data.columns.drop('TARGET')

    selector = SelectKBest(f_classif, k=features_count)
    X_new = selector.fit_transform(data[feature_cols], data['TARGET'])
    selected_features = pd.DataFrame(selector.inverse_transform(X_new),
                                     index=data.index,
                                     columns=feature_cols)
    selected_columns = selected_features.columns[selected_features.var() != 0]
    return selected_columns
Ejemplo n.º 10
0
def get_data(features=[]):
    if len(features) > 0:
        return data[features]
    else:
        # choose features
        selector = SelectKBest(f_classif, k)

        X_new = selector.fit_transform(X, y)

        # Get back the features we've kept, zero out all other features
        selected_features = pd.DataFrame(selector.inverse_transform(X_new),
                                         index=X.index,
                                         columns=X.columns)

        # Dropped columns have values of all zero, so var is zero, drop them
        selected_columns = selected_features[selected_features.var() != 0]

        return selected_features[selected_columns]
Ejemplo n.º 11
0
def univariate_selection(X_train, y_train, features, k_feat, method):

    # Esta función compara la correlación que hay entre las features que se le pasan y la variable target
    # de forma individual (uno contra uno). Puedo seleccionar la k features que deseo mantener. Existen distintas
    # metricas para medir la correlacion, por defecto está la 'f_classif' que hace un f score, pero puede ser
    # X^2, ANOVA o mutual information score (que captura relaciones no lineales). Ojo que las tecnicas cambian
    # segun sea clasificacion o regresion

    from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif, f_regression, mutual_info_regression
    # Create the selector, keeping k features

    selector = SelectKBest(method, k=k_feat)
    # Use the selector to retrieve the best features
    X_new = selector.fit_transform(X_train[features], y_train)

    # Get back the kept features as a DataFrame with dropped columns as all 0s
    selected_features = pd.DataFrame(selector.inverse_transform(X_new),
                                     index=X_train.index,
                                     columns=features)

    # Find the columns that must be kept
    keep_columns = selected_features.columns[selected_features.var() != 0]

    return keep_columns
Ejemplo n.º 12
0
    def SelectKBestFeatures(self, k):
        """
            k = Number of best features you want from the model
        """
        if k > self.track:
            print(
                'Error, Number of features to be selected are more than the number of features in the dataset'
            )
            return

        selector = SelectKBest(f_classif, k=k)
        self.features_new = selector.fit_transform(self.features, self.target)

        selected_features = pd.DataFrame(selector.inverse_transform(
            self.features_new),
                                         index=self.train.index,
                                         columns=self.features.columns)
        self.k_selected_columns = selected_features.columns[
            selected_features.var() != 0]
        self.k_selected_columns = self.train[self.k_selected_columns]

        print('Top {} features : \n{}'.format(k,
                                              self.k_selected_columns.columns))
        return self.k_selected_columns
Ejemplo n.º 13
0
def Best125():
    activities_labels = pd.read_csv(
        "/Users/user/Desktop/UCI_HAR_Dataset/activity_labels.csv", header=None)
    activities_labels = pd.DataFrame(activities_labels).to_numpy()
    activities_labels = list(activities_labels.flatten())

    feature_labels = pd.read_csv(
        "/Users/user/Desktop/UCI_HAR_Dataset/features.csv",
        header=None,
        delim_whitespace=True)

    groups_at_training = pd.read_csv(
        "/Users/user/Desktop/UCI_HAR_Dataset/train/subject_train.txt",
        header=None)
    groups_at_training = pd.DataFrame(groups_at_training).to_numpy()
    groups_at_training = groups_at_training.reshape(len(groups_at_training), )
    # groups_at_training = list(groups_at_training.flatten())

    groups_at_testing = pd.read_csv(
        "/Users/user/Desktop/UCI_HAR_Dataset/test/subject_test.txt",
        header=None)
    groups_at_testing = pd.DataFrame(groups_at_testing).to_numpy()
    groups_at_testing = groups_at_testing.reshape(len(groups_at_testing), )
    # groups_at_testing = list(groups_at_testing.flatten())

    # Specify data
    data_train = pd.read_csv(
        "/Users/user/Desktop/UCI_HAR_Dataset/train/X_train.csv",
        delim_whitespace=True,
        header=None)
    data_test = pd.read_csv(
        "/Users/user/Desktop/UCI_HAR_Dataset/test/X_test.csv",
        delim_whitespace=True,
        header=None)
    data_train = pd.DataFrame(data_train).to_numpy()
    data_test = pd.DataFrame(data_test).to_numpy()

    output_train = pd.read_csv(
        "/Users/user/Desktop/UCI_HAR_Dataset/train/y_train.csv",
        delim_whitespace=True,
        header=None)
    output_test = pd.read_csv(
        "/Users/user/Desktop/UCI_HAR_Dataset/test/y_test.csv",
        delim_whitespace=True,
        header=None)
    output_train = pd.DataFrame(output_train).to_numpy()
    output_test = pd.DataFrame(output_test).to_numpy()
    output_train = output_train.reshape(len(groups_at_training), )
    output_test = output_test.reshape(len(output_test), )

    selector = SelectKBest(f_classif, k=125)
    data_train = selector.fit_transform(data_train, output_train)
    data_test = selector.transform(data_test)
    unscaled_data_test = data_test

    selected_features = pd.DataFrame(selector.inverse_transform(data_train),
                                     columns=feature_labels)
    #print(selected_features.head())

    # Dropped columns have values of all 0s, so var is 0, drop them
    #selected_columns = selected_features.columns[selected_features.var() != 0]
    #print(selected_columns)

    #scaler = MinMaxScaler()
    #data_train= scaler.fit_transform(data_train)
    #data_test = scaler.transform(data_test)

    print("Data_train", np.shape(data_train), "Data_test", np.shape(data_test),
          "Output_train", np.shape(output_train), "Output_test",
          np.shape(output_test), "activities_labels",
          np.shape(activities_labels), "groups_at_training",
          np.shape(groups_at_training), "groups_at_testing",
          np.shape(groups_at_testing))

    print("################################")

    return data_train, data_test, output_train, output_test, unscaled_data_test, activities_labels, feature_labels, groups_at_training, groups_at_testing
# we can plug them together in a *pipeline* that performs the two operations
# successively:
from sklearn.pipeline import Pipeline
anova_svc = Pipeline([('anova', feature_selection), ('svc', clf)])

### Fit and predict ###########################################################

anova_svc.fit(X, y)
y_pred = anova_svc.predict(X)

### Visualisation #############################################################

### Look at the discriminating weights
svc = clf.support_vectors_
# reverse feature selection
svc = feature_selection.inverse_transform(svc)
# reverse masking
svc = nifti_masker.inverse_transform(svc[0])

# We use a masked array so that the voxels at '-1' are displayed
# transparently
act = np.ma.masked_array(svc.get_data(), svc.get_data() == 0)

plot_haxby(act, 'SVC')
pl.savefig('haxby_svm.pdf')
pl.savefig('haxby_svm.eps')

###############################################################################
#                                                                             #
#   Searchlight                                                               #
#                                                                             #
Ejemplo n.º 15
0
    # Best DNetPRO signature
    dnet_signature = dnet.selected_signature

    # print some informations
    print('Signature DNetPRO: {}'.format(sorted(dnet_signature)))
    print('DNetPRO score: {:.3f}'.format(dnet.score(X_test, y_test)))
    print('Informative found: {:d} / {:d}'.format(
        len([x for x in dnet_signature if x < Ninformative]), Ninformative))

    # Compare the obtained results against the Kbest features with K=number of feature in the DNetPRO signature
    filter_kbest = SelectKBest(k=new_probe)
    # extract the filtered datasets
    Kbest_data = filter_kbest.fit_transform(X_train, y_train)
    # set to zero the other features
    Kbest_filtered = filter_kbest.inverse_transform(Kbest_data)
    # now it is easy to extract the selected features as non-zero columns
    Kbest_signature = set(np.nonzero(Kbest_filtered)[1])

    # Just to be sure that everything goes right...
    if not (len(Kbest_signature) == len(dnet_signature)):
        raise ValueError(
            'Inconsistent length of features between the two methods')

    # print some informations
    print('Signature Kbest: {}'.format(sorted(Kbest_signature)))
    print('Kbest score: {:.3f}'.format(
        classifier.fit(Kbest_data,
                       y_train).score(filter_kbest.transform(X_test), y_test)))
    print('Informative found: {:d} / {:d}'.format(
        len([x for x in Kbest_signature if x < Ninformative]), Ninformative))
Ejemplo n.º 16
0
def create_coef_maps(subid, X, ev_labels, func_masker, mask_name, paths,
                     save_img=True, show_img=False, calc_A=False, univariate_fsel_k=None):

    """Create coef maps, output niftis and pngs
    ----------
    subid : str
        Subject ID (e.g., 'ap01')
    X : 2D numpy array
        Selected BOLD data (sample x voxel) for classification
    ev_labels : list/array of strings
        condition labels (length = # of samples)
    func_masker : NiftiMasker object
        created from mask image, use to convert 2D back to 3D
    paths : dict
        filepaths to the mean functional (to use as background for png) & output directory
    save_img : bool
        save out niftis and png images?
    show_img : bool
        show images?
    calc_A : bool
        calculate activation patterns? This is just covariance matrix of data multiplied by coefficients. Might crash computer if X is too big?
        http://www.sciencedirect.com/science/article/pii/S1053811913010914
    univariate_fsel_k : int
        include univariate feature selection (using ANOVA) for k best features before training classifier
    """

    # if using feature selection, do it now
    if univariate_fsel_k:
        fsel = SelectKBest(f_classif, k=univariate_fsel_k).fit(X, ev_labels)
        X = fsel.transform(X)

    classifier = LogisticRegression(penalty='l2', C=1.)
    classifier.fit(X, ev_labels)

    print classifier.classes_

    if len(classifier.classes_) > 2:
        cats = classifier.classes_
    else:
        cats = [classifier.classes_[0]]

    d_coef = pd.DataFrame(columns=cats)

    print 'Categories: '
    print cats

    for category in cats:
        print category

        # Get coefficients for the category of interest, save in d_coef df
        if len(classifier.classes_) > 2:
            coef = classifier.coef_[classifier.classes_ == category]
            d_coef[category] = pd.Series(coef.squeeze())
        else:
            coef = classifier.coef_
            d_coef[category] = pd.Series(coef.squeeze())

        # reverse feature selection if necessary
        if univariate_fsel_k:
            coef = fsel.inverse_transform(coef)

        # Transform activation patterns or coefs to native space
        if calc_A: #careful, might crash computer...
            print 'computing cov mat...'
            data_cov = np.cov(X.T)
            print 'multiplying matrices...'
            A_w = np.matmul(data_cov, coef.T)
            print 'transforming to epi space'
            weight_img = func_masker.inverse_transform(A_w.T)
            filename = "{mask_name}_activationpattern_{category}"
        else:
            weight_img = func_masker.inverse_transform(coef)
            filename = "{mask_name}_coef_{category}"

        # Save output
        if save_img:
            filepath = paths['outnifti'].format(subid=subid)

            if not op.exists(filepath):
                os.makedirs(filepath)

            weight_img.to_filename(op.join(filepath, 
                                           filename + '.nii.gz').format(mask_name=mask_name,
                                                                        category=category))
            plotting.plot_stat_map(weight_img,
                                   paths['meanfile'].format(subid=subid, run_id=1),
                                   title=category,
                                   output_file=op.join(filepath, 
                                                       filename + '.png').format(mask_name=mask_name,
                                                                                 category=category))
        elif show_img:
            print 'plot stat map'
            plotting.plot_stat_map(weight_img,
                                   paths['meanfile'].format(subid=subid, run_id=1),
                                   title=category)

    return d_coef
Ejemplo n.º 17
0
feature_cols = baseline_data.columns.drop('outcome')

# Keep 5 features
selector = SelectKBest(f_classif, k=5)

feature_cols = baseline_data.columns.drop('outcome')
train, valid, _ = get_data_splits(baseline_data)

## only with train data to avoid data-leakaage
X_new = selector.fit_transform(train[feature_cols], train['outcome'])
X_new

## To obtain dropped features: 
# Get back the features we've kept, zero out all other features
selected_features = pd.DataFrame(
    selector.inverse_transform(X_new), 
    index=train.index, 
    columns=feature_cols)
selected_features.head()

# This returns a DataFrame with the same index and columns as the training set, 
# but all the dropped columns are filled with zeros.

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns = selected_features.columns[selected_features.var() != 0]

# Get the valid dataset with the selected features.
valid_new = valid[selected_columns].head()

# L1 reg: 
from sklearn.linear_model import LogisticRegression
Ejemplo n.º 18
0
import pandas as pd
from preprocessing import df_all

"""
A large number of features can lead to overfitting.Optimizing hyperparameters and training algorithms could take longer.
That is why we want to pick the most relevant features from the beginning.

Univariate Feature Selection: It calculates how strongly  the output feature depends on each feature
from the dataset using statistical tests (like χ2).
Utilizing SelectKBest which has several options when it comes to used statistical tests (the default however is χ2)
"""

df_f_selection = pd.DataFrame()
df_f_selection['Inspired'] = df_all['Inspired']
df_f_selection['Envious'] = df_all['Envious']
df_f_selection['Angry'] = df_all['Angry']

df_age = pd.DataFrame()
df_age['Age'] = df_all['Age']

selector = SelectKBest(f_classif, k=3)

selected_data = selector.fit_transform(df_f_selection, df_age)

print(selected_data)
selected_features = pd.DataFrame(selector.inverse_transform(selected_data), index=df_f_selection.index,
                                 columns=df_f_selection.columns)

selected_columns = selected_features.columns
print(selected_features[selected_columns].head())
# we can plug them together in a *pipeline* that performs the two operations
# successively:
from sklearn.pipeline import Pipeline
anova_svc = Pipeline([('anova', feature_selection), ('svc', clf)])

### Fit and predict ###########################################################

anova_svc.fit(X, y)
y_pred = anova_svc.predict(X)

### Visualisation #############################################################

### Look at the discriminating weights
svc = clf.support_vectors_
# reverse feature selection
svc = feature_selection.inverse_transform(svc)
# reverse masking
niimg = nifti_masker.inverse_transform(svc[0])

# We use a masked array so that the voxels at '-1' are displayed
# transparently
act = np.ma.masked_array(niimg.get_data(), niimg.get_data() == 0)

### Create the figure
import pylab as pl
pl.axis('off')
pl.title('SVM vectors')
pl.imshow(np.rot90(mean_img[..., 27]), cmap=pl.cm.gray,
          interpolation='nearest')
pl.imshow(np.rot90(act[..., 27]), cmap=pl.cm.hot,
          interpolation='nearest')
Ejemplo n.º 20
0
# We have our predictor (SVR), our feature selection (SelectKBest), and now,
# we can plug them together in a *pipeline* that performs the two operations
# successively:
from sklearn.pipeline import Pipeline
anova_svr = Pipeline([('anova', feature_selection), ('svr', svr)])

### Fit and predict
anova_svr.fit(gm_maps_masked, age)
age_pred = anova_svr.predict(gm_maps_masked)

### Visualisation
### Look at the SVR's discriminating weights
coef = svr.coef_
# reverse feature selection
coef = feature_selection.inverse_transform(coef)
# reverse masking
weight_niimg = nifti_masker.inverse_transform(coef)

# We use a masked array so that the voxels at '-1' are transparent
weights = np.ma.masked_array(weight_niimg.get_data(),
                             weight_niimg.get_data() == 0)

### Create the figure
background_img = nibabel.load(dataset_files.gray_matter_maps[0]).get_data()
picked_slice = 36
plt.figure(figsize=(5.5, 5.5))
data_for_plot = weights[:, :, picked_slice, 0]
vmax = max(np.min(data_for_plot), np.max(data_for_plot)) * 0.5
plt.imshow(np.rot90(background_img[:, :, picked_slice]), cmap=plt.cm.gray,
          interpolation='nearest')
Ejemplo n.º 21
0
# apply log transformation
plt.hist(np.log(ks.goal), range=(0,25), bins=50);

# select the five best features based on F-value
# feature selection should use training data only to avoid leakage
# (1) drop the target column from the dataset
# (2) split the dataset into training, validation and testing
# (3) create a feature selector
# (4) apply the feature selector to the training dataset
# (5) get a dataframe with the same index and columns as the training data but the unselected columns are filled with zeros
# (6) find selected columns by choosing features with nonzero variance
feature_cols = data.columns.drop('outcome')
train, valid, test = get_data_splits(data)
selector = SelectKBest(f_classif, k=6)
X_new = selector.fit_transform(train[feature_cols], train['outcome'])
selected_features = pd.DataFrame(selector.inverse_transform(X_new), 
                                 index=train.index, 
                                 columns=feature_cols)
selected_columns = selected_features.columns[selected_features.var()!=0]

# L1 regularization
# feature selection using L1 regularization should use training data only
# (1) split the data into training, validation and testing
# (2) drop the target column
# (3) fit a logistic regressio model to the training dataset (the smaller the parameter C the more penalty)
# (4) select the nonzero coefficients using .SelectFromModel method
# (5) select features based on the nonzero coefficients
# (6) get a dataframe with the same index and columns as the training data but the unselected columns are filled with zeros
# (7) find selected columns by choosing features with nonzero variance
train, valid, test = get_data_splits(data)
X, y = train[train.columns.drop("outcome")], train['outcome']
Ejemplo n.º 22
0
    # # clf = Pipeline([
    # #   ('feature_selection', LinearSVC(penalty="l1",dual=False)),
    # #   ('classification', LinearSVC(penalty='l2'))
    # # ])
    # # clf.fit(X, y)

    feature_selection = SelectKBest(f_classif, k=50)  # take the best 500
    # to make life easier we will create a pipeline object
    pipe = Pipeline([('anova', feature_selection), ('svc', clf)])

    # where the magic happens
    for ii, (train, test) in enumerate(cv):
        pipe.fit(X[train], y[train])
        y_pred = pipe.predict(X[test])
        scores[ii] = np.sum(y_pred == y[test]) / float(len(y[test]))
        feature_weights += feature_selection.inverse_transform(clf.coef_)

    print 'Average prediction accuracy: %0.3f | standard deviation:  %0.3f' % \
    (scores.mean(), scores.std())

# from sklearn import cross_validation
# from sklearn.feature_selection import SelectPercentile, f_classif

# loocv = cross_validation.LeaveOneOut(len(y))
# clf = RandomForestClassifier(n_estimators=500, max_features=np.sqrt(X.shape[1]), max_depth=None, min_samples_split=1, compute_importances=True)

# top_features = 50
# # computing feature importance
# acc = 0
# cnt = .0
# for train_index, test_index in loocv:
Ejemplo n.º 23
0
# we will use an ANOVA f-test to preselect relevant spatio-temporal units
feature_selection = SelectKBest(f_classif, k=500)  # take the best 500
# to make life easier we will create a pipeline object
anova_svc = Pipeline([('anova', feature_selection), ('svc', clf)])

# initialize score and feature weights result arrays
scores = np.zeros(n_splits)
feature_weights = np.zeros([n_vertices, n_times])

# hold on, this may take a moment
for ii, (train, test) in enumerate(cv):
    anova_svc.fit(X[train], y[train])
    y_pred = anova_svc.predict(X[test])
    y_test = y[test]
    scores[ii] = np.sum(y_pred == y_test) / float(len(y_test))
    feature_weights += feature_selection.inverse_transform(clf.coef_) \
        .reshape(n_vertices, n_times)

print('Average prediction accuracy: %0.3f | standard deviation:  %0.3f' %
      (scores.mean(), scores.std()))

# prepare feature weights for visualization
feature_weights /= (ii + 1)  # create average weights
# create mask to avoid division error
feature_weights = np.ma.masked_array(feature_weights, feature_weights == 0)
# normalize scores for visualization purposes
feature_weights /= feature_weights.std(axis=1)[:, None]
feature_weights -= feature_weights.mean(axis=1)[:, None]

# unmask, take absolute values, emulate f-value scale
feature_weights = np.abs(feature_weights.data) * 10
# we will use an ANOVA f-test to preselect relevant spatio-temporal units
feature_selection = SelectKBest(f_classif, k=500)  # take the best 500
# to make life easier we will create a pipeline object
anova_svc = Pipeline([('anova', feature_selection), ('svc', clf)])

# initialize score and feature weights result arrays
scores = np.zeros(n_splits)
feature_weights = np.zeros([n_vertices, n_times])

# hold on, this may take a moment
for ii, (train, test) in enumerate(cv):
    anova_svc.fit(X[train], y[train])
    y_pred = anova_svc.predict(X[test])
    y_test = y[test]
    scores[ii] = np.sum(y_pred == y_test) / float(len(y_test))
    feature_weights += feature_selection.inverse_transform(clf.coef_) \
        .reshape(n_vertices, n_times)

print('Average prediction accuracy: %0.3f | standard deviation:  %0.3f'
      % (scores.mean(), scores.std()))

# prepare feature weights for visualization
feature_weights /= (ii + 1)  # create average weights
# create mask to avoid division error
feature_weights = np.ma.masked_array(feature_weights, feature_weights == 0)
# normalize scores for visualization purposes
feature_weights /= feature_weights.std(axis=1)[:, None]
feature_weights -= feature_weights.mean(axis=1)[:, None]

# unmask, take absolute values, emulate f-value scale
feature_weights = np.abs(feature_weights.data) * 10
Ejemplo n.º 25
0
    if y[i] == 0:
        plt.scatter(X1[i, 0], X1[i, 1], color='r')
    elif y[i] == 1:
        plt.scatter(X1[i, 0], X1[i, 1], color='b')
plt.title('visualization of data in 2D (  )-> tic-tac toe dataset')
plt.show()
#####################

r1=[]
x1=[]
e1=[]
for i in range(1,10,2):
    sk = SelectKBest(chi2, k=i)
    X1 = sk.fit_transform(X, y)
    e1.append(np.mean(chi2(X1,y)))
    X2=sk.inverse_transform(X1)
    rmse = np.sqrt(mean_squared_error(X, X2))
    x1.append(i)
    r1.append(rmse)

r1=np.array(r1)
x1=np.array(x1)
e1=np.array(e1)
print(e1)
print(r1)
plt.figure()

plt.subplot(2,1,1)
plt.plot(x1,e1)
plt.xlabel('number of components')
plt.ylabel('chi squared score')
Ejemplo n.º 26
0
For each feature, measure how strongly the target depends on the feature using a statistical test like  χ2  or ANOVA.

From the scikit-learn feature selection module, feature_selection.SelectKBest returns the K best features given some scoring function. 
For our classification problem, the module provides three different scoring functions:  χ2 , ANOVA F-value, and the mutual information
score. The F-value measures the linear dependency between the feature variable and the target.
This means the score might underestimate the relation between a feature and the target if the relationship is nonlinear. 
The mutual information score is nonparametric and so can capture nonlinear relationships.

from sklearn.feature_selection import SelectKBest, f_classif
feature_cols = baseline_data.columns.drop('outcome')
train, valid, _ = get_data_splits(baseline_data)
# Keep 5 features
selector = SelectKBest(f_classif, k=5)
X_new = selector.fit_transform(train[feature_cols], train['outcome'])
# Get back the features we've kept, zero out all other features
selected_features = pd.DataFrame(selector.inverse_transform(X_new), index=train.index, columns=feature_cols)
# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns = selected_features.columns[selected_features.var() != 0]

=================================================================================
L1 regularization
Univariate methods consider only one feature at a time when making a selection decision.
Instead, we can make our selection using all of the features by including them in a linear model with L1 regularization. 
This type of regularization (sometimes called Lasso) penalizes the absolute magnitude of the coefficients, 
as compared to L2 (Ridge) regression which penalizes the square of the coefficients.
As the strength of regularization is increased, features which are less important for predicting the target are set to 0. 
This allows us to perform feature selection by adjusting the regularization parameter. We choose the parameter 
by finding the best performance on a hold-out set, or decide ahead of time how many features to keep.


from sklearn.linear_model import LogisticRegression
    # # clf = Pipeline([
    # #   ('feature_selection', LinearSVC(penalty="l1",dual=False)),
    # #   ('classification', LinearSVC(penalty='l2'))
    # # ])
    # # clf.fit(X, y)

    feature_selection = SelectKBest(f_classif, k=50)  # take the best 500
    # to make life easier we will create a pipeline object
    pipe = Pipeline([("anova", feature_selection), ("svc", clf)])

    # where the magic happens
    for ii, (train, test) in enumerate(cv):
        pipe.fit(X[train], y[train])
        y_pred = pipe.predict(X[test])
        scores[ii] = np.sum(y_pred == y[test]) / float(len(y[test]))
        feature_weights += feature_selection.inverse_transform(clf.coef_)

    print "Average prediction accuracy: %0.3f | standard deviation:  %0.3f" % (scores.mean(), scores.std())

# from sklearn import cross_validation
# from sklearn.feature_selection import SelectPercentile, f_classif

# loocv = cross_validation.LeaveOneOut(len(y))
# clf = RandomForestClassifier(n_estimators=500, max_features=np.sqrt(X.shape[1]), max_depth=None, min_samples_split=1, compute_importances=True)

# top_features = 50
# # computing feature importance
# acc = 0
# cnt = .0
# for train_index, test_index in loocv:
#     selector = SelectPercentile(f_classif, percentile=10)
#get the selected/most important features and extract from validation set
selected_feats = pd.DataFrame(svc_mod.inverse_transform(x_train_new),
                              index=x_train.index,
                              columns=x_train.columns)
selected_cols = selected_feats.columns[selected_feats.var() != 0]
x_devel_new = x_devel[selected_cols]

#now train and test a decision tree using these selected features
print('L1 regularization:')
train_eval_tree(x_train_new, y_train, x_devel_new, y_devel)

#METHOD 2: SelectKBest using the f_classif score
select_feats = SelectKBest(f_classif, k=10)
x_train_new = select_feats.fit_transform(x_train, y_train)
selected_feats = pd.DataFrame(select_feats.inverse_transform(x_train_new),
                              index=x_train.index,
                              columns=x_train.columns)
selected_cols = selected_feats.columns[selected_feats.var() != 0]
x_devel_new = x_devel[selected_cols]
print('Univariate feature selection (f_classif):')
train_eval_tree(x_train_new, y_train, x_devel_new, y_devel)

#METHOD 3: RANDOM FOREST
forest = RandomForestClassifier(n_estimators=1000, random_state=0)
forest.fit(x_train, y_train)
selector = SelectFromModel(forest, threshold=0.10)
selector.fit(x_train, y_train)
for important_feats in selector.get_support(indices=True):
    print(x_train.columns[important_feats])
x_train_new = selector.transform(x_train)
Ejemplo n.º 29
0
# predict samples' classes for TRAINING dataset
y_pred = anova_svc.predict(X)
precision_X = precision_score(y, y_pred)
print('train dataset precision: %.2f' % (precision_X))

# predict samples' classes for TESTING dataset
y_pred_t = anova_svc.predict(X_t)
precision_X_t = precision_score(y_t, y_pred_t)
print('test dataset precision: %.2f' % (precision_X_t))

# ### Visualisation (SVC) #####################################################
import numpy as np

# ### Look at the discriminating weights
coef = clf.coef_
# reverse feature selection
coef = feature_selection.inverse_transform(coef)

# reverse masking
coef = masking.unmask(coef[0], mask)

# # We use a masked array so that the voxels at '-1' are displayed
# # transparently
act = np.ma.masked_array(coef, coef == 0)

plot_haxby(act, bg_img, 'SVC', slice=29)

# save statistical map as nifti image
img = nibabel.Nifti1Image(act, np.eye(4))
img.to_filename('output_stats_svc.nii.gz')
Ejemplo n.º 30
0
def lesson_4():
    print_("Lesson 4: Feature Selection", 0, 1)
    ks = pd.read_csv(ks_projects_file_path,
                     parse_dates=['deadline', 'launched'])

    # Drop live projects
    ks = ks.query('state != "live"')

    # Add outcome column, "successful" == 1, others are 0
    ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int))

    # Timestamp features
    ks = ks.assign(hour=ks.launched.dt.hour,
                   day=ks.launched.dt.day,
                   month=ks.launched.dt.month,
                   year=ks.launched.dt.year)

    # Label encoding
    cat_features = ['category', 'currency', 'country']
    encoder = LabelEncoder()
    encoded = ks[cat_features].apply(encoder.fit_transform)

    data_cols = ['goal', 'hour', 'day', 'month', 'year', 'outcome']
    baseline_data = ks[data_cols].join(encoded)

    cat_features = ['category', 'currency', 'country']
    interactions = pd.DataFrame(index=ks.index)
    for col1, col2 in itertools.combinations(cat_features, 2):
        new_col_name = '_'.join([col1, col2])
        # Convert to strings and combine
        new_values = ks[col1].map(str) + "_" + ks[col2].map(str)
        label_enc = LabelEncoder()
        interactions[new_col_name] = label_enc.fit_transform(new_values)
    baseline_data = baseline_data.join(interactions)

    launched = pd.Series(ks.index, index=ks.launched,
                         name="count_7_days").sort_index()
    count_7_days = launched.rolling('7d').count() - 1
    count_7_days.index = launched.values
    count_7_days = count_7_days.reindex(ks.index)

    baseline_data = baseline_data.join(count_7_days)

    def time_since_last_project(series):
        # Return the time in hours
        return series.diff().dt.total_seconds() / 3600.

    df = ks[['category', 'launched']].sort_values('launched')
    timedeltas = df.groupby('category').transform(time_since_last_project)
    timedeltas = timedeltas.fillna(timedeltas.max())

    baseline_data = baseline_data.join(
        timedeltas.rename({'launched': 'time_since_last_project'}, axis=1))

    def get_data_splits(dataframe, valid_fraction=0.1):
        valid_fraction = 0.1
        valid_size = int(len(dataframe) * valid_fraction)

        train = dataframe[:-valid_size * 2]
        # valid size == test size, last two sections of the data
        valid = dataframe[-valid_size * 2:-valid_size]
        test = dataframe[-valid_size:]

        return train, valid, test

    def train_model(train, valid):
        feature_cols = train.columns.drop('outcome')

        dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
        dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])

        param = {
            'num_leaves': 64,
            'objective': 'binary',
            'metric': 'auc',
            'seed': 7
        }
        print("Training model!")
        bst = lgb.train(param,
                        dtrain,
                        num_boost_round=1000,
                        valid_sets=[dvalid],
                        early_stopping_rounds=10,
                        verbose_eval=False)

        valid_pred = bst.predict(valid[feature_cols])
        valid_score = metrics.roc_auc_score(valid['outcome'], valid_pred)
        print(f"Validation AUC score: {valid_score:.4f}")
        return bst

    # ----------------------------
    # Univariate Feature Selection
    # ----------------------------
    feature_cols = baseline_data.columns.drop('outcome')

    # Keep 5 features
    selector = SelectKBest(f_classif, k=5)

    # NOTE: we should select features using only a training set, not the whole
    # dataset we are doing here (which will be fixed next)
    X_new = selector.fit_transform(baseline_data[feature_cols],
                                   baseline_data['outcome'])
    print_("X_new (after selecting 5 best features)", 0)
    print_(X_new)

    # Fix: select features using only a training set
    feature_cols = baseline_data.columns.drop('outcome')
    train, valid, _ = get_data_splits(baseline_data)

    # Keep 5 features
    selector = SelectKBest(f_classif, k=5)

    X_new = selector.fit_transform(train[feature_cols], train['outcome'])
    print_("X_new FIXED [Using Train Only]", 0)
    print_(X_new)

    # Get back the features we've kept, zero out all other features
    selected_features = pd.DataFrame(selector.inverse_transform(X_new),
                                     index=train.index,
                                     columns=feature_cols)
    print_(
        "First 5 rows from the train set including the 5 best features only (others set at 0)",
        0)
    print_(selected_features.head())

    # Dropped columns have values of all 0s, so var is 0, drop them
    selected_columns = selected_features.columns[selected_features.var() != 0]

    # Get the valid dataset with the selected features.
    print_("Valid dataset with the selected features only", 0)
    print_(valid[selected_columns].head())

    # -----------------
    # L1 regularization
    # -----------------
    train, valid, _ = get_data_splits(baseline_data)

    X, y = train[train.columns.drop("outcome")], train['outcome']

    # Set the regularization parameter C=1
    logistic = LogisticRegression(C=1,
                                  penalty="l1",
                                  solver='liblinear',
                                  random_state=7).fit(X, y)
    model = SelectFromModel(logistic, prefit=True)

    X_new = model.transform(X)
    print_("X_new with L1 regularization", 0)
    print_(X_new)

    # Get back the kept features as a DataFrame with dropped columns as all 0s
    selected_features = pd.DataFrame(model.inverse_transform(X_new),
                                     index=X.index,
                                     columns=X.columns)

    # Dropped columns have values of all 0s, keep other columns
    selected_columns = selected_features.columns[selected_features.var() != 0]
    print_("Rejected columns: {}".format(
        selected_features.columns.difference(selected_columns).to_list()))

    # Get the valid dataset with the selected features.
    print_("Valid dataset with the selected features using L1 regularization",
           0)
    print_(valid[selected_columns].head())