def test_select_fdr_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the fpr heuristic
    """
    X, Y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectFdr(f_classif, alpha=0.0001)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="fdr", param=0.0001).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
    def single_fdr(alpha, n_informative, random_state):
        X, y = make_regression(n_samples=150,
                               n_features=20,
                               n_informative=n_informative,
                               shuffle=False,
                               random_state=random_state,
                               noise=10)

        with warnings.catch_warnings(record=True):
            # Warnings can be raised when no features are selected
            # (low alpha or very noisy data)
            univariate_filter = SelectFdr(f_regression, alpha=alpha)
            X_r = univariate_filter.fit(X, y).transform(X)
            X_r2 = GenericUnivariateSelect(f_regression,
                                           mode='fdr',
                                           param=alpha).fit(X, y).transform(X)

        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        num_false_positives = np.sum(support[n_informative:] == 1)
        num_true_positives = np.sum(support[:n_informative] == 1)

        if num_false_positives == 0:
            return 0.
        false_discovery_rate = (num_false_positives /
                                (num_true_positives + num_false_positives))
        return false_discovery_rate
    def single_fdr(alpha, n_informative, random_state):
        X, y = make_regression(
            n_samples=150,
            n_features=20,
            n_informative=n_informative,
            shuffle=False,
            random_state=random_state,
            noise=10,
        )

        with warnings.catch_warnings(record=True):
            # Warnings can be raised when no features are selected
            # (low alpha or very noisy data)
            univariate_filter = SelectFdr(f_regression, alpha=alpha)
            X_r = univariate_filter.fit(X, y).transform(X)
            X_r2 = GenericUnivariateSelect(f_regression, mode="fdr", param=alpha).fit(X, y).transform(X)

        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        num_false_positives = np.sum(support[n_informative:] == 1)
        num_true_positives = np.sum(support[:n_informative] == 1)

        if num_false_positives == 0:
            return 0.0
        false_discovery_rate = num_false_positives / (num_true_positives + num_false_positives)
        return false_discovery_rate
def test_boundary_case_ch2():
    # Test boundary case, and always aim to select 1 feature.
    X = np.array([[10, 20], [20, 20], [20, 30]])
    y = np.array([[1], [0], [0]])
    scores, pvalues = chi2(X, y)
    assert_array_almost_equal(scores, np.array([4., 0.71428571]))
    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))

    filter_fdr = SelectFdr(chi2, alpha=0.1)
    filter_fdr.fit(X, y)
    support_fdr = filter_fdr.get_support()
    assert_array_equal(support_fdr, np.array([True, False]))

    filter_kbest = SelectKBest(chi2, k=1)
    filter_kbest.fit(X, y)
    support_kbest = filter_kbest.get_support()
    assert_array_equal(support_kbest, np.array([True, False]))

    filter_percentile = SelectPercentile(chi2, percentile=50)
    filter_percentile.fit(X, y)
    support_percentile = filter_percentile.get_support()
    assert_array_equal(support_percentile, np.array([True, False]))

    filter_fpr = SelectFpr(chi2, alpha=0.1)
    filter_fpr.fit(X, y)
    support_fpr = filter_fpr.get_support()
    assert_array_equal(support_fpr, np.array([True, False]))

    filter_fwe = SelectFwe(chi2, alpha=0.1)
    filter_fwe.fit(X, y)
    support_fwe = filter_fwe.get_support()
    assert_array_equal(support_fwe, np.array([True, False]))
Example #5
0
class f_regressionFDRPrim(primitive):
    def __init__(self, random_state=0):
        super(f_regressionFDRPrim, self).__init__(name='f_regressionFDR')
        self.id = 34
        self.PCA_LAPACK_Prim = []
        self.type = 'feature selection'
        self.description = "Filter: Select the p-values for an estimated false discovery rate with F-value between label/feature for regression tasks. This uses the Benjamini-Hochberg procedure. alpha is an upper bound on the expected false discovery rate."
        self.hyperparams_run = {'default': True}
        self.selector = None
        self.accept_type = 'c_r'

    def can_accept(self, data):
        return self.can_accept_c(data, 'Regression')

    def is_needed(self, data):
        if data['X'].shape[1] < 3:
            return False
        return True

    def fit(self, data):
        data = handle_data(data)
        self.selector = SelectFdr(f_regression)
        self.selector.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        mask = self.selector.get_support(indices=False)
        final_cols = list(compress(cols, mask))
        output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols)
        final_output = {0: output}
        return final_output
def test_select_fdr_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the fpr heuristic
    """
    X, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)

    univariate_filter = SelectFdr(f_classif, alpha=0.0001)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode='fdr',
                                   param=0.0001).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
    def gene_univariate_feature_selection(self, alpha=0.01):
        gene_normal_X, gene_normal_Y = self.make_dataset(
            dataset='gene',
            normal_tumor='normal',
            normal_matched=True,
            mirna_gene_matched=True)
        gene_tumor_X, gene_tumor_Y = self.make_dataset(dataset='gene',
                                                       normal_tumor='tumor',
                                                       normal_matched=True,
                                                       mirna_gene_matched=True)

        gene_exp_filter = SelectFdr(f_classif, alpha=alpha)
        gen_exp_new = gene_exp_filter.fit_transform(
            X=pandas.concat([gene_normal_X, gene_tumor_X]),
            y=pandas.concat([gene_normal_Y, gene_tumor_Y]))

        self.gene_symbols = np.asanyarray(
            self.gene_symbols)[gene_exp_filter.get_support(
                indices=True)].tolist()
        self.gene_tumor = self.gene_tumor[
            self.gene_symbols +
            ['patient_barcode', 'pathologic_stage', 'histological_type']]
        self.gene_normal = self.gene_normal[
            self.gene_symbols +
            ['patient_barcode', 'pathologic_stage', 'histological_type']]
Example #8
0
class UnivariateSelectChiFDRPrim(primitive):
    def __init__(self, random_state=0):
        super(UnivariateSelectChiFDRPrim, self).__init__(name='UnivariateSelectChiFDR')
        self.id = 31
        self.PCA_LAPACK_Prim = []
        self.type = 'feature selection'
        self.description = "Filter: Select the p-values for an estimated false discovery rate with Chi-square. This uses the Benjamini-Hochberg procedure. alpha is an upper bound on the expected false discovery rate."
        self.hyperparams_run = {'default': True}
        self.selector = None
        self.accept_type = 'd'

    def can_accept(self, data):
        return self.can_accept_d(data, 'Classification')

    def is_needed(self, data):
        if data['X'].shape[1] < 3:
            return False
        return True

    def fit(self, data):
        data = handle_data(data)
        self.selector = SelectFdr(chi2, alpha=0.05)
        self.selector.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        try:
            mask = self.selector.get_support(indices=False)
            final_cols = list(compress(cols, mask))
            output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols)
        except Exception as e:
            print(e)
        final_output = {0: output}
        return final_output
def test_boundary_case_ch2():
    # Test boundary case, and always aim to select 1 feature.
    X = np.array([[10, 20], [20, 20], [20, 30]])
    y = np.array([[1], [0], [0]])
    scores, pvalues = chi2(X, y)
    assert_array_almost_equal(scores, np.array([4.0, 0.71428571]))
    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))

    filter_fdr = SelectFdr(chi2, alpha=0.1)
    filter_fdr.fit(X, y)
    support_fdr = filter_fdr.get_support()
    assert_array_equal(support_fdr, np.array([True, False]))

    filter_kbest = SelectKBest(chi2, k=1)
    filter_kbest.fit(X, y)
    support_kbest = filter_kbest.get_support()
    assert_array_equal(support_kbest, np.array([True, False]))

    filter_percentile = SelectPercentile(chi2, percentile=50)
    filter_percentile.fit(X, y)
    support_percentile = filter_percentile.get_support()
    assert_array_equal(support_percentile, np.array([True, False]))

    filter_fpr = SelectFpr(chi2, alpha=0.1)
    filter_fpr.fit(X, y)
    support_fpr = filter_fpr.get_support()
    assert_array_equal(support_fpr, np.array([True, False]))

    filter_fwe = SelectFwe(chi2, alpha=0.1)
    filter_fwe.fit(X, y)
    support_fwe = filter_fwe.get_support()
    assert_array_equal(support_fwe, np.array([True, False]))
Example #10
0
def SelectFdr_selector(data, target, sf):
    selector = SelectFdr(score_func=sf)
    data_new = selector.fit_transform(data.values, target.values.ravel())
    outcome = selector.get_support(True)
    new_features = []  # The list of your K best features
    for ind in outcome:
        new_features.append(data.columns.values[ind])
    return pd.DataFrame(data_new, columns=new_features)
Example #11
0
def feature_select(labels, features, alfa=0.4):
    dct = DecisionTreeClassifier(random_state=42)
    rfecv1 = RFECV(estimator=dct,
                   step=1,
                   cv=StratifiedKFold(labels,
                                      n_folds=6,
                                      shuffle=True,
                                      random_state=42),
                   scoring='recall')
    rfecv2 = RFECV(estimator=dct,
                   step=1,
                   cv=StratifiedKFold(labels,
                                      n_folds=6,
                                      shuffle=True,
                                      random_state=42),
                   scoring='precision')
    rfecv1.fit(features, labels)
    rfecv2.fit(features, labels)
    print("Optimal number of features - Recall : %d" % rfecv1.n_features_)
    print("Optimal number of features - Precision : %d" % rfecv2.n_features_)

    BestFeatures = SelectFdr(score_func=f_classif, alpha=alfa)
    BestFeatures.fit_transform(features, labels)
    #    BestFeatures = SelectKBest(score_func=f_classif,k=numbest)
    #    BestFeatures.fit_transform(features,labels)
    feature_scores = BestFeatures.scores_
    feature_pvalues = BestFeatures.pvalues_
    best_feat_indices = BestFeatures.get_support(indices=True)

    best_list = []
    for i in range(len(best_feat_indices)):
        best_list.append(features_list[best_feat_indices[i] + 1])

    print 'Best features:', best_list
    feat_ctr = -1
    for index in best_feat_indices:
        feat_ctr += 1
        print best_list[feat_ctr], 'Score:', feature_scores[
            index], 'P-value:', feature_pvalues[index]

    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Recall & Precision")
    plt.plot(range(1,
                   len(rfecv1.grid_scores_) + 1),
             rfecv1.grid_scores_,
             label='Recall',
             color='blue')
    plt.plot(range(1,
                   len(rfecv2.grid_scores_) + 1),
             rfecv2.grid_scores_,
             label='Precision',
             color='green')
    plt.legend()
    plt.show()
Example #12
0
def select_fdr(input_data,
               feature_names=None,
               score_func=f_classif,
               alpha=0.05):
    if score_func == f_classif:
        input_data, feature_names, _ = remove_constant(input_data,
                                                       feature_names)

    x_train = input_data[0]
    y_train = input_data[1]
    x_test = input_data[2]
    y_test = input_data[3]

    dims = len(x_train.shape)
    if dims == 3:
        x_train = flatten(x_train)
        x_test = flatten(x_test)

    done = False
    increment = alpha
    while not done:
        feature_selector = SelectFdr(score_func=score_func, alpha=alpha)
        temp_x_train = feature_selector.fit_transform(x_train, y_train)
        temp_x_test = feature_selector.transform(x_test)
        if temp_x_train.shape[1] > 1 and temp_x_test.shape[1] > 1:
            done = True
            x_train = temp_x_train
            x_test = temp_x_test
        else:
            msg = 'Feature selection was too aggresive, '
            msg += 'increasing alpha from {} to {}'.format(
                alpha, alpha + increment)
            alpha += increment
            logging.warning(msg)

    if dims == 3:
        x_train = make3D(x_train)
        x_test = make3D(x_test)

    output_data = (x_train, y_train, x_test, y_test)
    if feature_names is not None:
        mask = feature_selector.get_support()
        feature_names = feature_names[mask]

    logging.info('Selected {} features'.format(x_train.shape[1]))

    final_args = {'score_func': score_func, 'alpha': alpha}

    return output_data, feature_names, final_args
Example #13
0
def select_fdr(df, target_col):
    y = df[target_col]
    X = df.drop(target_col, axis=1)
    selector = SelectFdr(chi2, alpha=0.01).fit(X, y)
    true_list = list(selector.get_support())
    index = [i for i in range(len(true_list)) if true_list[i] == True]
    if len(index) == 0:
        print(
            'No features were selected: either the data is too noisy or the selection Test_data too strict.'
        )
        return df
    else:
        saved_columns = [list(X.columns)[i] for i in index]
        result = pd.DataFrame(selector.transform(X), columns=saved_columns)
        result[target_col] = y
    return result
def test_select_fdr_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the fdr heuristic
    """
    X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectFdr(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode="fdr", param=0.01).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Example #15
0
def test_select_fdr_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the fdr heuristic
    """
    X, y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectFdr(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode='fdr',
                    param=0.01).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Example #16
0
 def select_f(self, score_function, has_pvalue):
     """ Select features using FDR (False Discovery Rate) or K-best """
     fname = score_function.__name__
     if has_pvalue:
         self._debug(f"Select FDR: '{fname}'")
         select = SelectFdr(score_func=score_function)
     else:
         self._debug(f"Select K-Best: '{fname}'")
         select = SelectKBest(score_func=score_function, k='all')
     self._debug(
         f"Select '{fname}': x.shape={self.x.shape}, y.shape={self.y.shape}"
     )
     select.fit(self.x, self.y)
     keep = select.get_support()
     if has_pvalue:
         return (fname, select.scores_, select.pvalues_, keep)
     else:
         return (fname, select.scores_, None, None)
Example #17
0
def svm_cv(data, data_target):
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, data_target)
    print "*" * 79
    print "Training..."
    # selector = SelectFdr(chi2)
    selector = SelectFdr(f_classif)
    selector.fit(X_train, y_train)
    clf = svm.SVC(kernel='linear', probability=True)
    clf.fit(selector.transform(X_train), y_train)
    print "Testing..."
    pred = clf.predict(selector.transform(X_test))
    probs = pred.predict_proba(selector.transfrom(X_test))
    accuracy_score = metrics.accuracy_score(y_test, pred)
    classification_report = metrics.classification_report(y_test, pred)
    support = selector.get_support()
    print support
    print accuracy_score
    print classification_report
    precision, recall, thresholds = precision_recall_curve(y_test, probs[:, 1])
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(X)
X = imputer.transform(X)

# feature scaling
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_norm = mms.fit_transform(X)

# Univariate feature selection using false discovery rate
from sklearn.feature_selection import SelectFdr, f_classif
X_fdr = SelectFdr(f_classif, alpha=0.05).fit(X, y)

# Get indices of selected features
X_fdr.get_support(indices=True)

# select features using false discovery rate method
X_fdr = SelectFdr(f_classif, alpha=0.05).fit_transform(X, y)
print(X_fdr.shape)

# Splitting the dataset into Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_fdr,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# fitting logistic regression to Training Set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
Example #19
0
    ft_model = load_embedding(FLAGS.embedfile)
    docs = [c.split(' ') for c in comments_text]
    for i in range(len(docs)):
        docs[i] = [t for t in docs[i] if t in ft_model.vocab]

    print('Building dictionary...')
    comments_dictionary = Dictionary(docs)
    comments_corpus = [comments_dictionary.doc2bow(d) for d in docs]

    print("Creating tfidf model...")
    model_tfidf = TfidfModel(comments_corpus)

    print("Converting to tfidf vectors...")
    comments_tfidf = model_tfidf[comments_corpus]
    comments_vecs = corpus2csc(comments_tfidf).T

    print('Finding important terms...')
    labelcols = data.columns.tolist()[2:]
    terms = Counter()
    for l in labelcols:
        cl = data[l]
        model_fdr = SelectFdr(chi2, alpha=0.025)
        model_fdr.fit(comments_vecs, cl)
        ids = model_fdr.get_support(indices=True)
        for i in ids:
            terms[comments_dictionary[i]] += model_fdr.scores_[i]

    print('Saving results...')
    with open(FLAGS.chi2file, 'wb') as f:
        pickle.dump(terms, f, protocol=pickle.HIGHEST_PROTOCOL)
class_names=list(np.unique(Y))
class_num=0
number_of_classes=np.unique(Y).shape[0]
for classes in np.unique(Y):
    y[Y==classes]=int(class_num)
    print('Class '+ classes + ': ' + str(class_num))
    class_num=class_num+1

X = StandardScaler().fit_transform(X) #### for anaova
#X = MinMaxScaler().fit_transform(X) #### for Chi2

## Select features
fdr = SelectFdr(f_classif,alpha=0.005) #### for anaova
#fdr = SelectFdr(chi2,alpha=0.05) #### for Chi2
X_sel = fdr.fit_transform(X,y)
idx_sorted = fdr.get_support(indices = True)
fdr_select_features = list( feature_set[i] for i in idx_sorted)
print ('Selected features with FDR: ')
print  (fdr_select_features)
print ('\n')
print (X.shape)
print (X_sel.shape)

X_new = df[fdr_select_features].values
Y=(df['Class'])

le = preprocessing.LabelEncoder()
y=le.fit_transform(Y)


Example #21
0
def select_features(df,
                    target,
                    featsel_runs=3,
                    max_it=150,
                    w_thr=1e-4,
                    keep=None,
                    n_jobs=1,
                    verbose=0):
    """
    Inputs:
        - df: nxp pandas DataFrame with n data points and p features; to avoid overfitting, only provide data belonging
              to the n training data points.
        - target: n dimensional array with targets corresponding to the data points in df
        - featsel_runs: number of times to perform in the feature selection part with a random fraction of data points (int; default: 3)
        - max_it: how many iterations will be performed at most (int; default: 150)
        - w_thr: threshold on the final Lasso model weights to filter the features (float; default: 1e-4)
        - keep: list of features that should be kept no matter what
        - n_jobs: how many jobs to run when selecting the features in parallel (int; default: 1)
        - verbose: verbosity level (int; default: 0)
    Returns:
        - good_cols: list of column names for df with which a regression model can be trained
    """
    if not (len(df) == len(target)):
        raise ValueError("[featsel] df and target dimension mismatch.")
    if keep is None:
        keep = []
    # scale features to have 0 mean and unit std
    if verbose > 0:
        if featsel_runs > df.shape[0]:
            print("[featsel] WARNING: Less data points than featsel runs!!")
        print("[featsel] Scaling data...", end="")
    scaler = StandardScaler()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        df_scaled = pd.DataFrame(scaler.fit_transform(df),
                                 columns=df.columns,
                                 dtype=np.float32)
        target_scaled = scaler.fit_transform(target.reshape(-1, 1)).ravel()
    if verbose > 0:
        print("done.")

    # quick and dirty univariate filtering
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        fsel = SelectFdr(f_regression, alpha=0.1).fit(df_scaled, target_scaled)
        cols = keep + [
            df_scaled.columns[i]
            for i in fsel.get_support(True) if df_scaled.columns[i] not in keep
        ]
        if cols:
            df_scaled = df_scaled[cols]
            if verbose > 0:
                print("[featsel] %i/%i features after univariate filtering" %
                      (len(df_scaled.columns), len(fsel.get_support())))

    # select good features in k runs in parallel
    # by doing sort of a cross-validation (i.e., randomly subsample data points)
    def run_select_features(i):
        if verbose > 0:
            print("[featsel] Feature selection run %i/%i" %
                  (i + 1, featsel_runs))
        np.random.seed(i)
        rand_idx = np.random.permutation(
            df_scaled.index)[:max(10, int(0.8 * len(df_scaled)))]
        return _select_features_1run(df_scaled.iloc[rand_idx],
                                     target_scaled[rand_idx],
                                     max_it=max_it,
                                     eps=1e-8,
                                     verbose=verbose - 1)

    good_cols = [c for c in keep]
    if featsel_runs >= 1:
        if n_jobs == 1:
            # only use parallelization code if you actually parallelize
            selected_columns = []
            for i in range(featsel_runs):
                selected_columns.extend(run_select_features(i))
        else:

            def flatten_lists(l):
                return [item for sublist in l for item in sublist]

            selected_columns = flatten_lists(
                Parallel(n_jobs=n_jobs,
                         verbose=100 * verbose)(delayed(run_select_features)(i)
                                                for i in range(featsel_runs)))

        if len(selected_columns) > 1:
            selected_columns = Counter(selected_columns)
            selected_columns = sorted(selected_columns,
                                      key=selected_columns.get,
                                      reverse=True)
            selected_columns = keep + [
                c for c in selected_columns if c not in keep
            ]
            if verbose > 0:
                print("[featsel] %i features after %i feature selection runs" %
                      (len(selected_columns), featsel_runs))
            correlations = df_scaled[selected_columns].corr()
            if not keep:
                good_cols.append(selected_columns[0])
                k = 1
            else:
                k = len(keep)
            for i, c in enumerate(selected_columns[k:], k):
                # only take features that are somewhat uncorrelated with the rest
                if np.max(np.abs(correlations[c].ravel()[:i])) < 0.9:
                    good_cols.append(c)
            if verbose > 0:
                print("[featsel] %i features after correlation filtering" %
                      len(good_cols))
        else:
            good_cols += selected_columns
    if not good_cols:
        good_cols = list(df.columns)
    # perform recursive feature elimination on these features
    df_scaled = df_scaled[good_cols]
    X = df_scaled.to_numpy()
    if df_scaled.shape[0] > 50:
        rand_noise = np.random.permutation(X.flatten()).reshape(X.shape)
        X = np.hstack([X, rand_noise])
    rand_noise = np.random.randn(df.shape[0], max(3,
                                                  int(0.5 * len(good_cols))))
    X = np.hstack([X, rand_noise])
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        reg = lm.LassoLarsCV(eps=1e-16)
        reg.fit(X, target)
    weights = dict(zip(good_cols, reg.coef_[:len(good_cols)]))
    # only include features that are more important than our known noise features
    noise_w_thr = np.max(np.abs(reg.coef_[len(good_cols):]))
    good_cols = [c for c in weights if abs(weights[c]) > noise_w_thr]
    if verbose > 0:
        print("[featsel] %i features after noise filtering" % len(good_cols))
    if not good_cols:
        if verbose > 0:
            print("[featsel] WARNING: Not a single good features was found...")
        return keep
    # train again a regression model, but this time on the original (unscaled) data
    df = df[good_cols]
    X = df.to_numpy()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        reg = lm.LassoLarsCV(eps=1e-16)
        reg.fit(X, target)
        # alphas in CV are generally chosen a bit too small
        reg = lm.LassoLars(alpha=1.5 * reg.alpha_, eps=1e-16)
        reg.fit(X, target)
    weights = dict(zip(list(df.columns), reg.coef_))
    good_cols = [
        c for c in sorted(weights, key=lambda x: abs(weights[x]), reverse=True)
        if abs(weights[c] * df[c].std()) >= w_thr
    ]
    # add keep columns back in
    good_cols = keep + [c for c in good_cols if c not in keep]
    if verbose > 0:
        if not good_cols:
            print("[featsel] WARNING: Not a single good features was found...")
        print(
            "[featsel] %i final features selected (including %i original keep features)."
            % (len(good_cols), len(keep)))
    return good_cols
###### MASKING FOR SELECTED STIMS
targetNames = ['bottle', 'face', 'scissors']  # the ttims of interest
stimMask = targetData.labels.isin(targetNames)  # indices for the stim of interest
X_fMRI_selected = X_fMRI[stimMask]   # features (for selected stimuli only)
y = np.array(targetData.labelInd)[stimMask]  # labels




###### FEATURE SELECTION
# FDR feature selector
selector = SelectFdr(f_classif, alpha=0.01)  # FDR selector object
selector.fit(X_fMRI_selected, y)   # learning from the data
X = selector.transform(X_fMRI_selected)   # Selected features only
indVoxels = selector.get_support(indices=True)   # indices of surviving voxels


###### VISUALIZING FEATURE LOCATIONS
# binary vector with 1s indicating selected voxels
bROI = np.zeros(X_fMRI.shape[-1])
bROI[indVoxels] = 1
# reverse masking
bROI_img = masker.inverse_transform(bROI)

# Create the figure
plot_stat_map(bROI_img, imgAnat, title='Voxels surviving FDR')



##### SVM CLASSIFICATION  (LINEAR WITH C=1)
Example #23
0
def remove_drugs_with_low_effect_univariate(
        feat, meta,
        threshold=0.05, fdr=0.05, test_each_dose=False,
        keep_names=['DMSO', 'NoCompound'], return_nonsignificant=False,
        drugname_column = 'drug_type', drugdose_column = 'drug_dose'
        ):
    """
    Remove drugs when the number of features significantly different to DMSO
    for any dose is lower than the threshold.
    The statistical significance of the difference between a compound dose and
    the DMSO is assessed based on individual ANOVA tests for each feature.
    The Benjamini-Hochberg method is used to control the false discovery rate.
    param:
        feat : dataframe
            feature dataframe
        meta : dtaframe
            dataframe with metadata
        threshold : float < 1.0 and < 0.0
            percentage of significant features detected to consider that the
            compound has significant effect
        fdr : float < 1.0 and > 0.0
            false discovery rate parameter in Benjamini-Hochberg method
        test_each_dose : bool, optional
            If true, each dose of each drug is tested for statistical
            significance compared to DMSO, and the drug is considered to
            have a significant effect if any of the doses satisfies the
            conditions set by the fdr and threshold parameters.
            If False, an ANOVA test is performed comparing the DMSO with all
            the doses (as separate classes) and the conditions are checked once
            for each drug.
        keep_names : list or None, optional
            list of names from the drugname_column to keep without checking
            for significance
        return_nonsignificant : bool, optional
            return the names of the drugs that are removed from the
            dataset
        drugname_column : string
            the name of the column in meta that contains the individual
            compound names
        drugdose_column : string
            the name of the column in meta that contains the drug doses
    return:
        feat = feature dataframe with low-potency drugs removed
        samples = dataframe with sample identification data corresponding to returned feat dataframe
    """
    import numpy as np
    from sklearn.feature_selection import SelectFdr, f_classif
    import pdb

    n_feat = feat.shape[1]
    drug_names = meta[drugname_column].unique()

    significant_drugs = []
    for idrug,drug in enumerate(drug_names):
        if drug in keep_names:
            continue
        # For each dose get significant features using Benjamini-Hochberg
        # method with FDR=fdr
        X = feat[meta[drugname_column].isin([drug,'DMSO'])]
        y = meta.loc[meta[drugname_column].isin([drug,'DMSO']), drugdose_column]

        selector = SelectFdr(score_func=f_classif, alpha=fdr)

        if not test_each_dose:
            try:
                selector.fit(X, y)
            except ValueError:
                pdb.set_trace()
            n_sign_feat = np.sum(selector.get_support())
            if n_sign_feat > threshold*n_feat:
                significant_drugs.append(drug)
        else:
            n_sign_feat = []
            for idose, dose in enumerate(y.unique()):
                if dose == 0:
                    continue
                selector.fit(X[np.isin(y,[0,dose])], y[np.isin(y,[0,dose])])
                n_sign_feat.append(np.sum(selector.get_support()))

            if np.any([n>threshold*n_feat for n in n_sign_feat]):
                significant_drugs.append(drug)

    # If DMSO was in drug list, include it in the final dataframe
    # (default behaviour)
    if keep_names is not None:
        significant_drugs.extend(keep_names)

    feat = feat[meta[drugname_column].isin(significant_drugs)]
    meta = meta[meta[drugname_column].isin(significant_drugs)]

    if return_nonsignificant:
        return feat, meta, list(
            set(drug_names).difference(set(significant_drugs))
            )
    else:
        return feat, meta