Exemple #1
0
def featureFitting(filename,
                   X,
                   y,
                   featureNames,
                   optimalFlag,
                   kbest=20,
                   alpha=0.05,
                   model=None):
    '''
    Gets the K-best features (filtered by FDR, then select best ranked by t-test, more advanced options can be implemented).
    Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv"
    Returns new features matrix, FD scaler, and K-select scaler
    '''
    a = alpha
    FD = SelectFdr(alpha=a)
    X = FD.fit_transform(X, y)

    selectK = SelectKBest(k=kbest)
    selectK.fit(X, y)
    selectK_mask = selectK.get_support()
    K_featnames = featureNames[selectK_mask]
    print("K_featnames: %s" % (K_featnames))
    Reduced_df = pd.read_csv(filename, index_col=0)
    Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]]
    Reduced_df.to_csv('REDUCED_Feat.csv')
    return Reduced_df, FD, selectK
    def single_fdr(alpha, n_informative, random_state):
        X, y = make_regression(n_samples=150,
                               n_features=20,
                               n_informative=n_informative,
                               shuffle=False,
                               random_state=random_state,
                               noise=10)

        with warnings.catch_warnings(record=True):
            # Warnings can be raised when no features are selected
            # (low alpha or very noisy data)
            univariate_filter = SelectFdr(f_regression, alpha=alpha)
            X_r = univariate_filter.fit(X, y).transform(X)
            X_r2 = GenericUnivariateSelect(f_regression,
                                           mode='fdr',
                                           param=alpha).fit(X, y).transform(X)

        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        num_false_positives = np.sum(support[n_informative:] == 1)
        num_true_positives = np.sum(support[:n_informative] == 1)

        if num_false_positives == 0:
            return 0.
        false_discovery_rate = (num_false_positives /
                                (num_true_positives + num_false_positives))
        return false_discovery_rate
Exemple #3
0
class f_regressionFDRPrim(primitive):
    def __init__(self, random_state=0):
        super(f_regressionFDRPrim, self).__init__(name='f_regressionFDR')
        self.id = 34
        self.PCA_LAPACK_Prim = []
        self.type = 'feature selection'
        self.description = "Filter: Select the p-values for an estimated false discovery rate with F-value between label/feature for regression tasks. This uses the Benjamini-Hochberg procedure. alpha is an upper bound on the expected false discovery rate."
        self.hyperparams_run = {'default': True}
        self.selector = None
        self.accept_type = 'c_r'

    def can_accept(self, data):
        return self.can_accept_c(data, 'Regression')

    def is_needed(self, data):
        if data['X'].shape[1] < 3:
            return False
        return True

    def fit(self, data):
        data = handle_data(data)
        self.selector = SelectFdr(f_regression)
        self.selector.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        mask = self.selector.get_support(indices=False)
        final_cols = list(compress(cols, mask))
        output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols)
        final_output = {0: output}
        return final_output
Exemple #4
0
class UnivariateSelectChiFDRPrim(primitive):
    def __init__(self, random_state=0):
        super(UnivariateSelectChiFDRPrim, self).__init__(name='UnivariateSelectChiFDR')
        self.id = 31
        self.PCA_LAPACK_Prim = []
        self.type = 'feature selection'
        self.description = "Filter: Select the p-values for an estimated false discovery rate with Chi-square. This uses the Benjamini-Hochberg procedure. alpha is an upper bound on the expected false discovery rate."
        self.hyperparams_run = {'default': True}
        self.selector = None
        self.accept_type = 'd'

    def can_accept(self, data):
        return self.can_accept_d(data, 'Classification')

    def is_needed(self, data):
        if data['X'].shape[1] < 3:
            return False
        return True

    def fit(self, data):
        data = handle_data(data)
        self.selector = SelectFdr(chi2, alpha=0.05)
        self.selector.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        try:
            mask = self.selector.get_support(indices=False)
            final_cols = list(compress(cols, mask))
            output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols)
        except Exception as e:
            print(e)
        final_output = {0: output}
        return final_output
def test_boundary_case_ch2():
    # Test boundary case, and always aim to select 1 feature.
    X = np.array([[10, 20], [20, 20], [20, 30]])
    y = np.array([[1], [0], [0]])
    scores, pvalues = chi2(X, y)
    assert_array_almost_equal(scores, np.array([4., 0.71428571]))
    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))

    filter_fdr = SelectFdr(chi2, alpha=0.1)
    filter_fdr.fit(X, y)
    support_fdr = filter_fdr.get_support()
    assert_array_equal(support_fdr, np.array([True, False]))

    filter_kbest = SelectKBest(chi2, k=1)
    filter_kbest.fit(X, y)
    support_kbest = filter_kbest.get_support()
    assert_array_equal(support_kbest, np.array([True, False]))

    filter_percentile = SelectPercentile(chi2, percentile=50)
    filter_percentile.fit(X, y)
    support_percentile = filter_percentile.get_support()
    assert_array_equal(support_percentile, np.array([True, False]))

    filter_fpr = SelectFpr(chi2, alpha=0.1)
    filter_fpr.fit(X, y)
    support_fpr = filter_fpr.get_support()
    assert_array_equal(support_fpr, np.array([True, False]))

    filter_fwe = SelectFwe(chi2, alpha=0.1)
    filter_fwe.fit(X, y)
    support_fwe = filter_fwe.get_support()
    assert_array_equal(support_fwe, np.array([True, False]))
def test_select_fdr_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the fpr heuristic
    """
    X, Y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectFdr(f_classif, alpha=0.0001)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="fdr", param=0.0001).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
    def single_fdr(alpha, n_informative, random_state):
        X, y = make_regression(
            n_samples=150,
            n_features=20,
            n_informative=n_informative,
            shuffle=False,
            random_state=random_state,
            noise=10,
        )

        with warnings.catch_warnings(record=True):
            # Warnings can be raised when no features are selected
            # (low alpha or very noisy data)
            univariate_filter = SelectFdr(f_regression, alpha=alpha)
            X_r = univariate_filter.fit(X, y).transform(X)
            X_r2 = GenericUnivariateSelect(f_regression, mode="fdr", param=alpha).fit(X, y).transform(X)

        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        num_false_positives = np.sum(support[n_informative:] == 1)
        num_true_positives = np.sum(support[:n_informative] == 1)

        if num_false_positives == 0:
            return 0.0
        false_discovery_rate = num_false_positives / (num_true_positives + num_false_positives)
        return false_discovery_rate
def test_boundary_case_ch2():
    # Test boundary case, and always aim to select 1 feature.
    X = np.array([[10, 20], [20, 20], [20, 30]])
    y = np.array([[1], [0], [0]])
    scores, pvalues = chi2(X, y)
    assert_array_almost_equal(scores, np.array([4.0, 0.71428571]))
    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))

    filter_fdr = SelectFdr(chi2, alpha=0.1)
    filter_fdr.fit(X, y)
    support_fdr = filter_fdr.get_support()
    assert_array_equal(support_fdr, np.array([True, False]))

    filter_kbest = SelectKBest(chi2, k=1)
    filter_kbest.fit(X, y)
    support_kbest = filter_kbest.get_support()
    assert_array_equal(support_kbest, np.array([True, False]))

    filter_percentile = SelectPercentile(chi2, percentile=50)
    filter_percentile.fit(X, y)
    support_percentile = filter_percentile.get_support()
    assert_array_equal(support_percentile, np.array([True, False]))

    filter_fpr = SelectFpr(chi2, alpha=0.1)
    filter_fpr.fit(X, y)
    support_fpr = filter_fpr.get_support()
    assert_array_equal(support_fpr, np.array([True, False]))

    filter_fwe = SelectFwe(chi2, alpha=0.1)
    filter_fwe.fit(X, y)
    support_fwe = filter_fwe.get_support()
    assert_array_equal(support_fwe, np.array([True, False]))
    def gene_univariate_feature_selection(self, alpha=0.01):
        gene_normal_X, gene_normal_Y = self.make_dataset(
            dataset='gene',
            normal_tumor='normal',
            normal_matched=True,
            mirna_gene_matched=True)
        gene_tumor_X, gene_tumor_Y = self.make_dataset(dataset='gene',
                                                       normal_tumor='tumor',
                                                       normal_matched=True,
                                                       mirna_gene_matched=True)

        gene_exp_filter = SelectFdr(f_classif, alpha=alpha)
        gen_exp_new = gene_exp_filter.fit_transform(
            X=pandas.concat([gene_normal_X, gene_tumor_X]),
            y=pandas.concat([gene_normal_Y, gene_tumor_Y]))

        self.gene_symbols = np.asanyarray(
            self.gene_symbols)[gene_exp_filter.get_support(
                indices=True)].tolist()
        self.gene_tumor = self.gene_tumor[
            self.gene_symbols +
            ['patient_barcode', 'pathologic_stage', 'histological_type']]
        self.gene_normal = self.gene_normal[
            self.gene_symbols +
            ['patient_barcode', 'pathologic_stage', 'histological_type']]
def test_select_fdr_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the fpr heuristic
    """
    X, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)

    univariate_filter = SelectFdr(f_classif, alpha=0.0001)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode='fdr',
                                   param=0.0001).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Exemple #11
0
def test_pipeline():
    pipeline = dl.Pipeline([("scale", StandardScaler()), ("fdr", SelectFdr()),
                            ("svm", LinearSVC())])

    pipeline = pipeline.fit(X, y)
    y2 = pipeline.predict(X)
    score = pipeline.score(X, y)

    assert isinstance(y2, di.Value)
    assert isinstance(score, di.Value)

    assert isinstance(score.compute(), float)

    assert pipeline.score(X, y).key == pipeline.score(X, y).key
    assert score.compute() == score.compute()

    y22 = y2.compute()
    assert y22.shape == y.shape
    assert y22.dtype == y.dtype
    skpipeline = sklearn.pipeline.Pipeline([("scale", StandardScaler()),
                                            ("fdr", SelectFdr()),
                                            ("svm", LinearSVC())])

    skpipeline.fit(X, y)
    sk_y2 = skpipeline.predict(X)
    sk_score = skpipeline.score(X, y)
    assert sk_score == score.compute()
Exemple #12
0
def SelectFdr_selector(data, target, sf):
    selector = SelectFdr(score_func=sf)
    data_new = selector.fit_transform(data.values, target.values.ravel())
    outcome = selector.get_support(True)
    new_features = []  # The list of your K best features
    for ind in outcome:
        new_features.append(data.columns.values[ind])
    return pd.DataFrame(data_new, columns=new_features)
def feature_SelectFdr(x_data, y_data):
    bestfeatures = SelectFdr(f_classif, alpha=0.01)
    fit = bestfeatures.fit(x_data, y_data)
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(x_data.columns)
    featureScores = pd.concat([dfcolumns, dfscores], axis=1)
    featureScores.columns = ['Specs', 'Score']  # naming the dataframe columns
    top_20_features = featureScores.nlargest(20, 'Score')
    return top_20_features
Exemple #14
0
def feature_select(labels, features, alfa=0.4):
    dct = DecisionTreeClassifier(random_state=42)
    rfecv1 = RFECV(estimator=dct,
                   step=1,
                   cv=StratifiedKFold(labels,
                                      n_folds=6,
                                      shuffle=True,
                                      random_state=42),
                   scoring='recall')
    rfecv2 = RFECV(estimator=dct,
                   step=1,
                   cv=StratifiedKFold(labels,
                                      n_folds=6,
                                      shuffle=True,
                                      random_state=42),
                   scoring='precision')
    rfecv1.fit(features, labels)
    rfecv2.fit(features, labels)
    print("Optimal number of features - Recall : %d" % rfecv1.n_features_)
    print("Optimal number of features - Precision : %d" % rfecv2.n_features_)

    BestFeatures = SelectFdr(score_func=f_classif, alpha=alfa)
    BestFeatures.fit_transform(features, labels)
    #    BestFeatures = SelectKBest(score_func=f_classif,k=numbest)
    #    BestFeatures.fit_transform(features,labels)
    feature_scores = BestFeatures.scores_
    feature_pvalues = BestFeatures.pvalues_
    best_feat_indices = BestFeatures.get_support(indices=True)

    best_list = []
    for i in range(len(best_feat_indices)):
        best_list.append(features_list[best_feat_indices[i] + 1])

    print 'Best features:', best_list
    feat_ctr = -1
    for index in best_feat_indices:
        feat_ctr += 1
        print best_list[feat_ctr], 'Score:', feature_scores[
            index], 'P-value:', feature_pvalues[index]

    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Recall & Precision")
    plt.plot(range(1,
                   len(rfecv1.grid_scores_) + 1),
             rfecv1.grid_scores_,
             label='Recall',
             color='blue')
    plt.plot(range(1,
                   len(rfecv2.grid_scores_) + 1),
             rfecv2.grid_scores_,
             label='Precision',
             color='green')
    plt.legend()
    plt.show()
Exemple #15
0
 def test_select_fdr_int(self):
     model = SelectFdr()
     X, y = load_breast_cancer(return_X_y=True)
     model.fit(X, y)
     model_onnx = convert_sklearn(
         model, "select fdr",
         [("input", Int64TensorType([None, X.shape[1]]))],
         target_opset=TARGET_OPSET)
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         X.astype(np.int64), model, model_onnx,
         basename="SklearnSelectFdr")
 def test_select_fdr_int(self):
     model = SelectFdr()
     X, y = load_breast_cancer(return_X_y=True)
     model.fit(X, y)
     model_onnx = convert_sklearn(
         model, 'select fdr', [('input', Int64TensorType([1, X.shape[1]]))])
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         X.astype(np.int64),
         model,
         model_onnx,
         basename="SklearnSelectFdr",
         allow_failure=
         "StrictVersion(onnx.__version__) < StrictVersion('1.2')")
def select_fdr(input_data,
               feature_names=None,
               score_func=f_classif,
               alpha=0.05):
    if score_func == f_classif:
        input_data, feature_names, _ = remove_constant(input_data,
                                                       feature_names)

    x_train = input_data[0]
    y_train = input_data[1]
    x_test = input_data[2]
    y_test = input_data[3]

    dims = len(x_train.shape)
    if dims == 3:
        x_train = flatten(x_train)
        x_test = flatten(x_test)

    done = False
    increment = alpha
    while not done:
        feature_selector = SelectFdr(score_func=score_func, alpha=alpha)
        temp_x_train = feature_selector.fit_transform(x_train, y_train)
        temp_x_test = feature_selector.transform(x_test)
        if temp_x_train.shape[1] > 1 and temp_x_test.shape[1] > 1:
            done = True
            x_train = temp_x_train
            x_test = temp_x_test
        else:
            msg = 'Feature selection was too aggresive, '
            msg += 'increasing alpha from {} to {}'.format(
                alpha, alpha + increment)
            alpha += increment
            logging.warning(msg)

    if dims == 3:
        x_train = make3D(x_train)
        x_test = make3D(x_test)

    output_data = (x_train, y_train, x_test, y_test)
    if feature_names is not None:
        mask = feature_selector.get_support()
        feature_names = feature_names[mask]

    logging.info('Selected {} features'.format(x_train.shape[1]))

    final_args = {'score_func': score_func, 'alpha': alpha}

    return output_data, feature_names, final_args
Exemple #18
0
    def SelectComorbidTraits(self,FDR,modifyDataset=False,useChi2=True):
        """
        Selects features (symptoms) correlated with some dichotomous variable (disease diagnosis), hence co-morbid. This dichotomous variable is automatically inferred from ClinicalDatasetSampler, as it is whatever the sampler is conditioned on.

        Parameters
        ----------

        FDR : float
            False discovery rate cut off for feature selection
        modifyDataset : bool
            If True, then features that faile to be selected will be dropped from the dataset.
        useChi2 : bool
            By default, uses chi-sq test to estimate co-morbidity between featureVector and features. If False, then Fisher's exact test is used. 

        Returns
        -------
        tuple of arrays
            (Index of selected features, Feature Scores ,Feature P-values)

        """


        assert self.sampler.isConditioned==True,"Cannot perform feature selection without being conditioned on some disease of interest"
        previousArrayType = self.sampler.returnArrays
        if self.sampler.returnArrays!='Sparse':
            self.sampler.ChangeArrayType('Sparse')


        sparseTrainingData=self.sampler.ReturnFullTrainingDataset(randomize=False)
        dataMatrix=sparseTrainingData[0]
        incidenceVec =sparseTrainingData[2]

        if useChi2==False:
            fdr=SelectFdr(fisher_exact, alpha=FDR)
        else:
            fdr=SelectFdr(chi2, alpha=FDR)


        fdr_fit = fdr.fit(dataMatrix,incidenceVec.toarray())
        discIndx=np.where(fdr_fit.get_support()==True)[0]

        if modifyDataset:
            self.sampler.currentClinicalDataset.IncludeOnly([self.sampler.currentClinicalDataset.dataIndexToDxCodeMap[x] for x in discIndx])

        if previousArrayType!='Sparse':
            self.sampler.ChangeArrayType(previousArrayType)

        return discIndx, fdr_fit.scores_[discIndx],fdr_fit.pvalues_[discIndx]
def select_fdr(df, target_col):
    y = df[target_col]
    X = df.drop(target_col, axis=1)
    selector = SelectFdr(chi2, alpha=0.01).fit(X, y)
    true_list = list(selector.get_support())
    index = [i for i in range(len(true_list)) if true_list[i] == True]
    if len(index) == 0:
        print(
            'No features were selected: either the data is too noisy or the selection Test_data too strict.'
        )
        return df
    else:
        saved_columns = [list(X.columns)[i] for i in index]
        result = pd.DataFrame(selector.transform(X), columns=saved_columns)
        result[target_col] = y
    return result
Exemple #20
0
def build_trained_model(training_data, classifier='svc'):
    alpha = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10]
    ridge_params = {'alpha': alpha}

    c_s = [0.01, 0.1, 1.0, 10.0, 100.0]
    gamma = [1e-4, 1e-3, 1e-2, 1e-1, 1, 10]
    svc_params = [{
        'kernel': ['rbf'],
        'gamma': gamma,
        'C': c_s
    }, {
        'kernel': ['linear'],
        'C': c_s
    }]

    if classifier == 'svc':
        clf = GridSearchCV(SVC(probability=True), svc_params, cv=5)
        # clf = GridSearchCV(SVC(probability=True, class_weight='balanced'), svc_params, cv=5)
    elif classifier == 'ridge':
        clf = GridSearchCV(RidgeClassifier(), ridge_params, cv=5)
    else:
        raise NotImplementedError(
            "Only 'svc' (default) and 'ridge' classifiers are supported")

    pipe = Pipeline([('standard_scalar', StandardScaler()),
                     ('feature_selection', SelectFdr()),
                     ('classification', clf)])

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')

        pipe.fit(training_data.ix[:, :-3], training_data.ix[:,
                                                            -3].astype('int'))

    return pipe
def test_select_fdr_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the fdr heuristic
    """
    X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectFdr(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode="fdr", param=0.01).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
 def test_select_fdr_float(self):
     model = SelectFdr()
     X, y = load_breast_cancer(return_X_y=True)
     model.fit(X, y)
     model_onnx = convert_sklearn(
         model, "select fdr", [("input", FloatTensorType([1, X.shape[1]]))])
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         X.astype(np.float32),
         model,
         model_onnx,
         basename="SklearnSelectFdr",
         allow_failure="StrictVersion(onnx.__version__)"
         " < StrictVersion('1.2') or "
         "StrictVersion(onnxruntime.__version__)"
         " <= StrictVersion('0.2.1')",
     )
Exemple #23
0
    def SelectComorbidTraits_ContinuousFeature(self,featureVector,FDR,modifyDataset=False,use_ttest=False):
        """
        Selects features correlated with some continuous variable.

        Parameters
        ----------
        featureVector : [float]
            Vector of floating values for feature selection. Must be sorted in the same order as the index for the ClinicalDatasetSampler training dataset.
        FDR : float
            False discovery rate cut off for feature selection
        modifyDataset : bool
            If True, then features that faile to be selected will be dropped from the dataset.
        use_ttest : bool
            By default, uses F-test to estimate correlation between featureVector and features. If True, instead uses T-test to perform association.

        Returns
        -------
        tuple of arrays
            (Index of selected features, Feature Scores ,Feature P-values)

        """


        previousArrayType = self.sampler.returnArrays
        if self.sampler.returnArrays!='Sparse':
            self.sampler.ChangeArrayType('Sparse')

        sparseTrainingData=self.sampler.ReturnFullTrainingDataset(randomize=False)
        dataMatrix=sparseTrainingData[0]

        if use_ttest:
            fdr=SelectFdr(T_test, alpha=FDR)
        else:
            fdr=SelectFdr(f_regression, alpha=FDR)

        fdr_fit = fdr.fit(dataMatrix,featureVector.ravel())
        discIndx=np.where(fdr_fit.get_support()==True)[0]


        if modifyDataset:
            self.sampler.currentClinicalDataset.IncludeOnly([self.sampler.currentClinicalDataset.dataIndexToDxCodeMap[x] for x in discIndx])

        if previousArrayType!='Sparse':
            self.sampler.ChangeArrayType(previousArrayType)

        return discIndx, fdr_fit.scores_[discIndx],fdr_fit.pvalues_[discIndx]
def test_verbose_output_for_select_select_fdr():
    expected_output = ("The p-value of column 'B' (1.0000) is above the " +
                       "specified alpha of 0.5000")

    model = SelectFdr(chi2, alpha=0.5)

    output = _capture_verbose_output_for_model(model, use_supervised_df=True)

    assert output == expected_output
Exemple #25
0
def featureFitting( filename, X, y, featureNames,optimalFlag, kbest=20, alpha=0.05,model=None):
    '''
    Gets the K-best features (filtered by FDR, then select best ranked by t-test , more advanced options can be implemented).
    Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv"
    '''
    a=alpha
    FD = SelectFdr(alpha=a)
    X = FD.fit_transform(X,y)

    selectK = SelectKBest(k=kbest)
    selectK.fit(X,y)
    selectK_mask=selectK.get_support()
    K_featnames = featureNames[selectK_mask]
    print("K_featnames: %s" %(K_featnames))
    Reduced_df = pd.read_csv(filename, index_col=0)
    Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]]
    Reduced_df.to_csv('REDUCED_Feat.csv')
    return Reduced_df
Exemple #26
0
def test_select_fdr_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the fdr heuristic
    """
    X, y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectFdr(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode='fdr',
                    param=0.01).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Exemple #27
0
def selectFdr(args):
    """Uses scikit-learn's SelectFdr, select the p-values for an estimated false discovery rate.
        
    Parameters
    ----------

    score_func : callable
        Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues).

    alpha : float, optional
        The highest uncorrected p-value for features to keep.

    """

    if (args[2] == "chi2"):
        selector = SelectFdr(chi2, alpha=float(args[1]))
    elif (args[2] == "f_classif"):
        selector = SelectFdr(f_classif, alpha=float(args[1]))

    return selector
Exemple #28
0
def build_trained_model(training_data):
    pipe = Pipeline([('scaler', StandardScaler()),
                     ('feature_selection', SelectFdr()),
                     ('classification', SVC(probability=True))])

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')

        pipe.fit(training_data.ix[:, :-3], training_data.ix[:,
                                                            -3].astype('int'))

    return pipe
Exemple #29
0
def select_fdr(args):
    # https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFdr.html

    from sklearn.feature_selection import f_classif, chi2

    if args['alpha'] is None:
        args['alpha'] = 0.05

    if args['score_function'] == 'chi2':
        args['score_function'] = chi2
    elif args['score_function'] == 'f_classif':
        args['score_function'] = f_classif

    return SelectFdr(score_func=args['score_function'], alpha=args['alpha'])
Exemple #30
0
def test_pipeline_shares_structure():
    pipeline = dl.Pipeline([("scale", StandardScaler()), ("fdr", SelectFdr()),
                            ("svm", LinearSVC())])

    pipeline1 = pipeline.fit(X, y)
    score1 = pipeline1.score(X, y)

    pipeline2 = pipeline.set_params(svm__C=0.1)
    pipeline2 = pipeline2.fit(X, y)
    score2 = pipeline2.score(X, y)

    assert (len(merge(score1.dask, score2.dask)) <=
            (len(score1.dask) + len(score2.dask)) * 0.75)
    assert score1.key != score2.key
Exemple #31
0
def feature_selection(df,tgt,mtd,slct=10):
	'''function to do feature selection for the target specified by tgt and using the method specified by mtd'''
	target = df[tgt]
	features = df.drop([tgt], axis=1)
	if mtd == 'KBest':
		bestfeatures = SelectKBest(score_func=f_classif, k=slct)
		fit = bestfeatures.fit(features,target)
		dfscores = pd.DataFrame(fit.scores_)
		dfcolumns = pd.DataFrame(features.columns)
		#dfpvalues = pd.DataFrame(fit.pvalues_) #not providing useful insight
		featureScores = pd.concat([dfcolumns,dfscores],axis=1)
		featureScores.columns = ['Features','Score']  #naming the dataframe columns
		featureScores = featureScores.sort_values(by=['Score'],ascending=False) #sort to see most important features at the top
		select_cols = features.columns.values[fit.get_support()] #get_support returns a boolean vector indicating which features (columns' names) were selected
		selectfeatures = pd.DataFrame(fit.transform(features),columns = select_cols) #build dataframe with selected features only
	elif mtd == 'Fdr':
		bestfeatures = SelectFdr(score_func=f_classif, alpha=0.05)
		fit = bestfeatures.fit(features,target)
		dfscores = pd.DataFrame(fit.scores_)
		dfcolumns = pd.DataFrame(features.columns)
		#dfpvalues = pd.DataFrame(fit.pvalues_) #not providing useful insight
		featureScores = pd.concat([dfcolumns,dfscores],axis=1)
		featureScores.columns = ['Features','Score']  #naming the dataframe columns
		featureScores = featureScores.sort_values(by=['Score'],ascending=False) #sort to see most important features at the top
		select_cols = features.columns.values[fit.get_support()] #get_support returns a boolean vector indicating which features (columns' names) were selected
		selectfeatures = pd.DataFrame(fit.transform(features),columns = select_cols) #build dataframe with selected features only
	elif mtd == 'Fwe':
		bestfeatures = SelectFwe(score_func=f_classif, alpha=0.05)
		fit = bestfeatures.fit(features,target)
		dfscores = pd.DataFrame(fit.scores_)
		dfcolumns = pd.DataFrame(features.columns)
		#dfpvalues = pd.DataFrame(fit.pvalues_) #not providing useful insight
		featureScores = pd.concat([dfcolumns,dfscores],axis=1)
		featureScores.columns = ['Features','Score']  #naming the dataframe columns
		featureScores = featureScores.sort_values(by=['Score'],ascending=False) #sort to see most important features at the top
		select_cols = features.columns.values[fit.get_support()] #get_support returns a boolean vector indicating which features (columns' names) were selected
		selectfeatures = pd.DataFrame(fit.transform(features),columns = select_cols) #build dataframe with selected features only
	elif mtd == 'Pct':
		bestfeatures = SelectPercentile(score_func=f_classif, percentile=20)
		fit = bestfeatures.fit(features,target)
		dfscores = pd.DataFrame(fit.scores_)
		dfcolumns = pd.DataFrame(features.columns)
		#dfpvalues = pd.DataFrame(fit.pvalues_) #not providing useful insight
		featureScores = pd.concat([dfcolumns,dfscores],axis=1)
		featureScores.columns = ['Features','Score']  #naming the dataframe columns
		featureScores = featureScores.sort_values(by=['Score'],ascending=False) #sort to see most important features at the top
		select_cols = features.columns.values[fit.get_support()] #get_support returns a boolean vector indicating which features (columns' names) were selected
		selectfeatures = pd.DataFrame(fit.transform(features),columns = select_cols) #build dataframe with selected features only

	return select_cols #selectfeatures featureScores
Exemple #32
0
def get_fsmethod (fsmethod, n_feats, n_subjs, n_jobs=1):

    if fsmethod == 'stats':
        return 'stats', None

    #Feature selection procedures
                                #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html
    fsmethods = { 'rfe'       : RFE(estimator=SVC(kernel="linear"), step=0.05, n_features_to_select=2),
                                #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html
                  'rfecv'     : RFECV(estimator=SVC(kernel="linear"), step=0.05, loss_func=zero_one), #cv=3, default; cv=StratifiedKFold(n_subjs, 3)
                                #Univariate Feature selection: http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html
                  'univariate': SelectPercentile(f_classif, percentile=5),
                                #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFpr.html
                  'fpr'       : SelectFpr (f_classif, alpha=0.05),
                                #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFdr.html
                  'fdr'       : SelectFdr (f_classif, alpha=0.05),
                                #http://scikit-learn.org/stable/modules/feature_selection.html
                  'extratrees': ExtraTreesClassifier(n_estimators=50, max_features='auto', compute_importances=True, n_jobs=n_jobs, random_state=0),

                  'pca'       : PCA(n_components='mle'),
                  'rpca'      : RandomizedPCA(random_state=0),
                  'lda'       : LDA(),
    }

    #feature selection parameter values for grid search
    max_feats = ['auto']
    if n_feats < 10:
        feats_to_sel = range(2, n_feats, 2)
        n_comps = range(1, n_feats, 2)
    else:
        feats_to_sel = range(2, 20, 4)
        n_comps = range(1, 30, 4)
    max_feats.extend(feats_to_sel)

    n_comps_pca = list(n_comps)
    n_comps_pca.extend(['mle'])

    fsgrid =    { 'rfe'       : dict(estimator_params = [dict(C=0.1), dict(C=1), dict(C=10)], n_features_to_select = feats_to_sel),
                  'rfecv'     : dict(estimator_params = [dict(C=0.1), dict(C=1), dict(C=10)]),
                  'univariate': dict(percentile = [1, 3, 5, 10]),
                  'fpr'       : dict(alpha = [1, 3, 5, 10]),
                  'fdr'       : dict(alpha = [1, 3, 5, 10]),
                  'extratrees': dict(n_estimators = [1, 3, 5, 10, 30, 50], max_features = max_feats),
                  'pca'       : dict(n_components = n_comps_pca, whiten = [True, False]),
                  'rpca'      : dict(n_components = n_comps, iterated_power = [3, 4, 5], whiten = [True, False]),
                  'lda'       : dict(n_components = n_comps)
    }

    return fsmethods[fsmethod], fsgrid[fsmethod]
Exemple #33
0
def select_features(data,
                    features,
                    target,
                    feature_selector='SelectKBest',
                    k=10,
                    alpha=0.05,
                    score_func='f_classif'):
    X = data[features]
    y = data[target]

    if score_func == 'f_classif':
        score_func = f_classif
    elif score_func == 'f_regression':
        score_func = f_regression
    elif score_func == 'chi2':
        score_func = chi2
    elif score_func == 'mutual_info_classif':
        score_func = mutual_info_classif
    elif score_func == 'mutual_info_regression':
        score_func = mutual_info_regression
    else:
        raise Exception('Undefined score_func')

    if feature_selector == 'SelectKBest':
        feature_selector = SelectKBest(score_func=score_func, k=k)
    elif feature_selector == 'SelectFpr':
        feature_selector = SelectFpr(score_func=score_func, alpha=alpha)
    elif feature_selector == 'SelectFdr':
        feature_selector = SelectFdr(score_func=score_func, alpha=alpha)
    else:
        raise Exception('Undefined score_func')

    feature_selector.fit_transform(X, y)
    feature_index = [
        zero_based_index
        for zero_based_index in list(feature_selector.get_support(
            indices=True))
    ]

    best_features = []
    for i in feature_index:
        best_features.append(features[i])

    print('Best features selected are: ' + str(best_features))

    return best_features
Exemple #34
0
def test_no_feature_selected():
    rng = np.random.RandomState(0)

    # Generate random uncorrelated data: a strict univariate test should
    # rejects all the features
    X = rng.rand(40, 10)
    y = rng.randint(0, 4, size=40)
    strict_selectors = [
        SelectFwe(alpha=0.01).fit(X, y),
        SelectFdr(alpha=0.01).fit(X, y),
        SelectFpr(alpha=0.01).fit(X, y),
        SelectPercentile(percentile=0).fit(X, y),
        SelectKBest(k=0).fit(X, y),
    ]
    for selector in strict_selectors:
        assert_array_equal(selector.get_support(), np.zeros(10))
        X_selected = assert_warns_message(
            UserWarning, 'No features were selected', selector.transform, X)
        assert_equal(X_selected.shape, (40, 0))
Exemple #35
0
def feature_sel(x,
                y,
                sel_method='estimator',
                k=None,
                estimator=None,
                score_func=chi2):
    """

    :param x:
    :param y:
    :param k:
    :param sel_method: kbest, fdr, fpr, fwe, estimator, rfecv
    :param estimator:
    :param score_func:
    :return:
    """

    if sel_method == 'kbest':
        assert k is not None
        selector = SelectKBest(score_func, k)
    elif sel_method == 'fdr':
        selector = SelectFdr(score_func, alpha=0.05)
    elif sel_method == 'fpr':
        selector = SelectFpr(score_func, alpha=0.05)
    elif sel_method == 'fwe':
        selector = SelectFwe(score_func, alpha=0.05)
    elif sel_method == 'estimator':
        assert estimator is not None
        if k is None:
            selector = SelectFromModel(estimator=estimator)
        else:
            selector = SelectFromModel(estimator=estimator,
                                       max_features=k,
                                       threshold=-np.inf)
    elif sel_method == 'rfecv':
        assert estimator is not None
        selector = RFECV(estimator, step=1, cv=5)
    else:
        raise Exception('unknown input parameters.')

    assert selector is not None
    x_new = selector.fit_transform(x, y)
    return selector.get_support(), x_new, y
def svm_cv(data, data_target):
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, data_target)
    print "*" * 79
    print "Training..."
    # selector = SelectFdr(chi2)
    selector = SelectFdr(f_classif)
    selector.fit(X_train, y_train)
    clf = svm.SVC(kernel='linear', probability=True)
    clf.fit(selector.transform(X_train), y_train)
    print "Testing..."
    pred = clf.predict(selector.transform(X_test))
    probs = pred.predict_proba(selector.transfrom(X_test))
    accuracy_score = metrics.accuracy_score(y_test, pred)
    classification_report = metrics.classification_report(y_test, pred)
    support = selector.get_support()
    print support
    print accuracy_score
    print classification_report
    precision, recall, thresholds = precision_recall_curve(y_test, probs[:, 1])
Exemple #37
0
    Kcv=4 #Number of stratified folds for cross validation. More = slower, more accurate.
    fileName = r'\trainingSetFeatures.csv'

    # filePath = r'E:\Dropbox\Dropbox\BioInformatics Lab\AA_Information\CODE\Feature_Extract\test_seq\Chap'
    filePath = str(input('Input DIRRectory containing TrainingData csv '))

    ## features, labels, lb_encoder,featureNames = load_data(filename, 'file')
    features, labels, lb_encoder,featureNames = load_data(filePath+fileName, 'file')

    X, y = features, labels
    print('len(set(y)',len(set(y)))
    print(X.shape,"X = samples, features")
    scale = StandardScaler(copy=False)
    X = scale.fit_transform(X)

    FD = SelectFdr(alpha=0.0005)
    FD_K = SelectPercentile(percentile=70)
    X = FD.fit_transform(X,y)
    print(X.shape,"X post FDR alpha filter")
    X_FD = FD_K.fit_transform(X,y)
    print(X_FD.shape,"X post FDR+K-best alpha filter")

    print("\n BASE X models: \n")
    ModelParam_GridSearch(X,y,cv=Kcv)
    '''
    pca = PCA(n_components='mle')
    X_PCA = pca.fit_transform(X)
    print(X_PCA.shape,"X - PCA,mle")
    ModelParam_GridSearch(X_PCA,y,cv=Kcv)
    '''