Beispiel #1
0
def feature_select_univariate(feat_df,
                              resp_df,
                              problem='classification',
                              cut_off=0.7,
                              percentile=0.4,
                              p_value=0.05,
                              return_ranking=True):
    '''
    Returns features selected or sorted features ranking from feat_df using univariate tests.
    Parameters
    ----------
    feat_df = dataframe with features data and column names
    resp_df = dataframe or array with response date
    problem = 'classification' default. Chi2 will be used for 'classification' problems and F_stat for 'regression'
    cut_off = 0.75 default. Cut-off threshold to select a variable using the average score from all the modes used.
    percentile = 0.4 default. Variables to be selected expressed as percentile.
    p_value = 0.05 default. P-value threshold used to select variables from the 'fpr','fdr' and 'fwe' modes
    return_ranking = True default. If True it returns a feature ranking, otherwise it will return the dataframe 
    with selected features above required cut-off threshold

    modes definition:
    * percentile = select features based on percentile of the highest scores
    * fpr = select features based on a false positive rate test  = Number Non-rejec False Ho / Tot Number True Ho 
    * fdr = select features based on an estimated false discovery rate = Number Reject True Ho / Total Num Rejections
    * fwe = select features based on family-wise error rate = Probability of making at least one type I error (Reject True Ho)

    '''
    import pandas as pd
    import numpy as np
    import sklearn.feature_selection as fs
    m_list = ['percentile', 'fpr', 'fdr', 'fwe']
    p_list = [0.4, 0.05, 0.05, 0.05]
    sel_df = pd.DataFrame(index=feat_df.columns)
    method = ['fs.chi2' if problem == 'classification' else 'fs.f_regression'
              ][0]
    for m, p in zip(m_list, p_list):
        f = fs.GenericUnivariateSelect(eval(method), mode=m,
                                       param=p).fit(feat_df, resp_df)
        sel_df[m] = f.get_support()

    sel_df['average'] = sel_df.apply(np.mean, axis=1)
    if return_ranking == True:
        return sel_df.sort_values(by='average', ascending=False)
    else:
        return feat_df[sel_df[sel_df['average'] > cut_off].index]
Beispiel #2
0
    def fit_transform(self, data):
        """Fit and transform using feature filtering.

        Fit and transform using several kind of feature filtering methods to
        select features in data.

        :param data: Dataframe. The Pandas dataframe, to be converted.

        :return: Dataframe. The converted dataframe after feature filtering.
        """
        # Removing features with low variance.
        threshold = 0.0
        var_thre = fe.VarianceThreshold(threshold=threshold)
        result = var_thre.fit_transform(data[data.columns.difference(
            [self.target_column])])
        feature_select = data.columns.difference([self.target_column
                                                  ])[var_thre.get_support()]
        result = pd.DataFrame(columns=feature_select, data=result)
        result[self.target_column] = data[self.target_column]
        # Store converter.
        self.variance_threshold = var_thre

        # Univariate feature selection, using univariate statistical tests.
        data = result
        univar_select = fe.GenericUnivariateSelect(
            score_func=fe.mutual_info_classif, mode='fwe', param=0.05)

        # Check whether it's regression or classification.
        # If classification, skip univariate.
        if len(data[self.target_column].value_counts()) <= 2:
            return result
        # If Regression.
        result = univar_select.fit_transform(
            data[data.columns.difference([self.target_column])],
            np.asarray(data[self.target_column]))
        feature_select = data.columns.difference(
            [self.target_column])[univar_select.get_support()]
        result = pd.DataFrame(columns=feature_select, data=result)
        result[self.target_column] = data[self.target_column]
        # Store converter.
        self.univar_select = univar_select

        return result
Beispiel #3
0
def _eval_search_params(params_builder):
    search_params = {}

    for p in params_builder['param_set']:
        search_list = p['sp_list'].strip()
        if search_list == '':
            continue

        param_name = p['sp_name']
        if param_name.lower().endswith(NON_SEARCHABLE):
            print("Warning: `%s` is not eligible for search and was "
                  "omitted!" % param_name)
            continue

        if not search_list.startswith(':'):
            safe_eval = SafeEval(load_scipy=True, load_numpy=True)
            ev = safe_eval(search_list)
            search_params[param_name] = ev
        else:
            # Have `:` before search list, asks for estimator evaluatio
            safe_eval_es = SafeEval(load_estimators=True)
            search_list = search_list[1:].strip()
            # TODO maybe add regular express check
            ev = safe_eval_es(search_list)
            preprocessings = (
                preprocessing.StandardScaler(), preprocessing.Binarizer(),
                preprocessing.MaxAbsScaler(), preprocessing.Normalizer(),
                preprocessing.MinMaxScaler(),
                preprocessing.PolynomialFeatures(),
                preprocessing.RobustScaler(), feature_selection.SelectKBest(),
                feature_selection.GenericUnivariateSelect(),
                feature_selection.SelectPercentile(),
                feature_selection.SelectFpr(), feature_selection.SelectFdr(),
                feature_selection.SelectFwe(),
                feature_selection.VarianceThreshold(),
                decomposition.FactorAnalysis(random_state=0),
                decomposition.FastICA(random_state=0),
                decomposition.IncrementalPCA(),
                decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS),
                decomposition.LatentDirichletAllocation(random_state=0,
                                                        n_jobs=N_JOBS),
                decomposition.MiniBatchDictionaryLearning(random_state=0,
                                                          n_jobs=N_JOBS),
                decomposition.MiniBatchSparsePCA(random_state=0,
                                                 n_jobs=N_JOBS),
                decomposition.NMF(random_state=0),
                decomposition.PCA(random_state=0),
                decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS),
                decomposition.TruncatedSVD(random_state=0),
                kernel_approximation.Nystroem(random_state=0),
                kernel_approximation.RBFSampler(random_state=0),
                kernel_approximation.AdditiveChi2Sampler(),
                kernel_approximation.SkewedChi2Sampler(random_state=0),
                cluster.FeatureAgglomeration(),
                skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS),
                skrebate.SURFstar(n_jobs=N_JOBS),
                skrebate.MultiSURF(n_jobs=N_JOBS),
                skrebate.MultiSURFstar(n_jobs=N_JOBS),
                imblearn.under_sampling.ClusterCentroids(random_state=0,
                                                         n_jobs=N_JOBS),
                imblearn.under_sampling.CondensedNearestNeighbour(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.EditedNearestNeighbours(random_state=0,
                                                                n_jobs=N_JOBS),
                imblearn.under_sampling.RepeatedEditedNearestNeighbours(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.InstanceHardnessThreshold(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.NearMiss(random_state=0,
                                                 n_jobs=N_JOBS),
                imblearn.under_sampling.NeighbourhoodCleaningRule(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.OneSidedSelection(random_state=0,
                                                          n_jobs=N_JOBS),
                imblearn.under_sampling.RandomUnderSampler(random_state=0),
                imblearn.under_sampling.TomekLinks(random_state=0,
                                                   n_jobs=N_JOBS),
                imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.RandomOverSampler(random_state=0),
                imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.BorderlineSMOTE(random_state=0,
                                                       n_jobs=N_JOBS),
                imblearn.over_sampling.SMOTENC(categorical_features=[],
                                               random_state=0,
                                               n_jobs=N_JOBS),
                imblearn.combine.SMOTEENN(random_state=0),
                imblearn.combine.SMOTETomek(random_state=0))
            newlist = []
            for obj in ev:
                if obj is None:
                    newlist.append(None)
                elif obj == 'all_0':
                    newlist.extend(preprocessings[0:35])
                elif obj == 'sk_prep_all':  # no KernalCenter()
                    newlist.extend(preprocessings[0:7])
                elif obj == 'fs_all':
                    newlist.extend(preprocessings[7:14])
                elif obj == 'decomp_all':
                    newlist.extend(preprocessings[14:25])
                elif obj == 'k_appr_all':
                    newlist.extend(preprocessings[25:29])
                elif obj == 'reb_all':
                    newlist.extend(preprocessings[30:35])
                elif obj == 'imb_all':
                    newlist.extend(preprocessings[35:54])
                elif type(obj) is int and -1 < obj < len(preprocessings):
                    newlist.append(preprocessings[obj])
                elif hasattr(obj, 'get_params'):  # user uploaded object
                    if 'n_jobs' in obj.get_params():
                        newlist.append(obj.set_params(n_jobs=N_JOBS))
                    else:
                        newlist.append(obj)
                else:
                    sys.exit("Unsupported estimator type: %r" % (obj))

            search_params[param_name] = newlist

    return search_params
def get_search_params(params_builder):
    search_params = {}
    safe_eval = SafeEval(load_scipy=True, load_numpy=True)
    safe_eval_es = SafeEval(load_estimators=True)

    for p in params_builder['param_set']:
        search_p = p['search_param_selector']['search_p']
        if search_p.strip() == '':
            continue
        param_type = p['search_param_selector']['selected_param_type']

        lst = search_p.split(':')
        assert (
            len(lst) == 2
        ), "Error, make sure there is one and only one colon in search parameter input."
        literal = lst[1].strip()
        param_name = lst[0].strip()
        if param_name:
            if param_name.lower() == 'n_jobs':
                sys.exit("Parameter `%s` is invalid for search." % param_name)
            elif not param_name.endswith('-'):
                ev = safe_eval(literal)
                if param_type == 'final_estimator_p':
                    search_params['estimator__' + param_name] = ev
                else:
                    search_params['preprocessing_' + param_type[5:6] + '__' +
                                  param_name] = ev
            else:
                # only for estimator eval, add `-` to the end of param
                #TODO maybe add regular express check
                ev = safe_eval_es(literal)
                for obj in ev:
                    if 'n_jobs' in obj.get_params():
                        obj.set_params(n_jobs=N_JOBS)
                if param_type == 'final_estimator_p':
                    search_params['estimator__' + param_name[:-1]] = ev
                else:
                    search_params['preprocessing_' + param_type[5:6] + '__' +
                                  param_name[:-1]] = ev
        elif param_type != 'final_estimator_p':
            #TODO regular express check ?
            ev = safe_eval_es(literal)
            preprocessors = [
                preprocessing.StandardScaler(),
                preprocessing.Binarizer(),
                preprocessing.Imputer(),
                preprocessing.MaxAbsScaler(),
                preprocessing.Normalizer(),
                preprocessing.MinMaxScaler(),
                preprocessing.PolynomialFeatures(),
                preprocessing.RobustScaler(),
                feature_selection.SelectKBest(),
                feature_selection.GenericUnivariateSelect(),
                feature_selection.SelectPercentile(),
                feature_selection.SelectFpr(),
                feature_selection.SelectFdr(),
                feature_selection.SelectFwe(),
                feature_selection.VarianceThreshold(),
                decomposition.FactorAnalysis(random_state=0),
                decomposition.FastICA(random_state=0),
                decomposition.IncrementalPCA(),
                decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS),
                decomposition.LatentDirichletAllocation(random_state=0,
                                                        n_jobs=N_JOBS),
                decomposition.MiniBatchDictionaryLearning(random_state=0,
                                                          n_jobs=N_JOBS),
                decomposition.MiniBatchSparsePCA(random_state=0,
                                                 n_jobs=N_JOBS),
                decomposition.NMF(random_state=0),
                decomposition.PCA(random_state=0),
                decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS),
                decomposition.TruncatedSVD(random_state=0),
                kernel_approximation.Nystroem(random_state=0),
                kernel_approximation.RBFSampler(random_state=0),
                kernel_approximation.AdditiveChi2Sampler(),
                kernel_approximation.SkewedChi2Sampler(random_state=0),
                cluster.FeatureAgglomeration(),
                skrebate.ReliefF(n_jobs=N_JOBS),
                skrebate.SURF(n_jobs=N_JOBS),
                skrebate.SURFstar(n_jobs=N_JOBS),
                skrebate.MultiSURF(n_jobs=N_JOBS),
                skrebate.MultiSURFstar(n_jobs=N_JOBS),
                imblearn.under_sampling.ClusterCentroids(random_state=0,
                                                         n_jobs=N_JOBS),
                imblearn.under_sampling.CondensedNearestNeighbour(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.EditedNearestNeighbours(random_state=0,
                                                                n_jobs=N_JOBS),
                imblearn.under_sampling.RepeatedEditedNearestNeighbours(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.InstanceHardnessThreshold(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.NearMiss(random_state=0,
                                                 n_jobs=N_JOBS),
                imblearn.under_sampling.NeighbourhoodCleaningRule(
                    random_state=0, n_jobs=N_JOBS),
                imblearn.under_sampling.OneSidedSelection(random_state=0,
                                                          n_jobs=N_JOBS),
                imblearn.under_sampling.RandomUnderSampler(random_state=0),
                imblearn.under_sampling.TomekLinks(random_state=0,
                                                   n_jobs=N_JOBS),
                imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.RandomOverSampler(random_state=0),
                imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS),
                imblearn.over_sampling.BorderlineSMOTE(random_state=0,
                                                       n_jobs=N_JOBS),
                imblearn.over_sampling.SMOTENC(categorical_features=[],
                                               random_state=0,
                                               n_jobs=N_JOBS),
                imblearn.combine.SMOTEENN(random_state=0),
                imblearn.combine.SMOTETomek(random_state=0)
            ]
            newlist = []
            for obj in ev:
                if obj is None:
                    newlist.append(None)
                elif obj == 'all_0':
                    newlist.extend(preprocessors[0:36])
                elif obj == 'sk_prep_all':  # no KernalCenter()
                    newlist.extend(preprocessors[0:8])
                elif obj == 'fs_all':
                    newlist.extend(preprocessors[8:15])
                elif obj == 'decomp_all':
                    newlist.extend(preprocessors[15:26])
                elif obj == 'k_appr_all':
                    newlist.extend(preprocessors[26:30])
                elif obj == 'reb_all':
                    newlist.extend(preprocessors[31:36])
                elif obj == 'imb_all':
                    newlist.extend(preprocessors[36:55])
                elif type(obj) is int and -1 < obj < len(preprocessors):
                    newlist.append(preprocessors[obj])
                elif hasattr(obj, 'get_params'):  # user object
                    if 'n_jobs' in obj.get_params():
                        newlist.append(obj.set_params(n_jobs=N_JOBS))
                    else:
                        newlist.append(obj)
                else:
                    sys.exit("Unsupported preprocessor type: %r" % (obj))
            search_params['preprocessing_' + param_type[5:6]] = newlist
        else:
            sys.exit("Parameter name of the final estimator can't be skipped!")

    return search_params
Beispiel #5
0
def get_feature_preprocessor(params):
    fp = None
    d_feat_pre = params['layer_dict_list'][2]

    if params['feat_pre'] == str(
            d_feat_pre['ExtraTreesClassifier']
    ) or params['feat_pre'] == 'ExtraTreesClassifier':
        if params['fp0:criterion'] == '0' or params['fp0:criterion'] == 'gini':
            criterion = 'gini'
        elif params['fp0:criterion'] == '1' or params[
                'fp0:criterion'] == 'entropy':
            criterion = 'entropy'
        max_features = int(float(params['fp0:max_features']))
        min_samples_split = int(float(params['fp0:min_samples_split']))
        min_samples_leaf = int(float(params['fp0:min_samples_leaf']))
        if params['fp0:bootstrap'] == '0' or params['fp0:bootstrap'] == 'True':
            bootstrap = True
        elif params['fp0:bootstrap'] == '1' or params[
                'fp0:bootstrap'] == 'False':
            bootstrap = False
        fp = ExtraTreesClassifier(n_estimators=100,
                                  criterion=criterion,
                                  max_features=max_features,
                                  min_samples_split=min_samples_split,
                                  min_samples_leaf=min_samples_leaf,
                                  min_weight_fraction_leaf=0.0,
                                  bootstrap=bootstrap,
                                  class_weight=params['class_weight'])

    elif params['feat_pre'] == str(
            d_feat_pre['FastICA']) or params['feat_pre'] == 'FastICA':
        n_components = int(float(params['fp1:n_components']))
        if params['fp1:algorithm'] == '0' or params[
                'fp1:algorithm'] == 'parallel':
            algorithm = 'parallel'
        elif params['fp1:algorithm'] == '1' or params[
                'fp1:algorithm'] == 'deflation':
            algorithm = 'deflation'
        if params['fp1:whiten'] == '0' or params['fp1:whiten'] == 'True':
            whiten = True
        elif params['fp1:whiten'] == '1' or params['fp1:whiten'] == 'False':
            whiten = False
        if params['fp1:fun'] == '0' or params['fp1:fun'] == 'logcosh':
            fun = 'logcosh'
        elif params['fp1:fun'] == '1' or params['fp1:fun'] == 'exp':
            fun = 'exp'
        elif params['fp1:fun'] == '2' or params['fp1:fun'] == 'cube':
            fun = 'cube'
        fp = FastICA(n_components=n_components,
                     algorithm=algorithm,
                     whiten=whiten,
                     fun=fun)

    elif params['feat_pre'] == str(
            d_feat_pre['FeatureAgglomeration']
    ) or params['feat_pre'] == 'FeatureAgglomeration':
        n_clusters = int(float(params['fp2:n_clusters']))
        if params['fp2:linkage+affinity'] == '0' or params[
                'fp2:linkage+affinity'] == 'ward+euclidean':
            linkage = 'ward'
            affinity = 'euclidean'
        elif params['fp2:linkage+affinity'] == '1' or params[
                'fp2:linkage+affinity'] == 'complete+euclidean':
            linkage = 'complete'
            affinity = 'euclidean'
        elif params['fp2:linkage+affinity'] == '2' or params[
                'fp2:linkage+affinity'] == 'complete+manhattan':
            linkage = 'complete'
            affinity = 'manhattan'
        elif params['fp2:linkage+affinity'] == '3' or params[
                'fp2:linkage+affinity'] == 'complete+cosine':
            linkage = 'complete'
            affinity = 'cosine'
        elif params['fp2:linkage+affinity'] == '4' or params[
                'fp2:linkage+affinity'] == 'average+euclidean':
            linkage = 'average'
            affinity = 'euclidean'
        elif params['fp2:linkage+affinity'] == '5' or params[
                'fp2:linkage+affinity'] == 'average+manhattan':
            linkage = 'average'
            affinity = 'manhattan'
        elif params['fp2:linkage+affinity'] == '6' or params[
                'fp2:linkage+affinity'] == 'average+cosine':
            linkage = 'average'
            affinity = 'cosine'
        if params['fp2:pooling_func'] == '0' or params[
                'fp2:pooling_func'] == 'mean':
            pooling_func = np.mean
        elif params['fp2:pooling_func'] == '1' or params[
                'fp2:pooling_func'] == 'median':
            pooling_func = np.median
        elif params['fp2:pooling_func'] == '2' or params[
                'fp2:pooling_func'] == 'max':
            pooling_func = np.max
        fp = FeatureAgglomeration(n_clusters=n_clusters,
                                  linkage=linkage,
                                  affinity=affinity,
                                  pooling_func=pooling_func)

    elif params['feat_pre'] == str(
            d_feat_pre['KernelPCA']) or params['feat_pre'] == 'KernelPCA':
        n_components = int(float(params['fp3:n_components']))
        degree = 3
        coef0 = 1
        gamma = None
        if 'fp3:rbf.gamma' in params:
            kernel = 'rbf'
            gamma = float(params['fp3:rbf.gamma'])
        elif 'fp3:sigmoid.coef0' in params:
            kernel = 'sigmoid'
            coef0 = float(params['fp3:sigmoid.coef0'])
        elif 'fp3:poly.degree' in params and 'fp3:poly.coef0' in params and 'fp3:poly.gamma' in params:
            kernel = 'poly'
            degree = int(float(params['fp3:poly.degree']))
            coef0 = float(params['fp3:poly.coef0'])
            gamma = float(params['fp3:poly.gamma'])
        elif params['fp3:kernel'] == '0' or params['fp3:kernel'] == 'cosine':
            kernel = 'cosine'
        fp = KernelPCA(n_components=n_components,
                       kernel=kernel,
                       degree=degree,
                       coef0=coef0,
                       gamma=gamma)

    elif params['feat_pre'] == str(
            d_feat_pre['RBFSampler']) or params['feat_pre'] == 'RBFSampler':
        gamma = float(params['fp4:gamma'])
        n_components = int(float(params['fp4:n_components']))
        fp = RBFSampler(gamma=gamma, n_components=n_components)

    elif params['feat_pre'] == str(
            d_feat_pre['LinearSVC']) or params['feat_pre'] == 'LinearSVC':
        tol = float(params['fp5:tol'])
        C = float(params['fp5:C'])
        fp = svm.LinearSVC(penalty='l1',
                           loss='squared_hinge',
                           dual=False,
                           tol=tol,
                           C=C,
                           multi_class='ovr',
                           fit_intercept=True,
                           intercept_scaling=1,
                           class_weight=params['class_weight'])

    elif params['feat_pre'] == str(
            d_feat_pre['None']) or params['feat_pre'] == 'None':
        fp = None

    elif params['feat_pre'] == str(
            d_feat_pre['Nystroem']) or params['feat_pre'] == 'Nystroem':
        n_components = int(float(params['fp7:n_components']))
        degree = 3
        coef0 = 1
        gamma = None
        if 'fp7:rbf.gamma' in params:
            kernel = 'rbf'
            gamma = float(params['fp7:rbf.gamma'])
        elif 'fp7:chi2.gamma' in params:
            kernel = 'chi2'
            gamma = float(params['fp7:chi2.gamma'])
        elif 'fp7:sigmoid.coef0' in params and 'fp7:sigmoid.gamma' in params:
            kernel = 'sigmoid'
            coef0 = float(params['fp7:sigmoid.coef0'])
            gamma = float(params['fp7:sigmoid.gamma'])
        elif 'fp7:poly.degree' in params and 'fp7:poly.coef0' in params and 'fp7:poly.gamma' in params:
            kernel = 'poly'
            degree = int(float(params['fp7:poly.degree']))
            coef0 = float(params['fp7:poly.coef0'])
            gamma = float(params['fp7:poly.gamma'])
        elif params['fp7:kernel'] == '0' or params['fp7:kernel'] == 'cosine':
            kernel = 'cosine'
        fp = Nystroem(n_components=n_components,
                      kernel=kernel,
                      degree=degree,
                      coef0=coef0,
                      gamma=gamma)

    elif params['feat_pre'] == str(
            d_feat_pre['PCA']) or params['feat_pre'] == 'PCA':
        n_components = float(params['fp8:n_components'])
        if params['fp8:whiten'] == '0' or params['fp8:whiten'] == 'True':
            whiten = True
        elif params['fp8:whiten'] == '1' or params['fp8:whiten'] == 'False':
            whiten = False
        fp = PCA(n_components=n_components, whiten=whiten)

    elif params['feat_pre'] == str(
            d_feat_pre['PolynomialFeatures']
    ) or params['feat_pre'] == 'PolynomialFeatures':
        degree = int(float(params['fp9:degree']))
        if params['fp9:interaction_only'] == '0' or params[
                'fp9:interaction_only'] == 'True':
            interaction_only = True
        elif params['fp9:interaction_only'] == '1' or params[
                'fp9:interaction_only'] == 'False':
            interaction_only = False
        if params['fp9:include_bias'] == '0' or params[
                'fp9:include_bias'] == 'True':
            include_bias = True
        elif params['fp9:include_bias'] == '1' or params[
                'fp9:include_bias'] == 'False':
            include_bias = False
        fp = PolynomialFeatures(degree=degree,
                                interaction_only=interaction_only,
                                include_bias=include_bias)

    elif params['feat_pre'] == str(
            d_feat_pre['RandomTreesEmbedding']
    ) or params['feat_pre'] == 'RandomTreesEmbedding':
        n_estimators = int(float(params['fp10:n_estimators']))
        max_depth = int(float(params['fp10:max_depth']))
        min_samples_split = int(float(params['fp10:min_samples_split']))
        min_samples_leaf = int(float(params['fp10:min_samples_leaf']))
        fp = RandomTreesEmbedding(n_estimators=n_estimators,
                                  max_depth=max_depth,
                                  min_samples_split=min_samples_split,
                                  min_samples_leaf=min_samples_leaf,
                                  min_weight_fraction_leaf=0,
                                  sparse_output=False)

    elif params['feat_pre'] == str(
            d_feat_pre['SelectPercentile']
    ) or params['feat_pre'] == 'SelectPercentile':
        percentile = int(float(params['fp11:percentile']))
        if params['fp11:score_func'] == '0' or params[
                'fp11:score_func'] == 'chi2':
            score_func = feature_selection.chi2
        elif params['fp11:score_func'] == '1' or params[
                'fp11:score_func'] == 'f_classif':
            score_func = feature_selection.f_classif
        fp = feature_selection.SelectPercentile(score_func=score_func,
                                                percentile=percentile)

    elif params['feat_pre'] == str(
            d_feat_pre['GenericUnivariateSelect']
    ) or params['feat_pre'] == 'GenericUnivariateSelect':
        param = float(params['fp12:param'])
        if params['fp12:score_func'] == '0' or params[
                'fp12:score_func'] == 'chi2':
            score_func = feature_selection.chi2
        elif params['fp12:score_func'] == '1' or params[
                'fp12:score_func'] == 'f_classif':
            score_func = feature_selection.f_classif
        if params['fp12:mode'] == '0' or params['fp12:mode'] == 'fpr':
            mode = 'fpr'
        elif params['fp12:mode'] == '1' or params['fp12:mode'] == 'fdr':
            mode = 'fdr'
        elif params['fp12:mode'] == '2' or params['fp12:mode'] == 'fwe':
            mode = 'fwe'
        fp = feature_selection.GenericUnivariateSelect(param=param,
                                                       score_func=score_func,
                                                       mode=mode)

    return fp
Beispiel #6
0
def main():

    args = getcliargs()
    #args = getcliargs(['-d','testdata/amm/prepped_trainingdata.csv','-t','-1','-o','SVCout'])
    # args = getcliargs(['-d', 'testdata/cccpmeta/cccpmeta_MIDORI418_Insecta_rand20k_prepped_trainingdata.csv', '-t', '2', '-m', '5000', '-o', 'testruns/cccpmeta_MIDORI418_Insecta_rank20k_full'])
    # args = getcliargs(['-d','testdata/cccpmeta_rand1000/prepped_trainingdata.csv','-t','-1','-o', 'testruns/cccpmeta_rand1000_pipetest'])
    train = pd.read_csv(args.data, index_col=0)
    cls = train.pop('class')
    strat = train.pop('stratum')
    {s: sum([1 if sv == s else 0 for sv in strat]) for s in set(strat)}
    # Drop this data because this is linked identifying the known invalid data
    dropcols = ['n_stops', 'n_nt_ambig', 'n_aa_ambig']
    train = train.drop([d for d in dropcols if d in train], axis=1)

    print(f"\nLoaded {len(cls)} total training data points with "
          f"{len(train.columns)} features after removal of nonindependent "
          "features\n")

    scorers = {
        'precision_score':
        metrics.make_scorer(metrics.precision_score, zero_division=0),
        'recall_score':
        metrics.make_scorer(metrics.recall_score),
        'accuracy_score':
        metrics.make_scorer(metrics.accuracy_score)
    }
    #location = 'cachedir'
    #memory = Memory(location = location, verbose = 10)

    pipe = pipeline.Pipeline([('reduce_dim', 'passthrough'),
                              ('classify',
                               svm.LinearSVC(dual=False,
                                             max_iter=args.maxiter))])

    linsvc_params = {
        'C': 10.0**np.arange(-7, 3),
        'tol': 10.0**np.arange(-4, 0)
    }

    #    param_grid = [
    #            {'reduce_dim': [feature_selection.GenericUnivariateSelect(
    #                            score_func = feature_selection.f_classif)],
    #             'reduce_dim__mode': ['fpr', 'fdr', 'fwe'],
    #             'reduce_dim__param': [0.01, 0.05, 0.1]},
    #            {'reduce_dim': [decomposition.PCA(iterated_power = 7)],
    #             'reduce_dim__n_components': [0.7, 0.8, 0.9]}
    #            ]

    param_grid = [{
        'reduce_dim': [
            feature_selection.GenericUnivariateSelect(
                score_func=feature_selection.f_classif)
        ],
        'reduce_dim__mode': ['fpr'],
        'reduce_dim__param': [0.01, 0.05]
    }]

    param_grid = [
        dict({f"classify__{k}": v
              for k, v in linsvc_params.items()}, **p) for p in param_grid
    ]

    gscvkwargs = {
        'cv': model_selection.StratifiedShuffleSplit(n_splits=10),
        'return_train_score': False,
        'n_jobs': args.threads,
        'pre_dispatch': '2*n_jobs',
        'scoring': scorers,
        'verbose': 1
    }

    gs = GridSearchCV_custom(pipe, param_grid, refit=False, **gscvkwargs)
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=ConvergenceWarning)
        gs.fit(train, cls)

    print("\nCompleted Grid Search")

    #gs = pickle.load(open(f"{args.output}_grid_search_full.pickle", 'rb'))

    with open(f"{args.output}_grid_search_full.pickle", 'wb') as oh:
        pickle.dump(gs, oh)

    models = analyse_results(gs, scorers, args.output, train, cls, strat)

    with open(f"{args.output}_bestestimators.pickle", 'wb') as oh:
        pickle.dump(models, oh)