Ejemplo n.º 1
0
def test_boundary_case_ch2():
    # Test boundary case, and always aim to select 1 feature.
    X = np.array([[10, 20], [20, 20], [20, 30]])
    y = np.array([[1], [0], [0]])
    scores, pvalues = chi2(X, y)
    assert_array_almost_equal(scores, np.array([4.0, 0.71428571]))
    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))

    filter_fdr = SelectFdr(chi2, alpha=0.1)
    filter_fdr.fit(X, y)
    support_fdr = filter_fdr.get_support()
    assert_array_equal(support_fdr, np.array([True, False]))

    filter_kbest = SelectKBest(chi2, k=1)
    filter_kbest.fit(X, y)
    support_kbest = filter_kbest.get_support()
    assert_array_equal(support_kbest, np.array([True, False]))

    filter_percentile = SelectPercentile(chi2, percentile=50)
    filter_percentile.fit(X, y)
    support_percentile = filter_percentile.get_support()
    assert_array_equal(support_percentile, np.array([True, False]))

    filter_fpr = SelectFpr(chi2, alpha=0.1)
    filter_fpr.fit(X, y)
    support_fpr = filter_fpr.get_support()
    assert_array_equal(support_fpr, np.array([True, False]))

    filter_fwe = SelectFwe(chi2, alpha=0.1)
    filter_fwe.fit(X, y)
    support_fwe = filter_fwe.get_support()
    assert_array_equal(support_fwe, np.array([True, False]))
def test_select_fwe_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the fpr heuristic
    """
    X, Y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectFwe(f_classif, alpha=0.01)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="fwe", param=0.01).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert np.sum(np.abs(support - gtruth)) < 2
Ejemplo n.º 3
0
def test_select_heuristics_classif():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the fdr, fwe and fpr heuristics
    X, y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectFwe(f_classif, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    for mode in ["fdr", "fpr", "fwe"]:
        X_r2 = GenericUnivariateSelect(f_classif, mode=mode, param=0.01).fit(X, y).transform(X)
        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        assert_array_almost_equal(support, gtruth)
Ejemplo n.º 4
0
def test_select_fwe_regression():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple regression problem
    # with the fwe heuristic
    X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectFwe(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode="fwe", param=0.01).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support[:5], np.ones((5,), dtype=np.bool))
    assert_less(np.sum(support[5:] == 1), 2)
Ejemplo n.º 5
0
def test_select_fwe_4():
    """Ensure that the TPOT select fwe outputs the same result as sklearn fwe when 0.001 < alpha < 0.05"""
    tpot_obj = TPOT()
    non_feature_columns = ['class', 'group', 'guess']
    training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1)
    training_class_vals = training_testing_data.loc[training_testing_data['group'] == 'training', 'class'].values

    with warnings.catch_warnings():
        warnings.simplefilter('ignore', category=UserWarning)
        selector = SelectFwe(f_classif, alpha=0.042)
        selector.fit(training_features, training_class_vals)
        mask = selector.get_support(True)
    mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns

    assert np.array_equal(tpot_obj._select_fwe(training_testing_data, 0.042), training_testing_data[mask_cols])
Ejemplo n.º 6
0
def test_select_fwe_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the fwe heuristic
    """
    X, Y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectFwe(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode='fwe',
                    param=0.01).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert(support[:5] == 1).all()
    assert(np.sum(support[5:] == 1) < 2)
Ejemplo n.º 7
0
Archivo: tpot.py Proyecto: vsolano/tpot
    def _select_fwe(self, input_df, alpha):
        """ Uses Scikit-learn's SelectFwe feature selection to filter the subset of features
           according to p-values corresponding to Family-wise error rate
        Parameters
        ----------
        input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
            Input DataFrame to perform feature selection on
        alpha: float in the range [0.001, 0.05]
            The highest uncorrected p-value for features to keep

        Returns
        -------
        subsetted_df: pandas.DataFrame {n_samples, n_filtered_features + ['guess', 'group', 'class']}
            Returns a DataFrame containing the 'best' features

        """
        training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1)
        training_class_vals = input_df.loc[input_df['group'] == 'training', 'class'].values

        # forcing  0.001 <= alpha <= 0.05
        if alpha > 0.05:
            alpha = 0.05
        elif alpha <= 0.001:
            alpha = 0.001


        if len(training_features.columns.values) == 0:
            return input_df.copy()

        with warnings.catch_warnings():
            # Ignore warnings about constant features
            warnings.simplefilter('ignore', category=UserWarning)

            selector = SelectFwe(f_classif, alpha=alpha)
            selector.fit(training_features, training_class_vals)
            mask = selector.get_support(True)

        mask_cols = list(training_features.iloc[:, mask].columns) + ['guess', 'class', 'group']
        return input_df[mask_cols].copy()
Ejemplo n.º 8
0
dataset = sys.argv[1]

preprocessor_list = [
    Binarizer(),
    MaxAbsScaler(),
    MinMaxScaler(),
    Normalizer(),
    PolynomialFeatures(),
    RobustScaler(),
    StandardScaler(),
    FastICA(),
    PCA(),
    RBFSampler(),
    Nystroem(),
    FeatureAgglomeration(),
    SelectFwe(),
    SelectKBest(),
    SelectPercentile(),
    VarianceThreshold(),
    SelectFromModel(estimator=ExtraTreesClassifier(n_estimators=100)),
    RFE(estimator=ExtraTreesClassifier(n_estimators=100))
]

# Read the data set into memory
input_data = pd.read_csv(dataset, compression='gzip',
                         sep='\t').sample(frac=1.,
                                          replace=False,
                                          random_state=42)

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
Ejemplo n.º 9
0
def train_predict_and_test(model,
                           target_name,
                           train_features,
                           train_labels,
                           test_features,
                           test_labels,
                           feature_selection=None):
    classification = (target_name == Phenotypes.DIAGNOSED_ASTHMA
                      or target_name == Phenotypes.BODY_MASS_INDEX_CATEGORICAL)

    # Standardize data
    standardized = False
    if model == Models.MLP or model == Models.SVM:
        print("Standardizing data..")
        standardized = True
        features_mean = train_features.mean()
        features_std = train_features.std()
        train_features = (train_features - features_mean) / features_std
        test_features = (test_features - features_mean) / features_std

        if not classification:
            labels_mean = train_labels.mean()
            labels_std = train_labels.std()
            train_labels = (train_labels - labels_mean) / labels_std
            test_labels = (test_labels - labels_mean) / labels_std

    # Load optimized params
    params = load_optimized_params(model, target_name)

    # Features selection
    feature_selector = VarianceThreshold(threshold=0).fit(
        train_features)  # Removing features with 0 variance
    train_col, test_col = train_features.columns, test_features.columns
    train_features = pd.DataFrame(feature_selector.transform(train_features),
                                  columns=train_col)
    test_features = pd.DataFrame(feature_selector.transform(test_features),
                                 columns=test_col)
    if feature_selection == "fwe":
        print("Selecting features according to Familly Wise Error")
        # alpha = 5e-2
        alpha = 0.3
        if params is not None:
            try:
                alpha = params['transformer_alpha']
            except KeyError:
                print(
                    "Cannot find parameter alpha for FWE feature selector. Using default value"
                )

        features_selector = SelectFwe(f_regression,
                                      alpha=alpha).fit(train_features,
                                                       train_labels)
        train_features = features_selector.transform(train_features)
        test_features = features_selector.transform(test_features)
    elif feature_selection == "kbest":
        k = 150
        if params is not None:
            try:
                k = params['k']
            except KeyError:
                print(
                    "Cannot find parameter k for k-best feature selector. Using default value: k=",
                    k)
        print("Selecting k-best features:", k)
        score_func = f_regression
        if classification:
            score_func = f_classif
        features_selector = SelectKBest(score_func=score_func, k=k)
        features_selector = features_selector.fit(train_features, train_labels)
        train_features = features_selector.transform(train_features)
        test_features = features_selector.transform(test_features)
    elif feature_selection == "tree":
        print("Selecting features from RF feature importance")
        clf = RandomForestRegressor(n_estimators=100).fit(
            train_features, train_labels)
        if classification:
            clf = RandomForestClassifier(n_estimators=100).fit(
                train_features, train_labels)
        features_selector = SelectFromModel(clf, prefit=True)
        train_features = features_selector.transform(train_features)
        test_features = features_selector.transform(test_features)
    elif feature_selection == "corr":
        threshold = 0.9  # Recommended default value
        col_corr = set()
        corr_matrix = train_features.corr()
        for i in range(len(corr_matrix.columns)):
            for j in range(i):
                if abs(corr_matrix.iloc[i, j]) > threshold:
                    colname = corr_matrix.columns[i]
                    col_corr.add(colname)
        train_features = train_features.drop(col_corr, axis=1)
        test_features = test_features.drop(col_corr, axis=1)

    # Oversampling
    if classification and model != Models.SVM and model != Models.CART and model != Models.ELASTIC:
        print("Oversampling features..")
        if target_name == Phenotypes.DIAGNOSED_ASTHMA:
            sampling_strat = 0.5
        else:
            sampling_strat = {
                0: np.max(np.bincount(train_labels)) // 4,
                1: np.max(np.bincount(train_labels)),
                2: np.max(np.bincount(train_labels)),
                3: np.max(np.bincount(train_labels)) // 2
            }
        oversampler = imblearn.over_sampling.RandomOverSampler(
            sampling_strategy=sampling_strat, random_state=42)
        # oversampler = imblearn.over_sampling.SMOTE(sampling_strategy=1.0,
        #                                          k_neighbors=5,
        #                                          random_state=42)
        train_features, train_labels = oversampler.fit_resample(
            train_features, train_labels)

    if model == Models.RF:
        if target_name == Phenotypes.BODY_MASS_INDEX_CATEGORICAL:
            # Create validation set for threshold optimization
            val_features, test_features, val_labels, test_labels = train_test_split(
                test_features, test_labels, test_size=0.5, random_state=42)
            model, predictions = _predict_rf(target_name, train_features,
                                             train_labels, val_features,
                                             val_labels)
        else:
            model, predictions = _predict_rf(target_name,
                                             train_features,
                                             train_labels,
                                             test_features,
                                             test_labels,
                                             params=params)
    elif model == Models.ELASTIC:
        model, predictions = predict_elastic_net(target_name, train_features,
                                                 train_labels, test_features,
                                                 test_labels)
    elif model == Models.XGB:
        model, predictions = _predict_xgb(target_name,
                                          train_features,
                                          train_labels,
                                          test_features,
                                          test_labels,
                                          params=params)
    elif model == Models.MLP:
        model, predictions = _predict_mlp(target_name,
                                          train_features,
                                          train_labels,
                                          test_features,
                                          test_labels,
                                          params=params)
    elif model == Models.SVM:
        model, predictions = _predict_svm(target_name, train_features,
                                          train_labels, test_features,
                                          test_labels)
    elif model == Models.CART:
        model, predictions = _predict_cart(target_name, train_features,
                                           train_labels, test_features,
                                           test_labels)
    elif model == Models.NAIVE:
        if not (classification):
            predictions = predict_naive(train_features, train_labels,
                                        test_features, test_labels)
        else:
            raise SystemExit("Cannot use naive model on classification task")
    else:
        raise SystemExit("Unkwown model:", model)

    # Destandardize results
    if standardized and not (classification):
        print("destandardize data..")
        predictions = (predictions * labels_std) + labels_mean
        test_labels = (test_labels * labels_std) + labels_mean

    # Print results
    if classification:
        print_classification_metrics(ground_truth=test_labels,
                                     predictions=predictions,
                                     num_classes=test_labels.nunique())
    else:
        print_regression_metrics(ground_truth=test_labels,
                                 predictions=predictions)

    return model, predictions
Ejemplo n.º 10
0
class Voting:
    __n_neighbors = 7
    __feature_methods = {
        "k_chi_3": SelectKBest(chi2, k=3),
        "k_chi_4": SelectKBest(chi2, k=4),
        "k_chi_5": SelectKBest(chi2, k=5),
        "k_fclassif_3": SelectKBest(f_classif, k=3),
        "k_fclassif_4": SelectKBest(f_classif, k=4),
        "k_fclassif_5": SelectKBest(f_classif, k=5),
        "k_mutual_3": SelectKBest(mutual_info_classif, k=3),
        "k_mutual_4": SelectKBest(mutual_info_classif, k=4),
        "k_mutual_5": SelectKBest(mutual_info_classif, k=5),
        "fpr_chi_01": SelectFpr(chi2, alpha=0.1),
        "fpr_chi_005": SelectFpr(chi2, alpha=0.05),
        "fpr_chi_001": SelectFpr(chi2, alpha=0.01),
        "fpr_fclassif_01": SelectFpr(f_classif, alpha=0.1),
        "fpr_fclassif_005": SelectFpr(f_classif, alpha=0.05),
        "fpr_fclassif_001": SelectFpr(f_classif, alpha=0.01),
        "fnr_chi_01": SelectFdr(chi2, alpha=0.1),
        "fnr_chi_005": SelectFdr(chi2, alpha=0.05),
        "fnr_chi_001": SelectFdr(chi2, alpha=0.01),
        "fnr_fclassif_01": SelectFdr(f_classif, alpha=0.1),
        "fnr_fclassif_005": SelectFdr(f_classif, alpha=0.05),
        "fnr_fclassif_001": SelectFdr(f_classif, alpha=0.01),
        "fwe_chi_01": SelectFwe(chi2, alpha=0.1),
        "fwe_chi_005": SelectFwe(chi2, alpha=0.05),
        "fwe_chi_001": SelectFwe(chi2, alpha=0.01),
        "fwe_fclassif_01": SelectFwe(f_classif, alpha=0.1),
        "fwe_fclassif_005": SelectFwe(f_classif, alpha=0.05),
        "fwe_fclassif_001": SelectFwe(f_classif, alpha=0.01),
    }

    def get_feature_method_names(self):
        return self.__feature_methods.keys()

    def learn(self, feature_method_name: str = None) -> List[int]:
        labels, train_array, test_array = LearnUtils.get_learn_data()
        if feature_method_name is not None:
            feature_filter = self.__feature_methods[feature_method_name]
            feature_filter.fit(train_array, labels)
            train_array = feature_filter.transform(train_array)
            test_array = feature_filter.transform(test_array)
        clf = self.__create_classifier()
        clf.fit(train_array, labels)
        return clf.predict(test_array).tolist()

    def cross_validation(self) -> List[float]:
        labels, train_array = LearnUtils.get_cross_val_data()
        clf = self.__create_classifier()
        return cross_val_score(clf, train_array, labels, cv=6)

    def __create_classifier(self):
        knn = KNN().get_classifier()
        svc = SVCMethod().get_classifier()
        random_forest = RandomForest().get_classifier()
        bayes = Bayes().get_classifier()

        return VotingClassifier(estimators=[("knn", knn), ("svc", svc),
                                            ("rf", random_forest),
                                            ("bayes", bayes)],
                                voting="soft")
Ejemplo n.º 11
0
def GetAllPerf (filePaths=None):
    if filePaths is None:
        filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv'))

    #Sanity check:
    # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile']
    # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv']

    print("FilePaths: \n",filePaths)
    fileNames=fileNameFromPaths (filePaths)
    print("FileNames:",fileNames)


    resDict = pd.DataFrame(index=fileNames,
        columns=['Accuracy','Accuracy_SD',
        'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1',
        'LargestClassPercent','Classes',
        # 'TopRFE-Features','Best (f1) Model parameters',
         '# Classes',
         'Array-Acc-Scores' ,'Array-f1-Scores'
         ,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted'])


    #redDict holds results for each file/class, for saving to output-file

    i=-1
    for filePath in filePaths:
        i +=1

        'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/'
        filePath = os.path.normpath(filePath)
        print(filePath)
        fileName=str(fileNames[i]) #Str added now 14.1

        print("fileName: %s" %(fileName))
        "resDict['Name']= fileName"

        # filePath = str(argv[1])
        # X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels
        X, y, lb_encoder,featureNames = load_data(filePath) # X, y = features, labels
        print(X.shape,"= (samples, features)")
        y_inv = Counter(lb_encoder.inverse_transform(y))
        MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1)
        print("Classes:", lb_encoder.classes_)
        print("MajorityClassPercent:", MajorityPercent)

        resDict.LargestClassPercent[fileName] = MajorityPercent
        resDict.Classes[fileName] = str(lb_encoder.classes_)
        resDict["# Classes"][fileName]=len(lb_encoder.classes_)

        KFilt=None
        KFilt=350  #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself.

        if KFilt is not None:
            k = SelectKBest(k=KFilt).fit(X,y)
            X=k.transform(X)
            featureNames=featureNames[k.get_support()]

        Fwe = SelectFwe(alpha=0.01).fit(X,y)
        X=Fwe.transform(X)
        featureNames=featureNames[Fwe.get_support()]

        print("X reduced to K best features: ",X.shape)


        FeatSelection_SVM=False #Feature Names need updating!!
        FeatSelection_RandLogReg=False

        if FeatSelection_RandLogReg == True:
            LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5,
             sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y)
            X_L1 = LogRegFeats.transform(X)
            featureNames=featureNames[LogRegFeats.get_support()]
            print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape)

        elif FeatSelection_SVM == True:
            svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y)
            X_L1 = svc_L1.transform(X, y)
            featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))]
            print ("L1 SVM Transformed X:",X_L1.shape)
        # X=X_L1

        '''
        print("Performance as a function of percent of features used:")
        PlotPerfPercentFeatures(X,y,est=LinearSVC())
        '''

        'EG - graph best features; feature selection using RF, ensemble classifiers..'
        'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb'

        RFE_FeatsToKeep = 16
        FeatSelection_RFE=False
        FeatSelection_RFECV=False

        if (FeatSelection_RFE or FeatSelection_RFECV) == True:
            'RFE + - best feats'
            'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html '
            svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False)
            # svc = LogisticRegression(class_weight='auto')#,C=1)

            if FeatSelection_RFECV==True:
                rfecv = RFECV(estimator=svc, step=RFE_FeatsToKeep,scoring='average_precision')
                             # ,cv=StratifiedShuffleSplit(y,n_iter=3,test_size=0.3))
                             #,scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1',accuracy..."
            else:
                rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.03)
            rfecv.fit(X, y)
            if FeatSelection_RFECV==True:
                print("RFE-CV selected %d features : " % (rfecv.n_features_))
            print("RFE (%d features) scorer : " % (rfecv.n_features_),rfecv.score(X, y) )
            rfe_featnames = featureNames[rfecv.get_support()]
            featureNames = featureNames[rfecv.get_support()]
            print("RFE selected feature names:",rfe_featnames)
            X_RFE = rfecv.fit_transform(X, y)
            print("X_RFE",X_RFE.shape)

            resDict['TopRFE-Features'][fileName]=str(rfe_featnames)

            'Set GetRFEPerf To true or by user, if perf. of reduced set wanted'
        GetRFEPerf=False

        # print("lb_encoder.classes_",lb_encoder.classes_)
        'Blind score boxplot graphic example using Seaborn: http://nbviewer.ipython.org/github/cs109/2014/blob/master/homework-solutions/HW5-solutions.ipynb '
        'Confusion matrixes + Dummies - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/'
        'http://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators'

        "http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html"
        print()

        "Make custom F1 scorer. May not have fixed problem!"
        from sklearn.metrics.score import make_scorer
        f1_scorer = make_scorer(metrics.f1_score,
                     greater_is_better=True, average="micro") #Maybe another metric? May NOT be fixed!?. #weighted, micro, macro, none

        # print("Dummy classifiers output:")

        dummy_frequent = DummyClassifier(strategy='most_frequent',random_state=0)
        y_dummyPred = Get_yPred(X,y,clf_class=dummy_frequent)
        dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y,y_dummyPred ))
        dummy_freq_f1 = '{:.3}'.format(metrics.f1_score(y, y_dummyPred,average='weighted'))

        dummy_freq_f1_weighted = '{:.3}'.format(f1_scorer(y, y_dummyPred))
        #Get from ALL classes f1..
        dummy_freq_f1_mean=(metrics.f1_score(y, y_dummyPred,average=None)).mean()
        # print("Dummy, most frequent acc:",dummy_freq_acc)

        # dummy_stratifiedRandom = DummyClassifier(strategy='stratified',random_state=0)
        # dummy_strat2= '{:.3%}'.format(metrics.accuracy_score(y, Get_yPred(X,y,clf_class=dummy_frequent))) #,sample_weight=balance_weights(y)))
        # 'print("Dummy, Stratified Random:",dummy_strat2)'
        print()

        resDict['dummy_freq:Accuracy'][fileName]=dummy_freq_acc
##        resDict['dummy_freq:f1'][fileName]=dummy_freq_f1 dummy_freq_f1_mean
        resDict['dummy_freq:f1'][fileName]=dummy_freq_f1_mean

        resDict['dummy_freq_f1_weighted'][fileName]=dummy_freq_f1_weighted
        # resDict.dummy_Stratfreq[fileName]=dummy_strat2

        "We can get seperately the best model for Acc, and the best for f1!"
        "WARNING!? In binary case - default F1 works for the 1 class, in sklearn 15. and lower"
        # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')
        "Temporary workaround until next SKlearn update of F1 metric:"
        # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')f1_scorer
        bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = f1_scorer)

        bestEst_acc,bestScore_acc = ModelParam_GridSearch(X,y,cv=2,scoreParam = 'accuracy')
        print("bestEst (f1):",bestEst_f1)#,"best f1",bestScore_f1)
        print("bestEst (f1):",bestEst_acc)#,"best acc",bestScore_acc)

        #Temp
        # bestEst_f1=bestEst_acc=bestEst = RandomForestClassifier(n_jobs=-1)

        if GetRFEPerf==True:
            bestEst_RFE,bestScore_RFE = ModelParam_GridSearch(X_RFE,y,cv=3,scoreParam = 'f1')

        "Modified to get 2 estimators"
        scores_acc = cross_val_score(estimator=bestEst_acc, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1) #Accuracy
        print("Accuracy: %0.3f (+- %0.2f)" % (scores_acc.mean(), scores_acc.std() * 2))
        scores_f1 = cross_val_score(estimator=bestEst_f1, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1, scoring='f1')
        print("f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))

        resDict['Accuracy'][fileName]=round(scores_acc.mean(),4)
        resDict['Accuracy_SD'][fileName]=round(scores_acc.std(),4)
        resDict['f1'][fileName]=round(scores_f1.mean(),4)
        resDict['f1_SD'][fileName]=round(scores_f1.std(),4)
        resDict['Array-f1-Scores'][fileName]=(scores_f1)
        resDict['Array-Acc-Scores'][fileName]=(scores_acc)
        resDict['bestML-f1'][fileName]=(str(bestEst_f1))
        resDict['bestML-Acc'][fileName]=(str(bestEst_acc))

        #ORIG
        # Acc,Acc_SD,f1,f1_SD = CV_multi_stats(X, y, bestEst,n=15)

        # resDict['Accuracy'][fileName]=round(Acc,4)
        # resDict['Accuracy_SD'][fileName]=round(Acc_SD,4)
        # resDict['f1 score'][fileName]=round(f1,4)
        # resDict['f1_SD'][fileName]=round(f1_SD,4)
        # resDict['Best (f1) Model parameters'][fileName]= bestEst

        print()
        # print(fileName," Done")

    print("Saving results to file")
    resDict.to_csv("OutputData.tsv", sep=',')
Ejemplo n.º 12
0
    feature_cols=np.array(feature_cols)


    # In[ ]:

    X=df[feature_cols].values
    y=df.classname.values

    # In[ ]:
    le = LabelEncoder()
    y = le.fit_transform(y)


    # In[ ]:
    print("Orig X -> ",X.shape)
    Fwe = SelectFwe(alpha=0.001).fit(X,y)
    X=Fwe.transform(X)
    print("F-test -> ",X.shape)
    feature_cols=feature_cols[Fwe.get_support()]

# In[ ]:

    rf = RandomForestClassifierWithCoef(max_depth= 9, min_samples_split= 3, min_samples_leaf= 3, n_estimators= 650,  n_jobs= -1, max_features= "auto")


    # In[ ]:

    scores = cross_val_score(rf,X,y,n_jobs=-1,cv=StratifiedShuffleSplit(y,n_iter=7,test_size=0.3))
    print("X RF Accuracy: %0.3f (+- %0.2f)" % (scores.mean(), scores.std() * 2))
#    scores_f1 = cross_val_score(rf,X,y,n_jobs=-1,cv=StratifiedShuffleSplit(y,n_iter=10,test_size=0.22),scoring='f1')
#    print("X RF f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFwe, f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.8185185185185185
exported_pipeline = make_pipeline(
    SelectFwe(score_func=f_classif, alpha=0.026000000000000002),
    ExtraTreesClassifier(bootstrap=False,
                         criterion="gini",
                         max_features=0.35000000000000003,
                         min_samples_leaf=3,
                         min_samples_split=16,
                         n_estimators=100))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Ejemplo n.º 14
0
def train_classi(model_name, inputs, X_pos, y_pos, X, y, X_neg, y_neg):
    scaler = None
    model_type = inputs['model_type']
    out_comp_nm = inputs['dir_nm'] + inputs['out_comp_nm']

    if (model_type == "tpot"):
        logging_info("Training model... %s", str(model_type))

        from sklearn.pipeline import make_pipeline

        if (model_name == "tpot_select"):
            clf = tpot_classi(inputs)
        elif (model_name == "SVM"):
            logging_info("Training model... %s", str(model_name))
            # Imports from tpot output
            from sklearn.preprocessing import StandardScaler
            #from sklearn.svm import LinearSVC
            from sklearn.svm import SVC

            # Pipeline from tpot
            #clf = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5))
            # Cross validate with C vals - default is 1
            # LinearSVC does not have a predict_proba function
            clf = make_pipeline(
                StandardScaler(),
                SVC(kernel='linear',
                    probability=True,
                    random_state=0,
                    tol=1e-5))
        elif (model_name == "estimator_SVM"):

            from sklearn.ensemble import GradientBoostingClassifier
            from sklearn.feature_selection import SelectFwe, f_classif
            from sklearn.linear_model import LogisticRegression
            from sklearn.pipeline import make_pipeline, make_union
            #from sklearn.svm import LinearSVC
            from tpot.builtins import StackingEstimator
            from xgboost import XGBClassifier

            # Score on the training set was:0.968003998605
            #clf = make_pipeline(StackingEstimator(estimator=GradientBoostingClassifier(learning_rate=0.1, max_depth=9, max_features=0.05, min_samples_leaf=2, min_samples_split=17, n_estimators=100, subsample=1.0)),SelectFwe(score_func=f_classif, alpha=0.02),StackingEstimator(estimator=LogisticRegression(C=1.0, dual=True, penalty="l2")),StackingEstimator(estimator=XGBClassifier(learning_rate=0.001, max_depth=7, min_child_weight=16, n_estimators=100, nthread=1, subsample=0.65)),LinearSVC(C=1.0, dual=True, loss="squared_hinge", penalty="l2", tol=0.001))

            clf = make_pipeline(
                StackingEstimator(
                    estimator=GradientBoostingClassifier(learning_rate=0.1,
                                                         max_depth=9,
                                                         max_features=0.05,
                                                         min_samples_leaf=2,
                                                         min_samples_split=17,
                                                         n_estimators=100,
                                                         subsample=1.0)),
                SelectFwe(score_func=f_classif, alpha=0.02),
                StackingEstimator(estimator=LogisticRegression(
                    C=1.0, dual=True, penalty="l2")),
                StackingEstimator(estimator=XGBClassifier(learning_rate=0.001,
                                                          max_depth=7,
                                                          min_child_weight=16,
                                                          n_estimators=100,
                                                          nthread=1,
                                                          subsample=0.65)),
                SVC(kernel='linear', probability=True, C=1.0, tol=0.001))
        elif (model_name == "log_reg"):
            logging_info("Training model... %s", str(model_name))
            # Imports from tpot output
            from sklearn.ensemble import ExtraTreesClassifier
            from sklearn.linear_model import LogisticRegression
            from tpot.builtins import StackingEstimator, ZeroCount

            # Pipeline from tpot
            # Score on humap was:0.986160063433
            clf = make_pipeline(
                ZeroCount(),
                StackingEstimator(
                    estimator=ExtraTreesClassifier(bootstrap=False,
                                                   criterion="entropy",
                                                   max_features=0.6,
                                                   min_samples_leaf=4,
                                                   min_samples_split=6,
                                                   n_estimators=100)),
                LogisticRegression(C=15.0, dual=False, penalty="l2"))

        elif (model_name == "extra_trees"):
            from sklearn.ensemble import ExtraTreesClassifier
            from tpot.builtins import StackingEstimator

            from sklearn.pipeline import make_pipeline, make_union
            from sklearn.preprocessing import Normalizer
            from sklearn.preprocessing import FunctionTransformer
            from copy import copy

            # Score on the training set was:0.948305771055
            clf = make_pipeline(
                make_union(
                    FunctionTransformer(copy),
                    make_pipeline(
                        StackingEstimator(estimator=ExtraTreesClassifier(
                            bootstrap=False,
                            criterion="gini",
                            max_features=0.25,
                            min_samples_leaf=8,
                            min_samples_split=11,
                            n_estimators=100)), Normalizer(norm="l1"))),
                StackingEstimator(
                    estimator=ExtraTreesClassifier(bootstrap=False,
                                                   criterion="entropy",
                                                   max_features=0.75,
                                                   min_samples_leaf=15,
                                                   min_samples_split=18,
                                                   n_estimators=100)),
                ExtraTreesClassifier(bootstrap=True,
                                     criterion="entropy",
                                     max_features=0.85,
                                     min_samples_leaf=5,
                                     min_samples_split=4,
                                     n_estimators=100))

        else:  # Random forest
            logging_info("Training model... %s", str(model_name))
            # Imports from tpot output
            from sklearn.ensemble import RandomForestClassifier
            from sklearn.feature_selection import VarianceThreshold
            from sklearn.preprocessing import PolynomialFeatures

            # Pipeline from tpot
            # Score on humap was:0.986160063433
            clf = make_pipeline(
                VarianceThreshold(threshold=0.05),
                PolynomialFeatures(degree=2,
                                   include_bias=False,
                                   interaction_only=False),
                RandomForestClassifier(bootstrap=False,
                                       criterion="entropy",
                                       max_features=0.35,
                                       min_samples_leaf=1,
                                       min_samples_split=11,
                                       n_estimators=100))

        clf.fit(X, y)

        logging_info("Finished Training model")
        logging_info("Evaluating training accuracy...")
        #Training accuracy

        acc_overall_train = clf.score(X, y)
        acc_pos_train = clf.score(X_pos, y_pos)
        acc_neg_train = clf.score(X_neg, y_neg)

        res_pos = clf.predict(X_pos)
        res = clf.predict(X_neg)

        n_pos = len(X_pos)
        n_neg = len(X_neg)

        acc, acc_neg, Recall, Precision, F1_score = calc_metrics(
            res, res_pos, n_neg, n_pos)
        analyze_sizewise_accuracies(
            X_pos, res_pos, X_neg, res,
            out_comp_nm + '_size_wise_accuracies_train.png')
        train_fit_probs = clf.predict_proba(X)[:, 1]
        train_aps = sklearn_metrics_average_precision_score(y, train_fit_probs)
        with open(out_comp_nm + '_metrics.out', "a") as fid:
            print("Training set average precision score = %.3f" % train_aps,
                  file=fid)

        model = clf

        if hasattr(model, 'decision_function'):
            score = model.decision_function(X_neg)
            np_savetxt(out_comp_nm + '_train_neg_score.out', score)
            score = model.decision_function(X_pos)
            np_savetxt(out_comp_nm + '_train_pos_score.out', score)

    elif (model_type == "NN"):

        # Standardizing the feature matrix
        from sklearn import preprocessing
        scaler = preprocessing.StandardScaler().fit(X)

        X = scaler.transform(X)

        # Scaling X_pos and X_neg as well now for testing with them later
        X_pos = scaler.transform(X_pos)
        X_neg = scaler.transform(X_neg)

        import tensorflow as tf
        from tensorflow import keras

        #tf.enable_eager_execution() # Fix ensuing errors

        logging_info("Training model... %s", str(model_type))

        # multi-layer perceptron
        #for most problems, one could probably get decent performance (even without a second optimization step) by setting the hidden layer configuration using just two rules: (i) number of hidden layers equals one; and (ii) the number of neurons in that layer is the mean of the neurons in the input and output layers.
        print()
        dims = X.shape
        n_feats = dims[1]
        n_classes = 2
        logging_info("No. of nodes in input layer = %s", str(n_feats))
        logging_info("No. of nodes in output layer (since softmax) = %s",
                     str(n_classes))
        hidden_nodes = int((n_feats + n_classes) / 2)
        logging_info("No. of nodes in the one hidden layer = %s",
                     str(hidden_nodes))
        model = keras.Sequential([
            keras.layers.Dense(n_feats, activation=tf.nn.relu),
            keras.layers.Dense(hidden_nodes, activation=tf.nn.relu),
            keras.layers.Dense(n_classes, activation=tf.nn.softmax)
        ])
        #model = keras.Sequential([keras.layers.Dense(n_feats, activation = tf.nn.relu), keras.layers.Dense(n_classes, activation = tf.nn.softmax)])
        model.compile(optimizer='adam',
                      loss='sparse_categorical_crossentropy',
                      metrics=['accuracy'])
        N_epochs = 1000
        model.fit(X, y, epochs=N_epochs, verbose=0)
        with open(out_comp_nm + '_metrics.out', "a") as fid:
            print("No. of epochs = ", N_epochs, file=fid)

        logging_info("Finished Training model")
        logging_info("Evaluating training accuracy...")
        loss_overall, acc_overall_train = model.evaluate(X, y, verbose=0)
        loss_pos, acc_pos_train = model.evaluate(X_pos, y_pos, verbose=0)
        loss_neg, acc_neg_train = model.evaluate(X_neg, y_neg, verbose=0)
    else:
        print("Model type not found")

    logging_info("Finished Evaluating training accuracy.")
    with open(out_comp_nm + '_metrics.out', "a") as fid:
        print("Accuracy overall train = %.3f" % acc_overall_train, file=fid)
        print("Accuracy positive train = %.3f" % acc_pos_train, file=fid)
        print("Accuracy negative train = %.3f" % acc_neg_train, file=fid)
        print("Train Precision = %.3f" % Precision, file=fid)
        print("Train Recall = %.3f" % Recall, file=fid)
        print("Train F1 score = %.3f" % F1_score, file=fid)
    return model, scaler
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, f1_score
from tpot_metrics import balanced_accuracy_score
from sklearn.pipeline import make_pipeline
import itertools

dataset = sys.argv[1]

preprocessor_list = [Binarizer(), MaxAbsScaler(), MinMaxScaler(), Normalizer(),
                     PolynomialFeatures(), RobustScaler(), StandardScaler(),
                     FastICA(), PCA(), RBFSampler(), Nystroem(), FeatureAgglomeration(),
                     SelectFwe(), SelectKBest(), SelectPercentile(), VarianceThreshold(),
                     SelectFromModel(estimator=ExtraTreesClassifier(n_estimators=100)),
                     RFE(estimator=ExtraTreesClassifier(n_estimators=100))]

# Read the data set into memory
input_data = pd.read_csv(dataset, compression='gzip', sep='\t').sample(frac=1., replace=False, random_state=42)

with warnings.catch_warnings():
    warnings.simplefilter('ignore')

    for (preprocessor, C, loss, fit_intercept) in itertools.product(
                preprocessor_list,
                [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 0.5, 1., 10., 50., 100.],
                ['hinge', 'squared_hinge'],
                [True, False]):
        features = input_data.drop('class', axis=1).values.astype(float)
Ejemplo n.º 16
0
from sklearn.preprocessing import Normalizer
from sklearn.svm import LinearSVR
from tpot.builtins import StackingEstimator
from xgboost import XGBRegressor

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:-15.203873985130153
exported_pipeline = make_pipeline(
    SelectFwe(score_func=f_regression, alpha=0.026000000000000002),
    StackingEstimator(estimator=LinearSVR(C=10.0,
                                          dual=False,
                                          epsilon=0.001,
                                          loss="squared_epsilon_insensitive",
                                          tol=0.01)),
    StackingEstimator(
        estimator=GradientBoostingRegressor(alpha=0.75,
                                            learning_rate=0.001,
                                            loss="quantile",
                                            max_depth=2,
                                            max_features=0.35000000000000003,
                                            min_samples_leaf=15,
                                            min_samples_split=17,
                                            n_estimators=100,
                                            subsample=0.7500000000000001)),
from sklearn.feature_selection import SelectFwe, SelectPercentile, f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.svm import LinearSVC
from tpot.builtins import OneHotEncoder, StackingEstimator
from tpot.export_utils import set_param_recursive
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=42)

# Average CV score on the training set was: 0.3656802383316783
exported_pipeline = make_pipeline(
    make_union(
        make_pipeline(SelectFwe(score_func=f_classif, alpha=0.008),
                      OneHotEncoder(minimum_fraction=0.1),
                      SelectPercentile(score_func=f_classif, percentile=13)),
        FunctionTransformer(copy)),
    LinearSVC(C=0.1, dual=False, loss="squared_hinge", penalty="l2", tol=0.01))
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Ejemplo n.º 18
0
]

for eachDataset in datasetNames:
    print eachDataset
    X_sparse, y_full = load_svmlight_file("OriginalDatasets/" + eachDataset)
    X_full = X_sparse.toarray()
    methodsNames = ["full", "pca", "chi2", "fs", "et", "fpr", "fdr", "fwe"]
    methodsFS = [
        SelectPercentile(chi2, percentile=100),
        PCA(n_components=0.9),
        SelectPercentile(chi2, percentile=75),
        SelectPercentile(f_classif, percentile=75),
        ExtraTreesClassifier(random_state=0),
        SelectFpr(),
        SelectFdr(),
        SelectFwe()
    ]
    for x in range(0, len(methodsFS)):
        fsm = methodsFS[x]
        fsm.fit(X_full, y_full)
        X_redu = fsm.transform(X_full)

        #Some algorithms fail and select 0 features, lets fix that
        if len(X_redu[0]) < 3:
            tmpMethod = SelectKBest(chi2, k=3)
            tmpMethod.fit(X_full, y_full)
            X_redu = tmpMethod.transform(X_full)

        fileOut = open(
            "ReducedDatasets/" + eachDataset + "_" + methodsNames[x], 'wb')
        dump_svmlight_file(X_redu, y_full, fileOut, zero_based=False)
Ejemplo n.º 19
0
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectFwe, f_classif
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from tpot.builtins import ZeroCount

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.9822176245306669
exported_pipeline = make_pipeline(
    ZeroCount(), SelectFwe(score_func=f_classif, alpha=0.015),
    KNeighborsClassifier(n_neighbors=5, p=2, weights="uniform"))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.8255806224382443
exported_pipeline = make_pipeline(
    SelectPercentile(score_func=f_classif, percentile=91),
    StackingEstimator(
        estimator=MLPClassifier(alpha=0.0001, learning_rate_init=0.001)),
    StackingEstimator(
        estimator=GradientBoostingClassifier(learning_rate=0.01,
                                             max_depth=7,
                                             max_features=0.6000000000000001,
                                             min_samples_leaf=15,
                                             min_samples_split=4,
                                             n_estimators=100,
                                             subsample=0.05)),
    SelectFwe(score_func=f_classif, alpha=0.035),
    MultinomialNB(alpha=0.1, fit_prior=False))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MaxAbsScaler, RobustScaler
from tpot.builtins import StackingEstimator, ZeroCount
from xgboost import XGBRegressor
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Average CV score on the training set was:-148.31589276097782
exported_pipeline = make_pipeline(
    make_union(
        make_pipeline(MaxAbsScaler(), RobustScaler(), ZeroCount(),
                      SelectFwe(score_func=f_regression, alpha=0.038)),
        FunctionTransformer(copy)),
    XGBRegressor(learning_rate=0.1,
                 max_depth=9,
                 min_child_weight=15,
                 n_estimators=100,
                 nthread=1,
                 subsample=1.0))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Ejemplo n.º 22
0
filename = "../training_data/ordered_tweets_no_duplicates.txt"
tweets_and_labels = parse_labeled_data(filename)
# print tweets_and_labels
# random.shuffle(tweets_and_labels)
Y, X = get_x_y(tweets_and_labels)
# X, Y = make_moons(noise=0.3, random_state=0)
# print X, Y
# print nX[0], nY[0]
# splitting training and test set
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

# C = regularization parameter (keeps from overfitting): C is the degree of penalty (L1 or L2) (powers of 10)
# penalty sparse = l2 lowers angle so that no unigram can be super weighted, l1 removes features to shift the curve
# TODO: separate into train test eval

fs = SelectFwe(alpha=700.0)
print "Before", x_train.shape

clf = svm.LinearSVC(C=100, penalty="l2", dual=False)
clf.fit(x_train, y_train)

print "NO FEATURE SELECTION"
print "Training Accuracy"
print clf.decision_function(x_train)
print (classification_report(y_train, clf.predict(x_train), target_names=target_names))

print "Testing Accuracy"
print (classification_report(y_test, clf.predict(x_test), target_names=target_names))


x_train = fs.fit_transform(x_train, y_train)
Ejemplo n.º 23
0
    'TODO: Allow user to select desired function - CV model, or feature reduction'
    'TODO: Use os.path.join - for file names/locations/dirs..'
    #Set by user input:
    fileName = r'/trainingSetFeatures.csv'
    filePath = str(argv[1])
    X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels

    print(X.shape,"= (samples, features)")

    y_inv = Counter(lb_encoder.inverse_transform(y))
    print("Classes:", y_inv)

    # 'Normalize/Scale features if needed. Our data is standardized by default'
    # X = StandardScaler(copy=False).fit_transform(X)

    Fwe = SelectFwe(alpha=0.01).fit(X,y)
    X=Fwe.transform(X)
    featureNames=featureNames[Fwe.get_support()]
    print("F-test filter ->",X.shape)

    FeatSelection_SVM=True
    FeatSelection_RandLogReg=False

    if FeatSelection_RandLogReg == True:
        LogRegFeats = RandomizedLogisticRegression(C=5, scaling=0.5,
         sample_fraction=0.8, n_resampling=60, selection_threshold=0.2,n_jobs=-1)
        X = LogRegFeats.fit_transform(X,y)
        featureNames=featureNames[LogRegFeats.get_support()]
        print("RandomizedLogisticRegression Feature Selection ->:",X.shape)

    elif FeatSelection_SVM == True:
Ejemplo n.º 24
0
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from PipeTasks import Get_yPred,balance_weights

# Import some data to play with
#########################################
os.chdir(r'/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile')
##os.chdir(r'/cs/prt3/danofer/ProtFeat/feat_extract/test_seq/NP/SP_Cleaved+NP+Neg_Big')

df = pd.read_csv('trainingSetFeatures.csv')
##    df.drop('proteinname',axis=1, inplace=True)
feature_cols = [col for col in df.columns if col not in ['classname','Id','proteinname']]
X=df[feature_cols].values
y=df.classname.values

Fwe = SelectFwe(alpha=0.01).fit(X,y)
X=Fwe.transform(X)

le = LabelEncoder()
y = le.fit_transform(y)
# Binarize the output
# y = label_binarize(y, classes=[0, 1, 2])
# y = label_binarize(y)

##n_classes = y.shape[1]
n_classes=len(set(y))
target_names=list(le.classes_)
print ("n_classes",n_classes,"target_names",target_names)
# shuffle and split training and test sets
##X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
##                                                    random_state=0)
Ejemplo n.º 25
0
    model_fs = SelectKBest(f_classif,
                           k=15).fit(x_train,
                                     y_train)  # grid search for the parameter

#-- method 2-2 SelectFdr: f_classif
if method == 'select_fdr_f_classif':
    from sklearn.feature_selection import SelectFdr
    from sklearn.feature_selection import f_classif
    model_fs = SelectFdr(f_classif, alpha=1e-7).fit(
        x_train, y_train)  # grid search for the parameter

#-- method 2-3 SelectFwe: f_classif
if method == 'select_fwe_f_classif':
    from sklearn.feature_selection import SelectFwe
    from sklearn.feature_selection import f_classif
    model_fs = SelectFwe(f_classif, alpha=0.0001).fit(x_train, y_train)

#-- method 3 RFECV: SVC
if method == 'rfecv_svc':
    from sklearn.feature_selection import RFECV
    from sklearn.svm import SVC
    svc = SVC(kernel="linear")
    model_fs_pre = RFECV(estimator=svc, step=1, cv=5)
    model_fs = model_fs_pre.fit(x_train, y_train)

#-- method 4-1 select from model: LinearSVC (L1-based)
if method == 'select_from_model_linear_svc':
    from sklearn.svm import LinearSVC
    from sklearn.feature_selection import SelectFromModel
    model_fs_pre = LinearSVC(C=0.01, penalty="l1",
                             dual=False)  # grid search for the parameter
Ejemplo n.º 26
0
def GetAllPerf (filePaths=None):
    if filePaths is None:
        filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv'))

    #Sanity check:
    # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile']
    # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv']

    print("FilePaths: \n",filePaths)
    fileNames=fileNameFromPaths (filePaths)
    print("FileNames:",fileNames)


    resDict = pd.DataFrame(index=fileNames,
        columns=['Accuracy','Accuracy_SD',
        'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1',
        'LargestClassPercent','Classes',
        # 'TopRFE-Features','Best (f1) Model parameters',
         '# Classes',
         'Array-Acc-Scores' ,'Array-f1-Scores'
         ,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted'])


    #redDict holds results for each file/class, for saving to output-file

    i=-1
    for filePath in filePaths:
        i +=1

        'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/'
        filePath = os.path.normpath(filePath)
        print(filePath)
        fileName=str(fileNames[i]) #Str added now 14.1

        print("fileName: %s" %(fileName))
        "resDict['Name']= fileName"

        # filePath = str(argv[1])
        # X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels
        X, y, lb_encoder,featureNames = load_data(filePath, 'file') # X, y = features, labels
        print(X.shape,"= (samples, features)")
        y_inv = Counter(lb_encoder.inverse_transform(y))
        MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1)
        print("Classes:", lb_encoder.classes_)
        print("MajorityClassPercent:", MajorityPercent)

        resDict.LargestClassPercent[fileName] = MajorityPercent
        resDict.Classes[fileName] = str(lb_encoder.classes_)
        resDict["# Classes"][fileName]=len(lb_encoder.classes_)

        KFilt=None
        KFilt=350  #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself.

        if KFilt is not None:
            k = SelectKBest(k=KFilt).fit(X,y)
            X=k.transform(X)
            featureNames=featureNames[k.get_support()]

        Fwe = SelectFwe(alpha=0.01).fit(X,y)
        X=Fwe.transform(X)
        featureNames=featureNames[Fwe.get_support()]

        print("X reduced to K best features: ",X.shape)


        FeatSelection_SVM=False #Feature Names need updating!!
        FeatSelection_RandLogReg=False

        if FeatSelection_RandLogReg == True:
            LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5,
             sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y)
            X_L1 = LogRegFeats.transform(X)
            featureNames=featureNames[LogRegFeats.get_support()]
            print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape)

        elif FeatSelection_SVM == True:
            svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y)
            X_L1 = svc_L1.transform(X, y)
            featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))]
            print ("L1 SVM Transformed X:",X_L1.shape)
        # X=X_L1

        '''
        print("Performance as a function of percent of features used:")
        PlotPerfPercentFeatures(X,y,est=LinearSVC())
        '''

        'EG - graph best features; feature selection using RF, ensemble classifiers..'
        'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb'

        RFE_FeatsToKeep = 16
        FeatSelection_RFE=False
        FeatSelection_RFECV=False

        if (FeatSelection_RFE or FeatSelection_RFECV) == True:
            'RFE + - best feats'
            'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html '
            svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False)
            # svc = LogisticRegression(class_weight='auto')#,C=1)

            if FeatSelection_RFECV==True:
                rfecv = RFECV(estimator=svc, step=RFE_FeatsToKeep,scoring='average_precision')
                             # ,cv=StratifiedShuffleSplit(y,n_iter=3,test_size=0.3))
                             #,scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1',accuracy..."
            else:
                rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.03)
            rfecv.fit(X, y)
            if FeatSelection_RFECV==True:
                print("RFE-CV selected %d features : " % (rfecv.n_features_))
            print("RFE (%d features) scorer : " % (rfecv.n_features_),rfecv.score(X, y) )
            rfe_featnames = featureNames[rfecv.get_support()]
            featureNames = featureNames[rfecv.get_support()]
            print("RFE selected feature names:",rfe_featnames)
            X_RFE = rfecv.fit_transform(X, y)
            print("X_RFE",X_RFE.shape)

            resDict['TopRFE-Features'][fileName]=str(rfe_featnames)

            'Set GetRFEPerf To true or by user, if perf. of reduced set wanted'
        GetRFEPerf=False

        # print("lb_encoder.classes_",lb_encoder.classes_)
        'Blind score boxplot graphic example using Seaborn: http://nbviewer.ipython.org/github/cs109/2014/blob/master/homework-solutions/HW5-solutions.ipynb '
        'Confusion matrixes + Dummies - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/'
        'http://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators'

        "http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html"
        print()

        "Make custom F1 scorer. May not have fixed problem!"
        from sklearn.metrics.score import make_scorer
        f1_scorer = make_scorer(metrics.f1_score,
                     greater_is_better=True, average="micro") #Maybe another metric? May NOT be fixed!?. #weighted, micro, macro, none

        # print("Dummy classifiers output:")

        dummy_frequent = DummyClassifier(strategy='most_frequent',random_state=0)
        y_dummyPred = Get_yPred(X,y,clf_class=dummy_frequent)
        dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y,y_dummyPred ))
        dummy_freq_f1 = '{:.3}'.format(metrics.f1_score(y, y_dummyPred,average='weighted'))

        dummy_freq_f1_weighted = '{:.3}'.format(f1_scorer(y, y_dummyPred))
        #Get from ALL classes f1..
        dummy_freq_f1_mean=(metrics.f1_score(y, y_dummyPred,average=None)).mean()
        # print("Dummy, most frequent acc:",dummy_freq_acc)

        # dummy_stratifiedRandom = DummyClassifier(strategy='stratified',random_state=0)
        # dummy_strat2= '{:.3%}'.format(metrics.accuracy_score(y, Get_yPred(X,y,clf_class=dummy_frequent))) #,sample_weight=balance_weights(y)))
        # 'print("Dummy, Stratified Random:",dummy_strat2)'
        print()

        resDict['dummy_freq:Accuracy'][fileName]=dummy_freq_acc
##        resDict['dummy_freq:f1'][fileName]=dummy_freq_f1 dummy_freq_f1_mean
        resDict['dummy_freq:f1'][fileName]=dummy_freq_f1_mean

        resDict['dummy_freq_f1_weighted'][fileName]=dummy_freq_f1_weighted
        # resDict.dummy_Stratfreq[fileName]=dummy_strat2

        "We can get seperately the best model for Acc, and the best for f1!"
        "WARNING!? In binary case - default F1 works for the 1 class, in sklearn 15. and lower"
        # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')
        "Temporary workaround until next SKlearn update of F1 metric:"
        # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')f1_scorer
        bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = f1_scorer)

        bestEst_acc,bestScore_acc = ModelParam_GridSearch(X,y,cv=2,scoreParam = 'accuracy')
        print("bestEst (f1):",bestEst_f1)#,"best f1",bestScore_f1)
        print("bestEst (f1):",bestEst_acc)#,"best acc",bestScore_acc)

        #Temp
        # bestEst_f1=bestEst_acc=bestEst = RandomForestClassifier(n_jobs=-1)

        if GetRFEPerf==True:
            bestEst_RFE,bestScore_RFE = ModelParam_GridSearch(X_RFE,y,cv=3,scoreParam = 'f1')

        "Modified to get 2 estimators"
        scores_acc = cross_val_score(estimator=bestEst_acc, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1) #Accuracy
        print("Accuracy: %0.3f (+- %0.2f)" % (scores_acc.mean(), scores_acc.std() * 2))
        scores_f1 = cross_val_score(estimator=bestEst_f1, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1, scoring='f1')
        print("f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))

        resDict['Accuracy'][fileName]=round(scores_acc.mean(),4)
        resDict['Accuracy_SD'][fileName]=round(scores_acc.std(),4)
        resDict['f1'][fileName]=round(scores_f1.mean(),4)
        resDict['f1_SD'][fileName]=round(scores_f1.std(),4)
        resDict['Array-f1-Scores'][fileName]=(scores_f1)
        resDict['Array-Acc-Scores'][fileName]=(scores_acc)
        resDict['bestML-f1'][fileName]=(str(bestEst_f1))
        resDict['bestML-Acc'][fileName]=(str(bestEst_acc))

        #ORIG
        # Acc,Acc_SD,f1,f1_SD = CV_multi_stats(X, y, bestEst,n=15)

        # resDict['Accuracy'][fileName]=round(Acc,4)
        # resDict['Accuracy_SD'][fileName]=round(Acc_SD,4)
        # resDict['f1 score'][fileName]=round(f1,4)
        # resDict['f1_SD'][fileName]=round(f1_SD,4)
        # resDict['Best (f1) Model parameters'][fileName]= bestEst

        print()
        # print(fileName," Done")

    print("Saving results to file")
    resDict.to_csv("OutputData.tsv", sep=',')
Ejemplo n.º 27
0
def run():

    target_names = ["Self", "Another Person", "General Statement"]
    tweets_and_labels = parse_labeled_data(filename)

    #splitting training and test set
    y_train, x_test, x_train = get_x_y(tweets_and_labels, testdata)

    #Chi-Squared Analysis
    sel = SelectPercentile(chi2, percentile=80)
    sel.fit(x_train, y_train)
    x_train = sel.transform(x_train)
    x_test = sel.transform(x_test)

    #Univariate Feature Selection
    fs = SelectFwe(alpha=150.0)
    x_train = fs.fit_transform(x_train, y_train)
    x_test = fs.transform(x_test)

    #Classifier Fitting
    clf = svm.LinearSVC(C=10,
                        penalty='l2',
                        loss='l1',
                        dual=True,
                        fit_intercept=False,
                        class_weight='auto')
    clf.fit(x_train, y_train)

    returned = clf.predict(x_test)
    print returned
    #Print relevant usernames & tweets to .csv file
    t = time.strftime("%d_%m_%Y")
    output1 = 'classifications/' + t + '_self.csv'
    output2 = 'classifications/' + t + '_another_person.csv'
    with open(output1, 'w+') as o1:
        wr = csv.writer(o1, quoting=csv.QUOTE_ALL)
        for i, val in enumerate(returned):
            if val == 0:
                row = [testdata[i][1], testdata[i][0]]
                wr.writerow(row)

    with open(output2, 'w+') as o2:
        wr = csv.writer(o2, quoting=csv.QUOTE_ALL)
        for i, val in enumerate(returned):
            if val == 1:
                row = [testdata[i][1], testdata[i][0]]
                wr.writerow(row)

    ########################################################################
    '''Graphing of Data'''
    '''Note, since there is no annotation for test data'''
    '''This is a visual representation of output data, not model accuracy'''
    ########################################################################

    graph = True
    if (graph):
        #Graph setup
        X, Y, Z, new_y = graph_setup(clf, x_test, returned)
        #graph Scatter Plot of training data
        graph_scatter(x_train, y_train)
        #Graph 3D Plot of test data
        graph_3d(X, Y, Z, new_y)
        #Graph 2-D Plot of test data
        graph_2d(X, Y, new_y)
Ejemplo n.º 28
0
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectFwe, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.9147335715485821
exported_pipeline = make_pipeline(
    SelectFwe(score_func=f_classif, alpha=0.023), MinMaxScaler(),
    LogisticRegression(C=1.0, dual=False, penalty="l1"))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Ejemplo n.º 29
0
from sklearn.feature_selection import SelectFwe, SelectPercentile, f_regression
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import Normalizer, RobustScaler
from sklearn.tree import DecisionTreeRegressor
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:-1.969947855356932
exported_pipeline = make_pipeline(
    SelectPercentile(score_func=f_regression, percentile=85),
    Normalizer(norm="l2"), RobustScaler(),
    StackingEstimator(estimator=RidgeCV()),
    StackingEstimator(estimator=DecisionTreeRegressor(
        max_depth=2, min_samples_leaf=8, min_samples_split=5)),
    SelectFwe(score_func=f_regression, alpha=0.014), RobustScaler(),
    StackingEstimator(estimator=RidgeCV()),
    StackingEstimator(estimator=DecisionTreeRegressor(
        max_depth=2, min_samples_leaf=2, min_samples_split=11)), RidgeCV())

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import SelectFwe
from sklearn.feature_selection import f_classif
from sklearn.neighbors import KNeighborsClassifier

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR')
training_indices, testing_indices = train_test_split(tpot_data.index, stratify = tpot_data['class'].values, train_size=0.75, test_size=0.25)


result1 = tpot_data.copy()

training_features = result1.loc[training_indices].drop(['class', 'group', 'guess'], axis=1)
training_class_vals = result1.loc[training_indices, 'class'].values
if len(training_features.columns.values) == 0:
    result1 = result1.copy()
else:
    selector = SelectFwe(f_classif, alpha=0.05)
    selector.fit(training_features.values, training_class_vals)
    mask = selector.get_support(True)
    mask_cols = list(training_features.iloc[:, mask].columns) + ['class']
    result1 = result1[mask_cols]

# Perform classification with a k-nearest neighbor classifier
knnc2 = KNeighborsClassifier(n_neighbors=min(8, len(training_indices)))
knnc2.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values)
result2 = result1.copy()
result2['knnc2-classification'] = knnc2.predict(result2.drop('class', axis=1).values)
Ejemplo n.º 31
0
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.tree import DecisionTreeClassifier
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Average CV score on the training set was:0.8551331638356954
exported_pipeline = make_pipeline(
    make_union(
        StackingEstimator(
            estimator=DecisionTreeClassifier(criterion="gini",
                                             max_depth=5,
                                             min_samples_leaf=4,
                                             min_samples_split=14)),
        FunctionTransformer(copy)), SelectFwe(score_func=f_classif,
                                              alpha=0.045),
    KNeighborsClassifier(n_neighbors=60, p=1, weights="distance"))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Ejemplo n.º 32
0
def main(args):
    if args.train_dir is None:
        # args.train_dir = '/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/chap/train/'
        #args.train_dir = '/cs/prt3/danofer/ProtFeat/feat_extract/test_seq/NP/SPCleaved_NP-70+NEG-30_Big-V3/'
#        args.train_dir =  r'D:\SkyDrive\Dropbox\bioInf_lab\AA_info\CODE\feat_extract\test_seq\NP\SPCleaved_NP-70+NEG-30_Big-V3'
        # args.train_dir =  r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\NP\SP_Cleaved+NP+Neg_Big'
        args.train_dir =  r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\Benchmarks\Thermophiles'
        print("Using default train_dir: %s" % args.train_dir)

    pandas.set_option('display.max_columns', 10)
    pandas.set_option('display.max_rows', 4)
    # mpl.rc('title', labelsize=6)
    mpl.rc('ytick', labelsize=7)
    mpl.rc('xtick', labelsize=4)

    os.chdir(args.train_dir)
    dataName = 'Neuropeptides'

    df = pandas.read_csv('trainingSetFeatures.csv')
    feature_cols = [col for col in df.columns if col not in ['classname','Id','proteinname']]
    feature_cols=numpy.array(feature_cols)

    X = df[feature_cols].values
    y = df.classname.values

    le = LabelEncoder()
    y = le.fit_transform(y)

    "Initial feature selection trimming"
    print(X.shape)

    Fwe = SelectFwe(alpha=0.01).fit(X,y)
    X=Fwe.transform(X)
    print("F-test -> ",X.shape)
    feature_cols=feature_cols[Fwe.get_support()]
    '''
    FeatSelection_SVM = True
    if FeatSelection_SVM == True:
        svc_L1 = LinearSVC(C=50, penalty="l1", dual=False,class_weight='auto').fit(X, y)
        X = svc_L1.transform(X, y)
        print ("L1 SVM Transformed X:",X_L1.shape)
        feature_cols=feature_cols[list(set(np.where(svc_L1.coef_ != 0)[-1]))]
    '''


    k = SelectKBest(k=255).fit(X,y)
    X=k.transform(X)
    feature_cols=feature_cols[k.get_support()]


    param_dist = {"max_depth": [6,9, None],
                  "max_features": ['auto',0.4],
                  "min_samples_leaf": [1,2,3],
                  "bootstrap": [True, False],
                  'min_samples_split':[2,3],
                  "criterion": [ "gini"],
                  "n_estimators":[100],
                  "n_jobs":[-1]}

    rf = RandomForestClassifierWithCoef(max_depth= 7, min_samples_split= 1, min_samples_leaf= 2, n_estimators= 50,  n_jobs= 2, max_features= "auto")

    "WARNING! F1 Score as implemented by Default in binary classification (two classes) gives the score for 1 class."

    scores = cross_validation.cross_val_score(rf,X,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2))
    print("X RF Accuracy: %0.3f (+- %0.2f)" % (scores.mean(), scores.std() * 2))
    "Instead of scores_f1, we could also use precision, sensitivity, MCC (if binary), etc'."
    scores_f1 = cross_validation.cross_val_score(rf,X,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2),scoring='f1')
    print("X RF f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))

    # rfeSelect = RFE(estimator=rf,n_features_to_select=16, step=0.04)
    rfeSelect = RFECV(estimator=rf,step=20, cv=2,scoring='f1') #average_precision , recall
    X_RFE = rfeSelect.fit_transform(X,y)
    print(X_RFE.shape)

    RFE_FeatureNames = feature_cols[rfeSelect.get_support()]
    print(RFE_FeatureNames)

    RFE_ScoreRatio = 100*(cross_validation.cross_val_score(rf,X_RFE,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2),scoring='f1').mean())/scores_f1.mean()
    print("Even with just",X_RFE.shape[1]," features, we have %f performance! (f1 score ratio)" %(RFE_ScoreRatio))

    # PlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName)
    print("Alt plot:")
    altPlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName)
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectFwe, f_classif
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.tree import DecisionTreeClassifier
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Average CV score on the training set was:0.8158182768942263
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=BernoulliNB(alpha=10.0, fit_prior=True)),
    SelectFwe(score_func=f_classif, alpha=0.043000000000000003),
    DecisionTreeClassifier(criterion="entropy", max_depth=10, min_samples_leaf=10, min_samples_split=17)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)