def naive_bayes_classifier(data, labels, columns):
    print 'Applying Naive Bayes classification'
    # create param grid
    n_numeric = len([
        c.TYPE for c in columns
        if c.TYPE is Types.NUMERICAL and c.CATEGORIES is None
    ])
    n_components = list(range(1, data.shape[1] + 1, 1))
    parameters = dict(pca__n_components=n_components)

    # create model pipeline
    ns = NumericScaler(n_numeric, with_std=False)
    rf = RandomForestClassifier()  #random_state=2)
    rfe = feature_selection.RFE(rf)
    pca = decomposition.PCA()
    gnb = GaussianNB()
    pipe = Pipeline(steps=[('ns', ns), ('pca', pca), ('gnb', gnb)])

    # run grid search with 10-fold validation
    clf = GridSearchCV(pipe, parameters, cv=10, verbose=1)
    clf.fit(data, labels)
    pred = clf.predict(data)

    print 'accuracy: %0.3f' % clf.best_score_
    print 'Best parameters set: '
    best_parameters = clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print '\t%s: %r' % (param_name, best_parameters[param_name])
    print '\n',
    print classification_report(labels, pred)

    return clf
Beispiel #2
0
def main():
    #company_names, X, Y, Y_citation = load_data(network_folder)
    # #X = normalize(X, axis=1)
    company_names, X, Y = load_data_combined("../data/networks/",
                                             "../data/citation_networks/", 17)
    # company_names, X, Y = load_data("../data/citation_networks/", 8)
    print X[1, :]
    lr = linear_model.HuberRegressor()
    sel = feature_selection.RFE(lr, n_features_to_select=11)
    # # sel = feature_selection.SelectKBest(feature_selection.f_regression, k=8) #3.856

    X = sel.fit_transform(X, Y)
    print "sup", sel.get_support()
    # lr = linear_model.RANSACRegressor()

    if use_pca:
        pca = PCA(n_components=pca_components)
        X = pca.fit_transform(X)
    #Run k-fold cross validation and prediction simultaneously
    Y_pred = cross_val_predict(lr, X, Y, cv=8)
    if use_ranking:
        Y = convert_to_rank(Y)
        Y_pred = convert_to_rank(Y_pred)
    #F_scores, p_values = f_regression(X, Y)
    #print F_scores
    #print p_values
    #for pred_pair in zip(Y, Y_pred):
    #    print "Actual: %s, Predicted: %s" %pred_pair
    print "Mean Absolute Error: %s" % mean_absolute_error(Y, Y_pred)
    print "Ground Truth StdDev: %s" % np.std(Y)
    lr.fit(X, Y_pred)
    print lr.coef_
def svm_classifier(data, labels, columns):
    print 'Applying SVM classification with RBF kernel'
    # create param grid
    n_numeric = len([
        c.TYPE for c in columns
        if c.TYPE is Types.NUMERICAL and c.CATEGORIES is None
    ])
    C = [0.1, 1, 10, 100, 1000]
    gamma = ['auto', 1, 0.1, 0.001, 0.0001]
    parameters = dict(svm__C=C, svm__gamma=gamma)

    # create model pipeline
    ns = NumericScaler(n_numeric)
    rf = RandomForestClassifier()  #random_state=2)
    rfe = feature_selection.RFE(rf)
    svm = SVC(kernel='rbf')  #, random_state=17)
    pipe = Pipeline(steps=[('ns', ns), ('rfe', rfe), ('svm', svm)])

    # run grid search with 10-fold validation
    clf = GridSearchCV(pipe, parameters, cv=10, verbose=1)
    clf.fit(data, labels)
    pred = clf.predict(data)

    print 'accuracy: %0.3f' % clf.best_score_
    print 'Best parameters set: '
    best_parameters = clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print '\t%s: %r' % (param_name, best_parameters[param_name])
    print '\n',
    print classification_report(labels, pred)

    return clf
def knn_classifier(data, labels, columns):
    print 'Applying k-nearest neighbor classification'
    # create param grid
    n_numeric = len([
        c.TYPE for c in columns
        if c.TYPE is Types.NUMERICAL and c.CATEGORIES is None
    ])
    n_neighbors = list(range(1, 51, 1))
    parameters = dict(knn__n_neighbors=n_neighbors)

    # create model pipeline
    ns = NumericScaler(n_numeric)
    rf = RandomForestClassifier()  #random_state=8)
    knn = KNeighborsClassifier()
    rfe = feature_selection.RFE(rf)
    pipe = Pipeline(steps=[('ns', ns), ('rfe', rfe), ('knn', knn)])

    # run grid search with 10-fold cross validation
    clf = GridSearchCV(pipe, parameters, cv=10, verbose=1)
    clf.fit(data, labels)
    pred = clf.predict(data)

    print 'accuracy: %0.3f' % clf.best_score_
    print 'Best parameters set: '
    best_parameters = clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print '\t%s: %r' % (param_name, best_parameters[param_name])
    print '\n',
    print classification_report(labels, pred)

    return clf
Beispiel #5
0
def main():
    df_train = pd.read_csv('data/train_data.csv')
    df_valid = pd.read_csv('data/valid_data.csv')
    df_test = pd.read_csv('data/test_data.csv')

    feature_cols = [f for f in list(df_train) if "feature" in f]
    target_col = df_train.columns[-1]

    X_train = df_train[feature_cols].values
    y_train = df_train[target_col].values

    X_valid = df_valid[feature_cols].values
    y_valid = df_valid[target_col].values

    X_test = df_test[feature_cols].values

    estimator = LogisticRegression(C=10.0)
    rfe = feature_selection.RFE(estimator=estimator,
                                n_features_to_select=reduction_dim,
                                verbose=1)
    print('Fitting Recursive Feature Elimination on data...')
    X_train_rfe = rfe.fit_transform(X_train, y_train)
    X_valid_rfe = rfe.fit_transform(X_valid, y_valid)
    X_test_rfe = rfe.transform(X_test)

    print('Saving...')

    save_path = 'data/rfe_selection_data_{}d.npz'.format(reduction_dim)
    np.savez(save_path, \
        train=X_train_rfe, \
        valid=X_valid_rfe, \
        test=X_test_rfe)
def get_rfe(X_train, y_train, step=5):
    BEST_FITTED = get_best_fitted(X_train, y_train)
    RFE = {
        "svm-rfe": {
            "model": feature_selection.RFE(CLASSIFIERS['svm'], step=step),
            "estimator": CLASSIFIERS['svm']
        },
        "rf-rfe": {
            "model": feature_selection.RFE(CLASSIFIERS['rf'], step=step),
            "estimator": CLASSIFIERS['rf']
        },
        "knn-rfe": {
            "model": feature_selection.RFE(CLASSIFIERS['rf'], step=step),
            "estimator": CLASSIFIERS['knn']
        },
        "nb-bernoulli-rfe": {
            "model": feature_selection.RFE(CLASSIFIERS['nb-bernoulli'],
                                           step=step),
            "estimator": CLASSIFIERS['nb-bernoulli']
        },
        "svm-grid-rfe": {
            "model": feature_selection.RFE(BEST_FITTED['svm-grid'], step=step),
            "estimator": BEST_FITTED['svm-grid']
        },
        "rf-grid-rfe": {
            "model": feature_selection.RFE(BEST_FITTED['rf-grid'], step=step),
            "estimator": BEST_FITTED['rf-grid']
        },
        "knn-grid-rfe": {
            "model": feature_selection.RFE(BEST_FITTED['rf-grid'], step=step),
            "estimator": BEST_FITTED['knn-grid']
        },
    }

    return RFE
Beispiel #7
0
def get_fs_model(model, method, train, target=None, cv=None):
    """Connects given model with specified feature selection method and trains
    the final structure.
    """
    if method == "RFE":
        model = fs_scikit.RFE(model, 2, step=5)
        if target is not None:
            return model.fit(train, target)
        else:
            return model.fit(train)
    if method == "RFECV":
        model = fs_scikit.RFECV(model, 3, cv=cv)
        if target is not None:
            return model.fit(train, target)
        else:
            return model.fit(train)
    elif method == "linearSVC":
        sel = SelectFromModel(LinearSVC(penalty='l1', dual=False))
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "fromModel":
        fm = fs_scikit.SelectFromModel(model)
        if target is not None:
            fm.fit(train, target)
        else:
            fm.fit(train)
        model = Pipeline([('feature_selection', fm), ('data_mining', model)])

    # elif method == "Anova":
    # ANOVA SVM-C
    # anova_filter = fs_scikit.SelectKBest(f_regression, k=5)
    # model = Pipeline([
    #     ('feature_selection', anova_filter),
    #     ('data_mining', model)
    # ])
    elif method == "VarianceThreshold":
        sel = fs_scikit.VarianceThreshold(threshold=(.8 * (1 - .8)))
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectPercentile":
        sel = fs_scikit.SelectPercentile(fs_scikit.f_classif, percentile=30)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectFpr":
        sel = fs_scikit.SelectFpr(alpha=0.2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectFdr":
        sel = fs_scikit.SelectFdr(alpha=0.2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "SelectFwe":
        sel = fs_scikit.SelectFwe(alpha=0.2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    elif method == "ch2":
        sel = fs_scikit.SelectKBest(fs_scikit.chi2, k=2)
        model = Pipeline([('feature_selection', sel), ('data_mining', model)])
    else:
        print("Feature selection method was not found: " + method)
        sys.exit(1)
    return model
Beispiel #8
0
def select_ests(X, y, nfeats, clf):
    rfe = feature_selection.RFE(estimator=clf,
                                n_features_to_select=100,
                                step=10)
    rfe.fit(X, y)
    ranking = rfe.ranking_
    # 1 values refer to the features that are taken
    rks = ranking[ranking == 1]

    return rks
Beispiel #9
0
 def logreg_model_result(self,X_train, Y_train, X_test, Y_test, iter):
     logreg_model = LogisticRegression(max_iter=iter)
     rfe = feature_selection.RFE(logreg_model, 20)
     #t0=time.time()
     rfe = rfe.fit(X_train, Y_train)
     #print("training time:", round(time.time() - t0, 3), "s")
     logreg_y_pred = rfe.predict(X_test)
     return (metrics.accuracy_score(Y_test, logreg_y_pred), metrics.f1_score(Y_test, logreg_y_pred, average='macro'),
             metrics.recall_score(Y_test, logreg_y_pred, average='macro'),
             metrics.precision_score(Y_test, logreg_y_pred, average='macro'))
Beispiel #10
0
 def learning_by_target_lasso(self, X, y, alpha, input_c=None):
     # alphas = np.logspace(-3, 0, 20)
     print(("Finding the most important half of {} features").format(len(X[0])))
     regr = linear_model.LogisticRegression(penalty="l2", C=input_c, n_jobs=-1, solver="newton-cg")
     rfe = feature_selection.RFE(regr)
     rfe.fit(X, y)
     new_dict = dict()
     for index, code in enumerate(list(self.code_dict)):
         # new_dict[code] = regr.coef_[0][index]
         new_dict[code] = rfe.ranking_[index]
     return new_dict
Beispiel #11
0
    def __init__(self):

        f5 = feature_selection.RFE(estimator=MultinomialNB(),
                                   n_features_to_select=100000,
                                   step=100,
                                   verbose=1)
        pipeline = Pipeline([
            ('rfe_feature_selection', f5),
            ('clf', MultinomialNB()),
        ])

        self.clf = pipeline
Beispiel #12
0
def ref(arr0, target, n_features):
    from sklearn.linear_model import LogisticRegression
    matrix = np.array(arr0)
    target = np.array(target)
    temp = feature_selection.RFE(estimator=LogisticRegression(),
                                 n_features_to_select=n_features).fit(
                                     matrix, target)
    scores = temp.ranking_.tolist()
    indx = temp.support_.tolist()
    # result = data_utility.retrieve_nan_index(temp.transform(matrix).tolist(), index)
    result = temp.transform(matrix).tolist()
    return scores, indx, result
def rfe_svm(X, y):
    clf = linear_model.SGDClassifier(loss='hinge',
                                     penalty='elasticnet',
                                     max_iter=1000,
                                     alpha=1e-9,
                                     tol=1e-3,
                                     random_state=123456,
                                     class_weight={
                                         0: 0.044,
                                         1: 1 - 0.044
                                     })
    cv = model_selection.ShuffleSplit(n_splits=10,
                                      test_size=0.1,
                                      random_state=123456)

    nb_features = X.shape[1]
    print(nb_features)

    scores = model_selection.cross_validate(
        clf,
        X,
        y,
        cv=cv,
        scoring=['precision', 'recall', 'f1'],
        return_train_score=True)
    print(scores)

    if nb_features > 1:
        rfe = feature_selection.RFE(clf,
                                    n_features_to_select=nb_features - 1,
                                    step=1)
        rfe.fit(X, y)
        output = rfe_svm(rfe.transform(X), y)

        output.append([
            nb_features,
            np.mean(scores['test_precision']),
            np.mean(scores['test_recall']),
            np.mean(scores['test_f1']), rfe.support_, rfe.ranking_
        ])
        return output
    else:
        return [[
            nb_features,
            np.mean(scores['test_precision']),
            np.mean(scores['test_recall']),
            np.mean(scores['test_f1']), [True], [1]
        ]]
Beispiel #14
0
def calcFoldScores ( nFold = 10):
     # kFold = sk_ms.KFold ( nFold).split(X,y)
     foldReturns = []
     lr = sk_lm.LogisticRegression( penalty='l1', C=10000 )
     for trainIndex,  testIndex in sk_ms.KFold(nFold).split(X,y):
          xTrain, yTrain = X[trainIndex], y[trainIndex]
          xTest,yTest = X[testIndex], y[testIndex]
          featureReturns   = []
          for nFeatures in range(1,14):
              
               rfe = sk_fs.RFE( lr, n_features_to_select = nFeatures )
               rfe.fit(xTrain,yTrain)
               score = rfe.score( xTest, yTest)
               featureReturns.append(( nFeatures, score)) 
          foldReturns.append(featureReturns)
     returns = np.array(foldReturns)
     return returns
Beispiel #15
0
def selectfeatures(dfn, toexclude, topredict):
    assert isinstance(dfn, pd.DataFrame), 'Argument of wrong type!'
    assert isinstance(toexclude, list), 'Argument of wrong type!'
    assert isinstance(topredict, str), 'Argument of wrong type!'

    dfn = dfn.select_dtypes(include=[np.number]).copy()
    feature_cols = [x for x in dfn.columns.values.tolist() if x not in toexclude] # exclude features we are not predicting
    print(feature_cols)
    XO = dfn[feature_cols]
    YO = dfn[topredict]
    estimator = svm.SVR(kernel="linear")
    selector = feature_selection.RFE(estimator, 5, step=1)
    selector = selector.fit(XO, YO)

    # From the ranking you can select your predictors with rank 1
    # Model 1; let us select the folowing features as predictors:
    select_features = np.array(feature_cols)[selector.ranking_ == 1].tolist()
    print("Features: ", select_features)
    return select_features
    def rfeCV(self, n_folds=5):
        self.important_features = [True] * (len(self.train_X.columns) - 1)
        for fold in range(n_folds):
            rfe_obj = feature_selection.RFE(
                self.estimator, n_features_to_select=self.n_features)
            fold_index = self.train_X[self.train_X.kfold != fold].index
            df = self.train_X[self.train_X.kfold != fold].copy()
            df.pop('kfold')
            rfe_obj.fit(df, self.train_y.loc[fold_index, :].values.ravel())

            assert (len(self.important_features) == len(rfe_obj.get_support()))
            tup = zip(self.important_features, rfe_obj.get_support())
            self.important_features = [i[0] and i[1] for i in tup]
            print("Completed for fold {}".format(fold))

        self.important_features = df.columns[self.important_features]

        self.train_X = self.train_X[self.important_features]
        self.test_df = self.test_df[self.important_features]
Beispiel #17
0
def linear_regressor_test(features, target, testing_data, solutions):
    svc = SVC(kernel="linear")
    dim = feature_selection.RFE(estimator=svc, n_features_to_select=7)
    feat = dim.fit_transform(features, target)

    print(dim.n_features_to_select)

    lr = LR()
    lr.fit(np.matrix(feat), np.matrix(target))

    testing_data = dim.transform(testing_data)
    predictions = lr.predict(np.matrix(testing_data))

    predictions = [p[0] for p in predictions.tolist()]

    predictions = list(map(constraints, predictions))

    score = metrics.mean_squared_error(list(solutions), predictions)
    print("Accuracy: %f" % score)
def recursive_feature_elimination(xs, ys, xnames, cutoff):
    estimator = ensemble.RandomForestRegressor()
    selector = feature_selection.RFE(estimator,
                                     math.floor(len(xs[0]) * cutoff),
                                     step=1)
    selector.fit(xs, ys)
    bool_arr = selector.support_
    new_xs = xs
    new_features = xnames
    i = len(xs[0]) - 1
    while (i > -1):
        if (not bool_arr[i]):
            new_xs = np.delete(new_xs, i, axis=1)
            new_features = np.delete(new_features, i, axis=0)
        i -= 1
    if (len(new_xs[0]) > 0):
        return new_xs, new_features
    else:
        return xs, xnames
Beispiel #19
0
    def get_model(self, resume=False):
        if not resume:
            if self.method == 'variance':  # Unsupervised
                p = .5
                selector = feature_selection.VarianceThreshold(
                    threshold=(p * (1 - p)))
            elif self.method == 'rfe':
                estimator = LogisticRegression()
                selector = feature_selection.RFE(
                    estimator,
                    n_features_to_select=self.feat_limit,
                    step=1,
                    verbose=0)
            elif self.method == 'forward':
                estimator = ExtraTreesClassifier(n_estimators=100)
                selector = SelectFromModel(estimator)
            elif self.method == 'seq_bwd':
                estimator = LogisticRegression(solver='lbfgs')
                selector = SFS(estimator,
                               k_features=self.feat_limit,
                               forward=False,
                               floating=False,
                               scoring='roc_auc',
                               cv=4,
                               n_jobs=-1)
            elif self.method == 'seq_fwd':
                estimator = LogisticRegression(solver='lbfgs')
                selector = SFS(estimator,
                               k_features=self.feat_limit,
                               forward=True,
                               floating=False,
                               scoring='roc_auc',
                               cv=4,
                               n_jobs=-1)
        else:
            selector = joblib.load(self.model_save_path)

        if self.verbose > 2:
            print(selector)
        return selector
Beispiel #20
0
 def __init__(self, method='skb', clf=None, n_vars=None):
     self.n_vars = n_vars
     self.method = method
     #print("Metodo elegido:", self.method)
     if (method == 'sfm'):
         if (clf == None):
             self.clf = sk_en.RandomForestClassifier(n_estimators=100,
                                                     max_features='auto')
         else:
             self.clf = clf
     elif (self.method == 'rfo'):
         if (clf == None):
             self.clf = sk_nb.GaussianNB()
         else:
             self.clf = clf
     elif (self.method == 'rfs'):
         if (clf == None):
             self.clf = sk_nb.GaussianNB()
         else:
             self.clf = clf
     elif (self.method == 'skb'):
         if (self.n_vars is None):
             self.n_vars = 10
         self.clf = sk_fs.SelectKBest(score_func=sk_fs.f_classif,
                                      k=self.n_vars)
     elif (self.method == 'eli5_rfe'):
         if (self.n_vars is None):
             self.n_vars = 10
         if (clf == None):
             base_clf = sk_lm.LogisticRegression()
         else:
             base_clf = clf
         eli5_estimator = eli5.sklearn.PermutationImportance(base_clf,
                                                             cv=10)
         self.clf = sk_fs.RFE(eli5_estimator,
                              n_features_to_select=self.n_vars,
                              step=1)
     elif (self.method == 'biofes'):
         pass
def reverse_feature_elimination(df,
                                k,
                                model,
                                target_name=None,
                                y=None,
                                verbose=False):
    """
    Use sklearn function for recursively
    elimininating the least important features
    
    How does it pick the best features? 
    the absolute value of the model.coef_ (not considering p_value)
    
    """

    if y is None:
        X, y = pml.X_y(df, target_name)
    else:
        X, y = df, y

    selector = fs.RFE(model, n_features_to_select=k, step=1, verbose=verbose)
    selector.fit(X, y)

    return pml.feature_names(X)[selector.support_]
Beispiel #22
0
                           pd_feature_selection.SelectKBest(k=1), True))
_feature_selectors.append(
    (feature_selection.SelectKBest(k=1),
     pickle.loads(pickle.dumps(pd_feature_selection.SelectKBest(k=1))), True))
_feature_selectors.append((feature_selection.SelectKBest(k=2),
                           pd_feature_selection.SelectKBest(k=2), True))
_feature_selectors.append((feature_selection.SelectPercentile(),
                           pd_feature_selection.SelectPercentile(), True))
_feature_selectors.append(
    (feature_selection.SelectFdr(), pd_feature_selection.SelectFdr(), True))
_feature_selectors.append(
    (feature_selection.SelectFwe(), pd_feature_selection.SelectFwe(), True))
# Tmp Ami
if False:
    _feature_selectors.append(
        (feature_selection.RFE(linear_model.LogisticRegression()),
         pd_feature_selection.RFE(pd_linear_model.LogisticRegression()), True))

_keras_estimators = []
if _level > 0:
    _keras_estimators.append(
        (KerasClassifier(_build_classifier_nn, verbose=0),
         PdKerasClassifier(_build_classifier_nn,
                           _load_iris()[0]['class'].unique(),
                           verbose=0), False))
    _keras_estimators.append((KerasRegressor(_build_regressor_nn, verbose=0),
                              PdKerasRegressor(_build_regressor_nn,
                                               verbose=0), False))


class _EstimatorTest(unittest.TestCase):
df = data_initialModel.select_dtypes(include=[np.number]).copy() 


feature_cols = df.columns.values.tolist() 

feature_cols.remove('SALE_PRICE')






XO = df[feature_cols]
YO = df['SALE_PRICE']
estimator = svm.SVR(kernel="linear")
selector = feature_selection.RFE(estimator, 5, step=1) 

selector = selector.fit(XO, YO)
select_features = np.array(feature_cols)[selector.ranking_ == 1].tolist() 
print(select_features)


# In[38]:


X = df[select_features]
Y = df['SALE_PRICE']
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.2) 
lm = linear_model.LinearRegression()
lm.fit(trainX, trainY)
# Inspect the calculated model equations
Beispiel #24
0
    titanic_train[imputable_cont_features])

titanic_train.loc[titanic_train['Embarked'].isnull(), 'Embarked'] = 'S'

encodable_columns = ['Sex', 'Embarked', 'Pclass']
feature_defs = [(col_name, preprocessing.LabelEncoder())
                for col_name in encodable_columns]
mapper = DataFrameMapper(feature_defs)
mapper.fit(titanic_train)
titanic_train[encodable_columns] = mapper.transform(titanic_train)

titanic_train1 = titanic_train.drop(
    ['PassengerId', 'Name', 'Cabin', 'Ticket', 'Survived'], axis=1)

features = ['Pclass', 'Sex', 'Embarked']
titanic_train2 = pd.get_dummies(titanic_train1, columns=features)

y_train = titanic_train['Survived']
X_train = titanic_train2

dt_estimator = tree.DecisionTreeClassifier(random_state=100)
rfe = feature_selection.RFE(dt_estimator, 5, 1)
rfe.fit(X_train, y_train)
X_new = rfe.transform(X_train)
print(rfe.support_)

model = feature_selection.SelectFromModel(best_est, prefit=True)
X_new = model.transform(X_train)
X_new.shape

#build model on X_new
                                                    penalty='l2',
                                                    multi_class='multinomial',
                                                    solver='lbfgs',
                                                    max_iter=500))])

    # nearest neighbor
    c_1NN = sklnn.KNeighborsClassifier(n_neighbors=1,
                                       algorithm='brute',
                                       metric='correlation')

    # cross-validation scheme
    cv_schem = skms.StratifiedShuffleSplit(n_splits=1, test_size=0.2)
    n_rep = 10  # number of repetitions

    # RFE wrappers
    RFE_pow = skfs.RFE(c_MLR, n_features_to_select=3)
    RFE_FC = skfs.RFE(c_MLR, n_features_to_select=90)

    # record classification performance
    perf = np.zeros([n_bands, n_measures, n_rep, 2])  # (last index: MLR/1NN)
    perf_shuf = np.zeros([n_bands, n_measures, n_rep,
                          2])  # (last index: MLR/1NN)
    conf_matrix = np.zeros([n_bands, n_measures, n_rep, 2, n_motiv,
                            n_motiv])  # (fourthindex: MLR/1NN)
    rk_pow = np.zeros([n_bands, n_rep, N],
                      dtype=np.int)  # RFE rankings for power (N feature)
    rk_FC = np.zeros(
        [n_bands, 2, n_rep, int(N * (N - 1) / 2)],
        dtype=np.int)  # RFE rankings for FC-type measures (N(N-1)/2 feature)
    pearson_corr_rk = np.zeros([
        n_bands, n_measures, int(n_rep * (n_rep - 1) / 2)
Beispiel #26
0
def classify(X,
             y,
             verbose=False,
             nfolds=2,
             dim_red=None,
             n_components=[5, 10, 20],
             scale=True,
             fs=None,
             njobs=1,
             LR_C=[.01, .1, 1, 10, 100],
             LR_class_weight=[None, 'balanced'],
             SVC_C=[.01, .1, 1, 10, 100],
             SVC_class_weight=[None, 'balanced'],
             SVC_kernels=['rbf', 'linear', 'poly'],
             n_estimators=[10, 20, 30],
             max_features=['auto', 'log2', None],
             **kwargs):

    # spit out to the screen the function parameters, for logging
    if verbose:
        import inspect
        frame = inspect.currentframe()
        args, _, _, values = inspect.getargvalues(frame)
        print 'function name "%s"' % inspect.getframeinfo(frame)[2]
        for i in args[2:]:
            print "    %s = %s" % (i, values[i])

    # prepare configuration for cross validation test harness
    seed = 8

    # prepare models
    models = []
    # all these support multiclass:
    # http://scikit-learn.org/stable/modules/multiclass.html
    models.append(
        ('LR', LogisticRegression(multi_class='multinomial',
                                  solver='newton-cg'), {
                                      "C": LR_C,
                                      "class_weight": LR_class_weight
                                  }))
    models.append(('LDA', LinearDiscriminantAnalysis(), {}))
    models.append(('RndFor', RandomForestClassifier(), {
        'n_estimators': n_estimators,
        'max_features': max_features
    }))
    models.append(('NB', GaussianNB(), {}))
    models.append(('SVC', SVC(), {
        "C": SVC_C,
        "class_weight": SVC_class_weight,
        'kernel': SVC_kernels
    }))
    models.append(
        ('Most frequent', DummyClassifier(strategy='most_frequent'), {}))
    models.append(('Stratified', DummyClassifier(strategy='stratified'), {}))

    # spit out to the screen the parameters to be tried in each classifier
    if verbose:
        print 'Trying these parameters:'
        for m in models:
            print m[0], ':', m[2]

    # evaluate each model in turn
    results = []
    names = []
    for name, model, params in models:
        # need to create the CV objects inside the loop because they get used
        # and not get reset!
        inner_cv = StratifiedShuffleSplit(n_splits=nfolds,
                                          test_size=.1,
                                          random_state=seed)
        outer_cv = StratifiedShuffleSplit(n_splits=nfolds,
                                          test_size=.1,
                                          random_state=seed)
        #     # do this if no shuffling is wanted
        #     inner_cv = StratifiedKFold(n_splits=num_folds, random_state=seed)
        #     outer_cv = StratifiedKFold(n_splits=num_folds, random_state=seed)
        steps = [('clf', model)]
        pipe_params = {}
        for key, val in params.iteritems():
            key_name = 'clf__%s' % key
            pipe_params[key_name] = val
        if fs == 'l1':
            lsvc = LinearSVC(C=0.1, penalty="l1", dual=False)
            fs = feature_selection.SelectFromModel(lsvc)
        elif fs == 'rfe':
            fs = feature_selection.RFE(estimator=model)
            pipe_params['feat_sel__n_features_to_select'] = n_components
        steps = [('feat_sel', fs)] + steps
        if dim_red is not None:
            if dim_red == 'pca':
                dr = decomposition.PCA()
                pipe_params['dim_red__n_components'] = n_components
            elif dim_red == 'ica':
                dr = decomposition.FastICA()
                pipe_params['dim_red__n_components'] = n_components
            steps = [('dim_red', dr)] + steps
        if scale:
            steps = [('scale', preprocessing.RobustScaler())] + steps

        pipe = Pipeline(steps)
        cv_results = []
        cnt = 0
        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            opt_model = GridSearchCV(estimator=pipe,
                                     param_grid=pipe_params,
                                     verbose=0,
                                     n_jobs=njobs,
                                     cv=inner_cv)
            opt_model.fit(X_train, y_train)
            if verbose:
                if len(params.keys()) > 0:
                    print 'Best paramaters for', name, \
                          ' (%d/%d):' % (cnt + 1, outer_cv.n_splits)
                    print opt_model.best_params_
            predictions = opt_model.predict(X_test)
            cv_results.append(metrics.accuracy_score(y_test, predictions))
            cnt += 1
        results.append(cv_results)
        names.append(name)
    if verbose:
        print '\n======'
        for model, res in zip(models, results):
            msg = "%s: %f (%f)" % (model[0], np.mean(res), np.std(res))
            print(msg)
        print 'Chance: %f' % (1 / float(len(np.unique(y))))
        print '======\n'
    return results, models
Beispiel #27
0
print(X[:, 1])

Y = (10 * np.sin(np.pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - .5)**2 +
     10 * X[:, 3] + 5 * X[:, 4]**5 + np.random.normal(0, 1))
X[:, 10:] = X[:, :4] + np.random.normal(0, .025, (size, 4))

lin = linear_model.LinearRegression()
lin.fit(X, Y)
ridge = Ridge()  # alpha=0.1
ridge.fit(X, Y)
lasso = linear_model.Lasso()  # alpha=0.1
lasso.fit(X, Y)
randLasso = linear_model.RandomizedLasso()
randLasso.fit(X, Y)
rfe = feature_selection.RFE(estimator=linear_model.LinearRegression())
rfe.fit(X=X, y=Y)

rfr = RandomForestRegressor()
rfr.fit(X, Y)
freg = feature_selection.f_regression(X, Y)

ans_lin = abs(lin.coef_)
mx = [max(ans_lin)] * 14
ans_lin = ans_lin / mx
ans_ridge = abs(ridge.coef_)
mx = [max(ans_ridge)] * 14
ans_ridge = ans_ridge / mx
ans_lasso = abs(lasso.coef_)
# ������� �� 0
ans_randLasso = abs(randLasso.scores_)
Beispiel #28
0
import tabulate
import pickle


df_wine = pd.read_csv('wine.data', header=None)
rIndex = sk_utils.shuffle( range(len(df_wine)))
X,y = df_wine.iloc[:,1:].values[rIndex], df_wine.iloc[:,0].values[rIndex]




if False:
     data = []
     lr = sk_lm.LogisticRegression( penalty='l1', C=10000 )
     for nToSelect in range (1,14):
          rfe = sk_fs.RFE(lr, n_features_to_select= nToSelect)
          rfe.fit(X,y)
          # RFE has ranking of selected features.
          # rfe.ranking_
          data.append(rfe.ranking_)
     xx = pd.DataFrame (data)
     xx.index = range(1,14)
     with ( open ("../tex/RFE_Features.tbl", 'w')) as f: 
          print >> f, tabulate.tabulate ( xx, tablefmt='latex', floatfmt=".3f" , headers="keys")
     print (" %d Selected Features : %s " % (nToSelect,  rfe.ranking_))

# print "Features sorted by their rank:"
# # print sorted(zip(map(lambda x: round(x, 4), rfe.ranking_),))

def calcFoldScores ( nFold = 10):
     # kFold = sk_ms.KFold ( nFold).split(X,y)
# Spliting the dataset into training subset (70%) and testinf subset (%30)
X_train, X_test, Y_train, Y_test = skl_ms.train_test_split(X,
                                                           Y,
                                                           test_size=0.3,
                                                           random_state=0)

# initiating the score list
score_list = []
selected_feature_masks = []

# iterating over different numbers of features to be selected
for n in range(1, len(X.columns)):
    # constructing the regression model
    model = skl_lm.LinearRegression()
    # initiate the RFE model
    rfe_selector = skl_fs.RFE(model, n_features_to_select=n)
    # finding the most relevant features based on recursively fitting the "model" object passed in the previous step; and removing the non-selected features from X_train
    X_train_rfe = rfe_selector.fit_transform(X_train, Y_train)
    # removing the non-selected features from X_test
    X_test_rfe = rfe_selector.transform(X_test)
    # fitting the regression model only with the selected features
    model.fit(X_train_rfe, Y_train)
    # scoring the model with the test data
    score = model.score(X_test_rfe, Y_test)
    #  storing the score value
    score_list.append(score)
    # storing the feature mask
    selected_feature_masks.append(rfe_selector.support_)

# retrieving the name of features
features = np.array(X.columns)
Beispiel #30
0
def gen_test_estimators():
    """ Generate couple of estimators for tests.
    """
    test_folder = Path(__file__).parent
    test_folder = test_folder.joinpath('tools', 'test-data')

    with open(test_folder.joinpath('LinearRegression01.zip'), 'wb') as f:
        estimator = linear_model.LinearRegression()
        pickle.dump(estimator, f, pickle.HIGHEST_PROTOCOL)

    with open(test_folder.joinpath('RandomForestRegressor01.zip'), 'wb') as f:
        estimator = ensemble.RandomForestRegressor(n_estimators=10,
                                                   random_state=10)
        pickle.dump(estimator, f, pickle.HIGHEST_PROTOCOL)

    with open(test_folder.joinpath('XGBRegressor01.zip'), 'wb') as f:
        estimator = xgboost.XGBRegressor(learning_rate=0.1,
                                         n_estimators=100,
                                         random_state=0)
        pickle.dump(estimator, f, pickle.HIGHEST_PROTOCOL)

    with open(test_folder.joinpath('pipeline10'), 'wb') as f:
        estimator = ensemble.AdaBoostRegressor(learning_rate=1.0,
                                               n_estimators=50)
        pipe = pipeline.make_pipeline(estimator)
        pickle.dump(pipe, f, pickle.HIGHEST_PROTOCOL)

    dump_model_to_h5 = try_get_attr('galaxy_ml.model_persist',
                                    'dump_model_to_h5')
    estimator = linear_model.LinearRegression()
    dump_model_to_h5(estimator,
                     test_folder.joinpath('LinearRegression01.h5mlm'))

    estimator = ensemble.RandomForestRegressor(n_estimators=10,
                                               random_state=10)
    dump_model_to_h5(estimator,
                     test_folder.joinpath('RandomForestRegressor01.h5mlm'))

    estimator = xgboost.XGBRegressor(learning_rate=0.1,
                                     n_estimators=100,
                                     random_state=0)
    dump_model_to_h5(estimator, test_folder.joinpath('XGBRegressor01.h5mlm'))

    estimator = ensemble.AdaBoostRegressor(learning_rate=1.0, n_estimators=50)
    pipe = pipeline.make_pipeline(estimator)
    dump_model_to_h5(pipe, test_folder.joinpath('pipeline10'))

    import pandas as pd
    X_path = test_folder.joinpath('regression_X.tabular')
    X = pd.read_csv(X_path, sep='\t').values
    y_path = test_folder.joinpath('regression_y.tabular')
    y = pd.read_csv(y_path, sep='\t').values.ravel()

    estimator = ensemble.RandomForestRegressor(n_estimators=10,
                                               random_state=10)
    searcher = model_selection.GridSearchCV(estimator, {})
    searcher.fit(X, y)

    dump_model_to_h5(searcher, test_folder.joinpath('GridSearchCV01.h5mlm'))

    rfe = feature_selection.RFE(estimator)
    rfe.fit(X, y)
    dump_model_to_h5(rfe, test_folder.joinpath('RFE.h5mlm'))