Beispiel #1
0
def rank_features_rfe(X, y, featureset):
    """Rank features by their importance using recursive feature elimination.

    :param X: A training set of features.
    :param y: A target set (aka class labels for the training set)
    :param featureset: An instance of a featureset (such as Basic9Extractor())
    :rtype: An OrderedDict of the form {K : V}, with K being the feature name
    and V being its importance. This dictionary will be sorted by importance.
    """

    # FIXME: Use an RBF SVC to rank features. It is likely that the "importance"
    # rankings derived from a LinearSVC are similar as an RBF kernel SVM, but,
    # for safety's sake, it is best to assume they are not.

    classifier = LinearSVC()
    classifier.fit(X, y)

    ranker = RFE(classifier, 1, step=1)
    ranker = ranker.fit(X, y)

    # Get the names of the feature columns.
    # FIXME: Duplicate code from rank_features. Make this its own function.
    feat_importance = OrderedDict()
    for index, func in enumerate(featureset.features):
        feat_importance[func] = ranker.ranking_[index]

    return sorted(feat_importance.items(), key=lambda x: x[1])
def select_features(X, y, random_state, kernel='linear', C=1.0, num_attributes=3):
    """
    Uses Support Vector Classifier as the estimator to rank features
    with Recursive Feature Eliminatin.

    Parameters
    ----------
    X: A pandas.DataFrame. Attributes.
    y: A pandas.DataFrame. Labels.
    random_state: A RandomState instance. Used in SVC().
    kernel: A string. Used in SVC(). Default: "linear".
    C: A float. Used in SVC(). Default: 1.0.
    num_attributes: An int. The number of features to select in RFE. Default: 3.

    Returns
    -------
    A 3-tuple of (RFE, np.ndarray, np.ndarray)
    model: An RFE instance.
    columns: Selected features.
    ranking: The feature ranking. Selected features are assigned rank 1.
    """

    rfe = RFE(svm.SVC(C, kernel, random_state=random_state), num_attributes)
    model = rfe.fit(X, y.values.ravel())
    columns = list()

    for idx, label in enumerate(X):
        if rfe.support_[idx]:
            columns.append(label)

    ranking = rfe.ranking_

    return model, columns, ranking
Beispiel #3
0
def get_best_cols(df):
    """ select best cols with RFE """

    # factors
    cols_to_factor = [
        pd.get_dummies(df.X7),
        pd.get_dummies(df.X8),
        pd.get_dummies(df.X9),
        pd.get_dummies(df.X11),
        pd.get_dummies(df.X12),
        pd.get_dummies(df.X14),
        pd.get_dummies(df.X12),
        pd.get_dummies(df.X14),
        pd.get_dummies(df.X32),
    ]
    # dataframe with factors blown out
    df_f = pd.concat(cols_to_factor, axis=1)
    # numerics
    RFE_col_list = ["X4", "X5", "X6", "X13", "X21", "X22", "X29", "X30", "X31"]
    # dataframe with numerics
    df_n = df.ix[:, RFE_col_list]
    X = np.asarray(df_n)
    X = StandardScaler().fit_transform(X)
    # add in factors
    X = np.concatenate([X, np.asarray(df_f)], axis=1)
    # leave y alone
    y = df.X1
    # I don't like to guess yes this is only linear relationships
    estimator = SVR(kernel="linear")
    selector = RFE(estimator, 40, step=2)
    selector = selector.fit(X, y)
    # make index for merged df, yes this whines
    df_index = df_n.columns + df_f.columns
    best_cols = df_index[selector.support_]
    return best_cols
Beispiel #4
0
    def recursive_feature_elimination(config_learning, config_data, number_features):

        output = open(os.path.expanduser(config_data.get("Learner", "models")) + "/" + "feature_ranks.txt", "w")

        feature_names = FeatureExtractor.get_combinations_from_config_file_unsorted(config_data)

        x_train = read_features_file(config_learning.get('x_train'), '\t')
        y_train = read_reference_file(config_learning.get('y_train'), '\t')
        x_test = read_features_file(config_learning.get('x_test'), '\t')
        estimator, scorers = learn_model.set_learning_method(config_learning, x_train, y_train)

        scale = config_learning.get("scale", True)

        if scale:
            x_train, x_test = scale_datasets(x_train, x_test)

        rfe = RFE(estimator, number_features, step=1)
        rfe.fit(x_train, y_train)

        for i, name in enumerate(feature_names):
            output.write(name + "\t" + str(rfe.ranking_[i]) + "\n")
            print(name + "\t" + str(rfe.ranking_[i]))

        predictions = rfe.predict(x_test)

        output.close()

        return predictions
Beispiel #5
0
def test_deeply_nested():
    # Render a deeply nested estimator
    rfe = RFE(RFE(RFE(RFE(RFE(RFE(RFE(LogisticRegression())))))))
    expected = """
RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=LogisticRegression(C=1.0,
                                                                                                                     class_weight=None,
                                                                                                                     dual=False,
                                                                                                                     fit_intercept=True,
                                                                                                                     intercept_scaling=1,
                                                                                                                     l1_ratio=None,
                                                                                                                     max_iter=100,
                                                                                                                     multi_class='warn',
                                                                                                                     n_jobs=None,
                                                                                                                     penalty='l2',
                                                                                                                     random_state=None,
                                                                                                                     solver='warn',
                                                                                                                     tol=0.0001,
                                                                                                                     verbose=0,
                                                                                                                     warm_start=False),
                                                                                        n_features_to_select=None,
                                                                                        step=1,
                                                                                        verbose=0),
                                                                          n_features_to_select=None,
                                                                          step=1,
                                                                          verbose=0),
                                                            n_features_to_select=None,
                                                            step=1, verbose=0),
                                              n_features_to_select=None, step=1,
                                              verbose=0),
                                n_features_to_select=None, step=1, verbose=0),
                  n_features_to_select=None, step=1, verbose=0),
    n_features_to_select=None, step=1, verbose=0)"""

    expected = expected[1:]  # remove first \n
    assert rfe.__repr__() == expected
Beispiel #6
0
def recursiveFeatureSelector(classifier_model,train_data,train_labels,test_data,number_of_features):
    
    rfe = RFE(classifier_model,number_of_features)
    transformed_train_data = rfe.fit_transform(train_data,train_labels)
    transformed_test_data = rfe.transform(test_data)
    
    return transformed_train_data,transformed_test_data 
Beispiel #7
0
 def doRFE(self, X, y):
     # do RFE
     self.numFeatures = X.shape[1]
     svc = SVC(kernel="linear", C=self.C)
     rfe = RFE(estimator=svc, n_features_to_select=1, step=1)
     rfe.fit(X, y)
     self.feature_importances_ = self._getImportances(rfe.ranking_)
def feature_sorting(features_values_temp, rows_temp, columns_temp, prediction_values_temp, kernel, threshold):
	rows = 0
	while rows_temp > 0:
		rows = rows + 1
		rows_temp = rows_temp - 1

	columns = 0
	while columns_temp > 0:
		columns = columns + 1
		columns_temp = columns_temp - 1

	features_values = [x for x in features_values_temp]
	prediction_values = [y for y in prediction_values_temp]

	rotated = convert_list_to_matrix(features_values, rows, columns)
	# print rotated.shape
	scores = np.array(prediction_values)

	threshold = float(threshold)

	estimator = SVR(kernel=kernel) # try to change to the model for which the test is gonna run (lasso, ridge, etc.)

	selector = RFE(estimator, 0, step=1)
	selector = selector.fit(rotated, scores)
	features_used = [i for i, x in enumerate(selector.support_) if x == True] # i+1 b/c matlab starts indexing from 1

	return selector.ranking_.tolist()
Beispiel #9
0
def test_main():
    iris = load_iris()
    x, y = iris.data, iris.target
    estimator = SVR(kernel="linear")
    selector = RFE(estimator, 2 , step=1)
    selector = selector.fit(x, y)
    print selector.support_
Beispiel #10
0
def ref(X, y, n_features_to_select=1, kernel='linear'):
    # specify the desired number of features
    # return the masks and ranking of selected features
    estimator = SVC(kernel=kernel, class_weight='balanced')
    selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)
    selector = selector.fit(X, y)
    return (selector)
Beispiel #11
0
def recursiveFeatureSelection():
	X = np.array(trainingData, dtype=float)
	y = np.array(trainingDataLabels, dtype=float)
	svc = SVC("linear", 1)
	rfe = RFE(svc, 1, 1)
	rfe.fit(X, y)
	print rfe
def featSelect(label,trainSet,trainObs,cv,numFeat=5,SEED=34,name=''):
	from sklearn.feature_selection import RFE
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import roc_auc_score
	from numpy import zeros
	model = LogisticRegression(random_state=SEED)
	predCv = zeros(len(trainObs))
	rfe = RFE(model, numFeat, step=1)
	rfe.fit(trainSet,trainObs)
	vars = list(trainSet.columns[rfe.ranking_ == 1])
	auc = 0
	for i in range(1,max(rfe.ranking_)):
		for tr, vl in cv:
			model.fit(trainSet[vars + list(trainSet.columns[rfe.ranking_ == i])].ix[tr],trainObs[tr])
			predCv[vl] = model.predict_proba(trainSet[vars + list(trainSet.columns[rfe.ranking_ == i])].ix[vl])[:,1]
		if roc_auc_score(trainObs,predCv) > auc:
			auc = roc_auc_score(trainObs,predCv)
			vars += list(trainSet.columns[rfe.ranking_ == i])
	for v in vars:
		for tr, vl in cv:
			model.fit(trainSet[[x for x in vars if x != v]].ix[tr],trainObs[tr])
			predCv[vl] = model.predict_proba(trainSet[[x for x in vars if x != v]].ix[vl])[:,1]
		if roc_auc_score(trainObs,predCv) > auc:
			auc = roc_auc_score(trainObs,predCv)
			vars.remove(v)
	for v in [x for x in trainSet.columns if x not in vars]:
		for tr, vl in cv:
			model.fit(trainSet[vars + [v]].ix[tr],trainObs[tr])
			predCv[vl] = model.predict_proba(trainSet[vars + [v]].ix[vl])[:,1]
		if roc_auc_score(trainObs,predCv) > auc:
			auc = roc_auc_score(trainObs,predCv)
			vars += [v]
	print name,"Final AUC:  ",auc
	return {label: vars}
 def get_model_RFE_top_features(self,expression_file,ic50_file,target_features,drug):
     expression_frame,ic50_series = dfm.get_expression_frame_and_ic50_series_for_drug(expression_file, ic50_file,drug,normalized=True,trimmed=True,threshold=None)
     scikit_data,scikit_target = dfm.get_scikit_data_and_target(expression_frame,ic50_series)
     step_length = int(len(scikit_data.tolist()[0]) / 100) + 1
     selector = RFE(self.model,int(target_features),step=step_length)
     selector.fit(scikit_data,scikit_target)
     return [expression_frame.index[i] for i in xrange(0,len(expression_frame.index)) if selector.support_[i]]
Beispiel #14
0
def LogReg(X_train, X_test, y_train, y_test, Min_N_Feat, Max_N_Feat, mask='None',weights='auto'):
#******************************************************************************

    from sklearn.feature_selection import RFE #import the library to rank features with recursive feature elimination
    from sklearn.linear_model import LogisticRegression as LogR #import the Logistic Regression module
    
    if mask=='None':
        mask = np.zeros((Max_N_Feat-Min_N_Feat+1,int(X_train.shape[1])),dtype='bool') #define the mask to obtain the list of selected features
    #end
    Pred_Train = np.zeros((int(max(y_train.shape)),Max_N_Feat-Min_N_Feat+1),dtype='int') #define the matrix of outputs (each prediction set is stored in a different column)
    Pred_Test = np.zeros((int(max(y_test.shape)),Max_N_Feat-Min_N_Feat+1),dtype='int') #define the matrix of outputs (each prediction set is stored in a different column)
    
    print 'Logistic Regression: Training...' #notify the user about the status of the process    
    for ift in range(Min_N_Feat,Max_N_Feat+1): #iterate across the maximum number of features    
        LogReg_obj = LogR(C=1e3, class_weight=weights) #create the logistic regression model
        if mask=='None':
            rfe = RFE(LogReg_obj, ift) #create the RFE model and select the number of attributes
            rfe = rfe.fit(X_train,y_train) #train the RFE (feature selection) model on the train data sets
            mask[ift-Min_N_Feat,:] = rfe.support_ #apply the best feature mask to the output mask
        #end
        LogReg_obj.fit(X_train[:,mask[ift-Min_N_Feat,:]], y_train) #fit the logistic model to the train data sets
        Pred_Train[:,ift-1] = LogReg_obj.predict(X_train[:,mask[ift-Min_N_Feat,:]]) #apply the logistic model to the train dataset
        Pred_Test[:,ift-1] = LogReg_obj.predict(X_test[:,mask[ift-Min_N_Feat,:]]) #apply the logistic model to the test dataset
        print 'Logistic Regression: Predicting...', 100*ift/(Max_N_Feat-Min_N_Feat+1), '%' #notify the user about the status of the process 
    #end
        
    print 'Logistic Regression: Completed!' #notify the user about the status of the process
        
    return Pred_Train, Pred_Test, mask
Beispiel #15
0
def remove_one_feature(X, Y, names):
   lr = LinearRegression()
   rfe = RFE(lr, n_features_to_select=1)
   rfe.fit(X,Y)
   rank = (sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names)))
   print(rank)
   return rank[-1][1]
 def selectFeaturesFromSubsetRecursive(self,subset,numFeatures):
   model = svm.LinearSVC(class_weights='auto')
   rfe = RFE(model, numFeatures)
   rfe = rfe.fit(self.instances[:,subset], self.classes)
   # summarize the selection of the attributes
   # print(rfe.get_support(indices=True))
   # print(rfe.ranking_)
   return rfe.get_support(indices=True)
    def buildTree(self,depth):
        #Here, we define the parameters of our tree and use a feature selection algorithm (RFE) to pick out the strongest features.

        self.tree = DecisionTreeClassifier(criterion = 'entropy', max_depth=depth, random_state=0)
        selector = RFE(self.tree, 2, step=1)
        selector = selector.fit(self.X_train, self.Y_train)
        selector.support_
        selector.ranking_
Beispiel #18
0
def rec_feature_elim(data,num_features=17700):
    X = data.get_gene_exp_matrix()
    y = data.get_labels()
    svc = SVC(kernel="linear", C=1)
    rfe = RFE(estimator=svc, n_features_to_select=num_features, step=1)
    selector = rfe.fit(X, y)
    mask = map(lambda x: 1 if x is True else 0,selector.support_)
    print_genes_nonzero_coeff(data,mask)
Beispiel #19
0
def build_model(x,y,no_features):
    """
    Build a linear regression model
    """
    model = LinearRegression(normalize=True,fit_intercept=True)
    rfe_model = RFE(estimator=model,n_features_to_select=no_features)
    rfe_model.fit(x,y)
    return rfe_model    
Beispiel #20
0
def recursive_feature_elimination(X, y):
    model = LogisticRegression()
    # create the RFE model and select 3 attributes
    rfe = RFE(model, 3)
    rfe = rfe.fit(X, y)
    # summarize the selection of the attributes
    print(rfe.support_)
    print(rfe.ranking_)
Beispiel #21
0
def feature_selection(X, y):
	model = LR()
	rfe = RFE(model, 10)
	fit = rfe.fit(X, y)
	print("Num Features: %d") % fit.n_features_
	print("Selected Features: %s") % fit.support_
	print("Feature Ranking: %s") % fit.ranking_
	print fit.score(X, y)
	return fit.transform(X)
def quick_rfe(estimator, X, y):

    rfe = RFE(estimator = estimator, n_features_to_select = 1)
    rfe.fit(X,y)

    features = X.columns.tolist()
    sorted_features = [f for (rank, f) in sorted(zip(rfe.ranking_, features))]

    return sorted_features, rfe.ranking_
Beispiel #23
0
class LogReg:

  """
  Initialization sets the objects model, vectorizer, labels, and corpus
  variables. Initialization also performs the initial training for the model
  and vectorizer using the given reviews.
  """
  def __init__(
      self,
      reviews,
      vectorizer = TfidfVectorizer(stop_words = 'english', max_df = 1,
        ngram_range = (1, 2)),
      model = LogisticRegression()
      ):
    self.model = model
    self.vectorizer = vectorizer
    self.selector = RFE(self.model, step = 100, verbose = 100)

    corpus = []
    labels = []
    for review in reviews:
      corpus += [review[1]["text"]]
      labels += [review[0]]

    #setting variables for the object
    self.corpus = corpus
    self.labels = labels
    self.reviews = reviews

    X = self.vectorizer.fit_transform(self.corpus)
    self.feature_names = self.vectorizer.get_feature_names()
    y = self.labels
    for string in self.feature_names:
      print(string.encode("ascii", 'ignore'))

    #Training the model
    X_new = self.selector.fit_transform(X, self.labels)
    self.model.fit(X_new, self.labels)

  def classify_all(self, all_test_data):
    test_corpus = []
    y = []
    for review in all_test_data:
      test_corpus += [review[1]['text']]
      y += [review[0]]

    #Used transform instead of fit_transform
    #for test data so number of features will match
    X = self.vectorizer.transform(test_corpus)
    X_new = self.selector.transform(X)
    results = self.model.predict(X_new)
    categories = ["spring", "summer", "fall", "winter"]
    for i, category in enumerate(categories):
      top10 = np.argsort(self.model.coef_[i])[-20:]
      for j in top10:
        print("%s: %s" % (category, "".join(self.feature_names[j])))
    return results
Beispiel #24
0
    def recurrciveFE(self, data):
        """
        Uses Recurrcise Feature Elimination to determine the write number of 
        features before adding additional leads to overfitting &
         It works by recursively removing attributes and building a model on those 
         attributes that remain. It uses the model accuracy to identify 
         which attributes (and combination of attributes) contribute the 
         most to predicting the target attribute.

        Parameters
        ----------
        data : DataFrame
            Input data, for which categorical variables should be converted
            response should be in 0 column, predictors in additional

        Returns
        -------
        out : Plot
            A plot with the number of optimal number of features,
            which is then used to determine features of most
            importance returned in a print out to console
          
        """
        features_list = data.columns.values[1::]
        predictors = np.asarray(data.values[:, 1::])
        response = np.asarray(data.values[:, 0])
        estimator = SVC(kernel="linear")
        
        ###using cross validation to determine nooffeatures
        rfecv = RFE(estimator, step=1, cv=StratifiedKFold(response, 2), scoring = 'accuracy')
        rfecv.fit(predictors, response)
        RFE( )
        print("Optimal number of features : %d" % rfecv.n_features_)
        
        # Plot number of features VS. cross-validation scores
        plt.figure()
        plt.xlabel("Number of features selected")
        plt.ylabel("Cross validation score (nb of correct classifications)")
        plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
        plt.show()        
        
        ##label as optimal #of features
        noffeatures = rfecv.n_features_  
        
        ##use rfe to determine top features
        selector = RFE(estimator,noffeatures , step=1)
        selector = selector.fit(predictors, response)
        ##creat index to get names
        index1 = np.where(selector.support_ == False)[0]
        index = np.argsort(selector.ranking_[index1])[::-1]
        feature_list_imp = features_list[index]

        for f in range(index.shape[0]):
            print("%d. feature %d (%s)" % (f + 1, index[f], feature_list_imp[index[f]]))
        print(selector.support_)
        print(selector.ranking_)    
def recursive_fs(X, y, clf, num_features):
    # create the RFE model and select 3 attributes
    rfe = RFE(clf, num_features)

    start = time.time()
    rfe = rfe.fit(X, y)
    # summarize the selection of the attributes
    end = time.time()
    print ("Training Time: " + str((end - start)) + "s")
    return rfe
Beispiel #26
0
 def feature_selection(estimator, x, y):
     """
         支持度评级
     """
     selector = RFE(estimator)
     selector.fit(x, y)
     print('RFE selection')
     print(pd.DataFrame(
         {'support': selector.support_, 'ranking': selector.ranking_},
         index=pig_three_feature.columns[1:]))
def trainDesicionTreeClassifier():
    modelDesicionTree=DecisionTreeClassifier(max_depth=5)
    # set the number of features to 10
    rfedecisiontree = RFE(modelDesicionTree, 10)
    rfedecisiontree = rfedecisiontree.fit(X_train, y_train)
    print("Feature Importance of Decision Tree Model")
    print(rfedecisiontree.support_)
    print(rfedecisiontree.ranking_)
    modelDesicionTree.fit(X_train, y_train)
    return modelDesicionTree
def rank(training_set, paradigm_lengths, category_description):

    transfomer = DataTransformer(training_set, paradigm_lengths, category_description)
    headlines, matrix, targets = transfomer.get_training_data_matrix(normalize=True)
    matrix = matrix.toarray()
    estimator = svm.SVC(C=1, kernel='linear')
    selector = RFE(estimator, 1, step=1)
    selector = selector.fit(matrix, targets)
    for i in range(len(headlines)):
        print headlines[i], selector.ranking_[i]
def trainLogisticRegression():
    modelLogisticRegression=LogisticRegression()
    #set the number of features to 10
    rfelogisticReg=RFE(modelLogisticRegression,10)
    rfelogisticReg=rfelogisticReg.fit(X_train, y_train)
    print("Feature Importance of Logistic Regression Model")
    print(rfelogisticReg.support_)
    print(rfelogisticReg.ranking_)
    modelLogisticRegression.fit(X_train, y_train)
    return modelLogisticRegression
Beispiel #30
0
def select_features(X, y, clf=None, n_features=10):
    if not clf:
        clf = LogisticRegression()
    clf.fit(X, y)
    selector = RFE(clf, n_features_to_select=n_features)
    selector = selector.fit(X, y)
    features = np.array(range(57))
    # print selector.ranking_
    # print selector.support_
    return features[selector.support_]
Beispiel #31
0
#########################
transfomers = [DummyTransformer, Normalizer(), StandardScaler()]
transfomers_cfg = {}
transfomers_cfg[DummyTransformer.func.__name__] = {}
transfomers_cfg[Normalizer.__name__] = dict(
    transfomer__norm=['l1', 'l2', 'max'])
transfomers_cfg[StandardScaler.__name__] = {}

###########################
####Dim Reducer, Feat Sel.#
###########################
reducers = [
    DummyTransformer,
    PCA(),
    GenericUnivariateSelect(),
    RFE(ExtraTreesRegressor())
]
reducers_cfg = {}
reducers_cfg[DummyTransformer.func.__name__] = {}
reducers_cfg[PCA.__name__] = dict(
    reducer__n_components=[],
    # reducer__whiten = [True, False],
    reducer__svd_solver=['auto'])
reducers_cfg[GenericUnivariateSelect.__name__] = dict(
    reducer__score_func=[f_regression],
    reducer__mode=['k_best'],
    reducer__param=[])
reducers_cfg[RFE.__name__] = dict(reducer__n_features_to_select=[],
                                  reducer__step=[0.1])
#########################
####### Models ##########
Beispiel #32
0
#                                param_grid=param_grid,
#                                scoring='accuracy',
#                                cv=10,
#                                n_jobs=-1)
#    pred = estimators[k].predict(X_test)
#    print("%s Score: %0.02f" % (k, estimators[k].score(X_test, y_test)))
#    scores = cross_validation.cross_val_score(estimators[k], X, y, cv=5)
#    print("%s Cross Avg. Score: %0.02f (+/- %0.02f)" % (k, scores.mean(), scores.std() * 2))
#    end_time = datetime.datetime.now()
#    time_spend = end_time - start_time
#    print("%s Time: %0.02f" % (k, time_spend.total_seconds()))    
    
    
    
from sklearn.feature_selection import RFE
rfe = RFE(clf, 41)
clf1 = rfe.fit(X, y)
clf1.score(X, y)

yhat_test = clf1.predict_proba(X_test)
clf1.score(X_test, y_test)
#conduct grid search for the models:
#logistic regression
from sklearn.grid_search import GridSearchCV
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]        
tuned_parameters = [{'C': param_range}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
Beispiel #33
0
# ### Fit a logistic regression model

# In[14]:


lgr = LogisticRegression(C=5)
lgr.fit(X,y)


# ### Select best features using RFE feature selection

# In[42]:


from sklearn.feature_selection import RFE
selector = RFE(lgr, 20)
selector.fit_transform(X, y)

ranks = selector.ranking_
X_names = encoded_df.columns.drop('good_bad')
# print sorted(map(lambda x: round(x, 4), selector.ranking_), names)


# In[110]:


rfe_features = np.column_stack((X_names, ranks))
rfe_cols = rfe_features[np.where(rfe_features[:,1]<10),:2][0]
rfe_col1 = rfe_cols[:,:1]
print(rfe_col1)
Beispiel #34
0
#testFeatures, testLabels = transformDataset(test_sents)
corpus=[d for (d,c) in documents]
labels=[c for (d,c) in documents]
features=tfidf(corpus)

#print(features[1])

#features,labels=transformDataset(documents)
#vec = DictVectorizer()
#features_new=vec.fit_transform(features).toarray()
#print(features_new.shape)

print(len(features))
print(len(labels))
svc = SVC(kernel="linear", C=1)
clf = RFE(svc, 300, step=1)
fe = clf.fit_transform(features, labels)
#print(fit.scores_)
print(fe.shape)

# summarize selected features
trainFeatures, testFeatures, trainLabels, testLabels = train_test_split(fe,labels, test_size=0.33, random_state=42)


print("length of testLabels=",len(testLabels))
#for l in testLabels:
#    print("label=",l)
#print("features=",trainFeatures[1],"label=",trainLabels[1])
#featuresets = [(document_features(d), c) for (d,c) in documents]

var = 1
def classify_one_vs_many(df,
                         model_name,
                         model,
                         feature_to_class,
                         type_class,
                         type_0_class=None):
    GH_df_reduced_one_vs_many = df.copy()
    if type_0_class is None:
        others_df = GH_df_reduced_one_vs_many[(
            GH_df_reduced_one_vs_many[feature_to_class] != type_class)].copy()
        others_df.loc[:, 'ml_type'] = type_0_class = 'others'
    else:
        others_df = GH_df_reduced_one_vs_many[(
            GH_df_reduced_one_vs_many[feature_to_class] == type_0_class
        )].copy()
        others_df.loc[:, 'ml_type'] = type_0_class
    category_df = GH_df_reduced_one_vs_many[
        GH_df_reduced_one_vs_many[feature_to_class] == type_class].copy()
    category_df.loc[:, 'ml_type'] = type_class

    df_merged = pd.concat([others_df, category_df], ignore_index=True)

    # print df_merged.groupby(['ml_type','category'])['analizo_accm_mean'].count()
    X = df_merged.select_dtypes(include=[np.number])
    y = df_merged.loc[:, 'ml_type']
    test_size = 0.2
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=test_size)
    ros = RandomOverSampler(random_state=0)
    if len(y_train.unique()) < 2:
        print('cannot fit for {}'.format(type_class))
        return None
    X_resampled, Y_resampled = ros.fit_resample(X_train, y_train)

    # print('Training target statistics: {}'.format(Counter(y)))
    if Counter(y)[type_class] == 1:
        print('cannot fit for {}'.format(type_class))
        return

    model.fit(X_train, y_train)
    # print model.score(X_test,y_test)
    rfe = RFE(model, 4)
    fit = rfe.fit(X_train, y_train)
    # print "Selected features : " + str(X.columns[fit.support_])
    pred = model.predict(X_test)
    #     print Counter(pred)
    #     df_accurarcy  = set_wrong_type(pred,y, df_merged,type_class)
    # calculate_accurarcy(df_accurarcy,pred,y,type_class)
    fpr = tpr = roc_auc = None
    t = True
    try:

        y_pred = model.predict_proba(X_test)[:, 1]
    except:
        t = False
    if t:
        fpr, tpr, _ = roc_curve(y_test, y_pred, pos_label=type_class)
        roc_auc = auc(tpr, fpr)
    f1 = f1_score(y_test, pred, pos_label=type_class)

    return {
        'model_name': model_name,
        'agent_type': agent_type,
        'feature_importance': fit,
        'model': model,
        'fpr': fpr,
        'tpr': tpr,
        'auc': roc_auc,
        'f1_score': f1,
        'class 0': type_0_class,
        'class 1': type_class
    }
def main(training_input_path, testing_input_path, output_path):

    # LOAD DATA
    train = pd.read_csv(training_input_path, header=0)
    test = pd.read_csv(testing_input_path, header=0)

    # PREPROCESSING
    le = LabelEncoder()
    train["ocean_proximity"] = le.fit_transform(train["ocean_proximity"])
    test["ocean_proximity"] = le.transform(test["ocean_proximity"])

    # SPLIT TRAINING AND TESTING DATA INTO X AND Y
    X_train = train.drop(columns="median_house_value")
    y_train = train['median_house_value']
    X_test = test.drop(columns="median_house_value")
    y_test = test['median_house_value']

    # CREATE A DF THAT EXCLUDES LATITUDE AND LONGITUDE
    X_train_featexc = X_train.drop(columns=["latitude", "longitude"])
    X_test_featexc = X_test.drop(columns=["latitude", "longitude"])

    # CREATE A DF THAT EXCLUDES LATITUDE, LONGITUDE, AND TOTAL BEDROOMS
    X_train_featexc_2 = X_train.drop(
        columns=["latitude", "longitude", "total_bedrooms"])
    X_test_featexc_2 = X_test.drop(
        columns=["latitude", "longitude", "total_bedrooms"])

    # APPLY SCALER
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    X_train_featexc = scaler.fit_transform(X_train_featexc)
    X_test_featexc = scaler.transform(X_test_featexc)
    X_train_featexc_2 = scaler.fit_transform(X_train_featexc_2)
    X_test_featexc_2 = scaler.transform(X_test_featexc_2)

    # LINEAR REGRESSION WITH FEATURE SELECTION - ALL FEATURES AVAILABLE
    lr_response = {
        'n_features_to_select': [],
        'train_error': [],
        'test_error': []
    }

    for i in list(range(1, X_train.shape[1] + 1, 1)):
        lr_response['n_features_to_select'].append(i)

        rfe_lr = RFE(LinearRegression(), n_features_to_select=i)
        rfe_lr.fit(X_train, y_train)
        lr_response['train_error'].append(
            round(1 - rfe_lr.score(X_train, y_train), 3))
        lr_response['test_error'].append(
            round(1 - rfe_lr.score(X_test, y_test), 3))
    pd.DataFrame(lr_response).to_csv(output_path + 'lr_rfe_results_table.csv',
                                     index=False)

    # Plotting LR performance
    data = pd.DataFrame(lr_response).melt(
        id_vars='n_features_to_select',
        value_vars=['train_error', 'test_error'])
    plot = alt.Chart(data).mark_line().encode(
        x=alt.X('n_features_to_select:Q', title="Number of Features Selected"),
        y=alt.Y('value:Q', title="Error"),
        color=alt.Color('variable:N', title="Data Split")).properties(
            title="Recursive Feature Elimination Linear Regression Error",
            width=250,
            height=200)
    plot.save(output_path + 'LR_performace.png')

    # LINEAR REGRESSION WITH FEATURE SELECTION - EXCLUDING LATITUDE AND LONGITUDE
    lr_response_exc = {
        'n_features_to_select': [],
        'train_error': [],
        'test_error': []
    }

    for i in list(range(1, X_train_featexc.shape[1] + 1, 1)):
        lr_response_exc['n_features_to_select'].append(i)

        rfe_lr = RFE(LinearRegression(), n_features_to_select=i)
        rfe_lr.fit(X_train_featexc, y_train)
        lr_response_exc['train_error'].append(
            round(1 - rfe_lr.score(X_train_featexc, y_train), 3))
        lr_response_exc['test_error'].append(
            round(1 - rfe_lr.score(X_test_featexc, y_test), 3))
    pd.DataFrame(lr_response_exc).to_csv(output_path +
                                         'lr_rfe_results_table_exc_feats.csv',
                                         index=False)

    # Plotting LR performance excluding latitude and longitude
    data = pd.DataFrame(lr_response_exc).melt(
        id_vars='n_features_to_select',
        value_vars=['train_error', 'test_error'])
    plot = alt.Chart(data).mark_line().encode(
        x=alt.X('n_features_to_select:Q', title="Number of Features Selected"),
        y=alt.Y('value:Q', title="Error"),
        color=alt.Color('variable:N', title="Data Split")
    ).properties(
        title=
        "Recursive Feature Elimination Linear Regression Error Excluding Latitude and Longitude",
        width=250,
        height=200)
    plot.save(output_path + 'LR_performace_exc_feats.png')

    # LINEAR REGRESSION WITH FEATURE SELECTION - EXCLUDING LATITUDE, LONGITUDE, AND TOTAL BEDROOMS
    lr_response_exc_2 = {
        'n_features_to_select': [],
        'train_error': [],
        'test_error': []
    }

    for i in list(range(1, X_train_featexc_2.shape[1] + 1, 1)):
        lr_response_exc_2['n_features_to_select'].append(i)

        rfe_lr = RFE(LinearRegression(), n_features_to_select=i)
        rfe_lr.fit(X_train_featexc_2, y_train)
        lr_response_exc_2['train_error'].append(
            round(1 - rfe_lr.score(X_train_featexc_2, y_train), 3))
        lr_response_exc_2['test_error'].append(
            round(1 - rfe_lr.score(X_test_featexc_2, y_test), 3))
    pd.DataFrame(lr_response_exc_2).to_csv(
        output_path + 'lr_rfe_results_table_exc_feats_2.csv', index=False)

    # Plotting LR performance excluding latitude and longitude
    data = pd.DataFrame(lr_response_exc_2).melt(
        id_vars='n_features_to_select',
        value_vars=['train_error', 'test_error'])
    plot = alt.Chart(data).mark_line().encode(
        x=alt.X('n_features_to_select:Q', title="Number of Features Selected"),
        y=alt.Y('value:Q', title="Error"),
        color=alt.Color('variable:N', title="Data Split")
    ).properties(
        title=
        "Recursive Feature Elimination Linear Regression Error Excluding Latitude, Longitude, and Total Bedrooms",
        width=250,
        height=200)
    plot.save(output_path + 'LR_performace_exc_feats_2.png')

    # KNN WITH VARYING N_NEIGHBOR VALUES WITH FULL DATA INCLUSION
    knn_response = {'n_neighbours': [], 'train_error': [], 'test_error': []}

    for i in list(range(1, 20, 1)):
        knn_response['n_neighbours'].append(i)

        knn = KNeighborsRegressor(n_neighbors=i)
        knn.fit(X_train, y_train)
        knn_response['train_error'].append(
            round(1 - knn.score(X_train, y_train), 3))
        knn_response['test_error'].append(
            round(1 - knn.score(X_test, y_test), 3))
        predictions = knn.predict(X_test)
    pd.DataFrame(knn_response).to_csv(output_path + 'knn_results_table.csv',
                                      index=False)

    # ploting KNN performance
    data = pd.DataFrame(knn_response).melt(
        id_vars='n_neighbours', value_vars=['train_error', 'test_error'])
    plot = alt.Chart(data).mark_line().encode(
        x=alt.X('n_neighbours:Q', title="Number of Nearest Neighbours"),
        y=alt.Y('value:Q', title="Error"),
        color=alt.Color('variable:N', title="Data Split")).properties(
            title="K-Nearest Neighbour Error when Varying K",
            width=250,
            height=200)
    plot.save(output_path + 'KNN_performace.png')

    # plotting KNN performance compared to actual values
    pred_estimates = pd.merge(
        pd.DataFrame(y_test),
        pd.DataFrame(predictions),
        left_index=True,
        right_index=True).rename(columns={
            0: "prediction",
            "median_house_value": "actual"
        })
    pred_estimates = pd.melt(pred_estimates,
                             value_vars=['actual', 'prediction'])
    plot = alt.Chart(pred_estimates).mark_bar(opacity=0.3).encode(
        alt.X('value:Q', bin=alt.Bin(maxbins=40), title="Median House Value"),
        alt.Y('count()', stack=None, title="Count"),
        alt.Color('variable', title="Value")).properties(
            title="Histogram of Actual and Predicted Median House Values",
            width=400,
            height=200)
    plot.save(output_path + 'KNN_actual_vs_predicted.png')

    # KNN WITH VARYING N_NEIGHBOR VALUES WITH LATITUDE AND LONGITUDE EXCLUSION
    knn_response_exc = {
        'n_neighbours': [],
        'train_error': [],
        'test_error': []
    }

    for i in list(range(1, 20, 1)):
        knn_response_exc['n_neighbours'].append(i)

        knn_exc = KNeighborsRegressor(n_neighbors=i)
        knn_exc.fit(X_train_featexc, y_train)
        knn_response_exc['train_error'].append(
            round(1 - knn_exc.score(X_train_featexc, y_train), 3))
        knn_response_exc['test_error'].append(
            round(1 - knn_exc.score(X_test_featexc, y_test), 3))
        predictions = knn_exc.predict(X_test_featexc)
    pd.DataFrame(knn_response_exc).to_csv(output_path +
                                          'knn_results_table_exc_feats.csv',
                                          index=False)

    # ploting KNN performance
    data = pd.DataFrame(knn_response_exc).melt(
        id_vars='n_neighbours', value_vars=['train_error', 'test_error'])
    plot = alt.Chart(data).mark_line().encode(
        x=alt.X('n_neighbours:Q', title="Number of Nearest Neighbours"),
        y=alt.Y('value:Q', title="Error"),
        color=alt.Color('variable:N', title="Data Split")
    ).properties(
        title=
        "K-Nearest Neighbour Error when Varying K and Excluding Latitude and Longitude",
        width=250,
        height=200)
    plot.save(output_path + 'KNN_performace_exc_feats.png')

    # plotting KNN performance compared to actual values excluding latitude and longitude
    pred_estimates = pd.merge(
        pd.DataFrame(y_test),
        pd.DataFrame(predictions),
        left_index=True,
        right_index=True).rename(columns={
            0: "prediction",
            "median_house_value": "actual"
        })
    pred_estimates = pd.melt(pred_estimates,
                             value_vars=['actual', 'prediction'])
    plot = alt.Chart(pred_estimates).mark_bar(opacity=0.3).encode(
        alt.X('value:Q', bin=alt.Bin(maxbins=40), title="Median House Value"),
        alt.Y('count()', stack=None, title="Count"),
        alt.Color('variable', title="Value")
    ).properties(
        title=
        "Histogram of Actual and Predicted Median House Values Excluding Latitude and Longitude",
        width=400,
        height=200)
    plot.save(output_path + 'KNN_actual_vs_predicted_exc_feats.png')

    # RANDOM FOREST REGRESSOR
    rfr = RandomForestRegressor(random_state=522)
    gs = GridSearchCV(rfr,
                      param_grid={
                          "max_depth": np.arange(5, 10, 1),
                          "min_samples_leaf": np.arange(1, 4, 1)
                      })
    gs.fit(X_train, y_train)
    rfr = gs.best_estimator_
    rfr_response = {
        'type': ['Random Forest Regressor'],
        'train_error': [round(1 - rfr.score(X_train, y_train), 3)],
        'test_error': [round(1 - rfr.score(X_test, y_test), 3)]
    }
    pd.DataFrame(rfr_response).to_csv(output_path + 'rfr_results_table.csv',
                                      index=False)

    # TESTING
    assert os.path.isfile(output_path + 'rfr_results_table.csv')
    assert os.path.isfile(output_path + 'KNN_performace.png')
    assert os.path.isfile(output_path + 'lr_rfe_results_table.csv')
    assert os.path.isfile(output_path + 'LR_performace.png')
    assert os.path.isfile(output_path + 'rfr_results_table.csv')
    assert os.path.isfile(output_path + 'knn_results_table_exc_feats.csv')
    assert os.path.isfile(output_path + 'KNN_performace_exc_feats.png')
    assert os.path.isfile(output_path + 'lr_rfe_results_table_exc_feats.csv')
    assert os.path.isfile(output_path + 'LR_performace_exc_feats.png')
    assert os.path.isfile(output_path + 'lr_rfe_results_table_exc_feats_2.csv')
    assert os.path.isfile(output_path + 'LR_performace_exc_feats_2.png')
    assert os.path.isfile(output_path + 'KNN_actual_vs_predicted.png')
    assert os.path.isfile(output_path +
                          'KNN_actual_vs_predicted_exc_feats.png')
for o in range(0, 10):

    #split into test and train set
    F_Training_Train, F_Training_Test, Label_Training_Train, Label_Training_Test = train_test_split(
        features_training, label_training, test_size=0.33)
    F_Test_Train, F_Test_Test, Label_Test_Train, Label_Test_Test = train_test_split(
        features_test, label_test, test_size=0.70)

    #classification
    #    clf = SVC(kernel='linear')
    #    clf = LogisticRegression()
    #    clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
    clf = GradientBoostingClassifier()

    #recursive feature elimination
    selector = RFE(clf, 1, step=1)
    Label_train = np.ravel(Label_Training_Train)
    Label_test = np.ravel(Label_Test_Test)
    selector = selector.fit(F_Training_Train, Label_train)
    rank = selector.ranking_
    Rank.append(rank)
    rank = np.asarray(rank)

    #create a list that contains index numbe of ranked features
    rankedlist = np.zeros((7, 1))

    #finding index of the ranked features and creating new training and test sets with respect to this ranking
    for m in range(1, 8):
        k = np.where(rank == m)
        rankedlist[m - 1] = k[0][0]
        F_Training_Train[:,
Beispiel #38
0
#    acc = accuracy_score(y_test, y_pred)
#    print("Accuracy: {:.4%}".format(acc))
#    print(classification_report(y_test, y_pred, digits=4))

seeds = 1618  # Регулируем значения псевдогенератора случайных чисел
confusion_matrixs = []

# ОТБОР ПРИЗНАКОВ. Метод 1
model = ExtraTreesClassifier(random_state=seeds)
model.fit(x_tr, y_tr)
print(model.feature_importances_)

# ОТБОР ПРИЗНАКОВ. Метод 2
model = LogisticRegression(random_state=seeds)
# create the RFE model and select 3 attributes
rfe = RFE(model, 2)
rfe = rfe.fit(x_tr, y_tr)
print(rfe.support_)
print(rfe.ranking_)

# Оба метода отбора признаков говорят о минимальном вкладе 2 и 4 компоненты.
# Для текущего этапа(классификация первого столбца в справочнике). Можно их исключить, но у меня не так много признаков. Пока оставим.

seeds = 1618  # Регулируем значения псевдогенератора случайных чисел
''' ==>  ЛОГИСТИЧЕСКАЯ РЕГРЕССИЯ
'''
# Часто используется для задач бинарной классификации, но допускается и многоклассовая классификация методом "one-vs-all".
# Достоинством этого алгоритма являеся то, что на выходе для каждого обьекта мы имеем вероятсность принадлежности классу.

model = LogisticRegression(random_state=1618, solver='lbfgs')
model.fit(x_tr, y_tr)
# print(cor_feature)

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=num_feats)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=num_feats, step=10, verbose=5)
rfe_selector.fit(X_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

embeded_lr_selector = SelectFromModel(LogisticRegression(solver='saga', penalty="l1", max_iter=1000), max_features=num_feats)
embeded_lr_selector.fit(X_norm, y)

embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')
       '''

    print('--FEATURE SELECTION ON--', '\n')

    ##1) Run Feature Selection #######
    if fs_type == 1:
        #Stepwise Recursive Backwards Feature removal
        if binning == 0:
            clf = DecisionTreeClassifier(criterion='gini',
                                         splitter='best',
                                         max_depth=None,
                                         min_samples_split=3,
                                         min_samples_leaf=1,
                                         max_features=None,
                                         random_state=rand_st)
            sel = RFE(clf, n_features_to_select=k_cnt, step=.1)
            print('Stepwise Recursive Backwards - Random Forest: ')
        if binning == 1:
            rgr = DecisionTreeClassifier(criterion='gini',
                                         splitter='best',
                                         max_depth=None,
                                         min_samples_split=3,
                                         min_samples_leaf=1,
                                         max_features=None,
                                         random_state=rand_st)
            sel = RFE(rgr, n_features_to_select=k_cnt, step=.1)
            print('Stepwise Recursive Backwards - Random Forest: ')

        fit_mod = sel.fit(data_np, target_np)
        print(sel.ranking_)
        sel_idx = fit_mod.get_support()
Beispiel #41
0

colname=loan_train.columns[:]
colname
from sklearn import tree
with open(r"XYZCorp_LendingData.txt", "w") as f:  
    f = tree.export_graphviz(model_Decision_tree, feature_names= colname[:-1],out_file=f)
#generate the file and upload the code in webgraphviz.com to plot the decision tree
    
# feature importance attribute of decision tree
    print(list(zip(colname,model_Decision_tree.feature_importances_)))



from sklearn.feature_selection import RFE 
rfe = RFE(classifier, 20)
model_rfe = rfe.fit(X_train, Y_train)
print("Num Features: ",model_rfe.n_features_)
print("Selected Features: ") 
print(list(zip(loan_train.columns, model_rfe.support_)))
print("Feature Ranking: ", model_rfe.ranking_) 

Y_pred=model_rfe.predict(X_test)


#predicting using the Random_Forest_Classifier
from sklearn.ensemble import RandomForestClassifier

model_RandomForest=RandomForestClassifier(500)

###
Beispiel #42
0
    for val in l[:-1]:
        j += 1
        data[i][j] = float(val)

X, y = data[:, :-1], data[:, -1]
#y = np.array([y])
#y = np.reshape(y,(y.shape[1],y.shape[0]))

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    stratify=y,
                                                    random_state=42)

clf = SVC(gamma='auto', kernel='linear')
selector = RFE(clf, 100, step=1)
selector = selector.fit(X_train, y_train)

y_pred = selector.estimator_.predict(X_test.compress(selector.support_,
                                                     axis=1))

curr_pos = curr_neg = inc_pos = inc_neg = 0

for i in range(len(y_test)):
    if y_test[i] == 1:
        if y_pred[i] == 1:
            curr_pos += 1
        else:
            inc_neg += 1
    else:
        if y_pred[i] == 1:
##Link https://medium.com/@aneesha/recursive-feature-elimination-with-scikit-learn-3a2cbdf23fb7
# Feature Extraction with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# load data
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = [
    'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'
]
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]
# feature extraction
model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)
print("Num Features: %d") % fit.n_features_
print("Selected Features: %s") % fit.support_
print("Feature Ranking: %s") % fit.ranking_
Beispiel #44
0
model = LogisticRegression(solver='lbfgs', max_iter=500)
for i in range(1, df_X.shape[1]+1):
    fs = sorted_columns[0:i]
    df_X_selected = df_X[fs]
    scores = cross_val_score(model, df_X_selected, df_y, cv=5)
    print(fs.tolist())
    print(np.round(scores.mean(), 4))
    

######################################################################
# Backward elimination (Recursive Feature Elimination)
######################################################################
from sklearn.feature_selection import RFE

model = LogisticRegression(solver='lbfgs', max_iter=500)
rfe = RFE(model, n_features_to_select=4)
fit = rfe.fit(df_X, df_y)
print("Num Features: %d" % fit.n_features_)
fs = df_X.columns[fit.support_].tolist()   # selected features
print("Selected Features: %s" % fs)
#print("Feature Ranking: %s" % fit.ranking_)

scores = cross_val_score(model, df_X[fs], df_y, cv=5)
print("Acc: "+str(scores.mean()))

######################################################################
# Forward selection 
######################################################################
# please install 'mlxtend' moudle  

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
Beispiel #45
0
plt.xlabel('Feature1')
plt.ylabel('Frequency of Feature1')
plt.show()
plt.savefig('Frequency of Feature1')

#Feature Selection

data_final_vars = data.columns.values.tolist()
y = ['Sickness', 'ID']
Y = ['Sickness']
X = [i for i in data_final_vars if i not in y]
print(X, y)

logreg = LogisticRegression()

rfe = RFE(logreg, 20)
rfe = rfe.fit(data[X], data[Y])
print(rfe.support_)
print(rfe.ranking_)

cols = [
    "Feature15",
    "Feature23",
    "Feature43",
    "Feature45",
    "Feature64",
    "Feature87",
    "Feature115",
    "Feature127",
    "Feature162",
    "Feature163",
        for rfe_step_idx, rfe_step in enumerate(rfe_step_range):

            print(
                str(count_iter) + '/' + str(
                    len(seed_range) * len(nCoeffs_range) *
                    len(rfe_step_range)))

            for train_index, test_index in skf.split(features,
                                                     labels):  # external CV

                X_train, X_test = features[train_index], features[test_index]
                y_train, y_test = labels[train_index], labels[test_index]

                scaler = MinMaxScaler()
                sv = LinearSVC()
                rfe = RFE(sv, step=rfe_step, n_features_to_select=nCoeffs)

                # Defining scaler + rfe
                pipe = Pipeline([('std_scaler', scaler), ('fs', rfe)])

                clf = GridSearchCV(pipe,
                                   param_grid=param_grid,
                                   cv=inner_folds,
                                   scoring=scoring_fct,
                                   n_jobs=6)
                y_score = clf.fit(X_train, y_train)

                #print(clf.best_params_)

                best_model = clf.best_estimator_
                selector = best_model.named_steps['fs']
           pd.DataFrame(np.transpose(classifier.coef_), columns = ["coef"])
           ],axis = 1)


#### Feature Selection ####


## Feature Selection
# Recursive Feature Elimination
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Model to Test
classifier = LogisticRegression()
# Select Best X Features
rfe = RFE(classifier, 20)
rfe = rfe.fit(X_train, y_train)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)
X_train.columns[rfe.support_]

# New Correlation Matrix
sn.set(style="white")

# Compute the correlation matrix
corr = X_train[X_train.columns[rfe.support_]].corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
Beispiel #48
0
    return np.amax([val1, val2])

# K=2 TITANIC

pca_titanic = []
ica_titanic = []
rca_titanic = []
rfe_titanic = []

k=2
for dim in range(1, len(tit_cols)+1):
    pca = PCA(n_components=dim)
    ica = FastICA(n_components=dim)
    rca = GaussianRandomProjection(n_components=dim)
    logreg = LogisticRegression()
    rfe = RFE(logreg, n_features_to_select=dim)
    pca_X_train = pca.fit_transform(tit_X_train)
    ica_X_train = ica.fit_transform(tit_X_train)
    rca_X_train = rca.fit_transform(tit_X_train)
    rfe.fit(tit_X_train, tit_y_train)
    rfe_X_train = rfe.transform(tit_X_train)
    em = GaussianMixture(n_components=k)
    em.fit(pca_X_train)
    pca_em_X_train = em.predict(pca_X_train)
    em.fit(ica_X_train)
    ica_em_X_train = em.predict(ica_X_train)
    em.fit(rca_X_train)
    rca_em_X_train = em.predict(rca_X_train)
    em.fit(rfe_X_train)
    rfe_em_X_train = em.predict(rfe_X_train)
Beispiel #49
0
print("finish")

names = [
    'alloy', 'class', 'delta', 'Hmix', 'Smix', 'Fi', 'RMS', 'VEC', 'r', 'Sc',
    'deltaHmixmax', 'deltaHmixmin', 'rootHmix', 'rootHmix0', 'rootHmix0+',
    'rootHmix0-'
]
data = pd.read_csv('合并数据集-去除重复.csv', header=0, names=names)
Y = data[["class"]]
X = pd.read_csv('generate_feature_1008.csv')
print("finish")

rfc = RandomForestClassifier()
#Y=Y.values
#Y= Y.reshape(c, )
rfe = RFE(estimator=rfc, n_features_to_select=1, step=1)
rfe.fit(X, Y)
ranking = rfe.ranking_
print("RFE ranking:\n", ranking)

list_ranking_index = []
list_ranking_importance = []
for i in range(len(ranking)):
    if ranking[i] <= 100:
        list_ranking_index.append(i)
        list_ranking_importance.append(ranking[i])
print("list_ranking_index:\n", list_ranking_index)
print("list_ranking_importance:\n", list_ranking_importance)
print('finish')

#写入CSV
Beispiel #50
0
 def RFE(self,estimator,k):
     X=self.X
     Y=self.Y
     rfe=RFE(estimator,n_features_to_select=k)
     res=rfe.fit_transform(X,Y)
     return rfe,res
Beispiel #51
0
X_train=training_data[['X1','X2','X3','X4','X5','X6','X7','X8']]
y_train=training_data[['Y']]


# step-1: create a cross-validation scheme
folds = KFold(n_splits = 10, shuffle = True, random_state = 100)

# step-2: specify range of hyperparameters to tune
hyper_params = [{'n_features_to_select': list(range(1, 9))}]


# step-3: perform grid search
# 3.1 specify model
lm = LinearRegression()
rfe = RFE(lm)             

# 3.2 call GridSearchCV()
model_cv = GridSearchCV(estimator = rfe, 
                        param_grid = hyper_params, 
                        scoring= 'r2', 
                        cv = folds, 
                        verbose = 1,
                        return_train_score=False)  


lr= model_cv.fit(X_train,y_train)
y_predict=lr.predict(X_train)
print("The coefficient of determination(r squared) obtained from Linear Regression:\n")
######score here returns The coefficient of determination(r squared) the closer to 1 the better model
print(lr.score(X_train,y_train),"\n") 
Beispiel #52
0
# for p in cv:
#     print p
# print len(cv)
# sys.exit()

''' Logistic regression '''
# w = 'balanced'
# clf = LogisticRegression(class_weight=w, penalty='l1', n_jobs=1)
# parameters = {'C': np.hstack((np.arange(0.0095, 0.02, 0.0001), np.arange(0.02, 0.601, 0.005)))}
# parameters = {'C': [0.005, 0.0075, 0.01]}
# parameters = {'C': [0.005]}

clf = Pipeline([
    # ('rfe', RFE(estimator=LogisticRegression(class_weight='balanced', penalty='l1', C=0.01), n_features_to_select=2,
    ('rfe', RFE(estimator=LogisticRegression(class_weight='balanced', penalty='l1', C=0.001), n_features_to_select=2,
                step=0.1)),
    ('clf', LogisticRegression(class_weight='balanced', penalty='l1', n_jobs=1))
])

# parameters = {'clf__C': [0.005, 0.0075, 0.01]}
parameters = {'clf__C': [0.001, 0.01]}

K = 5
R = 1  # repeat cross-validation

auc_limit = 0.55
auc_hat = 1
step_remove = 1

# TODO
# TODO
Beispiel #53
0
os_data_X,os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['IntermittentIceCover'])
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of annual lakes",len(os_data_y[os_data_y['IntermittentIceCover']==0]))
print("Number of intermittent lakes",len(os_data_y[os_data_y['IntermittentIceCover']==1]))
print("Proportion of annual lakes in oversampled data is ",len(os_data_y[os_data_y['IntermittentIceCover']==0])/len(os_data_X))
print("Proportion of intermittent lakes in oversampled data is ",len(os_data_y[os_data_y['IntermittentIceCover']==1])/len(os_data_X))

dt_vars=dt.columns.values.tolist()
y=['IntermittentIceCover']
X=[i for i in dt_vars if i not in y]

logreg = LogisticRegression()
rfe = RFE(logreg, 20)
rfe = rfe.fit(os_data_X, os_data_y.values.ravel())
print(rfe.support_)
print(rfe.ranking_)

cols=[ "MeanAnnualAirTemp_c", "MaximumDepth_m", 'Latitude_dd', 'temp_range']
#cols=[ "Elevation_m", "MeanAnnualAirTemp_c", "MaximumDepth_m", 'Latitude_dd']

X=os_data_X[cols]
y=os_data_y['IntermittentIceCover']

logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)
def train_svm_k_fold_RFE(matrix,
                         target,
                         gamma,
                         linear=True,
                         nfeatures=15,
                         nsplits=10,
                         penalty="l2",
                         C=1,
                         multi_class="ovr",
                         kernel="rbf",
                         degree=3,
                         probability=False,
                         decision_function_shape="ovr"):
    scores = []
    confusion = []
    features = []
    parameters = {
        "Gamma": gamma,
        "Linear": linear,
        "C": C,
        "Kernel": kernel,
        "Degree": degree,
        "Average": [],
        "Scores": [],
        "Features": [],
        "Macro": [],
        "Micro": [],
        "Weighted": []
    }

    if (linear):
        best_svc = LinearSVC(penalty="l2", C=C, multi_class="ovr")
    else:
        best_svc = SVC(C=C,
                       kernel=kernel,
                       gamma=gamma,
                       degree=degree,
                       probability=probability,
                       decision_function_shape=decision_function_shape)
    cv = KFold(n_splits=nsplits, random_state=42, shuffle=False)
    for train_index, test_index in cv.split(matrix):
        #print("Train Index: ", train_index, "\n")
        #print("Test Index: ", test_index)
        X_train, X_test, y_train, y_test = matrix[train_index], matrix[
            test_index], target[train_index], target[test_index]
        # ---------------- FEATURE SELECTION ------------------------

        rforest = RandomForestClassifier(random_state=101)
        rfe = RFE(estimator=rforest, n_features_to_select=nfeatures)
        rfe.fit(X_train, y_train)
        support = rfe.support_

        j = 0
        indexes = []
        for i in support:
            if i == True:
                indexes.append(j)
            j += 1

        x_train_fs = X_train[:, indexes]

        # --------------- TRAINING ------------------------------
        # Training the model
        best_svc.fit(x_train_fs, y_train)

        #--------------- TESTING -------------------------------
        # Getting the scores of the model on the test set
        svc_predictions = best_svc.predict(X_test[:, indexes])
        # getting accuracy
        scores.append(best_svc.score(X_test[:, indexes], y_test))
        # Macro
        parameters["Macro"].append(
            precision_recall_fscore_support(y_test,
                                            svc_predictions,
                                            average='macro'))
        # Micro
        parameters["Micro"].append(
            precision_recall_fscore_support(y_test,
                                            svc_predictions,
                                            average='micro'))
        # Weighted
        parameters["Weighted"].append(
            precision_recall_fscore_support(y_test,
                                            svc_predictions,
                                            average='weighted'))

        parameters["Features"].append(indexes)

        # getting confusion matrix
        confusion.append(confusion_matrix(y_test, svc_predictions))
    parameters["Scores"].append(scores)
    parameters["Average"] = np.average(scores)
    return (scores, confusion, parameters)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import (cross_val_score, KFold, cross_validate,
                                     train_test_split)
from sklearn.ensemble import StackingClassifier

data = load_wine()
y = data.target
X = data.data
stc = StandardScaler()
lenc = LabelEncoder()
columns = data.feature_names
df = pd.DataFrame(data=np.hstack(tup=(X, y.reshape(-1, 1))),
                  columns=np.hstack(tup=(columns, ["Class"])))
X_std = stc.fit_transform(df[columns])
pipesvm = Pipeline([("stc", stc), ("selection", RFE(LinearSVC())),
                    ("svm", SVC(kernel="linear"))])
pipelda = Pipeline([("stc", stc), ("svm", LinearDiscriminantAnalysis())])
estimators = [("LDA", pipelda), ("SVM", pipesvm)]
# El utilizar clasificadores apilados tiene beneficios cuando se trata de
# problemas multiclase, puesto que puede mejorar mucho el pronostico de clase
# al explotar el poder predictivo del pronostico para ciertas clases
stacking_classifier = StackingClassifier(estimators=estimators,
                                         final_estimator=GaussianNB())
print("Stacking stimators")
print(
    cross_val_score(X=df[columns],
                    y=y,
                    estimator=stacking_classifier,
                    cv=KFold(5)))
print("Only SVM")
Beispiel #56
0
del X['target']
del X['id']
X.describe()
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(Y.values.tolist())
label = le.transform(Y)
print(list(le.classes_))
print(label)
noOfFeature = 45
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
import timeit
start = timeit.default_timer()
clf = RandomForestClassifier()
rfe = RFE(clf, noOfFeature)
fit = rfe.fit(X, label)
print("Time take %.2f " % (timeit.default_timer() - start))
print(("Num Features: %d") % fit.n_features_)
print(("Selected Features: %s") % fit.support_)
print(("Feature Ranking: %s") % fit.ranking_)
features = []
for i, j in zip(X.columns, fit.support_):
    if j == True:
        features.append(str(i))
print(features)
from sklearn.model_selection import cross_val_score
import timeit
from xgboost import XGBClassifier
from statistics import mean
train_csv = pd.read_csv('../input/train.csv')
Beispiel #57
0
    X = fifa.drop('Overall', 1)
    y = fifa['Overall']

    lr_model = LinearRegression()

    rfe = RFE(lr_model, n_features_to_select=5)
    rfe.fit(X, y)

    mask = rfe.support_
    top_features = X.columns[mask]
    return list(top_features)


# In[117]:

q4()

# In[118]:

X = fifa.drop('Overall', 1)
y = fifa['Overall']
lr_model = LinearRegression()

rfe = RFE(lr_model, n_features_to_select=5)
rfe.fit(X, y)

plt.figure()
plt.title("Feature Importance")
pd.Series(rfe.estimator_.coef_,
          index=X.columns[rfe.support_]).sort_values().plot(kind='barh')
Beispiel #58
0
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from xgboost import XGBClassifier

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.86
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=XGBClassifier(learning_rate=0.1,
                                              max_depth=3,
                                              min_child_weight=16,
                                              n_estimators=100,
                                              nthread=1,
                                              subsample=0.25)),
    RFE(estimator=ExtraTreesClassifier(criterion="entropy",
                                       max_features=0.8,
                                       n_estimators=100),
        step=0.7000000000000001), GaussianNB())

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Beispiel #59
0
def logitRegression(data):

    # Feature Selection
    logistic = LogisticRegression()
    rfe = RFE(logistic, 18)
    rfe = rfe.fit(inputs, winners)
    print(rfe.support_)
    print(rfe.ranking_)
    features = rfe.support_
    print("\nFeature index: " + str(np.where(features == True)))

    # creating testing and training set
    X_train, X_test, Y_train, Y_test = train_test_split(inputs,
                                                        winners,
                                                        test_size=0.33)

    # train scikit learn model
    clf = LogisticRegression()
    clf.fit(X_train, Y_train)
    score = round(clf.score(X_test, Y_test), 2)
    print('score Scikit learn: ', score)

    logistic.fit(inputs, winners)
    predicted = logistic.predict(X_test)
    print("Predicted: " + str(predicted))
    plt.figure()
    plt.plot(predicted)

    # Metrics: confusion matrix
    cm = metrics.confusion_matrix(Y_test, predicted)
    print(cm)

    # plot
    plt.figure(figsize=(2, 2))
    sns.heatmap(cm,
                annot=True,
                fmt=".3f",
                linewidths=.5,
                square=True,
                cmap='Blues_r')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    all_sample_title = 'Accuracy Score: {0}'.format(score)
    plt.title(all_sample_title, size=15)
    plt.show()

    # cross validation
    kfold = sklearn.cross_validation.KFold(X_train.shape[0], n_folds=10)
    modelCV = LogisticRegression()
    scoring = 'accuracy'
    results = sklearn.metrics.accuracy_score(Y_test, predicted)
    print("\n\n 10-fold cross validation average accuracy: %.3f" %
          (results.mean()))
    print("\n")

    # precision
    print(classification_report(Y_test, predicted))

    # ROC
    logit_roc_auc = roc_auc_score(Y_test, logistic.predict(X_test))
    fpr, tpr, thresholds = roc_curve(Y_test,
                                     logistic.predict_proba(X_test)[:, 1])
    plt.figure()
    plt.plot(fpr,
             tpr,
             label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig('Log_ROC')
    plt.show()

    # train with selected features
    train_cols = [
        'Action 2', 'Action 9', 'Action 10', 'Action 11', 'Action 12',
        'Action 13', 'Action 14', 'Action 16', 'Action 18', 'Action 24',
        'Action 32', 'Action 41', 'Action 48', 'Action 53', 'Action 57',
        '2gram 10', '3gram 2', '3gram 9'
    ]
    X = data[train_cols]
    #print(X)
    y = data['Winner']
    logit_model = sm.Logit(y.astype(float), X.astype(float))
    result = logit_model.fit(method='bfgs')
    print(result.summary())
# In[ ]:

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

cols = [
    "Age", "Fare", "TravelAlone", "Pclass_1", "Pclass_2", "Embarked_C",
    "Embarked_S", "Sex_male", "IsMinor"
]
X = final_train[cols]
y = final_train['Survived']
# Build a logreg and compute the feature importances
model = LogisticRegression()
# create the RFE model and select 8 attributes
rfe = RFE(model, 8)
rfe = rfe.fit(X, y)
# summarize the selection of the attributes
print('Selected features: %s' % list(X.columns[rfe.support_]))

# <a id="t4.1.2."></a>
# ### 4.1.2. Feature ranking with recursive feature elimination and cross-validation
#
# RFECV performs RFE in a cross-validation loop to find the optimal number or the best number of features. Hereafter a recursive feature elimination applied on logistic regression with automatic tuning of the number of features selected with cross-validation.

# In[ ]:

from sklearn.feature_selection import RFECV
# Create the RFE object and compute a cross-validated score.
# The "accuracy" scoring is proportional to the number of correct classifications
rfecv = RFECV(estimator=LogisticRegression(),