コード例 #1
0
ファイル: feat_select.py プロジェクト: pagea/unstyle
def rank_features_rfe(X, y, featureset):
    """Rank features by their importance using recursive feature elimination.

    :param X: A training set of features.
    :param y: A target set (aka class labels for the training set)
    :param featureset: An instance of a featureset (such as Basic9Extractor())
    :rtype: An OrderedDict of the form {K : V}, with K being the feature name
    and V being its importance. This dictionary will be sorted by importance.
    """

    # FIXME: Use an RBF SVC to rank features. It is likely that the "importance"
    # rankings derived from a LinearSVC are similar as an RBF kernel SVM, but,
    # for safety's sake, it is best to assume they are not.

    classifier = LinearSVC()
    classifier.fit(X, y)

    ranker = RFE(classifier, 1, step=1)
    ranker = ranker.fit(X, y)

    # Get the names of the feature columns.
    # FIXME: Duplicate code from rank_features. Make this its own function.
    feat_importance = OrderedDict()
    for index, func in enumerate(featureset.features):
        feat_importance[func] = ranker.ranking_[index]

    return sorted(feat_importance.items(), key=lambda x: x[1])
コード例 #2
0
def select_features(X, y, random_state, kernel='linear', C=1.0, num_attributes=3):
    """
    Uses Support Vector Classifier as the estimator to rank features
    with Recursive Feature Eliminatin.

    Parameters
    ----------
    X: A pandas.DataFrame. Attributes.
    y: A pandas.DataFrame. Labels.
    random_state: A RandomState instance. Used in SVC().
    kernel: A string. Used in SVC(). Default: "linear".
    C: A float. Used in SVC(). Default: 1.0.
    num_attributes: An int. The number of features to select in RFE. Default: 3.

    Returns
    -------
    A 3-tuple of (RFE, np.ndarray, np.ndarray)
    model: An RFE instance.
    columns: Selected features.
    ranking: The feature ranking. Selected features are assigned rank 1.
    """

    rfe = RFE(svm.SVC(C, kernel, random_state=random_state), num_attributes)
    model = rfe.fit(X, y.values.ravel())
    columns = list()

    for idx, label in enumerate(X):
        if rfe.support_[idx]:
            columns.append(label)

    ranking = rfe.ranking_

    return model, columns, ranking
コード例 #3
0
ファイル: example.py プロジェクト: jonnagel/worksampleanon1
def get_best_cols(df):
    """ select best cols with RFE """

    # factors
    cols_to_factor = [
        pd.get_dummies(df.X7),
        pd.get_dummies(df.X8),
        pd.get_dummies(df.X9),
        pd.get_dummies(df.X11),
        pd.get_dummies(df.X12),
        pd.get_dummies(df.X14),
        pd.get_dummies(df.X12),
        pd.get_dummies(df.X14),
        pd.get_dummies(df.X32),
    ]
    # dataframe with factors blown out
    df_f = pd.concat(cols_to_factor, axis=1)
    # numerics
    RFE_col_list = ["X4", "X5", "X6", "X13", "X21", "X22", "X29", "X30", "X31"]
    # dataframe with numerics
    df_n = df.ix[:, RFE_col_list]
    X = np.asarray(df_n)
    X = StandardScaler().fit_transform(X)
    # add in factors
    X = np.concatenate([X, np.asarray(df_f)], axis=1)
    # leave y alone
    y = df.X1
    # I don't like to guess yes this is only linear relationships
    estimator = SVR(kernel="linear")
    selector = RFE(estimator, 40, step=2)
    selector = selector.fit(X, y)
    # make index for merged df, yes this whines
    df_index = df_n.columns + df_f.columns
    best_cols = df_index[selector.support_]
    return best_cols
コード例 #4
0
ファイル: scoring_task.py プロジェクト: mfomicheva/metric-dev
    def recursive_feature_elimination(config_learning, config_data, number_features):

        output = open(os.path.expanduser(config_data.get("Learner", "models")) + "/" + "feature_ranks.txt", "w")

        feature_names = FeatureExtractor.get_combinations_from_config_file_unsorted(config_data)

        x_train = read_features_file(config_learning.get('x_train'), '\t')
        y_train = read_reference_file(config_learning.get('y_train'), '\t')
        x_test = read_features_file(config_learning.get('x_test'), '\t')
        estimator, scorers = learn_model.set_learning_method(config_learning, x_train, y_train)

        scale = config_learning.get("scale", True)

        if scale:
            x_train, x_test = scale_datasets(x_train, x_test)

        rfe = RFE(estimator, number_features, step=1)
        rfe.fit(x_train, y_train)

        for i, name in enumerate(feature_names):
            output.write(name + "\t" + str(rfe.ranking_[i]) + "\n")
            print(name + "\t" + str(rfe.ranking_[i]))

        predictions = rfe.predict(x_test)

        output.close()

        return predictions
コード例 #5
0
ファイル: test_pprint.py プロジェクト: chrisfilo/scikit-learn
def test_deeply_nested():
    # Render a deeply nested estimator
    rfe = RFE(RFE(RFE(RFE(RFE(RFE(RFE(LogisticRegression())))))))
    expected = """
RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=LogisticRegression(C=1.0,
                                                                                                                     class_weight=None,
                                                                                                                     dual=False,
                                                                                                                     fit_intercept=True,
                                                                                                                     intercept_scaling=1,
                                                                                                                     l1_ratio=None,
                                                                                                                     max_iter=100,
                                                                                                                     multi_class='warn',
                                                                                                                     n_jobs=None,
                                                                                                                     penalty='l2',
                                                                                                                     random_state=None,
                                                                                                                     solver='warn',
                                                                                                                     tol=0.0001,
                                                                                                                     verbose=0,
                                                                                                                     warm_start=False),
                                                                                        n_features_to_select=None,
                                                                                        step=1,
                                                                                        verbose=0),
                                                                          n_features_to_select=None,
                                                                          step=1,
                                                                          verbose=0),
                                                            n_features_to_select=None,
                                                            step=1, verbose=0),
                                              n_features_to_select=None, step=1,
                                              verbose=0),
                                n_features_to_select=None, step=1, verbose=0),
                  n_features_to_select=None, step=1, verbose=0),
    n_features_to_select=None, step=1, verbose=0)"""

    expected = expected[1:]  # remove first \n
    assert rfe.__repr__() == expected
コード例 #6
0
ファイル: wine.py プロジェクト: rupakc/UCI-Data-Analysis
def recursiveFeatureSelector(classifier_model,train_data,train_labels,test_data,number_of_features):
    
    rfe = RFE(classifier_model,number_of_features)
    transformed_train_data = rfe.fit_transform(train_data,train_labels)
    transformed_test_data = rfe.transform(test_data)
    
    return transformed_train_data,transformed_test_data 
コード例 #7
0
ファイル: RFEWrapper.py プロジェクト: jbjorne/CAMDA2014
 def doRFE(self, X, y):
     # do RFE
     self.numFeatures = X.shape[1]
     svc = SVC(kernel="linear", C=self.C)
     rfe = RFE(estimator=svc, n_features_to_select=1, step=1)
     rfe.fit(X, y)
     self.feature_importances_ = self._getImportances(rfe.ranking_)
コード例 #8
0
def feature_sorting(features_values_temp, rows_temp, columns_temp, prediction_values_temp, kernel, threshold):
	rows = 0
	while rows_temp > 0:
		rows = rows + 1
		rows_temp = rows_temp - 1

	columns = 0
	while columns_temp > 0:
		columns = columns + 1
		columns_temp = columns_temp - 1

	features_values = [x for x in features_values_temp]
	prediction_values = [y for y in prediction_values_temp]

	rotated = convert_list_to_matrix(features_values, rows, columns)
	# print rotated.shape
	scores = np.array(prediction_values)

	threshold = float(threshold)

	estimator = SVR(kernel=kernel) # try to change to the model for which the test is gonna run (lasso, ridge, etc.)

	selector = RFE(estimator, 0, step=1)
	selector = selector.fit(rotated, scores)
	features_used = [i for i, x in enumerate(selector.support_) if x == True] # i+1 b/c matlab starts indexing from 1

	return selector.ranking_.tolist()
コード例 #9
0
ファイル: main.py プロジェクト: chaluemwut/featureselection
def test_main():
    iris = load_iris()
    x, y = iris.data, iris.target
    estimator = SVR(kernel="linear")
    selector = RFE(estimator, 2 , step=1)
    selector = selector.fit(x, y)
    print selector.support_
コード例 #10
0
ファイル: feature_selection.py プロジェクト: wtgme/ohsn
def ref(X, y, n_features_to_select=1, kernel='linear'):
    # specify the desired number of features
    # return the masks and ranking of selected features
    estimator = SVC(kernel=kernel, class_weight='balanced')
    selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)
    selector = selector.fit(X, y)
    return (selector)
コード例 #11
0
ファイル: testing.py プロジェクト: quentinperrot/stayalert
def recursiveFeatureSelection():
	X = np.array(trainingData, dtype=float)
	y = np.array(trainingDataLabels, dtype=float)
	svc = SVC("linear", 1)
	rfe = RFE(svc, 1, 1)
	rfe.fit(X, y)
	print rfe
コード例 #12
0
def featSelect(label,trainSet,trainObs,cv,numFeat=5,SEED=34,name=''):
	from sklearn.feature_selection import RFE
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import roc_auc_score
	from numpy import zeros
	model = LogisticRegression(random_state=SEED)
	predCv = zeros(len(trainObs))
	rfe = RFE(model, numFeat, step=1)
	rfe.fit(trainSet,trainObs)
	vars = list(trainSet.columns[rfe.ranking_ == 1])
	auc = 0
	for i in range(1,max(rfe.ranking_)):
		for tr, vl in cv:
			model.fit(trainSet[vars + list(trainSet.columns[rfe.ranking_ == i])].ix[tr],trainObs[tr])
			predCv[vl] = model.predict_proba(trainSet[vars + list(trainSet.columns[rfe.ranking_ == i])].ix[vl])[:,1]
		if roc_auc_score(trainObs,predCv) > auc:
			auc = roc_auc_score(trainObs,predCv)
			vars += list(trainSet.columns[rfe.ranking_ == i])
	for v in vars:
		for tr, vl in cv:
			model.fit(trainSet[[x for x in vars if x != v]].ix[tr],trainObs[tr])
			predCv[vl] = model.predict_proba(trainSet[[x for x in vars if x != v]].ix[vl])[:,1]
		if roc_auc_score(trainObs,predCv) > auc:
			auc = roc_auc_score(trainObs,predCv)
			vars.remove(v)
	for v in [x for x in trainSet.columns if x not in vars]:
		for tr, vl in cv:
			model.fit(trainSet[vars + [v]].ix[tr],trainObs[tr])
			predCv[vl] = model.predict_proba(trainSet[vars + [v]].ix[vl])[:,1]
		if roc_auc_score(trainObs,predCv) > auc:
			auc = roc_auc_score(trainObs,predCv)
			vars += [v]
	print name,"Final AUC:  ",auc
	return {label: vars}
コード例 #13
0
 def get_model_RFE_top_features(self,expression_file,ic50_file,target_features,drug):
     expression_frame,ic50_series = dfm.get_expression_frame_and_ic50_series_for_drug(expression_file, ic50_file,drug,normalized=True,trimmed=True,threshold=None)
     scikit_data,scikit_target = dfm.get_scikit_data_and_target(expression_frame,ic50_series)
     step_length = int(len(scikit_data.tolist()[0]) / 100) + 1
     selector = RFE(self.model,int(target_features),step=step_length)
     selector.fit(scikit_data,scikit_target)
     return [expression_frame.index[i] for i in xrange(0,len(expression_frame.index)) if selector.support_[i]]
コード例 #14
0
ファイル: MLearn.py プロジェクト: RiB-/Sandy_Damages_NYC
def LogReg(X_train, X_test, y_train, y_test, Min_N_Feat, Max_N_Feat, mask='None',weights='auto'):
#******************************************************************************

    from sklearn.feature_selection import RFE #import the library to rank features with recursive feature elimination
    from sklearn.linear_model import LogisticRegression as LogR #import the Logistic Regression module
    
    if mask=='None':
        mask = np.zeros((Max_N_Feat-Min_N_Feat+1,int(X_train.shape[1])),dtype='bool') #define the mask to obtain the list of selected features
    #end
    Pred_Train = np.zeros((int(max(y_train.shape)),Max_N_Feat-Min_N_Feat+1),dtype='int') #define the matrix of outputs (each prediction set is stored in a different column)
    Pred_Test = np.zeros((int(max(y_test.shape)),Max_N_Feat-Min_N_Feat+1),dtype='int') #define the matrix of outputs (each prediction set is stored in a different column)
    
    print 'Logistic Regression: Training...' #notify the user about the status of the process    
    for ift in range(Min_N_Feat,Max_N_Feat+1): #iterate across the maximum number of features    
        LogReg_obj = LogR(C=1e3, class_weight=weights) #create the logistic regression model
        if mask=='None':
            rfe = RFE(LogReg_obj, ift) #create the RFE model and select the number of attributes
            rfe = rfe.fit(X_train,y_train) #train the RFE (feature selection) model on the train data sets
            mask[ift-Min_N_Feat,:] = rfe.support_ #apply the best feature mask to the output mask
        #end
        LogReg_obj.fit(X_train[:,mask[ift-Min_N_Feat,:]], y_train) #fit the logistic model to the train data sets
        Pred_Train[:,ift-1] = LogReg_obj.predict(X_train[:,mask[ift-Min_N_Feat,:]]) #apply the logistic model to the train dataset
        Pred_Test[:,ift-1] = LogReg_obj.predict(X_test[:,mask[ift-Min_N_Feat,:]]) #apply the logistic model to the test dataset
        print 'Logistic Regression: Predicting...', 100*ift/(Max_N_Feat-Min_N_Feat+1), '%' #notify the user about the status of the process 
    #end
        
    print 'Logistic Regression: Completed!' #notify the user about the status of the process
        
    return Pred_Train, Pred_Test, mask
コード例 #15
0
ファイル: selectfeature.py プロジェクト: yjia3h695/insight
def remove_one_feature(X, Y, names):
   lr = LinearRegression()
   rfe = RFE(lr, n_features_to_select=1)
   rfe.fit(X,Y)
   rank = (sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names)))
   print(rank)
   return rank[-1][1]
コード例 #16
0
 def selectFeaturesFromSubsetRecursive(self,subset,numFeatures):
   model = svm.LinearSVC(class_weights='auto')
   rfe = RFE(model, numFeatures)
   rfe = rfe.fit(self.instances[:,subset], self.classes)
   # summarize the selection of the attributes
   # print(rfe.get_support(indices=True))
   # print(rfe.ranking_)
   return rfe.get_support(indices=True)
コード例 #17
0
    def buildTree(self,depth):
        #Here, we define the parameters of our tree and use a feature selection algorithm (RFE) to pick out the strongest features.

        self.tree = DecisionTreeClassifier(criterion = 'entropy', max_depth=depth, random_state=0)
        selector = RFE(self.tree, 2, step=1)
        selector = selector.fit(self.X_train, self.Y_train)
        selector.support_
        selector.ranking_
コード例 #18
0
ファイル: backwards.py プロジェクト: susanctu/229project
def rec_feature_elim(data,num_features=17700):
    X = data.get_gene_exp_matrix()
    y = data.get_labels()
    svc = SVC(kernel="linear", C=1)
    rfe = RFE(estimator=svc, n_features_to_select=num_features, step=1)
    selector = rfe.fit(X, y)
    mask = map(lambda x: 1 if x is True else 0,selector.support_)
    print_genes_nonzero_coeff(data,mask)
コード例 #19
0
def build_model(x,y,no_features):
    """
    Build a linear regression model
    """
    model = LinearRegression(normalize=True,fit_intercept=True)
    rfe_model = RFE(estimator=model,n_features_to_select=no_features)
    rfe_model.fit(x,y)
    return rfe_model    
コード例 #20
0
ファイル: ML_Algs.py プロジェクト: hyb148/python_code
def recursive_feature_elimination(X, y):
    model = LogisticRegression()
    # create the RFE model and select 3 attributes
    rfe = RFE(model, 3)
    rfe = rfe.fit(X, y)
    # summarize the selection of the attributes
    print(rfe.support_)
    print(rfe.ranking_)
コード例 #21
0
ファイル: predict.py プロジェクト: BIDS-collaborative/EDAM
def feature_selection(X, y):
	model = LR()
	rfe = RFE(model, 10)
	fit = rfe.fit(X, y)
	print("Num Features: %d") % fit.n_features_
	print("Selected Features: %s") % fit.support_
	print("Feature Ranking: %s") % fit.ranking_
	print fit.score(X, y)
	return fit.transform(X)
コード例 #22
0
def quick_rfe(estimator, X, y):

    rfe = RFE(estimator = estimator, n_features_to_select = 1)
    rfe.fit(X,y)

    features = X.columns.tolist()
    sorted_features = [f for (rank, f) in sorted(zip(rfe.ranking_, features))]

    return sorted_features, rfe.ranking_
コード例 #23
0
ファイル: logreg.py プロジェクト: kzh4ng/CS1671-Final_Project
class LogReg:

  """
  Initialization sets the objects model, vectorizer, labels, and corpus
  variables. Initialization also performs the initial training for the model
  and vectorizer using the given reviews.
  """
  def __init__(
      self,
      reviews,
      vectorizer = TfidfVectorizer(stop_words = 'english', max_df = 1,
        ngram_range = (1, 2)),
      model = LogisticRegression()
      ):
    self.model = model
    self.vectorizer = vectorizer
    self.selector = RFE(self.model, step = 100, verbose = 100)

    corpus = []
    labels = []
    for review in reviews:
      corpus += [review[1]["text"]]
      labels += [review[0]]

    #setting variables for the object
    self.corpus = corpus
    self.labels = labels
    self.reviews = reviews

    X = self.vectorizer.fit_transform(self.corpus)
    self.feature_names = self.vectorizer.get_feature_names()
    y = self.labels
    for string in self.feature_names:
      print(string.encode("ascii", 'ignore'))

    #Training the model
    X_new = self.selector.fit_transform(X, self.labels)
    self.model.fit(X_new, self.labels)

  def classify_all(self, all_test_data):
    test_corpus = []
    y = []
    for review in all_test_data:
      test_corpus += [review[1]['text']]
      y += [review[0]]

    #Used transform instead of fit_transform
    #for test data so number of features will match
    X = self.vectorizer.transform(test_corpus)
    X_new = self.selector.transform(X)
    results = self.model.predict(X_new)
    categories = ["spring", "summer", "fall", "winter"]
    for i, category in enumerate(categories):
      top10 = np.argsort(self.model.coef_[i])[-20:]
      for j in top10:
        print("%s: %s" % (category, "".join(self.feature_names[j])))
    return results
コード例 #24
0
    def recurrciveFE(self, data):
        """
        Uses Recurrcise Feature Elimination to determine the write number of 
        features before adding additional leads to overfitting &
         It works by recursively removing attributes and building a model on those 
         attributes that remain. It uses the model accuracy to identify 
         which attributes (and combination of attributes) contribute the 
         most to predicting the target attribute.

        Parameters
        ----------
        data : DataFrame
            Input data, for which categorical variables should be converted
            response should be in 0 column, predictors in additional

        Returns
        -------
        out : Plot
            A plot with the number of optimal number of features,
            which is then used to determine features of most
            importance returned in a print out to console
          
        """
        features_list = data.columns.values[1::]
        predictors = np.asarray(data.values[:, 1::])
        response = np.asarray(data.values[:, 0])
        estimator = SVC(kernel="linear")
        
        ###using cross validation to determine nooffeatures
        rfecv = RFE(estimator, step=1, cv=StratifiedKFold(response, 2), scoring = 'accuracy')
        rfecv.fit(predictors, response)
        RFE( )
        print("Optimal number of features : %d" % rfecv.n_features_)
        
        # Plot number of features VS. cross-validation scores
        plt.figure()
        plt.xlabel("Number of features selected")
        plt.ylabel("Cross validation score (nb of correct classifications)")
        plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
        plt.show()        
        
        ##label as optimal #of features
        noffeatures = rfecv.n_features_  
        
        ##use rfe to determine top features
        selector = RFE(estimator,noffeatures , step=1)
        selector = selector.fit(predictors, response)
        ##creat index to get names
        index1 = np.where(selector.support_ == False)[0]
        index = np.argsort(selector.ranking_[index1])[::-1]
        feature_list_imp = features_list[index]

        for f in range(index.shape[0]):
            print("%d. feature %d (%s)" % (f + 1, index[f], feature_list_imp[index[f]]))
        print(selector.support_)
        print(selector.ranking_)    
コード例 #25
0
def recursive_fs(X, y, clf, num_features):
    # create the RFE model and select 3 attributes
    rfe = RFE(clf, num_features)

    start = time.time()
    rfe = rfe.fit(X, y)
    # summarize the selection of the attributes
    end = time.time()
    print ("Training Time: " + str((end - start)) + "s")
    return rfe
コード例 #26
0
ファイル: c10.py プロジェクト: 3774257/abu
 def feature_selection(estimator, x, y):
     """
         支持度评级
     """
     selector = RFE(estimator)
     selector.fit(x, y)
     print('RFE selection')
     print(pd.DataFrame(
         {'support': selector.support_, 'ranking': selector.ranking_},
         index=pig_three_feature.columns[1:]))
コード例 #27
0
def trainDesicionTreeClassifier():
    modelDesicionTree=DecisionTreeClassifier(max_depth=5)
    # set the number of features to 10
    rfedecisiontree = RFE(modelDesicionTree, 10)
    rfedecisiontree = rfedecisiontree.fit(X_train, y_train)
    print("Feature Importance of Decision Tree Model")
    print(rfedecisiontree.support_)
    print(rfedecisiontree.ranking_)
    modelDesicionTree.fit(X_train, y_train)
    return modelDesicionTree
コード例 #28
0
def rank(training_set, paradigm_lengths, category_description):

    transfomer = DataTransformer(training_set, paradigm_lengths, category_description)
    headlines, matrix, targets = transfomer.get_training_data_matrix(normalize=True)
    matrix = matrix.toarray()
    estimator = svm.SVC(C=1, kernel='linear')
    selector = RFE(estimator, 1, step=1)
    selector = selector.fit(matrix, targets)
    for i in range(len(headlines)):
        print headlines[i], selector.ranking_[i]
コード例 #29
0
def trainLogisticRegression():
    modelLogisticRegression=LogisticRegression()
    #set the number of features to 10
    rfelogisticReg=RFE(modelLogisticRegression,10)
    rfelogisticReg=rfelogisticReg.fit(X_train, y_train)
    print("Feature Importance of Logistic Regression Model")
    print(rfelogisticReg.support_)
    print(rfelogisticReg.ranking_)
    modelLogisticRegression.fit(X_train, y_train)
    return modelLogisticRegression
コード例 #30
0
def select_features(X, y, clf=None, n_features=10):
    if not clf:
        clf = LogisticRegression()
    clf.fit(X, y)
    selector = RFE(clf, n_features_to_select=n_features)
    selector = selector.fit(X, y)
    features = np.array(range(57))
    # print selector.ranking_
    # print selector.support_
    return features[selector.support_]
コード例 #31
0
#########################
transfomers = [DummyTransformer, Normalizer(), StandardScaler()]
transfomers_cfg = {}
transfomers_cfg[DummyTransformer.func.__name__] = {}
transfomers_cfg[Normalizer.__name__] = dict(
    transfomer__norm=['l1', 'l2', 'max'])
transfomers_cfg[StandardScaler.__name__] = {}

###########################
####Dim Reducer, Feat Sel.#
###########################
reducers = [
    DummyTransformer,
    PCA(),
    GenericUnivariateSelect(),
    RFE(ExtraTreesRegressor())
]
reducers_cfg = {}
reducers_cfg[DummyTransformer.func.__name__] = {}
reducers_cfg[PCA.__name__] = dict(
    reducer__n_components=[],
    # reducer__whiten = [True, False],
    reducer__svd_solver=['auto'])
reducers_cfg[GenericUnivariateSelect.__name__] = dict(
    reducer__score_func=[f_regression],
    reducer__mode=['k_best'],
    reducer__param=[])
reducers_cfg[RFE.__name__] = dict(reducer__n_features_to_select=[],
                                  reducer__step=[0.1])
#########################
####### Models ##########
コード例 #32
0
#                                param_grid=param_grid,
#                                scoring='accuracy',
#                                cv=10,
#                                n_jobs=-1)
#    pred = estimators[k].predict(X_test)
#    print("%s Score: %0.02f" % (k, estimators[k].score(X_test, y_test)))
#    scores = cross_validation.cross_val_score(estimators[k], X, y, cv=5)
#    print("%s Cross Avg. Score: %0.02f (+/- %0.02f)" % (k, scores.mean(), scores.std() * 2))
#    end_time = datetime.datetime.now()
#    time_spend = end_time - start_time
#    print("%s Time: %0.02f" % (k, time_spend.total_seconds()))    
    
    
    
from sklearn.feature_selection import RFE
rfe = RFE(clf, 41)
clf1 = rfe.fit(X, y)
clf1.score(X, y)

yhat_test = clf1.predict_proba(X_test)
clf1.score(X_test, y_test)
#conduct grid search for the models:
#logistic regression
from sklearn.grid_search import GridSearchCV
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]        
tuned_parameters = [{'C': param_range}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
コード例 #33
0
# ### Fit a logistic regression model

# In[14]:


lgr = LogisticRegression(C=5)
lgr.fit(X,y)


# ### Select best features using RFE feature selection

# In[42]:


from sklearn.feature_selection import RFE
selector = RFE(lgr, 20)
selector.fit_transform(X, y)

ranks = selector.ranking_
X_names = encoded_df.columns.drop('good_bad')
# print sorted(map(lambda x: round(x, 4), selector.ranking_), names)


# In[110]:


rfe_features = np.column_stack((X_names, ranks))
rfe_cols = rfe_features[np.where(rfe_features[:,1]<10),:2][0]
rfe_col1 = rfe_cols[:,:1]
print(rfe_col1)
コード例 #34
0
#testFeatures, testLabels = transformDataset(test_sents)
corpus=[d for (d,c) in documents]
labels=[c for (d,c) in documents]
features=tfidf(corpus)

#print(features[1])

#features,labels=transformDataset(documents)
#vec = DictVectorizer()
#features_new=vec.fit_transform(features).toarray()
#print(features_new.shape)

print(len(features))
print(len(labels))
svc = SVC(kernel="linear", C=1)
clf = RFE(svc, 300, step=1)
fe = clf.fit_transform(features, labels)
#print(fit.scores_)
print(fe.shape)

# summarize selected features
trainFeatures, testFeatures, trainLabels, testLabels = train_test_split(fe,labels, test_size=0.33, random_state=42)


print("length of testLabels=",len(testLabels))
#for l in testLabels:
#    print("label=",l)
#print("features=",trainFeatures[1],"label=",trainLabels[1])
#featuresets = [(document_features(d), c) for (d,c) in documents]

var = 1
コード例 #35
0
def classify_one_vs_many(df,
                         model_name,
                         model,
                         feature_to_class,
                         type_class,
                         type_0_class=None):
    GH_df_reduced_one_vs_many = df.copy()
    if type_0_class is None:
        others_df = GH_df_reduced_one_vs_many[(
            GH_df_reduced_one_vs_many[feature_to_class] != type_class)].copy()
        others_df.loc[:, 'ml_type'] = type_0_class = 'others'
    else:
        others_df = GH_df_reduced_one_vs_many[(
            GH_df_reduced_one_vs_many[feature_to_class] == type_0_class
        )].copy()
        others_df.loc[:, 'ml_type'] = type_0_class
    category_df = GH_df_reduced_one_vs_many[
        GH_df_reduced_one_vs_many[feature_to_class] == type_class].copy()
    category_df.loc[:, 'ml_type'] = type_class

    df_merged = pd.concat([others_df, category_df], ignore_index=True)

    # print df_merged.groupby(['ml_type','category'])['analizo_accm_mean'].count()
    X = df_merged.select_dtypes(include=[np.number])
    y = df_merged.loc[:, 'ml_type']
    test_size = 0.2
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=test_size)
    ros = RandomOverSampler(random_state=0)
    if len(y_train.unique()) < 2:
        print('cannot fit for {}'.format(type_class))
        return None
    X_resampled, Y_resampled = ros.fit_resample(X_train, y_train)

    # print('Training target statistics: {}'.format(Counter(y)))
    if Counter(y)[type_class] == 1:
        print('cannot fit for {}'.format(type_class))
        return

    model.fit(X_train, y_train)
    # print model.score(X_test,y_test)
    rfe = RFE(model, 4)
    fit = rfe.fit(X_train, y_train)
    # print "Selected features : " + str(X.columns[fit.support_])
    pred = model.predict(X_test)
    #     print Counter(pred)
    #     df_accurarcy  = set_wrong_type(pred,y, df_merged,type_class)
    # calculate_accurarcy(df_accurarcy,pred,y,type_class)
    fpr = tpr = roc_auc = None
    t = True
    try:

        y_pred = model.predict_proba(X_test)[:, 1]
    except:
        t = False
    if t:
        fpr, tpr, _ = roc_curve(y_test, y_pred, pos_label=type_class)
        roc_auc = auc(tpr, fpr)
    f1 = f1_score(y_test, pred, pos_label=type_class)

    return {
        'model_name': model_name,
        'agent_type': agent_type,
        'feature_importance': fit,
        'model': model,
        'fpr': fpr,
        'tpr': tpr,
        'auc': roc_auc,
        'f1_score': f1,
        'class 0': type_0_class,
        'class 1': type_class
    }
def main(training_input_path, testing_input_path, output_path):

    # LOAD DATA
    train = pd.read_csv(training_input_path, header=0)
    test = pd.read_csv(testing_input_path, header=0)

    # PREPROCESSING
    le = LabelEncoder()
    train["ocean_proximity"] = le.fit_transform(train["ocean_proximity"])
    test["ocean_proximity"] = le.transform(test["ocean_proximity"])

    # SPLIT TRAINING AND TESTING DATA INTO X AND Y
    X_train = train.drop(columns="median_house_value")
    y_train = train['median_house_value']
    X_test = test.drop(columns="median_house_value")
    y_test = test['median_house_value']

    # CREATE A DF THAT EXCLUDES LATITUDE AND LONGITUDE
    X_train_featexc = X_train.drop(columns=["latitude", "longitude"])
    X_test_featexc = X_test.drop(columns=["latitude", "longitude"])

    # CREATE A DF THAT EXCLUDES LATITUDE, LONGITUDE, AND TOTAL BEDROOMS
    X_train_featexc_2 = X_train.drop(
        columns=["latitude", "longitude", "total_bedrooms"])
    X_test_featexc_2 = X_test.drop(
        columns=["latitude", "longitude", "total_bedrooms"])

    # APPLY SCALER
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    X_train_featexc = scaler.fit_transform(X_train_featexc)
    X_test_featexc = scaler.transform(X_test_featexc)
    X_train_featexc_2 = scaler.fit_transform(X_train_featexc_2)
    X_test_featexc_2 = scaler.transform(X_test_featexc_2)

    # LINEAR REGRESSION WITH FEATURE SELECTION - ALL FEATURES AVAILABLE
    lr_response = {
        'n_features_to_select': [],
        'train_error': [],
        'test_error': []
    }

    for i in list(range(1, X_train.shape[1] + 1, 1)):
        lr_response['n_features_to_select'].append(i)

        rfe_lr = RFE(LinearRegression(), n_features_to_select=i)
        rfe_lr.fit(X_train, y_train)
        lr_response['train_error'].append(
            round(1 - rfe_lr.score(X_train, y_train), 3))
        lr_response['test_error'].append(
            round(1 - rfe_lr.score(X_test, y_test), 3))
    pd.DataFrame(lr_response).to_csv(output_path + 'lr_rfe_results_table.csv',
                                     index=False)

    # Plotting LR performance
    data = pd.DataFrame(lr_response).melt(
        id_vars='n_features_to_select',
        value_vars=['train_error', 'test_error'])
    plot = alt.Chart(data).mark_line().encode(
        x=alt.X('n_features_to_select:Q', title="Number of Features Selected"),
        y=alt.Y('value:Q', title="Error"),
        color=alt.Color('variable:N', title="Data Split")).properties(
            title="Recursive Feature Elimination Linear Regression Error",
            width=250,
            height=200)
    plot.save(output_path + 'LR_performace.png')

    # LINEAR REGRESSION WITH FEATURE SELECTION - EXCLUDING LATITUDE AND LONGITUDE
    lr_response_exc = {
        'n_features_to_select': [],
        'train_error': [],
        'test_error': []
    }

    for i in list(range(1, X_train_featexc.shape[1] + 1, 1)):
        lr_response_exc['n_features_to_select'].append(i)

        rfe_lr = RFE(LinearRegression(), n_features_to_select=i)
        rfe_lr.fit(X_train_featexc, y_train)
        lr_response_exc['train_error'].append(
            round(1 - rfe_lr.score(X_train_featexc, y_train), 3))
        lr_response_exc['test_error'].append(
            round(1 - rfe_lr.score(X_test_featexc, y_test), 3))
    pd.DataFrame(lr_response_exc).to_csv(output_path +
                                         'lr_rfe_results_table_exc_feats.csv',
                                         index=False)

    # Plotting LR performance excluding latitude and longitude
    data = pd.DataFrame(lr_response_exc).melt(
        id_vars='n_features_to_select',
        value_vars=['train_error', 'test_error'])
    plot = alt.Chart(data).mark_line().encode(
        x=alt.X('n_features_to_select:Q', title="Number of Features Selected"),
        y=alt.Y('value:Q', title="Error"),
        color=alt.Color('variable:N', title="Data Split")
    ).properties(
        title=
        "Recursive Feature Elimination Linear Regression Error Excluding Latitude and Longitude",
        width=250,
        height=200)
    plot.save(output_path + 'LR_performace_exc_feats.png')

    # LINEAR REGRESSION WITH FEATURE SELECTION - EXCLUDING LATITUDE, LONGITUDE, AND TOTAL BEDROOMS
    lr_response_exc_2 = {
        'n_features_to_select': [],
        'train_error': [],
        'test_error': []
    }

    for i in list(range(1, X_train_featexc_2.shape[1] + 1, 1)):
        lr_response_exc_2['n_features_to_select'].append(i)

        rfe_lr = RFE(LinearRegression(), n_features_to_select=i)
        rfe_lr.fit(X_train_featexc_2, y_train)
        lr_response_exc_2['train_error'].append(
            round(1 - rfe_lr.score(X_train_featexc_2, y_train), 3))
        lr_response_exc_2['test_error'].append(
            round(1 - rfe_lr.score(X_test_featexc_2, y_test), 3))
    pd.DataFrame(lr_response_exc_2).to_csv(
        output_path + 'lr_rfe_results_table_exc_feats_2.csv', index=False)

    # Plotting LR performance excluding latitude and longitude
    data = pd.DataFrame(lr_response_exc_2).melt(
        id_vars='n_features_to_select',
        value_vars=['train_error', 'test_error'])
    plot = alt.Chart(data).mark_line().encode(
        x=alt.X('n_features_to_select:Q', title="Number of Features Selected"),
        y=alt.Y('value:Q', title="Error"),
        color=alt.Color('variable:N', title="Data Split")
    ).properties(
        title=
        "Recursive Feature Elimination Linear Regression Error Excluding Latitude, Longitude, and Total Bedrooms",
        width=250,
        height=200)
    plot.save(output_path + 'LR_performace_exc_feats_2.png')

    # KNN WITH VARYING N_NEIGHBOR VALUES WITH FULL DATA INCLUSION
    knn_response = {'n_neighbours': [], 'train_error': [], 'test_error': []}

    for i in list(range(1, 20, 1)):
        knn_response['n_neighbours'].append(i)

        knn = KNeighborsRegressor(n_neighbors=i)
        knn.fit(X_train, y_train)
        knn_response['train_error'].append(
            round(1 - knn.score(X_train, y_train), 3))
        knn_response['test_error'].append(
            round(1 - knn.score(X_test, y_test), 3))
        predictions = knn.predict(X_test)
    pd.DataFrame(knn_response).to_csv(output_path + 'knn_results_table.csv',
                                      index=False)

    # ploting KNN performance
    data = pd.DataFrame(knn_response).melt(
        id_vars='n_neighbours', value_vars=['train_error', 'test_error'])
    plot = alt.Chart(data).mark_line().encode(
        x=alt.X('n_neighbours:Q', title="Number of Nearest Neighbours"),
        y=alt.Y('value:Q', title="Error"),
        color=alt.Color('variable:N', title="Data Split")).properties(
            title="K-Nearest Neighbour Error when Varying K",
            width=250,
            height=200)
    plot.save(output_path + 'KNN_performace.png')

    # plotting KNN performance compared to actual values
    pred_estimates = pd.merge(
        pd.DataFrame(y_test),
        pd.DataFrame(predictions),
        left_index=True,
        right_index=True).rename(columns={
            0: "prediction",
            "median_house_value": "actual"
        })
    pred_estimates = pd.melt(pred_estimates,
                             value_vars=['actual', 'prediction'])
    plot = alt.Chart(pred_estimates).mark_bar(opacity=0.3).encode(
        alt.X('value:Q', bin=alt.Bin(maxbins=40), title="Median House Value"),
        alt.Y('count()', stack=None, title="Count"),
        alt.Color('variable', title="Value")).properties(
            title="Histogram of Actual and Predicted Median House Values",
            width=400,
            height=200)
    plot.save(output_path + 'KNN_actual_vs_predicted.png')

    # KNN WITH VARYING N_NEIGHBOR VALUES WITH LATITUDE AND LONGITUDE EXCLUSION
    knn_response_exc = {
        'n_neighbours': [],
        'train_error': [],
        'test_error': []
    }

    for i in list(range(1, 20, 1)):
        knn_response_exc['n_neighbours'].append(i)

        knn_exc = KNeighborsRegressor(n_neighbors=i)
        knn_exc.fit(X_train_featexc, y_train)
        knn_response_exc['train_error'].append(
            round(1 - knn_exc.score(X_train_featexc, y_train), 3))
        knn_response_exc['test_error'].append(
            round(1 - knn_exc.score(X_test_featexc, y_test), 3))
        predictions = knn_exc.predict(X_test_featexc)
    pd.DataFrame(knn_response_exc).to_csv(output_path +
                                          'knn_results_table_exc_feats.csv',
                                          index=False)

    # ploting KNN performance
    data = pd.DataFrame(knn_response_exc).melt(
        id_vars='n_neighbours', value_vars=['train_error', 'test_error'])
    plot = alt.Chart(data).mark_line().encode(
        x=alt.X('n_neighbours:Q', title="Number of Nearest Neighbours"),
        y=alt.Y('value:Q', title="Error"),
        color=alt.Color('variable:N', title="Data Split")
    ).properties(
        title=
        "K-Nearest Neighbour Error when Varying K and Excluding Latitude and Longitude",
        width=250,
        height=200)
    plot.save(output_path + 'KNN_performace_exc_feats.png')

    # plotting KNN performance compared to actual values excluding latitude and longitude
    pred_estimates = pd.merge(
        pd.DataFrame(y_test),
        pd.DataFrame(predictions),
        left_index=True,
        right_index=True).rename(columns={
            0: "prediction",
            "median_house_value": "actual"
        })
    pred_estimates = pd.melt(pred_estimates,
                             value_vars=['actual', 'prediction'])
    plot = alt.Chart(pred_estimates).mark_bar(opacity=0.3).encode(
        alt.X('value:Q', bin=alt.Bin(maxbins=40), title="Median House Value"),
        alt.Y('count()', stack=None, title="Count"),
        alt.Color('variable', title="Value")
    ).properties(
        title=
        "Histogram of Actual and Predicted Median House Values Excluding Latitude and Longitude",
        width=400,
        height=200)
    plot.save(output_path + 'KNN_actual_vs_predicted_exc_feats.png')

    # RANDOM FOREST REGRESSOR
    rfr = RandomForestRegressor(random_state=522)
    gs = GridSearchCV(rfr,
                      param_grid={
                          "max_depth": np.arange(5, 10, 1),
                          "min_samples_leaf": np.arange(1, 4, 1)
                      })
    gs.fit(X_train, y_train)
    rfr = gs.best_estimator_
    rfr_response = {
        'type': ['Random Forest Regressor'],
        'train_error': [round(1 - rfr.score(X_train, y_train), 3)],
        'test_error': [round(1 - rfr.score(X_test, y_test), 3)]
    }
    pd.DataFrame(rfr_response).to_csv(output_path + 'rfr_results_table.csv',
                                      index=False)

    # TESTING
    assert os.path.isfile(output_path + 'rfr_results_table.csv')
    assert os.path.isfile(output_path + 'KNN_performace.png')
    assert os.path.isfile(output_path + 'lr_rfe_results_table.csv')
    assert os.path.isfile(output_path + 'LR_performace.png')
    assert os.path.isfile(output_path + 'rfr_results_table.csv')
    assert os.path.isfile(output_path + 'knn_results_table_exc_feats.csv')
    assert os.path.isfile(output_path + 'KNN_performace_exc_feats.png')
    assert os.path.isfile(output_path + 'lr_rfe_results_table_exc_feats.csv')
    assert os.path.isfile(output_path + 'LR_performace_exc_feats.png')
    assert os.path.isfile(output_path + 'lr_rfe_results_table_exc_feats_2.csv')
    assert os.path.isfile(output_path + 'LR_performace_exc_feats_2.png')
    assert os.path.isfile(output_path + 'KNN_actual_vs_predicted.png')
    assert os.path.isfile(output_path +
                          'KNN_actual_vs_predicted_exc_feats.png')
for o in range(0, 10):

    #split into test and train set
    F_Training_Train, F_Training_Test, Label_Training_Train, Label_Training_Test = train_test_split(
        features_training, label_training, test_size=0.33)
    F_Test_Train, F_Test_Test, Label_Test_Train, Label_Test_Test = train_test_split(
        features_test, label_test, test_size=0.70)

    #classification
    #    clf = SVC(kernel='linear')
    #    clf = LogisticRegression()
    #    clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
    clf = GradientBoostingClassifier()

    #recursive feature elimination
    selector = RFE(clf, 1, step=1)
    Label_train = np.ravel(Label_Training_Train)
    Label_test = np.ravel(Label_Test_Test)
    selector = selector.fit(F_Training_Train, Label_train)
    rank = selector.ranking_
    Rank.append(rank)
    rank = np.asarray(rank)

    #create a list that contains index numbe of ranked features
    rankedlist = np.zeros((7, 1))

    #finding index of the ranked features and creating new training and test sets with respect to this ranking
    for m in range(1, 8):
        k = np.where(rank == m)
        rankedlist[m - 1] = k[0][0]
        F_Training_Train[:,
コード例 #38
0
#    acc = accuracy_score(y_test, y_pred)
#    print("Accuracy: {:.4%}".format(acc))
#    print(classification_report(y_test, y_pred, digits=4))

seeds = 1618  # Регулируем значения псевдогенератора случайных чисел
confusion_matrixs = []

# ОТБОР ПРИЗНАКОВ. Метод 1
model = ExtraTreesClassifier(random_state=seeds)
model.fit(x_tr, y_tr)
print(model.feature_importances_)

# ОТБОР ПРИЗНАКОВ. Метод 2
model = LogisticRegression(random_state=seeds)
# create the RFE model and select 3 attributes
rfe = RFE(model, 2)
rfe = rfe.fit(x_tr, y_tr)
print(rfe.support_)
print(rfe.ranking_)

# Оба метода отбора признаков говорят о минимальном вкладе 2 и 4 компоненты.
# Для текущего этапа(классификация первого столбца в справочнике). Можно их исключить, но у меня не так много признаков. Пока оставим.

seeds = 1618  # Регулируем значения псевдогенератора случайных чисел
''' ==>  ЛОГИСТИЧЕСКАЯ РЕГРЕССИЯ
'''
# Часто используется для задач бинарной классификации, но допускается и многоклассовая классификация методом "one-vs-all".
# Достоинством этого алгоритма являеся то, что на выходе для каждого обьекта мы имеем вероятсность принадлежности классу.

model = LogisticRegression(random_state=1618, solver='lbfgs')
model.fit(x_tr, y_tr)
コード例 #39
0
# print(cor_feature)

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=num_feats)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=num_feats, step=10, verbose=5)
rfe_selector.fit(X_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

embeded_lr_selector = SelectFromModel(LogisticRegression(solver='saga', penalty="l1", max_iter=1000), max_features=num_feats)
embeded_lr_selector.fit(X_norm, y)

embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')
コード例 #40
0
       '''

    print('--FEATURE SELECTION ON--', '\n')

    ##1) Run Feature Selection #######
    if fs_type == 1:
        #Stepwise Recursive Backwards Feature removal
        if binning == 0:
            clf = DecisionTreeClassifier(criterion='gini',
                                         splitter='best',
                                         max_depth=None,
                                         min_samples_split=3,
                                         min_samples_leaf=1,
                                         max_features=None,
                                         random_state=rand_st)
            sel = RFE(clf, n_features_to_select=k_cnt, step=.1)
            print('Stepwise Recursive Backwards - Random Forest: ')
        if binning == 1:
            rgr = DecisionTreeClassifier(criterion='gini',
                                         splitter='best',
                                         max_depth=None,
                                         min_samples_split=3,
                                         min_samples_leaf=1,
                                         max_features=None,
                                         random_state=rand_st)
            sel = RFE(rgr, n_features_to_select=k_cnt, step=.1)
            print('Stepwise Recursive Backwards - Random Forest: ')

        fit_mod = sel.fit(data_np, target_np)
        print(sel.ranking_)
        sel_idx = fit_mod.get_support()
コード例 #41
0

colname=loan_train.columns[:]
colname
from sklearn import tree
with open(r"XYZCorp_LendingData.txt", "w") as f:  
    f = tree.export_graphviz(model_Decision_tree, feature_names= colname[:-1],out_file=f)
#generate the file and upload the code in webgraphviz.com to plot the decision tree
    
# feature importance attribute of decision tree
    print(list(zip(colname,model_Decision_tree.feature_importances_)))



from sklearn.feature_selection import RFE 
rfe = RFE(classifier, 20)
model_rfe = rfe.fit(X_train, Y_train)
print("Num Features: ",model_rfe.n_features_)
print("Selected Features: ") 
print(list(zip(loan_train.columns, model_rfe.support_)))
print("Feature Ranking: ", model_rfe.ranking_) 

Y_pred=model_rfe.predict(X_test)


#predicting using the Random_Forest_Classifier
from sklearn.ensemble import RandomForestClassifier

model_RandomForest=RandomForestClassifier(500)

###
コード例 #42
0
ファイル: rfe.py プロジェクト: Dariwala/Succinylation
    for val in l[:-1]:
        j += 1
        data[i][j] = float(val)

X, y = data[:, :-1], data[:, -1]
#y = np.array([y])
#y = np.reshape(y,(y.shape[1],y.shape[0]))

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    stratify=y,
                                                    random_state=42)

clf = SVC(gamma='auto', kernel='linear')
selector = RFE(clf, 100, step=1)
selector = selector.fit(X_train, y_train)

y_pred = selector.estimator_.predict(X_test.compress(selector.support_,
                                                     axis=1))

curr_pos = curr_neg = inc_pos = inc_neg = 0

for i in range(len(y_test)):
    if y_test[i] == 1:
        if y_pred[i] == 1:
            curr_pos += 1
        else:
            inc_neg += 1
    else:
        if y_pred[i] == 1:
コード例 #43
0
##Link https://medium.com/@aneesha/recursive-feature-elimination-with-scikit-learn-3a2cbdf23fb7
# Feature Extraction with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# load data
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = [
    'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'
]
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]
# feature extraction
model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)
print("Num Features: %d") % fit.n_features_
print("Selected Features: %s") % fit.support_
print("Feature Ranking: %s") % fit.ranking_
コード例 #44
0
model = LogisticRegression(solver='lbfgs', max_iter=500)
for i in range(1, df_X.shape[1]+1):
    fs = sorted_columns[0:i]
    df_X_selected = df_X[fs]
    scores = cross_val_score(model, df_X_selected, df_y, cv=5)
    print(fs.tolist())
    print(np.round(scores.mean(), 4))
    

######################################################################
# Backward elimination (Recursive Feature Elimination)
######################################################################
from sklearn.feature_selection import RFE

model = LogisticRegression(solver='lbfgs', max_iter=500)
rfe = RFE(model, n_features_to_select=4)
fit = rfe.fit(df_X, df_y)
print("Num Features: %d" % fit.n_features_)
fs = df_X.columns[fit.support_].tolist()   # selected features
print("Selected Features: %s" % fs)
#print("Feature Ranking: %s" % fit.ranking_)

scores = cross_val_score(model, df_X[fs], df_y, cv=5)
print("Acc: "+str(scores.mean()))

######################################################################
# Forward selection 
######################################################################
# please install 'mlxtend' moudle  

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
コード例 #45
0
plt.xlabel('Feature1')
plt.ylabel('Frequency of Feature1')
plt.show()
plt.savefig('Frequency of Feature1')

#Feature Selection

data_final_vars = data.columns.values.tolist()
y = ['Sickness', 'ID']
Y = ['Sickness']
X = [i for i in data_final_vars if i not in y]
print(X, y)

logreg = LogisticRegression()

rfe = RFE(logreg, 20)
rfe = rfe.fit(data[X], data[Y])
print(rfe.support_)
print(rfe.ranking_)

cols = [
    "Feature15",
    "Feature23",
    "Feature43",
    "Feature45",
    "Feature64",
    "Feature87",
    "Feature115",
    "Feature127",
    "Feature162",
    "Feature163",
コード例 #46
0
        for rfe_step_idx, rfe_step in enumerate(rfe_step_range):

            print(
                str(count_iter) + '/' + str(
                    len(seed_range) * len(nCoeffs_range) *
                    len(rfe_step_range)))

            for train_index, test_index in skf.split(features,
                                                     labels):  # external CV

                X_train, X_test = features[train_index], features[test_index]
                y_train, y_test = labels[train_index], labels[test_index]

                scaler = MinMaxScaler()
                sv = LinearSVC()
                rfe = RFE(sv, step=rfe_step, n_features_to_select=nCoeffs)

                # Defining scaler + rfe
                pipe = Pipeline([('std_scaler', scaler), ('fs', rfe)])

                clf = GridSearchCV(pipe,
                                   param_grid=param_grid,
                                   cv=inner_folds,
                                   scoring=scoring_fct,
                                   n_jobs=6)
                y_score = clf.fit(X_train, y_train)

                #print(clf.best_params_)

                best_model = clf.best_estimator_
                selector = best_model.named_steps['fs']
コード例 #47
0
           pd.DataFrame(np.transpose(classifier.coef_), columns = ["coef"])
           ],axis = 1)


#### Feature Selection ####


## Feature Selection
# Recursive Feature Elimination
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Model to Test
classifier = LogisticRegression()
# Select Best X Features
rfe = RFE(classifier, 20)
rfe = rfe.fit(X_train, y_train)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)
X_train.columns[rfe.support_]

# New Correlation Matrix
sn.set(style="white")

# Compute the correlation matrix
corr = X_train[X_train.columns[rfe.support_]].corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
コード例 #48
0
    return np.amax([val1, val2])

# K=2 TITANIC

pca_titanic = []
ica_titanic = []
rca_titanic = []
rfe_titanic = []

k=2
for dim in range(1, len(tit_cols)+1):
    pca = PCA(n_components=dim)
    ica = FastICA(n_components=dim)
    rca = GaussianRandomProjection(n_components=dim)
    logreg = LogisticRegression()
    rfe = RFE(logreg, n_features_to_select=dim)
    pca_X_train = pca.fit_transform(tit_X_train)
    ica_X_train = ica.fit_transform(tit_X_train)
    rca_X_train = rca.fit_transform(tit_X_train)
    rfe.fit(tit_X_train, tit_y_train)
    rfe_X_train = rfe.transform(tit_X_train)
    em = GaussianMixture(n_components=k)
    em.fit(pca_X_train)
    pca_em_X_train = em.predict(pca_X_train)
    em.fit(ica_X_train)
    ica_em_X_train = em.predict(ica_X_train)
    em.fit(rca_X_train)
    rca_em_X_train = em.predict(rca_X_train)
    em.fit(rfe_X_train)
    rfe_em_X_train = em.predict(rfe_X_train)
コード例 #49
0
ファイル: HEA-RFE.py プロジェクト: CrayT/my-code
print("finish")

names = [
    'alloy', 'class', 'delta', 'Hmix', 'Smix', 'Fi', 'RMS', 'VEC', 'r', 'Sc',
    'deltaHmixmax', 'deltaHmixmin', 'rootHmix', 'rootHmix0', 'rootHmix0+',
    'rootHmix0-'
]
data = pd.read_csv('合并数据集-去除重复.csv', header=0, names=names)
Y = data[["class"]]
X = pd.read_csv('generate_feature_1008.csv')
print("finish")

rfc = RandomForestClassifier()
#Y=Y.values
#Y= Y.reshape(c, )
rfe = RFE(estimator=rfc, n_features_to_select=1, step=1)
rfe.fit(X, Y)
ranking = rfe.ranking_
print("RFE ranking:\n", ranking)

list_ranking_index = []
list_ranking_importance = []
for i in range(len(ranking)):
    if ranking[i] <= 100:
        list_ranking_index.append(i)
        list_ranking_importance.append(ranking[i])
print("list_ranking_index:\n", list_ranking_index)
print("list_ranking_importance:\n", list_ranking_importance)
print('finish')

#写入CSV
コード例 #50
0
 def RFE(self,estimator,k):
     X=self.X
     Y=self.Y
     rfe=RFE(estimator,n_features_to_select=k)
     res=rfe.fit_transform(X,Y)
     return rfe,res
コード例 #51
0
X_train=training_data[['X1','X2','X3','X4','X5','X6','X7','X8']]
y_train=training_data[['Y']]


# step-1: create a cross-validation scheme
folds = KFold(n_splits = 10, shuffle = True, random_state = 100)

# step-2: specify range of hyperparameters to tune
hyper_params = [{'n_features_to_select': list(range(1, 9))}]


# step-3: perform grid search
# 3.1 specify model
lm = LinearRegression()
rfe = RFE(lm)             

# 3.2 call GridSearchCV()
model_cv = GridSearchCV(estimator = rfe, 
                        param_grid = hyper_params, 
                        scoring= 'r2', 
                        cv = folds, 
                        verbose = 1,
                        return_train_score=False)  


lr= model_cv.fit(X_train,y_train)
y_predict=lr.predict(X_train)
print("The coefficient of determination(r squared) obtained from Linear Regression:\n")
######score here returns The coefficient of determination(r squared) the closer to 1 the better model
print(lr.score(X_train,y_train),"\n") 
コード例 #52
0
# for p in cv:
#     print p
# print len(cv)
# sys.exit()

''' Logistic regression '''
# w = 'balanced'
# clf = LogisticRegression(class_weight=w, penalty='l1', n_jobs=1)
# parameters = {'C': np.hstack((np.arange(0.0095, 0.02, 0.0001), np.arange(0.02, 0.601, 0.005)))}
# parameters = {'C': [0.005, 0.0075, 0.01]}
# parameters = {'C': [0.005]}

clf = Pipeline([
    # ('rfe', RFE(estimator=LogisticRegression(class_weight='balanced', penalty='l1', C=0.01), n_features_to_select=2,
    ('rfe', RFE(estimator=LogisticRegression(class_weight='balanced', penalty='l1', C=0.001), n_features_to_select=2,
                step=0.1)),
    ('clf', LogisticRegression(class_weight='balanced', penalty='l1', n_jobs=1))
])

# parameters = {'clf__C': [0.005, 0.0075, 0.01]}
parameters = {'clf__C': [0.001, 0.01]}

K = 5
R = 1  # repeat cross-validation

auc_limit = 0.55
auc_hat = 1
step_remove = 1

# TODO
# TODO
コード例 #53
0
ファイル: logreg.py プロジェクト: graliuce/warming
os_data_X,os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['IntermittentIceCover'])
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of annual lakes",len(os_data_y[os_data_y['IntermittentIceCover']==0]))
print("Number of intermittent lakes",len(os_data_y[os_data_y['IntermittentIceCover']==1]))
print("Proportion of annual lakes in oversampled data is ",len(os_data_y[os_data_y['IntermittentIceCover']==0])/len(os_data_X))
print("Proportion of intermittent lakes in oversampled data is ",len(os_data_y[os_data_y['IntermittentIceCover']==1])/len(os_data_X))

dt_vars=dt.columns.values.tolist()
y=['IntermittentIceCover']
X=[i for i in dt_vars if i not in y]

logreg = LogisticRegression()
rfe = RFE(logreg, 20)
rfe = rfe.fit(os_data_X, os_data_y.values.ravel())
print(rfe.support_)
print(rfe.ranking_)

cols=[ "MeanAnnualAirTemp_c", "MaximumDepth_m", 'Latitude_dd', 'temp_range']
#cols=[ "Elevation_m", "MeanAnnualAirTemp_c", "MaximumDepth_m", 'Latitude_dd']

X=os_data_X[cols]
y=os_data_y['IntermittentIceCover']

logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)
コード例 #54
0
def train_svm_k_fold_RFE(matrix,
                         target,
                         gamma,
                         linear=True,
                         nfeatures=15,
                         nsplits=10,
                         penalty="l2",
                         C=1,
                         multi_class="ovr",
                         kernel="rbf",
                         degree=3,
                         probability=False,
                         decision_function_shape="ovr"):
    scores = []
    confusion = []
    features = []
    parameters = {
        "Gamma": gamma,
        "Linear": linear,
        "C": C,
        "Kernel": kernel,
        "Degree": degree,
        "Average": [],
        "Scores": [],
        "Features": [],
        "Macro": [],
        "Micro": [],
        "Weighted": []
    }

    if (linear):
        best_svc = LinearSVC(penalty="l2", C=C, multi_class="ovr")
    else:
        best_svc = SVC(C=C,
                       kernel=kernel,
                       gamma=gamma,
                       degree=degree,
                       probability=probability,
                       decision_function_shape=decision_function_shape)
    cv = KFold(n_splits=nsplits, random_state=42, shuffle=False)
    for train_index, test_index in cv.split(matrix):
        #print("Train Index: ", train_index, "\n")
        #print("Test Index: ", test_index)
        X_train, X_test, y_train, y_test = matrix[train_index], matrix[
            test_index], target[train_index], target[test_index]
        # ---------------- FEATURE SELECTION ------------------------

        rforest = RandomForestClassifier(random_state=101)
        rfe = RFE(estimator=rforest, n_features_to_select=nfeatures)
        rfe.fit(X_train, y_train)
        support = rfe.support_

        j = 0
        indexes = []
        for i in support:
            if i == True:
                indexes.append(j)
            j += 1

        x_train_fs = X_train[:, indexes]

        # --------------- TRAINING ------------------------------
        # Training the model
        best_svc.fit(x_train_fs, y_train)

        #--------------- TESTING -------------------------------
        # Getting the scores of the model on the test set
        svc_predictions = best_svc.predict(X_test[:, indexes])
        # getting accuracy
        scores.append(best_svc.score(X_test[:, indexes], y_test))
        # Macro
        parameters["Macro"].append(
            precision_recall_fscore_support(y_test,
                                            svc_predictions,
                                            average='macro'))
        # Micro
        parameters["Micro"].append(
            precision_recall_fscore_support(y_test,
                                            svc_predictions,
                                            average='micro'))
        # Weighted
        parameters["Weighted"].append(
            precision_recall_fscore_support(y_test,
                                            svc_predictions,
                                            average='weighted'))

        parameters["Features"].append(indexes)

        # getting confusion matrix
        confusion.append(confusion_matrix(y_test, svc_predictions))
    parameters["Scores"].append(scores)
    parameters["Average"] = np.average(scores)
    return (scores, confusion, parameters)
コード例 #55
0
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import (cross_val_score, KFold, cross_validate,
                                     train_test_split)
from sklearn.ensemble import StackingClassifier

data = load_wine()
y = data.target
X = data.data
stc = StandardScaler()
lenc = LabelEncoder()
columns = data.feature_names
df = pd.DataFrame(data=np.hstack(tup=(X, y.reshape(-1, 1))),
                  columns=np.hstack(tup=(columns, ["Class"])))
X_std = stc.fit_transform(df[columns])
pipesvm = Pipeline([("stc", stc), ("selection", RFE(LinearSVC())),
                    ("svm", SVC(kernel="linear"))])
pipelda = Pipeline([("stc", stc), ("svm", LinearDiscriminantAnalysis())])
estimators = [("LDA", pipelda), ("SVM", pipesvm)]
# El utilizar clasificadores apilados tiene beneficios cuando se trata de
# problemas multiclase, puesto que puede mejorar mucho el pronostico de clase
# al explotar el poder predictivo del pronostico para ciertas clases
stacking_classifier = StackingClassifier(estimators=estimators,
                                         final_estimator=GaussianNB())
print("Stacking stimators")
print(
    cross_val_score(X=df[columns],
                    y=y,
                    estimator=stacking_classifier,
                    cv=KFold(5)))
print("Only SVM")
コード例 #56
0
del X['target']
del X['id']
X.describe()
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(Y.values.tolist())
label = le.transform(Y)
print(list(le.classes_))
print(label)
noOfFeature = 45
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
import timeit
start = timeit.default_timer()
clf = RandomForestClassifier()
rfe = RFE(clf, noOfFeature)
fit = rfe.fit(X, label)
print("Time take %.2f " % (timeit.default_timer() - start))
print(("Num Features: %d") % fit.n_features_)
print(("Selected Features: %s") % fit.support_)
print(("Feature Ranking: %s") % fit.ranking_)
features = []
for i, j in zip(X.columns, fit.support_):
    if j == True:
        features.append(str(i))
print(features)
from sklearn.model_selection import cross_val_score
import timeit
from xgboost import XGBClassifier
from statistics import mean
train_csv = pd.read_csv('../input/train.csv')
コード例 #57
0
ファイル: main.py プロジェクト: danvr/aceleradev-data-science
    X = fifa.drop('Overall', 1)
    y = fifa['Overall']

    lr_model = LinearRegression()

    rfe = RFE(lr_model, n_features_to_select=5)
    rfe.fit(X, y)

    mask = rfe.support_
    top_features = X.columns[mask]
    return list(top_features)


# In[117]:

q4()

# In[118]:

X = fifa.drop('Overall', 1)
y = fifa['Overall']
lr_model = LinearRegression()

rfe = RFE(lr_model, n_features_to_select=5)
rfe.fit(X, y)

plt.figure()
plt.title("Feature Importance")
pd.Series(rfe.estimator_.coef_,
          index=X.columns[rfe.support_]).sort_values().plot(kind='barh')
コード例 #58
0
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from xgboost import XGBClassifier

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.86
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=XGBClassifier(learning_rate=0.1,
                                              max_depth=3,
                                              min_child_weight=16,
                                              n_estimators=100,
                                              nthread=1,
                                              subsample=0.25)),
    RFE(estimator=ExtraTreesClassifier(criterion="entropy",
                                       max_features=0.8,
                                       n_estimators=100),
        step=0.7000000000000001), GaussianNB())

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
コード例 #59
0
def logitRegression(data):

    # Feature Selection
    logistic = LogisticRegression()
    rfe = RFE(logistic, 18)
    rfe = rfe.fit(inputs, winners)
    print(rfe.support_)
    print(rfe.ranking_)
    features = rfe.support_
    print("\nFeature index: " + str(np.where(features == True)))

    # creating testing and training set
    X_train, X_test, Y_train, Y_test = train_test_split(inputs,
                                                        winners,
                                                        test_size=0.33)

    # train scikit learn model
    clf = LogisticRegression()
    clf.fit(X_train, Y_train)
    score = round(clf.score(X_test, Y_test), 2)
    print('score Scikit learn: ', score)

    logistic.fit(inputs, winners)
    predicted = logistic.predict(X_test)
    print("Predicted: " + str(predicted))
    plt.figure()
    plt.plot(predicted)

    # Metrics: confusion matrix
    cm = metrics.confusion_matrix(Y_test, predicted)
    print(cm)

    # plot
    plt.figure(figsize=(2, 2))
    sns.heatmap(cm,
                annot=True,
                fmt=".3f",
                linewidths=.5,
                square=True,
                cmap='Blues_r')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    all_sample_title = 'Accuracy Score: {0}'.format(score)
    plt.title(all_sample_title, size=15)
    plt.show()

    # cross validation
    kfold = sklearn.cross_validation.KFold(X_train.shape[0], n_folds=10)
    modelCV = LogisticRegression()
    scoring = 'accuracy'
    results = sklearn.metrics.accuracy_score(Y_test, predicted)
    print("\n\n 10-fold cross validation average accuracy: %.3f" %
          (results.mean()))
    print("\n")

    # precision
    print(classification_report(Y_test, predicted))

    # ROC
    logit_roc_auc = roc_auc_score(Y_test, logistic.predict(X_test))
    fpr, tpr, thresholds = roc_curve(Y_test,
                                     logistic.predict_proba(X_test)[:, 1])
    plt.figure()
    plt.plot(fpr,
             tpr,
             label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig('Log_ROC')
    plt.show()

    # train with selected features
    train_cols = [
        'Action 2', 'Action 9', 'Action 10', 'Action 11', 'Action 12',
        'Action 13', 'Action 14', 'Action 16', 'Action 18', 'Action 24',
        'Action 32', 'Action 41', 'Action 48', 'Action 53', 'Action 57',
        '2gram 10', '3gram 2', '3gram 9'
    ]
    X = data[train_cols]
    #print(X)
    y = data['Winner']
    logit_model = sm.Logit(y.astype(float), X.astype(float))
    result = logit_model.fit(method='bfgs')
    print(result.summary())
# In[ ]:

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

cols = [
    "Age", "Fare", "TravelAlone", "Pclass_1", "Pclass_2", "Embarked_C",
    "Embarked_S", "Sex_male", "IsMinor"
]
X = final_train[cols]
y = final_train['Survived']
# Build a logreg and compute the feature importances
model = LogisticRegression()
# create the RFE model and select 8 attributes
rfe = RFE(model, 8)
rfe = rfe.fit(X, y)
# summarize the selection of the attributes
print('Selected features: %s' % list(X.columns[rfe.support_]))

# <a id="t4.1.2."></a>
# ### 4.1.2. Feature ranking with recursive feature elimination and cross-validation
#
# RFECV performs RFE in a cross-validation loop to find the optimal number or the best number of features. Hereafter a recursive feature elimination applied on logistic regression with automatic tuning of the number of features selected with cross-validation.

# In[ ]:

from sklearn.feature_selection import RFECV
# Create the RFE object and compute a cross-validated score.
# The "accuracy" scoring is proportional to the number of correct classifications
rfecv = RFECV(estimator=LogisticRegression(),