Ejemplo n.º 1
0
def remove_one_feature(X, Y, names):
   lr = LinearRegression()
   rfe = RFE(lr, n_features_to_select=1)
   rfe.fit(X,Y)
   rank = (sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names)))
   print(rank)
   return rank[-1][1]
Ejemplo n.º 2
0
    def recursive_feature_elimination(config_learning, config_data, number_features):

        output = open(os.path.expanduser(config_data.get("Learner", "models")) + "/" + "feature_ranks.txt", "w")

        feature_names = FeatureExtractor.get_combinations_from_config_file_unsorted(config_data)

        x_train = read_features_file(config_learning.get('x_train'), '\t')
        y_train = read_reference_file(config_learning.get('y_train'), '\t')
        x_test = read_features_file(config_learning.get('x_test'), '\t')
        estimator, scorers = learn_model.set_learning_method(config_learning, x_train, y_train)

        scale = config_learning.get("scale", True)

        if scale:
            x_train, x_test = scale_datasets(x_train, x_test)

        rfe = RFE(estimator, number_features, step=1)
        rfe.fit(x_train, y_train)

        for i, name in enumerate(feature_names):
            output.write(name + "\t" + str(rfe.ranking_[i]) + "\n")
            print(name + "\t" + str(rfe.ranking_[i]))

        predictions = rfe.predict(x_test)

        output.close()

        return predictions
Ejemplo n.º 3
0
 def doRFE(self, X, y):
     # do RFE
     self.numFeatures = X.shape[1]
     svc = SVC(kernel="linear", C=self.C)
     rfe = RFE(estimator=svc, n_features_to_select=1, step=1)
     rfe.fit(X, y)
     self.feature_importances_ = self._getImportances(rfe.ranking_)
 def get_model_RFE_top_features(self,expression_file,ic50_file,target_features,drug):
     expression_frame,ic50_series = dfm.get_expression_frame_and_ic50_series_for_drug(expression_file, ic50_file,drug,normalized=True,trimmed=True,threshold=None)
     scikit_data,scikit_target = dfm.get_scikit_data_and_target(expression_frame,ic50_series)
     step_length = int(len(scikit_data.tolist()[0]) / 100) + 1
     selector = RFE(self.model,int(target_features),step=step_length)
     selector.fit(scikit_data,scikit_target)
     return [expression_frame.index[i] for i in xrange(0,len(expression_frame.index)) if selector.support_[i]]
Ejemplo n.º 5
0
def recursiveFeatureSelection():
	X = np.array(trainingData, dtype=float)
	y = np.array(trainingDataLabels, dtype=float)
	svc = SVC("linear", 1)
	rfe = RFE(svc, 1, 1)
	rfe.fit(X, y)
	print rfe
Ejemplo n.º 6
0
def featSelect(label,trainSet,trainObs,cv,numFeat=5,SEED=34,name=''):
	from sklearn.feature_selection import RFE
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import roc_auc_score
	from numpy import zeros
	model = LogisticRegression(random_state=SEED)
	predCv = zeros(len(trainObs))
	rfe = RFE(model, numFeat, step=1)
	rfe.fit(trainSet,trainObs)
	vars = list(trainSet.columns[rfe.ranking_ == 1])
	auc = 0
	for i in range(1,max(rfe.ranking_)):
		for tr, vl in cv:
			model.fit(trainSet[vars + list(trainSet.columns[rfe.ranking_ == i])].ix[tr],trainObs[tr])
			predCv[vl] = model.predict_proba(trainSet[vars + list(trainSet.columns[rfe.ranking_ == i])].ix[vl])[:,1]
		if roc_auc_score(trainObs,predCv) > auc:
			auc = roc_auc_score(trainObs,predCv)
			vars += list(trainSet.columns[rfe.ranking_ == i])
	for v in vars:
		for tr, vl in cv:
			model.fit(trainSet[[x for x in vars if x != v]].ix[tr],trainObs[tr])
			predCv[vl] = model.predict_proba(trainSet[[x for x in vars if x != v]].ix[vl])[:,1]
		if roc_auc_score(trainObs,predCv) > auc:
			auc = roc_auc_score(trainObs,predCv)
			vars.remove(v)
	for v in [x for x in trainSet.columns if x not in vars]:
		for tr, vl in cv:
			model.fit(trainSet[vars + [v]].ix[tr],trainObs[tr])
			predCv[vl] = model.predict_proba(trainSet[vars + [v]].ix[vl])[:,1]
		if roc_auc_score(trainObs,predCv) > auc:
			auc = roc_auc_score(trainObs,predCv)
			vars += [v]
	print name,"Final AUC:  ",auc
	return {label: vars}
Ejemplo n.º 7
0
def build_model(x,y,no_features):
    """
    Build a linear regression model
    """
    model = LinearRegression(normalize=True,fit_intercept=True)
    rfe_model = RFE(estimator=model,n_features_to_select=no_features)
    rfe_model.fit(x,y)
    return rfe_model    
def quick_rfe(estimator, X, y):

    rfe = RFE(estimator = estimator, n_features_to_select = 1)
    rfe.fit(X,y)

    features = X.columns.tolist()
    sorted_features = [f for (rank, f) in sorted(zip(rfe.ranking_, features))]

    return sorted_features, rfe.ranking_
Ejemplo n.º 9
0
    def recurrciveFE(self, data):
        """
        Uses Recurrcise Feature Elimination to determine the write number of 
        features before adding additional leads to overfitting &
         It works by recursively removing attributes and building a model on those 
         attributes that remain. It uses the model accuracy to identify 
         which attributes (and combination of attributes) contribute the 
         most to predicting the target attribute.

        Parameters
        ----------
        data : DataFrame
            Input data, for which categorical variables should be converted
            response should be in 0 column, predictors in additional

        Returns
        -------
        out : Plot
            A plot with the number of optimal number of features,
            which is then used to determine features of most
            importance returned in a print out to console
          
        """
        features_list = data.columns.values[1::]
        predictors = np.asarray(data.values[:, 1::])
        response = np.asarray(data.values[:, 0])
        estimator = SVC(kernel="linear")
        
        ###using cross validation to determine nooffeatures
        rfecv = RFE(estimator, step=1, cv=StratifiedKFold(response, 2), scoring = 'accuracy')
        rfecv.fit(predictors, response)
        RFE( )
        print("Optimal number of features : %d" % rfecv.n_features_)
        
        # Plot number of features VS. cross-validation scores
        plt.figure()
        plt.xlabel("Number of features selected")
        plt.ylabel("Cross validation score (nb of correct classifications)")
        plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
        plt.show()        
        
        ##label as optimal #of features
        noffeatures = rfecv.n_features_  
        
        ##use rfe to determine top features
        selector = RFE(estimator,noffeatures , step=1)
        selector = selector.fit(predictors, response)
        ##creat index to get names
        index1 = np.where(selector.support_ == False)[0]
        index = np.argsort(selector.ranking_[index1])[::-1]
        feature_list_imp = features_list[index]

        for f in range(index.shape[0]):
            print("%d. feature %d (%s)" % (f + 1, index[f], feature_list_imp[index[f]]))
        print(selector.support_)
        print(selector.ranking_)    
Ejemplo n.º 10
0
Archivo: c10.py Proyecto: 3774257/abu
 def feature_selection(estimator, x, y):
     """
         支持度评级
     """
     selector = RFE(estimator)
     selector.fit(x, y)
     print('RFE selection')
     print(pd.DataFrame(
         {'support': selector.support_, 'ranking': selector.ranking_},
         index=pig_three_feature.columns[1:]))
Ejemplo n.º 11
0
def feature_selection_RFE_draft(fn ,ax=None, sel="all", goal="Linebreak", isclass=True,
        verbosity=0, nf=7):
    X, y, names = data_prepare(fn, sel=sel, goal=goal, verbosity=verbosity-1)
    if verbosity > 1:
        print "names:", ",".join(names)
    
    # Create the RFE object and compute a cross-validated score.
    if isclass:
        #estimator = svm.SVC(kernel="linear",C=1.0)
        estimator = get_clf('svm')    
        scoring = 'f1'
        cv = cross_validation.StratifiedKFold(y, 2)
    else:
        if False:
            from sklearn.ensemble import RandomForestRegressor
            if not hasattr(RandomForestRegressor,'coef_'):
                RandomForestRegressor.coef_ = property(lambda self:self.feature_importances_)
            estimator = RandomForestRegressor(n_estimators=100, max_depth=2, min_samples_leaf=2)
        else:
            estimator = linear_model.RidgeCV()
        scoring = 'mean_squared_error'
        cv = 3

    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    if True:
        rfecv = RFECV(estimator=estimator, step=1, cv=cv, scoring=scoring)
    else:
        from kgml.rfecv import RFECVp
        f_estimator = get_clf('svm')
        rfecv = RFECVp(estimator=estimator,f_estimator=f_estimator, step=1, cv=cv, scoring=scoring)
        
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        rfecv.fit(X, y)

    # Plot number of features VS. cross-validation scores
    ax.set_xlabel("Number of features selected")
    ax.set_ylabel("Cross validation score ({})".format(scoring))
    ax.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)

    #print("Optimal number of features : %d" % rfecv.n_features_)
    best = names[rfecv.ranking_==1]

    rfe = RFE(estimator, n_features_to_select=1)
    rfe.fit(X,y)
    ranks = sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names))

    # reorder best using ranks
    best_set = set(best)
    best = [name for (i,name) in ranks if name in best_set]
    #print "The best features:", ', '.join(best)
    assert len(best) == len(best_set)

    return best, ranks
Ejemplo n.º 12
0
def subtest(model, XL, YL, XT, YT, feature_names):
	nfeatures = XL.shape[1]
	rfe = RFE(model, nfeatures-1)
	print "BEFORE"
	model.fit(XL, YL)
	print_performance(YT, model.predict(XT))
	print "AFTER"
	rfe.fit(XL, YL)
	print_performance(YT, rfe.predict(XT))
	print "REMOVED FEATURE %s" % (feature_names[np.where(rfe.support_==False)[0][0]])
	print ""
	return rfe.transform(XL), rfe.transform(XT), feature_names[rfe.support_]
Ejemplo n.º 13
0
def test_rfe_2():
    """Ensure that the TPOT RFE outputs the same result as the sklearn rfe when num_features>no. of features in the dataframe """
    tpot_obj = TPOT()

    non_feature_columns = ['class', 'group', 'guess']
    training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1)
    estimator = LinearSVC()
    rfe = RFE(estimator, 100, step=0.1)
    rfe.fit(training_features, training_classes)
    mask = rfe.get_support(True)
    mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns

    assert np.array_equal(training_testing_data[mask_cols], tpot_obj._rfe(training_testing_data, 64, 0.1))
    def get_patient_predictions_rfe(self,expression_file,ic50_file,patient_directory,target_features,drug):

        e_data,e_target,p_identifiers,p_data = dfm.get_cell_line_and_patient_expression_data_target_for_drug(expression_file,ic50_file,patient_directory,1.0,drug)
        step_length = int(len(e_data.tolist()[0]) / 100) + 1

        model = RFE(self.model,target_features,step=step_length)

        model.fit(e_data,e_target)
        predictions = model.predict(p_data)

        all_features = dfm.get_cell_line_and_patient_expression_gene_intersection(dfm.get_cell_line_expression_frame(expression_file),dfm.get_patients_expression_frame(patient_directory))[0]
        top_features = [all_features[i] for i in xrange(0,len(all_features)) if model.support_[i]]
        return p_identifiers, predictions, top_features
Ejemplo n.º 15
0
Archivo: ml.py Proyecto: aboSamoor/NLP
 def show_most_informative_features(self, samples):
   X, y = self._fsets2dataset(samples)
   rfe = RFE(self._clf, 1)
   rfe.fit(X, y)
   ranking = rfe.ranking_
   if len(ranking) != len(self._fx):
     logging.error("Both feature ranking and features should have the same"
                    "length %d != %d", len(ranking), len(self._fx))
   fx_ranking = []
   for i in range(len(self._fx)):
     fx_ranking.append((ranking[i], self._fx[i]))
   self._clf.fit(X, y)
   return '\n'.join(['\t'.join([str(y),str(x)]) for x,y in sorted(fx_ranking)])
    def RecursiveFeatureElimination(self, nfeat=None, step=1, inplace=False):
        
        rfe = RFE(self.alg, n_features_to_select=nfeat, step=step)
        
        rfe.fit(self.data_train[self.predictors], self.data_train[self.target])
        
        ranks = pd.Series(rfe.ranking_, index=self.predictors)
        
        selected = ranks.loc[rfe.support_]

        if inplace:
            self.set_predictors(selected.index.tolist())
        
        return selected
Ejemplo n.º 17
0
def rank_features_rfe(X, y, featureset):
    """Rank features by their importance using recursive feature elimination.

    :param X: A training set of features.
    :param y: A target set (aka class labels for the training set)
    :param featureset: An instance of a featureset (such as Basic9Extractor())
    :rtype: An OrderedDict of the form {K : V}, with K being the feature name
    and V being its importance. This dictionary will be sorted by importance.
    """

    # FIXME: Use an RBF SVC to rank features. It is likely that the "importance"
    # rankings derived from a LinearSVC are similar as an RBF kernel SVM, but,
    # for safety's sake, it is best to assume they are not.

    classifier = LinearSVC()
    classifier.fit(X, y)

    ranker = RFE(classifier, 1, step=1)
    ranker = ranker.fit(X, y)

    # Get the names of the feature columns.
    # FIXME: Duplicate code from rank_features. Make this its own function.
    feat_importance = OrderedDict()
    for index, func in enumerate(featureset.features):
        feat_importance[func] = ranker.ranking_[index]

    return sorted(feat_importance.items(), key=lambda x: x[1])
Ejemplo n.º 18
0
def get_best_cols(df):
    """ select best cols with RFE """

    # factors
    cols_to_factor = [
        pd.get_dummies(df.X7),
        pd.get_dummies(df.X8),
        pd.get_dummies(df.X9),
        pd.get_dummies(df.X11),
        pd.get_dummies(df.X12),
        pd.get_dummies(df.X14),
        pd.get_dummies(df.X12),
        pd.get_dummies(df.X14),
        pd.get_dummies(df.X32),
    ]
    # dataframe with factors blown out
    df_f = pd.concat(cols_to_factor, axis=1)
    # numerics
    RFE_col_list = ["X4", "X5", "X6", "X13", "X21", "X22", "X29", "X30", "X31"]
    # dataframe with numerics
    df_n = df.ix[:, RFE_col_list]
    X = np.asarray(df_n)
    X = StandardScaler().fit_transform(X)
    # add in factors
    X = np.concatenate([X, np.asarray(df_f)], axis=1)
    # leave y alone
    y = df.X1
    # I don't like to guess yes this is only linear relationships
    estimator = SVR(kernel="linear")
    selector = RFE(estimator, 40, step=2)
    selector = selector.fit(X, y)
    # make index for merged df, yes this whines
    df_index = df_n.columns + df_f.columns
    best_cols = df_index[selector.support_]
    return best_cols
Ejemplo n.º 19
0
def LogReg(X_train, X_test, y_train, y_test, Min_N_Feat, Max_N_Feat, mask='None',weights='auto'):
#******************************************************************************

    from sklearn.feature_selection import RFE #import the library to rank features with recursive feature elimination
    from sklearn.linear_model import LogisticRegression as LogR #import the Logistic Regression module
    
    if mask=='None':
        mask = np.zeros((Max_N_Feat-Min_N_Feat+1,int(X_train.shape[1])),dtype='bool') #define the mask to obtain the list of selected features
    #end
    Pred_Train = np.zeros((int(max(y_train.shape)),Max_N_Feat-Min_N_Feat+1),dtype='int') #define the matrix of outputs (each prediction set is stored in a different column)
    Pred_Test = np.zeros((int(max(y_test.shape)),Max_N_Feat-Min_N_Feat+1),dtype='int') #define the matrix of outputs (each prediction set is stored in a different column)
    
    print 'Logistic Regression: Training...' #notify the user about the status of the process    
    for ift in range(Min_N_Feat,Max_N_Feat+1): #iterate across the maximum number of features    
        LogReg_obj = LogR(C=1e3, class_weight=weights) #create the logistic regression model
        if mask=='None':
            rfe = RFE(LogReg_obj, ift) #create the RFE model and select the number of attributes
            rfe = rfe.fit(X_train,y_train) #train the RFE (feature selection) model on the train data sets
            mask[ift-Min_N_Feat,:] = rfe.support_ #apply the best feature mask to the output mask
        #end
        LogReg_obj.fit(X_train[:,mask[ift-Min_N_Feat,:]], y_train) #fit the logistic model to the train data sets
        Pred_Train[:,ift-1] = LogReg_obj.predict(X_train[:,mask[ift-Min_N_Feat,:]]) #apply the logistic model to the train dataset
        Pred_Test[:,ift-1] = LogReg_obj.predict(X_test[:,mask[ift-Min_N_Feat,:]]) #apply the logistic model to the test dataset
        print 'Logistic Regression: Predicting...', 100*ift/(Max_N_Feat-Min_N_Feat+1), '%' #notify the user about the status of the process 
    #end
        
    print 'Logistic Regression: Completed!' #notify the user about the status of the process
        
    return Pred_Train, Pred_Test, mask
Ejemplo n.º 20
0
def ref(X, y, n_features_to_select=1, kernel='linear'):
    # specify the desired number of features
    # return the masks and ranking of selected features
    estimator = SVC(kernel=kernel, class_weight='balanced')
    selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1)
    selector = selector.fit(X, y)
    return (selector)
Ejemplo n.º 21
0
def test_main():
    iris = load_iris()
    x, y = iris.data, iris.target
    estimator = SVR(kernel="linear")
    selector = RFE(estimator, 2 , step=1)
    selector = selector.fit(x, y)
    print selector.support_
Ejemplo n.º 22
0
def rank_features(clf, x_train, y_train, columns,step=1, numFeatures=1):
    """
    rank features with rfe
    :param clf: estimator
    :param x_train:
    :param y_train:
    :return: the fitted rfe object
    """

    print '========== rank_features ==========='
    rfe = RFE(estimator=clf, n_features_to_select=numFeatures, verbose=2, step=step)
    rfe.fit(x_train, y_train)

    pprint(np.array(columns)[rfe.ranking_-1])

    return rfe
Ejemplo n.º 23
0
def select_features(X, y, random_state, kernel='linear', C=1.0, num_attributes=3):
    """
    Uses Support Vector Classifier as the estimator to rank features
    with Recursive Feature Eliminatin.

    Parameters
    ----------
    X: A pandas.DataFrame. Attributes.
    y: A pandas.DataFrame. Labels.
    random_state: A RandomState instance. Used in SVC().
    kernel: A string. Used in SVC(). Default: "linear".
    C: A float. Used in SVC(). Default: 1.0.
    num_attributes: An int. The number of features to select in RFE. Default: 3.

    Returns
    -------
    A 3-tuple of (RFE, np.ndarray, np.ndarray)
    model: An RFE instance.
    columns: Selected features.
    ranking: The feature ranking. Selected features are assigned rank 1.
    """

    rfe = RFE(svm.SVC(C, kernel, random_state=random_state), num_attributes)
    model = rfe.fit(X, y.values.ravel())
    columns = list()

    for idx, label in enumerate(X):
        if rfe.support_[idx]:
            columns.append(label)

    ranking = rfe.ranking_

    return model, columns, ranking
def feature_sorting(features_values_temp, rows_temp, columns_temp, prediction_values_temp, kernel, threshold):
	rows = 0
	while rows_temp > 0:
		rows = rows + 1
		rows_temp = rows_temp - 1

	columns = 0
	while columns_temp > 0:
		columns = columns + 1
		columns_temp = columns_temp - 1

	features_values = [x for x in features_values_temp]
	prediction_values = [y for y in prediction_values_temp]

	rotated = convert_list_to_matrix(features_values, rows, columns)
	# print rotated.shape
	scores = np.array(prediction_values)

	threshold = float(threshold)

	estimator = SVR(kernel=kernel) # try to change to the model for which the test is gonna run (lasso, ridge, etc.)

	selector = RFE(estimator, 0, step=1)
	selector = selector.fit(rotated, scores)
	features_used = [i for i, x in enumerate(selector.support_) if x == True] # i+1 b/c matlab starts indexing from 1

	return selector.ranking_.tolist()
Ejemplo n.º 25
0
def select_features(X_train, y_train):
    threshold = 0.90
    vt = VarianceThreshold().fit(X_train)
    feat_var_threshold = X_train.columns[vt.variances_ > threshold * (1 - threshold)]
    # print(feat_var_threshold)
    # print(len(feat_var_threshold))

    # Random Forest feature importance
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    feature_imp = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=["importance"])
    # print(feature_imp)
    feat_imp_20 = feature_imp.sort_values("importance", ascending=False).head(35).index
    # print(feat_imp_20)

    X_minmax = MinMaxScaler(feature_range=(0, 1)).fit_transform(X_train)
    X_scored = SelectKBest(score_func=chi2, k='all').fit(X_minmax, y_train)
    feature_scoring = pd.DataFrame({
        'feature': X_train.columns,
        'score': X_scored.scores_
    })
    feat_scored_20 = feature_scoring.sort_values('score', ascending=False).head(35)['feature'].values
    # print(feat_scored_20)

    rfe = RFE(LogisticRegression(), 20)
    rfe.fit(X_train, y_train)
    feature_rfe_scoring = pd.DataFrame({
        'feature': X_train.columns,
        'score': rfe.ranking_
    })
    feat_rfe_20 = feature_rfe_scoring[feature_rfe_scoring['score'] == 1]['feature'].values
    # print(feat_rfe_20)

    features = np.hstack([
        feat_var_threshold,
        feat_imp_20,
        feat_scored_20,
        feat_rfe_20
    ])
    # print(features)
    # features = map(str, features)
    features = np.unique(features)
    # print('Final features set:\n')
    # for f in features:
    #     print("\t-{}".format(f))

    return features
Ejemplo n.º 26
0
    def buildTree(self,depth):
        #Here, we define the parameters of our tree and use a feature selection algorithm (RFE) to pick out the strongest features.

        self.tree = DecisionTreeClassifier(criterion = 'entropy', max_depth=depth, random_state=0)
        selector = RFE(self.tree, 2, step=1)
        selector = selector.fit(self.X_train, self.Y_train)
        selector.support_
        selector.ranking_
Ejemplo n.º 27
0
 def selectFeaturesFromSubsetRecursive(self,subset,numFeatures):
   model = svm.LinearSVC(class_weights='auto')
   rfe = RFE(model, numFeatures)
   rfe = rfe.fit(self.instances[:,subset], self.classes)
   # summarize the selection of the attributes
   # print(rfe.get_support(indices=True))
   # print(rfe.ranking_)
   return rfe.get_support(indices=True)
Ejemplo n.º 28
0
    def feature_selection(self, **kwargs):
        x, y = kwargs['x'], kwargs['y']
        fiter = self.get_fiter()

        selector = RFE(fiter)
        selector.fit(x, y)

        ZLog.info('RFE selection')
        ZLog.info(pd.DataFrame({'support': selector.support_, 'ranking': selector.ranking_},
                               index=self.df.columns[1:]))

        selector = RFECV(fiter, cv=3, scoring='mean_squared_error')
        selector.fit(x, y)
        ZLog.newline()
        ZLog.info('RFECV selection')
        ZLog.info(pd.DataFrame({'support': selector.support_, 'ranking': selector.ranking_},
                               index=self.df.columns[1:]))
Ejemplo n.º 29
0
def rec_feature_elim(data,num_features=17700):
    X = data.get_gene_exp_matrix()
    y = data.get_labels()
    svc = SVC(kernel="linear", C=1)
    rfe = RFE(estimator=svc, n_features_to_select=num_features, step=1)
    selector = rfe.fit(X, y)
    mask = map(lambda x: 1 if x is True else 0,selector.support_)
    print_genes_nonzero_coeff(data,mask)
Ejemplo n.º 30
0
def recursive_feature_elimination(X, y):
    model = LogisticRegression()
    # create the RFE model and select 3 attributes
    rfe = RFE(model, 3)
    rfe = rfe.fit(X, y)
    # summarize the selection of the attributes
    print(rfe.support_)
    print(rfe.ranking_)
Ejemplo n.º 31
0
# selecting x and y variables

cr.shape

cr_x = cr.iloc[:, 0:11]

cr_y = cr.iloc[:, -1]

# feature selection using rfe

import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.svm import LinearSVC
svm = LinearSVC()
rfe = RFE(svm, 5)
rfe.fit(cr_x, cr_y)
rfe.transform(cr_x)
rfe.get_support()

imp_variables = pd.DataFrame({
    "Important": list(rfe.get_support()),
    "Feature_Name": list(cr_x.columns)
})
imp_variables

# feature selection using variance threshold

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
Ejemplo n.º 32
0
mean_squared_error(y_test, pred)
"""#### selecting feature for logistic regression"""

from sklearn.feature_selection import RFE

model_log = LogisticRegression()

selector = RFE(model_log, 3)

x = df_voice.loc[:, :'modindx']
y = df_voice['label']

x.shape

selector = selector.fit(x, y)

selector.support_
"""#### 6,9,13 using feature selection

#### selecting feature for svm
"""

model_svc = SVC(kernel="rbf")

selector2 = RFE(model_svc, 3)

x = df_voice.loc[:, :'modindx']
y = df_voice['label']

selector2 = selector2.fit(x, y)
 def TTest_mRMR_svmRFE_selector(originData):
     selectedFeatutesList = []
     label = originData['label']
     colNames = originData[originData.columns[2:8]].columns
     data = originData[originData.columns[2:8]].fillna(0)
     data = data.astype(np.float64)
     data = StandardScaler().fit_transform(data)
     # minmax_scale = preprocessing.MinMaxScaler().fit(data)
     # data = minmax_scale.transform(data)
     data = pd.DataFrame(data)
     data.columns = colNames
     data['label'] = label
     # balanced Data
     smo = SMOTE(random_state=3)
     X_smote, y_smote = smo.fit_sample(data, data['label'])
     for colName in X_smote.columns[0:-1]:
         # if 'DWI' in colName:
         if levene(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName])[1] > 0.05 and \
                 ttest_ind(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName])[
                     1] < 0.05:
             selectedFeatutesList.append(colName)
         elif levene(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName])[1] <= 0.05 and \
                 ttest_ind(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName],
                           equal_var=False)[1] < 0.05:
             selectedFeatutesList.append(colName)
     if 'label' not in selectedFeatutesList:
         selectedFeatutesList = ['label'] + selectedFeatutesList
     # print(index)
     data1 = X_smote[X_smote['label'] == 0][selectedFeatutesList]
     data2 = X_smote[X_smote['label'] == 1][selectedFeatutesList]
     trainData = pd.concat([data1, data2])
     # trainData = shuffle(trainData)
     # trainData.index = range(len(trainData))  # 打乱后重新标号
     X = trainData[trainData.columns[1:]]
     y = trainData['label']
     # print(X_Smote)
     # mRMR_features = pymrmr.mRMR(X_smote, 'MIQ', 15)
     # define MI_FS feature selection method
     feat_selector = mifs.MutualInformationFeatureSelector(method='JMIM')
     feat_selector.fit(X, y)
     # feat_selector._support_mask
     # feat_selector.ranking_
     # call transform() on X to filter it down to selected features
     # X_filtered = feat_selector.transform(X_smote)
     # X_filtered = pd.DataFrame(X_filtered)
     # print(feat_selector.ranking_)
     # if 'label' not in mRMR_features: mRMR_features = ['label'] + mRMR_features
     X_mRMR = X.loc[:, feat_selector._support_mask]
     colNames = X_mRMR.columns
     clf = LinearSVC()
     # featureNums = len(selectedFeatutesList)
     # print(featureNums)
     model = RFE(clf, n_features_to_select=len(feat_selector.ranking_))
     # print(y)
     # print(X_mRMR)
     model.fit(X_mRMR, y)
     feats = list(np.array(colNames)[model.support_])
     for featureNames in feats:
         print(featureNames)
     print(len(feats))
     X_RFE = X_mRMR[feats]
     return X_RFE, y
Ejemplo n.º 34
0
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
extra_trees_model = ExtraTreesClassifier()
extra_trees_model.fit(X_train, y_train)
extra_trees_predicted = extra_trees_model.predict(X_test)

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression()
rfe_model = RFE(model1, 3)
rfe_model = rfe_model.fit(X_train, y_train)
rfe_predicted = rfe_model.predict(X_test)

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
logistic_model_predicted = logistic_model.predict(X_test)

from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
gaussian_model = GaussianNB()
gaussian_model.fit(X_train, y_train)
gaussian_model_predicted = gaussian_model.predict(X_test)

from sklearn import metrics
Ejemplo n.º 35
0
def multipleregress():

    #IDEAL IS FOR THE INDEPENDENT VARIABLE TO BE CORRELATED WITH THE DEPENDENT VARIABLE BUT NOT
    #WITH EACH OTHER
    #Select the Columns that ONLY Use NUMBERS
    dataTrain = pd.read_csv('./tmdb_5000_train.csv')
    dataTest = pd.read_csv('./tmdb_5000_test.csv')

    x_train = dataTrain[['budget', 'popularity',
                         'vote_count']].values.reshape(-1, 3)
    y_train = dataTrain['revenue']

    x_test = dataTest[['budget', 'popularity',
                       'vote_count']].values.reshape(-1, 3)
    y_test = dataTest['revenue']

    ols = LinearRegression()
    model = ols.fit(x_train, y_train)

    dataTrain = pd.read_csv('./tmdb_5000.csv',
                            usecols=[
                                'budget', 'popularity', 'runtime',
                                'vote_average', 'vote_count', 'IMDB', 'rotten',
                                'metaC', 'revenue'
                            ])
    dataTest = pd.read_csv('./tmdb_5000_test.csv',
                           usecols=[
                               'budget', 'popularity', 'runtime',
                               'vote_average', 'vote_count', 'IMDB', 'rotten',
                               'metaC', 'revenue'
                           ])

    names = dataTrain.columns
    array = dataTrain.values
    X = array[:, 0:8]
    Y = array[:, 2]
    # feature extraction
    model = LinearRegression()
    rfe = RFE(model, 4)
    fit = rfe.fit(X, Y)
    print(fit.n_features_)
    print(fit.support_)
    print(fit.ranking_)
    ranks = fit.support_
    fields = np.where(ranks == True)
    ranks = list()
    for ind in np.nditer(fields):
        ranks.append(names[ind])

    print(ranks)

    x_train = dataTrain[['rotten', 'IMDB',
                         'vote_average']].values.reshape(-1, 3)
    y_train = dataTrain['revenue']

    x_test = dataTest[['rotten', 'IMDB', 'vote_average']].values.reshape(-1, 3)
    y_test = dataTest['revenue']

    ols = LinearRegression()
    model = ols.fit(x_train, y_train)
    params = np.append(model.intercept_, model.coef_)

    predictions = model.predict(x_train)
    print(predictions)

    newX = pd.DataFrame({
        "Constant": np.ones(len(x_test))
    }).join(pd.DataFrame(x_test))
    MSE = (sum((y_train - predictions)**2)) / (len(newX) - len(newX.columns))

    # Note if you don't want to use a DataFrame replace the two lines above with
    # newX = np.append(np.ones((len(X),1)), X, axis=1)
    # MSE = (sum((y-predictions)**2))/(len(newX)-len(newX[0]))

    var_b = MSE * (np.linalg.inv(np.dot(newX.T, newX)).diagonal())
    sd_b = np.sqrt(var_b)
    ts_b = params / sd_b

    p_values = [
        2 * (1 - stats.t.cdf(np.abs(i), (len(newX) - 1))) for i in ts_b
    ]

    sd_b = np.round(sd_b, 3)
    ts_b = np.round(ts_b, 3)
    p_values = np.round(p_values, 3)
    params = np.round(params, 4)

    myDF3 = pd.DataFrame()
    myDF3["Coefficients"], myDF3["Standard Errors"], myDF3["t values"], myDF3[
        "Probabilites"] = [params, sd_b, ts_b, p_values]
    print(myDF3)

    y_predicted = model.predict(x_train)

    plt.scatter(y_train, y_predicted)
    plt.plot(y_train, y_predicted, 'o')
    plt.show()
Ejemplo n.º 36
0
selection.sort()
for i in selection[:20]:
    print(i)

### variable list from RFECV
selected_val0 = X_standardized_train.columns[selector.support_]
print(selected_val0)

# In[19]:

# further reduce by RFE
X_standardized_train1 = X_standardized_train[X_standardized_train.columns[
    selector.support_]]

selector1 = RFE(estimator, n_features_to_select=17, step=1)
selector1 = selector1.fit(X_standardized_train1, y_train)

selection1 = list(
    zip(selector1.ranking_, selector1.support_, X_standardized_train1.columns))
selection1.sort()
for i in selection1[:20]:
    print(i)

selected_val1 = X_standardized_train1.columns[selector1.support_]
print(selected_val1)

# In[201]:

### clean dataset for modeling

#selected_val = selected_val0
Ejemplo n.º 37
0
def extractWavFeats():
    featureArr = []
    labelsArr = []
    for i in range(43):
        coughfeat = extract_features("Regen_coofs/cough" + str(i) + ".wav")
        nonCough = extract_features("Regen_coofs/neg" + str(i) + ".wav")
        featureArr.append(coughfeat)
        featureArr.append(nonCough)
        labelsArr.append(0)
        labelsArr.append(1)
    for i in range(30):
        nonCough = extract_features("Regen_coofs/neg" + str(i) + ".wav")
        featureArr.append(nonCough)
        labelsArr.append(1)
    return featureArr, labelsArr


#Optional Load
xArr = np.genfromtxt("x.csv", delimiter=",")
yArr = np.genfromtxt("y.csv", delimiter=",")
x_train, x_test, y_train, y_test = train_test_split(xArr[:],
                                                    yArr[:],
                                                    test_size=0.2,
                                                    random_state=42)
print("Done Splitting")

forest = rf(max_depth=10)
selector = RFE(forest, 10, 1)
fit = selector.fit(x_train, y_train)
print(fit.score(xArr[0:80], yArr[0:80]))
Ejemplo n.º 38
0
# But as I have wrangled the data to provide a column that does acknowledge whether or not the user is adopted,
# I am more willing to fit a model on it and conduct feature elimination via that.
# My initial thoughts are to use RFE, or Decision Trees after thinking this through.
# The following features have been removed as they seem redundant in inclusion, or provide no empirical predictive value:
# 'object_id', 'name', and 'email'

# Data selection for model
X = user_df[[
    'creation_time',  #1
    'creation_source',  #3
    'last_session_creation_time',  #1
    'opted_in_to_mailing_list',  #4
    'enabled_for_marketing_drip',  #5
    'org_id',  #1
    'invited_by_user_id'  #2
]]
y = user_df['adopted']

# Decision Tree Model
clf = DecisionTreeClassifier(random_state=0)
# Recursive feature selection
estimator = clf
selector = RFE(estimator, 3, step=1)
selector = selector.fit(X, y)
# Ranking of features
print("Feature Ranking: ", selector.ranking_)
# [1 3 1 4 5 1 2]
# The features that seems to be most important are the 'creation_time', 'last_session_creation_time', and 'org_id'
# The feature that seemed to follow close behind was 'invited_by_user_id'
# These features seem to be the most important when predicting future adoption of users
Ejemplo n.º 39
0
    def fit(self, X, y):

        X_t = X.copy()
        y_t = y.copy()

        print('Filling Nans')
        if self.fill_nan:
            X_t = self.filler.fit_transform(X_t)

        print('Removing outliers')
        if self.contamination > 0:
            method = LocalOutlierFactor(n_neighbors=max(
                50, int(0.1 * X_t.shape[1])),
                                        contamination=self.contamination)
            outlier = method.fit_predict(X_t)
            indices = np.where(outlier == 1)
            X_t = X_t[indices, :][0, :, :]
            y_t = y_t[indices]

        if self.feature_selection == 'RFE':
            print('Removing features with zero variance')
            sel = VarianceThreshold()
            sel.fit(X_t)
            self.not_constant_features.extend(sel.get_support(indices=True))
            X_t = X_t[:, self.not_constant_features]

            print('Removing uniform features')
            if self.bootstrap:
                self.not_uniform_features.extend(
                    pd.read_csv('task1/results/not_uniform.csv',
                                ',').to_numpy().flatten())
            else:
                result = self.find_uniform_features(X_t, y_t)
                pd.DataFrame(result).to_csv('task1/results/not_uniform.csv',
                                            ',',
                                            index=False)
                self.not_uniform_features.extend(result)
            X_t = X_t[:, self.not_uniform_features]

            print('Removing highly correlated features')
            self.correlated_features.extend(
                self.find_correlated_features(0.9, 0.03, X_t, y_t))
            X_t = X_t[:, self.correlated_features]

            print('Running RFE')
            selector = RFE(estimator=self.model,
                           n_features_to_select=self.features_to_select,
                           step=100)
            selector = selector.fit(X_t, y_t)
            support = selector.get_support(indices=True)
            self.RFE_features.extend(support)
            X_t = X_t[:, self.RFE_features]

        print('Final training matrix shape is ' + str(X_t.shape))

        print('Scaling matrix')
        if self.scale:
            X_t = self.scaler.fit_transform(X_t)

        print('Fitting inner model')
        self.model.fit(X_t, y_t)
        print('Finished fitting')
        print()

        return self
def train_model(classifier):
    if (classifier == 'LR'):
        model = LogisticRegression(random_state=seed)
        model.fit(X_train, y_train)
        return model
    if (classifier == 'KNN'):
        print("\n  K TREINO  TESTE")
        print(" -- ------ ------")
        for k in range(1, 130, 2):
            model = KNeighborsClassifier(n_neighbors=k,
                                         weights='uniform',
                                         metric='minkowski',
                                         p=2)
            model = model.fit(X_train, y_train)
            y_resposta_treino = model.predict(X_train)
            y_resposta_teste = model.predict(X_test)
            acuracia_treino = sum(y_resposta_treino == y_train) / len(y_train)
            acuracia_teste = sum(y_resposta_teste == y_test) / len(y_test)
            print("%3d" % k, "%6.1f" % (100 * acuracia_treino),
                  "%6.1f" % (100 * acuracia_teste))
        return model
    if (classifier == 'SV'):
        model = SVC(kernel='linear', random_state=seed)  # kernel = 'rbf'
        model.fit(X_train, y_train)
        return model
    if (classifier == 'NB'):
        model = GaussianNB()
        model.fit(X_train, y_train)
        return model
    if (classifier == 'DT'):
        model = DecisionTreeClassifier(criterion='entropy', random_state=seed)
        model.fit(X_train, y_train)
        return model
    if (classifier == 'RF'):
        # Hiper-parâmetros selecionados após a busca:
        model = RandomForestClassifier(n_estimators=1600,
                                       min_samples_split=2,
                                       min_samples_leaf=4,
                                       max_features='sqrt',
                                       max_depth=10,
                                       bootstrap=True,
                                       random_state=seed)
        model.fit(X_train, y_train)
        print(model.feature_importances_)
        return model
    if (classifier == 'RG'):
        model = RidgeClassifier(alpha=1,
                                class_weight='balanced',
                                solver='auto')
        model.fit(X_train, y_train)
        return model
    if (classifier == 'GBC'):
        # Hiper-parâmetros selecionados após a busca:
        model = GradientBoostingClassifier(
            random_state=seed,
            n_estimators=200,
            min_samples_split=5,
            min_samples_leaf=1,
            max_features='sqrt',
            max_depth=10,
        )
        rfe = RFE(model)
        rfe = rfe.fit(X_train, y_train)
        return rfe
    if (classifier == 'MLP'):
        kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)
        cvscores = []
        for treino, teste in kfold.split(X_train, y_train):
            model = tf.keras.models.Sequential()
            model.add(tf.keras.layers.Dense(units=20, activation='relu'))
            model.add(tf.keras.layers.Dense(units=10, activation='relu'))
            model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
            model.compile(optimizer='adam',
                          loss='binary_crossentropy',
                          metrics=['accuracy'])
            model.fit(X_train, y_train, batch_size=32, epochs=100, verbose=0)
            scores = model.evaluate(X_test, y_test, verbose=0)
            print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))
            cvscores.append(scores[1] * 100)
        print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))
        model.summary()
        return model
Ejemplo n.º 41
0
ridge = Ridge(alpha=ridgecv.alpha_)
ridge.fit(data, mark)
algorithm["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)

#lasso
lassocv = LassoCV()
lassocv.fit(data, mark)
#print(lassocv.alpha_)
lasso = Lasso(alpha=lassocv.alpha_)
lasso.fit(data, mark)
algorithm["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)

#rfe
log = LogisticRegression()
rfe = RFE(log, n_features_to_select=10)
rfe.fit(data, mark)
algorithm["RFE"] = rank_to_dict(list(map(float, rfe.ranking_)),
                                names,
                                order=-1)
'''
#f值检验
f, pval = f_classif(data, mark)
algorithm["Corr"] = rank_to_dict(f, names)
'''
r = {}
for name in names:
    r[name] = round(
        np.mean([algorithm[method][name] for method in algorithm.keys()]), 4)
methods = sorted(algorithm.keys())
algorithm["Mean"] = r
methods.append("Mean")
Ejemplo n.º 42
0
a digit classification task.

.. note::

    See also :ref:`example_feature_selection_plot_rfe_with_cross_validation.py`

"""
print(__doc__)

from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.feature_selection import RFE

# Load the digits dataset
digits = load_digits()
X = digits.images.reshape((len(digits.images), -1))
y = digits.target

# Create the RFE object and rank each pixel
svc = SVC(kernel="linear", C=1)
rfe = RFE(estimator=svc, n_features_to_select=1, step=1)
rfe.fit(X, y)
ranking = rfe.ranking_.reshape(digits.images[0].shape)

# Plot pixel ranking
import matplotlib.pyplot as plt
plt.matshow(ranking)
plt.colorbar()
plt.title("Ranking of pixels with RFE")
plt.show()
Ejemplo n.º 43
0
X_train = X_train_scaled

X_test_scaled = pd.DataFrame(sc_X.transform(X_test))
X_test_scaled.columns = X_test.columns.values
X_test_scaled.index = X_test.index.values
X_test = X_test_scaled


# Feature Selection by using Recursive Feature Elimination

# Model to Test
model = LogisticRegression(random_state = 0)

# Select Best X Features
rfe = RFE(model, 20)
rfe = rfe.fit(X_train, y_train)

# summarize the selection of the attributes
# selected features are assigned True value
rfe.support_
# selected features are assigned rank 1
rfe.ranking_

X_train.columns[rfe.support_]

# Correlation Matrix
sn.set(style="white")

# Compute the correlation matrix
corr = X_train[X_train.columns[rfe.support_]].corr()
Ejemplo n.º 44
0
array = dataframe.values

#Split the data into input and target
#There are 73 features in Writeprints Dataset
X = array[:,0:73]

Y = array[:,73]

#np.random.seed(20)
#model = LogisticRegression(np.random.seed(20))
#model=pickle.load(open('model_feature_selection', 'rb'))
#pickle.dump(model, open('model_feature_selection', 'wb'))
model = LogisticRegression(random_state=20)
# create the RFE model and select 3 attributes
rfe = RFE(model, 50)
rfe = rfe.fit(X, Y)

rankings= list(rfe.ranking_)
#print rankings.count(1)
#print rfe.support_
# summarize the selection of the attributes

np.set_printoptions(precision=3)


selected_feature_names=[]



for i in range(0,len(rfe.support_)):
	if rfe.support_[i]==True:
Ejemplo n.º 45
0
print(data.describe())

## plotting
import matplotlib.pyplot as plt
import matplotlib

matplotlib.style.use('ggplot')

#data.boxplot()
#data.hist()
#data.groupby('class').hist()
#data.groupby('class').plas.hist(alpha=0.4)
from pandas.tools.plotting import scatter_matrix
#scatter_matrix(data, alpha=0.2, figsize=(16.0, 16.0), diagonal='kde')
#plt.savefig(r"scatter_matrix_pima.png")

# Recursive Feature Elimination
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# load the iris datasets
dataset = datasets.load_iris()
# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model, 3)
rfe = rfe.fit(dataset.data, dataset.target)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)
Ejemplo n.º 46
0
features = fit.transform(X)

# In[28]:

features[0:20, :]

# In[29]:

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# In[30]:

model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)
result = fit.transform(X)

print("Num Features:      ", fit.n_features_)
print("Selected Features: ", fit.support_)
print("Feature Ranking:   ", fit.ranking_)
print("\n\n\n", result[:20, :])

# In[31]:

from sklearn.decomposition import PCA

# In[32]:

pca = PCA(n_components=3)
# Applying models
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
import sklearn.feature_selection
import matplotlib.pyplot as plt

model = ExtraTreesClassifier()
model2 = RandomForestClassifier()
model.fit(X_train_res, y_train_res)
model2.fit(X_train_res, y_train_res)

# Recursive Feature Elimination
# create the RFE model and select 4 attributes
from sklearn.feature_selection import RFE
rfe = RFE(model, 4)
rfe = rfe.fit(X_train_res, y_train_res)
rfe2 = RFE(model, 4)
rfe2 = rfe.fit(X_train_res, y_train_res)

# summarize the selection of the attributes
print("ForExtraTreesClassifier:By RFE")
print(rfe.support_)
print(rfe.ranking_)
print("RandomForestClassifier by RFE:")
print(rfe.support_)
print(rfe.ranking_)
print("ForExtraTreesClassifier by FE:")
print(model.feature_importances_)
print("RandomForestClassifier by FE:")
print(model.feature_importances_
      )  #use inbuilt class feature_importances of tree based classifiers
    def run(self):
        loanfreature_df = pd.read_csv(
            processData(loginemail=self.loginemail,
                        loginpassword=self.loginpassword).output().path,
            low_memory=False,
            encoding='ISO-8859-1')
        Y = loanfreature_df.int_rate
        loanfreature_df.drop('int_rate', axis=1, inplace=True)
        cols_to_keep = [
            'loan_amnt', 'term', 'emp_length', 'home_ownership_category',
            'annual_inc', 'verification_status_category', 'purpose',
            'addr_state', 'dti', 'delinq_2yrs', 'last_meanfico',
            'inq_last_6mths', 'open_acc', 'revol_bal', 'revol_util',
            'total_acc', 'mths_since_last_major_derog', 'funded_amnt_inv',
            'installment', 'application_type', 'pub_rec', 'addr_state'
        ]
        loanfreature_df = loanfreature_df[cols_to_keep]
        loanfreature_df = createDummies(loanfreature_df)

        X = loanfreature_df._get_numeric_data()
        names = ["%s" % i for i in X]
        ranks = {}

        lr = LinearRegression(normalize=True)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=DeprecationWarning)
            lr.fit(X, Y)
            ranks["Linear reg"] = rank_to_dict((lr.coef_), names)

        ridge = Ridge(alpha=7)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=DeprecationWarning)
            ridge.fit(X, Y)
            ranks["Ridge"] = rank_to_dict((ridge.coef_), names)

        lasso = Lasso(alpha=.05)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=DeprecationWarning)
            lasso.fit(X, Y)
            ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)

        rlasso = RandomizedLasso(alpha=0.00)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=DeprecationWarning)
            rlasso.fit(X, Y)
            ranks["Stability"] = rank_to_dict((rlasso.scores_), names)

        rf = RandomForestRegressor()
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=DeprecationWarning)
            rf.fit(X, Y)
            ranks["RF"] = rank_to_dict(rf.feature_importances_, names)

        # stop the search when 5 features are left (they will get equal scores)
        rfe = RFE(lr, n_features_to_select=15)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=DeprecationWarning)
            rfe.fit(X, Y)
            ranks["RFE"] = rank_to_dict(rfe.ranking_, X.columns, order=-1)

        f, pval = f_regression(X, Y, center=True)
        ranks["Corr."] = rank_to_dict(f, names)

        r = {}
        for name in names:
            r[name] = round(
                np.mean([ranks[method][name] for method in ranks.keys()]), 2)
        methods = sorted(ranks.keys())
        ranks["Mean"] = r
        methods.append("Mean")

        #     f_rank = pd.DataFrame()
        print("\t%s" % "\t".join(methods))
        temp = "\t".join(methods)
        f = open("testing.txt", 'w')
        f.write(temp)
        f.write("\n")
        for name in names:
            temp = name + "\t" + " \t".join(
                map(str, [ranks[method][name] for method in methods]))
            f.write(temp)
            f.write("\n")
            print("%s\t%s" % (name, "\t".join(
                map(str, [ranks[method][name] for method in methods]))))
        f.close()
        feature = pd.read_csv('testing.txt', sep='\t')
        feature.to_csv(self.output().path)
Ejemplo n.º 49
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
 @Time    : 2018/9/29 17:36
@Author  : LI Zhe
"""
import pandas as pd
from sklearn.svm import SVR
from sklearn.datasets import load_digits
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt

data_train = pd.read_csv('../data/new_train_feature.csv',
                         low_memory=False,
                         encoding='gbk')

train_x = data_train.iloc[:, :-1]
train_y = data_train.iloc[:, -1]

# Create the RFE object
svr = SVR(kernel="linear", C=1)
rfe = RFE(estimator=svr, n_features_to_select=20, step=1)
rfe.fit(train_x, train_y)
ranking = rfe.ranking_.reshape(train_x[0].shape)

plt.matshow(ranking)
plt.colorbar()
plt.title("Ranking of pixels with RFE")
plt.show()
Ejemplo n.º 50
0
series = pd.Series.from_csv('./dataset/monthly-car-sales-in-quebec-1960.csv',
                            header=0)
# 平稳化
diff = series.diff(12)[12:]

# 自相关图
plot_acf(diff)
plot_pacf(diff)

# 创建一系列滞后数据
df = pd.DataFrame()
df['t'] = diff
for i in range(1, 13):
    df['t-{0}'.format(str(i))] = diff.shift(i)
df = df.iloc[12:, :]

# 随机森林计算特征重要性
X = df.values[:, 1:]
y = df.values[:, 0]
model = rfr(500, random_state=1)
model.fit(X, y)
fi = model.feature_importances_
plt.bar(np.arange(1, fi.size + 1), fi)

# RFE选择特征
rfe = RFE(rfr(500, random_state=1), 4)
fit = rfe.fit(X, y)
print(df.columns[1:][fit.support_])
plt.bar(np.arange(1, fit.support_.size + 1), fit.support_)
plt.bar(np.arange(1, fit.ranking_.size + 1), fit.ranking_)
Ejemplo n.º 51
0
    data2 = data.sample(frac=1).reset_index(drop=True)    
    random.shuffle(headers2)
       
    #split the truth data and the deccriptors
    y = data2['result']   
    finaldata = data2[headers2]

    F1Scores = pd.DataFrame(columns = ['F1'])  

    #custom recursive feature elimination      
    for nfeat in range(10, 100, 1):
        print('the number of features is ', nfeat)
        
        #RFE works over the entire dataset
        selector = RFE(estimator = logmodel, n_features_to_select = nfeat, step = 10)
        selector = selector.fit(finaldata, y)
        rfe_fits = selector.ranking_
        
        columnNames = finaldata.columns
        rankedColumnns_Raw = pd.DataFrame(data = {'Rank':selector.ranking_, 'Name':columnNames})
        
                
        #data to use
        evalData = list(rankedColumnns_Raw[rankedColumnns_Raw['Rank']==1]['Name'] )
            
        tempSelDesc = rankedColumnns_Raw.loc[rankedColumnns_Raw['Rank']==1].reset_index(drop = True).drop(labels = ['Rank'], axis = 1)
        tempSelDesc = tempSelDesc.rename(columns={'Name': nfeat})
        
        #add the data from this iteration to the existing data
        selectedDesc = pd.concat([selectedDesc,tempSelDesc], ignore_index = True, sort = False, axis = 1)
        
Ejemplo n.º 52
0
X = df_final.loc[:, df_final.columns != 'y']
y = df_final.loc[:, df_final.columns == 'y']
#%% start over-sampling by importing SMOTE (Synthetic Minority Oversampling Technique)
from sklearn.model_selection import train_test_split
#train_test_split on predictors X and target Y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns #columns is a list of the predictor labels
#%% Recursive feature selection for regresssion
from sklearn.linear_model import LogisticRegression as LR
df_final_vars=df_final.columns.values.tolist()
y=['y']
X=[i for i in df_final_vars if i not in y]
from sklearn.feature_selection import RFE
logreg = LR(solver='liblinear', max_iter=200)
rfe = RFE(logreg, 20)
rfe = rfe.fit(X_train, y_train.values.ravel())
#%%
rfe_result= pd.DataFrame(list(zip(X_train.columns.values,rfe.support_,rfe.ranking_)),columns=['predictor', 'yes', 'rank'])
rfe_selected=rfe_result[rfe_result['yes']==1].predictor
#%%
rfe_selected=[ele for ele in rfe_selected if ele not in {'marital_unknown', 'default_no', 'default_unknown', 'contact_cellular', 'contact_telephone', 'poutcome_failure', 'poutcome_success', 'poutcome_nonexistent'}]
X=X_train[rfe_selected]
y=y_train['y']
#%%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LR(solver='lbfgs', max_iter=200)
logreg.fit(X_train, y_train)
y_pred_lr = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
#%% run Log regression
from sklearn.metrics import confusion_matrix
Ejemplo n.º 53
0
dataset1 = dataset.drop(['Unnamed: 0'], axis=1)
trg = dataset1[['Y']]
trn = dataset1.drop(['Y'], axis=1)
Y = np.array(trg, dtype=np.float32)
X = np.array(trn, dtype=np.float32)
normalized_X = preprocessing.normalize(trn)
standardized_X = preprocessing.scale(trn)
model = ExtraTreesClassifier()
model.fit(trn, trg)
print(model.feature_importances_)

model = LinearRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model, 3)
rfe = rfe.fit(trn, trg)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

model = LinearRegression()
model.fit(trn, trg)
print(model)
# make predictions
expected = trg
predicted = model.predict(trn)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

#models = [LinearRegression(),#метод наименьших квадратов
Ejemplo n.º 54
0
    sub_out = np.unique(sub_labels)[it]
    # sub_out = np.unique(sub_labels)[np.random.randint(0, 20, 1)]

    train_inds = org_train_inds[np.logical_not(train_subs == sub_out)]
    test_inds = org_test_inds[np.logical_not(test_subs == sub_out)]

    best_pvalue = 1
    best_acc = 0

    # for k in [80]:
    for k in np.arange(0, 1000, 25)[1:]:
        print('-' * 80)
        print('k=%i' % k)

        selector = RFE(clf, n_features_to_select=k, step=0.5, verbose=00)
        selector = selector.fit(FS_mask[train_inds], labels[train_inds])

        clf.fit(FS_mask[train_inds][:, selector.support_], labels[train_inds])

        acc = clf.score(FS_mask[test_inds][:, selector.support_],
                        labels[test_inds])
        print('Total acc: %.3f' % acc)

        # dump
        meta_space = np.zeros(selector.support_.shape, dtype=np.float32)
        meta_space[selector.support_] = clf.coef_
        brain_coef_nii = meta_mask.inverse_transform(meta_space)
        brain_coef_nii.to_filename(
            'train_verbs_nomen_predict_hand_objects_0.54accuracy.nii.gz')

        meta_space[selector.support_] = clf.coef_
Ejemplo n.º 55
0
x_validation = x_use.iloc[454:605, :]
y_validation = y[454:605]
x_test = x_use.iloc[605:757, :]
y_test = y[605:757]

######normalizaton
scaler = preprocessing.StandardScaler().fit(x_train)
n_x_train = scaler.transform(x_train)
n_x_validation = scaler.transform(x_validation)
n_x_test = scaler.transform(x_test)

######rfe
svc = SVC(kernel="linear")
#model = LogisticRegression() #设置算法为逻辑回归
rfe = RFE(svc, n_features_to_select=100)  #选择100个最佳特征变量,并进行RFE
selector = rfe.fit(n_x_train, y_train)  #进行RFE递归
selector.support_
selector.ranking_
new_x_train = n_x_train[:, selector.support_]
new_x_validation = n_x_validation[:, selector.support_]
new_x_test = n_x_test[:, selector.support_]
new_x_train.shape
new_x_validation.shape
new_x_test.shape

rfe_columns = selector.support_

if name == 'all_subset':
    rfe_baseline = np.array(rfe_columns[1:23])
    rfe_time_frequency = np.array(rfe_columns[23:34])
    rfe_vocal_fold = np.array(rfe_columns[34:56])
Ejemplo n.º 56
0



'''machine learning modeling'''

'''feature engineering (find the variables that gives max R2 accuracy score)'''
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

estimator = LinearRegression() #use regression model for regression problem
list_r2=[]
max_r2 = 0
for i in range(1,len(X_scaled.loc[0])+1):
    selector = RFE(estimator, i, step=1)
    selector = selector.fit(X_scaled, y_scaled)
    adj_r2 = 1 - ((len(X_scaled)-1)/(len(X_scaled)-i-1))*(1-selector.score(X_scaled, y_scaled))
    list_r2.append(adj_r2)# mse = 
    if max_r2 < adj_r2:
        sel_features = selector.support_
        max_r2 = adj_r2
       
X_sub = X_scaled.iloc[:,sel_features]
X_sub.columns.tolist() #selected features

#split training  and test set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_sub,y,random_state=0)


Ejemplo n.º 57
0
clf = linear_model.Lasso(alpha=0.1)
res = clf.fit(train_features, train_labels)
score = res.score(test_features, test_labels)
print("LASSO regression has a score of {} out of sample".format(
    score.round(4)))

# #### let's be more strict about features - rank and remove

# In[72]:

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
rfe = RFE(logreg, 20)
rfe = rfe.fit(features, labels.values.ravel())
print(rfe.support_)
print(rfe.ranking_)
rfe.score(test_features, test_labels)

# ### remove a few features according to rfe results

# In[73]:

labels = varSelection["defaulted"]
features = varSelection.drop(columns=[
    "defaulted", "loan_status", "application_type_JOINT", "home_ownership_ANY",
    "home_ownership_NONE"
])
features.info()
Ejemplo n.º 58
0
from sklearn.feature_selection import RFE
from sklearn.svm import SVR 
import pandas as pd 
from sklearn.linear_model import LinearRegression
import numpy as np 

data = pd.read_csv("python-ml-course-master/datasets/ads/Advertising.csv")
features_cols = ["TV", "Radio", "Newspaper"]
x= data[features_cols]
y = data["Sales"]
estimator = SVR(kernel = "linear") #crea un modelo lineal
selector = RFE(estimator, 2, step=1) #Le pedimos que deje el modelo en 2 variables predictoras Recursive Feature Elimination
selector = selector.fit(x,y) 
print(selector.support_)
print(selector.ranking_)

X_pred = x[["TV","Radio"]] 
lm = LinearRegression() #Crea el modelo de regresion lineal
lm.fit(X_pred, y) #Ajusta el modelo a nuestros datos
print(lm.intercept_) #Alpha
print(lm.coef_) #Bethas 
print(lm.score(X_pred, y)) #R2
Ejemplo n.º 59
0
# Read contents of the file
dataframe = pandas.read_csv('https://modcom.co.ke/bigdata/datasets/pima.csv')
pandas.set_option('display.max_columns', 9)
print(dataframe)

array = dataframe.values
print(array)
X = array[:, 0:8]
print(X)
y = array[:, 8]
print(y)
# Identify features that won't be good predictors
from sklearn.feature_selection import RFE
rfc = RandomForestClassifier(n_estimators=40)
rfe = RFE(rfc, 5)
fitted = rfe.fit(X, y)
print('Selected columns: ', fitted.support_)
# Create a new dataset for the best predictors
subset = dataframe[([
    'Glucose', 'BloodPressure', 'BMI', 'DiabetesPedigreeFunction', 'Age'
])]
print(subset)
# Obtain the values of the new dataset
subsetArray = subset.values
Xnew = subsetArray[:, 0:5]
print(Xnew)
# Establish the training and testing sets
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    Xnew, y, test_size=0.10, random_state=7)
# Pick an algorithm
Ejemplo n.º 60
0
Labels = URLS['Result']

Training_Data, Testing_Data = train_test_split(URLS_Without_Labels,
                                               test_size=0.25,
                                               random_state=150)

Training_Labels, Testing_Labels = train_test_split(Labels,
                                                   test_size=0.25,
                                                   random_state=150)

Model = LogisticRegression(random_state=0)

Rfe = RFE(Model, 15)

Fit = Rfe.fit(Training_Data, Training_Labels)

Prediction_Labels = Rfe.predict(Testing_Data)

New_Data = Rfe.transform(URLS_Without_Labels)

# print(label.shape)
df = pd.DataFrame(New_Data)
df.to_csv('RFElogreg.csv')

Confusion_Matrix = confusion_matrix(Testing_Labels, Prediction_Labels)

print("\nNumber Of Features: %d\n" % Fit.n_features_)
print("Selected Features: \n%s\n" % Fit.support_)
print("Feature Ranking: \n%s\n" % Fit.ranking_)
print("Training Accuracy Score Obtained is: {0:.2f}%".format(