def run(self, df, target_label):
     target = df[target_label]
     feature = df.drop(target_label, axis=1)
     clf = RandomizedLogisticRegression()
     for col in feature.columns:
         if np.any(np.isnan(feature[col].values)) or np.any(
                 np.isinf(feature[col].values)):
             print(list(feature[col].values))
     try:
         clf.fit(feature.values, target.values)
     except:
         for col in feature.columns:
             print(list(feature[col].values))
     scores = {}
     for col_index in range(len(feature.columns)):
         scores[feature.columns[col_index]] = abs(clf.scores_[col_index])
     scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
     print(scores)
     position = {}
     i = 0
     for col, _ in scores:
         position[col] = i
         i += 1
     print(position)
     return position
def statiblity(X, Y):
    from sklearn.linear_model import RandomizedLogisticRegression

    clf = RandomizedLogisticRegression(random_state=1)
    clf.fit(X, Y)

    return clf.scores_
Exemple #3
0
def feture_select_RLR():
    data_x, data_y, names = get_data()
    rlr = RLR()
    rlr.fit(data_x, data_y)
    return sorted(zip(names, map(lambda x: round(x, 4), rlr.scores_)),
                  key=lambda x: x[1],
                  reverse=True)
Exemple #4
0
def lasso_regression(X, y):
    """
	Use Randomized Logistic Regression to select the features based on the coefficient values
	"""

    clf = RandomizedLogisticRegression(C=1.0)
    clf.fit(X, y)
    print('Number of non zero valued coefficients: ', np.sum(clf.scores_ > 0))
    imp_feature_idx = clf.scores_.argsort()

    qualities = []

    X_train, X_test, y_train, y_test = split_examples(X, y)

    for i in range(0, 100, 4):
        clf = LogisticRegression(C=0.1)
        clf.fit(X_train[:, imp_feature_idx[i:]], y_train)
        q = roc_auc_score(
            y_test,
            clf.predict_proba(X_test[:, imp_feature_idx[i:]])[:, 1])

        qualities.append(q)
    plt.plot(range(0, 100, 4), qualities)
    plt.show()

    return qualities
Exemple #5
0
 def randomized_Logistic_regression(self):
     X = self.data[:, 1:len(self.data[0])]
     y = self.data[:, 0]
     randomized_logistic = RandomizedLogisticRegression()
     randomized_logistic.fit(X, y)
     a = randomized_logistic.get_support()
     selected = np.where(a)
Exemple #6
0
def Feature_sort(Feat_scale, Label, threads=4):  ##通过三种特征选择方法对特征进行排序

    ranks = {}
    ## Univariate feature selection
    Selector = SelectKBest(f_classif, k='all')
    Selector.fit_transform(Feat_scale, Label)
    ranks["Univariate_f"] = np.argsort(Selector.pvalues_)

    ## RandomizedLogistic regression n_jobs=**s, more robust result from bigger n_resampling
    ##从第1900左右起,后续的特征排序得较为可疑。
    rlogreg = RandomizedLogisticRegression(n_jobs=1,
                                           n_resampling=2000,
                                           selection_threshold=0,
                                           verbose=False,
                                           random_state=0)
    ##DeprecationWarning: Class RandomizedLogisticRegression is deprecated; The class RandomizedLogisticRegression is deprecated in 0.19 and will be removed in 0.21.
    ##warnings.warn(msg, category=DeprecationWarning)
    rlogreg.fit(Feat_scale, Label)
    ranks["Randomized_Logistic_f"] = np.argsort(-abs(rlogreg.scores_))

    ## boruta based on randomforest n_jobs=**
    rf = RandomForestClassifier(random_state=0,
                                n_jobs=threads,
                                max_features='auto')
    feat_selector = BorutaPy(rf, n_estimators='auto', perc=80, random_state=0)
    feat_selector.fit(Feat_scale, Label)
    ranks["Boruta_f"] = np.argsort(feat_selector.ranking_)

    return (ranks)
Exemple #7
0
	def randomized_Logistic_regression(self):
		X = self.data[:,1:len(self.data[0])]
		y = self.data[:,0]
		randomized_logistic = RandomizedLogisticRegression()
		randomized_logistic.fit(X,y)
		a = randomized_logistic.get_support()
		selected = np.where(a)
def feature_selection(train,test,y):
    print "特征选择"
    clf = RLR(C=10,scaling=0.5,sample_fraction=0.6,n_resampling=200,selection_threshold=0.4,n_jobs=3)
    clf.fit(train,y)
    train = clf.transform(train)
    test = clf.transform(test)

    return train,test
Exemple #9
0
 def randlogistic(self, selection_threshold=0.25, sample_fraction=0.75):
     rlr_model = RandomizedLogisticRegression(
         C=self.C,
         selection_threshold=selection_threshold,
         normalize=False,
         sample_fraction=sample_fraction)
     rlr_model.fit(self.data.values, self.target.values)
     return rlr_model
Exemple #10
0
def evaluate_stability(vocab, id_to_vec, mesh_to_id):
    labels = ('Male', 'Female', 'Both')
    Xs, ids = get_basic_Xs(id_to_vec, mesh_to_id, shuffle=True)
    Xtr, Ytr, Itr, Xte, Yte, Ite = get_test_train(labels, ids, Xs, 5)
    print 'Fitting RandomizedLR...'
    logreg = RandomizedLogisticRegression(verbose=True,
                                          n_resampling=1000,
                                          n_jobs=16)
    logreg.fit(Xtr, Ytr)
    scores = logreg.scores_
    return {vocab[i]: score for i, score in enumerate(scores)}
Exemple #11
0
def rank_features(algorithm, X, y):
    # The RFE approach can be used with various different classifiers
    if algorithm == 'random_forest_rfe':
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.feature_selection import RFE
        estimator = RandomForestClassifier(n_estimators=50,
                                           random_state=R_SEED,
                                           n_jobs=1)
        selector = RFE(estimator, 5, step=0.1)
        selector.fit(X, y)

        for x in sorted(
                zip(map(lambda x: round(x, 4), selector.ranking_), features)):
            print x[1]
    elif algorithm == 'svm_rfe':
        from sklearn.svm import SVC
        from sklearn.feature_selection import RFE
        estimator = SVC(random_state=R_SEED, kernel='linear')
        selector = RFE(estimator, 5, step=0.1)
        selector.fit(X, y)

        for x in sorted(
                zip(map(lambda x: round(x, 4), selector.ranking_), features)):
            print x[1]
    elif algorithm == 'random_logistic_regression':
        # See http://blog.datadive.net/selecting-good-features-part-iv-stability-selection-rfe-and-everything-side-by-side/
        from sklearn.linear_model import RandomizedLogisticRegression
        rlasso = RandomizedLogisticRegression(random_state=R_SEED)
        rlasso.fit(X, y)

        for x in sorted(zip(map(lambda x: round(x, 4), rlasso.scores_),
                            features),
                        reverse=True):
            print x[1]
    elif algorithm == 'random_lasso':
        from sklearn.linear_model import RandomizedLasso
        rlasso = RandomizedLasso(random_state=R_SEED)
        #rlasso = RandomizedLasso(alpha=0.025, random_state=R_SEED)
        rlasso.fit(X, y)

        for x in sorted(zip(map(lambda x: round(x, 4), rlasso.scores_),
                            features),
                        reverse=True):
            print x[1]
    elif algorithm == 'anova':
        from sklearn.feature_selection import f_classif
        F, pval = f_classif(X, y)
        random_array = random.random(len(pval))
        order = lexsort((random_array, pval))  # will break ties by random
        for i in order:
            print features[i]
    else:
        print "Invalid algorithm: %s" % algorithm
        exit(1)
def get_features(X_train, y_train, names, selection_threshold=0.2):
    print('\ngetting features with randomized logistic regression...')
    print('using a selection threshold of {}'.format(selection_threshold))
    randomized_logistic = RandomizedLogisticRegression(
        selection_threshold=selection_threshold)
    randomized_logistic.fit(X_train, y_train)
    mask = randomized_logistic.get_support()
    features = np.array(names)[mask]
    print('found {} ngrams:'.format(len([f for f in features])))
    print([f for f in features])
    return features
def get_support_fields(X,Y):
    '''
    Function for getting support fields
    '''
    rlr = RLR() #建立随机逻辑回归模型,筛选变量
    rlr.fit(X, Y) #训练模型
    rlr.get_support() #获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数
    print rlr.scores_
    print(u'有效特征为:%s' % (','.join(data.columns[rlr.get_support()])).decode('utf-8'))
    X = data[data.columns[rlr.get_support()]].as_matrix() #筛选好特征
    return X
Exemple #14
0
def predictWithAdaBoost(config, X, Y, testFeatures):
    adaConfig = config.getConfig('model/adaboost')
    if adaConfig.get('useRandomLog', False):
        clf = RandomizedLogisticRegression()
        clf.fit(X, Y)
        X_new = clf.transform(X)
        if not X_new.size == 0:
            X = X_new
            testFeatures = clf.transform(testFeatures)
    clf = AdaBoostClassifier(n_estimators=50,learning_rate=1.0, algorithm='SAMME.R')
    clf.fit(X,Y)
    return clf.predict(testFeatures)
Exemple #15
0
def compute_randomized_lr_score(data_set_df, user_info_df, label='gender'):
    # print "\t\t\tfilling nan values..."
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    x = df_filtered.dropna(how='all')
    x_imp = pc.fill_nan_features(x) if x.isnull().any().any() else x.values

    clf = RandomizedLogisticRegression()
    # print "\t\t\tfitting LR model..."
    clf.fit(x_imp.T, y_v)
    feature_importances = DataFrame(clf.scores_, index=df_filtered.index, columns=['importance'])
    feature_importances.sort_values('importance', ascending=False, inplace=True, na_position='last')
    return feature_importances
def classify_logistic():
    print "logistic regression"
    (X_train, y_train), (X_test, y_test) = util.load_all_feat()
    print "original X_train shape", X_train.shape
    clf = RandomizedLogisticRegression(n_jobs=2)
    clf.fit(X_train, y_train)
    # clf = LogisticRegression()
    # clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    print "accuracy score:", accuracy_score(y_test, pred)

    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
Exemple #17
0
def getElgiibleFeatures(allFeatureParam, allLabelParam):
    '''
    reff for paper : 
    http://scikit-learn.org/stable/modules/feature_selection.html#randomized-l1
    http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RandomizedLogisticRegression.html
  '''

    logiRegObj = RandomizedLogisticRegression()
    logiRegObj.fit(allFeatureParam, allLabelParam)
    ### Output ###
    #print "Model score: ", logiRegObj.scores_
    eligible_indices = logiRegObj.get_support(indices=True)
    return eligible_indices
def log_reg_feat_selection(X_train, y_train, X_valid, y_valid, random_state):
    """
    Feature selection based on the scores given to the features by the 
    RandomizedLogisticRegression algorithm.
    """
    
    rlr = RandomizedLogisticRegression(C=[0.001, 0.01, 0.1, 1.], 
                                       sample_fraction=0.7,
                                       n_resampling=200, selection_threshold=0.25,
                                       verbose=5, n_jobs=-1, random_state=0)                                   
    rlr.fit(X_train, y_train)
    np.save('save/feat_sel_log_reg.npy', rlr.scores_)
    
    return rlr.scores_
Exemple #19
0
def predictWithQDA(config, X, Y, testFeatures):
    qdaConfig = config.getConfig('model/qda')
    if qdaConfig.get('useRandomLog', False):
        clf = RandomizedLogisticRegression()
        clf.fit(X, Y)
        X_new = clf.transform(X)
        if not X_new.size == 0:
            X = X_new
            testFeatures = clf.transform(testFeatures)

    priors = qdaConfig.get('priors', None)
    clf = QDA(priors = priors)
    clf.fit(X, Y)
    return clf.predict(testFeatures)
Exemple #20
0
def test_rflasso():
    train_X, test_X, train_Y, test_Y = train_test_split(index_data,
                                                        index_lable,
                                                        test_size=0.25,
                                                        random_state=1)
    from sklearn.linear_model import LogisticRegression
    from sklearn.feature_selection import SelectFromModel
    from sklearn.svm import SVC
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.linear_model import RandomizedLogisticRegression
    randomized_logistic = RandomizedLogisticRegression(C=0.1, n_jobs=2)
    randomized_logistic.fit(train_X, train_Y)
    XX = randomized_logistic.transform(train_X)
    print XX.shape
Exemple #21
0
def select_features(X, y):
    '''
    Select the relevant features from X that are useful for predicting
    the labels in y.

    Args:
        X: numpy 2D array containing input features
        y: numpy 1D array containing labels

    Returns:
        feature_list: List of indices of the selected important features
    '''

    # Get the selection model (stability selection)
    selection_model = RandomizedLogisticRegression(random_state=0)
    selection_model.fit(X, y)

    # Use a cross validated logistic regression to choose the importance
    # threshold at which a feature is included
    step_size = 50
    max_weight = int(max(selection_model.scores_)) + 1
    trial_thresholds = [
        i / step_size for i in range(1, max_weight * step_size + 1)
    ]
    threshold = 0
    max_score = 0
    for trial in trial_thresholds:
        selected_features = [
            i for i, score in enumerate(selection_model.scores_)
            if score > trial
        ]
        if len(selected_features) > 0:
            X_reduced = X[:, selected_features]
            model = LogisticRegression(multi_class='multinomial',
                                       class_weight='balanced',
                                       solver='newton-cg',
                                       random_state=0,
                                       max_iter=1000)
            scores = cross_val_score(model, X_reduced, y, cv=5)
            score = scores.mean()
            if score >= max_score:
                max_score = score
                threshold = trial / step_size

    importance = {i: s for i, s in enumerate(selection_model.scores_)}
    return [
        i for i, score in enumerate(selection_model.scores_)
        if score > threshold
    ]
def learning_curves(X, y, clf, params, train_sizes=None, feature_selection=False, n_folds=3, scoring='accuracy'):
    """
    Builds learning curves on test set, with parameters chosen on train and validation set using nested cross validation
    :param X: data
    :param y: labels
    :param clf: classificator
    :param params: parameters for grid search
    :param train_sizes: train sizes for building learning curves
    :param feature_selection: whether to choose features by randomized logistic regression
    :param n_folds: number of outed cv folds
    :param scoring: scoring metric
    :return: train and test curve
    """
    if train_sizes is None:
        train_sizes = np.linspace(0.5, 1.0, 5)

    kf = KFold(X.shape[0], n_folds=n_folds)

    train_curve = np.zeros_like(train_sizes)
    test_curve = np.zeros_like(train_sizes)

    for train_inds, test_inds in kf:
        train_data = X[train_inds]
        test_data = X[test_inds]
        train_labels = y[train_inds]
        test_labels = y[test_inds]

        if feature_selection:
            rlr = RandomizedLogisticRegression()
            rlr.fit(train_data, train_labels)

            inds = [i for i in range(X.shape[1]) if rlr.all_scores_[i] > 0.0]
            print len(inds), ' features chosen'
            train_data = train_data[:, inds]

        gs = GridSearchCV(clf, params, scoring=scoring, cv=5)
        gs.fit(train_data, train_labels)
        bp = gs.best_params_
        print 'chosen params: ', bp

        for p in bp:
            setattr(clf, p, bp[p])
        lc = learning_curve(clf, test_data, test_labels, scoring=scoring, train_sizes=train_sizes)
        train_curve += lc[1].mean(axis=1)
        test_curve += lc[2].mean(axis=1)

    train_curve /= n_folds
    test_curve /= n_folds
    return train_curve, test_curve
Exemple #23
0
def stability_test(x, y, model, names, score_type):
    if score_type != "r2":
        rlasso = RandomizedLogisticRegression()
        rlasso.fit(x, y)
    else:
        rlasso = RandomizedLasso(alpha=0.025)
        rlasso.fit(x, y)
    if sum(rlasso.scores_) == 0:
        return [[0, el] for el in names]
    maxval = max(rlasso.scores_)
    minval = min(rlasso.scores_)
    dist = maxval - minval
    return list(
        zip(map(lambda x: round(x, 4), (rlasso.scores_ - minval) / dist),
            names))
Exemple #24
0
def feature_selection_class(predictors, responses, test_predictors,
                            selectFeatTech):
    if (selectFeatTech == 0):
        #t=int(predictors.shape[1]*0.40)
        t = 500  # no of features you want to select
        model = SelectKBest(chi2, k=t).fit(predictors.replace(-1, 0),
                                           responses)
        #print model.scores_
        predictors_new = model.transform(predictors)
        predictors_test_new = model.transform(test_predictors)
        indices = model.get_support(indices=True)
    if (selectFeatTech == 1):
        randomized_logistic = RandomizedLogisticRegression()
        model = randomized_logistic.fit(predictors, responses)
        predictors_new = model.transform(predictors)
        predictors_test_new = model.transform(test_predictors)
        indices = model.get_support(indices=True)

    column_names = predictors.columns[indices]
    predictors_new = pd.DataFrame(predictors_new,
                                  index=predictors.index,
                                  columns=column_names)
    predictors_test_new = pd.DataFrame(predictors_test_new,
                                       index=test_predictors.index,
                                       columns=column_names)
    return predictors_new, predictors_test_new
def perform_stability_selection(X_train, y_train, round_id = 0) :
	# Defaults: RandomizedLasso(alpha='aic', scaling=0.5, sample_fraction=0.75, n_resampling=200, n_jobs = 1)
	X_train = perform_scaling (X_train, scaling = 'minmax')
	
	#logistic = LogisticRegression(penalty = 'l2', class_weight = 'auto', max_iter = 1000, random_state = 30)
	#logistic.fit(X_train, y_train)
	print ("Round%d - Stability selection -" %(round_id))
	#print ("Logistic (L1 penalty) Feature_Importances: ", sorted(zip(map(lambda x: round(x, 5), logistic.coef_), header[1:]), 
  #           reverse=True))
	#print ("Logistic Feature_Importances: ", logistic.coef_)

	rlog = RandomizedLogisticRegression(random_state = 30, n_jobs = 3, n_resampling = 400)
	rlog.fit(X_train, y_train)
	print ("Randomized Logistic Feature_Importances: ", rlog.scores_)
	print ("Randomized Logistic Feature_Importances: ", sorted(zip(map(lambda x: round(x, 5), rlog.scores_), header[1:]), 
             reverse=True))
Exemple #26
0
def log_reg_feat_selection(X_train, y_train, X_valid, y_valid, random_state):
    """
    Feature selection based on the scores given to the features by the 
    RandomizedLogisticRegression algorithm.
    """

    rlr = RandomizedLogisticRegression(C=[0.001, 0.01, 0.1, 1.],
                                       sample_fraction=0.7,
                                       n_resampling=200,
                                       selection_threshold=0.25,
                                       verbose=5,
                                       n_jobs=-1,
                                       random_state=0)
    rlr.fit(X_train, y_train)
    np.save('save/feat_sel_log_reg.npy', rlr.scores_)

    return rlr.scores_
def tipdm_chapter5_test():
	# 参数初始化
	filename = '../../../MyFile/chapter5/data/bankloan.xls'
	data = pd.read_excel(filename)
	x = data.iloc[:,:8].as_matrix()
	y = data.iloc[:,8].as_matrix()

	# feature selection
	rlr = RLR()	# 建立随机逻辑回归模型,筛选变量
	rlr.fit(x, y)	# 训练模型
	features = rlr.get_support()	# 获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数
	print(u'通过随机逻辑回归模型筛选特征结束。')
	print(u'有效特征为: {0}'.format(','.join(data.columns[features])))
	x = data[data.columns[features]].as_matrix()	# 筛选好特征

	# training and test
	lr = LR()	# 建立逻辑货柜模型
	lr.fit(x, y)	# 用筛选后的特征数据来训练模型
	print(u'逻辑回归模型训练结束。')
	print(u'模型的平均正确率为: {0}'.format(lr.score(x, y))) # 给出模型的平均正确率
def programmer_1():
    filename = "data/bankloan.xls"
    data = pd.read_excel(filename)

    x = data.iloc[:, :8].as_matrix()
    y = data.iloc[:, 8].as_matrix()

    rlr = RLR()
    rlr.fit(x, y)
    rlr_support = rlr.get_support()
    support_col = data.drop('违约', axis=1).columns[rlr_support]

    print(
        "rlr_support_columns: {columns}".format(columns=','.join(support_col)))
    x = data[support_col].as_matrix()

    lr = LR()
    lr.fit(x, y)

    print("lr: {score}".format(score=lr.score(x, y)))
Exemple #29
0
    def data_proc(self):
        self.load_data()
        # iloc,完全基于位置的索引,[]中的第一个值是从第几行到第几行,第二个是从第几列到第几列
        x = self.data.iloc[:, :8].as_matrix()
        y = self.data.iloc[:, 8].as_matrix()
        #先使用随机变量模型进行属性的筛选
        rlr = RLR()
        rlr.fit(x, y)  #训练模型
        rlr.get_support()  #获取特征筛选结果,也可以通过.scores获得各个特征的分数

        print("有效特征为%s" % ','.join(self.data.columns[rlr.get_support()]))
        x = self.data[data.columns[rlr.get_support()]].as_matrix()  #筛选之后的特征
        rlr.get_support()
        lr = LR(class_weight={
            0: 0.9,
            1: 0.1
        })  # 分类权重,避免误分类代价比较高时使用,class_weight='balanced'自行处理,或者像代码中那样设置
        #lr.fit(x, y,sample_weight=[1,2,3,5,4,9,8,10])
        lr.fit(x, y, sample_weight=[1, 2, 3, 5, 4])  #样本权重,设置每一行数据的重要性,一行数据一个值
        result = lr.predict([[24, 2, 2, 0, 28, 17.3, 1.79, 3.06]])
        print('模型的正确率是:%s,预测结果是 %d' % (lr.score(x, y), result))
Exemple #30
0
def stable_select(df, y, rd_reg_columns, threshold=0.2, model='rlr'):
    X = df.loc[:, rd_reg_columns]
    Y = df[y]
    if model == 'rlr':
        rlr = RLR(scaling=0.5, sample_fraction=0.75, n_resampling=300, selection_threshold=threshold)  # 随机逻辑回归
        rlr.fit(X, Y)
        scores = rlr.scores_
    elif model == 'rls':
        rls = RLS(scaling=0.5, sample_fraction=0.75, n_resampling=300, selection_threshold=threshold)  # 随机Lasso回归
        rls.fit(X, Y)
        scores = rls.scores_
    elif model == 'rfr':
        rf = RFR()
        rf.fit(X, Y)
        scores = rf.feature_importances_
    else:
        pass
    result = pd.Series(dict(zip(X.columns, scores))).rename('score').sort_values(ascending=False)
    plt.figure(figsize=(20, 10))
    result.plot.barh(title='Feature Importances', color='lightblue')
    plt.ylabel('Feature Importance Score')
    return result
def randomlr(train_x,train_y,cv_x,test_x,regp,alpha=0.5):
    # Create the random forest object which will include all the parameters
    # for the fit
    randomlr = RandomizedLogisticRegression(C=regp,scaling=alpha,fit_intercept=True,sample_fraction=0.75,n_resampling=200)

    # Fit the training data to the Survived labels and create the decision trees
    randomlr = randomlr.fit(train_x,train_y)

    train_x = randomlr.fit_transform(train_x,train_y)
    cv_x = randomlr.transform(cv_x)
    test_x = randomlr.transform(test_x)

    return train_x,cv_x,test_x
Exemple #32
0
 def pick_variables(self,
                    descover=True,
                    method="rlr",
                    threshold=0.25,
                    auto_pick=True):  #默认阈值0.25
     #挑选变量助手(特征选择)
     if method == "rlr":
         """
         #顶层特征选择算法
         #随机逻辑回归选择与y线性关系的变量(稳定性选择1)。
         #在不同数据子集和特征子集上运行特征选择算法(rlr),最终汇总选择结果
         #不同的子集上建立模型,然后汇总最终确定特征得分
         稳定性选择是一种基于二次抽样和选择算法相结合较新的方法,选择算法可以是回归、SVM或其他类似的方法。
         它的主要思想是在不同的数据子集和特征子集上运行特征选择算法,不断的重复,最终汇总特征选择结果,
         比如可以统计某个特征被认为是重要特征的频率(被选为重要特征的次数除以它所在的子集被测试的次数)。
         理想情况下,重要特征的得分会接近100%。稍微弱一点的特征得分会是非0的数,而最无用的特征得分将会接近于0。
         RandomizedLogisticRegression()
         fit(X, y)	Fit the model using X, y as training data.
         fit_transform(X[, y])	Fit to data, then transform it.
         get_params([deep])	Get parameters for this estimator.
         get_support([indices])	Get a mask, or integer index, of the features selected
         inverse_transform(X)	Reverse the transformation operation
         set_params(**params)	Set the parameters of this estimator.
         transform(X)	Reduce X to the selected features.
         """
         rlr = RandomizedLogisticRegression(
             selection_threshold=threshold)  #随机逻辑回归
         rlr.fit(self.X_train, self.y_train)
         scoretable = pd.DataFrame(rlr.all_scores_,
                                   index=self.X_train.columns,
                                   columns=['var_score'])  #汇总最终确定特征得分
         columns_need = list(self.X_train.columns[rlr.get_support(
         )])  #	Get a mask, or integer index, of the features selected
         self.X_train = self.X_train[columns_need]
         self.X_test = self.X_test[columns_need]
         columns_need.append("y")
         if auto_pick:
             self.picked_data = self.data[columns_need]
         return scoretable
Exemple #33
0
def programmer_1():
    # 参数初始化
    filename = r'bankloan.xls'
    data = pd.read_excel(filename)
    x = data.iloc[:, :8].as_matrix()  # 使用pandas读取文件  就可以不用管label column标签
    y = data.iloc[:, 8].as_matrix()

    rlr = RLR()  # 建立随机逻辑回归模型,进行特征选择和变量筛选
    rlr.fit(x, y)  # 训练模型
    egeList = rlr.get_support()  # 获取筛选后的特征
    egeList = np.append(
        egeList, False)  # 往numpy数组中 添加一个False元素  使用np.append(array,ele)方法
    print("rlr.get_support():")
    print(egeList)
    print(u'随机逻辑回归模型特征选择结束!!!')
    print(u'有效特征为:%s' % ','.join(data.columns[egeList]))
    x = data[data.columns[egeList]].as_matrix()  # 筛选好特征值

    lr = LR()  # 建立逻辑回归模型
    lr.fit(x, y)  # 用筛选后的特征进行训练
    print(u'逻辑回归训练模型结束!!!')
    print(u'模型的平均正确率:%s' % lr.score(x, y))  # 给出模型的平均正确率,本例为81.4%
def rdlg_variables(X, y, threshold=0.25):#默认阈值0.25
    """
    #随机逻辑回归选择与y线性关系的变量(稳定性选择1)。
    #在不同数据子集和特征子集上运行特征选择算法(rlr),最终汇总选择结果
    #不同的子集上建立模型,然后汇总最终确定特征得分
    稳定性选择是一种基于二次抽样和选择算法相结合较新的方法,选择算法可以是回归、SVM或其他类似的方法。
    它的主要思想是在不同的数据子集和特征子集上运行特征选择算法,不断的重复,最终汇总特征选择结果,
    比如可以统计某个特征被认为是重要特征的频率(被选为重要特征的次数除以它所在的子集被测试的次数)。
    理想情况下,重要特征的得分会接近100%。稍微弱一点的特征得分会是非0的数,而最无用的特征得分将会接近于0。
    总的来说,好的特征不会因为有相似的特征、关联特征而得分为0,这跟Lasso是不同的。
    对于特征选择任务,在许多数据集和环境下,稳定性选择往往是性能最好的方法之一。
    """

    rlr = RandomizedLogisticRegression(selection_threshold = threshold)  #随机逻辑回归
    rlr.fit(X, y)
    scoretable = pd.DataFrame(rlr.all_scores_, index = X.columns) #汇总最终确定特征得分
    scoretable = scoretable.reset_index()    
    scoretable = scoretable.rename(columns = {'index':'Col', 0:'value_retio'}, copy = False)    
    df_score = scoretable[scoretable.value_retio > threshold] #删掉缺失值<0.25的数据   
    refesh_data = X[list(df_score['Col'])] 
         
    return scoretable,refesh_data
def logistic(X_train, X_test, y_train, y_test):
    from sklearn.linear_model import LogisticRegression as LR
    from sklearn.linear_model import RandomizedLogisticRegression as RLR
    #特征工程
    rlr = RLR()
    rlr.fit(X_train, y_train)
    print(rlr.get_support())
    x = X_train[X_train.columns[rlr.get_support()]].as_matrix()
    x_test = X_test[X_test.columns[rlr.get_support()]].as_matrix()
    '''
    x=X_train
    x_test=X_test
    '''
    #逻辑回归
    lr = LR()
    lr.fit(x, y_train)
    pred_prob_train = lr.predict_proba(x)
    pred_prob = lr.predict_proba(x_test)
    print('logistic')
    predicts = lr.predict(x_test)
    metrics_result(y_test, predicts)

    return pred_prob, pred_prob_train
Exemple #36
0
def logistic_regression():
    # 参数初始化
    filename = SRC_PATH + '/data/bankloan.xls'
    data = pd.read_excel(filename)
    print data.head()
    print data.tail()

    x = data.iloc[:, :8].as_matrix()
    y = data.iloc[:, 8].as_matrix()

    print x, y

    rlr = RLR()  # 建立随机逻辑回归模型,筛选变量
    rlr.fit(x, y)  # 训练模型
    rlr.get_support()  # 获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数
    print(u'通过随机逻辑回归模型筛选特征结束。')

    # print(u'有效特征为:%s' % ','.join(data.columns[rlr.get_support()]))
    # x = data[data.columns[rlr.get_support()]].as_matrix()  # 筛选好特征

    lr = LR()  # 建立逻辑货柜模型
    lr.fit(x, y)  # 用筛选后的特征数据来训练模型
    print(u'逻辑回归模型训练结束。')
    print(u'模型的平均正确率为:%s' % lr.score(x, y))  # 给出模型的平均正确率,本例为81.4%
Exemple #37
0
def feature_selection_tech(predictors, responses, test_predictors, selectFeatTech):
    if(selectFeatTech==0):
        t=int(predictors.shape[1]*0.40);
        t=40;
        model = SelectKBest(chi2, k=t).fit(predictors, responses);
        predictors_new = model.transform(predictors);
        predictors_test_new = model.transform(test_predictors);
        indices = model.get_support(indices=True);
    if(selectFeatTech==1):
        randomized_logistic = RandomizedLogisticRegression();
        model = randomized_logistic.fit(predictors, responses);
        predictors_new = model.transform(predictors);
        predictors_test_new = model.transform(test_predictors);
        indices = model.get_support(indices=True);
    return predictors_new, predictors_test_new, indices;
Exemple #38
0
 def feature_selection_tech(predictors, responses, test_predictors, selectFeatTech):
     if(selectFeatTech==0):
         t=int(predictors.shape[1]*0.40);
         t=40;
         model = SelectKBest(chi2, k=t).fit(predictors, responses);
         predictors_new = model.transform(predictors);
         predictors_test_new = model.transform(test_predictors);
         indices = model.get_support(indices=True);
     if(selectFeatTech==1):
         randomized_logistic = RandomizedLogisticRegression();
         model = randomized_logistic.fit(predictors, responses);
         predictors_new = model.transform(predictors);
         predictors_test_new = model.transform(test_predictors);
         indices = model.get_support(indices=True);
     return predictors_new, predictors_test_new, indices;
def lasso_regression(X, y):
	"""
	Use Randomized Logistic Regression to select the features based on the coefficient values
	"""

	clf = RandomizedLogisticRegression(C=1.0)
	clf.fit(X, y)
	print('Number of non zero valued coefficients: ', np.sum(clf.scores_ > 0))
	imp_feature_idx = clf.scores_.argsort()
	
	qualities = []
	
	X_train, X_test, y_train, y_test = split_examples(X, y)
	
	for i in range(0, 100, 4):
		clf = LogisticRegression(C=0.1)
		clf.fit(X_train[:, imp_feature_idx[i:]], y_train)
		q = roc_auc_score(y_test, clf.predict_proba(X_test[:, imp_feature_idx[i:]])[:, 1])
		
		qualities.append(q)
	plt.plot(range(0, 100, 4), qualities)
	plt.show()
	
	return qualities
def select_top_features(X, y, ques_dict, top_count=7):
    '''
    Run RandomizedLogisticRegression and return top number of features
    
    Args:
        X(dataframe) -- features
        y(dataframe) -- outcome
        ques_dict(dict) -- variable name to questions dictionary
        top_count(int) -- number of top features to return
    '''
    rand_log = RandomizedLogisticRegression()
    X_feat = rand_log.fit(X, y)
    questions = features_to_questions(X.columns, ques_dict)
    all_features = sorted(zip(questions, X_feat.scores_), key=lambda tup: tup[1], reverse=True)
    top_features = [f for f in all_features if f[1] > 0][:top_count]
    return top_features
Exemple #41
0
    def rank_random_logistic_regression(self,
                                        features_indep_df: PandasDataFrame,
                                        feature_target: List,
                                        n_jobs: int = -1,
                                        **kwargs: Any) -> object:
        """Use Randomized Logistic Regression to rank features.
        Attributes:
        model.scores_
        model.all_scores_

        :param features_indep_df: the independent features, which are inputted into the model.
        :param feature_target: the target feature, which is being estimated.
        :param n_jobs: number of CPUs to use during the resampling. If ‘-1’, use all the CPUs.
        :param kwargs: C=1, scaling=0.5, sample_fraction=0.75, n_resampling=200, selection_threshold=0.25, tol=0.001,
        fit_intercept=True, verbose=False, normalize=True, random_state=None, pre_dispatch='3*n_jobs'
        :return: the importance ranking model.
        """
        self.__logger.debug("Run Random Logistic Regression.")
        classifier = RandomizedLogisticRegression(n_jobs=n_jobs, **kwargs)
        return classifier.fit(features_indep_df, feature_target)
Exemple #42
0
#-*- coding:utf-8 -*-
# Peishichao
import pandas as pd

filename = '../data/bankloan.xls'
data = pd.read_excel(filename)

x = data.iloc[:, :8].as_matrix()

y = data.iloc[:, 8].as_matrix()

from sklearn.linear_model import LogisticRegression as LR

from sklearn.linear_model import RandomizedLogisticRegression as RLR

rlr = RLR()

rlr.fit(x, y)

rlr.get_support()
print(rlr.get_support())
print('end')

#print('Feature: %s ' % ','.join(data.columns[rlr.get_support()]))

x = data[data.columns[rlr.get_support()]].as_matrix()
print(x)
lr = LR()
lr.fit(x, y)
print('end')
print('accur: %s' % lr.score(x, y))
# -*- coding:utf-8 -*-
# 逻辑回归:自动建模
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR

data = pd.read_excel("c://mldata//bankloan.xls", header=0)
# x = data.iloc[:, :8].as_matrix()
# y = data.iloc[:, 8].as_matrix()   和下边的两种读取数据的方式,都会带来精度的影响
train_data = data.values  # 将读取的数据其转换为矩阵形式
train_x = train_data[0::, :8]
train_label = train_data[0::, 8]

rlr = RLR()  # 建立随机回归模型,筛选变量
rlr.fit(train_x, train_label)  # 训练模型
rlr.get_support()  # 获取特征筛选结果
print u"特征筛选结束"
print u"有效特征为:%s" % u'、'.join(data.columns[rlr.get_support()])

x = data[data.columns[rlr.get_support()]].as_matrix()  # 筛选好的特征

lr = LR()
lr.fit(x, train_label)  # 用筛选好的特征数据来训练模型
print u'逻辑回归训练结束'
print u'模型的平均正确率为:%s' % lr.score(x, train_label)
Exemple #44
0
from __future__ import division
import numpy as np
from sklearn.linear_model import RandomizedLogisticRegression
from sklearn.linear_model import LogisticRegression

X = np.load("../feats/train_formatted.npy")
y = np.load("../feats/train_y.npy")
X_test = np.load("../feats/test_formatted.npy")
y_test = np.load("../feats/test_y.npy")

clf = RandomizedLogisticRegression()
clf.fit(X, y) 
scores = clf.scores_
print 'Index    :   score'
sortedIdx = [i[0] for i in sorted(enumerate(scores), key=lambda x:x[1], reverse=True)]
top = 30
for i in range(top):
    print str(sortedIdx[i]) + ' :   ' + str(scores[sortedIdx[i]])

lr = LogisticRegression()
lr.fit(clf.transform(X), y)
pred = lr.predict(clf.transform(X_test))
accuracy = sum(pred == y_test)/y_test.size
print 'Logistic Regression Accuracy: ' + str(accuracy)

Exemple #45
0
def evaluate_model(model, X, y, labels, save_features=False, group_ablation=False, feature_output_name="features.csv", ticks=XTICKS):

    model_fs = RandomizedLogisticRegression(C=1, random_state=1)
    # Split into folds using labels
    # label_kfold = LabelKFold(labels, n_folds=10)
    group_kfold = GroupKFold(n_splits=10).split(X,y,groups=labels)
    folds  = []
    
    # For feature analysis
    feat_scores = []

    # For ablation study
    # Group ablation study
    feature_groups = feature_sets.get_all_groups()
    ablated = {key: set() for key in feature_groups.keys()}
    roc_ab  = {key: list() for key in feature_groups.keys()}
    roc_ab['true_roc_score'] = []

    for train_index, test_index in group_kfold:
        print "processing fold: %d" % (len(folds) + 1)

        # Split
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]

        scores   = []

        for k in XTICKS:
            indices = util.get_top_pearson_features(X_train, y_train, k)

            # Select k features
            X_train_fs = X_train[:, indices]
            X_test_fs  = X_test[:, indices]

            model = model.fit(X_train_fs, y_train)
            # summarize the selection of the attributes
            yhat  = model.predict(X_test_fs)                  # Predict
            scores.append(f1_score(y_test, yhat))     # Save
            if group_ablation:
                true_roc_score = roc_auc_score(y_test, yhat)
                roc_ab['true_roc_score'].append(true_roc_score)

                for group in feature_groups.keys():
                    # Get group features
                    features     = feature_groups[group]
                    features_idx = util.get_column_index(features, X)

                    # Get indices
                    indices_ab      = [i for i in indices if i not in features_idx]
                    removed_indices = [i for i in indices if i in features_idx]

                    # Filter
                    X_train_ab = X_train[:, indices_ab]
                    X_test_ab  = X_test[:, indices_ab]

                    # Fit
                    model_ab = model.fit(X_train_ab, y_train)
                    # Predict
                    yhat_ab  = model_ab.predict(X_test_ab)

                    # Save
                    ablated[group].update(X.columns[removed_indices])
                    roc_ab_score = roc_auc_score(y_test, yhat_ab)
                    roc_ab[group].append(roc_ab_score - true_roc_score)

        # ----- save row -----
        folds.append(scores)
        # ----- save row -----

        # ----- save features -----
        if save_features:
            model_fs = model_fs.fit(X_train, y_train)
            feat_scores.append(model_fs.scores_)
        # --------------------

    if save_features:
        feat_scores = np.asarray(feat_scores)  # convert to np array
        feat_scores = feat_scores.mean(axis=0)  # squash

        # This command maps scores to features and sorts by score, with the feature name in the first position
        feat_scores = sorted(zip(X.columns, map(lambda x: round(x, 4), model_fs.scores_)),
                             reverse=True, key=lambda x: x[1])
        feat_scores = pd.DataFrame(feat_scores)

        csv_path = "output/feature_scores/" + feature_output_name
        feat_scores.to_csv(csv_path, index=False)
        util.print_full(feat_scores)

    if group_ablation:
        roc_ab = pd.DataFrame(roc_ab).mean()
        print "======================="
        print "True AUC Score: %f" % roc_ab['true_roc_score']
        print "=======================\n\n"

        for group in ablated.keys():
            print "-----------------------"
            print "Group: %s " % group
            print "Removed: %s" % list(ablated[group])
            print "Change in AUC: %f" % (roc_ab[group])
            print "-----------------------\n"

    folds = np.asarray(folds)
    return folds
# 代码清单5-1 逻辑回归代码

import pandas as pd
# 参数初始化
fileName = 'data/bankloan.xls'
data = pd.read_excel(fileName)
x = data.iloc[:,:8].as_matrix()
y = data.iloc[:,8].as_matrix()

# 逻辑回归模型
from sklearn.linear_model import LogisticRegression as LR
# 随机逻辑回归模型
from sklearn.linear_model import RandomizedLogisticRegression as RLR
# 建立随机逻辑回归模型,筛选变量
rlr = RLR()
# 训练模型
rlr.fit(x,y)
# 获取特筛选结果,也可以通过.score_方法获取各个特征的分数
rlr.get_support()
print(u'通过随机逻辑回归模型筛选特征结束。')
print(u'有效特征为 %s' %'.'.join(data.columns[rlr.get_support()]))
# 筛选好特征
x = data[data.columns[rlr.get_support()]].as_matrix()

# 建立逻辑回归模型
lr = LR()
# 用筛选后的特征数据来训练模型
lr.fit(x,y)
print(u'逻辑回归模型训练结束。')
# 给出模型的平均正确率,本例为81.48
print(u'模型的平均正确率为 %s' %lr.score(x,y))
Exemple #47
0
import pandas as pda
fname = "C:/Users/Administrator/Desktop/data/luqu.xls"
dataf = pda.read_excel(fname)
#DataFrame.as_matrix: Convert the frame to its Numpy-array representation
#DataFrame.iloc: Purely integer-location based indexing for selection by position
x = dataf.iloc[:, 1:4].as_matrix()
y = dataf.iloc[:, 0:1].as_matrix()

from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR
r1 = RLR()
r1.fit(x, y)
eff = r1.get_support()#find the effective features, remove noneffective ones
#print(dataf.columns[eff])
t = dataf[dataf.columns[r1.get_support()]].as_matrix()
r2 = LR()
r2.fit(t, y)
print("training ends")
print("accuracy: " + str(r2.score(x,y))) #score():Returns the mean accuracy on the given test data and labels
Exemple #48
0
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC
from sklearn.cross_validation import StratifiedKFold
from sklearn.linear_model import RandomizedLogisticRegression
import fmriUtils as fm  #自定义函数

n_folds = 10

f = fm.outTo() #输出重定向到文件
X,y = fm.loadData2()   
X2,y2 = fm.loadData2()   

y = fm.defineClass(y)

randomized_logistic = RandomizedLogisticRegression(C=0.1,n_jobs=2)
randomized_logistic.fit(X,y)
XX = randomized_logistic.transform(X)
print "============选择后剩余的特征================"
print XX.shape

yy = y
cv = StratifiedKFold(yy,n_folds)
cv_scores = []
for train, test in cv:
    svc = SVC(kernel='linear')
    svc.fit(XX[train], yy[train])
    prediction = svc.predict(XX[test])
    cv_scores.append( np.sum(prediction == yy[test]) / float(np.size(yy[test])) )
    
print "========分类准确率======="
print cv_scores,np.mean(cv_scores)
#-*- coding: utf-8 -*-
#逻辑回归 自动建模
import pandas as pd

#参数初始化
filename = '../data/bankloan.xls'
data = pd.read_excel(filename)
x = data.iloc[:,:8].as_matrix()
y = data.iloc[:,8].as_matrix()

from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR 
rlr = RLR() #建立随机逻辑回归模型,筛选变量
rlr.fit(x, y) #训练模型
rlr.get_support() #获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数
print(u'通过随机逻辑回归模型筛选特征结束。')
print(u'有效特征为:%s' % ','.join(data.columns[rlr.get_support()]))
x = data[data.columns[rlr.get_support()]].as_matrix() #筛选好特征

lr = LR() #建立逻辑货柜模型
lr.fit(x, y) #用筛选后的特征数据来训练模型
print(u'逻辑回归模型训练结束。')
print(u'模型的平均正确率为:%s' % lr.score(x, y)) #给出模型的平均正确率,本例为81.4%